Merge remote-tracking branch 'upstream/release_34' into merge-20140211

Conflicts: lib/Linker/LinkModules.cpp lib/Support/Unix/Signals.inc Change-Id: Ia54f291fa5dc828052d2412736e8495c1282aa64
author: Stephen Hines <srhines@google.com> 2014-02-11 20:01:10 -0800
committer: Stephen Hines <srhines@google.com> 2014-02-11 20:01:10 -0800
commit: ce9904c6ea8fd669978a8eefb854b330eb9828ff (patch)
tree: 2418ee2e96ea220977c8fb74959192036ab5b133 /lib
parent: c27b10b198c1d9e9b51f2303994313ec2778edd7 (diff)
parent: dbb832b83351cec97b025b61c26536ef50c3181c (diff)
download: external_llvm-ce9904c6ea8fd669978a8eefb854b330eb9828ff.zip
external_llvm-ce9904c6ea8fd669978a8eefb854b330eb9828ff.tar.gz
external_llvm-ce9904c6ea8fd669978a8eefb854b330eb9828ff.tar.bz2
902 files changed, 86330 insertions, 36243 deletions
diff --git a/lib/Analysis/AliasSetTracker.cpp b/lib/Analysis/AliasSetTracker.cpp
index 5910526..2289c12 100644
--- a/lib/Analysis/AliasSetTracker.cpp
+++ b/lib/Analysis/AliasSetTracker.cpp
@@ -299,7 +299,6 @@ bool AliasSetTracker::add(Value *Ptr, uint64_t Size, const MDNode *TBAAInfo) {
 bool AliasSetTracker::add(LoadInst *LI) {
   if (LI->getOrdering() > Monotonic) return addUnknown(LI);
   AliasSet::AccessType ATy = AliasSet::Refs;
-  if (!LI->isUnordered()) ATy = AliasSet::ModRef;
   bool NewPtr;
   AliasSet &AS = addPointer(LI->getOperand(0),
                             AA.getTypeStoreSize(LI->getType()),
@@ -312,7 +311,6 @@ bool AliasSetTracker::add(LoadInst *LI) {
 bool AliasSetTracker::add(StoreInst *SI) {
   if (SI->getOrdering() > Monotonic) return addUnknown(SI);
   AliasSet::AccessType ATy = AliasSet::Mods;
-  if (!SI->isUnordered()) ATy = AliasSet::ModRef;
   bool NewPtr;
   Value *Val = SI->getOperand(0);
   AliasSet &AS = addPointer(SI->getOperand(1),
diff --git a/lib/Analysis/Analysis.cpp b/lib/Analysis/Analysis.cpp
index 349c417..98f2a55 100644
--- a/lib/Analysis/Analysis.cpp
+++ b/lib/Analysis/Analysis.cpp
@@ -34,6 +34,7 @@ void llvm::initializeAnalysis(PassRegistry &Registry) {
   initializeCFGOnlyViewerPass(Registry);
   initializeCFGOnlyPrinterPass(Registry);
   initializeDependenceAnalysisPass(Registry);
+  initializeDelinearizationPass(Registry);
   initializeDominanceFrontierPass(Registry);
   initializeDomViewerPass(Registry);
   initializeDomPrinterPass(Registry);
@@ -54,16 +55,6 @@ void llvm::initializeAnalysis(PassRegistry &Registry) {
   initializeMemoryDependenceAnalysisPass(Registry);
   initializeModuleDebugInfoPrinterPass(Registry);
   initializePostDominatorTreePass(Registry);
-  initializeProfileEstimatorPassPass(Registry);
-  initializeNoProfileInfoPass(Registry);
-  initializeNoPathProfileInfoPass(Registry);
-  initializeProfileInfoAnalysisGroup(Registry);
-  initializePathProfileInfoAnalysisGroup(Registry);
-  initializeLoaderPassPass(Registry);
-  initializePathProfileLoaderPassPass(Registry);
-  initializeProfileVerifierPassPass(Registry);
-  initializePathProfileVerifierPass(Registry);
-  initializeProfileMetadataLoaderPassPass(Registry);
   initializeRegionInfoPass(Registry);
   initializeRegionViewerPass(Registry);
   initializeRegionPrinterPass(Registry);
diff --git a/lib/Analysis/BasicAliasAnalysis.cpp b/lib/Analysis/BasicAliasAnalysis.cpp
index 9fe1362..b2c2011 100644
--- a/lib/Analysis/BasicAliasAnalysis.cpp
+++ b/lib/Analysis/BasicAliasAnalysis.cpp
@@ -122,7 +122,7 @@ static bool isObjectSmallerThan(const Value *V, uint64_t Size,
   //      question (in this case rewind to p), or
   //    - just give up. It is up to caller to make sure the pointer is pointing
   //      to the base address the object.
-  // 
+  //
   // We go for 2nd option for simplicity.
   if (!isIdentifiedObject(V))
     return false;
@@ -130,7 +130,7 @@ static bool isObjectSmallerThan(const Value *V, uint64_t Size,
   // This function needs to use the aligned object size because we allow
   // reads a bit past the end given sufficient alignment.
   uint64_t ObjectSize = getObjectSize(V, TD, TLI, /*RoundToAlign*/true);
-  
+
   return ObjectSize != AliasAnalysis::UnknownSize && ObjectSize < Size;
 }
 
@@ -163,7 +163,7 @@ namespace {
     EK_SignExt,
     EK_ZeroExt
   };
-  
+
   struct VariableGEPIndex {
     const Value *V;
     ExtensionKind Extension;
@@ -200,7 +200,7 @@ static Value *GetLinearExpression(Value *V, APInt &Scale, APInt &Offset,
     Offset = 0;
     return V;
   }
-  
+
   if (BinaryOperator *BOp = dyn_cast<BinaryOperator>(V)) {
     if (ConstantInt *RHSC = dyn_cast<ConstantInt>(BOp->getOperand(1))) {
       switch (BOp->getOpcode()) {
@@ -231,7 +231,7 @@ static Value *GetLinearExpression(Value *V, APInt &Scale, APInt &Offset,
       }
     }
   }
-  
+
   // Since GEP indices are sign extended anyway, we don't care about the high
   // bits of a sign or zero extended value - just scales and offsets.  The
   // extensions have to be consistent though.
@@ -248,10 +248,10 @@ static Value *GetLinearExpression(Value *V, APInt &Scale, APInt &Offset,
                                         TD, Depth+1);
     Scale = Scale.zext(OldWidth);
     Offset = Offset.zext(OldWidth);
-    
+
     return Result;
   }
-  
+
   Scale = 1;
   Offset = 0;
   return V;
@@ -276,7 +276,7 @@ DecomposeGEPExpression(const Value *V, int64_t &BaseOffs,
                        const DataLayout *TD) {
   // Limit recursion depth to limit compile time in crazy cases.
   unsigned MaxLookup = 6;
-  
+
   BaseOffs = 0;
   do {
     // See if this is a bitcast or GEP.
@@ -291,7 +291,7 @@ DecomposeGEPExpression(const Value *V, int64_t &BaseOffs,
       }
       return V;
     }
-    
+
     if (Op->getOpcode() == Instruction::BitCast) {
       V = Op->getOperand(0);
       continue;
@@ -308,15 +308,14 @@ DecomposeGEPExpression(const Value *V, int64_t &BaseOffs,
           V = Simplified;
           continue;
         }
-    
+
       return V;
     }
-    
+
     // Don't attempt to analyze GEPs over unsized objects.
-    if (!cast<PointerType>(GEPOp->getOperand(0)->getType())
-        ->getElementType()->isSized())
+    if (!GEPOp->getOperand(0)->getType()->getPointerElementType()->isSized())
       return V;
-    
+
     // If we are lacking DataLayout information, we can't compute the offets of
     // elements computed by GEPs.  However, we can handle bitcast equivalent
     // GEPs.
@@ -326,7 +325,8 @@ DecomposeGEPExpression(const Value *V, int64_t &BaseOffs,
       V = GEPOp->getOperand(0);
       continue;
     }
-    
+
+    unsigned AS = GEPOp->getPointerAddressSpace();
     // Walk the indices of the GEP, accumulating them into BaseOff/VarIndices.
     gep_type_iterator GTI = gep_type_begin(GEPOp);
     for (User::const_op_iterator I = GEPOp->op_begin()+1,
@@ -337,38 +337,37 @@ DecomposeGEPExpression(const Value *V, int64_t &BaseOffs,
         // For a struct, add the member offset.
         unsigned FieldNo = cast<ConstantInt>(Index)->getZExtValue();
         if (FieldNo == 0) continue;
-        
+
         BaseOffs += TD->getStructLayout(STy)->getElementOffset(FieldNo);
         continue;
       }
-      
+
       // For an array/pointer, add the element offset, explicitly scaled.
       if (ConstantInt *CIdx = dyn_cast<ConstantInt>(Index)) {
         if (CIdx->isZero()) continue;
         BaseOffs += TD->getTypeAllocSize(*GTI)*CIdx->getSExtValue();
         continue;
       }
-      
+
       uint64_t Scale = TD->getTypeAllocSize(*GTI);
       ExtensionKind Extension = EK_NotExtended;
-      
+
       // If the integer type is smaller than the pointer size, it is implicitly
       // sign extended to pointer size.
-      unsigned Width = cast<IntegerType>(Index->getType())->getBitWidth();
-      if (TD->getPointerSizeInBits() > Width)
+      unsigned Width = Index->getType()->getIntegerBitWidth();
+      if (TD->getPointerSizeInBits(AS) > Width)
         Extension = EK_SignExt;
-      
+
       // Use GetLinearExpression to decompose the index into a C1*V+C2 form.
       APInt IndexScale(Width, 0), IndexOffset(Width, 0);
       Index = GetLinearExpression(Index, IndexScale, IndexOffset, Extension,
                                   *TD, 0);
-      
+
       // The GEP index scale ("Scale") scales C1*V+C2, yielding (C1*V+C2)*Scale.
       // This gives us an aggregate computation of (C1*Scale)*V + C2*Scale.
       BaseOffs += IndexOffset.getSExtValue()*Scale;
       Scale *= IndexScale.getSExtValue();
-      
-      
+
       // If we already had an occurrence of this index variable, merge this
       // scale into it.  For example, we want to handle:
       //   A[x][x] -> x*16 + x*4 -> x*20
@@ -381,25 +380,25 @@ DecomposeGEPExpression(const Value *V, int64_t &BaseOffs,
           break;
         }
       }
-      
+
       // Make sure that we have a scale that makes sense for this target's
       // pointer size.
-      if (unsigned ShiftBits = 64-TD->getPointerSizeInBits()) {
+      if (unsigned ShiftBits = 64 - TD->getPointerSizeInBits(AS)) {
         Scale <<= ShiftBits;
         Scale = (int64_t)Scale >> ShiftBits;
       }
-      
+
       if (Scale) {
         VariableGEPIndex Entry = {Index, Extension,
                                   static_cast<int64_t>(Scale)};
         VarIndices.push_back(Entry);
       }
     }
-    
+
     // Analyze the base pointer next.
     V = GEPOp->getOperand(0);
   } while (--MaxLookup);
-  
+
   // If the chain of expressions is too deep, just return early.
   return V;
 }
@@ -407,7 +406,7 @@ DecomposeGEPExpression(const Value *V, int64_t &BaseOffs,
 /// GetIndexDifference - Dest and Src are the variable indices from two
 /// decomposed GetElementPtr instructions GEP1 and GEP2 which have common base
 /// pointers.  Subtract the GEP2 indices from GEP1 to find the symbolic
-/// difference between the two pointers. 
+/// difference between the two pointers.
 static void GetIndexDifference(SmallVectorImpl<VariableGEPIndex> &Dest,
                                const SmallVectorImpl<VariableGEPIndex> &Src) {
   if (Src.empty()) return;
@@ -416,12 +415,12 @@ static void GetIndexDifference(SmallVectorImpl<VariableGEPIndex> &Dest,
     const Value *V = Src[i].V;
     ExtensionKind Extension = Src[i].Extension;
     int64_t Scale = Src[i].Scale;
-    
+
     // Find V in Dest.  This is N^2, but pointer indices almost never have more
     // than a few variable indexes.
     for (unsigned j = 0, e = Dest.size(); j != e; ++j) {
       if (Dest[j].V != V || Dest[j].Extension != Extension) continue;
-      
+
       // If we found it, subtract off Scale V's from the entry in Dest.  If it
       // goes to zero, remove the entry.
       if (Dest[j].Scale != Scale)
@@ -431,7 +430,7 @@ static void GetIndexDifference(SmallVectorImpl<VariableGEPIndex> &Dest,
       Scale = 0;
       break;
     }
-    
+
     // If we didn't consume this entry, add it to the end of the Dest list.
     if (Scale) {
       VariableGEPIndex Entry = { V, Extension, -Scale };
@@ -526,7 +525,7 @@ namespace {
         return (AliasAnalysis*)this;
       return this;
     }
-    
+
   private:
     // AliasCache - Track alias queries to guard against recursion.
     typedef std::pair<Location, Location> LocPair;
@@ -696,7 +695,7 @@ BasicAliasAnalysis::getModRefInfo(ImmutableCallSite CS,
          "AliasAnalysis query involving multiple functions!");
 
   const Value *Object = GetUnderlyingObject(Loc.Ptr, TD);
-  
+
   // If this is a tail call and Loc.Ptr points to a stack location, we know that
   // the tail call cannot access or modify the local stack.
   // We cannot exclude byval arguments here; these belong to the caller of
@@ -706,7 +705,7 @@ BasicAliasAnalysis::getModRefInfo(ImmutableCallSite CS,
     if (const CallInst *CI = dyn_cast<CallInst>(CS.getInstruction()))
       if (CI->isTailCall())
         return NoModRef;
-  
+
   // If the pointer is to a locally allocated object that does not escape,
   // then the call can not mod/ref the pointer unless the call takes the pointer
   // as an argument, and itself doesn't capture it.
@@ -722,7 +721,7 @@ BasicAliasAnalysis::getModRefInfo(ImmutableCallSite CS,
       if (!(*CI)->getType()->isPointerTy() ||
           (!CS.doesNotCapture(ArgNo) && !CS.isByValArgument(ArgNo)))
         continue;
-      
+
       // If this is a no-capture pointer argument, see if we can tell that it
       // is impossible to alias the pointer we're checking.  If not, we have to
       // assume that the call could touch the pointer, even though it doesn't
@@ -732,7 +731,7 @@ BasicAliasAnalysis::getModRefInfo(ImmutableCallSite CS,
         break;
       }
     }
-    
+
     if (!PassedAsArg)
       return NoModRef;
   }
@@ -821,7 +820,7 @@ BasicAliasAnalysis::getModRefInfo(ImmutableCallSite CS,
     }
 
   // We can bound the aliasing properties of memset_pattern16 just as we can
-  // for memcpy/memset.  This is particularly important because the 
+  // for memcpy/memset.  This is particularly important because the
   // LoopIdiomRecognizer likes to turn loops into calls to memset_pattern16
   // whenever possible.
   else if (TLI.has(LibFunc::memset_pattern16) &&
@@ -925,22 +924,22 @@ BasicAliasAnalysis::aliasGEP(const GEPOperator *GEP1, uint64_t V1Size,
         GEP1VariableIndices.clear();
       }
     }
-    
+
     // If we get a No or May, then return it immediately, no amount of analysis
     // will improve this situation.
     if (BaseAlias != MustAlias) return BaseAlias;
-    
+
     // Otherwise, we have a MustAlias.  Since the base pointers alias each other
     // exactly, see if the computed offset from the common pointer tells us
     // about the relation of the resulting pointer.
     const Value *GEP1BasePtr =
       DecomposeGEPExpression(GEP1, GEP1BaseOffset, GEP1VariableIndices, TD);
-    
+
     int64_t GEP2BaseOffset;
     SmallVector<VariableGEPIndex, 4> GEP2VariableIndices;
     const Value *GEP2BasePtr =
       DecomposeGEPExpression(GEP2, GEP2BaseOffset, GEP2VariableIndices, TD);
-    
+
     // DecomposeGEPExpression and GetUnderlyingObject should return the
     // same result except when DecomposeGEPExpression has no DataLayout.
     if (GEP1BasePtr != UnderlyingV1 || GEP2BasePtr != UnderlyingV2) {
@@ -948,12 +947,12 @@ BasicAliasAnalysis::aliasGEP(const GEPOperator *GEP1, uint64_t V1Size,
              "DecomposeGEPExpression and GetUnderlyingObject disagree!");
       return MayAlias;
     }
-    
+
     // Subtract the GEP2 pointer from the GEP1 pointer to find out their
     // symbolic difference.
     GEP1BaseOffset -= GEP2BaseOffset;
     GetIndexDifference(GEP1VariableIndices, GEP2VariableIndices);
-    
+
   } else {
     // Check to see if these two pointers are related by the getelementptr
     // instruction.  If one pointer is a GEP with a non-zero index of the other
@@ -975,7 +974,7 @@ BasicAliasAnalysis::aliasGEP(const GEPOperator *GEP1, uint64_t V1Size,
 
     const Value *GEP1BasePtr =
       DecomposeGEPExpression(GEP1, GEP1BaseOffset, GEP1VariableIndices, TD);
-    
+
     // DecomposeGEPExpression and GetUnderlyingObject should return the
     // same result except when DecomposeGEPExpression has no DataLayout.
     if (GEP1BasePtr != UnderlyingV1) {
@@ -984,7 +983,7 @@ BasicAliasAnalysis::aliasGEP(const GEPOperator *GEP1, uint64_t V1Size,
       return MayAlias;
     }
   }
-  
+
   // In the two GEP Case, if there is no difference in the offsets of the
   // computed pointers, the resultant pointers are a must alias.  This
   // hapens when we have two lexically identical GEP's (for example).
@@ -1226,7 +1225,7 @@ BasicAliasAnalysis::aliasCheck(const Value *V1, uint64_t V1Size,
     if ((isa<ConstantPointerNull>(O2) && isKnownNonNull(O1)) ||
         (isa<ConstantPointerNull>(O1) && isKnownNonNull(O2)))
       return NoAlias;
-  
+
     // If one pointer is the result of a call/invoke or load and the other is a
     // non-escaping local object within the same function, then we know the
     // object couldn't escape to a point where the call could return it.
@@ -1248,7 +1247,7 @@ BasicAliasAnalysis::aliasCheck(const Value *V1, uint64_t V1Size,
     if ((V1Size != UnknownSize && isObjectSmallerThan(O2, V1Size, *TD, *TLI)) ||
         (V2Size != UnknownSize && isObjectSmallerThan(O1, V2Size, *TD, *TLI)))
       return NoAlias;
-  
+
   // Check the cache before climbing up use-def chains. This also terminates
   // otherwise infinitely recursive queries.
   LocPair Locs(Location(V1, V1Size, V1TBAAInfo),
diff --git a/lib/Analysis/BlockFrequencyInfo.cpp b/lib/Analysis/BlockFrequencyInfo.cpp
index 8469556..62f3ab1 100644
--- a/lib/Analysis/BlockFrequencyInfo.cpp
+++ b/lib/Analysis/BlockFrequencyInfo.cpp
@@ -1,4 +1,4 @@
-//=======-------- BlockFrequencyInfo.cpp - Block Frequency Analysis -------=======//
+//=======-------- BlockFrequencyInfo.cpp - Block Frequency Analysis -------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -17,14 +17,97 @@
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/Passes.h"
 #include "llvm/InitializePasses.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/GraphWriter.h"
 
 using namespace llvm;
 
-INITIALIZE_PASS_BEGIN(BlockFrequencyInfo, "block-freq", "Block Frequency Analysis",
-                      true, true)
+#ifndef NDEBUG
+enum GVDAGType {
+  GVDT_None,
+  GVDT_Fraction,
+  GVDT_Integer
+};
+
+static cl::opt<GVDAGType>
+ViewBlockFreqPropagationDAG("view-block-freq-propagation-dags", cl::Hidden,
+          cl::desc("Pop up a window to show a dag displaying how block "
+                   "frequencies propagation through the CFG."),
+          cl::values(
+            clEnumValN(GVDT_None, "none",
+                       "do not display graphs."),
+            clEnumValN(GVDT_Fraction, "fraction", "display a graph using the "
+                       "fractional block frequency representation."),
+            clEnumValN(GVDT_Integer, "integer", "display a graph using the raw "
+                       "integer fractional block frequency representation."),
+            clEnumValEnd));
+
+namespace llvm {
+
+template <>
+struct GraphTraits<BlockFrequencyInfo *> {
+  typedef const BasicBlock NodeType;
+  typedef succ_const_iterator ChildIteratorType;
+  typedef Function::const_iterator nodes_iterator;
+
+  static inline const NodeType *getEntryNode(const BlockFrequencyInfo *G) {
+    return G->getFunction()->begin();
+  }
+  static ChildIteratorType child_begin(const NodeType *N) {
+    return succ_begin(N);
+  }
+  static ChildIteratorType child_end(const NodeType *N) {
+    return succ_end(N);
+  }
+  static nodes_iterator nodes_begin(const BlockFrequencyInfo *G) {
+    return G->getFunction()->begin();
+  }
+  static nodes_iterator nodes_end(const BlockFrequencyInfo *G) {
+    return G->getFunction()->end();
+  }
+};
+
+template<>
+struct DOTGraphTraits<BlockFrequencyInfo*> : public DefaultDOTGraphTraits {
+  explicit DOTGraphTraits(bool isSimple=false) :
+    DefaultDOTGraphTraits(isSimple) {}
+
+  static std::string getGraphName(const BlockFrequencyInfo *G) {
+    return G->getFunction()->getName();
+  }
+
+  std::string getNodeLabel(const BasicBlock *Node,
+                           const BlockFrequencyInfo *Graph) {
+    std::string Result;
+    raw_string_ostream OS(Result);
+
+    OS << Node->getName().str() << ":";
+    switch (ViewBlockFreqPropagationDAG) {
+    case GVDT_Fraction:
+      Graph->getBlockFreq(Node).print(OS);
+      break;
+    case GVDT_Integer:
+      OS << Graph->getBlockFreq(Node).getFrequency();
+      break;
+    case GVDT_None:
+      llvm_unreachable("If we are not supposed to render a graph we should "
+                       "never reach this point.");
+    }
+
+    return Result;
+  }
+};
+
+} // end namespace llvm
+#endif
+
+INITIALIZE_PASS_BEGIN(BlockFrequencyInfo, "block-freq",
+                      "Block Frequency Analysis", true, true)
 INITIALIZE_PASS_DEPENDENCY(BranchProbabilityInfo)
-INITIALIZE_PASS_END(BlockFrequencyInfo, "block-freq", "Block Frequency Analysis",
-                    true, true)
+INITIALIZE_PASS_END(BlockFrequencyInfo, "block-freq",
+                    "Block Frequency Analysis", true, true)
 
 char BlockFrequencyInfo::ID = 0;
 
@@ -46,6 +129,10 @@ void BlockFrequencyInfo::getAnalysisUsage(AnalysisUsage &AU) const {
 bool BlockFrequencyInfo::runOnFunction(Function &F) {
   BranchProbabilityInfo &BPI = getAnalysis<BranchProbabilityInfo>();
   BFI->doFunction(&F, &BPI);
+#ifndef NDEBUG
+  if (ViewBlockFreqPropagationDAG != GVDT_None)
+    view();
+#endif
   return false;
 }
 
@@ -56,3 +143,19 @@ void BlockFrequencyInfo::print(raw_ostream &O, const Module *) const {
 BlockFrequency BlockFrequencyInfo::getBlockFreq(const BasicBlock *BB) const {
   return BFI->getBlockFreq(BB);
 }
+
+/// Pop up a ghostview window with the current block frequency propagation
+/// rendered using dot.
+void BlockFrequencyInfo::view() const {
+// This code is only for debugging.
+#ifndef NDEBUG
+  ViewGraph(const_cast<BlockFrequencyInfo *>(this), "BlockFrequencyDAGs");
+#else
+  errs() << "BlockFrequencyInfo::view is only available in debug builds on "
+            "systems with Graphviz or gv!\n";
+#endif // NDEBUG
+}
+
+const Function *BlockFrequencyInfo::getFunction() const {
+  return BFI->Fn;
+}
diff --git a/lib/Analysis/BranchProbabilityInfo.cpp b/lib/Analysis/BranchProbabilityInfo.cpp
index 7cdf828..86560ca 100644
--- a/lib/Analysis/BranchProbabilityInfo.cpp
+++ b/lib/Analysis/BranchProbabilityInfo.cpp
@@ -398,10 +398,24 @@ bool BranchProbabilityInfo::calcZeroHeuristics(BasicBlock *BB) {
     // InstCombine canonicalizes X <= 0 into X < 1.
     // X <= 0   ->  Unlikely
     isProb = false;
-  } else if (CV->isAllOnesValue() && CI->getPredicate() == CmpInst::ICMP_SGT) {
-    // InstCombine canonicalizes X >= 0 into X > -1.
-    // X >= 0   ->  Likely
-    isProb = true;
+  } else if (CV->isAllOnesValue()) {
+    switch (CI->getPredicate()) {
+    case CmpInst::ICMP_EQ:
+      // X == -1  ->  Unlikely
+      isProb = false;
+      break;
+    case CmpInst::ICMP_NE:
+      // X != -1  ->  Likely
+      isProb = true;
+      break;
+    case CmpInst::ICMP_SGT:
+      // InstCombine canonicalizes X >= 0 into X > -1.
+      // X >= 0   ->  Likely
+      isProb = true;
+      break;
+    default:
+      return false;
+    }
   } else {
     return false;
   }
diff --git a/lib/Analysis/CFG.cpp b/lib/Analysis/CFG.cpp
index a5ed21a..c3f32d3 100644
--- a/lib/Analysis/CFG.cpp
+++ b/lib/Analysis/CFG.cpp
@@ -116,7 +116,7 @@ bool llvm::isCriticalEdge(const TerminatorInst *TI, unsigned SuccNum,
 
 // LoopInfo contains a mapping from basic block to the innermost loop. Find
 // the outermost loop in the loop nest that contains BB.
-static const Loop *getOutermostLoop(LoopInfo *LI, const BasicBlock *BB) {
+static const Loop *getOutermostLoop(const LoopInfo *LI, const BasicBlock *BB) {
   const Loop *L = LI->getLoopFor(BB);
   if (L) {
     while (const Loop *Parent = L->getParentLoop())
@@ -126,60 +126,17 @@ static const Loop *getOutermostLoop(LoopInfo *LI, const BasicBlock *BB) {
 }
 
 // True if there is a loop which contains both BB1 and BB2.
-static bool loopContainsBoth(LoopInfo *LI,
+static bool loopContainsBoth(const LoopInfo *LI,
                              const BasicBlock *BB1, const BasicBlock *BB2) {
   const Loop *L1 = getOutermostLoop(LI, BB1);
   const Loop *L2 = getOutermostLoop(LI, BB2);
   return L1 != NULL && L1 == L2;
 }
 
-static bool isPotentiallyReachableSameBlock(const Instruction *A,
-                                            const Instruction *B,
-                                            LoopInfo *LI) {
-  // The same block case is special because it's the only time we're looking
-  // within a single block to see which comes first. Once we start looking at
-  // multiple blocks, the first instruction of the block is reachable, so we
-  // only need to determine reachability between whole blocks.
-
-  const BasicBlock *BB = A->getParent();
-  // If the block is in a loop then we can reach any instruction in the block
-  // from any other instruction in the block by going around the backedge.
-  // Check whether we're in a loop (or aren't sure).
-
-  // Can't be in a loop if it's the entry block -- the entry block may not
-  // have predecessors.
-  bool HasLoop = BB != &BB->getParent()->getEntryBlock();
-
-  // Can't be in a loop if LoopInfo doesn't know about it.
-  if (LI && HasLoop) {
-    HasLoop = LI->getLoopFor(BB) != 0;
-  }
-  if (HasLoop)
-    return true;
-
-  // Linear scan, start at 'A', see whether we hit 'B' or the end first.
-  for (BasicBlock::const_iterator I = A, E = BB->end(); I != E; ++I) {
-    if (&*I == B)
-      return true;
-  }
-  return false;
-}
-
-bool llvm::isPotentiallyReachable(const Instruction *A, const Instruction *B,
-                                  DominatorTree *DT, LoopInfo *LI) {
-  assert(A->getParent()->getParent() == B->getParent()->getParent() &&
-         "This analysis is function-local!");
-
-  const BasicBlock *StopBB = B->getParent();
-
-  if (A->getParent() == B->getParent())
-    return isPotentiallyReachableSameBlock(A, B, LI);
-
-  if (A->getParent() == &A->getParent()->getParent()->getEntryBlock())
-    return true;
-  if (B->getParent() == &A->getParent()->getParent()->getEntryBlock())
-    return false;
-
+static bool isPotentiallyReachableInner(SmallVectorImpl<BasicBlock *> &Worklist,
+                                        BasicBlock *StopBB,
+                                        const DominatorTree *DT,
+                                        const LoopInfo *LI) {
   // When the stop block is unreachable, it's dominated from everywhere,
   // regardless of whether there's a path between the two blocks.
   if (DT && !DT->isReachableFromEntry(StopBB))
@@ -188,11 +145,7 @@ bool llvm::isPotentiallyReachable(const Instruction *A, const Instruction *B,
   // Limit the number of blocks we visit. The goal is to avoid run-away compile
   // times on large CFGs without hampering sensible code. Arbitrarily chosen.
   unsigned Limit = 32;
-
   SmallSet<const BasicBlock*, 64> Visited;
-  SmallVector<BasicBlock*, 32> Worklist;
-  Worklist.push_back(const_cast<BasicBlock*>(A->getParent()));
-
   do {
     BasicBlock *BB = Worklist.pop_back_val();
     if (!Visited.insert(BB))
@@ -221,7 +174,72 @@ bool llvm::isPotentiallyReachable(const Instruction *A, const Instruction *B,
     }
   } while (!Worklist.empty());
 
-  // We have exhaustived all possible paths and are certain that 'To' can not
-  // be reached from 'From'.
+  // We have exhausted all possible paths and are certain that 'To' can not be
+  // reached from 'From'.
   return false;
 }
+
+bool llvm::isPotentiallyReachable(const BasicBlock *A, const BasicBlock *B,
+                                  const DominatorTree *DT, const LoopInfo *LI) {
+  assert(A->getParent() == B->getParent() &&
+         "This analysis is function-local!");
+
+  SmallVector<BasicBlock*, 32> Worklist;
+  Worklist.push_back(const_cast<BasicBlock*>(A));
+
+  return isPotentiallyReachableInner(Worklist, const_cast<BasicBlock*>(B),
+                                     DT, LI);
+}
+
+bool llvm::isPotentiallyReachable(const Instruction *A, const Instruction *B,
+                                  const DominatorTree *DT, const LoopInfo *LI) {
+  assert(A->getParent()->getParent() == B->getParent()->getParent() &&
+         "This analysis is function-local!");
+
+  SmallVector<BasicBlock*, 32> Worklist;
+
+  if (A->getParent() == B->getParent()) {
+    // The same block case is special because it's the only time we're looking
+    // within a single block to see which instruction comes first. Once we
+    // start looking at multiple blocks, the first instruction of the block is
+    // reachable, so we only need to determine reachability between whole
+    // blocks.
+    BasicBlock *BB = const_cast<BasicBlock *>(A->getParent());
+
+    // If the block is in a loop then we can reach any instruction in the block
+    // from any other instruction in the block by going around a backedge.
+    if (LI && LI->getLoopFor(BB) != 0)
+      return true;
+
+    // Linear scan, start at 'A', see whether we hit 'B' or the end first.
+    for (BasicBlock::const_iterator I = A, E = BB->end(); I != E; ++I) {
+      if (&*I == B)
+        return true;
+    }
+
+    // Can't be in a loop if it's the entry block -- the entry block may not
+    // have predecessors.
+    if (BB == &BB->getParent()->getEntryBlock())
+      return false;
+
+    // Otherwise, continue doing the normal per-BB CFG walk.
+    for (succ_iterator I = succ_begin(BB), E = succ_end(BB); I != E; ++I)
+      Worklist.push_back(*I);
+
+    if (Worklist.empty()) {
+      // We've proven that there's no path!
+      return false;
+    }
+  } else {
+    Worklist.push_back(const_cast<BasicBlock*>(A->getParent()));
+  }
+
+  if (A->getParent() == &A->getParent()->getParent()->getEntryBlock())
+    return true;
+  if (B->getParent() == &A->getParent()->getParent()->getEntryBlock())
+    return false;
+
+  return isPotentiallyReachableInner(Worklist,
+                                     const_cast<BasicBlock*>(B->getParent()),
+                                     DT, LI);
+}
diff --git a/lib/Analysis/CMakeLists.txt b/lib/Analysis/CMakeLists.txt
index 94ded34..3624aac 100644
--- a/lib/Analysis/CMakeLists.txt
+++ b/lib/Analysis/CMakeLists.txt
@@ -14,6 +14,7 @@ add_llvm_library(LLVMAnalysis
   CostModel.cpp
   CodeMetrics.cpp
   ConstantFolding.cpp
+  Delinearization.cpp
   DependenceAnalysis.cpp
   DomPrinter.cpp
   DominanceFrontier.cpp
@@ -35,17 +36,7 @@ add_llvm_library(LLVMAnalysis
   ModuleDebugInfoPrinter.cpp
   NoAliasAnalysis.cpp
   PHITransAddr.cpp
-  PathNumbering.cpp
-  PathProfileInfo.cpp
-  PathProfileVerifier.cpp
   PostDominators.cpp
-  ProfileEstimatorPass.cpp
-  ProfileInfo.cpp
-  ProfileInfoLoader.cpp
-  ProfileInfoLoaderPass.cpp
-  ProfileVerifierPass.cpp
-  ProfileDataLoader.cpp
-  ProfileDataLoaderPass.cpp
   PtrUseVisitor.cpp
   RegionInfo.cpp
   RegionPass.cpp
diff --git a/lib/Analysis/CaptureTracking.cpp b/lib/Analysis/CaptureTracking.cpp
index 9eb76a8..79fab1b 100644
--- a/lib/Analysis/CaptureTracking.cpp
+++ b/lib/Analysis/CaptureTracking.cpp
@@ -146,8 +146,14 @@ void llvm::PointerMayBeCaptured(const Value *V, CaptureTracker *Tracker) {
     case Instruction::PHI:
     case Instruction::Select:
       // The original value is not captured via this if the new value isn't.
+      Count = 0;
       for (Instruction::use_iterator UI = I->use_begin(), UE = I->use_end();
            UI != UE; ++UI) {
+        // If there are lots of uses, conservatively say that the value
+        // is captured to avoid taking too much compile time.
+        if (Count++ >= Threshold)
+          return Tracker->tooManyUses();
+
         Use *U = &UI.getUse();
         if (Visited.insert(U))
           if (Tracker->shouldExplore(U))
diff --git a/lib/Analysis/ConstantFolding.cpp b/lib/Analysis/ConstantFolding.cpp
index bc0dffc..3d32232 100644
--- a/lib/Analysis/ConstantFolding.cpp
+++ b/lib/Analysis/ConstantFolding.cpp
@@ -224,7 +224,8 @@ static bool IsConstantOffsetFromGlobal(Constant *C, GlobalValue *&GV,
                                        APInt &Offset, const DataLayout &TD) {
   // Trivial case, constant is the global.
   if ((GV = dyn_cast<GlobalValue>(C))) {
-    Offset.clearAllBits();
+    unsigned BitWidth = TD.getPointerTypeSizeInBits(GV->getType());
+    Offset = APInt(BitWidth, 0);
     return true;
   }
 
@@ -238,16 +239,23 @@ static bool IsConstantOffsetFromGlobal(Constant *C, GlobalValue *&GV,
     return IsConstantOffsetFromGlobal(CE->getOperand(0), GV, Offset, TD);
 
   // i32* getelementptr ([5 x i32]* @a, i32 0, i32 5)
-  if (GEPOperator *GEP = dyn_cast<GEPOperator>(CE)) {
-    // If the base isn't a global+constant, we aren't either.
-    if (!IsConstantOffsetFromGlobal(CE->getOperand(0), GV, Offset, TD))
-      return false;
+  GEPOperator *GEP = dyn_cast<GEPOperator>(CE);
+  if (!GEP)
+    return false;
 
-    // Otherwise, add any offset that our operands provide.
-    return GEP->accumulateConstantOffset(TD, Offset);
-  }
+  unsigned BitWidth = TD.getPointerTypeSizeInBits(GEP->getType());
+  APInt TmpOffset(BitWidth, 0);
 
-  return false;
+  // If the base isn't a global+constant, we aren't either.
+  if (!IsConstantOffsetFromGlobal(CE->getOperand(0), GV, TmpOffset, TD))
+    return false;
+
+  // Otherwise, add any offset that our operands provide.
+  if (!GEP->accumulateConstantOffset(TD, TmpOffset))
+    return false;
+
+  Offset = TmpOffset;
+  return true;
 }
 
 /// ReadDataFromGlobal - Recursive helper to read bits out of global.  C is the
@@ -324,12 +332,12 @@ static bool ReadDataFromGlobal(Constant *C, uint64_t ByteOffset,
       // If we read all of the bytes we needed from this element we're done.
       uint64_t NextEltOffset = SL->getElementOffset(Index);
 
-      if (BytesLeft <= NextEltOffset-CurEltOffset-ByteOffset)
+      if (BytesLeft <= NextEltOffset - CurEltOffset - ByteOffset)
         return true;
 
       // Move to the next element of the struct.
-      CurPtr += NextEltOffset-CurEltOffset-ByteOffset;
-      BytesLeft -= NextEltOffset-CurEltOffset-ByteOffset;
+      CurPtr += NextEltOffset - CurEltOffset - ByteOffset;
+      BytesLeft -= NextEltOffset - CurEltOffset - ByteOffset;
       ByteOffset = 0;
       CurEltOffset = NextEltOffset;
     }
@@ -338,7 +346,7 @@ static bool ReadDataFromGlobal(Constant *C, uint64_t ByteOffset,
 
   if (isa<ConstantArray>(C) || isa<ConstantVector>(C) ||
       isa<ConstantDataSequential>(C)) {
-    Type *EltTy = cast<SequentialType>(C->getType())->getElementType();
+    Type *EltTy = C->getType()->getSequentialElementType();
     uint64_t EltSize = TD.getTypeAllocSize(EltTy);
     uint64_t Index = ByteOffset / EltSize;
     uint64_t Offset = ByteOffset - Index * EltSize;
@@ -346,7 +354,7 @@ static bool ReadDataFromGlobal(Constant *C, uint64_t ByteOffset,
     if (ArrayType *AT = dyn_cast<ArrayType>(C->getType()))
       NumElts = AT->getNumElements();
     else
-      NumElts = cast<VectorType>(C->getType())->getNumElements();
+      NumElts = C->getType()->getVectorNumElements();
 
     for (; Index != NumElts; ++Index) {
       if (!ReadDataFromGlobal(C->getAggregateElement(Index), Offset, CurPtr,
@@ -367,9 +375,10 @@ static bool ReadDataFromGlobal(Constant *C, uint64_t ByteOffset,
 
   if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C)) {
     if (CE->getOpcode() == Instruction::IntToPtr &&
-        CE->getOperand(0)->getType() == TD.getIntPtrType(CE->getContext()))
+        CE->getOperand(0)->getType() == TD.getIntPtrType(CE->getType())) {
       return ReadDataFromGlobal(CE->getOperand(0), ByteOffset, CurPtr,
                                 BytesLeft, TD);
+    }
   }
 
   // Otherwise, unknown initializer type.
@@ -378,26 +387,29 @@ static bool ReadDataFromGlobal(Constant *C, uint64_t ByteOffset,
 
 static Constant *FoldReinterpretLoadFromConstPtr(Constant *C,
                                                  const DataLayout &TD) {
-  Type *LoadTy = cast<PointerType>(C->getType())->getElementType();
+  PointerType *PTy = cast<PointerType>(C->getType());
+  Type *LoadTy = PTy->getElementType();
   IntegerType *IntType = dyn_cast<IntegerType>(LoadTy);
 
   // If this isn't an integer load we can't fold it directly.
   if (!IntType) {
+    unsigned AS = PTy->getAddressSpace();
+
     // If this is a float/double load, we can try folding it as an int32/64 load
     // and then bitcast the result.  This can be useful for union cases.  Note
     // that address spaces don't matter here since we're not going to result in
     // an actual new load.
     Type *MapTy;
     if (LoadTy->isHalfTy())
-      MapTy = Type::getInt16PtrTy(C->getContext());
+      MapTy = Type::getInt16PtrTy(C->getContext(), AS);
     else if (LoadTy->isFloatTy())
-      MapTy = Type::getInt32PtrTy(C->getContext());
+      MapTy = Type::getInt32PtrTy(C->getContext(), AS);
     else if (LoadTy->isDoubleTy())
-      MapTy = Type::getInt64PtrTy(C->getContext());
+      MapTy = Type::getInt64PtrTy(C->getContext(), AS);
     else if (LoadTy->isVectorTy()) {
-      MapTy = IntegerType::get(C->getContext(),
-                               TD.getTypeAllocSizeInBits(LoadTy));
-      MapTy = PointerType::getUnqual(MapTy);
+      MapTy = PointerType::getIntNPtrTy(C->getContext(),
+                                        TD.getTypeAllocSizeInBits(LoadTy),
+                                        AS);
     } else
       return 0;
 
@@ -408,10 +420,11 @@ static Constant *FoldReinterpretLoadFromConstPtr(Constant *C,
   }
 
   unsigned BytesLoaded = (IntType->getBitWidth() + 7) / 8;
-  if (BytesLoaded > 32 || BytesLoaded == 0) return 0;
+  if (BytesLoaded > 32 || BytesLoaded == 0)
+    return 0;
 
   GlobalValue *GVal;
-  APInt Offset(TD.getPointerSizeInBits(), 0);
+  APInt Offset;
   if (!IsConstantOffsetFromGlobal(C, GVal, Offset, TD))
     return 0;
 
@@ -422,7 +435,8 @@ static Constant *FoldReinterpretLoadFromConstPtr(Constant *C,
 
   // If we're loading off the beginning of the global, some bytes may be valid,
   // but we don't try to handle this.
-  if (Offset.isNegative()) return 0;
+  if (Offset.isNegative())
+    return 0;
 
   // If we're not accessing anything in this constant, the result is undefined.
   if (Offset.getZExtValue() >=
@@ -439,7 +453,7 @@ static Constant *FoldReinterpretLoadFromConstPtr(Constant *C,
     ResultVal = RawBytes[BytesLoaded - 1];
     for (unsigned i = 1; i != BytesLoaded; ++i) {
       ResultVal <<= 8;
-      ResultVal |= RawBytes[BytesLoaded-1-i];
+      ResultVal |= RawBytes[BytesLoaded - 1 - i];
     }
   } else {
     ResultVal = RawBytes[0];
@@ -464,14 +478,17 @@ Constant *llvm::ConstantFoldLoadFromConstPtr(Constant *C,
 
   // If the loaded value isn't a constant expr, we can't handle it.
   ConstantExpr *CE = dyn_cast<ConstantExpr>(C);
-  if (!CE) return 0;
+  if (!CE)
+    return 0;
 
   if (CE->getOpcode() == Instruction::GetElementPtr) {
-    if (GlobalVariable *GV = dyn_cast<GlobalVariable>(CE->getOperand(0)))
-      if (GV->isConstant() && GV->hasDefinitiveInitializer())
+    if (GlobalVariable *GV = dyn_cast<GlobalVariable>(CE->getOperand(0))) {
+      if (GV->isConstant() && GV->hasDefinitiveInitializer()) {
         if (Constant *V =
              ConstantFoldLoadThroughGEPConstantExpr(GV->getInitializer(), CE))
           return V;
+      }
+    }
   }
 
   // Instead of loading constant c string, use corresponding integer value
@@ -576,13 +593,13 @@ static Constant *SymbolicallyEvaluateBinop(unsigned Opc, Constant *Op0,
   // constant.  This happens frequently when iterating over a global array.
   if (Opc == Instruction::Sub && DL) {
     GlobalValue *GV1, *GV2;
-    unsigned PtrSize = DL->getPointerSizeInBits();
-    unsigned OpSize = DL->getTypeSizeInBits(Op0->getType());
-    APInt Offs1(PtrSize, 0), Offs2(PtrSize, 0);
+    APInt Offs1, Offs2;
 
     if (IsConstantOffsetFromGlobal(Op0, GV1, Offs1, *DL))
       if (IsConstantOffsetFromGlobal(Op1, GV2, Offs2, *DL) &&
           GV1 == GV2) {
+        unsigned OpSize = DL->getTypeSizeInBits(Op0->getType());
+
         // (&GV+C1) - (&GV+C2) -> C1-C2, pointer arithmetic cannot overflow.
         // PtrToInt may change the bitwidth so we have convert to the right size
         // first.
@@ -600,15 +617,18 @@ static Constant *SymbolicallyEvaluateBinop(unsigned Opc, Constant *Op0,
 static Constant *CastGEPIndices(ArrayRef<Constant *> Ops,
                                 Type *ResultTy, const DataLayout *TD,
                                 const TargetLibraryInfo *TLI) {
-  if (!TD) return 0;
-  Type *IntPtrTy = TD->getIntPtrType(ResultTy->getContext());
+  if (!TD)
+    return 0;
+
+  Type *IntPtrTy = TD->getIntPtrType(ResultTy);
 
   bool Any = false;
   SmallVector<Constant*, 32> NewIdxs;
   for (unsigned i = 1, e = Ops.size(); i != e; ++i) {
     if ((i == 1 ||
-         !isa<StructType>(GetElementPtrInst::getIndexedType(Ops[0]->getType(),
-                                                        Ops.slice(1, i-1)))) &&
+         !isa<StructType>(GetElementPtrInst::getIndexedType(
+                            Ops[0]->getType(),
+                            Ops.slice(1, i - 1)))) &&
         Ops[i]->getType() != IntPtrTy) {
       Any = true;
       NewIdxs.push_back(ConstantExpr::getCast(CastInst::getCastOpcode(Ops[i],
@@ -619,13 +639,16 @@ static Constant *CastGEPIndices(ArrayRef<Constant *> Ops,
     } else
       NewIdxs.push_back(Ops[i]);
   }
-  if (!Any) return 0;
 
-  Constant *C =
-    ConstantExpr::getGetElementPtr(Ops[0], NewIdxs);
-  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C))
+  if (!Any)
+    return 0;
+
+  Constant *C = ConstantExpr::getGetElementPtr(Ops[0], NewIdxs);
+  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C)) {
     if (Constant *Folded = ConstantFoldConstantExpression(CE, TD, TLI))
       C = Folded;
+  }
+
   return C;
 }
 
@@ -640,7 +663,7 @@ static Constant* StripPtrCastKeepAS(Constant* Ptr) {
   if (NewPtrTy->getAddressSpace() != OldPtrTy->getAddressSpace()) {
     NewPtrTy = NewPtrTy->getElementType()->getPointerTo(
       OldPtrTy->getAddressSpace());
-    Ptr = ConstantExpr::getBitCast(Ptr, NewPtrTy);
+    Ptr = ConstantExpr::getPointerCast(Ptr, NewPtrTy);
   }
   return Ptr;
 }
@@ -651,11 +674,12 @@ static Constant *SymbolicallyEvaluateGEP(ArrayRef<Constant *> Ops,
                                          Type *ResultTy, const DataLayout *TD,
                                          const TargetLibraryInfo *TLI) {
   Constant *Ptr = Ops[0];
-  if (!TD || !cast<PointerType>(Ptr->getType())->getElementType()->isSized() ||
+  if (!TD || !Ptr->getType()->getPointerElementType()->isSized() ||
       !Ptr->getType()->isPointerTy())
     return 0;
 
-  Type *IntPtrTy = TD->getIntPtrType(Ptr->getContext());
+  Type *IntPtrTy = TD->getIntPtrType(Ptr->getType());
+  Type *ResultElementTy = ResultTy->getPointerElementType();
 
   // If this is a constant expr gep that is effectively computing an
   // "offsetof", fold it into 'cast int Size to T*' instead of 'gep 0, 0, 12'
@@ -664,8 +688,7 @@ static Constant *SymbolicallyEvaluateGEP(ArrayRef<Constant *> Ops,
 
       // If this is "gep i8* Ptr, (sub 0, V)", fold this as:
       // "inttoptr (sub (ptrtoint Ptr), V)"
-      if (Ops.size() == 2 &&
-          cast<PointerType>(ResultTy)->getElementType()->isIntegerTy(8)) {
+      if (Ops.size() == 2 && ResultElementTy->isIntegerTy(8)) {
         ConstantExpr *CE = dyn_cast<ConstantExpr>(Ops[1]);
         assert((CE == 0 || CE->getType() == IntPtrTy) &&
                "CastGEPIndices didn't canonicalize index types!");
@@ -692,7 +715,7 @@ static Constant *SymbolicallyEvaluateGEP(ArrayRef<Constant *> Ops,
 
   // If this is a GEP of a GEP, fold it all into a single GEP.
   while (GEPOperator *GEP = dyn_cast<GEPOperator>(Ptr)) {
-    SmallVector<Value *, 4> NestedOps(GEP->op_begin()+1, GEP->op_end());
+    SmallVector<Value *, 4> NestedOps(GEP->op_begin() + 1, GEP->op_end());
 
     // Do not try the incorporate the sub-GEP if some index is not a number.
     bool AllConstantInt = true;
@@ -713,12 +736,15 @@ static Constant *SymbolicallyEvaluateGEP(ArrayRef<Constant *> Ops,
   // If the base value for this address is a literal integer value, fold the
   // getelementptr to the resulting integer value casted to the pointer type.
   APInt BasePtr(BitWidth, 0);
-  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Ptr))
-    if (CE->getOpcode() == Instruction::IntToPtr)
+  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Ptr)) {
+    if (CE->getOpcode() == Instruction::IntToPtr) {
       if (ConstantInt *Base = dyn_cast<ConstantInt>(CE->getOperand(0)))
         BasePtr = Base->getValue().zextOrTrunc(BitWidth);
+    }
+  }
+
   if (Ptr->isNullValue() || BasePtr != 0) {
-    Constant *C = ConstantInt::get(Ptr->getContext(), Offset+BasePtr);
+    Constant *C = ConstantInt::get(Ptr->getContext(), Offset + BasePtr);
     return ConstantExpr::getIntToPtr(C, ResultTy);
   }
 
@@ -728,7 +754,8 @@ static Constant *SymbolicallyEvaluateGEP(ArrayRef<Constant *> Ops,
   // Also, this helps GlobalOpt do SROA on GlobalVariables.
   Type *Ty = Ptr->getType();
   assert(Ty->isPointerTy() && "Forming regular GEP of non-pointer type");
-  SmallVector<Constant*, 32> NewIdxs;
+  SmallVector<Constant *, 32> NewIdxs;
+
   do {
     if (SequentialType *ATy = dyn_cast<SequentialType>(Ty)) {
       if (ATy->isPointerTy()) {
@@ -743,7 +770,6 @@ static Constant *SymbolicallyEvaluateGEP(ArrayRef<Constant *> Ops,
 
       // Determine which element of the array the offset points into.
       APInt ElemSize(BitWidth, TD->getTypeAllocSize(ATy->getElementType()));
-      IntegerType *IntPtrTy = TD->getIntPtrType(Ty->getContext());
       if (ElemSize == 0)
         // The element size is 0. This may be [0 x Ty]*, so just use a zero
         // index for this level and proceed to the next level to see if it can
@@ -778,7 +804,7 @@ static Constant *SymbolicallyEvaluateGEP(ArrayRef<Constant *> Ops,
       // We've reached some non-indexable type.
       break;
     }
-  } while (Ty != cast<PointerType>(ResultTy)->getElementType());
+  } while (Ty != ResultElementTy);
 
   // If we haven't used up the entire offset by descending the static
   // type, then the offset is pointing into the middle of an indivisible
@@ -787,14 +813,13 @@ static Constant *SymbolicallyEvaluateGEP(ArrayRef<Constant *> Ops,
     return 0;
 
   // Create a GEP.
-  Constant *C =
-    ConstantExpr::getGetElementPtr(Ptr, NewIdxs);
-  assert(cast<PointerType>(C->getType())->getElementType() == Ty &&
+  Constant *C = ConstantExpr::getGetElementPtr(Ptr, NewIdxs);
+  assert(C->getType()->getPointerElementType() == Ty &&
          "Computed GetElementPtr has unexpected type!");
 
   // If we ended up indexing a member with a type that doesn't match
   // the type of what the original indices indexed, add a cast.
-  if (Ty != cast<PointerType>(ResultTy)->getElementType())
+  if (Ty != ResultElementTy)
     C = FoldBitCast(C, ResultTy, *TD);
 
   return C;
@@ -867,16 +892,18 @@ Constant *llvm::ConstantFoldInstruction(Instruction *I,
   if (const LoadInst *LI = dyn_cast<LoadInst>(I))
     return ConstantFoldLoadInst(LI, TD);
 
-  if (InsertValueInst *IVI = dyn_cast<InsertValueInst>(I))
+  if (InsertValueInst *IVI = dyn_cast<InsertValueInst>(I)) {
     return ConstantExpr::getInsertValue(
                                 cast<Constant>(IVI->getAggregateOperand()),
                                 cast<Constant>(IVI->getInsertedValueOperand()),
                                 IVI->getIndices());
+  }
 
-  if (ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(I))
+  if (ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(I)) {
     return ConstantExpr::getExtractValue(
                                     cast<Constant>(EVI->getAggregateOperand()),
                                     EVI->getIndices());
+  }
 
   return ConstantFoldInstOperands(I->getOpcode(), I->getType(), Ops, TD, TLI);
 }
@@ -930,9 +957,10 @@ Constant *llvm::ConstantFoldInstOperands(unsigned Opcode, Type *DestTy,
                                          const TargetLibraryInfo *TLI) {
   // Handle easy binops first.
   if (Instruction::isBinaryOp(Opcode)) {
-    if (isa<ConstantExpr>(Ops[0]) || isa<ConstantExpr>(Ops[1]))
+    if (isa<ConstantExpr>(Ops[0]) || isa<ConstantExpr>(Ops[1])) {
       if (Constant *C = SymbolicallyEvaluateBinop(Opcode, Ops[0], Ops[1], TD))
         return C;
+    }
 
     return ConstantExpr::get(Opcode, Ops[0], Ops[1]);
   }
@@ -953,10 +981,11 @@ Constant *llvm::ConstantFoldInstOperands(unsigned Opcode, Type *DestTy,
       if (TD && CE->getOpcode() == Instruction::IntToPtr) {
         Constant *Input = CE->getOperand(0);
         unsigned InWidth = Input->getType()->getScalarSizeInBits();
-        if (TD->getPointerSizeInBits() < InWidth) {
+        unsigned PtrWidth = TD->getPointerTypeSizeInBits(CE->getType());
+        if (PtrWidth < InWidth) {
           Constant *Mask =
-            ConstantInt::get(CE->getContext(), APInt::getLowBitsSet(InWidth,
-                                                  TD->getPointerSizeInBits()));
+            ConstantInt::get(CE->getContext(),
+                             APInt::getLowBitsSet(InWidth, PtrWidth));
           Input = ConstantExpr::getAnd(Input, Mask);
         }
         // Do a zext or trunc to get to the dest size.
@@ -966,13 +995,22 @@ Constant *llvm::ConstantFoldInstOperands(unsigned Opcode, Type *DestTy,
     return ConstantExpr::getCast(Opcode, Ops[0], DestTy);
   case Instruction::IntToPtr:
     // If the input is a ptrtoint, turn the pair into a ptr to ptr bitcast if
-    // the int size is >= the ptr size.  This requires knowing the width of a
-    // pointer, so it can't be done in ConstantExpr::getCast.
-    if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Ops[0]))
-      if (TD &&
-          TD->getPointerSizeInBits() <= CE->getType()->getScalarSizeInBits() &&
-          CE->getOpcode() == Instruction::PtrToInt)
-        return FoldBitCast(CE->getOperand(0), DestTy, *TD);
+    // the int size is >= the ptr size and the address spaces are the same.
+    // This requires knowing the width of a pointer, so it can't be done in
+    // ConstantExpr::getCast.
+    if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Ops[0])) {
+      if (TD && CE->getOpcode() == Instruction::PtrToInt) {
+        Constant *SrcPtr = CE->getOperand(0);
+        unsigned SrcPtrSize = TD->getPointerTypeSizeInBits(SrcPtr->getType());
+        unsigned MidIntSize = CE->getType()->getScalarSizeInBits();
+
+        if (MidIntSize >= SrcPtrSize) {
+          unsigned SrcAS = SrcPtr->getType()->getPointerAddressSpace();
+          if (SrcAS == DestTy->getPointerAddressSpace())
+            return FoldBitCast(CE->getOperand(0), DestTy, *TD);
+        }
+      }
+    }
 
     return ConstantExpr::getCast(Opcode, Ops[0], DestTy);
   case Instruction::Trunc:
@@ -984,6 +1022,7 @@ Constant *llvm::ConstantFoldInstOperands(unsigned Opcode, Type *DestTy,
   case Instruction::SIToFP:
   case Instruction::FPToUI:
   case Instruction::FPToSI:
+  case Instruction::AddrSpaceCast:
       return ConstantExpr::getCast(Opcode, Ops[0], DestTy);
   case Instruction::BitCast:
     if (TD)
@@ -1024,8 +1063,8 @@ Constant *llvm::ConstantFoldCompareInstOperands(unsigned Predicate,
   // around to know if bit truncation is happening.
   if (ConstantExpr *CE0 = dyn_cast<ConstantExpr>(Ops0)) {
     if (TD && Ops1->isNullValue()) {
-      Type *IntPtrTy = TD->getIntPtrType(CE0->getContext());
       if (CE0->getOpcode() == Instruction::IntToPtr) {
+        Type *IntPtrTy = TD->getIntPtrType(CE0->getType());
         // Convert the integer value to the right size to ensure we get the
         // proper extension or truncation.
         Constant *C = ConstantExpr::getIntegerCast(CE0->getOperand(0),
@@ -1036,19 +1075,21 @@ Constant *llvm::ConstantFoldCompareInstOperands(unsigned Predicate,
 
       // Only do this transformation if the int is intptrty in size, otherwise
       // there is a truncation or extension that we aren't modeling.
-      if (CE0->getOpcode() == Instruction::PtrToInt &&
-          CE0->getType() == IntPtrTy) {
-        Constant *C = CE0->getOperand(0);
-        Constant *Null = Constant::getNullValue(C->getType());
-        return ConstantFoldCompareInstOperands(Predicate, C, Null, TD, TLI);
+      if (CE0->getOpcode() == Instruction::PtrToInt) {
+        Type *IntPtrTy = TD->getIntPtrType(CE0->getOperand(0)->getType());
+        if (CE0->getType() == IntPtrTy) {
+          Constant *C = CE0->getOperand(0);
+          Constant *Null = Constant::getNullValue(C->getType());
+          return ConstantFoldCompareInstOperands(Predicate, C, Null, TD, TLI);
+        }
       }
     }
 
     if (ConstantExpr *CE1 = dyn_cast<ConstantExpr>(Ops1)) {
       if (TD && CE0->getOpcode() == CE1->getOpcode()) {
-        Type *IntPtrTy = TD->getIntPtrType(CE0->getContext());
-
         if (CE0->getOpcode() == Instruction::IntToPtr) {
+          Type *IntPtrTy = TD->getIntPtrType(CE0->getType());
+
           // Convert the integer value to the right size to ensure we get the
           // proper extension or truncation.
           Constant *C0 = ConstantExpr::getIntegerCast(CE0->getOperand(0),
@@ -1060,11 +1101,17 @@ Constant *llvm::ConstantFoldCompareInstOperands(unsigned Predicate,
 
         // Only do this transformation if the int is intptrty in size, otherwise
         // there is a truncation or extension that we aren't modeling.
-        if ((CE0->getOpcode() == Instruction::PtrToInt &&
-             CE0->getType() == IntPtrTy &&
-             CE0->getOperand(0)->getType() == CE1->getOperand(0)->getType()))
-          return ConstantFoldCompareInstOperands(Predicate, CE0->getOperand(0),
-                                                 CE1->getOperand(0), TD, TLI);
+        if (CE0->getOpcode() == Instruction::PtrToInt) {
+          Type *IntPtrTy = TD->getIntPtrType(CE0->getOperand(0)->getType());
+          if (CE0->getType() == IntPtrTy &&
+              CE0->getOperand(0)->getType() == CE1->getOperand(0)->getType()) {
+            return ConstantFoldCompareInstOperands(Predicate,
+                                                   CE0->getOperand(0),
+                                                   CE1->getOperand(0),
+                                                   TD,
+                                                   TLI);
+          }
+        }
       }
     }
 
@@ -1101,7 +1148,8 @@ Constant *llvm::ConstantFoldLoadThroughGEPConstantExpr(Constant *C,
   // addressing.
   for (unsigned i = 2, e = CE->getNumOperands(); i != e; ++i) {
     C = C->getAggregateElement(CE->getOperand(i));
-    if (C == 0) return 0;
+    if (C == 0)
+      return 0;
   }
   return C;
 }
@@ -1116,7 +1164,8 @@ Constant *llvm::ConstantFoldLoadThroughGEPIndices(Constant *C,
   // addressing.
   for (unsigned i = 0, e = Indices.size(); i != e; ++i) {
     C = C->getAggregateElement(Indices[i]);
-    if (C == 0) return 0;
+    if (C == 0)
+      return 0;
   }
   return C;
 }
@@ -1128,8 +1177,7 @@ Constant *llvm::ConstantFoldLoadThroughGEPIndices(Constant *C,
 
 /// canConstantFoldCallTo - Return true if its even possible to fold a call to
 /// the specified function.
-bool
-llvm::canConstantFoldCallTo(const Function *F) {
+bool llvm::canConstantFoldCallTo(const Function *F) {
   switch (F->getIntrinsicID()) {
   case Intrinsic::fabs:
   case Intrinsic::log:
@@ -1167,7 +1215,8 @@ llvm::canConstantFoldCallTo(const Function *F) {
   case 0: break;
   }
 
-  if (!F->hasName()) return false;
+  if (!F->hasName())
+    return false;
   StringRef Name = F->getName();
 
   // In these cases, the check of the length is required.  We don't want to
@@ -1250,7 +1299,7 @@ static Constant *ConstantFoldBinaryFP(double (*NativeFP)(double, double),
 static Constant *ConstantFoldConvertToInt(const APFloat &Val,
                                           bool roundTowardZero, Type *Ty) {
   // All of these conversion intrinsics form an integer of at most 64bits.
-  unsigned ResultWidth = cast<IntegerType>(Ty)->getBitWidth();
+  unsigned ResultWidth = Ty->getIntegerBitWidth();
   assert(ResultWidth <= 64 &&
          "Can only constant fold conversions to 64 and 32 bit ints");
 
@@ -1271,7 +1320,8 @@ static Constant *ConstantFoldConvertToInt(const APFloat &Val,
 Constant *
 llvm::ConstantFoldCall(Function *F, ArrayRef<Constant *> Operands,
                        const TargetLibraryInfo *TLI) {
-  if (!F->hasName()) return 0;
+  if (!F->hasName())
+    return 0;
   StringRef Name = F->getName();
 
   Type *Ty = F->getReturnType();
diff --git a/lib/Analysis/CostModel.cpp b/lib/Analysis/CostModel.cpp
index 927508e..f943258 100644
--- a/lib/Analysis/CostModel.cpp
+++ b/lib/Analysis/CostModel.cpp
@@ -19,6 +19,7 @@
 
 #define CM_NAME "cost-model"
 #define DEBUG_TYPE CM_NAME
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/Analysis/Passes.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/Function.h"
@@ -26,10 +27,15 @@
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Value.h"
 #include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
+static cl::opt<bool> EnableReduxCost("costmodel-reduxcost", cl::init(false),
+                                     cl::Hidden,
+                                     cl::desc("Recognize reduction patterns."));
+
 namespace {
   class CostModelAnalysis : public FunctionPass {
 
@@ -105,6 +111,260 @@ static TargetTransformInfo::OperandValueKind getOperandInfo(Value *V) {
   return OpInfo;
 }
 
+static bool matchMask(SmallVectorImpl<int> &M1, SmallVectorImpl<int> &M2) {
+  if (M1.size() != M2.size())
+    return false;
+
+  for (unsigned i = 0, e = M1.size(); i != e; ++i)
+    if (M1[i] != M2[i])
+      return false;
+
+  return true;
+}
+
+static bool matchPairwiseShuffleMask(ShuffleVectorInst *SI, bool IsLeft,
+                                     unsigned Level) {
+  // We don't need a shuffle if we just want to have element 0 in position 0 of
+  // the vector.
+  if (!SI && Level == 0 && IsLeft)
+    return true;
+  else if (!SI)
+    return false;
+
+  SmallVector<int, 32> Mask(SI->getType()->getVectorNumElements(), -1);
+
+  // Build a mask of 0, 2, ... (left) or 1, 3, ... (right) depending on whether
+  // we look at the left or right side.
+  for (unsigned i = 0, e = (1 << Level), val = !IsLeft; i != e; ++i, val += 2)
+    Mask[i] = val;
+
+  SmallVector<int, 16> ActualMask = SI->getShuffleMask();
+  if (!matchMask(Mask, ActualMask))
+    return false;
+
+  return true;
+}
+
+static bool matchPairwiseReductionAtLevel(const BinaryOperator *BinOp,
+                                          unsigned Level, unsigned NumLevels) {
+  // Match one level of pairwise operations.
+  // %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef,
+  //       <4 x i32> <i32 0, i32 2 , i32 undef, i32 undef>
+  // %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef,
+  //       <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
+  // %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
+  if (BinOp == 0)
+    return false;
+
+  assert(BinOp->getType()->isVectorTy() && "Expecting a vector type");
+
+  unsigned Opcode = BinOp->getOpcode();
+  Value *L = BinOp->getOperand(0);
+  Value *R = BinOp->getOperand(1);
+
+  ShuffleVectorInst *LS = dyn_cast<ShuffleVectorInst>(L);
+  if (!LS && Level)
+    return false;
+  ShuffleVectorInst *RS = dyn_cast<ShuffleVectorInst>(R);
+  if (!RS && Level)
+    return false;
+
+  // On level 0 we can omit one shufflevector instruction.
+  if (!Level && !RS && !LS)
+    return false;
+
+  // Shuffle inputs must match.
+  Value *NextLevelOpL = LS ? LS->getOperand(0) : 0;
+  Value *NextLevelOpR = RS ? RS->getOperand(0) : 0;
+  Value *NextLevelOp = 0;
+  if (NextLevelOpR && NextLevelOpL) {
+    // If we have two shuffles their operands must match.
+    if (NextLevelOpL != NextLevelOpR)
+      return false;
+
+    NextLevelOp = NextLevelOpL;
+  } else if (Level == 0 && (NextLevelOpR || NextLevelOpL)) {
+    // On the first level we can omit the shufflevector <0, undef,...>. So the
+    // input to the other shufflevector <1, undef> must match with one of the
+    // inputs to the current binary operation.
+    // Example:
+    //  %NextLevelOpL = shufflevector %R, <1, undef ...>
+    //  %BinOp        = fadd          %NextLevelOpL, %R
+    if (NextLevelOpL && NextLevelOpL != R)
+      return false;
+    else if (NextLevelOpR && NextLevelOpR != L)
+      return false;
+
+    NextLevelOp = NextLevelOpL ? R : L;
+  } else
+    return false;
+
+  // Check that the next levels binary operation exists and matches with the
+  // current one.
+  BinaryOperator *NextLevelBinOp = 0;
+  if (Level + 1 != NumLevels) {
+    if (!(NextLevelBinOp = dyn_cast<BinaryOperator>(NextLevelOp)))
+      return false;
+    else if (NextLevelBinOp->getOpcode() != Opcode)
+      return false;
+  }
+
+  // Shuffle mask for pairwise operation must match.
+  if (matchPairwiseShuffleMask(LS, true, Level)) {
+    if (!matchPairwiseShuffleMask(RS, false, Level))
+      return false;
+  } else if (matchPairwiseShuffleMask(RS, true, Level)) {
+    if (!matchPairwiseShuffleMask(LS, false, Level))
+      return false;
+  } else
+    return false;
+
+  if (++Level == NumLevels)
+    return true;
+
+  // Match next level.
+  return matchPairwiseReductionAtLevel(NextLevelBinOp, Level, NumLevels);
+}
+
+static bool matchPairwiseReduction(const ExtractElementInst *ReduxRoot,
+                                   unsigned &Opcode, Type *&Ty) {
+  if (!EnableReduxCost)
+    return false;
+
+  // Need to extract the first element.
+  ConstantInt *CI = dyn_cast<ConstantInt>(ReduxRoot->getOperand(1));
+  unsigned Idx = ~0u;
+  if (CI)
+    Idx = CI->getZExtValue();
+  if (Idx != 0)
+    return false;
+
+  BinaryOperator *RdxStart = dyn_cast<BinaryOperator>(ReduxRoot->getOperand(0));
+  if (!RdxStart)
+    return false;
+
+  Type *VecTy = ReduxRoot->getOperand(0)->getType();
+  unsigned NumVecElems = VecTy->getVectorNumElements();
+  if (!isPowerOf2_32(NumVecElems))
+    return false;
+
+  // We look for a sequence of shuffle,shuffle,add triples like the following
+  // that builds a pairwise reduction tree.
+  //
+  //  (X0, X1, X2, X3)
+  //   (X0 + X1, X2 + X3, undef, undef)
+  //    ((X0 + X1) + (X2 + X3), undef, undef, undef)
+  //
+  // %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef,
+  //       <4 x i32> <i32 0, i32 2 , i32 undef, i32 undef>
+  // %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef,
+  //       <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
+  // %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
+  // %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef,
+  //       <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
+  // %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef,
+  //       <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  // %bin.rdx8 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
+  // %r = extractelement <4 x float> %bin.rdx8, i32 0
+  if (!matchPairwiseReductionAtLevel(RdxStart, 0,  Log2_32(NumVecElems)))
+    return false;
+
+  Opcode = RdxStart->getOpcode();
+  Ty = VecTy;
+
+  return true;
+}
+
+static std::pair<Value *, ShuffleVectorInst *>
+getShuffleAndOtherOprd(BinaryOperator *B) {
+
+  Value *L = B->getOperand(0);
+  Value *R = B->getOperand(1);
+  ShuffleVectorInst *S = 0;
+
+  if ((S = dyn_cast<ShuffleVectorInst>(L)))
+    return std::make_pair(R, S);
+
+  S = dyn_cast<ShuffleVectorInst>(R);
+  return std::make_pair(L, S);
+}
+
+static bool matchVectorSplittingReduction(const ExtractElementInst *ReduxRoot,
+                                          unsigned &Opcode, Type *&Ty) {
+  if (!EnableReduxCost)
+    return false;
+
+  // Need to extract the first element.
+  ConstantInt *CI = dyn_cast<ConstantInt>(ReduxRoot->getOperand(1));
+  unsigned Idx = ~0u;
+  if (CI)
+    Idx = CI->getZExtValue();
+  if (Idx != 0)
+    return false;
+
+  BinaryOperator *RdxStart = dyn_cast<BinaryOperator>(ReduxRoot->getOperand(0));
+  if (!RdxStart)
+    return false;
+  unsigned RdxOpcode = RdxStart->getOpcode();
+
+  Type *VecTy = ReduxRoot->getOperand(0)->getType();
+  unsigned NumVecElems = VecTy->getVectorNumElements();
+  if (!isPowerOf2_32(NumVecElems))
+    return false;
+
+  // We look for a sequence of shuffles and adds like the following matching one
+  // fadd, shuffle vector pair at a time.
+  //
+  // %rdx.shuf = shufflevector <4 x float> %rdx, <4 x float> undef,
+  //                           <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+  // %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf
+  // %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef,
+  //                          <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  // %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7
+  // %r = extractelement <4 x float> %bin.rdx8, i32 0
+
+  unsigned MaskStart = 1;
+  Value *RdxOp = RdxStart;
+  SmallVector<int, 32> ShuffleMask(NumVecElems, 0);
+  unsigned NumVecElemsRemain = NumVecElems;
+  while (NumVecElemsRemain - 1) {
+    // Check for the right reduction operation.
+    BinaryOperator *BinOp;
+    if (!(BinOp = dyn_cast<BinaryOperator>(RdxOp)))
+      return false;
+    if (BinOp->getOpcode() != RdxOpcode)
+      return false;
+
+    Value *NextRdxOp;
+    ShuffleVectorInst *Shuffle;
+    tie(NextRdxOp, Shuffle) = getShuffleAndOtherOprd(BinOp);
+
+    // Check the current reduction operation and the shuffle use the same value.
+    if (Shuffle == 0)
+      return false;
+    if (Shuffle->getOperand(0) != NextRdxOp)
+      return false;
+
+    // Check that shuffle masks matches.
+    for (unsigned j = 0; j != MaskStart; ++j)
+      ShuffleMask[j] = MaskStart + j;
+    // Fill the rest of the mask with -1 for undef.
+    std::fill(&ShuffleMask[MaskStart], ShuffleMask.end(), -1);
+
+    SmallVector<int, 16> Mask = Shuffle->getShuffleMask();
+    if (!matchMask(ShuffleMask, Mask))
+      return false;
+
+    RdxOp = NextRdxOp;
+    NumVecElemsRemain /= 2;
+    MaskStart *= 2;
+  }
+
+  Opcode = RdxOpcode;
+  Ty = VecTy;
+  return true;
+}
+
 unsigned CostModelAnalysis::getInstructionCost(const Instruction *I) const {
   if (!TTI)
     return -1;
@@ -189,6 +449,17 @@ unsigned CostModelAnalysis::getInstructionCost(const Instruction *I) const {
     unsigned Idx = -1;
     if (CI)
       Idx = CI->getZExtValue();
+
+    // Try to match a reduction sequence (series of shufflevector and vector
+    // adds followed by a extractelement).
+    unsigned ReduxOpCode;
+    Type *ReduxType;
+
+    if (matchVectorSplittingReduction(EEI, ReduxOpCode, ReduxType))
+      return TTI->getReductionCost(ReduxOpCode, ReduxType, false);
+    else if (matchPairwiseReduction(EEI, ReduxOpCode, ReduxType))
+      return TTI->getReductionCost(ReduxOpCode, ReduxType, true);
+
     return TTI->getVectorInstrCost(I->getOpcode(),
                                    EEI->getOperand(0)->getType(), Idx);
   }
diff --git a/lib/Analysis/Delinearization.cpp b/lib/Analysis/Delinearization.cpp
new file mode 100644
index 0000000..3ed0609
--- /dev/null
+++ b/lib/Analysis/Delinearization.cpp
@@ -0,0 +1,133 @@
+//===---- Delinearization.cpp - MultiDimensional Index Delinearization ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements an analysis pass that tries to delinearize all GEP
+// instructions in all loops using the SCEV analysis functionality. This pass is
+// only used for testing purposes: if your pass needs delinearization, please
+// use the on-demand SCEVAddRecExpr::delinearize() function.
+//
+//===----------------------------------------------------------------------===//
+
+#define DL_NAME "delinearize"
+#define DEBUG_TYPE DL_NAME
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Pass.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/InstIterator.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+namespace {
+
+class Delinearization : public FunctionPass {
+  Delinearization(const Delinearization &); // do not implement
+protected:
+  Function *F;
+  LoopInfo *LI;
+  ScalarEvolution *SE;
+
+public:
+  static char ID; // Pass identification, replacement for typeid
+
+  Delinearization() : FunctionPass(ID) {
+    initializeDelinearizationPass(*PassRegistry::getPassRegistry());
+  }
+  virtual bool runOnFunction(Function &F);
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const;
+  virtual void print(raw_ostream &O, const Module *M = 0) const;
+};
+
+} // end anonymous namespace
+
+void Delinearization::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AU.addRequired<LoopInfo>();
+  AU.addRequired<ScalarEvolution>();
+}
+
+bool Delinearization::runOnFunction(Function &F) {
+  this->F = &F;
+  SE = &getAnalysis<ScalarEvolution>();
+  LI = &getAnalysis<LoopInfo>();
+  return false;
+}
+
+static Value *getPointerOperand(Instruction &Inst) {
+  if (LoadInst *Load = dyn_cast<LoadInst>(&Inst))
+    return Load->getPointerOperand();
+  else if (StoreInst *Store = dyn_cast<StoreInst>(&Inst))
+    return Store->getPointerOperand();
+  else if (GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(&Inst))
+    return Gep->getPointerOperand();
+  return NULL;
+}
+
+void Delinearization::print(raw_ostream &O, const Module *) const {
+  O << "Delinearization on function " << F->getName() << ":\n";
+  for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I) {
+    Instruction *Inst = &(*I);
+
+    // Only analyze loads and stores.
+    if (!isa<StoreInst>(Inst) && !isa<LoadInst>(Inst) &&
+        !isa<GetElementPtrInst>(Inst))
+      continue;
+
+    const BasicBlock *BB = Inst->getParent();
+    // Delinearize the memory access as analyzed in all the surrounding loops.
+    // Do not analyze memory accesses outside loops.
+    for (Loop *L = LI->getLoopFor(BB); L != NULL; L = L->getParentLoop()) {
+      const SCEV *AccessFn = SE->getSCEVAtScope(getPointerOperand(*Inst), L);
+      const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(AccessFn);
+
+      // Do not try to delinearize memory accesses that are not AddRecs.
+      if (!AR)
+        break;
+
+      O << "AddRec: " << *AR << "\n";
+
+      SmallVector<const SCEV *, 3> Subscripts, Sizes;
+      const SCEV *Res = AR->delinearize(*SE, Subscripts, Sizes);
+      int Size = Subscripts.size();
+      if (Res == AR || Size == 0) {
+        O << "failed to delinearize\n";
+        continue;
+      }
+      O << "Base offset: " << *Res << "\n";
+      O << "ArrayDecl[UnknownSize]";
+      for (int i = 0; i < Size - 1; i++)
+        O << "[" << *Sizes[i] << "]";
+      O << " with elements of " << *Sizes[Size - 1] << " bytes.\n";
+
+      O << "ArrayRef";
+      for (int i = 0; i < Size; i++)
+        O << "[" << *Subscripts[i] << "]";
+      O << "\n";
+    }
+  }
+}
+
+char Delinearization::ID = 0;
+static const char delinearization_name[] = "Delinearization";
+INITIALIZE_PASS_BEGIN(Delinearization, DL_NAME, delinearization_name, true,
+                      true)
+INITIALIZE_PASS_DEPENDENCY(LoopInfo)
+INITIALIZE_PASS_END(Delinearization, DL_NAME, delinearization_name, true, true)
+
+FunctionPass *llvm::createDelinearizationPass() { return new Delinearization; }
diff --git a/lib/Analysis/DependenceAnalysis.cpp b/lib/Analysis/DependenceAnalysis.cpp
index a0f1a69..3b3e2ef 100644
--- a/lib/Analysis/DependenceAnalysis.cpp
+++ b/lib/Analysis/DependenceAnalysis.cpp
@@ -24,11 +24,11 @@
 // Both of these are conservative weaknesses;
 // that is, not a source of correctness problems.
 //
-// The implementation depends on the GEP instruction to
-// differentiate subscripts. Since Clang linearizes subscripts
-// for most arrays, we give up some precision (though the existing MIV tests
-// will help). We trust that the GEP instruction will eventually be extended.
-// In the meantime, we should explore Maslov's ideas about delinearization.
+// The implementation depends on the GEP instruction to differentiate
+// subscripts. Since Clang linearizes some array subscripts, the dependence
+// analysis is using SCEV->delinearize to recover the representation of multiple
+// subscripts, and thus avoid the more expensive and less precise MIV tests. The
+// delinearization is controlled by the flag -da-delinearize.
 //
 // We should pay some careful attention to the possibility of integer overflow
 // in the implementation of the various tests. This could happen with Add,
@@ -61,6 +61,7 @@
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Operator.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/InstIterator.h"
@@ -104,6 +105,10 @@ STATISTIC(BanerjeeApplications, "Banerjee applications");
 STATISTIC(BanerjeeIndependence, "Banerjee independence");
 STATISTIC(BanerjeeSuccesses, "Banerjee successes");
 
+static cl::opt<bool>
+Delinearize("da-delinearize", cl::init(false), cl::Hidden, cl::ZeroOrMore,
+            cl::desc("Try to delinearize array references."));
+
 //===----------------------------------------------------------------------===//
 // basics
 
@@ -3171,6 +3176,55 @@ void DependenceAnalysis::updateDirection(Dependence::DVEntry &Level,
     llvm_unreachable("constraint has unexpected kind");
 }
 
+/// Check if we can delinearize the subscripts. If the SCEVs representing the
+/// source and destination array references are recurrences on a nested loop,
+/// this function flattens the nested recurrences into seperate recurrences
+/// for each loop level.
+bool
+DependenceAnalysis::tryDelinearize(const SCEV *SrcSCEV, const SCEV *DstSCEV,
+                                   SmallVectorImpl<Subscript> &Pair) const {
+  const SCEVAddRecExpr *SrcAR = dyn_cast<SCEVAddRecExpr>(SrcSCEV);
+  const SCEVAddRecExpr *DstAR = dyn_cast<SCEVAddRecExpr>(DstSCEV);
+  if (!SrcAR || !DstAR || !SrcAR->isAffine() || !DstAR->isAffine())
+    return false;
+
+  SmallVector<const SCEV *, 4> SrcSubscripts, DstSubscripts, SrcSizes, DstSizes;
+  SrcAR->delinearize(*SE, SrcSubscripts, SrcSizes);
+  DstAR->delinearize(*SE, DstSubscripts, DstSizes);
+
+  int size = SrcSubscripts.size();
+  int dstSize = DstSubscripts.size();
+  if (size != dstSize || size < 2)
+    return false;
+
+#ifndef NDEBUG
+  DEBUG(errs() << "\nSrcSubscripts: ");
+  for (int i = 0; i < size; i++)
+    DEBUG(errs() << *SrcSubscripts[i]);
+  DEBUG(errs() << "\nDstSubscripts: ");
+  for (int i = 0; i < size; i++)
+    DEBUG(errs() << *DstSubscripts[i]);
+#endif
+
+  // The delinearization transforms a single-subscript MIV dependence test into
+  // a multi-subscript SIV dependence test that is easier to compute. So we
+  // resize Pair to contain as many pairs of subscripts as the delinearization
+  // has found, and then initialize the pairs following the delinearization.
+  Pair.resize(size);
+  for (int i = 0; i < size; ++i) {
+    Pair[i].Src = SrcSubscripts[i];
+    Pair[i].Dst = DstSubscripts[i];
+
+    // FIXME: we should record the bounds SrcSizes[i] and DstSizes[i] that the
+    // delinearization has found, and add these constraints to the dependence
+    // check to avoid memory accesses overflow from one dimension into another.
+    // This is related to the problem of determining the existence of data
+    // dependences in array accesses using a different number of subscripts: in
+    // C one can access an array A[100][100]; as A[0][9999], *A[9999], etc.
+  }
+
+  return true;
+}
 
 //===----------------------------------------------------------------------===//
 
@@ -3280,6 +3334,12 @@ Dependence *DependenceAnalysis::depends(Instruction *Src,
     Pair[0].Dst = DstSCEV;
   }
 
+  if (Delinearize && Pairs == 1 && CommonLevels > 1 &&
+      tryDelinearize(Pair[0].Src, Pair[0].Dst, Pair)) {
+    DEBUG(dbgs() << "    delinerized GEP\n");
+    Pairs = Pair.size();
+  }
+
   for (unsigned P = 0; P < Pairs; ++P) {
     Pair[P].Loops.resize(MaxLevels + 1);
     Pair[P].GroupLoops.resize(MaxLevels + 1);
@@ -3698,6 +3758,12 @@ const  SCEV *DependenceAnalysis::getSplitIteration(const Dependence *Dep,
     Pair[0].Dst = DstSCEV;
   }
 
+  if (Delinearize && Pairs == 1 && CommonLevels > 1 &&
+      tryDelinearize(Pair[0].Src, Pair[0].Dst, Pair)) {
+    DEBUG(dbgs() << "    delinerized GEP\n");
+    Pairs = Pair.size();
+  }
+
   for (unsigned P = 0; P < Pairs; ++P) {
     Pair[P].Loops.resize(MaxLevels + 1);
     Pair[P].GroupLoops.resize(MaxLevels + 1);
diff --git a/lib/Analysis/IPA/CallGraph.cpp b/lib/Analysis/IPA/CallGraph.cpp
index 7620fd9..f042964 100644
--- a/lib/Analysis/IPA/CallGraph.cpp
+++ b/lib/Analysis/IPA/CallGraph.cpp
@@ -6,11 +6,6 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
-//
-// This file implements the CallGraph class and provides the BasicCallGraph
-// default implementation.
-//
-//===----------------------------------------------------------------------===//
 
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/IR/Instructions.h"
@@ -21,168 +16,92 @@
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
-namespace {
+CallGraph::CallGraph()
+    : ModulePass(ID), Root(0), ExternalCallingNode(0), CallsExternalNode(0) {
+  initializeCallGraphPass(*PassRegistry::getPassRegistry());
+}
 
-//===----------------------------------------------------------------------===//
-// BasicCallGraph class definition
-//
-class BasicCallGraph : public ModulePass, public CallGraph {
-  // Root is root of the call graph, or the external node if a 'main' function
-  // couldn't be found.
-  //
-  CallGraphNode *Root;
-
-  // ExternalCallingNode - This node has edges to all external functions and
-  // those internal functions that have their address taken.
-  CallGraphNode *ExternalCallingNode;
-
-  // CallsExternalNode - This node has edges to it from all functions making
-  // indirect calls or calling an external function.
-  CallGraphNode *CallsExternalNode;
-
-public:
-  static char ID; // Class identification, replacement for typeinfo
-  BasicCallGraph() : ModulePass(ID), Root(0), 
-    ExternalCallingNode(0), CallsExternalNode(0) {
-      initializeBasicCallGraphPass(*PassRegistry::getPassRegistry());
-    }
+void CallGraph::addToCallGraph(Function *F) {
+  CallGraphNode *Node = getOrInsertFunction(F);
 
-  // runOnModule - Compute the call graph for the specified module.
-  virtual bool runOnModule(Module &M) {
-    CallGraph::initialize(M);
-    
-    ExternalCallingNode = getOrInsertFunction(0);
-    CallsExternalNode = new CallGraphNode(0);
-    Root = 0;
-  
-    // Add every function to the call graph.
-    for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I)
-      addToCallGraph(I);
-  
-    // If we didn't find a main function, use the external call graph node
-    if (Root == 0) Root = ExternalCallingNode;
-    
-    return false;
-  }
-
-  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
-    AU.setPreservesAll();
-  }
+  // If this function has external linkage, anything could call it.
+  if (!F->hasLocalLinkage()) {
+    ExternalCallingNode->addCalledFunction(CallSite(), Node);
 
-  virtual void print(raw_ostream &OS, const Module *) const {
-    OS << "CallGraph Root is: ";
-    if (Function *F = getRoot()->getFunction())
-      OS << F->getName() << "\n";
-    else {
-      OS << "<<null function: 0x" << getRoot() << ">>\n";
+    // Found the entry point?
+    if (F->getName() == "main") {
+      if (Root) // Found multiple external mains?  Don't pick one.
+        Root = ExternalCallingNode;
+      else
+        Root = Node; // Found a main, keep track of it!
     }
-    
-    CallGraph::print(OS, 0);
   }
 
-  virtual void releaseMemory() {
-    destroy();
-  }
-  
-  /// getAdjustedAnalysisPointer - This method is used when a pass implements
-  /// an analysis interface through multiple inheritance.  If needed, it should
-  /// override this to adjust the this pointer as needed for the specified pass
-  /// info.
-  virtual void *getAdjustedAnalysisPointer(AnalysisID PI) {
-    if (PI == &CallGraph::ID)
-      return (CallGraph*)this;
-    return this;
-  }
-  
-  CallGraphNode* getExternalCallingNode() const { return ExternalCallingNode; }
-  CallGraphNode* getCallsExternalNode()   const { return CallsExternalNode; }
-
-  // getRoot - Return the root of the call graph, which is either main, or if
-  // main cannot be found, the external node.
-  //
-  CallGraphNode *getRoot()             { return Root; }
-  const CallGraphNode *getRoot() const { return Root; }
-
-private:
-  //===---------------------------------------------------------------------
-  // Implementation of CallGraph construction
-  //
-
-  // addToCallGraph - Add a function to the call graph, and link the node to all
-  // of the functions that it calls.
-  //
-  void addToCallGraph(Function *F) {
-    CallGraphNode *Node = getOrInsertFunction(F);
-
-    // If this function has external linkage, anything could call it.
-    if (!F->hasLocalLinkage()) {
-      ExternalCallingNode->addCalledFunction(CallSite(), Node);
-
-      // Found the entry point?
-      if (F->getName() == "main") {
-        if (Root)    // Found multiple external mains?  Don't pick one.
-          Root = ExternalCallingNode;
-        else
-          Root = Node;          // Found a main, keep track of it!
+  // If this function has its address taken, anything could call it.
+  if (F->hasAddressTaken())
+    ExternalCallingNode->addCalledFunction(CallSite(), Node);
+
+  // If this function is not defined in this translation unit, it could call
+  // anything.
+  if (F->isDeclaration() && !F->isIntrinsic())
+    Node->addCalledFunction(CallSite(), CallsExternalNode);
+
+  // Look for calls by this function.
+  for (Function::iterator BB = F->begin(), BBE = F->end(); BB != BBE; ++BB)
+    for (BasicBlock::iterator II = BB->begin(), IE = BB->end(); II != IE;
+         ++II) {
+      CallSite CS(cast<Value>(II));
+      if (CS) {
+        const Function *Callee = CS.getCalledFunction();
+        if (!Callee)
+          // Indirect calls of intrinsics are not allowed so no need to check.
+          Node->addCalledFunction(CS, CallsExternalNode);
+        else if (!Callee->isIntrinsic())
+          Node->addCalledFunction(CS, getOrInsertFunction(Callee));
       }
     }
+}
 
-    // If this function has its address taken, anything could call it.
-    if (F->hasAddressTaken())
-      ExternalCallingNode->addCalledFunction(CallSite(), Node);
-
-    // If this function is not defined in this translation unit, it could call
-    // anything.
-    if (F->isDeclaration() && !F->isIntrinsic())
-      Node->addCalledFunction(CallSite(), CallsExternalNode);
-
-    // Look for calls by this function.
-    for (Function::iterator BB = F->begin(), BBE = F->end(); BB != BBE; ++BB)
-      for (BasicBlock::iterator II = BB->begin(), IE = BB->end();
-           II != IE; ++II) {
-        CallSite CS(cast<Value>(II));
-        if (CS) {
-          const Function *Callee = CS.getCalledFunction();
-          if (!Callee)
-            // Indirect calls of intrinsics are not allowed so no need to check.
-            Node->addCalledFunction(CS, CallsExternalNode);
-          else if (!Callee->isIntrinsic())
-            Node->addCalledFunction(CS, getOrInsertFunction(Callee));
-        }
-      }
-  }
+void CallGraph::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+}
 
-  //
-  // destroy - Release memory for the call graph
-  virtual void destroy() {
-    /// CallsExternalNode is not in the function map, delete it explicitly.
-    if (CallsExternalNode) {
-      CallsExternalNode->allReferencesDropped();
-      delete CallsExternalNode;
-      CallsExternalNode = 0;
-    }
-    CallGraph::destroy();
-  }
-};
+bool CallGraph::runOnModule(Module &M) {
+  Mod = &M;
 
-} //End anonymous namespace
+  ExternalCallingNode = getOrInsertFunction(0);
+  assert(!CallsExternalNode);
+  CallsExternalNode = new CallGraphNode(0);
+  Root = 0;
 
-INITIALIZE_ANALYSIS_GROUP(CallGraph, "Call Graph", BasicCallGraph)
-INITIALIZE_AG_PASS(BasicCallGraph, CallGraph, "basiccg",
-                   "Basic CallGraph Construction", false, true, true)
+  // Add every function to the call graph.
+  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I)
+    addToCallGraph(I);
 
-char CallGraph::ID = 0;
-char BasicCallGraph::ID = 0;
+  // If we didn't find a main function, use the external call graph node
+  if (Root == 0)
+    Root = ExternalCallingNode;
 
-void CallGraph::initialize(Module &M) {
-  Mod = &M;
+  return false;
 }
 
-void CallGraph::destroy() {
-  if (FunctionMap.empty()) return;
-  
-  // Reset all node's use counts to zero before deleting them to prevent an
-  // assertion from firing.
+INITIALIZE_PASS(CallGraph, "basiccg", "CallGraph Construction", false, true)
+
+char CallGraph::ID = 0;
+
+void CallGraph::releaseMemory() {
+  /// CallsExternalNode is not in the function map, delete it explicitly.
+  if (CallsExternalNode) {
+    CallsExternalNode->allReferencesDropped();
+    delete CallsExternalNode;
+    CallsExternalNode = 0;
+  }
+
+  if (FunctionMap.empty())
+    return;
+
+// Reset all node's use counts to zero before deleting them to prevent an
+// assertion from firing.
 #ifndef NDEBUG
   for (FunctionMapTy::iterator I = FunctionMap.begin(), E = FunctionMap.end();
        I != E; ++I)
@@ -195,7 +114,14 @@ void CallGraph::destroy() {
   FunctionMap.clear();
 }
 
-void CallGraph::print(raw_ostream &OS, Module*) const {
+void CallGraph::print(raw_ostream &OS, const Module*) const {
+  OS << "CallGraph Root is: ";
+  if (Function *F = Root->getFunction())
+    OS << F->getName() << "\n";
+  else {
+    OS << "<<null function: 0x" << Root << ">>\n";
+  }
+
   for (CallGraph::const_iterator I = begin(), E = end(); I != E; ++I)
     I->second->print(OS);
 }
diff --git a/lib/Analysis/IPA/CallGraphSCCPass.cpp b/lib/Analysis/IPA/CallGraphSCCPass.cpp
index a0d788f..182beca 100644
--- a/lib/Analysis/IPA/CallGraphSCCPass.cpp
+++ b/lib/Analysis/IPA/CallGraphSCCPass.cpp
@@ -22,7 +22,7 @@
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IntrinsicInst.h"
-#include "llvm/PassManagers.h"
+#include "llvm/IR/LegacyPassManagers.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Timer.h"
diff --git a/lib/Analysis/IPA/GlobalsModRef.cpp b/lib/Analysis/IPA/GlobalsModRef.cpp
index 92d0d23..7ec4644 100644
--- a/lib/Analysis/IPA/GlobalsModRef.cpp
+++ b/lib/Analysis/IPA/GlobalsModRef.cpp
@@ -189,7 +189,7 @@ char GlobalsModRef::ID = 0;
 INITIALIZE_AG_PASS_BEGIN(GlobalsModRef, AliasAnalysis,
                 "globalsmodref-aa", "Simple mod/ref analysis for globals",    
                 false, true, false)
-INITIALIZE_AG_DEPENDENCY(CallGraph)
+INITIALIZE_PASS_DEPENDENCY(CallGraph)
 INITIALIZE_AG_PASS_END(GlobalsModRef, AliasAnalysis,
                 "globalsmodref-aa", "Simple mod/ref analysis for globals",    
                 false, true, false)
diff --git a/lib/Analysis/IPA/IPA.cpp b/lib/Analysis/IPA/IPA.cpp
index 1c1816d..47357cf 100644
--- a/lib/Analysis/IPA/IPA.cpp
+++ b/lib/Analysis/IPA/IPA.cpp
@@ -19,8 +19,7 @@ using namespace llvm;
 
 /// initializeIPA - Initialize all passes linked into the IPA library.
 void llvm::initializeIPA(PassRegistry &Registry) {
-  initializeBasicCallGraphPass(Registry);
-  initializeCallGraphAnalysisGroup(Registry);
+  initializeCallGraphPass(Registry);
   initializeCallGraphPrinterPass(Registry);
   initializeCallGraphViewerPass(Registry);
   initializeFindUsedTypesPass(Registry);
diff --git a/lib/Analysis/IPA/InlineCost.cpp b/lib/Analysis/IPA/InlineCost.cpp
index 37d73a8..3bc796e 100644
--- a/lib/Analysis/IPA/InlineCost.cpp
+++ b/lib/Analysis/IPA/InlineCost.cpp
@@ -59,6 +59,8 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
   bool ExposesReturnsTwice;
   bool HasDynamicAlloca;
   bool ContainsNoDuplicateCall;
+  bool HasReturn;
+  bool HasIndirectBr;
 
   /// Number of bytes allocated statically by the callee.
   uint64_t AllocatedSize;
@@ -132,6 +134,12 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
   bool visitExtractValue(ExtractValueInst &I);
   bool visitInsertValue(InsertValueInst &I);
   bool visitCallSite(CallSite CS);
+  bool visitReturnInst(ReturnInst &RI);
+  bool visitBranchInst(BranchInst &BI);
+  bool visitSwitchInst(SwitchInst &SI);
+  bool visitIndirectBrInst(IndirectBrInst &IBI);
+  bool visitResumeInst(ResumeInst &RI);
+  bool visitUnreachableInst(UnreachableInst &I);
 
 public:
   CallAnalyzer(const DataLayout *TD, const TargetTransformInfo &TTI,
@@ -139,12 +147,13 @@ public:
       : TD(TD), TTI(TTI), F(Callee), Threshold(Threshold), Cost(0),
         IsCallerRecursive(false), IsRecursiveCall(false),
         ExposesReturnsTwice(false), HasDynamicAlloca(false),
-        ContainsNoDuplicateCall(false), AllocatedSize(0), NumInstructions(0),
-        NumVectorInstructions(0), FiftyPercentVectorBonus(0),
-        TenPercentVectorBonus(0), VectorBonus(0), NumConstantArgs(0),
-        NumConstantOffsetPtrArgs(0), NumAllocaArgs(0), NumConstantPtrCmps(0),
-        NumConstantPtrDiffs(0), NumInstructionsSimplified(0),
-        SROACostSavings(0), SROACostSavingsLost(0) {}
+        ContainsNoDuplicateCall(false), HasReturn(false), HasIndirectBr(false),
+        AllocatedSize(0), NumInstructions(0), NumVectorInstructions(0),
+        FiftyPercentVectorBonus(0), TenPercentVectorBonus(0), VectorBonus(0),
+        NumConstantArgs(0), NumConstantOffsetPtrArgs(0), NumAllocaArgs(0),
+        NumConstantPtrCmps(0), NumConstantPtrDiffs(0),
+        NumInstructionsSimplified(0), SROACostSavings(0),
+        SROACostSavingsLost(0) {}
 
   bool analyzeCall(CallSite CS);
 
@@ -704,7 +713,7 @@ bool CallAnalyzer::simplifyCallSite(Function *F, CallSite CS) {
 }
 
 bool CallAnalyzer::visitCallSite(CallSite CS) {
-  if (CS.isCall() && cast<CallInst>(CS.getInstruction())->canReturnTwice() &&
+  if (CS.hasFnAttr(Attribute::ReturnsTwice) &&
       !F.getAttributes().hasAttribute(AttributeSet::FunctionIndex,
                                       Attribute::ReturnsTwice)) {
     // This aborts the entire analysis.
@@ -785,6 +794,60 @@ bool CallAnalyzer::visitCallSite(CallSite CS) {
   return Base::visitCallSite(CS);
 }
 
+bool CallAnalyzer::visitReturnInst(ReturnInst &RI) {
+  // At least one return instruction will be free after inlining.
+  bool Free = !HasReturn;
+  HasReturn = true;
+  return Free;
+}
+
+bool CallAnalyzer::visitBranchInst(BranchInst &BI) {
+  // We model unconditional branches as essentially free -- they really
+  // shouldn't exist at all, but handling them makes the behavior of the
+  // inliner more regular and predictable. Interestingly, conditional branches
+  // which will fold away are also free.
+  return BI.isUnconditional() || isa<ConstantInt>(BI.getCondition()) ||
+         dyn_cast_or_null<ConstantInt>(
+             SimplifiedValues.lookup(BI.getCondition()));
+}
+
+bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) {
+  // We model unconditional switches as free, see the comments on handling
+  // branches.
+  return isa<ConstantInt>(SI.getCondition()) ||
+         dyn_cast_or_null<ConstantInt>(
+             SimplifiedValues.lookup(SI.getCondition()));
+}
+
+bool CallAnalyzer::visitIndirectBrInst(IndirectBrInst &IBI) {
+  // We never want to inline functions that contain an indirectbr.  This is
+  // incorrect because all the blockaddress's (in static global initializers
+  // for example) would be referring to the original function, and this
+  // indirect jump would jump from the inlined copy of the function into the
+  // original function which is extremely undefined behavior.
+  // FIXME: This logic isn't really right; we can safely inline functions with
+  // indirectbr's as long as no other function or global references the
+  // blockaddress of a block within the current function.  And as a QOI issue,
+  // if someone is using a blockaddress without an indirectbr, and that
+  // reference somehow ends up in another function or global, we probably don't
+  // want to inline this function.
+  HasIndirectBr = true;
+  return false;
+}
+
+bool CallAnalyzer::visitResumeInst(ResumeInst &RI) {
+  // FIXME: It's not clear that a single instruction is an accurate model for
+  // the inline cost of a resume instruction.
+  return false;
+}
+
+bool CallAnalyzer::visitUnreachableInst(UnreachableInst &I) {
+  // FIXME: It might be reasonably to discount the cost of instructions leading
+  // to unreachable as they have the lowest possible impact on both runtime and
+  // code size.
+  return true; // No actual code is needed for unreachable.
+}
+
 bool CallAnalyzer::visitInstruction(Instruction &I) {
   // Some instructions are free. All of the free intrinsics can also be
   // handled by SROA, etc.
@@ -808,8 +871,7 @@ bool CallAnalyzer::visitInstruction(Instruction &I) {
 /// construct has been detected. It returns false if inlining is no longer
 /// viable, and true if inlining remains viable.
 bool CallAnalyzer::analyzeBlock(BasicBlock *BB) {
-  for (BasicBlock::iterator I = BB->begin(), E = llvm::prior(BB->end());
-       I != E; ++I) {
+  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
     ++NumInstructions;
     if (isa<ExtractElementInst>(I) || I->getType()->isVectorTy())
       ++NumVectorInstructions;
@@ -825,7 +887,8 @@ bool CallAnalyzer::analyzeBlock(BasicBlock *BB) {
       Cost += InlineConstants::InstrCost;
 
     // If the visit this instruction detected an uninlinable pattern, abort.
-    if (IsRecursiveCall || ExposesReturnsTwice || HasDynamicAlloca)
+    if (IsRecursiveCall || ExposesReturnsTwice || HasDynamicAlloca ||
+        HasIndirectBr)
       return false;
 
     // If the caller is a recursive function then we don't want to inline
@@ -989,10 +1052,6 @@ bool CallAnalyzer::analyzeCall(CallSite CS) {
     }
   }
 
-  // Track whether we've seen a return instruction. The first return
-  // instruction is free, as at least one will usually disappear in inlining.
-  bool HasReturn = false;
-
   // Populate our simplified values by mapping from function arguments to call
   // arguments with known important simplifications.
   CallSite::arg_iterator CAI = CS.arg_begin();
@@ -1039,33 +1098,11 @@ bool CallAnalyzer::analyzeCall(CallSite CS) {
     if (BB->empty())
       continue;
 
-    // Handle the terminator cost here where we can track returns and other
-    // function-wide constructs.
-    TerminatorInst *TI = BB->getTerminator();
-
-    // We never want to inline functions that contain an indirectbr.  This is
-    // incorrect because all the blockaddress's (in static global initializers
-    // for example) would be referring to the original function, and this
-    // indirect jump would jump from the inlined copy of the function into the 
-    // original function which is extremely undefined behavior.
-    // FIXME: This logic isn't really right; we can safely inline functions
-    // with indirectbr's as long as no other function or global references the
-    // blockaddress of a block within the current function.  And as a QOI issue,
-    // if someone is using a blockaddress without an indirectbr, and that
-    // reference somehow ends up in another function or global, we probably
-    // don't want to inline this function.
-    if (isa<IndirectBrInst>(TI))
-      return false;
-
-    if (!HasReturn && isa<ReturnInst>(TI))
-      HasReturn = true;
-    else
-      Cost += InlineConstants::InstrCost;
-
     // Analyze the cost of this block. If we blow through the threshold, this
     // returns false, and we can bail on out.
     if (!analyzeBlock(BB)) {
-      if (IsRecursiveCall || ExposesReturnsTwice || HasDynamicAlloca)
+      if (IsRecursiveCall || ExposesReturnsTwice || HasDynamicAlloca ||
+          HasIndirectBr)
         return false;
 
       // If the caller is a recursive function then we don't want to inline
@@ -1078,6 +1115,8 @@ bool CallAnalyzer::analyzeCall(CallSite CS) {
       break;
     }
 
+    TerminatorInst *TI = BB->getTerminator();
+
     // Add in the live successors by first checking whether we have terminator
     // that may be simplified based on the values simplified by this call.
     if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
@@ -1171,6 +1210,22 @@ InlineCost InlineCostAnalysis::getInlineCost(CallSite CS, int Threshold) {
   return getInlineCost(CS, CS.getCalledFunction(), Threshold);
 }
 
+/// \brief Test that two functions either have or have not the given attribute
+///        at the same time.
+static bool attributeMatches(Function *F1, Function *F2,
+                             Attribute::AttrKind Attr) {
+  return F1->hasFnAttribute(Attr) == F2->hasFnAttribute(Attr);
+}
+
+/// \brief Test that there are no attribute conflicts between Caller and Callee
+///        that prevent inlining.
+static bool functionsHaveCompatibleAttributes(Function *Caller,
+                                              Function *Callee) {
+  return attributeMatches(Caller, Callee, Attribute::SanitizeAddress) &&
+         attributeMatches(Caller, Callee, Attribute::SanitizeMemory) &&
+         attributeMatches(Caller, Callee, Attribute::SanitizeThread);
+}
+
 InlineCost InlineCostAnalysis::getInlineCost(CallSite CS, Function *Callee,
                                              int Threshold) {
   // Cannot inline indirect calls.
@@ -1179,20 +1234,26 @@ InlineCost InlineCostAnalysis::getInlineCost(CallSite CS, Function *Callee,
 
   // Calls to functions with always-inline attributes should be inlined
   // whenever possible.
-  if (Callee->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
-                                           Attribute::AlwaysInline)) {
+  if (Callee->hasFnAttribute(Attribute::AlwaysInline)) {
     if (isInlineViable(*Callee))
       return llvm::InlineCost::getAlways();
     return llvm::InlineCost::getNever();
   }
 
+  // Never inline functions with conflicting attributes (unless callee has
+  // always-inline attribute).
+  if (!functionsHaveCompatibleAttributes(CS.getCaller(), Callee))
+    return llvm::InlineCost::getNever();
+
+  // Don't inline this call if the caller has the optnone attribute.
+  if (CS.getCaller()->hasFnAttribute(Attribute::OptimizeNone))
+    return llvm::InlineCost::getNever();
+
   // Don't inline functions which can be redefined at link-time to mean
   // something else.  Don't inline functions marked noinline or call sites
   // marked noinline.
   if (Callee->mayBeOverridden() ||
-      Callee->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
-                                           Attribute::NoInline) ||
-      CS.isNoInline())
+      Callee->hasFnAttribute(Attribute::NoInline) || CS.isNoInline())
     return llvm::InlineCost::getNever();
 
   DEBUG(llvm::dbgs() << "      Analyzing call of " << Callee->getName()
diff --git a/lib/Analysis/InstructionSimplify.cpp b/lib/Analysis/InstructionSimplify.cpp
index b275dfe..b867af1 100644
--- a/lib/Analysis/InstructionSimplify.cpp
+++ b/lib/Analysis/InstructionSimplify.cpp
@@ -668,7 +668,8 @@ Value *llvm::SimplifyAddInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
 /// follow non-inbounds geps. This allows it to remain usable for icmp ult/etc.
 /// folding.
 static Constant *stripAndComputeConstantOffsets(const DataLayout *TD,
-                                                Value *&V) {
+                                                Value *&V,
+                                                bool AllowNonInbounds = false) {
   assert(V->getType()->getScalarType()->isPointerTy());
 
   // Without DataLayout, just be conservative for now. Theoretically, more could
@@ -685,7 +686,8 @@ static Constant *stripAndComputeConstantOffsets(const DataLayout *TD,
   Visited.insert(V);
   do {
     if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
-      if (!GEP->isInBounds() || !GEP->accumulateConstantOffset(*TD, Offset))
+      if ((!AllowNonInbounds && !GEP->isInBounds()) ||
+          !GEP->accumulateConstantOffset(*TD, Offset))
         break;
       V = GEP->getPointerOperand();
     } else if (Operator::getOpcode(V) == Instruction::BitCast) {
@@ -1737,7 +1739,7 @@ static Constant *computePointerICmp(const DataLayout *TD,
   RHS = RHS->stripPointerCasts();
 
   // A non-null pointer is not equal to a null pointer.
-  if (llvm::isKnownNonNull(LHS) && isa<ConstantPointerNull>(RHS) &&
+  if (llvm::isKnownNonNull(LHS, TLI) && isa<ConstantPointerNull>(RHS) &&
       (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE))
     return ConstantInt::get(GetCompareTy(LHS),
                             !CmpInst::isTrueWhenEqual(Pred));
@@ -1837,6 +1839,17 @@ static Constant *computePointerICmp(const DataLayout *TD,
         return ConstantInt::get(GetCompareTy(LHS),
                                 !CmpInst::isTrueWhenEqual(Pred));
     }
+
+    // Even if an non-inbounds GEP occurs along the path we can still optimize
+    // equality comparisons concerning the result. We avoid walking the whole
+    // chain again by starting where the last calls to
+    // stripAndComputeConstantOffsets left off and accumulate the offsets.
+    Constant *LHSNoBound = stripAndComputeConstantOffsets(TD, LHS, true);
+    Constant *RHSNoBound = stripAndComputeConstantOffsets(TD, RHS, true);
+    if (LHS == RHS)
+      return ConstantExpr::getICmp(Pred,
+                                   ConstantExpr::getAdd(LHSOffset, LHSNoBound),
+                                   ConstantExpr::getAdd(RHSOffset, RHSNoBound));
   }
 
   // Otherwise, fail.
@@ -2946,6 +2959,7 @@ static bool IsIdempotent(Intrinsic::ID ID) {
   case Intrinsic::trunc:
   case Intrinsic::rint:
   case Intrinsic::nearbyint:
+  case Intrinsic::round:
     return true;
   }
 }
diff --git a/lib/Analysis/Lint.cpp b/lib/Analysis/Lint.cpp
index 9393508..ec17f47 100644
--- a/lib/Analysis/Lint.cpp
+++ b/lib/Analysis/Lint.cpp
@@ -207,7 +207,7 @@ void Lint::visitCallSite(CallSite CS) {
             &I);
 
     FunctionType *FT = F->getFunctionType();
-    unsigned NumActualArgs = unsigned(CS.arg_end()-CS.arg_begin());
+    unsigned NumActualArgs = CS.arg_size();
 
     Assert1(FT->isVarArg() ?
               FT->getNumParams() <= NumActualArgs :
@@ -504,14 +504,42 @@ void Lint::visitShl(BinaryOperator &I) {
             "Undefined result: Shift count out of range", &I);
 }
 
-static bool isZero(Value *V, DataLayout *TD) {
+static bool isZero(Value *V, DataLayout *DL) {
   // Assume undef could be zero.
-  if (isa<UndefValue>(V)) return true;
+  if (isa<UndefValue>(V))
+    return true;
+
+  VectorType *VecTy = dyn_cast<VectorType>(V->getType());
+  if (!VecTy) {
+    unsigned BitWidth = V->getType()->getIntegerBitWidth();
+    APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0);
+    ComputeMaskedBits(V, KnownZero, KnownOne, DL);
+    return KnownZero.isAllOnesValue();
+  }
+
+  // Per-component check doesn't work with zeroinitializer
+  Constant *C = dyn_cast<Constant>(V);
+  if (!C)
+    return false;
+
+  if (C->isZeroValue())
+    return true;
+
+  // For a vector, KnownZero will only be true if all values are zero, so check
+  // this per component
+  unsigned BitWidth = VecTy->getElementType()->getIntegerBitWidth();
+  for (unsigned I = 0, N = VecTy->getNumElements(); I != N; ++I) {
+    Constant *Elem = C->getAggregateElement(I);
+    if (isa<UndefValue>(Elem))
+      return true;
+
+    APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0);
+    ComputeMaskedBits(Elem, KnownZero, KnownOne, DL);
+    if (KnownZero.isAllOnesValue())
+      return true;
+  }
 
-  unsigned BitWidth = cast<IntegerType>(V->getType())->getBitWidth();
-  APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0);
-  ComputeMaskedBits(V, KnownZero, KnownOne, TD);
-  return KnownZero.isAllOnesValue();
+  return false;
 }
 
 void Lint::visitSDiv(BinaryOperator &I) {
diff --git a/lib/Analysis/LoopInfo.cpp b/lib/Analysis/LoopInfo.cpp
index 142ebed..e369633 100644
--- a/lib/Analysis/LoopInfo.cpp
+++ b/lib/Analysis/LoopInfo.cpp
@@ -177,10 +177,6 @@ PHINode *Loop::getCanonicalInductionVariable() const {
 
 /// isLCSSAForm - Return true if the Loop is in LCSSA form
 bool Loop::isLCSSAForm(DominatorTree &DT) const {
-  // Sort the blocks vector so that we can use binary search to do quick
-  // lookups.
-  SmallPtrSet<BasicBlock*, 16> LoopBBs(block_begin(), block_end());
-
   for (block_iterator BI = block_begin(), E = block_end(); BI != E; ++BI) {
     BasicBlock *BB = *BI;
     for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;++I)
@@ -196,7 +192,7 @@ bool Loop::isLCSSAForm(DominatorTree &DT) const {
         // block they are defined in.  Also, blocks not reachable from the
         // entry are special; uses in them don't need to go through PHIs.
         if (UserBB != BB &&
-            !LoopBBs.count(UserBB) &&
+            !contains(UserBB) &&
             DT.isReachableFromEntry(UserBB))
           return false;
       }
@@ -220,12 +216,12 @@ bool Loop::isSafeToClone() const {
   // Return false if any loop blocks contain indirectbrs, or there are any calls
   // to noduplicate functions.
   for (Loop::block_iterator I = block_begin(), E = block_end(); I != E; ++I) {
-    if (isa<IndirectBrInst>((*I)->getTerminator())) {
+    if (isa<IndirectBrInst>((*I)->getTerminator()))
       return false;
-    } else if (const InvokeInst *II = dyn_cast<InvokeInst>((*I)->getTerminator())) {
+
+    if (const InvokeInst *II = dyn_cast<InvokeInst>((*I)->getTerminator()))
       if (II->hasFnAttr(Attribute::NoDuplicate))
         return false;
-    }
 
     for (BasicBlock::iterator BI = (*I)->begin(), BE = (*I)->end(); BI != BE; ++BI) {
       if (const CallInst *CI = dyn_cast<CallInst>(BI)) {
@@ -309,15 +305,15 @@ bool Loop::isAnnotatedParallel() const {
       if (!II->mayReadOrWriteMemory())
         continue;
 
-      if (!II->getMetadata("llvm.mem.parallel_loop_access"))
-        return false;
-
       // The memory instruction can refer to the loop identifier metadata
       // directly or indirectly through another list metadata (in case of
       // nested parallel loops). The loop identifier metadata refers to
       // itself so we can check both cases with the same routine.
-      MDNode *loopIdMD =
-          dyn_cast<MDNode>(II->getMetadata("llvm.mem.parallel_loop_access"));
+      MDNode *loopIdMD = II->getMetadata("llvm.mem.parallel_loop_access");
+
+      if (!loopIdMD)
+        return false;
+
       bool loopIdMDFound = false;
       for (unsigned i = 0, e = loopIdMD->getNumOperands(); i < e; ++i) {
         if (loopIdMD->getOperand(i) == desiredLoopIdMetadata) {
@@ -337,9 +333,6 @@ bool Loop::isAnnotatedParallel() const {
 /// hasDedicatedExits - Return true if no exit block for the loop
 /// has a predecessor that is outside the loop.
 bool Loop::hasDedicatedExits() const {
-  // Sort the blocks vector so that we can use binary search to do quick
-  // lookups.
-  SmallPtrSet<BasicBlock *, 16> LoopBBs(block_begin(), block_end());
   // Each predecessor of each exit block of a normal loop is contained
   // within the loop.
   SmallVector<BasicBlock *, 4> ExitBlocks;
@@ -347,7 +340,7 @@ bool Loop::hasDedicatedExits() const {
   for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i)
     for (pred_iterator PI = pred_begin(ExitBlocks[i]),
          PE = pred_end(ExitBlocks[i]); PI != PE; ++PI)
-      if (!LoopBBs.count(*PI))
+      if (!contains(*PI))
         return false;
   // All the requirements are met.
   return true;
@@ -362,11 +355,6 @@ Loop::getUniqueExitBlocks(SmallVectorImpl<BasicBlock *> &ExitBlocks) const {
   assert(hasDedicatedExits() &&
          "getUniqueExitBlocks assumes the loop has canonical form exits!");
 
-  // Sort the blocks vector so that we can use binary search to do quick
-  // lookups.
-  SmallVector<BasicBlock *, 128> LoopBBs(block_begin(), block_end());
-  std::sort(LoopBBs.begin(), LoopBBs.end());
-
   SmallVector<BasicBlock *, 32> switchExitBlocks;
 
   for (block_iterator BI = block_begin(), BE = block_end(); BI != BE; ++BI) {
@@ -376,7 +364,7 @@ Loop::getUniqueExitBlocks(SmallVectorImpl<BasicBlock *> &ExitBlocks) const {
 
     for (succ_iterator I = succ_begin(*BI), E = succ_end(*BI); I != E; ++I) {
       // If block is inside the loop then it is not a exit block.
-      if (std::binary_search(LoopBBs.begin(), LoopBBs.end(), *I))
+      if (contains(*I))
         continue;
 
       pred_iterator PI = pred_begin(*I);
diff --git a/lib/Analysis/MemoryBuiltins.cpp b/lib/Analysis/MemoryBuiltins.cpp
index 0f0a1c9..1db0f63 100644
--- a/lib/Analysis/MemoryBuiltins.cpp
+++ b/lib/Analysis/MemoryBuiltins.cpp
@@ -31,12 +31,13 @@
 using namespace llvm;
 
 enum AllocType {
-  MallocLike         = 1<<0, // allocates
-  CallocLike         = 1<<1, // allocates + bzero
-  ReallocLike        = 1<<2, // reallocates
-  StrDupLike         = 1<<3,
+  OpNewLike          = 1<<0, // allocates; never returns null
+  MallocLike         = 1<<1 | OpNewLike, // allocates; may return null
+  CallocLike         = 1<<2, // allocates + bzero
+  ReallocLike        = 1<<3, // reallocates
+  StrDupLike         = 1<<4,
   AllocLike          = MallocLike | CallocLike | StrDupLike,
-  AnyAlloc           = MallocLike | CallocLike | ReallocLike | StrDupLike
+  AnyAlloc           = AllocLike | ReallocLike
 };
 
 struct AllocFnsTy {
@@ -52,20 +53,20 @@ struct AllocFnsTy {
 static const AllocFnsTy AllocationFnData[] = {
   {LibFunc::malloc,              MallocLike,  1, 0,  -1},
   {LibFunc::valloc,              MallocLike,  1, 0,  -1},
-  {LibFunc::Znwj,                MallocLike,  1, 0,  -1}, // new(unsigned int)
+  {LibFunc::Znwj,                OpNewLike,   1, 0,  -1}, // new(unsigned int)
   {LibFunc::ZnwjRKSt9nothrow_t,  MallocLike,  2, 0,  -1}, // new(unsigned int, nothrow)
-  {LibFunc::Znwm,                MallocLike,  1, 0,  -1}, // new(unsigned long)
+  {LibFunc::Znwm,                OpNewLike,   1, 0,  -1}, // new(unsigned long)
   {LibFunc::ZnwmRKSt9nothrow_t,  MallocLike,  2, 0,  -1}, // new(unsigned long, nothrow)
-  {LibFunc::Znaj,                MallocLike,  1, 0,  -1}, // new[](unsigned int)
+  {LibFunc::Znaj,                OpNewLike,   1, 0,  -1}, // new[](unsigned int)
   {LibFunc::ZnajRKSt9nothrow_t,  MallocLike,  2, 0,  -1}, // new[](unsigned int, nothrow)
-  {LibFunc::Znam,                MallocLike,  1, 0,  -1}, // new[](unsigned long)
+  {LibFunc::Znam,                OpNewLike,   1, 0,  -1}, // new[](unsigned long)
   {LibFunc::ZnamRKSt9nothrow_t,  MallocLike,  2, 0,  -1}, // new[](unsigned long, nothrow)
-  {LibFunc::posix_memalign,      MallocLike,  3, 2,  -1},
   {LibFunc::calloc,              CallocLike,  2, 0,   1},
   {LibFunc::realloc,             ReallocLike, 2, 1,  -1},
   {LibFunc::reallocf,            ReallocLike, 2, 1,  -1},
   {LibFunc::strdup,              StrDupLike,  1, -1, -1},
   {LibFunc::strndup,             StrDupLike,  2, 1,  -1}
+  // TODO: Handle "int posix_memalign(void **, size_t, size_t)"
 };
 
 
@@ -117,7 +118,7 @@ static const AllocFnsTy *getAllocationData(const Value *V, AllocType AllocTy,
     return 0;
 
   const AllocFnsTy *FnData = &AllocationFnData[i];
-  if ((FnData->AllocTy & AllocTy) == 0)
+  if ((FnData->AllocTy & AllocTy) != FnData->AllocTy)
     return 0;
 
   // Check function prototype.
@@ -189,6 +190,13 @@ bool llvm::isReallocLikeFn(const Value *V, const TargetLibraryInfo *TLI,
   return getAllocationData(V, ReallocLike, TLI, LookThroughBitCast);
 }
 
+/// \brief Tests if a value is a call or invoke to a library function that
+/// allocates memory and never returns null (such as operator new).
+bool llvm::isOperatorNewLikeFn(const Value *V, const TargetLibraryInfo *TLI,
+                               bool LookThroughBitCast) {
+  return getAllocationData(V, OpNewLike, TLI, LookThroughBitCast);
+}
+
 /// extractMallocCall - Returns the corresponding CallInst if the instruction
 /// is a malloc call.  Since CallInst::CreateMalloc() only creates calls, we
 /// ignore InvokeInst here.
@@ -197,7 +205,7 @@ const CallInst *llvm::extractMallocCall(const Value *I,
   return isMallocLikeFn(I, TLI) ? dyn_cast<CallInst>(I) : 0;
 }
 
-static Value *computeArraySize(const CallInst *CI, const DataLayout *TD,
+static Value *computeArraySize(const CallInst *CI, const DataLayout *DL,
                                const TargetLibraryInfo *TLI,
                                bool LookThroughSExt = false) {
   if (!CI)
@@ -205,12 +213,12 @@ static Value *computeArraySize(const CallInst *CI, const DataLayout *TD,
 
   // The size of the malloc's result type must be known to determine array size.
   Type *T = getMallocAllocatedType(CI, TLI);
-  if (!T || !T->isSized() || !TD)
+  if (!T || !T->isSized() || !DL)
     return 0;
 
-  unsigned ElementSize = TD->getTypeAllocSize(T);
+  unsigned ElementSize = DL->getTypeAllocSize(T);
   if (StructType *ST = dyn_cast<StructType>(T))
-    ElementSize = TD->getStructLayout(ST)->getSizeInBytes();
+    ElementSize = DL->getStructLayout(ST)->getSizeInBytes();
 
   // If malloc call's arg can be determined to be a multiple of ElementSize,
   // return the multiple.  Otherwise, return NULL.
@@ -227,10 +235,10 @@ static Value *computeArraySize(const CallInst *CI, const DataLayout *TD,
 /// is a call to malloc whose array size can be determined and the array size
 /// is not constant 1.  Otherwise, return NULL.
 const CallInst *llvm::isArrayMalloc(const Value *I,
-                                    const DataLayout *TD,
+                                    const DataLayout *DL,
                                     const TargetLibraryInfo *TLI) {
   const CallInst *CI = extractMallocCall(I, TLI);
-  Value *ArraySize = computeArraySize(CI, TD, TLI);
+  Value *ArraySize = computeArraySize(CI, DL, TLI);
 
   if (ConstantInt *ConstSize = dyn_cast_or_null<ConstantInt>(ArraySize))
     if (ConstSize->isOne())
@@ -288,11 +296,11 @@ Type *llvm::getMallocAllocatedType(const CallInst *CI,
 /// then return that multiple.  For non-array mallocs, the multiple is
 /// constant 1.  Otherwise, return NULL for mallocs whose array size cannot be
 /// determined.
-Value *llvm::getMallocArraySize(CallInst *CI, const DataLayout *TD,
+Value *llvm::getMallocArraySize(CallInst *CI, const DataLayout *DL,
                                 const TargetLibraryInfo *TLI,
                                 bool LookThroughSExt) {
   assert(isMallocLikeFn(CI, TLI) && "getMallocArraySize and not malloc call");
-  return computeArraySize(CI, TD, TLI, LookThroughSExt);
+  return computeArraySize(CI, DL, TLI, LookThroughSExt);
 }
 
 
@@ -354,12 +362,12 @@ const CallInst *llvm::isFreeCall(const Value *I, const TargetLibraryInfo *TLI) {
 /// object size in Size if successful, and false otherwise.
 /// If RoundToAlign is true, then Size is rounded up to the aligment of allocas,
 /// byval arguments, and global variables.
-bool llvm::getObjectSize(const Value *Ptr, uint64_t &Size, const DataLayout *TD,
+bool llvm::getObjectSize(const Value *Ptr, uint64_t &Size, const DataLayout *DL,
                          const TargetLibraryInfo *TLI, bool RoundToAlign) {
-  if (!TD)
+  if (!DL)
     return false;
 
-  ObjectSizeOffsetVisitor Visitor(TD, TLI, Ptr->getContext(), RoundToAlign);
+  ObjectSizeOffsetVisitor Visitor(DL, TLI, Ptr->getContext(), RoundToAlign);
   SizeOffsetType Data = Visitor.compute(const_cast<Value*>(Ptr));
   if (!Visitor.bothKnown(Data))
     return false;
@@ -386,12 +394,12 @@ APInt ObjectSizeOffsetVisitor::align(APInt Size, uint64_t Align) {
   return Size;
 }
 
-ObjectSizeOffsetVisitor::ObjectSizeOffsetVisitor(const DataLayout *TD,
+ObjectSizeOffsetVisitor::ObjectSizeOffsetVisitor(const DataLayout *DL,
                                                  const TargetLibraryInfo *TLI,
                                                  LLVMContext &Context,
                                                  bool RoundToAlign)
-: TD(TD), TLI(TLI), RoundToAlign(RoundToAlign) {
-  IntegerType *IntTy = TD->getIntPtrType(Context);
+: DL(DL), TLI(TLI), RoundToAlign(RoundToAlign) {
+  IntegerType *IntTy = DL->getIntPtrType(Context);
   IntTyBits = IntTy->getBitWidth();
   Zero = APInt::getNullValue(IntTyBits);
 }
@@ -434,7 +442,7 @@ SizeOffsetType ObjectSizeOffsetVisitor::visitAllocaInst(AllocaInst &I) {
   if (!I.getAllocatedType()->isSized())
     return unknown();
 
-  APInt Size(IntTyBits, TD->getTypeAllocSize(I.getAllocatedType()));
+  APInt Size(IntTyBits, DL->getTypeAllocSize(I.getAllocatedType()));
   if (!I.isArrayAllocation())
     return std::make_pair(align(Size, I.getAlignment()), Zero);
 
@@ -453,7 +461,7 @@ SizeOffsetType ObjectSizeOffsetVisitor::visitArgument(Argument &A) {
     return unknown();
   }
   PointerType *PT = cast<PointerType>(A.getType());
-  APInt Size(IntTyBits, TD->getTypeAllocSize(PT->getElementType()));
+  APInt Size(IntTyBits, DL->getTypeAllocSize(PT->getElementType()));
   return std::make_pair(align(Size, A.getParamAlignment()), Zero);
 }
 
@@ -526,7 +534,7 @@ ObjectSizeOffsetVisitor::visitExtractValueInst(ExtractValueInst&) {
 SizeOffsetType ObjectSizeOffsetVisitor::visitGEPOperator(GEPOperator &GEP) {
   SizeOffsetType PtrData = compute(GEP.getPointerOperand());
   APInt Offset(IntTyBits, 0);
-  if (!bothKnown(PtrData) || !GEP.accumulateConstantOffset(*TD, Offset))
+  if (!bothKnown(PtrData) || !GEP.accumulateConstantOffset(*DL, Offset))
     return unknown();
 
   return std::make_pair(PtrData.first, PtrData.second + Offset);
@@ -542,7 +550,7 @@ SizeOffsetType ObjectSizeOffsetVisitor::visitGlobalVariable(GlobalVariable &GV){
   if (!GV.hasDefinitiveInitializer())
     return unknown();
 
-  APInt Size(IntTyBits, TD->getTypeAllocSize(GV.getType()->getElementType()));
+  APInt Size(IntTyBits, DL->getTypeAllocSize(GV.getType()->getElementType()));
   return std::make_pair(align(Size, GV.getAlignment()), Zero);
 }
 
@@ -578,12 +586,13 @@ SizeOffsetType ObjectSizeOffsetVisitor::visitInstruction(Instruction &I) {
   return unknown();
 }
 
-
-ObjectSizeOffsetEvaluator::ObjectSizeOffsetEvaluator(const DataLayout *TD,
-                                                   const TargetLibraryInfo *TLI,
-                                                     LLVMContext &Context)
-: TD(TD), TLI(TLI), Context(Context), Builder(Context, TargetFolder(TD)) {
-  IntTy = TD->getIntPtrType(Context);
+ObjectSizeOffsetEvaluator::ObjectSizeOffsetEvaluator(const DataLayout *DL,
+                                                     const TargetLibraryInfo *TLI,
+                                                     LLVMContext &Context,
+                                                     bool RoundToAlign)
+: DL(DL), TLI(TLI), Context(Context), Builder(Context, TargetFolder(DL)),
+  RoundToAlign(RoundToAlign) {
+  IntTy = DL->getIntPtrType(Context);
   Zero = ConstantInt::get(IntTy, 0);
 }
 
@@ -607,7 +616,7 @@ SizeOffsetEvalType ObjectSizeOffsetEvaluator::compute(Value *V) {
 }
 
 SizeOffsetEvalType ObjectSizeOffsetEvaluator::compute_(Value *V) {
-  ObjectSizeOffsetVisitor Visitor(TD, TLI, Context);
+  ObjectSizeOffsetVisitor Visitor(DL, TLI, Context, RoundToAlign);
   SizeOffsetType Const = Visitor.compute(V);
   if (Visitor.bothKnown(Const))
     return std::make_pair(ConstantInt::get(Context, Const.first),
@@ -626,13 +635,15 @@ SizeOffsetEvalType ObjectSizeOffsetEvaluator::compute_(Value *V) {
   if (Instruction *I = dyn_cast<Instruction>(V))
     Builder.SetInsertPoint(I);
 
-  // record the pointers that were handled in this run, so that they can be
-  // cleaned later if something fails
-  SeenVals.insert(V);
-
   // now compute the size and offset
   SizeOffsetEvalType Result;
-  if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
+
+  // Record the pointers that were handled in this run, so that they can be
+  // cleaned later if something fails. We also use this set to break cycles that
+  // can occur in dead code.
+  if (!SeenVals.insert(V)) {
+    Result = unknown();
+  } else if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
     Result = visitGEPOperator(*GEP);
   } else if (Instruction *I = dyn_cast<Instruction>(V)) {
     Result = visit(*I);
@@ -665,7 +676,7 @@ SizeOffsetEvalType ObjectSizeOffsetEvaluator::visitAllocaInst(AllocaInst &I) {
   assert(I.isArrayAllocation());
   Value *ArraySize = I.getArraySize();
   Value *Size = ConstantInt::get(ArraySize->getType(),
-                                 TD->getTypeAllocSize(I.getAllocatedType()));
+                                 DL->getTypeAllocSize(I.getAllocatedType()));
   Size = Builder.CreateMul(Size, ArraySize);
   return std::make_pair(Size, Zero);
 }
@@ -717,7 +728,7 @@ ObjectSizeOffsetEvaluator::visitGEPOperator(GEPOperator &GEP) {
   if (!bothKnown(PtrData))
     return unknown();
 
-  Value *Offset = EmitGEPOffset(&Builder, *TD, &GEP, /*NoAssumptions=*/true);
+  Value *Offset = EmitGEPOffset(&Builder, *DL, &GEP, /*NoAssumptions=*/true);
   Offset = Builder.CreateAdd(PtrData.second, Offset);
   return std::make_pair(PtrData.first, Offset);
 }
diff --git a/lib/Analysis/MemoryDependenceAnalysis.cpp b/lib/Analysis/MemoryDependenceAnalysis.cpp
index fe1c874..84ff2ee 100644
--- a/lib/Analysis/MemoryDependenceAnalysis.cpp
+++ b/lib/Analysis/MemoryDependenceAnalysis.cpp
@@ -371,18 +371,19 @@ getPointerDependencyFrom(const AliasAnalysis::Location &MemLoc, bool isLoad,
 
   // Walk backwards through the basic block, looking for dependencies.
   while (ScanIt != BB->begin()) {
+    Instruction *Inst = --ScanIt;
+
+    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst))
+      // Debug intrinsics don't (and can't) cause dependencies.
+      if (isa<DbgInfoIntrinsic>(II)) continue;
+
     // Limit the amount of scanning we do so we don't end up with quadratic
     // running time on extreme testcases.
     --Limit;
     if (!Limit)
       return MemDepResult::getUnknown();
 
-    Instruction *Inst = --ScanIt;
-
     if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
-      // Debug intrinsics don't (and can't) cause dependences.
-      if (isa<DbgInfoIntrinsic>(II)) continue;
-
       // If we reach a lifetime begin or end marker, then the query ends here
       // because the value is undefined.
       if (II->getIntrinsicID() == Intrinsic::lifetime_start) {
diff --git a/lib/Analysis/PathNumbering.cpp b/lib/Analysis/PathNumbering.cpp
deleted file mode 100644
index 30d213b..0000000
--- a/lib/Analysis/PathNumbering.cpp
+++ /dev/null
@@ -1,521 +0,0 @@
-//===- PathNumbering.cpp --------------------------------------*- C++ -*---===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// Ball-Larus path numbers uniquely identify paths through a directed acyclic
-// graph (DAG) [Ball96].  For a CFG backedges are removed and replaced by phony
-// edges to obtain a DAG, and thus the unique path numbers [Ball96].
-//
-// The purpose of this analysis is to enumerate the edges in a CFG in order
-// to obtain paths from path numbers in a convenient manner.  As described in
-// [Ball96] edges can be enumerated such that given a path number by following
-// the CFG and updating the path number, the path is obtained.
-//
-// [Ball96]
-//  T. Ball and J. R. Larus. "Efficient Path Profiling."
-//  International Symposium on Microarchitecture, pages 46-57, 1996.
-//  http://portal.acm.org/citation.cfm?id=243857
-//
-//===----------------------------------------------------------------------===//
-#define DEBUG_TYPE "ball-larus-numbering"
-
-#include "llvm/Analysis/PathNumbering.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/TypeBuilder.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/CFG.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Compiler.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include <queue>
-#include <sstream>
-#include <stack>
-#include <string>
-#include <utility>
-
-using namespace llvm;
-
-// Are we enabling early termination
-static cl::opt<bool> ProcessEarlyTermination(
-  "path-profile-early-termination", cl::Hidden,
-  cl::desc("In path profiling, insert extra instrumentation to account for "
-           "unexpected function termination."));
-
-// Returns the basic block for the BallLarusNode
-BasicBlock* BallLarusNode::getBlock() {
-  return(_basicBlock);
-}
-
-// Returns the number of paths to the exit starting at the node.
-unsigned BallLarusNode::getNumberPaths() {
-  return(_numberPaths);
-}
-
-// Sets the number of paths to the exit starting at the node.
-void BallLarusNode::setNumberPaths(unsigned numberPaths) {
-  _numberPaths = numberPaths;
-}
-
-// Gets the NodeColor used in graph algorithms.
-BallLarusNode::NodeColor BallLarusNode::getColor() {
-  return(_color);
-}
-
-// Sets the NodeColor used in graph algorithms.
-void BallLarusNode::setColor(BallLarusNode::NodeColor color) {
-  _color = color;
-}
-
-// Returns an iterator over predecessor edges. Includes phony and
-// backedges.
-BLEdgeIterator BallLarusNode::predBegin() {
-  return(_predEdges.begin());
-}
-
-// Returns the end sentinel for the predecessor iterator.
-BLEdgeIterator BallLarusNode::predEnd() {
-  return(_predEdges.end());
-}
-
-// Returns the number of predecessor edges.  Includes phony and
-// backedges.
-unsigned BallLarusNode::getNumberPredEdges() {
-  return(_predEdges.size());
-}
-
-// Returns an iterator over successor edges. Includes phony and
-// backedges.
-BLEdgeIterator BallLarusNode::succBegin() {
-  return(_succEdges.begin());
-}
-
-// Returns the end sentinel for the successor iterator.
-BLEdgeIterator BallLarusNode::succEnd() {
-  return(_succEdges.end());
-}
-
-// Returns the number of successor edges.  Includes phony and
-// backedges.
-unsigned BallLarusNode::getNumberSuccEdges() {
-  return(_succEdges.size());
-}
-
-// Add an edge to the predecessor list.
-void BallLarusNode::addPredEdge(BallLarusEdge* edge) {
-  _predEdges.push_back(edge);
-}
-
-// Remove an edge from the predecessor list.
-void BallLarusNode::removePredEdge(BallLarusEdge* edge) {
-  removeEdge(_predEdges, edge);
-}
-
-// Add an edge to the successor list.
-void BallLarusNode::addSuccEdge(BallLarusEdge* edge) {
-  _succEdges.push_back(edge);
-}
-
-// Remove an edge from the successor list.
-void BallLarusNode::removeSuccEdge(BallLarusEdge* edge) {
-  removeEdge(_succEdges, edge);
-}
-
-// Returns the name of the BasicBlock being represented.  If BasicBlock
-// is null then returns "<null>".  If BasicBlock has no name, then
-// "<unnamed>" is returned.  Intended for use with debug output.
-std::string BallLarusNode::getName() {
-  std::stringstream name;
-
-  if(getBlock() != NULL) {
-    if(getBlock()->hasName()) {
-      std::string tempName(getBlock()->getName());
-      name << tempName.c_str() << " (" << _uid << ")";
-    } else
-      name << "<unnamed> (" << _uid << ")";
-  } else
-    name << "<null> (" << _uid << ")";
-
-  return name.str();
-}
-
-// Removes an edge from an edgeVector.  Used by removePredEdge and
-// removeSuccEdge.
-void BallLarusNode::removeEdge(BLEdgeVector& v, BallLarusEdge* e) {
-  // TODO: Avoid linear scan by using a set instead
-  for(BLEdgeIterator i = v.begin(),
-        end = v.end();
-      i != end;
-      ++i) {
-    if((*i) == e) {
-      v.erase(i);
-      break;
-    }
-  }
-}
-
-// Returns the source node of this edge.
-BallLarusNode* BallLarusEdge::getSource() const {
-  return(_source);
-}
-
-// Returns the target node of this edge.
-BallLarusNode* BallLarusEdge::getTarget() const {
-  return(_target);
-}
-
-// Sets the type of the edge.
-BallLarusEdge::EdgeType BallLarusEdge::getType() const {
-  return _edgeType;
-}
-
-// Gets the type of the edge.
-void BallLarusEdge::setType(EdgeType type) {
-  _edgeType = type;
-}
-
-// Returns the weight of this edge.  Used to decode path numbers to sequences
-// of basic blocks.
-unsigned BallLarusEdge::getWeight() {
-  return(_weight);
-}
-
-// Sets the weight of the edge.  Used during path numbering.
-void BallLarusEdge::setWeight(unsigned weight) {
-  _weight = weight;
-}
-
-// Gets the phony edge originating at the root.
-BallLarusEdge* BallLarusEdge::getPhonyRoot() {
-  return _phonyRoot;
-}
-
-// Sets the phony edge originating at the root.
-void BallLarusEdge::setPhonyRoot(BallLarusEdge* phonyRoot) {
-  _phonyRoot = phonyRoot;
-}
-
-// Gets the phony edge terminating at the exit.
-BallLarusEdge* BallLarusEdge::getPhonyExit() {
-  return _phonyExit;
-}
-
-// Sets the phony edge terminating at the exit.
-void BallLarusEdge::setPhonyExit(BallLarusEdge* phonyExit) {
-  _phonyExit = phonyExit;
-}
-
-// Gets the associated real edge if this is a phony edge.
-BallLarusEdge* BallLarusEdge::getRealEdge() {
-  return _realEdge;
-}
-
-// Sets the associated real edge if this is a phony edge.
-void BallLarusEdge::setRealEdge(BallLarusEdge* realEdge) {
-  _realEdge = realEdge;
-}
-
-// Returns the duplicate number of the edge.
-unsigned BallLarusEdge::getDuplicateNumber() {
-  return(_duplicateNumber);
-}
-
-// Initialization that requires virtual functions which are not fully
-// functional in the constructor.
-void BallLarusDag::init() {
-  BLBlockNodeMap inDag;
-  std::stack<BallLarusNode*> dfsStack;
-
-  _root = addNode(&(_function.getEntryBlock()));
-  _exit = addNode(NULL);
-
-  // start search from root
-  dfsStack.push(getRoot());
-
-  // dfs to add each bb into the dag
-  while(dfsStack.size())
-    buildNode(inDag, dfsStack);
-
-  // put in the final edge
-  addEdge(getExit(),getRoot(),0);
-}
-
-// Frees all memory associated with the DAG.
-BallLarusDag::~BallLarusDag() {
-  for(BLEdgeIterator edge = _edges.begin(), end = _edges.end(); edge != end;
-      ++edge)
-    delete (*edge);
-
-  for(BLNodeIterator node = _nodes.begin(), end = _nodes.end(); node != end;
-      ++node)
-    delete (*node);
-}
-
-// Calculate the path numbers by assigning edge increments as prescribed
-// in Ball-Larus path profiling.
-void BallLarusDag::calculatePathNumbers() {
-  BallLarusNode* node;
-  std::queue<BallLarusNode*> bfsQueue;
-  bfsQueue.push(getExit());
-
-  while(bfsQueue.size() > 0) {
-    node = bfsQueue.front();
-
-    DEBUG(dbgs() << "calculatePathNumbers on " << node->getName() << "\n");
-
-    bfsQueue.pop();
-    unsigned prevPathNumber = node->getNumberPaths();
-    calculatePathNumbersFrom(node);
-
-    // Check for DAG splitting
-    if( node->getNumberPaths() > 100000000 && node != getRoot() ) {
-      // Add new phony edge from the split-node to the DAG's exit
-      BallLarusEdge* exitEdge = addEdge(node, getExit(), 0);
-      exitEdge->setType(BallLarusEdge::SPLITEDGE_PHONY);
-
-      // Counters to handle the possibility of a multi-graph
-      BasicBlock* oldTarget = 0;
-      unsigned duplicateNumber = 0;
-
-      // Iterate through each successor edge, adding phony edges
-      for( BLEdgeIterator succ = node->succBegin(), end = node->succEnd();
-           succ != end; oldTarget = (*succ)->getTarget()->getBlock(), succ++ ) {
-
-        if( (*succ)->getType() == BallLarusEdge::NORMAL ) {
-          // is this edge a duplicate?
-          if( oldTarget != (*succ)->getTarget()->getBlock() )
-            duplicateNumber = 0;
-
-          // create the new phony edge: root -> succ
-          BallLarusEdge* rootEdge =
-            addEdge(getRoot(), (*succ)->getTarget(), duplicateNumber++);
-          rootEdge->setType(BallLarusEdge::SPLITEDGE_PHONY);
-          rootEdge->setRealEdge(*succ);
-
-          // split on this edge and reference it's exit/root phony edges
-          (*succ)->setType(BallLarusEdge::SPLITEDGE);
-          (*succ)->setPhonyRoot(rootEdge);
-          (*succ)->setPhonyExit(exitEdge);
-          (*succ)->setWeight(0);
-        }
-      }
-
-      calculatePathNumbersFrom(node);
-    }
-
-    DEBUG(dbgs() << "prev, new number paths " << prevPathNumber << ", "
-          << node->getNumberPaths() << ".\n");
-
-    if(prevPathNumber == 0 && node->getNumberPaths() != 0) {
-      DEBUG(dbgs() << "node ready : " << node->getName() << "\n");
-      for(BLEdgeIterator pred = node->predBegin(), end = node->predEnd();
-          pred != end; pred++) {
-        if( (*pred)->getType() == BallLarusEdge::BACKEDGE ||
-            (*pred)->getType() == BallLarusEdge::SPLITEDGE )
-          continue;
-
-        BallLarusNode* nextNode = (*pred)->getSource();
-        // not yet visited?
-        if(nextNode->getNumberPaths() == 0)
-          bfsQueue.push(nextNode);
-      }
-    }
-  }
-
-  DEBUG(dbgs() << "\tNumber of paths: " << getRoot()->getNumberPaths() << "\n");
-}
-
-// Returns the number of paths for the Dag.
-unsigned BallLarusDag::getNumberOfPaths() {
-  return(getRoot()->getNumberPaths());
-}
-
-// Returns the root (i.e. entry) node for the DAG.
-BallLarusNode* BallLarusDag::getRoot() {
-  return _root;
-}
-
-// Returns the exit node for the DAG.
-BallLarusNode* BallLarusDag::getExit() {
-  return _exit;
-}
-
-// Returns the function for the DAG.
-Function& BallLarusDag::getFunction() {
-  return(_function);
-}
-
-// Clears the node colors.
-void BallLarusDag::clearColors(BallLarusNode::NodeColor color) {
-  for (BLNodeIterator nodeIt = _nodes.begin(); nodeIt != _nodes.end(); nodeIt++)
-    (*nodeIt)->setColor(color);
-}
-
-// Processes one node and its imediate edges for building the DAG.
-void BallLarusDag::buildNode(BLBlockNodeMap& inDag, BLNodeStack& dfsStack) {
-  BallLarusNode* currentNode = dfsStack.top();
-  BasicBlock* currentBlock = currentNode->getBlock();
-
-  if(currentNode->getColor() != BallLarusNode::WHITE) {
-    // we have already visited this node
-    dfsStack.pop();
-    currentNode->setColor(BallLarusNode::BLACK);
-  } else {
-    // are there any external procedure calls?
-    if( ProcessEarlyTermination ) {
-      for( BasicBlock::iterator bbCurrent = currentNode->getBlock()->begin(),
-             bbEnd = currentNode->getBlock()->end(); bbCurrent != bbEnd;
-           bbCurrent++ ) {
-        Instruction& instr = *bbCurrent;
-        if( instr.getOpcode() == Instruction::Call ) {
-          BallLarusEdge* callEdge = addEdge(currentNode, getExit(), 0);
-          callEdge->setType(BallLarusEdge::CALLEDGE_PHONY);
-          break;
-        }
-      }
-    }
-
-    TerminatorInst* terminator = currentNode->getBlock()->getTerminator();
-    if(isa<ReturnInst>(terminator) || isa<UnreachableInst>(terminator) ||
-       isa<ResumeInst>(terminator))
-      addEdge(currentNode, getExit(),0);
-
-    currentNode->setColor(BallLarusNode::GRAY);
-    inDag[currentBlock] = currentNode;
-
-    BasicBlock* oldSuccessor = 0;
-    unsigned duplicateNumber = 0;
-
-    // iterate through this node's successors
-    for(succ_iterator successor = succ_begin(currentBlock),
-          succEnd = succ_end(currentBlock); successor != succEnd;
-        oldSuccessor = *successor, ++successor ) {
-      BasicBlock* succBB = *successor;
-
-      // is this edge a duplicate?
-      if (oldSuccessor == succBB)
-        duplicateNumber++;
-      else
-        duplicateNumber = 0;
-
-      buildEdge(inDag, dfsStack, currentNode, succBB, duplicateNumber);
-    }
-  }
-}
-
-// Process an edge in the CFG for DAG building.
-void BallLarusDag::buildEdge(BLBlockNodeMap& inDag, std::stack<BallLarusNode*>&
-                             dfsStack, BallLarusNode* currentNode,
-                             BasicBlock* succBB, unsigned duplicateCount) {
-  BallLarusNode* succNode = inDag[succBB];
-
-  if(succNode && succNode->getColor() == BallLarusNode::BLACK) {
-    // visited node and forward edge
-    addEdge(currentNode, succNode, duplicateCount);
-  } else if(succNode && succNode->getColor() == BallLarusNode::GRAY) {
-    // visited node and back edge
-    DEBUG(dbgs() << "Backedge detected.\n");
-    addBackedge(currentNode, succNode, duplicateCount);
-  } else {
-    BallLarusNode* childNode;
-    // not visited node and forward edge
-    if(succNode) // an unvisited node that is child of a gray node
-      childNode = succNode;
-    else { // an unvisited node that is a child of a an unvisted node
-      childNode = addNode(succBB);
-      inDag[succBB] = childNode;
-    }
-    addEdge(currentNode, childNode, duplicateCount);
-    dfsStack.push(childNode);
-  }
-}
-
-// The weight on each edge is the increment required along any path that
-// contains that edge.
-void BallLarusDag::calculatePathNumbersFrom(BallLarusNode* node) {
-  if(node == getExit())
-    // The Exit node must be base case
-    node->setNumberPaths(1);
-  else {
-    unsigned sumPaths = 0;
-    BallLarusNode* succNode;
-
-    for(BLEdgeIterator succ = node->succBegin(), end = node->succEnd();
-        succ != end; succ++) {
-      if( (*succ)->getType() == BallLarusEdge::BACKEDGE ||
-          (*succ)->getType() == BallLarusEdge::SPLITEDGE )
-        continue;
-
-      (*succ)->setWeight(sumPaths);
-      succNode = (*succ)->getTarget();
-
-      if( !succNode->getNumberPaths() )
-        return;
-      sumPaths += succNode->getNumberPaths();
-    }
-
-    node->setNumberPaths(sumPaths);
-  }
-}
-
-// Allows subclasses to determine which type of Node is created.
-// Override this method to produce subclasses of BallLarusNode if
-// necessary. The destructor of BallLarusDag will call free on each
-// pointer created.
-BallLarusNode* BallLarusDag::createNode(BasicBlock* BB) {
-  return( new BallLarusNode(BB) );
-}
-
-// Allows subclasses to determine which type of Edge is created.
-// Override this method to produce subclasses of BallLarusEdge if
-// necessary. The destructor of BallLarusDag will call free on each
-// pointer created.
-BallLarusEdge* BallLarusDag::createEdge(BallLarusNode* source,
-                                        BallLarusNode* target,
-                                        unsigned duplicateCount) {
-  return( new BallLarusEdge(source, target, duplicateCount) );
-}
-
-// Proxy to node's constructor.  Updates the DAG state.
-BallLarusNode* BallLarusDag::addNode(BasicBlock* BB) {
-  BallLarusNode* newNode = createNode(BB);
-  _nodes.push_back(newNode);
-  return( newNode );
-}
-
-// Proxy to edge's constructor. Updates the DAG state.
-BallLarusEdge* BallLarusDag::addEdge(BallLarusNode* source,
-                                     BallLarusNode* target,
-                                     unsigned duplicateCount) {
-  BallLarusEdge* newEdge = createEdge(source, target, duplicateCount);
-  _edges.push_back(newEdge);
-  source->addSuccEdge(newEdge);
-  target->addPredEdge(newEdge);
-  return(newEdge);
-}
-
-// Adds a backedge with its phony edges. Updates the DAG state.
-void BallLarusDag::addBackedge(BallLarusNode* source, BallLarusNode* target,
-                               unsigned duplicateCount) {
-  BallLarusEdge* childEdge = addEdge(source, target, duplicateCount);
-  childEdge->setType(BallLarusEdge::BACKEDGE);
-
-  childEdge->setPhonyRoot(addEdge(getRoot(), target,0));
-  childEdge->setPhonyExit(addEdge(source, getExit(),0));
-
-  childEdge->getPhonyRoot()->setRealEdge(childEdge);
-  childEdge->getPhonyRoot()->setType(BallLarusEdge::BACKEDGE_PHONY);
-
-  childEdge->getPhonyExit()->setRealEdge(childEdge);
-  childEdge->getPhonyExit()->setType(BallLarusEdge::BACKEDGE_PHONY);
-  _backEdges.push_back(childEdge);
-}
diff --git a/lib/Analysis/PathProfileInfo.cpp b/lib/Analysis/PathProfileInfo.cpp
deleted file mode 100644
index bc53221..0000000
--- a/lib/Analysis/PathProfileInfo.cpp
+++ /dev/null
@@ -1,433 +0,0 @@
-//===- PathProfileInfo.cpp ------------------------------------*- C++ -*---===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines the interface used by optimizers to load path profiles,
-// and provides a loader pass which reads a path profile file.
-//
-//===----------------------------------------------------------------------===//
-#define DEBUG_TYPE "path-profile-info"
-
-#include "llvm/Analysis/PathProfileInfo.h"
-#include "llvm/Analysis/Passes.h"
-#include "llvm/Analysis/ProfileInfoTypes.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include <cstdio>
-
-using namespace llvm;
-
-// command line option for loading path profiles
-static cl::opt<std::string>
-PathProfileInfoFilename("path-profile-loader-file", cl::init("llvmprof.out"),
-  cl::value_desc("filename"),
-  cl::desc("Path profile file loaded by -path-profile-loader"), cl::Hidden);
-
-namespace {
-  class PathProfileLoaderPass : public ModulePass, public PathProfileInfo {
-  public:
-    PathProfileLoaderPass() : ModulePass(ID) { }
-    ~PathProfileLoaderPass();
-
-    // this pass doesn't change anything (only loads information)
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
-      AU.setPreservesAll();
-    }
-
-    // the full name of the loader pass
-    virtual const char* getPassName() const {
-      return "Path Profiling Information Loader";
-    }
-
-    // required since this pass implements multiple inheritance
-                virtual void *getAdjustedAnalysisPointer(AnalysisID PI) {
-      if (PI == &PathProfileInfo::ID)
-        return (PathProfileInfo*)this;
-      return this;
-    }
-
-    // entry point to run the pass
-    bool runOnModule(Module &M);
-
-    // pass identification
-    static char ID;
-
-  private:
-    // make a reference table to refer to function by number
-    void buildFunctionRefs(Module &M);
-
-    // process argument info of a program from the input file
-    void handleArgumentInfo();
-
-    // process path number information from the input file
-    void handlePathInfo();
-
-    // array of references to the functions in the module
-    std::vector<Function*> _functions;
-
-    // path profile file handle
-    FILE* _file;
-
-    // path profile file name
-    std::string _filename;
-  };
-}
-
-// register PathLoader
-char PathProfileLoaderPass::ID = 0;
-
-INITIALIZE_ANALYSIS_GROUP(PathProfileInfo, "Path Profile Information",
-                          NoPathProfileInfo)
-INITIALIZE_AG_PASS(PathProfileLoaderPass, PathProfileInfo,
-                   "path-profile-loader",
-                   "Load path profile information from file",
-                   false, true, false)
-
-char &llvm::PathProfileLoaderPassID = PathProfileLoaderPass::ID;
-
-// link PathLoader as a pass, and make it available as an optimisation
-ModulePass *llvm::createPathProfileLoaderPass() {
-  return new PathProfileLoaderPass;
-}
-
-// ----------------------------------------------------------------------------
-// PathEdge implementation
-//
-ProfilePathEdge::ProfilePathEdge (BasicBlock* source, BasicBlock* target,
-                                  unsigned duplicateNumber)
-  : _source(source), _target(target), _duplicateNumber(duplicateNumber) {}
-
-// ----------------------------------------------------------------------------
-// Path implementation
-//
-
-ProfilePath::ProfilePath (unsigned int number, unsigned int count,
-                          double countStdDev,   PathProfileInfo* ppi)
-  : _number(number) , _count(count), _countStdDev(countStdDev), _ppi(ppi) {}
-
-double ProfilePath::getFrequency() const {
-  return 100 * double(_count) /
-    double(_ppi->_functionPathCounts[_ppi->_currentFunction]);
-}
-
-static BallLarusEdge* getNextEdge (BallLarusNode* node,
-                                   unsigned int pathNumber) {
-  BallLarusEdge* best = 0;
-
-  for( BLEdgeIterator next = node->succBegin(),
-         end = node->succEnd(); next != end; next++ ) {
-    if( (*next)->getType() != BallLarusEdge::BACKEDGE && // no backedges
-        (*next)->getType() != BallLarusEdge::SPLITEDGE && // no split edges
-        (*next)->getWeight() <= pathNumber && // weight must be <= pathNumber
-        (!best || (best->getWeight() < (*next)->getWeight())) ) // best one?
-      best = *next;
-  }
-
-  return best;
-}
-
-ProfilePathEdgeVector* ProfilePath::getPathEdges() const {
-  BallLarusNode* currentNode = _ppi->_currentDag->getRoot ();
-  unsigned int increment = _number;
-  ProfilePathEdgeVector* pev = new ProfilePathEdgeVector;
-
-  while (currentNode != _ppi->_currentDag->getExit()) {
-    BallLarusEdge* next = getNextEdge(currentNode, increment);
-
-    increment -= next->getWeight();
-
-    if( next->getType() != BallLarusEdge::BACKEDGE_PHONY &&
-        next->getType() != BallLarusEdge::SPLITEDGE_PHONY &&
-        next->getTarget() != _ppi->_currentDag->getExit() )
-      pev->push_back(ProfilePathEdge(
-                       next->getSource()->getBlock(),
-                       next->getTarget()->getBlock(),
-                       next->getDuplicateNumber()));
-
-    if( next->getType() == BallLarusEdge::BACKEDGE_PHONY &&
-        next->getTarget() == _ppi->_currentDag->getExit() )
-      pev->push_back(ProfilePathEdge(
-                       next->getRealEdge()->getSource()->getBlock(),
-                       next->getRealEdge()->getTarget()->getBlock(),
-                       next->getDuplicateNumber()));
-
-    if( next->getType() == BallLarusEdge::SPLITEDGE_PHONY &&
-        next->getSource() == _ppi->_currentDag->getRoot() )
-      pev->push_back(ProfilePathEdge(
-                       next->getRealEdge()->getSource()->getBlock(),
-                       next->getRealEdge()->getTarget()->getBlock(),
-                       next->getDuplicateNumber()));
-
-    // set the new node
-    currentNode = next->getTarget();
-  }
-
-  return pev;
-}
-
-ProfilePathBlockVector* ProfilePath::getPathBlocks() const {
-  BallLarusNode* currentNode = _ppi->_currentDag->getRoot ();
-  unsigned int increment = _number;
-  ProfilePathBlockVector* pbv = new ProfilePathBlockVector;
-
-  while (currentNode != _ppi->_currentDag->getExit()) {
-    BallLarusEdge* next = getNextEdge(currentNode, increment);
-    increment -= next->getWeight();
-
-    // add block to the block list if it is a real edge
-    if( next->getType() == BallLarusEdge::NORMAL)
-      pbv->push_back (currentNode->getBlock());
-    // make the back edge the last edge since we are at the end
-    else if( next->getTarget() == _ppi->_currentDag->getExit() ) {
-      pbv->push_back (currentNode->getBlock());
-      pbv->push_back (next->getRealEdge()->getTarget()->getBlock());
-    }
-
-    // set the new node
-    currentNode = next->getTarget();
-  }
-
-  return pbv;
-}
-
-BasicBlock* ProfilePath::getFirstBlockInPath() const {
-  BallLarusNode* root = _ppi->_currentDag->getRoot();
-  BallLarusEdge* edge = getNextEdge(root, _number);
-
-  if( edge && (edge->getType() == BallLarusEdge::BACKEDGE_PHONY ||
-               edge->getType() == BallLarusEdge::SPLITEDGE_PHONY) )
-    return edge->getTarget()->getBlock();
-
-  return root->getBlock();
-}
-
-// ----------------------------------------------------------------------------
-// PathProfileInfo implementation
-//
-
-// Pass identification
-char llvm::PathProfileInfo::ID = 0;
-
-PathProfileInfo::PathProfileInfo () : _currentDag(0) , _currentFunction(0) {
-}
-
-PathProfileInfo::~PathProfileInfo() {
-  if (_currentDag)
-    delete _currentDag;
-}
-
-// set the function for which paths are currently begin processed
-void PathProfileInfo::setCurrentFunction(Function* F) {
-  // Make sure it exists
-  if (!F) return;
-
-  if (_currentDag)
-    delete _currentDag;
-
-  _currentFunction = F;
-  _currentDag = new BallLarusDag(*F);
-  _currentDag->init();
-  _currentDag->calculatePathNumbers();
-}
-
-// get the function for which paths are currently being processed
-Function* PathProfileInfo::getCurrentFunction() const {
-  return _currentFunction;
-}
-
-// get the entry block of the function
-BasicBlock* PathProfileInfo::getCurrentFunctionEntry() {
-  return _currentDag->getRoot()->getBlock();
-}
-
-// return the path based on its number
-ProfilePath* PathProfileInfo::getPath(unsigned int number) {
-  return _functionPaths[_currentFunction][number];
-}
-
-// return the number of paths which a function may potentially execute
-unsigned int PathProfileInfo::getPotentialPathCount() {
-  return _currentDag ? _currentDag->getNumberOfPaths() : 0;
-}
-
-// return an iterator for the beginning of a functions executed paths
-ProfilePathIterator PathProfileInfo::pathBegin() {
-  return _functionPaths[_currentFunction].begin();
-}
-
-// return an iterator for the end of a functions executed paths
-ProfilePathIterator PathProfileInfo::pathEnd() {
-  return _functionPaths[_currentFunction].end();
-}
-
-// returns the total number of paths run in the function
-unsigned int PathProfileInfo::pathsRun() {
-  return _currentFunction ? _functionPaths[_currentFunction].size() : 0;
-}
-
-// ----------------------------------------------------------------------------
-// PathLoader implementation
-//
-
-// remove all generated paths
-PathProfileLoaderPass::~PathProfileLoaderPass() {
-  for( FunctionPathIterator funcNext = _functionPaths.begin(),
-         funcEnd = _functionPaths.end(); funcNext != funcEnd; funcNext++)
-    for( ProfilePathIterator pathNext = funcNext->second.begin(),
-           pathEnd = funcNext->second.end(); pathNext != pathEnd; pathNext++)
-      delete pathNext->second;
-}
-
-// entry point of the pass; this loads and parses a file
-bool PathProfileLoaderPass::runOnModule(Module &M) {
-  // get the filename and setup the module's function references
-  _filename = PathProfileInfoFilename;
-  buildFunctionRefs (M);
-
-  if (!(_file = fopen(_filename.c_str(), "rb"))) {
-    errs () << "error: input '" << _filename << "' file does not exist.\n";
-    return false;
-  }
-
-  ProfilingType profType;
-
-  while( fread(&profType, sizeof(ProfilingType), 1, _file) ) {
-    switch (profType) {
-    case ArgumentInfo:
-      handleArgumentInfo ();
-      break;
-    case PathInfo:
-      handlePathInfo ();
-      break;
-    default:
-      errs () << "error: bad path profiling file syntax, " << profType << "\n";
-      fclose (_file);
-      return false;
-    }
-  }
-
-  fclose (_file);
-
-  return true;
-}
-
-// create a reference table for functions defined in the path profile file
-void PathProfileLoaderPass::buildFunctionRefs (Module &M) {
-  _functions.push_back(0); // make the 0 index a null pointer
-
-  for (Module::iterator F = M.begin(), E = M.end(); F != E; F++) {
-    if (F->isDeclaration())
-      continue;
-    _functions.push_back(F);
-  }
-}
-
-// handle command like argument infor in the output file
-void PathProfileLoaderPass::handleArgumentInfo() {
-  // get the argument list's length
-  unsigned savedArgsLength;
-  if( fread(&savedArgsLength, sizeof(unsigned), 1, _file) != 1 ) {
-    errs() << "warning: argument info header/data mismatch\n";
-    return;
-  }
-
-  // allocate a buffer, and get the arguments
-  char* args = new char[savedArgsLength+1];
-  if( fread(args, 1, savedArgsLength, _file) != savedArgsLength )
-    errs() << "warning: argument info header/data mismatch\n";
-
-  args[savedArgsLength] = '\0';
-  argList = std::string(args);
-  delete [] args; // cleanup dynamic string
-
-  // byte alignment
-  if (savedArgsLength & 3)
-    fseek(_file, 4-(savedArgsLength&3), SEEK_CUR);
-}
-
-// Handle path profile information in the output file
-void PathProfileLoaderPass::handlePathInfo () {
-  // get the number of functions in this profile
-  unsigned functionCount;
-  if( fread(&functionCount, sizeof(functionCount), 1, _file) != 1 ) {
-    errs() << "warning: path info header/data mismatch\n";
-    return;
-  }
-
-  // gather path information for each function
-  for (unsigned i = 0; i < functionCount; i++) {
-    PathProfileHeader pathHeader;
-    if( fread(&pathHeader, sizeof(pathHeader), 1, _file) != 1 ) {
-      errs() << "warning: bad header for path function info\n";
-      break;
-    }
-
-    Function* f = _functions[pathHeader.fnNumber];
-
-    // dynamically allocate a table to store path numbers
-    PathProfileTableEntry* pathTable =
-      new PathProfileTableEntry[pathHeader.numEntries];
-
-    if( fread(pathTable, sizeof(PathProfileTableEntry),
-              pathHeader.numEntries, _file) != pathHeader.numEntries) {
-      delete [] pathTable;
-      errs() << "warning: path function info header/data mismatch\n";
-      return;
-    }
-
-    // Build a new path for the current function
-    unsigned int totalPaths = 0;
-    for (unsigned int j = 0; j < pathHeader.numEntries; j++) {
-      totalPaths += pathTable[j].pathCounter;
-      _functionPaths[f][pathTable[j].pathNumber]
-        = new ProfilePath(pathTable[j].pathNumber, pathTable[j].pathCounter,
-                          0, this);
-    }
-
-    _functionPathCounts[f] = totalPaths;
-
-    delete [] pathTable;
-  }
-}
-
-//===----------------------------------------------------------------------===//
-//  NoProfile PathProfileInfo implementation
-//
-
-namespace {
-  struct NoPathProfileInfo : public ImmutablePass, public PathProfileInfo {
-    static char ID; // Class identification, replacement for typeinfo
-    NoPathProfileInfo() : ImmutablePass(ID) {
-      initializeNoPathProfileInfoPass(*PassRegistry::getPassRegistry());
-    }
-
-    /// getAdjustedAnalysisPointer - This method is used when a pass implements
-    /// an analysis interface through multiple inheritance.  If needed, it
-    /// should override this to adjust the this pointer as needed for the
-    /// specified pass info.
-    virtual void *getAdjustedAnalysisPointer(AnalysisID PI) {
-      if (PI == &PathProfileInfo::ID)
-        return (PathProfileInfo*)this;
-      return this;
-    }
-
-    virtual const char *getPassName() const {
-      return "NoPathProfileInfo";
-    }
-  };
-}  // End of anonymous namespace
-
-char NoPathProfileInfo::ID = 0;
-// Register this pass...
-INITIALIZE_AG_PASS(NoPathProfileInfo, PathProfileInfo, "no-path-profile",
-                   "No Path Profile Information", false, true, true)
-
-ImmutablePass *llvm::createNoPathProfileInfoPass() { return new NoPathProfileInfo(); }
diff --git a/lib/Analysis/PathProfileVerifier.cpp b/lib/Analysis/PathProfileVerifier.cpp
deleted file mode 100644
index 48d7d05..0000000
--- a/lib/Analysis/PathProfileVerifier.cpp
+++ /dev/null
@@ -1,206 +0,0 @@
-//===- PathProfileVerifier.cpp --------------------------------*- C++ -*---===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This verifier derives an edge profile file from current path profile
-// information
-//
-//===----------------------------------------------------------------------===//
-#define DEBUG_TYPE "path-profile-verifier"
-
-#include "llvm/Analysis/Passes.h"
-#include "llvm/Analysis/PathProfileInfo.h"
-#include "llvm/Analysis/ProfileInfoTypes.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include <stdio.h>
-
-using namespace llvm;
-
-namespace {
-  class PathProfileVerifier : public ModulePass {
-  private:
-    bool runOnModule(Module &M);
-
-  public:
-    static char ID; // Pass identification, replacement for typeid
-    PathProfileVerifier() : ModulePass(ID) {
-      initializePathProfileVerifierPass(*PassRegistry::getPassRegistry());
-    }
-
-
-    virtual const char *getPassName() const {
-      return "Path Profiler Verifier";
-    }
-
-    // The verifier requires the path profile and edge profile.
-    virtual void getAnalysisUsage(AnalysisUsage& AU) const;
-  };
-}
-
-static cl::opt<std::string>
-EdgeProfileFilename("path-profile-verifier-file",
-  cl::init("edgefrompath.llvmprof.out"),
-  cl::value_desc("filename"),
-  cl::desc("Edge profile file generated by -path-profile-verifier"),
-  cl::Hidden);
-
-char PathProfileVerifier::ID = 0;
-INITIALIZE_PASS(PathProfileVerifier, "path-profile-verifier",
-                "Compare the path profile derived edge profile against the "
-                "edge profile.", true, true)
-
-ModulePass *llvm::createPathProfileVerifierPass() {
-  return new PathProfileVerifier();
-}
-
-// The verifier requires the path profile and edge profile.
-void PathProfileVerifier::getAnalysisUsage(AnalysisUsage& AU) const {
-  AU.addRequired<PathProfileInfo>();
-  AU.addPreserved<PathProfileInfo>();
-}
-
-typedef std::map<unsigned, unsigned> DuplicateToIndexMap;
-typedef std::map<BasicBlock*,DuplicateToIndexMap> BlockToDuplicateMap;
-typedef std::map<BasicBlock*,BlockToDuplicateMap> NestedBlockToIndexMap;
-
-// the verifier iterates through each path to gather the total
-// number of edge frequencies
-bool PathProfileVerifier::runOnModule (Module &M) {
-  PathProfileInfo& pathProfileInfo = getAnalysis<PathProfileInfo>();
-
-  // setup a data structure to map path edges which index an
-  // array of edge counters
-  NestedBlockToIndexMap arrayMap;
-  unsigned i = 0;
-  for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) {
-    if (F->isDeclaration()) continue;
-
-    arrayMap[(BasicBlock*)0][F->begin()][0] = i++;
-
-    for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
-      TerminatorInst *TI = BB->getTerminator();
-
-      unsigned duplicate = 0;
-      BasicBlock* prev = 0;
-      for (unsigned s = 0, e = TI->getNumSuccessors(); s != e;
-           prev = TI->getSuccessor(s), ++s) {
-        if (prev == TI->getSuccessor(s))
-          duplicate++;
-        else duplicate = 0;
-
-        arrayMap[BB][TI->getSuccessor(s)][duplicate] = i++;
-      }
-    }
-  }
-
-  std::vector<unsigned> edgeArray(i);
-
-  // iterate through each path and increment the edge counters as needed
-  for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) {
-    if (F->isDeclaration()) continue;
-
-    pathProfileInfo.setCurrentFunction(F);
-
-    DEBUG(dbgs() << "function '" << F->getName() << "' ran "
-          << pathProfileInfo.pathsRun()
-          << "/" << pathProfileInfo.getPotentialPathCount()
-          << " potential paths\n");
-
-    for( ProfilePathIterator nextPath = pathProfileInfo.pathBegin(),
-           endPath = pathProfileInfo.pathEnd();
-         nextPath != endPath; nextPath++ ) {
-      ProfilePath* currentPath = nextPath->second;
-
-      ProfilePathEdgeVector* pev = currentPath->getPathEdges();
-      DEBUG(dbgs () << "path #" << currentPath->getNumber() << ": "
-            << currentPath->getCount() << "\n");
-      // setup the entry edge (normally path profiling doesn't care about this)
-      if (currentPath->getFirstBlockInPath() == &F->getEntryBlock())
-        edgeArray[arrayMap[(BasicBlock*)0][currentPath->getFirstBlockInPath()][0]]
-          += currentPath->getCount();
-
-      for( ProfilePathEdgeIterator nextEdge = pev->begin(),
-             endEdge = pev->end(); nextEdge != endEdge; nextEdge++ ) {
-        if (nextEdge != pev->begin())
-          DEBUG(dbgs() << " :: ");
-
-        BasicBlock* source = nextEdge->getSource();
-        BasicBlock* target = nextEdge->getTarget();
-        unsigned duplicateNumber = nextEdge->getDuplicateNumber();
-        DEBUG(dbgs() << source->getName() << " --{" << duplicateNumber
-                     << "}--> " << target->getName());
-
-        // Ensure all the referenced edges exist
-        // TODO: make this a separate function
-        if( !arrayMap.count(source) ) {
-          errs() << "  error [" << F->getName() << "()]: source '"
-                 << source->getName()
-                 << "' does not exist in the array map.\n";
-        } else if( !arrayMap[source].count(target) ) {
-          errs() << "  error [" << F->getName() << "()]: target '"
-                 << target->getName()
-                 << "' does not exist in the array map.\n";
-        } else if( !arrayMap[source][target].count(duplicateNumber) ) {
-          errs() << "  error [" << F->getName() << "()]: edge "
-                 << source->getName() << " -> " << target->getName()
-                 << " duplicate number " << duplicateNumber
-                 << " does not exist in the array map.\n";
-        } else {
-          edgeArray[arrayMap[source][target][duplicateNumber]]
-            += currentPath->getCount();
-        }
-      }
-
-      DEBUG(errs() << "\n");
-
-      delete pev;
-    }
-  }
-
-  std::string errorInfo;
-  std::string filename = EdgeProfileFilename;
-
-  // Open a handle to the file
-  FILE* edgeFile = fopen(filename.c_str(),"wb");
-
-  if (!edgeFile) {
-    errs() << "error: unable to open file '" << filename << "' for output.\n";
-    return false;
-  }
-
-  errs() << "Generating edge profile '" << filename << "' ...\n";
-
-  // write argument info
-  unsigned type = ArgumentInfo;
-  unsigned num = pathProfileInfo.argList.size();
-  int zeros = 0;
-
-  fwrite(&type,sizeof(unsigned),1,edgeFile);
-  fwrite(&num,sizeof(unsigned),1,edgeFile);
-  fwrite(pathProfileInfo.argList.c_str(),1,num,edgeFile);
-  if (num&3)
-    fwrite(&zeros, 1, 4-(num&3), edgeFile);
-
-  type = EdgeInfo;
-  num = edgeArray.size();
-  fwrite(&type,sizeof(unsigned),1,edgeFile);
-  fwrite(&num,sizeof(unsigned),1,edgeFile);
-
-  // write each edge to the file
-  for( std::vector<unsigned>::iterator s = edgeArray.begin(),
-         e = edgeArray.end(); s != e; s++)
-    fwrite(&*s, sizeof (unsigned), 1, edgeFile);
-
-  fclose (edgeFile);
-
-  return true;
-}
diff --git a/lib/Analysis/ProfileDataLoader.cpp b/lib/Analysis/ProfileDataLoader.cpp
deleted file mode 100644
index 3d0a1e2..0000000
--- a/lib/Analysis/ProfileDataLoader.cpp
+++ /dev/null
@@ -1,155 +0,0 @@
-//===- ProfileDataLoader.cpp - Load profile information from disk ---------===//
-//
-//                      The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// The ProfileDataLoader class is used to load raw profiling data from the dump
-// file.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Analysis/ProfileDataLoader.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/OwningPtr.h"
-#include "llvm/Analysis/ProfileDataTypes.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/system_error.h"
-#include <cstdio>
-#include <cstdlib>
-using namespace llvm;
-
-raw_ostream &llvm::operator<<(raw_ostream &O, std::pair<const BasicBlock *,
-                                                        const BasicBlock *> E) {
-  O << "(";
-
-  if (E.first)
-    O << E.first->getName();
-  else
-    O << "0";
-
-  O << ",";
-
-  if (E.second)
-    O << E.second->getName();
-  else
-    O << "0";
-
-  return O << ")";
-}
-
-/// AddCounts - Add 'A' and 'B', accounting for the fact that the value of one
-/// (or both) may not be defined.
-static unsigned AddCounts(unsigned A, unsigned B) {
-  // If either value is undefined, use the other.
-  // Undefined + undefined = undefined.
-  if (A == ProfileDataLoader::Uncounted) return B;
-  if (B == ProfileDataLoader::Uncounted) return A;
-
-  return A + B;
-}
-
-/// ReadProfilingData - Load 'NumEntries' items of type 'T' from file 'F'
-template <typename T>
-static void ReadProfilingData(const char *ToolName, FILE *F,
-                              T *Data, size_t NumEntries) {
-  // Read in the block of data...
-  if (fread(Data, sizeof(T), NumEntries, F) != NumEntries)
-    report_fatal_error(Twine(ToolName) + ": Profiling data truncated");
-}
-
-/// ReadProfilingNumEntries - Read how many entries are in this profiling data
-/// packet.
-static unsigned ReadProfilingNumEntries(const char *ToolName, FILE *F,
-                                        bool ShouldByteSwap) {
-  unsigned Entry;
-  ReadProfilingData<unsigned>(ToolName, F, &Entry, 1);
-  return ShouldByteSwap ? ByteSwap_32(Entry) : Entry;
-}
-
-/// ReadProfilingBlock - Read the number of entries in the next profiling data
-/// packet and then accumulate the entries into 'Data'.
-static void ReadProfilingBlock(const char *ToolName, FILE *F,
-                               bool ShouldByteSwap,
-                               SmallVectorImpl<unsigned> &Data) {
-  // Read the number of entries...
-  unsigned NumEntries = ReadProfilingNumEntries(ToolName, F, ShouldByteSwap);
-
-  // Read in the data.
-  SmallVector<unsigned, 8> TempSpace(NumEntries);
-  ReadProfilingData<unsigned>(ToolName, F, TempSpace.data(), NumEntries);
-
-  // Make sure we have enough space ...
-  if (Data.size() < NumEntries)
-    Data.resize(NumEntries, ProfileDataLoader::Uncounted);
-
-  // Accumulate the data we just read into the existing data.
-  for (unsigned i = 0; i < NumEntries; ++i) {
-    unsigned Entry = ShouldByteSwap ? ByteSwap_32(TempSpace[i]) : TempSpace[i];
-    Data[i] = AddCounts(Entry, Data[i]);
-  }
-}
-
-/// ReadProfilingArgBlock - Read the command line arguments that the progam was
-/// run with when the current profiling data packet(s) were generated.
-static void ReadProfilingArgBlock(const char *ToolName, FILE *F,
-                                  bool ShouldByteSwap,
-                                  SmallVectorImpl<std::string> &CommandLines) {
-  // Read the number of bytes ...
-  unsigned ArgLength = ReadProfilingNumEntries(ToolName, F, ShouldByteSwap);
-
-  // Read in the arguments (if there are any to read).  Round up the length to
-  // the nearest 4-byte multiple.
-  SmallVector<char, 8> Args(ArgLength+4);
-  if (ArgLength)
-    ReadProfilingData<char>(ToolName, F, Args.data(), (ArgLength+3) & ~3);
-
-  // Store the arguments.
-  CommandLines.push_back(std::string(&Args[0], &Args[ArgLength]));
-}
-
-const unsigned ProfileDataLoader::Uncounted = ~0U;
-
-/// ProfileDataLoader ctor - Read the specified profiling data file, reporting
-/// a fatal error if the file is invalid or broken.
-ProfileDataLoader::ProfileDataLoader(const char *ToolName,
-                                     const std::string &Filename)
-  : Filename(Filename) {
-  FILE *F = fopen(Filename.c_str(), "rb");
-  if (F == 0)
-    report_fatal_error(Twine(ToolName) + ": Error opening '" +
-                       Filename + "': ");
-
-  // Keep reading packets until we run out of them.
-  unsigned PacketType;
-  while (fread(&PacketType, sizeof(unsigned), 1, F) == 1) {
-    // If the low eight bits of the packet are zero, we must be dealing with an
-    // endianness mismatch.  Byteswap all words read from the profiling
-    // information.  This can happen when the compiler host and target have
-    // different endianness.
-    bool ShouldByteSwap = (char)PacketType == 0;
-    PacketType = ShouldByteSwap ? ByteSwap_32(PacketType) : PacketType;
-
-    switch (PacketType) {
-      case ArgumentInfo:
-        ReadProfilingArgBlock(ToolName, F, ShouldByteSwap, CommandLines);
-        break;
-
-      case EdgeInfo:
-        ReadProfilingBlock(ToolName, F, ShouldByteSwap, EdgeCounts);
-        break;
-
-      default:
-        report_fatal_error(std::string(ToolName)
-                           + ": Unknown profiling packet type");
-        break;
-    }
-  }
-
-  fclose(F);
-}
diff --git a/lib/Analysis/ProfileDataLoaderPass.cpp b/lib/Analysis/ProfileDataLoaderPass.cpp
deleted file mode 100644
index 2ee0093..0000000
--- a/lib/Analysis/ProfileDataLoaderPass.cpp
+++ /dev/null
@@ -1,188 +0,0 @@
-//===- ProfileDataLoaderPass.cpp - Set branch weight metadata from prof ---===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass loads profiling data from a dump file and sets branch weight
-// metadata.
-//
-// TODO: Replace all "profile-metadata-loader" strings with "profile-loader"
-// once ProfileInfo etc. has been removed.
-//
-//===----------------------------------------------------------------------===//
-#define DEBUG_TYPE "profile-metadata-loader"
-#include "llvm/Analysis/Passes.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/ProfileDataLoader.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/MDBuilder.h"
-#include "llvm/IR/Metadata.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/CFG.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/Format.h"
-#include "llvm/Support/raw_ostream.h"
-using namespace llvm;
-
-STATISTIC(NumEdgesRead, "The # of edges read.");
-STATISTIC(NumTermsAnnotated, "The # of terminator instructions annotated.");
-
-static cl::opt<std::string>
-ProfileMetadataFilename("profile-file", cl::init("llvmprof.out"),
-                  cl::value_desc("filename"),
-                  cl::desc("Profile file loaded by -profile-metadata-loader"));
-
-namespace {
-  /// This pass loads profiling data from a dump file and sets branch weight
-  /// metadata.
-  class ProfileMetadataLoaderPass : public ModulePass {
-    std::string Filename;
-  public:
-    static char ID; // Class identification, replacement for typeinfo
-    explicit ProfileMetadataLoaderPass(const std::string &filename = "")
-        : ModulePass(ID), Filename(filename) {
-      initializeProfileMetadataLoaderPassPass(*PassRegistry::getPassRegistry());
-      if (filename.empty()) Filename = ProfileMetadataFilename;
-    }
-
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
-      AU.setPreservesAll();
-    }
-
-    virtual const char *getPassName() const {
-      return "Profile loader";
-    }
-
-    virtual void readEdge(unsigned, ProfileData&, ProfileData::Edge,
-                          ArrayRef<unsigned>);
-    virtual unsigned matchEdges(Module&, ProfileData&, ArrayRef<unsigned>);
-    virtual void setBranchWeightMetadata(Module&, ProfileData&);
-
-    virtual bool runOnModule(Module &M);
-  };
-}  // End of anonymous namespace
-
-char ProfileMetadataLoaderPass::ID = 0;
-INITIALIZE_PASS_BEGIN(ProfileMetadataLoaderPass, "profile-metadata-loader",
-              "Load profile information from llvmprof.out", false, true)
-INITIALIZE_PASS_END(ProfileMetadataLoaderPass, "profile-metadata-loader",
-              "Load profile information from llvmprof.out", false, true)
-
-char &llvm::ProfileMetadataLoaderPassID = ProfileMetadataLoaderPass::ID;
-
-/// createProfileMetadataLoaderPass - This function returns a Pass that loads
-/// the profiling information for the module from the specified filename,
-/// making it available to the optimizers.
-ModulePass *llvm::createProfileMetadataLoaderPass() { 
-    return new ProfileMetadataLoaderPass();
-}
-ModulePass *llvm::createProfileMetadataLoaderPass(const std::string &Filename) {
-  return new ProfileMetadataLoaderPass(Filename);
-}
-
-/// readEdge - Take the value from a profile counter and assign it to an edge.
-void ProfileMetadataLoaderPass::readEdge(unsigned ReadCount,
-                                         ProfileData &PB, ProfileData::Edge e,
-                                         ArrayRef<unsigned> Counters) {
-  if (ReadCount >= Counters.size()) return;
-
-  unsigned weight = Counters[ReadCount];
-  assert(weight != ProfileDataLoader::Uncounted);
-  PB.addEdgeWeight(e, weight);
-
-  DEBUG(dbgs() << "-- Read Edge Counter for " << e
-               << " (# "<< (ReadCount) << "): "
-               << PB.getEdgeWeight(e) << "\n");
-}
-
-/// matchEdges - Link every profile counter with an edge.
-unsigned ProfileMetadataLoaderPass::matchEdges(Module &M, ProfileData &PB,
-                                               ArrayRef<unsigned> Counters) {
-  if (Counters.size() == 0) return 0;
-
-  unsigned ReadCount = 0;
-
-  for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) {
-    if (F->isDeclaration()) continue;
-    DEBUG(dbgs() << "Loading edges in '" << F->getName() << "'\n");
-    readEdge(ReadCount++, PB, PB.getEdge(0, &F->getEntryBlock()), Counters);
-    for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
-      TerminatorInst *TI = BB->getTerminator();
-      for (unsigned s = 0, e = TI->getNumSuccessors(); s != e; ++s) {
-        readEdge(ReadCount++, PB, PB.getEdge(BB,TI->getSuccessor(s)),
-                 Counters);
-      }
-    }
-  }
-
-  return ReadCount;
-}
-
-/// setBranchWeightMetadata - Translate the counter values associated with each
-/// edge into branch weights for each conditional branch (a branch with 2 or
-/// more desinations).
-void ProfileMetadataLoaderPass::setBranchWeightMetadata(Module &M,
-                                                        ProfileData &PB) {
-  for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) {
-    if (F->isDeclaration()) continue;
-    DEBUG(dbgs() << "Setting branch metadata in '" << F->getName() << "'\n");
-
-    for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
-      TerminatorInst *TI = BB->getTerminator();
-      unsigned NumSuccessors = TI->getNumSuccessors();
-
-      // If there is only one successor then we can not set a branch
-      // probability as the target is certain.
-      if (NumSuccessors < 2) continue;
-
-      // Load the weights of all edges leading from this terminator.
-      DEBUG(dbgs() << "-- Terminator with " << NumSuccessors
-                   << " successors:\n");
-      SmallVector<uint32_t, 4> Weights(NumSuccessors);
-      for (unsigned s = 0 ; s < NumSuccessors ; ++s) {
-          ProfileData::Edge edge = PB.getEdge(BB, TI->getSuccessor(s));
-          Weights[s] = (uint32_t)PB.getEdgeWeight(edge);
-          DEBUG(dbgs() << "---- Edge '" << edge << "' has weight "
-                       << Weights[s] << "\n");
-      }
-
-      // Set branch weight metadata.  This will set branch probabilities of
-      // 100%/0% if that is true of the dynamic execution.
-      // BranchProbabilityInfo can account for this when it loads this metadata
-      // (it gives the unexectuted branch a weight of 1 for the purposes of
-      // probability calculations).
-      MDBuilder MDB(TI->getContext());
-      MDNode *Node = MDB.createBranchWeights(Weights);
-      TI->setMetadata(LLVMContext::MD_prof, Node);
-      NumTermsAnnotated++;
-    }
-  }
-}
-
-bool ProfileMetadataLoaderPass::runOnModule(Module &M) {
-  ProfileDataLoader PDL("profile-data-loader", Filename);
-  ProfileData PB;
-
-  ArrayRef<unsigned> Counters = PDL.getRawEdgeCounts();
-
-  unsigned ReadCount = matchEdges(M, PB, Counters);
-
-  if (ReadCount != Counters.size()) {
-    errs() << "WARNING: profile information is inconsistent with "
-           << "the current program!\n";
-  }
-  NumEdgesRead = ReadCount;
-
-  setBranchWeightMetadata(M, PB);
-
-  return ReadCount > 0;
-}
diff --git a/lib/Analysis/ProfileEstimatorPass.cpp b/lib/Analysis/ProfileEstimatorPass.cpp
deleted file mode 100644
index 365b64c..0000000
--- a/lib/Analysis/ProfileEstimatorPass.cpp
+++ /dev/null
@@ -1,426 +0,0 @@
-//===- ProfileEstimatorPass.cpp - LLVM Pass to estimate profile info ------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements a concrete implementation of profiling information that
-// estimates the profiling information in a very crude and unimaginative way.
-//
-//===----------------------------------------------------------------------===//
-#define DEBUG_TYPE "profile-estimator"
-#include "llvm/Analysis/Passes.h"
-#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/ProfileInfo.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/Format.h"
-#include "llvm/Support/raw_ostream.h"
-using namespace llvm;
-
-static cl::opt<double>
-LoopWeight(
-    "profile-estimator-loop-weight", cl::init(10),
-    cl::value_desc("loop-weight"),
-    cl::desc("Number of loop executions used for profile-estimator")
-);
-
-namespace {
-  class ProfileEstimatorPass : public FunctionPass, public ProfileInfo {
-    double ExecCount;
-    LoopInfo *LI;
-    std::set<BasicBlock*>  BBToVisit;
-    std::map<Loop*,double> LoopExitWeights;
-    std::map<Edge,double>  MinimalWeight;
-  public:
-    static char ID; // Class identification, replacement for typeinfo
-    explicit ProfileEstimatorPass(const double execcount = 0)
-        : FunctionPass(ID), ExecCount(execcount) {
-      initializeProfileEstimatorPassPass(*PassRegistry::getPassRegistry());
-      if (execcount == 0) ExecCount = LoopWeight;
-    }
-
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
-      AU.setPreservesAll();
-      AU.addRequired<LoopInfo>();
-    }
-
-    virtual const char *getPassName() const {
-      return "Profiling information estimator";
-    }
-
-    /// run - Estimate the profile information from the specified file.
-    virtual bool runOnFunction(Function &F);
-
-    /// getAdjustedAnalysisPointer - This method is used when a pass implements
-    /// an analysis interface through multiple inheritance.  If needed, it
-    /// should override this to adjust the this pointer as needed for the
-    /// specified pass info.
-    virtual void *getAdjustedAnalysisPointer(AnalysisID PI) {
-      if (PI == &ProfileInfo::ID)
-        return (ProfileInfo*)this;
-      return this;
-    }
-    
-    virtual void recurseBasicBlock(BasicBlock *BB);
-
-    void inline printEdgeWeight(Edge);
-  };
-}  // End of anonymous namespace
-
-char ProfileEstimatorPass::ID = 0;
-INITIALIZE_AG_PASS_BEGIN(ProfileEstimatorPass, ProfileInfo, "profile-estimator",
-                "Estimate profiling information", false, true, false)
-INITIALIZE_PASS_DEPENDENCY(LoopInfo)
-INITIALIZE_AG_PASS_END(ProfileEstimatorPass, ProfileInfo, "profile-estimator",
-                "Estimate profiling information", false, true, false)
-
-namespace llvm {
-  char &ProfileEstimatorPassID = ProfileEstimatorPass::ID;
-
-  FunctionPass *createProfileEstimatorPass() {
-    return new ProfileEstimatorPass();
-  }
-
-  /// createProfileEstimatorPass - This function returns a Pass that estimates
-  /// profiling information using the given loop execution count.
-  Pass *createProfileEstimatorPass(const unsigned execcount) {
-    return new ProfileEstimatorPass(execcount);
-  }
-}
-
-static double ignoreMissing(double w) {
-  if (w == ProfileInfo::MissingValue) return 0;
-  return w;
-}
-
-static void inline printEdgeError(ProfileInfo::Edge e, const char *M) {
-  DEBUG(dbgs() << "-- Edge " << e << " is not calculated, " << M << "\n");
-}
-
-void inline ProfileEstimatorPass::printEdgeWeight(Edge E) {
-  DEBUG(dbgs() << "-- Weight of Edge " << E << ":"
-               << format("%20.20g", getEdgeWeight(E)) << "\n");
-}
-
-// recurseBasicBlock() - This calculates the ProfileInfo estimation for a
-// single block and then recurses into the successors.
-// The algorithm preserves the flow condition, meaning that the sum of the
-// weight of the incoming edges must be equal the block weight which must in
-// turn be equal to the sume of the weights of the outgoing edges.
-// Since the flow of an block is deterimined from the current state of the
-// flow, once an edge has a flow assigned this flow is never changed again,
-// otherwise it would be possible to violate the flow condition in another
-// block.
-void ProfileEstimatorPass::recurseBasicBlock(BasicBlock *BB) {
-
-  // Break the recursion if this BasicBlock was already visited.
-  if (BBToVisit.find(BB) == BBToVisit.end()) return;
-
-  // Read the LoopInfo for this block.
-  bool  BBisHeader = LI->isLoopHeader(BB);
-  Loop* BBLoop     = LI->getLoopFor(BB);
-
-  // To get the block weight, read all incoming edges.
-  double BBWeight = 0;
-  std::set<BasicBlock*> ProcessedPreds;
-  for ( pred_iterator bbi = pred_begin(BB), bbe = pred_end(BB);
-        bbi != bbe; ++bbi ) {
-    // If this block was not considered already, add weight.
-    Edge edge = getEdge(*bbi,BB);
-    double w = getEdgeWeight(edge);
-    if (ProcessedPreds.insert(*bbi).second) {
-      BBWeight += ignoreMissing(w);
-    }
-    // If this block is a loop header and the predecessor is contained in this
-    // loop, thus the edge is a backedge, continue and do not check if the
-    // value is valid.
-    if (BBisHeader && BBLoop->contains(*bbi)) {
-      printEdgeError(edge, "but is backedge, continuing");
-      continue;
-    }
-    // If the edges value is missing (and this is no loop header, and this is
-    // no backedge) return, this block is currently non estimatable.
-    if (w == MissingValue) {
-      printEdgeError(edge, "returning");
-      return;
-    }
-  }
-  if (getExecutionCount(BB) != MissingValue) {
-    BBWeight = getExecutionCount(BB);
-  }
-
-  // Fetch all necessary information for current block.
-  SmallVector<Edge, 8> ExitEdges;
-  SmallVector<Edge, 8> Edges;
-  if (BBLoop) {
-    BBLoop->getExitEdges(ExitEdges);
-  }
-
-  // If this is a loop header, consider the following:
-  // Exactly the flow that is entering this block, must exit this block too. So
-  // do the following: 
-  // *) get all the exit edges, read the flow that is already leaving this
-  // loop, remember the edges that do not have any flow on them right now.
-  // (The edges that have already flow on them are most likely exiting edges of
-  // other loops, do not touch those flows because the previously caclulated
-  // loopheaders would not be exact anymore.)
-  // *) In case there is not a single exiting edge left, create one at the loop
-  // latch to prevent the flow from building up in the loop.
-  // *) Take the flow that is not leaving the loop already and distribute it on
-  // the remaining exiting edges.
-  // (This ensures that all flow that enters the loop also leaves it.)
-  // *) Increase the flow into the loop by increasing the weight of this block.
-  // There is at least one incoming backedge that will bring us this flow later
-  // on. (So that the flow condition in this node is valid again.)
-  if (BBisHeader) {
-    double incoming = BBWeight;
-    // Subtract the flow leaving the loop.
-    std::set<Edge> ProcessedExits;
-    for (SmallVectorImpl<Edge>::iterator ei = ExitEdges.begin(),
-         ee = ExitEdges.end(); ei != ee; ++ei) {
-      if (ProcessedExits.insert(*ei).second) {
-        double w = getEdgeWeight(*ei);
-        if (w == MissingValue) {
-          Edges.push_back(*ei);
-          // Check if there is a necessary minimal weight, if yes, subtract it 
-          // from weight.
-          if (MinimalWeight.find(*ei) != MinimalWeight.end()) {
-            incoming -= MinimalWeight[*ei];
-            DEBUG(dbgs() << "Reserving " << format("%.20g",MinimalWeight[*ei]) << " at " << (*ei) << "\n");
-          }
-        } else {
-          incoming -= w;
-        }
-      }
-    }
-    // If no exit edges, create one:
-    if (Edges.size() == 0) {
-      BasicBlock *Latch = BBLoop->getLoopLatch();
-      if (Latch) {
-        Edge edge = getEdge(Latch,0);
-        EdgeInformation[BB->getParent()][edge] = BBWeight;
-        printEdgeWeight(edge);
-        edge = getEdge(Latch, BB);
-        EdgeInformation[BB->getParent()][edge] = BBWeight * ExecCount;
-        printEdgeWeight(edge);
-      }
-    }
-
-    // Distribute remaining weight to the exting edges. To prevent fractions
-    // from building up and provoking precision problems the weight which is to
-    // be distributed is split and the rounded, the last edge gets a somewhat
-    // bigger value, but we are close enough for an estimation.
-    double fraction = floor(incoming/Edges.size());
-    for (SmallVectorImpl<Edge>::iterator ei = Edges.begin(), ee = Edges.end();
-         ei != ee; ++ei) {
-      double w = 0;
-      if (ei != (ee-1)) {
-        w = fraction;
-        incoming -= fraction;
-      } else {
-        w = incoming;
-      }
-      EdgeInformation[BB->getParent()][*ei] += w;
-      // Read necessary minimal weight.
-      if (MinimalWeight.find(*ei) != MinimalWeight.end()) {
-        EdgeInformation[BB->getParent()][*ei] += MinimalWeight[*ei];
-        DEBUG(dbgs() << "Additionally " << format("%.20g",MinimalWeight[*ei]) << " at " << (*ei) << "\n");
-      }
-      printEdgeWeight(*ei);
-      
-      // Add minimal weight to paths to all exit edges, this is used to ensure
-      // that enough flow is reaching this edges.
-      Path p;
-      const BasicBlock *Dest = GetPath(BB, (*ei).first, p, GetPathToDest);
-      while (Dest != BB) {
-        const BasicBlock *Parent = p.find(Dest)->second;
-        Edge e = getEdge(Parent, Dest);
-        if (MinimalWeight.find(e) == MinimalWeight.end()) {
-          MinimalWeight[e] = 0;
-        }
-        MinimalWeight[e] += w;
-        DEBUG(dbgs() << "Minimal Weight for " << e << ": " << format("%.20g",MinimalWeight[e]) << "\n");
-        Dest = Parent;
-      }
-    }
-    // Increase flow into the loop.
-    BBWeight *= (ExecCount+1);
-  }
-
-  BlockInformation[BB->getParent()][BB] = BBWeight;
-  // Up until now we considered only the loop exiting edges, now we have a
-  // definite block weight and must distribute this onto the outgoing edges.
-  // Since there may be already flow attached to some of the edges, read this
-  // flow first and remember the edges that have still now flow attached.
-  Edges.clear();
-  std::set<BasicBlock*> ProcessedSuccs;
-
-  succ_iterator bbi = succ_begin(BB), bbe = succ_end(BB);
-  // Also check for (BB,0) edges that may already contain some flow. (But only
-  // in case there are no successors.)
-  if (bbi == bbe) {
-    Edge edge = getEdge(BB,0);
-    EdgeInformation[BB->getParent()][edge] = BBWeight;
-    printEdgeWeight(edge);
-  }
-  for ( ; bbi != bbe; ++bbi ) {
-    if (ProcessedSuccs.insert(*bbi).second) {
-      Edge edge = getEdge(BB,*bbi);
-      double w = getEdgeWeight(edge);
-      if (w != MissingValue) {
-        BBWeight -= getEdgeWeight(edge);
-      } else {
-        Edges.push_back(edge);
-        // If minimal weight is necessary, reserve weight by subtracting weight
-        // from block weight, this is readded later on.
-        if (MinimalWeight.find(edge) != MinimalWeight.end()) {
-          BBWeight -= MinimalWeight[edge];
-          DEBUG(dbgs() << "Reserving " << format("%.20g",MinimalWeight[edge]) << " at " << edge << "\n");
-        }
-      }
-    }
-  }
-
-  double fraction = Edges.size() ? floor(BBWeight/Edges.size()) : 0.0;
-  // Finally we know what flow is still not leaving the block, distribute this
-  // flow onto the empty edges.
-  for (SmallVectorImpl<Edge>::iterator ei = Edges.begin(), ee = Edges.end();
-       ei != ee; ++ei) {
-    if (ei != (ee-1)) {
-      EdgeInformation[BB->getParent()][*ei] += fraction;
-      BBWeight -= fraction;
-    } else {
-      EdgeInformation[BB->getParent()][*ei] += BBWeight;
-    }
-    // Readd minial necessary weight.
-    if (MinimalWeight.find(*ei) != MinimalWeight.end()) {
-      EdgeInformation[BB->getParent()][*ei] += MinimalWeight[*ei];
-      DEBUG(dbgs() << "Additionally " << format("%.20g",MinimalWeight[*ei]) << " at " << (*ei) << "\n");
-    }
-    printEdgeWeight(*ei);
-  }
-
-  // This block is visited, mark this before the recursion.
-  BBToVisit.erase(BB);
-
-  // Recurse into successors.
-  for (succ_iterator bbi = succ_begin(BB), bbe = succ_end(BB);
-       bbi != bbe; ++bbi) {
-    recurseBasicBlock(*bbi);
-  }
-}
-
-bool ProfileEstimatorPass::runOnFunction(Function &F) {
-  if (F.isDeclaration()) return false;
-
-  // Fetch LoopInfo and clear ProfileInfo for this function.
-  LI = &getAnalysis<LoopInfo>();
-  FunctionInformation.erase(&F);
-  BlockInformation[&F].clear();
-  EdgeInformation[&F].clear();
-  BBToVisit.clear();
-
-  // Mark all blocks as to visit.
-  for (Function::iterator bi = F.begin(), be = F.end(); bi != be; ++bi)
-    BBToVisit.insert(bi);
-
-  // Clear Minimal Edges.
-  MinimalWeight.clear();
-
-  DEBUG(dbgs() << "Working on function " << F.getName() << "\n");
-
-  // Since the entry block is the first one and has no predecessors, the edge
-  // (0,entry) is inserted with the starting weight of 1.
-  BasicBlock *entry = &F.getEntryBlock();
-  BlockInformation[&F][entry] = pow(2.0, 32.0);
-  Edge edge = getEdge(0,entry);
-  EdgeInformation[&F][edge] = BlockInformation[&F][entry];
-  printEdgeWeight(edge);
-
-  // Since recurseBasicBlock() maybe returns with a block which was not fully
-  // estimated, use recurseBasicBlock() until everything is calculated.
-  bool cleanup = false;
-  recurseBasicBlock(entry);
-  while (BBToVisit.size() > 0 && !cleanup) {
-    // Remember number of open blocks, this is later used to check if progress
-    // was made.
-    unsigned size = BBToVisit.size();
-
-    // Try to calculate all blocks in turn.
-    for (std::set<BasicBlock*>::iterator bi = BBToVisit.begin(),
-         be = BBToVisit.end(); bi != be; ++bi) {
-      recurseBasicBlock(*bi);
-      // If at least one block was finished, break because iterator may be
-      // invalid.
-      if (BBToVisit.size() < size) break;
-    }
-
-    // If there was not a single block resolved, make some assumptions.
-    if (BBToVisit.size() == size) {
-      bool found = false;
-      for (std::set<BasicBlock*>::iterator BBI = BBToVisit.begin(), BBE = BBToVisit.end(); 
-           (BBI != BBE) && (!found); ++BBI) {
-        BasicBlock *BB = *BBI;
-        // Try each predecessor if it can be assumend.
-        for (pred_iterator bbi = pred_begin(BB), bbe = pred_end(BB);
-             (bbi != bbe) && (!found); ++bbi) {
-          Edge e = getEdge(*bbi,BB);
-          double w = getEdgeWeight(e);
-          // Check that edge from predecessor is still free.
-          if (w == MissingValue) {
-            // Check if there is a circle from this block to predecessor.
-            Path P;
-            const BasicBlock *Dest = GetPath(BB, *bbi, P, GetPathToDest);
-            if (Dest != *bbi) {
-              // If there is no circle, just set edge weight to 0
-              EdgeInformation[&F][e] = 0;
-              DEBUG(dbgs() << "Assuming edge weight: ");
-              printEdgeWeight(e);
-              found = true;
-            }
-          }
-        }
-      }
-      if (!found) {
-        cleanup = true;
-        DEBUG(dbgs() << "No assumption possible in Fuction "<<F.getName()<<", setting all to zero\n");
-      }
-    }
-  }
-  // In case there was no safe way to assume edges, set as a last measure, 
-  // set _everything_ to zero.
-  if (cleanup) {
-    FunctionInformation[&F] = 0;
-    BlockInformation[&F].clear();
-    EdgeInformation[&F].clear();
-    for (Function::const_iterator FI = F.begin(), FE = F.end(); FI != FE; ++FI) {
-      const BasicBlock *BB = &(*FI);
-      BlockInformation[&F][BB] = 0;
-      const_pred_iterator predi = pred_begin(BB), prede = pred_end(BB);
-      if (predi == prede) {
-        Edge e = getEdge(0,BB);
-        setEdgeWeight(e,0);
-      }
-      for (;predi != prede; ++predi) {
-        Edge e = getEdge(*predi,BB);
-        setEdgeWeight(e,0);
-      }
-      succ_const_iterator succi = succ_begin(BB), succe = succ_end(BB);
-      if (succi == succe) {
-        Edge e = getEdge(BB,0);
-        setEdgeWeight(e,0);
-      }
-      for (;succi != succe; ++succi) {
-        Edge e = getEdge(*succi,BB);
-        setEdgeWeight(e,0);
-      }
-    }
-  }
-
-  return false;
-}
diff --git a/lib/Analysis/ProfileInfo.cpp b/lib/Analysis/ProfileInfo.cpp
deleted file mode 100644
index 9626a48..0000000
--- a/lib/Analysis/ProfileInfo.cpp
+++ /dev/null
@@ -1,1079 +0,0 @@
-//===- ProfileInfo.cpp - Profile Info Interface ---------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the abstract ProfileInfo interface, and the default
-// "no profile" implementation.
-//
-//===----------------------------------------------------------------------===//
-#define DEBUG_TYPE "profile-info"
-#include "llvm/Analysis/ProfileInfo.h"
-#include "llvm/ADT/SmallSet.h"
-#include "llvm/Analysis/Passes.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/CFG.h"
-#include <limits>
-#include <queue>
-#include <set>
-using namespace llvm;
-
-namespace llvm {
-  template<> char ProfileInfoT<Function,BasicBlock>::ID = 0;
-}
-
-// Register the ProfileInfo interface, providing a nice name to refer to.
-INITIALIZE_ANALYSIS_GROUP(ProfileInfo, "Profile Information", NoProfileInfo)
-
-namespace llvm {
-
-template <>
-ProfileInfoT<MachineFunction, MachineBasicBlock>::ProfileInfoT() {}
-template <>
-ProfileInfoT<MachineFunction, MachineBasicBlock>::~ProfileInfoT() {}
-
-template <>
-ProfileInfoT<Function, BasicBlock>::ProfileInfoT() {
-  MachineProfile = 0;
-}
-template <>
-ProfileInfoT<Function, BasicBlock>::~ProfileInfoT() {
-  if (MachineProfile) delete MachineProfile;
-}
-
-template<>
-char ProfileInfoT<MachineFunction, MachineBasicBlock>::ID = 0;
-
-template<>
-const double ProfileInfoT<Function,BasicBlock>::MissingValue = -1;
-
-template<> const
-double ProfileInfoT<MachineFunction, MachineBasicBlock>::MissingValue = -1;
-
-template<> double
-ProfileInfoT<Function,BasicBlock>::getExecutionCount(const BasicBlock *BB) {
-  std::map<const Function*, BlockCounts>::iterator J =
-    BlockInformation.find(BB->getParent());
-  if (J != BlockInformation.end()) {
-    BlockCounts::iterator I = J->second.find(BB);
-    if (I != J->second.end())
-      return I->second;
-  }
-
-  double Count = MissingValue;
-
-  const_pred_iterator PI = pred_begin(BB), PE = pred_end(BB);
-
-  // Are there zero predecessors of this block?
-  if (PI == PE) {
-    Edge e = getEdge(0, BB);
-    Count = getEdgeWeight(e);
-  } else {
-    // Otherwise, if there are predecessors, the execution count of this block is
-    // the sum of the edge frequencies from the incoming edges.
-    std::set<const BasicBlock*> ProcessedPreds;
-    Count = 0;
-    for (; PI != PE; ++PI) {
-      const BasicBlock *P = *PI;
-      if (ProcessedPreds.insert(P).second) {
-        double w = getEdgeWeight(getEdge(P, BB));
-        if (w == MissingValue) {
-          Count = MissingValue;
-          break;
-        }
-        Count += w;
-      }
-    }
-  }
-
-  // If the predecessors did not suffice to get block weight, try successors.
-  if (Count == MissingValue) {
-
-    succ_const_iterator SI = succ_begin(BB), SE = succ_end(BB);
-
-    // Are there zero successors of this block?
-    if (SI == SE) {
-      Edge e = getEdge(BB,0);
-      Count = getEdgeWeight(e);
-    } else {
-      std::set<const BasicBlock*> ProcessedSuccs;
-      Count = 0;
-      for (; SI != SE; ++SI)
-        if (ProcessedSuccs.insert(*SI).second) {
-          double w = getEdgeWeight(getEdge(BB, *SI));
-          if (w == MissingValue) {
-            Count = MissingValue;
-            break;
-          }
-          Count += w;
-        }
-    }
-  }
-
-  if (Count != MissingValue) BlockInformation[BB->getParent()][BB] = Count;
-  return Count;
-}
-
-template<>
-double ProfileInfoT<MachineFunction, MachineBasicBlock>::
-        getExecutionCount(const MachineBasicBlock *MBB) {
-  std::map<const MachineFunction*, BlockCounts>::iterator J =
-    BlockInformation.find(MBB->getParent());
-  if (J != BlockInformation.end()) {
-    BlockCounts::iterator I = J->second.find(MBB);
-    if (I != J->second.end())
-      return I->second;
-  }
-
-  return MissingValue;
-}
-
-template<>
-double ProfileInfoT<Function,BasicBlock>::getExecutionCount(const Function *F) {
-  std::map<const Function*, double>::iterator J =
-    FunctionInformation.find(F);
-  if (J != FunctionInformation.end())
-    return J->second;
-
-  // isDeclaration() is checked here and not at start of function to allow
-  // functions without a body still to have a execution count.
-  if (F->isDeclaration()) return MissingValue;
-
-  double Count = getExecutionCount(&F->getEntryBlock());
-  if (Count != MissingValue) FunctionInformation[F] = Count;
-  return Count;
-}
-
-template<>
-double ProfileInfoT<MachineFunction, MachineBasicBlock>::
-        getExecutionCount(const MachineFunction *MF) {
-  std::map<const MachineFunction*, double>::iterator J =
-    FunctionInformation.find(MF);
-  if (J != FunctionInformation.end())
-    return J->second;
-
-  double Count = getExecutionCount(&MF->front());
-  if (Count != MissingValue) FunctionInformation[MF] = Count;
-  return Count;
-}
-
-template<>
-void ProfileInfoT<Function,BasicBlock>::
-        setExecutionCount(const BasicBlock *BB, double w) {
-  DEBUG(dbgs() << "Creating Block " << BB->getName() 
-               << " (weight: " << format("%.20g",w) << ")\n");
-  BlockInformation[BB->getParent()][BB] = w;
-}
-
-template<>
-void ProfileInfoT<MachineFunction, MachineBasicBlock>::
-        setExecutionCount(const MachineBasicBlock *MBB, double w) {
-  DEBUG(dbgs() << "Creating Block " << MBB->getBasicBlock()->getName()
-               << " (weight: " << format("%.20g",w) << ")\n");
-  BlockInformation[MBB->getParent()][MBB] = w;
-}
-
-template<>
-void ProfileInfoT<Function,BasicBlock>::addEdgeWeight(Edge e, double w) {
-  double oldw = getEdgeWeight(e);
-  assert (oldw != MissingValue && "Adding weight to Edge with no previous weight");
-  DEBUG(dbgs() << "Adding to Edge " << e
-               << " (new weight: " << format("%.20g",oldw + w) << ")\n");
-  EdgeInformation[getFunction(e)][e] = oldw + w;
-}
-
-template<>
-void ProfileInfoT<Function,BasicBlock>::
-        addExecutionCount(const BasicBlock *BB, double w) {
-  double oldw = getExecutionCount(BB);
-  assert (oldw != MissingValue && "Adding weight to Block with no previous weight");
-  DEBUG(dbgs() << "Adding to Block " << BB->getName()
-               << " (new weight: " << format("%.20g",oldw + w) << ")\n");
-  BlockInformation[BB->getParent()][BB] = oldw + w;
-}
-
-template<>
-void ProfileInfoT<Function,BasicBlock>::removeBlock(const BasicBlock *BB) {
-  std::map<const Function*, BlockCounts>::iterator J =
-    BlockInformation.find(BB->getParent());
-  if (J == BlockInformation.end()) return;
-
-  DEBUG(dbgs() << "Deleting " << BB->getName() << "\n");
-  J->second.erase(BB);
-}
-
-template<>
-void ProfileInfoT<Function,BasicBlock>::removeEdge(Edge e) {
-  std::map<const Function*, EdgeWeights>::iterator J =
-    EdgeInformation.find(getFunction(e));
-  if (J == EdgeInformation.end()) return;
-
-  DEBUG(dbgs() << "Deleting" << e << "\n");
-  J->second.erase(e);
-}
-
-template<>
-void ProfileInfoT<Function,BasicBlock>::
-        replaceEdge(const Edge &oldedge, const Edge &newedge) {
-  double w;
-  if ((w = getEdgeWeight(newedge)) == MissingValue) {
-    w = getEdgeWeight(oldedge);
-    DEBUG(dbgs() << "Replacing " << oldedge << " with " << newedge  << "\n");
-  } else {
-    w += getEdgeWeight(oldedge);
-    DEBUG(dbgs() << "Adding " << oldedge << " to " << newedge  << "\n");
-  }
-  setEdgeWeight(newedge,w);
-  removeEdge(oldedge);
-}
-
-template<>
-const BasicBlock *ProfileInfoT<Function,BasicBlock>::
-        GetPath(const BasicBlock *Src, const BasicBlock *Dest,
-                Path &P, unsigned Mode) {
-  const BasicBlock *BB = 0;
-  bool hasFoundPath = false;
-
-  std::queue<const BasicBlock *> BFS;
-  BFS.push(Src);
-
-  while(BFS.size() && !hasFoundPath) {
-    BB = BFS.front();
-    BFS.pop();
-
-    succ_const_iterator Succ = succ_begin(BB), End = succ_end(BB);
-    if (Succ == End) {
-      P[(const BasicBlock*)0] = BB;
-      if (Mode & GetPathToExit) {
-        hasFoundPath = true;
-        BB = 0;
-      }
-    }
-    for(;Succ != End; ++Succ) {
-      if (P.find(*Succ) != P.end()) continue;
-      Edge e = getEdge(BB,*Succ);
-      if ((Mode & GetPathWithNewEdges) && (getEdgeWeight(e) != MissingValue)) continue;
-      P[*Succ] = BB;
-      BFS.push(*Succ);
-      if ((Mode & GetPathToDest) && *Succ == Dest) {
-        hasFoundPath = true;
-        BB = *Succ;
-        break;
-      }
-      if ((Mode & GetPathToValue) && (getExecutionCount(*Succ) != MissingValue)) {
-        hasFoundPath = true;
-        BB = *Succ;
-        break;
-      }
-    }
-  }
-
-  return BB;
-}
-
-template<>
-void ProfileInfoT<Function,BasicBlock>::
-        divertFlow(const Edge &oldedge, const Edge &newedge) {
-  DEBUG(dbgs() << "Diverting " << oldedge << " via " << newedge );
-
-  // First check if the old edge was taken, if not, just delete it...
-  if (getEdgeWeight(oldedge) == 0) {
-    removeEdge(oldedge);
-    return;
-  }
-
-  Path P;
-  P[newedge.first] = 0;
-  P[newedge.second] = newedge.first;
-  const BasicBlock *BB = GetPath(newedge.second,oldedge.second,P,GetPathToExit | GetPathToDest);
-
-  double w = getEdgeWeight (oldedge);
-  DEBUG(dbgs() << ", Weight: " << format("%.20g",w) << "\n");
-  do {
-    const BasicBlock *Parent = P.find(BB)->second;
-    Edge e = getEdge(Parent,BB);
-    double oldw = getEdgeWeight(e);
-    double oldc = getExecutionCount(e.first);
-    setEdgeWeight(e, w+oldw);
-    if (Parent != oldedge.first) {
-      setExecutionCount(e.first, w+oldc);
-    }
-    BB = Parent;
-  } while (BB != newedge.first);
-  removeEdge(oldedge);
-}
-
-/// Replaces all occurrences of RmBB in the ProfilingInfo with DestBB.
-/// This checks all edges of the function the blocks reside in and replaces the
-/// occurrences of RmBB with DestBB.
-template<>
-void ProfileInfoT<Function,BasicBlock>::
-        replaceAllUses(const BasicBlock *RmBB, const BasicBlock *DestBB) {
-  DEBUG(dbgs() << "Replacing " << RmBB->getName()
-               << " with " << DestBB->getName() << "\n");
-  const Function *F = DestBB->getParent();
-  std::map<const Function*, EdgeWeights>::iterator J =
-    EdgeInformation.find(F);
-  if (J == EdgeInformation.end()) return;
-
-  Edge e, newedge;
-  bool erasededge = false;
-  EdgeWeights::iterator I = J->second.begin(), E = J->second.end();
-  while(I != E) {
-    e = (I++)->first;
-    bool foundedge = false; bool eraseedge = false;
-    if (e.first == RmBB) {
-      if (e.second == DestBB) {
-        eraseedge = true;
-      } else {
-        newedge = getEdge(DestBB, e.second);
-        foundedge = true;
-      }
-    }
-    if (e.second == RmBB) {
-      if (e.first == DestBB) {
-        eraseedge = true;
-      } else {
-        newedge = getEdge(e.first, DestBB);
-        foundedge = true;
-      }
-    }
-    if (foundedge) {
-      replaceEdge(e, newedge);
-    }
-    if (eraseedge) {
-      if (erasededge) {
-        Edge newedge = getEdge(DestBB, DestBB);
-        replaceEdge(e, newedge);
-      } else {
-        removeEdge(e);
-        erasededge = true;
-      }
-    }
-  }
-}
-
-/// Splits an edge in the ProfileInfo and redirects flow over NewBB.
-/// Since its possible that there is more than one edge in the CFG from FristBB
-/// to SecondBB its necessary to redirect the flow proporionally.
-template<>
-void ProfileInfoT<Function,BasicBlock>::splitEdge(const BasicBlock *FirstBB,
-                                                  const BasicBlock *SecondBB,
-                                                  const BasicBlock *NewBB,
-                                                  bool MergeIdenticalEdges) {
-  const Function *F = FirstBB->getParent();
-  std::map<const Function*, EdgeWeights>::iterator J =
-    EdgeInformation.find(F);
-  if (J == EdgeInformation.end()) return;
-
-  // Generate edges and read current weight.
-  Edge e  = getEdge(FirstBB, SecondBB);
-  Edge n1 = getEdge(FirstBB, NewBB);
-  Edge n2 = getEdge(NewBB, SecondBB);
-  EdgeWeights &ECs = J->second;
-  double w = ECs[e];
-
-  int succ_count = 0;
-  if (!MergeIdenticalEdges) {
-    // First count the edges from FristBB to SecondBB, if there is more than
-    // one, only slice out a proporional part for NewBB.
-    for(succ_const_iterator BBI = succ_begin(FirstBB), BBE = succ_end(FirstBB);
-        BBI != BBE; ++BBI) {
-      if (*BBI == SecondBB) succ_count++;  
-    }
-    // When the NewBB is completely new, increment the count by one so that
-    // the counts are properly distributed.
-    if (getExecutionCount(NewBB) == ProfileInfo::MissingValue) succ_count++;
-  } else {
-    // When the edges are merged anyway, then redirect all flow.
-    succ_count = 1;
-  }
-
-  // We know now how many edges there are from FirstBB to SecondBB, reroute a
-  // proportional part of the edge weight over NewBB.
-  double neww = floor(w / succ_count);
-  ECs[n1] += neww;
-  ECs[n2] += neww;
-  BlockInformation[F][NewBB] += neww;
-  if (succ_count == 1) {
-    ECs.erase(e);
-  } else {
-    ECs[e] -= neww;
-  }
-}
-
-template<>
-void ProfileInfoT<Function,BasicBlock>::splitBlock(const BasicBlock *Old,
-                                                   const BasicBlock* New) {
-  const Function *F = Old->getParent();
-  std::map<const Function*, EdgeWeights>::iterator J =
-    EdgeInformation.find(F);
-  if (J == EdgeInformation.end()) return;
-
-  DEBUG(dbgs() << "Splitting " << Old->getName() << " to " << New->getName() << "\n");
-
-  std::set<Edge> Edges;
-  for (EdgeWeights::iterator ewi = J->second.begin(), ewe = J->second.end(); 
-       ewi != ewe; ++ewi) {
-    Edge old = ewi->first;
-    if (old.first == Old) {
-      Edges.insert(old);
-    }
-  }
-  for (std::set<Edge>::iterator EI = Edges.begin(), EE = Edges.end(); 
-       EI != EE; ++EI) {
-    Edge newedge = getEdge(New, EI->second);
-    replaceEdge(*EI, newedge);
-  }
-
-  double w = getExecutionCount(Old);
-  setEdgeWeight(getEdge(Old, New), w);
-  setExecutionCount(New, w);
-}
-
-template<>
-void ProfileInfoT<Function,BasicBlock>::splitBlock(const BasicBlock *BB,
-                                                   const BasicBlock* NewBB,
-                                                   BasicBlock *const *Preds,
-                                                   unsigned NumPreds) {
-  const Function *F = BB->getParent();
-  std::map<const Function*, EdgeWeights>::iterator J =
-    EdgeInformation.find(F);
-  if (J == EdgeInformation.end()) return;
-
-  DEBUG(dbgs() << "Splitting " << NumPreds << " Edges from " << BB->getName() 
-               << " to " << NewBB->getName() << "\n");
-
-  // Collect weight that was redirected over NewBB.
-  double newweight = 0;
-  
-  std::set<const BasicBlock *> ProcessedPreds;
-  // For all requestes Predecessors.
-  for (unsigned pred = 0; pred < NumPreds; ++pred) {
-    const BasicBlock * Pred = Preds[pred];
-    if (ProcessedPreds.insert(Pred).second) {
-      // Create edges and read old weight.
-      Edge oldedge = getEdge(Pred, BB);
-      Edge newedge = getEdge(Pred, NewBB);
-
-      // Remember how much weight was redirected.
-      newweight += getEdgeWeight(oldedge);
-    
-      replaceEdge(oldedge,newedge);
-    }
-  }
-
-  Edge newedge = getEdge(NewBB,BB);
-  setEdgeWeight(newedge, newweight);
-  setExecutionCount(NewBB, newweight);
-}
-
-template<>
-void ProfileInfoT<Function,BasicBlock>::transfer(const Function *Old,
-                                                 const Function *New) {
-  DEBUG(dbgs() << "Replacing Function " << Old->getName() << " with "
-               << New->getName() << "\n");
-  std::map<const Function*, EdgeWeights>::iterator J =
-    EdgeInformation.find(Old);
-  if(J != EdgeInformation.end()) {
-    EdgeInformation[New] = J->second;
-  }
-  EdgeInformation.erase(Old);
-  BlockInformation.erase(Old);
-  FunctionInformation.erase(Old);
-}
-
-static double readEdgeOrRemember(ProfileInfo::Edge edge, double w,
-                                 ProfileInfo::Edge &tocalc, unsigned &uncalc) {
-  if (w == ProfileInfo::MissingValue) {
-    tocalc = edge;
-    uncalc++;
-    return 0;
-  } else {
-    return w;
-  }
-}
-
-template<>
-bool ProfileInfoT<Function,BasicBlock>::
-        CalculateMissingEdge(const BasicBlock *BB, Edge &removed,
-                             bool assumeEmptySelf) {
-  Edge edgetocalc;
-  unsigned uncalculated = 0;
-
-  // collect weights of all incoming and outgoing edges, rememer edges that
-  // have no value
-  double incount = 0;
-  SmallSet<const BasicBlock*,8> pred_visited;
-  const_pred_iterator bbi = pred_begin(BB), bbe = pred_end(BB);
-  if (bbi==bbe) {
-    Edge e = getEdge(0,BB);
-    incount += readEdgeOrRemember(e, getEdgeWeight(e) ,edgetocalc,uncalculated);
-  }
-  for (;bbi != bbe; ++bbi) {
-    if (pred_visited.insert(*bbi)) {
-      Edge e = getEdge(*bbi,BB);
-      incount += readEdgeOrRemember(e, getEdgeWeight(e) ,edgetocalc,uncalculated);
-    }
-  }
-
-  double outcount = 0;
-  SmallSet<const BasicBlock*,8> succ_visited;
-  succ_const_iterator sbbi = succ_begin(BB), sbbe = succ_end(BB);
-  if (sbbi==sbbe) {
-    Edge e = getEdge(BB,0);
-    if (getEdgeWeight(e) == MissingValue) {
-      double w = getExecutionCount(BB);
-      if (w != MissingValue) {
-        setEdgeWeight(e,w);
-        removed = e;
-      }
-    }
-    outcount += readEdgeOrRemember(e, getEdgeWeight(e), edgetocalc, uncalculated);
-  }
-  for (;sbbi != sbbe; ++sbbi) {
-    if (succ_visited.insert(*sbbi)) {
-      Edge e = getEdge(BB,*sbbi);
-      outcount += readEdgeOrRemember(e, getEdgeWeight(e), edgetocalc, uncalculated);
-    }
-  }
-
-  // if exactly one edge weight was missing, calculate it and remove it from
-  // spanning tree
-  if (uncalculated == 0 ) {
-    return true;
-  } else
-  if (uncalculated == 1) {
-    if (incount < outcount) {
-      EdgeInformation[BB->getParent()][edgetocalc] = outcount-incount;
-    } else {
-      EdgeInformation[BB->getParent()][edgetocalc] = incount-outcount;
-    }
-    DEBUG(dbgs() << "--Calc Edge Counter for " << edgetocalc << ": "
-                 << format("%.20g", getEdgeWeight(edgetocalc)) << "\n");
-    removed = edgetocalc;
-    return true;
-  } else 
-  if (uncalculated == 2 && assumeEmptySelf && edgetocalc.first == edgetocalc.second && incount == outcount) {
-    setEdgeWeight(edgetocalc, incount * 10);
-    removed = edgetocalc;
-    return true;
-  } else {
-    return false;
-  }
-}
-
-static void readEdge(ProfileInfo *PI, ProfileInfo::Edge e, double &calcw, std::set<ProfileInfo::Edge> &misscount) {
-  double w = PI->getEdgeWeight(e);
-  if (w != ProfileInfo::MissingValue) {
-    calcw += w;
-  } else {
-    misscount.insert(e);
-  }
-}
-
-template<>
-bool ProfileInfoT<Function,BasicBlock>::EstimateMissingEdges(const BasicBlock *BB) {
-  double inWeight = 0;
-  std::set<Edge> inMissing;
-  std::set<const BasicBlock*> ProcessedPreds;
-  const_pred_iterator bbi = pred_begin(BB), bbe = pred_end(BB);
-  if (bbi == bbe) {
-    readEdge(this,getEdge(0,BB),inWeight,inMissing);
-  }
-  for( ; bbi != bbe; ++bbi ) {
-    if (ProcessedPreds.insert(*bbi).second) {
-      readEdge(this,getEdge(*bbi,BB),inWeight,inMissing);
-    }
-  }
-
-  double outWeight = 0;
-  std::set<Edge> outMissing;
-  std::set<const BasicBlock*> ProcessedSuccs;
-  succ_const_iterator sbbi = succ_begin(BB), sbbe = succ_end(BB);
-  if (sbbi == sbbe)
-    readEdge(this,getEdge(BB,0),outWeight,outMissing);
-  for ( ; sbbi != sbbe; ++sbbi ) {
-    if (ProcessedSuccs.insert(*sbbi).second) {
-      readEdge(this,getEdge(BB,*sbbi),outWeight,outMissing);
-    }
-  }
-
-  double share;
-  std::set<Edge>::iterator ei,ee;
-  if (inMissing.size() == 0 && outMissing.size() > 0) {
-    ei = outMissing.begin();
-    ee = outMissing.end();
-    share = inWeight/outMissing.size();
-    setExecutionCount(BB,inWeight);
-  } else
-  if (inMissing.size() > 0 && outMissing.size() == 0 && outWeight == 0) {
-    ei = inMissing.begin();
-    ee = inMissing.end();
-    share = 0;
-    setExecutionCount(BB,0);
-  } else
-  if (inMissing.size() == 0 && outMissing.size() == 0) {
-    setExecutionCount(BB,outWeight);
-    return true;
-  } else {
-    return false;
-  }
-  for ( ; ei != ee; ++ei ) {
-    setEdgeWeight(*ei,share);
-  }
-  return true;
-}
-
-template<>
-void ProfileInfoT<Function,BasicBlock>::repair(const Function *F) {
-//  if (getExecutionCount(&(F->getEntryBlock())) == 0) {
-//    for (Function::const_iterator FI = F->begin(), FE = F->end();
-//         FI != FE; ++FI) {
-//      const BasicBlock* BB = &(*FI);
-//      {
-//        const_pred_iterator NBB = pred_begin(BB), End = pred_end(BB);
-//        if (NBB == End) {
-//          setEdgeWeight(getEdge(0,BB),0);
-//        }
-//        for(;NBB != End; ++NBB) {
-//          setEdgeWeight(getEdge(*NBB,BB),0);
-//        }
-//      }
-//      {
-//        succ_const_iterator NBB = succ_begin(BB), End = succ_end(BB);
-//        if (NBB == End) {
-//          setEdgeWeight(getEdge(0,BB),0);
-//        }
-//        for(;NBB != End; ++NBB) {
-//          setEdgeWeight(getEdge(*NBB,BB),0);
-//        }
-//      }
-//    }
-//    return;
-//  }
-  // The set of BasicBlocks that are still unvisited.
-  std::set<const BasicBlock*> Unvisited;
-
-  // The set of return edges (Edges with no successors).
-  std::set<Edge> ReturnEdges;
-  double ReturnWeight = 0;
-  
-  // First iterate over the whole function and collect:
-  // 1) The blocks in this function in the Unvisited set.
-  // 2) The return edges in the ReturnEdges set.
-  // 3) The flow that is leaving the function already via return edges.
-
-  // Data structure for searching the function.
-  std::queue<const BasicBlock *> BFS;
-  const BasicBlock *BB = &(F->getEntryBlock());
-  BFS.push(BB);
-  Unvisited.insert(BB);
-
-  while (BFS.size()) {
-    BB = BFS.front(); BFS.pop();
-    succ_const_iterator NBB = succ_begin(BB), End = succ_end(BB);
-    if (NBB == End) {
-      Edge e = getEdge(BB,0);
-      double w = getEdgeWeight(e);
-      if (w == MissingValue) {
-        // If the return edge has no value, try to read value from block.
-        double bw = getExecutionCount(BB);
-        if (bw != MissingValue) {
-          setEdgeWeight(e,bw);
-          ReturnWeight += bw;
-        } else {
-          // If both return edge and block provide no value, collect edge.
-          ReturnEdges.insert(e);
-        }
-      } else {
-        // If the return edge has a proper value, collect it.
-        ReturnWeight += w;
-      }
-    }
-    for (;NBB != End; ++NBB) {
-      if (Unvisited.insert(*NBB).second) {
-        BFS.push(*NBB);
-      }
-    }
-  }
-
-  while (Unvisited.size() > 0) {
-    unsigned oldUnvisitedCount = Unvisited.size();
-    bool FoundPath = false;
-
-    // If there is only one edge left, calculate it.
-    if (ReturnEdges.size() == 1) {
-      ReturnWeight = getExecutionCount(&(F->getEntryBlock())) - ReturnWeight;
-
-      Edge e = *ReturnEdges.begin();
-      setEdgeWeight(e,ReturnWeight);
-      setExecutionCount(e.first,ReturnWeight);
-
-      Unvisited.erase(e.first);
-      ReturnEdges.erase(e);
-      continue;
-    }
-
-    // Calculate all blocks where only one edge is missing, this may also
-    // resolve furhter return edges.
-    std::set<const BasicBlock *>::iterator FI = Unvisited.begin(), FE = Unvisited.end();
-    while(FI != FE) {
-      const BasicBlock *BB = *FI; ++FI;
-      Edge e;
-      if(CalculateMissingEdge(BB,e,true)) {
-        if (BlockInformation[F].find(BB) == BlockInformation[F].end()) {
-          setExecutionCount(BB,getExecutionCount(BB));
-        }
-        Unvisited.erase(BB);
-        if (e.first != 0 && e.second == 0) {
-          ReturnEdges.erase(e);
-          ReturnWeight += getEdgeWeight(e);
-        }
-      }
-    }
-    if (oldUnvisitedCount > Unvisited.size()) continue;
-
-    // Estimate edge weights by dividing the flow proportionally.
-    FI = Unvisited.begin(), FE = Unvisited.end();
-    while(FI != FE) {
-      const BasicBlock *BB = *FI; ++FI;
-      const BasicBlock *Dest = 0;
-      bool AllEdgesHaveSameReturn = true;
-      // Check each Successor, these must all end up in the same or an empty
-      // return block otherwise its dangerous to do an estimation on them.
-      for (succ_const_iterator Succ = succ_begin(BB), End = succ_end(BB);
-           Succ != End; ++Succ) {
-        Path P;
-        GetPath(*Succ, 0, P, GetPathToExit);
-        if (Dest && Dest != P[(const BasicBlock*)0]) {
-          AllEdgesHaveSameReturn = false;
-        }
-        Dest = P[(const BasicBlock*)0];
-      }
-      if (AllEdgesHaveSameReturn) {
-        if(EstimateMissingEdges(BB)) {
-          Unvisited.erase(BB);
-          break;
-        }
-      }
-    }
-    if (oldUnvisitedCount > Unvisited.size()) continue;
-
-    // Check if there is a path to an block that has a known value and redirect
-    // flow accordingly.
-    FI = Unvisited.begin(), FE = Unvisited.end();
-    while(FI != FE && !FoundPath) {
-      // Fetch path.
-      const BasicBlock *BB = *FI; ++FI;
-      Path P;
-      const BasicBlock *Dest = GetPath(BB, 0, P, GetPathToValue);
-
-      // Calculate incoming flow.
-      double iw = 0; unsigned inmissing = 0; unsigned incount = 0; unsigned invalid = 0;
-      std::set<const BasicBlock *> Processed;
-      for (const_pred_iterator NBB = pred_begin(BB), End = pred_end(BB);
-           NBB != End; ++NBB) {
-        if (Processed.insert(*NBB).second) {
-          Edge e = getEdge(*NBB, BB);
-          double ew = getEdgeWeight(e);
-          if (ew != MissingValue) {
-            iw += ew;
-            invalid++;
-          } else {
-            // If the path contains the successor, this means its a backedge,
-            // do not count as missing.
-            if (P.find(*NBB) == P.end())
-              inmissing++;
-          }
-          incount++;
-        }
-      }
-      if (inmissing == incount) continue;
-      if (invalid == 0) continue;
-
-      // Subtract (already) outgoing flow.
-      Processed.clear();
-      for (succ_const_iterator NBB = succ_begin(BB), End = succ_end(BB);
-           NBB != End; ++NBB) {
-        if (Processed.insert(*NBB).second) {
-          Edge e = getEdge(BB, *NBB);
-          double ew = getEdgeWeight(e);
-          if (ew != MissingValue) {
-            iw -= ew;
-          }
-        }
-      }
-      if (iw < 0) continue;
-
-      // Check the receiving end of the path if it can handle the flow.
-      double ow = getExecutionCount(Dest);
-      Processed.clear();
-      for (succ_const_iterator NBB = succ_begin(BB), End = succ_end(BB);
-           NBB != End; ++NBB) {
-        if (Processed.insert(*NBB).second) {
-          Edge e = getEdge(BB, *NBB);
-          double ew = getEdgeWeight(e);
-          if (ew != MissingValue) {
-            ow -= ew;
-          }
-        }
-      }
-      if (ow < 0) continue;
-
-      // Determine how much flow shall be used.
-      double ew = getEdgeWeight(getEdge(P[Dest],Dest));
-      if (ew != MissingValue) {
-        ew = ew<ow?ew:ow;
-        ew = ew<iw?ew:iw;
-      } else {
-        if (inmissing == 0)
-          ew = iw;
-      }
-
-      // Create flow.
-      if (ew != MissingValue) {
-        do {
-          Edge e = getEdge(P[Dest],Dest);
-          if (getEdgeWeight(e) == MissingValue) {
-            setEdgeWeight(e,ew);
-            FoundPath = true;
-          }
-          Dest = P[Dest];
-        } while (Dest != BB);
-      }
-    }
-    if (FoundPath) continue;
-
-    // Calculate a block with self loop.
-    FI = Unvisited.begin(), FE = Unvisited.end();
-    while(FI != FE && !FoundPath) {
-      const BasicBlock *BB = *FI; ++FI;
-      bool SelfEdgeFound = false;
-      for (succ_const_iterator NBB = succ_begin(BB), End = succ_end(BB);
-           NBB != End; ++NBB) {
-        if (*NBB == BB) {
-          SelfEdgeFound = true;
-          break;
-        }
-      }
-      if (SelfEdgeFound) {
-        Edge e = getEdge(BB,BB);
-        if (getEdgeWeight(e) == MissingValue) {
-          double iw = 0;
-          std::set<const BasicBlock *> Processed;
-          for (const_pred_iterator NBB = pred_begin(BB), End = pred_end(BB);
-               NBB != End; ++NBB) {
-            if (Processed.insert(*NBB).second) {
-              Edge e = getEdge(*NBB, BB);
-              double ew = getEdgeWeight(e);
-              if (ew != MissingValue) {
-                iw += ew;
-              }
-            }
-          }
-          setEdgeWeight(e,iw * 10);
-          FoundPath = true;
-        }
-      }
-    }
-    if (FoundPath) continue;
-
-    // Determine backedges, set them to zero.
-    FI = Unvisited.begin(), FE = Unvisited.end();
-    while(FI != FE && !FoundPath) {
-      const BasicBlock *BB = *FI; ++FI;
-      const BasicBlock *Dest = 0;
-      Path P;
-      bool BackEdgeFound = false;
-      for (const_pred_iterator NBB = pred_begin(BB), End = pred_end(BB);
-           NBB != End; ++NBB) {
-        Dest = GetPath(BB, *NBB, P, GetPathToDest | GetPathWithNewEdges);
-        if (Dest == *NBB) {
-          BackEdgeFound = true;
-          break;
-        }
-      }
-      if (BackEdgeFound) {
-        Edge e = getEdge(Dest,BB);
-        double w = getEdgeWeight(e);
-        if (w == MissingValue) {
-          setEdgeWeight(e,0);
-          FoundPath = true;
-        }
-        do {
-          Edge e = getEdge(P[Dest], Dest);
-          double w = getEdgeWeight(e);
-          if (w == MissingValue) {
-            setEdgeWeight(e,0);
-            FoundPath = true;
-          }
-          Dest = P[Dest];
-        } while (Dest != BB);
-      }
-    }
-    if (FoundPath) continue;
-
-    // Channel flow to return block.
-    FI = Unvisited.begin(), FE = Unvisited.end();
-    while(FI != FE && !FoundPath) {
-      const BasicBlock *BB = *FI; ++FI;
-
-      Path P;
-      const BasicBlock *Dest = GetPath(BB, 0, P, GetPathToExit | GetPathWithNewEdges);
-      Dest = P[(const BasicBlock*)0];
-      if (!Dest) continue;
-
-      if (getEdgeWeight(getEdge(Dest,0)) == MissingValue) {
-        // Calculate incoming flow.
-        double iw = 0;
-        std::set<const BasicBlock *> Processed;
-        for (const_pred_iterator NBB = pred_begin(BB), End = pred_end(BB);
-             NBB != End; ++NBB) {
-          if (Processed.insert(*NBB).second) {
-            Edge e = getEdge(*NBB, BB);
-            double ew = getEdgeWeight(e);
-            if (ew != MissingValue) {
-              iw += ew;
-            }
-          }
-        }
-        do {
-          Edge e = getEdge(P[Dest], Dest);
-          double w = getEdgeWeight(e);
-          if (w == MissingValue) {
-            setEdgeWeight(e,iw);
-            FoundPath = true;
-          } else {
-            assert(0 && "Edge should not have value already!");
-          }
-          Dest = P[Dest];
-        } while (Dest != BB);
-      }
-    }
-    if (FoundPath) continue;
-
-    // Speculatively set edges to zero.
-    FI = Unvisited.begin(), FE = Unvisited.end();
-    while(FI != FE && !FoundPath) {
-      const BasicBlock *BB = *FI; ++FI;
-
-      for (const_pred_iterator NBB = pred_begin(BB), End = pred_end(BB);
-           NBB != End; ++NBB) {
-        Edge e = getEdge(*NBB,BB);
-        double w = getEdgeWeight(e);
-        if (w == MissingValue) {
-          setEdgeWeight(e,0);
-          FoundPath = true;
-          break;
-        }
-      }
-    }
-    if (FoundPath) continue;
-
-    errs() << "{";
-    FI = Unvisited.begin(), FE = Unvisited.end();
-    while(FI != FE) {
-      const BasicBlock *BB = *FI; ++FI;
-      dbgs() << BB->getName();
-      if (FI != FE)
-        dbgs() << ",";
-    }
-    errs() << "}";
-
-    errs() << "ASSERT: could not repair function";
-    assert(0 && "could not repair function");
-  }
-
-  EdgeWeights J = EdgeInformation[F];
-  for (EdgeWeights::iterator EI = J.begin(), EE = J.end(); EI != EE; ++EI) {
-    Edge e = EI->first;
-
-    bool SuccFound = false;
-    if (e.first != 0) {
-      succ_const_iterator NBB = succ_begin(e.first), End = succ_end(e.first);
-      if (NBB == End) {
-        if (0 == e.second) {
-          SuccFound = true;
-        }
-      }
-      for (;NBB != End; ++NBB) {
-        if (*NBB == e.second) {
-          SuccFound = true;
-          break;
-        }
-      }
-      if (!SuccFound) {
-        removeEdge(e);
-      }
-    }
-  }
-}
-
-raw_ostream& operator<<(raw_ostream &O, const MachineFunction *MF) {
-  return O << MF->getFunction()->getName() << "(MF)";
-}
-
-raw_ostream& operator<<(raw_ostream &O, const MachineBasicBlock *MBB) {
-  return O << MBB->getBasicBlock()->getName() << "(MB)";
-}
-
-raw_ostream& operator<<(raw_ostream &O, std::pair<const MachineBasicBlock *, const MachineBasicBlock *> E) {
-  O << "(";
-
-  if (E.first)
-    O << E.first;
-  else
-    O << "0";
-
-  O << ",";
-
-  if (E.second)
-    O << E.second;
-  else
-    O << "0";
-
-  return O << ")";
-}
-
-} // namespace llvm
-
-//===----------------------------------------------------------------------===//
-//  NoProfile ProfileInfo implementation
-//
-
-namespace {
-  struct NoProfileInfo : public ImmutablePass, public ProfileInfo {
-    static char ID; // Class identification, replacement for typeinfo
-    NoProfileInfo() : ImmutablePass(ID) {
-      initializeNoProfileInfoPass(*PassRegistry::getPassRegistry());
-    }
-    
-    /// getAdjustedAnalysisPointer - This method is used when a pass implements
-    /// an analysis interface through multiple inheritance.  If needed, it
-    /// should override this to adjust the this pointer as needed for the
-    /// specified pass info.
-    virtual void *getAdjustedAnalysisPointer(AnalysisID PI) {
-      if (PI == &ProfileInfo::ID)
-        return (ProfileInfo*)this;
-      return this;
-    }
-    
-    virtual const char *getPassName() const {
-      return "NoProfileInfo";
-    }
-  };
-}  // End of anonymous namespace
-
-char NoProfileInfo::ID = 0;
-// Register this pass...
-INITIALIZE_AG_PASS(NoProfileInfo, ProfileInfo, "no-profile",
-                   "No Profile Information", false, true, true)
-
-ImmutablePass *llvm::createNoProfileInfoPass() { return new NoProfileInfo(); }
diff --git a/lib/Analysis/ProfileInfoLoader.cpp b/lib/Analysis/ProfileInfoLoader.cpp
deleted file mode 100644
index f1f3e94..0000000
--- a/lib/Analysis/ProfileInfoLoader.cpp
+++ /dev/null
@@ -1,155 +0,0 @@
-//===- ProfileInfoLoad.cpp - Load profile information from disk -----------===//
-//
-//                      The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// The ProfileInfoLoader class is used to load and represent profiling
-// information read in from the dump file.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Analysis/ProfileInfoLoader.h"
-#include "llvm/Analysis/ProfileInfoTypes.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Support/raw_ostream.h"
-#include <cstdio>
-#include <cstdlib>
-using namespace llvm;
-
-// ByteSwap - Byteswap 'Var' if 'Really' is true.
-//
-static inline unsigned ByteSwap(unsigned Var, bool Really) {
-  if (!Really) return Var;
-  return ((Var & (255U<< 0U)) << 24U) |
-         ((Var & (255U<< 8U)) <<  8U) |
-         ((Var & (255U<<16U)) >>  8U) |
-         ((Var & (255U<<24U)) >> 24U);
-}
-
-static unsigned AddCounts(unsigned A, unsigned B) {
-  // If either value is undefined, use the other.
-  if (A == ProfileInfoLoader::Uncounted) return B;
-  if (B == ProfileInfoLoader::Uncounted) return A;
-  return A + B;
-}
-
-static void ReadProfilingBlock(const char *ToolName, FILE *F,
-                               bool ShouldByteSwap,
-                               std::vector<unsigned> &Data) {
-  // Read the number of entries...
-  unsigned NumEntries;
-  if (fread(&NumEntries, sizeof(unsigned), 1, F) != 1) {
-    errs() << ToolName << ": data packet truncated!\n";
-    perror(0);
-    exit(1);
-  }
-  NumEntries = ByteSwap(NumEntries, ShouldByteSwap);
-
-  // Read the counts...
-  std::vector<unsigned> TempSpace(NumEntries);
-
-  // Read in the block of data...
-  if (fread(&TempSpace[0], sizeof(unsigned)*NumEntries, 1, F) != 1) {
-    errs() << ToolName << ": data packet truncated!\n";
-    perror(0);
-    exit(1);
-  }
-
-  // Make sure we have enough space... The space is initialised to -1 to
-  // facitiltate the loading of missing values for OptimalEdgeProfiling.
-  if (Data.size() < NumEntries)
-    Data.resize(NumEntries, ProfileInfoLoader::Uncounted);
-
-  // Accumulate the data we just read into the data.
-  if (!ShouldByteSwap) {
-    for (unsigned i = 0; i != NumEntries; ++i) {
-      Data[i] = AddCounts(TempSpace[i], Data[i]);
-    }
-  } else {
-    for (unsigned i = 0; i != NumEntries; ++i) {
-      Data[i] = AddCounts(ByteSwap(TempSpace[i], true), Data[i]);
-    }
-  }
-}
-
-const unsigned ProfileInfoLoader::Uncounted = ~0U;
-
-// ProfileInfoLoader ctor - Read the specified profiling data file, exiting the
-// program if the file is invalid or broken.
-//
-ProfileInfoLoader::ProfileInfoLoader(const char *ToolName,
-                                     const std::string &Filename)
-  : Filename(Filename) {
-  FILE *F = fopen(Filename.c_str(), "rb");
-  if (F == 0) {
-    errs() << ToolName << ": Error opening '" << Filename << "': ";
-    perror(0);
-    exit(1);
-  }
-
-  // Keep reading packets until we run out of them.
-  unsigned PacketType;
-  while (fread(&PacketType, sizeof(unsigned), 1, F) == 1) {
-    // If the low eight bits of the packet are zero, we must be dealing with an
-    // endianness mismatch.  Byteswap all words read from the profiling
-    // information.
-    bool ShouldByteSwap = (char)PacketType == 0;
-    PacketType = ByteSwap(PacketType, ShouldByteSwap);
-
-    switch (PacketType) {
-    case ArgumentInfo: {
-      unsigned ArgLength;
-      if (fread(&ArgLength, sizeof(unsigned), 1, F) != 1) {
-        errs() << ToolName << ": arguments packet truncated!\n";
-        perror(0);
-        exit(1);
-      }
-      ArgLength = ByteSwap(ArgLength, ShouldByteSwap);
-
-      // Read in the arguments...
-      std::vector<char> Chars(ArgLength+4);
-
-      if (ArgLength)
-        if (fread(&Chars[0], (ArgLength+3) & ~3, 1, F) != 1) {
-          errs() << ToolName << ": arguments packet truncated!\n";
-          perror(0);
-          exit(1);
-        }
-      CommandLines.push_back(std::string(&Chars[0], &Chars[ArgLength]));
-      break;
-    }
-
-    case FunctionInfo:
-      ReadProfilingBlock(ToolName, F, ShouldByteSwap, FunctionCounts);
-      break;
-
-    case BlockInfo:
-      ReadProfilingBlock(ToolName, F, ShouldByteSwap, BlockCounts);
-      break;
-
-    case EdgeInfo:
-      ReadProfilingBlock(ToolName, F, ShouldByteSwap, EdgeCounts);
-      break;
-
-    case OptEdgeInfo:
-      ReadProfilingBlock(ToolName, F, ShouldByteSwap, OptimalEdgeCounts);
-      break;
-
-    case BBTraceInfo:
-      ReadProfilingBlock(ToolName, F, ShouldByteSwap, BBTrace);
-      break;
-
-    default:
-      errs() << ToolName << ": Unknown packet type #" << PacketType << "!\n";
-      exit(1);
-    }
-  }
-
-  fclose(F);
-}
-
diff --git a/lib/Analysis/ProfileInfoLoaderPass.cpp b/lib/Analysis/ProfileInfoLoaderPass.cpp
deleted file mode 100644
index 346f8d6..0000000
--- a/lib/Analysis/ProfileInfoLoaderPass.cpp
+++ /dev/null
@@ -1,267 +0,0 @@
-//===- ProfileInfoLoaderPass.cpp - LLVM Pass to load profile info ---------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements a concrete implementation of profiling information that
-// loads the information from a profile dump file.
-//
-//===----------------------------------------------------------------------===//
-#define DEBUG_TYPE "profile-loader"
-#include "llvm/Analysis/Passes.h"
-#include "llvm/ADT/SmallSet.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/ProfileInfo.h"
-#include "llvm/Analysis/ProfileInfoLoader.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/CFG.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/Format.h"
-#include "llvm/Support/raw_ostream.h"
-#include <set>
-using namespace llvm;
-
-STATISTIC(NumEdgesRead, "The # of edges read.");
-
-static cl::opt<std::string>
-ProfileInfoFilename("profile-info-file", cl::init("llvmprof.out"),
-                    cl::value_desc("filename"),
-                    cl::desc("Profile file loaded by -profile-loader"));
-
-namespace {
-  class LoaderPass : public ModulePass, public ProfileInfo {
-    std::string Filename;
-    std::set<Edge> SpanningTree;
-    std::set<const BasicBlock*> BBisUnvisited;
-    unsigned ReadCount;
-  public:
-    static char ID; // Class identification, replacement for typeinfo
-    explicit LoaderPass(const std::string &filename = "")
-      : ModulePass(ID), Filename(filename) {
-      initializeLoaderPassPass(*PassRegistry::getPassRegistry());
-      if (filename.empty()) Filename = ProfileInfoFilename;
-    }
-
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
-      AU.setPreservesAll();
-    }
-
-    virtual const char *getPassName() const {
-      return "Profiling information loader";
-    }
-
-    // recurseBasicBlock() - Calculates the edge weights for as much basic
-    // blocks as possbile.
-    virtual void recurseBasicBlock(const BasicBlock *BB);
-    virtual void readEdgeOrRemember(Edge, Edge&, unsigned &, double &);
-    virtual void readEdge(ProfileInfo::Edge, std::vector<unsigned>&);
-
-    /// getAdjustedAnalysisPointer - This method is used when a pass implements
-    /// an analysis interface through multiple inheritance.  If needed, it
-    /// should override this to adjust the this pointer as needed for the
-    /// specified pass info.
-    virtual void *getAdjustedAnalysisPointer(AnalysisID PI) {
-      if (PI == &ProfileInfo::ID)
-        return (ProfileInfo*)this;
-      return this;
-    }
-    
-    /// run - Load the profile information from the specified file.
-    virtual bool runOnModule(Module &M);
-  };
-}  // End of anonymous namespace
-
-char LoaderPass::ID = 0;
-INITIALIZE_AG_PASS(LoaderPass, ProfileInfo, "profile-loader",
-              "Load profile information from llvmprof.out", false, true, false)
-
-char &llvm::ProfileLoaderPassID = LoaderPass::ID;
-
-ModulePass *llvm::createProfileLoaderPass() { return new LoaderPass(); }
-
-/// createProfileLoaderPass - This function returns a Pass that loads the
-/// profiling information for the module from the specified filename, making it
-/// available to the optimizers.
-Pass *llvm::createProfileLoaderPass(const std::string &Filename) {
-  return new LoaderPass(Filename);
-}
-
-void LoaderPass::readEdgeOrRemember(Edge edge, Edge &tocalc, 
-                                    unsigned &uncalc, double &count) {
-  double w;
-  if ((w = getEdgeWeight(edge)) == MissingValue) {
-    tocalc = edge;
-    uncalc++;
-  } else {
-    count+=w;
-  }
-}
-
-// recurseBasicBlock - Visits all neighbours of a block and then tries to
-// calculate the missing edge values.
-void LoaderPass::recurseBasicBlock(const BasicBlock *BB) {
-
-  // break recursion if already visited
-  if (BBisUnvisited.find(BB) == BBisUnvisited.end()) return;
-  BBisUnvisited.erase(BB);
-  if (!BB) return;
-
-  for (succ_const_iterator bbi = succ_begin(BB), bbe = succ_end(BB);
-       bbi != bbe; ++bbi) {
-    recurseBasicBlock(*bbi);
-  }
-  for (const_pred_iterator bbi = pred_begin(BB), bbe = pred_end(BB);
-       bbi != bbe; ++bbi) {
-    recurseBasicBlock(*bbi);
-  }
-
-  Edge tocalc;
-  if (CalculateMissingEdge(BB, tocalc)) {
-    SpanningTree.erase(tocalc);
-  }
-}
-
-void LoaderPass::readEdge(ProfileInfo::Edge e,
-                          std::vector<unsigned> &ECs) {
-  if (ReadCount < ECs.size()) {
-    double weight = ECs[ReadCount++];
-    if (weight != ProfileInfoLoader::Uncounted) {
-      // Here the data realm changes from the unsigned of the file to the
-      // double of the ProfileInfo. This conversion is save because we know
-      // that everything thats representable in unsinged is also representable
-      // in double.
-      EdgeInformation[getFunction(e)][e] += (double)weight;
-
-      DEBUG(dbgs() << "--Read Edge Counter for " << e
-                   << " (# "<< (ReadCount-1) << "): "
-                   << (unsigned)getEdgeWeight(e) << "\n");
-    } else {
-      // This happens only if reading optimal profiling information, not when
-      // reading regular profiling information.
-      SpanningTree.insert(e);
-    }
-  }
-}
-
-bool LoaderPass::runOnModule(Module &M) {
-  ProfileInfoLoader PIL("profile-loader", Filename);
-
-  EdgeInformation.clear();
-  std::vector<unsigned> Counters = PIL.getRawEdgeCounts();
-  if (Counters.size() > 0) {
-    ReadCount = 0;
-    for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) {
-      if (F->isDeclaration()) continue;
-      DEBUG(dbgs() << "Working on " << F->getName() << "\n");
-      readEdge(getEdge(0,&F->getEntryBlock()), Counters);
-      for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
-        TerminatorInst *TI = BB->getTerminator();
-        for (unsigned s = 0, e = TI->getNumSuccessors(); s != e; ++s) {
-          readEdge(getEdge(BB,TI->getSuccessor(s)), Counters);
-        }
-      }
-    }
-    if (ReadCount != Counters.size()) {
-      errs() << "WARNING: profile information is inconsistent with "
-             << "the current program!\n";
-    }
-    NumEdgesRead = ReadCount;
-  }
-
-  Counters = PIL.getRawOptimalEdgeCounts();
-  if (Counters.size() > 0) {
-    ReadCount = 0;
-    for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) {
-      if (F->isDeclaration()) continue;
-      DEBUG(dbgs() << "Working on " << F->getName() << "\n");
-      readEdge(getEdge(0,&F->getEntryBlock()), Counters);
-      for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
-        TerminatorInst *TI = BB->getTerminator();
-        if (TI->getNumSuccessors() == 0) {
-          readEdge(getEdge(BB,0), Counters);
-        }
-        for (unsigned s = 0, e = TI->getNumSuccessors(); s != e; ++s) {
-          readEdge(getEdge(BB,TI->getSuccessor(s)), Counters);
-        }
-      }
-      while (SpanningTree.size() > 0) {
-
-        unsigned size = SpanningTree.size();
-
-        BBisUnvisited.clear();
-        for (std::set<Edge>::iterator ei = SpanningTree.begin(),
-             ee = SpanningTree.end(); ei != ee; ++ei) {
-          BBisUnvisited.insert(ei->first);
-          BBisUnvisited.insert(ei->second);
-        }
-        while (BBisUnvisited.size() > 0) {
-          recurseBasicBlock(*BBisUnvisited.begin());
-        }
-
-        if (SpanningTree.size() == size) {
-          DEBUG(dbgs()<<"{");
-          for (std::set<Edge>::iterator ei = SpanningTree.begin(),
-               ee = SpanningTree.end(); ei != ee; ++ei) {
-            DEBUG(dbgs()<< *ei <<",");
-          }
-          assert(0 && "No edge calculated!");
-        }
-
-      }
-    }
-    if (ReadCount != Counters.size()) {
-      errs() << "WARNING: profile information is inconsistent with "
-             << "the current program!\n";
-    }
-    NumEdgesRead = ReadCount;
-  }
-
-  BlockInformation.clear();
-  Counters = PIL.getRawBlockCounts();
-  if (Counters.size() > 0) {
-    ReadCount = 0;
-    for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) {
-      if (F->isDeclaration()) continue;
-      for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB)
-        if (ReadCount < Counters.size())
-          // Here the data realm changes from the unsigned of the file to the
-          // double of the ProfileInfo. This conversion is save because we know
-          // that everything thats representable in unsinged is also
-          // representable in double.
-          BlockInformation[F][BB] = (double)Counters[ReadCount++];
-    }
-    if (ReadCount != Counters.size()) {
-      errs() << "WARNING: profile information is inconsistent with "
-             << "the current program!\n";
-    }
-  }
-
-  FunctionInformation.clear();
-  Counters = PIL.getRawFunctionCounts();
-  if (Counters.size() > 0) {
-    ReadCount = 0;
-    for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) {
-      if (F->isDeclaration()) continue;
-      if (ReadCount < Counters.size())
-        // Here the data realm changes from the unsigned of the file to the
-        // double of the ProfileInfo. This conversion is save because we know
-        // that everything thats representable in unsinged is also
-        // representable in double.
-        FunctionInformation[F] = (double)Counters[ReadCount++];
-    }
-    if (ReadCount != Counters.size()) {
-      errs() << "WARNING: profile information is inconsistent with "
-             << "the current program!\n";
-    }
-  }
-
-  return false;
-}
diff --git a/lib/Analysis/ProfileVerifierPass.cpp b/lib/Analysis/ProfileVerifierPass.cpp
deleted file mode 100644
index c8896de..0000000
--- a/lib/Analysis/ProfileVerifierPass.cpp
+++ /dev/null
@@ -1,383 +0,0 @@
-//===- ProfileVerifierPass.cpp - LLVM Pass to estimate profile info -------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements a pass that checks profiling information for 
-// plausibility.
-//
-//===----------------------------------------------------------------------===//
-#define DEBUG_TYPE "profile-verifier"
-#include "llvm/Analysis/Passes.h"
-#include "llvm/Analysis/ProfileInfo.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/CFG.h"
-#include "llvm/Support/CallSite.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/Format.h"
-#include "llvm/Support/InstIterator.h"
-#include "llvm/Support/raw_ostream.h"
-#include <set>
-using namespace llvm;
-
-static cl::opt<bool,false>
-ProfileVerifierDisableAssertions("profile-verifier-noassert",
-     cl::desc("Disable assertions"));
-
-namespace {
-  template<class FType, class BType>
-  class ProfileVerifierPassT : public FunctionPass {
-
-    struct DetailedBlockInfo {
-      const BType *BB;
-      double      BBWeight;
-      double      inWeight;
-      int         inCount;
-      double      outWeight;
-      int         outCount;
-    };
-
-    ProfileInfoT<FType, BType> *PI;
-    std::set<const BType*> BBisVisited;
-    std::set<const FType*>   FisVisited;
-    bool DisableAssertions;
-
-    // When debugging is enabled, the verifier prints a whole slew of debug
-    // information, otherwise its just the assert. These are all the helper
-    // functions.
-    bool PrintedDebugTree;
-    std::set<const BType*> BBisPrinted;
-    void debugEntry(DetailedBlockInfo*);
-    void printDebugInfo(const BType *BB);
-
-  public:
-    static char ID; // Class identification, replacement for typeinfo
-
-    explicit ProfileVerifierPassT () : FunctionPass(ID) {
-      initializeProfileVerifierPassPass(*PassRegistry::getPassRegistry());
-      DisableAssertions = ProfileVerifierDisableAssertions;
-    }
-    explicit ProfileVerifierPassT (bool da) : FunctionPass(ID), 
-                                              DisableAssertions(da) {
-      initializeProfileVerifierPassPass(*PassRegistry::getPassRegistry());
-    }
-
-    void getAnalysisUsage(AnalysisUsage &AU) const {
-      AU.setPreservesAll();
-      AU.addRequired<ProfileInfoT<FType, BType> >();
-    }
-
-    const char *getPassName() const {
-      return "Profiling information verifier";
-    }
-
-    /// run - Verify the profile information.
-    bool runOnFunction(FType &F);
-    void recurseBasicBlock(const BType*);
-
-    bool   exitReachable(const FType*);
-    double ReadOrAssert(typename ProfileInfoT<FType, BType>::Edge);
-    void   CheckValue(bool, const char*, DetailedBlockInfo*);
-  };
-
-  typedef ProfileVerifierPassT<Function, BasicBlock> ProfileVerifierPass;
-
-  template<class FType, class BType>
-  void ProfileVerifierPassT<FType, BType>::printDebugInfo(const BType *BB) {
-
-    if (BBisPrinted.find(BB) != BBisPrinted.end()) return;
-
-    double BBWeight = PI->getExecutionCount(BB);
-    if (BBWeight == ProfileInfoT<FType, BType>::MissingValue) { BBWeight = 0; }
-    double inWeight = 0;
-    int inCount = 0;
-    std::set<const BType*> ProcessedPreds;
-    for (const_pred_iterator bbi = pred_begin(BB), bbe = pred_end(BB);
-         bbi != bbe; ++bbi ) {
-      if (ProcessedPreds.insert(*bbi).second) {
-        typename ProfileInfoT<FType, BType>::Edge E = PI->getEdge(*bbi,BB);
-        double EdgeWeight = PI->getEdgeWeight(E);
-        if (EdgeWeight == ProfileInfoT<FType, BType>::MissingValue) { EdgeWeight = 0; }
-        dbgs() << "calculated in-edge " << E << ": " 
-               << format("%20.20g",EdgeWeight) << "\n";
-        inWeight += EdgeWeight;
-        inCount++;
-      }
-    }
-    double outWeight = 0;
-    int outCount = 0;
-    std::set<const BType*> ProcessedSuccs;
-    for ( succ_const_iterator bbi = succ_begin(BB), bbe = succ_end(BB);
-          bbi != bbe; ++bbi ) {
-      if (ProcessedSuccs.insert(*bbi).second) {
-        typename ProfileInfoT<FType, BType>::Edge E = PI->getEdge(BB,*bbi);
-        double EdgeWeight = PI->getEdgeWeight(E);
-        if (EdgeWeight == ProfileInfoT<FType, BType>::MissingValue) { EdgeWeight = 0; }
-        dbgs() << "calculated out-edge " << E << ": " 
-               << format("%20.20g",EdgeWeight) << "\n";
-        outWeight += EdgeWeight;
-        outCount++;
-      }
-    }
-    dbgs() << "Block " << BB->getName()                   << " in "
-           << BB->getParent()->getName()                  << ":"
-           << "BBWeight="  << format("%20.20g",BBWeight)  << ","
-           << "inWeight="  << format("%20.20g",inWeight)  << ","
-           << "inCount="   << inCount                     << ","
-           << "outWeight=" << format("%20.20g",outWeight) << ","
-           << "outCount"   << outCount                    << "\n";
-
-    // mark as visited and recurse into subnodes
-    BBisPrinted.insert(BB);
-    for ( succ_const_iterator bbi = succ_begin(BB), bbe = succ_end(BB); 
-          bbi != bbe; ++bbi ) {
-      printDebugInfo(*bbi);
-    }
-  }
-
-  template<class FType, class BType>
-  void ProfileVerifierPassT<FType, BType>::debugEntry (DetailedBlockInfo *DI) {
-    dbgs() << "TROUBLE: Block " << DI->BB->getName()          << " in "
-           << DI->BB->getParent()->getName()                  << ":"
-           << "BBWeight="  << format("%20.20g",DI->BBWeight)  << ","
-           << "inWeight="  << format("%20.20g",DI->inWeight)  << ","
-           << "inCount="   << DI->inCount                     << ","
-           << "outWeight=" << format("%20.20g",DI->outWeight) << ","
-           << "outCount="  << DI->outCount                    << "\n";
-    if (!PrintedDebugTree) {
-      PrintedDebugTree = true;
-      printDebugInfo(&(DI->BB->getParent()->getEntryBlock()));
-    }
-  }
-
-  // This compares A and B for equality.
-  static bool Equals(double A, double B) {
-    return A == B;
-  }
-
-  // This checks if the function "exit" is reachable from an given function
-  // via calls, this is necessary to check if a profile is valid despite the
-  // counts not fitting exactly.
-  template<class FType, class BType>
-  bool ProfileVerifierPassT<FType, BType>::exitReachable(const FType *F) {
-    if (!F) return false;
-
-    if (FisVisited.count(F)) return false;
-
-    FType *Exit = F->getParent()->getFunction("exit");
-    if (Exit == F) {
-      return true;
-    }
-
-    FisVisited.insert(F);
-    bool exits = false;
-    for (const_inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I) {
-      if (const CallInst *CI = dyn_cast<CallInst>(&*I)) {
-        FType *F = CI->getCalledFunction();
-        if (F) {
-          exits |= exitReachable(F);
-        } else {
-          // This is a call to a pointer, all bets are off...
-          exits = true;
-        }
-        if (exits) break;
-      }
-    }
-    return exits;
-  }
-
-  #define ASSERTMESSAGE(M) \
-    { dbgs() << "ASSERT:" << (M) << "\n"; \
-      if (!DisableAssertions) assert(0 && (M)); }
-
-  template<class FType, class BType>
-  double ProfileVerifierPassT<FType, BType>::ReadOrAssert(typename ProfileInfoT<FType, BType>::Edge E) {
-    double EdgeWeight = PI->getEdgeWeight(E);
-    if (EdgeWeight == ProfileInfoT<FType, BType>::MissingValue) {
-      dbgs() << "Edge " << E << " in Function " 
-             << ProfileInfoT<FType, BType>::getFunction(E)->getName() << ": ";
-      ASSERTMESSAGE("Edge has missing value");
-      return 0;
-    } else {
-      if (EdgeWeight < 0) {
-        dbgs() << "Edge " << E << " in Function " 
-               << ProfileInfoT<FType, BType>::getFunction(E)->getName() << ": ";
-        ASSERTMESSAGE("Edge has negative value");
-      }
-      return EdgeWeight;
-    }
-  }
-
-  template<class FType, class BType>
-  void ProfileVerifierPassT<FType, BType>::CheckValue(bool Error, 
-                                                      const char *Message,
-                                                      DetailedBlockInfo *DI) {
-    if (Error) {
-      DEBUG(debugEntry(DI));
-      dbgs() << "Block " << DI->BB->getName() << " in Function "
-             << DI->BB->getParent()->getName() << ": ";
-      ASSERTMESSAGE(Message);
-    }
-    return;
-  }
-
-  // This calculates the Information for a block and then recurses into the
-  // successors.
-  template<class FType, class BType>
-  void ProfileVerifierPassT<FType, BType>::recurseBasicBlock(const BType *BB) {
-
-    // Break the recursion by remembering all visited blocks.
-    if (BBisVisited.find(BB) != BBisVisited.end()) return;
-
-    // Use a data structure to store all the information, this can then be handed
-    // to debug printers.
-    DetailedBlockInfo DI;
-    DI.BB = BB;
-    DI.outCount = DI.inCount = 0;
-    DI.inWeight = DI.outWeight = 0;
-
-    // Read predecessors.
-    std::set<const BType*> ProcessedPreds;
-    const_pred_iterator bpi = pred_begin(BB), bpe = pred_end(BB);
-    // If there are none, check for (0,BB) edge.
-    if (bpi == bpe) {
-      DI.inWeight += ReadOrAssert(PI->getEdge(0,BB));
-      DI.inCount++;
-    }
-    for (;bpi != bpe; ++bpi) {
-      if (ProcessedPreds.insert(*bpi).second) {
-        DI.inWeight += ReadOrAssert(PI->getEdge(*bpi,BB));
-        DI.inCount++;
-      }
-    }
-
-    // Read successors.
-    std::set<const BType*> ProcessedSuccs;
-    succ_const_iterator bbi = succ_begin(BB), bbe = succ_end(BB);
-    // If there is an (0,BB) edge, consider it too. (This is done not only when
-    // there are no successors, but every time; not every function contains
-    // return blocks with no successors (think loop latch as return block)).
-    double w = PI->getEdgeWeight(PI->getEdge(BB,0));
-    if (w != ProfileInfoT<FType, BType>::MissingValue) {
-      DI.outWeight += w;
-      DI.outCount++;
-    }
-    for (;bbi != bbe; ++bbi) {
-      if (ProcessedSuccs.insert(*bbi).second) {
-        DI.outWeight += ReadOrAssert(PI->getEdge(BB,*bbi));
-        DI.outCount++;
-      }
-    }
-
-    // Read block weight.
-    DI.BBWeight = PI->getExecutionCount(BB);
-    CheckValue(DI.BBWeight == ProfileInfoT<FType, BType>::MissingValue,
-               "BasicBlock has missing value", &DI);
-    CheckValue(DI.BBWeight < 0,
-               "BasicBlock has negative value", &DI);
-
-    // Check if this block is a setjmp target.
-    bool isSetJmpTarget = false;
-    if (DI.outWeight > DI.inWeight) {
-      for (typename BType::const_iterator i = BB->begin(), ie = BB->end();
-           i != ie; ++i) {
-        if (const CallInst *CI = dyn_cast<CallInst>(&*i)) {
-          FType *F = CI->getCalledFunction();
-          if (F && (F->getName() == "_setjmp")) {
-            isSetJmpTarget = true; break;
-          }
-        }
-      }
-    }
-    // Check if this block is eventually reaching exit.
-    bool isExitReachable = false;
-    if (DI.inWeight > DI.outWeight) {
-      for (typename BType::const_iterator i = BB->begin(), ie = BB->end();
-           i != ie; ++i) {
-        if (const CallInst *CI = dyn_cast<CallInst>(&*i)) {
-          FType *F = CI->getCalledFunction();
-          if (F) {
-            FisVisited.clear();
-            isExitReachable |= exitReachable(F);
-          } else {
-            // This is a call to a pointer, all bets are off...
-            isExitReachable = true;
-          }
-          if (isExitReachable) break;
-        }
-      }
-    }
-
-    if (DI.inCount > 0 && DI.outCount == 0) {
-       // If this is a block with no successors.
-      if (!isSetJmpTarget) {
-        CheckValue(!Equals(DI.inWeight,DI.BBWeight), 
-                   "inWeight and BBWeight do not match", &DI);
-      }
-    } else if (DI.inCount == 0 && DI.outCount > 0) {
-      // If this is a block with no predecessors.
-      if (!isExitReachable)
-        CheckValue(!Equals(DI.BBWeight,DI.outWeight), 
-                   "BBWeight and outWeight do not match", &DI);
-    } else {
-      // If this block has successors and predecessors.
-      if (DI.inWeight > DI.outWeight && !isExitReachable)
-        CheckValue(!Equals(DI.inWeight,DI.outWeight), 
-                   "inWeight and outWeight do not match", &DI);
-      if (DI.inWeight < DI.outWeight && !isSetJmpTarget)
-        CheckValue(!Equals(DI.inWeight,DI.outWeight), 
-                   "inWeight and outWeight do not match", &DI);
-    }
-
-
-    // Mark this block as visited, rescurse into successors.
-    BBisVisited.insert(BB);
-    for ( succ_const_iterator bbi = succ_begin(BB), bbe = succ_end(BB); 
-          bbi != bbe; ++bbi ) {
-      recurseBasicBlock(*bbi);
-    }
-  }
-
-  template<class FType, class BType>
-  bool ProfileVerifierPassT<FType, BType>::runOnFunction(FType &F) {
-    PI = getAnalysisIfAvailable<ProfileInfoT<FType, BType> >();
-    if (!PI)
-      ASSERTMESSAGE("No ProfileInfo available");
-
-    // Prepare global variables.
-    PrintedDebugTree = false;
-    BBisVisited.clear();
-
-    // Fetch entry block and recurse into it.
-    const BType *entry = &F.getEntryBlock();
-    recurseBasicBlock(entry);
-
-    if (PI->getExecutionCount(&F) != PI->getExecutionCount(entry))
-      ASSERTMESSAGE("Function count and entry block count do not match");
-
-    return false;
-  }
-
-  template<class FType, class BType>
-  char ProfileVerifierPassT<FType, BType>::ID = 0;
-}
-
-INITIALIZE_PASS_BEGIN(ProfileVerifierPass, "profile-verifier",
-                "Verify profiling information", false, true)
-INITIALIZE_AG_DEPENDENCY(ProfileInfo)
-INITIALIZE_PASS_END(ProfileVerifierPass, "profile-verifier",
-                "Verify profiling information", false, true)
-
-namespace llvm {
-  FunctionPass *createProfileVerifierPass() {
-    return new ProfileVerifierPass(ProfileVerifierDisableAssertions); 
-  }
-}
-
diff --git a/lib/Analysis/RegionInfo.cpp b/lib/Analysis/RegionInfo.cpp
index 8577025..5635688 100644
--- a/lib/Analysis/RegionInfo.cpp
+++ b/lib/Analysis/RegionInfo.cpp
@@ -9,6 +9,7 @@
 // Detects single entry single exit regions in the control flow graph.
 //===----------------------------------------------------------------------===//
 
+#define DEBUG_TYPE "region"
 #include "llvm/Analysis/RegionInfo.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/Statistic.h"
@@ -17,12 +18,9 @@
 #include "llvm/Assembly/Writer.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
-
-#define DEBUG_TYPE "region"
 #include "llvm/Support/Debug.h"
-
-#include <set>
 #include <algorithm>
+#include <set>
 
 using namespace llvm;
 
diff --git a/lib/Analysis/ScalarEvolution.cpp b/lib/Analysis/ScalarEvolution.cpp
index f5d095b..0a02f4e 100644
--- a/lib/Analysis/ScalarEvolution.cpp
+++ b/lib/Analysis/ScalarEvolution.cpp
@@ -2590,55 +2590,39 @@ const SCEV *ScalarEvolution::getUMinExpr(const SCEV *LHS,
   return getNotSCEV(getUMaxExpr(getNotSCEV(LHS), getNotSCEV(RHS)));
 }
 
-const SCEV *ScalarEvolution::getSizeOfExpr(Type *AllocTy) {
+const SCEV *ScalarEvolution::getSizeOfExpr(Type *IntTy, Type *AllocTy) {
   // If we have DataLayout, we can bypass creating a target-independent
   // constant expression and then folding it back into a ConstantInt.
   // This is just a compile-time optimization.
   if (TD)
-    return getConstant(TD->getIntPtrType(getContext()),
-                       TD->getTypeAllocSize(AllocTy));
+    return getConstant(IntTy, TD->getTypeAllocSize(AllocTy));
 
   Constant *C = ConstantExpr::getSizeOf(AllocTy);
   if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C))
     if (Constant *Folded = ConstantFoldConstantExpression(CE, TD, TLI))
       C = Folded;
   Type *Ty = getEffectiveSCEVType(PointerType::getUnqual(AllocTy));
+  assert(Ty == IntTy && "Effective SCEV type doesn't match");
   return getTruncateOrZeroExtend(getSCEV(C), Ty);
 }
 
-const SCEV *ScalarEvolution::getAlignOfExpr(Type *AllocTy) {
-  Constant *C = ConstantExpr::getAlignOf(AllocTy);
-  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C))
-    if (Constant *Folded = ConstantFoldConstantExpression(CE, TD, TLI))
-      C = Folded;
-  Type *Ty = getEffectiveSCEVType(PointerType::getUnqual(AllocTy));
-  return getTruncateOrZeroExtend(getSCEV(C), Ty);
-}
-
-const SCEV *ScalarEvolution::getOffsetOfExpr(StructType *STy,
+const SCEV *ScalarEvolution::getOffsetOfExpr(Type *IntTy,
+                                             StructType *STy,
                                              unsigned FieldNo) {
   // If we have DataLayout, we can bypass creating a target-independent
   // constant expression and then folding it back into a ConstantInt.
   // This is just a compile-time optimization.
-  if (TD)
-    return getConstant(TD->getIntPtrType(getContext()),
+  if (TD) {
+    return getConstant(IntTy,
                        TD->getStructLayout(STy)->getElementOffset(FieldNo));
+  }
 
   Constant *C = ConstantExpr::getOffsetOf(STy, FieldNo);
   if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C))
     if (Constant *Folded = ConstantFoldConstantExpression(CE, TD, TLI))
       C = Folded;
-  Type *Ty = getEffectiveSCEVType(PointerType::getUnqual(STy));
-  return getTruncateOrZeroExtend(getSCEV(C), Ty);
-}
 
-const SCEV *ScalarEvolution::getOffsetOfExpr(Type *CTy,
-                                             Constant *FieldNo) {
-  Constant *C = ConstantExpr::getOffsetOf(CTy, FieldNo);
-  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C))
-    if (Constant *Folded = ConstantFoldConstantExpression(CE, TD, TLI))
-      C = Folded;
-  Type *Ty = getEffectiveSCEVType(PointerType::getUnqual(CTy));
+  Type *Ty = getEffectiveSCEVType(PointerType::getUnqual(STy));
   return getTruncateOrZeroExtend(getSCEV(C), Ty);
 }
 
@@ -2703,12 +2687,15 @@ uint64_t ScalarEvolution::getTypeSizeInBits(Type *Ty) const {
 Type *ScalarEvolution::getEffectiveSCEVType(Type *Ty) const {
   assert(isSCEVable(Ty) && "Type is not SCEVable!");
 
-  if (Ty->isIntegerTy())
+  if (Ty->isIntegerTy()) {
     return Ty;
+  }
 
   // The only other support type is pointer.
   assert(Ty->isPointerTy() && "Unexpected non-pointer non-integer type!");
-  if (TD) return TD->getIntPtrType(getContext());
+
+  if (TD)
+    return TD->getIntPtrType(Ty);
 
   // Without DataLayout, conservatively assume pointers are 64-bit.
   return Type::getInt64Ty(getContext());
@@ -3101,15 +3088,26 @@ const SCEV *ScalarEvolution::createNodeForPHI(PHINode *PN) {
                   Flags = setFlags(Flags, SCEV::FlagNUW);
                 if (OBO->hasNoSignedWrap())
                   Flags = setFlags(Flags, SCEV::FlagNSW);
-              } else if (const GEPOperator *GEP =
-                         dyn_cast<GEPOperator>(BEValueV)) {
+              } else if (GEPOperator *GEP = dyn_cast<GEPOperator>(BEValueV)) {
                 // If the increment is an inbounds GEP, then we know the address
                 // space cannot be wrapped around. We cannot make any guarantee
                 // about signed or unsigned overflow because pointers are
                 // unsigned but we may have a negative index from the base
-                // pointer.
-                if (GEP->isInBounds())
+                // pointer. We can guarantee that no unsigned wrap occurs if the
+                // indices form a positive value.
+                if (GEP->isInBounds()) {
                   Flags = setFlags(Flags, SCEV::FlagNW);
+
+                  const SCEV *Ptr = getSCEV(GEP->getPointerOperand());
+                  if (isKnownPositive(getMinusSCEV(getSCEV(GEP), Ptr)))
+                    Flags = setFlags(Flags, SCEV::FlagNUW);
+                }
+              } else if (const SubOperator *OBO =
+                           dyn_cast<SubOperator>(BEValueV)) {
+                if (OBO->hasNoUnsignedWrap())
+                  Flags = setFlags(Flags, SCEV::FlagNUW);
+                if (OBO->hasNoSignedWrap())
+                  Flags = setFlags(Flags, SCEV::FlagNSW);
               }
 
               const SCEV *StartVal = getSCEV(StartValueV);
@@ -3177,18 +3175,18 @@ const SCEV *ScalarEvolution::createNodeForPHI(PHINode *PN) {
 /// operations. This allows them to be analyzed by regular SCEV code.
 ///
 const SCEV *ScalarEvolution::createNodeForGEP(GEPOperator *GEP) {
+  Type *IntPtrTy = getEffectiveSCEVType(GEP->getType());
+  Value *Base = GEP->getOperand(0);
+  // Don't attempt to analyze GEPs over unsized objects.
+  if (!Base->getType()->getPointerElementType()->isSized())
+    return getUnknown(GEP);
 
   // Don't blindly transfer the inbounds flag from the GEP instruction to the
   // Add expression, because the Instruction may be guarded by control flow
   // and the no-overflow bits may not be valid for the expression in any
   // context.
-  bool isInBounds = GEP->isInBounds();
+  SCEV::NoWrapFlags Wrap = GEP->isInBounds() ? SCEV::FlagNSW : SCEV::FlagAnyWrap;
 
-  Type *IntPtrTy = getEffectiveSCEVType(GEP->getType());
-  Value *Base = GEP->getOperand(0);
-  // Don't attempt to analyze GEPs over unsized objects.
-  if (!cast<PointerType>(Base->getType())->getElementType()->isSized())
-    return getUnknown(GEP);
   const SCEV *TotalOffset = getConstant(IntPtrTy, 0);
   gep_type_iterator GTI = gep_type_begin(GEP);
   for (GetElementPtrInst::op_iterator I = llvm::next(GEP->op_begin()),
@@ -3199,21 +3197,19 @@ const SCEV *ScalarEvolution::createNodeForGEP(GEPOperator *GEP) {
     if (StructType *STy = dyn_cast<StructType>(*GTI++)) {
       // For a struct, add the member offset.
       unsigned FieldNo = cast<ConstantInt>(Index)->getZExtValue();
-      const SCEV *FieldOffset = getOffsetOfExpr(STy, FieldNo);
+      const SCEV *FieldOffset = getOffsetOfExpr(IntPtrTy, STy, FieldNo);
 
       // Add the field offset to the running total offset.
       TotalOffset = getAddExpr(TotalOffset, FieldOffset);
     } else {
       // For an array, add the element offset, explicitly scaled.
-      const SCEV *ElementSize = getSizeOfExpr(*GTI);
+      const SCEV *ElementSize = getSizeOfExpr(IntPtrTy, *GTI);
       const SCEV *IndexS = getSCEV(Index);
       // Getelementptr indices are signed.
       IndexS = getTruncateOrSignExtend(IndexS, IntPtrTy);
 
       // Multiply the index by the element size to compute the element offset.
-      const SCEV *LocalOffset = getMulExpr(IndexS, ElementSize,
-                                           isInBounds ? SCEV::FlagNSW :
-                                           SCEV::FlagAnyWrap);
+      const SCEV *LocalOffset = getMulExpr(IndexS, ElementSize, Wrap);
 
       // Add the element offset to the running total offset.
       TotalOffset = getAddExpr(TotalOffset, LocalOffset);
@@ -3224,8 +3220,7 @@ const SCEV *ScalarEvolution::createNodeForGEP(GEPOperator *GEP) {
   const SCEV *BaseS = getSCEV(Base);
 
   // Add the total offset from all the GEP indices to the base.
-  return getAddExpr(BaseS, TotalOffset,
-                    isInBounds ? SCEV::FlagNSW : SCEV::FlagAnyWrap);
+  return getAddExpr(BaseS, TotalOffset, Wrap);
 }
 
 /// GetMinTrailingZeros - Determine the minimum number of zero bits that S is
@@ -4616,25 +4611,17 @@ ScalarEvolution::ComputeExitLimitFromICmp(const Loop *L,
     if (EL.hasAnyInfo()) return EL;
     break;
   }
-  case ICmpInst::ICMP_SLT: {
-    ExitLimit EL = HowManyLessThans(LHS, RHS, L, true, IsSubExpr);
-    if (EL.hasAnyInfo()) return EL;
-    break;
-  }
-  case ICmpInst::ICMP_SGT: {
-    ExitLimit EL = HowManyLessThans(getNotSCEV(LHS),
-                                    getNotSCEV(RHS), L, true, IsSubExpr);
-    if (EL.hasAnyInfo()) return EL;
-    break;
-  }
-  case ICmpInst::ICMP_ULT: {
-    ExitLimit EL = HowManyLessThans(LHS, RHS, L, false, IsSubExpr);
+  case ICmpInst::ICMP_SLT:
+  case ICmpInst::ICMP_ULT: {                    // while (X < Y)
+    bool IsSigned = Cond == ICmpInst::ICMP_SLT;
+    ExitLimit EL = HowManyLessThans(LHS, RHS, L, IsSigned, IsSubExpr);
     if (EL.hasAnyInfo()) return EL;
     break;
   }
-  case ICmpInst::ICMP_UGT: {
-    ExitLimit EL = HowManyLessThans(getNotSCEV(LHS),
-                                    getNotSCEV(RHS), L, false, IsSubExpr);
+  case ICmpInst::ICMP_SGT:
+  case ICmpInst::ICMP_UGT: {                    // while (X > Y)
+    bool IsSigned = Cond == ICmpInst::ICMP_SGT;
+    ExitLimit EL = HowManyGreaterThans(LHS, RHS, L, IsSigned, IsSubExpr);
     if (EL.hasAnyInfo()) return EL;
     break;
   }
@@ -5072,15 +5059,21 @@ const SCEV *ScalarEvolution::ComputeExitCountExhaustively(const Loop *L,
 /// original value V is returned.
 const SCEV *ScalarEvolution::getSCEVAtScope(const SCEV *V, const Loop *L) {
   // Check to see if we've folded this expression at this loop before.
-  std::map<const Loop *, const SCEV *> &Values = ValuesAtScopes[V];
-  std::pair<std::map<const Loop *, const SCEV *>::iterator, bool> Pair =
-    Values.insert(std::make_pair(L, static_cast<const SCEV *>(0)));
-  if (!Pair.second)
-    return Pair.first->second ? Pair.first->second : V;
-
+  SmallVector<std::pair<const Loop *, const SCEV *>, 2> &Values = ValuesAtScopes[V];
+  for (unsigned u = 0; u < Values.size(); u++) {
+    if (Values[u].first == L)
+      return Values[u].second ? Values[u].second : V;
+  }
+  Values.push_back(std::make_pair(L, static_cast<const SCEV *>(0)));
   // Otherwise compute it.
   const SCEV *C = computeSCEVAtScope(V, L);
-  ValuesAtScopes[V][L] = C;
+  SmallVector<std::pair<const Loop *, const SCEV *>, 2> &Values2 = ValuesAtScopes[V];
+  for (unsigned u = Values2.size(); u > 0; u--) {
+    if (Values2[u - 1].first == L) {
+      Values2[u - 1].second = C;
+      break;
+    }
+  }
   return C;
 }
 
@@ -5119,18 +5112,23 @@ static Constant *BuildConstantFromSCEV(const SCEV *V) {
     case scAddExpr: {
       const SCEVAddExpr *SA = cast<SCEVAddExpr>(V);
       if (Constant *C = BuildConstantFromSCEV(SA->getOperand(0))) {
-        if (C->getType()->isPointerTy())
-          C = ConstantExpr::getBitCast(C, Type::getInt8PtrTy(C->getContext()));
+        if (PointerType *PTy = dyn_cast<PointerType>(C->getType())) {
+          unsigned AS = PTy->getAddressSpace();
+          Type *DestPtrTy = Type::getInt8PtrTy(C->getContext(), AS);
+          C = ConstantExpr::getBitCast(C, DestPtrTy);
+        }
         for (unsigned i = 1, e = SA->getNumOperands(); i != e; ++i) {
           Constant *C2 = BuildConstantFromSCEV(SA->getOperand(i));
           if (!C2) return 0;
 
           // First pointer!
           if (!C->getType()->isPointerTy() && C2->getType()->isPointerTy()) {
+            unsigned AS = C2->getType()->getPointerAddressSpace();
             std::swap(C, C2);
+            Type *DestPtrTy = Type::getInt8PtrTy(C->getContext(), AS);
             // The offsets have been converted to bytes.  We can add bytes to an
             // i8* by GEP with the byte count in the first index.
-            C = ConstantExpr::getBitCast(C,Type::getInt8PtrTy(C->getContext()));
+            C = ConstantExpr::getBitCast(C, DestPtrTy);
           }
 
           // Don't bother trying to sum two pointers. We probably can't
@@ -5138,8 +5136,8 @@ static Constant *BuildConstantFromSCEV(const SCEV *V) {
           if (C2->getType()->isPointerTy())
             return 0;
 
-          if (C->getType()->isPointerTy()) {
-            if (cast<PointerType>(C->getType())->getElementType()->isStructTy())
+          if (PointerType *PTy = dyn_cast<PointerType>(C->getType())) {
+            if (PTy->getElementType()->isStructTy())
               C2 = ConstantExpr::getIntegerCast(
                   C2, Type::getInt32Ty(C->getContext()), true);
             C = ConstantExpr::getGetElementPtr(C, C2);
@@ -6336,45 +6334,72 @@ ScalarEvolution::isImpliedCondOperandsHelper(ICmpInst::Predicate Pred,
   return false;
 }
 
-/// getBECount - Subtract the end and start values and divide by the step,
-/// rounding up, to get the number of times the backedge is executed. Return
-/// CouldNotCompute if an intermediate computation overflows.
-const SCEV *ScalarEvolution::getBECount(const SCEV *Start,
-                                        const SCEV *End,
-                                        const SCEV *Step,
-                                        bool NoWrap) {
-  assert(!isKnownNegative(Step) &&
-         "This code doesn't handle negative strides yet!");
-
-  Type *Ty = Start->getType();
-
-  // When Start == End, we have an exact BECount == 0. Short-circuit this case
-  // here because SCEV may not be able to determine that the unsigned division
-  // after rounding is zero.
-  if (Start == End)
-    return getConstant(Ty, 0);
-
-  const SCEV *NegOne = getConstant(Ty, (uint64_t)-1);
-  const SCEV *Diff = getMinusSCEV(End, Start);
-  const SCEV *RoundUp = getAddExpr(Step, NegOne);
-
-  // Add an adjustment to the difference between End and Start so that
-  // the division will effectively round up.
-  const SCEV *Add = getAddExpr(Diff, RoundUp);
-
-  if (!NoWrap) {
-    // Check Add for unsigned overflow.
-    // TODO: More sophisticated things could be done here.
-    Type *WideTy = IntegerType::get(getContext(),
-                                          getTypeSizeInBits(Ty) + 1);
-    const SCEV *EDiff = getZeroExtendExpr(Diff, WideTy);
-    const SCEV *ERoundUp = getZeroExtendExpr(RoundUp, WideTy);
-    const SCEV *OperandExtendedAdd = getAddExpr(EDiff, ERoundUp);
-    if (getZeroExtendExpr(Add, WideTy) != OperandExtendedAdd)
-      return getCouldNotCompute();
+// Verify if an linear IV with positive stride can overflow when in a 
+// less-than comparison, knowing the invariant term of the comparison, the 
+// stride and the knowledge of NSW/NUW flags on the recurrence.
+bool ScalarEvolution::doesIVOverflowOnLT(const SCEV *RHS, const SCEV *Stride,
+                                         bool IsSigned, bool NoWrap) {
+  if (NoWrap) return false;
+
+  unsigned BitWidth = getTypeSizeInBits(RHS->getType());
+  const SCEV *One = getConstant(Stride->getType(), 1);
+
+  if (IsSigned) {
+    APInt MaxRHS = getSignedRange(RHS).getSignedMax();
+    APInt MaxValue = APInt::getSignedMaxValue(BitWidth);
+    APInt MaxStrideMinusOne = getSignedRange(getMinusSCEV(Stride, One))
+                                .getSignedMax();
+
+    // SMaxRHS + SMaxStrideMinusOne > SMaxValue => overflow!
+    return (MaxValue - MaxStrideMinusOne).slt(MaxRHS);
+  }
+
+  APInt MaxRHS = getUnsignedRange(RHS).getUnsignedMax();
+  APInt MaxValue = APInt::getMaxValue(BitWidth);
+  APInt MaxStrideMinusOne = getUnsignedRange(getMinusSCEV(Stride, One))
+                              .getUnsignedMax();
+
+  // UMaxRHS + UMaxStrideMinusOne > UMaxValue => overflow!
+  return (MaxValue - MaxStrideMinusOne).ult(MaxRHS);
+}
+
+// Verify if an linear IV with negative stride can overflow when in a 
+// greater-than comparison, knowing the invariant term of the comparison,
+// the stride and the knowledge of NSW/NUW flags on the recurrence.
+bool ScalarEvolution::doesIVOverflowOnGT(const SCEV *RHS, const SCEV *Stride,
+                                         bool IsSigned, bool NoWrap) {
+  if (NoWrap) return false;
+
+  unsigned BitWidth = getTypeSizeInBits(RHS->getType());
+  const SCEV *One = getConstant(Stride->getType(), 1);
+
+  if (IsSigned) {
+    APInt MinRHS = getSignedRange(RHS).getSignedMin();
+    APInt MinValue = APInt::getSignedMinValue(BitWidth);
+    APInt MaxStrideMinusOne = getSignedRange(getMinusSCEV(Stride, One))
+                               .getSignedMax();
+
+    // SMinRHS - SMaxStrideMinusOne < SMinValue => overflow!
+    return (MinValue + MaxStrideMinusOne).sgt(MinRHS);
   }
 
-  return getUDivExpr(Add, Step);
+  APInt MinRHS = getUnsignedRange(RHS).getUnsignedMin();
+  APInt MinValue = APInt::getMinValue(BitWidth);
+  APInt MaxStrideMinusOne = getUnsignedRange(getMinusSCEV(Stride, One))
+                            .getUnsignedMax();
+
+  // UMinRHS - UMaxStrideMinusOne < UMinValue => overflow!
+  return (MinValue + MaxStrideMinusOne).ugt(MinRHS);
+}
+
+// Compute the backedge taken count knowing the interval difference, the
+// stride and presence of the equality in the comparison.
+const SCEV *ScalarEvolution::computeBECount(const SCEV *Delta, const SCEV *Step, 
+                                            bool Equality) {
+  const SCEV *One = getConstant(Step->getType(), 1);
+  Delta = Equality ? getAddExpr(Delta, Step)
+                   : getAddExpr(Delta, getMinusSCEV(Step, One));
+  return getUDivExpr(Delta, Step);
 }
 
 /// HowManyLessThans - Return the number of times a backedge containing the
@@ -6386,119 +6411,144 @@ const SCEV *ScalarEvolution::getBECount(const SCEV *Start,
 /// a subexpression that cannot overflow before evaluating true.
 ScalarEvolution::ExitLimit
 ScalarEvolution::HowManyLessThans(const SCEV *LHS, const SCEV *RHS,
-                                  const Loop *L, bool isSigned,
+                                  const Loop *L, bool IsSigned,
                                   bool IsSubExpr) {
-  // Only handle:  "ADDREC < LoopInvariant".
-  if (!isLoopInvariant(RHS, L)) return getCouldNotCompute();
+  // We handle only IV < Invariant
+  if (!isLoopInvariant(RHS, L))
+    return getCouldNotCompute();
 
-  const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(LHS);
-  if (!AddRec || AddRec->getLoop() != L)
+  const SCEVAddRecExpr *IV = dyn_cast<SCEVAddRecExpr>(LHS);
+
+  // Avoid weird loops
+  if (!IV || IV->getLoop() != L || !IV->isAffine())
     return getCouldNotCompute();
 
-  // Check to see if we have a flag which makes analysis easy.
-  bool NoWrap = false;
-  if (!IsSubExpr) {
-    NoWrap = AddRec->getNoWrapFlags(
-      (SCEV::NoWrapFlags)(((isSigned ? SCEV::FlagNSW : SCEV::FlagNUW))
-                          | SCEV::FlagNW));
-  }
-  if (AddRec->isAffine()) {
-    unsigned BitWidth = getTypeSizeInBits(AddRec->getType());
-    const SCEV *Step = AddRec->getStepRecurrence(*this);
+  bool NoWrap = !IsSubExpr &&
+                IV->getNoWrapFlags(IsSigned ? SCEV::FlagNSW : SCEV::FlagNUW);
 
-    if (Step->isZero())
-      return getCouldNotCompute();
-    if (Step->isOne()) {
-      // With unit stride, the iteration never steps past the limit value.
-    } else if (isKnownPositive(Step)) {
-      // Test whether a positive iteration can step past the limit
-      // value and past the maximum value for its type in a single step.
-      // Note that it's not sufficient to check NoWrap here, because even
-      // though the value after a wrap is undefined, it's not undefined
-      // behavior, so if wrap does occur, the loop could either terminate or
-      // loop infinitely, but in either case, the loop is guaranteed to
-      // iterate at least until the iteration where the wrapping occurs.
-      const SCEV *One = getConstant(Step->getType(), 1);
-      if (isSigned) {
-        APInt Max = APInt::getSignedMaxValue(BitWidth);
-        if ((Max - getSignedRange(getMinusSCEV(Step, One)).getSignedMax())
-              .slt(getSignedRange(RHS).getSignedMax()))
-          return getCouldNotCompute();
-      } else {
-        APInt Max = APInt::getMaxValue(BitWidth);
-        if ((Max - getUnsignedRange(getMinusSCEV(Step, One)).getUnsignedMax())
-              .ult(getUnsignedRange(RHS).getUnsignedMax()))
-          return getCouldNotCompute();
-      }
-    } else
-      // TODO: Handle negative strides here and below.
-      return getCouldNotCompute();
+  const SCEV *Stride = IV->getStepRecurrence(*this);
 
-    // We know the LHS is of the form {n,+,s} and the RHS is some loop-invariant
-    // m.  So, we count the number of iterations in which {n,+,s} < m is true.
-    // Note that we cannot simply return max(m-n,0)/s because it's not safe to
-    // treat m-n as signed nor unsigned due to overflow possibility.
-
-    // First, we get the value of the LHS in the first iteration: n
-    const SCEV *Start = AddRec->getOperand(0);
-
-    // Determine the minimum constant start value.
-    const SCEV *MinStart = getConstant(isSigned ?
-      getSignedRange(Start).getSignedMin() :
-      getUnsignedRange(Start).getUnsignedMin());
-
-    // If we know that the condition is true in order to enter the loop,
-    // then we know that it will run exactly (m-n)/s times. Otherwise, we
-    // only know that it will execute (max(m,n)-n)/s times. In both cases,
-    // the division must round up.
-    const SCEV *End = RHS;
-    if (!isLoopEntryGuardedByCond(L,
-                                  isSigned ? ICmpInst::ICMP_SLT :
-                                             ICmpInst::ICMP_ULT,
-                                  getMinusSCEV(Start, Step), RHS))
-      End = isSigned ? getSMaxExpr(RHS, Start)
-                     : getUMaxExpr(RHS, Start);
-
-    // Determine the maximum constant end value.
-    const SCEV *MaxEnd = getConstant(isSigned ?
-      getSignedRange(End).getSignedMax() :
-      getUnsignedRange(End).getUnsignedMax());
-
-    // If MaxEnd is within a step of the maximum integer value in its type,
-    // adjust it down to the minimum value which would produce the same effect.
-    // This allows the subsequent ceiling division of (N+(step-1))/step to
-    // compute the correct value.
-    const SCEV *StepMinusOne = getMinusSCEV(Step,
-                                            getConstant(Step->getType(), 1));
-    MaxEnd = isSigned ?
-      getSMinExpr(MaxEnd,
-                  getMinusSCEV(getConstant(APInt::getSignedMaxValue(BitWidth)),
-                               StepMinusOne)) :
-      getUMinExpr(MaxEnd,
-                  getMinusSCEV(getConstant(APInt::getMaxValue(BitWidth)),
-                               StepMinusOne));
-
-    // Finally, we subtract these two values and divide, rounding up, to get
-    // the number of times the backedge is executed.
-    const SCEV *BECount = getBECount(Start, End, Step, NoWrap);
-
-    // The maximum backedge count is similar, except using the minimum start
-    // value and the maximum end value.
-    // If we already have an exact constant BECount, use it instead.
-    const SCEV *MaxBECount = isa<SCEVConstant>(BECount) ? BECount
-      : getBECount(MinStart, MaxEnd, Step, NoWrap);
-
-    // If the stride is nonconstant, and NoWrap == true, then
-    // getBECount(MinStart, MaxEnd) may not compute. This would result in an
-    // exact BECount and invalid MaxBECount, which should be avoided to catch
-    // more optimization opportunities.
-    if (isa<SCEVCouldNotCompute>(MaxBECount))
-      MaxBECount = BECount;
-
-    return ExitLimit(BECount, MaxBECount);
-  }
+  // Avoid negative or zero stride values
+  if (!isKnownPositive(Stride))
+    return getCouldNotCompute();
 
-  return getCouldNotCompute();
+  // Avoid proven overflow cases: this will ensure that the backedge taken count
+  // will not generate any unsigned overflow. Relaxed no-overflow conditions
+  // exploit NoWrapFlags, allowing to optimize in presence of undefined 
+  // behaviors like the case of C language.
+  if (!Stride->isOne() && doesIVOverflowOnLT(RHS, Stride, IsSigned, NoWrap))
+    return getCouldNotCompute();
+
+  ICmpInst::Predicate Cond = IsSigned ? ICmpInst::ICMP_SLT
+                                      : ICmpInst::ICMP_ULT;
+  const SCEV *Start = IV->getStart();
+  const SCEV *End = RHS;
+  if (!isLoopEntryGuardedByCond(L, Cond, getMinusSCEV(Start, Stride), RHS))
+    End = IsSigned ? getSMaxExpr(RHS, Start)
+                   : getUMaxExpr(RHS, Start);
+
+  const SCEV *BECount = computeBECount(getMinusSCEV(End, Start), Stride, false);
+
+  APInt MinStart = IsSigned ? getSignedRange(Start).getSignedMin()
+                            : getUnsignedRange(Start).getUnsignedMin();
+
+  APInt MinStride = IsSigned ? getSignedRange(Stride).getSignedMin()
+                             : getUnsignedRange(Stride).getUnsignedMin();
+
+  unsigned BitWidth = getTypeSizeInBits(LHS->getType());
+  APInt Limit = IsSigned ? APInt::getSignedMaxValue(BitWidth) - (MinStride - 1)
+                         : APInt::getMaxValue(BitWidth) - (MinStride - 1);
+
+  // Although End can be a MAX expression we estimate MaxEnd considering only
+  // the case End = RHS. This is safe because in the other case (End - Start)
+  // is zero, leading to a zero maximum backedge taken count.
+  APInt MaxEnd =
+    IsSigned ? APIntOps::smin(getSignedRange(RHS).getSignedMax(), Limit)
+             : APIntOps::umin(getUnsignedRange(RHS).getUnsignedMax(), Limit);
+
+  const SCEV *MaxBECount = getCouldNotCompute();
+  if (isa<SCEVConstant>(BECount))
+    MaxBECount = BECount;
+  else
+    MaxBECount = computeBECount(getConstant(MaxEnd - MinStart),
+                                getConstant(MinStride), false);
+
+  if (isa<SCEVCouldNotCompute>(MaxBECount))
+    MaxBECount = BECount;
+
+  return ExitLimit(BECount, MaxBECount);
+}
+
+ScalarEvolution::ExitLimit
+ScalarEvolution::HowManyGreaterThans(const SCEV *LHS, const SCEV *RHS,
+                                     const Loop *L, bool IsSigned,
+                                     bool IsSubExpr) {
+  // We handle only IV > Invariant
+  if (!isLoopInvariant(RHS, L))
+    return getCouldNotCompute();
+
+  const SCEVAddRecExpr *IV = dyn_cast<SCEVAddRecExpr>(LHS);
+
+  // Avoid weird loops
+  if (!IV || IV->getLoop() != L || !IV->isAffine())
+    return getCouldNotCompute();
+
+  bool NoWrap = !IsSubExpr &&
+                IV->getNoWrapFlags(IsSigned ? SCEV::FlagNSW : SCEV::FlagNUW);
+
+  const SCEV *Stride = getNegativeSCEV(IV->getStepRecurrence(*this));
+
+  // Avoid negative or zero stride values
+  if (!isKnownPositive(Stride))
+    return getCouldNotCompute();
+
+  // Avoid proven overflow cases: this will ensure that the backedge taken count
+  // will not generate any unsigned overflow. Relaxed no-overflow conditions
+  // exploit NoWrapFlags, allowing to optimize in presence of undefined 
+  // behaviors like the case of C language.
+  if (!Stride->isOne() && doesIVOverflowOnGT(RHS, Stride, IsSigned, NoWrap))
+    return getCouldNotCompute();
+
+  ICmpInst::Predicate Cond = IsSigned ? ICmpInst::ICMP_SGT
+                                      : ICmpInst::ICMP_UGT;
+
+  const SCEV *Start = IV->getStart();
+  const SCEV *End = RHS;
+  if (!isLoopEntryGuardedByCond(L, Cond, getAddExpr(Start, Stride), RHS))
+    End = IsSigned ? getSMinExpr(RHS, Start)
+                   : getUMinExpr(RHS, Start);
+
+  const SCEV *BECount = computeBECount(getMinusSCEV(Start, End), Stride, false);
+
+  APInt MaxStart = IsSigned ? getSignedRange(Start).getSignedMax()
+                            : getUnsignedRange(Start).getUnsignedMax();
+
+  APInt MinStride = IsSigned ? getSignedRange(Stride).getSignedMin()
+                             : getUnsignedRange(Stride).getUnsignedMin();
+
+  unsigned BitWidth = getTypeSizeInBits(LHS->getType());
+  APInt Limit = IsSigned ? APInt::getSignedMinValue(BitWidth) + (MinStride - 1)
+                         : APInt::getMinValue(BitWidth) + (MinStride - 1);
+
+  // Although End can be a MIN expression we estimate MinEnd considering only
+  // the case End = RHS. This is safe because in the other case (Start - End)
+  // is zero, leading to a zero maximum backedge taken count.
+  APInt MinEnd =
+    IsSigned ? APIntOps::smax(getSignedRange(RHS).getSignedMin(), Limit)
+             : APIntOps::umax(getUnsignedRange(RHS).getUnsignedMin(), Limit);
+
+
+  const SCEV *MaxBECount = getCouldNotCompute();
+  if (isa<SCEVConstant>(BECount))
+    MaxBECount = BECount;
+  else
+    MaxBECount = computeBECount(getConstant(MaxStart - MinEnd), 
+                                getConstant(MinStride), false);
+
+  if (isa<SCEVCouldNotCompute>(MaxBECount))
+    MaxBECount = BECount;
+
+  return ExitLimit(BECount, MaxBECount);
 }
 
 /// getNumIterationsInRange - Return the number of iterations of this loop that
@@ -6627,7 +6677,534 @@ const SCEV *SCEVAddRecExpr::getNumIterationsInRange(ConstantRange Range,
   return SE.getCouldNotCompute();
 }
 
+static const APInt gcd(const SCEVConstant *C1, const SCEVConstant *C2) {
+  APInt A = C1->getValue()->getValue().abs();
+  APInt B = C2->getValue()->getValue().abs();
+  uint32_t ABW = A.getBitWidth();
+  uint32_t BBW = B.getBitWidth();
+
+  if (ABW > BBW)
+    B = B.zext(ABW);
+  else if (ABW < BBW)
+    A = A.zext(BBW);
+
+  return APIntOps::GreatestCommonDivisor(A, B);
+}
+
+static const APInt srem(const SCEVConstant *C1, const SCEVConstant *C2) {
+  APInt A = C1->getValue()->getValue();
+  APInt B = C2->getValue()->getValue();
+  uint32_t ABW = A.getBitWidth();
+  uint32_t BBW = B.getBitWidth();
+
+  if (ABW > BBW)
+    B = B.sext(ABW);
+  else if (ABW < BBW)
+    A = A.sext(BBW);
+
+  return APIntOps::srem(A, B);
+}
+
+static const APInt sdiv(const SCEVConstant *C1, const SCEVConstant *C2) {
+  APInt A = C1->getValue()->getValue();
+  APInt B = C2->getValue()->getValue();
+  uint32_t ABW = A.getBitWidth();
+  uint32_t BBW = B.getBitWidth();
+
+  if (ABW > BBW)
+    B = B.sext(ABW);
+  else if (ABW < BBW)
+    A = A.sext(BBW);
+
+  return APIntOps::sdiv(A, B);
+}
+
+namespace {
+struct SCEVGCD : public SCEVVisitor<SCEVGCD, const SCEV *> {
+public:
+  // Pattern match Step into Start. When Step is a multiply expression, find
+  // the largest subexpression of Step that appears in Start. When Start is an
+  // add expression, try to match Step in the subexpressions of Start, non
+  // matching subexpressions are returned under Remainder.
+  static const SCEV *findGCD(ScalarEvolution &SE, const SCEV *Start,
+                             const SCEV *Step, const SCEV **Remainder) {
+    assert(Remainder && "Remainder should not be NULL");
+    SCEVGCD R(SE, Step, SE.getConstant(Step->getType(), 0));
+    const SCEV *Res = R.visit(Start);
+    *Remainder = R.Remainder;
+    return Res;
+  }
+
+  SCEVGCD(ScalarEvolution &S, const SCEV *G, const SCEV *R)
+      : SE(S), GCD(G), Remainder(R) {
+    Zero = SE.getConstant(GCD->getType(), 0);
+    One = SE.getConstant(GCD->getType(), 1);
+  }
+
+  const SCEV *visitConstant(const SCEVConstant *Constant) {
+    if (GCD == Constant || Constant == Zero)
+      return GCD;
+
+    if (const SCEVConstant *CGCD = dyn_cast<SCEVConstant>(GCD)) {
+      const SCEV *Res = SE.getConstant(gcd(Constant, CGCD));
+      if (Res != One)
+        return Res;
+
+      Remainder = SE.getConstant(srem(Constant, CGCD));
+      Constant = cast<SCEVConstant>(SE.getMinusSCEV(Constant, Remainder));
+      Res = SE.getConstant(gcd(Constant, CGCD));
+      return Res;
+    }
+
+    // When GCD is not a constant, it could be that the GCD is an Add, Mul,
+    // AddRec, etc., in which case we want to find out how many times the
+    // Constant divides the GCD: we then return that as the new GCD.
+    const SCEV *Rem = Zero;
+    const SCEV *Res = findGCD(SE, GCD, Constant, &Rem);
+
+    if (Res == One || Rem != Zero) {
+      Remainder = Constant;
+      return One;
+    }
+
+    assert(isa<SCEVConstant>(Res) && "Res should be a constant");
+    Remainder = SE.getConstant(srem(Constant, cast<SCEVConstant>(Res)));
+    return Res;
+  }
+
+  const SCEV *visitTruncateExpr(const SCEVTruncateExpr *Expr) {
+    if (GCD != Expr)
+      Remainder = Expr;
+    return GCD;
+  }
+
+  const SCEV *visitZeroExtendExpr(const SCEVZeroExtendExpr *Expr) {
+    if (GCD != Expr)
+      Remainder = Expr;
+    return GCD;
+  }
+
+  const SCEV *visitSignExtendExpr(const SCEVSignExtendExpr *Expr) {
+    if (GCD != Expr)
+      Remainder = Expr;
+    return GCD;
+  }
+
+  const SCEV *visitAddExpr(const SCEVAddExpr *Expr) {
+    if (GCD == Expr)
+      return GCD;
+
+    for (int i = 0, e = Expr->getNumOperands(); i < e; ++i) {
+      const SCEV *Rem = Zero;
+      const SCEV *Res = findGCD(SE, Expr->getOperand(e - 1 - i), GCD, &Rem);
+
+      // FIXME: There may be ambiguous situations: for instance,
+      // GCD(-4 + (3 * %m), 2 * %m) where 2 divides -4 and %m divides (3 * %m).
+      // The order in which the AddExpr is traversed computes a different GCD
+      // and Remainder.
+      if (Res != One)
+        GCD = Res;
+      if (Rem != Zero)
+        Remainder = SE.getAddExpr(Remainder, Rem);
+    }
+
+    return GCD;
+  }
+
+  const SCEV *visitMulExpr(const SCEVMulExpr *Expr) {
+    if (GCD == Expr)
+      return GCD;
+
+    for (int i = 0, e = Expr->getNumOperands(); i < e; ++i) {
+      if (Expr->getOperand(i) == GCD)
+        return GCD;
+    }
+
+    // If we have not returned yet, it means that GCD is not part of Expr.
+    const SCEV *PartialGCD = One;
+    for (int i = 0, e = Expr->getNumOperands(); i < e; ++i) {
+      const SCEV *Rem = Zero;
+      const SCEV *Res = findGCD(SE, Expr->getOperand(i), GCD, &Rem);
+      if (Rem != Zero)
+        // GCD does not divide Expr->getOperand(i).
+        continue;
+
+      if (Res == GCD)
+        return GCD;
+      PartialGCD = SE.getMulExpr(PartialGCD, Res);
+      if (PartialGCD == GCD)
+        return GCD;
+    }
+
+    if (PartialGCD != One)
+      return PartialGCD;
+
+    Remainder = Expr;
+    const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(GCD);
+    if (!Mul)
+      return PartialGCD;
+
+    // When the GCD is a multiply expression, try to decompose it:
+    // this occurs when Step does not divide the Start expression
+    // as in: {(-4 + (3 * %m)),+,(2 * %m)}
+    for (int i = 0, e = Mul->getNumOperands(); i < e; ++i) {
+      const SCEV *Rem = Zero;
+      const SCEV *Res = findGCD(SE, Expr, Mul->getOperand(i), &Rem);
+      if (Rem == Zero) {
+        Remainder = Rem;
+        return Res;
+      }
+    }
+
+    return PartialGCD;
+  }
+
+  const SCEV *visitUDivExpr(const SCEVUDivExpr *Expr) {
+    if (GCD != Expr)
+      Remainder = Expr;
+    return GCD;
+  }
+
+  const SCEV *visitAddRecExpr(const SCEVAddRecExpr *Expr) {
+    if (GCD == Expr)
+      return GCD;
+
+    if (!Expr->isAffine()) {
+      Remainder = Expr;
+      return GCD;
+    }
+
+    const SCEV *Rem = Zero;
+    const SCEV *Res = findGCD(SE, Expr->getOperand(0), GCD, &Rem);
+    if (Rem != Zero)
+      Remainder = SE.getAddExpr(Remainder, Rem);
+
+    Rem = Zero;
+    Res = findGCD(SE, Expr->getOperand(1), Res, &Rem);
+    if (Rem != Zero) {
+      Remainder = Expr;
+      return GCD;
+    }
+
+    return Res;
+  }
+
+  const SCEV *visitSMaxExpr(const SCEVSMaxExpr *Expr) {
+    if (GCD != Expr)
+      Remainder = Expr;
+    return GCD;
+  }
+
+  const SCEV *visitUMaxExpr(const SCEVUMaxExpr *Expr) {
+    if (GCD != Expr)
+      Remainder = Expr;
+    return GCD;
+  }
+
+  const SCEV *visitUnknown(const SCEVUnknown *Expr) {
+    if (GCD != Expr)
+      Remainder = Expr;
+    return GCD;
+  }
+
+  const SCEV *visitCouldNotCompute(const SCEVCouldNotCompute *Expr) {
+    return One;
+  }
+
+private:
+  ScalarEvolution &SE;
+  const SCEV *GCD, *Remainder, *Zero, *One;
+};
+
+struct SCEVDivision : public SCEVVisitor<SCEVDivision, const SCEV *> {
+public:
+  // Remove from Start all multiples of Step.
+  static const SCEV *divide(ScalarEvolution &SE, const SCEV *Start,
+                            const SCEV *Step) {
+    SCEVDivision D(SE, Step);
+    const SCEV *Rem = D.Zero;
+    (void)Rem;
+    // The division is guaranteed to succeed: Step should divide Start with no
+    // remainder.
+    assert(Step == SCEVGCD::findGCD(SE, Start, Step, &Rem) && Rem == D.Zero &&
+           "Step should divide Start with no remainder.");
+    return D.visit(Start);
+  }
+
+  SCEVDivision(ScalarEvolution &S, const SCEV *G) : SE(S), GCD(G) {
+    Zero = SE.getConstant(GCD->getType(), 0);
+    One = SE.getConstant(GCD->getType(), 1);
+  }
+
+  const SCEV *visitConstant(const SCEVConstant *Constant) {
+    if (GCD == Constant)
+      return One;
+
+    if (const SCEVConstant *CGCD = dyn_cast<SCEVConstant>(GCD))
+      return SE.getConstant(sdiv(Constant, CGCD));
+    return Constant;
+  }
+
+  const SCEV *visitTruncateExpr(const SCEVTruncateExpr *Expr) {
+    if (GCD == Expr)
+      return One;
+    return Expr;
+  }
+
+  const SCEV *visitZeroExtendExpr(const SCEVZeroExtendExpr *Expr) {
+    if (GCD == Expr)
+      return One;
+    return Expr;
+  }
+
+  const SCEV *visitSignExtendExpr(const SCEVSignExtendExpr *Expr) {
+    if (GCD == Expr)
+      return One;
+    return Expr;
+  }
+
+  const SCEV *visitAddExpr(const SCEVAddExpr *Expr) {
+    if (GCD == Expr)
+      return One;
+
+    SmallVector<const SCEV *, 2> Operands;
+    for (int i = 0, e = Expr->getNumOperands(); i < e; ++i)
+      Operands.push_back(divide(SE, Expr->getOperand(i), GCD));
+
+    if (Operands.size() == 1)
+      return Operands[0];
+    return SE.getAddExpr(Operands);
+  }
+
+  const SCEV *visitMulExpr(const SCEVMulExpr *Expr) {
+    if (GCD == Expr)
+      return One;
+
+    bool FoundGCDTerm = false;
+    for (int i = 0, e = Expr->getNumOperands(); i < e; ++i)
+      if (Expr->getOperand(i) == GCD)
+        FoundGCDTerm = true;
+
+    SmallVector<const SCEV *, 2> Operands;
+    if (FoundGCDTerm) {
+      FoundGCDTerm = false;
+      for (int i = 0, e = Expr->getNumOperands(); i < e; ++i) {
+        if (FoundGCDTerm)
+          Operands.push_back(Expr->getOperand(i));
+        else if (Expr->getOperand(i) == GCD)
+          FoundGCDTerm = true;
+        else
+          Operands.push_back(Expr->getOperand(i));
+      }
+    } else {
+      FoundGCDTerm = false;
+      const SCEV *PartialGCD = One;
+      for (int i = 0, e = Expr->getNumOperands(); i < e; ++i) {
+        if (PartialGCD == GCD) {
+          Operands.push_back(Expr->getOperand(i));
+          continue;
+        }
+
+        const SCEV *Rem = Zero;
+        const SCEV *Res = SCEVGCD::findGCD(SE, Expr->getOperand(i), GCD, &Rem);
+        if (Rem == Zero) {
+          PartialGCD = SE.getMulExpr(PartialGCD, Res);
+          Operands.push_back(divide(SE, Expr->getOperand(i), GCD));
+        } else {
+          Operands.push_back(Expr->getOperand(i));
+        }
+      }
+    }
+
+    if (Operands.size() == 1)
+      return Operands[0];
+    return SE.getMulExpr(Operands);
+  }
+
+  const SCEV *visitUDivExpr(const SCEVUDivExpr *Expr) {
+    if (GCD == Expr)
+      return One;
+    return Expr;
+  }
+
+  const SCEV *visitAddRecExpr(const SCEVAddRecExpr *Expr) {
+    if (GCD == Expr)
+      return One;
+
+    assert(Expr->isAffine() && "Expr should be affine");
+
+    const SCEV *Start = divide(SE, Expr->getStart(), GCD);
+    const SCEV *Step = divide(SE, Expr->getStepRecurrence(SE), GCD);
+
+    return SE.getAddRecExpr(Start, Step, Expr->getLoop(),
+                            Expr->getNoWrapFlags());
+  }
+
+  const SCEV *visitSMaxExpr(const SCEVSMaxExpr *Expr) {
+    if (GCD == Expr)
+      return One;
+    return Expr;
+  }
+
+  const SCEV *visitUMaxExpr(const SCEVUMaxExpr *Expr) {
+    if (GCD == Expr)
+      return One;
+    return Expr;
+  }
+
+  const SCEV *visitUnknown(const SCEVUnknown *Expr) {
+    if (GCD == Expr)
+      return One;
+    return Expr;
+  }
+
+  const SCEV *visitCouldNotCompute(const SCEVCouldNotCompute *Expr) {
+    return Expr;
+  }
+
+private:
+  ScalarEvolution &SE;
+  const SCEV *GCD, *Zero, *One;
+};
+}
+
+/// Splits the SCEV into two vectors of SCEVs representing the subscripts and
+/// sizes of an array access. Returns the remainder of the delinearization that
+/// is the offset start of the array.  The SCEV->delinearize algorithm computes
+/// the multiples of SCEV coefficients: that is a pattern matching of sub
+/// expressions in the stride and base of a SCEV corresponding to the
+/// computation of a GCD (greatest common divisor) of base and stride.  When
+/// SCEV->delinearize fails, it returns the SCEV unchanged.
+///
+/// For example: when analyzing the memory access A[i][j][k] in this loop nest
+///
+///  void foo(long n, long m, long o, double A[n][m][o]) {
+///
+///    for (long i = 0; i < n; i++)
+///      for (long j = 0; j < m; j++)
+///        for (long k = 0; k < o; k++)
+///          A[i][j][k] = 1.0;
+///  }
+///
+/// the delinearization input is the following AddRec SCEV:
+///
+///  AddRec: {{{%A,+,(8 * %m * %o)}<%for.i>,+,(8 * %o)}<%for.j>,+,8}<%for.k>
+///
+/// From this SCEV, we are able to say that the base offset of the access is %A
+/// because it appears as an offset that does not divide any of the strides in
+/// the loops:
+///
+///  CHECK: Base offset: %A
+///
+/// and then SCEV->delinearize determines the size of some of the dimensions of
+/// the array as these are the multiples by which the strides are happening:
+///
+///  CHECK: ArrayDecl[UnknownSize][%m][%o] with elements of sizeof(double) bytes.
+///
+/// Note that the outermost dimension remains of UnknownSize because there are
+/// no strides that would help identifying the size of the last dimension: when
+/// the array has been statically allocated, one could compute the size of that
+/// dimension by dividing the overall size of the array by the size of the known
+/// dimensions: %m * %o * 8.
+///
+/// Finally delinearize provides the access functions for the array reference
+/// that does correspond to A[i][j][k] of the above C testcase:
+///
+///  CHECK: ArrayRef[{0,+,1}<%for.i>][{0,+,1}<%for.j>][{0,+,1}<%for.k>]
+///
+/// The testcases are checking the output of a function pass:
+/// DelinearizationPass that walks through all loads and stores of a function
+/// asking for the SCEV of the memory access with respect to all enclosing
+/// loops, calling SCEV->delinearize on that and printing the results.
 
+const SCEV *
+SCEVAddRecExpr::delinearize(ScalarEvolution &SE,
+                            SmallVectorImpl<const SCEV *> &Subscripts,
+                            SmallVectorImpl<const SCEV *> &Sizes) const {
+  // Early exit in case this SCEV is not an affine multivariate function.
+  if (!this->isAffine())
+    return this;
+
+  const SCEV *Start = this->getStart();
+  const SCEV *Step = this->getStepRecurrence(SE);
+
+  // Build the SCEV representation of the cannonical induction variable in the
+  // loop of this SCEV.
+  const SCEV *Zero = SE.getConstant(this->getType(), 0);
+  const SCEV *One = SE.getConstant(this->getType(), 1);
+  const SCEV *IV =
+      SE.getAddRecExpr(Zero, One, this->getLoop(), this->getNoWrapFlags());
+
+  DEBUG(dbgs() << "(delinearize: " << *this << "\n");
+
+  // Currently we fail to delinearize when the stride of this SCEV is 1. We
+  // could decide to not fail in this case: we could just return 1 for the size
+  // of the subscript, and this same SCEV for the access function.
+  if (Step == One) {
+    DEBUG(dbgs() << "failed to delinearize " << *this << "\n)\n");
+    return this;
+  }
+
+  // Find the GCD and Remainder of the Start and Step coefficients of this SCEV.
+  const SCEV *Remainder = NULL;
+  const SCEV *GCD = SCEVGCD::findGCD(SE, Start, Step, &Remainder);
+
+  DEBUG(dbgs() << "GCD: " << *GCD << "\n");
+  DEBUG(dbgs() << "Remainder: " << *Remainder << "\n");
+
+  // Same remark as above: we currently fail the delinearization, although we
+  // can very well handle this special case.
+  if (GCD == One) {
+    DEBUG(dbgs() << "failed to delinearize " << *this << "\n)\n");
+    return this;
+  }
+
+  // As findGCD computed Remainder, GCD divides "Start - Remainder." The
+  // Quotient is then this SCEV without Remainder, scaled down by the GCD.  The
+  // Quotient is what will be used in the next subscript delinearization.
+  const SCEV *Quotient =
+      SCEVDivision::divide(SE, SE.getMinusSCEV(Start, Remainder), GCD);
+  DEBUG(dbgs() << "Quotient: " << *Quotient << "\n");
+
+  const SCEV *Rem;
+  if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Quotient))
+    // Recursively call delinearize on the Quotient until there are no more
+    // multiples that can be recognized.
+    Rem = AR->delinearize(SE, Subscripts, Sizes);
+  else
+    Rem = Quotient;
+
+  // Scale up the cannonical induction variable IV by whatever remains from the
+  // Step after division by the GCD: the GCD is the size of all the sub-array.
+  if (Step != GCD) {
+    Step = SCEVDivision::divide(SE, Step, GCD);
+    IV = SE.getMulExpr(IV, Step);
+  }
+  // The access function in the current subscript is computed as the cannonical
+  // induction variable IV (potentially scaled up by the step) and offset by
+  // Rem, the offset of delinearization in the sub-array.
+  const SCEV *Index = SE.getAddExpr(IV, Rem);
+
+  // Record the access function and the size of the current subscript.
+  Subscripts.push_back(Index);
+  Sizes.push_back(GCD);
+
+#ifndef NDEBUG
+  int Size = Sizes.size();
+  DEBUG(dbgs() << "succeeded to delinearize " << *this << "\n");
+  DEBUG(dbgs() << "ArrayDecl[UnknownSize]");
+  for (int i = 0; i < Size - 1; i++)
+    DEBUG(dbgs() << "[" << *Sizes[i] << "]");
+  DEBUG(dbgs() << " with elements of " << *Sizes[Size - 1] << " bytes.\n");
+
+  DEBUG(dbgs() << "ArrayRef");
+  for (int i = 0; i < Size; i++)
+    DEBUG(dbgs() << "[" << *Subscripts[i] << "]");
+  DEBUG(dbgs() << "\n)\n");
+#endif
+
+  return Remainder;
+}
 
 //===----------------------------------------------------------------------===//
 //                   SCEVCallbackVH Class Implementation
@@ -6683,7 +7260,7 @@ ScalarEvolution::SCEVCallbackVH::SCEVCallbackVH(Value *V, ScalarEvolution *se)
 //===----------------------------------------------------------------------===//
 
 ScalarEvolution::ScalarEvolution()
-  : FunctionPass(ID), FirstUnknown(0) {
+  : FunctionPass(ID), ValuesAtScopes(64), LoopDispositions(64), BlockDispositions(64), FirstUnknown(0) {
   initializeScalarEvolutionPass(*PassRegistry::getPassRegistry());
 }
 
@@ -6821,14 +7398,21 @@ void ScalarEvolution::print(raw_ostream &OS, const Module *) const {
 
 ScalarEvolution::LoopDisposition
 ScalarEvolution::getLoopDisposition(const SCEV *S, const Loop *L) {
-  std::map<const Loop *, LoopDisposition> &Values = LoopDispositions[S];
-  std::pair<std::map<const Loop *, LoopDisposition>::iterator, bool> Pair =
-    Values.insert(std::make_pair(L, LoopVariant));
-  if (!Pair.second)
-    return Pair.first->second;
-
+  SmallVector<std::pair<const Loop *, LoopDisposition>, 2> &Values = LoopDispositions[S];
+  for (unsigned u = 0; u < Values.size(); u++) {
+    if (Values[u].first == L)
+      return Values[u].second;
+  }
+  Values.push_back(std::make_pair(L, LoopVariant));
   LoopDisposition D = computeLoopDisposition(S, L);
-  return LoopDispositions[S][L] = D;
+  SmallVector<std::pair<const Loop *, LoopDisposition>, 2> &Values2 = LoopDispositions[S];
+  for (unsigned u = Values2.size(); u > 0; u--) {
+    if (Values2[u - 1].first == L) {
+      Values2[u - 1].second = D;
+      break;
+    }
+  }
+  return D;
 }
 
 ScalarEvolution::LoopDisposition
@@ -6920,14 +7504,21 @@ bool ScalarEvolution::hasComputableLoopEvolution(const SCEV *S, const Loop *L) {
 
 ScalarEvolution::BlockDisposition
 ScalarEvolution::getBlockDisposition(const SCEV *S, const BasicBlock *BB) {
-  std::map<const BasicBlock *, BlockDisposition> &Values = BlockDispositions[S];
-  std::pair<std::map<const BasicBlock *, BlockDisposition>::iterator, bool>
-    Pair = Values.insert(std::make_pair(BB, DoesNotDominateBlock));
-  if (!Pair.second)
-    return Pair.first->second;
-
+  SmallVector<std::pair<const BasicBlock *, BlockDisposition>, 2> &Values = BlockDispositions[S];
+  for (unsigned u = 0; u < Values.size(); u++) {
+    if (Values[u].first == BB)
+      return Values[u].second;
+  }
+  Values.push_back(std::make_pair(BB, DoesNotDominateBlock));
   BlockDisposition D = computeBlockDisposition(S, BB);
-  return BlockDispositions[S][BB] = D;
+  SmallVector<std::pair<const BasicBlock *, BlockDisposition>, 2> &Values2 = BlockDispositions[S];
+  for (unsigned u = Values2.size(); u > 0; u--) {
+    if (Values2[u - 1].first == BB) {
+      Values2[u - 1].second = D;
+      break;
+    }
+  }
+  return D;
 }
 
 ScalarEvolution::BlockDisposition
diff --git a/lib/Analysis/ScalarEvolutionExpander.cpp b/lib/Analysis/ScalarEvolutionExpander.cpp
index c434b40..86a557b 100644
--- a/lib/Analysis/ScalarEvolutionExpander.cpp
+++ b/lib/Analysis/ScalarEvolutionExpander.cpp
@@ -14,6 +14,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
@@ -176,8 +177,8 @@ Value *SCEVExpander::InsertBinop(Instruction::BinaryOps Opcode,
   }
 
   // Save the original insertion point so we can restore it when we're done.
-  BasicBlock *SaveInsertBB = Builder.GetInsertBlock();
-  BasicBlock::iterator SaveInsertPt = Builder.GetInsertPoint();
+  DebugLoc Loc = Builder.GetInsertPoint()->getDebugLoc();
+  BuilderType::InsertPointGuard Guard(Builder);
 
   // Move the insertion point out of as many loops as we can.
   while (const Loop *L = SE.LI->getLoopFor(Builder.GetInsertBlock())) {
@@ -191,13 +192,9 @@ Value *SCEVExpander::InsertBinop(Instruction::BinaryOps Opcode,
 
   // If we haven't found this binop, insert it.
   Instruction *BO = cast<Instruction>(Builder.CreateBinOp(Opcode, LHS, RHS));
-  BO->setDebugLoc(SaveInsertPt->getDebugLoc());
+  BO->setDebugLoc(Loc);
   rememberInstruction(BO);
 
-  // Restore the original insert point.
-  if (SaveInsertBB)
-    restoreInsertPoint(SaveInsertBB, SaveInsertPt);
-
   return BO;
 }
 
@@ -406,6 +403,10 @@ Value *SCEVExpander::expandAddToGEP(const SCEV *const *op_begin,
   // without the other.
   SplitAddRecs(Ops, Ty, SE);
 
+  Type *IntPtrTy = SE.TD
+                 ? SE.TD->getIntPtrType(PTy)
+                 : Type::getInt64Ty(PTy->getContext());
+
   // Descend down the pointer's type and attempt to convert the other
   // operands into GEP indices, at each level. The first index in a GEP
   // indexes into the array implied by the pointer operand; the rest of
@@ -416,7 +417,7 @@ Value *SCEVExpander::expandAddToGEP(const SCEV *const *op_begin,
     // array indexing.
     SmallVector<const SCEV *, 8> ScaledOps;
     if (ElTy->isSized()) {
-      const SCEV *ElSize = SE.getSizeOfExpr(ElTy);
+      const SCEV *ElSize = SE.getSizeOfExpr(IntPtrTy, ElTy);
       if (!ElSize->isZero()) {
         SmallVector<const SCEV *, 8> NewOps;
         for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
@@ -548,8 +549,7 @@ Value *SCEVExpander::expandAddToGEP(const SCEV *const *op_begin,
     }
 
     // Save the original insertion point so we can restore it when we're done.
-    BasicBlock *SaveInsertBB = Builder.GetInsertBlock();
-    BasicBlock::iterator SaveInsertPt = Builder.GetInsertPoint();
+    BuilderType::InsertPointGuard Guard(Builder);
 
     // Move the insertion point out of as many loops as we can.
     while (const Loop *L = SE.LI->getLoopFor(Builder.GetInsertBlock())) {
@@ -565,16 +565,11 @@ Value *SCEVExpander::expandAddToGEP(const SCEV *const *op_begin,
     Value *GEP = Builder.CreateGEP(V, Idx, "uglygep");
     rememberInstruction(GEP);
 
-    // Restore the original insert point.
-    if (SaveInsertBB)
-      restoreInsertPoint(SaveInsertBB, SaveInsertPt);
-
     return GEP;
   }
 
   // Save the original insertion point so we can restore it when we're done.
-  BasicBlock *SaveInsertBB = Builder.GetInsertBlock();
-  BasicBlock::iterator SaveInsertPt = Builder.GetInsertPoint();
+  BuilderType::InsertPoint SaveInsertPt = Builder.saveIP();
 
   // Move the insertion point out of as many loops as we can.
   while (const Loop *L = SE.LI->getLoopFor(Builder.GetInsertBlock())) {
@@ -610,8 +605,7 @@ Value *SCEVExpander::expandAddToGEP(const SCEV *const *op_begin,
   rememberInstruction(GEP);
 
   // Restore the original insert point.
-  if (SaveInsertBB)
-    restoreInsertPoint(SaveInsertBB, SaveInsertPt);
+  Builder.restoreIP(SaveInsertPt);
 
   return expand(SE.getAddExpr(Ops));
 }
@@ -1076,8 +1070,7 @@ SCEVExpander::getAddRecExprPHILiterally(const SCEVAddRecExpr *Normalized,
   }
 
   // Save the original insertion point so we can restore it when we're done.
-  BasicBlock *SaveInsertBB = Builder.GetInsertBlock();
-  BasicBlock::iterator SaveInsertPt = Builder.GetInsertPoint();
+  BuilderType::InsertPointGuard Guard(Builder);
 
   // Another AddRec may need to be recursively expanded below. For example, if
   // this AddRec is quadratic, the StepV may itself be an AddRec in this
@@ -1144,10 +1137,6 @@ SCEVExpander::getAddRecExprPHILiterally(const SCEVAddRecExpr *Normalized,
     PN->addIncoming(IncV, Pred);
   }
 
-  // Restore the original insert point.
-  if (SaveInsertBB)
-    restoreInsertPoint(SaveInsertBB, SaveInsertPt);
-
   // After expanding subexpressions, restore the PostIncLoops set so the caller
   // can ensure that IVIncrement dominates the current uses.
   PostIncLoops = SavedPostIncLoops;
@@ -1232,19 +1221,19 @@ Value *SCEVExpander::expandAddRecExprLiterally(const SCEVAddRecExpr *S) {
         !ExpandTy->isPointerTy() && Step->isNonConstantNegative();
       if (useSubtract)
         Step = SE.getNegativeSCEV(Step);
-      // Expand the step somewhere that dominates the loop header.
-      BasicBlock *SaveInsertBB = Builder.GetInsertBlock();
-      BasicBlock::iterator SaveInsertPt = Builder.GetInsertPoint();
-      Value *StepV = expandCodeFor(Step, IntTy, L->getHeader()->begin());
-      // Restore the insertion point to the place where the caller has
-      // determined dominates all uses.
-      restoreInsertPoint(SaveInsertBB, SaveInsertPt);
+      Value *StepV;
+      {
+        // Expand the step somewhere that dominates the loop header.
+        BuilderType::InsertPointGuard Guard(Builder);
+        StepV = expandCodeFor(Step, IntTy, L->getHeader()->begin());
+      }
       Result = expandIVInc(PN, StepV, L, ExpandTy, IntTy, useSubtract);
     }
   }
 
   // Re-apply any non-loop-dominating scale.
   if (PostLoopScale) {
+    assert(S->isAffine() && "Can't linearly scale non-affine recurrences.");
     Result = InsertNoopCastOfTo(Result, IntTy);
     Result = Builder.CreateMul(Result,
                                expandCodeFor(PostLoopScale, IntTy));
@@ -1289,16 +1278,14 @@ Value *SCEVExpander::visitAddRecExpr(const SCEVAddRecExpr *S) {
       NewOps[i] = SE.getAnyExtendExpr(S->op_begin()[i], CanonicalIV->getType());
     Value *V = expand(SE.getAddRecExpr(NewOps, S->getLoop(),
                                        S->getNoWrapFlags(SCEV::FlagNW)));
-    BasicBlock *SaveInsertBB = Builder.GetInsertBlock();
-    BasicBlock::iterator SaveInsertPt = Builder.GetInsertPoint();
     BasicBlock::iterator NewInsertPt =
       llvm::next(BasicBlock::iterator(cast<Instruction>(V)));
+    BuilderType::InsertPointGuard Guard(Builder);
     while (isa<PHINode>(NewInsertPt) || isa<DbgInfoIntrinsic>(NewInsertPt) ||
            isa<LandingPadInst>(NewInsertPt))
       ++NewInsertPt;
     V = expandCodeFor(SE.getTruncateExpr(SE.getUnknown(V), Ty), 0,
                       NewInsertPt);
-    restoreInsertPoint(SaveInsertBB, SaveInsertPt);
     return V;
   }
 
@@ -1342,9 +1329,13 @@ Value *SCEVExpander::visitAddRecExpr(const SCEVAddRecExpr *S) {
                                   Header->begin());
     rememberInstruction(CanonicalIV);
 
+    SmallSet<BasicBlock *, 4> PredSeen;
     Constant *One = ConstantInt::get(Ty, 1);
     for (pred_iterator HPI = HPB; HPI != HPE; ++HPI) {
       BasicBlock *HP = *HPI;
+      if (!PredSeen.insert(HP))
+        continue;
+
       if (L->contains(HP)) {
         // Insert a unit add instruction right before the terminator
         // corresponding to the back-edge.
@@ -1527,8 +1518,7 @@ Value *SCEVExpander::expand(const SCEV *S) {
   if (I != InsertedExpressions.end())
     return I->second;
 
-  BasicBlock *SaveInsertBB = Builder.GetInsertBlock();
-  BasicBlock::iterator SaveInsertPt = Builder.GetInsertPoint();
+  BuilderType::InsertPointGuard Guard(Builder);
   Builder.SetInsertPoint(InsertPt->getParent(), InsertPt);
 
   // Expand the expression into instructions.
@@ -1541,8 +1531,6 @@ Value *SCEVExpander::expand(const SCEV *S) {
   // a postinc expansion, it could be reused by a non postinc user, but only if
   // its insertion point was already at the head of the loop.
   InsertedExpressions[std::make_pair(S, InsertPt)] = V;
-
-  restoreInsertPoint(SaveInsertBB, SaveInsertPt);
   return V;
 }
 
@@ -1553,10 +1541,6 @@ void SCEVExpander::rememberInstruction(Value *I) {
     InsertedValues.insert(I);
 }
 
-void SCEVExpander::restoreInsertPoint(BasicBlock *BB, BasicBlock::iterator I) {
-  Builder.SetInsertPoint(BB, I);
-}
-
 /// getOrInsertCanonicalInductionVariable - This method returns the
 /// canonical induction variable of the specified type for the specified
 /// loop (inserting one if there is none).  A canonical induction variable
@@ -1572,11 +1556,8 @@ SCEVExpander::getOrInsertCanonicalInductionVariable(const Loop *L,
                                    SE.getConstant(Ty, 1), L, SCEV::FlagAnyWrap);
 
   // Emit code for it.
-  BasicBlock *SaveInsertBB = Builder.GetInsertBlock();
-  BasicBlock::iterator SaveInsertPt = Builder.GetInsertPoint();
+  BuilderType::InsertPointGuard Guard(Builder);
   PHINode *V = cast<PHINode>(expandCodeFor(H, 0, L->getHeader()->begin()));
-  if (SaveInsertBB)
-    restoreInsertPoint(SaveInsertBB, SaveInsertPt);
 
   return V;
 }
@@ -1724,28 +1705,43 @@ namespace {
 // Currently, we only allow division by a nonzero constant here. If this is
 // inadequate, we could easily allow division by SCEVUnknown by using
 // ValueTracking to check isKnownNonZero().
+//
+// We cannot generally expand recurrences unless the step dominates the loop
+// header. The expander handles the special case of affine recurrences by
+// scaling the recurrence outside the loop, but this technique isn't generally
+// applicable. Expanding a nested recurrence outside a loop requires computing
+// binomial coefficients. This could be done, but the recurrence has to be in a
+// perfectly reduced form, which can't be guaranteed.
 struct SCEVFindUnsafe {
+  ScalarEvolution &SE;
   bool IsUnsafe;
 
-  SCEVFindUnsafe(): IsUnsafe(false) {}
+  SCEVFindUnsafe(ScalarEvolution &se): SE(se), IsUnsafe(false) {}
 
   bool follow(const SCEV *S) {
-    const SCEVUDivExpr *D = dyn_cast<SCEVUDivExpr>(S);
-    if (!D)
-      return true;
-    const SCEVConstant *SC = dyn_cast<SCEVConstant>(D->getRHS());
-    if (SC && !SC->getValue()->isZero())
-      return true;
-    IsUnsafe = true;
-    return false;
+    if (const SCEVUDivExpr *D = dyn_cast<SCEVUDivExpr>(S)) {
+      const SCEVConstant *SC = dyn_cast<SCEVConstant>(D->getRHS());
+      if (!SC || SC->getValue()->isZero()) {
+        IsUnsafe = true;
+        return false;
+      }
+    }
+    if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
+      const SCEV *Step = AR->getStepRecurrence(SE);
+      if (!AR->isAffine() && !SE.dominates(Step, AR->getLoop()->getHeader())) {
+        IsUnsafe = true;
+        return false;
+      }
+    }
+    return true;
   }
   bool isDone() const { return IsUnsafe; }
 };
 }
 
 namespace llvm {
-bool isSafeToExpand(const SCEV *S) {
-  SCEVFindUnsafe Search;
+bool isSafeToExpand(const SCEV *S, ScalarEvolution &SE) {
+  SCEVFindUnsafe Search(SE);
   visitAll(S, Search);
   return !Search.IsUnsafe;
 }
diff --git a/lib/Analysis/ScalarEvolutionNormalization.cpp b/lib/Analysis/ScalarEvolutionNormalization.cpp
index dd2ed4f..f110616 100644
--- a/lib/Analysis/ScalarEvolutionNormalization.cpp
+++ b/lib/Analysis/ScalarEvolutionNormalization.cpp
@@ -119,11 +119,19 @@ TransformImpl(const SCEV *S, Instruction *User, Value *OperandValToReplace) {
     const SCEV *Result = SE.getAddRecExpr(Operands, L, SCEV::FlagAnyWrap);
     switch (Kind) {
     case NormalizeAutodetect:
-      if (IVUseShouldUsePostIncValue(User, OperandValToReplace, L, &DT)) {
-        const SCEV *TransformedStep =
-          TransformSubExpr(AR->getStepRecurrence(SE),
-                           User, OperandValToReplace);
-        Result = SE.getMinusSCEV(Result, TransformedStep);
+      // Normalize this SCEV by subtracting the expression for the final step.
+      // We only allow affine AddRecs to be normalized, otherwise we would not
+      // be able to correctly denormalize.
+      // e.g. {1,+,3,+,2} == {-2,+,1,+,2} + {3,+,2}
+      // Normalized form:   {-2,+,1,+,2}
+      // Denormalized form: {1,+,3,+,2}
+      //
+      // However, denormalization would use the a different step expression than
+      // normalization (see getPostIncExpr), generating the wrong final
+      // expression: {-2,+,1,+,2} + {1,+,2} => {-1,+,3,+,2}
+      if (AR->isAffine() &&
+          IVUseShouldUsePostIncValue(User, OperandValToReplace, L, &DT)) {
+        Result = SE.getMinusSCEV(Result, AR->getStepRecurrence(SE));
         Loops.insert(L);
       }
 #if 0
diff --git a/lib/Analysis/TargetTransformInfo.cpp b/lib/Analysis/TargetTransformInfo.cpp
index 4ad7162..0353295 100644
--- a/lib/Analysis/TargetTransformInfo.cpp
+++ b/lib/Analysis/TargetTransformInfo.cpp
@@ -96,6 +96,11 @@ bool TargetTransformInfo::isLoweredToCall(const Function *F) const {
   return PrevTTI->isLoweredToCall(F);
 }
 
+void TargetTransformInfo::getUnrollingPreferences(Loop *L,
+                            UnrollingPreferences &UP) const {
+  PrevTTI->getUnrollingPreferences(L, UP);
+}
+
 bool TargetTransformInfo::isLegalAddImmediate(int64_t Imm) const {
   return PrevTTI->isLegalAddImmediate(Imm);
 }
@@ -145,6 +150,10 @@ TargetTransformInfo::getPopcntSupport(unsigned IntTyWidthInBit) const {
   return PrevTTI->getPopcntSupport(IntTyWidthInBit);
 }
 
+bool TargetTransformInfo::haveFastSqrt(Type *Ty) const {
+  return PrevTTI->haveFastSqrt(Ty);
+}
+
 unsigned TargetTransformInfo::getIntImmCost(const APInt &Imm, Type *Ty) const {
   return PrevTTI->getIntImmCost(Imm, Ty);
 }
@@ -215,6 +224,11 @@ unsigned TargetTransformInfo::getAddressComputationCost(Type *Tp,
   return PrevTTI->getAddressComputationCost(Tp, IsComplex);
 }
 
+unsigned TargetTransformInfo::getReductionCost(unsigned Opcode, Type *Ty,
+                                               bool IsPairwise) const {
+  return PrevTTI->getReductionCost(Opcode, Ty, IsPairwise);
+}
+
 namespace {
 
 struct NoTTI : ImmutablePass, TargetTransformInfo {
@@ -265,26 +279,34 @@ struct NoTTI : ImmutablePass, TargetTransformInfo {
       // Otherwise, the default basic cost is used.
       return TCC_Basic;
 
-    case Instruction::IntToPtr:
+    case Instruction::IntToPtr: {
+      if (!DL)
+        return TCC_Basic;
+
       // An inttoptr cast is free so long as the input is a legal integer type
       // which doesn't contain values outside the range of a pointer.
-      if (DL && DL->isLegalInteger(OpTy->getScalarSizeInBits()) &&
-          OpTy->getScalarSizeInBits() <= DL->getPointerSizeInBits())
+      unsigned OpSize = OpTy->getScalarSizeInBits();
+      if (DL->isLegalInteger(OpSize) &&
+          OpSize <= DL->getPointerTypeSizeInBits(Ty))
         return TCC_Free;
 
       // Otherwise it's not a no-op.
       return TCC_Basic;
+    }
+    case Instruction::PtrToInt: {
+      if (!DL)
+        return TCC_Basic;
 
-    case Instruction::PtrToInt:
       // A ptrtoint cast is free so long as the result is large enough to store
       // the pointer, and a legal integer type.
-      if (DL && DL->isLegalInteger(Ty->getScalarSizeInBits()) &&
-          Ty->getScalarSizeInBits() >= DL->getPointerSizeInBits())
+      unsigned DestSize = Ty->getScalarSizeInBits();
+      if (DL->isLegalInteger(DestSize) &&
+          DestSize >= DL->getPointerTypeSizeInBits(OpTy))
         return TCC_Free;
 
       // Otherwise it's not a no-op.
       return TCC_Basic;
-
+    }
     case Instruction::Trunc:
       // trunc to a native type is free (assuming the target has compare and
       // shift-right of the same width).
@@ -457,6 +479,8 @@ struct NoTTI : ImmutablePass, TargetTransformInfo {
     return true;
   }
 
+  void getUnrollingPreferences(Loop *, UnrollingPreferences &) const { }
+
   bool isLegalAddImmediate(int64_t Imm) const {
     return false;
   }
@@ -505,6 +529,10 @@ struct NoTTI : ImmutablePass, TargetTransformInfo {
     return PSK_Software;
   }
 
+  bool haveFastSqrt(Type *Ty) const {
+    return false;
+  }
+
   unsigned getIntImmCost(const APInt &Imm, Type *Ty) const {
     return 1;
   }
@@ -569,6 +597,10 @@ struct NoTTI : ImmutablePass, TargetTransformInfo {
   unsigned getAddressComputationCost(Type *Tp, bool) const {
     return 0;
   }
+
+  unsigned getReductionCost(unsigned, Type *, bool) const {
+    return 1;
+  }
 };
 
 } // end anonymous namespace
diff --git a/lib/Analysis/TypeBasedAliasAnalysis.cpp b/lib/Analysis/TypeBasedAliasAnalysis.cpp
index bbf3c3a..6791d4b 100644
--- a/lib/Analysis/TypeBasedAliasAnalysis.cpp
+++ b/lib/Analysis/TypeBasedAliasAnalysis.cpp
@@ -16,7 +16,12 @@
 // typical C/C++ TBAA, but it can also be used to implement custom alias
 // analysis behavior for other languages.
 //
-// The current metadata format is very simple. TBAA MDNodes have up to
+// We now support two types of metadata format: scalar TBAA and struct-path
+// aware TBAA. After all testing cases are upgraded to use struct-path aware
+// TBAA and we can auto-upgrade existing bc files, the support for scalar TBAA
+// can be dropped.
+//
+// The scalar TBAA metadata format is very simple. TBAA MDNodes have up to
 // three fields, e.g.:
 //   !0 = metadata !{ metadata !"an example type tree" }
 //   !1 = metadata !{ metadata !"int", metadata !0 }
@@ -40,6 +45,65 @@
 // should return true; see
 // http://llvm.org/docs/AliasAnalysis.html#OtherItfs).
 //
+// With struct-path aware TBAA, the MDNodes attached to an instruction using
+// "!tbaa" are called path tag nodes.
+//
+// The path tag node has 4 fields with the last field being optional.
+//
+// The first field is the base type node, it can be a struct type node
+// or a scalar type node. The second field is the access type node, it
+// must be a scalar type node. The third field is the offset into the base type.
+// The last field has the same meaning as the last field of our scalar TBAA:
+// it's an integer which if equal to 1 indicates that the access is "constant".
+//
+// The struct type node has a name and a list of pairs, one pair for each member
+// of the struct. The first element of each pair is a type node (a struct type
+// node or a sclar type node), specifying the type of the member, the second
+// element of each pair is the offset of the member.
+//
+// Given an example
+// typedef struct {
+//   short s;
+// } A;
+// typedef struct {
+//   uint16_t s;
+//   A a;
+// } B;
+//
+// For an acess to B.a.s, we attach !5 (a path tag node) to the load/store
+// instruction. The base type is !4 (struct B), the access type is !2 (scalar
+// type short) and the offset is 4.
+//
+// !0 = metadata !{metadata !"Simple C/C++ TBAA"}
+// !1 = metadata !{metadata !"omnipotent char", metadata !0} // Scalar type node
+// !2 = metadata !{metadata !"short", metadata !1}           // Scalar type node
+// !3 = metadata !{metadata !"A", metadata !2, i64 0}        // Struct type node
+// !4 = metadata !{metadata !"B", metadata !2, i64 0, metadata !3, i64 4}
+//                                                           // Struct type node
+// !5 = metadata !{metadata !4, metadata !2, i64 4}          // Path tag node
+//
+// The struct type nodes and the scalar type nodes form a type DAG.
+//         Root (!0)
+//         char (!1)  -- edge to Root
+//         short (!2) -- edge to char
+//         A (!3) -- edge with offset 0 to short
+//         B (!4) -- edge with offset 0 to short and edge with offset 4 to A
+//
+// To check if two tags (tagX and tagY) can alias, we start from the base type
+// of tagX, follow the edge with the correct offset in the type DAG and adjust
+// the offset until we reach the base type of tagY or until we reach the Root
+// node.
+// If we reach the base type of tagY, compare the adjusted offset with
+// offset of tagY, return Alias if the offsets are the same, return NoAlias
+// otherwise.
+// If we reach the Root node, perform the above starting from base type of tagY
+// to see if we reach base type of tagX.
+//
+// If they have different roots, they're part of different potentially
+// unrelated type systems, so we return Alias to be conservative.
+// If neither node is an ancestor of the other and they have the same root,
+// then we say NoAlias.
+//
 // TODO: The current metadata format doesn't support struct
 // fields. For example:
 //   struct X {
@@ -71,7 +135,6 @@ using namespace llvm;
 // achieved by stripping the !tbaa tags from IR, but this option is sometimes
 // more convenient.
 static cl::opt<bool> EnableTBAA("enable-tbaa", cl::init(true));
-static cl::opt<bool> EnableStructPathTBAA("struct-path-tbaa", cl::init(false));
 
 namespace {
   /// TBAANode - This is a simple wrapper around an MDNode which provides a
@@ -168,8 +231,12 @@ namespace {
       if (Node->getNumOperands() < 2)
         return TBAAStructTypeNode();
 
-      // Special handling for a scalar type node. 
+      // Fast path for a scalar type node and a struct type node with a single
+      // field.
       if (Node->getNumOperands() <= 3) {
+        uint64_t Cur = Node->getNumOperands() == 2 ? 0 :
+                       cast<ConstantInt>(Node->getOperand(2))->getZExtValue();
+        Offset -= Cur;
         MDNode *P = dyn_cast_or_null<MDNode>(Node->getOperand(1));
         if (!P)
           return TBAAStructTypeNode();
@@ -259,12 +326,21 @@ TypeBasedAliasAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
   AliasAnalysis::getAnalysisUsage(AU);
 }
 
+/// Check the first operand of the tbaa tag node, if it is a MDNode, we treat
+/// it as struct-path aware TBAA format, otherwise, we treat it as scalar TBAA
+/// format.
+static bool isStructPathTBAA(const MDNode *MD) {
+  // Anonymous TBAA root starts with a MDNode and dragonegg uses it as
+  // a TBAA tag.
+  return isa<MDNode>(MD->getOperand(0)) && MD->getNumOperands() >= 3;
+}
+
 /// Aliases - Test whether the type represented by A may alias the
 /// type represented by B.
 bool
 TypeBasedAliasAnalysis::Aliases(const MDNode *A,
                                 const MDNode *B) const {
-  if (EnableStructPathTBAA)
+  if (isStructPathTBAA(A))
     return PathAliases(A, B);
 
   // Keep track of the root node for A and B.
@@ -397,8 +473,8 @@ bool TypeBasedAliasAnalysis::pointsToConstantMemory(const Location &Loc,
 
   // If this is an "immutable" type, we can assume the pointer is pointing
   // to constant memory.
-  if ((!EnableStructPathTBAA && TBAANode(M).TypeIsImmutable()) ||
-      (EnableStructPathTBAA && TBAAStructTagNode(M).TypeIsImmutable()))
+  if ((!isStructPathTBAA(M) && TBAANode(M).TypeIsImmutable()) ||
+      (isStructPathTBAA(M) && TBAAStructTagNode(M).TypeIsImmutable()))
     return true;
 
   return AliasAnalysis::pointsToConstantMemory(Loc, OrLocal);
@@ -414,8 +490,8 @@ TypeBasedAliasAnalysis::getModRefBehavior(ImmutableCallSite CS) {
   // If this is an "immutable" type, we can assume the call doesn't write
   // to memory.
   if (const MDNode *M = CS.getInstruction()->getMetadata(LLVMContext::MD_tbaa))
-    if ((!EnableStructPathTBAA && TBAANode(M).TypeIsImmutable()) ||
-        (EnableStructPathTBAA && TBAAStructTagNode(M).TypeIsImmutable()))
+    if ((!isStructPathTBAA(M) && TBAANode(M).TypeIsImmutable()) ||
+        (isStructPathTBAA(M) && TBAAStructTagNode(M).TypeIsImmutable()))
       Min = OnlyReadsMemory;
 
   return ModRefBehavior(AliasAnalysis::getModRefBehavior(CS) & Min);
@@ -458,6 +534,25 @@ TypeBasedAliasAnalysis::getModRefInfo(ImmutableCallSite CS1,
   return AliasAnalysis::getModRefInfo(CS1, CS2);
 }
 
+bool MDNode::isTBAAVtableAccess() const {
+  if (!isStructPathTBAA(this)) {
+    if (getNumOperands() < 1) return false;
+    if (MDString *Tag1 = dyn_cast<MDString>(getOperand(0))) {
+      if (Tag1->getString() == "vtable pointer") return true;
+    }
+    return false;
+  }
+
+  // For struct-path aware TBAA, we use the access type of the tag.
+  if (getNumOperands() < 2) return false;
+  MDNode *Tag = cast_or_null<MDNode>(getOperand(1));
+  if (!Tag) return false;
+  if (MDString *Tag1 = dyn_cast<MDString>(Tag->getOperand(0))) {
+    if (Tag1->getString() == "vtable pointer") return true;
+  }
+  return false;  
+}
+
 MDNode *MDNode::getMostGenericTBAA(MDNode *A, MDNode *B) {
   if (!A || !B)
     return NULL;
@@ -466,7 +561,8 @@ MDNode *MDNode::getMostGenericTBAA(MDNode *A, MDNode *B) {
     return A;
 
   // For struct-path aware TBAA, we use the access type of the tag.
-  if (EnableStructPathTBAA) {
+  bool StructPath = isStructPathTBAA(A);
+  if (StructPath) {
     A = cast_or_null<MDNode>(A->getOperand(1));
     if (!A) return 0;
     B = cast_or_null<MDNode>(B->getOperand(1));
@@ -499,7 +595,7 @@ MDNode *MDNode::getMostGenericTBAA(MDNode *A, MDNode *B) {
     --IA;
     --IB;
   }
-  if (!EnableStructPathTBAA)
+  if (!StructPath)
     return Ret;
 
   if (!Ret)
diff --git a/lib/Analysis/ValueTracking.cpp b/lib/Analysis/ValueTracking.cpp
index 4591af8..e39ee62 100644
--- a/lib/Analysis/ValueTracking.cpp
+++ b/lib/Analysis/ValueTracking.cpp
@@ -15,6 +15,7 @@
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/GlobalAlias.h"
@@ -39,8 +40,8 @@ const unsigned MaxDepth = 6;
 static unsigned getBitWidth(Type *Ty, const DataLayout *TD) {
   if (unsigned BitWidth = Ty->getScalarSizeInBits())
     return BitWidth;
-  assert(isa<PointerType>(Ty) && "Expected a pointer type!");
-  return TD ? TD->getPointerSizeInBits() : 0;
+
+  return TD ? TD->getPointerTypeSizeInBits(Ty) : 0;
 }
 
 static void ComputeMaskedBitsAddSub(bool Add, Value *Op0, Value *Op1, bool NSW,
@@ -629,9 +630,19 @@ void llvm::ComputeMaskedBits(Value *V, APInt &KnownZero, APInt &KnownOne,
       Value *Index = I->getOperand(i);
       if (StructType *STy = dyn_cast<StructType>(*GTI)) {
         // Handle struct member offset arithmetic.
-        if (!TD) return;
-        const StructLayout *SL = TD->getStructLayout(STy);
+        if (!TD)
+          return;
+
+        // Handle case when index is vector zeroinitializer
+        Constant *CIndex = cast<Constant>(Index);
+        if (CIndex->isZeroValue())
+          continue;
+
+        if (CIndex->getType()->isVectorTy())
+          Index = CIndex->getSplatValue();
+
         unsigned Idx = cast<ConstantInt>(Index)->getZExtValue();
+        const StructLayout *SL = TD->getStructLayout(STy);
         uint64_t Offset = SL->getElementOffset(Idx);
         TrailZ = std::min<unsigned>(TrailZ,
                                     countTrailingZeros(Offset));
@@ -749,7 +760,6 @@ void llvm::ComputeMaskedBits(Value *V, APInt &KnownZero, APInt &KnownOne,
         KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - LowBits);
         break;
       }
-      case Intrinsic::x86_sse42_crc32_64_8:
       case Intrinsic::x86_sse42_crc32_64_64:
         KnownZero = APInt::getHighBitsSet(64, 32);
         break;
@@ -1704,20 +1714,24 @@ Value *llvm::FindInsertedValue(Value *V, ArrayRef<unsigned> idx_range,
 /// it can be expressed as a base pointer plus a constant offset.  Return the
 /// base and offset to the caller.
 Value *llvm::GetPointerBaseWithConstantOffset(Value *Ptr, int64_t &Offset,
-                                              const DataLayout *TD) {
+                                              const DataLayout *DL) {
   // Without DataLayout, conservatively assume 64-bit offsets, which is
   // the widest we support.
-  unsigned BitWidth = TD ? TD->getPointerSizeInBits() : 64;
+  unsigned BitWidth = DL ? DL->getPointerTypeSizeInBits(Ptr->getType()) : 64;
   APInt ByteOffset(BitWidth, 0);
   while (1) {
     if (Ptr->getType()->isVectorTy())
       break;
 
     if (GEPOperator *GEP = dyn_cast<GEPOperator>(Ptr)) {
-      APInt GEPOffset(BitWidth, 0);
-      if (TD && !GEP->accumulateConstantOffset(*TD, GEPOffset))
-        break;
-      ByteOffset += GEPOffset;
+      if (DL) {
+        APInt GEPOffset(BitWidth, 0);
+        if (!GEP->accumulateConstantOffset(*DL, GEPOffset))
+          break;
+
+        ByteOffset += GEPOffset;
+      }
+
       Ptr = GEP->getPointerOperand();
     } else if (Operator::getOpcode(Ptr) == Instruction::BitCast) {
       Ptr = cast<Operator>(Ptr)->getOperand(0);
@@ -2050,7 +2064,7 @@ bool llvm::isSafeToSpeculativelyExecute(const Value *V,
 
 /// isKnownNonNull - Return true if we know that the specified value is never
 /// null.
-bool llvm::isKnownNonNull(const Value *V) {
+bool llvm::isKnownNonNull(const Value *V, const TargetLibraryInfo *TLI) {
   // Alloca never returns null, malloc might.
   if (isa<AllocaInst>(V)) return true;
 
@@ -2061,5 +2075,10 @@ bool llvm::isKnownNonNull(const Value *V) {
   // Global values are not null unless extern weak.
   if (const GlobalValue *GV = dyn_cast<GlobalValue>(V))
     return !GV->hasExternalWeakLinkage();
+
+  // operator new never returns null.
+  if (isOperatorNewLikeFn(V, TLI, /*LookThroughBitCast=*/true))
+    return true;
+
   return false;
 }
diff --git a/lib/AsmParser/LLLexer.cpp b/lib/AsmParser/LLLexer.cpp
index 48675ac..1e6085b 100644
--- a/lib/AsmParser/LLLexer.cpp
+++ b/lib/AsmParser/LLLexer.cpp
@@ -478,12 +478,10 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(private);
   KEYWORD(linker_private);
   KEYWORD(linker_private_weak);
-  KEYWORD(linker_private_weak_def_auto); // FIXME: For backwards compatibility.
   KEYWORD(internal);
   KEYWORD(available_externally);
   KEYWORD(linkonce);
   KEYWORD(linkonce_odr);
-  KEYWORD(linkonce_odr_auto_hide);
   KEYWORD(weak);
   KEYWORD(weak_odr);
   KEYWORD(appending);
@@ -540,6 +538,7 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(alignstack);
   KEYWORD(inteldialect);
   KEYWORD(gc);
+  KEYWORD(prefix);
 
   KEYWORD(ccc);
   KEYWORD(fastcc);
@@ -558,6 +557,8 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(intel_ocl_bicc);
   KEYWORD(x86_64_sysvcc);
   KEYWORD(x86_64_win64cc);
+  KEYWORD(webkit_jscc);
+  KEYWORD(anyregcc);
 
   KEYWORD(cc);
   KEYWORD(c);
@@ -583,6 +584,7 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(noredzone);
   KEYWORD(noreturn);
   KEYWORD(nounwind);
+  KEYWORD(optnone);
   KEYWORD(optsize);
   KEYWORD(readnone);
   KEYWORD(readonly);
@@ -663,6 +665,7 @@ lltok::Kind LLLexer::LexIdentifier() {
   INSTKEYWORD(inttoptr,    IntToPtr);
   INSTKEYWORD(ptrtoint,    PtrToInt);
   INSTKEYWORD(bitcast,     BitCast);
+  INSTKEYWORD(addrspacecast, AddrSpaceCast);
   INSTKEYWORD(select,      Select);
   INSTKEYWORD(va_arg,      VAArg);
   INSTKEYWORD(ret,         Ret);
diff --git a/lib/AsmParser/LLParser.cpp b/lib/AsmParser/LLParser.cpp
index 62a07f5..3b903cd 100644
--- a/lib/AsmParser/LLParser.cpp
+++ b/lib/AsmParser/LLParser.cpp
@@ -19,6 +19,7 @@
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/IR/ValueSymbolTable.h"
@@ -65,6 +66,9 @@ bool LLParser::ValidateEndOfModule() {
     ForwardRefInstMetadata.clear();
   }
 
+  for (unsigned I = 0, E = InstsWithTBAATag.size(); I < E; I++)
+    UpgradeInstWithTBAATag(InstsWithTBAATag[I]);
+
   // Handle any function attribute group forward references.
   for (std::map<Value*, std::vector<unsigned> >::iterator
          I = ForwardRefAttrGroups.begin(), E = ForwardRefAttrGroups.end();
@@ -178,6 +182,8 @@ bool LLParser::ValidateEndOfModule() {
   for (Module::iterator FI = M->begin(), FE = M->end(); FI != FE; )
     UpgradeCallsToIntrinsic(FI++); // must be post-increment, as we remove
 
+  UpgradeDebugInfo(*M);
+
   return false;
 }
 
@@ -242,13 +248,11 @@ bool LLParser::ParseTopLevelEntities() {
     case lltok::kw_private:             // OptionalLinkage
     case lltok::kw_linker_private:      // OptionalLinkage
     case lltok::kw_linker_private_weak: // OptionalLinkage
-    case lltok::kw_linker_private_weak_def_auto: // FIXME: backwards compat.
     case lltok::kw_internal:            // OptionalLinkage
     case lltok::kw_weak:                // OptionalLinkage
     case lltok::kw_weak_odr:            // OptionalLinkage
     case lltok::kw_linkonce:            // OptionalLinkage
     case lltok::kw_linkonce_odr:        // OptionalLinkage
-    case lltok::kw_linkonce_odr_auto_hide: // OptionalLinkage
     case lltok::kw_appending:           // OptionalLinkage
     case lltok::kw_dllexport:           // OptionalLinkage
     case lltok::kw_common:              // OptionalLinkage
@@ -623,18 +627,14 @@ bool LLParser::ParseAlias(const std::string &Name, LocTy NameLoc,
                           unsigned Visibility) {
   assert(Lex.getKind() == lltok::kw_alias);
   Lex.Lex();
-  unsigned Linkage;
   LocTy LinkageLoc = Lex.getLoc();
-  if (ParseOptionalLinkage(Linkage))
+  unsigned L;
+  if (ParseOptionalLinkage(L))
     return true;
 
-  if (Linkage != GlobalValue::ExternalLinkage &&
-      Linkage != GlobalValue::WeakAnyLinkage &&
-      Linkage != GlobalValue::WeakODRLinkage &&
-      Linkage != GlobalValue::InternalLinkage &&
-      Linkage != GlobalValue::PrivateLinkage &&
-      Linkage != GlobalValue::LinkerPrivateLinkage &&
-      Linkage != GlobalValue::LinkerPrivateWeakLinkage)
+  GlobalValue::LinkageTypes Linkage = (GlobalValue::LinkageTypes) L;
+
+  if(!GlobalAlias::isValidLinkage(Linkage))
     return Error(LinkageLoc, "invalid linkage type for alias");
 
   Constant *Aliasee;
@@ -922,6 +922,7 @@ bool LLParser::ParseFnAttributeValuePairs(AttrBuilder &B,
     case lltok::kw_noredzone:         B.addAttribute(Attribute::NoRedZone); break;
     case lltok::kw_noreturn:          B.addAttribute(Attribute::NoReturn); break;
     case lltok::kw_nounwind:          B.addAttribute(Attribute::NoUnwind); break;
+    case lltok::kw_optnone:           B.addAttribute(Attribute::OptimizeNone); break;
     case lltok::kw_optsize:           B.addAttribute(Attribute::OptimizeForSize); break;
     case lltok::kw_readnone:          B.addAttribute(Attribute::ReadNone); break;
     case lltok::kw_readonly:          B.addAttribute(Attribute::ReadOnly); break;
@@ -1180,6 +1181,7 @@ bool LLParser::ParseOptionalParamAttrs(AttrBuilder &B) {
     case lltok::kw_noredzone:
     case lltok::kw_noreturn:
     case lltok::kw_nounwind:
+    case lltok::kw_optnone:
     case lltok::kw_optsize:
     case lltok::kw_returns_twice:
     case lltok::kw_sanitize_address:
@@ -1238,6 +1240,7 @@ bool LLParser::ParseOptionalReturnAttrs(AttrBuilder &B) {
     case lltok::kw_noredzone:
     case lltok::kw_noreturn:
     case lltok::kw_nounwind:
+    case lltok::kw_optnone:
     case lltok::kw_optsize:
     case lltok::kw_returns_twice:
     case lltok::kw_sanitize_address:
@@ -1269,7 +1272,6 @@ bool LLParser::ParseOptionalReturnAttrs(AttrBuilder &B) {
 ///   ::= 'weak_odr'
 ///   ::= 'linkonce'
 ///   ::= 'linkonce_odr'
-///   ::= 'linkonce_odr_auto_hide'
 ///   ::= 'available_externally'
 ///   ::= 'appending'
 ///   ::= 'dllexport'
@@ -1291,10 +1293,6 @@ bool LLParser::ParseOptionalLinkage(unsigned &Res, bool &HasLinkage) {
   case lltok::kw_weak_odr:       Res = GlobalValue::WeakODRLinkage;       break;
   case lltok::kw_linkonce:       Res = GlobalValue::LinkOnceAnyLinkage;   break;
   case lltok::kw_linkonce_odr:   Res = GlobalValue::LinkOnceODRLinkage;   break;
-  case lltok::kw_linkonce_odr_auto_hide:
-  case lltok::kw_linker_private_weak_def_auto: // FIXME: For backwards compat.
-    Res = GlobalValue::LinkOnceODRAutoHideLinkage;
-    break;
   case lltok::kw_available_externally:
     Res = GlobalValue::AvailableExternallyLinkage;
     break;
@@ -1346,6 +1344,8 @@ bool LLParser::ParseOptionalVisibility(unsigned &Res) {
 ///   ::= 'spir_kernel'
 ///   ::= 'x86_64_sysvcc'
 ///   ::= 'x86_64_win64cc'
+///   ::= 'webkit_jscc'
+///   ::= 'anyregcc'
 ///   ::= 'cc' UINT
 ///
 bool LLParser::ParseOptionalCallingConv(CallingConv::ID &CC) {
@@ -1368,6 +1368,8 @@ bool LLParser::ParseOptionalCallingConv(CallingConv::ID &CC) {
   case lltok::kw_intel_ocl_bicc: CC = CallingConv::Intel_OCL_BI; break;
   case lltok::kw_x86_64_sysvcc:  CC = CallingConv::X86_64_SysV; break;
   case lltok::kw_x86_64_win64cc: CC = CallingConv::X86_64_Win64; break;
+  case lltok::kw_webkit_jscc:    CC = CallingConv::WebKit_JS; break;
+  case lltok::kw_anyregcc:       CC = CallingConv::AnyReg; break;
   case lltok::kw_cc: {
       unsigned ArbitraryCC;
       Lex.Lex();
@@ -1424,6 +1426,9 @@ bool LLParser::ParseInstructionMetadata(Instruction *Inst,
       }
     }
 
+    if (MDK == LLVMContext::MD_tbaa)
+      InstsWithTBAATag.push_back(Inst);
+
     // If this is the end of the list, we're done.
   } while (EatIfPresent(lltok::comma));
   return false;
@@ -2383,7 +2388,6 @@ bool LLParser::ParseValID(ValID &ID, PerFunctionState *PFS) {
     Lex.Lex();
 
     ValID Fn, Label;
-    LocTy FnLoc, LabelLoc;
 
     if (ParseToken(lltok::lparen, "expected '(' in block address expression") ||
         ParseValID(Fn) ||
@@ -2413,6 +2417,7 @@ bool LLParser::ParseValID(ValID &ID, PerFunctionState *PFS) {
   case lltok::kw_fptrunc:
   case lltok::kw_fpext:
   case lltok::kw_bitcast:
+  case lltok::kw_addrspacecast:
   case lltok::kw_uitofp:
   case lltok::kw_sitofp:
   case lltok::kw_fptoui:
@@ -2919,7 +2924,7 @@ bool LLParser::ParseTypeAndBasicBlock(BasicBlock *&BB, LocTy &Loc,
 /// FunctionHeader
 ///   ::= OptionalLinkage OptionalVisibility OptionalCallingConv OptRetAttrs
 ///       OptUnnamedAddr Type GlobalName '(' ArgList ')' OptFuncAttrs OptSection
-///       OptionalAlign OptGC
+///       OptionalAlign OptGC OptionalPrefix
 bool LLParser::ParseFunctionHeader(Function *&Fn, bool isDefine) {
   // Parse the linkage.
   LocTy LinkageLoc = Lex.getLoc();
@@ -2953,7 +2958,6 @@ bool LLParser::ParseFunctionHeader(Function *&Fn, bool isDefine) {
   case GlobalValue::AvailableExternallyLinkage:
   case GlobalValue::LinkOnceAnyLinkage:
   case GlobalValue::LinkOnceODRLinkage:
-  case GlobalValue::LinkOnceODRAutoHideLinkage:
   case GlobalValue::WeakAnyLinkage:
   case GlobalValue::WeakODRLinkage:
   case GlobalValue::DLLExportLinkage:
@@ -2998,6 +3002,7 @@ bool LLParser::ParseFunctionHeader(Function *&Fn, bool isDefine) {
   std::string GC;
   bool UnnamedAddr;
   LocTy UnnamedAddrLoc;
+  Constant *Prefix = 0;
 
   if (ParseArgumentList(ArgList, isVarArg) ||
       ParseOptionalToken(lltok::kw_unnamed_addr, UnnamedAddr,
@@ -3008,7 +3013,9 @@ bool LLParser::ParseFunctionHeader(Function *&Fn, bool isDefine) {
        ParseStringConstant(Section)) ||
       ParseOptionalAlignment(Alignment) ||
       (EatIfPresent(lltok::kw_gc) &&
-       ParseStringConstant(GC)))
+       ParseStringConstant(GC)) ||
+      (EatIfPresent(lltok::kw_prefix) &&
+       ParseGlobalTypeAndValue(Prefix)))
     return true;
 
   if (FuncAttrs.contains(Attribute::Builtin))
@@ -3106,6 +3113,7 @@ bool LLParser::ParseFunctionHeader(Function *&Fn, bool isDefine) {
   Fn->setAlignment(Alignment);
   Fn->setSection(Section);
   if (!GC.empty()) Fn->setGC(GC.c_str());
+  Fn->setPrefixData(Prefix);
   ForwardRefAttrGroups[Fn] = FwdRefAttrGrps;
 
   // Add all of the arguments we parsed to the function.
@@ -3171,7 +3179,6 @@ bool LLParser::ParseBasicBlock(PerFunctionState &PFS) {
 
   // Parse the instructions in this block until we get a terminator.
   Instruction *Inst;
-  SmallVector<std::pair<unsigned, MDNode *>, 4> MetadataOnInst;
   do {
     // This instruction may have three possibilities for a name: a) none
     // specified, b) name specified "%foo =", c) number specified: "%4 =".
@@ -3299,6 +3306,7 @@ int LLParser::ParseInstruction(Instruction *&Inst, BasicBlock *BB,
   case lltok::kw_fptrunc:
   case lltok::kw_fpext:
   case lltok::kw_bitcast:
+  case lltok::kw_addrspacecast:
   case lltok::kw_uitofp:
   case lltok::kw_sitofp:
   case lltok::kw_fptoui:
diff --git a/lib/AsmParser/LLParser.h b/lib/AsmParser/LLParser.h
index 594281e..ded776c 100644
--- a/lib/AsmParser/LLParser.h
+++ b/lib/AsmParser/LLParser.h
@@ -107,6 +107,8 @@ namespace llvm {
     };
     DenseMap<Instruction*, std::vector<MDRef> > ForwardRefInstMetadata;
 
+    SmallVector<Instruction*, 64> InstsWithTBAATag;
+
     // Type resolution handling data structures.  The location is set when we
     // have processed a use of the type but not a definition yet.
     StringMap<std::pair<Type*, LocTy> > NamedTypes;
diff --git a/lib/AsmParser/LLToken.h b/lib/AsmParser/LLToken.h
index 9cf4c2c..786d84d 100644
--- a/lib/AsmParser/LLToken.h
+++ b/lib/AsmParser/LLToken.h
@@ -38,9 +38,8 @@ namespace lltok {
     kw_global,  kw_constant,
 
     kw_private, kw_linker_private, kw_linker_private_weak,
-    kw_linker_private_weak_def_auto, // FIXME: For backwards compatibility.
     kw_internal,
-    kw_linkonce, kw_linkonce_odr, kw_linkonce_odr_auto_hide,
+    kw_linkonce, kw_linkonce_odr,
     kw_weak, kw_weak_odr, kw_appending,
     kw_dllimport, kw_dllexport, kw_common, kw_available_externally,
     kw_default, kw_hidden, kw_protected,
@@ -81,6 +80,7 @@ namespace lltok {
     kw_alignstack,
     kw_inteldialect,
     kw_gc,
+    kw_prefix,
     kw_c,
 
     kw_cc, kw_ccc, kw_fastcc, kw_coldcc,
@@ -91,6 +91,7 @@ namespace lltok {
     kw_ptx_kernel, kw_ptx_device,
     kw_spir_kernel, kw_spir_func,
     kw_x86_64_sysvcc, kw_x86_64_win64cc,
+    kw_webkit_jscc, kw_anyregcc,
 
     // Attributes:
     kw_attributes,
@@ -114,6 +115,7 @@ namespace lltok {
     kw_noredzone,
     kw_noreturn,
     kw_nounwind,
+    kw_optnone,
     kw_optsize,
     kw_readnone,
     kw_readonly,
@@ -148,6 +150,7 @@ namespace lltok {
     kw_phi, kw_call,
     kw_trunc, kw_zext, kw_sext, kw_fptrunc, kw_fpext, kw_uitofp, kw_sitofp,
     kw_fptoui, kw_fptosi, kw_inttoptr, kw_ptrtoint, kw_bitcast,
+    kw_addrspacecast,
     kw_select, kw_va_arg,
 
     kw_landingpad, kw_personality, kw_cleanup, kw_catch, kw_filter,
diff --git a/lib/Bitcode/Reader/BitcodeReader.cpp b/lib/Bitcode/Reader/BitcodeReader.cpp
index e6d7b50..ce3b7d1 100644
--- a/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -17,6 +17,7 @@
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/OperandTraits.h"
 #include "llvm/IR/Operator.h"
@@ -89,7 +90,6 @@ static GlobalValue::LinkageTypes GetDecodedLinkage(unsigned Val) {
   case 12: return GlobalValue::AvailableExternallyLinkage;
   case 13: return GlobalValue::LinkerPrivateLinkage;
   case 14: return GlobalValue::LinkerPrivateWeakLinkage;
-  case 15: return GlobalValue::LinkOnceODRAutoHideLinkage;
   }
 }
 
@@ -128,6 +128,7 @@ static int GetDecodedCastOpcode(unsigned Val) {
   case bitc::CAST_PTRTOINT: return Instruction::PtrToInt;
   case bitc::CAST_INTTOPTR: return Instruction::IntToPtr;
   case bitc::CAST_BITCAST : return Instruction::BitCast;
+  case bitc::CAST_ADDRSPACECAST: return Instruction::AddrSpaceCast;
   }
 }
 static int GetDecodedBinaryOpcode(unsigned Val, Type *Ty) {
@@ -450,12 +451,12 @@ static void decodeLLVMAttributesForBitcode(AttrBuilder &B,
                 (EncodedAttrs & 0xffff));
 }
 
-bool BitcodeReader::ParseAttributeBlock() {
+error_code BitcodeReader::ParseAttributeBlock() {
   if (Stream.EnterSubBlock(bitc::PARAMATTR_BLOCK_ID))
-    return Error("Malformed block record");
+    return Error(InvalidRecord);
 
   if (!MAttributes.empty())
-    return Error("Multiple PARAMATTR blocks found!");
+    return Error(InvalidMultipleBlocks);
 
   SmallVector<uint64_t, 64> Record;
 
@@ -468,9 +469,9 @@ bool BitcodeReader::ParseAttributeBlock() {
     switch (Entry.Kind) {
     case BitstreamEntry::SubBlock: // Handled for us already.
     case BitstreamEntry::Error:
-      return Error("Error at end of PARAMATTR block");
+      return Error(MalformedBlock);
     case BitstreamEntry::EndBlock:
-      return false;
+      return error_code::success();
     case BitstreamEntry::Record:
       // The interesting case.
       break;
@@ -484,7 +485,7 @@ bool BitcodeReader::ParseAttributeBlock() {
     case bitc::PARAMATTR_CODE_ENTRY_OLD: { // ENTRY: [paramidx0, attr0, ...]
       // FIXME: Remove in 4.0.
       if (Record.size() & 1)
-        return Error("Invalid ENTRY record");
+        return Error(InvalidRecord);
 
       for (unsigned i = 0, e = Record.size(); i != e; i += 2) {
         AttrBuilder B;
@@ -508,131 +509,102 @@ bool BitcodeReader::ParseAttributeBlock() {
   }
 }
 
-bool BitcodeReader::ParseAttrKind(uint64_t Code, Attribute::AttrKind *Kind) {
+// Returns Attribute::None on unrecognized codes.
+static Attribute::AttrKind GetAttrFromCode(uint64_t Code) {
   switch (Code) {
+  default:
+    return Attribute::None;
   case bitc::ATTR_KIND_ALIGNMENT:
-    *Kind = Attribute::Alignment;
-    return false;
+    return Attribute::Alignment;
   case bitc::ATTR_KIND_ALWAYS_INLINE:
-    *Kind = Attribute::AlwaysInline;
-    return false;
+    return Attribute::AlwaysInline;
   case bitc::ATTR_KIND_BUILTIN:
-    *Kind = Attribute::Builtin;
-    return false;
+    return Attribute::Builtin;
   case bitc::ATTR_KIND_BY_VAL:
-    *Kind = Attribute::ByVal;
-    return false;
+    return Attribute::ByVal;
   case bitc::ATTR_KIND_COLD:
-    *Kind = Attribute::Cold;
-    return false;
+    return Attribute::Cold;
   case bitc::ATTR_KIND_INLINE_HINT:
-    *Kind = Attribute::InlineHint;
-    return false;
+    return Attribute::InlineHint;
   case bitc::ATTR_KIND_IN_REG:
-    *Kind = Attribute::InReg;
-    return false;
+    return Attribute::InReg;
   case bitc::ATTR_KIND_MIN_SIZE:
-    *Kind = Attribute::MinSize;
-    return false;
+    return Attribute::MinSize;
   case bitc::ATTR_KIND_NAKED:
-    *Kind = Attribute::Naked;
-    return false;
+    return Attribute::Naked;
   case bitc::ATTR_KIND_NEST:
-    *Kind = Attribute::Nest;
-    return false;
+    return Attribute::Nest;
   case bitc::ATTR_KIND_NO_ALIAS:
-    *Kind = Attribute::NoAlias;
-    return false;
+    return Attribute::NoAlias;
   case bitc::ATTR_KIND_NO_BUILTIN:
-    *Kind = Attribute::NoBuiltin;
-    return false;
+    return Attribute::NoBuiltin;
   case bitc::ATTR_KIND_NO_CAPTURE:
-    *Kind = Attribute::NoCapture;
-    return false;
+    return Attribute::NoCapture;
   case bitc::ATTR_KIND_NO_DUPLICATE:
-    *Kind = Attribute::NoDuplicate;
-    return false;
+    return Attribute::NoDuplicate;
   case bitc::ATTR_KIND_NO_IMPLICIT_FLOAT:
-    *Kind = Attribute::NoImplicitFloat;
-    return false;
+    return Attribute::NoImplicitFloat;
   case bitc::ATTR_KIND_NO_INLINE:
-    *Kind = Attribute::NoInline;
-    return false;
+    return Attribute::NoInline;
   case bitc::ATTR_KIND_NON_LAZY_BIND:
-    *Kind = Attribute::NonLazyBind;
-    return false;
+    return Attribute::NonLazyBind;
   case bitc::ATTR_KIND_NO_RED_ZONE:
-    *Kind = Attribute::NoRedZone;
-    return false;
+    return Attribute::NoRedZone;
   case bitc::ATTR_KIND_NO_RETURN:
-    *Kind = Attribute::NoReturn;
-    return false;
+    return Attribute::NoReturn;
   case bitc::ATTR_KIND_NO_UNWIND:
-    *Kind = Attribute::NoUnwind;
-    return false;
+    return Attribute::NoUnwind;
   case bitc::ATTR_KIND_OPTIMIZE_FOR_SIZE:
-    *Kind = Attribute::OptimizeForSize;
-    return false;
+    return Attribute::OptimizeForSize;
+  case bitc::ATTR_KIND_OPTIMIZE_NONE:
+    return Attribute::OptimizeNone;
   case bitc::ATTR_KIND_READ_NONE:
-    *Kind = Attribute::ReadNone;
-    return false;
+    return Attribute::ReadNone;
   case bitc::ATTR_KIND_READ_ONLY:
-    *Kind = Attribute::ReadOnly;
-    return false;
+    return Attribute::ReadOnly;
   case bitc::ATTR_KIND_RETURNED:
-    *Kind = Attribute::Returned;
-    return false;
+    return Attribute::Returned;
   case bitc::ATTR_KIND_RETURNS_TWICE:
-    *Kind = Attribute::ReturnsTwice;
-    return false;
+    return Attribute::ReturnsTwice;
   case bitc::ATTR_KIND_S_EXT:
-    *Kind = Attribute::SExt;
-    return false;
+    return Attribute::SExt;
   case bitc::ATTR_KIND_STACK_ALIGNMENT:
-    *Kind = Attribute::StackAlignment;
-    return false;
+    return Attribute::StackAlignment;
   case bitc::ATTR_KIND_STACK_PROTECT:
-    *Kind = Attribute::StackProtect;
-    return false;
+    return Attribute::StackProtect;
   case bitc::ATTR_KIND_STACK_PROTECT_REQ:
-    *Kind = Attribute::StackProtectReq;
-    return false;
+    return Attribute::StackProtectReq;
   case bitc::ATTR_KIND_STACK_PROTECT_STRONG:
-    *Kind = Attribute::StackProtectStrong;
-    return false;
+    return Attribute::StackProtectStrong;
   case bitc::ATTR_KIND_STRUCT_RET:
-    *Kind = Attribute::StructRet;
-    return false;
+    return Attribute::StructRet;
   case bitc::ATTR_KIND_SANITIZE_ADDRESS:
-    *Kind = Attribute::SanitizeAddress;
-    return false;
+    return Attribute::SanitizeAddress;
   case bitc::ATTR_KIND_SANITIZE_THREAD:
-    *Kind = Attribute::SanitizeThread;
-    return false;
+    return Attribute::SanitizeThread;
   case bitc::ATTR_KIND_SANITIZE_MEMORY:
-    *Kind = Attribute::SanitizeMemory;
-    return false;
+    return Attribute::SanitizeMemory;
   case bitc::ATTR_KIND_UW_TABLE:
-    *Kind = Attribute::UWTable;
-    return false;
+    return Attribute::UWTable;
   case bitc::ATTR_KIND_Z_EXT:
-    *Kind = Attribute::ZExt;
-    return false;
-  default:
-    std::string Buf;
-    raw_string_ostream fmt(Buf);
-    fmt << "Unknown attribute kind (" << Code << ")";
-    fmt.flush();
-    return Error(Buf.c_str());
+    return Attribute::ZExt;
   }
 }
 
-bool BitcodeReader::ParseAttributeGroupBlock() {
+error_code BitcodeReader::ParseAttrKind(uint64_t Code,
+                                        Attribute::AttrKind *Kind) {
+  *Kind = GetAttrFromCode(Code);
+  if (*Kind == Attribute::None)
+    return Error(InvalidValue);
+  return error_code::success();
+}
+
+error_code BitcodeReader::ParseAttributeGroupBlock() {
   if (Stream.EnterSubBlock(bitc::PARAMATTR_GROUP_BLOCK_ID))
-    return Error("Malformed block record");
+    return Error(InvalidRecord);
 
   if (!MAttributeGroups.empty())
-    return Error("Multiple PARAMATTR_GROUP blocks found!");
+    return Error(InvalidMultipleBlocks);
 
   SmallVector<uint64_t, 64> Record;
 
@@ -643,9 +615,9 @@ bool BitcodeReader::ParseAttributeGroupBlock() {
     switch (Entry.Kind) {
     case BitstreamEntry::SubBlock: // Handled for us already.
     case BitstreamEntry::Error:
-      return Error("Error at end of PARAMATTR_GROUP block");
+      return Error(MalformedBlock);
     case BitstreamEntry::EndBlock:
-      return false;
+      return error_code::success();
     case BitstreamEntry::Record:
       // The interesting case.
       break;
@@ -658,7 +630,7 @@ bool BitcodeReader::ParseAttributeGroupBlock() {
       break;
     case bitc::PARAMATTR_GRP_CODE_ENTRY: { // ENTRY: [grpid, idx, a0, a1, ...]
       if (Record.size() < 3)
-        return Error("Invalid ENTRY record");
+        return Error(InvalidRecord);
 
       uint64_t GrpID = Record[0];
       uint64_t Idx = Record[1]; // Index of the object this attribute refers to.
@@ -667,14 +639,14 @@ bool BitcodeReader::ParseAttributeGroupBlock() {
       for (unsigned i = 2, e = Record.size(); i != e; ++i) {
         if (Record[i] == 0) {        // Enum attribute
           Attribute::AttrKind Kind;
-          if (ParseAttrKind(Record[++i], &Kind))
-            return true;
+          if (error_code EC = ParseAttrKind(Record[++i], &Kind))
+            return EC;
 
           B.addAttribute(Kind);
         } else if (Record[i] == 1) { // Align attribute
           Attribute::AttrKind Kind;
-          if (ParseAttrKind(Record[++i], &Kind))
-            return true;
+          if (error_code EC = ParseAttrKind(Record[++i], &Kind))
+            return EC;
           if (Kind == Attribute::Alignment)
             B.addAlignmentAttr(Record[++i]);
           else
@@ -709,16 +681,16 @@ bool BitcodeReader::ParseAttributeGroupBlock() {
   }
 }
 
-bool BitcodeReader::ParseTypeTable() {
+error_code BitcodeReader::ParseTypeTable() {
   if (Stream.EnterSubBlock(bitc::TYPE_BLOCK_ID_NEW))
-    return Error("Malformed block record");
+    return Error(InvalidRecord);
 
   return ParseTypeTableBody();
 }
 
-bool BitcodeReader::ParseTypeTableBody() {
+error_code BitcodeReader::ParseTypeTableBody() {
   if (!TypeList.empty())
-    return Error("Multiple TYPE_BLOCKs found!");
+    return Error(InvalidMultipleBlocks);
 
   SmallVector<uint64_t, 64> Record;
   unsigned NumRecords = 0;
@@ -732,12 +704,11 @@ bool BitcodeReader::ParseTypeTableBody() {
     switch (Entry.Kind) {
     case BitstreamEntry::SubBlock: // Handled for us already.
     case BitstreamEntry::Error:
-      Error("Error in the type table block");
-      return true;
+      return Error(MalformedBlock);
     case BitstreamEntry::EndBlock:
       if (NumRecords != TypeList.size())
-        return Error("Invalid type forward reference in TYPE_BLOCK");
-      return false;
+        return Error(MalformedBlock);
+      return error_code::success();
     case BitstreamEntry::Record:
       // The interesting case.
       break;
@@ -747,12 +718,13 @@ bool BitcodeReader::ParseTypeTableBody() {
     Record.clear();
     Type *ResultTy = 0;
     switch (Stream.readRecord(Entry.ID, Record)) {
-    default: return Error("unknown type in type table");
+    default:
+      return Error(InvalidValue);
     case bitc::TYPE_CODE_NUMENTRY: // TYPE_CODE_NUMENTRY: [numentries]
       // TYPE_CODE_NUMENTRY contains a count of the number of types in the
       // type list.  This allows us to reserve space.
       if (Record.size() < 1)
-        return Error("Invalid TYPE_CODE_NUMENTRY record");
+        return Error(InvalidRecord);
       TypeList.resize(Record[0]);
       continue;
     case bitc::TYPE_CODE_VOID:      // VOID
@@ -787,19 +759,20 @@ bool BitcodeReader::ParseTypeTableBody() {
       break;
     case bitc::TYPE_CODE_INTEGER:   // INTEGER: [width]
       if (Record.size() < 1)
-        return Error("Invalid Integer type record");
+        return Error(InvalidRecord);
 
       ResultTy = IntegerType::get(Context, Record[0]);
       break;
     case bitc::TYPE_CODE_POINTER: { // POINTER: [pointee type] or
                                     //          [pointee type, address space]
       if (Record.size() < 1)
-        return Error("Invalid POINTER type record");
+        return Error(InvalidRecord);
       unsigned AddressSpace = 0;
       if (Record.size() == 2)
         AddressSpace = Record[1];
       ResultTy = getTypeByID(Record[0]);
-      if (ResultTy == 0) return Error("invalid element type in pointer type");
+      if (ResultTy == 0)
+        return Error(InvalidType);
       ResultTy = PointerType::get(ResultTy, AddressSpace);
       break;
     }
@@ -807,7 +780,7 @@ bool BitcodeReader::ParseTypeTableBody() {
       // FIXME: attrid is dead, remove it in LLVM 4.0
       // FUNCTION: [vararg, attrid, retty, paramty x N]
       if (Record.size() < 3)
-        return Error("Invalid FUNCTION type record");
+        return Error(InvalidRecord);
       SmallVector<Type*, 8> ArgTys;
       for (unsigned i = 3, e = Record.size(); i != e; ++i) {
         if (Type *T = getTypeByID(Record[i]))
@@ -818,7 +791,7 @@ bool BitcodeReader::ParseTypeTableBody() {
 
       ResultTy = getTypeByID(Record[2]);
       if (ResultTy == 0 || ArgTys.size() < Record.size()-3)
-        return Error("invalid type in function type");
+        return Error(InvalidType);
 
       ResultTy = FunctionType::get(ResultTy, ArgTys, Record[0]);
       break;
@@ -826,7 +799,7 @@ bool BitcodeReader::ParseTypeTableBody() {
     case bitc::TYPE_CODE_FUNCTION: {
       // FUNCTION: [vararg, retty, paramty x N]
       if (Record.size() < 2)
-        return Error("Invalid FUNCTION type record");
+        return Error(InvalidRecord);
       SmallVector<Type*, 8> ArgTys;
       for (unsigned i = 2, e = Record.size(); i != e; ++i) {
         if (Type *T = getTypeByID(Record[i]))
@@ -837,14 +810,14 @@ bool BitcodeReader::ParseTypeTableBody() {
 
       ResultTy = getTypeByID(Record[1]);
       if (ResultTy == 0 || ArgTys.size() < Record.size()-2)
-        return Error("invalid type in function type");
+        return Error(InvalidType);
 
       ResultTy = FunctionType::get(ResultTy, ArgTys, Record[0]);
       break;
     }
     case bitc::TYPE_CODE_STRUCT_ANON: {  // STRUCT: [ispacked, eltty x N]
       if (Record.size() < 1)
-        return Error("Invalid STRUCT type record");
+        return Error(InvalidRecord);
       SmallVector<Type*, 8> EltTys;
       for (unsigned i = 1, e = Record.size(); i != e; ++i) {
         if (Type *T = getTypeByID(Record[i]))
@@ -853,21 +826,21 @@ bool BitcodeReader::ParseTypeTableBody() {
           break;
       }
       if (EltTys.size() != Record.size()-1)
-        return Error("invalid type in struct type");
+        return Error(InvalidType);
       ResultTy = StructType::get(Context, EltTys, Record[0]);
       break;
     }
     case bitc::TYPE_CODE_STRUCT_NAME:   // STRUCT_NAME: [strchr x N]
       if (ConvertToString(Record, 0, TypeName))
-        return Error("Invalid STRUCT_NAME record");
+        return Error(InvalidRecord);
       continue;
 
     case bitc::TYPE_CODE_STRUCT_NAMED: { // STRUCT: [ispacked, eltty x N]
       if (Record.size() < 1)
-        return Error("Invalid STRUCT type record");
+        return Error(InvalidRecord);
 
       if (NumRecords >= TypeList.size())
-        return Error("invalid TYPE table");
+        return Error(InvalidTYPETable);
 
       // Check to see if this was forward referenced, if so fill in the temp.
       StructType *Res = cast_or_null<StructType>(TypeList[NumRecords]);
@@ -886,17 +859,17 @@ bool BitcodeReader::ParseTypeTableBody() {
           break;
       }
       if (EltTys.size() != Record.size()-1)
-        return Error("invalid STRUCT type record");
+        return Error(InvalidRecord);
       Res->setBody(EltTys, Record[0]);
       ResultTy = Res;
       break;
     }
     case bitc::TYPE_CODE_OPAQUE: {       // OPAQUE: []
       if (Record.size() != 1)
-        return Error("Invalid OPAQUE type record");
+        return Error(InvalidRecord);
 
       if (NumRecords >= TypeList.size())
-        return Error("invalid TYPE table");
+        return Error(InvalidTYPETable);
 
       // Check to see if this was forward referenced, if so fill in the temp.
       StructType *Res = cast_or_null<StructType>(TypeList[NumRecords]);
@@ -911,33 +884,33 @@ bool BitcodeReader::ParseTypeTableBody() {
     }
     case bitc::TYPE_CODE_ARRAY:     // ARRAY: [numelts, eltty]
       if (Record.size() < 2)
-        return Error("Invalid ARRAY type record");
+        return Error(InvalidRecord);
       if ((ResultTy = getTypeByID(Record[1])))
         ResultTy = ArrayType::get(ResultTy, Record[0]);
       else
-        return Error("Invalid ARRAY type element");
+        return Error(InvalidType);
       break;
     case bitc::TYPE_CODE_VECTOR:    // VECTOR: [numelts, eltty]
       if (Record.size() < 2)
-        return Error("Invalid VECTOR type record");
+        return Error(InvalidRecord);
       if ((ResultTy = getTypeByID(Record[1])))
         ResultTy = VectorType::get(ResultTy, Record[0]);
       else
-        return Error("Invalid ARRAY type element");
+        return Error(InvalidType);
       break;
     }
 
     if (NumRecords >= TypeList.size())
-      return Error("invalid TYPE table");
+      return Error(InvalidTYPETable);
     assert(ResultTy && "Didn't read a type?");
     assert(TypeList[NumRecords] == 0 && "Already read type?");
     TypeList[NumRecords++] = ResultTy;
   }
 }
 
-bool BitcodeReader::ParseValueSymbolTable() {
+error_code BitcodeReader::ParseValueSymbolTable() {
   if (Stream.EnterSubBlock(bitc::VALUE_SYMTAB_BLOCK_ID))
-    return Error("Malformed block record");
+    return Error(InvalidRecord);
 
   SmallVector<uint64_t, 64> Record;
 
@@ -949,9 +922,9 @@ bool BitcodeReader::ParseValueSymbolTable() {
     switch (Entry.Kind) {
     case BitstreamEntry::SubBlock: // Handled for us already.
     case BitstreamEntry::Error:
-      return Error("malformed value symbol table block");
+      return Error(MalformedBlock);
     case BitstreamEntry::EndBlock:
-      return false;
+      return error_code::success();
     case BitstreamEntry::Record:
       // The interesting case.
       break;
@@ -964,10 +937,10 @@ bool BitcodeReader::ParseValueSymbolTable() {
       break;
     case bitc::VST_CODE_ENTRY: {  // VST_ENTRY: [valueid, namechar x N]
       if (ConvertToString(Record, 1, ValueName))
-        return Error("Invalid VST_ENTRY record");
+        return Error(InvalidRecord);
       unsigned ValueID = Record[0];
       if (ValueID >= ValueList.size())
-        return Error("Invalid Value ID in VST_ENTRY record");
+        return Error(InvalidRecord);
       Value *V = ValueList[ValueID];
 
       V->setName(StringRef(ValueName.data(), ValueName.size()));
@@ -976,10 +949,10 @@ bool BitcodeReader::ParseValueSymbolTable() {
     }
     case bitc::VST_CODE_BBENTRY: {
       if (ConvertToString(Record, 1, ValueName))
-        return Error("Invalid VST_BBENTRY record");
+        return Error(InvalidRecord);
       BasicBlock *BB = getBasicBlock(Record[0]);
       if (BB == 0)
-        return Error("Invalid BB ID in VST_BBENTRY record");
+        return Error(InvalidRecord);
 
       BB->setName(StringRef(ValueName.data(), ValueName.size()));
       ValueName.clear();
@@ -989,11 +962,11 @@ bool BitcodeReader::ParseValueSymbolTable() {
   }
 }
 
-bool BitcodeReader::ParseMetadata() {
+error_code BitcodeReader::ParseMetadata() {
   unsigned NextMDValueNo = MDValueList.size();
 
   if (Stream.EnterSubBlock(bitc::METADATA_BLOCK_ID))
-    return Error("Malformed block record");
+    return Error(InvalidRecord);
 
   SmallVector<uint64_t, 64> Record;
 
@@ -1004,10 +977,9 @@ bool BitcodeReader::ParseMetadata() {
     switch (Entry.Kind) {
     case BitstreamEntry::SubBlock: // Handled for us already.
     case BitstreamEntry::Error:
-      Error("malformed metadata block");
-      return true;
+      return Error(MalformedBlock);
     case BitstreamEntry::EndBlock:
-      return false;
+      return error_code::success();
     case BitstreamEntry::Record:
       // The interesting case.
       break;
@@ -1036,7 +1008,7 @@ bool BitcodeReader::ParseMetadata() {
       for (unsigned i = 0; i != Size; ++i) {
         MDNode *MD = dyn_cast<MDNode>(MDValueList.getValueFwdRef(Record[i]));
         if (MD == 0)
-          return Error("Malformed metadata record");
+          return Error(InvalidRecord);
         NMD->addOperand(MD);
       }
       break;
@@ -1046,13 +1018,14 @@ bool BitcodeReader::ParseMetadata() {
       // fall-through
     case bitc::METADATA_NODE: {
       if (Record.size() % 2 == 1)
-        return Error("Invalid METADATA_NODE record");
+        return Error(InvalidRecord);
 
       unsigned Size = Record.size();
       SmallVector<Value*, 8> Elts;
       for (unsigned i = 0; i != Size; i += 2) {
         Type *Ty = getTypeByID(Record[i]);
-        if (!Ty) return Error("Invalid METADATA_NODE record");
+        if (!Ty)
+          return Error(InvalidRecord);
         if (Ty->isMetadataTy())
           Elts.push_back(MDValueList.getValueFwdRef(Record[i+1]));
         else if (!Ty->isVoidTy())
@@ -1073,14 +1046,14 @@ bool BitcodeReader::ParseMetadata() {
     }
     case bitc::METADATA_KIND: {
       if (Record.size() < 2)
-        return Error("Invalid METADATA_KIND record");
+        return Error(InvalidRecord);
 
       unsigned Kind = Record[0];
       SmallString<8> Name(Record.begin()+1, Record.end());
 
       unsigned NewKind = TheModule->getMDKindID(Name.str());
       if (!MDKindMap.insert(std::make_pair(Kind, NewKind)).second)
-        return Error("Conflicting METADATA_KIND records");
+        return Error(ConflictingMETADATA_KINDRecords);
       break;
     }
     }
@@ -1100,12 +1073,14 @@ uint64_t BitcodeReader::decodeSignRotatedValue(uint64_t V) {
 
 /// ResolveGlobalAndAliasInits - Resolve all of the initializers for global
 /// values and aliases that we can.
-bool BitcodeReader::ResolveGlobalAndAliasInits() {
+error_code BitcodeReader::ResolveGlobalAndAliasInits() {
   std::vector<std::pair<GlobalVariable*, unsigned> > GlobalInitWorklist;
   std::vector<std::pair<GlobalAlias*, unsigned> > AliasInitWorklist;
+  std::vector<std::pair<Function*, unsigned> > FunctionPrefixWorklist;
 
   GlobalInitWorklist.swap(GlobalInits);
   AliasInitWorklist.swap(AliasInits);
+  FunctionPrefixWorklist.swap(FunctionPrefixes);
 
   while (!GlobalInitWorklist.empty()) {
     unsigned ValID = GlobalInitWorklist.back().second;
@@ -1116,7 +1091,7 @@ bool BitcodeReader::ResolveGlobalAndAliasInits() {
       if (Constant *C = dyn_cast<Constant>(ValueList[ValID]))
         GlobalInitWorklist.back().first->setInitializer(C);
       else
-        return Error("Global variable initializer is not a constant!");
+        return Error(ExpectedConstant);
     }
     GlobalInitWorklist.pop_back();
   }
@@ -1129,11 +1104,25 @@ bool BitcodeReader::ResolveGlobalAndAliasInits() {
       if (Constant *C = dyn_cast<Constant>(ValueList[ValID]))
         AliasInitWorklist.back().first->setAliasee(C);
       else
-        return Error("Alias initializer is not a constant!");
+        return Error(ExpectedConstant);
     }
     AliasInitWorklist.pop_back();
   }
-  return false;
+
+  while (!FunctionPrefixWorklist.empty()) {
+    unsigned ValID = FunctionPrefixWorklist.back().second;
+    if (ValID >= ValueList.size()) {
+      FunctionPrefixes.push_back(FunctionPrefixWorklist.back());
+    } else {
+      if (Constant *C = dyn_cast<Constant>(ValueList[ValID]))
+        FunctionPrefixWorklist.back().first->setPrefixData(C);
+      else
+        return Error(ExpectedConstant);
+    }
+    FunctionPrefixWorklist.pop_back();
+  }
+
+  return error_code::success();
 }
 
 static APInt ReadWideAPInt(ArrayRef<uint64_t> Vals, unsigned TypeBits) {
@@ -1144,9 +1133,9 @@ static APInt ReadWideAPInt(ArrayRef<uint64_t> Vals, unsigned TypeBits) {
   return APInt(TypeBits, Words);
 }
 
-bool BitcodeReader::ParseConstants() {
+error_code BitcodeReader::ParseConstants() {
   if (Stream.EnterSubBlock(bitc::CONSTANTS_BLOCK_ID))
-    return Error("Malformed block record");
+    return Error(InvalidRecord);
 
   SmallVector<uint64_t, 64> Record;
 
@@ -1159,15 +1148,15 @@ bool BitcodeReader::ParseConstants() {
     switch (Entry.Kind) {
     case BitstreamEntry::SubBlock: // Handled for us already.
     case BitstreamEntry::Error:
-      return Error("malformed block record in AST file");
+      return Error(MalformedBlock);
     case BitstreamEntry::EndBlock:
       if (NextCstNo != ValueList.size())
-        return Error("Invalid constant reference!");
+        return Error(InvalidConstantReference);
 
       // Once all the constants have been read, go through and resolve forward
       // references.
       ValueList.ResolveConstantForwardRefs();
-      return false;
+      return error_code::success();
     case BitstreamEntry::Record:
       // The interesting case.
       break;
@@ -1184,9 +1173,9 @@ bool BitcodeReader::ParseConstants() {
       break;
     case bitc::CST_CODE_SETTYPE:   // SETTYPE: [typeid]
       if (Record.empty())
-        return Error("Malformed CST_SETTYPE record");
+        return Error(InvalidRecord);
       if (Record[0] >= TypeList.size())
-        return Error("Invalid Type ID in CST_SETTYPE record");
+        return Error(InvalidRecord);
       CurTy = TypeList[Record[0]];
       continue;  // Skip the ValueList manipulation.
     case bitc::CST_CODE_NULL:      // NULL
@@ -1194,12 +1183,12 @@ bool BitcodeReader::ParseConstants() {
       break;
     case bitc::CST_CODE_INTEGER:   // INTEGER: [intval]
       if (!CurTy->isIntegerTy() || Record.empty())
-        return Error("Invalid CST_INTEGER record");
+        return Error(InvalidRecord);
       V = ConstantInt::get(CurTy, decodeSignRotatedValue(Record[0]));
       break;
     case bitc::CST_CODE_WIDE_INTEGER: {// WIDE_INTEGER: [n x intval]
       if (!CurTy->isIntegerTy() || Record.empty())
-        return Error("Invalid WIDE_INTEGER record");
+        return Error(InvalidRecord);
 
       APInt VInt = ReadWideAPInt(Record,
                                  cast<IntegerType>(CurTy)->getBitWidth());
@@ -1209,7 +1198,7 @@ bool BitcodeReader::ParseConstants() {
     }
     case bitc::CST_CODE_FLOAT: {    // FLOAT: [fpval]
       if (Record.empty())
-        return Error("Invalid FLOAT record");
+        return Error(InvalidRecord);
       if (CurTy->isHalfTy())
         V = ConstantFP::get(Context, APFloat(APFloat::IEEEhalf,
                                              APInt(16, (uint16_t)Record[0])));
@@ -1239,7 +1228,7 @@ bool BitcodeReader::ParseConstants() {
 
     case bitc::CST_CODE_AGGREGATE: {// AGGREGATE: [n x value number]
       if (Record.empty())
-        return Error("Invalid CST_AGGREGATE record");
+        return Error(InvalidRecord);
 
       unsigned Size = Record.size();
       SmallVector<Constant*, 16> Elts;
@@ -1267,7 +1256,7 @@ bool BitcodeReader::ParseConstants() {
     case bitc::CST_CODE_STRING:    // STRING: [values]
     case bitc::CST_CODE_CSTRING: { // CSTRING: [values]
       if (Record.empty())
-        return Error("Invalid CST_STRING record");
+        return Error(InvalidRecord);
 
       SmallString<16> Elts(Record.begin(), Record.end());
       V = ConstantDataArray::getString(Context, Elts,
@@ -1276,7 +1265,7 @@ bool BitcodeReader::ParseConstants() {
     }
     case bitc::CST_CODE_DATA: {// DATA: [n x value]
       if (Record.empty())
-        return Error("Invalid CST_DATA record");
+        return Error(InvalidRecord);
 
       Type *EltTy = cast<SequentialType>(CurTy)->getElementType();
       unsigned Size = Record.size();
@@ -1321,13 +1310,14 @@ bool BitcodeReader::ParseConstants() {
         else
           V = ConstantDataArray::get(Context, Elts);
       } else {
-        return Error("Unknown element type in CE_DATA");
+        return Error(InvalidTypeForValue);
       }
       break;
     }
 
     case bitc::CST_CODE_CE_BINOP: {  // CE_BINOP: [opcode, opval, opval]
-      if (Record.size() < 3) return Error("Invalid CE_BINOP record");
+      if (Record.size() < 3)
+        return Error(InvalidRecord);
       int Opc = GetDecodedBinaryOpcode(Record[0], CurTy);
       if (Opc < 0) {
         V = UndefValue::get(CurTy);  // Unknown binop.
@@ -1357,25 +1347,30 @@ bool BitcodeReader::ParseConstants() {
       break;
     }
     case bitc::CST_CODE_CE_CAST: {  // CE_CAST: [opcode, opty, opval]
-      if (Record.size() < 3) return Error("Invalid CE_CAST record");
+      if (Record.size() < 3)
+        return Error(InvalidRecord);
       int Opc = GetDecodedCastOpcode(Record[0]);
       if (Opc < 0) {
         V = UndefValue::get(CurTy);  // Unknown cast.
       } else {
         Type *OpTy = getTypeByID(Record[1]);
-        if (!OpTy) return Error("Invalid CE_CAST record");
+        if (!OpTy)
+          return Error(InvalidRecord);
         Constant *Op = ValueList.getConstantFwdRef(Record[2], OpTy);
-        V = ConstantExpr::getCast(Opc, Op, CurTy);
+        V = UpgradeBitCastExpr(Opc, Op, CurTy);
+        if (!V) V = ConstantExpr::getCast(Opc, Op, CurTy);
       }
       break;
     }
     case bitc::CST_CODE_CE_INBOUNDS_GEP:
     case bitc::CST_CODE_CE_GEP: {  // CE_GEP:        [n x operands]
-      if (Record.size() & 1) return Error("Invalid CE_GEP record");
+      if (Record.size() & 1)
+        return Error(InvalidRecord);
       SmallVector<Constant*, 16> Elts;
       for (unsigned i = 0, e = Record.size(); i != e; i += 2) {
         Type *ElTy = getTypeByID(Record[i]);
-        if (!ElTy) return Error("Invalid CE_GEP record");
+        if (!ElTy)
+          return Error(InvalidRecord);
         Elts.push_back(ValueList.getConstantFwdRef(Record[i+1], ElTy));
       }
       ArrayRef<Constant *> Indices(Elts.begin() + 1, Elts.end());
@@ -1384,19 +1379,31 @@ bool BitcodeReader::ParseConstants() {
                                            bitc::CST_CODE_CE_INBOUNDS_GEP);
       break;
     }
-    case bitc::CST_CODE_CE_SELECT:  // CE_SELECT: [opval#, opval#, opval#]
-      if (Record.size() < 3) return Error("Invalid CE_SELECT record");
-      V = ConstantExpr::getSelect(
-                          ValueList.getConstantFwdRef(Record[0],
-                                                      Type::getInt1Ty(Context)),
-                          ValueList.getConstantFwdRef(Record[1],CurTy),
-                          ValueList.getConstantFwdRef(Record[2],CurTy));
+    case bitc::CST_CODE_CE_SELECT: {  // CE_SELECT: [opval#, opval#, opval#]
+      if (Record.size() < 3)
+        return Error(InvalidRecord);
+
+      Type *SelectorTy = Type::getInt1Ty(Context);
+
+      // If CurTy is a vector of length n, then Record[0] must be a <n x i1>
+      // vector. Otherwise, it must be a single bit.
+      if (VectorType *VTy = dyn_cast<VectorType>(CurTy))
+        SelectorTy = VectorType::get(Type::getInt1Ty(Context),
+                                     VTy->getNumElements());
+
+      V = ConstantExpr::getSelect(ValueList.getConstantFwdRef(Record[0],
+                                                              SelectorTy),
+                                  ValueList.getConstantFwdRef(Record[1],CurTy),
+                                  ValueList.getConstantFwdRef(Record[2],CurTy));
       break;
+    }
     case bitc::CST_CODE_CE_EXTRACTELT: { // CE_EXTRACTELT: [opty, opval, opval]
-      if (Record.size() < 3) return Error("Invalid CE_EXTRACTELT record");
+      if (Record.size() < 3)
+        return Error(InvalidRecord);
       VectorType *OpTy =
         dyn_cast_or_null<VectorType>(getTypeByID(Record[0]));
-      if (OpTy == 0) return Error("Invalid CE_EXTRACTELT record");
+      if (OpTy == 0)
+        return Error(InvalidRecord);
       Constant *Op0 = ValueList.getConstantFwdRef(Record[1], OpTy);
       Constant *Op1 = ValueList.getConstantFwdRef(Record[2],
                                                   Type::getInt32Ty(Context));
@@ -1406,7 +1413,7 @@ bool BitcodeReader::ParseConstants() {
     case bitc::CST_CODE_CE_INSERTELT: { // CE_INSERTELT: [opval, opval, opval]
       VectorType *OpTy = dyn_cast<VectorType>(CurTy);
       if (Record.size() < 3 || OpTy == 0)
-        return Error("Invalid CE_INSERTELT record");
+        return Error(InvalidRecord);
       Constant *Op0 = ValueList.getConstantFwdRef(Record[0], OpTy);
       Constant *Op1 = ValueList.getConstantFwdRef(Record[1],
                                                   OpTy->getElementType());
@@ -1418,7 +1425,7 @@ bool BitcodeReader::ParseConstants() {
     case bitc::CST_CODE_CE_SHUFFLEVEC: { // CE_SHUFFLEVEC: [opval, opval, opval]
       VectorType *OpTy = dyn_cast<VectorType>(CurTy);
       if (Record.size() < 3 || OpTy == 0)
-        return Error("Invalid CE_SHUFFLEVEC record");
+        return Error(InvalidRecord);
       Constant *Op0 = ValueList.getConstantFwdRef(Record[0], OpTy);
       Constant *Op1 = ValueList.getConstantFwdRef(Record[1], OpTy);
       Type *ShufTy = VectorType::get(Type::getInt32Ty(Context),
@@ -1432,7 +1439,7 @@ bool BitcodeReader::ParseConstants() {
       VectorType *OpTy =
         dyn_cast_or_null<VectorType>(getTypeByID(Record[0]));
       if (Record.size() < 4 || RTy == 0 || OpTy == 0)
-        return Error("Invalid CE_SHUFVEC_EX record");
+        return Error(InvalidRecord);
       Constant *Op0 = ValueList.getConstantFwdRef(Record[1], OpTy);
       Constant *Op1 = ValueList.getConstantFwdRef(Record[2], OpTy);
       Type *ShufTy = VectorType::get(Type::getInt32Ty(Context),
@@ -1442,9 +1449,11 @@ bool BitcodeReader::ParseConstants() {
       break;
     }
     case bitc::CST_CODE_CE_CMP: {     // CE_CMP: [opty, opval, opval, pred]
-      if (Record.size() < 4) return Error("Invalid CE_CMP record");
+      if (Record.size() < 4)
+        return Error(InvalidRecord);
       Type *OpTy = getTypeByID(Record[0]);
-      if (OpTy == 0) return Error("Invalid CE_CMP record");
+      if (OpTy == 0)
+        return Error(InvalidRecord);
       Constant *Op0 = ValueList.getConstantFwdRef(Record[1], OpTy);
       Constant *Op1 = ValueList.getConstantFwdRef(Record[2], OpTy);
 
@@ -1457,16 +1466,17 @@ bool BitcodeReader::ParseConstants() {
     // This maintains backward compatibility, pre-asm dialect keywords.
     // FIXME: Remove with the 4.0 release.
     case bitc::CST_CODE_INLINEASM_OLD: {
-      if (Record.size() < 2) return Error("Invalid INLINEASM record");
+      if (Record.size() < 2)
+        return Error(InvalidRecord);
       std::string AsmStr, ConstrStr;
       bool HasSideEffects = Record[0] & 1;
       bool IsAlignStack = Record[0] >> 1;
       unsigned AsmStrSize = Record[1];
       if (2+AsmStrSize >= Record.size())
-        return Error("Invalid INLINEASM record");
+        return Error(InvalidRecord);
       unsigned ConstStrSize = Record[2+AsmStrSize];
       if (3+AsmStrSize+ConstStrSize > Record.size())
-        return Error("Invalid INLINEASM record");
+        return Error(InvalidRecord);
 
       for (unsigned i = 0; i != AsmStrSize; ++i)
         AsmStr += (char)Record[2+i];
@@ -1480,17 +1490,18 @@ bool BitcodeReader::ParseConstants() {
     // This version adds support for the asm dialect keywords (e.g.,
     // inteldialect).
     case bitc::CST_CODE_INLINEASM: {
-      if (Record.size() < 2) return Error("Invalid INLINEASM record");
+      if (Record.size() < 2)
+        return Error(InvalidRecord);
       std::string AsmStr, ConstrStr;
       bool HasSideEffects = Record[0] & 1;
       bool IsAlignStack = (Record[0] >> 1) & 1;
       unsigned AsmDialect = Record[0] >> 2;
       unsigned AsmStrSize = Record[1];
       if (2+AsmStrSize >= Record.size())
-        return Error("Invalid INLINEASM record");
+        return Error(InvalidRecord);
       unsigned ConstStrSize = Record[2+AsmStrSize];
       if (3+AsmStrSize+ConstStrSize > Record.size())
-        return Error("Invalid INLINEASM record");
+        return Error(InvalidRecord);
 
       for (unsigned i = 0; i != AsmStrSize; ++i)
         AsmStr += (char)Record[2+i];
@@ -1503,12 +1514,15 @@ bool BitcodeReader::ParseConstants() {
       break;
     }
     case bitc::CST_CODE_BLOCKADDRESS:{
-      if (Record.size() < 3) return Error("Invalid CE_BLOCKADDRESS record");
+      if (Record.size() < 3)
+        return Error(InvalidRecord);
       Type *FnTy = getTypeByID(Record[0]);
-      if (FnTy == 0) return Error("Invalid CE_BLOCKADDRESS record");
+      if (FnTy == 0)
+        return Error(InvalidRecord);
       Function *Fn =
         dyn_cast_or_null<Function>(ValueList.getConstantFwdRef(Record[1],FnTy));
-      if (Fn == 0) return Error("Invalid CE_BLOCKADDRESS record");
+      if (Fn == 0)
+        return Error(InvalidRecord);
 
       // If the function is already parsed we can insert the block address right
       // away.
@@ -1516,7 +1530,7 @@ bool BitcodeReader::ParseConstants() {
         Function::iterator BBI = Fn->begin(), BBE = Fn->end();
         for (size_t I = 0, E = Record[2]; I != E; ++I) {
           if (BBI == BBE)
-            return Error("Invalid blockaddress block #");
+            return Error(InvalidID);
           ++BBI;
         }
         V = BlockAddress::get(Fn, BBI);
@@ -1539,9 +1553,9 @@ bool BitcodeReader::ParseConstants() {
   }
 }
 
-bool BitcodeReader::ParseUseLists() {
+error_code BitcodeReader::ParseUseLists() {
   if (Stream.EnterSubBlock(bitc::USELIST_BLOCK_ID))
-    return Error("Malformed block record");
+    return Error(InvalidRecord);
 
   SmallVector<uint64_t, 64> Record;
 
@@ -1552,9 +1566,9 @@ bool BitcodeReader::ParseUseLists() {
     switch (Entry.Kind) {
     case BitstreamEntry::SubBlock: // Handled for us already.
     case BitstreamEntry::Error:
-      return Error("malformed use list block");
+      return Error(MalformedBlock);
     case BitstreamEntry::EndBlock:
-      return false;
+      return error_code::success();
     case BitstreamEntry::Record:
       // The interesting case.
       break;
@@ -1568,7 +1582,7 @@ bool BitcodeReader::ParseUseLists() {
     case bitc::USELIST_CODE_ENTRY: { // USELIST_CODE_ENTRY: TBD.
       unsigned RecordLength = Record.size();
       if (RecordLength < 1)
-        return Error ("Invalid UseList reader!");
+        return Error(InvalidRecord);
       UseListRecords.push_back(Record);
       break;
     }
@@ -1579,10 +1593,10 @@ bool BitcodeReader::ParseUseLists() {
 /// RememberAndSkipFunctionBody - When we see the block for a function body,
 /// remember where it is and then skip it.  This lets us lazily deserialize the
 /// functions.
-bool BitcodeReader::RememberAndSkipFunctionBody() {
+error_code BitcodeReader::RememberAndSkipFunctionBody() {
   // Get the function we are talking about.
   if (FunctionsWithBodies.empty())
-    return Error("Insufficient function protos");
+    return Error(InsufficientFunctionProtos);
 
   Function *Fn = FunctionsWithBodies.back();
   FunctionsWithBodies.pop_back();
@@ -1593,15 +1607,15 @@ bool BitcodeReader::RememberAndSkipFunctionBody() {
 
   // Skip over the function block for now.
   if (Stream.SkipBlock())
-    return Error("Malformed block record");
-  return false;
+    return Error(InvalidRecord);
+  return error_code::success();
 }
 
-bool BitcodeReader::GlobalCleanup() {
+error_code BitcodeReader::GlobalCleanup() {
   // Patch the initializers for globals and aliases up.
   ResolveGlobalAndAliasInits();
   if (!GlobalInits.empty() || !AliasInits.empty())
-    return Error("Malformed global initializer set");
+    return Error(MalformedGlobalInitializerSet);
 
   // Look for intrinsic functions which need to be upgraded at some point
   for (Module::iterator FI = TheModule->begin(), FE = TheModule->end();
@@ -1620,14 +1634,14 @@ bool BitcodeReader::GlobalCleanup() {
   // want lazy deserialization.
   std::vector<std::pair<GlobalVariable*, unsigned> >().swap(GlobalInits);
   std::vector<std::pair<GlobalAlias*, unsigned> >().swap(AliasInits);
-  return false;
+  return error_code::success();
 }
 
-bool BitcodeReader::ParseModule(bool Resume) {
+error_code BitcodeReader::ParseModule(bool Resume) {
   if (Resume)
     Stream.JumpToBit(NextUnreadBit);
   else if (Stream.EnterSubBlock(bitc::MODULE_BLOCK_ID))
-    return Error("Malformed block record");
+    return Error(InvalidRecord);
 
   SmallVector<uint64_t, 64> Record;
   std::vector<std::string> SectionTable;
@@ -1639,8 +1653,7 @@ bool BitcodeReader::ParseModule(bool Resume) {
 
     switch (Entry.Kind) {
     case BitstreamEntry::Error:
-      Error("malformed module block");
-      return true;
+      return Error(MalformedBlock);
     case BitstreamEntry::EndBlock:
       return GlobalCleanup();
 
@@ -1648,49 +1661,51 @@ bool BitcodeReader::ParseModule(bool Resume) {
       switch (Entry.ID) {
       default:  // Skip unknown content.
         if (Stream.SkipBlock())
-          return Error("Malformed block record");
+          return Error(InvalidRecord);
         break;
       case bitc::BLOCKINFO_BLOCK_ID:
         if (Stream.ReadBlockInfoBlock())
-          return Error("Malformed BlockInfoBlock");
+          return Error(MalformedBlock);
         break;
       case bitc::PARAMATTR_BLOCK_ID:
-        if (ParseAttributeBlock())
-          return true;
+        if (error_code EC = ParseAttributeBlock())
+          return EC;
         break;
       case bitc::PARAMATTR_GROUP_BLOCK_ID:
-        if (ParseAttributeGroupBlock())
-          return true;
+        if (error_code EC = ParseAttributeGroupBlock())
+          return EC;
         break;
       case bitc::TYPE_BLOCK_ID_NEW:
-        if (ParseTypeTable())
-          return true;
+        if (error_code EC = ParseTypeTable())
+          return EC;
         break;
       case bitc::VALUE_SYMTAB_BLOCK_ID:
-        if (ParseValueSymbolTable())
-          return true;
+        if (error_code EC = ParseValueSymbolTable())
+          return EC;
         SeenValueSymbolTable = true;
         break;
       case bitc::CONSTANTS_BLOCK_ID:
-        if (ParseConstants() || ResolveGlobalAndAliasInits())
-          return true;
+        if (error_code EC = ParseConstants())
+          return EC;
+        if (error_code EC = ResolveGlobalAndAliasInits())
+          return EC;
         break;
       case bitc::METADATA_BLOCK_ID:
-        if (ParseMetadata())
-          return true;
+        if (error_code EC = ParseMetadata())
+          return EC;
         break;
       case bitc::FUNCTION_BLOCK_ID:
         // If this is the first function body we've seen, reverse the
         // FunctionsWithBodies list.
         if (!SeenFirstFunctionBody) {
           std::reverse(FunctionsWithBodies.begin(), FunctionsWithBodies.end());
-          if (GlobalCleanup())
-            return true;
+          if (error_code EC = GlobalCleanup())
+            return EC;
           SeenFirstFunctionBody = true;
         }
 
-        if (RememberAndSkipFunctionBody())
-          return true;
+        if (error_code EC = RememberAndSkipFunctionBody())
+          return EC;
         // For streaming bitcode, suspend parsing when we reach the function
         // bodies. Subsequent materialization calls will resume it when
         // necessary. For streaming, the function bodies must be at the end of
@@ -1699,12 +1714,12 @@ bool BitcodeReader::ParseModule(bool Resume) {
         // just finish the parse now.
         if (LazyStreamer && SeenValueSymbolTable) {
           NextUnreadBit = Stream.GetCurrentBitNo();
-          return false;
+          return error_code::success();
         }
         break;
       case bitc::USELIST_BLOCK_ID:
-        if (ParseUseLists())
-          return true;
+        if (error_code EC = ParseUseLists())
+          return EC;
         break;
       }
       continue;
@@ -1720,11 +1735,12 @@ bool BitcodeReader::ParseModule(bool Resume) {
     default: break;  // Default behavior, ignore unknown content.
     case bitc::MODULE_CODE_VERSION: {  // VERSION: [version#]
       if (Record.size() < 1)
-        return Error("Malformed MODULE_CODE_VERSION");
+        return Error(InvalidRecord);
       // Only version #0 and #1 are supported so far.
       unsigned module_version = Record[0];
       switch (module_version) {
-        default: return Error("Unknown bitstream version!");
+        default:
+          return Error(InvalidValue);
         case 0:
           UseRelativeIDs = false;
           break;
@@ -1737,21 +1753,21 @@ bool BitcodeReader::ParseModule(bool Resume) {
     case bitc::MODULE_CODE_TRIPLE: {  // TRIPLE: [strchr x N]
       std::string S;
       if (ConvertToString(Record, 0, S))
-        return Error("Invalid MODULE_CODE_TRIPLE record");
+        return Error(InvalidRecord);
       TheModule->setTargetTriple(S);
       break;
     }
     case bitc::MODULE_CODE_DATALAYOUT: {  // DATALAYOUT: [strchr x N]
       std::string S;
       if (ConvertToString(Record, 0, S))
-        return Error("Invalid MODULE_CODE_DATALAYOUT record");
+        return Error(InvalidRecord);
       TheModule->setDataLayout(S);
       break;
     }
     case bitc::MODULE_CODE_ASM: {  // ASM: [strchr x N]
       std::string S;
       if (ConvertToString(Record, 0, S))
-        return Error("Invalid MODULE_CODE_ASM record");
+        return Error(InvalidRecord);
       TheModule->setModuleInlineAsm(S);
       break;
     }
@@ -1759,21 +1775,21 @@ bool BitcodeReader::ParseModule(bool Resume) {
       // FIXME: Remove in 4.0.
       std::string S;
       if (ConvertToString(Record, 0, S))
-        return Error("Invalid MODULE_CODE_DEPLIB record");
+        return Error(InvalidRecord);
       // Ignore value.
       break;
     }
     case bitc::MODULE_CODE_SECTIONNAME: {  // SECTIONNAME: [strchr x N]
       std::string S;
       if (ConvertToString(Record, 0, S))
-        return Error("Invalid MODULE_CODE_SECTIONNAME record");
+        return Error(InvalidRecord);
       SectionTable.push_back(S);
       break;
     }
     case bitc::MODULE_CODE_GCNAME: {  // SECTIONNAME: [strchr x N]
       std::string S;
       if (ConvertToString(Record, 0, S))
-        return Error("Invalid MODULE_CODE_GCNAME record");
+        return Error(InvalidRecord);
       GCTable.push_back(S);
       break;
     }
@@ -1782,11 +1798,12 @@ bool BitcodeReader::ParseModule(bool Resume) {
     //             unnamed_addr]
     case bitc::MODULE_CODE_GLOBALVAR: {
       if (Record.size() < 6)
-        return Error("Invalid MODULE_CODE_GLOBALVAR record");
+        return Error(InvalidRecord);
       Type *Ty = getTypeByID(Record[0]);
-      if (!Ty) return Error("Invalid MODULE_CODE_GLOBALVAR record");
+      if (!Ty)
+        return Error(InvalidRecord);
       if (!Ty->isPointerTy())
-        return Error("Global not a pointer type!");
+        return Error(InvalidTypeForValue);
       unsigned AddressSpace = cast<PointerType>(Ty)->getAddressSpace();
       Ty = cast<PointerType>(Ty)->getElementType();
 
@@ -1796,7 +1813,7 @@ bool BitcodeReader::ParseModule(bool Resume) {
       std::string Section;
       if (Record[5]) {
         if (Record[5]-1 >= SectionTable.size())
-          return Error("Invalid section ID");
+          return Error(InvalidID);
         Section = SectionTable[Record[5]-1];
       }
       GlobalValue::VisibilityTypes Visibility = GlobalValue::DefaultVisibility;
@@ -1835,15 +1852,16 @@ bool BitcodeReader::ParseModule(bool Resume) {
     //             alignment, section, visibility, gc, unnamed_addr]
     case bitc::MODULE_CODE_FUNCTION: {
       if (Record.size() < 8)
-        return Error("Invalid MODULE_CODE_FUNCTION record");
+        return Error(InvalidRecord);
       Type *Ty = getTypeByID(Record[0]);
-      if (!Ty) return Error("Invalid MODULE_CODE_FUNCTION record");
+      if (!Ty)
+        return Error(InvalidRecord);
       if (!Ty->isPointerTy())
-        return Error("Function not a pointer type!");
+        return Error(InvalidTypeForValue);
       FunctionType *FTy =
         dyn_cast<FunctionType>(cast<PointerType>(Ty)->getElementType());
       if (!FTy)
-        return Error("Function not a pointer to function type!");
+        return Error(InvalidTypeForValue);
 
       Function *Func = Function::Create(FTy, GlobalValue::ExternalLinkage,
                                         "", TheModule);
@@ -1856,19 +1874,21 @@ bool BitcodeReader::ParseModule(bool Resume) {
       Func->setAlignment((1 << Record[5]) >> 1);
       if (Record[6]) {
         if (Record[6]-1 >= SectionTable.size())
-          return Error("Invalid section ID");
+          return Error(InvalidID);
         Func->setSection(SectionTable[Record[6]-1]);
       }
       Func->setVisibility(GetDecodedVisibility(Record[7]));
       if (Record.size() > 8 && Record[8]) {
         if (Record[8]-1 > GCTable.size())
-          return Error("Invalid GC ID");
+          return Error(InvalidID);
         Func->setGC(GCTable[Record[8]-1].c_str());
       }
       bool UnnamedAddr = false;
       if (Record.size() > 9)
         UnnamedAddr = Record[9];
       Func->setUnnamedAddr(UnnamedAddr);
+      if (Record.size() > 10 && Record[10] != 0)
+        FunctionPrefixes.push_back(std::make_pair(Func, Record[10]-1));
       ValueList.push_back(Func);
 
       // If this is a function with a body, remember the prototype we are
@@ -1883,11 +1903,12 @@ bool BitcodeReader::ParseModule(bool Resume) {
     // ALIAS: [alias type, aliasee val#, linkage, visibility]
     case bitc::MODULE_CODE_ALIAS: {
       if (Record.size() < 3)
-        return Error("Invalid MODULE_ALIAS record");
+        return Error(InvalidRecord);
       Type *Ty = getTypeByID(Record[0]);
-      if (!Ty) return Error("Invalid MODULE_ALIAS record");
+      if (!Ty)
+        return Error(InvalidRecord);
       if (!Ty->isPointerTy())
-        return Error("Function not a pointer type!");
+        return Error(InvalidTypeForValue);
 
       GlobalAlias *NewGA = new GlobalAlias(Ty, GetDecodedLinkage(Record[2]),
                                            "", 0, TheModule);
@@ -1902,7 +1923,7 @@ bool BitcodeReader::ParseModule(bool Resume) {
     case bitc::MODULE_CODE_PURGEVALS:
       // Trim down the value list to the specified size.
       if (Record.size() < 1 || Record[0] > ValueList.size())
-        return Error("Invalid MODULE_PURGEVALS record");
+        return Error(InvalidRecord);
       ValueList.shrinkTo(Record[0]);
       break;
     }
@@ -1910,10 +1931,11 @@ bool BitcodeReader::ParseModule(bool Resume) {
   }
 }
 
-bool BitcodeReader::ParseBitcodeInto(Module *M) {
+error_code BitcodeReader::ParseBitcodeInto(Module *M) {
   TheModule = 0;
 
-  if (InitStream()) return true;
+  if (error_code EC = InitStream())
+    return EC;
 
   // Sniff for the signature.
   if (Stream.Read(8) != 'B' ||
@@ -1922,42 +1944,42 @@ bool BitcodeReader::ParseBitcodeInto(Module *M) {
       Stream.Read(4) != 0xC ||
       Stream.Read(4) != 0xE ||
       Stream.Read(4) != 0xD)
-    return Error("Invalid bitcode signature");
+    return Error(InvalidBitcodeSignature);
 
   // We expect a number of well-defined blocks, though we don't necessarily
   // need to understand them all.
   while (1) {
     if (Stream.AtEndOfStream())
-      return false;
+      return error_code::success();
 
     BitstreamEntry Entry =
       Stream.advance(BitstreamCursor::AF_DontAutoprocessAbbrevs);
 
     switch (Entry.Kind) {
     case BitstreamEntry::Error:
-      Error("malformed module file");
-      return true;
+      return Error(MalformedBlock);
     case BitstreamEntry::EndBlock:
-      return false;
+      return error_code::success();
 
     case BitstreamEntry::SubBlock:
       switch (Entry.ID) {
       case bitc::BLOCKINFO_BLOCK_ID:
         if (Stream.ReadBlockInfoBlock())
-          return Error("Malformed BlockInfoBlock");
+          return Error(MalformedBlock);
         break;
       case bitc::MODULE_BLOCK_ID:
         // Reject multiple MODULE_BLOCK's in a single bitstream.
         if (TheModule)
-          return Error("Multiple MODULE_BLOCKs in same stream");
+          return Error(InvalidMultipleBlocks);
         TheModule = M;
-        if (ParseModule(false))
-          return true;
-        if (LazyStreamer) return false;
+        if (error_code EC = ParseModule(false))
+          return EC;
+        if (LazyStreamer)
+          return error_code::success();
         break;
       default:
         if (Stream.SkipBlock())
-          return Error("Malformed block record");
+          return Error(InvalidRecord);
         break;
       }
       continue;
@@ -1970,16 +1992,16 @@ bool BitcodeReader::ParseBitcodeInto(Module *M) {
       if (Stream.getAbbrevIDWidth() == 2 && Entry.ID == 2 &&
           Stream.Read(6) == 2 && Stream.Read(24) == 0xa0a0a &&
           Stream.AtEndOfStream())
-        return false;
+        return error_code::success();
 
-      return Error("Invalid record at top-level");
+      return Error(InvalidRecord);
     }
   }
 }
 
-bool BitcodeReader::ParseModuleTriple(std::string &Triple) {
+error_code BitcodeReader::ParseModuleTriple(std::string &Triple) {
   if (Stream.EnterSubBlock(bitc::MODULE_BLOCK_ID))
-    return Error("Malformed block record");
+    return Error(InvalidRecord);
 
   SmallVector<uint64_t, 64> Record;
 
@@ -1990,9 +2012,9 @@ bool BitcodeReader::ParseModuleTriple(std::string &Triple) {
     switch (Entry.Kind) {
     case BitstreamEntry::SubBlock: // Handled for us already.
     case BitstreamEntry::Error:
-      return Error("malformed module block");
+      return Error(MalformedBlock);
     case BitstreamEntry::EndBlock:
-      return false;
+      return error_code::success();
     case BitstreamEntry::Record:
       // The interesting case.
       break;
@@ -2004,7 +2026,7 @@ bool BitcodeReader::ParseModuleTriple(std::string &Triple) {
     case bitc::MODULE_CODE_TRIPLE: {  // TRIPLE: [strchr x N]
       std::string S;
       if (ConvertToString(Record, 0, S))
-        return Error("Invalid MODULE_CODE_TRIPLE record");
+        return Error(InvalidRecord);
       Triple = S;
       break;
     }
@@ -2013,8 +2035,9 @@ bool BitcodeReader::ParseModuleTriple(std::string &Triple) {
   }
 }
 
-bool BitcodeReader::ParseTriple(std::string &Triple) {
-  if (InitStream()) return true;
+error_code BitcodeReader::ParseTriple(std::string &Triple) {
+  if (error_code EC = InitStream())
+    return EC;
 
   // Sniff for the signature.
   if (Stream.Read(8) != 'B' ||
@@ -2023,7 +2046,7 @@ bool BitcodeReader::ParseTriple(std::string &Triple) {
       Stream.Read(4) != 0xC ||
       Stream.Read(4) != 0xE ||
       Stream.Read(4) != 0xD)
-    return Error("Invalid bitcode signature");
+    return Error(InvalidBitcodeSignature);
 
   // We expect a number of well-defined blocks, though we don't necessarily
   // need to understand them all.
@@ -2032,20 +2055,17 @@ bool BitcodeReader::ParseTriple(std::string &Triple) {
 
     switch (Entry.Kind) {
     case BitstreamEntry::Error:
-      Error("malformed module file");
-      return true;
+      return Error(MalformedBlock);
     case BitstreamEntry::EndBlock:
-      return false;
+      return error_code::success();
 
     case BitstreamEntry::SubBlock:
       if (Entry.ID == bitc::MODULE_BLOCK_ID)
         return ParseModuleTriple(Triple);
 
       // Ignore other sub-blocks.
-      if (Stream.SkipBlock()) {
-        Error("malformed block record in AST file");
-        return true;
-      }
+      if (Stream.SkipBlock())
+        return Error(MalformedBlock);
       continue;
 
     case BitstreamEntry::Record:
@@ -2056,9 +2076,9 @@ bool BitcodeReader::ParseTriple(std::string &Triple) {
 }
 
 /// ParseMetadataAttachment - Parse metadata attachments.
-bool BitcodeReader::ParseMetadataAttachment() {
+error_code BitcodeReader::ParseMetadataAttachment() {
   if (Stream.EnterSubBlock(bitc::METADATA_ATTACHMENT_ID))
-    return Error("Malformed block record");
+    return Error(InvalidRecord);
 
   SmallVector<uint64_t, 64> Record;
   while (1) {
@@ -2067,9 +2087,9 @@ bool BitcodeReader::ParseMetadataAttachment() {
     switch (Entry.Kind) {
     case BitstreamEntry::SubBlock: // Handled for us already.
     case BitstreamEntry::Error:
-      return Error("malformed metadata block");
+      return Error(MalformedBlock);
     case BitstreamEntry::EndBlock:
-      return false;
+      return error_code::success();
     case BitstreamEntry::Record:
       // The interesting case.
       break;
@@ -2083,16 +2103,18 @@ bool BitcodeReader::ParseMetadataAttachment() {
     case bitc::METADATA_ATTACHMENT: {
       unsigned RecordLength = Record.size();
       if (Record.empty() || (RecordLength - 1) % 2 == 1)
-        return Error ("Invalid METADATA_ATTACHMENT reader!");
+        return Error(InvalidRecord);
       Instruction *Inst = InstructionList[Record[0]];
       for (unsigned i = 1; i != RecordLength; i = i+2) {
         unsigned Kind = Record[i];
         DenseMap<unsigned, unsigned>::iterator I =
           MDKindMap.find(Kind);
         if (I == MDKindMap.end())
-          return Error("Invalid metadata kind ID");
+          return Error(InvalidID);
         Value *Node = MDValueList.getValueFwdRef(Record[i+1]);
         Inst->setMetadata(I->second, cast<MDNode>(Node));
+        if (I->second == LLVMContext::MD_tbaa)
+          InstsWithTBAATag.push_back(Inst);
       }
       break;
     }
@@ -2101,9 +2123,9 @@ bool BitcodeReader::ParseMetadataAttachment() {
 }
 
 /// ParseFunctionBody - Lazily parse the specified function body block.
-bool BitcodeReader::ParseFunctionBody(Function *F) {
+error_code BitcodeReader::ParseFunctionBody(Function *F) {
   if (Stream.EnterSubBlock(bitc::FUNCTION_BLOCK_ID))
-    return Error("Malformed block record");
+    return Error(InvalidRecord);
 
   InstructionList.clear();
   unsigned ModuleValueListSize = ValueList.size();
@@ -2126,7 +2148,7 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
 
     switch (Entry.Kind) {
     case BitstreamEntry::Error:
-      return Error("Bitcode error in function block");
+      return Error(MalformedBlock);
     case BitstreamEntry::EndBlock:
       goto OutOfRecordLoop;
 
@@ -2134,20 +2156,24 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
       switch (Entry.ID) {
       default:  // Skip unknown content.
         if (Stream.SkipBlock())
-          return Error("Malformed block record");
+          return Error(InvalidRecord);
         break;
       case bitc::CONSTANTS_BLOCK_ID:
-        if (ParseConstants()) return true;
+        if (error_code EC = ParseConstants())
+          return EC;
         NextValueNo = ValueList.size();
         break;
       case bitc::VALUE_SYMTAB_BLOCK_ID:
-        if (ParseValueSymbolTable()) return true;
+        if (error_code EC = ParseValueSymbolTable())
+          return EC;
         break;
       case bitc::METADATA_ATTACHMENT_ID:
-        if (ParseMetadataAttachment()) return true;
+        if (error_code EC = ParseMetadataAttachment())
+          return EC;
         break;
       case bitc::METADATA_BLOCK_ID:
-        if (ParseMetadata()) return true;
+        if (error_code EC = ParseMetadata())
+          return EC;
         break;
       }
       continue;
@@ -2163,10 +2189,10 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
     unsigned BitCode = Stream.readRecord(Entry.ID, Record);
     switch (BitCode) {
     default: // Default behavior: reject
-      return Error("Unknown instruction");
+      return Error(InvalidValue);
     case bitc::FUNC_CODE_DECLAREBLOCKS:     // DECLAREBLOCKS: [nblocks]
       if (Record.size() < 1 || Record[0] == 0)
-        return Error("Invalid DECLAREBLOCKS record");
+        return Error(InvalidRecord);
       // Create all the basic blocks for the function.
       FunctionBBs.resize(Record[0]);
       for (unsigned i = 0, e = FunctionBBs.size(); i != e; ++i)
@@ -2186,7 +2212,8 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
                !FunctionBBs[CurBBNo-1]->empty())
         I = &FunctionBBs[CurBBNo-1]->back();
 
-      if (I == 0) return Error("Invalid DEBUG_LOC_AGAIN record");
+      if (I == 0)
+        return Error(InvalidRecord);
       I->setDebugLoc(LastLoc);
       I = 0;
       continue;
@@ -2199,7 +2226,7 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
                !FunctionBBs[CurBBNo-1]->empty())
         I = &FunctionBBs[CurBBNo-1]->back();
       if (I == 0 || Record.size() < 4)
-        return Error("Invalid FUNC_CODE_DEBUG_LOC record");
+        return Error(InvalidRecord);
 
       unsigned Line = Record[0], Col = Record[1];
       unsigned ScopeID = Record[2], IAID = Record[3];
@@ -2219,10 +2246,11 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
       if (getValueTypePair(Record, OpNum, NextValueNo, LHS) ||
           popValue(Record, OpNum, NextValueNo, LHS->getType(), RHS) ||
           OpNum+1 > Record.size())
-        return Error("Invalid BINOP record");
+        return Error(InvalidRecord);
 
       int Opc = GetDecodedBinaryOpcode(Record[OpNum++], LHS->getType());
-      if (Opc == -1) return Error("Invalid BINOP record");
+      if (Opc == -1)
+        return Error(InvalidRecord);
       I = BinaryOperator::Create((Instruction::BinaryOps)Opc, LHS, RHS);
       InstructionList.push_back(I);
       if (OpNum < Record.size()) {
@@ -2264,13 +2292,21 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
       Value *Op;
       if (getValueTypePair(Record, OpNum, NextValueNo, Op) ||
           OpNum+2 != Record.size())
-        return Error("Invalid CAST record");
+        return Error(InvalidRecord);
 
       Type *ResTy = getTypeByID(Record[OpNum]);
       int Opc = GetDecodedCastOpcode(Record[OpNum+1]);
       if (Opc == -1 || ResTy == 0)
-        return Error("Invalid CAST record");
-      I = CastInst::Create((Instruction::CastOps)Opc, Op, ResTy);
+        return Error(InvalidRecord);
+      Instruction *Temp = 0;
+      if ((I = UpgradeBitCastInst(Opc, Op, ResTy, Temp))) {
+        if (Temp) {
+          InstructionList.push_back(Temp);
+          CurBB->getInstList().push_back(Temp);
+        }
+      } else {
+        I = CastInst::Create((Instruction::CastOps)Opc, Op, ResTy);
+      }
       InstructionList.push_back(I);
       break;
     }
@@ -2279,13 +2315,13 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
       unsigned OpNum = 0;
       Value *BasePtr;
       if (getValueTypePair(Record, OpNum, NextValueNo, BasePtr))
-        return Error("Invalid GEP record");
+        return Error(InvalidRecord);
 
       SmallVector<Value*, 16> GEPIdx;
       while (OpNum != Record.size()) {
         Value *Op;
         if (getValueTypePair(Record, OpNum, NextValueNo, Op))
-          return Error("Invalid GEP record");
+          return Error(InvalidRecord);
         GEPIdx.push_back(Op);
       }
 
@@ -2301,14 +2337,14 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
       unsigned OpNum = 0;
       Value *Agg;
       if (getValueTypePair(Record, OpNum, NextValueNo, Agg))
-        return Error("Invalid EXTRACTVAL record");
+        return Error(InvalidRecord);
 
       SmallVector<unsigned, 4> EXTRACTVALIdx;
       for (unsigned RecSize = Record.size();
            OpNum != RecSize; ++OpNum) {
         uint64_t Index = Record[OpNum];
         if ((unsigned)Index != Index)
-          return Error("Invalid EXTRACTVAL index");
+          return Error(InvalidValue);
         EXTRACTVALIdx.push_back((unsigned)Index);
       }
 
@@ -2322,17 +2358,17 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
       unsigned OpNum = 0;
       Value *Agg;
       if (getValueTypePair(Record, OpNum, NextValueNo, Agg))
-        return Error("Invalid INSERTVAL record");
+        return Error(InvalidRecord);
       Value *Val;
       if (getValueTypePair(Record, OpNum, NextValueNo, Val))
-        return Error("Invalid INSERTVAL record");
+        return Error(InvalidRecord);
 
       SmallVector<unsigned, 4> INSERTVALIdx;
       for (unsigned RecSize = Record.size();
            OpNum != RecSize; ++OpNum) {
         uint64_t Index = Record[OpNum];
         if ((unsigned)Index != Index)
-          return Error("Invalid INSERTVAL index");
+          return Error(InvalidValue);
         INSERTVALIdx.push_back((unsigned)Index);
       }
 
@@ -2349,7 +2385,7 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
       if (getValueTypePair(Record, OpNum, NextValueNo, TrueVal) ||
           popValue(Record, OpNum, NextValueNo, TrueVal->getType(), FalseVal) ||
           popValue(Record, OpNum, NextValueNo, Type::getInt1Ty(Context), Cond))
-        return Error("Invalid SELECT record");
+        return Error(InvalidRecord);
 
       I = SelectInst::Create(Cond, TrueVal, FalseVal);
       InstructionList.push_back(I);
@@ -2364,18 +2400,18 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
       if (getValueTypePair(Record, OpNum, NextValueNo, TrueVal) ||
           popValue(Record, OpNum, NextValueNo, TrueVal->getType(), FalseVal) ||
           getValueTypePair(Record, OpNum, NextValueNo, Cond))
-        return Error("Invalid SELECT record");
+        return Error(InvalidRecord);
 
       // select condition can be either i1 or [N x i1]
       if (VectorType* vector_type =
           dyn_cast<VectorType>(Cond->getType())) {
         // expect <n x i1>
         if (vector_type->getElementType() != Type::getInt1Ty(Context))
-          return Error("Invalid SELECT condition type");
+          return Error(InvalidTypeForValue);
       } else {
         // expect i1
         if (Cond->getType() != Type::getInt1Ty(Context))
-          return Error("Invalid SELECT condition type");
+          return Error(InvalidTypeForValue);
       }
 
       I = SelectInst::Create(Cond, TrueVal, FalseVal);
@@ -2388,7 +2424,7 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
       Value *Vec, *Idx;
       if (getValueTypePair(Record, OpNum, NextValueNo, Vec) ||
           popValue(Record, OpNum, NextValueNo, Type::getInt32Ty(Context), Idx))
-        return Error("Invalid EXTRACTELT record");
+        return Error(InvalidRecord);
       I = ExtractElementInst::Create(Vec, Idx);
       InstructionList.push_back(I);
       break;
@@ -2401,7 +2437,7 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
           popValue(Record, OpNum, NextValueNo,
                    cast<VectorType>(Vec->getType())->getElementType(), Elt) ||
           popValue(Record, OpNum, NextValueNo, Type::getInt32Ty(Context), Idx))
-        return Error("Invalid INSERTELT record");
+        return Error(InvalidRecord);
       I = InsertElementInst::Create(Vec, Elt, Idx);
       InstructionList.push_back(I);
       break;
@@ -2412,10 +2448,10 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
       Value *Vec1, *Vec2, *Mask;
       if (getValueTypePair(Record, OpNum, NextValueNo, Vec1) ||
           popValue(Record, OpNum, NextValueNo, Vec1->getType(), Vec2))
-        return Error("Invalid SHUFFLEVEC record");
+        return Error(InvalidRecord);
 
       if (getValueTypePair(Record, OpNum, NextValueNo, Mask))
-        return Error("Invalid SHUFFLEVEC record");
+        return Error(InvalidRecord);
       I = new ShuffleVectorInst(Vec1, Vec2, Mask);
       InstructionList.push_back(I);
       break;
@@ -2433,7 +2469,7 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
       if (getValueTypePair(Record, OpNum, NextValueNo, LHS) ||
           popValue(Record, OpNum, NextValueNo, LHS->getType(), RHS) ||
           OpNum+1 != Record.size())
-        return Error("Invalid CMP record");
+        return Error(InvalidRecord);
 
       if (LHS->getType()->isFPOrFPVectorTy())
         I = new FCmpInst((FCmpInst::Predicate)Record[OpNum], LHS, RHS);
@@ -2455,9 +2491,9 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
         unsigned OpNum = 0;
         Value *Op = NULL;
         if (getValueTypePair(Record, OpNum, NextValueNo, Op))
-          return Error("Invalid RET record");
+          return Error(InvalidRecord);
         if (OpNum != Record.size())
-          return Error("Invalid RET record");
+          return Error(InvalidRecord);
 
         I = ReturnInst::Create(Context, Op);
         InstructionList.push_back(I);
@@ -2465,10 +2501,10 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
       }
     case bitc::FUNC_CODE_INST_BR: { // BR: [bb#, bb#, opval] or [bb#]
       if (Record.size() != 1 && Record.size() != 3)
-        return Error("Invalid BR record");
+        return Error(InvalidRecord);
       BasicBlock *TrueDest = getBasicBlock(Record[0]);
       if (TrueDest == 0)
-        return Error("Invalid BR record");
+        return Error(InvalidRecord);
 
       if (Record.size() == 1) {
         I = BranchInst::Create(TrueDest);
@@ -2479,7 +2515,7 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
         Value *Cond = getValue(Record, 2, NextValueNo,
                                Type::getInt1Ty(Context));
         if (FalseDest == 0 || Cond == 0)
-          return Error("Invalid BR record");
+          return Error(InvalidRecord);
         I = BranchInst::Create(TrueDest, FalseDest, Cond);
         InstructionList.push_back(I);
       }
@@ -2488,7 +2524,10 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
     case bitc::FUNC_CODE_INST_SWITCH: { // SWITCH: [opty, op0, op1, ...]
       // Check magic
       if ((Record[0] >> 16) == SWITCH_INST_MAGIC) {
-        // New SwitchInst format with case ranges.
+        // "New" SwitchInst format with case ranges. The changes to write this
+        // format were reverted but we still recognize bitcode that uses it.
+        // Hopefully someday we will have support for case ranges and can use
+        // this format again.
 
         Type *OpTy = getTypeByID(Record[1]);
         unsigned ValueBitWidth = cast<IntegerType>(OpTy)->getBitWidth();
@@ -2496,7 +2535,7 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
         Value *Cond = getValue(Record, 2, NextValueNo, OpTy);
         BasicBlock *Default = getBasicBlock(Record[3]);
         if (OpTy == 0 || Cond == 0 || Default == 0)
-          return Error("Invalid SWITCH record");
+          return Error(InvalidRecord);
 
         unsigned NumCases = Record[4];
 
@@ -2505,7 +2544,7 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
 
         unsigned CurIdx = 5;
         for (unsigned i = 0; i != NumCases; ++i) {
-          IntegersSubsetToBB CaseBuilder;
+          SmallVector<ConstantInt*, 1> CaseVals;
           unsigned NumItems = Record[CurIdx++];
           for (unsigned ci = 0; ci != NumItems; ++ci) {
             bool isSingleNumber = Record[CurIdx++];
@@ -2525,20 +2564,22 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
               APInt High =
                   ReadWideAPInt(makeArrayRef(&Record[CurIdx], ActiveWords),
                                 ValueBitWidth);
-
-              CaseBuilder.add(IntItem::fromType(OpTy, Low),
-                              IntItem::fromType(OpTy, High));
               CurIdx += ActiveWords;
+
+              // FIXME: It is not clear whether values in the range should be
+              // compared as signed or unsigned values. The partially
+              // implemented changes that used this format in the past used
+              // unsigned comparisons.
+              for ( ; Low.ule(High); ++Low)
+                CaseVals.push_back(ConstantInt::get(Context, Low));
             } else
-              CaseBuilder.add(IntItem::fromType(OpTy, Low));
+              CaseVals.push_back(ConstantInt::get(Context, Low));
           }
           BasicBlock *DestBB = getBasicBlock(Record[CurIdx++]);
-          IntegersSubset Case = CaseBuilder.getCase();
-          SI->addCase(Case, DestBB);
+          for (SmallVector<ConstantInt*, 1>::iterator cvi = CaseVals.begin(),
+                 cve = CaseVals.end(); cvi != cve; ++cvi)
+            SI->addCase(*cvi, DestBB);
         }
-        uint16_t Hash = SI->hash();
-        if (Hash != (Record[0] & 0xFFFF))
-          return Error("Invalid SWITCH record");
         I = SI;
         break;
       }
@@ -2546,12 +2587,12 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
       // Old SwitchInst format without case ranges.
 
       if (Record.size() < 3 || (Record.size() & 1) == 0)
-        return Error("Invalid SWITCH record");
+        return Error(InvalidRecord);
       Type *OpTy = getTypeByID(Record[0]);
       Value *Cond = getValue(Record, 1, NextValueNo, OpTy);
       BasicBlock *Default = getBasicBlock(Record[2]);
       if (OpTy == 0 || Cond == 0 || Default == 0)
-        return Error("Invalid SWITCH record");
+        return Error(InvalidRecord);
       unsigned NumCases = (Record.size()-3)/2;
       SwitchInst *SI = SwitchInst::Create(Cond, Default, NumCases);
       InstructionList.push_back(SI);
@@ -2561,7 +2602,7 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
         BasicBlock *DestBB = getBasicBlock(Record[1+3+i*2]);
         if (CaseVal == 0 || DestBB == 0) {
           delete SI;
-          return Error("Invalid SWITCH record!");
+          return Error(InvalidRecord);
         }
         SI->addCase(CaseVal, DestBB);
       }
@@ -2570,11 +2611,11 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
     }
     case bitc::FUNC_CODE_INST_INDIRECTBR: { // INDIRECTBR: [opty, op0, op1, ...]
       if (Record.size() < 2)
-        return Error("Invalid INDIRECTBR record");
+        return Error(InvalidRecord);
       Type *OpTy = getTypeByID(Record[0]);
       Value *Address = getValue(Record, 1, NextValueNo, OpTy);
       if (OpTy == 0 || Address == 0)
-        return Error("Invalid INDIRECTBR record");
+        return Error(InvalidRecord);
       unsigned NumDests = Record.size()-2;
       IndirectBrInst *IBI = IndirectBrInst::Create(Address, NumDests);
       InstructionList.push_back(IBI);
@@ -2583,7 +2624,7 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
           IBI->addDestination(DestBB);
         } else {
           delete IBI;
-          return Error("Invalid INDIRECTBR record!");
+          return Error(InvalidRecord);
         }
       }
       I = IBI;
@@ -2592,7 +2633,8 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
 
     case bitc::FUNC_CODE_INST_INVOKE: {
       // INVOKE: [attrs, cc, normBB, unwindBB, fnty, op0,op1,op2, ...]
-      if (Record.size() < 4) return Error("Invalid INVOKE record");
+      if (Record.size() < 4)
+        return Error(InvalidRecord);
       AttributeSet PAL = getAttributes(Record[0]);
       unsigned CCInfo = Record[1];
       BasicBlock *NormalBB = getBasicBlock(Record[2]);
@@ -2601,7 +2643,7 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
       unsigned OpNum = 4;
       Value *Callee;
       if (getValueTypePair(Record, OpNum, NextValueNo, Callee))
-        return Error("Invalid INVOKE record");
+        return Error(InvalidRecord);
 
       PointerType *CalleeTy = dyn_cast<PointerType>(Callee->getType());
       FunctionType *FTy = !CalleeTy ? 0 :
@@ -2610,24 +2652,25 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
       // Check that the right number of fixed parameters are here.
       if (FTy == 0 || NormalBB == 0 || UnwindBB == 0 ||
           Record.size() < OpNum+FTy->getNumParams())
-        return Error("Invalid INVOKE record");
+        return Error(InvalidRecord);
 
       SmallVector<Value*, 16> Ops;
       for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i, ++OpNum) {
         Ops.push_back(getValue(Record, OpNum, NextValueNo,
                                FTy->getParamType(i)));
-        if (Ops.back() == 0) return Error("Invalid INVOKE record");
+        if (Ops.back() == 0)
+          return Error(InvalidRecord);
       }
 
       if (!FTy->isVarArg()) {
         if (Record.size() != OpNum)
-          return Error("Invalid INVOKE record");
+          return Error(InvalidRecord);
       } else {
         // Read type/value pairs for varargs params.
         while (OpNum != Record.size()) {
           Value *Op;
           if (getValueTypePair(Record, OpNum, NextValueNo, Op))
-            return Error("Invalid INVOKE record");
+            return Error(InvalidRecord);
           Ops.push_back(Op);
         }
       }
@@ -2643,7 +2686,7 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
       unsigned Idx = 0;
       Value *Val = 0;
       if (getValueTypePair(Record, Idx, NextValueNo, Val))
-        return Error("Invalid RESUME record");
+        return Error(InvalidRecord);
       I = ResumeInst::Create(Val);
       InstructionList.push_back(I);
       break;
@@ -2654,9 +2697,10 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
       break;
     case bitc::FUNC_CODE_INST_PHI: { // PHI: [ty, val0,bb0, ...]
       if (Record.size() < 1 || ((Record.size()-1)&1))
-        return Error("Invalid PHI record");
+        return Error(InvalidRecord);
       Type *Ty = getTypeByID(Record[0]);
-      if (!Ty) return Error("Invalid PHI record");
+      if (!Ty)
+        return Error(InvalidRecord);
 
       PHINode *PN = PHINode::Create(Ty, (Record.size()-1)/2);
       InstructionList.push_back(PN);
@@ -2671,7 +2715,8 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
         else
           V = getValue(Record, 1+i, NextValueNo, Ty);
         BasicBlock *BB = getBasicBlock(Record[2+i]);
-        if (!V || !BB) return Error("Invalid PHI record");
+        if (!V || !BB)
+          return Error(InvalidRecord);
         PN->addIncoming(V, BB);
       }
       I = PN;
@@ -2682,12 +2727,13 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
       // LANDINGPAD: [ty, val, val, num, (id0,val0 ...)?]
       unsigned Idx = 0;
       if (Record.size() < 4)
-        return Error("Invalid LANDINGPAD record");
+        return Error(InvalidRecord);
       Type *Ty = getTypeByID(Record[Idx++]);
-      if (!Ty) return Error("Invalid LANDINGPAD record");
+      if (!Ty)
+        return Error(InvalidRecord);
       Value *PersFn = 0;
       if (getValueTypePair(Record, Idx, NextValueNo, PersFn))
-        return Error("Invalid LANDINGPAD record");
+        return Error(InvalidRecord);
 
       bool IsCleanup = !!Record[Idx++];
       unsigned NumClauses = Record[Idx++];
@@ -2700,7 +2746,7 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
 
         if (getValueTypePair(Record, Idx, NextValueNo, Val)) {
           delete LP;
-          return Error("Invalid LANDINGPAD record");
+          return Error(InvalidRecord);
         }
 
         assert((CT != LandingPadInst::Catch ||
@@ -2719,13 +2765,14 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
 
     case bitc::FUNC_CODE_INST_ALLOCA: { // ALLOCA: [instty, opty, op, align]
       if (Record.size() != 4)
-        return Error("Invalid ALLOCA record");
+        return Error(InvalidRecord);
       PointerType *Ty =
         dyn_cast_or_null<PointerType>(getTypeByID(Record[0]));
       Type *OpTy = getTypeByID(Record[1]);
       Value *Size = getFnValueByID(Record[2], OpTy);
       unsigned Align = Record[3];
-      if (!Ty || !Size) return Error("Invalid ALLOCA record");
+      if (!Ty || !Size)
+        return Error(InvalidRecord);
       I = new AllocaInst(Ty->getElementType(), Size, (1 << Align) >> 1);
       InstructionList.push_back(I);
       break;
@@ -2735,7 +2782,7 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
       Value *Op;
       if (getValueTypePair(Record, OpNum, NextValueNo, Op) ||
           OpNum+2 != Record.size())
-        return Error("Invalid LOAD record");
+        return Error(InvalidRecord);
 
       I = new LoadInst(Op, "", Record[OpNum+1], (1 << Record[OpNum]) >> 1);
       InstructionList.push_back(I);
@@ -2747,15 +2794,15 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
       Value *Op;
       if (getValueTypePair(Record, OpNum, NextValueNo, Op) ||
           OpNum+4 != Record.size())
-        return Error("Invalid LOADATOMIC record");
+        return Error(InvalidRecord);
 
 
       AtomicOrdering Ordering = GetDecodedOrdering(Record[OpNum+2]);
       if (Ordering == NotAtomic || Ordering == Release ||
           Ordering == AcquireRelease)
-        return Error("Invalid LOADATOMIC record");
+        return Error(InvalidRecord);
       if (Ordering != NotAtomic && Record[OpNum] == 0)
-        return Error("Invalid LOADATOMIC record");
+        return Error(InvalidRecord);
       SynchronizationScope SynchScope = GetDecodedSynchScope(Record[OpNum+3]);
 
       I = new LoadInst(Op, "", Record[OpNum+1], (1 << Record[OpNum]) >> 1,
@@ -2770,7 +2817,7 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
           popValue(Record, OpNum, NextValueNo,
                     cast<PointerType>(Ptr->getType())->getElementType(), Val) ||
           OpNum+2 != Record.size())
-        return Error("Invalid STORE record");
+        return Error(InvalidRecord);
 
       I = new StoreInst(Val, Ptr, Record[OpNum+1], (1 << Record[OpNum]) >> 1);
       InstructionList.push_back(I);
@@ -2784,15 +2831,15 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
           popValue(Record, OpNum, NextValueNo,
                     cast<PointerType>(Ptr->getType())->getElementType(), Val) ||
           OpNum+4 != Record.size())
-        return Error("Invalid STOREATOMIC record");
+        return Error(InvalidRecord);
 
       AtomicOrdering Ordering = GetDecodedOrdering(Record[OpNum+2]);
       if (Ordering == NotAtomic || Ordering == Acquire ||
           Ordering == AcquireRelease)
-        return Error("Invalid STOREATOMIC record");
+        return Error(InvalidRecord);
       SynchronizationScope SynchScope = GetDecodedSynchScope(Record[OpNum+3]);
       if (Ordering != NotAtomic && Record[OpNum] == 0)
-        return Error("Invalid STOREATOMIC record");
+        return Error(InvalidRecord);
 
       I = new StoreInst(Val, Ptr, Record[OpNum+1], (1 << Record[OpNum]) >> 1,
                         Ordering, SynchScope);
@@ -2809,10 +2856,10 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
           popValue(Record, OpNum, NextValueNo,
                     cast<PointerType>(Ptr->getType())->getElementType(), New) ||
           OpNum+3 != Record.size())
-        return Error("Invalid CMPXCHG record");
+        return Error(InvalidRecord);
       AtomicOrdering Ordering = GetDecodedOrdering(Record[OpNum+1]);
       if (Ordering == NotAtomic || Ordering == Unordered)
-        return Error("Invalid CMPXCHG record");
+        return Error(InvalidRecord);
       SynchronizationScope SynchScope = GetDecodedSynchScope(Record[OpNum+2]);
       I = new AtomicCmpXchgInst(Ptr, Cmp, New, Ordering, SynchScope);
       cast<AtomicCmpXchgInst>(I)->setVolatile(Record[OpNum]);
@@ -2827,14 +2874,14 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
           popValue(Record, OpNum, NextValueNo,
                     cast<PointerType>(Ptr->getType())->getElementType(), Val) ||
           OpNum+4 != Record.size())
-        return Error("Invalid ATOMICRMW record");
+        return Error(InvalidRecord);
       AtomicRMWInst::BinOp Operation = GetDecodedRMWOperation(Record[OpNum]);
       if (Operation < AtomicRMWInst::FIRST_BINOP ||
           Operation > AtomicRMWInst::LAST_BINOP)
-        return Error("Invalid ATOMICRMW record");
+        return Error(InvalidRecord);
       AtomicOrdering Ordering = GetDecodedOrdering(Record[OpNum+2]);
       if (Ordering == NotAtomic || Ordering == Unordered)
-        return Error("Invalid ATOMICRMW record");
+        return Error(InvalidRecord);
       SynchronizationScope SynchScope = GetDecodedSynchScope(Record[OpNum+3]);
       I = new AtomicRMWInst(Operation, Ptr, Val, Ordering, SynchScope);
       cast<AtomicRMWInst>(I)->setVolatile(Record[OpNum+1]);
@@ -2843,11 +2890,11 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
     }
     case bitc::FUNC_CODE_INST_FENCE: { // FENCE:[ordering, synchscope]
       if (2 != Record.size())
-        return Error("Invalid FENCE record");
+        return Error(InvalidRecord);
       AtomicOrdering Ordering = GetDecodedOrdering(Record[0]);
       if (Ordering == NotAtomic || Ordering == Unordered ||
           Ordering == Monotonic)
-        return Error("Invalid FENCE record");
+        return Error(InvalidRecord);
       SynchronizationScope SynchScope = GetDecodedSynchScope(Record[1]);
       I = new FenceInst(Context, Ordering, SynchScope);
       InstructionList.push_back(I);
@@ -2856,7 +2903,7 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
     case bitc::FUNC_CODE_INST_CALL: {
       // CALL: [paramattrs, cc, fnty, fnid, arg0, arg1...]
       if (Record.size() < 3)
-        return Error("Invalid CALL record");
+        return Error(InvalidRecord);
 
       AttributeSet PAL = getAttributes(Record[0]);
       unsigned CCInfo = Record[1];
@@ -2864,13 +2911,13 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
       unsigned OpNum = 2;
       Value *Callee;
       if (getValueTypePair(Record, OpNum, NextValueNo, Callee))
-        return Error("Invalid CALL record");
+        return Error(InvalidRecord);
 
       PointerType *OpTy = dyn_cast<PointerType>(Callee->getType());
       FunctionType *FTy = 0;
       if (OpTy) FTy = dyn_cast<FunctionType>(OpTy->getElementType());
       if (!FTy || Record.size() < FTy->getNumParams()+OpNum)
-        return Error("Invalid CALL record");
+        return Error(InvalidRecord);
 
       SmallVector<Value*, 16> Args;
       // Read the fixed params.
@@ -2880,18 +2927,19 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
         else
           Args.push_back(getValue(Record, OpNum, NextValueNo,
                                   FTy->getParamType(i)));
-        if (Args.back() == 0) return Error("Invalid CALL record");
+        if (Args.back() == 0)
+          return Error(InvalidRecord);
       }
 
       // Read type/value pairs for varargs params.
       if (!FTy->isVarArg()) {
         if (OpNum != Record.size())
-          return Error("Invalid CALL record");
+          return Error(InvalidRecord);
       } else {
         while (OpNum != Record.size()) {
           Value *Op;
           if (getValueTypePair(Record, OpNum, NextValueNo, Op))
-            return Error("Invalid CALL record");
+            return Error(InvalidRecord);
           Args.push_back(Op);
         }
       }
@@ -2906,12 +2954,12 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
     }
     case bitc::FUNC_CODE_INST_VAARG: { // VAARG: [valistty, valist, instty]
       if (Record.size() < 3)
-        return Error("Invalid VAARG record");
+        return Error(InvalidRecord);
       Type *OpTy = getTypeByID(Record[0]);
       Value *Op = getValue(Record, 1, NextValueNo, OpTy);
       Type *ResTy = getTypeByID(Record[2]);
       if (!OpTy || !Op || !ResTy)
-        return Error("Invalid VAARG record");
+        return Error(InvalidRecord);
       I = new VAArgInst(Op, ResTy);
       InstructionList.push_back(I);
       break;
@@ -2922,7 +2970,7 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
     // this file.
     if (CurBB == 0) {
       delete I;
-      return Error("Invalid instruction with no BB");
+      return Error(InvalidInstructionWithNoBB);
     }
     CurBB->getInstList().push_back(I);
 
@@ -2949,7 +2997,7 @@ OutOfRecordLoop:
           delete A;
         }
       }
-      return Error("Never resolved value found in function!");
+      return Error(NeverResolvedValueFoundInFunction);
     }
   }
 
@@ -2965,7 +3013,7 @@ OutOfRecordLoop:
     for (unsigned i = 0, e = RefList.size(); i != e; ++i) {
       unsigned BlockIdx = RefList[i].first;
       if (BlockIdx >= FunctionBBs.size())
-        return Error("Invalid blockaddress block #");
+        return Error(InvalidID);
 
       GlobalVariable *FwdRef = RefList[i].second;
       FwdRef->replaceAllUsesWith(BlockAddress::get(F, FunctionBBs[BlockIdx]));
@@ -2979,20 +3027,21 @@ OutOfRecordLoop:
   ValueList.shrinkTo(ModuleValueListSize);
   MDValueList.shrinkTo(ModuleMDValueListSize);
   std::vector<BasicBlock*>().swap(FunctionBBs);
-  return false;
+  return error_code::success();
 }
 
-/// FindFunctionInStream - Find the function body in the bitcode stream
-bool BitcodeReader::FindFunctionInStream(Function *F,
+/// Find the function body in the bitcode stream
+error_code BitcodeReader::FindFunctionInStream(Function *F,
        DenseMap<Function*, uint64_t>::iterator DeferredFunctionInfoIterator) {
   while (DeferredFunctionInfoIterator->second == 0) {
     if (Stream.AtEndOfStream())
-      return Error("Could not find Function in stream");
+      return Error(CouldNotFindFunctionInStream);
     // ParseModule will parse the next body in the stream and set its
     // position in the DeferredFunctionInfo map.
-    if (ParseModule(true)) return true;
+    if (error_code EC = ParseModule(true))
+      return EC;
   }
-  return false;
+  return error_code::success();
 }
 
 //===----------------------------------------------------------------------===//
@@ -3008,25 +3057,25 @@ bool BitcodeReader::isMaterializable(const GlobalValue *GV) const {
   return false;
 }
 
-bool BitcodeReader::Materialize(GlobalValue *GV, std::string *ErrInfo) {
+error_code BitcodeReader::Materialize(GlobalValue *GV) {
   Function *F = dyn_cast<Function>(GV);
   // If it's not a function or is already material, ignore the request.
-  if (!F || !F->isMaterializable()) return false;
+  if (!F || !F->isMaterializable())
+    return error_code::success();
 
   DenseMap<Function*, uint64_t>::iterator DFII = DeferredFunctionInfo.find(F);
   assert(DFII != DeferredFunctionInfo.end() && "Deferred function not found!");
   // If its position is recorded as 0, its body is somewhere in the stream
   // but we haven't seen it yet.
-  if (DFII->second == 0)
-    if (LazyStreamer && FindFunctionInStream(F, DFII)) return true;
+  if (DFII->second == 0 && LazyStreamer)
+    if (error_code EC = FindFunctionInStream(F, DFII))
+      return EC;
 
   // Move the bit stream to the saved position of the deferred function body.
   Stream.JumpToBit(DFII->second);
 
-  if (ParseFunctionBody(F)) {
-    if (ErrInfo) *ErrInfo = ErrorString;
-    return true;
-  }
+  if (error_code EC = ParseFunctionBody(F))
+    return EC;
 
   // Upgrade any old intrinsic calls in the function.
   for (UpgradedIntrinsicMap::iterator I = UpgradedIntrinsics.begin(),
@@ -3040,7 +3089,7 @@ bool BitcodeReader::Materialize(GlobalValue *GV, std::string *ErrInfo) {
     }
   }
 
-  return false;
+  return error_code::success();
 }
 
 bool BitcodeReader::isDematerializable(const GlobalValue *GV) const {
@@ -3063,17 +3112,18 @@ void BitcodeReader::Dematerialize(GlobalValue *GV) {
 }
 
 
-bool BitcodeReader::MaterializeModule(Module *M, std::string *ErrInfo) {
+error_code BitcodeReader::MaterializeModule(Module *M) {
   assert(M == TheModule &&
          "Can only Materialize the Module this BitcodeReader is attached to.");
   // Iterate over the module, deserializing any functions that are still on
   // disk.
   for (Module::iterator F = TheModule->begin(), E = TheModule->end();
-       F != E; ++F)
-    if (F->isMaterializable() &&
-        Materialize(F, ErrInfo))
-      return true;
-
+       F != E; ++F) {
+    if (F->isMaterializable()) {
+      if (error_code EC = Materialize(F))
+        return EC;
+    }
+  }
   // At this point, if there are any function bodies, the current bit is
   // pointing to the END_BLOCK record after them. Now make sure the rest
   // of the bits in the module have been read.
@@ -3099,38 +3149,43 @@ bool BitcodeReader::MaterializeModule(Module *M, std::string *ErrInfo) {
   }
   std::vector<std::pair<Function*, Function*> >().swap(UpgradedIntrinsics);
 
-  return false;
+  for (unsigned I = 0, E = InstsWithTBAATag.size(); I < E; I++)
+    UpgradeInstWithTBAATag(InstsWithTBAATag[I]);
+
+  UpgradeDebugInfo(*M);
+  return error_code::success();
 }
 
-bool BitcodeReader::InitStream() {
-  if (LazyStreamer) return InitLazyStream();
+error_code BitcodeReader::InitStream() {
+  if (LazyStreamer)
+    return InitLazyStream();
   return InitStreamFromBuffer();
 }
 
-bool BitcodeReader::InitStreamFromBuffer() {
+error_code BitcodeReader::InitStreamFromBuffer() {
   const unsigned char *BufPtr = (const unsigned char*)Buffer->getBufferStart();
   const unsigned char *BufEnd = BufPtr+Buffer->getBufferSize();
 
   if (Buffer->getBufferSize() & 3) {
     if (!isRawBitcode(BufPtr, BufEnd) && !isBitcodeWrapper(BufPtr, BufEnd))
-      return Error("Invalid bitcode signature");
+      return Error(InvalidBitcodeSignature);
     else
-      return Error("Bitcode stream should be a multiple of 4 bytes in length");
+      return Error(BitcodeStreamInvalidSize);
   }
 
   // If we have a wrapper header, parse it and ignore the non-bc file contents.
   // The magic number is 0x0B17C0DE stored in little endian.
   if (isBitcodeWrapper(BufPtr, BufEnd))
     if (SkipBitcodeWrapperHeader(BufPtr, BufEnd, true))
-      return Error("Invalid bitcode wrapper header");
+      return Error(InvalidBitcodeWrapperHeader);
 
   StreamFile.reset(new BitstreamReader(BufPtr, BufEnd));
   Stream.init(*StreamFile);
 
-  return false;
+  return error_code::success();
 }
 
-bool BitcodeReader::InitLazyStream() {
+error_code BitcodeReader::InitLazyStream() {
   // Check and strip off the bitcode wrapper; BitstreamReader expects never to
   // see it.
   StreamingMemoryObject *Bytes = new StreamingMemoryObject(LazyStreamer);
@@ -3139,10 +3194,10 @@ bool BitcodeReader::InitLazyStream() {
 
   unsigned char buf[16];
   if (Bytes->readBytes(0, 16, buf) == -1)
-    return Error("Bitcode stream must be at least 16 bytes in length");
+    return Error(BitcodeStreamInvalidSize);
 
   if (!isBitcode(buf, buf + 16))
-    return Error("Invalid bitcode signature");
+    return Error(InvalidBitcodeSignature);
 
   if (isBitcodeWrapper(buf, buf + 4)) {
     const unsigned char *bitcodeStart = buf;
@@ -3151,7 +3206,64 @@ bool BitcodeReader::InitLazyStream() {
     Bytes->dropLeadingBytes(bitcodeStart - buf);
     Bytes->setKnownObjectSize(bitcodeEnd - bitcodeStart);
   }
-  return false;
+  return error_code::success();
+}
+
+namespace {
+class BitcodeErrorCategoryType : public _do_message {
+  const char *name() const LLVM_OVERRIDE {
+    return "llvm.bitcode";
+  }
+  std::string message(int IE) const LLVM_OVERRIDE {
+    BitcodeReader::ErrorType E = static_cast<BitcodeReader::ErrorType>(IE);
+    switch (E) {
+    case BitcodeReader::BitcodeStreamInvalidSize:
+      return "Bitcode stream length should be >= 16 bytes and a multiple of 4";
+    case BitcodeReader::ConflictingMETADATA_KINDRecords:
+      return "Conflicting METADATA_KIND records";
+    case BitcodeReader::CouldNotFindFunctionInStream:
+      return "Could not find function in stream";
+    case BitcodeReader::ExpectedConstant:
+      return "Expected a constant";
+    case BitcodeReader::InsufficientFunctionProtos:
+      return "Insufficient function protos";
+    case BitcodeReader::InvalidBitcodeSignature:
+      return "Invalid bitcode signature";
+    case BitcodeReader::InvalidBitcodeWrapperHeader:
+      return "Invalid bitcode wrapper header";
+    case BitcodeReader::InvalidConstantReference:
+      return "Invalid ronstant reference";
+    case BitcodeReader::InvalidID:
+      return "Invalid ID";
+    case BitcodeReader::InvalidInstructionWithNoBB:
+      return "Invalid instruction with no BB";
+    case BitcodeReader::InvalidRecord:
+      return "Invalid record";
+    case BitcodeReader::InvalidTypeForValue:
+      return "Invalid type for value";
+    case BitcodeReader::InvalidTYPETable:
+      return "Invalid TYPE table";
+    case BitcodeReader::InvalidType:
+      return "Invalid type";
+    case BitcodeReader::MalformedBlock:
+      return "Malformed block";
+    case BitcodeReader::MalformedGlobalInitializerSet:
+      return "Malformed global initializer set";
+    case BitcodeReader::InvalidMultipleBlocks:
+      return "Invalid multiple blocks";
+    case BitcodeReader::NeverResolvedValueFoundInFunction:
+      return "Never resolved value found in function";
+    case BitcodeReader::InvalidValue:
+      return "Invalid value";
+    }
+    llvm_unreachable("Unknown error type!");
+  }
+};
+}
+
+const error_category &BitcodeReader::BitcodeErrorCategory() {
+  static BitcodeErrorCategoryType O;
+  return O;
 }
 
 //===----------------------------------------------------------------------===//
@@ -3166,9 +3278,9 @@ Module *llvm::getLazyBitcodeModule(MemoryBuffer *Buffer,
   Module *M = new Module(Buffer->getBufferIdentifier(), Context);
   BitcodeReader *R = new BitcodeReader(Buffer, Context);
   M->setMaterializer(R);
-  if (R->ParseBitcodeInto(M)) {
+  if (error_code EC = R->ParseBitcodeInto(M)) {
     if (ErrMsg)
-      *ErrMsg = R->getErrorString();
+      *ErrMsg = EC.message();
 
     delete M;  // Also deletes R.
     return 0;
@@ -3189,9 +3301,9 @@ Module *llvm::getStreamedBitcodeModule(const std::string &name,
   Module *M = new Module(name, Context);
   BitcodeReader *R = new BitcodeReader(streamer, Context);
   M->setMaterializer(R);
-  if (R->ParseBitcodeInto(M)) {
+  if (error_code EC = R->ParseBitcodeInto(M)) {
     if (ErrMsg)
-      *ErrMsg = R->getErrorString();
+      *ErrMsg = EC.message();
     delete M;  // Also deletes R.
     return 0;
   }
@@ -3230,9 +3342,9 @@ std::string llvm::getBitcodeTargetTriple(MemoryBuffer *Buffer,
   R->setBufferOwned(false);
 
   std::string Triple("");
-  if (R->ParseTriple(Triple))
+  if (error_code EC = R->ParseTriple(Triple))
     if (ErrMsg)
-      *ErrMsg = R->getErrorString();
+      *ErrMsg = EC.message();
 
   delete R;
   return Triple;
diff --git a/lib/Bitcode/Reader/BitcodeReader.h b/lib/Bitcode/Reader/BitcodeReader.h
index b095447..c5d345b 100644
--- a/lib/Bitcode/Reader/BitcodeReader.h
+++ b/lib/Bitcode/Reader/BitcodeReader.h
@@ -21,6 +21,7 @@
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/OperandTraits.h"
 #include "llvm/IR/Type.h"
+#include "llvm/Support/system_error.h"
 #include "llvm/Support/ValueHandle.h"
 #include <vector>
 
@@ -132,8 +133,6 @@ class BitcodeReader : public GVMaterializer {
   uint64_t NextUnreadBit;
   bool SeenValueSymbolTable;
 
-  const char *ErrorString;
-
   std::vector<Type*> TypeList;
   BitcodeReaderValueList ValueList;
   BitcodeReaderMDValueList MDValueList;
@@ -142,6 +141,9 @@ class BitcodeReader : public GVMaterializer {
 
   std::vector<std::pair<GlobalVariable*, unsigned> > GlobalInits;
   std::vector<std::pair<GlobalAlias*, unsigned> > AliasInits;
+  std::vector<std::pair<Function*, unsigned> > FunctionPrefixes;
+
+  SmallVector<Instruction*, 64> InstsWithTBAATag;
 
   /// MAttributes - The set of attributes by index.  Index zero in the
   /// file is for null, and is thus not represented here.  As such all indices
@@ -191,17 +193,46 @@ class BitcodeReader : public GVMaterializer {
   /// not need this flag.
   bool UseRelativeIDs;
 
+  static const error_category &BitcodeErrorCategory();
+
 public:
+  enum ErrorType {
+    BitcodeStreamInvalidSize,
+    ConflictingMETADATA_KINDRecords,
+    CouldNotFindFunctionInStream,
+    ExpectedConstant,
+    InsufficientFunctionProtos,
+    InvalidBitcodeSignature,
+    InvalidBitcodeWrapperHeader,
+    InvalidConstantReference,
+    InvalidID, // A read identifier is not found in the table it should be in.
+    InvalidInstructionWithNoBB,
+    InvalidRecord, // A read record doesn't have the expected size or structure
+    InvalidTypeForValue, // Type read OK, but is invalid for its use
+    InvalidTYPETable,
+    InvalidType, // We were unable to read a type
+    MalformedBlock, // We are unable to advance in the stream.
+    MalformedGlobalInitializerSet,
+    InvalidMultipleBlocks, // We found multiple blocks of a kind that should
+                           // have only one
+    NeverResolvedValueFoundInFunction,
+    InvalidValue // Invalid version, inst number, attr number, etc
+  };
+
+  error_code Error(ErrorType E) {
+    return error_code(E, BitcodeErrorCategory());
+  }
+
   explicit BitcodeReader(MemoryBuffer *buffer, LLVMContext &C)
     : Context(C), TheModule(0), Buffer(buffer), BufferOwned(false),
       LazyStreamer(0), NextUnreadBit(0), SeenValueSymbolTable(false),
-      ErrorString(0), ValueList(C), MDValueList(C),
+      ValueList(C), MDValueList(C),
       SeenFirstFunctionBody(false), UseRelativeIDs(false) {
   }
   explicit BitcodeReader(DataStreamer *streamer, LLVMContext &C)
     : Context(C), TheModule(0), Buffer(0), BufferOwned(false),
       LazyStreamer(streamer), NextUnreadBit(0), SeenValueSymbolTable(false),
-      ErrorString(0), ValueList(C), MDValueList(C),
+      ValueList(C), MDValueList(C),
       SeenFirstFunctionBody(false), UseRelativeIDs(false) {
   }
   ~BitcodeReader() {
@@ -218,23 +249,17 @@ public:
 
   virtual bool isMaterializable(const GlobalValue *GV) const;
   virtual bool isDematerializable(const GlobalValue *GV) const;
-  virtual bool Materialize(GlobalValue *GV, std::string *ErrInfo = 0);
-  virtual bool MaterializeModule(Module *M, std::string *ErrInfo = 0);
+  virtual error_code Materialize(GlobalValue *GV);
+  virtual error_code MaterializeModule(Module *M);
   virtual void Dematerialize(GlobalValue *GV);
 
-  bool Error(const char *Str) {
-    ErrorString = Str;
-    return true;
-  }
-  const char *getErrorString() const { return ErrorString; }
-
   /// @brief Main interface to parsing a bitcode buffer.
   /// @returns true if an error occurred.
-  bool ParseBitcodeInto(Module *M);
+  error_code ParseBitcodeInto(Module *M);
 
   /// @brief Cheap mechanism to just extract module triple
   /// @returns true if an error occurred.
-  bool ParseTriple(std::string &Triple);
+  error_code ParseTriple(std::string &Triple);
 
   static uint64_t decodeSignRotatedValue(uint64_t V);
 
@@ -321,27 +346,27 @@ private:
     return getFnValueByID(ValNo, Ty);
   }
 
-  bool ParseAttrKind(uint64_t Code, Attribute::AttrKind *Kind);
-  bool ParseModule(bool Resume);
-  bool ParseAttributeBlock();
-  bool ParseAttributeGroupBlock();
-  bool ParseTypeTable();
-  bool ParseTypeTableBody();
-
-  bool ParseValueSymbolTable();
-  bool ParseConstants();
-  bool RememberAndSkipFunctionBody();
-  bool ParseFunctionBody(Function *F);
-  bool GlobalCleanup();
-  bool ResolveGlobalAndAliasInits();
-  bool ParseMetadata();
-  bool ParseMetadataAttachment();
-  bool ParseModuleTriple(std::string &Triple);
-  bool ParseUseLists();
-  bool InitStream();
-  bool InitStreamFromBuffer();
-  bool InitLazyStream();
-  bool FindFunctionInStream(Function *F,
+  error_code ParseAttrKind(uint64_t Code, Attribute::AttrKind *Kind);
+  error_code ParseModule(bool Resume);
+  error_code ParseAttributeBlock();
+  error_code ParseAttributeGroupBlock();
+  error_code ParseTypeTable();
+  error_code ParseTypeTableBody();
+
+  error_code ParseValueSymbolTable();
+  error_code ParseConstants();
+  error_code RememberAndSkipFunctionBody();
+  error_code ParseFunctionBody(Function *F);
+  error_code GlobalCleanup();
+  error_code ResolveGlobalAndAliasInits();
+  error_code ParseMetadata();
+  error_code ParseMetadataAttachment();
+  error_code ParseModuleTriple(std::string &Triple);
+  error_code ParseUseLists();
+  error_code InitStream();
+  error_code InitStreamFromBuffer();
+  error_code InitLazyStream();
+  error_code FindFunctionInStream(Function *F,
          DenseMap<Function*, uint64_t>::iterator DeferredFunctionInfoIterator);
 };
 
diff --git a/lib/Bitcode/Writer/BitcodeWriter.cpp b/lib/Bitcode/Writer/BitcodeWriter.cpp
index 311c233..4cfc6bd 100644
--- a/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -60,10 +60,7 @@ enum {
   FUNCTION_INST_CAST_ABBREV,
   FUNCTION_INST_RET_VOID_ABBREV,
   FUNCTION_INST_RET_VAL_ABBREV,
-  FUNCTION_INST_UNREACHABLE_ABBREV,
-
-  // SwitchInst Magic
-  SWITCH_INST_MAGIC = 0x4B5 // May 2012 => 1205 => Hex
+  FUNCTION_INST_UNREACHABLE_ABBREV
 };
 
 static unsigned GetEncodedCastOpcode(unsigned Opcode) {
@@ -81,6 +78,7 @@ static unsigned GetEncodedCastOpcode(unsigned Opcode) {
   case Instruction::PtrToInt: return bitc::CAST_PTRTOINT;
   case Instruction::IntToPtr: return bitc::CAST_INTTOPTR;
   case Instruction::BitCast : return bitc::CAST_BITCAST;
+  case Instruction::AddrSpaceCast: return bitc::CAST_ADDRSPACECAST;
   }
 }
 
@@ -205,6 +203,8 @@ static uint64_t getAttrKindEncoding(Attribute::AttrKind Kind) {
     return bitc::ATTR_KIND_NO_UNWIND;
   case Attribute::OptimizeForSize:
     return bitc::ATTR_KIND_OPTIMIZE_FOR_SIZE;
+  case Attribute::OptimizeNone:
+    return bitc::ATTR_KIND_OPTIMIZE_NONE;
   case Attribute::ReadNone:
     return bitc::ATTR_KIND_READ_NONE;
   case Attribute::ReadOnly:
@@ -490,7 +490,6 @@ static unsigned getEncodedLinkage(const GlobalValue *GV) {
   case GlobalValue::AvailableExternallyLinkage:      return 12;
   case GlobalValue::LinkerPrivateLinkage:            return 13;
   case GlobalValue::LinkerPrivateWeakLinkage:        return 14;
-  case GlobalValue::LinkOnceODRAutoHideLinkage:      return 15;
   }
   llvm_unreachable("Invalid linkage");
 }
@@ -607,7 +606,7 @@ static void WriteModuleInfo(const Module *M, const ValueEnumerator &VE,
 
     // GLOBALVAR: [type, isconst, initid,
     //             linkage, alignment, section, visibility, threadlocal,
-    //             unnamed_addr]
+    //             unnamed_addr, externally_initialized]
     Vals.push_back(VE.getTypeID(GV->getType()));
     Vals.push_back(GV->isConstant());
     Vals.push_back(GV->isDeclaration() ? 0 :
@@ -633,7 +632,7 @@ static void WriteModuleInfo(const Module *M, const ValueEnumerator &VE,
   // Emit the function proto information.
   for (Module::const_iterator F = M->begin(), E = M->end(); F != E; ++F) {
     // FUNCTION:  [type, callingconv, isproto, linkage, paramattrs, alignment,
-    //             section, visibility, gc, unnamed_addr]
+    //             section, visibility, gc, unnamed_addr, prefix]
     Vals.push_back(VE.getTypeID(F->getType()));
     Vals.push_back(F->getCallingConv());
     Vals.push_back(F->isDeclaration());
@@ -644,6 +643,8 @@ static void WriteModuleInfo(const Module *M, const ValueEnumerator &VE,
     Vals.push_back(getEncodedVisibility(F));
     Vals.push_back(F->hasGC() ? GCMap[F->getGC()] : 0);
     Vals.push_back(F->hasUnnamedAddr());
+    Vals.push_back(F->hasPrefixData() ? (VE.getValueID(F->getPrefixData()) + 1)
+                                      : 0);
 
     unsigned AbbrevToUse = 0;
     Stream.EmitRecord(bitc::MODULE_CODE_FUNCTION, Vals, AbbrevToUse);
@@ -863,34 +864,6 @@ static void emitSignedInt64(SmallVectorImpl<uint64_t> &Vals, uint64_t V) {
     Vals.push_back((-V << 1) | 1);
 }
 
-static void EmitAPInt(SmallVectorImpl<uint64_t> &Vals,
-                      unsigned &Code, unsigned &AbbrevToUse, const APInt &Val,
-                      bool EmitSizeForWideNumbers = false
-                      ) {
-  if (Val.getBitWidth() <= 64) {
-    uint64_t V = Val.getSExtValue();
-    emitSignedInt64(Vals, V);
-    Code = bitc::CST_CODE_INTEGER;
-    AbbrevToUse = CONSTANTS_INTEGER_ABBREV;
-  } else {
-    // Wide integers, > 64 bits in size.
-    // We have an arbitrary precision integer value to write whose
-    // bit width is > 64. However, in canonical unsigned integer
-    // format it is likely that the high bits are going to be zero.
-    // So, we only write the number of active words.
-    unsigned NWords = Val.getActiveWords();
-
-    if (EmitSizeForWideNumbers)
-      Vals.push_back(NWords);
-
-    const uint64_t *RawWords = Val.getRawData();
-    for (unsigned i = 0; i != NWords; ++i) {
-      emitSignedInt64(Vals, RawWords[i]);
-    }
-    Code = bitc::CST_CODE_WIDE_INTEGER;
-  }
-}
-
 static void WriteConstants(unsigned FirstVal, unsigned LastVal,
                            const ValueEnumerator &VE,
                            BitstreamWriter &Stream, bool isGlobal) {
@@ -974,7 +947,23 @@ static void WriteConstants(unsigned FirstVal, unsigned LastVal,
     } else if (isa<UndefValue>(C)) {
       Code = bitc::CST_CODE_UNDEF;
     } else if (const ConstantInt *IV = dyn_cast<ConstantInt>(C)) {
-      EmitAPInt(Record, Code, AbbrevToUse, IV->getValue());
+      if (IV->getBitWidth() <= 64) {
+        uint64_t V = IV->getSExtValue();
+        emitSignedInt64(Record, V);
+        Code = bitc::CST_CODE_INTEGER;
+        AbbrevToUse = CONSTANTS_INTEGER_ABBREV;
+      } else {                             // Wide integers, > 64 bits in size.
+        // We have an arbitrary precision integer value to write whose
+        // bit width is > 64. However, in canonical unsigned integer
+        // format it is likely that the high bits are going to be zero.
+        // So, we only write the number of active words.
+        unsigned NWords = IV->getValue().getActiveWords();
+        const uint64_t *RawWords = IV->getValue().getRawData();
+        for (unsigned i = 0; i != NWords; ++i) {
+          emitSignedInt64(Record, RawWords[i]);
+        }
+        Code = bitc::CST_CODE_WIDE_INTEGER;
+      }
     } else if (const ConstantFP *CFP = dyn_cast<ConstantFP>(C)) {
       Code = bitc::CST_CODE_FLOAT;
       Type *Ty = CFP->getType();
@@ -1182,13 +1171,6 @@ static void pushValue(const Value *V, unsigned InstID,
   Vals.push_back(InstID - ValID);
 }
 
-static void pushValue64(const Value *V, unsigned InstID,
-                        SmallVectorImpl<uint64_t> &Vals,
-                        ValueEnumerator &VE) {
-  uint64_t ValID = VE.getValueID(V);
-  Vals.push_back(InstID - ValID);
-}
-
 static void pushValueSigned(const Value *V, unsigned InstID,
                             SmallVectorImpl<uint64_t> &Vals,
                             ValueEnumerator &VE) {
@@ -1312,63 +1294,16 @@ static void WriteInstruction(const Instruction &I, unsigned InstID,
     break;
   case Instruction::Switch:
     {
-      // Redefine Vals, since here we need to use 64 bit values
-      // explicitly to store large APInt numbers.
-      SmallVector<uint64_t, 128> Vals64;
-
       Code = bitc::FUNC_CODE_INST_SWITCH;
       const SwitchInst &SI = cast<SwitchInst>(I);
-
-      uint32_t SwitchRecordHeader = SI.hash() | (SWITCH_INST_MAGIC << 16);
-      Vals64.push_back(SwitchRecordHeader);
-
-      Vals64.push_back(VE.getTypeID(SI.getCondition()->getType()));
-      pushValue64(SI.getCondition(), InstID, Vals64, VE);
-      Vals64.push_back(VE.getValueID(SI.getDefaultDest()));
-      Vals64.push_back(SI.getNumCases());
+      Vals.push_back(VE.getTypeID(SI.getCondition()->getType()));
+      pushValue(SI.getCondition(), InstID, Vals, VE);
+      Vals.push_back(VE.getValueID(SI.getDefaultDest()));
       for (SwitchInst::ConstCaseIt i = SI.case_begin(), e = SI.case_end();
            i != e; ++i) {
-        const IntegersSubset& CaseRanges = i.getCaseValueEx();
-        unsigned Code, Abbrev; // will unused.
-
-        if (CaseRanges.isSingleNumber()) {
-          Vals64.push_back(1/*NumItems = 1*/);
-          Vals64.push_back(true/*IsSingleNumber = true*/);
-          EmitAPInt(Vals64, Code, Abbrev, CaseRanges.getSingleNumber(0), true);
-        } else {
-
-          Vals64.push_back(CaseRanges.getNumItems());
-
-          if (CaseRanges.isSingleNumbersOnly()) {
-            for (unsigned ri = 0, rn = CaseRanges.getNumItems();
-                 ri != rn; ++ri) {
-
-              Vals64.push_back(true/*IsSingleNumber = true*/);
-
-              EmitAPInt(Vals64, Code, Abbrev,
-                        CaseRanges.getSingleNumber(ri), true);
-            }
-          } else
-            for (unsigned ri = 0, rn = CaseRanges.getNumItems();
-                 ri != rn; ++ri) {
-              IntegersSubset::Range r = CaseRanges.getItem(ri);
-              bool IsSingleNumber = CaseRanges.isSingleNumber(ri);
-
-              Vals64.push_back(IsSingleNumber);
-
-              EmitAPInt(Vals64, Code, Abbrev, r.getLow(), true);
-              if (!IsSingleNumber)
-                EmitAPInt(Vals64, Code, Abbrev, r.getHigh(), true);
-            }
-        }
-        Vals64.push_back(VE.getValueID(i.getCaseSuccessor()));
+        Vals.push_back(VE.getValueID(i.getCaseValue()));
+        Vals.push_back(VE.getValueID(i.getCaseSuccessor()));
       }
-
-      Stream.EmitRecord(Code, Vals64, AbbrevToUse);
-
-      // Also do expected action - clear external Vals collection:
-      Vals.clear();
-      return;
     }
     break;
   case Instruction::IndirectBr:
@@ -1930,6 +1865,8 @@ static void WriteModuleUseLists(const Module *M, ValueEnumerator &VE,
     WriteUseList(FI, VE, Stream);
     if (!FI->isDeclaration())
       WriteFunctionUseList(FI, VE, Stream);
+    if (FI->hasPrefixData())
+      WriteUseList(FI->getPrefixData(), VE, Stream);
   }
 
   // Write the aliases.
diff --git a/lib/Bitcode/Writer/ValueEnumerator.cpp b/lib/Bitcode/Writer/ValueEnumerator.cpp
index 8bac6da..a164104 100644
--- a/lib/Bitcode/Writer/ValueEnumerator.cpp
+++ b/lib/Bitcode/Writer/ValueEnumerator.cpp
@@ -60,6 +60,11 @@ ValueEnumerator::ValueEnumerator(const Module *M) {
        I != E; ++I)
     EnumerateValue(I->getAliasee());
 
+  // Enumerate the prefix data constants.
+  for (Module::const_iterator I = M->begin(), E = M->end(); I != E; ++I)
+    if (I->hasPrefixData())
+      EnumerateValue(I->getPrefixData());
+
   // Insert constants and metadata that are named at module level into the slot
   // pool so that the module symbol table can refer to them...
   EnumerateValueSymbolTable(M->getValueSymbolTable());
diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt
index 4d9aebc..7fbf123 100644
--- a/lib/CMakeLists.txt
+++ b/lib/CMakeLists.txt
@@ -7,6 +7,7 @@ add_subdirectory(Bitcode)
 add_subdirectory(Transforms)
 add_subdirectory(Linker)
 add_subdirectory(Analysis)
+add_subdirectory(LTO)
 add_subdirectory(MC)
 add_subdirectory(Object)
 add_subdirectory(Option)
diff --git a/lib/CodeGen/AggressiveAntiDepBreaker.cpp b/lib/CodeGen/AggressiveAntiDepBreaker.cpp
index e079707..2ee7767 100644
--- a/lib/CodeGen/AggressiveAntiDepBreaker.cpp
+++ b/lib/CodeGen/AggressiveAntiDepBreaker.cpp
@@ -782,7 +782,7 @@ unsigned AggressiveAntiDepBreaker::BreakAntiDependencies(
     if (MI == CriticalPathMI) {
       CriticalPathSU = CriticalPathStep(CriticalPathSU);
       CriticalPathMI = (CriticalPathSU) ? CriticalPathSU->getInstr() : 0;
-    } else {
+    } else if (CriticalPathSet.any()) {
       ExcludeRegs = &CriticalPathSet;
     }
 
diff --git a/lib/CodeGen/Analysis.cpp b/lib/CodeGen/Analysis.cpp
index ca08b5b..1600c67 100644
--- a/lib/CodeGen/Analysis.cpp
+++ b/lib/CodeGen/Analysis.cpp
@@ -320,6 +320,7 @@ static const Value *getNoopInput(const Value *V,
 static bool slotOnlyDiscardsData(const Value *RetVal, const Value *CallVal,
                                  SmallVectorImpl<unsigned> &RetIndices,
                                  SmallVectorImpl<unsigned> &CallIndices,
+                                 bool AllowDifferingSizes,
                                  const TargetLoweringBase &TLI) {
 
   // Trace the sub-value needed by the return value as far back up the graph as
@@ -350,7 +351,8 @@ static bool slotOnlyDiscardsData(const Value *RetVal, const Value *CallVal,
   // all the bits that are needed by the "ret" have been provided by the "tail
   // call". FIXME: with sufficiently cunning bit-tracking, we could look through
   // extensions too.
-  if (BitsProvided < BitsRequired)
+  if (BitsProvided < BitsRequired ||
+      (!AllowDifferingSizes && BitsProvided != BitsRequired))
     return false;
 
   return true;
@@ -382,9 +384,8 @@ static bool indexReallyValid(CompositeType *T, unsigned Idx) {
 /// function again on a finished iterator will repeatedly return
 /// false. SubTypes.back()->getTypeAtIndex(Path.back()) is either an empty
 /// aggregate or a non-aggregate
-static bool
-advanceToNextLeafType(SmallVectorImpl<CompositeType *> &SubTypes,
-                     SmallVectorImpl<unsigned> &Path) {
+static bool advanceToNextLeafType(SmallVectorImpl<CompositeType *> &SubTypes,
+                                  SmallVectorImpl<unsigned> &Path) {
   // First march back up the tree until we can successfully increment one of the
   // coordinates in Path.
   while (!Path.empty() && !indexReallyValid(SubTypes.back(), Path.back() + 1)) {
@@ -454,8 +455,8 @@ static bool firstRealType(Type *Next,
 
 /// Set the iterator data-structures to the next non-empty, non-aggregate
 /// subtype.
-bool nextRealType(SmallVectorImpl<CompositeType *> &SubTypes,
-                  SmallVectorImpl<unsigned> &Path) {
+static bool nextRealType(SmallVectorImpl<CompositeType *> &SubTypes,
+                         SmallVectorImpl<unsigned> &Path) {
   do {
     if (!advanceToNextLeafType(SubTypes, Path))
       return false;
@@ -509,6 +510,13 @@ bool llvm::isInTailCallPosition(ImmutableCallSite CS,
         return false;
     }
 
+  return returnTypeIsEligibleForTailCall(ExitBB->getParent(), I, Ret, TLI);
+}
+
+bool llvm::returnTypeIsEligibleForTailCall(const Function *F,
+                                           const Instruction *I,
+                                           const ReturnInst *Ret,
+                                           const TargetLoweringBase &TLI) {
   // If the block ends with a void return or unreachable, it doesn't matter
   // what the call's return type is.
   if (!Ret || Ret->getNumOperands() == 0) return true;
@@ -517,19 +525,38 @@ bool llvm::isInTailCallPosition(ImmutableCallSite CS,
   // return type is.
   if (isa<UndefValue>(Ret->getOperand(0))) return true;
 
-  // Conservatively require the attributes of the call to match those of
-  // the return. Ignore noalias because it doesn't affect the call sequence.
-  const Function *F = ExitBB->getParent();
-  AttributeSet CallerAttrs = F->getAttributes();
-  if (AttrBuilder(CallerAttrs, AttributeSet::ReturnIndex).
-        removeAttribute(Attribute::NoAlias) !=
-      AttrBuilder(CallerAttrs, AttributeSet::ReturnIndex).
-        removeAttribute(Attribute::NoAlias))
-    return false;
+  // Make sure the attributes attached to each return are compatible.
+  AttrBuilder CallerAttrs(F->getAttributes(),
+                          AttributeSet::ReturnIndex);
+  AttrBuilder CalleeAttrs(cast<CallInst>(I)->getAttributes(),
+                          AttributeSet::ReturnIndex);
+
+  // Noalias is completely benign as far as calling convention goes, it
+  // shouldn't affect whether the call is a tail call.
+  CallerAttrs = CallerAttrs.removeAttribute(Attribute::NoAlias);
+  CalleeAttrs = CalleeAttrs.removeAttribute(Attribute::NoAlias);
+
+  bool AllowDifferingSizes = true;
+  if (CallerAttrs.contains(Attribute::ZExt)) {
+    if (!CalleeAttrs.contains(Attribute::ZExt))
+      return false;
+
+    AllowDifferingSizes = false;
+    CallerAttrs.removeAttribute(Attribute::ZExt);
+    CalleeAttrs.removeAttribute(Attribute::ZExt);
+  } else if (CallerAttrs.contains(Attribute::SExt)) {
+    if (!CalleeAttrs.contains(Attribute::SExt))
+      return false;
+
+    AllowDifferingSizes = false;
+    CallerAttrs.removeAttribute(Attribute::SExt);
+    CalleeAttrs.removeAttribute(Attribute::SExt);
+  }
 
-  // It's not safe to eliminate the sign / zero extension of the return value.
-  if (CallerAttrs.hasAttribute(AttributeSet::ReturnIndex, Attribute::ZExt) ||
-      CallerAttrs.hasAttribute(AttributeSet::ReturnIndex, Attribute::SExt))
+  // If they're still different, there's some facet we don't understand
+  // (currently only "inreg", but in future who knows). It may be OK but the
+  // only safe option is to reject the tail call.
+  if (CallerAttrs != CalleeAttrs)
     return false;
 
   const Value *RetVal = Ret->getOperand(0), *CallVal = I;
@@ -571,7 +598,8 @@ bool llvm::isInTailCallPosition(ImmutableCallSite CS,
 
     // Finally, we can check whether the value produced by the tail call at this
     // index is compatible with the value we return.
-    if (!slotOnlyDiscardsData(RetVal, CallVal, TmpRetPath, TmpCallPath, TLI))
+    if (!slotOnlyDiscardsData(RetVal, CallVal, TmpRetPath, TmpCallPath,
+                              AllowDifferingSizes, TLI))
       return false;
 
     CallEmpty  = !nextRealType(CallSubTypes, CallPath);
diff --git a/lib/CodeGen/AsmPrinter/ARMException.cpp b/lib/CodeGen/AsmPrinter/ARMException.cpp
index 188047d..5d82dd9 100644
--- a/lib/CodeGen/AsmPrinter/ARMException.cpp
+++ b/lib/CodeGen/AsmPrinter/ARMException.cpp
@@ -47,13 +47,18 @@ ARMException::ARMException(AsmPrinter *A)
 
 ARMException::~ARMException() {}
 
+ARMTargetStreamer &ARMException::getTargetStreamer() {
+  MCTargetStreamer &TS = Asm->OutStreamer.getTargetStreamer();
+  return static_cast<ARMTargetStreamer &>(TS);
+}
+
 void ARMException::EndModule() {
 }
 
 /// BeginFunction - Gather pre-function exception information. Assumes it's
 /// being emitted immediately after the function entry point.
 void ARMException::BeginFunction(const MachineFunction *MF) {
-  Asm->OutStreamer.EmitFnStart();
+  getTargetStreamer().emitFnStart();
   if (Asm->MF->getFunction()->needsUnwindTableEntry())
     Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("eh_func_begin",
                                                   Asm->getFunctionNumber()));
@@ -62,8 +67,9 @@ void ARMException::BeginFunction(const MachineFunction *MF) {
 /// EndFunction - Gather and emit post-function exception information.
 ///
 void ARMException::EndFunction() {
+  ARMTargetStreamer &ATS = getTargetStreamer();
   if (!Asm->MF->getFunction()->needsUnwindTableEntry())
-    Asm->OutStreamer.EmitCantUnwind();
+    ATS.emitCantUnwind();
   else {
     Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("eh_func_end",
                                                   Asm->getFunctionNumber()));
@@ -76,13 +82,13 @@ void ARMException::EndFunction() {
         // Emit references to personality.
         if (const Function * Personality =
             MMI->getPersonalities()[MMI->getPersonalityIndex()]) {
-          MCSymbol *PerSym = Asm->Mang->getSymbol(Personality);
+          MCSymbol *PerSym = Asm->getSymbol(Personality);
           Asm->OutStreamer.EmitSymbolAttribute(PerSym, MCSA_Global);
-          Asm->OutStreamer.EmitPersonality(PerSym);
+          ATS.emitPersonality(PerSym);
         }
 
         // Emit .handlerdata directive.
-        Asm->OutStreamer.EmitHandlerData();
+        ATS.emitHandlerData();
 
         // Emit actual exception table
         EmitExceptionTable();
@@ -90,7 +96,7 @@ void ARMException::EndFunction() {
     }
   }
 
-  Asm->OutStreamer.EmitFnEnd();
+  ATS.emitFnEnd();
 }
 
 void ARMException::EmitTypeInfos(unsigned TTypeEncoding) {
diff --git a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 12c3574..308b0e0 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -48,6 +48,7 @@
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Transforms/Utils/GlobalStatus.h"
 using namespace llvm;
 
 static const char *const DWARFGroupName = "DWARF Emission";
@@ -94,7 +95,7 @@ static unsigned getGVAlignmentLog2(const GlobalValue *GV, const DataLayout &TD,
 
 AsmPrinter::AsmPrinter(TargetMachine &tm, MCStreamer &Streamer)
   : MachineFunctionPass(ID),
-    TM(tm), MAI(tm.getMCAsmInfo()),
+    TM(tm), MAI(tm.getMCAsmInfo()), MII(tm.getInstrInfo()),
     OutContext(Streamer.getContext()),
     OutStreamer(Streamer),
     LastMI(0), LastFn(0), Counter(~0U), SetCounter(0) {
@@ -164,7 +165,7 @@ bool AsmPrinter::doInitialization(Module &M) {
 
   OutStreamer.InitStreamer();
 
-  Mang = new Mangler(OutContext, &TM);
+  Mang = new Mangler(&TM);
 
   // Allow the target to emit any magic that it wants at the start of the file.
   EmitStartOfAsmFile(M);
@@ -212,12 +213,12 @@ bool AsmPrinter::doInitialization(Module &M) {
   llvm_unreachable("Unknown exception type.");
 }
 
-void AsmPrinter::EmitLinkage(unsigned Linkage, MCSymbol *GVSym) const {
-  switch ((GlobalValue::LinkageTypes)Linkage) {
+void AsmPrinter::EmitLinkage(const GlobalValue *GV, MCSymbol *GVSym) const {
+  GlobalValue::LinkageTypes Linkage = GV->getLinkage();
+  switch (Linkage) {
   case GlobalValue::CommonLinkage:
   case GlobalValue::LinkOnceAnyLinkage:
   case GlobalValue::LinkOnceODRLinkage:
-  case GlobalValue::LinkOnceODRAutoHideLinkage:
   case GlobalValue::WeakAnyLinkage:
   case GlobalValue::WeakODRLinkage:
   case GlobalValue::LinkerPrivateWeakLinkage:
@@ -225,8 +226,19 @@ void AsmPrinter::EmitLinkage(unsigned Linkage, MCSymbol *GVSym) const {
       // .globl _foo
       OutStreamer.EmitSymbolAttribute(GVSym, MCSA_Global);
 
-      if ((GlobalValue::LinkageTypes)Linkage !=
-          GlobalValue::LinkOnceODRAutoHideLinkage)
+      bool CanBeHidden = false;
+
+      if (Linkage == GlobalValue::LinkOnceODRLinkage) {
+        if (GV->hasUnnamedAddr()) {
+          CanBeHidden = true;
+        } else {
+          GlobalStatus GS;
+          if (!GlobalStatus::analyzeGlobal(GV, GS) && !GS.IsCompared)
+            CanBeHidden = true;
+        }
+      }
+
+      if (!CanBeHidden)
         // .weak_definition _foo
         OutStreamer.EmitSymbolAttribute(GVSym, MCSA_WeakDefinition);
       else
@@ -239,7 +251,7 @@ void AsmPrinter::EmitLinkage(unsigned Linkage, MCSymbol *GVSym) const {
       // .weak _foo
       OutStreamer.EmitSymbolAttribute(GVSym, MCSA_Weak);
     }
-    break;
+    return;
   case GlobalValue::DLLExportLinkage:
   case GlobalValue::AppendingLinkage:
     // FIXME: appending linkage variables should go into a section of
@@ -248,16 +260,23 @@ void AsmPrinter::EmitLinkage(unsigned Linkage, MCSymbol *GVSym) const {
     // If external or appending, declare as a global symbol.
     // .globl _foo
     OutStreamer.EmitSymbolAttribute(GVSym, MCSA_Global);
-    break;
+    return;
   case GlobalValue::PrivateLinkage:
   case GlobalValue::InternalLinkage:
   case GlobalValue::LinkerPrivateLinkage:
-    break;
-  default:
-    llvm_unreachable("Unknown linkage type!");
+    return;
+  case GlobalValue::AvailableExternallyLinkage:
+    llvm_unreachable("Should never emit this");
+  case GlobalValue::DLLImportLinkage:
+  case GlobalValue::ExternalWeakLinkage:
+    llvm_unreachable("Don't know how to emit these");
   }
+  llvm_unreachable("Unknown linkage type!");
 }
 
+MCSymbol *AsmPrinter::getSymbol(const GlobalValue *GV) const {
+  return getObjFileLowering().getSymbol(*Mang, GV);
+}
 
 /// EmitGlobalVariable - Emit the specified global variable to the .s file.
 void AsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
@@ -273,7 +292,7 @@ void AsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
     }
   }
 
-  MCSymbol *GVSym = Mang->getSymbol(GV);
+  MCSymbol *GVSym = getSymbol(GV);
   EmitVisibility(GVSym, GV->getVisibility(), !GV->isDeclaration());
 
   if (!GV->hasInitializer())   // External globals require no extra code.
@@ -284,13 +303,16 @@ void AsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
 
   SectionKind GVKind = TargetLoweringObjectFile::getKindForGlobal(GV, TM);
 
-  const DataLayout *TD = TM.getDataLayout();
-  uint64_t Size = TD->getTypeAllocSize(GV->getType()->getElementType());
+  const DataLayout *DL = TM.getDataLayout();
+  uint64_t Size = DL->getTypeAllocSize(GV->getType()->getElementType());
 
   // If the alignment is specified, we *must* obey it.  Overaligning a global
   // with a specified alignment is a prompt way to break globals emitted to
   // sections and expected to be contiguous (e.g. ObjC metadata).
-  unsigned AlignLog = getGVAlignmentLog2(GV, *TD);
+  unsigned AlignLog = getGVAlignmentLog2(GV, *DL);
+
+  if (DD)
+    DD->setSymbolSize(GVSym, Size);
 
   // Handle common and BSS local symbols (.lcomm).
   if (GVKind.isCommon() || GVKind.isBSSLocal()) {
@@ -388,14 +410,14 @@ void AsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
 
     OutStreamer.SwitchSection(TLVSect);
     // Emit the linkage here.
-    EmitLinkage(GV->getLinkage(), GVSym);
+    EmitLinkage(GV, GVSym);
     OutStreamer.EmitLabel(GVSym);
 
     // Three pointers in size:
     //   - __tlv_bootstrap - used to make sure support exists
     //   - spare pointer, used when mapped by the runtime
     //   - pointer to mangled symbol above with initializer
-    unsigned PtrSize = TD->getPointerSizeInBits()/8;
+    unsigned PtrSize = DL->getPointerTypeSize(GV->getType());
     OutStreamer.EmitSymbolValue(GetExternalSymbolSymbol("_tlv_bootstrap"),
                                 PtrSize);
     OutStreamer.EmitIntValue(0, PtrSize);
@@ -407,7 +429,7 @@ void AsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
 
   OutStreamer.SwitchSection(TheSection);
 
-  EmitLinkage(GV->getLinkage(), GVSym);
+  EmitLinkage(GV, GVSym);
   EmitAlignment(AlignLog, GV);
 
   OutStreamer.EmitLabel(GVSym);
@@ -433,7 +455,7 @@ void AsmPrinter::EmitFunctionHeader() {
   OutStreamer.SwitchSection(getObjFileLowering().SectionForGlobal(F, Mang, TM));
   EmitVisibility(CurrentFnSym, F->getVisibility());
 
-  EmitLinkage(F->getLinkage(), CurrentFnSym);
+  EmitLinkage(F, CurrentFnSym);
   EmitAlignment(MF->getAlignment(), F);
 
   if (MAI->hasDotTypeDotSizeDirective())
@@ -459,16 +481,6 @@ void AsmPrinter::EmitFunctionHeader() {
     OutStreamer.EmitLabel(DeadBlockSyms[i]);
   }
 
-  // Add some workaround for linkonce linkage on Cygwin\MinGW.
-  if (MAI->getLinkOnceDirective() != 0 &&
-      (F->hasLinkOnceLinkage() || F->hasWeakLinkage())) {
-    // FIXME: What is this?
-    MCSymbol *FakeStub =
-      OutContext.GetOrCreateSymbol(Twine("Lllvm$workaround$fake$stub$")+
-                                   CurrentFnSym->getName());
-    OutStreamer.EmitLabel(FakeStub);
-  }
-
   // Emit pre-function debug and/or EH information.
   if (DE) {
     NamedRegionTimer T(EHTimerName, DWARFGroupName, TimePassesIsEnabled);
@@ -478,6 +490,10 @@ void AsmPrinter::EmitFunctionHeader() {
     NamedRegionTimer T(DbgTimerName, DWARFGroupName, TimePassesIsEnabled);
     DD->beginFunction(MF);
   }
+
+  // Emit the prefix data.
+  if (F->hasPrefixData())
+    EmitGlobalConstant(F->getPrefixData());
 }
 
 /// EmitFunctionEntryLabel - Emit the label that is the entrypoint for the
@@ -530,11 +546,11 @@ static void emitComments(const MachineInstr &MI, raw_ostream &CommentOS) {
 
 /// emitImplicitDef - This method emits the specified machine instruction
 /// that is an implicit def.
-static void emitImplicitDef(const MachineInstr *MI, AsmPrinter &AP) {
+void AsmPrinter::emitImplicitDef(const MachineInstr *MI) const {
   unsigned RegNo = MI->getOperand(0).getReg();
-  AP.OutStreamer.AddComment(Twine("implicit-def: ") +
-                            AP.TM.getRegisterInfo()->getName(RegNo));
-  AP.OutStreamer.AddBlankLine();
+  OutStreamer.AddComment(Twine("implicit-def: ") +
+                         TM.getRegisterInfo()->getName(RegNo));
+  OutStreamer.AddBlankLine();
 }
 
 static void emitKill(const MachineInstr *MI, AsmPrinter &AP) {
@@ -646,7 +662,7 @@ bool AsmPrinter::needsRelocationsForDwarfStringPool() const {
 }
 
 void AsmPrinter::emitPrologLabel(const MachineInstr &MI) {
-  MCSymbol *Label = MI.getOperand(0).getMCSymbol();
+  const MCSymbol *Label = MI.getOperand(0).getMCSymbol();
 
   if (MAI->getExceptionHandlingType() != ExceptionHandling::DwarfCFI)
     return;
@@ -657,12 +673,12 @@ void AsmPrinter::emitPrologLabel(const MachineInstr &MI) {
   if (MMI->getCompactUnwindEncoding() != 0)
     OutStreamer.EmitCompactUnwindEncoding(MMI->getCompactUnwindEncoding());
 
-  MachineModuleInfo &MMI = MF->getMMI();
-  std::vector<MCCFIInstruction> Instructions = MMI.getFrameInstructions();
+  const MachineModuleInfo &MMI = MF->getMMI();
+  const std::vector<MCCFIInstruction> &Instrs = MMI.getFrameInstructions();
   bool FoundOne = false;
   (void)FoundOne;
-  for (std::vector<MCCFIInstruction>::iterator I = Instructions.begin(),
-         E = Instructions.end(); I != E; ++I) {
+  for (std::vector<MCCFIInstruction>::const_iterator I = Instrs.begin(),
+         E = Instrs.end(); I != E; ++I) {
     if (I->getLabel() == Label) {
       emitCFIInstruction(*I);
       FoundOne = true;
@@ -724,7 +740,7 @@ void AsmPrinter::EmitFunctionBody() {
         }
         break;
       case TargetOpcode::IMPLICIT_DEF:
-        if (isVerbose()) emitImplicitDef(II, *this);
+        if (isVerbose()) emitImplicitDef(II);
         break;
       case TargetOpcode::KILL:
         if (isVerbose()) emitKill(II, *this);
@@ -877,7 +893,7 @@ bool AsmPrinter::doFinalization(Module &M) {
     if (V == GlobalValue::DefaultVisibility)
       continue;
 
-    MCSymbol *Name = Mang->getSymbol(&F);
+    MCSymbol *Name = getSymbol(&F);
     EmitVisibility(Name, V, false);
   }
 
@@ -887,6 +903,9 @@ bool AsmPrinter::doFinalization(Module &M) {
   if (!ModuleFlags.empty())
     getObjFileLowering().emitModuleFlags(OutStreamer, ModuleFlags, Mang, TM);
 
+  // Make sure we wrote out everything we need.
+  OutStreamer.Flush();
+
   // Finalize debug and EH information.
   if (DE) {
     {
@@ -914,12 +933,12 @@ bool AsmPrinter::doFinalization(Module &M) {
     for (Module::const_global_iterator I = M.global_begin(), E = M.global_end();
          I != E; ++I) {
       if (!I->hasExternalWeakLinkage()) continue;
-      OutStreamer.EmitSymbolAttribute(Mang->getSymbol(I), MCSA_WeakReference);
+      OutStreamer.EmitSymbolAttribute(getSymbol(I), MCSA_WeakReference);
     }
 
     for (Module::const_iterator I = M.begin(), E = M.end(); I != E; ++I) {
       if (!I->hasExternalWeakLinkage()) continue;
-      OutStreamer.EmitSymbolAttribute(Mang->getSymbol(I), MCSA_WeakReference);
+      OutStreamer.EmitSymbolAttribute(getSymbol(I), MCSA_WeakReference);
     }
   }
 
@@ -927,14 +946,19 @@ bool AsmPrinter::doFinalization(Module &M) {
     OutStreamer.AddBlankLine();
     for (Module::const_alias_iterator I = M.alias_begin(), E = M.alias_end();
          I != E; ++I) {
-      MCSymbol *Name = Mang->getSymbol(I);
+      MCSymbol *Name = getSymbol(I);
 
       const GlobalValue *GV = I->getAliasedGlobal();
-      MCSymbol *Target = Mang->getSymbol(GV);
+      if (GV->isDeclaration()) {
+        report_fatal_error(Name->getName() +
+                           ": Target doesn't support aliases to declarations");
+      }
+
+      MCSymbol *Target = getSymbol(GV);
 
       if (I->hasExternalLinkage() || !MAI->getWeakRefDirective())
         OutStreamer.EmitSymbolAttribute(Name, MCSA_Global);
-      else if (I->hasWeakLinkage())
+      else if (I->hasWeakLinkage() || I->hasLinkOnceLinkage())
         OutStreamer.EmitSymbolAttribute(Name, MCSA_WeakReference);
       else
         assert(I->hasLocalLinkage() && "Invalid alias linkage");
@@ -953,6 +977,9 @@ bool AsmPrinter::doFinalization(Module &M) {
     if (GCMetadataPrinter *MP = GetOrCreateGCPrinter(*--I))
       MP->finishAssembly(*this);
 
+  // Emit llvm.ident metadata in an '.ident' directive.
+  EmitModuleIdents(M);
+
   // If we don't have any trampolines, then we don't require stack memory
   // to be executable. Some targets have a directive to declare this.
   Function *InitTrampolineIntrinsic = M.getFunction("llvm.init.trampoline");
@@ -976,7 +1003,7 @@ bool AsmPrinter::doFinalization(Module &M) {
 void AsmPrinter::SetupMachineFunction(MachineFunction &MF) {
   this->MF = &MF;
   // Get the function symbol.
-  CurrentFnSym = Mang->getSymbol(MF.getFunction());
+  CurrentFnSym = getSymbol(MF.getFunction());
   CurrentFnSymForSize = CurrentFnSym;
 
   if (isVerbose())
@@ -1283,16 +1310,10 @@ void AsmPrinter::EmitLLVMUsedList(const ConstantArray *InitList) {
     const GlobalValue *GV =
       dyn_cast<GlobalValue>(InitList->getOperand(i)->stripPointerCasts());
     if (GV && getObjFileLowering().shouldEmitUsedDirectiveFor(GV, Mang))
-      OutStreamer.EmitSymbolAttribute(Mang->getSymbol(GV), MCSA_NoDeadStrip);
+      OutStreamer.EmitSymbolAttribute(getSymbol(GV), MCSA_NoDeadStrip);
   }
 }
 
-typedef std::pair<unsigned, Constant*> Structor;
-
-static bool priority_order(const Structor& lhs, const Structor& rhs) {
-  return lhs.first < rhs.first;
-}
-
 /// EmitXXStructorList - Emit the ctor or dtor list taking into account the init
 /// priority.
 void AsmPrinter::EmitXXStructorList(const Constant *List, bool isCtor) {
@@ -1309,6 +1330,7 @@ void AsmPrinter::EmitXXStructorList(const Constant *List, bool isCtor) {
       !isa<PointerType>(ETy->getTypeAtIndex(1U))) return; // Not (int, ptr).
 
   // Gather the structors in a form that's convenient for sorting by priority.
+  typedef std::pair<unsigned, Constant *> Structor;
   SmallVector<Structor, 8> Structors;
   for (unsigned i = 0, e = InitList->getNumOperands(); i != e; ++i) {
     ConstantStruct *CS = dyn_cast<ConstantStruct>(InitList->getOperand(i));
@@ -1322,9 +1344,9 @@ void AsmPrinter::EmitXXStructorList(const Constant *List, bool isCtor) {
   }
 
   // Emit the function pointers in the target-specific order
-  const DataLayout *TD = TM.getDataLayout();
-  unsigned Align = Log2_32(TD->getPointerPrefAlignment());
-  std::stable_sort(Structors.begin(), Structors.end(), priority_order);
+  const DataLayout *DL = TM.getDataLayout();
+  unsigned Align = Log2_32(DL->getPointerPrefAlignment());
+  std::stable_sort(Structors.begin(), Structors.end(), less_first());
   for (unsigned i = 0, e = Structors.size(); i != e; ++i) {
     const MCSection *OutputSection =
       (isCtor ?
@@ -1337,6 +1359,21 @@ void AsmPrinter::EmitXXStructorList(const Constant *List, bool isCtor) {
   }
 }
 
+void AsmPrinter::EmitModuleIdents(Module &M) {
+  if (!MAI->hasIdentDirective())
+    return;
+
+  if (const NamedMDNode *NMD = M.getNamedMetadata("llvm.ident")) {
+    for (unsigned i = 0, e = NMD->getNumOperands(); i != e; ++i) {
+      const MDNode *N = NMD->getOperand(i);
+      assert(N->getNumOperands() == 1 && 
+             "llvm.ident metadata entry can have only one operand");
+      const MDString *S = cast<MDString>(N->getOperand(0));
+      OutStreamer.EmitIdent(S->getString());
+    }
+  }
+}
+
 //===--------------------------------------------------------------------===//
 // Emission and print routines
 //
@@ -1402,12 +1439,12 @@ void AsmPrinter::EmitLabelOffsetDifference(const MCSymbol *Hi, uint64_t Offset,
                             OutContext);
 
   if (!MAI->hasSetDirective())
-    OutStreamer.EmitValue(Diff, 4);
+    OutStreamer.EmitValue(Diff, Size);
   else {
     // Otherwise, emit with .set (aka assignment).
     MCSymbol *SetLabel = GetTempSymbol("set", SetCounter++);
     OutStreamer.EmitAssignment(SetLabel, Diff);
-    OutStreamer.EmitSymbolValue(SetLabel, 4);
+    OutStreamer.EmitSymbolValue(SetLabel, Size);
   }
 }
 
@@ -1415,9 +1452,9 @@ void AsmPrinter::EmitLabelOffsetDifference(const MCSymbol *Hi, uint64_t Offset,
 /// where the size in bytes of the directive is specified by Size and Label
 /// specifies the label.  This implicitly uses .set if it is available.
 void AsmPrinter::EmitLabelPlusOffset(const MCSymbol *Label, uint64_t Offset,
-                                      unsigned Size)
+                                      unsigned Size, bool IsSectionRelative)
   const {
-  if (MAI->needsDwarfSectionOffsetDirective() && Size == 4) { // secrel32 ONLY works for 32bits.
+  if (MAI->needsDwarfSectionOffsetDirective() && IsSectionRelative) {
     OutStreamer.EmitCOFFSecRel32(Label);
     return;
   }
@@ -1468,7 +1505,7 @@ static const MCExpr *lowerConstant(const Constant *CV, AsmPrinter &AP) {
     return MCConstantExpr::Create(CI->getZExtValue(), Ctx);
 
   if (const GlobalValue *GV = dyn_cast<GlobalValue>(CV))
-    return MCSymbolRefExpr::Create(AP.Mang->getSymbol(GV), Ctx);
+    return MCSymbolRefExpr::Create(AP.getSymbol(GV), Ctx);
 
   if (const BlockAddress *BA = dyn_cast<BlockAddress>(CV))
     return MCSymbolRefExpr::Create(AP.GetBlockAddressSymbol(BA), Ctx);
@@ -1498,10 +1535,10 @@ static const MCExpr *lowerConstant(const Constant *CV, AsmPrinter &AP) {
       report_fatal_error(OS.str());
     }
   case Instruction::GetElementPtr: {
-    const DataLayout &TD = *AP.TM.getDataLayout();
+    const DataLayout &DL = *AP.TM.getDataLayout();
     // Generate a symbolic expression for the byte address
-    APInt OffsetAI(TD.getPointerSizeInBits(), 0);
-    cast<GEPOperator>(CE)->accumulateConstantOffset(TD, OffsetAI);
+    APInt OffsetAI(DL.getPointerTypeSizeInBits(CE->getType()), 0);
+    cast<GEPOperator>(CE)->accumulateConstantOffset(DL, OffsetAI);
 
     const MCExpr *Base = lowerConstant(CE->getOperand(0), AP);
     if (!OffsetAI)
@@ -1522,17 +1559,17 @@ static const MCExpr *lowerConstant(const Constant *CV, AsmPrinter &AP) {
     return lowerConstant(CE->getOperand(0), AP);
 
   case Instruction::IntToPtr: {
-    const DataLayout &TD = *AP.TM.getDataLayout();
+    const DataLayout &DL = *AP.TM.getDataLayout();
     // Handle casts to pointers by changing them into casts to the appropriate
     // integer type.  This promotes constant folding and simplifies this code.
     Constant *Op = CE->getOperand(0);
-    Op = ConstantExpr::getIntegerCast(Op, TD.getIntPtrType(CV->getContext()),
+    Op = ConstantExpr::getIntegerCast(Op, DL.getIntPtrType(CV->getType()),
                                       false/*ZExt*/);
     return lowerConstant(Op, AP);
   }
 
   case Instruction::PtrToInt: {
-    const DataLayout &TD = *AP.TM.getDataLayout();
+    const DataLayout &DL = *AP.TM.getDataLayout();
     // Support only foldable casts to/from pointers that can be eliminated by
     // changing the pointer to the appropriately sized integer type.
     Constant *Op = CE->getOperand(0);
@@ -1542,13 +1579,13 @@ static const MCExpr *lowerConstant(const Constant *CV, AsmPrinter &AP) {
 
     // We can emit the pointer value into this slot if the slot is an
     // integer slot equal to the size of the pointer.
-    if (TD.getTypeAllocSize(Ty) == TD.getTypeAllocSize(Op->getType()))
+    if (DL.getTypeAllocSize(Ty) == DL.getTypeAllocSize(Op->getType()))
       return OpExpr;
 
     // Otherwise the pointer is smaller than the resultant integer, mask off
     // the high bits so we are sure to get a proper truncation if the input is
     // a constant expr.
-    unsigned InBits = TD.getTypeAllocSizeInBits(Op->getType());
+    unsigned InBits = DL.getTypeAllocSizeInBits(Op->getType());
     const MCExpr *MaskExpr = MCConstantExpr::Create(~0ULL >> (64-InBits), Ctx);
     return MCBinaryExpr::CreateAnd(OpExpr, MaskExpr, Ctx);
   }
@@ -1699,9 +1736,9 @@ static void emitGlobalConstantDataSequential(const ConstantDataSequential *CDS,
     }
   }
 
-  const DataLayout &TD = *AP.TM.getDataLayout();
-  unsigned Size = TD.getTypeAllocSize(CDS->getType());
-  unsigned EmittedSize = TD.getTypeAllocSize(CDS->getType()->getElementType()) *
+  const DataLayout &DL = *AP.TM.getDataLayout();
+  unsigned Size = DL.getTypeAllocSize(CDS->getType());
+  unsigned EmittedSize = DL.getTypeAllocSize(CDS->getType()->getElementType()) *
                         CDS->getNumElements();
   if (unsigned Padding = Size - EmittedSize)
     AP.OutStreamer.EmitZeros(Padding);
@@ -1727,9 +1764,9 @@ static void emitGlobalConstantVector(const ConstantVector *CV, AsmPrinter &AP) {
   for (unsigned i = 0, e = CV->getType()->getNumElements(); i != e; ++i)
     emitGlobalConstantImpl(CV->getOperand(i), AP);
 
-  const DataLayout &TD = *AP.TM.getDataLayout();
-  unsigned Size = TD.getTypeAllocSize(CV->getType());
-  unsigned EmittedSize = TD.getTypeAllocSize(CV->getType()->getElementType()) *
+  const DataLayout &DL = *AP.TM.getDataLayout();
+  unsigned Size = DL.getTypeAllocSize(CV->getType());
+  unsigned EmittedSize = DL.getTypeAllocSize(CV->getType()->getElementType()) *
                          CV->getType()->getNumElements();
   if (unsigned Padding = Size - EmittedSize)
     AP.OutStreamer.EmitZeros(Padding);
@@ -1737,15 +1774,15 @@ static void emitGlobalConstantVector(const ConstantVector *CV, AsmPrinter &AP) {
 
 static void emitGlobalConstantStruct(const ConstantStruct *CS, AsmPrinter &AP) {
   // Print the fields in successive locations. Pad to align if needed!
-  const DataLayout *TD = AP.TM.getDataLayout();
-  unsigned Size = TD->getTypeAllocSize(CS->getType());
-  const StructLayout *Layout = TD->getStructLayout(CS->getType());
+  const DataLayout *DL = AP.TM.getDataLayout();
+  unsigned Size = DL->getTypeAllocSize(CS->getType());
+  const StructLayout *Layout = DL->getStructLayout(CS->getType());
   uint64_t SizeSoFar = 0;
   for (unsigned i = 0, e = CS->getNumOperands(); i != e; ++i) {
     const Constant *Field = CS->getOperand(i);
 
     // Check if padding is needed and insert one or more 0s.
-    uint64_t FieldSize = TD->getTypeAllocSize(Field->getType());
+    uint64_t FieldSize = DL->getTypeAllocSize(Field->getType());
     uint64_t PadSize = ((i == e-1 ? Size : Layout->getElementOffset(i+1))
                         - Layout->getElementOffset(i)) - FieldSize;
     SizeSoFar += FieldSize + PadSize;
@@ -1802,13 +1839,13 @@ static void emitGlobalConstantFP(const ConstantFP *CFP, AsmPrinter &AP) {
   }
 
   // Emit the tail padding for the long double.
-  const DataLayout &TD = *AP.TM.getDataLayout();
-  AP.OutStreamer.EmitZeros(TD.getTypeAllocSize(CFP->getType()) -
-                           TD.getTypeStoreSize(CFP->getType()));
+  const DataLayout &DL = *AP.TM.getDataLayout();
+  AP.OutStreamer.EmitZeros(DL.getTypeAllocSize(CFP->getType()) -
+                           DL.getTypeStoreSize(CFP->getType()));
 }
 
 static void emitGlobalConstantLargeInt(const ConstantInt *CI, AsmPrinter &AP) {
-  const DataLayout *TD = AP.TM.getDataLayout();
+  const DataLayout *DL = AP.TM.getDataLayout();
   unsigned BitWidth = CI->getBitWidth();
 
   // Copy the value as we may massage the layout for constants whose bit width
@@ -1825,7 +1862,7 @@ static void emitGlobalConstantLargeInt(const ConstantInt *CI, AsmPrinter &AP) {
     // Big endian:
     // * Record the extra bits to emit.
     // * Realign the raw data to emit the chunks of 64-bits.
-    if (TD->isBigEndian()) {
+    if (DL->isBigEndian()) {
       // Basically the structure of the raw data is a chunk of 64-bits cells:
       //    0        1         BitWidth / 64
       // [chunk1][chunk2] ... [chunkN].
@@ -1846,7 +1883,7 @@ static void emitGlobalConstantLargeInt(const ConstantInt *CI, AsmPrinter &AP) {
   // quantities at a time.
   const uint64_t *RawData = Realigned.getRawData();
   for (unsigned i = 0, e = BitWidth / 64; i != e; ++i) {
-    uint64_t Val = TD->isBigEndian() ? RawData[e - i - 1] : RawData[i];
+    uint64_t Val = DL->isBigEndian() ? RawData[e - i - 1] : RawData[i];
     AP.OutStreamer.EmitIntValue(Val, 8);
   }
 
@@ -1864,8 +1901,8 @@ static void emitGlobalConstantLargeInt(const ConstantInt *CI, AsmPrinter &AP) {
 }
 
 static void emitGlobalConstantImpl(const Constant *CV, AsmPrinter &AP) {
-  const DataLayout *TD = AP.TM.getDataLayout();
-  uint64_t Size = TD->getTypeAllocSize(CV->getType());
+  const DataLayout *DL = AP.TM.getDataLayout();
+  uint64_t Size = DL->getTypeAllocSize(CV->getType());
   if (isa<ConstantAggregateZero>(CV) || isa<UndefValue>(CV))
     return AP.OutStreamer.EmitZeros(Size);
 
@@ -1913,7 +1950,7 @@ static void emitGlobalConstantImpl(const Constant *CV, AsmPrinter &AP) {
       // If the constant expression's size is greater than 64-bits, then we have
       // to emit the value in chunks. Try to constant fold the value and emit it
       // that way.
-      Constant *New = ConstantFoldConstantExpression(CE, TD);
+      Constant *New = ConstantFoldConstantExpression(CE, DL);
       if (New && New != CE)
         return emitGlobalConstantImpl(New, AP);
     }
diff --git a/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp b/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp
index c141d60..b92f49c 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp
@@ -185,5 +185,11 @@ void AsmPrinter::emitCFIInstruction(const MCCFIInstruction &Inst) const {
   case MCCFIInstruction::OpOffset:
     OutStreamer.EmitCFIOffset(Inst.getRegister(), Inst.getOffset());
     break;
+  case MCCFIInstruction::OpRegister:
+    OutStreamer.EmitCFIRegister(Inst.getRegister(), Inst.getRegister2());
+    break;
+  case MCCFIInstruction::OpWindowSave:
+    OutStreamer.EmitCFIWindowSave();
+    break;
   }
 }
diff --git a/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp b/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
index d8e9c95..4f927f6 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
@@ -123,7 +123,7 @@ void AsmPrinter::EmitInlineAsm(StringRef Str, const MDNode *LocMDNode,
                                              TM.getTargetCPU(),
                                              TM.getTargetFeatureString()));
   OwningPtr<MCTargetAsmParser>
-    TAP(TM.getTarget().createMCAsmParser(*STI, *Parser));
+    TAP(TM.getTarget().createMCAsmParser(*STI, *Parser, *MII));
   if (!TAP)
     report_fatal_error("Inline asm not supported by this streamer because"
                        " we don't have an asm parser for this target\n");
diff --git a/lib/CodeGen/AsmPrinter/CMakeLists.txt b/lib/CodeGen/AsmPrinter/CMakeLists.txt
index 65e7bee..be484a6 100644
--- a/lib/CodeGen/AsmPrinter/CMakeLists.txt
+++ b/lib/CodeGen/AsmPrinter/CMakeLists.txt
@@ -4,6 +4,7 @@ add_llvm_library(LLVMAsmPrinter
   AsmPrinterDwarf.cpp
   AsmPrinterInlineAsm.cpp
   DIE.cpp
+  DIEHash.cpp
   DwarfAccelTable.cpp
   DwarfCFIException.cpp
   DwarfCompileUnit.cpp
diff --git a/lib/CodeGen/AsmPrinter/DIE.cpp b/lib/CodeGen/AsmPrinter/DIE.cpp
index ab03861..6944428 100644
--- a/lib/CodeGen/AsmPrinter/DIE.cpp
+++ b/lib/CodeGen/AsmPrinter/DIE.cpp
@@ -34,8 +34,10 @@ using namespace llvm;
 /// Profile - Used to gather unique data for the abbreviation folding set.
 ///
 void DIEAbbrevData::Profile(FoldingSetNodeID &ID) const {
-  ID.AddInteger(Attribute);
-  ID.AddInteger(Form);
+  // Explicitly cast to an integer type for which FoldingSetNodeID has
+  // overloads.  Otherwise MSVC 2010 thinks this call is ambiguous.
+  ID.AddInteger(unsigned(Attribute));
+  ID.AddInteger(unsigned(Form));
 }
 
 //===----------------------------------------------------------------------===//
@@ -45,7 +47,7 @@ void DIEAbbrevData::Profile(FoldingSetNodeID &ID) const {
 /// Profile - Used to gather unique data for the abbreviation folding set.
 ///
 void DIEAbbrev::Profile(FoldingSetNodeID &ID) const {
-  ID.AddInteger(Tag);
+  ID.AddInteger(unsigned(Tag));
   ID.AddInteger(ChildrenFlag);
 
   // For each attribute description.
@@ -112,17 +114,25 @@ DIE::~DIE() {
 
 /// Climb up the parent chain to get the compile unit DIE to which this DIE
 /// belongs.
-DIE *DIE::getCompileUnit() {
-  DIE *p = this;
+const DIE *DIE::getCompileUnit() const {
+  const DIE *Cu = getCompileUnitOrNull();
+  assert(Cu && "We should not have orphaned DIEs.");
+  return Cu;
+}
+
+/// Climb up the parent chain to get the compile unit DIE this DIE belongs
+/// to. Return NULL if DIE is not added to an owner yet.
+const DIE *DIE::getCompileUnitOrNull() const {
+  const DIE *p = this;
   while (p) {
     if (p->getTag() == dwarf::DW_TAG_compile_unit)
       return p;
     p = p->getParent();
   }
-  llvm_unreachable("We should not have orphaned DIEs.");
+  return NULL;
 }
 
-DIEValue *DIE::findAttribute(unsigned Attribute) {
+DIEValue *DIE::findAttribute(uint16_t Attribute) {
   const SmallVectorImpl<DIEValue *> &Values = getValues();
   const DIEAbbrev &Abbrevs = getAbbrev();
 
@@ -199,14 +209,14 @@ void DIEValue::dump() const {
 
 /// EmitValue - Emit integer of appropriate size.
 ///
-void DIEInteger::EmitValue(AsmPrinter *Asm, unsigned Form) const {
+void DIEInteger::EmitValue(AsmPrinter *Asm, dwarf::Form Form) const {
   unsigned Size = ~0U;
   switch (Form) {
   case dwarf::DW_FORM_flag_present:
     // Emit something to keep the lines and comments in sync.
     // FIXME: Is there a better way to do this?
     if (Asm->OutStreamer.hasRawTextSupport())
-      Asm->OutStreamer.EmitRawText(StringRef(""));
+      Asm->OutStreamer.EmitRawText("");
     return;
   case dwarf::DW_FORM_flag:  // Fall thru
   case dwarf::DW_FORM_ref1:  // Fall thru
@@ -231,7 +241,7 @@ void DIEInteger::EmitValue(AsmPrinter *Asm, unsigned Form) const {
 
 /// SizeOf - Determine size of integer value in bytes.
 ///
-unsigned DIEInteger::SizeOf(AsmPrinter *AP, unsigned Form) const {
+unsigned DIEInteger::SizeOf(AsmPrinter *AP, dwarf::Form Form) const {
   switch (Form) {
   case dwarf::DW_FORM_flag_present: return 0;
   case dwarf::DW_FORM_flag:  // Fall thru
@@ -266,13 +276,13 @@ void DIEInteger::print(raw_ostream &O) const {
 
 /// EmitValue - Emit expression value.
 ///
-void DIEExpr::EmitValue(AsmPrinter *AP, unsigned Form) const {
+void DIEExpr::EmitValue(AsmPrinter *AP, dwarf::Form Form) const {
   AP->OutStreamer.EmitValue(Expr, SizeOf(AP, Form));
 }
 
 /// SizeOf - Determine size of expression value in bytes.
 ///
-unsigned DIEExpr::SizeOf(AsmPrinter *AP, unsigned Form) const {
+unsigned DIEExpr::SizeOf(AsmPrinter *AP, dwarf::Form Form) const {
   if (Form == dwarf::DW_FORM_data4) return 4;
   if (Form == dwarf::DW_FORM_sec_offset) return 4;
   if (Form == dwarf::DW_FORM_strp) return 4;
@@ -292,13 +302,16 @@ void DIEExpr::print(raw_ostream &O) const {
 
 /// EmitValue - Emit label value.
 ///
-void DIELabel::EmitValue(AsmPrinter *AP, unsigned Form) const {
-  AP->EmitLabelReference(Label, SizeOf(AP, Form));
+void DIELabel::EmitValue(AsmPrinter *AP, dwarf::Form Form) const {
+  AP->EmitLabelReference(Label, SizeOf(AP, Form),
+                         Form == dwarf::DW_FORM_strp ||
+                             Form == dwarf::DW_FORM_sec_offset ||
+                             Form == dwarf::DW_FORM_ref_addr);
 }
 
 /// SizeOf - Determine size of label value in bytes.
 ///
-unsigned DIELabel::SizeOf(AsmPrinter *AP, unsigned Form) const {
+unsigned DIELabel::SizeOf(AsmPrinter *AP, dwarf::Form Form) const {
   if (Form == dwarf::DW_FORM_data4) return 4;
   if (Form == dwarf::DW_FORM_sec_offset) return 4;
   if (Form == dwarf::DW_FORM_strp) return 4;
@@ -317,13 +330,13 @@ void DIELabel::print(raw_ostream &O) const {
 
 /// EmitValue - Emit delta value.
 ///
-void DIEDelta::EmitValue(AsmPrinter *AP, unsigned Form) const {
+void DIEDelta::EmitValue(AsmPrinter *AP, dwarf::Form Form) const {
   AP->EmitLabelDifference(LabelHi, LabelLo, SizeOf(AP, Form));
 }
 
 /// SizeOf - Determine size of delta value in bytes.
 ///
-unsigned DIEDelta::SizeOf(AsmPrinter *AP, unsigned Form) const {
+unsigned DIEDelta::SizeOf(AsmPrinter *AP, dwarf::Form Form) const {
   if (Form == dwarf::DW_FORM_data4) return 4;
   if (Form == dwarf::DW_FORM_strp) return 4;
   return AP->getDataLayout().getPointerSize();
@@ -341,13 +354,13 @@ void DIEDelta::print(raw_ostream &O) const {
 
 /// EmitValue - Emit string value.
 ///
-void DIEString::EmitValue(AsmPrinter *AP, unsigned Form) const {
+void DIEString::EmitValue(AsmPrinter *AP, dwarf::Form Form) const {
   Access->EmitValue(AP, Form);
 }
 
 /// SizeOf - Determine size of delta value in bytes.
 ///
-unsigned DIEString::SizeOf(AsmPrinter *AP, unsigned Form) const {
+unsigned DIEString::SizeOf(AsmPrinter *AP, dwarf::Form Form) const {
   return Access->SizeOf(AP, Form);
 }
 
@@ -364,7 +377,7 @@ void DIEString::print(raw_ostream &O) const {
 
 /// EmitValue - Emit debug information entry offset.
 ///
-void DIEEntry::EmitValue(AsmPrinter *AP, unsigned Form) const {
+void DIEEntry::EmitValue(AsmPrinter *AP, dwarf::Form Form) const {
   AP->EmitInt32(Entry->getOffset());
 }
 
@@ -402,7 +415,7 @@ unsigned DIEBlock::ComputeSize(AsmPrinter *AP) {
 
 /// EmitValue - Emit block data.
 ///
-void DIEBlock::EmitValue(AsmPrinter *Asm, unsigned Form) const {
+void DIEBlock::EmitValue(AsmPrinter *Asm, dwarf::Form Form) const {
   switch (Form) {
   default: llvm_unreachable("Improper form for block");
   case dwarf::DW_FORM_block1: Asm->EmitInt8(Size);    break;
@@ -418,7 +431,7 @@ void DIEBlock::EmitValue(AsmPrinter *Asm, unsigned Form) const {
 
 /// SizeOf - Determine size of block data in bytes.
 ///
-unsigned DIEBlock::SizeOf(AsmPrinter *AP, unsigned Form) const {
+unsigned DIEBlock::SizeOf(AsmPrinter *AP, dwarf::Form Form) const {
   switch (Form) {
   case dwarf::DW_FORM_block1: return Size + sizeof(int8_t);
   case dwarf::DW_FORM_block2: return Size + sizeof(int16_t);
diff --git a/lib/CodeGen/AsmPrinter/DIE.h b/lib/CodeGen/AsmPrinter/DIE.h
index bfd7d1d..f4fa326 100644
--- a/lib/CodeGen/AsmPrinter/DIE.h
+++ b/lib/CodeGen/AsmPrinter/DIE.h
@@ -33,17 +33,17 @@ namespace llvm {
   class DIEAbbrevData {
     /// Attribute - Dwarf attribute code.
     ///
-    uint16_t Attribute;
+    dwarf::Attribute Attribute;
 
     /// Form - Dwarf form code.
     ///
-    uint16_t Form;
+    dwarf::Form Form;
   public:
-    DIEAbbrevData(uint16_t A, uint16_t F) : Attribute(A), Form(F) {}
+    DIEAbbrevData(dwarf::Attribute A, dwarf::Form F) : Attribute(A), Form(F) {}
 
     // Accessors.
-    uint16_t getAttribute() const { return Attribute; }
-    uint16_t getForm()      const { return Form; }
+    dwarf::Attribute getAttribute() const { return Attribute; }
+    dwarf::Form getForm() const { return Form; }
 
     /// Profile - Used to gather unique data for the abbreviation folding set.
     ///
@@ -56,7 +56,7 @@ namespace llvm {
   class DIEAbbrev : public FoldingSetNode {
     /// Tag - Dwarf tag code.
     ///
-    uint16_t Tag;
+    dwarf::Tag Tag;
 
     /// ChildrenFlag - Dwarf children flag.
     ///
@@ -71,20 +71,19 @@ namespace llvm {
     SmallVector<DIEAbbrevData, 12> Data;
 
   public:
-    DIEAbbrev(uint16_t T, uint16_t C) : Tag(T), ChildrenFlag(C), Data() {}
+    DIEAbbrev(dwarf::Tag T, uint16_t C) : Tag(T), ChildrenFlag(C), Data() {}
 
     // Accessors.
-    uint16_t getTag() const { return Tag; }
+    dwarf::Tag getTag() const { return Tag; }
     unsigned getNumber() const { return Number; }
     uint16_t getChildrenFlag() const { return ChildrenFlag; }
     const SmallVectorImpl<DIEAbbrevData> &getData() const { return Data; }
-    void setTag(uint16_t T) { Tag = T; }
     void setChildrenFlag(uint16_t CF) { ChildrenFlag = CF; }
     void setNumber(unsigned N) { Number = N; }
 
     /// AddAttribute - Adds another set of attribute information to the
     /// abbreviation.
-    void AddAttribute(uint16_t Attribute, uint16_t Form) {
+    void AddAttribute(dwarf::Attribute Attribute, dwarf::Form Form) {
       Data.push_back(DIEAbbrevData(Attribute, Form));
     }
 
@@ -131,19 +130,17 @@ namespace llvm {
     ///
     SmallVector<DIEValue*, 12> Values;
 
-#ifndef NDEBUG
-    // Private data for print()
-    mutable unsigned IndentCount;
-#endif
   public:
     explicit DIE(unsigned Tag)
-      : Offset(0), Size(0), Abbrev(Tag, dwarf::DW_CHILDREN_no), Parent(0) {}
+        : Offset(0), Size(0), Abbrev((dwarf::Tag)Tag, dwarf::DW_CHILDREN_no),
+          Parent(0) {}
     virtual ~DIE();
 
     // Accessors.
     DIEAbbrev &getAbbrev() { return Abbrev; }
+    const DIEAbbrev &getAbbrev() const { return Abbrev; }
     unsigned getAbbrevNumber() const { return Abbrev.getNumber(); }
-    unsigned getTag() const { return Abbrev.getTag(); }
+    dwarf::Tag getTag() const { return Abbrev.getTag(); }
     unsigned getOffset() const { return Offset; }
     unsigned getSize() const { return Size; }
     const std::vector<DIE *> &getChildren() const { return Children; }
@@ -151,14 +148,17 @@ namespace llvm {
     DIE *getParent() const { return Parent; }
     /// Climb up the parent chain to get the compile unit DIE this DIE belongs
     /// to.
-    DIE *getCompileUnit();
-    void setTag(unsigned Tag) { Abbrev.setTag(Tag); }
+    const DIE *getCompileUnit() const;
+    /// Similar to getCompileUnit, returns null when DIE is not added to an
+    /// owner yet.
+    const DIE *getCompileUnitOrNull() const;
     void setOffset(unsigned O) { Offset = O; }
     void setSize(unsigned S) { Size = S; }
 
     /// addValue - Add a value and attributes to a DIE.
     ///
-    void addValue(unsigned Attribute, unsigned Form, DIEValue *Value) {
+    void addValue(dwarf::Attribute Attribute, dwarf::Form Form,
+                  DIEValue *Value) {
       Abbrev.AddAttribute(Attribute, Form);
       Values.push_back(Value);
     }
@@ -166,10 +166,7 @@ namespace llvm {
     /// addChild - Add a child to the DIE.
     ///
     void addChild(DIE *Child) {
-      if (Child->getParent()) {
-        assert (Child->getParent() == this && "Unexpected DIE Parent!");
-        return;
-      }
+      assert(!Child->getParent());
       Abbrev.setChildrenFlag(dwarf::DW_CHILDREN_yes);
       Children.push_back(Child);
       Child->Parent = this;
@@ -177,7 +174,7 @@ namespace llvm {
 
     /// findAttribute - Find a value in the DIE with the attribute given, returns NULL
     /// if no such attribute exists.
-    DIEValue *findAttribute(unsigned Attribute);
+    DIEValue *findAttribute(uint16_t Attribute);
 
 #ifndef NDEBUG
     void print(raw_ostream &O, unsigned IndentCount = 0) const;
@@ -213,11 +210,11 @@ namespace llvm {
 
     /// EmitValue - Emit value via the Dwarf writer.
     ///
-    virtual void EmitValue(AsmPrinter *AP, unsigned Form) const = 0;
+    virtual void EmitValue(AsmPrinter *AP, dwarf::Form Form) const = 0;
 
     /// SizeOf - Return the size of a value in bytes.
     ///
-    virtual unsigned SizeOf(AsmPrinter *AP, unsigned Form) const = 0;
+    virtual unsigned SizeOf(AsmPrinter *AP, dwarf::Form Form) const = 0;
 
 #ifndef NDEBUG
     virtual void print(raw_ostream &O) const = 0;
@@ -235,7 +232,7 @@ namespace llvm {
 
     /// BestForm - Choose the best form for integer.
     ///
-    static unsigned BestForm(bool IsSigned, uint64_t Int) {
+    static dwarf::Form BestForm(bool IsSigned, uint64_t Int) {
       if (IsSigned) {
         const int64_t SignedInt = Int;
         if ((char)Int == SignedInt)     return dwarf::DW_FORM_data1;
@@ -251,13 +248,13 @@ namespace llvm {
 
     /// EmitValue - Emit integer of appropriate size.
     ///
-    virtual void EmitValue(AsmPrinter *AP, unsigned Form) const;
+    virtual void EmitValue(AsmPrinter *AP, dwarf::Form Form) const;
 
     uint64_t getValue() const { return Integer; }
 
     /// SizeOf - Determine size of integer value in bytes.
     ///
-    virtual unsigned SizeOf(AsmPrinter *AP, unsigned Form) const;
+    virtual unsigned SizeOf(AsmPrinter *AP, dwarf::Form Form) const;
 
     // Implement isa/cast/dyncast.
     static bool classof(const DIEValue *I) { return I->getType() == isInteger; }
@@ -277,7 +274,7 @@ namespace llvm {
 
     /// EmitValue - Emit expression value.
     ///
-    virtual void EmitValue(AsmPrinter *AP, unsigned Form) const;
+    virtual void EmitValue(AsmPrinter *AP, dwarf::Form Form) const;
 
     /// getValue - Get MCExpr.
     ///
@@ -285,7 +282,7 @@ namespace llvm {
 
     /// SizeOf - Determine size of expression value in bytes.
     ///
-    virtual unsigned SizeOf(AsmPrinter *AP, unsigned Form) const;
+    virtual unsigned SizeOf(AsmPrinter *AP, dwarf::Form Form) const;
 
     // Implement isa/cast/dyncast.
     static bool classof(const DIEValue *E) { return E->getType() == isExpr; }
@@ -305,7 +302,7 @@ namespace llvm {
 
     /// EmitValue - Emit label value.
     ///
-    virtual void EmitValue(AsmPrinter *AP, unsigned Form) const;
+    virtual void EmitValue(AsmPrinter *AP, dwarf::Form Form) const;
 
     /// getValue - Get MCSymbol.
     ///
@@ -313,7 +310,7 @@ namespace llvm {
 
     /// SizeOf - Determine size of label value in bytes.
     ///
-    virtual unsigned SizeOf(AsmPrinter *AP, unsigned Form) const;
+    virtual unsigned SizeOf(AsmPrinter *AP, dwarf::Form Form) const;
 
     // Implement isa/cast/dyncast.
     static bool classof(const DIEValue *L) { return L->getType() == isLabel; }
@@ -335,11 +332,11 @@ namespace llvm {
 
     /// EmitValue - Emit delta value.
     ///
-    virtual void EmitValue(AsmPrinter *AP, unsigned Form) const;
+    virtual void EmitValue(AsmPrinter *AP, dwarf::Form Form) const;
 
     /// SizeOf - Determine size of delta value in bytes.
     ///
-    virtual unsigned SizeOf(AsmPrinter *AP, unsigned Form) const;
+    virtual unsigned SizeOf(AsmPrinter *AP, dwarf::Form Form) const;
 
     // Implement isa/cast/dyncast.
     static bool classof(const DIEValue *D) { return D->getType() == isDelta; }
@@ -365,11 +362,11 @@ namespace llvm {
 
     /// EmitValue - Emit delta value.
     ///
-    virtual void EmitValue(AsmPrinter *AP, unsigned Form) const;
+    virtual void EmitValue(AsmPrinter *AP, dwarf::Form Form) const;
 
     /// SizeOf - Determine size of delta value in bytes.
     ///
-    virtual unsigned SizeOf(AsmPrinter *AP, unsigned Form) const;
+    virtual unsigned SizeOf(AsmPrinter *AP, dwarf::Form Form) const;
 
     // Implement isa/cast/dyncast.
     static bool classof(const DIEValue *D) { return D->getType() == isString; }
@@ -394,13 +391,13 @@ namespace llvm {
 
     /// EmitValue - Emit debug information entry offset.
     ///
-    virtual void EmitValue(AsmPrinter *AP, unsigned Form) const;
+    virtual void EmitValue(AsmPrinter *AP, dwarf::Form Form) const;
 
     /// SizeOf - Determine size of debug information entry in bytes.
     ///
-    virtual unsigned SizeOf(AsmPrinter *AP, unsigned Form) const {
-      return Form == dwarf::DW_FORM_ref_addr ? getRefAddrSize(AP) :
-                                               sizeof(int32_t);
+    virtual unsigned SizeOf(AsmPrinter *AP, dwarf::Form Form) const {
+      return Form == dwarf::DW_FORM_ref_addr ? getRefAddrSize(AP)
+                                             : sizeof(int32_t);
     }
 
     /// Returns size of a ref_addr entry.
@@ -420,9 +417,7 @@ namespace llvm {
   class DIEBlock : public DIEValue, public DIE {
     unsigned Size;                // Size in bytes excluding size header.
   public:
-    DIEBlock()
-      : DIEValue(isBlock), DIE(0), Size(0) {}
-    virtual ~DIEBlock() {}
+    DIEBlock() : DIEValue(isBlock), DIE(0), Size(0) {}
 
     /// ComputeSize - calculate the size of the block.
     ///
@@ -430,7 +425,7 @@ namespace llvm {
 
     /// BestForm - Choose the best form for data.
     ///
-    unsigned BestForm() const {
+    dwarf::Form BestForm() const {
       if ((unsigned char)Size == Size)  return dwarf::DW_FORM_block1;
       if ((unsigned short)Size == Size) return dwarf::DW_FORM_block2;
       if ((unsigned int)Size == Size)   return dwarf::DW_FORM_block4;
@@ -439,11 +434,11 @@ namespace llvm {
 
     /// EmitValue - Emit block data.
     ///
-    virtual void EmitValue(AsmPrinter *AP, unsigned Form) const;
+    virtual void EmitValue(AsmPrinter *AP, dwarf::Form Form) const;
 
     /// SizeOf - Determine size of block data in bytes.
     ///
-    virtual unsigned SizeOf(AsmPrinter *AP, unsigned Form) const;
+    virtual unsigned SizeOf(AsmPrinter *AP, dwarf::Form Form) const;
 
     // Implement isa/cast/dyncast.
     static bool classof(const DIEValue *E) { return E->getType() == isBlock; }
diff --git a/lib/CodeGen/AsmPrinter/DIEHash.cpp b/lib/CodeGen/AsmPrinter/DIEHash.cpp
new file mode 100644
index 0000000..95eca90
--- /dev/null
+++ b/lib/CodeGen/AsmPrinter/DIEHash.cpp
@@ -0,0 +1,507 @@
+//===-- llvm/CodeGen/DIEHash.cpp - Dwarf Hashing Framework ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for DWARF4 hashing of DIEs.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "dwarfdebug"
+
+#include "DIEHash.h"
+
+#include "DIE.h"
+#include "DwarfCompileUnit.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Dwarf.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/MD5.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+/// \brief Grabs the string in whichever attribute is passed in and returns
+/// a reference to it.
+static StringRef getDIEStringAttr(const DIE &Die, uint16_t Attr) {
+  const SmallVectorImpl<DIEValue *> &Values = Die.getValues();
+  const DIEAbbrev &Abbrevs = Die.getAbbrev();
+
+  // Iterate through all the attributes until we find the one we're
+  // looking for, if we can't find it return an empty string.
+  for (size_t i = 0; i < Values.size(); ++i) {
+    if (Abbrevs.getData()[i].getAttribute() == Attr) {
+      DIEValue *V = Values[i];
+      assert(isa<DIEString>(V) && "String requested. Not a string.");
+      DIEString *S = cast<DIEString>(V);
+      return S->getString();
+    }
+  }
+  return StringRef("");
+}
+
+/// \brief Adds the string in \p Str to the hash. This also hashes
+/// a trailing NULL with the string.
+void DIEHash::addString(StringRef Str) {
+  DEBUG(dbgs() << "Adding string " << Str << " to hash.\n");
+  Hash.update(Str);
+  Hash.update(makeArrayRef((uint8_t)'\0'));
+}
+
+// FIXME: The LEB128 routines are copied and only slightly modified out of
+// LEB128.h.
+
+/// \brief Adds the unsigned in \p Value to the hash encoded as a ULEB128.
+void DIEHash::addULEB128(uint64_t Value) {
+  DEBUG(dbgs() << "Adding ULEB128 " << Value << " to hash.\n");
+  do {
+    uint8_t Byte = Value & 0x7f;
+    Value >>= 7;
+    if (Value != 0)
+      Byte |= 0x80; // Mark this byte to show that more bytes will follow.
+    Hash.update(Byte);
+  } while (Value != 0);
+}
+
+void DIEHash::addSLEB128(int64_t Value) {
+  DEBUG(dbgs() << "Adding ULEB128 " << Value << " to hash.\n");
+  bool More;
+  do {
+    uint8_t Byte = Value & 0x7f;
+    Value >>= 7;
+    More = !((((Value == 0 ) && ((Byte & 0x40) == 0)) ||
+              ((Value == -1) && ((Byte & 0x40) != 0))));
+    if (More)
+      Byte |= 0x80; // Mark this byte to show that more bytes will follow.
+    Hash.update(Byte);
+  } while (More);
+}
+
+/// \brief Including \p Parent adds the context of Parent to the hash..
+void DIEHash::addParentContext(const DIE &Parent) {
+
+  DEBUG(dbgs() << "Adding parent context to hash...\n");
+
+  // [7.27.2] For each surrounding type or namespace beginning with the
+  // outermost such construct...
+  SmallVector<const DIE *, 1> Parents;
+  const DIE *Cur = &Parent;
+  while (Cur->getTag() != dwarf::DW_TAG_compile_unit) {
+    Parents.push_back(Cur);
+    Cur = Cur->getParent();
+  }
+
+  // Reverse iterate over our list to go from the outermost construct to the
+  // innermost.
+  for (SmallVectorImpl<const DIE *>::reverse_iterator I = Parents.rbegin(),
+                                                      E = Parents.rend();
+       I != E; ++I) {
+    const DIE &Die = **I;
+
+    // ... Append the letter "C" to the sequence...
+    addULEB128('C');
+
+    // ... Followed by the DWARF tag of the construct...
+    addULEB128(Die.getTag());
+
+    // ... Then the name, taken from the DW_AT_name attribute.
+    StringRef Name = getDIEStringAttr(Die, dwarf::DW_AT_name);
+    DEBUG(dbgs() << "... adding context: " << Name << "\n");
+    if (!Name.empty())
+      addString(Name);
+  }
+}
+
+// Collect all of the attributes for a particular DIE in single structure.
+void DIEHash::collectAttributes(const DIE &Die, DIEAttrs &Attrs) {
+  const SmallVectorImpl<DIEValue *> &Values = Die.getValues();
+  const DIEAbbrev &Abbrevs = Die.getAbbrev();
+
+#define COLLECT_ATTR(NAME)                                                     \
+  case dwarf::NAME:                                                            \
+    Attrs.NAME.Val = Values[i];                                                \
+    Attrs.NAME.Desc = &Abbrevs.getData()[i];                                   \
+    break
+
+  for (size_t i = 0, e = Values.size(); i != e; ++i) {
+    DEBUG(dbgs() << "Attribute: "
+                 << dwarf::AttributeString(Abbrevs.getData()[i].getAttribute())
+                 << " added.\n");
+    switch (Abbrevs.getData()[i].getAttribute()) {
+    COLLECT_ATTR(DW_AT_name);
+    COLLECT_ATTR(DW_AT_accessibility);
+    COLLECT_ATTR(DW_AT_address_class);
+    COLLECT_ATTR(DW_AT_allocated);
+    COLLECT_ATTR(DW_AT_artificial);
+    COLLECT_ATTR(DW_AT_associated);
+    COLLECT_ATTR(DW_AT_binary_scale);
+    COLLECT_ATTR(DW_AT_bit_offset);
+    COLLECT_ATTR(DW_AT_bit_size);
+    COLLECT_ATTR(DW_AT_bit_stride);
+    COLLECT_ATTR(DW_AT_byte_size);
+    COLLECT_ATTR(DW_AT_byte_stride);
+    COLLECT_ATTR(DW_AT_const_expr);
+    COLLECT_ATTR(DW_AT_const_value);
+    COLLECT_ATTR(DW_AT_containing_type);
+    COLLECT_ATTR(DW_AT_count);
+    COLLECT_ATTR(DW_AT_data_bit_offset);
+    COLLECT_ATTR(DW_AT_data_location);
+    COLLECT_ATTR(DW_AT_data_member_location);
+    COLLECT_ATTR(DW_AT_decimal_scale);
+    COLLECT_ATTR(DW_AT_decimal_sign);
+    COLLECT_ATTR(DW_AT_default_value);
+    COLLECT_ATTR(DW_AT_digit_count);
+    COLLECT_ATTR(DW_AT_discr);
+    COLLECT_ATTR(DW_AT_discr_list);
+    COLLECT_ATTR(DW_AT_discr_value);
+    COLLECT_ATTR(DW_AT_encoding);
+    COLLECT_ATTR(DW_AT_enum_class);
+    COLLECT_ATTR(DW_AT_endianity);
+    COLLECT_ATTR(DW_AT_explicit);
+    COLLECT_ATTR(DW_AT_is_optional);
+    COLLECT_ATTR(DW_AT_location);
+    COLLECT_ATTR(DW_AT_lower_bound);
+    COLLECT_ATTR(DW_AT_mutable);
+    COLLECT_ATTR(DW_AT_ordering);
+    COLLECT_ATTR(DW_AT_picture_string);
+    COLLECT_ATTR(DW_AT_prototyped);
+    COLLECT_ATTR(DW_AT_small);
+    COLLECT_ATTR(DW_AT_segment);
+    COLLECT_ATTR(DW_AT_string_length);
+    COLLECT_ATTR(DW_AT_threads_scaled);
+    COLLECT_ATTR(DW_AT_upper_bound);
+    COLLECT_ATTR(DW_AT_use_location);
+    COLLECT_ATTR(DW_AT_use_UTF8);
+    COLLECT_ATTR(DW_AT_variable_parameter);
+    COLLECT_ATTR(DW_AT_virtuality);
+    COLLECT_ATTR(DW_AT_visibility);
+    COLLECT_ATTR(DW_AT_vtable_elem_location);
+    COLLECT_ATTR(DW_AT_type);
+    default:
+      break;
+    }
+  }
+}
+
+void DIEHash::hashShallowTypeReference(dwarf::Attribute Attribute,
+                                       const DIE &Entry, StringRef Name) {
+  // append the letter 'N'
+  addULEB128('N');
+
+  // the DWARF attribute code (DW_AT_type or DW_AT_friend),
+  addULEB128(Attribute);
+
+  // the context of the tag,
+  if (const DIE *Parent = Entry.getParent())
+    addParentContext(*Parent);
+
+  // the letter 'E',
+  addULEB128('E');
+
+  // and the name of the type.
+  addString(Name);
+
+  // Currently DW_TAG_friends are not used by Clang, but if they do become so,
+  // here's the relevant spec text to implement:
+  //
+  // For DW_TAG_friend, if the referenced entry is the DW_TAG_subprogram,
+  // the context is omitted and the name to be used is the ABI-specific name
+  // of the subprogram (e.g., the mangled linker name).
+}
+
+void DIEHash::hashRepeatedTypeReference(dwarf::Attribute Attribute,
+                                        unsigned DieNumber) {
+  // a) If T is in the list of [previously hashed types], use the letter
+  // 'R' as the marker
+  addULEB128('R');
+
+  addULEB128(Attribute);
+
+  // and use the unsigned LEB128 encoding of [the index of T in the
+  // list] as the attribute value;
+  addULEB128(DieNumber);
+}
+
+void DIEHash::hashDIEEntry(dwarf::Attribute Attribute, dwarf::Tag Tag,
+                           const DIE &Entry) {
+  assert(Tag != dwarf::DW_TAG_friend && "No current LLVM clients emit friend "
+                                        "tags. Add support here when there's "
+                                        "a use case");
+  // Step 5
+  // If the tag in Step 3 is one of [the below tags]
+  if ((Tag == dwarf::DW_TAG_pointer_type ||
+       Tag == dwarf::DW_TAG_reference_type ||
+       Tag == dwarf::DW_TAG_rvalue_reference_type ||
+       Tag == dwarf::DW_TAG_ptr_to_member_type) &&
+      // and the referenced type (via the [below attributes])
+      // FIXME: This seems overly restrictive, and causes hash mismatches
+      // there's a decl/def difference in the containing type of a
+      // ptr_to_member_type, but it's what DWARF says, for some reason.
+      Attribute == dwarf::DW_AT_type) {
+    // ... has a DW_AT_name attribute,
+    StringRef Name = getDIEStringAttr(Entry, dwarf::DW_AT_name);
+    if (!Name.empty()) {
+      hashShallowTypeReference(Attribute, Entry, Name);
+      return;
+    }
+  }
+
+  unsigned &DieNumber = Numbering[&Entry];
+  if (DieNumber) {
+    hashRepeatedTypeReference(Attribute, DieNumber);
+    return;
+  }
+
+  // otherwise, b) use the letter 'T' as a the marker, ...
+  addULEB128('T');
+
+  addULEB128(Attribute);
+
+  // ... process the type T recursively by performing Steps 2 through 7, and
+  // use the result as the attribute value.
+  DieNumber = Numbering.size();
+  computeHash(Entry);
+}
+
+// Hash an individual attribute \param Attr based on the type of attribute and
+// the form.
+void DIEHash::hashAttribute(AttrEntry Attr, dwarf::Tag Tag) {
+  const DIEValue *Value = Attr.Val;
+  const DIEAbbrevData *Desc = Attr.Desc;
+  dwarf::Attribute Attribute = Desc->getAttribute();
+
+  // 7.27 Step 3
+  // ... An attribute that refers to another type entry T is processed as
+  // follows:
+  if (const DIEEntry *EntryAttr = dyn_cast<DIEEntry>(Value)) {
+    hashDIEEntry(Attribute, Tag, *EntryAttr->getEntry());
+    return;
+  }
+
+  // Other attribute values use the letter 'A' as the marker, ...
+  addULEB128('A');
+
+  addULEB128(Attribute);
+
+  // ... and the value consists of the form code (encoded as an unsigned LEB128
+  // value) followed by the encoding of the value according to the form code. To
+  // ensure reproducibility of the signature, the set of forms used in the
+  // signature computation is limited to the following: DW_FORM_sdata,
+  // DW_FORM_flag, DW_FORM_string, and DW_FORM_block.
+  switch (Desc->getForm()) {
+  case dwarf::DW_FORM_string:
+    llvm_unreachable(
+        "Add support for DW_FORM_string if we ever start emitting them again");
+  case dwarf::DW_FORM_GNU_str_index:
+  case dwarf::DW_FORM_strp:
+    addULEB128(dwarf::DW_FORM_string);
+    addString(cast<DIEString>(Value)->getString());
+    break;
+  case dwarf::DW_FORM_data1:
+  case dwarf::DW_FORM_data2:
+  case dwarf::DW_FORM_data4:
+  case dwarf::DW_FORM_data8:
+  case dwarf::DW_FORM_udata:
+    addULEB128(dwarf::DW_FORM_sdata);
+    addSLEB128((int64_t)cast<DIEInteger>(Value)->getValue());
+    break;
+  default:
+    llvm_unreachable("Add support for additional forms");
+  }
+}
+
+// Go through the attributes from \param Attrs in the order specified in 7.27.4
+// and hash them.
+void DIEHash::hashAttributes(const DIEAttrs &Attrs, dwarf::Tag Tag) {
+#define ADD_ATTR(ATTR)                                                         \
+  {                                                                            \
+    if (ATTR.Val != 0)                                                         \
+      hashAttribute(ATTR, Tag);                                                \
+  }
+
+  ADD_ATTR(Attrs.DW_AT_name);
+  ADD_ATTR(Attrs.DW_AT_accessibility);
+  ADD_ATTR(Attrs.DW_AT_address_class);
+  ADD_ATTR(Attrs.DW_AT_allocated);
+  ADD_ATTR(Attrs.DW_AT_artificial);
+  ADD_ATTR(Attrs.DW_AT_associated);
+  ADD_ATTR(Attrs.DW_AT_binary_scale);
+  ADD_ATTR(Attrs.DW_AT_bit_offset);
+  ADD_ATTR(Attrs.DW_AT_bit_size);
+  ADD_ATTR(Attrs.DW_AT_bit_stride);
+  ADD_ATTR(Attrs.DW_AT_byte_size);
+  ADD_ATTR(Attrs.DW_AT_byte_stride);
+  ADD_ATTR(Attrs.DW_AT_const_expr);
+  ADD_ATTR(Attrs.DW_AT_const_value);
+  ADD_ATTR(Attrs.DW_AT_containing_type);
+  ADD_ATTR(Attrs.DW_AT_count);
+  ADD_ATTR(Attrs.DW_AT_data_bit_offset);
+  ADD_ATTR(Attrs.DW_AT_data_location);
+  ADD_ATTR(Attrs.DW_AT_data_member_location);
+  ADD_ATTR(Attrs.DW_AT_decimal_scale);
+  ADD_ATTR(Attrs.DW_AT_decimal_sign);
+  ADD_ATTR(Attrs.DW_AT_default_value);
+  ADD_ATTR(Attrs.DW_AT_digit_count);
+  ADD_ATTR(Attrs.DW_AT_discr);
+  ADD_ATTR(Attrs.DW_AT_discr_list);
+  ADD_ATTR(Attrs.DW_AT_discr_value);
+  ADD_ATTR(Attrs.DW_AT_encoding);
+  ADD_ATTR(Attrs.DW_AT_enum_class);
+  ADD_ATTR(Attrs.DW_AT_endianity);
+  ADD_ATTR(Attrs.DW_AT_explicit);
+  ADD_ATTR(Attrs.DW_AT_is_optional);
+  ADD_ATTR(Attrs.DW_AT_location);
+  ADD_ATTR(Attrs.DW_AT_lower_bound);
+  ADD_ATTR(Attrs.DW_AT_mutable);
+  ADD_ATTR(Attrs.DW_AT_ordering);
+  ADD_ATTR(Attrs.DW_AT_picture_string);
+  ADD_ATTR(Attrs.DW_AT_prototyped);
+  ADD_ATTR(Attrs.DW_AT_small);
+  ADD_ATTR(Attrs.DW_AT_segment);
+  ADD_ATTR(Attrs.DW_AT_string_length);
+  ADD_ATTR(Attrs.DW_AT_threads_scaled);
+  ADD_ATTR(Attrs.DW_AT_upper_bound);
+  ADD_ATTR(Attrs.DW_AT_use_location);
+  ADD_ATTR(Attrs.DW_AT_use_UTF8);
+  ADD_ATTR(Attrs.DW_AT_variable_parameter);
+  ADD_ATTR(Attrs.DW_AT_virtuality);
+  ADD_ATTR(Attrs.DW_AT_visibility);
+  ADD_ATTR(Attrs.DW_AT_vtable_elem_location);
+  ADD_ATTR(Attrs.DW_AT_type);
+
+  // FIXME: Add the extended attributes.
+}
+
+// Add all of the attributes for \param Die to the hash.
+void DIEHash::addAttributes(const DIE &Die) {
+  DIEAttrs Attrs = {};
+  collectAttributes(Die, Attrs);
+  hashAttributes(Attrs, Die.getTag());
+}
+
+void DIEHash::hashNestedType(const DIE &Die, StringRef Name) {
+  // 7.27 Step 7
+  // ... append the letter 'S',
+  addULEB128('S');
+
+  // the tag of C,
+  addULEB128(Die.getTag());
+
+  // and the name.
+  addString(Name);
+}
+
+// Compute the hash of a DIE. This is based on the type signature computation
+// given in section 7.27 of the DWARF4 standard. It is the md5 hash of a
+// flattened description of the DIE.
+void DIEHash::computeHash(const DIE &Die) {
+  // Append the letter 'D', followed by the DWARF tag of the DIE.
+  addULEB128('D');
+  addULEB128(Die.getTag());
+
+  // Add each of the attributes of the DIE.
+  addAttributes(Die);
+
+  // Then hash each of the children of the DIE.
+  for (std::vector<DIE *>::const_iterator I = Die.getChildren().begin(),
+                                          E = Die.getChildren().end();
+       I != E; ++I) {
+    // 7.27 Step 7
+    // If C is a nested type entry or a member function entry, ...
+    if (isType((*I)->getTag()) || (*I)->getTag() == dwarf::DW_TAG_subprogram) {
+      StringRef Name = getDIEStringAttr(**I, dwarf::DW_AT_name);
+      // ... and has a DW_AT_name attribute
+      if (!Name.empty()) {
+        hashNestedType(**I, Name);
+        continue;
+      }
+    }
+    computeHash(**I);
+  }
+
+  // Following the last (or if there are no children), append a zero byte.
+  Hash.update(makeArrayRef((uint8_t)'\0'));
+}
+
+/// This is based on the type signature computation given in section 7.27 of the
+/// DWARF4 standard. It is the md5 hash of a flattened description of the DIE
+/// with the exception that we are hashing only the context and the name of the
+/// type.
+uint64_t DIEHash::computeDIEODRSignature(const DIE &Die) {
+
+  // Add the contexts to the hash. We won't be computing the ODR hash for
+  // function local types so it's safe to use the generic context hashing
+  // algorithm here.
+  // FIXME: If we figure out how to account for linkage in some way we could
+  // actually do this with a slight modification to the parent hash algorithm.
+  if (const DIE *Parent = Die.getParent())
+    addParentContext(*Parent);
+
+  // Add the current DIE information.
+
+  // Add the DWARF tag of the DIE.
+  addULEB128(Die.getTag());
+
+  // Add the name of the type to the hash.
+  addString(getDIEStringAttr(Die, dwarf::DW_AT_name));
+
+  // Now get the result.
+  MD5::MD5Result Result;
+  Hash.final(Result);
+
+  // ... take the least significant 8 bytes and return those. Our MD5
+  // implementation always returns its results in little endian, swap bytes
+  // appropriately.
+  return *reinterpret_cast<support::ulittle64_t *>(Result + 8);
+}
+
+/// This is based on the type signature computation given in section 7.27 of the
+/// DWARF4 standard. It is an md5 hash of the flattened description of the DIE
+/// with the inclusion of the full CU and all top level CU entities.
+// TODO: Initialize the type chain at 0 instead of 1 for CU signatures.
+uint64_t DIEHash::computeCUSignature(const DIE &Die) {
+  Numbering.clear();
+  Numbering[&Die] = 1;
+
+  // Hash the DIE.
+  computeHash(Die);
+
+  // Now return the result.
+  MD5::MD5Result Result;
+  Hash.final(Result);
+
+  // ... take the least significant 8 bytes and return those. Our MD5
+  // implementation always returns its results in little endian, swap bytes
+  // appropriately.
+  return *reinterpret_cast<support::ulittle64_t *>(Result + 8);
+}
+
+/// This is based on the type signature computation given in section 7.27 of the
+/// DWARF4 standard. It is an md5 hash of the flattened description of the DIE
+/// with the inclusion of additional forms not specifically called out in the
+/// standard.
+uint64_t DIEHash::computeTypeSignature(const DIE &Die) {
+  Numbering.clear();
+  Numbering[&Die] = 1;
+
+  if (const DIE *Parent = Die.getParent())
+    addParentContext(*Parent);
+
+  // Hash the DIE.
+  computeHash(Die);
+
+  // Now return the result.
+  MD5::MD5Result Result;
+  Hash.final(Result);
+
+  // ... take the least significant 8 bytes and return those. Our MD5
+  // implementation always returns its results in little endian, swap bytes
+  // appropriately.
+  return *reinterpret_cast<support::ulittle64_t *>(Result + 8);
+}
diff --git a/lib/CodeGen/AsmPrinter/DIEHash.h b/lib/CodeGen/AsmPrinter/DIEHash.h
new file mode 100644
index 0000000..f0c4ef9
--- /dev/null
+++ b/lib/CodeGen/AsmPrinter/DIEHash.h
@@ -0,0 +1,147 @@
+//===-- llvm/CodeGen/DIEHash.h - Dwarf Hashing Framework -------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for DWARF4 hashing of DIEs.
+//
+//===----------------------------------------------------------------------===//
+
+#include "DIE.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/Support/MD5.h"
+
+namespace llvm {
+
+class CompileUnit;
+
+/// \brief An object containing the capability of hashing and adding hash
+/// attributes onto a DIE.
+class DIEHash {
+  // The entry for a particular attribute.
+  struct AttrEntry {
+    const DIEValue *Val;
+    const DIEAbbrevData *Desc;
+  };
+
+  // Collection of all attributes used in hashing a particular DIE.
+  struct DIEAttrs {
+    AttrEntry DW_AT_name;
+    AttrEntry DW_AT_accessibility;
+    AttrEntry DW_AT_address_class;
+    AttrEntry DW_AT_allocated;
+    AttrEntry DW_AT_artificial;
+    AttrEntry DW_AT_associated;
+    AttrEntry DW_AT_binary_scale;
+    AttrEntry DW_AT_bit_offset;
+    AttrEntry DW_AT_bit_size;
+    AttrEntry DW_AT_bit_stride;
+    AttrEntry DW_AT_byte_size;
+    AttrEntry DW_AT_byte_stride;
+    AttrEntry DW_AT_const_expr;
+    AttrEntry DW_AT_const_value;
+    AttrEntry DW_AT_containing_type;
+    AttrEntry DW_AT_count;
+    AttrEntry DW_AT_data_bit_offset;
+    AttrEntry DW_AT_data_location;
+    AttrEntry DW_AT_data_member_location;
+    AttrEntry DW_AT_decimal_scale;
+    AttrEntry DW_AT_decimal_sign;
+    AttrEntry DW_AT_default_value;
+    AttrEntry DW_AT_digit_count;
+    AttrEntry DW_AT_discr;
+    AttrEntry DW_AT_discr_list;
+    AttrEntry DW_AT_discr_value;
+    AttrEntry DW_AT_encoding;
+    AttrEntry DW_AT_enum_class;
+    AttrEntry DW_AT_endianity;
+    AttrEntry DW_AT_explicit;
+    AttrEntry DW_AT_is_optional;
+    AttrEntry DW_AT_location;
+    AttrEntry DW_AT_lower_bound;
+    AttrEntry DW_AT_mutable;
+    AttrEntry DW_AT_ordering;
+    AttrEntry DW_AT_picture_string;
+    AttrEntry DW_AT_prototyped;
+    AttrEntry DW_AT_small;
+    AttrEntry DW_AT_segment;
+    AttrEntry DW_AT_string_length;
+    AttrEntry DW_AT_threads_scaled;
+    AttrEntry DW_AT_upper_bound;
+    AttrEntry DW_AT_use_location;
+    AttrEntry DW_AT_use_UTF8;
+    AttrEntry DW_AT_variable_parameter;
+    AttrEntry DW_AT_virtuality;
+    AttrEntry DW_AT_visibility;
+    AttrEntry DW_AT_vtable_elem_location;
+    AttrEntry DW_AT_type;
+
+    // Insert any additional ones here...
+  };
+
+public:
+  /// \brief Computes the ODR signature.
+  uint64_t computeDIEODRSignature(const DIE &Die);
+
+  /// \brief Computes the CU signature.
+  uint64_t computeCUSignature(const DIE &Die);
+
+  /// \brief Computes the type signature.
+  uint64_t computeTypeSignature(const DIE &Die);
+
+  // Helper routines to process parts of a DIE.
+private:
+  /// \brief Adds the parent context of \param Die to the hash.
+  void addParentContext(const DIE &Die);
+
+  /// \brief Adds the attributes of \param Die to the hash.
+  void addAttributes(const DIE &Die);
+
+  /// \brief Computes the full DWARF4 7.27 hash of the DIE.
+  void computeHash(const DIE &Die);
+
+  // Routines that add DIEValues to the hash.
+private:
+  /// \brief Encodes and adds \param Value to the hash as a ULEB128.
+  void addULEB128(uint64_t Value);
+
+  /// \brief Encodes and adds \param Value to the hash as a SLEB128.
+  void addSLEB128(int64_t Value);
+
+  /// \brief Adds \param Str to the hash and includes a NULL byte.
+  void addString(StringRef Str);
+
+  /// \brief Collects the attributes of DIE \param Die into the \param Attrs
+  /// structure.
+  void collectAttributes(const DIE &Die, DIEAttrs &Attrs);
+
+  /// \brief Hashes the attributes in \param Attrs in order.
+  void hashAttributes(const DIEAttrs &Attrs, dwarf::Tag Tag);
+
+  /// \brief Hashes an individual attribute.
+  void hashAttribute(AttrEntry Attr, dwarf::Tag Tag);
+
+  /// \brief Hashes an attribute that refers to another DIE.
+  void hashDIEEntry(dwarf::Attribute Attribute, dwarf::Tag Tag,
+                    const DIE &Entry);
+
+  /// \brief Hashes a reference to a named type in such a way that is
+  /// independent of whether that type is described by a declaration or a
+  /// definition.
+  void hashShallowTypeReference(dwarf::Attribute Attribute, const DIE &Entry,
+                                StringRef Name);
+
+  /// \brief Hashes a reference to a previously referenced type DIE.
+  void hashRepeatedTypeReference(dwarf::Attribute Attribute, unsigned DieNumber);
+
+  void hashNestedType(const DIE &Die, StringRef Name);
+
+private:
+  MD5 Hash;
+  DenseMap<const DIE *, unsigned> Numbering;
+};
+}
diff --git a/lib/CodeGen/AsmPrinter/DwarfAccelTable.cpp b/lib/CodeGen/AsmPrinter/DwarfAccelTable.cpp
index a82a149..689aeda 100644
--- a/lib/CodeGen/AsmPrinter/DwarfAccelTable.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfAccelTable.cpp
@@ -24,27 +24,14 @@
 
 using namespace llvm;
 
-const char *DwarfAccelTable::Atom::AtomTypeString(enum AtomType AT) {
-  switch (AT) {
-  case eAtomTypeNULL: return "eAtomTypeNULL";
-  case eAtomTypeDIEOffset: return "eAtomTypeDIEOffset";
-  case eAtomTypeCUOffset: return "eAtomTypeCUOffset";
-  case eAtomTypeTag: return "eAtomTypeTag";
-  case eAtomTypeNameFlags: return "eAtomTypeNameFlags";
-  case eAtomTypeTypeFlags: return "eAtomTypeTypeFlags";
-  }
-  llvm_unreachable("invalid AtomType!");
-}
-
 // The length of the header data is always going to be 4 + 4 + 4*NumAtoms.
-DwarfAccelTable::DwarfAccelTable(ArrayRef<DwarfAccelTable::Atom> atomList) :
-  Header(8 + (atomList.size() * 4)),
-  HeaderData(atomList),
-  Entries(Allocator) { }
+DwarfAccelTable::DwarfAccelTable(ArrayRef<DwarfAccelTable::Atom> atomList)
+    : Header(8 + (atomList.size() * 4)), HeaderData(atomList),
+      Entries(Allocator) {}
 
-DwarfAccelTable::~DwarfAccelTable() { }
+DwarfAccelTable::~DwarfAccelTable() {}
 
-void DwarfAccelTable::AddName(StringRef Name, DIE* die, char Flags) {
+void DwarfAccelTable::AddName(StringRef Name, DIE *die, char Flags) {
   assert(Data.empty() && "Already finalized!");
   // If the string is in the list already then add this die to the list
   // otherwise add a new one.
@@ -59,13 +46,16 @@ void DwarfAccelTable::ComputeBucketCount(void) {
     uniques[i] = Data[i]->HashValue;
   array_pod_sort(uniques.begin(), uniques.end());
   std::vector<uint32_t>::iterator p =
-    std::unique(uniques.begin(), uniques.end());
+      std::unique(uniques.begin(), uniques.end());
   uint32_t num = std::distance(uniques.begin(), p);
 
   // Then compute the bucket size, minimum of 1 bucket.
-  if (num > 1024) Header.bucket_count = num/4;
-  if (num > 16) Header.bucket_count = num/2;
-  else Header.bucket_count = num > 0 ? num : 1;
+  if (num > 1024)
+    Header.bucket_count = num / 4;
+  if (num > 16)
+    Header.bucket_count = num / 2;
+  else
+    Header.bucket_count = num > 0 ? num : 1;
 
   Header.hashes_count = num;
 }
@@ -78,13 +68,13 @@ static bool compareDIEs(const DwarfAccelTable::HashDataContents *A,
 
 void DwarfAccelTable::FinalizeTable(AsmPrinter *Asm, StringRef Prefix) {
   // Create the individual hash data outputs.
-  for (StringMap<DataArray>::iterator
-         EI = Entries.begin(), EE = Entries.end(); EI != EE; ++EI) {
+  for (StringMap<DataArray>::iterator EI = Entries.begin(), EE = Entries.end();
+       EI != EE; ++EI) {
 
     // Unique the entries.
     std::stable_sort(EI->second.begin(), EI->second.end(), compareDIEs);
     EI->second.erase(std::unique(EI->second.begin(), EI->second.end()),
-                       EI->second.end());
+                     EI->second.end());
 
     HashData *Entry = new (Allocator) HashData(EI->getKey(), EI->second);
     Data.push_back(Entry);
@@ -126,7 +116,7 @@ void DwarfAccelTable::EmitHeader(AsmPrinter *Asm) {
   Asm->EmitInt32(HeaderData.Atoms.size());
   for (size_t i = 0; i < HeaderData.Atoms.size(); i++) {
     Atom A = HeaderData.Atoms[i];
-    Asm->OutStreamer.AddComment(Atom::AtomTypeString(A.type));
+    Asm->OutStreamer.AddComment(dwarf::AtomTypeString(A.type));
     Asm->EmitInt16(A.type);
     Asm->OutStreamer.AddComment(dwarf::FormEncodingString(A.form));
     Asm->EmitInt16(A.form);
@@ -152,7 +142,8 @@ void DwarfAccelTable::EmitBuckets(AsmPrinter *Asm) {
 void DwarfAccelTable::EmitHashes(AsmPrinter *Asm) {
   for (size_t i = 0, e = Buckets.size(); i < e; ++i) {
     for (HashList::const_iterator HI = Buckets[i].begin(),
-           HE = Buckets[i].end(); HI != HE; ++HI) {
+                                  HE = Buckets[i].end();
+         HI != HE; ++HI) {
       Asm->OutStreamer.AddComment("Hash in Bucket " + Twine(i));
       Asm->EmitInt32((*HI)->HashValue);
     }
@@ -166,13 +157,13 @@ void DwarfAccelTable::EmitHashes(AsmPrinter *Asm) {
 void DwarfAccelTable::EmitOffsets(AsmPrinter *Asm, MCSymbol *SecBegin) {
   for (size_t i = 0, e = Buckets.size(); i < e; ++i) {
     for (HashList::const_iterator HI = Buckets[i].begin(),
-           HE = Buckets[i].end(); HI != HE; ++HI) {
+                                  HE = Buckets[i].end();
+         HI != HE; ++HI) {
       Asm->OutStreamer.AddComment("Offset in Bucket " + Twine(i));
       MCContext &Context = Asm->OutStreamer.getContext();
-      const MCExpr *Sub =
-        MCBinaryExpr::CreateSub(MCSymbolRefExpr::Create((*HI)->Sym, Context),
-                                MCSymbolRefExpr::Create(SecBegin, Context),
-                                Context);
+      const MCExpr *Sub = MCBinaryExpr::CreateSub(
+          MCSymbolRefExpr::Create((*HI)->Sym, Context),
+          MCSymbolRefExpr::Create(SecBegin, Context), Context);
       Asm->OutStreamer.EmitValue(Sub, sizeof(uint32_t));
     }
   }
@@ -185,7 +176,8 @@ void DwarfAccelTable::EmitData(AsmPrinter *Asm, DwarfUnits *D) {
   uint64_t PrevHash = UINT64_MAX;
   for (size_t i = 0, e = Buckets.size(); i < e; ++i) {
     for (HashList::const_iterator HI = Buckets[i].begin(),
-           HE = Buckets[i].end(); HI != HE; ++HI) {
+                                  HE = Buckets[i].end();
+         HI != HE; ++HI) {
       // Remember to emit the label for our offset.
       Asm->OutStreamer.EmitLabel((*HI)->Sym);
       Asm->OutStreamer.AddComment((*HI)->Str);
@@ -193,8 +185,9 @@ void DwarfAccelTable::EmitData(AsmPrinter *Asm, DwarfUnits *D) {
                              D->getStringPoolSym());
       Asm->OutStreamer.AddComment("Num DIEs");
       Asm->EmitInt32((*HI)->Data.size());
-      for (ArrayRef<HashDataContents*>::const_iterator
-             DI = (*HI)->Data.begin(), DE = (*HI)->Data.end();
+      for (ArrayRef<HashDataContents *>::const_iterator
+               DI = (*HI)->Data.begin(),
+               DE = (*HI)->Data.end();
            DI != DE; ++DI) {
         // Emit the DIE offset
         Asm->EmitInt32((*DI)->Die->getOffset());
@@ -214,8 +207,7 @@ void DwarfAccelTable::EmitData(AsmPrinter *Asm, DwarfUnits *D) {
 }
 
 // Emit the entire data structure to the output file.
-void DwarfAccelTable::Emit(AsmPrinter *Asm, MCSymbol *SecBegin,
-                           DwarfUnits *D) {
+void DwarfAccelTable::Emit(AsmPrinter *Asm, MCSymbol *SecBegin, DwarfUnits *D) {
   // Emit the header.
   EmitHeader(Asm);
 
@@ -239,11 +231,12 @@ void DwarfAccelTable::print(raw_ostream &O) {
   HeaderData.print(O);
 
   O << "Entries: \n";
-  for (StringMap<DataArray>::const_iterator
-         EI = Entries.begin(), EE = Entries.end(); EI != EE; ++EI) {
+  for (StringMap<DataArray>::const_iterator EI = Entries.begin(),
+                                            EE = Entries.end();
+       EI != EE; ++EI) {
     O << "Name: " << EI->getKeyData() << "\n";
     for (DataArray::const_iterator DI = EI->second.begin(),
-           DE = EI->second.end();
+                                   DE = EI->second.end();
          DI != DE; ++DI)
       (*DI)->print(O);
   }
@@ -251,14 +244,14 @@ void DwarfAccelTable::print(raw_ostream &O) {
   O << "Buckets and Hashes: \n";
   for (size_t i = 0, e = Buckets.size(); i < e; ++i)
     for (HashList::const_iterator HI = Buckets[i].begin(),
-           HE = Buckets[i].end(); HI != HE; ++HI)
+                                  HE = Buckets[i].end();
+         HI != HE; ++HI)
       (*HI)->print(O);
 
   O << "Data: \n";
-    for (std::vector<HashData*>::const_iterator
-           DI = Data.begin(), DE = Data.end(); DI != DE; ++DI)
-      (*DI)->print(O);
-
-
+  for (std::vector<HashData *>::const_iterator DI = Data.begin(),
+                                               DE = Data.end();
+       DI != DE; ++DI)
+    (*DI)->print(O);
 }
 #endif
diff --git a/lib/CodeGen/AsmPrinter/DwarfAccelTable.h b/lib/CodeGen/AsmPrinter/DwarfAccelTable.h
index 3ef1dc5..7627313 100644
--- a/lib/CodeGen/AsmPrinter/DwarfAccelTable.h
+++ b/lib/CodeGen/AsmPrinter/DwarfAccelTable.h
@@ -67,11 +67,7 @@ class DwarfUnits;
 
 class DwarfAccelTable {
 
-  enum HashFunctionType {
-    eHashFunctionDJB = 0u
-  };
-
-  static uint32_t HashDJB (StringRef Str) {
+  static uint32_t HashDJB(StringRef Str) {
     uint32_t h = 5381;
     for (unsigned i = 0, e = Str.size(); i != e; ++i)
       h = ((h << 5) + h) + Str[i];
@@ -80,25 +76,25 @@ class DwarfAccelTable {
 
   // Helper function to compute the number of buckets needed based on
   // the number of unique hashes.
-  void ComputeBucketCount (void);
+  void ComputeBucketCount(void);
 
   struct TableHeader {
-    uint32_t   magic;           // 'HASH' magic value to allow endian detection
-    uint16_t   version;         // Version number.
-    uint16_t   hash_function;   // The hash function enumeration that was used.
-    uint32_t   bucket_count;    // The number of buckets in this hash table.
-    uint32_t   hashes_count;    // The total number of unique hash values
-                                // and hash data offsets in this table.
-    uint32_t   header_data_len; // The bytes to skip to get to the hash
-                                // indexes (buckets) for correct alignment.
+    uint32_t magic;           // 'HASH' magic value to allow endian detection
+    uint16_t version;         // Version number.
+    uint16_t hash_function;   // The hash function enumeration that was used.
+    uint32_t bucket_count;    // The number of buckets in this hash table.
+    uint32_t hashes_count;    // The total number of unique hash values
+                              // and hash data offsets in this table.
+    uint32_t header_data_len; // The bytes to skip to get to the hash
+                              // indexes (buckets) for correct alignment.
     // Also written to disk is the implementation specific header data.
 
     static const uint32_t MagicHash = 0x48415348;
 
-    TableHeader (uint32_t data_len) :
-      magic (MagicHash), version (1), hash_function (eHashFunctionDJB),
-      bucket_count (0), hashes_count (0), header_data_len (data_len)
-    {}
+    TableHeader(uint32_t data_len)
+        : magic(MagicHash), version(1),
+          hash_function(dwarf::DW_hash_function_djb), bucket_count(0),
+          hashes_count(0), header_data_len(data_len) {}
 
 #ifndef NDEBUG
     void print(raw_ostream &O) {
@@ -124,62 +120,38 @@ public:
   // uint32_t die_offset_base
   // uint32_t atom_count
   // atom_count Atoms
-  enum AtomType {
-    eAtomTypeNULL       = 0u,
-    eAtomTypeDIEOffset  = 1u,   // DIE offset, check form for encoding
-    eAtomTypeCUOffset   = 2u,   // DIE offset of the compiler unit header that
-                                // contains the item in question
-    eAtomTypeTag        = 3u,   // DW_TAG_xxx value, should be encoded as
-                                // DW_FORM_data1 (if no tags exceed 255) or
-                                // DW_FORM_data2.
-    eAtomTypeNameFlags  = 4u,   // Flags from enum NameFlags
-    eAtomTypeTypeFlags  = 5u    // Flags from enum TypeFlags
-  };
-
-  enum TypeFlags {
-    eTypeFlagClassMask = 0x0000000fu,
-
-    // Always set for C++, only set for ObjC if this is the
-    // @implementation for a class.
-    eTypeFlagClassIsImplementation  = ( 1u << 1 )
-  };
 
   // Make these public so that they can be used as a general interface to
   // the class.
   struct Atom {
-    AtomType type; // enum AtomType
+    uint16_t type; // enum AtomType
     uint16_t form; // DWARF DW_FORM_ defines
 
-    Atom(AtomType type, uint16_t form) : type(type), form(form) {}
-    static const char * AtomTypeString(enum AtomType);
+    Atom(uint16_t type, uint16_t form) : type(type), form(form) {}
 #ifndef NDEBUG
     void print(raw_ostream &O) {
-      O << "Type: " << AtomTypeString(type) << "\n"
+      O << "Type: " << dwarf::AtomTypeString(type) << "\n"
         << "Form: " << dwarf::FormEncodingString(form) << "\n";
     }
-    void dump() {
-      print(dbgs());
-    }
+    void dump() { print(dbgs()); }
 #endif
   };
 
- private:
+private:
   struct TableHeaderData {
     uint32_t die_offset_base;
     SmallVector<Atom, 1> Atoms;
 
     TableHeaderData(ArrayRef<Atom> AtomList, uint32_t offset = 0)
-      : die_offset_base(offset), Atoms(AtomList.begin(), AtomList.end()) { }
+        : die_offset_base(offset), Atoms(AtomList.begin(), AtomList.end()) {}
 
 #ifndef NDEBUG
-    void print (raw_ostream &O) {
+    void print(raw_ostream &O) {
       O << "die_offset_base: " << die_offset_base << "\n";
       for (size_t i = 0; i < Atoms.size(); i++)
         Atoms[i].print(O);
     }
-    void dump() {
-      print(dbgs());
-    }
+    void dump() { print(dbgs()); }
 #endif
   };
 
@@ -193,37 +165,38 @@ public:
   // HashData[hash_data_count]
 public:
   struct HashDataContents {
-    DIE *Die; // Offsets
+    DIE *Die;   // Offsets
     char Flags; // Specific flags to output
 
-    HashDataContents(DIE *D, char Flags) :
-      Die(D),
-      Flags(Flags) { }
-    #ifndef NDEBUG
+    HashDataContents(DIE *D, char Flags) : Die(D), Flags(Flags) {}
+#ifndef NDEBUG
     void print(raw_ostream &O) const {
       O << "  Offset: " << Die->getOffset() << "\n";
       O << "  Tag: " << dwarf::TagString(Die->getTag()) << "\n";
       O << "  Flags: " << Flags << "\n";
     }
-    #endif
+#endif
   };
+
 private:
   struct HashData {
     StringRef Str;
     uint32_t HashValue;
     MCSymbol *Sym;
-    ArrayRef<HashDataContents*> Data; // offsets
-    HashData(StringRef S, ArrayRef<HashDataContents*> Data)
-      : Str(S), Data(Data) {
+    ArrayRef<HashDataContents *> Data; // offsets
+    HashData(StringRef S, ArrayRef<HashDataContents *> Data)
+        : Str(S), Data(Data) {
       HashValue = DwarfAccelTable::HashDJB(S);
     }
-    #ifndef NDEBUG
+#ifndef NDEBUG
     void print(raw_ostream &O) {
       O << "Name: " << Str << "\n";
       O << "  Hash Value: " << format("0x%x", HashValue) << "\n";
-      O << "  Symbol: " ;
-      if (Sym) Sym->print(O);
-      else O << "<none>";
+      O << "  Symbol: ";
+      if (Sym)
+        Sym->print(O);
+      else
+        O << "<none>";
       O << "\n";
       for (size_t i = 0; i < Data.size(); i++) {
         O << "  Offset: " << Data[i]->Die->getOffset() << "\n";
@@ -231,14 +204,12 @@ private:
         O << "  Flags: " << Data[i]->Flags << "\n";
       }
     }
-    void dump() {
-      print(dbgs());
-    }
-    #endif
+    void dump() { print(dbgs()); }
+#endif
   };
 
-  DwarfAccelTable(const DwarfAccelTable&) LLVM_DELETED_FUNCTION;
-  void operator=(const DwarfAccelTable&) LLVM_DELETED_FUNCTION;
+  DwarfAccelTable(const DwarfAccelTable &) LLVM_DELETED_FUNCTION;
+  void operator=(const DwarfAccelTable &) LLVM_DELETED_FUNCTION;
 
   // Internal Functions
   void EmitHeader(AsmPrinter *);
@@ -253,24 +224,24 @@ private:
   // Output Variables
   TableHeader Header;
   TableHeaderData HeaderData;
-  std::vector<HashData*> Data;
+  std::vector<HashData *> Data;
 
   // String Data
-  typedef std::vector<HashDataContents*> DataArray;
-  typedef StringMap<DataArray, BumpPtrAllocator&> StringEntries;
+  typedef std::vector<HashDataContents *> DataArray;
+  typedef StringMap<DataArray, BumpPtrAllocator &> StringEntries;
   StringEntries Entries;
 
   // Buckets/Hashes/Offsets
-  typedef std::vector<HashData*> HashList;
+  typedef std::vector<HashData *> HashList;
   typedef std::vector<HashList> BucketList;
   BucketList Buckets;
   HashList Hashes;
 
   // Public Implementation
- public:
+public:
   DwarfAccelTable(ArrayRef<DwarfAccelTable::Atom>);
   ~DwarfAccelTable();
-  void AddName(StringRef, DIE*, char = 0);
+  void AddName(StringRef, DIE *, char = 0);
   void FinalizeTable(AsmPrinter *, StringRef);
   void Emit(AsmPrinter *, MCSymbol *, DwarfUnits *);
 #ifndef NDEBUG
@@ -278,6 +249,5 @@ private:
   void dump() { print(dbgs()); }
 #endif
 };
-
 }
 #endif
diff --git a/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp b/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp
index fec5ced..8918f3d 100644
--- a/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp
@@ -68,7 +68,7 @@ void DwarfCFIException::EndModule() {
   for (size_t i = 0, e = Personalities.size(); i != e; ++i) {
     if (!Personalities[i])
       continue;
-    MCSymbol *Sym = Asm->Mang->getSymbol(Personalities[i]);
+    MCSymbol *Sym = Asm->getSymbol(Personalities[i]);
     TLOF.emitPersonalityValue(Asm->OutStreamer, Asm->TM, Sym);
     AtLeastOne = true;
   }
diff --git a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
index df8ca17..a6ff953 100644
--- a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
@@ -22,8 +22,8 @@
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/Instructions.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
+#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCStreamer.h"
 #include "llvm/Target/Mangler.h"
 #include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetMachine.h"
@@ -33,12 +33,12 @@
 using namespace llvm;
 
 /// CompileUnit - Compile unit constructor.
-CompileUnit::CompileUnit(unsigned UID, unsigned L, DIE *D, const MDNode *N,
+CompileUnit::CompileUnit(unsigned UID, DIE *D, DICompileUnit Node,
                          AsmPrinter *A, DwarfDebug *DW, DwarfUnits *DWU)
-  : UniqueID(UID), Language(L), CUDie(D), Asm(A), DD(DW), DU(DWU),
-    IndexTyDie(0), DebugInfoOffset(0) {
+    : UniqueID(UID), Node(Node), CUDie(D), Asm(A), DD(DW), DU(DWU),
+      IndexTyDie(0), DebugInfoOffset(0) {
   DIEIntegerOne = new (DIEValueAllocator) DIEInteger(1);
-  insertDIE(N, D);
+  insertDIE(Node, D);
 }
 
 /// ~CompileUnit - Destructor for compile unit.
@@ -57,7 +57,7 @@ DIEEntry *CompileUnit::createDIEEntry(DIE *Entry) {
 /// getDefaultLowerBound - Return the default lower bound for an array. If the
 /// DWARF version doesn't handle the language, return -1.
 int64_t CompileUnit::getDefaultLowerBound() const {
-  switch (Language) {
+  switch (getLanguage()) {
   default:
     break;
 
@@ -98,32 +98,71 @@ int64_t CompileUnit::getDefaultLowerBound() const {
   return -1;
 }
 
+/// Check whether the DIE for this MDNode can be shared across CUs.
+static bool isShareableAcrossCUs(DIDescriptor D) {
+  // When the MDNode can be part of the type system, the DIE can be
+  // shared across CUs.
+  return D.isType() ||
+         (D.isSubprogram() && !DISubprogram(D).isDefinition());
+}
+
+/// getDIE - Returns the debug information entry map slot for the
+/// specified debug variable. We delegate the request to DwarfDebug
+/// when the DIE for this MDNode can be shared across CUs. The mappings
+/// will be kept in DwarfDebug for shareable DIEs.
+DIE *CompileUnit::getDIE(DIDescriptor D) const {
+  if (isShareableAcrossCUs(D))
+    return DD->getDIE(D);
+  return MDNodeToDieMap.lookup(D);
+}
+
+/// insertDIE - Insert DIE into the map. We delegate the request to DwarfDebug
+/// when the DIE for this MDNode can be shared across CUs. The mappings
+/// will be kept in DwarfDebug for shareable DIEs.
+void CompileUnit::insertDIE(DIDescriptor Desc, DIE *D) {
+  if (isShareableAcrossCUs(Desc)) {
+    DD->insertDIE(Desc, D);
+    return;
+  }
+  MDNodeToDieMap.insert(std::make_pair(Desc, D));
+}
+
 /// addFlag - Add a flag that is true.
-void CompileUnit::addFlag(DIE *Die, unsigned Attribute) {
-  if (!DD->useDarwinGDBCompat())
-    Die->addValue(Attribute, dwarf::DW_FORM_flag_present,
-                  DIEIntegerOne);
+void CompileUnit::addFlag(DIE *Die, dwarf::Attribute Attribute) {
+  if (DD->getDwarfVersion() >= 4)
+    Die->addValue(Attribute, dwarf::DW_FORM_flag_present, DIEIntegerOne);
   else
-    addUInt(Die, Attribute, dwarf::DW_FORM_flag, 1);
+    Die->addValue(Attribute, dwarf::DW_FORM_flag, DIEIntegerOne);
 }
 
 /// addUInt - Add an unsigned integer attribute data and value.
 ///
-void CompileUnit::addUInt(DIE *Die, unsigned Attribute,
-                          unsigned Form, uint64_t Integer) {
-  if (!Form) Form = DIEInteger::BestForm(false, Integer);
-  DIEValue *Value = Integer == 1 ?
-    DIEIntegerOne : new (DIEValueAllocator) DIEInteger(Integer);
-  Die->addValue(Attribute, Form, Value);
+void CompileUnit::addUInt(DIE *Die, dwarf::Attribute Attribute,
+                          Optional<dwarf::Form> Form, uint64_t Integer) {
+  if (!Form)
+    Form = DIEInteger::BestForm(false, Integer);
+  DIEValue *Value = Integer == 1 ? DIEIntegerOne : new (DIEValueAllocator)
+                        DIEInteger(Integer);
+  Die->addValue(Attribute, *Form, Value);
+}
+
+void CompileUnit::addUInt(DIEBlock *Block, dwarf::Form Form, uint64_t Integer) {
+  addUInt(Block, (dwarf::Attribute)0, Form, Integer);
 }
 
 /// addSInt - Add an signed integer attribute data and value.
 ///
-void CompileUnit::addSInt(DIE *Die, unsigned Attribute,
-                          unsigned Form, int64_t Integer) {
-  if (!Form) Form = DIEInteger::BestForm(true, Integer);
+void CompileUnit::addSInt(DIE *Die, dwarf::Attribute Attribute,
+                          Optional<dwarf::Form> Form, int64_t Integer) {
+  if (!Form)
+    Form = DIEInteger::BestForm(true, Integer);
   DIEValue *Value = new (DIEValueAllocator) DIEInteger(Integer);
-  Die->addValue(Attribute, Form, Value);
+  Die->addValue(Attribute, *Form, Value);
+}
+
+void CompileUnit::addSInt(DIEBlock *Die, Optional<dwarf::Form> Form,
+                          int64_t Integer) {
+  addSInt(Die, (dwarf::Attribute)0, Form, Integer);
 }
 
 /// addString - Add a string attribute data and value. We always emit a
@@ -131,9 +170,10 @@ void CompileUnit::addSInt(DIE *Die, unsigned Attribute,
 /// more predictable sizes. In the case of split dwarf we emit an index
 /// into another table which gets us the static offset into the string
 /// table.
-void CompileUnit::addString(DIE *Die, unsigned Attribute, StringRef String) {
+void CompileUnit::addString(DIE *Die, dwarf::Attribute Attribute,
+                            StringRef String) {
   DIEValue *Value;
-  unsigned Form;
+  dwarf::Form Form;
   if (!DD->useSplitDwarf()) {
     MCSymbol *Symb = DU->getStringPoolEntry(String);
     if (Asm->needsRelocationsForDwarfStringPool())
@@ -154,7 +194,7 @@ void CompileUnit::addString(DIE *Die, unsigned Attribute, StringRef String) {
 
 /// addLocalString - Add a string attribute data and value. This is guaranteed
 /// to be in the local string pool instead of indirected.
-void CompileUnit::addLocalString(DIE *Die, unsigned Attribute,
+void CompileUnit::addLocalString(DIE *Die, dwarf::Attribute Attribute,
                                  StringRef String) {
   MCSymbol *Symb = DU->getStringPoolEntry(String);
   DIEValue *Value;
@@ -169,25 +209,32 @@ void CompileUnit::addLocalString(DIE *Die, unsigned Attribute,
 
 /// addExpr - Add a Dwarf expression attribute data and value.
 ///
-void CompileUnit::addExpr(DIE *Die, unsigned Attribute, unsigned Form,
-                          const MCExpr *Expr) {
+void CompileUnit::addExpr(DIEBlock *Die, dwarf::Form Form, const MCExpr *Expr) {
   DIEValue *Value = new (DIEValueAllocator) DIEExpr(Expr);
-  Die->addValue(Attribute, Form, Value);
+  Die->addValue((dwarf::Attribute)0, Form, Value);
 }
 
 /// addLabel - Add a Dwarf label attribute data and value.
 ///
-void CompileUnit::addLabel(DIE *Die, unsigned Attribute, unsigned Form,
-                           const MCSymbol *Label) {
+void CompileUnit::addLabel(DIE *Die, dwarf::Attribute Attribute,
+                           dwarf::Form Form, const MCSymbol *Label) {
   DIEValue *Value = new (DIEValueAllocator) DIELabel(Label);
   Die->addValue(Attribute, Form, Value);
 }
 
+void CompileUnit::addLabel(DIEBlock *Die, dwarf::Form Form,
+                           const MCSymbol *Label) {
+  addLabel(Die, (dwarf::Attribute)0, Form, Label);
+}
+
 /// addLabelAddress - Add a dwarf label attribute data and value using
 /// DW_FORM_addr or DW_FORM_GNU_addr_index.
 ///
-void CompileUnit::addLabelAddress(DIE *Die, unsigned Attribute,
+void CompileUnit::addLabelAddress(DIE *Die, dwarf::Attribute Attribute,
                                   MCSymbol *Label) {
+  if (Label)
+    DD->addArangeLabel(SymbolCU(this, Label));
+
   if (!DD->useSplitDwarf()) {
     if (Label != NULL) {
       DIEValue *Value = new (DIEValueAllocator) DIELabel(Label);
@@ -206,34 +253,60 @@ void CompileUnit::addLabelAddress(DIE *Die, unsigned Attribute,
 /// addOpAddress - Add a dwarf op address data and value using the
 /// form given and an op of either DW_FORM_addr or DW_FORM_GNU_addr_index.
 ///
-void CompileUnit::addOpAddress(DIE *Die, const MCSymbol *Sym) {
+void CompileUnit::addOpAddress(DIEBlock *Die, const MCSymbol *Sym) {
+  DD->addArangeLabel(SymbolCU(this, Sym));
   if (!DD->useSplitDwarf()) {
-    addUInt(Die, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_addr);
-    addLabel(Die, 0, dwarf::DW_FORM_udata, Sym);
+    addUInt(Die, dwarf::DW_FORM_data1, dwarf::DW_OP_addr);
+    addLabel(Die, dwarf::DW_FORM_udata, Sym);
   } else {
-    addUInt(Die, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_GNU_addr_index);
-    addUInt(Die, 0, dwarf::DW_FORM_GNU_addr_index, DU->getAddrPoolIndex(Sym));
+    addUInt(Die, dwarf::DW_FORM_data1, dwarf::DW_OP_GNU_addr_index);
+    addUInt(Die, dwarf::DW_FORM_GNU_addr_index, DU->getAddrPoolIndex(Sym));
   }
 }
 
 /// addDelta - Add a label delta attribute data and value.
 ///
-void CompileUnit::addDelta(DIE *Die, unsigned Attribute, unsigned Form,
-                           const MCSymbol *Hi, const MCSymbol *Lo) {
+void CompileUnit::addDelta(DIE *Die, dwarf::Attribute Attribute,
+                           dwarf::Form Form, const MCSymbol *Hi,
+                           const MCSymbol *Lo) {
   DIEValue *Value = new (DIEValueAllocator) DIEDelta(Hi, Lo);
   Die->addValue(Attribute, Form, Value);
 }
 
 /// addDIEEntry - Add a DIE attribute data and value.
 ///
-void CompileUnit::addDIEEntry(DIE *Die, unsigned Attribute, unsigned Form,
+void CompileUnit::addDIEEntry(DIE *Die, dwarf::Attribute Attribute,
                               DIE *Entry) {
-  Die->addValue(Attribute, Form, createDIEEntry(Entry));
+  addDIEEntry(Die, Attribute, createDIEEntry(Entry));
+}
+
+void CompileUnit::addDIEEntry(DIE *Die, dwarf::Attribute Attribute,
+                              DIEEntry *Entry) {
+  const DIE *DieCU = Die->getCompileUnitOrNull();
+  const DIE *EntryCU = Entry->getEntry()->getCompileUnitOrNull();
+  if (!DieCU)
+    // We assume that Die belongs to this CU, if it is not linked to any CU yet.
+    DieCU = getCUDie();
+  if (!EntryCU)
+    EntryCU = getCUDie();
+  Die->addValue(Attribute, EntryCU == DieCU ? dwarf::DW_FORM_ref4
+                                            : dwarf::DW_FORM_ref_addr,
+                Entry);
+}
+
+/// Create a DIE with the given Tag, add the DIE to its parent, and
+/// call insertDIE if MD is not null.
+DIE *CompileUnit::createAndAddDIE(unsigned Tag, DIE &Parent, DIDescriptor N) {
+  DIE *Die = new DIE(Tag);
+  Parent.addChild(Die);
+  if (N)
+    insertDIE(N, Die);
+  return Die;
 }
 
 /// addBlock - Add block data.
 ///
-void CompileUnit::addBlock(DIE *Die, unsigned Attribute, unsigned Form,
+void CompileUnit::addBlock(DIE *Die, dwarf::Attribute Attribute,
                            DIEBlock *Block) {
   Block->ComputeSize(Asm);
   DIEBlocks.push_back(Block); // Memoize so we can call the destructor later on.
@@ -250,12 +323,12 @@ void CompileUnit::addSourceLine(DIE *Die, DIVariable V) {
   unsigned Line = V.getLineNumber();
   if (Line == 0)
     return;
-  unsigned FileID = DD->getOrCreateSourceID(V.getContext().getFilename(),
-                                            V.getContext().getDirectory(),
-                                            getUniqueID());
+  unsigned FileID =
+      DD->getOrCreateSourceID(V.getContext().getFilename(),
+                              V.getContext().getDirectory(), getUniqueID());
   assert(FileID && "Invalid file id");
-  addUInt(Die, dwarf::DW_AT_decl_file, 0, FileID);
-  addUInt(Die, dwarf::DW_AT_decl_line, 0, Line);
+  addUInt(Die, dwarf::DW_AT_decl_file, None, FileID);
+  addUInt(Die, dwarf::DW_AT_decl_line, None, Line);
 }
 
 /// addSourceLine - Add location information to specified debug information
@@ -268,11 +341,11 @@ void CompileUnit::addSourceLine(DIE *Die, DIGlobalVariable G) {
   unsigned Line = G.getLineNumber();
   if (Line == 0)
     return;
-  unsigned FileID = DD->getOrCreateSourceID(G.getFilename(), G.getDirectory(),
-                                            getUniqueID());
+  unsigned FileID =
+      DD->getOrCreateSourceID(G.getFilename(), G.getDirectory(), getUniqueID());
   assert(FileID && "Invalid file id");
-  addUInt(Die, dwarf::DW_AT_decl_file, 0, FileID);
-  addUInt(Die, dwarf::DW_AT_decl_line, 0, Line);
+  addUInt(Die, dwarf::DW_AT_decl_file, None, FileID);
+  addUInt(Die, dwarf::DW_AT_decl_line, None, Line);
 }
 
 /// addSourceLine - Add location information to specified debug information
@@ -287,11 +360,11 @@ void CompileUnit::addSourceLine(DIE *Die, DISubprogram SP) {
   if (Line == 0)
     return;
 
-  unsigned FileID = DD->getOrCreateSourceID(SP.getFilename(),
-                                            SP.getDirectory(), getUniqueID());
+  unsigned FileID = DD->getOrCreateSourceID(SP.getFilename(), SP.getDirectory(),
+                                            getUniqueID());
   assert(FileID && "Invalid file id");
-  addUInt(Die, dwarf::DW_AT_decl_file, 0, FileID);
-  addUInt(Die, dwarf::DW_AT_decl_line, 0, Line);
+  addUInt(Die, dwarf::DW_AT_decl_file, None, FileID);
+  addUInt(Die, dwarf::DW_AT_decl_line, None, Line);
 }
 
 /// addSourceLine - Add location information to specified debug information
@@ -304,11 +377,11 @@ void CompileUnit::addSourceLine(DIE *Die, DIType Ty) {
   unsigned Line = Ty.getLineNumber();
   if (Line == 0)
     return;
-  unsigned FileID = DD->getOrCreateSourceID(Ty.getFilename(),
-                                            Ty.getDirectory(), getUniqueID());
+  unsigned FileID = DD->getOrCreateSourceID(Ty.getFilename(), Ty.getDirectory(),
+                                            getUniqueID());
   assert(FileID && "Invalid file id");
-  addUInt(Die, dwarf::DW_AT_decl_file, 0, FileID);
-  addUInt(Die, dwarf::DW_AT_decl_line, 0, Line);
+  addUInt(Die, dwarf::DW_AT_decl_file, None, FileID);
+  addUInt(Die, dwarf::DW_AT_decl_line, None, Line);
 }
 
 /// addSourceLine - Add location information to specified debug information
@@ -325,8 +398,8 @@ void CompileUnit::addSourceLine(DIE *Die, DIObjCProperty Ty) {
   unsigned FileID = DD->getOrCreateSourceID(File.getFilename(),
                                             File.getDirectory(), getUniqueID());
   assert(FileID && "Invalid file id");
-  addUInt(Die, dwarf::DW_AT_decl_file, 0, FileID);
-  addUInt(Die, dwarf::DW_AT_decl_line, 0, Line);
+  addUInt(Die, dwarf::DW_AT_decl_file, None, FileID);
+  addUInt(Die, dwarf::DW_AT_decl_line, None, Line);
 }
 
 /// addSourceLine - Add location information to specified debug information
@@ -341,11 +414,11 @@ void CompileUnit::addSourceLine(DIE *Die, DINameSpace NS) {
     return;
   StringRef FN = NS.getFilename();
 
-  unsigned FileID = DD->getOrCreateSourceID(FN, NS.getDirectory(),
-                                            getUniqueID());
+  unsigned FileID =
+      DD->getOrCreateSourceID(FN, NS.getDirectory(), getUniqueID());
   assert(FileID && "Invalid file id");
-  addUInt(Die, dwarf::DW_AT_decl_file, 0, FileID);
-  addUInt(Die, dwarf::DW_AT_decl_line, 0, Line);
+  addUInt(Die, dwarf::DW_AT_decl_file, None, FileID);
+  addUInt(Die, dwarf::DW_AT_decl_line, None, Line);
 }
 
 /// addVariableAddress - Add DW_AT_location attribute for a
@@ -362,38 +435,38 @@ void CompileUnit::addVariableAddress(const DbgVariable &DV, DIE *Die,
 }
 
 /// addRegisterOp - Add register operand.
-void CompileUnit::addRegisterOp(DIE *TheDie, unsigned Reg) {
+void CompileUnit::addRegisterOp(DIEBlock *TheDie, unsigned Reg) {
   const TargetRegisterInfo *RI = Asm->TM.getRegisterInfo();
   unsigned DWReg = RI->getDwarfRegNum(Reg, false);
   if (DWReg < 32)
-    addUInt(TheDie, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_reg0 + DWReg);
+    addUInt(TheDie, dwarf::DW_FORM_data1, dwarf::DW_OP_reg0 + DWReg);
   else {
-    addUInt(TheDie, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_regx);
-    addUInt(TheDie, 0, dwarf::DW_FORM_udata, DWReg);
+    addUInt(TheDie, dwarf::DW_FORM_data1, dwarf::DW_OP_regx);
+    addUInt(TheDie, dwarf::DW_FORM_udata, DWReg);
   }
 }
 
 /// addRegisterOffset - Add register offset.
-void CompileUnit::addRegisterOffset(DIE *TheDie, unsigned Reg,
+void CompileUnit::addRegisterOffset(DIEBlock *TheDie, unsigned Reg,
                                     int64_t Offset) {
   const TargetRegisterInfo *RI = Asm->TM.getRegisterInfo();
   unsigned DWReg = RI->getDwarfRegNum(Reg, false);
   const TargetRegisterInfo *TRI = Asm->TM.getRegisterInfo();
   if (Reg == TRI->getFrameRegister(*Asm->MF))
     // If variable offset is based in frame register then use fbreg.
-    addUInt(TheDie, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_fbreg);
+    addUInt(TheDie, dwarf::DW_FORM_data1, dwarf::DW_OP_fbreg);
   else if (DWReg < 32)
-    addUInt(TheDie, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_breg0 + DWReg);
+    addUInt(TheDie, dwarf::DW_FORM_data1, dwarf::DW_OP_breg0 + DWReg);
   else {
-    addUInt(TheDie, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_bregx);
-    addUInt(TheDie, 0, dwarf::DW_FORM_udata, DWReg);
+    addUInt(TheDie, dwarf::DW_FORM_data1, dwarf::DW_OP_bregx);
+    addUInt(TheDie, dwarf::DW_FORM_udata, DWReg);
   }
-  addSInt(TheDie, 0, dwarf::DW_FORM_sdata, Offset);
+  addSInt(TheDie, dwarf::DW_FORM_sdata, Offset);
 }
 
 /// addAddress - Add an address attribute to a die based on the location
 /// provided.
-void CompileUnit::addAddress(DIE *Die, unsigned Attribute,
+void CompileUnit::addAddress(DIE *Die, dwarf::Attribute Attribute,
                              const MachineLocation &Location, bool Indirect) {
   DIEBlock *Block = new (DIEValueAllocator) DIEBlock();
 
@@ -402,12 +475,12 @@ void CompileUnit::addAddress(DIE *Die, unsigned Attribute,
   else {
     addRegisterOffset(Block, Location.getReg(), Location.getOffset());
     if (Indirect && !Location.isReg()) {
-      addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_deref);
+      addUInt(Block, dwarf::DW_FORM_data1, dwarf::DW_OP_deref);
     }
   }
 
   // Now attach the location information to the DIE.
-  addBlock(Die, Attribute, 0, Block);
+  addBlock(Die, Attribute, Block);
 }
 
 /// addComplexAddress - Start with the address based on the location provided,
@@ -416,7 +489,7 @@ void CompileUnit::addAddress(DIE *Die, unsigned Attribute,
 /// the starting location.  Add the DWARF information to the die.
 ///
 void CompileUnit::addComplexAddress(const DbgVariable &DV, DIE *Die,
-                                    unsigned Attribute,
+                                    dwarf::Attribute Attribute,
                                     const MachineLocation &Location) {
   DIEBlock *Block = new (DIEValueAllocator) DIEBlock();
   unsigned N = DV.getNumAddrElements();
@@ -429,23 +502,23 @@ void CompileUnit::addComplexAddress(const DbgVariable &DV, DIE *Die,
       i = 2;
     } else
       addRegisterOp(Block, Location.getReg());
-  }
-  else
+  } else
     addRegisterOffset(Block, Location.getReg(), Location.getOffset());
 
-  for (;i < N; ++i) {
+  for (; i < N; ++i) {
     uint64_t Element = DV.getAddrElement(i);
     if (Element == DIBuilder::OpPlus) {
-      addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_plus_uconst);
-      addUInt(Block, 0, dwarf::DW_FORM_udata, DV.getAddrElement(++i));
+      addUInt(Block, dwarf::DW_FORM_data1, dwarf::DW_OP_plus_uconst);
+      addUInt(Block, dwarf::DW_FORM_udata, DV.getAddrElement(++i));
     } else if (Element == DIBuilder::OpDeref) {
       if (!Location.isReg())
-        addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_deref);
-    } else llvm_unreachable("unknown DIBuilder Opcode");
+        addUInt(Block, dwarf::DW_FORM_data1, dwarf::DW_OP_deref);
+    } else
+      llvm_unreachable("unknown DIBuilder Opcode");
   }
 
   // Now attach the location information to the DIE.
-  addBlock(Die, Attribute, 0, Block);
+  addBlock(Die, Attribute, Block);
 }
 
 /* Byref variables, in Blocks, are declared by the programmer as "SomeType
@@ -509,44 +582,41 @@ void CompileUnit::addComplexAddress(const DbgVariable &DV, DIE *Die,
 /// more information, read large comment just above here.
 ///
 void CompileUnit::addBlockByrefAddress(const DbgVariable &DV, DIE *Die,
-                                       unsigned Attribute,
+                                       dwarf::Attribute Attribute,
                                        const MachineLocation &Location) {
   DIType Ty = DV.getType();
   DIType TmpTy = Ty;
-  unsigned Tag = Ty.getTag();
+  uint16_t Tag = Ty.getTag();
   bool isPointer = false;
 
   StringRef varName = DV.getName();
 
   if (Tag == dwarf::DW_TAG_pointer_type) {
-    DIDerivedType DTy = DIDerivedType(Ty);
-    TmpTy = DTy.getTypeDerivedFrom();
+    DIDerivedType DTy(Ty);
+    TmpTy = resolve(DTy.getTypeDerivedFrom());
     isPointer = true;
   }
 
-  DICompositeType blockStruct = DICompositeType(TmpTy);
+  DICompositeType blockStruct(TmpTy);
 
   // Find the __forwarding field and the variable field in the __Block_byref
   // struct.
   DIArray Fields = blockStruct.getTypeArray();
-  DIDescriptor varField = DIDescriptor();
-  DIDescriptor forwardingField = DIDescriptor();
+  DIDerivedType varField;
+  DIDerivedType forwardingField;
 
   for (unsigned i = 0, N = Fields.getNumElements(); i < N; ++i) {
-    DIDescriptor Element = Fields.getElement(i);
-    DIDerivedType DT = DIDerivedType(Element);
+    DIDerivedType DT(Fields.getElement(i));
     StringRef fieldName = DT.getName();
     if (fieldName == "__forwarding")
-      forwardingField = Element;
+      forwardingField = DT;
     else if (fieldName == varName)
-      varField = Element;
+      varField = DT;
   }
 
   // Get the offsets for the forwarding field and the variable field.
-  unsigned forwardingFieldOffset =
-    DIDerivedType(forwardingField).getOffsetInBits() >> 3;
-  unsigned varFieldOffset =
-    DIDerivedType(varField).getOffsetInBits() >> 3;
+  unsigned forwardingFieldOffset = forwardingField.getOffsetInBits() >> 3;
+  unsigned varFieldOffset = varField.getOffsetInBits() >> 2;
 
   // Decode the original location, and use that as the start of the byref
   // variable's location.
@@ -560,45 +630,91 @@ void CompileUnit::addBlockByrefAddress(const DbgVariable &DV, DIE *Die,
   // If we started with a pointer to the __Block_byref... struct, then
   // the first thing we need to do is dereference the pointer (DW_OP_deref).
   if (isPointer)
-    addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_deref);
+    addUInt(Block, dwarf::DW_FORM_data1, dwarf::DW_OP_deref);
 
   // Next add the offset for the '__forwarding' field:
   // DW_OP_plus_uconst ForwardingFieldOffset.  Note there's no point in
   // adding the offset if it's 0.
   if (forwardingFieldOffset > 0) {
-    addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_plus_uconst);
-    addUInt(Block, 0, dwarf::DW_FORM_udata, forwardingFieldOffset);
+    addUInt(Block, dwarf::DW_FORM_data1, dwarf::DW_OP_plus_uconst);
+    addUInt(Block, dwarf::DW_FORM_udata, forwardingFieldOffset);
   }
 
   // Now dereference the __forwarding field to get to the real __Block_byref
   // struct:  DW_OP_deref.
-  addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_deref);
+  addUInt(Block, dwarf::DW_FORM_data1, dwarf::DW_OP_deref);
 
   // Now that we've got the real __Block_byref... struct, add the offset
   // for the variable's field to get to the location of the actual variable:
   // DW_OP_plus_uconst varFieldOffset.  Again, don't add if it's 0.
   if (varFieldOffset > 0) {
-    addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_plus_uconst);
-    addUInt(Block, 0, dwarf::DW_FORM_udata, varFieldOffset);
+    addUInt(Block, dwarf::DW_FORM_data1, dwarf::DW_OP_plus_uconst);
+    addUInt(Block, dwarf::DW_FORM_udata, varFieldOffset);
   }
 
   // Now attach the location information to the DIE.
-  addBlock(Die, Attribute, 0, Block);
+  addBlock(Die, Attribute, Block);
 }
 
 /// isTypeSigned - Return true if the type is signed.
-static bool isTypeSigned(DIType Ty, int *SizeInBits) {
+static bool isTypeSigned(DwarfDebug *DD, DIType Ty, int *SizeInBits) {
   if (Ty.isDerivedType())
-    return isTypeSigned(DIDerivedType(Ty).getTypeDerivedFrom(), SizeInBits);
+    return isTypeSigned(DD, DD->resolve(DIDerivedType(Ty).getTypeDerivedFrom()),
+                        SizeInBits);
   if (Ty.isBasicType())
-    if (DIBasicType(Ty).getEncoding() == dwarf::DW_ATE_signed
-        || DIBasicType(Ty).getEncoding() == dwarf::DW_ATE_signed_char) {
+    if (DIBasicType(Ty).getEncoding() == dwarf::DW_ATE_signed ||
+        DIBasicType(Ty).getEncoding() == dwarf::DW_ATE_signed_char) {
       *SizeInBits = Ty.getSizeInBits();
       return true;
     }
   return false;
 }
 
+/// Return true if type encoding is unsigned.
+static bool isUnsignedDIType(DwarfDebug *DD, DIType Ty) {
+  DIDerivedType DTy(Ty);
+  if (DTy.isDerivedType())
+    return isUnsignedDIType(DD, DD->resolve(DTy.getTypeDerivedFrom()));
+
+  DIBasicType BTy(Ty);
+  if (BTy.isBasicType()) {
+    unsigned Encoding = BTy.getEncoding();
+    if (Encoding == dwarf::DW_ATE_unsigned ||
+        Encoding == dwarf::DW_ATE_unsigned_char ||
+        Encoding == dwarf::DW_ATE_boolean)
+      return true;
+  }
+  return false;
+}
+
+/// If this type is derived from a base type then return base type size.
+static uint64_t getBaseTypeSize(DwarfDebug *DD, DIDerivedType Ty) {
+  unsigned Tag = Ty.getTag();
+
+  if (Tag != dwarf::DW_TAG_member && Tag != dwarf::DW_TAG_typedef &&
+      Tag != dwarf::DW_TAG_const_type && Tag != dwarf::DW_TAG_volatile_type &&
+      Tag != dwarf::DW_TAG_restrict_type)
+    return Ty.getSizeInBits();
+
+  DIType BaseType = DD->resolve(Ty.getTypeDerivedFrom());
+
+  // If this type is not derived from any type then take conservative approach.
+  if (!BaseType.isValid())
+    return Ty.getSizeInBits();
+
+  // If this is a derived type, go ahead and get the base type, unless it's a
+  // reference then it's just the size of the field. Pointer types have no need
+  // of this since they're a different type of qualification on the type.
+  if (BaseType.getTag() == dwarf::DW_TAG_reference_type ||
+      BaseType.getTag() == dwarf::DW_TAG_rvalue_reference_type)
+    return Ty.getSizeInBits();
+
+  if (BaseType.isDerivedType())
+    return getBaseTypeSize(DD, DIDerivedType(BaseType));
+
+  return BaseType.getSizeInBits();
+}
+
 /// addConstantValue - Add constant value entry in variable DIE.
 void CompileUnit::addConstantValue(DIE *Die, const MachineOperand &MO,
                                    DIType Ty) {
@@ -606,32 +722,47 @@ void CompileUnit::addConstantValue(DIE *Die, const MachineOperand &MO,
   // their maximum bit width which is a bit unfortunate (& doesn't prefer
   // udata/sdata over dataN as suggested by the DWARF spec)
   assert(MO.isImm() && "Invalid machine operand!");
-  DIEBlock *Block = new (DIEValueAllocator) DIEBlock();
   int SizeInBits = -1;
-  bool SignedConstant = isTypeSigned(Ty, &SizeInBits);
-  unsigned Form = SignedConstant ? dwarf::DW_FORM_sdata : dwarf::DW_FORM_udata;
-  switch (SizeInBits) {
-    case 8:  Form = dwarf::DW_FORM_data1; break;
-    case 16: Form = dwarf::DW_FORM_data2; break;
-    case 32: Form = dwarf::DW_FORM_data4; break;
-    case 64: Form = dwarf::DW_FORM_data8; break;
-    default: break;
+  bool SignedConstant = isTypeSigned(DD, Ty, &SizeInBits);
+  dwarf::Form Form;
+
+  // If we're a signed constant definitely use sdata.
+  if (SignedConstant) {
+    addSInt(Die, dwarf::DW_AT_const_value, dwarf::DW_FORM_sdata, MO.getImm());
+    return;
   }
-  SignedConstant ? addSInt(Block, 0, Form, MO.getImm())
-    : addUInt(Block, 0, Form, MO.getImm());
 
-  addBlock(Die, dwarf::DW_AT_const_value, 0, Block);
+  // Else use data for now unless it's larger than we can deal with.
+  switch (SizeInBits) {
+  case 8:
+    Form = dwarf::DW_FORM_data1;
+    break;
+  case 16:
+    Form = dwarf::DW_FORM_data2;
+    break;
+  case 32:
+    Form = dwarf::DW_FORM_data4;
+    break;
+  case 64:
+    Form = dwarf::DW_FORM_data8;
+    break;
+  default:
+    Form = dwarf::DW_FORM_udata;
+    addUInt(Die, dwarf::DW_AT_const_value, Form, MO.getImm());
+    return;
+  }
+  addUInt(Die, dwarf::DW_AT_const_value, Form, MO.getImm());
 }
 
 /// addConstantFPValue - Add constant value entry in variable DIE.
 void CompileUnit::addConstantFPValue(DIE *Die, const MachineOperand &MO) {
-  assert (MO.isFPImm() && "Invalid machine operand!");
+  assert(MO.isFPImm() && "Invalid machine operand!");
   DIEBlock *Block = new (DIEValueAllocator) DIEBlock();
   APFloat FPImm = MO.getFPImm()->getValueAPF();
 
   // Get the raw data form of the floating point.
   const APInt FltVal = FPImm.bitcastToAPInt();
-  const char *FltPtr = (const char*)FltVal.getRawData();
+  const char *FltPtr = (const char *)FltVal.getRawData();
 
   int NumBytes = FltVal.getBitWidth() / 8; // 8 bits per byte.
   bool LittleEndian = Asm->getDataLayout().isLittleEndian();
@@ -641,15 +772,15 @@ void CompileUnit::addConstantFPValue(DIE *Die, const MachineOperand &MO) {
 
   // Output the constant to DWARF one byte at a time.
   for (; Start != Stop; Start += Incr)
-    addUInt(Block, 0, dwarf::DW_FORM_data1,
-            (unsigned char)0xFF & FltPtr[Start]);
+    addUInt(Block, dwarf::DW_FORM_data1, (unsigned char)0xFF & FltPtr[Start]);
 
-  addBlock(Die, dwarf::DW_AT_const_value, 0, Block);
+  addBlock(Die, dwarf::DW_AT_const_value, Block);
 }
 
 /// addConstantFPValue - Add constant value entry in variable DIE.
 void CompileUnit::addConstantFPValue(DIE *Die, const ConstantFP *CFP) {
-  addConstantValue(Die, CFP->getValueAPF().bitcastToAPInt(), false);
+  // Pass this down to addConstantValue as an unsigned bag of bits.
+  addConstantValue(Die, CFP->getValueAPF().bitcastToAPInt(), true);
 }
 
 /// addConstantValue - Add constant value entry in variable DIE.
@@ -662,19 +793,34 @@ void CompileUnit::addConstantValue(DIE *Die, const ConstantInt *CI,
 void CompileUnit::addConstantValue(DIE *Die, const APInt &Val, bool Unsigned) {
   unsigned CIBitWidth = Val.getBitWidth();
   if (CIBitWidth <= 64) {
-    unsigned form = 0;
+    // If we're a signed constant definitely use sdata.
+    if (!Unsigned) {
+      addSInt(Die, dwarf::DW_AT_const_value, dwarf::DW_FORM_sdata,
+              Val.getSExtValue());
+      return;
+    }
+
+    // Else use data for now unless it's larger than we can deal with.
+    dwarf::Form Form;
     switch (CIBitWidth) {
-    case 8: form = dwarf::DW_FORM_data1; break;
-    case 16: form = dwarf::DW_FORM_data2; break;
-    case 32: form = dwarf::DW_FORM_data4; break;
-    case 64: form = dwarf::DW_FORM_data8; break;
+    case 8:
+      Form = dwarf::DW_FORM_data1;
+      break;
+    case 16:
+      Form = dwarf::DW_FORM_data2;
+      break;
+    case 32:
+      Form = dwarf::DW_FORM_data4;
+      break;
+    case 64:
+      Form = dwarf::DW_FORM_data8;
+      break;
     default:
-      form = Unsigned ? dwarf::DW_FORM_udata : dwarf::DW_FORM_sdata;
+      addUInt(Die, dwarf::DW_AT_const_value, dwarf::DW_FORM_udata,
+              Val.getZExtValue());
+      return;
     }
-    if (Unsigned)
-      addUInt(Die, dwarf::DW_AT_const_value, form, Val.getZExtValue());
-    else
-      addSInt(Die, dwarf::DW_AT_const_value, form, Val.getSExtValue());
+    addUInt(Die, dwarf::DW_AT_const_value, Form, Val.getZExtValue());
     return;
   }
 
@@ -693,10 +839,10 @@ void CompileUnit::addConstantValue(DIE *Die, const APInt &Val, bool Unsigned) {
       c = Ptr64[i / 8] >> (8 * (i & 7));
     else
       c = Ptr64[(NumBytes - 1 - i) / 8] >> (8 * ((NumBytes - 1 - i) & 7));
-    addUInt(Block, 0, dwarf::DW_FORM_data1, c);
+    addUInt(Block, dwarf::DW_FORM_data1, c);
   }
 
-  addBlock(Die, dwarf::DW_AT_const_value, 0, Block);
+  addBlock(Die, dwarf::DW_AT_const_value, Block);
 }
 
 /// addTemplateParams - Add template parameters into buffer.
@@ -705,47 +851,48 @@ void CompileUnit::addTemplateParams(DIE &Buffer, DIArray TParams) {
   for (unsigned i = 0, e = TParams.getNumElements(); i != e; ++i) {
     DIDescriptor Element = TParams.getElement(i);
     if (Element.isTemplateTypeParameter())
-      Buffer.addChild(getOrCreateTemplateTypeParameterDIE(
-                        DITemplateTypeParameter(Element)));
+      constructTemplateTypeParameterDIE(Buffer,
+                                        DITemplateTypeParameter(Element));
     else if (Element.isTemplateValueParameter())
-      Buffer.addChild(getOrCreateTemplateValueParameterDIE(
-                        DITemplateValueParameter(Element)));
+      constructTemplateValueParameterDIE(Buffer,
+                                         DITemplateValueParameter(Element));
   }
 }
 
 /// getOrCreateContextDIE - Get context owner's DIE.
-DIE *CompileUnit::getOrCreateContextDIE(DIDescriptor Context) {
+DIE *CompileUnit::getOrCreateContextDIE(DIScope Context) {
+  if (!Context || Context.isFile())
+    return getCUDie();
   if (Context.isType())
     return getOrCreateTypeDIE(DIType(Context));
-  else if (Context.isNameSpace())
+  if (Context.isNameSpace())
     return getOrCreateNameSpace(DINameSpace(Context));
-  else if (Context.isSubprogram())
+  if (Context.isSubprogram())
     return getOrCreateSubprogramDIE(DISubprogram(Context));
-  else
-    return getDIE(Context);
-}
-
-/// addToContextOwner - Add Die into the list of its context owner's children.
-void CompileUnit::addToContextOwner(DIE *Die, DIDescriptor Context) {
-  if (DIE *ContextDIE = getOrCreateContextDIE(Context))
-    ContextDIE->addChild(Die);
-  else
-    addDie(Die);
+  return getDIE(Context);
 }
 
 /// getOrCreateTypeDIE - Find existing DIE or create new DIE for the
 /// given DIType.
 DIE *CompileUnit::getOrCreateTypeDIE(const MDNode *TyNode) {
-  DIType Ty(TyNode);
-  if (!Ty.isType())
+  if (!TyNode)
     return NULL;
+
+  DIType Ty(TyNode);
+  assert(Ty.isType());
+
+  // Construct the context before querying for the existence of the DIE in case
+  // such construction creates the DIE.
+  DIE *ContextDIE = getOrCreateContextDIE(resolve(Ty.getContext()));
+  assert(ContextDIE);
+
   DIE *TyDIE = getDIE(Ty);
   if (TyDIE)
     return TyDIE;
 
   // Create new type.
-  TyDIE = new DIE(dwarf::DW_TAG_base_type);
-  insertDIE(Ty, TyDIE);
+  TyDIE = createAndAddDIE(Ty.getTag(), *ContextDIE, Ty);
+
   if (Ty.isBasicType())
     constructTypeDIE(*TyDIE, DIBasicType(Ty));
   else if (Ty.isCompositeType())
@@ -762,28 +909,24 @@ DIE *CompileUnit::getOrCreateTypeDIE(const MDNode *TyNode) {
       DICompositeType CT(Ty);
       // A runtime language of 0 actually means C/C++ and that any
       // non-negative value is some version of Objective-C/C++.
-      IsImplementation = (CT.getRunTimeLang() == 0) ||
-        CT.isObjcClassComplete();
+      IsImplementation = (CT.getRunTimeLang() == 0) || CT.isObjcClassComplete();
     }
-    unsigned Flags = IsImplementation ?
-                     DwarfAccelTable::eTypeFlagClassIsImplementation : 0;
+    unsigned Flags = IsImplementation ? dwarf::DW_FLAG_type_implementation : 0;
     addAccelType(Ty.getName(), std::make_pair(TyDIE, Flags));
   }
 
-  addToContextOwner(TyDIE, Ty.getContext());
   return TyDIE;
 }
 
 /// addType - Add a new type attribute to the specified entity.
-void CompileUnit::addType(DIE *Entity, DIType Ty, unsigned Attribute) {
-  if (!Ty.isType())
-    return;
+void CompileUnit::addType(DIE *Entity, DIType Ty, dwarf::Attribute Attribute) {
+  assert(Ty && "Trying to add a type that doesn't exist?");
 
   // Check for pre-existence.
   DIEEntry *Entry = getDIEEntry(Ty);
   // If it exists then use the existing value.
   if (Entry) {
-    Entity->addValue(Attribute, dwarf::DW_FORM_ref4, Entry);
+    addDIEEntry(Entity, Attribute, Entry);
     return;
   }
 
@@ -793,28 +936,105 @@ void CompileUnit::addType(DIE *Entity, DIType Ty, unsigned Attribute) {
   // Set up proxy.
   Entry = createDIEEntry(Buffer);
   insertDIEEntry(Ty, Entry);
-  Entity->addValue(Attribute, dwarf::DW_FORM_ref4, Entry);
+  addDIEEntry(Entity, Attribute, Entry);
 
   // If this is a complete composite type then include it in the
   // list of global types.
   addGlobalType(Ty);
 }
 
+// Accelerator table mutators - add each name along with its companion
+// DIE to the proper table while ensuring that the name that we're going
+// to reference is in the string table. We do this since the names we
+// add may not only be identical to the names in the DIE.
+void CompileUnit::addAccelName(StringRef Name, DIE *Die) {
+  DU->getStringPoolEntry(Name);
+  std::vector<DIE *> &DIEs = AccelNames[Name];
+  DIEs.push_back(Die);
+}
+
+void CompileUnit::addAccelObjC(StringRef Name, DIE *Die) {
+  DU->getStringPoolEntry(Name);
+  std::vector<DIE *> &DIEs = AccelObjC[Name];
+  DIEs.push_back(Die);
+}
+
+void CompileUnit::addAccelNamespace(StringRef Name, DIE *Die) {
+  DU->getStringPoolEntry(Name);
+  std::vector<DIE *> &DIEs = AccelNamespace[Name];
+  DIEs.push_back(Die);
+}
+
+void CompileUnit::addAccelType(StringRef Name, std::pair<DIE *, unsigned> Die) {
+  DU->getStringPoolEntry(Name);
+  std::vector<std::pair<DIE *, unsigned> > &DIEs = AccelTypes[Name];
+  DIEs.push_back(Die);
+}
+
+/// addGlobalName - Add a new global name to the compile unit.
+void CompileUnit::addGlobalName(StringRef Name, DIE *Die, DIScope Context) {
+  std::string FullName = getParentContextString(Context) + Name.str();
+  GlobalNames[FullName] = Die;
+}
+
 /// addGlobalType - Add a new global type to the compile unit.
 ///
 void CompileUnit::addGlobalType(DIType Ty) {
-  DIDescriptor Context = Ty.getContext();
-  if (Ty.isCompositeType() && !Ty.getName().empty() && !Ty.isForwardDecl()
-      && (!Context || Context.isCompileUnit() || Context.isFile()
-          || Context.isNameSpace()))
-    if (DIEEntry *Entry = getDIEEntry(Ty))
-      GlobalTypes[Ty.getName()] = Entry->getEntry();
+  DIScope Context = resolve(Ty.getContext());
+  if (!Ty.getName().empty() && !Ty.isForwardDecl() &&
+      (!Context || Context.isCompileUnit() || Context.isFile() ||
+       Context.isNameSpace()))
+    if (DIEEntry *Entry = getDIEEntry(Ty)) {
+      std::string FullName =
+          getParentContextString(Context) + Ty.getName().str();
+      GlobalTypes[FullName] = Entry->getEntry();
+    }
+}
+
+/// getParentContextString - Walks the metadata parent chain in a language
+/// specific manner (using the compile unit language) and returns
+/// it as a string. This is done at the metadata level because DIEs may
+/// not currently have been added to the parent context and walking the
+/// DIEs looking for names is more expensive than walking the metadata.
+std::string CompileUnit::getParentContextString(DIScope Context) const {
+  if (!Context)
+    return "";
+
+  // FIXME: Decide whether to implement this for non-C++ languages.
+  if (getLanguage() != dwarf::DW_LANG_C_plus_plus)
+    return "";
+
+  std::string CS;
+  SmallVector<DIScope, 1> Parents;
+  while (!Context.isCompileUnit()) {
+    Parents.push_back(Context);
+    if (Context.getContext())
+      Context = resolve(Context.getContext());
+    else
+      // Structure, etc types will have a NULL context if they're at the top
+      // level.
+      break;
+  }
+
+  // Reverse iterate over our list to go from the outermost construct to the
+  // innermost.
+  for (SmallVectorImpl<DIScope>::reverse_iterator I = Parents.rbegin(),
+                                                  E = Parents.rend();
+       I != E; ++I) {
+    DIScope Ctx = *I;
+    StringRef Name = Ctx.getName();
+    if (!Name.empty()) {
+      CS += Name;
+      CS += "::";
+    }
+  }
+  return CS;
 }
 
-/// addPubTypes - Add type for pubtypes section.
+/// addPubTypes - Add subprogram argument types for pubtypes section.
 void CompileUnit::addPubTypes(DISubprogram SP) {
   DICompositeType SPTy = SP.getType();
-  unsigned SPTag = SPTy.getTag();
+  uint16_t SPTag = SPTy.getTag();
   if (SPTag != dwarf::DW_TAG_subroutine_type)
     return;
 
@@ -835,18 +1055,15 @@ void CompileUnit::constructTypeDIE(DIE &Buffer, DIBasicType BTy) {
   if (!Name.empty())
     addString(&Buffer, dwarf::DW_AT_name, Name);
 
-  if (BTy.getTag() == dwarf::DW_TAG_unspecified_type) {
-    Buffer.setTag(dwarf::DW_TAG_unspecified_type);
-    // An unspecified type only has a name attribute.
+  // An unspecified type only has a name attribute.
+  if (BTy.getTag() == dwarf::DW_TAG_unspecified_type)
     return;
-  }
 
-  Buffer.setTag(dwarf::DW_TAG_base_type);
   addUInt(&Buffer, dwarf::DW_AT_encoding, dwarf::DW_FORM_data1,
           BTy.getEncoding());
 
   uint64_t Size = BTy.getSizeInBits() >> 3;
-  addUInt(&Buffer, dwarf::DW_AT_byte_size, 0, Size);
+  addUInt(&Buffer, dwarf::DW_AT_byte_size, None, Size);
 }
 
 /// constructTypeDIE - Construct derived type die from DIDerivedType.
@@ -854,16 +1071,12 @@ void CompileUnit::constructTypeDIE(DIE &Buffer, DIDerivedType DTy) {
   // Get core information.
   StringRef Name = DTy.getName();
   uint64_t Size = DTy.getSizeInBits() >> 3;
-  unsigned Tag = DTy.getTag();
-
-  // FIXME - Workaround for templates.
-  if (Tag == dwarf::DW_TAG_inheritance) Tag = dwarf::DW_TAG_reference_type;
-
-  Buffer.setTag(Tag);
+  uint16_t Tag = Buffer.getTag();
 
   // Map to main type, void will not have a type.
-  DIType FromTy = DTy.getTypeDerivedFrom();
-  addType(&Buffer, FromTy);
+  DIType FromTy = resolve(DTy.getTypeDerivedFrom());
+  if (FromTy)
+    addType(&Buffer, FromTy);
 
   // Add name if not anonymous or intermediate type.
   if (!Name.empty())
@@ -871,11 +1084,11 @@ void CompileUnit::constructTypeDIE(DIE &Buffer, DIDerivedType DTy) {
 
   // Add size if non-zero (derived types might be zero-sized.)
   if (Size && Tag != dwarf::DW_TAG_pointer_type)
-    addUInt(&Buffer, dwarf::DW_AT_byte_size, 0, Size);
+    addUInt(&Buffer, dwarf::DW_AT_byte_size, None, Size);
 
   if (Tag == dwarf::DW_TAG_ptr_to_member_type)
-      addDIEEntry(&Buffer, dwarf::DW_AT_containing_type, dwarf::DW_FORM_ref4,
-                  getOrCreateTypeDIE(DTy.getClassType()));
+    addDIEEntry(&Buffer, dwarf::DW_AT_containing_type,
+                getOrCreateTypeDIE(resolve(DTy.getClassType())));
   // Add source line info if available and TyDesc is not a forward declaration.
   if (!DTy.isForwardDecl())
     addSourceLine(&Buffer, DTy);
@@ -883,20 +1096,20 @@ void CompileUnit::constructTypeDIE(DIE &Buffer, DIDerivedType DTy) {
 
 /// Return true if the type is appropriately scoped to be contained inside
 /// its own type unit.
-static bool isTypeUnitScoped(DIType Ty) {
-  DIScope Parent = Ty.getContext();
+static bool isTypeUnitScoped(DIType Ty, const DwarfDebug *DD) {
+  DIScope Parent = DD->resolve(Ty.getContext());
   while (Parent) {
     // Don't generate a hash for anything scoped inside a function.
     if (Parent.isSubprogram())
       return false;
-    Parent = Parent.getContext();
+    Parent = DD->resolve(Parent.getContext());
   }
   return true;
 }
 
 /// Return true if the type should be split out into a type unit.
-static bool shouldCreateTypeUnit(DICompositeType CTy) {
-  unsigned Tag = CTy.getTag();
+static bool shouldCreateTypeUnit(DICompositeType CTy, const DwarfDebug *DD) {
+  uint16_t Tag = CTy.getTag();
 
   switch (Tag) {
   case dwarf::DW_TAG_structure_type:
@@ -904,13 +1117,11 @@ static bool shouldCreateTypeUnit(DICompositeType CTy) {
   case dwarf::DW_TAG_enumeration_type:
   case dwarf::DW_TAG_class_type:
     // If this is a class, structure, union, or enumeration type
-    // that is not a declaration, is a type definition, and not scoped
+    // that is a definition (not a declaration), and not scoped
     // inside a function then separate this out as a type unit.
-    if (CTy.isForwardDecl() || !isTypeUnitScoped(CTy))
-      return 0;
-    return 1;
+    return !CTy.isForwardDecl() && isTypeUnitScoped(CTy, DD);
   default:
-    return 0;
+    return false;
   }
 }
 
@@ -920,69 +1131,47 @@ void CompileUnit::constructTypeDIE(DIE &Buffer, DICompositeType CTy) {
   StringRef Name = CTy.getName();
 
   uint64_t Size = CTy.getSizeInBits() >> 3;
-  unsigned Tag = CTy.getTag();
-  Buffer.setTag(Tag);
+  uint16_t Tag = Buffer.getTag();
 
   switch (Tag) {
   case dwarf::DW_TAG_array_type:
-    constructArrayTypeDIE(Buffer, &CTy);
+    constructArrayTypeDIE(Buffer, CTy);
     break;
-  case dwarf::DW_TAG_enumeration_type: {
-    DIArray Elements = CTy.getTypeArray();
-
-    // Add enumerators to enumeration type.
-    for (unsigned i = 0, N = Elements.getNumElements(); i < N; ++i) {
-      DIE *ElemDie = NULL;
-      DIDescriptor Enum(Elements.getElement(i));
-      if (Enum.isEnumerator()) {
-        ElemDie = constructEnumTypeDIE(DIEnumerator(Enum));
-        Buffer.addChild(ElemDie);
-      }
-    }
-    DIType DTy = CTy.getTypeDerivedFrom();
-    if (DTy.isType()) {
-      addType(&Buffer, DTy);
-      addUInt(&Buffer, dwarf::DW_AT_enum_class, dwarf::DW_FORM_flag, 1);
-    }
-  }
+  case dwarf::DW_TAG_enumeration_type:
+    constructEnumTypeDIE(Buffer, CTy);
     break;
   case dwarf::DW_TAG_subroutine_type: {
-    // Add return type.
+    // Add return type. A void return won't have a type.
     DIArray Elements = CTy.getTypeArray();
-    DIDescriptor RTy = Elements.getElement(0);
-    addType(&Buffer, DIType(RTy));
+    DIType RTy(Elements.getElement(0));
+    if (RTy)
+      addType(&Buffer, RTy);
 
     bool isPrototyped = true;
     // Add arguments.
     for (unsigned i = 1, N = Elements.getNumElements(); i < N; ++i) {
       DIDescriptor Ty = Elements.getElement(i);
       if (Ty.isUnspecifiedParameter()) {
-        DIE *Arg = new DIE(dwarf::DW_TAG_unspecified_parameters);
-        Buffer.addChild(Arg);
+        createAndAddDIE(dwarf::DW_TAG_unspecified_parameters, Buffer);
         isPrototyped = false;
       } else {
-        DIE *Arg = new DIE(dwarf::DW_TAG_formal_parameter);
+        DIE *Arg = createAndAddDIE(dwarf::DW_TAG_formal_parameter, Buffer);
         addType(Arg, DIType(Ty));
         if (DIType(Ty).isArtificial())
           addFlag(Arg, dwarf::DW_AT_artificial);
-        Buffer.addChild(Arg);
       }
     }
     // Add prototype flag if we're dealing with a C language and the
     // function has been prototyped.
+    uint16_t Language = getLanguage();
     if (isPrototyped &&
-        (Language == dwarf::DW_LANG_C89 ||
-         Language == dwarf::DW_LANG_C99 ||
+        (Language == dwarf::DW_LANG_C89 || Language == dwarf::DW_LANG_C99 ||
          Language == dwarf::DW_LANG_ObjC))
       addFlag(&Buffer, dwarf::DW_AT_prototyped);
-  }
-    break;
+  } break;
   case dwarf::DW_TAG_structure_type:
   case dwarf::DW_TAG_union_type:
   case dwarf::DW_TAG_class_type: {
-    if (CTy.isForwardDecl())
-      break;
-
     // Add elements to structure type.
     DIArray Elements = CTy.getTypeArray();
     for (unsigned i = 0, N = Elements.getNumElements(); i < N; ++i) {
@@ -990,7 +1179,7 @@ void CompileUnit::constructTypeDIE(DIE &Buffer, DICompositeType CTy) {
       DIE *ElemDie = NULL;
       if (Element.isSubprogram()) {
         DISubprogram SP(Element);
-        ElemDie = getOrCreateSubprogramDIE(DISubprogram(Element));
+        ElemDie = getOrCreateSubprogramDIE(SP);
         if (SP.isProtected())
           addUInt(ElemDie, dwarf::DW_AT_accessibility, dwarf::DW_FORM_data1,
                   dwarf::DW_ACCESS_protected);
@@ -999,21 +1188,23 @@ void CompileUnit::constructTypeDIE(DIE &Buffer, DICompositeType CTy) {
                   dwarf::DW_ACCESS_private);
         else
           addUInt(ElemDie, dwarf::DW_AT_accessibility, dwarf::DW_FORM_data1,
-            dwarf::DW_ACCESS_public);
+                  dwarf::DW_ACCESS_public);
         if (SP.isExplicit())
           addFlag(ElemDie, dwarf::DW_AT_explicit);
       } else if (Element.isDerivedType()) {
         DIDerivedType DDTy(Element);
         if (DDTy.getTag() == dwarf::DW_TAG_friend) {
-          ElemDie = new DIE(dwarf::DW_TAG_friend);
-          addType(ElemDie, DDTy.getTypeDerivedFrom(), dwarf::DW_AT_friend);
-        } else if (DDTy.isStaticMember())
-          ElemDie = createStaticMemberDIE(DDTy);
-        else
-          ElemDie = createMemberDIE(DDTy);
+          ElemDie = createAndAddDIE(dwarf::DW_TAG_friend, Buffer);
+          addType(ElemDie, resolve(DDTy.getTypeDerivedFrom()),
+                  dwarf::DW_AT_friend);
+        } else if (DDTy.isStaticMember()) {
+          getOrCreateStaticMemberDIE(DDTy);
+        } else {
+          constructMemberDIE(Buffer, DDTy);
+        }
       } else if (Element.isObjCProperty()) {
         DIObjCProperty Property(Element);
-        ElemDie = new DIE(Property.getTag());
+        ElemDie = createAndAddDIE(Property.getTag(), Buffer);
         StringRef PropertyName = Property.getObjCPropertyName();
         addString(ElemDie, dwarf::DW_AT_APPLE_property_name, PropertyName);
         addType(ElemDie, Property.getType());
@@ -1038,8 +1229,8 @@ void CompileUnit::constructTypeDIE(DIE &Buffer, DICompositeType CTy) {
         if (Property.isNonAtomicObjCProperty())
           PropertyAttributes |= dwarf::DW_APPLE_PROPERTY_nonatomic;
         if (PropertyAttributes)
-          addUInt(ElemDie, dwarf::DW_AT_APPLE_property_attribute, 0,
-                 PropertyAttributes);
+          addUInt(ElemDie, dwarf::DW_AT_APPLE_property_attribute, None,
+                  PropertyAttributes);
 
         DIEEntry *Entry = getDIEEntry(Element);
         if (!Entry) {
@@ -1048,18 +1239,15 @@ void CompileUnit::constructTypeDIE(DIE &Buffer, DICompositeType CTy) {
         }
       } else
         continue;
-      Buffer.addChild(ElemDie);
     }
 
     if (CTy.isAppleBlockExtension())
       addFlag(&Buffer, dwarf::DW_AT_APPLE_block);
 
-    DICompositeType ContainingType = CTy.getContainingType();
-    if (DIDescriptor(ContainingType).isCompositeType())
-      addDIEEntry(&Buffer, dwarf::DW_AT_containing_type, dwarf::DW_FORM_ref4,
-                  getOrCreateTypeDIE(DIType(ContainingType)));
-    else
-      addToContextOwner(&Buffer, CTy.getContext());
+    DICompositeType ContainingType(resolve(CTy.getContainingType()));
+    if (ContainingType)
+      addDIEEntry(&Buffer, dwarf::DW_AT_containing_type,
+                  getOrCreateTypeDIE(ContainingType));
 
     if (CTy.isObjcClassComplete())
       addFlag(&Buffer, dwarf::DW_AT_APPLE_objc_complete_type);
@@ -1067,8 +1255,7 @@ void CompileUnit::constructTypeDIE(DIE &Buffer, DICompositeType CTy) {
     // Add template parameters to a class, structure or union types.
     // FIXME: The support isn't in the metadata for this yet.
     if (Tag == dwarf::DW_TAG_class_type ||
-        Tag == dwarf::DW_TAG_structure_type ||
-        Tag == dwarf::DW_TAG_union_type)
+        Tag == dwarf::DW_TAG_structure_type || Tag == dwarf::DW_TAG_union_type)
       addTemplateParams(Buffer, CTy.getTemplateParams());
 
     break;
@@ -1082,16 +1269,15 @@ void CompileUnit::constructTypeDIE(DIE &Buffer, DICompositeType CTy) {
     addString(&Buffer, dwarf::DW_AT_name, Name);
 
   if (Tag == dwarf::DW_TAG_enumeration_type ||
-      Tag == dwarf::DW_TAG_class_type ||
-      Tag == dwarf::DW_TAG_structure_type ||
+      Tag == dwarf::DW_TAG_class_type || Tag == dwarf::DW_TAG_structure_type ||
       Tag == dwarf::DW_TAG_union_type) {
     // Add size if non-zero (derived types might be zero-sized.)
     // TODO: Do we care about size for enum forward declarations?
     if (Size)
-      addUInt(&Buffer, dwarf::DW_AT_byte_size, 0, Size);
+      addUInt(&Buffer, dwarf::DW_AT_byte_size, None, Size);
     else if (!CTy.isForwardDecl())
       // Add zero size if it is not a forward declaration.
-      addUInt(&Buffer, dwarf::DW_AT_byte_size, 0, 0);
+      addUInt(&Buffer, dwarf::DW_AT_byte_size, None, 0);
 
     // If we're a forward decl, say so.
     if (CTy.isForwardDecl())
@@ -1104,131 +1290,126 @@ void CompileUnit::constructTypeDIE(DIE &Buffer, DICompositeType CTy) {
     // No harm in adding the runtime language to the declaration.
     unsigned RLang = CTy.getRunTimeLang();
     if (RLang)
-      addUInt(&Buffer, dwarf::DW_AT_APPLE_runtime_class,
-              dwarf::DW_FORM_data1, RLang);
+      addUInt(&Buffer, dwarf::DW_AT_APPLE_runtime_class, dwarf::DW_FORM_data1,
+              RLang);
   }
   // If this is a type applicable to a type unit it then add it to the
   // list of types we'll compute a hash for later.
-  if (shouldCreateTypeUnit(CTy))
+  if (shouldCreateTypeUnit(CTy, DD))
     DD->addTypeUnitType(&Buffer);
 }
 
-/// getOrCreateTemplateTypeParameterDIE - Find existing DIE or create new DIE
-/// for the given DITemplateTypeParameter.
-DIE *
-CompileUnit::getOrCreateTemplateTypeParameterDIE(DITemplateTypeParameter TP) {
-  DIE *ParamDIE = getDIE(TP);
-  if (ParamDIE)
-    return ParamDIE;
-
-  ParamDIE = new DIE(dwarf::DW_TAG_template_type_parameter);
-  addType(ParamDIE, TP.getType());
+/// constructTemplateTypeParameterDIE - Construct new DIE for the given
+/// DITemplateTypeParameter.
+void
+CompileUnit::constructTemplateTypeParameterDIE(DIE &Buffer,
+                                               DITemplateTypeParameter TP) {
+  DIE *ParamDIE =
+      createAndAddDIE(dwarf::DW_TAG_template_type_parameter, Buffer);
+  // Add the type if it exists, it could be void and therefore no type.
+  if (TP.getType())
+    addType(ParamDIE, resolve(TP.getType()));
   if (!TP.getName().empty())
     addString(ParamDIE, dwarf::DW_AT_name, TP.getName());
-  return ParamDIE;
-}
-
-/// getOrCreateTemplateValueParameterDIE - Find existing DIE or create new DIE
-/// for the given DITemplateValueParameter.
-DIE *
-CompileUnit::getOrCreateTemplateValueParameterDIE(DITemplateValueParameter TPV){
-  DIE *ParamDIE = getDIE(TPV);
-  if (ParamDIE)
-    return ParamDIE;
-
-  ParamDIE = new DIE(TPV.getTag());
-  addType(ParamDIE, TPV.getType());
-  if (!TPV.getName().empty())
-    addString(ParamDIE, dwarf::DW_AT_name, TPV.getName());
-  if (Value *Val = TPV.getValue()) {
+}
+
+/// constructTemplateValueParameterDIE - Construct new DIE for the given
+/// DITemplateValueParameter.
+void
+CompileUnit::constructTemplateValueParameterDIE(DIE &Buffer,
+                                                DITemplateValueParameter VP) {
+  DIE *ParamDIE = createAndAddDIE(VP.getTag(), Buffer);
+
+  // Add the type if there is one, template template and template parameter
+  // packs will not have a type.
+  if (VP.getTag() == dwarf::DW_TAG_template_value_parameter)
+    addType(ParamDIE, resolve(VP.getType()));
+  if (!VP.getName().empty())
+    addString(ParamDIE, dwarf::DW_AT_name, VP.getName());
+  if (Value *Val = VP.getValue()) {
     if (ConstantInt *CI = dyn_cast<ConstantInt>(Val))
-      addConstantValue(ParamDIE, CI, TPV.getType().isUnsignedDIType());
+      addConstantValue(ParamDIE, CI,
+                       isUnsignedDIType(DD, resolve(VP.getType())));
     else if (GlobalValue *GV = dyn_cast<GlobalValue>(Val)) {
       // For declaration non-type template parameters (such as global values and
       // functions)
       DIEBlock *Block = new (DIEValueAllocator) DIEBlock();
-      addOpAddress(Block, Asm->Mang->getSymbol(GV));
+      addOpAddress(Block, Asm->getSymbol(GV));
       // Emit DW_OP_stack_value to use the address as the immediate value of the
       // parameter, rather than a pointer to it.
-      addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_stack_value);
-      addBlock(ParamDIE, dwarf::DW_AT_location, 0, Block);
-    } else if (TPV.getTag() == dwarf::DW_TAG_GNU_template_template_param) {
+      addUInt(Block, dwarf::DW_FORM_data1, dwarf::DW_OP_stack_value);
+      addBlock(ParamDIE, dwarf::DW_AT_location, Block);
+    } else if (VP.getTag() == dwarf::DW_TAG_GNU_template_template_param) {
       assert(isa<MDString>(Val));
       addString(ParamDIE, dwarf::DW_AT_GNU_template_name,
                 cast<MDString>(Val)->getString());
-    } else if (TPV.getTag() == dwarf::DW_TAG_GNU_template_parameter_pack) {
+    } else if (VP.getTag() == dwarf::DW_TAG_GNU_template_parameter_pack) {
       assert(isa<MDNode>(Val));
       DIArray A(cast<MDNode>(Val));
       addTemplateParams(*ParamDIE, A);
     }
   }
-
-  return ParamDIE;
 }
 
 /// getOrCreateNameSpace - Create a DIE for DINameSpace.
 DIE *CompileUnit::getOrCreateNameSpace(DINameSpace NS) {
+  // Construct the context before querying for the existence of the DIE in case
+  // such construction creates the DIE.
+  DIE *ContextDIE = getOrCreateContextDIE(NS.getContext());
+
   DIE *NDie = getDIE(NS);
   if (NDie)
     return NDie;
-  NDie = new DIE(dwarf::DW_TAG_namespace);
-  insertDIE(NS, NDie);
+  NDie = createAndAddDIE(dwarf::DW_TAG_namespace, *ContextDIE, NS);
+
   if (!NS.getName().empty()) {
     addString(NDie, dwarf::DW_AT_name, NS.getName());
     addAccelNamespace(NS.getName(), NDie);
+    addGlobalName(NS.getName(), NDie, NS.getContext());
   } else
     addAccelNamespace("(anonymous namespace)", NDie);
   addSourceLine(NDie, NS);
-  addToContextOwner(NDie, NS.getContext());
   return NDie;
 }
 
 /// getOrCreateSubprogramDIE - Create new DIE using SP.
 DIE *CompileUnit::getOrCreateSubprogramDIE(DISubprogram SP) {
+  // Construct the context before querying for the existence of the DIE in case
+  // such construction creates the DIE (as is the case for member function
+  // declarations).
+  DIE *ContextDIE = getOrCreateContextDIE(resolve(SP.getContext()));
+
   DIE *SPDie = getDIE(SP);
   if (SPDie)
     return SPDie;
 
-  SPDie = new DIE(dwarf::DW_TAG_subprogram);
+  DISubprogram SPDecl = SP.getFunctionDeclaration();
+  if (SPDecl.isSubprogram())
+    // Add subprogram definitions to the CU die directly.
+    ContextDIE = CUDie.get();
 
   // DW_TAG_inlined_subroutine may refer to this DIE.
-  insertDIE(SP, SPDie);
+  SPDie = createAndAddDIE(dwarf::DW_TAG_subprogram, *ContextDIE, SP);
 
-  DISubprogram SPDecl = SP.getFunctionDeclaration();
   DIE *DeclDie = NULL;
-  if (SPDecl.isSubprogram()) {
+  if (SPDecl.isSubprogram())
     DeclDie = getOrCreateSubprogramDIE(SPDecl);
-  }
-
-  // Add to context owner.
-  addToContextOwner(SPDie, SP.getContext());
 
   // Add function template parameters.
   addTemplateParams(*SPDie, SP.getTemplateParams());
 
-  // Unfortunately this code needs to stay here instead of below the
-  // AT_specification code in order to work around a bug in older
-  // gdbs that requires the linkage name to resolve multiple template
-  // functions.
-  // TODO: Remove this set of code when we get rid of the old gdb
-  // compatibility.
-  StringRef LinkageName = SP.getLinkageName();
-  if (!LinkageName.empty() && DD->useDarwinGDBCompat())
-    addString(SPDie, dwarf::DW_AT_MIPS_linkage_name,
-              GlobalValue::getRealLinkageName(LinkageName));
-
   // If this DIE is going to refer declaration info using AT_specification
   // then there is no need to add other attributes.
   if (DeclDie) {
     // Refer function declaration directly.
-    addDIEEntry(SPDie, dwarf::DW_AT_specification, dwarf::DW_FORM_ref4,
-                DeclDie);
+    addDIEEntry(SPDie, dwarf::DW_AT_specification, DeclDie);
 
     return SPDie;
   }
 
   // Add the linkage name if we have one.
-  if (!LinkageName.empty() && !DD->useDarwinGDBCompat())
+  StringRef LinkageName = SP.getLinkageName();
+  if (!LinkageName.empty())
     addString(SPDie, dwarf::DW_AT_MIPS_linkage_name,
               GlobalValue::getRealLinkageName(LinkageName));
 
@@ -1240,29 +1421,31 @@ DIE *CompileUnit::getOrCreateSubprogramDIE(DISubprogram SP) {
 
   // Add the prototype if we have a prototype and we have a C like
   // language.
+  uint16_t Language = getLanguage();
   if (SP.isPrototyped() &&
-      (Language == dwarf::DW_LANG_C89 ||
-       Language == dwarf::DW_LANG_C99 ||
+      (Language == dwarf::DW_LANG_C89 || Language == dwarf::DW_LANG_C99 ||
        Language == dwarf::DW_LANG_ObjC))
     addFlag(SPDie, dwarf::DW_AT_prototyped);
 
-  // Add Return Type.
   DICompositeType SPTy = SP.getType();
   assert(SPTy.getTag() == dwarf::DW_TAG_subroutine_type &&
          "the type of a subprogram should be a subroutine");
 
   DIArray Args = SPTy.getTypeArray();
-  addType(SPDie, DIType(Args.getElement(0)));
+  // Add a return type. If this is a type like a C/C++ void type we don't add a
+  // return type.
+  if (Args.getElement(0))
+    addType(SPDie, DIType(Args.getElement(0)));
 
   unsigned VK = SP.getVirtuality();
   if (VK) {
     addUInt(SPDie, dwarf::DW_AT_virtuality, dwarf::DW_FORM_data1, VK);
     DIEBlock *Block = getDIEBlock();
-    addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_constu);
-    addUInt(Block, 0, dwarf::DW_FORM_udata, SP.getVirtualIndex());
-    addBlock(SPDie, dwarf::DW_AT_vtable_elem_location, 0, Block);
-    ContainingTypeMap.insert(std::make_pair(SPDie,
-                                            SP.getContainingType()));
+    addUInt(Block, dwarf::DW_FORM_data1, dwarf::DW_OP_constu);
+    addUInt(Block, dwarf::DW_FORM_udata, SP.getVirtualIndex());
+    addBlock(SPDie, dwarf::DW_AT_vtable_elem_location, Block);
+    ContainingTypeMap.insert(
+        std::make_pair(SPDie, resolve(SP.getContainingType())));
   }
 
   if (!SP.isDefinition()) {
@@ -1270,13 +1453,12 @@ DIE *CompileUnit::getOrCreateSubprogramDIE(DISubprogram SP) {
 
     // Add arguments. Do not add arguments for subprogram definition. They will
     // be handled while processing variables.
-    for (unsigned i = 1, N =  Args.getNumElements(); i < N; ++i) {
-      DIE *Arg = new DIE(dwarf::DW_TAG_formal_parameter);
-      DIType ATy = DIType(Args.getElement(i));
+    for (unsigned i = 1, N = Args.getNumElements(); i < N; ++i) {
+      DIE *Arg = createAndAddDIE(dwarf::DW_TAG_formal_parameter, *SPDie);
+      DIType ATy(Args.getElement(i));
       addType(Arg, ATy);
       if (ATy.isArtificial())
         addFlag(Arg, dwarf::DW_AT_artificial);
-      SPDie->addChild(Arg);
     }
   }
 
@@ -1324,16 +1506,16 @@ static const ConstantExpr *getMergedGlobalExpr(const Value *V) {
 }
 
 /// createGlobalVariableDIE - create global variable DIE.
-void CompileUnit::createGlobalVariableDIE(const MDNode *N) {
+void CompileUnit::createGlobalVariableDIE(DIGlobalVariable GV) {
+
   // Check for pre-existence.
-  if (getDIE(N))
+  if (getDIE(GV))
     return;
 
-  DIGlobalVariable GV(N);
   if (!GV.isGlobalVariable())
     return;
 
-  DIDescriptor GVContext = GV.getContext();
+  DIScope GVContext = GV.getContext();
   DIType GTy = GV.getType();
 
   // If this is a static data member definition, some attributes belong
@@ -1344,35 +1526,30 @@ void CompileUnit::createGlobalVariableDIE(const MDNode *N) {
   if (SDMDecl.Verify()) {
     assert(SDMDecl.isStaticMember() && "Expected static member decl");
     // We need the declaration DIE that is in the static member's class.
-    // But that class might not exist in the DWARF yet.
-    // Creating the class will create the static member decl DIE.
-    getOrCreateContextDIE(SDMDecl.getContext());
-    VariableDIE = getDIE(SDMDecl);
-    assert(VariableDIE && "Static member decl has no context?");
+    VariableDIE = getOrCreateStaticMemberDIE(SDMDecl);
     IsStaticMember = true;
   }
 
   // If this is not a static data member definition, create the variable
   // DIE and add the initial set of attributes to it.
   if (!VariableDIE) {
-    VariableDIE = new DIE(GV.getTag());
+    // Construct the context before querying for the existence of the DIE in
+    // case such construction creates the DIE.
+    DIE *ContextDIE = getOrCreateContextDIE(GVContext);
+
     // Add to map.
-    insertDIE(N, VariableDIE);
+    VariableDIE = createAndAddDIE(GV.getTag(), *ContextDIE, GV);
 
     // Add name and type.
     addString(VariableDIE, dwarf::DW_AT_name, GV.getDisplayName());
     addType(VariableDIE, GTy);
 
     // Add scoping info.
-    if (!GV.isLocalToUnit()) {
+    if (!GV.isLocalToUnit())
       addFlag(VariableDIE, dwarf::DW_AT_external);
-      addGlobalName(GV.getName(), VariableDIE);
-    }
 
     // Add line number info.
     addSourceLine(VariableDIE, GV);
-    // Add to context owner.
-    addToContextOwner(VariableDIE, GVContext);
   }
 
   // Add location.
@@ -1382,7 +1559,7 @@ void CompileUnit::createGlobalVariableDIE(const MDNode *N) {
   if (isGlobalVariable) {
     addToAccelTable = true;
     DIEBlock *Block = new (DIEValueAllocator) DIEBlock();
-    const MCSymbol *Sym = Asm->Mang->getSymbol(GV.getGlobal());
+    const MCSymbol *Sym = Asm->getSymbol(GV.getGlobal());
     if (GV.getGlobal()->isThreadLocal()) {
       // FIXME: Make this work with -gsplit-dwarf.
       unsigned PointerSize = Asm->getDataLayout().getPointerSize();
@@ -1393,68 +1570,62 @@ void CompileUnit::createGlobalVariableDIE(const MDNode *N) {
       // Based on GCC's support for TLS:
       if (!DD->useSplitDwarf()) {
         // 1) Start with a constNu of the appropriate pointer size
-        addUInt(Block, 0, dwarf::DW_FORM_data1,
+        addUInt(Block, dwarf::DW_FORM_data1,
                 PointerSize == 4 ? dwarf::DW_OP_const4u : dwarf::DW_OP_const8u);
-        // 2) containing the (relocated) address of the TLS variable
-        addExpr(Block, 0, dwarf::DW_FORM_udata, Expr);
+        // 2) containing the (relocated) offset of the TLS variable
+        //    within the module's TLS block.
+        addExpr(Block, dwarf::DW_FORM_udata, Expr);
       } else {
-        addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_GNU_const_index);
-        addUInt(Block, 0, dwarf::DW_FORM_udata, DU->getAddrPoolIndex(Expr));
+        addUInt(Block, dwarf::DW_FORM_data1, dwarf::DW_OP_GNU_const_index);
+        addUInt(Block, dwarf::DW_FORM_udata, DU->getAddrPoolIndex(Expr));
       }
-      // 3) followed by a custom OP to tell the debugger about TLS (presumably)
-      addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_lo_user);
+      // 3) followed by a custom OP to make the debugger do a TLS lookup.
+      addUInt(Block, dwarf::DW_FORM_data1, dwarf::DW_OP_GNU_push_tls_address);
     } else
       addOpAddress(Block, Sym);
     // Do not create specification DIE if context is either compile unit
     // or a subprogram.
     if (GVContext && GV.isDefinition() && !GVContext.isCompileUnit() &&
-        !GVContext.isFile() && !isSubprogramContext(GVContext)) {
+        !GVContext.isFile() && !DD->isSubprogramContext(GVContext)) {
       // Create specification DIE.
-      VariableSpecDIE = new DIE(dwarf::DW_TAG_variable);
-      addDIEEntry(VariableSpecDIE, dwarf::DW_AT_specification,
-                  dwarf::DW_FORM_ref4, VariableDIE);
-      addBlock(VariableSpecDIE, dwarf::DW_AT_location, 0, Block);
+      VariableSpecDIE = createAndAddDIE(dwarf::DW_TAG_variable, *CUDie);
+      addDIEEntry(VariableSpecDIE, dwarf::DW_AT_specification, VariableDIE);
+      addBlock(VariableSpecDIE, dwarf::DW_AT_location, Block);
       // A static member's declaration is already flagged as such.
       if (!SDMDecl.Verify())
         addFlag(VariableDIE, dwarf::DW_AT_declaration);
-      addDie(VariableSpecDIE);
     } else {
-      addBlock(VariableDIE, dwarf::DW_AT_location, 0, Block);
+      addBlock(VariableDIE, dwarf::DW_AT_location, Block);
     }
-    // Add linkage name.
+    // Add the linkage name.
     StringRef LinkageName = GV.getLinkageName();
-    if (!LinkageName.empty()) {
+    if (!LinkageName.empty())
       // From DWARF4: DIEs to which DW_AT_linkage_name may apply include:
       // TAG_common_block, TAG_constant, TAG_entry_point, TAG_subprogram and
       // TAG_variable.
-      addString(IsStaticMember && VariableSpecDIE ?
-                VariableSpecDIE : VariableDIE, dwarf::DW_AT_MIPS_linkage_name,
+      addString(IsStaticMember && VariableSpecDIE ? VariableSpecDIE
+                                                  : VariableDIE,
+                dwarf::DW_AT_MIPS_linkage_name,
                 GlobalValue::getRealLinkageName(LinkageName));
-      // In compatibility mode with older gdbs we put the linkage name on both
-      // the TAG_variable DIE and on the TAG_member DIE.
-      if (IsStaticMember && VariableSpecDIE && DD->useDarwinGDBCompat())
-        addString(VariableDIE, dwarf::DW_AT_MIPS_linkage_name,
-                  GlobalValue::getRealLinkageName(LinkageName));
-    }
   } else if (const ConstantInt *CI =
-             dyn_cast_or_null<ConstantInt>(GV.getConstant())) {
+                 dyn_cast_or_null<ConstantInt>(GV.getConstant())) {
     // AT_const_value was added when the static member was created. To avoid
     // emitting AT_const_value multiple times, we only add AT_const_value when
     // it is not a static member.
     if (!IsStaticMember)
-      addConstantValue(VariableDIE, CI, GTy.isUnsignedDIType());
-  } else if (const ConstantExpr *CE = getMergedGlobalExpr(N->getOperand(11))) {
+      addConstantValue(VariableDIE, CI, isUnsignedDIType(DD, GTy));
+  } else if (const ConstantExpr *CE = getMergedGlobalExpr(GV->getOperand(11))) {
     addToAccelTable = true;
     // GV is a merged global.
     DIEBlock *Block = new (DIEValueAllocator) DIEBlock();
     Value *Ptr = CE->getOperand(0);
-    addOpAddress(Block, Asm->Mang->getSymbol(cast<GlobalValue>(Ptr)));
-    addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_constu);
-    SmallVector<Value*, 3> Idx(CE->op_begin()+1, CE->op_end());
-    addUInt(Block, 0, dwarf::DW_FORM_udata,
-                   Asm->getDataLayout().getIndexedOffset(Ptr->getType(), Idx));
-    addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_plus);
-    addBlock(VariableDIE, dwarf::DW_AT_location, 0, Block);
+    addOpAddress(Block, Asm->getSymbol(cast<GlobalValue>(Ptr)));
+    addUInt(Block, dwarf::DW_FORM_data1, dwarf::DW_OP_constu);
+    SmallVector<Value *, 3> Idx(CE->op_begin() + 1, CE->op_end());
+    addUInt(Block, dwarf::DW_FORM_udata,
+            Asm->getDataLayout().getIndexedOffset(Ptr->getType(), Idx));
+    addUInt(Block, dwarf::DW_FORM_data1, dwarf::DW_OP_plus);
+    addBlock(VariableDIE, dwarf::DW_AT_location, Block);
   }
 
   if (addToAccelTable) {
@@ -1466,13 +1637,17 @@ void CompileUnit::createGlobalVariableDIE(const MDNode *N) {
     if (GV.getLinkageName() != "" && GV.getName() != GV.getLinkageName())
       addAccelName(GV.getLinkageName(), AddrDIE);
   }
+
+  if (!GV.isLocalToUnit())
+    addGlobalName(GV.getName(), VariableSpecDIE ? VariableSpecDIE : VariableDIE,
+                  GV.getContext());
 }
 
 /// constructSubrangeDIE - Construct subrange DIE from DISubrange.
 void CompileUnit::constructSubrangeDIE(DIE &Buffer, DISubrange SR,
                                        DIE *IndexTy) {
-  DIE *DW_Subrange = new DIE(dwarf::DW_TAG_subrange_type);
-  addDIEEntry(DW_Subrange, dwarf::DW_AT_type, dwarf::DW_FORM_ref4, IndexTy);
+  DIE *DW_Subrange = createAndAddDIE(dwarf::DW_TAG_subrange_type, Buffer);
+  addDIEEntry(DW_Subrange, dwarf::DW_AT_type, IndexTy);
 
   // The LowerBound value defines the lower bounds which is typically zero for
   // C/C++. The Count value is the number of elements.  Values are 64 bit. If
@@ -1485,26 +1660,22 @@ void CompileUnit::constructSubrangeDIE(DIE &Buffer, DISubrange SR,
   int64_t Count = SR.getCount();
 
   if (DefaultLowerBound == -1 || LowerBound != DefaultLowerBound)
-    addUInt(DW_Subrange, dwarf::DW_AT_lower_bound, 0, LowerBound);
+    addUInt(DW_Subrange, dwarf::DW_AT_lower_bound, None, LowerBound);
 
   if (Count != -1 && Count != 0)
     // FIXME: An unbounded array should reference the expression that defines
     // the array.
-    addUInt(DW_Subrange, dwarf::DW_AT_upper_bound, 0, LowerBound + Count - 1);
-
-  Buffer.addChild(DW_Subrange);
+    addUInt(DW_Subrange, dwarf::DW_AT_upper_bound, None,
+            LowerBound + Count - 1);
 }
 
 /// constructArrayTypeDIE - Construct array type DIE from DICompositeType.
-void CompileUnit::constructArrayTypeDIE(DIE &Buffer,
-                                        DICompositeType *CTy) {
-  Buffer.setTag(dwarf::DW_TAG_array_type);
-  if (CTy->isVector())
+void CompileUnit::constructArrayTypeDIE(DIE &Buffer, DICompositeType CTy) {
+  if (CTy.isVector())
     addFlag(&Buffer, dwarf::DW_AT_GNU_vector);
 
-  // Emit derived type.
-  addType(&Buffer, CTy->getTypeDerivedFrom());
-  DIArray Elements = CTy->getTypeArray();
+  // Emit the element type.
+  addType(&Buffer, resolve(CTy.getTypeDerivedFrom()));
 
   // Get an anonymous type for index type.
   // FIXME: This type should be passed down from the front end
@@ -1512,16 +1683,16 @@ void CompileUnit::constructArrayTypeDIE(DIE &Buffer,
   DIE *IdxTy = getIndexTyDie();
   if (!IdxTy) {
     // Construct an anonymous type for index type.
-    IdxTy = new DIE(dwarf::DW_TAG_base_type);
+    IdxTy = createAndAddDIE(dwarf::DW_TAG_base_type, *CUDie.get());
     addString(IdxTy, dwarf::DW_AT_name, "int");
-    addUInt(IdxTy, dwarf::DW_AT_byte_size, 0, sizeof(int32_t));
+    addUInt(IdxTy, dwarf::DW_AT_byte_size, None, sizeof(int32_t));
     addUInt(IdxTy, dwarf::DW_AT_encoding, dwarf::DW_FORM_data1,
             dwarf::DW_ATE_signed);
-    addDie(IdxTy);
     setIndexTyDie(IdxTy);
   }
 
   // Add subranges to array type.
+  DIArray Elements = CTy.getTypeArray();
   for (unsigned i = 0, N = Elements.getNumElements(); i < N; ++i) {
     DIDescriptor Element = Elements.getElement(i);
     if (Element.getTag() == dwarf::DW_TAG_subrange_type)
@@ -1529,168 +1700,180 @@ void CompileUnit::constructArrayTypeDIE(DIE &Buffer,
   }
 }
 
-/// constructEnumTypeDIE - Construct enum type DIE from DIEnumerator.
-DIE *CompileUnit::constructEnumTypeDIE(DIEnumerator ETy) {
-  DIE *Enumerator = new DIE(dwarf::DW_TAG_enumerator);
-  StringRef Name = ETy.getName();
-  addString(Enumerator, dwarf::DW_AT_name, Name);
-  int64_t Value = ETy.getEnumValue();
-  addSInt(Enumerator, dwarf::DW_AT_const_value, dwarf::DW_FORM_sdata, Value);
-  return Enumerator;
+/// constructEnumTypeDIE - Construct an enum type DIE from DICompositeType.
+void CompileUnit::constructEnumTypeDIE(DIE &Buffer, DICompositeType CTy) {
+  DIArray Elements = CTy.getTypeArray();
+
+  // Add enumerators to enumeration type.
+  for (unsigned i = 0, N = Elements.getNumElements(); i < N; ++i) {
+    DIEnumerator Enum(Elements.getElement(i));
+    if (Enum.isEnumerator()) {
+      DIE *Enumerator = createAndAddDIE(dwarf::DW_TAG_enumerator, Buffer);
+      StringRef Name = Enum.getName();
+      addString(Enumerator, dwarf::DW_AT_name, Name);
+      int64_t Value = Enum.getEnumValue();
+      addSInt(Enumerator, dwarf::DW_AT_const_value, dwarf::DW_FORM_sdata, Value);
+    }
+  }
+  DIType DTy = resolve(CTy.getTypeDerivedFrom());
+  if (DTy) {
+    addType(&Buffer, DTy);
+    addFlag(&Buffer, dwarf::DW_AT_enum_class);
+  }
 }
 
 /// constructContainingTypeDIEs - Construct DIEs for types that contain
 /// vtables.
 void CompileUnit::constructContainingTypeDIEs() {
   for (DenseMap<DIE *, const MDNode *>::iterator CI = ContainingTypeMap.begin(),
-         CE = ContainingTypeMap.end(); CI != CE; ++CI) {
+                                                 CE = ContainingTypeMap.end();
+       CI != CE; ++CI) {
     DIE *SPDie = CI->first;
-    const MDNode *N = CI->second;
-    if (!N) continue;
-    DIE *NDie = getDIE(N);
-    if (!NDie) continue;
-    addDIEEntry(SPDie, dwarf::DW_AT_containing_type, dwarf::DW_FORM_ref4, NDie);
+    DIDescriptor D(CI->second);
+    if (!D)
+      continue;
+    DIE *NDie = getDIE(D);
+    if (!NDie)
+      continue;
+    addDIEEntry(SPDie, dwarf::DW_AT_containing_type, NDie);
   }
 }
 
 /// constructVariableDIE - Construct a DIE for the given DbgVariable.
-DIE *CompileUnit::constructVariableDIE(DbgVariable *DV,
-                                       bool isScopeAbstract) {
-  StringRef Name = DV->getName();
-
-  // Translate tag to proper Dwarf tag.
-  unsigned Tag = DV->getTag();
+DIE *CompileUnit::constructVariableDIE(DbgVariable &DV, bool isScopeAbstract) {
+  StringRef Name = DV.getName();
 
   // Define variable debug information entry.
-  DIE *VariableDie = new DIE(Tag);
-  DbgVariable *AbsVar = DV->getAbstractVariable();
+  DIE *VariableDie = new DIE(DV.getTag());
+  DbgVariable *AbsVar = DV.getAbstractVariable();
   DIE *AbsDIE = AbsVar ? AbsVar->getDIE() : NULL;
   if (AbsDIE)
-    addDIEEntry(VariableDie, dwarf::DW_AT_abstract_origin,
-                            dwarf::DW_FORM_ref4, AbsDIE);
+    addDIEEntry(VariableDie, dwarf::DW_AT_abstract_origin, AbsDIE);
   else {
-    addString(VariableDie, dwarf::DW_AT_name, Name);
-    addSourceLine(VariableDie, DV->getVariable());
-    addType(VariableDie, DV->getType());
+    if (!Name.empty())
+      addString(VariableDie, dwarf::DW_AT_name, Name);
+    addSourceLine(VariableDie, DV.getVariable());
+    addType(VariableDie, DV.getType());
   }
 
-  if (DV->isArtificial())
+  if (DV.isArtificial())
     addFlag(VariableDie, dwarf::DW_AT_artificial);
 
   if (isScopeAbstract) {
-    DV->setDIE(VariableDie);
+    DV.setDIE(VariableDie);
     return VariableDie;
   }
 
   // Add variable address.
 
-  unsigned Offset = DV->getDotDebugLocOffset();
+  unsigned Offset = DV.getDotDebugLocOffset();
   if (Offset != ~0U) {
-    addLabel(VariableDie, dwarf::DW_AT_location, dwarf::DW_FORM_data4,
+    addLabel(VariableDie, dwarf::DW_AT_location,
+             DD->getDwarfVersion() >= 4 ? dwarf::DW_FORM_sec_offset
+                                        : dwarf::DW_FORM_data4,
              Asm->GetTempSymbol("debug_loc", Offset));
-    DV->setDIE(VariableDie);
+    DV.setDIE(VariableDie);
     return VariableDie;
   }
 
   // Check if variable is described by a DBG_VALUE instruction.
-  if (const MachineInstr *DVInsn = DV->getMInsn()) {
+  if (const MachineInstr *DVInsn = DV.getMInsn()) {
     assert(DVInsn->getNumOperands() == 3);
     if (DVInsn->getOperand(0).isReg()) {
       const MachineOperand RegOp = DVInsn->getOperand(0);
       // If the second operand is an immediate, this is an indirect value.
       if (DVInsn->getOperand(1).isImm()) {
-        MachineLocation Location(RegOp.getReg(), DVInsn->getOperand(1).getImm());
-        addVariableAddress(*DV, VariableDie, Location);
+        MachineLocation Location(RegOp.getReg(),
+                                 DVInsn->getOperand(1).getImm());
+        addVariableAddress(DV, VariableDie, Location);
       } else if (RegOp.getReg())
-        addVariableAddress(*DV, VariableDie, MachineLocation(RegOp.getReg()));
+        addVariableAddress(DV, VariableDie, MachineLocation(RegOp.getReg()));
     } else if (DVInsn->getOperand(0).isImm())
-      addConstantValue(VariableDie, DVInsn->getOperand(0), DV->getType());
+      addConstantValue(VariableDie, DVInsn->getOperand(0), DV.getType());
     else if (DVInsn->getOperand(0).isFPImm())
       addConstantFPValue(VariableDie, DVInsn->getOperand(0));
     else if (DVInsn->getOperand(0).isCImm())
       addConstantValue(VariableDie, DVInsn->getOperand(0).getCImm(),
-                       DV->getType().isUnsignedDIType());
+                       isUnsignedDIType(DD, DV.getType()));
 
-    DV->setDIE(VariableDie);
+    DV.setDIE(VariableDie);
     return VariableDie;
   } else {
     // .. else use frame index.
-    int FI = DV->getFrameIndex();
+    int FI = DV.getFrameIndex();
     if (FI != ~0) {
       unsigned FrameReg = 0;
       const TargetFrameLowering *TFI = Asm->TM.getFrameLowering();
-      int Offset =
-        TFI->getFrameIndexReference(*Asm->MF, FI, FrameReg);
+      int Offset = TFI->getFrameIndexReference(*Asm->MF, FI, FrameReg);
       MachineLocation Location(FrameReg, Offset);
-      addVariableAddress(*DV, VariableDie, Location);
+      addVariableAddress(DV, VariableDie, Location);
     }
   }
 
-  DV->setDIE(VariableDie);
+  DV.setDIE(VariableDie);
   return VariableDie;
 }
 
-/// createMemberDIE - Create new member DIE.
-DIE *CompileUnit::createMemberDIE(DIDerivedType DT) {
-  DIE *MemberDie = new DIE(DT.getTag());
+/// constructMemberDIE - Construct member DIE from DIDerivedType.
+void CompileUnit::constructMemberDIE(DIE &Buffer, DIDerivedType DT) {
+  DIE *MemberDie = createAndAddDIE(DT.getTag(), Buffer);
   StringRef Name = DT.getName();
   if (!Name.empty())
     addString(MemberDie, dwarf::DW_AT_name, Name);
 
-  addType(MemberDie, DT.getTypeDerivedFrom());
+  addType(MemberDie, resolve(DT.getTypeDerivedFrom()));
 
   addSourceLine(MemberDie, DT);
 
   DIEBlock *MemLocationDie = new (DIEValueAllocator) DIEBlock();
-  addUInt(MemLocationDie, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_plus_uconst);
-
-  uint64_t Size = DT.getSizeInBits();
-  uint64_t FieldSize = DT.getOriginalTypeSize();
-
-  if (Size != FieldSize) {
-    // Handle bitfield.
-    addUInt(MemberDie, dwarf::DW_AT_byte_size, 0, DT.getOriginalTypeSize()>>3);
-    addUInt(MemberDie, dwarf::DW_AT_bit_size, 0, DT.getSizeInBits());
-
-    uint64_t Offset = DT.getOffsetInBits();
-    uint64_t AlignMask = ~(DT.getAlignInBits() - 1);
-    uint64_t HiMark = (Offset + FieldSize) & AlignMask;
-    uint64_t FieldOffset = (HiMark - FieldSize);
-    Offset -= FieldOffset;
-
-    // Maybe we need to work from the other end.
-    if (Asm->getDataLayout().isLittleEndian())
-      Offset = FieldSize - (Offset + Size);
-    addUInt(MemberDie, dwarf::DW_AT_bit_offset, 0, Offset);
-
-    // Here WD_AT_data_member_location points to the anonymous
-    // field that includes this bit field.
-    addUInt(MemLocationDie, 0, dwarf::DW_FORM_udata, FieldOffset >> 3);
-
-  } else
-    // This is not a bitfield.
-    addUInt(MemLocationDie, 0, dwarf::DW_FORM_udata, DT.getOffsetInBits() >> 3);
+  addUInt(MemLocationDie, dwarf::DW_FORM_data1, dwarf::DW_OP_plus_uconst);
 
-  if (DT.getTag() == dwarf::DW_TAG_inheritance
-      && DT.isVirtual()) {
+  if (DT.getTag() == dwarf::DW_TAG_inheritance && DT.isVirtual()) {
 
     // For C++, virtual base classes are not at fixed offset. Use following
     // expression to extract appropriate offset from vtable.
     // BaseAddr = ObAddr + *((*ObAddr) - Offset)
 
     DIEBlock *VBaseLocationDie = new (DIEValueAllocator) DIEBlock();
-    addUInt(VBaseLocationDie, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_dup);
-    addUInt(VBaseLocationDie, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_deref);
-    addUInt(VBaseLocationDie, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_constu);
-    addUInt(VBaseLocationDie, 0, dwarf::DW_FORM_udata, DT.getOffsetInBits());
-    addUInt(VBaseLocationDie, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_minus);
-    addUInt(VBaseLocationDie, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_deref);
-    addUInt(VBaseLocationDie, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_plus);
-
-    addBlock(MemberDie, dwarf::DW_AT_data_member_location, 0,
-             VBaseLocationDie);
-  } else
-    addBlock(MemberDie, dwarf::DW_AT_data_member_location, 0, MemLocationDie);
+    addUInt(VBaseLocationDie, dwarf::DW_FORM_data1, dwarf::DW_OP_dup);
+    addUInt(VBaseLocationDie, dwarf::DW_FORM_data1, dwarf::DW_OP_deref);
+    addUInt(VBaseLocationDie, dwarf::DW_FORM_data1, dwarf::DW_OP_constu);
+    addUInt(VBaseLocationDie, dwarf::DW_FORM_udata, DT.getOffsetInBits());
+    addUInt(VBaseLocationDie, dwarf::DW_FORM_data1, dwarf::DW_OP_minus);
+    addUInt(VBaseLocationDie, dwarf::DW_FORM_data1, dwarf::DW_OP_deref);
+    addUInt(VBaseLocationDie, dwarf::DW_FORM_data1, dwarf::DW_OP_plus);
+
+    addBlock(MemberDie, dwarf::DW_AT_data_member_location, VBaseLocationDie);
+  } else {
+    uint64_t Size = DT.getSizeInBits();
+    uint64_t FieldSize = getBaseTypeSize(DD, DT);
+    uint64_t OffsetInBytes;
+
+    if (Size != FieldSize) {
+      // Handle bitfield.
+      addUInt(MemberDie, dwarf::DW_AT_byte_size, None,
+              getBaseTypeSize(DD, DT) >> 3);
+      addUInt(MemberDie, dwarf::DW_AT_bit_size, None, DT.getSizeInBits());
+
+      uint64_t Offset = DT.getOffsetInBits();
+      uint64_t AlignMask = ~(DT.getAlignInBits() - 1);
+      uint64_t HiMark = (Offset + FieldSize) & AlignMask;
+      uint64_t FieldOffset = (HiMark - FieldSize);
+      Offset -= FieldOffset;
+
+      // Maybe we need to work from the other end.
+      if (Asm->getDataLayout().isLittleEndian())
+        Offset = FieldSize - (Offset + Size);
+      addUInt(MemberDie, dwarf::DW_AT_bit_offset, None, Offset);
+
+      // Here WD_AT_data_member_location points to the anonymous
+      // field that includes this bit field.
+      OffsetInBytes = FieldOffset >> 3;
+    } else
+      // This is not a bitfield.
+      OffsetInBytes = DT.getOffsetInBits() >> 3;
+    addUInt(MemberDie, dwarf::DW_AT_data_member_location, None, OffsetInBytes);
+  }
 
   if (DT.isProtected())
     addUInt(MemberDie, dwarf::DW_AT_accessibility, dwarf::DW_FORM_data1,
@@ -1714,17 +1897,26 @@ DIE *CompileUnit::createMemberDIE(DIDerivedType DT) {
 
   if (DT.isArtificial())
     addFlag(MemberDie, dwarf::DW_AT_artificial);
-
-  return MemberDie;
 }
 
-/// createStaticMemberDIE - Create new DIE for C++ static member.
-DIE *CompileUnit::createStaticMemberDIE(const DIDerivedType DT) {
+/// getOrCreateStaticMemberDIE - Create new DIE for C++ static member.
+DIE *CompileUnit::getOrCreateStaticMemberDIE(DIDerivedType DT) {
   if (!DT.Verify())
     return NULL;
 
-  DIE *StaticMemberDIE = new DIE(DT.getTag());
-  DIType Ty = DT.getTypeDerivedFrom();
+  // Construct the context before querying for the existence of the DIE in case
+  // such construction creates the DIE.
+  DIE *ContextDIE = getOrCreateContextDIE(resolve(DT.getContext()));
+  assert(dwarf::isType(ContextDIE->getTag()) &&
+         "Static member should belong to a type.");
+
+  DIE *StaticMemberDIE = getDIE(DT);
+  if (StaticMemberDIE)
+    return StaticMemberDIE;
+
+  StaticMemberDIE = createAndAddDIE(DT.getTag(), *ContextDIE, DT);
+
+  DIType Ty = resolve(DT.getTypeDerivedFrom());
 
   addString(StaticMemberDIE, dwarf::DW_AT_name, DT.getName());
   addType(StaticMemberDIE, Ty);
@@ -1745,10 +1937,20 @@ DIE *CompileUnit::createStaticMemberDIE(const DIDerivedType DT) {
             dwarf::DW_ACCESS_public);
 
   if (const ConstantInt *CI = dyn_cast_or_null<ConstantInt>(DT.getConstant()))
-    addConstantValue(StaticMemberDIE, CI, Ty.isUnsignedDIType());
+    addConstantValue(StaticMemberDIE, CI, isUnsignedDIType(DD, Ty));
   if (const ConstantFP *CFP = dyn_cast_or_null<ConstantFP>(DT.getConstant()))
     addConstantFPValue(StaticMemberDIE, CFP);
 
-  insertDIE(DT, StaticMemberDIE);
   return StaticMemberDIE;
 }
+
+void CompileUnit::emitHeader(const MCSection *ASection,
+                             const MCSymbol *ASectionSym) {
+  Asm->OutStreamer.AddComment("DWARF version number");
+  Asm->EmitInt16(DD->getDwarfVersion());
+  Asm->OutStreamer.AddComment("Offset Into Abbrev. Section");
+  Asm->EmitSectionOffset(Asm->GetTempSymbol(ASection->getLabelBeginName()),
+                         ASectionSym);
+  Asm->OutStreamer.AddComment("Address Size (in bytes)");
+  Asm->EmitInt8(Asm->getDataLayout().getPointerSize());
+}
diff --git a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
index 3908b37..d782c88 100644
--- a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
+++ b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
@@ -15,7 +15,9 @@
 #define CODEGEN_ASMPRINTER_DWARFCOMPILEUNIT_H
 
 #include "DIE.h"
+#include "DwarfDebug.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/OwningPtr.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/DebugInfo.h"
@@ -23,8 +25,6 @@
 
 namespace llvm {
 
-class DwarfDebug;
-class DwarfUnits;
 class MachineLocation;
 class MachineOperand;
 class ConstantInt;
@@ -39,11 +39,10 @@ class CompileUnit {
   ///
   unsigned UniqueID;
 
-  /// Language - The DW_AT_language of the compile unit
-  ///
-  unsigned Language;
+  /// Node - MDNode for the compile unit.
+  DICompileUnit Node;
 
-  /// Die - Compile unit debug information entry.
+  /// CUDie - Compile unit debug information entry.
   ///
   const OwningPtr<DIE> CUDie;
 
@@ -67,18 +66,18 @@ class CompileUnit {
 
   /// GlobalNames - A map of globally visible named entities for this unit.
   ///
-  StringMap<DIE*> GlobalNames;
+  StringMap<DIE *> GlobalNames;
 
   /// GlobalTypes - A map of globally visible types for this unit.
   ///
-  StringMap<DIE*> GlobalTypes;
+  StringMap<DIE *> GlobalTypes;
 
   /// AccelNames - A map of names for the name accelerator table.
   ///
-  StringMap<std::vector<DIE*> > AccelNames;
-  StringMap<std::vector<DIE*> > AccelObjC;
-  StringMap<std::vector<DIE*> > AccelNamespace;
-  StringMap<std::vector<std::pair<DIE*, unsigned> > > AccelTypes;
+  StringMap<std::vector<DIE *> > AccelNames;
+  StringMap<std::vector<DIE *> > AccelObjC;
+  StringMap<std::vector<DIE *> > AccelNamespace;
+  StringMap<std::vector<std::pair<DIE *, unsigned> > > AccelTypes;
 
   /// DIEBlocks - A list of all the DIEBlocks in use.
   std::vector<DIEBlock *> DIEBlocks;
@@ -88,165 +87,154 @@ class CompileUnit {
   /// corresponds to the MDNode mapped with the subprogram DIE.
   DenseMap<DIE *, const MDNode *> ContainingTypeMap;
 
-  /// Offset of the CUDie from beginning of debug info section.
-  unsigned DebugInfoOffset;
+  // DIEValueAllocator - All DIEValues are allocated through this allocator.
+  BumpPtrAllocator DIEValueAllocator;
 
-  /// getLowerBoundDefault - Return the default lower bound for an array. If the
-  /// DWARF version doesn't handle the language, return -1.
-  int64_t getDefaultLowerBound() const;
+  // DIEIntegerOne - A preallocated DIEValue because 1 is used frequently.
+  DIEInteger *DIEIntegerOne;
 
 public:
-  CompileUnit(unsigned UID, unsigned L, DIE *D, const MDNode *N, AsmPrinter *A,
+  CompileUnit(unsigned UID, DIE *D, DICompileUnit CU, AsmPrinter *A,
               DwarfDebug *DW, DwarfUnits *DWU);
   ~CompileUnit();
 
   // Accessors.
-  unsigned getUniqueID()            const { return UniqueID; }
-  unsigned getLanguage()            const { return Language; }
-  DIE* getCUDie()                   const { return CUDie.get(); }
-  unsigned getDebugInfoOffset()     const { return DebugInfoOffset; }
-  const StringMap<DIE*> &getGlobalNames() const { return GlobalNames; }
-  const StringMap<DIE*> &getGlobalTypes() const { return GlobalTypes; }
-
-  const StringMap<std::vector<DIE*> > &getAccelNames() const {
+  unsigned getUniqueID() const { return UniqueID; }
+  uint16_t getLanguage() const { return Node.getLanguage(); }
+  DICompileUnit getNode() const { return Node; }
+  DIE *getCUDie() const { return CUDie.get(); }
+  const StringMap<DIE *> &getGlobalNames() const { return GlobalNames; }
+  const StringMap<DIE *> &getGlobalTypes() const { return GlobalTypes; }
+
+  const StringMap<std::vector<DIE *> > &getAccelNames() const {
     return AccelNames;
   }
-  const StringMap<std::vector<DIE*> > &getAccelObjC() const {
+  const StringMap<std::vector<DIE *> > &getAccelObjC() const {
     return AccelObjC;
   }
-  const StringMap<std::vector<DIE*> > &getAccelNamespace() const {
+  const StringMap<std::vector<DIE *> > &getAccelNamespace() const {
     return AccelNamespace;
   }
-  const StringMap<std::vector<std::pair<DIE*, unsigned > > >
-  &getAccelTypes() const {
+  const StringMap<std::vector<std::pair<DIE *, unsigned> > > &
+  getAccelTypes() const {
     return AccelTypes;
   }
 
+  unsigned getDebugInfoOffset() const { return DebugInfoOffset; }
   void setDebugInfoOffset(unsigned DbgInfoOff) { DebugInfoOffset = DbgInfoOff; }
+
   /// hasContent - Return true if this compile unit has something to write out.
   ///
   bool hasContent() const { return !CUDie->getChildren().empty(); }
 
+  /// getParentContextString - Get a string containing the language specific
+  /// context for a global name.
+  std::string getParentContextString(DIScope Context) const;
+
   /// addGlobalName - Add a new global entity to the compile unit.
   ///
-  void addGlobalName(StringRef Name, DIE *Die) { GlobalNames[Name] = Die; }
+  void addGlobalName(StringRef Name, DIE *Die, DIScope Context);
 
   /// addGlobalType - Add a new global type to the compile unit.
   ///
   void addGlobalType(DIType Ty);
 
+  /// addPubTypes - Add a set of types from the subprogram to the global types.
+  void addPubTypes(DISubprogram SP);
 
   /// addAccelName - Add a new name to the name accelerator table.
-  void addAccelName(StringRef Name, DIE *Die) {
-    std::vector<DIE*> &DIEs = AccelNames[Name];
-    DIEs.push_back(Die);
-  }
-  void addAccelObjC(StringRef Name, DIE *Die) {
-    std::vector<DIE*> &DIEs = AccelObjC[Name];
-    DIEs.push_back(Die);
-  }
-  void addAccelNamespace(StringRef Name, DIE *Die) {
-    std::vector<DIE*> &DIEs = AccelNamespace[Name];
-    DIEs.push_back(Die);
-  }
-  void addAccelType(StringRef Name, std::pair<DIE *, unsigned> Die) {
-    std::vector<std::pair<DIE *, unsigned> > &DIEs = AccelTypes[Name];
-    DIEs.push_back(Die);
-  }
+  void addAccelName(StringRef Name, DIE *Die);
 
-  /// getDIE - Returns the debug information entry map slot for the
-  /// specified debug variable.
-  DIE *getDIE(const MDNode *N) const { return MDNodeToDieMap.lookup(N); }
+  /// addAccelObjC - Add a new name to the ObjC accelerator table.
+  void addAccelObjC(StringRef Name, DIE *Die);
 
-  DIEBlock *getDIEBlock() {
-    return new (DIEValueAllocator) DIEBlock();
-  }
+  /// addAccelNamespace - Add a new name to the namespace accelerator table.
+  void addAccelNamespace(StringRef Name, DIE *Die);
 
-  /// insertDIE - Insert DIE into the map.
-  void insertDIE(const MDNode *N, DIE *D) {
-    MDNodeToDieMap.insert(std::make_pair(N, D));
-  }
+  /// addAccelType - Add a new type to the type accelerator table.
+  void addAccelType(StringRef Name, std::pair<DIE *, unsigned> Die);
 
-  /// getDIEEntry - Returns the debug information entry for the specified
-  /// debug variable.
-  DIEEntry *getDIEEntry(const MDNode *N) const {
-    return MDNodeToDIEEntryMap.lookup(N);
-  }
+  /// getDIE - Returns the debug information entry map slot for the
+  /// specified debug variable. We delegate the request to DwarfDebug
+  /// when the MDNode can be part of the type system, since DIEs for
+  /// the type system can be shared across CUs and the mappings are
+  /// kept in DwarfDebug.
+  DIE *getDIE(DIDescriptor D) const;
 
-  /// insertDIEEntry - Insert debug information entry into the map.
-  void insertDIEEntry(const MDNode *N, DIEEntry *E) {
-    MDNodeToDIEEntryMap.insert(std::make_pair(N, E));
-  }
+  DIEBlock *getDIEBlock() { return new (DIEValueAllocator) DIEBlock(); }
+
+  /// insertDIE - Insert DIE into the map. We delegate the request to DwarfDebug
+  /// when the MDNode can be part of the type system, since DIEs for
+  /// the type system can be shared across CUs and the mappings are
+  /// kept in DwarfDebug.
+  void insertDIE(DIDescriptor Desc, DIE *D);
 
   /// addDie - Adds or interns the DIE to the compile unit.
   ///
-  void addDie(DIE *Buffer) {
-    this->CUDie->addChild(Buffer);
-  }
-
-  // getIndexTyDie - Get an anonymous type for index type.
-  DIE *getIndexTyDie() {
-    return IndexTyDie;
-  }
-
-  // setIndexTyDie - Set D as anonymous type for index which can be reused
-  // later.
-  void setIndexTyDie(DIE *D) {
-    IndexTyDie = D;
-  }
+  void addDie(DIE *Buffer) { CUDie->addChild(Buffer); }
 
   /// addFlag - Add a flag that is true to the DIE.
-  void addFlag(DIE *Die, unsigned Attribute);
+  void addFlag(DIE *Die, dwarf::Attribute Attribute);
 
   /// addUInt - Add an unsigned integer attribute data and value.
   ///
-  void addUInt(DIE *Die, unsigned Attribute, unsigned Form, uint64_t Integer);
+  void addUInt(DIE *Die, dwarf::Attribute Attribute, Optional<dwarf::Form> Form,
+               uint64_t Integer);
+
+  void addUInt(DIEBlock *Block, dwarf::Form Form, uint64_t Integer);
 
   /// addSInt - Add an signed integer attribute data and value.
   ///
-  void addSInt(DIE *Die, unsigned Attribute, unsigned Form, int64_t Integer);
+  void addSInt(DIE *Die, dwarf::Attribute Attribute, Optional<dwarf::Form> Form,
+               int64_t Integer);
+
+  void addSInt(DIEBlock *Die, Optional<dwarf::Form> Form, int64_t Integer);
 
   /// addString - Add a string attribute data and value.
   ///
-  void addString(DIE *Die, unsigned Attribute, const StringRef Str);
+  void addString(DIE *Die, dwarf::Attribute Attribute, const StringRef Str);
 
   /// addLocalString - Add a string attribute data and value.
   ///
-  void addLocalString(DIE *Die, unsigned Attribute, const StringRef Str);
+  void addLocalString(DIE *Die, dwarf::Attribute Attribute, const StringRef Str);
 
   /// addExpr - Add a Dwarf expression attribute data and value.
   ///
-  void addExpr(DIE *Die, unsigned Attribute, unsigned Form,
-               const MCExpr *Expr);
+  void addExpr(DIEBlock *Die, dwarf::Form Form, const MCExpr *Expr);
 
   /// addLabel - Add a Dwarf label attribute data and value.
   ///
-  void addLabel(DIE *Die, unsigned Attribute, unsigned Form,
+  void addLabel(DIE *Die, dwarf::Attribute Attribute, dwarf::Form Form,
                 const MCSymbol *Label);
 
+  void addLabel(DIEBlock *Die, dwarf::Form Form, const MCSymbol *Label);
+
   /// addLabelAddress - Add a dwarf label attribute data and value using
   /// either DW_FORM_addr or DW_FORM_GNU_addr_index.
   ///
-  void addLabelAddress(DIE *Die, unsigned Attribute, MCSymbol *Label);
+  void addLabelAddress(DIE *Die, dwarf::Attribute Attribute, MCSymbol *Label);
 
   /// addOpAddress - Add a dwarf op address data and value using the
   /// form given and an op of either DW_FORM_addr or DW_FORM_GNU_addr_index.
   ///
-  void addOpAddress(DIE *Die, const MCSymbol *Label);
-  void addOpAddress(DIE *Die, const MCSymbolRefExpr *Label);
+  void addOpAddress(DIEBlock *Die, const MCSymbol *Label);
 
   /// addDelta - Add a label delta attribute data and value.
   ///
-  void addDelta(DIE *Die, unsigned Attribute, unsigned Form,
-                const MCSymbol *Hi, const MCSymbol *Lo);
+  void addDelta(DIE *Die, dwarf::Attribute Attribute, dwarf::Form Form, const MCSymbol *Hi,
+                const MCSymbol *Lo);
 
   /// addDIEEntry - Add a DIE attribute data and value.
   ///
-  void addDIEEntry(DIE *Die, unsigned Attribute, unsigned Form, DIE *Entry);
+  void addDIEEntry(DIE *Die, dwarf::Attribute Attribute, DIE *Entry);
+
+  /// addDIEEntry - Add a DIE attribute data and value.
+  ///
+  void addDIEEntry(DIE *Die, dwarf::Attribute Attribute, DIEEntry *Entry);
 
   /// addBlock - Add block data.
   ///
-  void addBlock(DIE *Die, unsigned Attribute, unsigned Form, DIEBlock *Block);
+  void addBlock(DIE *Die, dwarf::Attribute Attribute, DIEBlock *Block);
 
   /// addSourceLine - Add location information to specified debug information
   /// entry.
@@ -259,8 +247,8 @@ public:
 
   /// addAddress - Add an address attribute to a die based on the location
   /// provided.
-  void addAddress(DIE *Die, unsigned Attribute,
-                  const MachineLocation &Location, bool Indirect = false);
+  void addAddress(DIE *Die, dwarf::Attribute Attribute, const MachineLocation &Location,
+                  bool Indirect = false);
 
   /// addConstantValue - Add constant value entry in variable DIE.
   void addConstantValue(DIE *Die, const MachineOperand &MO, DIType Ty);
@@ -275,17 +263,17 @@ public:
   void addTemplateParams(DIE &Buffer, DIArray TParams);
 
   /// addRegisterOp - Add register operand.
-  void addRegisterOp(DIE *TheDie, unsigned Reg);
+  void addRegisterOp(DIEBlock *TheDie, unsigned Reg);
 
   /// addRegisterOffset - Add register offset.
-  void addRegisterOffset(DIE *TheDie, unsigned Reg, int64_t Offset);
+  void addRegisterOffset(DIEBlock *TheDie, unsigned Reg, int64_t Offset);
 
   /// addComplexAddress - Start with the address based on the location provided,
   /// and generate the DWARF information necessary to find the actual variable
   /// (navigating the extra location information encoded in the type) based on
   /// the starting location.  Add the DWARF information to the die.
   ///
-  void addComplexAddress(const DbgVariable &DV, DIE *Die, unsigned Attribute,
+  void addComplexAddress(const DbgVariable &DV, DIE *Die, dwarf::Attribute Attribute,
                          const MachineLocation &Location);
 
   // FIXME: Should be reformulated in terms of addComplexAddress.
@@ -295,7 +283,7 @@ public:
   /// starting location.  Add the DWARF information to the die.  Obsolete,
   /// please use addComplexAddress instead.
   ///
-  void addBlockByrefAddress(const DbgVariable &DV, DIE *Die, unsigned Attribute,
+  void addBlockByrefAddress(const DbgVariable &DV, DIE *Die, dwarf::Attribute Attribute,
                             const MachineLocation &Location);
 
   /// addVariableAddress - Add DW_AT_location attribute for a
@@ -303,13 +291,10 @@ public:
   void addVariableAddress(const DbgVariable &DV, DIE *Die,
                           MachineLocation Location);
 
-  /// addToContextOwner - Add Die into the list of its context owner's children.
-  void addToContextOwner(DIE *Die, DIDescriptor Context);
-
   /// addType - Add a new type attribute to the specified entity. This takes
   /// and attribute parameter because DW_AT_friend attributes are also
   /// type references.
-  void addType(DIE *Entity, DIType Ty, unsigned Attribute = dwarf::DW_AT_type);
+  void addType(DIE *Entity, DIType Ty, dwarf::Attribute Attribute = dwarf::DW_AT_type);
 
   /// getOrCreateNameSpace - Create a DIE for DINameSpace.
   DIE *getOrCreateNameSpace(DINameSpace NS);
@@ -321,66 +306,103 @@ public:
   /// given DIType.
   DIE *getOrCreateTypeDIE(const MDNode *N);
 
-  /// getOrCreateTemplateTypeParameterDIE - Find existing DIE or create new DIE
-  /// for the given DITemplateTypeParameter.
-  DIE *getOrCreateTemplateTypeParameterDIE(DITemplateTypeParameter TP);
+  /// getOrCreateContextDIE - Get context owner's DIE.
+  DIE *getOrCreateContextDIE(DIScope Context);
 
-  /// getOrCreateTemplateValueParameterDIE - Find existing DIE or create
-  /// new DIE for the given DITemplateValueParameter.
-  DIE *getOrCreateTemplateValueParameterDIE(DITemplateValueParameter TVP);
+  /// createGlobalVariableDIE - create global variable DIE.
+  void createGlobalVariableDIE(DIGlobalVariable GV);
 
-  /// createDIEEntry - Creates a new DIEEntry to be a proxy for a debug
-  /// information entry.
-  DIEEntry *createDIEEntry(DIE *Entry);
+  /// constructContainingTypeDIEs - Construct DIEs for types that contain
+  /// vtables.
+  void constructContainingTypeDIEs();
 
-  /// createGlobalVariableDIE - create global variable DIE.
-  void createGlobalVariableDIE(const MDNode *N);
+  /// constructVariableDIE - Construct a DIE for the given DbgVariable.
+  DIE *constructVariableDIE(DbgVariable &DV, bool isScopeAbstract);
+
+  /// Create a DIE with the given Tag, add the DIE to its parent, and
+  /// call insertDIE if MD is not null.
+  DIE *createAndAddDIE(unsigned Tag, DIE &Parent, DIDescriptor N = DIDescriptor());
+
+  /// Compute the size of a header for this unit, not including the initial
+  /// length field.
+  unsigned getHeaderSize() const {
+    return sizeof(int16_t) + // DWARF version number
+           sizeof(int32_t) + // Offset Into Abbrev. Section
+           sizeof(int8_t);   // Pointer Size (in bytes)
+  }
 
-  void addPubTypes(DISubprogram SP);
+  /// Emit the header for this unit, not including the initial length field.
+  void emitHeader(const MCSection *ASection, const MCSymbol *ASectionSym);
 
+private:
   /// constructTypeDIE - Construct basic type die from DIBasicType.
-  void constructTypeDIE(DIE &Buffer,
-                        DIBasicType BTy);
+  void constructTypeDIE(DIE &Buffer, DIBasicType BTy);
 
   /// constructTypeDIE - Construct derived type die from DIDerivedType.
-  void constructTypeDIE(DIE &Buffer,
-                        DIDerivedType DTy);
+  void constructTypeDIE(DIE &Buffer, DIDerivedType DTy);
 
   /// constructTypeDIE - Construct type DIE from DICompositeType.
-  void constructTypeDIE(DIE &Buffer,
-                        DICompositeType CTy);
+  void constructTypeDIE(DIE &Buffer, DICompositeType CTy);
 
   /// constructSubrangeDIE - Construct subrange DIE from DISubrange.
   void constructSubrangeDIE(DIE &Buffer, DISubrange SR, DIE *IndexTy);
 
   /// constructArrayTypeDIE - Construct array type DIE from DICompositeType.
-  void constructArrayTypeDIE(DIE &Buffer,
-                             DICompositeType *CTy);
+  void constructArrayTypeDIE(DIE &Buffer, DICompositeType CTy);
 
   /// constructEnumTypeDIE - Construct enum type DIE from DIEnumerator.
-  DIE *constructEnumTypeDIE(DIEnumerator ETy);
+  void constructEnumTypeDIE(DIE &Buffer, DICompositeType CTy);
 
-  /// constructContainingTypeDIEs - Construct DIEs for types that contain
-  /// vtables.
-  void constructContainingTypeDIEs();
+  /// constructMemberDIE - Construct member DIE from DIDerivedType.
+  void constructMemberDIE(DIE &Buffer, DIDerivedType DT);
 
-  /// constructVariableDIE - Construct a DIE for the given DbgVariable.
-  DIE *constructVariableDIE(DbgVariable *DV, bool isScopeAbstract);
+  /// constructTemplateTypeParameterDIE - Construct new DIE for the given
+  /// DITemplateTypeParameter.
+  void constructTemplateTypeParameterDIE(DIE &Buffer,
+                                         DITemplateTypeParameter TP);
 
-  /// createMemberDIE - Create new member DIE.
-  DIE *createMemberDIE(DIDerivedType DT);
+  /// constructTemplateValueParameterDIE - Construct new DIE for the given
+  /// DITemplateValueParameter.
+  void constructTemplateValueParameterDIE(DIE &Buffer,
+                                          DITemplateValueParameter TVP);
 
-  /// createStaticMemberDIE - Create new static data member DIE.
-  DIE *createStaticMemberDIE(DIDerivedType DT);
+  /// getOrCreateStaticMemberDIE - Create new static data member DIE.
+  DIE *getOrCreateStaticMemberDIE(DIDerivedType DT);
 
-  /// getOrCreateContextDIE - Get context owner's DIE.
-  DIE *getOrCreateContextDIE(DIDescriptor Context);
+  /// Offset of the CUDie from beginning of debug info section.
+  unsigned DebugInfoOffset;
 
-private:
+  /// getLowerBoundDefault - Return the default lower bound for an array. If the
+  /// DWARF version doesn't handle the language, return -1.
+  int64_t getDefaultLowerBound() const;
 
-  // DIEValueAllocator - All DIEValues are allocated through this allocator.
-  BumpPtrAllocator DIEValueAllocator;
-  DIEInteger *DIEIntegerOne;
+  /// getDIEEntry - Returns the debug information entry for the specified
+  /// debug variable.
+  DIEEntry *getDIEEntry(const MDNode *N) const {
+    return MDNodeToDIEEntryMap.lookup(N);
+  }
+
+  /// insertDIEEntry - Insert debug information entry into the map.
+  void insertDIEEntry(const MDNode *N, DIEEntry *E) {
+    MDNodeToDIEEntryMap.insert(std::make_pair(N, E));
+  }
+
+  // getIndexTyDie - Get an anonymous type for index type.
+  DIE *getIndexTyDie() { return IndexTyDie; }
+
+  // setIndexTyDie - Set D as anonymous type for index which can be reused
+  // later.
+  void setIndexTyDie(DIE *D) { IndexTyDie = D; }
+
+  /// createDIEEntry - Creates a new DIEEntry to be a proxy for a debug
+  /// information entry.
+  DIEEntry *createDIEEntry(DIE *Entry);
+
+  /// resolve - Look in the DwarfDebug map for the MDNode that
+  /// corresponds to the reference.
+  template <typename T> T resolve(DIRef<T> Ref) const {
+    return DD->resolve(Ref);
+  }
 };
 
 } // end llvm namespace
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index 979c0c3..24e2c05 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -14,6 +14,7 @@
 #define DEBUG_TYPE "dwarfdebug"
 #include "DwarfDebug.h"
 #include "DIE.h"
+#include "DIEHash.h"
 #include "DwarfAccelTable.h"
 #include "DwarfCompileUnit.h"
 #include "llvm/ADT/STLExtras.h"
@@ -34,6 +35,7 @@
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/Dwarf.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/MD5.h"
@@ -57,15 +59,20 @@ static cl::opt<bool> UnknownLocations(
     cl::init(false));
 
 static cl::opt<bool>
-GenerateDwarfPubNamesSection("generate-dwarf-pubnames", cl::Hidden,
-                             cl::init(false),
-                             cl::desc("Generate DWARF pubnames section"));
-
-static cl::opt<bool>
 GenerateODRHash("generate-odr-hash", cl::Hidden,
                 cl::desc("Add an ODR hash to external type DIEs."),
                 cl::init(false));
 
+static cl::opt<bool>
+GenerateCUHash("generate-cu-hash", cl::Hidden,
+               cl::desc("Add the CU hash as the dwo_id."),
+               cl::init(false));
+
+static cl::opt<bool>
+GenerateGnuPubSections("generate-gnu-dwarf-pub-sections", cl::Hidden,
+                       cl::desc("Generate GNU-style pubnames and pubtypes"),
+                       cl::init(false));
+
 namespace {
 enum DefaultOnOff {
   Default,
@@ -83,14 +90,6 @@ DwarfAccelTables("dwarf-accel-tables", cl::Hidden,
                  cl::init(Default));
 
 static cl::opt<DefaultOnOff>
-DarwinGDBCompat("darwin-gdb-compat", cl::Hidden,
-                cl::desc("Compatibility with Darwin gdb."),
-                cl::values(clEnumVal(Default, "Default for platform"),
-                           clEnumVal(Enable, "Enabled"),
-                           clEnumVal(Disable, "Disabled"), clEnumValEnd),
-                cl::init(Default));
-
-static cl::opt<DefaultOnOff>
 SplitDwarf("split-dwarf", cl::Hidden,
            cl::desc("Output prototype dwarf split debug info."),
            cl::values(clEnumVal(Default, "Default for platform"),
@@ -98,16 +97,16 @@ SplitDwarf("split-dwarf", cl::Hidden,
                       clEnumVal(Disable, "Disabled"), clEnumValEnd),
            cl::init(Default));
 
-namespace {
-  const char *const DWARFGroupName = "DWARF Emission";
-  const char *const DbgTimerName = "DWARF Debug Writer";
+static cl::opt<DefaultOnOff>
+DwarfPubSections("generate-dwarf-pub-sections", cl::Hidden,
+                 cl::desc("Generate DWARF pubnames and pubtypes sections"),
+                 cl::values(clEnumVal(Default, "Default for platform"),
+                            clEnumVal(Enable, "Enabled"),
+                            clEnumVal(Disable, "Disabled"), clEnumValEnd),
+                 cl::init(Default));
 
-  struct CompareFirst {
-    template <typename T> bool operator()(const T &lhs, const T &rhs) const {
-      return lhs.first < rhs.first;
-    }
-  };
-} // end anonymous namespace
+static const char *const DWARFGroupName = "DWARF Emission";
+static const char *const DbgTimerName = "DWARF Debug Writer";
 
 //===----------------------------------------------------------------------===//
 
@@ -117,6 +116,13 @@ static const unsigned InitAbbreviationsSetSize = 9; // log2(512)
 
 namespace llvm {
 
+/// resolve - Look in the DwarfDebug map for the MDNode that
+/// corresponds to the reference.
+template <typename T>
+T DbgVariable::resolve(DIRef<T> Ref) const {
+  return DD->resolve(Ref);
+}
+
 DIType DbgVariable::getType() const {
   DIType Ty = Var.getType();
   // FIXME: isBlockByrefVariable should be reformulated in terms of complex
@@ -147,21 +153,16 @@ DIType DbgVariable::getType() const {
        the pointers and __Block_byref_x_VarName struct to find the actual
        value of the variable.  The function addBlockByrefType does this.  */
     DIType subType = Ty;
-    unsigned tag = Ty.getTag();
+    uint16_t tag = Ty.getTag();
 
-    if (tag == dwarf::DW_TAG_pointer_type) {
-      DIDerivedType DTy = DIDerivedType(Ty);
-      subType = DTy.getTypeDerivedFrom();
-    }
-
-    DICompositeType blockStruct = DICompositeType(subType);
-    DIArray Elements = blockStruct.getTypeArray();
+    if (tag == dwarf::DW_TAG_pointer_type)
+      subType = resolve(DIDerivedType(Ty).getTypeDerivedFrom());
 
+    DIArray Elements = DICompositeType(subType).getTypeArray();
     for (unsigned i = 0, N = Elements.getNumElements(); i < N; ++i) {
-      DIDescriptor Element = Elements.getElement(i);
-      DIDerivedType DT = DIDerivedType(Element);
+      DIDerivedType DT(Elements.getElement(i));
       if (getName() == DT.getName())
-        return (DT.getTypeDerivedFrom());
+        return (resolve(DT.getTypeDerivedFrom()));
     }
   }
   return Ty;
@@ -182,10 +183,10 @@ DwarfDebug::DwarfDebug(AsmPrinter *A, Module *M)
     AbbreviationsSet(InitAbbreviationsSetSize),
     SourceIdMap(DIEValueAllocator),
     PrevLabel(NULL), GlobalCUIndexCount(0),
-    InfoHolder(A, &AbbreviationsSet, &Abbreviations, "info_string",
+    InfoHolder(A, &AbbreviationsSet, Abbreviations, "info_string",
                DIEValueAllocator),
     SkeletonAbbrevSet(InitAbbreviationsSetSize),
-    SkeletonHolder(A, &SkeletonAbbrevSet, &SkeletonAbbrevs, "skel_string",
+    SkeletonHolder(A, &SkeletonAbbrevSet, SkeletonAbbrevs, "skel_string",
                    DIEValueAllocator) {
 
   DwarfInfoSectionSym = DwarfAbbrevSectionSym = 0;
@@ -195,29 +196,24 @@ DwarfDebug::DwarfDebug(AsmPrinter *A, Module *M)
   DwarfAbbrevDWOSectionSym = DwarfStrDWOSectionSym = 0;
   FunctionBeginSym = FunctionEndSym = 0;
 
-  // Turn on accelerator tables and older gdb compatibility
-  // for Darwin.
+  // Turn on accelerator tables for Darwin by default, pubnames by
+  // default for non-Darwin, and handle split dwarf.
   bool IsDarwin = Triple(A->getTargetTriple()).isOSDarwin();
-  if (DarwinGDBCompat == Default) {
-    if (IsDarwin)
-      IsDarwinGDBCompat = true;
-    else
-      IsDarwinGDBCompat = false;
-  } else
-    IsDarwinGDBCompat = DarwinGDBCompat == Enable ? true : false;
 
-  if (DwarfAccelTables == Default) {
-    if (IsDarwin)
-      HasDwarfAccelTables = true;
-    else
-      HasDwarfAccelTables = false;
-  } else
-    HasDwarfAccelTables = DwarfAccelTables == Enable ? true : false;
+  if (DwarfAccelTables == Default)
+    HasDwarfAccelTables = IsDarwin;
+  else
+    HasDwarfAccelTables = DwarfAccelTables == Enable;
 
   if (SplitDwarf == Default)
     HasSplitDwarf = false;
   else
-    HasSplitDwarf = SplitDwarf == Enable ? true : false;
+    HasSplitDwarf = SplitDwarf == Enable;
+
+  if (DwarfPubSections == Default)
+    HasDwarfPubSections = !IsDarwin;
+  else
+    HasDwarfPubSections = DwarfPubSections == Enable;
 
   DwarfVersion = getDwarfVersionFromModule(MMI->getModule());
 
@@ -226,8 +222,6 @@ DwarfDebug::DwarfDebug(AsmPrinter *A, Module *M)
     beginModule();
   }
 }
-DwarfDebug::~DwarfDebug() {
-}
 
 // Switch to the specified MCSection and emit an assembler
 // temporary label to it if SymbolStem is specified.
@@ -285,10 +279,10 @@ void DwarfUnits::assignAbbrevNumber(DIEAbbrev &Abbrev) {
   // If it's newly added.
   if (InSet == &Abbrev) {
     // Add to abbreviation list.
-    Abbreviations->push_back(&Abbrev);
+    Abbreviations.push_back(&Abbrev);
 
     // Assign the vector position + 1 as its number.
-    Abbrev.setNumber(Abbreviations->size());
+    Abbrev.setNumber(Abbreviations.size());
   } else {
     // Assign existing abbreviation number.
     Abbrev.setNumber(InSet->getNumber());
@@ -302,12 +296,7 @@ static bool isObjCClass(StringRef Name) {
 static bool hasObjCCategory(StringRef Name) {
   if (!isObjCClass(Name)) return false;
 
-  size_t pos = Name.find(')');
-  if (pos != std::string::npos) {
-    if (Name[pos+1] != ' ') return false;
-    return true;
-  }
-  return false;
+  return Name.find(") ") != StringRef::npos;
 }
 
 static void getObjCClassCategory(StringRef In, StringRef &Class,
@@ -327,11 +316,20 @@ static StringRef getObjCMethodName(StringRef In) {
   return In.slice(In.find(' ') + 1, In.find(']'));
 }
 
+// Helper for sorting sections into a stable output order.
+static bool SectionSort(const MCSection *A, const MCSection *B) {
+    std::string LA = (A ? A->getLabelBeginName() : "");
+    std::string LB = (B ? B->getLabelBeginName() : "");
+    return LA < LB;
+}
+
 // Add the various names to the Dwarf accelerator table names.
+// TODO: Determine whether or not we should add names for programs
+// that do not have a DW_AT_name or DW_AT_linkage_name field - this
+// is only slightly different than the lookup of non-standard ObjC names.
 static void addSubprogramNames(CompileUnit *TheCU, DISubprogram SP,
                                DIE* Die) {
   if (!SP.isDefinition()) return;
-
   TheCU->addAccelName(SP.getName(), Die);
 
   // If the linkage name is different than the name, go ahead and output
@@ -352,30 +350,34 @@ static void addSubprogramNames(CompileUnit *TheCU, DISubprogram SP,
   }
 }
 
+/// isSubprogramContext - Return true if Context is either a subprogram
+/// or another context nested inside a subprogram.
+bool DwarfDebug::isSubprogramContext(const MDNode *Context) {
+  if (!Context)
+    return false;
+  DIDescriptor D(Context);
+  if (D.isSubprogram())
+    return true;
+  if (D.isType())
+    return isSubprogramContext(resolve(DIType(Context).getContext()));
+  return false;
+}
+
 // Find DIE for the given subprogram and attach appropriate DW_AT_low_pc
 // and DW_AT_high_pc attributes. If there are global variables in this
 // scope then create and insert DIEs for these variables.
-DIE *DwarfDebug::updateSubprogramScopeDIE(CompileUnit *SPCU,
-                                          const MDNode *SPNode) {
-  DIE *SPDie = SPCU->getDIE(SPNode);
+DIE *DwarfDebug::updateSubprogramScopeDIE(CompileUnit *SPCU, DISubprogram SP) {
+  DIE *SPDie = SPCU->getDIE(SP);
 
   assert(SPDie && "Unable to find subprogram DIE!");
-  DISubprogram SP(SPNode);
 
   // If we're updating an abstract DIE, then we will be adding the children and
   // object pointer later on. But what we don't want to do is process the
   // concrete DIE twice.
-  DIE *AbsSPDIE = AbstractSPDies.lookup(SPNode);
-  if (AbsSPDIE) {
-    bool InSameCU = (AbsSPDIE->getCompileUnit() == SPCU->getCUDie());
+  if (DIE *AbsSPDIE = AbstractSPDies.lookup(SP)) {
     // Pick up abstract subprogram DIE.
-    SPDie = new DIE(dwarf::DW_TAG_subprogram);
-    // If AbsSPDIE belongs to a different CU, use DW_FORM_ref_addr instead of
-    // DW_FORM_ref4.
-    SPCU->addDIEEntry(SPDie, dwarf::DW_AT_abstract_origin,
-                      InSameCU ? dwarf::DW_FORM_ref4 : dwarf::DW_FORM_ref_addr,
-                      AbsSPDIE);
-    SPCU->addDie(SPDie);
+    SPDie = SPCU->createAndAddDIE(dwarf::DW_TAG_subprogram, *SPCU->getCUDie());
+    SPCU->addDIEEntry(SPDie, dwarf::DW_AT_abstract_origin, AbsSPDIE);
   } else {
     DISubprogram SPDecl = SP.getFunctionDeclaration();
     if (!SPDecl.isSubprogram()) {
@@ -384,32 +386,31 @@ DIE *DwarfDebug::updateSubprogramScopeDIE(CompileUnit *SPCU,
       // function then gdb prefers the definition at top level and but does not
       // expect specification DIE in parent function. So avoid creating
       // specification DIE for a function defined inside a function.
-      if (SP.isDefinition() && !SP.getContext().isCompileUnit() &&
-          !SP.getContext().isFile() &&
-          !isSubprogramContext(SP.getContext())) {
+      DIScope SPContext = resolve(SP.getContext());
+      if (SP.isDefinition() && !SPContext.isCompileUnit() &&
+          !SPContext.isFile() &&
+          !isSubprogramContext(SPContext)) {
         SPCU->addFlag(SPDie, dwarf::DW_AT_declaration);
 
         // Add arguments.
         DICompositeType SPTy = SP.getType();
         DIArray Args = SPTy.getTypeArray();
-        unsigned SPTag = SPTy.getTag();
+        uint16_t SPTag = SPTy.getTag();
         if (SPTag == dwarf::DW_TAG_subroutine_type)
           for (unsigned i = 1, N = Args.getNumElements(); i < N; ++i) {
-            DIE *Arg = new DIE(dwarf::DW_TAG_formal_parameter);
-            DIType ATy = DIType(Args.getElement(i));
+            DIE *Arg =
+                SPCU->createAndAddDIE(dwarf::DW_TAG_formal_parameter, *SPDie);
+            DIType ATy(Args.getElement(i));
             SPCU->addType(Arg, ATy);
             if (ATy.isArtificial())
               SPCU->addFlag(Arg, dwarf::DW_AT_artificial);
             if (ATy.isObjectPointer())
-              SPCU->addDIEEntry(SPDie, dwarf::DW_AT_object_pointer,
-                                dwarf::DW_FORM_ref4, Arg);
-            SPDie->addChild(Arg);
+              SPCU->addDIEEntry(SPDie, dwarf::DW_AT_object_pointer, Arg);
           }
         DIE *SPDeclDie = SPDie;
-        SPDie = new DIE(dwarf::DW_TAG_subprogram);
-        SPCU->addDIEEntry(SPDie, dwarf::DW_AT_specification,
-                          dwarf::DW_FORM_ref4, SPDeclDie);
-        SPCU->addDie(SPDie);
+        SPDie =
+            SPCU->createAndAddDIE(dwarf::DW_TAG_subprogram, *SPCU->getCUDie());
+        SPCU->addDIEEntry(SPDie, dwarf::DW_AT_specification, SPDeclDie);
       }
     }
   }
@@ -431,18 +432,39 @@ DIE *DwarfDebug::updateSubprogramScopeDIE(CompileUnit *SPCU,
   return SPDie;
 }
 
+/// Check whether we should create a DIE for the given Scope, return true
+/// if we don't create a DIE (the corresponding DIE is null).
+bool DwarfDebug::isLexicalScopeDIENull(LexicalScope *Scope) {
+  if (Scope->isAbstractScope())
+    return false;
+
+  // We don't create a DIE if there is no Range.
+  const SmallVectorImpl<InsnRange> &Ranges = Scope->getRanges();
+  if (Ranges.empty())
+    return true;
+
+  if (Ranges.size() > 1)
+    return false;
+
+  // We don't create a DIE if we have a single Range and the end label
+  // is null.
+  SmallVectorImpl<InsnRange>::const_iterator RI = Ranges.begin();
+  MCSymbol *End = getLabelAfterInsn(RI->second);
+  return !End;
+}
+
 // Construct new DW_TAG_lexical_block for this scope and attach
 // DW_AT_low_pc/DW_AT_high_pc labels.
 DIE *DwarfDebug::constructLexicalScopeDIE(CompileUnit *TheCU,
                                           LexicalScope *Scope) {
+  if (isLexicalScopeDIENull(Scope))
+    return 0;
+
   DIE *ScopeDIE = new DIE(dwarf::DW_TAG_lexical_block);
   if (Scope->isAbstractScope())
     return ScopeDIE;
 
   const SmallVectorImpl<InsnRange> &Ranges = Scope->getRanges();
-  if (Ranges.empty())
-    return 0;
-
   // If we have multiple ranges, emit them into the range section.
   if (Ranges.size() > 1) {
     // .debug_range section has not been laid out yet. Emit offset in
@@ -467,8 +489,7 @@ DIE *DwarfDebug::constructLexicalScopeDIE(CompileUnit *TheCU,
   SmallVectorImpl<InsnRange>::const_iterator RI = Ranges.begin();
   MCSymbol *Start = getLabelBeforeInsn(RI->first);
   MCSymbol *End = getLabelAfterInsn(RI->second);
-
-  if (End == 0) return 0;
+  assert(End && "End label should not be null!");
 
   assert(Start->isDefined() && "Invalid starting label for an inlined scope!");
   assert(End->isDefined() && "Invalid end label for an inlined scope!");
@@ -498,8 +519,7 @@ DIE *DwarfDebug::constructInlinedScopeDIE(CompileUnit *TheCU,
   }
 
   DIE *ScopeDIE = new DIE(dwarf::DW_TAG_inlined_subroutine);
-  TheCU->addDIEEntry(ScopeDIE, dwarf::DW_AT_abstract_origin,
-                     dwarf::DW_FORM_ref4, OriginDIE);
+  TheCU->addDIEEntry(ScopeDIE, dwarf::DW_AT_abstract_origin, OriginDIE);
 
   if (Ranges.size() > 1) {
     // .debug_range section has not been laid out yet. Emit offset in
@@ -535,26 +555,10 @@ DIE *DwarfDebug::constructInlinedScopeDIE(CompileUnit *TheCU,
 
   // Add the call site information to the DIE.
   DILocation DL(Scope->getInlinedAt());
-  TheCU->addUInt(ScopeDIE, dwarf::DW_AT_call_file, 0,
+  TheCU->addUInt(ScopeDIE, dwarf::DW_AT_call_file, None,
                  getOrCreateSourceID(DL.getFilename(), DL.getDirectory(),
                                      TheCU->getUniqueID()));
-  TheCU->addUInt(ScopeDIE, dwarf::DW_AT_call_line, 0, DL.getLineNumber());
-
-  // Track the start label for this inlined function.
-  //.debug_inlined section specification does not clearly state how
-  // to emit inlined scopes that are split into multiple instruction ranges.
-  // For now, use the first instruction range and emit low_pc/high_pc pair and
-  // corresponding the .debug_inlined section entry for this pair.
-  if (Asm->MAI->doesDwarfUseInlineInfoSection()) {
-    MCSymbol *StartLabel = getLabelBeforeInsn(Ranges.begin()->first);
-    InlineInfoMap::iterator I = InlineInfo.find(InlinedSP);
-
-    if (I == InlineInfo.end()) {
-      InlineInfo[InlinedSP].push_back(std::make_pair(StartLabel, ScopeDIE));
-      InlinedSPNodes.push_back(InlinedSP);
-    } else
-      I->second.push_back(std::make_pair(StartLabel, ScopeDIE));
-  }
+  TheCU->addUInt(ScopeDIE, dwarf::DW_AT_call_line, None, DL.getLineNumber());
 
   // Add name to the name table, we do this here because we're guaranteed
   // to have concrete versions of our DW_TAG_inlined_subprogram nodes.
@@ -563,26 +567,16 @@ DIE *DwarfDebug::constructInlinedScopeDIE(CompileUnit *TheCU,
   return ScopeDIE;
 }
 
-// Construct a DIE for this scope.
-DIE *DwarfDebug::constructScopeDIE(CompileUnit *TheCU, LexicalScope *Scope) {
-  if (!Scope || !Scope->getScopeNode())
-    return NULL;
-
-  DIScope DS(Scope->getScopeNode());
-  // Early return to avoid creating dangling variable|scope DIEs.
-  if (!Scope->getInlinedAt() && DS.isSubprogram() && Scope->isAbstractScope() &&
-      !TheCU->getDIE(DS))
-    return NULL;
-
-  SmallVector<DIE *, 8> Children;
-  DIE *ObjectPointer = NULL;
+DIE *DwarfDebug::createScopeChildrenDIE(CompileUnit *TheCU, LexicalScope *Scope,
+                                        SmallVectorImpl<DIE*> &Children) {
+    DIE *ObjectPointer = NULL;
 
   // Collect arguments for current function.
   if (LScopes.isCurrentFunctionScope(Scope))
     for (unsigned i = 0, N = CurrentFnArguments.size(); i < N; ++i)
       if (DbgVariable *ArgDV = CurrentFnArguments[i])
         if (DIE *Arg =
-            TheCU->constructVariableDIE(ArgDV, Scope->isAbstractScope())) {
+            TheCU->constructVariableDIE(*ArgDV, Scope->isAbstractScope())) {
           Children.push_back(Arg);
           if (ArgDV->isObjectPointer()) ObjectPointer = Arg;
         }
@@ -591,7 +585,7 @@ DIE *DwarfDebug::constructScopeDIE(CompileUnit *TheCU, LexicalScope *Scope) {
   const SmallVectorImpl<DbgVariable *> &Variables =ScopeVariables.lookup(Scope);
   for (unsigned i = 0, N = Variables.size(); i < N; ++i)
     if (DIE *Variable =
-        TheCU->constructVariableDIE(Variables[i], Scope->isAbstractScope())) {
+        TheCU->constructVariableDIE(*Variables[i], Scope->isAbstractScope())) {
       Children.push_back(Variable);
       if (Variables[i]->isObjectPointer()) ObjectPointer = Variable;
     }
@@ -599,6 +593,23 @@ DIE *DwarfDebug::constructScopeDIE(CompileUnit *TheCU, LexicalScope *Scope) {
   for (unsigned j = 0, M = Scopes.size(); j < M; ++j)
     if (DIE *Nested = constructScopeDIE(TheCU, Scopes[j]))
       Children.push_back(Nested);
+  return ObjectPointer;
+}
+
+// Construct a DIE for this scope.
+DIE *DwarfDebug::constructScopeDIE(CompileUnit *TheCU, LexicalScope *Scope) {
+  if (!Scope || !Scope->getScopeNode())
+    return NULL;
+
+  DIScope DS(Scope->getScopeNode());
+
+  SmallVector<DIE *, 8> Children;
+  DIE *ObjectPointer = NULL;
+  bool ChildrenCreated = false;
+
+  // We try to create the scope DIE first, then the children DIEs. This will
+  // avoid creating un-used children then removing them later when we find out
+  // the scope DIE is null.
   DIE *ScopeDIE = NULL;
   if (Scope->getInlinedAt())
     ScopeDIE = constructInlinedScopeDIE(TheCU, Scope);
@@ -609,26 +620,41 @@ DIE *DwarfDebug::constructScopeDIE(CompileUnit *TheCU, LexicalScope *Scope) {
       // Note down abstract DIE.
       if (ScopeDIE)
         AbstractSPDies.insert(std::make_pair(DS, ScopeDIE));
-    }
-    else
-      ScopeDIE = updateSubprogramScopeDIE(TheCU, DS);
-  }
-  else {
+    } else
+      ScopeDIE = updateSubprogramScopeDIE(TheCU, DISubprogram(DS));
+  } else {
+    // Early exit when we know the scope DIE is going to be null.
+    if (isLexicalScopeDIENull(Scope))
+      return NULL;
+
+    // We create children here when we know the scope DIE is not going to be
+    // null and the children will be added to the scope DIE.
+    ObjectPointer = createScopeChildrenDIE(TheCU, Scope, Children);
+    ChildrenCreated = true;
+
     // There is no need to emit empty lexical block DIE.
     std::pair<ImportedEntityMap::const_iterator,
               ImportedEntityMap::const_iterator> Range = std::equal_range(
         ScopesWithImportedEntities.begin(), ScopesWithImportedEntities.end(),
         std::pair<const MDNode *, const MDNode *>(DS, (const MDNode*)0),
-        CompareFirst());
+        less_first());
     if (Children.empty() && Range.first == Range.second)
       return NULL;
     ScopeDIE = constructLexicalScopeDIE(TheCU, Scope);
+    assert(ScopeDIE && "Scope DIE should not be null.");
     for (ImportedEntityMap::const_iterator i = Range.first; i != Range.second;
          ++i)
       constructImportedEntityDIE(TheCU, i->second, ScopeDIE);
   }
 
-  if (!ScopeDIE) return NULL;
+  if (!ScopeDIE) {
+    assert(Children.empty() &&
+           "We create children only when the scope DIE is not null.");
+    return NULL;
+  }
+  if (!ChildrenCreated)
+    // We create children when the scope DIE is not null.
+    ObjectPointer = createScopeChildrenDIE(TheCU, Scope, Children);
 
   // Add children
   for (SmallVectorImpl<DIE *>::iterator I = Children.begin(),
@@ -636,8 +662,7 @@ DIE *DwarfDebug::constructScopeDIE(CompileUnit *TheCU, LexicalScope *Scope) {
     ScopeDIE->addChild(*I);
 
   if (DS.isSubprogram() && ObjectPointer != NULL)
-    TheCU->addDIEEntry(ScopeDIE, dwarf::DW_AT_object_pointer,
-                       dwarf::DW_FORM_ref4, ObjectPointer);
+    TheCU->addDIEEntry(ScopeDIE, dwarf::DW_AT_object_pointer, ObjectPointer);
 
   if (DS.isSubprogram())
     TheCU->addPubTypes(DISubprogram(DS));
@@ -653,8 +678,10 @@ unsigned DwarfDebug::getOrCreateSourceID(StringRef FileName,
                                          StringRef DirName, unsigned CUID) {
   // If we use .loc in assembly, we can't separate .file entries according to
   // compile units. Thus all files will belong to the default compile unit.
-  if (Asm->TM.hasMCUseLoc() &&
-      Asm->OutStreamer.getKind() == MCStreamer::SK_AsmStreamer)
+
+  // FIXME: add a better feature test than hasRawTextSupport. Even better,
+  // extend .file to support this.
+  if (Asm->TM.hasMCUseLoc() && Asm->OutStreamer.hasRawTextSupport())
     CUID = 0;
 
   // If FE did not provide a file name, then assume stdin.
@@ -689,14 +716,12 @@ unsigned DwarfDebug::getOrCreateSourceID(StringRef FileName,
 
 // Create new CompileUnit for the given metadata node with tag
 // DW_TAG_compile_unit.
-CompileUnit *DwarfDebug::constructCompileUnit(const MDNode *N) {
-  DICompileUnit DIUnit(N);
+CompileUnit *DwarfDebug::constructCompileUnit(DICompileUnit DIUnit) {
   StringRef FN = DIUnit.getFilename();
   CompilationDir = DIUnit.getDirectory();
 
   DIE *Die = new DIE(dwarf::DW_TAG_compile_unit);
-  CompileUnit *NewCU = new CompileUnit(GlobalCUIndexCount++,
-                                       DIUnit.getLanguage(), Die, N, Asm,
+  CompileUnit *NewCU = new CompileUnit(GlobalCUIndexCount++, Die, DIUnit, Asm,
                                        this, &InfoHolder);
 
   FileIDCUMap[NewCU->getUniqueID()] = 0;
@@ -723,31 +748,57 @@ CompileUnit *DwarfDebug::constructCompileUnit(const MDNode *N) {
 
   // Use a single line table if we are using .loc and generating assembly.
   bool UseTheFirstCU =
-    (Asm->TM.hasMCUseLoc() &&
-     Asm->OutStreamer.getKind() == MCStreamer::SK_AsmStreamer) ||
-    (NewCU->getUniqueID() == 0);
+      (Asm->TM.hasMCUseLoc() && Asm->OutStreamer.hasRawTextSupport()) ||
+      (NewCU->getUniqueID() == 0);
 
-  // DW_AT_stmt_list is a offset of line number information for this
-  // compile unit in debug_line section. For split dwarf this is
-  // left in the skeleton CU and so not included.
-  // The line table entries are not always emitted in assembly, so it
-  // is not okay to use line_table_start here.
   if (!useSplitDwarf()) {
+    // DW_AT_stmt_list is a offset of line number information for this
+    // compile unit in debug_line section. For split dwarf this is
+    // left in the skeleton CU and so not included.
+    // The line table entries are not always emitted in assembly, so it
+    // is not okay to use line_table_start here.
     if (Asm->MAI->doesDwarfUseRelocationsAcrossSections())
-      NewCU->addLabel(Die, dwarf::DW_AT_stmt_list, dwarf::DW_FORM_data4,
-                      UseTheFirstCU ?
-                      Asm->GetTempSymbol("section_line") : LineTableStartSym);
+      NewCU->addLabel(Die, dwarf::DW_AT_stmt_list, dwarf::DW_FORM_sec_offset,
+                      UseTheFirstCU ? Asm->GetTempSymbol("section_line")
+                                    : LineTableStartSym);
     else if (UseTheFirstCU)
       NewCU->addUInt(Die, dwarf::DW_AT_stmt_list, dwarf::DW_FORM_data4, 0);
     else
       NewCU->addDelta(Die, dwarf::DW_AT_stmt_list, dwarf::DW_FORM_data4,
                       LineTableStartSym, DwarfLineSectionSym);
+
+    // If we're using split dwarf the compilation dir is going to be in the
+    // skeleton CU and so we don't need to duplicate it here.
+    if (!CompilationDir.empty())
+      NewCU->addString(Die, dwarf::DW_AT_comp_dir, CompilationDir);
+
+    // Flags to let the linker know we have emitted new style pubnames. Only
+    // emit it here if we don't have a skeleton CU for split dwarf.
+    if (GenerateGnuPubSections) {
+      if (Asm->MAI->doesDwarfUseRelocationsAcrossSections())
+        NewCU->addLabel(Die, dwarf::DW_AT_GNU_pubnames,
+                        dwarf::DW_FORM_sec_offset,
+                        Asm->GetTempSymbol("gnu_pubnames",
+                                           NewCU->getUniqueID()));
+      else
+        NewCU->addDelta(Die, dwarf::DW_AT_GNU_pubnames, dwarf::DW_FORM_data4,
+                        Asm->GetTempSymbol("gnu_pubnames",
+                                           NewCU->getUniqueID()),
+                        DwarfGnuPubNamesSectionSym);
+
+      if (Asm->MAI->doesDwarfUseRelocationsAcrossSections())
+        NewCU->addLabel(Die, dwarf::DW_AT_GNU_pubtypes,
+                        dwarf::DW_FORM_sec_offset,
+                        Asm->GetTempSymbol("gnu_pubtypes",
+                                           NewCU->getUniqueID()));
+      else
+        NewCU->addDelta(Die, dwarf::DW_AT_GNU_pubtypes, dwarf::DW_FORM_data4,
+                        Asm->GetTempSymbol("gnu_pubtypes",
+                                           NewCU->getUniqueID()),
+                        DwarfGnuPubTypesSectionSym);
+    }
   }
 
-  // If we're using split dwarf the compilation dir is going to be in the
-  // skeleton CU and so we don't need to duplicate it here.
-  if (!useSplitDwarf() && !CompilationDir.empty())
-    NewCU->addString(Die, dwarf::DW_AT_comp_dir, CompilationDir);
   if (DIUnit.isOptimized())
     NewCU->addFlag(Die, dwarf::DW_AT_APPLE_optimized);
 
@@ -764,13 +815,17 @@ CompileUnit *DwarfDebug::constructCompileUnit(const MDNode *N) {
 
   InfoHolder.addUnit(NewCU);
 
-  CUMap.insert(std::make_pair(N, NewCU));
+  CUMap.insert(std::make_pair(DIUnit, NewCU));
+  CUDieMap.insert(std::make_pair(Die, NewCU));
   return NewCU;
 }
 
 // Construct subprogram DIE.
-void DwarfDebug::constructSubprogramDIE(CompileUnit *TheCU,
-                                        const MDNode *N) {
+void DwarfDebug::constructSubprogramDIE(CompileUnit *TheCU, const MDNode *N) {
+  // FIXME: We should only call this routine once, however, during LTO if a
+  // program is defined in multiple CUs we could end up calling it out of
+  // beginModule as we walk the CUs.
+
   CompileUnit *&CURef = SPMap[N];
   if (CURef)
     return;
@@ -784,15 +839,8 @@ void DwarfDebug::constructSubprogramDIE(CompileUnit *TheCU,
 
   DIE *SubprogramDie = TheCU->getOrCreateSubprogramDIE(SP);
 
-  // Add to map.
-  TheCU->insertDIE(N, SubprogramDie);
-
-  // Add to context owner.
-  TheCU->addToContextOwner(SubprogramDie, SP.getContext());
-
-  // Expose as global, if requested.
-  if (GenerateDwarfPubNamesSection)
-    TheCU->addGlobalName(SP.getName(), SubprogramDie);
+  // Expose as a global name.
+  TheCU->addGlobalName(SP.getName(), SubprogramDie, resolve(SP.getContext()));
 }
 
 void DwarfDebug::constructImportedEntityDIE(CompileUnit *TheCU,
@@ -833,10 +881,9 @@ void DwarfDebug::constructImportedEntityDIE(CompileUnit *TheCU,
   unsigned FileID = getOrCreateSourceID(Module.getContext().getFilename(),
                                         Module.getContext().getDirectory(),
                                         TheCU->getUniqueID());
-  TheCU->addUInt(IMDie, dwarf::DW_AT_decl_file, 0, FileID);
-  TheCU->addUInt(IMDie, dwarf::DW_AT_decl_line, 0, Module.getLineNumber());
-  TheCU->addDIEEntry(IMDie, dwarf::DW_AT_import, dwarf::DW_FORM_ref4,
-                     EntityDie);
+  TheCU->addUInt(IMDie, dwarf::DW_AT_decl_file, None, FileID);
+  TheCU->addUInt(IMDie, dwarf::DW_AT_decl_line, None, Module.getLineNumber());
+  TheCU->addDIEEntry(IMDie, dwarf::DW_AT_import, EntityDie);
   StringRef Name = Module.getName();
   if (!Name.empty())
     TheCU->addString(IMDie, dwarf::DW_AT_name, Name);
@@ -857,6 +904,7 @@ void DwarfDebug::beginModule() {
   NamedMDNode *CU_Nodes = M->getNamedMetadata("llvm.dbg.cu");
   if (!CU_Nodes)
     return;
+  TypeIdentifierMap = generateDITypeIdentifierMap(CU_Nodes);
 
   // Emit initial sections so we can reference labels later.
   emitSectionLabels();
@@ -870,10 +918,10 @@ void DwarfDebug::beginModule() {
           DIImportedEntity(ImportedEntities.getElement(i)).getContext(),
           ImportedEntities.getElement(i)));
     std::sort(ScopesWithImportedEntities.begin(),
-              ScopesWithImportedEntities.end(), CompareFirst());
+              ScopesWithImportedEntities.end(), less_first());
     DIArray GVs = CUNode.getGlobalVariables();
     for (unsigned i = 0, e = GVs.getNumElements(); i != e; ++i)
-      CU->createGlobalVariableDIE(GVs.getElement(i));
+      CU->createGlobalVariableDIE(DIGlobalVariable(GVs.getElement(i)));
     DIArray SPs = CUNode.getSubprograms();
     for (unsigned i = 0, e = SPs.getNumElements(); i != e; ++i)
       constructSubprogramDIE(CU, SPs.getElement(i));
@@ -887,22 +935,13 @@ void DwarfDebug::beginModule() {
     // available.
     for (unsigned i = 0, e = ImportedEntities.getNumElements(); i != e; ++i)
       constructImportedEntityDIE(CU, ImportedEntities.getElement(i));
-    // If we're splitting the dwarf out now that we've got the entire
-    // CU then construct a skeleton CU based upon it.
-    if (useSplitDwarf()) {
-      // This should be a unique identifier when we want to build .dwp files.
-      CU->addUInt(CU->getCUDie(), dwarf::DW_AT_GNU_dwo_id,
-                  dwarf::DW_FORM_data8, 0);
-      // Now construct the skeleton CU associated.
-      constructSkeletonCU(CUNode);
-    }
   }
 
   // Tell MMI that we have debug info.
   MMI->setDebugInfoAvailability(true);
 
   // Prime section data.
-  SectionMap.insert(Asm->getObjFileLowering().getTextSection());
+  SectionMap[Asm->getObjFileLowering().getTextSection()];
 }
 
 // Attach DW_AT_inline attribute with inlined subprogram DIEs.
@@ -911,21 +950,20 @@ void DwarfDebug::computeInlinedDIEs() {
   for (SmallPtrSet<DIE *, 4>::iterator AI = InlinedSubprogramDIEs.begin(),
          AE = InlinedSubprogramDIEs.end(); AI != AE; ++AI) {
     DIE *ISP = *AI;
-    FirstCU->addUInt(ISP, dwarf::DW_AT_inline, 0, dwarf::DW_INL_inlined);
+    FirstCU->addUInt(ISP, dwarf::DW_AT_inline, None, dwarf::DW_INL_inlined);
   }
   for (DenseMap<const MDNode *, DIE *>::iterator AI = AbstractSPDies.begin(),
          AE = AbstractSPDies.end(); AI != AE; ++AI) {
     DIE *ISP = AI->second;
     if (InlinedSubprogramDIEs.count(ISP))
       continue;
-    FirstCU->addUInt(ISP, dwarf::DW_AT_inline, 0, dwarf::DW_INL_inlined);
+    FirstCU->addUInt(ISP, dwarf::DW_AT_inline, None, dwarf::DW_INL_inlined);
   }
 }
 
 // Collect info for variables that were optimized out.
 void DwarfDebug::collectDeadVariables() {
   const Module *M = MMI->getModule();
-  DenseMap<const MDNode *, LexicalScope *> DeadFnScopeMap;
 
   if (NamedMDNode *CU_Nodes = M->getNamedMetadata("llvm.dbg.cu")) {
     for (unsigned i = 0, e = CU_Nodes->getNumOperands(); i != e; ++i) {
@@ -933,37 +971,38 @@ void DwarfDebug::collectDeadVariables() {
       DIArray Subprograms = TheCU.getSubprograms();
       for (unsigned i = 0, e = Subprograms.getNumElements(); i != e; ++i) {
         DISubprogram SP(Subprograms.getElement(i));
-        if (ProcessedSPNodes.count(SP) != 0) continue;
-        if (!SP.isSubprogram()) continue;
-        if (!SP.isDefinition()) continue;
+        if (ProcessedSPNodes.count(SP) != 0)
+          continue;
+        if (!SP.isSubprogram())
+          continue;
+        if (!SP.isDefinition())
+          continue;
         DIArray Variables = SP.getVariables();
-        if (Variables.getNumElements() == 0) continue;
-
-        LexicalScope *Scope =
-          new LexicalScope(NULL, DIDescriptor(SP), NULL, false);
-        DeadFnScopeMap[SP] = Scope;
+        if (Variables.getNumElements() == 0)
+          continue;
 
         // Construct subprogram DIE and add variables DIEs.
         CompileUnit *SPCU = CUMap.lookup(TheCU);
         assert(SPCU && "Unable to find Compile Unit!");
+        // FIXME: See the comment in constructSubprogramDIE about duplicate
+        // subprogram DIEs.
         constructSubprogramDIE(SPCU, SP);
-        DIE *ScopeDIE = SPCU->getDIE(SP);
+        DIE *SPDIE = SPCU->getDIE(SP);
         for (unsigned vi = 0, ve = Variables.getNumElements(); vi != ve; ++vi) {
           DIVariable DV(Variables.getElement(vi));
-          if (!DV.isVariable()) continue;
-          DbgVariable NewVar(DV, NULL);
+          if (!DV.isVariable())
+            continue;
+          DbgVariable NewVar(DV, NULL, this);
           if (DIE *VariableDIE =
-              SPCU->constructVariableDIE(&NewVar, Scope->isAbstractScope()))
-            ScopeDIE->addChild(VariableDIE);
+                  SPCU->constructVariableDIE(NewVar, false))
+            SPDIE->addChild(VariableDIE);
         }
       }
     }
   }
-  DeleteContainerSeconds(DeadFnScopeMap);
 }
 
-// Type Signature [7.27] computation code.
-typedef ArrayRef<uint8_t> HashValue;
+// Type Signature [7.27] and ODR Hash code.
 
 /// \brief Grabs the string in whichever attribute is passed in and returns
 /// a reference to it. Returns "" if the attribute doesn't exist.
@@ -976,100 +1015,6 @@ static StringRef getDIEStringAttr(DIE *Die, unsigned Attr) {
   return StringRef("");
 }
 
-/// \brief Adds the string in \p Str to the hash in \p Hash. This also hashes
-/// a trailing NULL with the string.
-static void addStringToHash(MD5 &Hash, StringRef Str) {
-  DEBUG(dbgs() << "Adding string " << Str << " to hash.\n");
-  Hash.update(Str);
-  Hash.update(makeArrayRef((uint8_t)'\0'));
-}
-
-// FIXME: These are copied and only slightly modified out of LEB128.h.
-
-/// \brief Adds the unsigned in \p N to the hash in \p Hash. This also encodes
-/// the unsigned as a ULEB128.
-static void addULEB128ToHash(MD5 &Hash, uint64_t Value) {
-  DEBUG(dbgs() << "Adding ULEB128 " << Value << " to hash.\n");
-  do {
-    uint8_t Byte = Value & 0x7f;
-    Value >>= 7;
-    if (Value != 0)
-      Byte |= 0x80; // Mark this byte to show that more bytes will follow.
-    Hash.update(Byte);
-  } while (Value != 0);
-}
-
-/// \brief Including \p Parent adds the context of Parent to \p Hash.
-static void addParentContextToHash(MD5 &Hash, DIE *Parent) {
-
-  DEBUG(dbgs() << "Adding parent context to hash...\n");
-
-  // [7.27.2] For each surrounding type or namespace beginning with the
-  // outermost such construct...
-  SmallVector<DIE *, 1> Parents;
-  while (Parent->getTag() != dwarf::DW_TAG_compile_unit) {
-    Parents.push_back(Parent);
-    Parent = Parent->getParent();
-  }
-
-  // Reverse iterate over our list to go from the outermost construct to the
-  // innermost.
-  for (SmallVectorImpl<DIE *>::reverse_iterator I = Parents.rbegin(),
-                                                E = Parents.rend();
-       I != E; ++I) {
-    DIE *Die = *I;
-
-    // ... Append the letter "C" to the sequence...
-    addULEB128ToHash(Hash, 'C');
-
-    // ... Followed by the DWARF tag of the construct...
-    addULEB128ToHash(Hash, Die->getTag());
-
-    // ... Then the name, taken from the DW_AT_name attribute.
-    StringRef Name = getDIEStringAttr(Die, dwarf::DW_AT_name);
-    DEBUG(dbgs() << "... adding context: " << Name << "\n");
-    if (!Name.empty())
-      addStringToHash(Hash, Name);
-  }
-}
-
-/// This is based on the type signature computation given in section 7.27 of the
-/// DWARF4 standard. It is the md5 hash of a flattened description of the DIE with
-/// the exception that we are hashing only the context and the name of the type.
-static void addDIEODRSignature(MD5 &Hash, CompileUnit *CU, DIE *Die) {
-
-  // Add the contexts to the hash. We won't be computing the ODR hash for
-  // function local types so it's safe to use the generic context hashing
-  // algorithm here.
-  // FIXME: If we figure out how to account for linkage in some way we could
-  // actually do this with a slight modification to the parent hash algorithm.
-  DIE *Parent = Die->getParent();
-  if (Parent)
-    addParentContextToHash(Hash, Parent);
-
-  // Add the current DIE information.
-
-  // Add the DWARF tag of the DIE.
-  addULEB128ToHash(Hash, Die->getTag());
-
-  // Add the name of the type to the hash.
-  addStringToHash(Hash, getDIEStringAttr(Die, dwarf::DW_AT_name));
-
-  // Now get the result.
-  MD5::MD5Result Result;
-  Hash.final(Result);
-
-  // ... take the least significant 8 bytes and store those as the attribute.
-  // Our MD5 implementation always returns its results in little endian, swap
-  // bytes appropriately.
-  uint64_t Signature = *reinterpret_cast<support::ulittle64_t *>(Result + 8);
-
-  // FIXME: This should be added onto the type unit, not the type, but this
-  // works as an intermediate stage.
-  CU->addUInt(Die, dwarf::DW_AT_GNU_odr_signature, dwarf::DW_FORM_data8,
-              Signature);
-}
-
 /// Return true if the current DIE is contained within an anonymous namespace.
 static bool isContainedInAnonNamespace(DIE *Die) {
   DIE *Parent = Die->getParent();
@@ -1090,7 +1035,7 @@ static bool shouldAddODRHash(CompileUnit *CU, DIE *Die) {
   return CU->getLanguage() == dwarf::DW_LANG_C_plus_plus &&
          getDIEStringAttr(Die, dwarf::DW_AT_name) != "" &&
          !isContainedInAnonNamespace(Die);
- }
+}
 
 void DwarfDebug::finalizeModuleInfo() {
   // Collect info for variables that were optimized out.
@@ -1099,43 +1044,102 @@ void DwarfDebug::finalizeModuleInfo() {
   // Attach DW_AT_inline attribute with inlined subprogram DIEs.
   computeInlinedDIEs();
 
-  // Emit DW_AT_containing_type attribute to connect types with their
-  // vtable holding type.
-  for (DenseMap<const MDNode *, CompileUnit *>::iterator CUI = CUMap.begin(),
-         CUE = CUMap.end(); CUI != CUE; ++CUI) {
-    CompileUnit *TheCU = CUI->second;
-    TheCU->constructContainingTypeDIEs();
-  }
-
   // Split out type units and conditionally add an ODR tag to the split
   // out type.
   // FIXME: Do type splitting.
   for (unsigned i = 0, e = TypeUnits.size(); i != e; ++i) {
-    MD5 Hash;
     DIE *Die = TypeUnits[i];
+    DIEHash Hash;
     // If we've requested ODR hashes and it's applicable for an ODR hash then
     // add the ODR signature now.
+    // FIXME: This should be added onto the type unit, not the type, but this
+    // works as an intermediate stage.
     if (GenerateODRHash && shouldAddODRHash(CUMap.begin()->second, Die))
-      addDIEODRSignature(Hash, CUMap.begin()->second, Die);
+      CUMap.begin()->second->addUInt(Die, dwarf::DW_AT_GNU_odr_signature,
+                                     dwarf::DW_FORM_data8,
+                                     Hash.computeDIEODRSignature(*Die));
   }
 
-   // Compute DIE offsets and sizes.
+  // Handle anything that needs to be done on a per-cu basis.
+  for (DenseMap<const MDNode *, CompileUnit *>::iterator CUI = CUMap.begin(),
+                                                         CUE = CUMap.end();
+       CUI != CUE; ++CUI) {
+    CompileUnit *TheCU = CUI->second;
+    // Emit DW_AT_containing_type attribute to connect types with their
+    // vtable holding type.
+    TheCU->constructContainingTypeDIEs();
+
+    // If we're splitting the dwarf out now that we've got the entire
+    // CU then construct a skeleton CU based upon it.
+    if (useSplitDwarf()) {
+      uint64_t ID = 0;
+      if (GenerateCUHash) {
+        DIEHash CUHash;
+        ID = CUHash.computeCUSignature(*TheCU->getCUDie());
+      }
+      // This should be a unique identifier when we want to build .dwp files.
+      TheCU->addUInt(TheCU->getCUDie(), dwarf::DW_AT_GNU_dwo_id,
+                     dwarf::DW_FORM_data8, ID);
+      // Now construct the skeleton CU associated.
+      CompileUnit *SkCU = constructSkeletonCU(TheCU);
+      // This should be a unique identifier when we want to build .dwp files.
+      SkCU->addUInt(SkCU->getCUDie(), dwarf::DW_AT_GNU_dwo_id,
+                    dwarf::DW_FORM_data8, ID);
+    }
+  }
+
+  // Compute DIE offsets and sizes.
   InfoHolder.computeSizeAndOffsets();
   if (useSplitDwarf())
     SkeletonHolder.computeSizeAndOffsets();
 }
 
 void DwarfDebug::endSections() {
-  // Standard sections final addresses.
-  Asm->OutStreamer.SwitchSection(Asm->getObjFileLowering().getTextSection());
-  Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("text_end"));
-  Asm->OutStreamer.SwitchSection(Asm->getObjFileLowering().getDataSection());
-  Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("data_end"));
+   // Filter labels by section.
+  for (size_t n = 0; n < ArangeLabels.size(); n++) {
+    const SymbolCU &SCU = ArangeLabels[n];
+    if (SCU.Sym->isInSection()) {
+      // Make a note of this symbol and it's section.
+      const MCSection *Section = &SCU.Sym->getSection();
+      if (!Section->getKind().isMetadata())
+        SectionMap[Section].push_back(SCU);
+    } else {
+      // Some symbols (e.g. common/bss on mach-o) can have no section but still
+      // appear in the output. This sucks as we rely on sections to build
+      // arange spans. We can do it without, but it's icky.
+      SectionMap[NULL].push_back(SCU);
+    }
+  }
 
-  // End text sections.
-  for (unsigned I = 0, E = SectionMap.size(); I != E; ++I) {
-    Asm->OutStreamer.SwitchSection(SectionMap[I]);
-    Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("section_end", I+1));
+  // Build a list of sections used.
+  std::vector<const MCSection *> Sections;
+  for (SectionMapType::iterator it = SectionMap.begin(); it != SectionMap.end();
+       it++) {
+    const MCSection *Section = it->first;
+    Sections.push_back(Section);
+  }
+
+  // Sort the sections into order.
+  // This is only done to ensure consistent output order across different runs.
+  std::sort(Sections.begin(), Sections.end(), SectionSort);
+
+  // Add terminating symbols for each section.
+  for (unsigned ID=0;ID<Sections.size();ID++) {
+    const MCSection *Section = Sections[ID];
+    MCSymbol *Sym = NULL;
+
+    if (Section) {
+      // We can't call MCSection::getLabelEndName, as it's only safe to do so
+      // if we know the section name up-front. For user-created sections, the resulting
+      // label may not be valid to use as a label. (section names can use a greater
+      // set of characters on some systems)
+      Sym = Asm->GetTempSymbol("debug_end", ID);
+      Asm->OutStreamer.SwitchSection(Section);
+      Asm->OutStreamer.EmitLabel(Sym);
+    }
+
+    // Insert a final terminator.
+    SectionMap[Section].push_back(SymbolCU(NULL, Sym));
   }
 }
 
@@ -1152,6 +1156,8 @@ void DwarfDebug::endModule() {
   finalizeModuleInfo();
 
   if (!useSplitDwarf()) {
+    emitDebugStr();
+
     // Emit all the DIEs into a debug info section.
     emitDebugInfo();
 
@@ -1170,15 +1176,12 @@ void DwarfDebug::endModule() {
     // Emit info into a debug macinfo section.
     emitDebugMacInfo();
 
-    // Emit inline info.
-    // TODO: When we don't need the option anymore we
-    // can remove all of the code that this section
-    // depends upon.
-    if (useDarwinGDBCompat())
-      emitDebugInlineInfo();
   } else {
     // TODO: Fill this in for separated debug sections and separate
     // out information into new sections.
+    emitDebugStr();
+    if (useSplitDwarf())
+      emitDebugStrDWO();
 
     // Emit the debug info section and compile units.
     emitDebugInfo();
@@ -1203,12 +1206,6 @@ void DwarfDebug::endModule() {
     // Emit DWO addresses.
     InfoHolder.emitAddresses(Asm->getObjFileLowering().getDwarfAddrSection());
 
-    // Emit inline info.
-    // TODO: When we don't need the option anymore we
-    // can remove all of the code that this section
-    // depends upon.
-    if (useDarwinGDBCompat())
-      emitDebugInlineInfo();
   }
 
   // Emit info into the dwarf accelerator table sections.
@@ -1219,20 +1216,11 @@ void DwarfDebug::endModule() {
     emitAccelTypes();
   }
 
-  // Emit info into a debug pubnames section, if requested.
-  if (GenerateDwarfPubNamesSection)
-    emitDebugPubnames();
-
-  // Emit info into a debug pubtypes section.
-  // TODO: When we don't need the option anymore we can
-  // remove all of the code that adds to the table.
-  if (useDarwinGDBCompat())
-    emitDebugPubTypes();
-
-  // Finally emit string information into a string table.
-  emitDebugStr();
-  if (useSplitDwarf())
-    emitDebugStrDWO();
+  // Emit the pubnames and pubtypes sections if requested.
+  if (HasDwarfPubSections) {
+    emitDebugPubNames(GenerateGnuPubSections);
+    emitDebugPubTypes(GenerateGnuPubSections);
+  }
 
   // clean up.
   SPMap.clear();
@@ -1262,7 +1250,7 @@ DbgVariable *DwarfDebug::findAbstractVariable(DIVariable &DV,
   if (!Scope)
     return NULL;
 
-  AbsDbgVariable = new DbgVariable(Var, NULL);
+  AbsDbgVariable = new DbgVariable(Var, NULL, this);
   addScopeVariable(Scope, AbsDbgVariable);
   AbstractVariables[Var] = AbsDbgVariable;
   return AbsDbgVariable;
@@ -1311,7 +1299,7 @@ DwarfDebug::collectVariableInfoFromMMITable(const MachineFunction *MF,
       continue;
 
     DbgVariable *AbsDbgVariable = findAbstractVariable(DV, VP.second);
-    DbgVariable *RegVar = new DbgVariable(DV, AbsDbgVariable);
+    DbgVariable *RegVar = new DbgVariable(DV, AbsDbgVariable, this);
     RegVar->setFrameIndex(VP.first);
     if (!addCurrentFnArgument(MF, RegVar, Scope))
       addScopeVariable(Scope, RegVar);
@@ -1396,7 +1384,7 @@ DwarfDebug::collectVariableInfo(const MachineFunction *MF,
     Processed.insert(DV);
     assert(MInsn->isDebugValue() && "History must begin with debug value");
     DbgVariable *AbsVar = findAbstractVariable(DV, MInsn->getDebugLoc());
-    DbgVariable *RegVar = new DbgVariable(DV, AbsVar);
+    DbgVariable *RegVar = new DbgVariable(DV, AbsVar, this);
     if (!addCurrentFnArgument(MF, RegVar, Scope))
       addScopeVariable(Scope, RegVar);
     if (AbsVar)
@@ -1459,7 +1447,7 @@ DwarfDebug::collectVariableInfo(const MachineFunction *MF,
     if (!DV || !DV.isVariable() || !Processed.insert(DV))
       continue;
     if (LexicalScope *Scope = LScopes.findLexicalScope(DV.getContext()))
-      addScopeVariable(Scope, new DbgVariable(DV, NULL));
+      addScopeVariable(Scope, new DbgVariable(DV, NULL, this));
   }
 }
 
@@ -1602,36 +1590,45 @@ static DebugLoc getFnDebugLoc(DebugLoc DL, const LLVMContext &Ctx) {
 // Gather pre-function debug information.  Assumes being called immediately
 // after the function entry point has been emitted.
 void DwarfDebug::beginFunction(const MachineFunction *MF) {
-  if (!MMI->hasDebugInfo()) return;
+
+  // If there's no debug info for the function we're not going to do anything.
+  if (!MMI->hasDebugInfo())
+    return;
+
+  // Grab the lexical scopes for the function, if we don't have any of those
+  // then we're not going to be able to do anything.
   LScopes.initialize(*MF);
-  if (LScopes.empty()) return;
+  if (LScopes.empty())
+    return;
+
+  assert(UserVariables.empty() && DbgValues.empty() && "Maps weren't cleaned");
+
+  // Make sure that each lexical scope will have a begin/end label.
   identifyScopeMarkers();
 
   // Set DwarfCompileUnitID in MCContext to the Compile Unit this function
-  // belongs to.
+  // belongs to so that we add to the correct per-cu line table in the
+  // non-asm case.
   LexicalScope *FnScope = LScopes.getCurrentFunctionScope();
   CompileUnit *TheCU = SPMap.lookup(FnScope->getScopeNode());
   assert(TheCU && "Unable to find compile unit!");
-  if (Asm->TM.hasMCUseLoc() &&
-      Asm->OutStreamer.getKind() == MCStreamer::SK_AsmStreamer)
+  if (Asm->TM.hasMCUseLoc() && Asm->OutStreamer.hasRawTextSupport())
     // Use a single line table if we are using .loc and generating assembly.
     Asm->OutStreamer.getContext().setDwarfCompileUnitID(0);
   else
     Asm->OutStreamer.getContext().setDwarfCompileUnitID(TheCU->getUniqueID());
 
-  FunctionBeginSym = Asm->GetTempSymbol("func_begin",
-                                        Asm->getFunctionNumber());
+  // Emit a label for the function so that we have a beginning address.
+  FunctionBeginSym = Asm->GetTempSymbol("func_begin", Asm->getFunctionNumber());
   // Assumes in correct section after the entry point.
   Asm->OutStreamer.EmitLabel(FunctionBeginSym);
 
-  assert(UserVariables.empty() && DbgValues.empty() && "Maps weren't cleaned");
-
   const TargetRegisterInfo *TRI = Asm->TM.getRegisterInfo();
   // LiveUserVar - Map physreg numbers to the MDNode they contain.
-  std::vector<const MDNode*> LiveUserVar(TRI->getNumRegs());
+  std::vector<const MDNode *> LiveUserVar(TRI->getNumRegs());
 
-  for (MachineFunction::const_iterator I = MF->begin(), E = MF->end();
-       I != E; ++I) {
+  for (MachineFunction::const_iterator I = MF->begin(), E = MF->end(); I != E;
+       ++I) {
     bool AtBlockEntry = true;
     for (MachineBasicBlock::const_iterator II = I->begin(), IE = I->end();
          II != IE; ++II) {
@@ -1642,22 +1639,21 @@ void DwarfDebug::beginFunction(const MachineFunction *MF) {
 
         // Keep track of user variables.
         const MDNode *Var =
-          MI->getOperand(MI->getNumOperands() - 1).getMetadata();
+            MI->getOperand(MI->getNumOperands() - 1).getMetadata();
 
         // Variable is in a register, we need to check for clobbers.
         if (isDbgValueInDefinedReg(MI))
           LiveUserVar[MI->getOperand(0).getReg()] = Var;
 
         // Check the history of this variable.
-        SmallVectorImpl<const MachineInstr*> &History = DbgValues[Var];
+        SmallVectorImpl<const MachineInstr *> &History = DbgValues[Var];
         if (History.empty()) {
           UserVariables.push_back(Var);
           // The first mention of a function argument gets the FunctionBeginSym
           // label, so arguments are visible when breaking at function entry.
           DIVariable DV(Var);
           if (DV.isVariable() && DV.getTag() == dwarf::DW_TAG_arg_variable &&
-              DISubprogram(getDISubprogram(DV.getContext()))
-                .describes(MF->getFunction()))
+              getDISubprogram(DV.getContext()).describes(MF->getFunction()))
             LabelsBeforeInsn[MI] = FunctionBeginSym;
         } else {
           // We have seen this variable before. Try to coalesce DBG_VALUEs.
@@ -1667,8 +1663,8 @@ void DwarfDebug::beginFunction(const MachineFunction *MF) {
             if (History.size() >= 2 &&
                 Prev->isIdenticalTo(History[History.size() - 2])) {
               DEBUG(dbgs() << "Coalescing identical DBG_VALUE entries:\n"
-                    << "\t" << *Prev
-                    << "\t" << *History[History.size() - 2] << "\n");
+                           << "\t" << *Prev << "\t"
+                           << *History[History.size() - 2] << "\n");
               History.pop_back();
             }
 
@@ -1679,11 +1675,11 @@ void DwarfDebug::beginFunction(const MachineFunction *MF) {
               // Previous register assignment needs to terminate at the end of
               // its basic block.
               MachineBasicBlock::const_iterator LastMI =
-                PrevMBB->getLastNonDebugInstr();
+                  PrevMBB->getLastNonDebugInstr();
               if (LastMI == PrevMBB->end()) {
                 // Drop DBG_VALUE for empty range.
                 DEBUG(dbgs() << "Dropping DBG_VALUE for empty range:\n"
-                      << "\t" << *Prev << "\n");
+                             << "\t" << *Prev << "\n");
                 History.pop_back();
               } else if (llvm::next(PrevMBB) != PrevMBB->getParent()->end())
                 // Terminate after LastMI.
@@ -1705,11 +1701,12 @@ void DwarfDebug::beginFunction(const MachineFunction *MF) {
 
         // Check if the instruction clobbers any registers with debug vars.
         for (MachineInstr::const_mop_iterator MOI = MI->operands_begin(),
-               MOE = MI->operands_end(); MOI != MOE; ++MOI) {
+                                              MOE = MI->operands_end();
+             MOI != MOE; ++MOI) {
           if (!MOI->isReg() || !MOI->isDef() || !MOI->getReg())
             continue;
-          for (MCRegAliasIterator AI(MOI->getReg(), TRI, true);
-               AI.isValid(); ++AI) {
+          for (MCRegAliasIterator AI(MOI->getReg(), TRI, true); AI.isValid();
+               ++AI) {
             unsigned Reg = *AI;
             const MDNode *Var = LiveUserVar[Reg];
             if (!Var)
@@ -1721,7 +1718,7 @@ void DwarfDebug::beginFunction(const MachineFunction *MF) {
             DbgValueHistoryMap::iterator HistI = DbgValues.find(Var);
             if (HistI == DbgValues.end())
               continue;
-            SmallVectorImpl<const MachineInstr*> &History = HistI->second;
+            SmallVectorImpl<const MachineInstr *> &History = HistI->second;
             if (History.empty())
               continue;
             const MachineInstr *Prev = History.back();
@@ -1743,7 +1740,7 @@ void DwarfDebug::beginFunction(const MachineFunction *MF) {
 
   for (DbgValueHistoryMap::iterator I = DbgValues.begin(), E = DbgValues.end();
        I != E; ++I) {
-    SmallVectorImpl<const MachineInstr*> &History = I->second;
+    SmallVectorImpl<const MachineInstr *> &History = I->second;
     if (History.empty())
       continue;
 
@@ -1752,7 +1749,7 @@ void DwarfDebug::beginFunction(const MachineFunction *MF) {
     if (Prev->isDebugValue() && isDbgValueInDefinedReg(Prev)) {
       const MachineBasicBlock *PrevMBB = Prev->getParent();
       MachineBasicBlock::const_iterator LastMI =
-        PrevMBB->getLastNonDebugInstr();
+          PrevMBB->getLastNonDebugInstr();
       if (LastMI == PrevMBB->end())
         // Drop DBG_VALUE for empty range.
         History.pop_back();
@@ -1776,13 +1773,14 @@ void DwarfDebug::beginFunction(const MachineFunction *MF) {
 
   // Record beginning of function.
   if (!PrologEndLoc.isUnknown()) {
-    DebugLoc FnStartDL = getFnDebugLoc(PrologEndLoc,
-                                       MF->getFunction()->getContext());
-    recordSourceLine(FnStartDL.getLine(), FnStartDL.getCol(),
-                     FnStartDL.getScope(MF->getFunction()->getContext()),
-    // We'd like to list the prologue as "not statements" but GDB behaves
-    // poorly if we do that. Revisit this with caution/GDB (7.5+) testing.
-                     DWARF2_FLAG_IS_STMT);
+    DebugLoc FnStartDL =
+        getFnDebugLoc(PrologEndLoc, MF->getFunction()->getContext());
+    recordSourceLine(
+        FnStartDL.getLine(), FnStartDL.getCol(),
+        FnStartDL.getScope(MF->getFunction()->getContext()),
+        // We'd like to list the prologue as "not statements" but GDB behaves
+        // poorly if we do that. Revisit this with caution/GDB (7.5+) testing.
+        DWARF2_FLAG_IS_STMT);
   }
 }
 
@@ -1855,7 +1853,7 @@ void DwarfDebug::endFunction(const MachineFunction *MF) {
         if (AbstractVariables.lookup(CleanDV))
           continue;
         if (LexicalScope *Scope = LScopes.findAbstractScope(DV.getContext()))
-          addScopeVariable(Scope, new DbgVariable(DV, NULL));
+          addScopeVariable(Scope, new DbgVariable(DV, NULL, this));
       }
     }
     if (ProcessedSPNodes.count(AScope->getScopeNode()) == 0)
@@ -1924,7 +1922,8 @@ void DwarfDebug::recordSourceLine(unsigned Line, unsigned Col, const MDNode *S,
 // Emit Methods
 //===----------------------------------------------------------------------===//
 
-// Compute the size and offset of a DIE.
+// Compute the size and offset of a DIE. The offset is relative to start of the
+// CU. It returns the offset after laying out the DIE.
 unsigned
 DwarfUnits::computeSizeAndOffset(DIE *Die, unsigned Offset) {
   // Get the children.
@@ -1935,7 +1934,7 @@ DwarfUnits::computeSizeAndOffset(DIE *Die, unsigned Offset) {
 
   // Get the abbreviation for this DIE.
   unsigned AbbrevNumber = Die->getAbbrevNumber();
-  const DIEAbbrev *Abbrev = Abbreviations->at(AbbrevNumber - 1);
+  const DIEAbbrev *Abbrev = Abbreviations[AbbrevNumber - 1];
 
   // Set DIE offset
   Die->setOffset(Offset);
@@ -1967,19 +1966,23 @@ DwarfUnits::computeSizeAndOffset(DIE *Die, unsigned Offset) {
   return Offset;
 }
 
-// Compute the size and offset of all the DIEs.
+// Compute the size and offset for each DIE.
 void DwarfUnits::computeSizeAndOffsets() {
-  // Offset from the beginning of debug info section.
+  // Offset from the first CU in the debug info section is 0 initially.
   unsigned SecOffset = 0;
+
+  // Iterate over each compile unit and set the size and offsets for each
+  // DIE within each compile unit. All offsets are CU relative.
   for (SmallVectorImpl<CompileUnit *>::iterator I = CUs.begin(),
          E = CUs.end(); I != E; ++I) {
     (*I)->setDebugInfoOffset(SecOffset);
-    unsigned Offset =
-      sizeof(int32_t) + // Length of Compilation Unit Info
-      sizeof(int16_t) + // DWARF version number
-      sizeof(int32_t) + // Offset Into Abbrev. Section
-      sizeof(int8_t);   // Pointer Size (in bytes)
 
+    // CU-relative offset is reset to 0 here.
+    unsigned Offset = sizeof(int32_t) + // Length of Unit Info
+                      (*I)->getHeaderSize(); // Unit-specific headers
+
+    // EndOffset here is CU-relative, after laying out
+    // all of the CU DIE.
     unsigned EndOffset = computeSizeAndOffset((*I)->getCUDie(), Offset);
     SecOffset += EndOffset;
   }
@@ -2006,9 +2009,16 @@ void DwarfDebug::emitSectionLabels() {
   DwarfLineSectionSym =
     emitSectionSym(Asm, TLOF.getDwarfLineSection(), "section_line");
   emitSectionSym(Asm, TLOF.getDwarfLocSection());
-  if (GenerateDwarfPubNamesSection)
+  if (GenerateGnuPubSections) {
+    DwarfGnuPubNamesSectionSym =
+        emitSectionSym(Asm, TLOF.getDwarfGnuPubNamesSection());
+    DwarfGnuPubTypesSectionSym =
+        emitSectionSym(Asm, TLOF.getDwarfGnuPubTypesSection());
+  } else if (HasDwarfPubSections) {
     emitSectionSym(Asm, TLOF.getDwarfPubNamesSection());
-  emitSectionSym(Asm, TLOF.getDwarfPubTypesSection());
+    emitSectionSym(Asm, TLOF.getDwarfPubTypesSection());
+  }
+
   DwarfStrSectionSym =
     emitSectionSym(Asm, TLOF.getDwarfStrSection(), "info_string");
   if (useSplitDwarf()) {
@@ -2028,10 +2038,10 @@ void DwarfDebug::emitSectionLabels() {
 }
 
 // Recursively emits a debug information entry.
-void DwarfDebug::emitDIE(DIE *Die, std::vector<DIEAbbrev *> *Abbrevs) {
+void DwarfDebug::emitDIE(DIE *Die, ArrayRef<DIEAbbrev *> Abbrevs) {
   // Get the abbreviation for this DIE.
   unsigned AbbrevNumber = Die->getAbbrevNumber();
-  const DIEAbbrev *Abbrev = Abbrevs->at(AbbrevNumber - 1);
+  const DIEAbbrev *Abbrev = Abbrevs[AbbrevNumber - 1];
 
   // Emit the code (index) for the abbreviation.
   if (Asm->isVerbose())
@@ -2046,27 +2056,44 @@ void DwarfDebug::emitDIE(DIE *Die, std::vector<DIEAbbrev *> *Abbrevs) {
 
   // Emit the DIE attribute values.
   for (unsigned i = 0, N = Values.size(); i < N; ++i) {
-    unsigned Attr = AbbrevData[i].getAttribute();
-    unsigned Form = AbbrevData[i].getForm();
+    dwarf::Attribute Attr = AbbrevData[i].getAttribute();
+    dwarf::Form Form = AbbrevData[i].getForm();
     assert(Form && "Too many attributes for DIE (check abbreviation)");
 
     if (Asm->isVerbose())
       Asm->OutStreamer.AddComment(dwarf::AttributeString(Attr));
 
     switch (Attr) {
-    case dwarf::DW_AT_abstract_origin: {
+    case dwarf::DW_AT_abstract_origin:
+    case dwarf::DW_AT_type:
+    case dwarf::DW_AT_friend:
+    case dwarf::DW_AT_specification:
+    case dwarf::DW_AT_import:
+    case dwarf::DW_AT_containing_type: {
       DIEEntry *E = cast<DIEEntry>(Values[i]);
       DIE *Origin = E->getEntry();
       unsigned Addr = Origin->getOffset();
       if (Form == dwarf::DW_FORM_ref_addr) {
+        assert(!useSplitDwarf() && "TODO: dwo files can't have relocations.");
         // For DW_FORM_ref_addr, output the offset from beginning of debug info
         // section. Origin->getOffset() returns the offset from start of the
         // compile unit.
-        DwarfUnits &Holder = useSplitDwarf() ? SkeletonHolder : InfoHolder;
-        Addr += Holder.getCUOffset(Origin->getCompileUnit());
+        CompileUnit *CU = CUDieMap.lookup(Origin->getCompileUnit());
+        assert(CU && "CUDie should belong to a CU.");
+        Addr += CU->getDebugInfoOffset();
+        if (Asm->MAI->doesDwarfUseRelocationsAcrossSections())
+          Asm->EmitLabelPlusOffset(DwarfInfoSectionSym, Addr,
+                                   DIEEntry::getRefAddrSize(Asm));
+        else
+          Asm->EmitLabelOffsetDifference(DwarfInfoSectionSym, Addr,
+                                         DwarfInfoSectionSym,
+                                         DIEEntry::getRefAddrSize(Asm));
+      } else {
+        // Make sure Origin belong to the same CU.
+        assert(Die->getCompileUnit() == Origin->getCompileUnit() &&
+               "The referenced DIE should belong to the same CU in ref4");
+        Asm->EmitInt32(Addr);
       }
-      Asm->OutStreamer.EmitIntValue(Addr,
-          Form == dwarf::DW_FORM_ref_addr ? DIEEntry::getRefAddrSize(Asm) : 4);
       break;
     }
     case dwarf::DW_AT_ranges: {
@@ -2088,7 +2115,7 @@ void DwarfDebug::emitDIE(DIE *Die, std::vector<DIEAbbrev *> *Abbrevs) {
     case dwarf::DW_AT_location: {
       if (DIELabel *L = dyn_cast<DIELabel>(Values[i])) {
         if (Asm->MAI->doesDwarfUseRelocationsAcrossSections())
-          Asm->EmitLabelReference(L->getValue(), 4);
+          Asm->EmitSectionOffset(L->getValue(), DwarfDebugLocSectionSym);
         else
           Asm->EmitLabelDifference(L->getValue(), DwarfDebugLocSectionSym, 4);
       } else {
@@ -2142,20 +2169,10 @@ void DwarfUnits::emitUnits(DwarfDebug *DD,
                                     TheCU->getUniqueID()));
 
     // Emit size of content not including length itself
-    unsigned ContentSize = Die->getSize() +
-      sizeof(int16_t) + // DWARF version number
-      sizeof(int32_t) + // Offset Into Abbrev. Section
-      sizeof(int8_t);   // Pointer Size (in bytes)
+    Asm->OutStreamer.AddComment("Length of Unit");
+    Asm->EmitInt32(TheCU->getHeaderSize() + Die->getSize());
 
-    Asm->OutStreamer.AddComment("Length of Compilation Unit Info");
-    Asm->EmitInt32(ContentSize);
-    Asm->OutStreamer.AddComment("DWARF version number");
-    Asm->EmitInt16(DD->getDwarfVersion());
-    Asm->OutStreamer.AddComment("Offset Into Abbrev. Section");
-    Asm->EmitSectionOffset(Asm->GetTempSymbol(ASection->getLabelBeginName()),
-                           ASectionSym);
-    Asm->OutStreamer.AddComment("Address Size (in bytes)");
-    Asm->EmitInt8(Asm->getDataLayout().getPointerSize());
+    TheCU->emitHeader(ASection, ASectionSym);
 
     DD->emitDIE(Die, Abbreviations);
     Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol(USection->getLabelEndName(),
@@ -2163,19 +2180,6 @@ void DwarfUnits::emitUnits(DwarfDebug *DD,
   }
 }
 
-/// For a given compile unit DIE, returns offset from beginning of debug info.
-unsigned DwarfUnits::getCUOffset(DIE *Die) {
-  assert(Die->getTag() == dwarf::DW_TAG_compile_unit  &&
-         "Input DIE should be compile unit in getCUOffset.");
-  for (SmallVectorImpl<CompileUnit *>::iterator I = CUs.begin(),
-       E = CUs.end(); I != E; ++I) {
-    CompileUnit *TheCU = *I;
-    if (TheCU->getCUDie() == Die)
-      return TheCU->getDebugInfoOffset();
-  }
-  llvm_unreachable("The compile unit DIE should belong to CUs in DwarfUnits.");
-}
-
 // Emit the debug info section.
 void DwarfDebug::emitDebugInfo() {
   DwarfUnits &Holder = useSplitDwarf() ? SkeletonHolder : InfoHolder;
@@ -2249,7 +2253,7 @@ void DwarfDebug::emitEndOfLineMatrix(unsigned SectionEnd) {
 
 // Emit visible names into a hashed accelerator table section.
 void DwarfDebug::emitAccelNames() {
-  DwarfAccelTable AT(DwarfAccelTable::Atom(DwarfAccelTable::eAtomTypeDIEOffset,
+  DwarfAccelTable AT(DwarfAccelTable::Atom(dwarf::DW_ATOM_die_offset,
                                            dwarf::DW_FORM_data4));
   for (DenseMap<const MDNode *, CompileUnit *>::iterator I = CUMap.begin(),
          E = CUMap.end(); I != E; ++I) {
@@ -2278,7 +2282,7 @@ void DwarfDebug::emitAccelNames() {
 // Emit objective C classes and categories into a hashed accelerator table
 // section.
 void DwarfDebug::emitAccelObjC() {
-  DwarfAccelTable AT(DwarfAccelTable::Atom(DwarfAccelTable::eAtomTypeDIEOffset,
+  DwarfAccelTable AT(DwarfAccelTable::Atom(dwarf::DW_ATOM_die_offset,
                                            dwarf::DW_FORM_data4));
   for (DenseMap<const MDNode *, CompileUnit *>::iterator I = CUMap.begin(),
          E = CUMap.end(); I != E; ++I) {
@@ -2306,7 +2310,7 @@ void DwarfDebug::emitAccelObjC() {
 
 // Emit namespace dies into a hashed accelerator table.
 void DwarfDebug::emitAccelNamespaces() {
-  DwarfAccelTable AT(DwarfAccelTable::Atom(DwarfAccelTable::eAtomTypeDIEOffset,
+  DwarfAccelTable AT(DwarfAccelTable::Atom(dwarf::DW_ATOM_die_offset,
                                            dwarf::DW_FORM_data4));
   for (DenseMap<const MDNode *, CompileUnit *>::iterator I = CUMap.begin(),
          E = CUMap.end(); I != E; ++I) {
@@ -2335,11 +2339,11 @@ void DwarfDebug::emitAccelNamespaces() {
 // Emit type dies into a hashed accelerator table.
 void DwarfDebug::emitAccelTypes() {
   std::vector<DwarfAccelTable::Atom> Atoms;
-  Atoms.push_back(DwarfAccelTable::Atom(DwarfAccelTable::eAtomTypeDIEOffset,
+  Atoms.push_back(DwarfAccelTable::Atom(dwarf::DW_ATOM_die_offset,
                                         dwarf::DW_FORM_data4));
-  Atoms.push_back(DwarfAccelTable::Atom(DwarfAccelTable::eAtomTypeTag,
+  Atoms.push_back(DwarfAccelTable::Atom(dwarf::DW_ATOM_die_tag,
                                         dwarf::DW_FORM_data2));
-  Atoms.push_back(DwarfAccelTable::Atom(DwarfAccelTable::eAtomTypeTypeFlags,
+  Atoms.push_back(DwarfAccelTable::Atom(dwarf::DW_ATOM_type_flags,
                                         dwarf::DW_FORM_data1));
   DwarfAccelTable AT(Atoms);
   for (DenseMap<const MDNode *, CompileUnit *>::iterator I = CUMap.begin(),
@@ -2367,23 +2371,85 @@ void DwarfDebug::emitAccelTypes() {
   AT.Emit(Asm, SectionBegin, &InfoHolder);
 }
 
-/// emitDebugPubnames - Emit visible names into a debug pubnames section.
+// Public name handling.
+// The format for the various pubnames:
+//
+// dwarf pubnames - offset/name pairs where the offset is the offset into the CU
+// for the DIE that is named.
+//
+// gnu pubnames - offset/index value/name tuples where the offset is the offset
+// into the CU and the index value is computed according to the type of value
+// for the DIE that is named.
+//
+// For type units the offset is the offset of the skeleton DIE. For split dwarf
+// it's the offset within the debug_info/debug_types dwo section, however, the
+// reference in the pubname header doesn't change.
+
+/// computeIndexValue - Compute the gdb index value for the DIE and CU.
+static dwarf::PubIndexEntryDescriptor computeIndexValue(CompileUnit *CU,
+                                                        DIE *Die) {
+  dwarf::GDBIndexEntryLinkage Linkage = dwarf::GIEL_STATIC;
+
+  // We could have a specification DIE that has our most of our knowledge,
+  // look for that now.
+  DIEValue *SpecVal = Die->findAttribute(dwarf::DW_AT_specification);
+  if (SpecVal) {
+    DIE *SpecDIE = cast<DIEEntry>(SpecVal)->getEntry();
+    if (SpecDIE->findAttribute(dwarf::DW_AT_external))
+      Linkage = dwarf::GIEL_EXTERNAL;
+  } else if (Die->findAttribute(dwarf::DW_AT_external))
+    Linkage = dwarf::GIEL_EXTERNAL;
+
+  switch (Die->getTag()) {
+  case dwarf::DW_TAG_class_type:
+  case dwarf::DW_TAG_structure_type:
+  case dwarf::DW_TAG_union_type:
+  case dwarf::DW_TAG_enumeration_type:
+    return dwarf::PubIndexEntryDescriptor(
+        dwarf::GIEK_TYPE, CU->getLanguage() != dwarf::DW_LANG_C_plus_plus
+                              ? dwarf::GIEL_STATIC
+                              : dwarf::GIEL_EXTERNAL);
+  case dwarf::DW_TAG_typedef:
+  case dwarf::DW_TAG_base_type:
+  case dwarf::DW_TAG_subrange_type:
+    return dwarf::PubIndexEntryDescriptor(dwarf::GIEK_TYPE, dwarf::GIEL_STATIC);
+  case dwarf::DW_TAG_namespace:
+    return dwarf::GIEK_TYPE;
+  case dwarf::DW_TAG_subprogram:
+    return dwarf::PubIndexEntryDescriptor(dwarf::GIEK_FUNCTION, Linkage);
+  case dwarf::DW_TAG_constant:
+  case dwarf::DW_TAG_variable:
+    return dwarf::PubIndexEntryDescriptor(dwarf::GIEK_VARIABLE, Linkage);
+  case dwarf::DW_TAG_enumerator:
+    return dwarf::PubIndexEntryDescriptor(dwarf::GIEK_VARIABLE,
+                                          dwarf::GIEL_STATIC);
+  default:
+    return dwarf::GIEK_NONE;
+  }
+}
+
+/// emitDebugPubNames - Emit visible names into a debug pubnames section.
 ///
-void DwarfDebug::emitDebugPubnames() {
+void DwarfDebug::emitDebugPubNames(bool GnuStyle) {
   const MCSection *ISec = Asm->getObjFileLowering().getDwarfInfoSection();
+  const MCSection *PSec =
+      GnuStyle ? Asm->getObjFileLowering().getDwarfGnuPubNamesSection()
+               : Asm->getObjFileLowering().getDwarfPubNamesSection();
 
   typedef DenseMap<const MDNode*, CompileUnit*> CUMapType;
   for (CUMapType::iterator I = CUMap.begin(), E = CUMap.end(); I != E; ++I) {
     CompileUnit *TheCU = I->second;
     unsigned ID = TheCU->getUniqueID();
 
-    if (TheCU->getGlobalNames().empty())
-      continue;
-
     // Start the dwarf pubnames section.
-    Asm->OutStreamer.SwitchSection(
-      Asm->getObjFileLowering().getDwarfPubNamesSection());
+    Asm->OutStreamer.SwitchSection(PSec);
+
+    // Emit a label so we can reference the beginning of this pubname section.
+    if (GnuStyle)
+      Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("gnu_pubnames",
+                                                    TheCU->getUniqueID()));
 
+    // Emit the header.
     Asm->OutStreamer.AddComment("Length of Public Names Info");
     Asm->EmitLabelDifference(Asm->GetTempSymbol("pubnames_end", ID),
                              Asm->GetTempSymbol("pubnames_begin", ID), 4);
@@ -2391,7 +2457,7 @@ void DwarfDebug::emitDebugPubnames() {
     Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("pubnames_begin", ID));
 
     Asm->OutStreamer.AddComment("DWARF Version");
-    Asm->EmitInt16(DwarfVersion);
+    Asm->EmitInt16(dwarf::DW_PUBNAMES_VERSION);
 
     Asm->OutStreamer.AddComment("Offset of Compilation Unit Info");
     Asm->EmitSectionOffset(Asm->GetTempSymbol(ISec->getLabelBeginName(), ID),
@@ -2402,15 +2468,24 @@ void DwarfDebug::emitDebugPubnames() {
                              Asm->GetTempSymbol(ISec->getLabelBeginName(), ID),
                              4);
 
+    // Emit the pubnames for this compilation unit.
     const StringMap<DIE*> &Globals = TheCU->getGlobalNames();
     for (StringMap<DIE*>::const_iterator
            GI = Globals.begin(), GE = Globals.end(); GI != GE; ++GI) {
       const char *Name = GI->getKeyData();
-      const DIE *Entity = GI->second;
+      DIE *Entity = GI->second;
 
       Asm->OutStreamer.AddComment("DIE offset");
       Asm->EmitInt32(Entity->getOffset());
 
+      if (GnuStyle) {
+        dwarf::PubIndexEntryDescriptor Desc = computeIndexValue(TheCU, Entity);
+        Asm->OutStreamer.AddComment(
+            Twine("Kind: ") + dwarf::GDBIndexEntryKindString(Desc.Kind) + ", " +
+            dwarf::GDBIndexEntryLinkageString(Desc.Linkage));
+        Asm->EmitInt8(Desc.toBits());
+      }
+
       if (Asm->isVerbose())
         Asm->OutStreamer.AddComment("External Name");
       Asm->OutStreamer.EmitBytes(StringRef(Name, GI->getKeyLength()+1));
@@ -2422,55 +2497,78 @@ void DwarfDebug::emitDebugPubnames() {
   }
 }
 
-void DwarfDebug::emitDebugPubTypes() {
+void DwarfDebug::emitDebugPubTypes(bool GnuStyle) {
+  const MCSection *ISec = Asm->getObjFileLowering().getDwarfInfoSection();
+  const MCSection *PSec =
+      GnuStyle ? Asm->getObjFileLowering().getDwarfGnuPubTypesSection()
+               : Asm->getObjFileLowering().getDwarfPubTypesSection();
+
   for (DenseMap<const MDNode *, CompileUnit *>::iterator I = CUMap.begin(),
-         E = CUMap.end(); I != E; ++I) {
+                                                         E = CUMap.end();
+       I != E; ++I) {
     CompileUnit *TheCU = I->second;
     // Start the dwarf pubtypes section.
-    Asm->OutStreamer.SwitchSection(
-      Asm->getObjFileLowering().getDwarfPubTypesSection());
+    Asm->OutStreamer.SwitchSection(PSec);
+
+    // Emit a label so we can reference the beginning of this pubtype section.
+    if (GnuStyle)
+      Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("gnu_pubtypes",
+                                                    TheCU->getUniqueID()));
+
+    // Emit the header.
     Asm->OutStreamer.AddComment("Length of Public Types Info");
     Asm->EmitLabelDifference(
-      Asm->GetTempSymbol("pubtypes_end", TheCU->getUniqueID()),
-      Asm->GetTempSymbol("pubtypes_begin", TheCU->getUniqueID()), 4);
+        Asm->GetTempSymbol("pubtypes_end", TheCU->getUniqueID()),
+        Asm->GetTempSymbol("pubtypes_begin", TheCU->getUniqueID()), 4);
 
-    Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("pubtypes_begin",
-                                                  TheCU->getUniqueID()));
+    Asm->OutStreamer.EmitLabel(
+        Asm->GetTempSymbol("pubtypes_begin", TheCU->getUniqueID()));
 
-    if (Asm->isVerbose()) Asm->OutStreamer.AddComment("DWARF Version");
-    Asm->EmitInt16(DwarfVersion);
+    if (Asm->isVerbose())
+      Asm->OutStreamer.AddComment("DWARF Version");
+    Asm->EmitInt16(dwarf::DW_PUBTYPES_VERSION);
 
     Asm->OutStreamer.AddComment("Offset of Compilation Unit Info");
-    const MCSection *ISec = Asm->getObjFileLowering().getDwarfInfoSection();
-    Asm->EmitSectionOffset(Asm->GetTempSymbol(ISec->getLabelBeginName(),
-                                              TheCU->getUniqueID()),
-                           DwarfInfoSectionSym);
+    Asm->EmitSectionOffset(
+        Asm->GetTempSymbol(ISec->getLabelBeginName(), TheCU->getUniqueID()),
+        DwarfInfoSectionSym);
 
     Asm->OutStreamer.AddComment("Compilation Unit Length");
-    Asm->EmitLabelDifference(Asm->GetTempSymbol(ISec->getLabelEndName(),
-                                                TheCU->getUniqueID()),
-                             Asm->GetTempSymbol(ISec->getLabelBeginName(),
-                                                TheCU->getUniqueID()),
-                             4);
-
-    const StringMap<DIE*> &Globals = TheCU->getGlobalTypes();
-    for (StringMap<DIE*>::const_iterator
-           GI = Globals.begin(), GE = Globals.end(); GI != GE; ++GI) {
+    Asm->EmitLabelDifference(
+        Asm->GetTempSymbol(ISec->getLabelEndName(), TheCU->getUniqueID()),
+        Asm->GetTempSymbol(ISec->getLabelBeginName(), TheCU->getUniqueID()), 4);
+
+    // Emit the pubtypes.
+    const StringMap<DIE *> &Globals = TheCU->getGlobalTypes();
+    for (StringMap<DIE *>::const_iterator GI = Globals.begin(),
+                                          GE = Globals.end();
+         GI != GE; ++GI) {
       const char *Name = GI->getKeyData();
       DIE *Entity = GI->second;
 
-      if (Asm->isVerbose()) Asm->OutStreamer.AddComment("DIE offset");
+      if (Asm->isVerbose())
+        Asm->OutStreamer.AddComment("DIE offset");
       Asm->EmitInt32(Entity->getOffset());
 
-      if (Asm->isVerbose()) Asm->OutStreamer.AddComment("External Name");
+      if (GnuStyle) {
+        dwarf::PubIndexEntryDescriptor Desc = computeIndexValue(TheCU, Entity);
+        Asm->OutStreamer.AddComment(
+            Twine("Kind: ") + dwarf::GDBIndexEntryKindString(Desc.Kind) + ", " +
+            dwarf::GDBIndexEntryLinkageString(Desc.Linkage));
+        Asm->EmitInt8(Desc.toBits());
+      }
+
+      if (Asm->isVerbose())
+        Asm->OutStreamer.AddComment("External Name");
+
       // Emit the name with a terminating null byte.
-      Asm->OutStreamer.EmitBytes(StringRef(Name, GI->getKeyLength()+1));
+      Asm->OutStreamer.EmitBytes(StringRef(Name, GI->getKeyLength() + 1));
     }
 
     Asm->OutStreamer.AddComment("End Mark");
     Asm->EmitInt32(0);
-    Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("pubtypes_end",
-                                                  TheCU->getUniqueID()));
+    Asm->OutStreamer.EmitLabel(
+        Asm->GetTempSymbol("pubtypes_end", TheCU->getUniqueID()));
   }
 }
 
@@ -2649,18 +2747,178 @@ void DwarfDebug::emitDebugLoc() {
   }
 }
 
-// Emit visible names into a debug aranges section.
+struct SymbolCUSorter {
+  SymbolCUSorter(const MCStreamer &s) : Streamer(s) {}
+  const MCStreamer &Streamer;
+
+  bool operator() (const SymbolCU &A, const SymbolCU &B) {
+    unsigned IA = A.Sym ? Streamer.GetSymbolOrder(A.Sym) : 0;
+    unsigned IB = B.Sym ? Streamer.GetSymbolOrder(B.Sym) : 0;
+
+    // Symbols with no order assigned should be placed at the end.
+    // (e.g. section end labels)
+    if (IA == 0)
+      IA = (unsigned)(-1);
+    if (IB == 0)
+      IB = (unsigned)(-1);
+    return IA < IB;
+  }
+};
+
+static bool CUSort(const CompileUnit *A, const CompileUnit *B) {
+    return (A->getUniqueID() < B->getUniqueID());
+}
+
+struct ArangeSpan {
+  const MCSymbol *Start, *End;
+};
+
+// Emit a debug aranges section, containing a CU lookup for any
+// address we can tie back to a CU.
 void DwarfDebug::emitDebugARanges() {
   // Start the dwarf aranges section.
-  Asm->OutStreamer.SwitchSection(
-                          Asm->getObjFileLowering().getDwarfARangesSection());
+  Asm->OutStreamer
+      .SwitchSection(Asm->getObjFileLowering().getDwarfARangesSection());
+
+  typedef DenseMap<CompileUnit *, std::vector<ArangeSpan> > SpansType;
+
+  SpansType Spans;
+
+  // Build a list of sections used.
+  std::vector<const MCSection *> Sections;
+  for (SectionMapType::iterator it = SectionMap.begin(); it != SectionMap.end();
+       it++) {
+    const MCSection *Section = it->first;
+    Sections.push_back(Section);
+  }
+
+  // Sort the sections into order.
+  // This is only done to ensure consistent output order across different runs.
+  std::sort(Sections.begin(), Sections.end(), SectionSort);
+
+  // Build a set of address spans, sorted by CU.
+  for (size_t SecIdx=0;SecIdx<Sections.size();SecIdx++) {
+    const MCSection *Section = Sections[SecIdx];
+    SmallVector<SymbolCU, 8> &List = SectionMap[Section];
+    if (List.size() < 2)
+      continue;
+
+    // Sort the symbols by offset within the section.
+    SymbolCUSorter sorter(Asm->OutStreamer);
+    std::sort(List.begin(), List.end(), sorter);
+
+    // If we have no section (e.g. common), just write out
+    // individual spans for each symbol.
+    if (Section == NULL) {
+      for (size_t n = 0; n < List.size(); n++) {
+        const SymbolCU &Cur = List[n];
+
+        ArangeSpan Span;
+        Span.Start = Cur.Sym;
+        Span.End = NULL;
+        if (Cur.CU)
+          Spans[Cur.CU].push_back(Span);
+      }
+    } else {
+      // Build spans between each label.
+      const MCSymbol *StartSym = List[0].Sym;
+      for (size_t n = 1; n < List.size(); n++) {
+        const SymbolCU &Prev = List[n - 1];
+        const SymbolCU &Cur = List[n];
+
+        // Try and build the longest span we can within the same CU.
+        if (Cur.CU != Prev.CU) {
+          ArangeSpan Span;
+          Span.Start = StartSym;
+          Span.End = Cur.Sym;
+          Spans[Prev.CU].push_back(Span);
+          StartSym = Cur.Sym;
+        }
+      }
+    }
+  }
+
+  const MCSection *ISec = Asm->getObjFileLowering().getDwarfInfoSection();
+  unsigned PtrSize = Asm->getDataLayout().getPointerSize();
+
+  // Build a list of CUs used.
+  std::vector<CompileUnit *> CUs;
+  for (SpansType::iterator it = Spans.begin(); it != Spans.end(); it++) {
+    CompileUnit *CU = it->first;
+    CUs.push_back(CU);
+  }
+
+  // Sort the CU list (again, to ensure consistent output order).
+  std::sort(CUs.begin(), CUs.end(), CUSort);
+
+  // Emit an arange table for each CU we used.
+  for (size_t CUIdx=0;CUIdx<CUs.size();CUIdx++) {
+    CompileUnit *CU = CUs[CUIdx];
+    std::vector<ArangeSpan> &List = Spans[CU];
+
+    // Emit size of content not including length itself.
+    unsigned ContentSize
+        = sizeof(int16_t) // DWARF ARange version number
+        + sizeof(int32_t) // Offset of CU in the .debug_info section
+        + sizeof(int8_t)  // Pointer Size (in bytes)
+        + sizeof(int8_t); // Segment Size (in bytes)
+
+    unsigned TupleSize = PtrSize * 2;
+
+    // 7.20 in the Dwarf specs requires the table to be aligned to a tuple.
+    unsigned Padding = 0;
+    while (((sizeof(int32_t) + ContentSize + Padding) % TupleSize) != 0)
+      Padding++;
+
+    ContentSize += Padding;
+    ContentSize += (List.size() + 1) * TupleSize;
+
+    // For each compile unit, write the list of spans it covers.
+    Asm->OutStreamer.AddComment("Length of ARange Set");
+    Asm->EmitInt32(ContentSize);
+    Asm->OutStreamer.AddComment("DWARF Arange version number");
+    Asm->EmitInt16(dwarf::DW_ARANGES_VERSION);
+    Asm->OutStreamer.AddComment("Offset Into Debug Info Section");
+    Asm->EmitSectionOffset(
+        Asm->GetTempSymbol(ISec->getLabelBeginName(), CU->getUniqueID()),
+        DwarfInfoSectionSym);
+    Asm->OutStreamer.AddComment("Address Size (in bytes)");
+    Asm->EmitInt8(PtrSize);
+    Asm->OutStreamer.AddComment("Segment Size (in bytes)");
+    Asm->EmitInt8(0);
+
+    for (unsigned n = 0; n < Padding; n++)
+      Asm->EmitInt8(0xff);
+
+    for (unsigned n = 0; n < List.size(); n++) {
+      const ArangeSpan &Span = List[n];
+      Asm->EmitLabelReference(Span.Start, PtrSize);
+
+      // Calculate the size as being from the span start to it's end.
+      if (Span.End) {
+        Asm->EmitLabelDifference(Span.End, Span.Start, PtrSize);
+      } else {
+        // For symbols without an end marker (e.g. common), we
+        // write a single arange entry containing just that one symbol.
+        uint64_t Size = SymSize[Span.Start];
+        if (Size == 0)
+          Size = 1;
+
+        Asm->OutStreamer.EmitIntValue(Size, PtrSize);
+      }
+    }
+
+    Asm->OutStreamer.AddComment("ARange terminator");
+    Asm->OutStreamer.EmitIntValue(0, PtrSize);
+    Asm->OutStreamer.EmitIntValue(0, PtrSize);
+  }
 }
 
 // Emit visible names into a debug ranges section.
 void DwarfDebug::emitDebugRanges() {
   // Start the dwarf ranges section.
-  Asm->OutStreamer.SwitchSection(
-    Asm->getObjFileLowering().getDwarfRangesSection());
+  Asm->OutStreamer
+      .SwitchSection(Asm->getObjFileLowering().getDwarfRangesSection());
   unsigned char Size = Asm->getDataLayout().getPointerSize();
   for (SmallVectorImpl<const MCSymbol *>::iterator
          I = DebugRangeSymbols.begin(), E = DebugRangeSymbols.end();
@@ -2681,103 +2939,19 @@ void DwarfDebug::emitDebugMacInfo() {
   }
 }
 
-// Emit inline info using following format.
-// Section Header:
-// 1. length of section
-// 2. Dwarf version number
-// 3. address size.
-//
-// Entries (one "entry" for each function that was inlined):
-//
-// 1. offset into __debug_str section for MIPS linkage name, if exists;
-//   otherwise offset into __debug_str for regular function name.
-// 2. offset into __debug_str section for regular function name.
-// 3. an unsigned LEB128 number indicating the number of distinct inlining
-// instances for the function.
-//
-// The rest of the entry consists of a {die_offset, low_pc} pair for each
-// inlined instance; the die_offset points to the inlined_subroutine die in the
-// __debug_info section, and the low_pc is the starting address for the
-// inlining instance.
-void DwarfDebug::emitDebugInlineInfo() {
-  if (!Asm->MAI->doesDwarfUseInlineInfoSection())
-    return;
-
-  if (!FirstCU)
-    return;
-
-  Asm->OutStreamer.SwitchSection(
-                        Asm->getObjFileLowering().getDwarfDebugInlineSection());
-
-  Asm->OutStreamer.AddComment("Length of Debug Inlined Information Entry");
-  Asm->EmitLabelDifference(Asm->GetTempSymbol("debug_inlined_end", 1),
-                           Asm->GetTempSymbol("debug_inlined_begin", 1), 4);
-
-  Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("debug_inlined_begin", 1));
-
-  Asm->OutStreamer.AddComment("Dwarf Version");
-  Asm->EmitInt16(DwarfVersion);
-  Asm->OutStreamer.AddComment("Address Size (in bytes)");
-  Asm->EmitInt8(Asm->getDataLayout().getPointerSize());
-
-  for (SmallVectorImpl<const MDNode *>::iterator I = InlinedSPNodes.begin(),
-         E = InlinedSPNodes.end(); I != E; ++I) {
-
-    const MDNode *Node = *I;
-    InlineInfoMap::iterator II = InlineInfo.find(Node);
-    SmallVectorImpl<InlineInfoLabels> &Labels = II->second;
-    DISubprogram SP(Node);
-    StringRef LName = SP.getLinkageName();
-    StringRef Name = SP.getName();
-
-    Asm->OutStreamer.AddComment("MIPS linkage name");
-    if (LName.empty())
-      Asm->EmitSectionOffset(InfoHolder.getStringPoolEntry(Name),
-                             DwarfStrSectionSym);
-    else
-      Asm->EmitSectionOffset(
-          InfoHolder.getStringPoolEntry(Function::getRealLinkageName(LName)),
-          DwarfStrSectionSym);
-
-    Asm->OutStreamer.AddComment("Function name");
-    Asm->EmitSectionOffset(InfoHolder.getStringPoolEntry(Name),
-                           DwarfStrSectionSym);
-    Asm->EmitULEB128(Labels.size(), "Inline count");
-
-    for (SmallVectorImpl<InlineInfoLabels>::iterator LI = Labels.begin(),
-           LE = Labels.end(); LI != LE; ++LI) {
-      if (Asm->isVerbose()) Asm->OutStreamer.AddComment("DIE offset");
-      Asm->EmitInt32(LI->second->getOffset());
-
-      if (Asm->isVerbose()) Asm->OutStreamer.AddComment("low_pc");
-      Asm->OutStreamer.EmitSymbolValue(LI->first,
-                                       Asm->getDataLayout().getPointerSize());
-    }
-  }
-
-  Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("debug_inlined_end", 1));
-}
-
 // DWARF5 Experimental Separate Dwarf emitters.
 
 // This DIE has the following attributes: DW_AT_comp_dir, DW_AT_stmt_list,
 // DW_AT_low_pc, DW_AT_high_pc, DW_AT_ranges, DW_AT_dwo_name, DW_AT_dwo_id,
-// DW_AT_ranges_base, DW_AT_addr_base. If DW_AT_ranges is present,
-// DW_AT_low_pc and DW_AT_high_pc are not used, and vice versa.
-CompileUnit *DwarfDebug::constructSkeletonCU(const MDNode *N) {
-  DICompileUnit DIUnit(N);
-  CompilationDir = DIUnit.getDirectory();
+// DW_AT_ranges_base, DW_AT_addr_base.
+CompileUnit *DwarfDebug::constructSkeletonCU(const CompileUnit *CU) {
 
   DIE *Die = new DIE(dwarf::DW_TAG_compile_unit);
-  CompileUnit *NewCU = new CompileUnit(GlobalCUIndexCount++,
-                                       DIUnit.getLanguage(), Die, N, Asm,
-                                       this, &SkeletonHolder);
+  CompileUnit *NewCU = new CompileUnit(CU->getUniqueID(), Die, CU->getNode(),
+                                       Asm, this, &SkeletonHolder);
 
   NewCU->addLocalString(Die, dwarf::DW_AT_GNU_dwo_name,
-                        DIUnit.getSplitDebugFilename());
-
-  // This should be a unique identifier when we want to build .dwp files.
-  NewCU->addUInt(Die, dwarf::DW_AT_GNU_dwo_id, dwarf::DW_FORM_data8, 0);
+                        CU->getNode().getSplitDebugFilename());
 
   // Relocate to the beginning of the addr_base section, else 0 for the
   // beginning of the one for this compile unit.
@@ -2804,6 +2978,35 @@ CompileUnit *DwarfDebug::constructSkeletonCU(const MDNode *N) {
   if (!CompilationDir.empty())
     NewCU->addLocalString(Die, dwarf::DW_AT_comp_dir, CompilationDir);
 
+  // Flags to let the linker know we have emitted new style pubnames.
+  if (GenerateGnuPubSections) {
+    if (Asm->MAI->doesDwarfUseRelocationsAcrossSections())
+      NewCU->addLabel(Die, dwarf::DW_AT_GNU_pubnames, dwarf::DW_FORM_sec_offset,
+                      Asm->GetTempSymbol("gnu_pubnames", NewCU->getUniqueID()));
+    else
+      NewCU->addDelta(Die, dwarf::DW_AT_GNU_pubnames, dwarf::DW_FORM_data4,
+                      Asm->GetTempSymbol("gnu_pubnames", NewCU->getUniqueID()),
+                      DwarfGnuPubNamesSectionSym);
+
+    if (Asm->MAI->doesDwarfUseRelocationsAcrossSections())
+      NewCU->addLabel(Die, dwarf::DW_AT_GNU_pubtypes, dwarf::DW_FORM_sec_offset,
+                      Asm->GetTempSymbol("gnu_pubtypes", NewCU->getUniqueID()));
+    else
+      NewCU->addDelta(Die, dwarf::DW_AT_GNU_pubtypes, dwarf::DW_FORM_data4,
+                      Asm->GetTempSymbol("gnu_pubtypes", NewCU->getUniqueID()),
+                      DwarfGnuPubTypesSectionSym);
+  }
+
+  // Flag if we've emitted any ranges and their location for the compile unit.
+  if (DebugRangeSymbols.size()) {
+    if (Asm->MAI->doesDwarfUseRelocationsAcrossSections())
+      NewCU->addLabel(Die, dwarf::DW_AT_GNU_ranges_base,
+                      dwarf::DW_FORM_sec_offset, DwarfDebugRangeSectionSym);
+    else
+      NewCU->addUInt(Die, dwarf::DW_AT_GNU_ranges_base, dwarf::DW_FORM_data4,
+                     0);
+  }
+
   SkeletonHolder.addUnit(NewCU);
   SkeletonCUs.push_back(NewCU);
 
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.h b/lib/CodeGen/AsmPrinter/DwarfDebug.h
index e14f9b1..cebac39 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.h
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.h
@@ -150,11 +150,12 @@ class DbgVariable {
   DbgVariable *AbsVar;               // Corresponding Abstract variable, if any.
   const MachineInstr *MInsn;         // DBG_VALUE instruction of the variable.
   int FrameIndex;
+  DwarfDebug *DD;
 public:
   // AbsVar may be NULL.
-  DbgVariable(DIVariable V, DbgVariable *AV)
+  DbgVariable(DIVariable V, DbgVariable *AV, DwarfDebug *DD)
     : Var(V), TheDIE(0), DotDebugLocOffset(~0U), AbsVar(AV), MInsn(0),
-      FrameIndex(~0) {}
+      FrameIndex(~0), DD(DD) {}
 
   // Accessors.
   DIVariable getVariable()           const { return Var; }
@@ -169,7 +170,7 @@ public:
   int getFrameIndex()                const { return FrameIndex; }
   void setFrameIndex(int FI)               { FrameIndex = FI; }
   // Translate tag to proper Dwarf tag.
-  unsigned getTag()                  const {
+  uint16_t getTag()                  const {
     if (Var.getTag() == dwarf::DW_TAG_arg_variable)
       return dwarf::DW_TAG_formal_parameter;
 
@@ -208,6 +209,11 @@ public:
     return Var.getAddrElement(i);
   }
   DIType getType() const;
+
+private:
+  /// resolve - Look in the DwarfDebug map for the MDNode that
+  /// corresponds to the reference.
+  template <typename T> T resolve(DIRef<T> Ref) const;
 };
 
 /// \brief Collects and handles information specific to a particular
@@ -220,7 +226,7 @@ class DwarfUnits {
   FoldingSet<DIEAbbrev> *AbbreviationsSet;
 
   // A list of all the unique abbreviations in use.
-  std::vector<DIEAbbrev *> *Abbreviations;
+  std::vector<DIEAbbrev *> &Abbreviations;
 
   // A pointer to all units in the section.
   SmallVector<CompileUnit *, 1> CUs;
@@ -243,7 +249,7 @@ class DwarfUnits {
 
 public:
   DwarfUnits(AsmPrinter *AP, FoldingSet<DIEAbbrev> *AS,
-             std::vector<DIEAbbrev *> *A, const char *Pref,
+             std::vector<DIEAbbrev *> &A, const char *Pref,
              BumpPtrAllocator &DA)
       : Asm(AP), AbbreviationsSet(AS), Abbreviations(A), StringPool(DA),
         NextStringPoolNumber(0), StringPref(Pref), AddressPool(),
@@ -294,10 +300,13 @@ public:
 
   /// \brief Returns the address pool.
   AddrPool *getAddrPool() { return &AddressPool; }
+};
 
-  /// \brief for a given compile unit DIE, returns offset from beginning of
-  /// debug info.
-  unsigned getCUOffset(DIE *Die);
+/// \brief Helper used to pair up a symbol and its DWARF compile unit.
+struct SymbolCU {
+  SymbolCU(CompileUnit *CU, const MCSymbol *Sym) : Sym(Sym), CU(CU) {}
+  const MCSymbol *Sym;
+  CompileUnit *CU;
 };
 
 /// \brief Collects and handles dwarf debug information.
@@ -320,6 +329,14 @@ class DwarfDebug {
   // Maps subprogram MDNode with its corresponding CompileUnit.
   DenseMap <const MDNode *, CompileUnit *> SPMap;
 
+  // Maps a CU DIE with its corresponding CompileUnit.
+  DenseMap <const DIE *, CompileUnit *> CUDieMap;
+
+  /// Maps MDNodes for type sysstem with the corresponding DIEs. These DIEs can
+  /// be shared across CUs, that is why we keep the map here instead
+  /// of in CompileUnit.
+  DenseMap<const MDNode *, DIE *> MDTypeNodeToDieMap;
+
   // Used to uniquely define abbreviations.
   FoldingSet<DIEAbbrev> AbbreviationsSet;
 
@@ -332,8 +349,15 @@ class DwarfDebug {
   // separated by a zero byte, mapped to a unique id.
   StringMap<unsigned, BumpPtrAllocator&> SourceIdMap;
 
+  // List of all labels used in aranges generation.
+  std::vector<SymbolCU> ArangeLabels;
+
+  // Size of each symbol emitted (for those symbols that have a specific size).
+  DenseMap <const MCSymbol *, uint64_t> SymSize;
+
   // Provides a unique id per text section.
-  SetVector<const MCSection*> SectionMap;
+  typedef DenseMap<const MCSection *, SmallVector<SymbolCU, 8> > SectionMapType;
+  SectionMapType SectionMap;
 
   // List of arguments for current function.
   SmallVector<DbgVariable *, 8> CurrentFnArguments;
@@ -358,14 +382,6 @@ class DwarfDebug {
   // as DW_AT_inline.
   SmallPtrSet<DIE *, 4> InlinedSubprogramDIEs;
 
-  // Keep track of inlined functions and their location.  This
-  // information is used to populate the debug_inlined section.
-  typedef std::pair<const MCSymbol *, DIE *> InlineInfoLabels;
-  typedef DenseMap<const MDNode *,
-                   SmallVector<InlineInfoLabels, 4> > InlineInfoMap;
-  InlineInfoMap InlineInfo;
-  SmallVector<const MDNode *, 4> InlinedSPNodes;
-
   // This is a collection of subprogram MDNodes that are processed to
   // create DIEs.
   SmallPtrSet<const MDNode *, 16> ProcessedSPNodes;
@@ -406,6 +422,7 @@ class DwarfDebug {
   MCSymbol *DwarfDebugLocSectionSym, *DwarfLineSectionSym, *DwarfAddrSectionSym;
   MCSymbol *FunctionBeginSym, *FunctionEndSym;
   MCSymbol *DwarfAbbrevDWOSectionSym, *DwarfStrDWOSectionSym;
+  MCSymbol *DwarfGnuPubNamesSectionSym, *DwarfGnuPubTypesSectionSym;
 
   // As an optimization, there is no need to emit an entry in the directory
   // table for the same directory as DW_AT_comp_dir.
@@ -420,9 +437,6 @@ class DwarfDebug {
   // Holders for the various debug information flags that we might need to
   // have exposed. See accessor functions below for description.
 
-  // Whether or not we're emitting info for older versions of gdb on darwin.
-  bool IsDarwinGDBCompat;
-
   // Holder for imported entities.
   typedef SmallVector<std::pair<const MDNode *, const MDNode *>, 32>
     ImportedEntityMap;
@@ -431,12 +445,16 @@ class DwarfDebug {
   // Holder for types that are going to be extracted out into a type unit.
   std::vector<DIE *> TypeUnits;
 
+  // Whether to emit the pubnames/pubtypes sections.
+  bool HasDwarfPubSections;
+
+  // Version of dwarf we're emitting.
+  unsigned DwarfVersion;
+
   // DWARF5 Experimental Options
   bool HasDwarfAccelTables;
   bool HasSplitDwarf;
 
-  unsigned DwarfVersion;
-
   // Separated Dwarf Variables
   // In general these will all be for bits that are left in the
   // original object file, rather than things that are meant
@@ -454,6 +472,9 @@ class DwarfDebug {
   // Holder for the skeleton information.
   DwarfUnits SkeletonHolder;
 
+  // Maps from a type identifier to the actual MDNode.
+  DITypeIdentifierMap TypeIdentifierMap;
+
 private:
 
   void addScopeVariable(LexicalScope *LS, DbgVariable *Var);
@@ -465,11 +486,14 @@ private:
   /// DW_AT_low_pc and DW_AT_high_pc attributes. If there are global
   /// variables in this scope then create and insert DIEs for these
   /// variables.
-  DIE *updateSubprogramScopeDIE(CompileUnit *SPCU, const MDNode *SPNode);
+  DIE *updateSubprogramScopeDIE(CompileUnit *SPCU, DISubprogram SP);
 
   /// \brief Construct new DW_TAG_lexical_block for this scope and
   /// attach DW_AT_low_pc/DW_AT_high_pc labels.
   DIE *constructLexicalScopeDIE(CompileUnit *TheCU, LexicalScope *Scope);
+  /// A helper function to check whether the DIE for a given Scope is going
+  /// to be null.
+  bool isLexicalScopeDIENull(LexicalScope *Scope);
 
   /// \brief This scope represents inlined body of a function. Construct
   /// DIE to represent this concrete inlined copy of the function.
@@ -477,6 +501,9 @@ private:
 
   /// \brief Construct a DIE for this scope.
   DIE *constructScopeDIE(CompileUnit *TheCU, LexicalScope *Scope);
+  /// A helper function to create children of a Scope DIE.
+  DIE *createScopeChildrenDIE(CompileUnit *TheCU, LexicalScope *Scope,
+                              SmallVectorImpl<DIE*> &Children);
 
   /// \brief Emit initial Dwarf sections with a label at the start of each one.
   void emitSectionLabels();
@@ -528,10 +555,16 @@ private:
   void emitAccelTypes();
 
   /// \brief Emit visible names into a debug pubnames section.
-  void emitDebugPubnames();
+  /// \param GnuStyle determines whether or not we want to emit
+  /// additional information into the table ala newer gcc for gdb
+  /// index.
+  void emitDebugPubNames(bool GnuStyle = false);
 
   /// \brief Emit visible types into a debug pubtypes section.
-  void emitDebugPubTypes();
+  /// \param GnuStyle determines whether or not we want to emit
+  /// additional information into the table ala newer gcc for gdb
+  /// index.
+  void emitDebugPubTypes(bool GnuStyle = false);
 
   /// \brief Emit visible names into a debug str section.
   void emitDebugStr();
@@ -555,7 +588,7 @@ private:
 
   /// \brief Construct the split debug info compile unit for the debug info
   /// section.
-  CompileUnit *constructSkeletonCU(const MDNode *);
+  CompileUnit *constructSkeletonCU(const CompileUnit *CU);
 
   /// \brief Emit the local split abbreviations.
   void emitSkeletonAbbrevs(const MCSection *);
@@ -571,7 +604,7 @@ private:
 
   /// \brief Create new CompileUnit for the given metadata node with tag
   /// DW_TAG_compile_unit.
-  CompileUnit *constructCompileUnit(const MDNode *N);
+  CompileUnit *constructCompileUnit(DICompileUnit DIUnit);
 
   /// \brief Construct subprogram DIE.
   void constructSubprogramDIE(CompileUnit *TheCU, const MDNode *N);
@@ -633,7 +666,13 @@ public:
   // Main entry points.
   //
   DwarfDebug(AsmPrinter *A, Module *M);
-  ~DwarfDebug();
+
+  void insertDIE(const MDNode *TypeMD, DIE *Die) {
+    MDTypeNodeToDieMap.insert(std::make_pair(TypeMD, Die));
+  }
+  DIE *getDIE(const MDNode *TypeMD) {
+    return MDTypeNodeToDieMap.lookup(TypeMD);
+  }
 
   /// \brief Emit all Dwarf sections that should come prior to the
   /// content.
@@ -658,6 +697,13 @@ public:
   /// type units.
   void addTypeUnitType(DIE *Die) { TypeUnits.push_back(Die); }
 
+  /// \brief Add a label so that arange data can be generated for it.
+  void addArangeLabel(SymbolCU SCU) { ArangeLabels.push_back(SCU); }
+
+  /// \brief For symbols that have a size designated (e.g. common symbols),
+  /// this tracks that size.
+  void setSymbolSize(const MCSymbol *Sym, uint64_t Size) { SymSize[Sym] = Size;}
+
   /// \brief Look up the source id with the given directory and source file
   /// names. If none currently exists, create a new id and insert it in the
   /// SourceIds map.
@@ -665,11 +711,7 @@ public:
                                unsigned CUID);
 
   /// \brief Recursively Emits a debug information entry.
-  void emitDIE(DIE *Die, std::vector<DIEAbbrev *> *Abbrevs);
-
-  /// \brief Returns whether or not to limit some of our debug
-  /// output to the limitations of darwin gdb.
-  bool useDarwinGDBCompat() { return IsDarwinGDBCompat; }
+  void emitDIE(DIE *Die, ArrayRef<DIEAbbrev *> Abbrevs);
 
   // Experimental DWARF5 features.
 
@@ -683,6 +725,16 @@ public:
 
   /// Returns the Dwarf Version.
   unsigned getDwarfVersion() const { return DwarfVersion; }
+
+  /// Find the MDNode for the given reference.
+  template <typename T> T resolve(DIRef<T> Ref) const {
+    return Ref.resolve(TypeIdentifierMap);
+  }
+
+  /// isSubprogramContext - Return true if Context is either a subprogram
+  /// or another context nested inside a subprogram.
+  bool isSubprogramContext(const MDNode *Context);
+
 };
 } // End of namespace llvm
 
diff --git a/lib/CodeGen/AsmPrinter/DwarfException.h b/lib/CodeGen/AsmPrinter/DwarfException.h
index 49a85d8..1575161 100644
--- a/lib/CodeGen/AsmPrinter/DwarfException.h
+++ b/lib/CodeGen/AsmPrinter/DwarfException.h
@@ -29,6 +29,7 @@ class MCAsmInfo;
 class MCExpr;
 class MCSymbol;
 class Function;
+class ARMTargetStreamer;
 class AsmPrinter;
 
 //===----------------------------------------------------------------------===//
@@ -177,6 +178,8 @@ public:
 
 class ARMException : public DwarfException {
   void EmitTypeInfos(unsigned TTypeEncoding);
+  ARMTargetStreamer &getTargetStreamer();
+
 public:
   //===--------------------------------------------------------------------===//
   // Main entry points.
diff --git a/lib/CodeGen/BasicTargetTransformInfo.cpp b/lib/CodeGen/BasicTargetTransformInfo.cpp
index b48b817..24aa1ab 100644
--- a/lib/CodeGen/BasicTargetTransformInfo.cpp
+++ b/lib/CodeGen/BasicTargetTransformInfo.cpp
@@ -83,6 +83,8 @@ public:
   virtual unsigned getJumpBufAlignment() const;
   virtual unsigned getJumpBufSize() const;
   virtual bool shouldBuildLookupTables() const;
+  virtual bool haveFastSqrt(Type *Ty) const;
+  virtual void getUnrollingPreferences(Loop *L, UnrollingPreferences &UP) const;
 
   /// @}
 
@@ -111,6 +113,7 @@ public:
                                          ArrayRef<Type*> Tys) const;
   virtual unsigned getNumberOfParts(Type *Tp) const;
   virtual unsigned getAddressComputationCost(Type *Ty, bool IsComplex) const;
+  virtual unsigned getReductionCost(unsigned Opcode, Type *Ty, bool IsPairwise) const;
 
   /// @}
 };
@@ -182,6 +185,14 @@ bool BasicTTI::shouldBuildLookupTables() const {
        TLI->isOperationLegalOrCustom(ISD::BRIND, MVT::Other));
 }
 
+bool BasicTTI::haveFastSqrt(Type *Ty) const {
+  const TargetLoweringBase *TLI = getTLI();
+  EVT VT = TLI->getValueType(Ty);
+  return TLI->isTypeLegal(VT) && TLI->isOperationLegalOrCustom(ISD::FSQRT, VT);
+}
+
+void BasicTTI::getUnrollingPreferences(Loop *, UnrollingPreferences &) const { }
+
 //===----------------------------------------------------------------------===//
 //
 // Calls used by the vectorizers.
@@ -443,12 +454,14 @@ unsigned BasicTTI::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
   case Intrinsic::log10:   ISD = ISD::FLOG10; break;
   case Intrinsic::log2:    ISD = ISD::FLOG2;  break;
   case Intrinsic::fabs:    ISD = ISD::FABS;   break;
+  case Intrinsic::copysign: ISD = ISD::FCOPYSIGN; break;
   case Intrinsic::floor:   ISD = ISD::FFLOOR; break;
   case Intrinsic::ceil:    ISD = ISD::FCEIL;  break;
   case Intrinsic::trunc:   ISD = ISD::FTRUNC; break;
   case Intrinsic::nearbyint:
                            ISD = ISD::FNEARBYINT; break;
   case Intrinsic::rint:    ISD = ISD::FRINT;  break;
+  case Intrinsic::round:   ISD = ISD::FROUND; break;
   case Intrinsic::pow:     ISD = ISD::FPOW;   break;
   case Intrinsic::fma:     ISD = ISD::FMA;    break;
   case Intrinsic::fmuladd: ISD = ISD::FMA;    break; // FIXME: mul + add?
@@ -498,3 +511,17 @@ unsigned BasicTTI::getNumberOfParts(Type *Tp) const {
 unsigned BasicTTI::getAddressComputationCost(Type *Ty, bool IsComplex) const {
   return 0;
 }
+
+unsigned BasicTTI::getReductionCost(unsigned Opcode, Type *Ty,
+                                    bool IsPairwise) const {
+  assert(Ty->isVectorTy() && "Expect a vector type");
+  unsigned NumVecElts = Ty->getVectorNumElements();
+  unsigned NumReduxLevels = Log2_32(NumVecElts);
+  unsigned ArithCost = NumReduxLevels *
+    TopTTI->getArithmeticInstrCost(Opcode, Ty);
+  // Assume the pairwise shuffles add a cost.
+  unsigned ShuffleCost =
+      NumReduxLevels * (IsPairwise + 1) *
+      TopTTI->getShuffleCost(SK_ExtractSubvector, Ty, NumVecElts / 2, Ty);
+  return ShuffleCost + ArithCost + getScalarizationOverhead(Ty, false, true);
+}
diff --git a/lib/CodeGen/BranchFolding.h b/lib/CodeGen/BranchFolding.h
index 26bdca9..0d15ed7 100644
--- a/lib/CodeGen/BranchFolding.h
+++ b/lib/CodeGen/BranchFolding.h
@@ -1,4 +1,4 @@
-//===-- BranchFolding.h - Fold machine code branch instructions --*- C++ -*===//
+//===-- BranchFolding.h - Fold machine code branch instructions -*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
diff --git a/lib/CodeGen/CMakeLists.txt b/lib/CodeGen/CMakeLists.txt
index 56aa330..10cc9ff 100644
--- a/lib/CodeGen/CMakeLists.txt
+++ b/lib/CodeGen/CMakeLists.txt
@@ -35,6 +35,7 @@ add_llvm_library(LLVMCodeGen
   LiveRangeCalc.cpp
   LiveRangeEdit.cpp
   LiveRegMatrix.cpp
+  LiveRegUnits.cpp
   LiveStackAnalysis.cpp
   LiveVariables.cpp
   LocalStackSlotAllocation.cpp
@@ -88,7 +89,6 @@ add_llvm_library(LLVMCodeGen
   ScheduleDAGPrinter.cpp
   ScoreboardHazardRecognizer.cpp
   ShadowStackGC.cpp
-  ShrinkWrapping.cpp
   SjLjEHPrepare.cpp
   SlotIndexes.cpp
   SpillPlacement.cpp
@@ -97,7 +97,7 @@ add_llvm_library(LLVMCodeGen
   StackColoring.cpp
   StackProtector.cpp
   StackSlotColoring.cpp
-  StrongPHIElimination.cpp
+  StackMaps.cpp
   TailDuplication.cpp
   TargetFrameLoweringImpl.cpp
   TargetInstrInfo.cpp
diff --git a/lib/CodeGen/CalcSpillWeights.cpp b/lib/CodeGen/CalcSpillWeights.cpp
index b03c325..4925c4d 100644
--- a/lib/CodeGen/CalcSpillWeights.cpp
+++ b/lib/CodeGen/CalcSpillWeights.cpp
@@ -9,14 +9,12 @@
 
 #define DEBUG_TYPE "calcspillweights"
 
-#include "llvm/ADT/SmallSet.h"
 #include "llvm/CodeGen/CalcSpillWeights.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/SlotIndexes.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
@@ -24,38 +22,22 @@
 #include "llvm/Target/TargetRegisterInfo.h"
 using namespace llvm;
 
-char CalculateSpillWeights::ID = 0;
-INITIALIZE_PASS_BEGIN(CalculateSpillWeights, "calcspillweights",
-                "Calculate spill weights", false, false)
-INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
-INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
-INITIALIZE_PASS_END(CalculateSpillWeights, "calcspillweights",
-                "Calculate spill weights", false, false)
-
-void CalculateSpillWeights::getAnalysisUsage(AnalysisUsage &au) const {
-  au.addRequired<LiveIntervals>();
-  au.addRequired<MachineBlockFrequencyInfo>();
-  au.addRequired<MachineLoopInfo>();
-  au.setPreservesAll();
-  MachineFunctionPass::getAnalysisUsage(au);
-}
-
-bool CalculateSpillWeights::runOnMachineFunction(MachineFunction &MF) {
-
+void llvm::calculateSpillWeightsAndHints(LiveIntervals &LIS,
+                           MachineFunction &MF,
+                           const MachineLoopInfo &MLI,
+                           const MachineBlockFrequencyInfo &MBFI,
+                           VirtRegAuxInfo::NormalizingFn norm) {
   DEBUG(dbgs() << "********** Compute Spill Weights **********\n"
                << "********** Function: " << MF.getName() << '\n');
 
-  LiveIntervals &LIS = getAnalysis<LiveIntervals>();
   MachineRegisterInfo &MRI = MF.getRegInfo();
-  VirtRegAuxInfo VRAI(MF, LIS, getAnalysis<MachineLoopInfo>(),
-                      getAnalysis<MachineBlockFrequencyInfo>());
+  VirtRegAuxInfo VRAI(MF, LIS, MLI, MBFI, norm);
   for (unsigned i = 0, e = MRI.getNumVirtRegs(); i != e; ++i) {
     unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
     if (MRI.reg_nodbg_empty(Reg))
       continue;
-    VRAI.CalculateWeightAndHint(LIS.getInterval(Reg));
+    VRAI.calculateSpillWeightAndHint(LIS.getInterval(Reg));
   }
-  return false;
 }
 
 // Return the preferred allocation register for reg, given a COPY instruction.
@@ -111,7 +93,7 @@ static bool isRematerializable(const LiveInterval &LI,
 }
 
 void
-VirtRegAuxInfo::CalculateWeightAndHint(LiveInterval &li) {
+VirtRegAuxInfo::calculateSpillWeightAndHint(LiveInterval &li) {
   MachineRegisterInfo &mri = MF.getRegInfo();
   const TargetRegisterInfo &tri = *MF.getTarget().getRegisterInfo();
   MachineBasicBlock *mbb = 0;
@@ -201,5 +183,5 @@ VirtRegAuxInfo::CalculateWeightAndHint(LiveInterval &li) {
   if (isRematerializable(li, LIS, *MF.getTarget().getInstrInfo()))
     totalWeight *= 0.5F;
 
-  li.weight = normalizeSpillWeight(totalWeight, li.getSize());
+  li.weight = normalize(totalWeight, li.getSize());
 }
diff --git a/lib/CodeGen/CodeGen.cpp b/lib/CodeGen/CodeGen.cpp
index c641991..7430c53 100644
--- a/lib/CodeGen/CodeGen.cpp
+++ b/lib/CodeGen/CodeGen.cpp
@@ -22,7 +22,6 @@ using namespace llvm;
 void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializeBasicTTIPass(Registry);
   initializeBranchFolderPassPass(Registry);
-  initializeCalculateSpillWeightsPass(Registry);
   initializeDeadMachineInstructionElimPass(Registry);
   initializeEarlyIfConverterPass(Registry);
   initializeExpandPostRAPass(Registry);
@@ -60,7 +59,6 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializeStackProtectorPass(Registry);
   initializeStackColoringPass(Registry);
   initializeStackSlotColoringPass(Registry);
-  initializeStrongPHIEliminationPass(Registry);
   initializeTailDuplicatePassPass(Registry);
   initializeTargetPassConfigPass(Registry);
   initializeTwoAddressInstructionPassPass(Registry);
diff --git a/lib/CodeGen/DFAPacketizer.cpp b/lib/CodeGen/DFAPacketizer.cpp
index 840a101..6619bcf 100644
--- a/lib/CodeGen/DFAPacketizer.cpp
+++ b/lib/CodeGen/DFAPacketizer.cpp
@@ -160,7 +160,8 @@ void VLIWPacketizerList::PacketizeMIs(MachineBasicBlock *MBB,
                                       MachineBasicBlock::iterator EndItr) {
   assert(VLIWScheduler && "VLIW Scheduler is not initialized!");
   VLIWScheduler->startBlock(MBB);
-  VLIWScheduler->enterRegion(MBB, BeginItr, EndItr, MBB->size());
+  VLIWScheduler->enterRegion(MBB, BeginItr, EndItr,
+                             std::distance(BeginItr, EndItr));
   VLIWScheduler->schedule();
 
   // Generate MI -> SU map.
diff --git a/lib/CodeGen/ExecutionDepsFix.cpp b/lib/CodeGen/ExecutionDepsFix.cpp
index e277f5c..031f19c 100644
--- a/lib/CodeGen/ExecutionDepsFix.cpp
+++ b/lib/CodeGen/ExecutionDepsFix.cpp
@@ -23,6 +23,7 @@
 #define DEBUG_TYPE "execution-fix"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/CodeGen/LiveRegUnits.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/Support/Allocator.h"
@@ -136,6 +137,12 @@ class ExeDepsFix : public MachineFunctionPass {
   typedef DenseMap<MachineBasicBlock*, LiveReg*> LiveOutMap;
   LiveOutMap LiveOuts;
 
+  /// List of undefined register reads in this block in forward order.
+  std::vector<std::pair<MachineInstr*, unsigned> > UndefReads;
+
+  /// Storage for register unit liveness.
+  LiveRegUnits LiveUnits;
+
   /// Current instruction number.
   /// The first instruction in each basic block is 0.
   int CurInstr;
@@ -185,6 +192,8 @@ private:
   void processDefs(MachineInstr*, bool Kill);
   void visitSoftInstr(MachineInstr*, unsigned mask);
   void visitHardInstr(MachineInstr*, unsigned domain);
+  bool shouldBreakDependence(MachineInstr*, unsigned OpIdx, unsigned Pref);
+  void processUndefReads(MachineBasicBlock*);
 };
 }
 
@@ -341,6 +350,10 @@ void ExeDepsFix::enterBasicBlock(MachineBasicBlock *MBB) {
   // Reset instruction counter in each basic block.
   CurInstr = 0;
 
+  // Set up UndefReads to track undefined register reads.
+  UndefReads.clear();
+  LiveUnits.clear();
+
   // Set up LiveRegs to represent registers entering MBB.
   if (!LiveRegs)
     LiveRegs = new LiveReg[NumRegs];
@@ -448,10 +461,46 @@ void ExeDepsFix::visitInstr(MachineInstr *MI) {
   processDefs(MI, !DomP.first);
 }
 
+/// \brief Return true to if it makes sense to break dependence on a partial def
+/// or undef use.
+bool ExeDepsFix::shouldBreakDependence(MachineInstr *MI, unsigned OpIdx,
+                                       unsigned Pref) {
+  int rx = regIndex(MI->getOperand(OpIdx).getReg());
+  if (rx < 0)
+    return false;
+
+  unsigned Clearance = CurInstr - LiveRegs[rx].Def;
+  DEBUG(dbgs() << "Clearance: " << Clearance << ", want " << Pref);
+
+  if (Pref > Clearance) {
+    DEBUG(dbgs() << ": Break dependency.\n");
+    return true;
+  }
+  // The current clearance seems OK, but we may be ignoring a def from a
+  // back-edge.
+  if (!SeenUnknownBackEdge || Pref <= unsigned(CurInstr)) {
+    DEBUG(dbgs() << ": OK .\n");
+    return false;
+  }
+  // A def from an unprocessed back-edge may make us break this dependency.
+  DEBUG(dbgs() << ": Wait for back-edge to resolve.\n");
+  return false;
+}
+
 // Update def-ages for registers defined by MI.
 // If Kill is set, also kill off DomainValues clobbered by the defs.
+//
+// Also break dependencies on partial defs and undef uses.
 void ExeDepsFix::processDefs(MachineInstr *MI, bool Kill) {
   assert(!MI->isDebugValue() && "Won't process debug values");
+
+  // Break dependence on undef uses. Do this before updating LiveRegs below.
+  unsigned OpNum;
+  unsigned Pref = TII->getUndefRegClearance(MI, OpNum, TRI);
+  if (Pref) {
+    if (shouldBreakDependence(MI, OpNum, Pref))
+      UndefReads.push_back(std::make_pair(MI, OpNum));
+  }
   const MCInstrDesc &MCID = MI->getDesc();
   for (unsigned i = 0,
          e = MI->isVariadic() ? MI->getNumOperands() : MCID.getNumDefs();
@@ -471,37 +520,58 @@ void ExeDepsFix::processDefs(MachineInstr *MI, bool Kill) {
     DEBUG(dbgs() << TRI->getName(RC->getRegister(rx)) << ":\t" << CurInstr
                  << '\t' << *MI);
 
+    // Check clearance before partial register updates.
+    // Call breakDependence before setting LiveRegs[rx].Def.
+    unsigned Pref = TII->getPartialRegUpdateClearance(MI, i, TRI);
+    if (Pref && shouldBreakDependence(MI, i, Pref))
+      TII->breakPartialRegDependency(MI, i, TRI);
+
     // How many instructions since rx was last written?
-    unsigned Clearance = CurInstr - LiveRegs[rx].Def;
     LiveRegs[rx].Def = CurInstr;
 
     // Kill off domains redefined by generic instructions.
     if (Kill)
       kill(rx);
+  }
+  ++CurInstr;
+}
 
-    // Verify clearance before partial register updates.
-    unsigned Pref = TII->getPartialRegUpdateClearance(MI, i, TRI);
-    if (!Pref)
-      continue;
-    DEBUG(dbgs() << "Clearance: " << Clearance << ", want " << Pref);
-    if (Pref > Clearance) {
-      DEBUG(dbgs() << ": Break dependency.\n");
-      TII->breakPartialRegDependency(MI, i, TRI);
-      continue;
-    }
-
-    // The current clearance seems OK, but we may be ignoring a def from a
-    // back-edge.
-    if (!SeenUnknownBackEdge || Pref <= unsigned(CurInstr)) {
-      DEBUG(dbgs() << ": OK.\n");
-      continue;
-    }
+/// \break Break false dependencies on undefined register reads.
+///
+/// Walk the block backward computing precise liveness. This is expensive, so we
+/// only do it on demand. Note that the occurrence of undefined register reads
+/// that should be broken is very rare, but when they occur we may have many in
+/// a single block.
+void ExeDepsFix::processUndefReads(MachineBasicBlock *MBB) {
+  if (UndefReads.empty())
+    return;
 
-    // A def from an unprocessed back-edge may make us break this dependency.
-    DEBUG(dbgs() << ": Wait for back-edge to resolve.\n");
+  // Collect this block's live out register units.
+  LiveUnits.init(TRI);
+  for (MachineBasicBlock::const_succ_iterator SI = MBB->succ_begin(),
+         SE = MBB->succ_end(); SI != SE; ++SI) {
+    LiveUnits.addLiveIns(*SI, *TRI);
   }
+  MachineInstr *UndefMI = UndefReads.back().first;
+  unsigned OpIdx = UndefReads.back().second;
 
-  ++CurInstr;
+  for (MachineBasicBlock::reverse_iterator I = MBB->rbegin(), E = MBB->rend();
+       I != E; ++I) {
+    // Update liveness, including the current instrucion's defs.
+    LiveUnits.stepBackward(*I, *TRI);
+
+    if (UndefMI == &*I) {
+      if (!LiveUnits.contains(UndefMI->getOperand(OpIdx).getReg(), *TRI))
+        TII->breakPartialRegDependency(UndefMI, OpIdx, TRI);
+
+      UndefReads.pop_back();
+      if (UndefReads.empty())
+        return;
+
+      UndefMI = UndefReads.back().first;
+      OpIdx = UndefReads.back().second;
+    }
+  }
 }
 
 // A hard instruction only works in one domain. All input registers will be
@@ -549,7 +619,7 @@ void ExeDepsFix::visitSoftInstr(MachineInstr *mi, unsigned mask) {
         // Is it possible to use this collapsed register for free?
         if (dv->isCollapsed()) {
           // Restrict available domains to the ones in common with the operand.
-          // If there are no common domains, we must pay the cross-domain 
+          // If there are no common domains, we must pay the cross-domain
           // penalty for this operand.
           if (common) available = common;
         } else if (common)
@@ -686,6 +756,7 @@ bool ExeDepsFix::runOnMachineFunction(MachineFunction &mf) {
     for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;
         ++I)
       visitInstr(I);
+    processUndefReads(MBB);
     leaveBasicBlock(MBB);
   }
 
@@ -698,6 +769,7 @@ bool ExeDepsFix::runOnMachineFunction(MachineFunction &mf) {
         ++I)
       if (!I->isDebugValue())
         processDefs(I, false);
+    processUndefReads(MBB);
     leaveBasicBlock(MBB);
   }
 
@@ -713,6 +785,7 @@ bool ExeDepsFix::runOnMachineFunction(MachineFunction &mf) {
     delete[] FI->second;
   }
   LiveOuts.clear();
+  UndefReads.clear();
   Avail.clear();
   Allocator.DestroyAll();
 
diff --git a/lib/CodeGen/ExpandPostRAPseudos.cpp b/lib/CodeGen/ExpandPostRAPseudos.cpp
index 1611db8..6c73fff 100644
--- a/lib/CodeGen/ExpandPostRAPseudos.cpp
+++ b/lib/CodeGen/ExpandPostRAPseudos.cpp
@@ -104,7 +104,7 @@ bool ExpandPostRA::LowerSubregToReg(MachineInstr *MI) {
   }
 
   if (DstSubReg == InsReg) {
-    // No need to insert an identify copy instruction.
+    // No need to insert an identity copy instruction.
     // Watch out for case like this:
     // %RAX<def> = SUBREG_TO_REG 0, %EAX<kill>, 3
     // We must leave %RAX live.
diff --git a/lib/CodeGen/IfConversion.cpp b/lib/CodeGen/IfConversion.cpp
index 1ae7e3b..e2d0eb4 100644
--- a/lib/CodeGen/IfConversion.cpp
+++ b/lib/CodeGen/IfConversion.cpp
@@ -22,6 +22,8 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetSchedule.h"
+#include "llvm/CodeGen/LiveRegUnits.h"
 #include "llvm/MC/MCInstrItineraries.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -31,6 +33,8 @@
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
 using namespace llvm;
 
 // Hidden options for help debugging.
@@ -150,14 +154,17 @@ namespace {
     /// BBAnalysis - Results of if-conversion feasibility analysis indexed by
     /// basic block number.
     std::vector<BBInfo> BBAnalysis;
+    TargetSchedModel SchedModel;
 
     const TargetLoweringBase *TLI;
     const TargetInstrInfo *TII;
     const TargetRegisterInfo *TRI;
-    const InstrItineraryData *InstrItins;
     const MachineBranchProbabilityInfo *MBPI;
     MachineRegisterInfo *MRI;
 
+    LiveRegUnits Redefs;
+    LiveRegUnits DontKill;
+
     bool PreRegAlloc;
     bool MadeChange;
     int FnNum;
@@ -198,11 +205,9 @@ namespace {
     void PredicateBlock(BBInfo &BBI,
                         MachineBasicBlock::iterator E,
                         SmallVectorImpl<MachineOperand> &Cond,
-                        SmallSet<unsigned, 4> &Redefs,
                         SmallSet<unsigned, 4> *LaterRedefs = 0);
     void CopyAndPredicateBlock(BBInfo &ToBBI, BBInfo &FromBBI,
                                SmallVectorImpl<MachineOperand> &Cond,
-                               SmallSet<unsigned, 4> &Redefs,
                                bool IgnoreBr = false);
     void MergeBlocks(BBInfo &ToBBI, BBInfo &FromBBI, bool AddEdges = true);
 
@@ -267,7 +272,11 @@ bool IfConverter::runOnMachineFunction(MachineFunction &MF) {
   TRI = MF.getTarget().getRegisterInfo();
   MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
   MRI = &MF.getRegInfo();
-  InstrItins = MF.getTarget().getInstrItineraryData();
+
+  const TargetSubtargetInfo &ST =
+    MF.getTarget().getSubtarget<TargetSubtargetInfo>();
+  SchedModel.init(*ST.getSchedModel(), &ST, TII);
+
   if (!TII) return false;
 
   PreRegAlloc = MRI->isSSA();
@@ -666,32 +675,28 @@ void IfConverter::ScanInstructions(BBInfo &BBI) {
     bool isPredicated = TII->isPredicated(I);
     bool isCondBr = BBI.IsBrAnalyzable && I->isConditionalBranch();
 
-    if (!isCondBr) {
-      if (!isPredicated) {
-        BBI.NonPredSize++;
-        unsigned ExtraPredCost = 0;
-        unsigned NumCycles = TII->getInstrLatency(InstrItins, &*I,
-                                                  &ExtraPredCost);
-        if (NumCycles > 1)
-          BBI.ExtraCost += NumCycles-1;
-        BBI.ExtraCost2 += ExtraPredCost;
-      } else if (!AlreadyPredicated) {
-        // FIXME: This instruction is already predicated before the
-        // if-conversion pass. It's probably something like a conditional move.
-        // Mark this block unpredicable for now.
-        BBI.IsUnpredicable = true;
-        return;
-      }
+    // A conditional branch is not predicable, but it may be eliminated.
+    if (isCondBr)
+      continue;
+
+    if (!isPredicated) {
+      BBI.NonPredSize++;
+      unsigned ExtraPredCost = TII->getPredicationCost(&*I);
+      unsigned NumCycles = SchedModel.computeInstrLatency(&*I, false);
+      if (NumCycles > 1)
+        BBI.ExtraCost += NumCycles-1;
+      BBI.ExtraCost2 += ExtraPredCost;
+    } else if (!AlreadyPredicated) {
+      // FIXME: This instruction is already predicated before the
+      // if-conversion pass. It's probably something like a conditional move.
+      // Mark this block unpredicable for now.
+      BBI.IsUnpredicable = true;
+      return;
     }
 
     if (BBI.ClobbersPred && !isPredicated) {
       // Predicate modification instruction should end the block (except for
       // already predicated instructions and end of block branches).
-      if (isCondBr) {
-        // A conditional branch is not predicable, but it may be eliminated.
-        continue;
-      }
-
       // Predicate may have been modified, the subsequent (currently)
       // unpredicated instructions cannot be correctly predicated.
       BBI.IsUnpredicable = true;
@@ -961,64 +966,58 @@ void IfConverter::RemoveExtraEdges(BBInfo &BBI) {
     BBI.BB->CorrectExtraCFGEdges(TBB, FBB, !Cond.empty());
 }
 
-/// InitPredRedefs / UpdatePredRedefs - Defs by predicated instructions are
-/// modeled as read + write (sort like two-address instructions). These
-/// routines track register liveness and add implicit uses to if-converted
-/// instructions to conform to the model.
-static void InitPredRedefs(MachineBasicBlock *BB, SmallSet<unsigned,4> &Redefs,
-                           const TargetRegisterInfo *TRI) {
-  for (MachineBasicBlock::livein_iterator I = BB->livein_begin(),
-         E = BB->livein_end(); I != E; ++I) {
-    unsigned Reg = *I;
-    for (MCSubRegIterator SubRegs(Reg, TRI, /*IncludeSelf=*/true);
-         SubRegs.isValid(); ++SubRegs)
-      Redefs.insert(*SubRegs);
-  }
-}
-
-static void UpdatePredRedefs(MachineInstr *MI, SmallSet<unsigned,4> &Redefs,
-                             const TargetRegisterInfo *TRI,
-                             bool AddImpUse = false) {
-  SmallVector<unsigned, 4> Defs;
-  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    const MachineOperand &MO = MI->getOperand(i);
-    if (!MO.isReg())
+/// Behaves like LiveRegUnits::StepForward() but also adds implicit uses to all
+/// values defined in MI which are not live/used by MI.
+static void UpdatePredRedefs(MachineInstr *MI, LiveRegUnits &Redefs,
+                             const TargetRegisterInfo *TRI) {
+  for (ConstMIBundleOperands Ops(MI); Ops.isValid(); ++Ops) {
+    if (!Ops->isReg() || !Ops->isKill())
       continue;
-    unsigned Reg = MO.getReg();
-    if (!Reg)
+    unsigned Reg = Ops->getReg();
+    if (Reg == 0)
       continue;
-    if (MO.isDef())
-      Defs.push_back(Reg);
-    else if (MO.isKill()) {
-      for (MCSubRegIterator SubRegs(Reg, TRI, /*IncludeSelf=*/true);
-           SubRegs.isValid(); ++SubRegs)
-        Redefs.erase(*SubRegs);
-    }
+    Redefs.removeReg(Reg, *TRI);
   }
-  MachineInstrBuilder MIB(*MI->getParent()->getParent(), MI);
-  for (unsigned i = 0, e = Defs.size(); i != e; ++i) {
-    unsigned Reg = Defs[i];
-    if (!Redefs.insert(Reg)) {
-      if (AddImpUse)
-        // Treat predicated update as read + write.
-        MIB.addReg(Reg, RegState::Implicit | RegState::Undef);
-    } else {
-      for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs)
-        Redefs.insert(*SubRegs);
-    }
+  for (MIBundleOperands Ops(MI); Ops.isValid(); ++Ops) {
+    if (!Ops->isReg() || !Ops->isDef())
+      continue;
+    unsigned Reg = Ops->getReg();
+    if (Reg == 0 || Redefs.contains(Reg, *TRI))
+      continue;
+    Redefs.addReg(Reg, *TRI);
+
+    MachineOperand &Op = *Ops;
+    MachineInstr *MI = Op.getParent();
+    MachineInstrBuilder MIB(*MI->getParent()->getParent(), MI);
+    MIB.addReg(Reg, RegState::Implicit | RegState::Undef);
   }
 }
 
-static void UpdatePredRedefs(MachineBasicBlock::iterator I,
-                             MachineBasicBlock::iterator E,
-                             SmallSet<unsigned,4> &Redefs,
-                             const TargetRegisterInfo *TRI) {
-  while (I != E) {
-    UpdatePredRedefs(I, Redefs, TRI);
-    ++I;
+/**
+ * Remove kill flags from operands with a registers in the @p DontKill set.
+ */
+static void RemoveKills(MachineInstr &MI, const LiveRegUnits &DontKill,
+                        const MCRegisterInfo &MCRI) {
+  for (MIBundleOperands O(&MI); O.isValid(); ++O) {
+    if (!O->isReg() || !O->isKill())
+      continue;
+    if (DontKill.contains(O->getReg(), MCRI))
+      O->setIsKill(false);
   }
 }
 
+/**
+ * Walks a range of machine instructions and removes kill flags for registers
+ * in the @p DontKill set.
+ */
+static void RemoveKills(MachineBasicBlock::iterator I,
+                        MachineBasicBlock::iterator E,
+                        const LiveRegUnits &DontKill,
+                        const MCRegisterInfo &MCRI) {
+  for ( ; I != E; ++I)
+    RemoveKills(*I, DontKill, MCRI);
+}
+
 /// IfConvertSimple - If convert a simple (split, no rejoin) sub-CFG.
 ///
 bool IfConverter::IfConvertSimple(BBInfo &BBI, IfcvtKind Kind) {
@@ -1049,21 +1048,27 @@ bool IfConverter::IfConvertSimple(BBInfo &BBI, IfcvtKind Kind) {
 
   // Initialize liveins to the first BB. These are potentiall redefined by
   // predicated instructions.
-  SmallSet<unsigned, 4> Redefs;
-  InitPredRedefs(CvtBBI->BB, Redefs, TRI);
-  InitPredRedefs(NextBBI->BB, Redefs, TRI);
+  Redefs.init(TRI);
+  Redefs.addLiveIns(CvtBBI->BB, *TRI);
+  Redefs.addLiveIns(NextBBI->BB, *TRI);
+
+  // Compute a set of registers which must not be killed by instructions in
+  // BB1: This is everything live-in to BB2.
+  DontKill.init(TRI);
+  DontKill.addLiveIns(NextBBI->BB, *TRI);
 
   if (CvtBBI->BB->pred_size() > 1) {
     BBI.NonPredSize -= TII->RemoveBranch(*BBI.BB);
     // Copy instructions in the true block, predicate them, and add them to
     // the entry block.
-    CopyAndPredicateBlock(BBI, *CvtBBI, Cond, Redefs);
+    CopyAndPredicateBlock(BBI, *CvtBBI, Cond);
 
     // RemoveExtraEdges won't work if the block has an unanalyzable branch, so
     // explicitly remove CvtBBI as a successor.
     BBI.BB->removeSuccessor(CvtBBI->BB);
   } else {
-    PredicateBlock(*CvtBBI, CvtBBI->BB->end(), Cond, Redefs);
+    RemoveKills(CvtBBI->BB->begin(), CvtBBI->BB->end(), DontKill, *TRI);
+    PredicateBlock(*CvtBBI, CvtBBI->BB->end(), Cond);
 
     // Merge converted block into entry block.
     BBI.NonPredSize -= TII->RemoveBranch(*BBI.BB);
@@ -1148,16 +1153,18 @@ bool IfConverter::IfConvertTriangle(BBInfo &BBI, IfcvtKind Kind) {
 
   // Initialize liveins to the first BB. These are potentially redefined by
   // predicated instructions.
-  SmallSet<unsigned, 4> Redefs;
-  InitPredRedefs(CvtBBI->BB, Redefs, TRI);
-  InitPredRedefs(NextBBI->BB, Redefs, TRI);
+  Redefs.init(TRI);
+  Redefs.addLiveIns(CvtBBI->BB, *TRI);
+  Redefs.addLiveIns(NextBBI->BB, *TRI);
+
+  DontKill.clear();
 
   bool HasEarlyExit = CvtBBI->FalseBB != NULL;
   if (CvtBBI->BB->pred_size() > 1) {
     BBI.NonPredSize -= TII->RemoveBranch(*BBI.BB);
     // Copy instructions in the true block, predicate them, and add them to
     // the entry block.
-    CopyAndPredicateBlock(BBI, *CvtBBI, Cond, Redefs, true);
+    CopyAndPredicateBlock(BBI, *CvtBBI, Cond, true);
 
     // RemoveExtraEdges won't work if the block has an unanalyzable branch, so
     // explicitly remove CvtBBI as a successor.
@@ -1165,7 +1172,7 @@ bool IfConverter::IfConvertTriangle(BBInfo &BBI, IfcvtKind Kind) {
   } else {
     // Predicate the 'true' block after removing its branch.
     CvtBBI->NonPredSize -= TII->RemoveBranch(*CvtBBI->BB);
-    PredicateBlock(*CvtBBI, CvtBBI->BB->end(), Cond, Redefs);
+    PredicateBlock(*CvtBBI, CvtBBI->BB->end(), Cond);
 
     // Now merge the entry of the triangle with the true block.
     BBI.NonPredSize -= TII->RemoveBranch(*BBI.BB);
@@ -1276,8 +1283,8 @@ bool IfConverter::IfConvertDiamond(BBInfo &BBI, IfcvtKind Kind,
 
   // Initialize liveins to the first BB. These are potentially redefined by
   // predicated instructions.
-  SmallSet<unsigned, 4> Redefs;
-  InitPredRedefs(BBI1->BB, Redefs, TRI);
+  Redefs.init(TRI);
+  Redefs.addLiveIns(BBI1->BB, *TRI);
 
   // Remove the duplicated instructions at the beginnings of both paths.
   MachineBasicBlock::iterator DI1 = BBI1->BB->begin();
@@ -1304,7 +1311,19 @@ bool IfConverter::IfConvertDiamond(BBInfo &BBI, IfcvtKind Kind,
       --NumDups1;
   }
 
-  UpdatePredRedefs(BBI1->BB->begin(), DI1, Redefs, TRI);
+  // Compute a set of registers which must not be killed by instructions in BB1:
+  // This is everything used+live in BB2 after the duplicated instructions. We
+  // can compute this set by simulating liveness backwards from the end of BB2.
+  DontKill.init(TRI);
+  for (MachineBasicBlock::reverse_iterator I = BBI2->BB->rbegin(),
+       E = MachineBasicBlock::reverse_iterator(DI2); I != E; ++I) {
+    DontKill.stepBackward(*I, *TRI);
+  }
+
+  for (MachineBasicBlock::const_iterator I = BBI1->BB->begin(), E = DI1; I != E;
+       ++I) {
+    Redefs.stepForward(*I, *TRI);
+  }
   BBI.BB->splice(BBI.BB->end(), BBI1->BB, BBI1->BB->begin(), DI1);
   BBI2->BB->erase(BBI2->BB->begin(), DI2);
 
@@ -1322,6 +1341,10 @@ bool IfConverter::IfConvertDiamond(BBInfo &BBI, IfcvtKind Kind,
   }
   BBI1->BB->erase(DI1, BBI1->BB->end());
 
+  // Kill flags in the true block for registers living into the false block
+  // must be removed.
+  RemoveKills(BBI1->BB->begin(), BBI1->BB->end(), DontKill, *TRI);
+
   // Remove 'false' block branch and find the last instruction to predicate.
   BBI2->NonPredSize -= TII->RemoveBranch(*BBI2->BB);
   DI2 = BBI2->BB->end();
@@ -1380,10 +1403,10 @@ bool IfConverter::IfConvertDiamond(BBInfo &BBI, IfcvtKind Kind,
   }
 
   // Predicate the 'true' block.
-  PredicateBlock(*BBI1, BBI1->BB->end(), *Cond1, Redefs, &RedefsByFalse);
+  PredicateBlock(*BBI1, BBI1->BB->end(), *Cond1, &RedefsByFalse);
 
   // Predicate the 'false' block.
-  PredicateBlock(*BBI2, DI2, *Cond2, Redefs);
+  PredicateBlock(*BBI2, DI2, *Cond2);
 
   // Merge the true block into the entry of the diamond.
   MergeBlocks(BBI, *BBI1, TailBB == 0);
@@ -1458,7 +1481,6 @@ static bool MaySpeculate(const MachineInstr *MI,
 void IfConverter::PredicateBlock(BBInfo &BBI,
                                  MachineBasicBlock::iterator E,
                                  SmallVectorImpl<MachineOperand> &Cond,
-                                 SmallSet<unsigned, 4> &Redefs,
                                  SmallSet<unsigned, 4> *LaterRedefs) {
   bool AnyUnpred = false;
   bool MaySpec = LaterRedefs != 0;
@@ -1484,7 +1506,7 @@ void IfConverter::PredicateBlock(BBInfo &BBI,
 
     // If the predicated instruction now redefines a register as the result of
     // if-conversion, add an implicit kill.
-    UpdatePredRedefs(I, Redefs, TRI, true);
+    UpdatePredRedefs(I, Redefs, TRI);
   }
 
   std::copy(Cond.begin(), Cond.end(), std::back_inserter(BBI.Predicate));
@@ -1501,7 +1523,6 @@ void IfConverter::PredicateBlock(BBInfo &BBI,
 /// the destination block. Skip end of block branches if IgnoreBr is true.
 void IfConverter::CopyAndPredicateBlock(BBInfo &ToBBI, BBInfo &FromBBI,
                                         SmallVectorImpl<MachineOperand> &Cond,
-                                        SmallSet<unsigned, 4> &Redefs,
                                         bool IgnoreBr) {
   MachineFunction &MF = *ToBBI.BB->getParent();
 
@@ -1514,8 +1535,8 @@ void IfConverter::CopyAndPredicateBlock(BBInfo &ToBBI, BBInfo &FromBBI,
     MachineInstr *MI = MF.CloneMachineInstr(I);
     ToBBI.BB->insert(ToBBI.BB->end(), MI);
     ToBBI.NonPredSize++;
-    unsigned ExtraPredCost = 0;
-    unsigned NumCycles = TII->getInstrLatency(InstrItins, &*I, &ExtraPredCost);
+    unsigned ExtraPredCost = TII->getPredicationCost(&*I);
+    unsigned NumCycles = SchedModel.computeInstrLatency(&*I, false);
     if (NumCycles > 1)
       ToBBI.ExtraCost += NumCycles-1;
     ToBBI.ExtraCost2 += ExtraPredCost;
@@ -1531,7 +1552,11 @@ void IfConverter::CopyAndPredicateBlock(BBInfo &ToBBI, BBInfo &FromBBI,
 
     // If the predicated instruction now redefines a register as the result of
     // if-conversion, add an implicit kill.
-    UpdatePredRedefs(MI, Redefs, TRI, true);
+    UpdatePredRedefs(MI, Redefs, TRI);
+
+    // Some kill flags may not be correct anymore.
+    if (!DontKill.empty())
+      RemoveKills(*MI, DontKill, *TRI);
   }
 
   if (!IgnoreBr) {
diff --git a/lib/CodeGen/InlineSpiller.cpp b/lib/CodeGen/InlineSpiller.cpp
index 8910652..bb0e642 100644
--- a/lib/CodeGen/InlineSpiller.cpp
+++ b/lib/CodeGen/InlineSpiller.cpp
@@ -179,10 +179,8 @@ private:
   bool coalesceStackAccess(MachineInstr *MI, unsigned Reg);
   bool foldMemoryOperand(ArrayRef<std::pair<MachineInstr*, unsigned> >,
                          MachineInstr *LoadMI = 0);
-  void insertReload(LiveInterval &NewLI, SlotIndex,
-                    MachineBasicBlock::iterator MI);
-  void insertSpill(LiveInterval &NewLI, const LiveInterval &OldLI,
-                   SlotIndex, MachineBasicBlock::iterator MI);
+  void insertReload(unsigned VReg, SlotIndex, MachineBasicBlock::iterator MI);
+  void insertSpill(unsigned VReg, bool isKill, MachineBasicBlock::iterator MI);
 
   void spillAroundUses(unsigned Reg);
   void spillAll();
@@ -580,7 +578,7 @@ MachineInstr *InlineSpiller::traceSiblingValue(unsigned UseReg, VNInfo *UseVNI,
     if (unsigned SrcReg = isFullCopyOf(MI, Reg)) {
       if (isSibling(SrcReg)) {
         LiveInterval &SrcLI = LIS.getInterval(SrcReg);
-        LiveRangeQuery SrcQ(SrcLI, VNI->def);
+        LiveQueryResult SrcQ = SrcLI.Query(VNI->def);
         assert(SrcQ.valueIn() && "Copy from non-existing value");
         // Check if this COPY kills its source.
         SVI->second.KillsSource = SrcQ.isKill();
@@ -885,12 +883,12 @@ bool InlineSpiller::reMaterializeFor(LiveInterval &VirtReg,
   }
 
   // Alocate a new register for the remat.
-  LiveInterval &NewLI = Edit->createFrom(Original);
-  NewLI.markNotSpillable();
+  unsigned NewVReg = Edit->createFrom(Original);
 
   // Finally we can rematerialize OrigMI before MI.
-  SlotIndex DefIdx = Edit->rematerializeAt(*MI->getParent(), MI, NewLI.reg, RM,
+  SlotIndex DefIdx = Edit->rematerializeAt(*MI->getParent(), MI, NewVReg, RM,
                                            TRI);
+  (void)DefIdx;
   DEBUG(dbgs() << "\tremat:  " << DefIdx << '\t'
                << *LIS.getInstructionFromIndex(DefIdx));
 
@@ -898,15 +896,12 @@ bool InlineSpiller::reMaterializeFor(LiveInterval &VirtReg,
   for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
     MachineOperand &MO = MI->getOperand(Ops[i].second);
     if (MO.isReg() && MO.isUse() && MO.getReg() == VirtReg.reg) {
-      MO.setReg(NewLI.reg);
+      MO.setReg(NewVReg);
       MO.setIsKill();
     }
   }
-  DEBUG(dbgs() << "\t        " << UseIdx << '\t' << *MI);
+  DEBUG(dbgs() << "\t        " << UseIdx << '\t' << *MI << '\n');
 
-  VNInfo *DefVNI = NewLI.getNextValue(DefIdx, LIS.getVNInfoAllocator());
-  NewLI.addRange(LiveRange(DefIdx, UseIdx.getRegSlot(), DefVNI));
-  DEBUG(dbgs() << "\tinterval: " << NewLI << '\n');
   ++NumRemats;
   return true;
 }
@@ -1009,6 +1004,40 @@ bool InlineSpiller::coalesceStackAccess(MachineInstr *MI, unsigned Reg) {
   return true;
 }
 
+#if !defined(NDEBUG)
+// Dump the range of instructions from B to E with their slot indexes.
+static void dumpMachineInstrRangeWithSlotIndex(MachineBasicBlock::iterator B,
+                                               MachineBasicBlock::iterator E,
+                                               LiveIntervals const &LIS,
+                                               const char *const header,
+                                               unsigned VReg =0) {
+  char NextLine = '\n';
+  char SlotIndent = '\t';
+
+  if (llvm::next(B) == E) {
+    NextLine = ' ';
+    SlotIndent = ' ';
+  }
+
+  dbgs() << '\t' << header << ": " << NextLine;
+
+  for (MachineBasicBlock::iterator I = B; I != E; ++I) {
+    SlotIndex Idx = LIS.getInstructionIndex(I).getRegSlot();
+
+    // If a register was passed in and this instruction has it as a
+    // destination that is marked as an early clobber, print the
+    // early-clobber slot index.
+    if (VReg) {
+      MachineOperand *MO = I->findRegisterDefOperand(VReg);
+      if (MO && MO->isEarlyClobber())
+        Idx = Idx.getRegSlot(true);
+    }
+
+    dbgs() << SlotIndent << Idx << '\t' << *I;
+  }
+}
+#endif
+
 /// foldMemoryOperand - Try folding stack slot references in Ops into their
 /// instructions.
 ///
@@ -1028,6 +1057,9 @@ foldMemoryOperand(ArrayRef<std::pair<MachineInstr*, unsigned> > Ops,
   bool WasCopy = MI->isCopy();
   unsigned ImpReg = 0;
 
+  bool SpillSubRegs = (MI->getOpcode() == TargetOpcode::PATCHPOINT ||
+                       MI->getOpcode() == TargetOpcode::STACKMAP);
+
   // TargetInstrInfo::foldMemoryOperand only expects explicit, non-tied
   // operands.
   SmallVector<unsigned, 8> FoldOps;
@@ -1039,7 +1071,7 @@ foldMemoryOperand(ArrayRef<std::pair<MachineInstr*, unsigned> > Ops,
       continue;
     }
     // FIXME: Teach targets to deal with subregs.
-    if (MO.getSubReg())
+    if (!SpillSubRegs && MO.getSubReg())
       return false;
     // We cannot fold a load instruction into a def.
     if (LoadMI && MO.isDef())
@@ -1049,6 +1081,8 @@ foldMemoryOperand(ArrayRef<std::pair<MachineInstr*, unsigned> > Ops,
       FoldOps.push_back(Idx);
   }
 
+  MachineInstrSpan MIS(MI);
+
   MachineInstr *FoldMI =
                 LoadMI ? TII.foldMemoryOperand(MI, FoldOps, LoadMI)
                        : TII.foldMemoryOperand(MI, FoldOps, StackSlot);
@@ -1075,16 +1109,24 @@ foldMemoryOperand(ArrayRef<std::pair<MachineInstr*, unsigned> > Ops,
     // FoldMI does not define this physreg. Remove the LI segment.
     assert(MO->isDead() && "Cannot fold physreg def");
     for (MCRegUnitIterator Units(Reg, &TRI); Units.isValid(); ++Units) {
-      if (LiveInterval *LI = LIS.getCachedRegUnit(*Units)) {
+      if (LiveRange *LR = LIS.getCachedRegUnit(*Units)) {
         SlotIndex Idx = LIS.getInstructionIndex(MI).getRegSlot();
-        if (VNInfo *VNI = LI->getVNInfoAt(Idx))
-          LI->removeValNo(VNI);
+        if (VNInfo *VNI = LR->getVNInfoAt(Idx))
+          LR->removeValNo(VNI);
       }
     }
   }
+
   LIS.ReplaceMachineInstrInMaps(MI, FoldMI);
   MI->eraseFromParent();
 
+  // Insert any new instructions other than FoldMI into the LIS maps.
+  assert(!MIS.empty() && "Unexpected empty span of instructions!");
+  for (MachineBasicBlock::iterator MII = MIS.begin(), End = MIS.end();
+       MII != End; ++MII)
+    if (&*MII != FoldMI)
+      LIS.InsertMachineInstrInMaps(&*MII);
+
   // TII.foldMemoryOperand may have left some implicit operands on the
   // instruction.  Strip them.
   if (ImpReg)
@@ -1096,8 +1138,9 @@ foldMemoryOperand(ArrayRef<std::pair<MachineInstr*, unsigned> > Ops,
         FoldMI->RemoveOperand(i - 1);
     }
 
-  DEBUG(dbgs() << "\tfolded:  " << LIS.getInstructionIndex(FoldMI) << '\t'
-               << *FoldMI);
+  DEBUG(dumpMachineInstrRangeWithSlotIndex(MIS.begin(), MIS.end(), LIS,
+                                           "folded"));
+
   if (!WasCopy)
     ++NumFolded;
   else if (Ops.front().second == 0)
@@ -1107,36 +1150,35 @@ foldMemoryOperand(ArrayRef<std::pair<MachineInstr*, unsigned> > Ops,
   return true;
 }
 
-/// insertReload - Insert a reload of NewLI.reg before MI.
-void InlineSpiller::insertReload(LiveInterval &NewLI,
+void InlineSpiller::insertReload(unsigned NewVReg,
                                  SlotIndex Idx,
                                  MachineBasicBlock::iterator MI) {
   MachineBasicBlock &MBB = *MI->getParent();
-  TII.loadRegFromStackSlot(MBB, MI, NewLI.reg, StackSlot,
-                           MRI.getRegClass(NewLI.reg), &TRI);
-  --MI; // Point to load instruction.
-  SlotIndex LoadIdx = LIS.InsertMachineInstrInMaps(MI).getRegSlot();
-  // Some (out-of-tree) targets have EC reload instructions.
-  if (MachineOperand *MO = MI->findRegisterDefOperand(NewLI.reg))
-    if (MO->isEarlyClobber())
-      LoadIdx = LoadIdx.getRegSlot(true);
-  DEBUG(dbgs() << "\treload:  " << LoadIdx << '\t' << *MI);
-  VNInfo *LoadVNI = NewLI.getNextValue(LoadIdx, LIS.getVNInfoAllocator());
-  NewLI.addRange(LiveRange(LoadIdx, Idx, LoadVNI));
+
+  MachineInstrSpan MIS(MI);
+  TII.loadRegFromStackSlot(MBB, MI, NewVReg, StackSlot,
+                           MRI.getRegClass(NewVReg), &TRI);
+
+  LIS.InsertMachineInstrRangeInMaps(MIS.begin(), MI);
+
+  DEBUG(dumpMachineInstrRangeWithSlotIndex(MIS.begin(), MI, LIS, "reload",
+                                           NewVReg));
   ++NumReloads;
 }
 
-/// insertSpill - Insert a spill of NewLI.reg after MI.
-void InlineSpiller::insertSpill(LiveInterval &NewLI, const LiveInterval &OldLI,
-                                SlotIndex Idx, MachineBasicBlock::iterator MI) {
+/// insertSpill - Insert a spill of NewVReg after MI.
+void InlineSpiller::insertSpill(unsigned NewVReg, bool isKill,
+                                 MachineBasicBlock::iterator MI) {
   MachineBasicBlock &MBB = *MI->getParent();
-  TII.storeRegToStackSlot(MBB, ++MI, NewLI.reg, true, StackSlot,
-                          MRI.getRegClass(NewLI.reg), &TRI);
-  --MI; // Point to store instruction.
-  SlotIndex StoreIdx = LIS.InsertMachineInstrInMaps(MI).getRegSlot();
-  DEBUG(dbgs() << "\tspilled: " << StoreIdx << '\t' << *MI);
-  VNInfo *StoreVNI = NewLI.getNextValue(Idx, LIS.getVNInfoAllocator());
-  NewLI.addRange(LiveRange(Idx, StoreIdx, StoreVNI));
+
+  MachineInstrSpan MIS(MI);
+  TII.storeRegToStackSlot(MBB, llvm::next(MI), NewVReg, isKill, StackSlot,
+                          MRI.getRegClass(NewVReg), &TRI);
+
+  LIS.InsertMachineInstrRangeInMaps(llvm::next(MI), MIS.end());
+
+  DEBUG(dumpMachineInstrRangeWithSlotIndex(llvm::next(MI), MIS.end(), LIS,
+                                           "spill"));
   ++NumSpills;
 }
 
@@ -1152,7 +1194,7 @@ void InlineSpiller::spillAroundUses(unsigned Reg) {
     // Debug values are not allowed to affect codegen.
     if (MI->isDebugValue()) {
       // Modify DBG_VALUE now that the value is in a spill slot.
-      bool IsIndirect = MI->getOperand(1).isImm();
+      bool IsIndirect = MI->isIndirectDebugValue();
       uint64_t Offset = IsIndirect ? MI->getOperand(1).getImm() : 0;
       const MDNode *MDPtr = MI->getOperand(2).getMetadata();
       DebugLoc DL = MI->getDebugLoc();
@@ -1212,19 +1254,18 @@ void InlineSpiller::spillAroundUses(unsigned Reg) {
     if (foldMemoryOperand(Ops))
       continue;
 
-    // Allocate interval around instruction.
+    // Create a new virtual register for spill/fill.
     // FIXME: Infer regclass from instruction alone.
-    LiveInterval &NewLI = Edit->createFrom(Reg);
-    NewLI.markNotSpillable();
+    unsigned NewVReg = Edit->createFrom(Reg);
 
     if (RI.Reads)
-      insertReload(NewLI, Idx, MI);
+      insertReload(NewVReg, Idx, MI);
 
     // Rewrite instruction operands.
     bool hasLiveDef = false;
     for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
       MachineOperand &MO = Ops[i].first->getOperand(Ops[i].second);
-      MO.setReg(NewLI.reg);
+      MO.setReg(NewVReg);
       if (MO.isUse()) {
         if (!Ops[i].first->isRegTiedToDefOperand(Ops[i].second))
           MO.setIsKill();
@@ -1233,21 +1274,12 @@ void InlineSpiller::spillAroundUses(unsigned Reg) {
           hasLiveDef = true;
       }
     }
-    DEBUG(dbgs() << "\trewrite: " << Idx << '\t' << *MI);
+    DEBUG(dbgs() << "\trewrite: " << Idx << '\t' << *MI << '\n');
 
     // FIXME: Use a second vreg if instruction has no tied ops.
-    if (RI.Writes) {
+    if (RI.Writes)
       if (hasLiveDef)
-        insertSpill(NewLI, OldLI, Idx, MI);
-      else {
-        // This instruction defines a dead value.  We don't need to spill it,
-        // but do create a live range for the dead value.
-        VNInfo *VNI = NewLI.getNextValue(Idx, LIS.getVNInfoAllocator());
-        NewLI.addRange(LiveRange(Idx, Idx.getDeadSlot(), VNI));
-      }
-    }
-
-    DEBUG(dbgs() << "\tinterval: " << NewLI << '\n');
+        insertSpill(NewVReg, true, MI);
   }
 }
 
@@ -1266,8 +1298,8 @@ void InlineSpiller::spillAll() {
 
   assert(StackInt->getNumValNums() == 1 && "Bad stack interval values");
   for (unsigned i = 0, e = RegsToSpill.size(); i != e; ++i)
-    StackInt->MergeRangesInAsValue(LIS.getInterval(RegsToSpill[i]),
-                                   StackInt->getValNumInfo(0));
+    StackInt->MergeSegmentsInAsValue(LIS.getInterval(RegsToSpill[i]),
+                                     StackInt->getValNumInfo(0));
   DEBUG(dbgs() << "Merged spilled regs: " << *StackInt << '\n');
 
   // Spill around uses of all RegsToSpill.
@@ -1308,8 +1340,8 @@ void InlineSpiller::spill(LiveRangeEdit &edit) {
 
   DEBUG(dbgs() << "Inline spilling "
                << MRI.getRegClass(edit.getReg())->getName()
-               << ':' << PrintReg(edit.getReg()) << ' ' << edit.getParent()
-               << "\nFrom original " << LIS.getInterval(Original) << '\n');
+               << ':' << edit.getParent()
+               << "\nFrom original " << PrintReg(Original) << '\n');
   assert(edit.getParent().isSpillable() &&
          "Attempting to spill already spilled value.");
   assert(DeadDefs.empty() && "Previous spill didn't remove dead defs");
diff --git a/lib/CodeGen/InterferenceCache.cpp b/lib/CodeGen/InterferenceCache.cpp
index a8e711e..427225d 100644
--- a/lib/CodeGen/InterferenceCache.cpp
+++ b/lib/CodeGen/InterferenceCache.cpp
@@ -204,11 +204,11 @@ void InterferenceCache::Entry::update(unsigned MBBNum) {
   // Fixed interference.
   for (unsigned i = 0, e = RegUnits.size(); i != e; ++i) {
     LiveInterval::iterator &I = RegUnits[i].FixedI;
-    LiveInterval *LI = RegUnits[i].Fixed;
-    if (I == LI->end() || I->start >= Stop)
+    LiveRange *LR = RegUnits[i].Fixed;
+    if (I == LR->end() || I->start >= Stop)
       continue;
-    I = LI->advanceTo(I, Stop);
-    bool Backup = I == LI->end() || I->start >= Stop;
+    I = LR->advanceTo(I, Stop);
+    bool Backup = I == LR->end() || I->start >= Stop;
     if (Backup)
       --I;
     SlotIndex StopI = I->end;
diff --git a/lib/CodeGen/InterferenceCache.h b/lib/CodeGen/InterferenceCache.h
index c02fb9a..800f705 100644
--- a/lib/CodeGen/InterferenceCache.h
+++ b/lib/CodeGen/InterferenceCache.h
@@ -72,7 +72,7 @@ class InterferenceCache {
       unsigned VirtTag;
 
       /// Fixed interference in RegUnit.
-      LiveInterval *Fixed;
+      LiveRange *Fixed;
 
       /// Iterator pointing into the fixed RegUnit interference.
       LiveInterval::iterator FixedI;
diff --git a/lib/CodeGen/IntrinsicLowering.cpp b/lib/CodeGen/IntrinsicLowering.cpp
index d894f66..c38d4fb 100644
--- a/lib/CodeGen/IntrinsicLowering.cpp
+++ b/lib/CodeGen/IntrinsicLowering.cpp
@@ -485,11 +485,12 @@ void IntrinsicLowering::LowerIntrinsicCall(CallInst *CI) {
     break;
   }
   case Intrinsic::memset: {
-    Type *IntPtr = TD.getIntPtrType(Context);
+    Value *Op0 = CI->getArgOperand(0);
+    Type *IntPtr = TD.getIntPtrType(Op0->getType());
     Value *Size = Builder.CreateIntCast(CI->getArgOperand(2), IntPtr,
                                         /* isSigned */ false);
     Value *Ops[3];
-    Ops[0] = CI->getArgOperand(0);
+    Ops[0] = Op0;
     // Extend the amount to i32.
     Ops[1] = Builder.CreateIntCast(CI->getArgOperand(1),
                                    Type::getInt32Ty(Context),
diff --git a/lib/CodeGen/LLVMTargetMachine.cpp b/lib/CodeGen/LLVMTargetMachine.cpp
index 6c9b2e5..ad2c553 100644
--- a/lib/CodeGen/LLVMTargetMachine.cpp
+++ b/lib/CodeGen/LLVMTargetMachine.cpp
@@ -175,12 +175,11 @@ bool LLVMTargetMachine::addPassesToEmitFile(PassManagerBase &PM,
 
     // Create a code emitter if asked to show the encoding.
     MCCodeEmitter *MCE = 0;
-    MCAsmBackend *MAB = 0;
-    if (ShowMCEncoding) {
+    if (ShowMCEncoding)
       MCE = getTarget().createMCCodeEmitter(MII, MRI, STI, *Context);
-      MAB = getTarget().createMCAsmBackend(getTargetTriple(), TargetCPU);
-    }
 
+    MCAsmBackend *MAB = getTarget().createMCAsmBackend(MRI, getTargetTriple(),
+                                                       TargetCPU);
     MCStreamer *S = getTarget().createAsmStreamer(*Context, Out,
                                                   getVerboseAsm(),
                                                   hasMCUseLoc(),
@@ -197,7 +196,7 @@ bool LLVMTargetMachine::addPassesToEmitFile(PassManagerBase &PM,
     // emission fails.
     MCCodeEmitter *MCE = getTarget().createMCCodeEmitter(MII, MRI, STI,
                                                          *Context);
-    MCAsmBackend *MAB = getTarget().createMCAsmBackend(getTargetTriple(),
+    MCAsmBackend *MAB = getTarget().createMCAsmBackend(MRI, getTargetTriple(),
                                                        TargetCPU);
     if (MCE == 0 || MAB == 0)
       return true;
@@ -232,7 +231,7 @@ bool LLVMTargetMachine::addPassesToEmitFile(PassManagerBase &PM,
 /// addPassesToEmitMachineCode - Add passes to the specified pass manager to
 /// get machine code emitted.  This uses a JITCodeEmitter object to handle
 /// actually outputting the machine code and resolving things like the address
-/// of functions.  This method should returns true if machine code emission is
+/// of functions.  This method should return true if machine code emission is
 /// not supported.
 ///
 bool LLVMTargetMachine::addPassesToEmitMachineCode(PassManagerBase &PM,
@@ -271,7 +270,8 @@ bool LLVMTargetMachine::addPassesToEmitMC(PassManagerBase &PM,
   const MCSubtargetInfo &STI = getSubtarget<MCSubtargetInfo>();
   MCCodeEmitter *MCE = getTarget().createMCCodeEmitter(*getInstrInfo(), MRI,
                                                        STI, *Ctx);
-  MCAsmBackend *MAB = getTarget().createMCAsmBackend(getTargetTriple(), TargetCPU);
+  MCAsmBackend *MAB = getTarget().createMCAsmBackend(MRI, getTargetTriple(),
+                                                     TargetCPU);
   if (MCE == 0 || MAB == 0)
     return true;
 
diff --git a/lib/CodeGen/LiveDebugVariables.cpp b/lib/CodeGen/LiveDebugVariables.cpp
index 85bed46..25645e0 100644
--- a/lib/CodeGen/LiveDebugVariables.cpp
+++ b/lib/CodeGen/LiveDebugVariables.cpp
@@ -131,7 +131,8 @@ class UserValue {
 
   /// splitLocation - Replace OldLocNo ranges with NewRegs ranges where NewRegs
   /// is live. Returns true if any changes were made.
-  bool splitLocation(unsigned OldLocNo, ArrayRef<LiveInterval*> NewRegs);
+  bool splitLocation(unsigned OldLocNo, ArrayRef<unsigned> NewRegs,
+                     LiveIntervals &LIS);
 
 public:
   /// UserValue - Create a new UserValue.
@@ -219,13 +220,13 @@ public:
   /// End points where VNI is no longer live are added to Kills.
   /// @param Idx   Starting point for the definition.
   /// @param LocNo Location number to propagate.
-  /// @param LI    Restrict liveness to where LI has the value VNI. May be null.
-  /// @param VNI   When LI is not null, this is the value to restrict to.
+  /// @param LR    Restrict liveness to where LR has the value VNI. May be null.
+  /// @param VNI   When LR is not null, this is the value to restrict to.
   /// @param Kills Append end points of VNI's live range to Kills.
   /// @param LIS   Live intervals analysis.
   /// @param MDT   Dominator tree.
   void extendDef(SlotIndex Idx, unsigned LocNo,
-                 LiveInterval *LI, const VNInfo *VNI,
+                 LiveRange *LR, const VNInfo *VNI,
                  SmallVectorImpl<SlotIndex> *Kills,
                  LiveIntervals &LIS, MachineDominatorTree &MDT,
                  UserValueScopes &UVS);
@@ -251,7 +252,8 @@ public:
 
   /// splitRegister - Replace OldReg ranges with NewRegs ranges where NewRegs is
   /// live. Returns true if any changes were made.
-  bool splitRegister(unsigned OldLocNo, ArrayRef<LiveInterval*> NewRegs);
+  bool splitRegister(unsigned OldLocNo, ArrayRef<unsigned> NewRegs,
+                     LiveIntervals &LIS);
 
   /// rewriteLocations - Rewrite virtual register locations according to the
   /// provided virtual register map.
@@ -345,7 +347,7 @@ public:
   void mapVirtReg(unsigned VirtReg, UserValue *EC);
 
   /// splitRegister -  Replace all references to OldReg with NewRegs.
-  void splitRegister(unsigned OldReg, ArrayRef<LiveInterval*> NewRegs);
+  void splitRegister(unsigned OldReg, ArrayRef<unsigned> NewRegs);
 
   /// emitDebugValues - Recreate DBG_VALUE instruction from data structures.
   void emitDebugValues(VirtRegMap *VRM);
@@ -455,9 +457,10 @@ bool LDVImpl::handleDebugValue(MachineInstr *MI, SlotIndex Idx) {
   }
 
   // Get or create the UserValue for (variable,offset).
-  bool IsIndirect = MI->getOperand(1).isImm();
+  bool IsIndirect = MI->isIndirectDebugValue();
   unsigned Offset = IsIndirect ? MI->getOperand(1).getImm() : 0;
   const MDNode *Var = MI->getOperand(2).getMetadata();
+  //here.
   UserValue *UV = getUserValue(Var, Offset, IsIndirect, MI->getDebugLoc());
   UV->addDef(Idx, MI->getOperand(0));
   return true;
@@ -492,7 +495,7 @@ bool LDVImpl::collectDebugValues(MachineFunction &mf) {
 }
 
 void UserValue::extendDef(SlotIndex Idx, unsigned LocNo,
-                          LiveInterval *LI, const VNInfo *VNI,
+                          LiveRange *LR, const VNInfo *VNI,
                           SmallVectorImpl<SlotIndex> *Kills,
                           LiveIntervals &LIS, MachineDominatorTree &MDT,
                           UserValueScopes &UVS) {
@@ -506,15 +509,15 @@ void UserValue::extendDef(SlotIndex Idx, unsigned LocNo,
 
     // Limit to VNI's live range.
     bool ToEnd = true;
-    if (LI && VNI) {
-      LiveRange *Range = LI->getLiveRangeContaining(Start);
-      if (!Range || Range->valno != VNI) {
+    if (LR && VNI) {
+      LiveInterval::Segment *Segment = LR->getSegmentContaining(Start);
+      if (!Segment || Segment->valno != VNI) {
         if (Kills)
           Kills->push_back(Start);
         continue;
       }
-      if (Range->end < Stop)
-        Stop = Range->end, ToEnd = false;
+      if (Segment->end < Stop)
+        Stop = Segment->end, ToEnd = false;
     }
 
     // There could already be a short def at Start.
@@ -666,10 +669,10 @@ UserValue::computeIntervals(MachineRegisterInfo &MRI,
 
     // For physregs, use the live range of the first regunit as a guide.
     unsigned Unit = *MCRegUnitIterator(Loc.getReg(), &TRI);
-    LiveInterval *LI = &LIS.getRegUnit(Unit);
-    const VNInfo *VNI = LI->getVNInfoAt(Idx);
+    LiveRange *LR = &LIS.getRegUnit(Unit);
+    const VNInfo *VNI = LR->getVNInfoAt(Idx);
     // Don't track copies from physregs, it is too expensive.
-    extendDef(Idx, LocNo, LI, VNI, 0, LIS, MDT, UVS);
+    extendDef(Idx, LocNo, LR, VNI, 0, LIS, MDT, UVS);
   }
 
   // Finally, erase all the undefs.
@@ -729,7 +732,8 @@ LiveDebugVariables::~LiveDebugVariables() {
 //===----------------------------------------------------------------------===//
 
 bool
-UserValue::splitLocation(unsigned OldLocNo, ArrayRef<LiveInterval*> NewRegs) {
+UserValue::splitLocation(unsigned OldLocNo, ArrayRef<unsigned> NewRegs,
+                         LiveIntervals& LIS) {
   DEBUG({
     dbgs() << "Splitting Loc" << OldLocNo << '\t';
     print(dbgs(), 0);
@@ -738,7 +742,7 @@ UserValue::splitLocation(unsigned OldLocNo, ArrayRef<LiveInterval*> NewRegs) {
   LocMap::iterator LocMapI;
   LocMapI.setMap(locInts);
   for (unsigned i = 0; i != NewRegs.size(); ++i) {
-    LiveInterval *LI = NewRegs[i];
+    LiveInterval *LI = &LIS.getInterval(NewRegs[i]);
     if (LI->empty())
       continue;
 
@@ -827,7 +831,8 @@ UserValue::splitLocation(unsigned OldLocNo, ArrayRef<LiveInterval*> NewRegs) {
 }
 
 bool
-UserValue::splitRegister(unsigned OldReg, ArrayRef<LiveInterval*> NewRegs) {
+UserValue::splitRegister(unsigned OldReg, ArrayRef<unsigned> NewRegs,
+                         LiveIntervals &LIS) {
   bool DidChange = false;
   // Split locations referring to OldReg. Iterate backwards so splitLocation can
   // safely erase unused locations.
@@ -836,15 +841,15 @@ UserValue::splitRegister(unsigned OldReg, ArrayRef<LiveInterval*> NewRegs) {
     const MachineOperand *Loc = &locations[LocNo];
     if (!Loc->isReg() || Loc->getReg() != OldReg)
       continue;
-    DidChange |= splitLocation(LocNo, NewRegs);
+    DidChange |= splitLocation(LocNo, NewRegs, LIS);
   }
   return DidChange;
 }
 
-void LDVImpl::splitRegister(unsigned OldReg, ArrayRef<LiveInterval*> NewRegs) {
+void LDVImpl::splitRegister(unsigned OldReg, ArrayRef<unsigned> NewRegs) {
   bool DidChange = false;
   for (UserValue *UV = lookupVirtReg(OldReg); UV; UV = UV->getNext())
-    DidChange |= UV->splitRegister(OldReg, NewRegs);
+    DidChange |= UV->splitRegister(OldReg, NewRegs, *LIS);
 
   if (!DidChange)
     return;
@@ -852,11 +857,11 @@ void LDVImpl::splitRegister(unsigned OldReg, ArrayRef<LiveInterval*> NewRegs) {
   // Map all of the new virtual registers.
   UserValue *UV = lookupVirtReg(OldReg);
   for (unsigned i = 0; i != NewRegs.size(); ++i)
-    mapVirtReg(NewRegs[i]->reg, UV);
+    mapVirtReg(NewRegs[i], UV);
 }
 
 void LiveDebugVariables::
-splitRegister(unsigned OldReg, ArrayRef<LiveInterval*> NewRegs) {
+splitRegister(unsigned OldReg, ArrayRef<unsigned> NewRegs, LiveIntervals &LIS) {
   if (pImpl)
     static_cast<LDVImpl*>(pImpl)->splitRegister(OldReg, NewRegs);
 }
diff --git a/lib/CodeGen/LiveDebugVariables.h b/lib/CodeGen/LiveDebugVariables.h
index 3ce3c39..58a3f0f 100644
--- a/lib/CodeGen/LiveDebugVariables.h
+++ b/lib/CodeGen/LiveDebugVariables.h
@@ -27,6 +27,7 @@
 namespace llvm {
 
 class LiveInterval;
+class LiveIntervals;
 class VirtRegMap;
 
 class LiveDebugVariables : public MachineFunctionPass {
@@ -47,7 +48,8 @@ public:
   /// splitRegister - Move any user variables in OldReg to the live ranges in
   /// NewRegs where they are live. Mark the values as unavailable where no new
   /// register is live.
-  void splitRegister(unsigned OldReg, ArrayRef<LiveInterval*> NewRegs);
+  void splitRegister(unsigned OldReg, ArrayRef<unsigned> NewRegs,
+                     LiveIntervals &LIS);
 
   /// emitDebugValues - Emit new DBG_VALUE instructions reflecting the changes
   /// that happened during register allocation.
diff --git a/lib/CodeGen/LiveInterval.cpp b/lib/CodeGen/LiveInterval.cpp
index 6be6bf3..2b8feb8 100644
--- a/lib/CodeGen/LiveInterval.cpp
+++ b/lib/CodeGen/LiveInterval.cpp
@@ -9,12 +9,12 @@
 //
 // This file implements the LiveRange and LiveInterval classes.  Given some
 // numbering of each the machine instructions an interval [i, j) is said to be a
-// live interval for register v if there is no instruction with number j' > j
+// live range for register v if there is no instruction with number j' >= j
 // such that v is live at j' and there is no instruction with number i' < i such
-// that v is live at i'. In this implementation intervals can have holes,
-// i.e. an interval might look like [1,20), [50,65), [1000,1001).  Each
-// individual range is represented as an instance of LiveRange, and the whole
-// interval is represented as an instance of LiveInterval.
+// that v is live at i'. In this implementation ranges can have holes,
+// i.e. a range might look like [1,20), [50,65), [1000,1001).  Each
+// individual segment is represented as an instance of LiveRange::Segment,
+// and the whole range is represented as an instance of LiveRange.
 //
 //===----------------------------------------------------------------------===//
 
@@ -31,14 +31,14 @@
 #include <algorithm>
 using namespace llvm;
 
-LiveInterval::iterator LiveInterval::find(SlotIndex Pos) {
+LiveRange::iterator LiveRange::find(SlotIndex Pos) {
   // This algorithm is basically std::upper_bound.
   // Unfortunately, std::upper_bound cannot be used with mixed types until we
   // adopt C++0x. Many libraries can do it, but not all.
   if (empty() || Pos >= endIndex())
     return end();
   iterator I = begin();
-  size_t Len = ranges.size();
+  size_t Len = size();
   do {
     size_t Mid = Len >> 1;
     if (Pos < I[Mid].end)
@@ -49,13 +49,13 @@ LiveInterval::iterator LiveInterval::find(SlotIndex Pos) {
   return I;
 }
 
-VNInfo *LiveInterval::createDeadDef(SlotIndex Def,
-                                    VNInfo::Allocator &VNInfoAllocator) {
+VNInfo *LiveRange::createDeadDef(SlotIndex Def,
+                                  VNInfo::Allocator &VNInfoAllocator) {
   assert(!Def.isDead() && "Cannot define a value at the dead slot");
   iterator I = find(Def);
   if (I == end()) {
     VNInfo *VNI = getNextValue(Def, VNInfoAllocator);
-    ranges.push_back(LiveRange(Def, Def.getDeadSlot(), VNI));
+    segments.push_back(Segment(Def, Def.getDeadSlot(), VNI));
     return VNI;
   }
   if (SlotIndex::isSameInstr(Def, I->start)) {
@@ -73,11 +73,11 @@ VNInfo *LiveInterval::createDeadDef(SlotIndex Def,
   }
   assert(SlotIndex::isEarlierInstr(Def, I->start) && "Already live at def");
   VNInfo *VNI = getNextValue(Def, VNInfoAllocator);
-  ranges.insert(I, LiveRange(Def, Def.getDeadSlot(), VNI));
+  segments.insert(I, Segment(Def, Def.getDeadSlot(), VNI));
   return VNI;
 }
 
-// overlaps - Return true if the intersection of the two live intervals is
+// overlaps - Return true if the intersection of the two live ranges is
 // not empty.
 //
 // An example for overlaps():
@@ -86,7 +86,7 @@ VNInfo *LiveInterval::createDeadDef(SlotIndex Def,
 // 4: B = ...
 // 8: C = A + B ;; last use of A
 //
-// The live intervals should look like:
+// The live ranges should look like:
 //
 // A = [3, 11)
 // B = [7, x)
@@ -95,9 +95,9 @@ VNInfo *LiveInterval::createDeadDef(SlotIndex Def,
 // A->overlaps(C) should return false since we want to be able to join
 // A and C.
 //
-bool LiveInterval::overlapsFrom(const LiveInterval& other,
-                                const_iterator StartPos) const {
-  assert(!empty() && "empty interval");
+bool LiveRange::overlapsFrom(const LiveRange& other,
+                             const_iterator StartPos) const {
+  assert(!empty() && "empty range");
   const_iterator i = begin();
   const_iterator ie = end();
   const_iterator j = StartPos;
@@ -108,13 +108,13 @@ bool LiveInterval::overlapsFrom(const LiveInterval& other,
 
   if (i->start < j->start) {
     i = std::upper_bound(i, ie, j->start);
-    if (i != ranges.begin()) --i;
+    if (i != begin()) --i;
   } else if (j->start < i->start) {
     ++StartPos;
     if (StartPos != other.end() && StartPos->start <= i->start) {
       assert(StartPos < other.end() && i < end());
       j = std::upper_bound(j, je, i->start);
-      if (j != other.ranges.begin()) --j;
+      if (j != other.begin()) --j;
     }
   } else {
     return true;
@@ -136,10 +136,9 @@ bool LiveInterval::overlapsFrom(const LiveInterval& other,
   return false;
 }
 
-bool LiveInterval::overlaps(const LiveInterval &Other,
-                            const CoalescerPair &CP,
-                            const SlotIndexes &Indexes) const {
-  assert(!empty() && "empty interval");
+bool LiveRange::overlaps(const LiveRange &Other, const CoalescerPair &CP,
+                         const SlotIndexes &Indexes) const {
+  assert(!empty() && "empty range");
   if (Other.empty())
     return false;
 
@@ -178,9 +177,9 @@ bool LiveInterval::overlaps(const LiveInterval &Other,
   }
 }
 
-/// overlaps - Return true if the live interval overlaps a range specified
+/// overlaps - Return true if the live range overlaps an interval specified
 /// by [Start, End).
-bool LiveInterval::overlaps(SlotIndex Start, SlotIndex End) const {
+bool LiveRange::overlaps(SlotIndex Start, SlotIndex End) const {
   assert(Start < End && "Invalid range");
   const_iterator I = std::lower_bound(begin(), end(), End);
   return I != begin() && (--I)->end > Start;
@@ -190,7 +189,7 @@ bool LiveInterval::overlaps(SlotIndex Start, SlotIndex End) const {
 /// ValNo is dead, remove it.  If it is the largest value number, just nuke it
 /// (and any other deleted values neighboring it), otherwise mark it as ~1U so
 /// it can be nuked later.
-void LiveInterval::markValNoForDeletion(VNInfo *ValNo) {
+void LiveRange::markValNoForDeletion(VNInfo *ValNo) {
   if (ValNo->id == getNumValNums()-1) {
     do {
       valnos.pop_back();
@@ -202,137 +201,135 @@ void LiveInterval::markValNoForDeletion(VNInfo *ValNo) {
 
 /// RenumberValues - Renumber all values in order of appearance and delete the
 /// remaining unused values.
-void LiveInterval::RenumberValues(LiveIntervals &lis) {
+void LiveRange::RenumberValues() {
   SmallPtrSet<VNInfo*, 8> Seen;
   valnos.clear();
   for (const_iterator I = begin(), E = end(); I != E; ++I) {
     VNInfo *VNI = I->valno;
     if (!Seen.insert(VNI))
       continue;
-    assert(!VNI->isUnused() && "Unused valno used by live range");
+    assert(!VNI->isUnused() && "Unused valno used by live segment");
     VNI->id = (unsigned)valnos.size();
     valnos.push_back(VNI);
   }
 }
 
-/// extendIntervalEndTo - This method is used when we want to extend the range
-/// specified by I to end at the specified endpoint.  To do this, we should
-/// merge and eliminate all ranges that this will overlap with.  The iterator is
-/// not invalidated.
-void LiveInterval::extendIntervalEndTo(Ranges::iterator I, SlotIndex NewEnd) {
-  assert(I != ranges.end() && "Not a valid interval!");
+/// This method is used when we want to extend the segment specified by I to end
+/// at the specified endpoint.  To do this, we should merge and eliminate all
+/// segments that this will overlap with.  The iterator is not invalidated.
+void LiveRange::extendSegmentEndTo(iterator I, SlotIndex NewEnd) {
+  assert(I != end() && "Not a valid segment!");
   VNInfo *ValNo = I->valno;
 
-  // Search for the first interval that we can't merge with.
-  Ranges::iterator MergeTo = llvm::next(I);
-  for (; MergeTo != ranges.end() && NewEnd >= MergeTo->end; ++MergeTo) {
+  // Search for the first segment that we can't merge with.
+  iterator MergeTo = llvm::next(I);
+  for (; MergeTo != end() && NewEnd >= MergeTo->end; ++MergeTo) {
     assert(MergeTo->valno == ValNo && "Cannot merge with differing values!");
   }
 
-  // If NewEnd was in the middle of an interval, make sure to get its endpoint.
+  // If NewEnd was in the middle of a segment, make sure to get its endpoint.
   I->end = std::max(NewEnd, prior(MergeTo)->end);
 
-  // If the newly formed range now touches the range after it and if they have
-  // the same value number, merge the two ranges into one range.
-  if (MergeTo != ranges.end() && MergeTo->start <= I->end &&
+  // If the newly formed segment now touches the segment after it and if they
+  // have the same value number, merge the two segments into one segment.
+  if (MergeTo != end() && MergeTo->start <= I->end &&
       MergeTo->valno == ValNo) {
     I->end = MergeTo->end;
     ++MergeTo;
   }
 
-  // Erase any dead ranges.
-  ranges.erase(llvm::next(I), MergeTo);
+  // Erase any dead segments.
+  segments.erase(llvm::next(I), MergeTo);
 }
 
 
-/// extendIntervalStartTo - This method is used when we want to extend the range
-/// specified by I to start at the specified endpoint.  To do this, we should
-/// merge and eliminate all ranges that this will overlap with.
-LiveInterval::Ranges::iterator
-LiveInterval::extendIntervalStartTo(Ranges::iterator I, SlotIndex NewStart) {
-  assert(I != ranges.end() && "Not a valid interval!");
+/// This method is used when we want to extend the segment specified by I to
+/// start at the specified endpoint.  To do this, we should merge and eliminate
+/// all segments that this will overlap with.
+LiveRange::iterator
+LiveRange::extendSegmentStartTo(iterator I, SlotIndex NewStart) {
+  assert(I != end() && "Not a valid segment!");
   VNInfo *ValNo = I->valno;
 
-  // Search for the first interval that we can't merge with.
-  Ranges::iterator MergeTo = I;
+  // Search for the first segment that we can't merge with.
+  iterator MergeTo = I;
   do {
-    if (MergeTo == ranges.begin()) {
+    if (MergeTo == begin()) {
       I->start = NewStart;
-      ranges.erase(MergeTo, I);
+      segments.erase(MergeTo, I);
       return I;
     }
     assert(MergeTo->valno == ValNo && "Cannot merge with differing values!");
     --MergeTo;
   } while (NewStart <= MergeTo->start);
 
-  // If we start in the middle of another interval, just delete a range and
-  // extend that interval.
+  // If we start in the middle of another segment, just delete a range and
+  // extend that segment.
   if (MergeTo->end >= NewStart && MergeTo->valno == ValNo) {
     MergeTo->end = I->end;
   } else {
-    // Otherwise, extend the interval right after.
+    // Otherwise, extend the segment right after.
     ++MergeTo;
     MergeTo->start = NewStart;
     MergeTo->end = I->end;
   }
 
-  ranges.erase(llvm::next(MergeTo), llvm::next(I));
+  segments.erase(llvm::next(MergeTo), llvm::next(I));
   return MergeTo;
 }
 
-LiveInterval::iterator
-LiveInterval::addRangeFrom(LiveRange LR, iterator From) {
-  SlotIndex Start = LR.start, End = LR.end;
-  iterator it = std::upper_bound(From, ranges.end(), Start);
+LiveRange::iterator LiveRange::addSegmentFrom(Segment S, iterator From) {
+  SlotIndex Start = S.start, End = S.end;
+  iterator it = std::upper_bound(From, end(), Start);
 
-  // If the inserted interval starts in the middle or right at the end of
-  // another interval, just extend that interval to contain the range of LR.
-  if (it != ranges.begin()) {
+  // If the inserted segment starts in the middle or right at the end of
+  // another segment, just extend that segment to contain the segment of S.
+  if (it != begin()) {
     iterator B = prior(it);
-    if (LR.valno == B->valno) {
+    if (S.valno == B->valno) {
       if (B->start <= Start && B->end >= Start) {
-        extendIntervalEndTo(B, End);
+        extendSegmentEndTo(B, End);
         return B;
       }
     } else {
-      // Check to make sure that we are not overlapping two live ranges with
+      // Check to make sure that we are not overlapping two live segments with
       // different valno's.
       assert(B->end <= Start &&
-             "Cannot overlap two LiveRanges with differing ValID's"
+             "Cannot overlap two segments with differing ValID's"
              " (did you def the same reg twice in a MachineInstr?)");
     }
   }
 
-  // Otherwise, if this range ends in the middle of, or right next to, another
-  // interval, merge it into that interval.
-  if (it != ranges.end()) {
-    if (LR.valno == it->valno) {
+  // Otherwise, if this segment ends in the middle of, or right next to, another
+  // segment, merge it into that segment.
+  if (it != end()) {
+    if (S.valno == it->valno) {
       if (it->start <= End) {
-        it = extendIntervalStartTo(it, Start);
+        it = extendSegmentStartTo(it, Start);
 
-        // If LR is a complete superset of an interval, we may need to grow its
+        // If S is a complete superset of a segment, we may need to grow its
         // endpoint as well.
         if (End > it->end)
-          extendIntervalEndTo(it, End);
+          extendSegmentEndTo(it, End);
         return it;
       }
     } else {
-      // Check to make sure that we are not overlapping two live ranges with
+      // Check to make sure that we are not overlapping two live segments with
       // different valno's.
       assert(it->start >= End &&
-             "Cannot overlap two LiveRanges with differing ValID's");
+             "Cannot overlap two segments with differing ValID's");
     }
   }
 
-  // Otherwise, this is just a new range that doesn't interact with anything.
+  // Otherwise, this is just a new segment that doesn't interact with anything.
   // Insert it.
-  return ranges.insert(it, LR);
+  return segments.insert(it, S);
 }
 
-/// extendInBlock - If this interval is live before Kill in the basic
+/// extendInBlock - If this range is live before Kill in the basic
 /// block that starts at StartIdx, extend it to be live up to Kill and return
 /// the value. If there is no live range before Kill, return NULL.
-VNInfo *LiveInterval::extendInBlock(SlotIndex StartIdx, SlotIndex Kill) {
+VNInfo *LiveRange::extendInBlock(SlotIndex StartIdx, SlotIndex Kill) {
   if (empty())
     return 0;
   iterator I = std::upper_bound(begin(), end(), Kill.getPrevSlot());
@@ -342,20 +339,21 @@ VNInfo *LiveInterval::extendInBlock(SlotIndex StartIdx, SlotIndex Kill) {
   if (I->end <= StartIdx)
     return 0;
   if (I->end < Kill)
-    extendIntervalEndTo(I, Kill);
+    extendSegmentEndTo(I, Kill);
   return I->valno;
 }
 
-/// removeRange - Remove the specified range from this interval.  Note that
-/// the range must be in a single LiveRange in its entirety.
-void LiveInterval::removeRange(SlotIndex Start, SlotIndex End,
-                               bool RemoveDeadValNo) {
-  // Find the LiveRange containing this span.
-  Ranges::iterator I = find(Start);
-  assert(I != ranges.end() && "Range is not in interval!");
-  assert(I->containsRange(Start, End) && "Range is not entirely in interval!");
+/// Remove the specified segment from this range.  Note that the segment must
+/// be in a single Segment in its entirety.
+void LiveRange::removeSegment(SlotIndex Start, SlotIndex End,
+                              bool RemoveDeadValNo) {
+  // Find the Segment containing this span.
+  iterator I = find(Start);
+  assert(I != end() && "Segment is not in range!");
+  assert(I->containsInterval(Start, End)
+         && "Segment is not entirely in range!");
 
-  // If the span we are removing is at the start of the LiveRange, adjust it.
+  // If the span we are removing is at the start of the Segment, adjust it.
   VNInfo *ValNo = I->valno;
   if (I->start == Start) {
     if (I->end == End) {
@@ -373,54 +371,50 @@ void LiveInterval::removeRange(SlotIndex Start, SlotIndex End,
         }
       }
 
-      ranges.erase(I);  // Removed the whole LiveRange.
+      segments.erase(I);  // Removed the whole Segment.
     } else
       I->start = End;
     return;
   }
 
-  // Otherwise if the span we are removing is at the end of the LiveRange,
+  // Otherwise if the span we are removing is at the end of the Segment,
   // adjust the other way.
   if (I->end == End) {
     I->end = Start;
     return;
   }
 
-  // Otherwise, we are splitting the LiveRange into two pieces.
+  // Otherwise, we are splitting the Segment into two pieces.
   SlotIndex OldEnd = I->end;
-  I->end = Start;   // Trim the old interval.
+  I->end = Start;   // Trim the old segment.
 
   // Insert the new one.
-  ranges.insert(llvm::next(I), LiveRange(End, OldEnd, ValNo));
+  segments.insert(llvm::next(I), Segment(End, OldEnd, ValNo));
 }
 
-/// removeValNo - Remove all the ranges defined by the specified value#.
+/// removeValNo - Remove all the segments defined by the specified value#.
 /// Also remove the value# from value# list.
-void LiveInterval::removeValNo(VNInfo *ValNo) {
+void LiveRange::removeValNo(VNInfo *ValNo) {
   if (empty()) return;
-  Ranges::iterator I = ranges.end();
-  Ranges::iterator E = ranges.begin();
+  iterator I = end();
+  iterator E = begin();
   do {
     --I;
     if (I->valno == ValNo)
-      ranges.erase(I);
+      segments.erase(I);
   } while (I != E);
   // Now that ValNo is dead, remove it.
   markValNoForDeletion(ValNo);
 }
 
-/// join - Join two live intervals (this, and other) together.  This applies
-/// mappings to the value numbers in the LHS/RHS intervals as specified.  If
-/// the intervals are not joinable, this aborts.
-void LiveInterval::join(LiveInterval &Other,
-                        const int *LHSValNoAssignments,
-                        const int *RHSValNoAssignments,
-                        SmallVectorImpl<VNInfo *> &NewVNInfo,
-                        MachineRegisterInfo *MRI) {
+void LiveRange::join(LiveRange &Other,
+                     const int *LHSValNoAssignments,
+                     const int *RHSValNoAssignments,
+                     SmallVectorImpl<VNInfo *> &NewVNInfo) {
   verify();
 
-  // Determine if any of our live range values are mapped.  This is uncommon, so
-  // we want to avoid the interval scan if not.
+  // Determine if any of our values are mapped.  This is uncommon, so we want
+  // to avoid the range scan if not.
   bool MustMapCurValNos = false;
   unsigned NumVals = getNumValNums();
   unsigned NumNewVals = NewVNInfo.size();
@@ -433,8 +427,7 @@ void LiveInterval::join(LiveInterval &Other,
     }
   }
 
-  // If we have to apply a mapping to our base interval assignment, rewrite it
-  // now.
+  // If we have to apply a mapping to our base range assignment, rewrite it now.
   if (MustMapCurValNos && !empty()) {
     // Map the first live range.
 
@@ -445,12 +438,12 @@ void LiveInterval::join(LiveInterval &Other,
       assert(nextValNo != 0 && "Huh?");
 
       // If this live range has the same value # as its immediate predecessor,
-      // and if they are neighbors, remove one LiveRange.  This happens when we
+      // and if they are neighbors, remove one Segment.  This happens when we
       // have [0,4:0)[4,7:1) and map 0/1 onto the same value #.
       if (OutIt->valno == nextValNo && OutIt->end == I->start) {
         OutIt->end = I->end;
       } else {
-        // Didn't merge. Move OutIt to the next interval,
+        // Didn't merge. Move OutIt to the next segment,
         ++OutIt;
         OutIt->valno = nextValNo;
         if (OutIt != I) {
@@ -459,9 +452,9 @@ void LiveInterval::join(LiveInterval &Other,
         }
       }
     }
-    // If we merge some live ranges, chop off the end.
+    // If we merge some segments, chop off the end.
     ++OutIt;
-    ranges.erase(OutIt, end());
+    segments.erase(OutIt, end());
   }
 
   // Rewrite Other values before changing the VNInfo ids.
@@ -472,7 +465,7 @@ void LiveInterval::join(LiveInterval &Other,
     I->valno = NewVNInfo[RHSValNoAssignments[I->valno->id]];
 
   // Update val# info. Renumber them and make sure they all belong to this
-  // LiveInterval now. Also remove dead val#'s.
+  // LiveRange now. Also remove dead val#'s.
   unsigned NumValNos = 0;
   for (unsigned i = 0; i < NumNewVals; ++i) {
     VNInfo *VNI = NewVNInfo[i];
@@ -487,31 +480,31 @@ void LiveInterval::join(LiveInterval &Other,
   if (NumNewVals < NumVals)
     valnos.resize(NumNewVals);  // shrinkify
 
-  // Okay, now insert the RHS live ranges into the LHS.
+  // Okay, now insert the RHS live segments into the LHS.
   LiveRangeUpdater Updater(this);
   for (iterator I = Other.begin(), E = Other.end(); I != E; ++I)
     Updater.add(*I);
 }
 
-/// MergeRangesInAsValue - Merge all of the intervals in RHS into this live
-/// interval as the specified value number.  The LiveRanges in RHS are
-/// allowed to overlap with LiveRanges in the current interval, but only if
-/// the overlapping LiveRanges have the specified value number.
-void LiveInterval::MergeRangesInAsValue(const LiveInterval &RHS,
-                                        VNInfo *LHSValNo) {
+/// Merge all of the segments in RHS into this live range as the specified
+/// value number.  The segments in RHS are allowed to overlap with segments in
+/// the current range, but only if the overlapping segments have the
+/// specified value number.
+void LiveRange::MergeSegmentsInAsValue(const LiveRange &RHS,
+                                       VNInfo *LHSValNo) {
   LiveRangeUpdater Updater(this);
   for (const_iterator I = RHS.begin(), E = RHS.end(); I != E; ++I)
     Updater.add(I->start, I->end, LHSValNo);
 }
 
-/// MergeValueInAsValue - Merge all of the live ranges of a specific val#
-/// in RHS into this live interval as the specified value number.
-/// The LiveRanges in RHS are allowed to overlap with LiveRanges in the
-/// current interval, it will replace the value numbers of the overlaped
-/// live ranges with the specified value number.
-void LiveInterval::MergeValueInAsValue(const LiveInterval &RHS,
-                                       const VNInfo *RHSValNo,
-                                       VNInfo *LHSValNo) {
+/// MergeValueInAsValue - Merge all of the live segments of a specific val#
+/// in RHS into this live range as the specified value number.
+/// The segments in RHS are allowed to overlap with segments in the
+/// current range, it will replace the value numbers of the overlaped
+/// segments with the specified value number.
+void LiveRange::MergeValueInAsValue(const LiveRange &RHS,
+                                    const VNInfo *RHSValNo,
+                                    VNInfo *LHSValNo) {
   LiveRangeUpdater Updater(this);
   for (const_iterator I = RHS.begin(), E = RHS.end(); I != E; ++I)
     if (I->valno == RHSValNo)
@@ -520,9 +513,9 @@ void LiveInterval::MergeValueInAsValue(const LiveInterval &RHS,
 
 /// MergeValueNumberInto - This method is called when two value nubmers
 /// are found to be equivalent.  This eliminates V1, replacing all
-/// LiveRanges with the V1 value number with the V2 value number.  This can
+/// segments with the V1 value number with the V2 value number.  This can
 /// cause merging of V1/V2 values numbers and compaction of the value space.
-VNInfo* LiveInterval::MergeValueNumberInto(VNInfo *V1, VNInfo *V2) {
+VNInfo *LiveRange::MergeValueNumberInto(VNInfo *V1, VNInfo *V2) {
   assert(V1 != V2 && "Identical value#'s are always equivalent!");
 
   // This code actually merges the (numerically) larger value number into the
@@ -536,37 +529,37 @@ VNInfo* LiveInterval::MergeValueNumberInto(VNInfo *V1, VNInfo *V2) {
     std::swap(V1, V2);
   }
 
-  // Merge V1 live ranges into V2.
+  // Merge V1 segments into V2.
   for (iterator I = begin(); I != end(); ) {
-    iterator LR = I++;
-    if (LR->valno != V1) continue;  // Not a V1 LiveRange.
+    iterator S = I++;
+    if (S->valno != V1) continue;  // Not a V1 Segment.
 
     // Okay, we found a V1 live range.  If it had a previous, touching, V2 live
     // range, extend it.
-    if (LR != begin()) {
-      iterator Prev = LR-1;
-      if (Prev->valno == V2 && Prev->end == LR->start) {
-        Prev->end = LR->end;
+    if (S != begin()) {
+      iterator Prev = S-1;
+      if (Prev->valno == V2 && Prev->end == S->start) {
+        Prev->end = S->end;
 
         // Erase this live-range.
-        ranges.erase(LR);
+        segments.erase(S);
         I = Prev+1;
-        LR = Prev;
+        S = Prev;
       }
     }
 
     // Okay, now we have a V1 or V2 live range that is maximally merged forward.
     // Ensure that it is a V2 live-range.
-    LR->valno = V2;
+    S->valno = V2;
 
-    // If we can merge it into later V2 live ranges, do so now.  We ignore any
-    // following V1 live ranges, as they will be merged in subsequent iterations
+    // If we can merge it into later V2 segments, do so now.  We ignore any
+    // following V1 segments, as they will be merged in subsequent iterations
     // of the loop.
     if (I != end()) {
-      if (I->start == LR->end && I->valno == V2) {
-        LR->end = I->end;
-        ranges.erase(I);
-        I = LR+1;
+      if (I->start == S->end && I->valno == V2) {
+        S->end = I->end;
+        segments.erase(I);
+        I = S+1;
       }
     }
   }
@@ -584,22 +577,21 @@ unsigned LiveInterval::getSize() const {
   return Sum;
 }
 
-raw_ostream& llvm::operator<<(raw_ostream& os, const LiveRange &LR) {
-  return os << '[' << LR.start << ',' << LR.end << ':' << LR.valno->id << ")";
+raw_ostream& llvm::operator<<(raw_ostream& os, const LiveRange::Segment &S) {
+  return os << '[' << S.start << ',' << S.end << ':' << S.valno->id << ")";
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void LiveRange::dump() const {
+void LiveRange::Segment::dump() const {
   dbgs() << *this << "\n";
 }
 #endif
 
-void LiveInterval::print(raw_ostream &OS) const {
+void LiveRange::print(raw_ostream &OS) const {
   if (empty())
     OS << "EMPTY";
   else {
-    for (LiveInterval::Ranges::const_iterator I = ranges.begin(),
-           E = ranges.end(); I != E; ++I) {
+    for (const_iterator I = begin(), E = end(); I != E; ++I) {
       OS << *I;
       assert(I->valno == getValNumInfo(I->valno->id) && "Bad VNInfo");
     }
@@ -625,19 +617,29 @@ void LiveInterval::print(raw_ostream &OS) const {
   }
 }
 
+void LiveInterval::print(raw_ostream &OS) const {
+  OS << PrintReg(reg) << ' ';
+  super::print(OS);
+}
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void LiveRange::dump() const {
+  dbgs() << *this << "\n";
+}
+
 void LiveInterval::dump() const {
   dbgs() << *this << "\n";
 }
 #endif
 
 #ifndef NDEBUG
-void LiveInterval::verify() const {
+void LiveRange::verify() const {
   for (const_iterator I = begin(), E = end(); I != E; ++I) {
     assert(I->start.isValid());
     assert(I->end.isValid());
     assert(I->start < I->end);
     assert(I->valno != 0);
+    assert(I->valno->id < valnos.size());
     assert(I->valno == valnos[I->valno->id]);
     if (llvm::next(I) != E) {
       assert(I->end <= llvm::next(I)->start);
@@ -649,10 +651,6 @@ void LiveInterval::verify() const {
 #endif
 
 
-void LiveRange::print(raw_ostream &os) const {
-  os << *this;
-}
-
 //===----------------------------------------------------------------------===//
 //                           LiveRangeUpdater class
 //===----------------------------------------------------------------------===//
@@ -665,11 +663,11 @@ void LiveRange::print(raw_ostream &os) const {
 //
 // Otherwise, segments are kept in three separate areas:
 //
-// 1. [begin; WriteI) at the front of LI.
-// 2. [ReadI; end) at the back of LI.
+// 1. [begin; WriteI) at the front of LR.
+// 2. [ReadI; end) at the back of LR.
 // 3. Spills.
 //
-// - LI.begin() <= WriteI <= ReadI <= LI.end().
+// - LR.begin() <= WriteI <= ReadI <= LR.end().
 // - Segments in all three areas are fully ordered and coalesced.
 // - Segments in area 1 precede and can't coalesce with segments in area 2.
 // - Segments in Spills precede and can't coalesce with segments in area 2.
@@ -684,23 +682,23 @@ void LiveRange::print(raw_ostream &os) const {
 
 void LiveRangeUpdater::print(raw_ostream &OS) const {
   if (!isDirty()) {
-    if (LI)
-      OS << "Clean " << PrintReg(LI->reg) << " updater: " << *LI << '\n';
+    if (LR)
+      OS << "Clean updater: " << *LR << '\n';
     else
       OS << "Null updater.\n";
     return;
   }
-  assert(LI && "Can't have null LI in dirty updater.");
-  OS << PrintReg(LI->reg) << " updater with gap = " << (ReadI - WriteI)
+  assert(LR && "Can't have null LR in dirty updater.");
+  OS << " updater with gap = " << (ReadI - WriteI)
      << ", last start = " << LastStart
      << ":\n  Area 1:";
-  for (LiveInterval::const_iterator I = LI->begin(); I != WriteI; ++I)
+  for (LiveRange::const_iterator I = LR->begin(); I != WriteI; ++I)
     OS << ' ' << *I;
   OS << "\n  Spills:";
   for (unsigned I = 0, E = Spills.size(); I != E; ++I)
     OS << ' ' << Spills[I];
   OS << "\n  Area 2:";
-  for (LiveInterval::const_iterator I = ReadI, E = LI->end(); I != E; ++I)
+  for (LiveRange::const_iterator I = ReadI, E = LR->end(); I != E; ++I)
     OS << ' ' << *I;
   OS << '\n';
 }
@@ -711,8 +709,9 @@ void LiveRangeUpdater::dump() const
 }
 
 // Determine if A and B should be coalesced.
-static inline bool coalescable(const LiveRange &A, const LiveRange &B) {
-  assert(A.start <= B.start && "Unordered live ranges.");
+static inline bool coalescable(const LiveRange::Segment &A,
+                               const LiveRange::Segment &B) {
+  assert(A.start <= B.start && "Unordered live segments.");
   if (A.end == B.start)
     return A.valno == B.valno;
   if (A.end < B.start)
@@ -721,8 +720,8 @@ static inline bool coalescable(const LiveRange &A, const LiveRange &B) {
   return true;
 }
 
-void LiveRangeUpdater::add(LiveRange Seg) {
-  assert(LI && "Cannot add to a null destination");
+void LiveRangeUpdater::add(LiveRange::Segment Seg) {
+  assert(LR && "Cannot add to a null destination");
 
   // Flush the state if Start moves backwards.
   if (!LastStart.isValid() || LastStart > Seg.start) {
@@ -730,21 +729,21 @@ void LiveRangeUpdater::add(LiveRange Seg) {
       flush();
     // This brings us to an uninitialized state. Reinitialize.
     assert(Spills.empty() && "Leftover spilled segments");
-    WriteI = ReadI = LI->begin();
+    WriteI = ReadI = LR->begin();
   }
 
   // Remember start for next time.
   LastStart = Seg.start;
 
   // Advance ReadI until it ends after Seg.start.
-  LiveInterval::iterator E = LI->end();
+  LiveRange::iterator E = LR->end();
   if (ReadI != E && ReadI->end <= Seg.start) {
     // First try to close the gap between WriteI and ReadI with spills.
     if (ReadI != WriteI)
       mergeSpills();
     // Then advance ReadI.
     if (ReadI == WriteI)
-      ReadI = WriteI = LI->find(Seg.start);
+      ReadI = WriteI = LR->find(Seg.start);
     else
       while (ReadI != E && ReadI->end <= Seg.start)
         *WriteI++ = *ReadI++;
@@ -777,7 +776,7 @@ void LiveRangeUpdater::add(LiveRange Seg) {
   }
 
   // Try coalescing Seg into WriteI[-1].
-  if (WriteI != LI->begin() && coalescable(WriteI[-1], Seg)) {
+  if (WriteI != LR->begin() && coalescable(WriteI[-1], Seg)) {
     WriteI[-1].end = std::max(WriteI[-1].end, Seg.end);
     return;
   }
@@ -788,10 +787,10 @@ void LiveRangeUpdater::add(LiveRange Seg) {
     return;
   }
 
-  // Finally, append to LI or Spills.
+  // Finally, append to LR or Spills.
   if (WriteI == E) {
-    LI->ranges.push_back(Seg);
-    WriteI = ReadI = LI->ranges.end();
+    LR->segments.push_back(Seg);
+    WriteI = ReadI = LR->end();
   } else
     Spills.push_back(Seg);
 }
@@ -802,10 +801,10 @@ void LiveRangeUpdater::mergeSpills() {
   // Perform a backwards merge of Spills and [SpillI;WriteI).
   size_t GapSize = ReadI - WriteI;
   size_t NumMoved = std::min(Spills.size(), GapSize);
-  LiveInterval::iterator Src = WriteI;
-  LiveInterval::iterator Dst = Src + NumMoved;
-  LiveInterval::iterator SpillSrc = Spills.end();
-  LiveInterval::iterator B = LI->begin();
+  LiveRange::iterator Src = WriteI;
+  LiveRange::iterator Dst = Src + NumMoved;
+  LiveRange::iterator SpillSrc = Spills.end();
+  LiveRange::iterator B = LR->begin();
 
   // This is the new WriteI position after merging spills.
   WriteI = Dst;
@@ -827,12 +826,12 @@ void LiveRangeUpdater::flush() {
   // Clear the dirty state.
   LastStart = SlotIndex();
 
-  assert(LI && "Cannot add to a null destination");
+  assert(LR && "Cannot add to a null destination");
 
   // Nothing to merge?
   if (Spills.empty()) {
-    LI->ranges.erase(WriteI, ReadI);
-    LI->verify();
+    LR->segments.erase(WriteI, ReadI);
+    LR->verify();
     return;
   }
 
@@ -840,17 +839,17 @@ void LiveRangeUpdater::flush() {
   size_t GapSize = ReadI - WriteI;
   if (GapSize < Spills.size()) {
     // The gap is too small. Make some room.
-    size_t WritePos = WriteI - LI->begin();
-    LI->ranges.insert(ReadI, Spills.size() - GapSize, LiveRange());
+    size_t WritePos = WriteI - LR->begin();
+    LR->segments.insert(ReadI, Spills.size() - GapSize, LiveRange::Segment());
     // This also invalidated ReadI, but it is recomputed below.
-    WriteI = LI->ranges.begin() + WritePos;
+    WriteI = LR->begin() + WritePos;
   } else {
     // Shrink the gap if necessary.
-    LI->ranges.erase(WriteI + Spills.size(), ReadI);
+    LR->segments.erase(WriteI + Spills.size(), ReadI);
   }
   ReadI = WriteI + Spills.size();
   mergeSpills();
-  LI->verify();
+  LR->verify();
 }
 
 unsigned ConnectedVNInfoEqClasses::Classify(const LiveInterval *LI) {
@@ -918,7 +917,7 @@ void ConnectedVNInfoEqClasses::Distribute(LiveInterval *LIV[],
       Idx = LIS.getSlotIndexes()->getIndexBefore(MI);
     else
       Idx = LIS.getInstructionIndex(MI);
-    LiveRangeQuery LRQ(LI, Idx);
+    LiveQueryResult LRQ = LI.Query(Idx);
     const VNInfo *VNI = MO.readsReg() ? LRQ.valueIn() : LRQ.valueDefined();
     // In the case of an <undef> use that isn't tied to any def, VNI will be
     // NULL. If the use is tied to a def, VNI will be the defined value.
@@ -935,11 +934,11 @@ void ConnectedVNInfoEqClasses::Distribute(LiveInterval *LIV[],
     if (unsigned eq = EqClass[I->valno->id]) {
       assert((LIV[eq]->empty() || LIV[eq]->expiredAt(I->start)) &&
              "New intervals should be empty");
-      LIV[eq]->ranges.push_back(*I);
+      LIV[eq]->segments.push_back(*I);
     } else
       *J++ = *I;
   }
-  LI.ranges.erase(J, E);
+  LI.segments.erase(J, E);
 
   // Transfer VNInfos to their new owners and renumber them.
   unsigned j = 0, e = LI.getNumValNums();
diff --git a/lib/CodeGen/LiveIntervalAnalysis.cpp b/lib/CodeGen/LiveIntervalAnalysis.cpp
index 3680943..e1c3217 100644
--- a/lib/CodeGen/LiveIntervalAnalysis.cpp
+++ b/lib/CodeGen/LiveIntervalAnalysis.cpp
@@ -95,15 +95,15 @@ void LiveIntervals::releaseMemory() {
   RegMaskBits.clear();
   RegMaskBlocks.clear();
 
-  for (unsigned i = 0, e = RegUnitIntervals.size(); i != e; ++i)
-    delete RegUnitIntervals[i];
-  RegUnitIntervals.clear();
+  for (unsigned i = 0, e = RegUnitRanges.size(); i != e; ++i)
+    delete RegUnitRanges[i];
+  RegUnitRanges.clear();
 
   // Release VNInfo memory regions, VNInfo objects don't need to be dtor'd.
   VNInfoAllocator.Reset();
 }
 
-/// runOnMachineFunction - Register allocate the whole function
+/// runOnMachineFunction - calculates LiveIntervals
 ///
 bool LiveIntervals::runOnMachineFunction(MachineFunction &fn) {
   MF = &fn;
@@ -139,15 +139,15 @@ void LiveIntervals::print(raw_ostream &OS, const Module* ) const {
   OS << "********** INTERVALS **********\n";
 
   // Dump the regunits.
-  for (unsigned i = 0, e = RegUnitIntervals.size(); i != e; ++i)
-    if (LiveInterval *LI = RegUnitIntervals[i])
-      OS << PrintRegUnit(i, TRI) << " = " << *LI << '\n';
+  for (unsigned i = 0, e = RegUnitRanges.size(); i != e; ++i)
+    if (LiveRange *LR = RegUnitRanges[i])
+      OS << PrintRegUnit(i, TRI) << ' ' << *LR << '\n';
 
   // Dump the virtregs.
   for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
     unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
     if (hasInterval(Reg))
-      OS << PrintReg(Reg) << " = " << getInterval(Reg) << '\n';
+      OS << getInterval(Reg) << '\n';
   }
 
   OS << "RegMasks:";
@@ -170,16 +170,17 @@ void LiveIntervals::dumpInstrs() const {
 #endif
 
 LiveInterval* LiveIntervals::createInterval(unsigned reg) {
-  float Weight = TargetRegisterInfo::isPhysicalRegister(reg) ? HUGE_VALF : 0.0F;
+  float Weight = TargetRegisterInfo::isPhysicalRegister(reg) ?
+                  llvm::huge_valf : 0.0F;
   return new LiveInterval(reg, Weight);
 }
 
 
 /// computeVirtRegInterval - Compute the live interval of a virtual register,
 /// based on defs and uses.
-void LiveIntervals::computeVirtRegInterval(LiveInterval *LI) {
+void LiveIntervals::computeVirtRegInterval(LiveInterval &LI) {
   assert(LRCalc && "LRCalc not initialized.");
-  assert(LI->empty() && "Should only compute empty intervals.");
+  assert(LI.empty() && "Should only compute empty intervals.");
   LRCalc->reset(MF, getSlotIndexes(), DomTree, &getVNInfoAllocator());
   LRCalc->createDeadDefs(LI);
   LRCalc->extendToUses(LI);
@@ -190,9 +191,7 @@ void LiveIntervals::computeVirtRegs() {
     unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
     if (MRI->reg_nodbg_empty(Reg))
       continue;
-    LiveInterval *LI = createInterval(Reg);
-    VirtRegIntervals[Reg] = LI;
-    computeVirtRegInterval(LI);
+    createAndComputeVirtRegInterval(Reg);
   }
 }
 
@@ -229,12 +228,10 @@ void LiveIntervals::computeRegMasks() {
 // interference.
 //
 
-/// computeRegUnitInterval - Compute the live interval of a register unit, based
-/// on the uses and defs of aliasing registers.  The interval should be empty,
+/// computeRegUnitInterval - Compute the live range of a register unit, based
+/// on the uses and defs of aliasing registers.  The range should be empty,
 /// or contain only dead phi-defs from ABI blocks.
-void LiveIntervals::computeRegUnitInterval(LiveInterval *LI) {
-  unsigned Unit = LI->reg;
-
+void LiveIntervals::computeRegUnitRange(LiveRange &LR, unsigned Unit) {
   assert(LRCalc && "LRCalc not initialized.");
   LRCalc->reset(MF, getSlotIndexes(), DomTree, &getVNInfoAllocator());
 
@@ -247,18 +244,18 @@ void LiveIntervals::computeRegUnitInterval(LiveInterval *LI) {
     for (MCSuperRegIterator Supers(*Roots, TRI, /*IncludeSelf=*/true);
          Supers.isValid(); ++Supers) {
       if (!MRI->reg_empty(*Supers))
-        LRCalc->createDeadDefs(LI, *Supers);
+        LRCalc->createDeadDefs(LR, *Supers);
     }
   }
 
-  // Now extend LI to reach all uses.
+  // Now extend LR to reach all uses.
   // Ignore uses of reserved registers. We only track defs of those.
   for (MCRegUnitRootIterator Roots(Unit, TRI); Roots.isValid(); ++Roots) {
     for (MCSuperRegIterator Supers(*Roots, TRI, /*IncludeSelf=*/true);
          Supers.isValid(); ++Supers) {
       unsigned Reg = *Supers;
       if (!MRI->isReserved(Reg) && !MRI->reg_empty(Reg))
-        LRCalc->extendToUses(LI, Reg);
+        LRCalc->extendToUses(LR, Reg);
     }
   }
 }
@@ -269,11 +266,11 @@ void LiveIntervals::computeRegUnitInterval(LiveInterval *LI) {
 /// without a corresponding def when entering the entry block or a landing pad.
 ///
 void LiveIntervals::computeLiveInRegUnits() {
-  RegUnitIntervals.resize(TRI->getNumRegUnits());
+  RegUnitRanges.resize(TRI->getNumRegUnits());
   DEBUG(dbgs() << "Computing live-in reg-units in ABI blocks.\n");
 
-  // Keep track of the intervals allocated.
-  SmallVector<LiveInterval*, 8> NewIntvs;
+  // Keep track of the live range sets allocated.
+  SmallVector<unsigned, 8> NewRanges;
 
   // Check all basic blocks for live-ins.
   for (MachineFunction::const_iterator MFI = MF->begin(), MFE = MF->end();
@@ -291,23 +288,25 @@ void LiveIntervals::computeLiveInRegUnits() {
          LIE = MBB->livein_end(); LII != LIE; ++LII) {
       for (MCRegUnitIterator Units(*LII, TRI); Units.isValid(); ++Units) {
         unsigned Unit = *Units;
-        LiveInterval *Intv = RegUnitIntervals[Unit];
-        if (!Intv) {
-          Intv = RegUnitIntervals[Unit] = new LiveInterval(Unit, HUGE_VALF);
-          NewIntvs.push_back(Intv);
+        LiveRange *LR = RegUnitRanges[Unit];
+        if (!LR) {
+          LR = RegUnitRanges[Unit] = new LiveRange();
+          NewRanges.push_back(Unit);
         }
-        VNInfo *VNI = Intv->createDeadDef(Begin, getVNInfoAllocator());
+        VNInfo *VNI = LR->createDeadDef(Begin, getVNInfoAllocator());
         (void)VNI;
         DEBUG(dbgs() << ' ' << PrintRegUnit(Unit, TRI) << '#' << VNI->id);
       }
     }
     DEBUG(dbgs() << '\n');
   }
-  DEBUG(dbgs() << "Created " << NewIntvs.size() << " new intervals.\n");
+  DEBUG(dbgs() << "Created " << NewRanges.size() << " new intervals.\n");
 
-  // Compute the 'normal' part of the intervals.
-  for (unsigned i = 0, e = NewIntvs.size(); i != e; ++i)
-    computeRegUnitInterval(NewIntvs[i]);
+  // Compute the 'normal' part of the ranges.
+  for (unsigned i = 0, e = NewRanges.size(); i != e; ++i) {
+    unsigned Unit = NewRanges[i];
+    computeRegUnitRange(*RegUnitRanges[Unit], Unit);
+  }
 }
 
 
@@ -331,7 +330,7 @@ bool LiveIntervals::shrinkToUses(LiveInterval *li,
     if (UseMI->isDebugValue() || !UseMI->readsVirtualRegister(li->reg))
       continue;
     SlotIndex Idx = getInstructionIndex(UseMI).getRegSlot();
-    LiveRangeQuery LRQ(*li, Idx);
+    LiveQueryResult LRQ = li->Query(Idx);
     VNInfo *VNI = LRQ.valueIn();
     if (!VNI) {
       // This shouldn't happen: readsVirtualRegister returns true, but there is
@@ -350,14 +349,14 @@ bool LiveIntervals::shrinkToUses(LiveInterval *li,
     WorkList.push_back(std::make_pair(Idx, VNI));
   }
 
-  // Create a new live interval with only minimal live segments per def.
-  LiveInterval NewLI(li->reg, 0);
+  // Create new live ranges with only minimal live segments per def.
+  LiveRange NewLR;
   for (LiveInterval::vni_iterator I = li->vni_begin(), E = li->vni_end();
        I != E; ++I) {
     VNInfo *VNI = *I;
     if (VNI->isUnused())
       continue;
-    NewLI.addRange(LiveRange(VNI->def, VNI->def.getDeadSlot(), VNI));
+    NewLR.addSegment(LiveRange::Segment(VNI->def, VNI->def.getDeadSlot(), VNI));
   }
 
   // Keep track of the PHIs that are in use.
@@ -372,7 +371,7 @@ bool LiveIntervals::shrinkToUses(LiveInterval *li,
     SlotIndex BlockStart = getMBBStartIdx(MBB);
 
     // Extend the live range for VNI to be live at Idx.
-    if (VNInfo *ExtVNI = NewLI.extendInBlock(BlockStart, Idx)) {
+    if (VNInfo *ExtVNI = NewLR.extendInBlock(BlockStart, Idx)) {
       (void)ExtVNI;
       assert(ExtVNI == VNI && "Unexpected existing value number");
       // Is this a PHIDef we haven't seen before?
@@ -393,7 +392,7 @@ bool LiveIntervals::shrinkToUses(LiveInterval *li,
 
     // VNI is live-in to MBB.
     DEBUG(dbgs() << " live-in at " << BlockStart << '\n');
-    NewLI.addRange(LiveRange(BlockStart, Idx, VNI));
+    NewLR.addSegment(LiveRange::Segment(BlockStart, Idx, VNI));
 
     // Make sure VNI is live-out from the predecessors.
     for (MachineBasicBlock::const_pred_iterator PI = MBB->pred_begin(),
@@ -414,14 +413,14 @@ bool LiveIntervals::shrinkToUses(LiveInterval *li,
     VNInfo *VNI = *I;
     if (VNI->isUnused())
       continue;
-    LiveInterval::iterator LII = NewLI.FindLiveRangeContaining(VNI->def);
-    assert(LII != NewLI.end() && "Missing live range for PHI");
-    if (LII->end != VNI->def.getDeadSlot())
+    LiveRange::iterator LRI = NewLR.FindSegmentContaining(VNI->def);
+    assert(LRI != NewLR.end() && "Missing segment for PHI");
+    if (LRI->end != VNI->def.getDeadSlot())
       continue;
     if (VNI->isPHIDef()) {
       // This is a dead PHI. Remove it.
       VNI->markUnused();
-      NewLI.removeRange(*LII);
+      NewLR.removeSegment(LRI->start, LRI->end);
       DEBUG(dbgs() << "Dead PHI at " << VNI->def << " may separate interval\n");
       CanSeparate = true;
     } else {
@@ -436,23 +435,23 @@ bool LiveIntervals::shrinkToUses(LiveInterval *li,
     }
   }
 
-  // Move the trimmed ranges back.
-  li->ranges.swap(NewLI.ranges);
+  // Move the trimmed segments back.
+  li->segments.swap(NewLR.segments);
   DEBUG(dbgs() << "Shrunk: " << *li << '\n');
   return CanSeparate;
 }
 
-void LiveIntervals::extendToIndices(LiveInterval *LI,
+void LiveIntervals::extendToIndices(LiveRange &LR,
                                     ArrayRef<SlotIndex> Indices) {
   assert(LRCalc && "LRCalc not initialized.");
   LRCalc->reset(MF, getSlotIndexes(), DomTree, &getVNInfoAllocator());
   for (unsigned i = 0, e = Indices.size(); i != e; ++i)
-    LRCalc->extend(LI, Indices[i]);
+    LRCalc->extend(LR, Indices[i]);
 }
 
 void LiveIntervals::pruneValue(LiveInterval *LI, SlotIndex Kill,
                                SmallVectorImpl<SlotIndex> *EndPoints) {
-  LiveRangeQuery LRQ(*LI, Kill);
+  LiveQueryResult LRQ = LI->Query(Kill);
   VNInfo *VNI = LRQ.valueOut();
   if (!VNI)
     return;
@@ -463,13 +462,13 @@ void LiveIntervals::pruneValue(LiveInterval *LI, SlotIndex Kill,
 
   // If VNI isn't live out from KillMBB, the value is trivially pruned.
   if (LRQ.endPoint() < MBBEnd) {
-    LI->removeRange(Kill, LRQ.endPoint());
+    LI->removeSegment(Kill, LRQ.endPoint());
     if (EndPoints) EndPoints->push_back(LRQ.endPoint());
     return;
   }
 
   // VNI is live out of KillMBB.
-  LI->removeRange(Kill, MBBEnd);
+  LI->removeSegment(Kill, MBBEnd);
   if (EndPoints) EndPoints->push_back(MBBEnd);
 
   // Find all blocks that are reachable from KillMBB without leaving VNI's live
@@ -487,23 +486,23 @@ void LiveIntervals::pruneValue(LiveInterval *LI, SlotIndex Kill,
 
       // Check if VNI is live in to MBB.
       tie(MBBStart, MBBEnd) = Indexes->getMBBRange(MBB);
-      LiveRangeQuery LRQ(*LI, MBBStart);
+      LiveQueryResult LRQ = LI->Query(MBBStart);
       if (LRQ.valueIn() != VNI) {
-        // This block isn't part of the VNI live range. Prune the search.
+        // This block isn't part of the VNI segment. Prune the search.
         I.skipChildren();
         continue;
       }
 
       // Prune the search if VNI is killed in MBB.
       if (LRQ.endPoint() < MBBEnd) {
-        LI->removeRange(MBBStart, LRQ.endPoint());
+        LI->removeSegment(MBBStart, LRQ.endPoint());
         if (EndPoints) EndPoints->push_back(LRQ.endPoint());
         I.skipChildren();
         continue;
       }
 
       // VNI is live through MBB.
-      LI->removeRange(MBBStart, MBBEnd);
+      LI->removeSegment(MBBStart, MBBEnd);
       if (EndPoints) EndPoints->push_back(MBBEnd);
       ++I;
     }
@@ -516,7 +515,7 @@ void LiveIntervals::pruneValue(LiveInterval *LI, SlotIndex Kill,
 
 void LiveIntervals::addKillFlags(const VirtRegMap *VRM) {
   // Keep track of regunit ranges.
-  SmallVector<std::pair<LiveInterval*, LiveInterval::iterator>, 8> RU;
+  SmallVector<std::pair<LiveRange*, LiveRange::iterator>, 8> RU;
 
   for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
     unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
@@ -531,13 +530,14 @@ void LiveIntervals::addKillFlags(const VirtRegMap *VRM) {
     RU.clear();
     for (MCRegUnitIterator Units(VRM->getPhys(Reg), TRI); Units.isValid();
          ++Units) {
-      LiveInterval *RUInt = &getRegUnit(*Units);
-      if (RUInt->empty())
+      LiveRange &RURanges = getRegUnit(*Units);
+      if (RURanges.empty())
         continue;
-      RU.push_back(std::make_pair(RUInt, RUInt->find(LI->begin()->end)));
+      RU.push_back(std::make_pair(&RURanges, RURanges.find(LI->begin()->end)));
     }
 
-    // Every instruction that kills Reg corresponds to a live range end point.
+    // Every instruction that kills Reg corresponds to a segment range end
+    // point.
     for (LiveInterval::iterator RI = LI->begin(), RE = LI->end(); RI != RE;
          ++RI) {
       // A block index indicates an MBB edge.
@@ -547,7 +547,7 @@ void LiveIntervals::addKillFlags(const VirtRegMap *VRM) {
       if (!MI)
         continue;
 
-      // Check if any of the reguints are live beyond the end of RI. That could
+      // Check if any of the regunits are live beyond the end of RI. That could
       // happen when a physreg is defined as a copy of a virtreg:
       //
       //   %EAX = COPY %vreg5
@@ -557,12 +557,12 @@ void LiveIntervals::addKillFlags(const VirtRegMap *VRM) {
       // There should be no kill flag on FOO when %vreg5 is rewritten as %EAX.
       bool CancelKill = false;
       for (unsigned u = 0, e = RU.size(); u != e; ++u) {
-        LiveInterval *RInt = RU[u].first;
-        LiveInterval::iterator &I = RU[u].second;
-        if (I == RInt->end())
+        LiveRange &RRanges = *RU[u].first;
+        LiveRange::iterator &I = RU[u].second;
+        if (I == RRanges.end())
           continue;
-        I = RInt->advanceTo(I, RI->end);
-        if (I == RInt->end() || I->start >= RI->end)
+        I = RRanges.advanceTo(I, RI->end);
+        if (I == RRanges.end() || I->start >= RI->end)
           continue;
         // I is overlapping RI.
         CancelKill = true;
@@ -625,18 +625,18 @@ LiveIntervals::getSpillWeight(bool isDef, bool isUse, BlockFrequency freq) {
   return (isDef + isUse) * (freq.getFrequency() * Scale);
 }
 
-LiveRange LiveIntervals::addLiveRangeToEndOfBlock(unsigned reg,
-                                                  MachineInstr* startInst) {
-  LiveInterval& Interval = getOrCreateInterval(reg);
+LiveRange::Segment
+LiveIntervals::addSegmentToEndOfBlock(unsigned reg, MachineInstr* startInst) {
+  LiveInterval& Interval = createEmptyInterval(reg);
   VNInfo* VN = Interval.getNextValue(
     SlotIndex(getInstructionIndex(startInst).getRegSlot()),
     getVNInfoAllocator());
-  LiveRange LR(
+  LiveRange::Segment S(
      SlotIndex(getInstructionIndex(startInst).getRegSlot()),
      getMBBEndIdx(startInst->getParent()), VN);
-  Interval.addRange(LR);
+  Interval.addSegment(S);
 
-  return LR;
+  return S;
 }
 
 
@@ -711,7 +711,7 @@ private:
   const TargetRegisterInfo& TRI;
   SlotIndex OldIdx;
   SlotIndex NewIdx;
-  SmallPtrSet<LiveInterval*, 8> Updated;
+  SmallPtrSet<LiveRange*, 8> Updated;
   bool UpdateFlags;
 
 public:
@@ -725,7 +725,7 @@ public:
   // physregs, even those that aren't needed for regalloc, in order to update
   // kill flags. This is wasteful. Eventually, LiveVariables will strip all kill
   // flags, and postRA passes will use a live register utility instead.
-  LiveInterval *getRegUnitLI(unsigned Unit) {
+  LiveRange *getRegUnitLI(unsigned Unit) {
     if (UpdateFlags)
       return &LIS.getRegUnit(Unit);
     return LIS.getCachedRegUnit(Unit);
@@ -750,15 +750,16 @@ public:
       if (!Reg)
         continue;
       if (TargetRegisterInfo::isVirtualRegister(Reg)) {
-        updateRange(LIS.getInterval(Reg));
+        LiveInterval &LI = LIS.getInterval(Reg);
+        updateRange(LI, Reg);
         continue;
       }
 
       // For physregs, only update the regunits that actually have a
       // precomputed live range.
       for (MCRegUnitIterator Units(Reg, &TRI); Units.isValid(); ++Units)
-        if (LiveInterval *LI = getRegUnitLI(*Units))
-          updateRange(*LI);
+        if (LiveRange *LR = getRegUnitLI(*Units))
+          updateRange(*LR, *Units);
     }
     if (hasRegMask)
       updateRegMaskSlots();
@@ -767,26 +768,26 @@ public:
 private:
   /// Update a single live range, assuming an instruction has been moved from
   /// OldIdx to NewIdx.
-  void updateRange(LiveInterval &LI) {
-    if (!Updated.insert(&LI))
+  void updateRange(LiveRange &LR, unsigned Reg) {
+    if (!Updated.insert(&LR))
       return;
     DEBUG({
       dbgs() << "     ";
-      if (TargetRegisterInfo::isVirtualRegister(LI.reg))
-        dbgs() << PrintReg(LI.reg);
+      if (TargetRegisterInfo::isVirtualRegister(Reg))
+        dbgs() << PrintReg(Reg);
       else
-        dbgs() << PrintRegUnit(LI.reg, &TRI);
-      dbgs() << ":\t" << LI << '\n';
+        dbgs() << PrintRegUnit(Reg, &TRI);
+      dbgs() << ":\t" << LR << '\n';
     });
     if (SlotIndex::isEarlierInstr(OldIdx, NewIdx))
-      handleMoveDown(LI);
+      handleMoveDown(LR);
     else
-      handleMoveUp(LI);
-    DEBUG(dbgs() << "        -->\t" << LI << '\n');
-    LI.verify();
+      handleMoveUp(LR, Reg);
+    DEBUG(dbgs() << "        -->\t" << LR << '\n');
+    LR.verify();
   }
 
-  /// Update LI to reflect an instruction has been moved downwards from OldIdx
+  /// Update LR to reflect an instruction has been moved downwards from OldIdx
   /// to NewIdx.
   ///
   /// 1. Live def at OldIdx:
@@ -800,17 +801,17 @@ private:
   ///    Move def to NewIdx, possibly across another live value.
   ///
   /// 4. Def at OldIdx AND at NewIdx:
-  ///    Remove live range [OldIdx;NewIdx) and value defined at OldIdx.
+  ///    Remove segment [OldIdx;NewIdx) and value defined at OldIdx.
   ///    (Happens when bundling multiple defs together).
   ///
   /// 5. Value read at OldIdx, killed before NewIdx:
   ///    Extend kill to NewIdx.
   ///
-  void handleMoveDown(LiveInterval &LI) {
+  void handleMoveDown(LiveRange &LR) {
     // First look for a kill at OldIdx.
-    LiveInterval::iterator I = LI.find(OldIdx.getBaseIndex());
-    LiveInterval::iterator E = LI.end();
-    // Is LI even live at OldIdx?
+    LiveRange::iterator I = LR.find(OldIdx.getBaseIndex());
+    LiveRange::iterator E = LR.end();
+    // Is LR even live at OldIdx?
     if (I == E || SlotIndex::isEarlierInstr(OldIdx, I->start))
       return;
 
@@ -827,7 +828,7 @@ private:
         for (MIBundleOperands MO(KillMI); MO.isValid(); ++MO)
           if (MO->isReg() && MO->isUse())
             MO->setIsKill(false);
-      // Adjust I->end to reach NewIdx. This may temporarily make LI invalid by
+      // Adjust I->end to reach NewIdx. This may temporarily make LR invalid by
       // overlapping ranges. Case 5 above.
       I->end = NewIdx.getRegSlot(I->end.isEarlyClobber());
       // If this was a kill, there may also be a def. Otherwise we're done.
@@ -856,24 +857,25 @@ private:
     assert((I->end == OldIdx.getDeadSlot() ||
             SlotIndex::isSameInstr(I->end, NewIdx)) &&
             "Cannot move def below kill");
-    LiveInterval::iterator NewI = LI.advanceTo(I, NewIdx.getRegSlot());
+    LiveRange::iterator NewI = LR.advanceTo(I, NewIdx.getRegSlot());
     if (NewI != E && SlotIndex::isSameInstr(NewI->start, NewIdx)) {
       // There is an existing def at NewIdx, case 4 above. The def at OldIdx is
       // coalesced into that value.
       assert(NewI->valno != DefVNI && "Multiple defs of value?");
-      LI.removeValNo(DefVNI);
+      LR.removeValNo(DefVNI);
       return;
     }
     // There was no existing def at NewIdx. Turn *I into a dead def at NewIdx.
-    // If the def at OldIdx was dead, we allow it to be moved across other LI
+    // If the def at OldIdx was dead, we allow it to be moved across other LR
     // values. The new range should be placed immediately before NewI, move any
     // intermediate ranges up.
     assert(NewI != I && "Inconsistent iterators");
     std::copy(llvm::next(I), NewI, I);
-    *llvm::prior(NewI) = LiveRange(DefVNI->def, NewIdx.getDeadSlot(), DefVNI);
+    *llvm::prior(NewI)
+      = LiveRange::Segment(DefVNI->def, NewIdx.getDeadSlot(), DefVNI);
   }
 
-  /// Update LI to reflect an instruction has been moved upwards from OldIdx
+  /// Update LR to reflect an instruction has been moved upwards from OldIdx
   /// to NewIdx.
   ///
   /// 1. Live def at OldIdx:
@@ -893,11 +895,11 @@ private:
   ///    Hoist kill to NewIdx, then scan for last kill between NewIdx and
   ///    OldIdx.
   ///
-  void handleMoveUp(LiveInterval &LI) {
+  void handleMoveUp(LiveRange &LR, unsigned Reg) {
     // First look for a kill at OldIdx.
-    LiveInterval::iterator I = LI.find(OldIdx.getBaseIndex());
-    LiveInterval::iterator E = LI.end();
-    // Is LI even live at OldIdx?
+    LiveRange::iterator I = LR.find(OldIdx.getBaseIndex());
+    LiveRange::iterator E = LR.end();
+    // Is LR even live at OldIdx?
     if (I == E || SlotIndex::isEarlierInstr(OldIdx, I->start))
       return;
 
@@ -914,7 +916,7 @@ private:
       if (I == E || !SlotIndex::isSameInstr(I->start, OldIdx)) {
         // No def, search for the new kill.
         // This can never be an early clobber kill since there is no def.
-        llvm::prior(I)->end = findLastUseBefore(LI.reg).getRegSlot();
+        llvm::prior(I)->end = findLastUseBefore(Reg).getRegSlot();
         return;
       }
     }
@@ -926,18 +928,18 @@ private:
     DefVNI->def = NewIdx.getRegSlot(I->start.isEarlyClobber());
 
     // Check for an existing def at NewIdx.
-    LiveInterval::iterator NewI = LI.find(NewIdx.getRegSlot());
+    LiveRange::iterator NewI = LR.find(NewIdx.getRegSlot());
     if (SlotIndex::isSameInstr(NewI->start, NewIdx)) {
       assert(NewI->valno != DefVNI && "Same value defined more than once?");
       // There is an existing def at NewIdx.
       if (I->end.isDead()) {
         // Case 3: Remove the dead def at OldIdx.
-        LI.removeValNo(DefVNI);
+        LR.removeValNo(DefVNI);
         return;
       }
       // Case 4: Replace def at NewIdx with live def at OldIdx.
       I->start = DefVNI->def;
-      LI.removeValNo(NewI->valno);
+      LR.removeValNo(NewI->valno);
       return;
     }
 
@@ -948,10 +950,10 @@ private:
       return;
     }
 
-    // DefVNI is a dead def. It may have been moved across other values in LI,
+    // DefVNI is a dead def. It may have been moved across other values in LR,
     // so move I up to NewI. Slide [NewI;I) down one position.
     std::copy_backward(NewI, I, llvm::next(I));
-    *NewI = LiveRange(DefVNI->def, NewIdx.getDeadSlot(), DefVNI);
+    *NewI = LiveRange::Segment(DefVNI->def, NewIdx.getDeadSlot(), DefVNI);
   }
 
   void updateRegMaskSlots() {
@@ -1074,8 +1076,7 @@ LiveIntervals::repairIntervalsInRange(MachineBasicBlock *MBB,
       if (MOI->isReg() &&
           TargetRegisterInfo::isVirtualRegister(MOI->getReg()) &&
           !hasInterval(MOI->getReg())) {
-        LiveInterval &LI = getOrCreateInterval(MOI->getReg());
-        computeVirtRegInterval(&LI);
+        createAndComputeVirtRegInterval(MOI->getReg());
       }
     }
   }
@@ -1122,9 +1123,9 @@ LiveIntervals::repairIntervalsInRange(MachineBasicBlock *MBB,
               if (LII != LI.begin())
                 prevStart = llvm::prior(LII)->start;
 
-              // FIXME: This could be more efficient if there was a removeRange
-              // method that returned an iterator.
-              LI.removeRange(*LII, true);
+              // FIXME: This could be more efficient if there was a
+              // removeSegment method that returned an iterator.
+              LI.removeSegment(*LII, true);
               if (prevStart.isValid())
                 LII = LI.find(prevStart);
               else
@@ -1143,13 +1144,14 @@ LiveIntervals::repairIntervalsInRange(MachineBasicBlock *MBB,
           if (!lastUseIdx.isValid()) {
             VNInfo *VNI = LI.getNextValue(instrIdx.getRegSlot(),
                                           VNInfoAllocator);
-            LiveRange LR(instrIdx.getRegSlot(), instrIdx.getDeadSlot(), VNI);
-            LII = LI.addRange(LR);
+            LiveRange::Segment S(instrIdx.getRegSlot(),
+                                 instrIdx.getDeadSlot(), VNI);
+            LII = LI.addSegment(S);
           } else if (LII->start != instrIdx.getRegSlot()) {
             VNInfo *VNI = LI.getNextValue(instrIdx.getRegSlot(),
                                           VNInfoAllocator);
-            LiveRange LR(instrIdx.getRegSlot(), lastUseIdx, VNI);
-            LII = LI.addRange(LR);
+            LiveRange::Segment S(instrIdx.getRegSlot(), lastUseIdx, VNI);
+            LII = LI.addSegment(S);
           }
 
           if (MO.getSubReg() && !MO.isUndef())
diff --git a/lib/CodeGen/LiveRangeCalc.cpp b/lib/CodeGen/LiveRangeCalc.cpp
index dede490..ae086bc 100644
--- a/lib/CodeGen/LiveRangeCalc.cpp
+++ b/lib/CodeGen/LiveRangeCalc.cpp
@@ -36,11 +36,11 @@ void LiveRangeCalc::reset(const MachineFunction *mf,
 }
 
 
-void LiveRangeCalc::createDeadDefs(LiveInterval *LI, unsigned Reg) {
+void LiveRangeCalc::createDeadDefs(LiveRange &LR, unsigned Reg) {
   assert(MRI && Indexes && "call reset() first");
 
   // Visit all def operands. If the same instruction has multiple defs of Reg,
-  // LI->createDeadDef() will deduplicate.
+  // LR.createDeadDef() will deduplicate.
   for (MachineRegisterInfo::def_iterator
        I = MRI->def_begin(Reg), E = MRI->def_end(); I != E; ++I) {
     const MachineInstr *MI = &*I;
@@ -54,13 +54,13 @@ void LiveRangeCalc::createDeadDefs(LiveInterval *LI, unsigned Reg) {
       Idx = Indexes->getInstructionIndex(MI)
         .getRegSlot(I.getOperand().isEarlyClobber());
 
-    // Create the def in LI. This may find an existing def.
-    LI->createDeadDef(Idx, *Alloc);
+    // Create the def in LR. This may find an existing def.
+    LR.createDeadDef(Idx, *Alloc);
   }
 }
 
 
-void LiveRangeCalc::extendToUses(LiveInterval *LI, unsigned Reg) {
+void LiveRangeCalc::extendToUses(LiveRange &LR, unsigned Reg) {
   assert(MRI && Indexes && "call reset() first");
 
   // Visit all operands that read Reg. This may include partial defs.
@@ -99,7 +99,7 @@ void LiveRangeCalc::extendToUses(LiveInterval *LI, unsigned Reg) {
           Idx = Idx.getRegSlot(true);
       }
     }
-    extend(LI, Idx, Reg);
+    extend(LR, Idx, Reg);
   }
 }
 
@@ -125,17 +125,14 @@ void LiveRangeCalc::updateLiveIns() {
       assert(Seen.test(MBB->getNumber()));
       LiveOut[MBB] = LiveOutPair(I->Value, (MachineDomTreeNode *)0);
     }
-    Updater.setDest(I->LI);
+    Updater.setDest(&I->LR);
     Updater.add(Start, End, I->Value);
   }
   LiveIn.clear();
 }
 
 
-void LiveRangeCalc::extend(LiveInterval *LI,
-                           SlotIndex Kill,
-                           unsigned PhysReg) {
-  assert(LI && "Missing live range");
+void LiveRangeCalc::extend(LiveRange &LR, SlotIndex Kill, unsigned PhysReg) {
   assert(Kill.isValid() && "Invalid SlotIndex");
   assert(Indexes && "Missing SlotIndexes");
   assert(DomTree && "Missing dominator tree");
@@ -144,14 +141,14 @@ void LiveRangeCalc::extend(LiveInterval *LI,
   assert(KillMBB && "No MBB at Kill");
 
   // Is there a def in the same MBB we can extend?
-  if (LI->extendInBlock(Indexes->getMBBStartIdx(KillMBB), Kill))
+  if (LR.extendInBlock(Indexes->getMBBStartIdx(KillMBB), Kill))
     return;
 
   // Find the single reaching def, or determine if Kill is jointly dominated by
   // multiple values, and we may need to create even more phi-defs to preserve
   // VNInfo SSA form.  Perform a search for all predecessor blocks where we
   // know the dominating VNInfo.
-  if (findReachingDefs(LI, KillMBB, Kill, PhysReg))
+  if (findReachingDefs(LR, *KillMBB, Kill, PhysReg))
     return;
 
   // When there were multiple different values, we may need new PHIs.
@@ -170,13 +167,11 @@ void LiveRangeCalc::calculateValues() {
 }
 
 
-bool LiveRangeCalc::findReachingDefs(LiveInterval *LI,
-                                     MachineBasicBlock *KillMBB,
-                                     SlotIndex Kill,
-                                     unsigned PhysReg) {
-  unsigned KillMBBNum = KillMBB->getNumber();
+bool LiveRangeCalc::findReachingDefs(LiveRange &LR, MachineBasicBlock &KillMBB,
+                                     SlotIndex Kill, unsigned PhysReg) {
+  unsigned KillMBBNum = KillMBB.getNumber();
 
-  // Block numbers where LI should be live-in.
+  // Block numbers where LR should be live-in.
   SmallVector<unsigned, 16> WorkList(1, KillMBBNum);
 
   // Remember if we have seen more than one value.
@@ -203,7 +198,7 @@ bool LiveRangeCalc::findReachingDefs(LiveInterval *LI,
 #endif
 
     for (MachineBasicBlock::pred_iterator PI = MBB->pred_begin(),
-           PE = MBB->pred_end(); PI != PE; ++PI) {
+         PE = MBB->pred_end(); PI != PE; ++PI) {
        MachineBasicBlock *Pred = *PI;
 
        // Is this a known live-out block?
@@ -221,7 +216,7 @@ bool LiveRangeCalc::findReachingDefs(LiveInterval *LI,
 
        // First time we see Pred.  Try to determine the live-out value, but set
        // it as null if Pred is live-through with an unknown value.
-       VNInfo *VNI = LI->extendInBlock(Start, End);
+       VNInfo *VNI = LR.extendInBlock(Start, End);
        setLiveOutValue(Pred, VNI);
        if (VNI) {
          if (TheVNI && TheVNI != VNI)
@@ -231,7 +226,7 @@ bool LiveRangeCalc::findReachingDefs(LiveInterval *LI,
        }
 
        // No, we need a live-in value for Pred as well
-       if (Pred != KillMBB)
+       if (Pred != &KillMBB)
           WorkList.push_back(Pred->getNumber());
        else
           // Loopback to KillMBB, so value is really live through.
@@ -248,9 +243,9 @@ bool LiveRangeCalc::findReachingDefs(LiveInterval *LI,
 
   // If a unique reaching def was found, blit in the live ranges immediately.
   if (UniqueVNI) {
-    LiveRangeUpdater Updater(LI);
-    for (SmallVectorImpl<unsigned>::const_iterator
-         I = WorkList.begin(), E = WorkList.end(); I != E; ++I) {
+    LiveRangeUpdater Updater(&LR);
+    for (SmallVectorImpl<unsigned>::const_iterator I = WorkList.begin(),
+         E = WorkList.end(); I != E; ++I) {
        SlotIndex Start, End;
        tie(Start, End) = Indexes->getMBBRange(*I);
        // Trim the live range in KillMBB.
@@ -270,8 +265,8 @@ bool LiveRangeCalc::findReachingDefs(LiveInterval *LI,
   for (SmallVectorImpl<unsigned>::const_iterator
        I = WorkList.begin(), E = WorkList.end(); I != E; ++I) {
     MachineBasicBlock *MBB = MF->getBlockNumbered(*I);
-    addLiveInBlock(LI, DomTree->getNode(MBB));
-    if (MBB == KillMBB)
+    addLiveInBlock(LR, DomTree->getNode(MBB));
+    if (MBB == &KillMBB)
       LiveIn.back().Kill = Kill;
   }
 
@@ -348,16 +343,17 @@ void LiveRangeCalc::updateSSA() {
         assert(Alloc && "Need VNInfo allocator to create PHI-defs");
         SlotIndex Start, End;
         tie(Start, End) = Indexes->getMBBRange(MBB);
-        VNInfo *VNI = I->LI->getNextValue(Start, *Alloc);
+        LiveRange &LR = I->LR;
+        VNInfo *VNI = LR.getNextValue(Start, *Alloc);
         I->Value = VNI;
         // This block is done, we know the final value.
         I->DomNode = 0;
 
         // Add liveness since updateLiveIns now skips this node.
         if (I->Kill.isValid())
-          I->LI->addRange(LiveRange(Start, I->Kill, VNI));
+          LR.addSegment(LiveInterval::Segment(Start, I->Kill, VNI));
         else {
-          I->LI->addRange(LiveRange(Start, End, VNI));
+          LR.addSegment(LiveInterval::Segment(Start, End, VNI));
           LOP = LiveOutPair(VNI, Node);
         }
       } else if (IDomValue.first) {
diff --git a/lib/CodeGen/LiveRangeCalc.h b/lib/CodeGen/LiveRangeCalc.h
index 57cab7b..a3a3fbb 100644
--- a/lib/CodeGen/LiveRangeCalc.h
+++ b/lib/CodeGen/LiveRangeCalc.h
@@ -75,9 +75,9 @@ class LiveRangeCalc {
   /// LiveInBlock - Information about a basic block where a live range is known
   /// to be live-in, but the value has not yet been determined.
   struct LiveInBlock {
-    // LI - The live range that is live-in to this block.  The algorithms can
+    // The live range set that is live-in to this block.  The algorithms can
     // handle multiple non-overlapping live ranges simultaneously.
-    LiveInterval *LI;
+    LiveRange &LR;
 
     // DomNode - Dominator tree node for the block.
     // Cleared when the final value has been determined and LI has been updated.
@@ -91,8 +91,8 @@ class LiveRangeCalc {
     // Live-in value filled in by updateSSA once it is known.
     VNInfo *Value;
 
-    LiveInBlock(LiveInterval *li, MachineDomTreeNode *node, SlotIndex kill)
-      : LI(li), DomNode(node), Kill(kill), Value(0) {}
+    LiveInBlock(LiveRange &LR, MachineDomTreeNode *node, SlotIndex kill)
+      : LR(LR), DomNode(node), Kill(kill), Value(0) {}
   };
 
   /// LiveIn - Work list of blocks where the live-in value has yet to be
@@ -111,10 +111,8 @@ class LiveRangeCalc {
   /// are added to the LiveIn array, and the function returns false.
   ///
   /// PhysReg, when set, is used to verify live-in lists on basic blocks.
-  bool findReachingDefs(LiveInterval *LI,
-                        MachineBasicBlock *KillMBB,
-                        SlotIndex Kill,
-                        unsigned PhysReg);
+  bool findReachingDefs(LiveRange &LR, MachineBasicBlock &KillMBB,
+                        SlotIndex Kill, unsigned PhysReg);
 
   /// updateSSA - Compute the values that will be live in to all requested
   /// blocks in LiveIn.  Create PHI-def values as required to preserve SSA form.
@@ -146,10 +144,6 @@ public:
              MachineDominatorTree*,
              VNInfo::Allocator*);
 
-  /// calculate - Calculate the live range of a virtual register from its defs
-  /// and uses.  LI must be empty with no values.
-  void calculate(LiveInterval *LI);
-
   //===--------------------------------------------------------------------===//
   // Mid-level interface.
   //===--------------------------------------------------------------------===//
@@ -165,27 +159,27 @@ public:
   /// single existing value, Alloc may be null.
   ///
   /// PhysReg, when set, is used to verify live-in lists on basic blocks.
-  void extend(LiveInterval *LI, SlotIndex Kill, unsigned PhysReg = 0);
+  void extend(LiveRange &LR, SlotIndex Kill, unsigned PhysReg = 0);
 
   /// createDeadDefs - Create a dead def in LI for every def operand of Reg.
   /// Each instruction defining Reg gets a new VNInfo with a corresponding
   /// minimal live range.
-  void createDeadDefs(LiveInterval *LI, unsigned Reg);
+  void createDeadDefs(LiveRange &LR, unsigned Reg);
 
   /// createDeadDefs - Create a dead def in LI for every def of LI->reg.
-  void createDeadDefs(LiveInterval *LI) {
-    createDeadDefs(LI, LI->reg);
+  void createDeadDefs(LiveInterval &LI) {
+    createDeadDefs(LI, LI.reg);
   }
 
   /// extendToUses - Extend the live range of LI to reach all uses of Reg.
   ///
   /// All uses must be jointly dominated by existing liveness.  PHI-defs are
   /// inserted as needed to preserve SSA form.
-  void extendToUses(LiveInterval *LI, unsigned Reg);
+  void extendToUses(LiveRange &LR, unsigned Reg);
 
   /// extendToUses - Extend the live range of LI to reach all uses of LI->reg.
-  void extendToUses(LiveInterval *LI) {
-    extendToUses(LI, LI->reg);
+  void extendToUses(LiveInterval &LI) {
+    extendToUses(LI, LI.reg);
   }
 
   //===--------------------------------------------------------------------===//
@@ -216,15 +210,15 @@ public:
   /// function can only be called once per basic block.  Once the live-in value
   /// has been determined, calculateValues() will add liveness to LI.
   ///
-  /// @param LI      The live range that is live-in to the block.
+  /// @param LR      The live range that is live-in to the block.
   /// @param DomNode The domtree node for the block.
   /// @param Kill    Index in block where LI is killed.  If the value is
   ///                live-through, set Kill = SLotIndex() and also call
   ///                setLiveOutValue(MBB, 0).
-  void addLiveInBlock(LiveInterval *LI,
+  void addLiveInBlock(LiveRange &LR,
                       MachineDomTreeNode *DomNode,
                       SlotIndex Kill = SlotIndex()) {
-    LiveIn.push_back(LiveInBlock(LI, DomNode, Kill));
+    LiveIn.push_back(LiveInBlock(LR, DomNode, Kill));
   }
 
   /// calculateValues - Calculate the value that will be live-in to each block
diff --git a/lib/CodeGen/LiveRangeEdit.cpp b/lib/CodeGen/LiveRangeEdit.cpp
index 792ef54..cb70c43 100644
--- a/lib/CodeGen/LiveRangeEdit.cpp
+++ b/lib/CodeGen/LiveRangeEdit.cpp
@@ -30,17 +30,23 @@ STATISTIC(NumFracRanges,     "Number of live ranges fractured by DCE");
 
 void LiveRangeEdit::Delegate::anchor() { }
 
-LiveInterval &LiveRangeEdit::createFrom(unsigned OldReg) {
+LiveInterval &LiveRangeEdit::createEmptyIntervalFrom(unsigned OldReg) {
   unsigned VReg = MRI.createVirtualRegister(MRI.getRegClass(OldReg));
   if (VRM) {
-    VRM->grow();
     VRM->setIsSplitFromReg(VReg, VRM->getOriginal(OldReg));
   }
-  LiveInterval &LI = LIS.getOrCreateInterval(VReg);
-  NewRegs.push_back(&LI);
+  LiveInterval &LI = LIS.createEmptyInterval(VReg);
   return LI;
 }
 
+unsigned LiveRangeEdit::createFrom(unsigned OldReg) {
+  unsigned VReg = MRI.createVirtualRegister(MRI.getRegClass(OldReg));
+  if (VRM) {
+    VRM->setIsSplitFromReg(VReg, VRM->getOriginal(OldReg));
+  }
+  return VReg;
+}
+
 bool LiveRangeEdit::checkRematerializable(VNInfo *VNI,
                                           const MachineInstr *DefMI,
                                           AliasAnalysis *aa) {
@@ -256,9 +262,9 @@ void LiveRangeEdit::eliminateDeadDef(MachineInstr *MI, ToShrinkSet &ToShrink) {
       else if (MOI->isDef()) {
         for (MCRegUnitIterator Units(Reg, MRI.getTargetRegisterInfo());
              Units.isValid(); ++Units) {
-          if (LiveInterval *LI = LIS.getCachedRegUnit(*Units)) {
-            if (VNInfo *VNI = LI->getVNInfoAt(Idx))
-              LI->removeValNo(VNI);
+          if (LiveRange *LR = LIS.getCachedRegUnit(*Units)) {
+            if (VNInfo *VNI = LR->getVNInfoAt(Idx))
+              LR->removeValNo(VNI);
           }
         }
       }
@@ -272,7 +278,7 @@ void LiveRangeEdit::eliminateDeadDef(MachineInstr *MI, ToShrinkSet &ToShrink) {
     // Always shrink COPY uses that probably come from live range splitting.
     if (MI->readsVirtualRegister(Reg) &&
         (MI->isCopy() || MOI->isDef() || MRI.hasOneNonDBGUse(Reg) ||
-         LI.killedAt(Idx)))
+         LI.Query(Idx).isKill()))
       ToShrink.insert(&LI);
 
     // Remove defined value.
@@ -360,7 +366,7 @@ void LiveRangeEdit::eliminateDeadDefs(SmallVectorImpl<MachineInstr*> &Dead,
     if (BeingSpilled) continue;
 
     // LI may have been separated, create new intervals.
-    LI->RenumberValues(LIS);
+    LI->RenumberValues();
     ConnectedVNInfoEqClasses ConEQ(LIS);
     unsigned NumComp = ConEQ.Classify(LI);
     if (NumComp <= 1)
@@ -370,7 +376,7 @@ void LiveRangeEdit::eliminateDeadDefs(SmallVectorImpl<MachineInstr*> &Dead,
     DEBUG(dbgs() << NumComp << " components: " << *LI << '\n');
     SmallVector<LiveInterval*, 8> Dups(1, LI);
     for (unsigned i = 1; i != NumComp; ++i) {
-      Dups.push_back(&createFrom(LI->reg));
+      Dups.push_back(&createEmptyIntervalFrom(LI->reg));
       // If LI is an original interval that hasn't been split yet, make the new
       // intervals their own originals instead of referring to LI. The original
       // interval must contain all the split products, and LI doesn't.
@@ -387,16 +393,27 @@ void LiveRangeEdit::eliminateDeadDefs(SmallVectorImpl<MachineInstr*> &Dead,
   }
 }
 
+// Keep track of new virtual registers created via
+// MachineRegisterInfo::createVirtualRegister.
+void
+LiveRangeEdit::MRI_NoteNewVirtualRegister(unsigned VReg)
+{
+  if (VRM)
+    VRM->grow();
+
+  NewRegs.push_back(VReg);
+}
+
 void
 LiveRangeEdit::calculateRegClassAndHint(MachineFunction &MF,
                                         const MachineLoopInfo &Loops,
                                         const MachineBlockFrequencyInfo &MBFI) {
   VirtRegAuxInfo VRAI(MF, LIS, Loops, MBFI);
-  for (iterator I = begin(), E = end(); I != E; ++I) {
-    LiveInterval &LI = **I;
+  for (unsigned I = 0, Size = size(); I < Size; ++I) {
+    LiveInterval &LI = LIS.getInterval(get(I));
     if (MRI.recomputeRegClass(LI.reg, MF.getTarget()))
       DEBUG(dbgs() << "Inflated " << PrintReg(LI.reg) << " to "
                    << MRI.getRegClass(LI.reg)->getName() << '\n');
-    VRAI.CalculateWeightAndHint(LI);
+    VRAI.calculateSpillWeightAndHint(LI);
   }
 }
diff --git a/lib/CodeGen/LiveRegMatrix.cpp b/lib/CodeGen/LiveRegMatrix.cpp
index 0ef069f..1d801ac 100644
--- a/lib/CodeGen/LiveRegMatrix.cpp
+++ b/lib/CodeGen/LiveRegMatrix.cpp
@@ -119,9 +119,11 @@ bool LiveRegMatrix::checkRegUnitInterference(LiveInterval &VirtReg,
   if (VirtReg.empty())
     return false;
   CoalescerPair CP(VirtReg.reg, PhysReg, *TRI);
-  for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units)
-    if (VirtReg.overlaps(LIS->getRegUnit(*Units), CP, *LIS->getSlotIndexes()))
+  for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) {
+    const LiveRange &UnitRange = LIS->getRegUnit(*Units);
+    if (VirtReg.overlaps(UnitRange, CP, *LIS->getSlotIndexes()))
       return true;
+  }
   return false;
 }
 
diff --git a/lib/CodeGen/LiveRegUnits.cpp b/lib/CodeGen/LiveRegUnits.cpp
new file mode 100644
index 0000000..6221ca2
--- /dev/null
+++ b/lib/CodeGen/LiveRegUnits.cpp
@@ -0,0 +1,111 @@
+//===-- LiveInterval.cpp - Live Interval Representation -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the LiveRegUnits utility for tracking liveness of
+// physical register units across machine instructions in forward or backward
+// order.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/LiveRegUnits.h"
+#include "llvm/CodeGen/MachineInstrBundle.h"
+using namespace llvm;
+
+/// Return true if the given MachineOperand clobbers the given register unit.
+/// A register unit is only clobbered if all its super-registers are clobbered.
+static bool operClobbersUnit(const MachineOperand *MO, unsigned Unit,
+                             const MCRegisterInfo *MCRI) {
+  for (MCRegUnitRootIterator RI(Unit, MCRI); RI.isValid(); ++RI) {
+    for (MCSuperRegIterator SI(*RI, MCRI, true); SI.isValid(); ++SI) {
+      if (!MO->clobbersPhysReg(*SI))
+        return false;
+    }
+  }
+  return true;
+}
+
+/// We assume the high bits of a physical super register are not preserved
+/// unless the instruction has an implicit-use operand reading the
+/// super-register or a register unit for the upper bits is available.
+void LiveRegUnits::removeRegsInMask(const MachineOperand &Op,
+                                    const MCRegisterInfo &MCRI) {
+  SparseSet<unsigned>::iterator LUI = LiveUnits.begin();
+  while (LUI != LiveUnits.end()) {
+    if (operClobbersUnit(&Op, *LUI, &MCRI))
+      LUI = LiveUnits.erase(LUI);
+    else
+      ++LUI;
+  }
+}
+
+void LiveRegUnits::stepBackward(const MachineInstr &MI,
+                                const MCRegisterInfo &MCRI) {
+  // Remove defined registers and regmask kills from the set.
+  for (ConstMIBundleOperands O(&MI); O.isValid(); ++O) {
+    if (O->isReg()) {
+      if (!O->isDef())
+        continue;
+      unsigned Reg = O->getReg();
+      if (Reg == 0)
+        continue;
+      removeReg(Reg, MCRI);
+    } else if (O->isRegMask()) {
+      removeRegsInMask(*O, MCRI);
+    }
+  }
+  // Add uses to the set.
+  for (ConstMIBundleOperands O(&MI); O.isValid(); ++O) {
+    if (!O->isReg() || !O->readsReg() || O->isUndef())
+      continue;
+    unsigned Reg = O->getReg();
+    if (Reg == 0)
+      continue;
+    addReg(Reg, MCRI);
+  }
+}
+
+/// Uses with kill flag get removed from the set, defs added. If possible
+/// use StepBackward() instead of this function because some kill flags may
+/// be missing.
+void LiveRegUnits::stepForward(const MachineInstr &MI,
+                               const MCRegisterInfo &MCRI) {
+  SmallVector<unsigned, 4> Defs;
+  // Remove killed registers from the set.
+  for (ConstMIBundleOperands O(&MI); O.isValid(); ++O) {
+    if (O->isReg()) {
+      unsigned Reg = O->getReg();
+      if (Reg == 0)
+        continue;
+      if (O->isDef()) {
+        if (!O->isDead())
+          Defs.push_back(Reg);
+      } else {
+        if (!O->isKill())
+          continue;
+        assert(O->isUse());
+        removeReg(Reg, MCRI);
+      }
+    } else if (O->isRegMask()) {
+      removeRegsInMask(*O, MCRI);
+    }
+  }
+  // Add defs to the set.
+  for (unsigned i = 0, e = Defs.size(); i != e; ++i) {
+    addReg(Defs[i], MCRI);
+  }
+}
+
+/// Adds all registers in the live-in list of block @p BB.
+void LiveRegUnits::addLiveIns(const MachineBasicBlock *MBB,
+                              const MCRegisterInfo &MCRI) {
+  for (MachineBasicBlock::livein_iterator L = MBB->livein_begin(),
+         LE = MBB->livein_end(); L != LE; ++L) {
+    addReg(*L, MCRI);
+  }
+}
diff --git a/lib/CodeGen/MachineBasicBlock.cpp b/lib/CodeGen/MachineBasicBlock.cpp
index 5633271..ca71e3b 100644
--- a/lib/CodeGen/MachineBasicBlock.cpp
+++ b/lib/CodeGen/MachineBasicBlock.cpp
@@ -861,7 +861,7 @@ MachineBasicBlock::SplitCriticalEdge(MachineBasicBlock *Succ, Pass *P) {
           LiveInterval &LI = LIS->getInterval(Reg);
           VNInfo *VNI = LI.getVNInfoAt(PrevIndex);
           assert(VNI && "PHI sources should be live out of their predecessors.");
-          LI.addRange(LiveRange(StartIndex, EndIndex, VNI));
+          LI.addSegment(LiveInterval::Segment(StartIndex, EndIndex, VNI));
         }
       }
     }
@@ -880,9 +880,9 @@ MachineBasicBlock::SplitCriticalEdge(MachineBasicBlock *Succ, Pass *P) {
       if (isLiveOut && isLastMBB) {
         VNInfo *VNI = LI.getVNInfoAt(PrevIndex);
         assert(VNI && "LiveInterval should have VNInfo where it is live.");
-        LI.addRange(LiveRange(StartIndex, EndIndex, VNI));
+        LI.addSegment(LiveInterval::Segment(StartIndex, EndIndex, VNI));
       } else if (!isLiveOut && !isLastMBB) {
-        LI.removeRange(StartIndex, EndIndex);
+        LI.removeSegment(StartIndex, EndIndex);
       }
     }
 
diff --git a/lib/CodeGen/MachineInstr.cpp b/lib/CodeGen/MachineInstr.cpp
index 06bb80a..295b450 100644
--- a/lib/CodeGen/MachineInstr.cpp
+++ b/lib/CodeGen/MachineInstr.cpp
@@ -647,12 +647,15 @@ void MachineInstr::addOperand(MachineFunction &MF, const MachineOperand &Op) {
     }
   }
 
+#ifndef NDEBUG
+  bool isMetaDataOp = Op.getType() == MachineOperand::MO_Metadata;
   // OpNo now points as the desired insertion point.  Unless this is a variadic
   // instruction, only implicit regs are allowed beyond MCID->getNumOperands().
   // RegMask operands go between the explicit and implicit operands.
   assert((isImpReg || Op.isRegMask() || MCID->isVariadic() ||
-          OpNo < MCID->getNumOperands()) &&
+          OpNo < MCID->getNumOperands() || isMetaDataOp) &&
          "Trying to add an operand to a machine instr that is already done!");
+#endif
 
   MachineRegisterInfo *MRI = getRegInfo();
 
@@ -1702,31 +1705,31 @@ void MachineInstr::clearRegisterKills(unsigned Reg,
   }
 }
 
-bool MachineInstr::addRegisterDead(unsigned IncomingReg,
+bool MachineInstr::addRegisterDead(unsigned Reg,
                                    const TargetRegisterInfo *RegInfo,
                                    bool AddIfNotFound) {
-  bool isPhysReg = TargetRegisterInfo::isPhysicalRegister(IncomingReg);
+  bool isPhysReg = TargetRegisterInfo::isPhysicalRegister(Reg);
   bool hasAliases = isPhysReg &&
-    MCRegAliasIterator(IncomingReg, RegInfo, false).isValid();
+    MCRegAliasIterator(Reg, RegInfo, false).isValid();
   bool Found = false;
   SmallVector<unsigned,4> DeadOps;
   for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
     MachineOperand &MO = getOperand(i);
     if (!MO.isReg() || !MO.isDef())
       continue;
-    unsigned Reg = MO.getReg();
-    if (!Reg)
+    unsigned MOReg = MO.getReg();
+    if (!MOReg)
       continue;
 
-    if (Reg == IncomingReg) {
+    if (MOReg == Reg) {
       MO.setIsDead();
       Found = true;
     } else if (hasAliases && MO.isDead() &&
-               TargetRegisterInfo::isPhysicalRegister(Reg)) {
+               TargetRegisterInfo::isPhysicalRegister(MOReg)) {
       // There exists a super-register that's marked dead.
-      if (RegInfo->isSuperRegister(IncomingReg, Reg))
+      if (RegInfo->isSuperRegister(Reg, MOReg))
         return true;
-      if (RegInfo->isSubRegister(IncomingReg, Reg))
+      if (RegInfo->isSubRegister(Reg, MOReg))
         DeadOps.push_back(i);
     }
   }
@@ -1746,7 +1749,7 @@ bool MachineInstr::addRegisterDead(unsigned IncomingReg,
   if (Found || !AddIfNotFound)
     return Found;
 
-  addOperand(MachineOperand::CreateReg(IncomingReg,
+  addOperand(MachineOperand::CreateReg(Reg,
                                        true  /*IsDef*/,
                                        true  /*IsImp*/,
                                        false /*IsKill*/,
@@ -1754,21 +1757,21 @@ bool MachineInstr::addRegisterDead(unsigned IncomingReg,
   return true;
 }
 
-void MachineInstr::addRegisterDefined(unsigned IncomingReg,
+void MachineInstr::addRegisterDefined(unsigned Reg,
                                       const TargetRegisterInfo *RegInfo) {
-  if (TargetRegisterInfo::isPhysicalRegister(IncomingReg)) {
-    MachineOperand *MO = findRegisterDefOperand(IncomingReg, false, RegInfo);
+  if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+    MachineOperand *MO = findRegisterDefOperand(Reg, false, RegInfo);
     if (MO)
       return;
   } else {
     for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
       const MachineOperand &MO = getOperand(i);
-      if (MO.isReg() && MO.getReg() == IncomingReg && MO.isDef() &&
+      if (MO.isReg() && MO.getReg() == Reg && MO.isDef() &&
           MO.getSubReg() == 0)
         return;
     }
   }
-  addOperand(MachineOperand::CreateReg(IncomingReg,
+  addOperand(MachineOperand::CreateReg(Reg,
                                        true  /*IsDef*/,
                                        true  /*IsImp*/));
 }
diff --git a/lib/CodeGen/MachineLICM.cpp b/lib/CodeGen/MachineLICM.cpp
index 6ad4e39..104eacd 100644
--- a/lib/CodeGen/MachineLICM.cpp
+++ b/lib/CodeGen/MachineLICM.cpp
@@ -468,12 +468,12 @@ void MachineLICM::ProcessMI(MachineInstr *MI,
     for (MCRegAliasIterator AS(Reg, TRI, true); AS.isValid(); ++AS) {
       if (PhysRegDefs.test(*AS))
         PhysRegClobbers.set(*AS);
-      if (PhysRegClobbers.test(*AS))
-        // MI defined register is seen defined by another instruction in
-        // the loop, it cannot be a LICM candidate.
-        RuledOut = true;
       PhysRegDefs.set(*AS);
     }
+    if (PhysRegClobbers.test(Reg))
+      // MI defined register is seen defined by another instruction in
+      // the loop, it cannot be a LICM candidate.
+      RuledOut = true;
   }
 
   // Only consider reloads for now and remats which do not have register
@@ -502,7 +502,7 @@ void MachineLICM::HoistRegionPostRA() {
 
   // Walk the entire region, count number of defs for each register, and
   // collect potential LICM candidates.
-  const std::vector<MachineBasicBlock*> Blocks = CurLoop->getBlocks();
+  const std::vector<MachineBasicBlock *> &Blocks = CurLoop->getBlocks();
   for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
     MachineBasicBlock *BB = Blocks[i];
 
@@ -584,7 +584,7 @@ void MachineLICM::HoistRegionPostRA() {
 /// AddToLiveIns - Add register 'Reg' to the livein sets of BBs in the current
 /// loop, and make sure it is not killed by any instructions in the loop.
 void MachineLICM::AddToLiveIns(unsigned Reg) {
-  const std::vector<MachineBasicBlock*> Blocks = CurLoop->getBlocks();
+  const std::vector<MachineBasicBlock *> &Blocks = CurLoop->getBlocks();
   for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
     MachineBasicBlock *BB = Blocks[i];
     if (!BB->isLiveIn(Reg))
diff --git a/lib/CodeGen/MachineRegisterInfo.cpp b/lib/CodeGen/MachineRegisterInfo.cpp
index 7f2c0ca..f8b8796 100644
--- a/lib/CodeGen/MachineRegisterInfo.cpp
+++ b/lib/CodeGen/MachineRegisterInfo.cpp
@@ -19,8 +19,11 @@
 
 using namespace llvm;
 
+// Pin the vtable to this file.
+void MachineRegisterInfo::Delegate::anchor() {}
+
 MachineRegisterInfo::MachineRegisterInfo(const TargetMachine &TM)
-  : TM(TM), IsSSA(true), TracksLiveness(true) {
+  : TM(TM), TheDelegate(0), IsSSA(true), TracksLiveness(true) {
   VRegInfo.reserve(256);
   RegAllocHints.reserve(256);
   UsedRegUnits.resize(getTargetRegisterInfo()->getNumRegUnits());
@@ -108,6 +111,8 @@ MachineRegisterInfo::createVirtualRegister(const TargetRegisterClass *RegClass){
   VRegInfo.grow(Reg);
   VRegInfo[Reg].first = RegClass;
   RegAllocHints.grow(Reg);
+  if (TheDelegate)
+    TheDelegate->MRI_NoteNewVirtualRegister(Reg);
   return Reg;
 }
 
diff --git a/lib/CodeGen/MachineScheduler.cpp b/lib/CodeGen/MachineScheduler.cpp
index a6c5a9f..e71c4df 100644
--- a/lib/CodeGen/MachineScheduler.cpp
+++ b/lib/CodeGen/MachineScheduler.cpp
@@ -53,6 +53,12 @@ static cl::opt<unsigned> MISchedCutoff("misched-cutoff", cl::Hidden,
 static bool ViewMISchedDAGs = false;
 #endif // NDEBUG
 
+static cl::opt<bool> EnableRegPressure("misched-regpressure", cl::Hidden,
+  cl::desc("Enable register pressure scheduling."), cl::init(true));
+
+static cl::opt<bool> EnableCyclicPath("misched-cyclicpath", cl::Hidden,
+  cl::desc("Enable cyclic critical path analysis."), cl::init(true));
+
 static cl::opt<bool> EnableLoadCluster("misched-cluster", cl::Hidden,
   cl::desc("Enable load clustering."), cl::init(true));
 
@@ -66,6 +72,10 @@ static cl::opt<bool> VerifyScheduling("verify-misched", cl::Hidden,
 // DAG subtrees must have at least this many nodes.
 static const unsigned MinSubtreeSize = 8;
 
+// Pin the vtables to this file.
+void MachineSchedStrategy::anchor() {}
+void ScheduleDAGMutation::anchor() {}
+
 //===----------------------------------------------------------------------===//
 // Machine Instruction Scheduling Pass and Registry
 //===----------------------------------------------------------------------===//
@@ -95,6 +105,9 @@ public:
   virtual void print(raw_ostream &O, const Module* = 0) const;
 
   static char ID; // Class identification, replacement for typeinfo
+
+protected:
+  ScheduleDAGInstrs *createMachineScheduler();
 };
 } // namespace
 
@@ -149,12 +162,13 @@ DefaultSchedRegistry("default", "Use the target's default scheduler choice.",
 
 /// Forward declare the standard machine scheduler. This will be used as the
 /// default scheduler if the target does not set a default.
-static ScheduleDAGInstrs *createConvergingSched(MachineSchedContext *C);
+static ScheduleDAGInstrs *createGenericSched(MachineSchedContext *C);
 
 
 /// Decrement this iterator until reaching the top or a non-debug instr.
-static MachineBasicBlock::iterator
-priorNonDebug(MachineBasicBlock::iterator I, MachineBasicBlock::iterator Beg) {
+static MachineBasicBlock::const_iterator
+priorNonDebug(MachineBasicBlock::const_iterator I,
+              MachineBasicBlock::const_iterator Beg) {
   assert(I != Beg && "reached the top of the region, cannot decrement");
   while (--I != Beg) {
     if (!I->isDebugValue())
@@ -163,10 +177,19 @@ priorNonDebug(MachineBasicBlock::iterator I, MachineBasicBlock::iterator Beg) {
   return I;
 }
 
+/// Non-const version.
+static MachineBasicBlock::iterator
+priorNonDebug(MachineBasicBlock::iterator I,
+              MachineBasicBlock::const_iterator Beg) {
+  return const_cast<MachineInstr*>(
+    &*priorNonDebug(MachineBasicBlock::const_iterator(I), Beg));
+}
+
 /// If this iterator is a debug value, increment until reaching the End or a
 /// non-debug instruction.
-static MachineBasicBlock::iterator
-nextIfDebug(MachineBasicBlock::iterator I, MachineBasicBlock::iterator End) {
+static MachineBasicBlock::const_iterator
+nextIfDebug(MachineBasicBlock::const_iterator I,
+            MachineBasicBlock::const_iterator End) {
   for(; I != End; ++I) {
     if (!I->isDebugValue())
       break;
@@ -174,6 +197,34 @@ nextIfDebug(MachineBasicBlock::iterator I, MachineBasicBlock::iterator End) {
   return I;
 }
 
+/// Non-const version.
+static MachineBasicBlock::iterator
+nextIfDebug(MachineBasicBlock::iterator I,
+            MachineBasicBlock::const_iterator End) {
+  // Cast the return value to nonconst MachineInstr, then cast to an
+  // instr_iterator, which does not check for null, finally return a
+  // bundle_iterator.
+  return MachineBasicBlock::instr_iterator(
+    const_cast<MachineInstr*>(
+      &*nextIfDebug(MachineBasicBlock::const_iterator(I), End)));
+}
+
+/// Instantiate a ScheduleDAGInstrs that will be owned by the caller.
+ScheduleDAGInstrs *MachineScheduler::createMachineScheduler() {
+  // Select the scheduler, or set the default.
+  MachineSchedRegistry::ScheduleDAGCtor Ctor = MachineSchedOpt;
+  if (Ctor != useDefaultMachineSched)
+    return Ctor(this);
+
+  // Get the default scheduler set by the target for this function.
+  ScheduleDAGInstrs *Scheduler = PassConfig->createMachineScheduler(this);
+  if (Scheduler)
+    return Scheduler;
+
+  // Default to GenericScheduler.
+  return createGenericSched(this);
+}
+
 /// Top-level MachineScheduler pass driver.
 ///
 /// Visit blocks in function order. Divide each block into scheduling regions
@@ -209,18 +260,9 @@ bool MachineScheduler::runOnMachineFunction(MachineFunction &mf) {
   }
   RegClassInfo->runOnMachineFunction(*MF);
 
-  // Select the scheduler, or set the default.
-  MachineSchedRegistry::ScheduleDAGCtor Ctor = MachineSchedOpt;
-  if (Ctor == useDefaultMachineSched) {
-    // Get the default scheduler set by the target.
-    Ctor = MachineSchedRegistry::getDefault();
-    if (!Ctor) {
-      Ctor = createConvergingSched;
-      MachineSchedRegistry::setDefault(Ctor);
-    }
-  }
-  // Instantiate the selected scheduler.
-  OwningPtr<ScheduleDAGInstrs> Scheduler(Ctor(this));
+  // Instantiate the selected scheduler for this target, function, and
+  // optimization level.
+  OwningPtr<ScheduleDAGInstrs> Scheduler(createMachineScheduler());
 
   // Visit all machine basic blocks.
   //
@@ -255,14 +297,15 @@ bool MachineScheduler::runOnMachineFunction(MachineFunction &mf) {
 
       // The next region starts above the previous region. Look backward in the
       // instruction stream until we find the nearest boundary.
+      unsigned NumRegionInstrs = 0;
       MachineBasicBlock::iterator I = RegionEnd;
-      for(;I != MBB->begin(); --I, --RemainingInstrs) {
+      for(;I != MBB->begin(); --I, --RemainingInstrs, ++NumRegionInstrs) {
         if (TII->isSchedulingBoundary(llvm::prior(I), MBB, *MF))
           break;
       }
       // Notify the scheduler of the region, even if we may skip scheduling
       // it. Perhaps it still needs to be bundled.
-      Scheduler->enterRegion(MBB, I, RegionEnd, RemainingInstrs);
+      Scheduler->enterRegion(MBB, I, RegionEnd, NumRegionInstrs);
 
       // Skip empty scheduling regions (0 or 1 schedulable instructions).
       if (I == RegionEnd || I == llvm::prior(RegionEnd)) {
@@ -277,7 +320,8 @@ bool MachineScheduler::runOnMachineFunction(MachineFunction &mf) {
             << "\n  From: " << *I << "    To: ";
             if (RegionEnd != MBB->end()) dbgs() << *RegionEnd;
             else dbgs() << "End";
-            dbgs() << " Remaining: " << RemainingInstrs << "\n");
+            dbgs() << " RegionInstrs: " << NumRegionInstrs
+            << " Remaining: " << RemainingInstrs << "\n");
 
       // Schedule a region: possibly reorder instructions.
       // This invalidates 'RegionEnd' and 'I'.
@@ -446,13 +490,19 @@ bool ScheduleDAGMI::checkSchedLimit() {
 void ScheduleDAGMI::enterRegion(MachineBasicBlock *bb,
                                 MachineBasicBlock::iterator begin,
                                 MachineBasicBlock::iterator end,
-                                unsigned endcount)
+                                unsigned regioninstrs)
 {
-  ScheduleDAGInstrs::enterRegion(bb, begin, end, endcount);
+  ScheduleDAGInstrs::enterRegion(bb, begin, end, regioninstrs);
 
   // For convenience remember the end of the liveness region.
   LiveRegionEnd =
     (RegionEnd == bb->end()) ? RegionEnd : llvm::next(RegionEnd);
+
+  SUPressureDiffs.clear();
+
+  SchedImpl->initPolicy(begin, end, regioninstrs);
+
+  ShouldTrackPressure = SchedImpl->shouldTrackPressure();
 }
 
 // Setup the register pressure trackers for the top scheduled top and bottom
@@ -483,9 +533,16 @@ void ScheduleDAGMI::initRegPressure() {
           dumpRegSetPressure(BotRPTracker.getLiveThru(), TRI));
   };
 
+  // For each live out vreg reduce the pressure change associated with other
+  // uses of the same vreg below the live-out reaching def.
+  updatePressureDiffs(RPTracker.getPressure().LiveOutRegs);
+
   // Account for liveness generated by the region boundary.
-  if (LiveRegionEnd != RegionEnd)
-    BotRPTracker.recede();
+  if (LiveRegionEnd != RegionEnd) {
+    SmallVector<unsigned, 8> LiveUses;
+    BotRPTracker.recede(&LiveUses);
+    updatePressureDiffs(LiveUses);
+  }
 
   assert(BotRPTracker.getPos() == RegionEnd && "Can't find the region bottom");
 
@@ -500,34 +557,83 @@ void ScheduleDAGMI::initRegPressure() {
       DEBUG(dbgs() << TRI->getRegPressureSetName(i)
             << " Limit " << Limit
             << " Actual " << RegionPressure[i] << "\n");
-      RegionCriticalPSets.push_back(PressureElement(i, 0));
+      RegionCriticalPSets.push_back(PressureChange(i));
     }
   }
   DEBUG(dbgs() << "Excess PSets: ";
         for (unsigned i = 0, e = RegionCriticalPSets.size(); i != e; ++i)
           dbgs() << TRI->getRegPressureSetName(
-            RegionCriticalPSets[i].PSetID) << " ";
+            RegionCriticalPSets[i].getPSet()) << " ";
         dbgs() << "\n");
 }
 
-// FIXME: When the pressure tracker deals in pressure differences then we won't
-// iterate over all RegionCriticalPSets[i].
 void ScheduleDAGMI::
-updateScheduledPressure(const std::vector<unsigned> &NewMaxPressure) {
-  for (unsigned i = 0, e = RegionCriticalPSets.size(); i < e; ++i) {
-    unsigned ID = RegionCriticalPSets[i].PSetID;
-    int &MaxUnits = RegionCriticalPSets[i].UnitIncrease;
-    if ((int)NewMaxPressure[ID] > MaxUnits)
-      MaxUnits = NewMaxPressure[ID];
+updateScheduledPressure(const SUnit *SU,
+                        const std::vector<unsigned> &NewMaxPressure) {
+  const PressureDiff &PDiff = getPressureDiff(SU);
+  unsigned CritIdx = 0, CritEnd = RegionCriticalPSets.size();
+  for (PressureDiff::const_iterator I = PDiff.begin(), E = PDiff.end();
+       I != E; ++I) {
+    if (!I->isValid())
+      break;
+    unsigned ID = I->getPSet();
+    while (CritIdx != CritEnd && RegionCriticalPSets[CritIdx].getPSet() < ID)
+      ++CritIdx;
+    if (CritIdx != CritEnd && RegionCriticalPSets[CritIdx].getPSet() == ID) {
+      if ((int)NewMaxPressure[ID] > RegionCriticalPSets[CritIdx].getUnitInc()
+          && NewMaxPressure[ID] <= INT16_MAX)
+        RegionCriticalPSets[CritIdx].setUnitInc(NewMaxPressure[ID]);
+    }
+    unsigned Limit = RegClassInfo->getRegPressureSetLimit(ID);
+    if (NewMaxPressure[ID] >= Limit - 2) {
+      DEBUG(dbgs() << "  " << TRI->getRegPressureSetName(ID) << ": "
+            << NewMaxPressure[ID] << " > " << Limit << "(+ "
+            << BotRPTracker.getLiveThru()[ID] << " livethru)\n");
+    }
   }
-  DEBUG(
-    for (unsigned i = 0, e = NewMaxPressure.size(); i < e; ++i) {
-      unsigned Limit = RegClassInfo->getRegPressureSetLimit(i);
-      if (NewMaxPressure[i] > Limit ) {
-        dbgs() << "  " << TRI->getRegPressureSetName(i) << ": "
-               << NewMaxPressure[i] << " > " << Limit << "\n";
+}
+
+/// Update the PressureDiff array for liveness after scheduling this
+/// instruction.
+void ScheduleDAGMI::updatePressureDiffs(ArrayRef<unsigned> LiveUses) {
+  for (unsigned LUIdx = 0, LUEnd = LiveUses.size(); LUIdx != LUEnd; ++LUIdx) {
+    /// FIXME: Currently assuming single-use physregs.
+    unsigned Reg = LiveUses[LUIdx];
+    DEBUG(dbgs() << "  LiveReg: " << PrintVRegOrUnit(Reg, TRI) << "\n");
+    if (!TRI->isVirtualRegister(Reg))
+      continue;
+
+    // This may be called before CurrentBottom has been initialized. However,
+    // BotRPTracker must have a valid position. We want the value live into the
+    // instruction or live out of the block, so ask for the previous
+    // instruction's live-out.
+    const LiveInterval &LI = LIS->getInterval(Reg);
+    VNInfo *VNI;
+    MachineBasicBlock::const_iterator I =
+      nextIfDebug(BotRPTracker.getPos(), BB->end());
+    if (I == BB->end())
+      VNI = LI.getVNInfoBefore(LIS->getMBBEndIdx(BB));
+    else {
+      LiveQueryResult LRQ = LI.Query(LIS->getInstructionIndex(I));
+      VNI = LRQ.valueIn();
+    }
+    // RegisterPressureTracker guarantees that readsReg is true for LiveUses.
+    assert(VNI && "No live value at use.");
+    for (VReg2UseMap::iterator
+           UI = VRegUses.find(Reg); UI != VRegUses.end(); ++UI) {
+      SUnit *SU = UI->SU;
+      DEBUG(dbgs() << "  UpdateRegP: SU(" << SU->NodeNum << ") "
+            << *SU->getInstr());
+      // If this use comes before the reaching def, it cannot be a last use, so
+      // descrease its pressure change.
+      if (!SU->isScheduled && SU != &ExitSU) {
+        LiveQueryResult LRQ
+          = LI.Query(LIS->getInstructionIndex(SU->getInstr()));
+        if (LRQ.valueIn() == VNI)
+          getPressureDiff(SU).addPressureChange(Reg, true, &MRI);
       }
-    });
+    }
+  }
 }
 
 /// schedule - Called back from MachineScheduler::runOnMachineFunction
@@ -585,6 +691,13 @@ void ScheduleDAGMI::schedule() {
 
 /// Build the DAG and setup three register pressure trackers.
 void ScheduleDAGMI::buildDAGWithRegPressure() {
+  if (!ShouldTrackPressure) {
+    RPTracker.reset();
+    RegionCriticalPSets.clear();
+    buildSchedGraph(AA);
+    return;
+  }
+
   // Initialize the register pressure tracker used by buildSchedGraph.
   RPTracker.init(&MF, RegClassInfo, LIS, BB, LiveRegionEnd,
                  /*TrackUntiedDefs=*/true);
@@ -594,7 +707,7 @@ void ScheduleDAGMI::buildDAGWithRegPressure() {
     RPTracker.recede();
 
   // Build the DAG, and compute current register pressure.
-  buildSchedGraph(AA, &RPTracker);
+  buildSchedGraph(AA, &RPTracker, &SUPressureDiffs);
 
   // Initialize top/bottom trackers after computing region pressure.
   initRegPressure();
@@ -637,6 +750,91 @@ void ScheduleDAGMI::findRootsAndBiasEdges(SmallVectorImpl<SUnit*> &TopRoots,
   ExitSU.biasCriticalPath();
 }
 
+/// Compute the max cyclic critical path through the DAG. The scheduling DAG
+/// only provides the critical path for single block loops. To handle loops that
+/// span blocks, we could use the vreg path latencies provided by
+/// MachineTraceMetrics instead. However, MachineTraceMetrics is not currently
+/// available for use in the scheduler.
+///
+/// The cyclic path estimation identifies a def-use pair that crosses the back
+/// edge and considers the depth and height of the nodes. For example, consider
+/// the following instruction sequence where each instruction has unit latency
+/// and defines an epomymous virtual register:
+///
+/// a->b(a,c)->c(b)->d(c)->exit
+///
+/// The cyclic critical path is a two cycles: b->c->b
+/// The acyclic critical path is four cycles: a->b->c->d->exit
+/// LiveOutHeight = height(c) = len(c->d->exit) = 2
+/// LiveOutDepth = depth(c) + 1 = len(a->b->c) + 1 = 3
+/// LiveInHeight = height(b) + 1 = len(b->c->d->exit) + 1 = 4
+/// LiveInDepth = depth(b) = len(a->b) = 1
+///
+/// LiveOutDepth - LiveInDepth = 3 - 1 = 2
+/// LiveInHeight - LiveOutHeight = 4 - 2 = 2
+/// CyclicCriticalPath = min(2, 2) = 2
+unsigned ScheduleDAGMI::computeCyclicCriticalPath() {
+  // This only applies to single block loop.
+  if (!BB->isSuccessor(BB))
+    return 0;
+
+  unsigned MaxCyclicLatency = 0;
+  // Visit each live out vreg def to find def/use pairs that cross iterations.
+  ArrayRef<unsigned> LiveOuts = RPTracker.getPressure().LiveOutRegs;
+  for (ArrayRef<unsigned>::iterator RI = LiveOuts.begin(), RE = LiveOuts.end();
+       RI != RE; ++RI) {
+    unsigned Reg = *RI;
+    if (!TRI->isVirtualRegister(Reg))
+        continue;
+    const LiveInterval &LI = LIS->getInterval(Reg);
+    const VNInfo *DefVNI = LI.getVNInfoBefore(LIS->getMBBEndIdx(BB));
+    if (!DefVNI)
+      continue;
+
+    MachineInstr *DefMI = LIS->getInstructionFromIndex(DefVNI->def);
+    const SUnit *DefSU = getSUnit(DefMI);
+    if (!DefSU)
+      continue;
+
+    unsigned LiveOutHeight = DefSU->getHeight();
+    unsigned LiveOutDepth = DefSU->getDepth() + DefSU->Latency;
+    // Visit all local users of the vreg def.
+    for (VReg2UseMap::iterator
+           UI = VRegUses.find(Reg); UI != VRegUses.end(); ++UI) {
+      if (UI->SU == &ExitSU)
+        continue;
+
+      // Only consider uses of the phi.
+      LiveQueryResult LRQ =
+        LI.Query(LIS->getInstructionIndex(UI->SU->getInstr()));
+      if (!LRQ.valueIn()->isPHIDef())
+        continue;
+
+      // Assume that a path spanning two iterations is a cycle, which could
+      // overestimate in strange cases. This allows cyclic latency to be
+      // estimated as the minimum slack of the vreg's depth or height.
+      unsigned CyclicLatency = 0;
+      if (LiveOutDepth > UI->SU->getDepth())
+        CyclicLatency = LiveOutDepth - UI->SU->getDepth();
+
+      unsigned LiveInHeight = UI->SU->getHeight() + DefSU->Latency;
+      if (LiveInHeight > LiveOutHeight) {
+        if (LiveInHeight - LiveOutHeight < CyclicLatency)
+          CyclicLatency = LiveInHeight - LiveOutHeight;
+      }
+      else
+        CyclicLatency = 0;
+
+      DEBUG(dbgs() << "Cyclic Path: SU(" << DefSU->NodeNum << ") -> SU("
+            << UI->SU->NodeNum << ") = " << CyclicLatency << "c\n");
+      if (CyclicLatency > MaxCyclicLatency)
+        MaxCyclicLatency = CyclicLatency;
+    }
+  }
+  DEBUG(dbgs() << "Cyclic Critical Path: " << MaxCyclicLatency << "c\n");
+  return MaxCyclicLatency;
+}
+
 /// Identify DAG roots and setup scheduler queues.
 void ScheduleDAGMI::initQueues(ArrayRef<SUnit*> TopRoots,
                                ArrayRef<SUnit*> BotRoots) {
@@ -664,11 +862,13 @@ void ScheduleDAGMI::initQueues(ArrayRef<SUnit*> TopRoots,
   SchedImpl->registerRoots();
 
   // Advance past initial DebugValues.
-  assert(TopRPTracker.getPos() == RegionBegin && "bad initial Top tracker");
   CurrentTop = nextIfDebug(RegionBegin, RegionEnd);
-  TopRPTracker.setPos(CurrentTop);
-
   CurrentBottom = RegionEnd;
+
+  if (ShouldTrackPressure) {
+    assert(TopRPTracker.getPos() == RegionBegin && "bad initial Top tracker");
+    TopRPTracker.setPos(CurrentTop);
+  }
 }
 
 /// Move an instruction and update register pressure.
@@ -685,10 +885,12 @@ void ScheduleDAGMI::scheduleMI(SUnit *SU, bool IsTopNode) {
       TopRPTracker.setPos(MI);
     }
 
-    // Update top scheduled pressure.
-    TopRPTracker.advance();
-    assert(TopRPTracker.getPos() == CurrentTop && "out of sync");
-    updateScheduledPressure(TopRPTracker.getPressure().MaxSetPressure);
+    if (ShouldTrackPressure) {
+      // Update top scheduled pressure.
+      TopRPTracker.advance();
+      assert(TopRPTracker.getPos() == CurrentTop && "out of sync");
+      updateScheduledPressure(SU, TopRPTracker.getPressure().MaxSetPressure);
+    }
   }
   else {
     assert(SU->isBottomReady() && "node still has unscheduled dependencies");
@@ -704,10 +906,14 @@ void ScheduleDAGMI::scheduleMI(SUnit *SU, bool IsTopNode) {
       moveInstruction(MI, CurrentBottom);
       CurrentBottom = MI;
     }
-    // Update bottom scheduled pressure.
-    BotRPTracker.recede();
-    assert(BotRPTracker.getPos() == CurrentBottom && "out of sync");
-    updateScheduledPressure(BotRPTracker.getPressure().MaxSetPressure);
+    if (ShouldTrackPressure) {
+      // Update bottom scheduled pressure.
+      SmallVector<unsigned, 8> LiveUses;
+      BotRPTracker.recede(&LiveUses);
+      assert(BotRPTracker.getPos() == CurrentBottom && "out of sync");
+      updateScheduledPressure(SU, BotRPTracker.getPressure().MaxSetPressure);
+      updatePressureDiffs(LiveUses);
+    }
   }
 }
 
@@ -1113,13 +1319,13 @@ void CopyConstrain::apply(ScheduleDAGMI *DAG) {
 }
 
 //===----------------------------------------------------------------------===//
-// ConvergingScheduler - Implementation of the generic MachineSchedStrategy.
+// GenericScheduler - Implementation of the generic MachineSchedStrategy.
 //===----------------------------------------------------------------------===//
 
 namespace {
-/// ConvergingScheduler shrinks the unscheduled zone using heuristics to balance
+/// GenericScheduler shrinks the unscheduled zone using heuristics to balance
 /// the schedule.
-class ConvergingScheduler : public MachineSchedStrategy {
+class GenericScheduler : public MachineSchedStrategy {
 public:
   /// Represent the type of SchedCandidate found within a single queue.
   /// pickNodeBidirectional depends on these listed by decreasing priority.
@@ -1129,7 +1335,7 @@ public:
     TopDepthReduce, TopPathReduce, NextDefUse, NodeOrder};
 
 #ifndef NDEBUG
-  static const char *getReasonStr(ConvergingScheduler::CandReason Reason);
+  static const char *getReasonStr(GenericScheduler::CandReason Reason);
 #endif
 
   /// Policy for scheduling the next instruction in the candidate's zone.
@@ -1160,7 +1366,7 @@ public:
     }
   };
 
-  /// Store the state used by ConvergingScheduler heuristics, required for the
+  /// Store the state used by GenericScheduler heuristics, required for the
   /// lifetime of one invocation of pickNode().
   struct SchedCandidate {
     CandPolicy Policy;
@@ -1205,16 +1411,21 @@ public:
   struct SchedRemainder {
     // Critical path through the DAG in expected latency.
     unsigned CriticalPath;
+    unsigned CyclicCritPath;
 
     // Scaled count of micro-ops left to schedule.
     unsigned RemIssueCount;
 
+    bool IsAcyclicLatencyLimited;
+
     // Unscheduled resources
     SmallVector<unsigned, 16> RemainingCounts;
 
     void reset() {
       CriticalPath = 0;
+      CyclicCritPath = 0;
       RemIssueCount = 0;
+      IsAcyclicLatencyLimited = false;
       RemainingCounts.clear();
     }
 
@@ -1288,13 +1499,16 @@ public:
 
     void reset() {
       // A new HazardRec is created for each DAG and owned by SchedBoundary.
-      delete HazardRec;
-
+      // Destroying and reconstructing it is very expensive though. So keep
+      // invalid, placeholder HazardRecs.
+      if (HazardRec && HazardRec->isEnabled()) {
+        delete HazardRec;
+        HazardRec = 0;
+      }
       Available.clear();
       Pending.clear();
       CheckPending = false;
       NextSUs.clear();
-      HazardRec = 0;
       CurrCycle = 0;
       CurrMOps = 0;
       MinReadyCycle = UINT_MAX;
@@ -1316,7 +1530,7 @@ public:
     /// PendingFlag set.
     SchedBoundary(unsigned ID, const Twine &Name):
       DAG(0), SchedModel(0), Rem(0), Available(ID, Name+".A"),
-      Pending(ID << ConvergingScheduler::LogMaxQID, Name+".P"),
+      Pending(ID << GenericScheduler::LogMaxQID, Name+".P"),
       HazardRec(0) {
       reset();
     }
@@ -1327,7 +1541,7 @@ public:
               SchedRemainder *rem);
 
     bool isTop() const {
-      return Available.getID() == ConvergingScheduler::TopQID;
+      return Available.getID() == GenericScheduler::TopQID;
     }
 
 #ifndef NDEBUG
@@ -1399,6 +1613,7 @@ public:
   };
 
 private:
+  const MachineSchedContext *Context;
   ScheduleDAGMI *DAG;
   const TargetSchedModel *SchedModel;
   const TargetRegisterInfo *TRI;
@@ -1408,6 +1623,7 @@ private:
   SchedBoundary Top;
   SchedBoundary Bot;
 
+  MachineSchedPolicy RegionPolicy;
 public:
   /// SUnit::NodeQueueId: 0 (none), 1 (top), 2 (bot), 3 (both)
   enum {
@@ -1416,8 +1632,15 @@ public:
     LogMaxQID = 2
   };
 
-  ConvergingScheduler():
-    DAG(0), SchedModel(0), TRI(0), Top(TopQID, "TopQ"), Bot(BotQID, "BotQ") {}
+  GenericScheduler(const MachineSchedContext *C):
+    Context(C), DAG(0), SchedModel(0), TRI(0),
+    Top(TopQID, "TopQ"), Bot(BotQID, "BotQ") {}
+
+  virtual void initPolicy(MachineBasicBlock::iterator Begin,
+                          MachineBasicBlock::iterator End,
+                          unsigned NumRegionInstrs);
+
+  bool shouldTrackPressure() const { return RegionPolicy.ShouldTrackPressure; }
 
   virtual void initialize(ScheduleDAGMI *dag);
 
@@ -1432,6 +1655,8 @@ public:
   virtual void registerRoots();
 
 protected:
+  void checkAcyclicLatency();
+
   void tryCandidate(SchedCandidate &Cand,
                     SchedCandidate &TryCand,
                     SchedBoundary &Zone,
@@ -1452,7 +1677,7 @@ protected:
 };
 } // namespace
 
-void ConvergingScheduler::SchedRemainder::
+void GenericScheduler::SchedRemainder::
 init(ScheduleDAGMI *DAG, const TargetSchedModel *SchedModel) {
   reset();
   if (!SchedModel->hasInstrSchedModel())
@@ -1473,7 +1698,7 @@ init(ScheduleDAGMI *DAG, const TargetSchedModel *SchedModel) {
   }
 }
 
-void ConvergingScheduler::SchedBoundary::
+void GenericScheduler::SchedBoundary::
 init(ScheduleDAGMI *dag, const TargetSchedModel *smodel, SchedRemainder *rem) {
   reset();
   DAG = dag;
@@ -1483,7 +1708,49 @@ init(ScheduleDAGMI *dag, const TargetSchedModel *smodel, SchedRemainder *rem) {
     ExecutedResCounts.resize(SchedModel->getNumProcResourceKinds());
 }
 
-void ConvergingScheduler::initialize(ScheduleDAGMI *dag) {
+/// Initialize the per-region scheduling policy.
+void GenericScheduler::initPolicy(MachineBasicBlock::iterator Begin,
+                                     MachineBasicBlock::iterator End,
+                                     unsigned NumRegionInstrs) {
+  const TargetMachine &TM = Context->MF->getTarget();
+
+  // Avoid setting up the register pressure tracker for small regions to save
+  // compile time. As a rough heuristic, only track pressure when the number of
+  // schedulable instructions exceeds half the integer register file.
+  unsigned NIntRegs = Context->RegClassInfo->getNumAllocatableRegs(
+    TM.getTargetLowering()->getRegClassFor(MVT::i32));
+
+  RegionPolicy.ShouldTrackPressure = NumRegionInstrs > (NIntRegs / 2);
+
+  // For generic targets, we default to bottom-up, because it's simpler and more
+  // compile-time optimizations have been implemented in that direction.
+  RegionPolicy.OnlyBottomUp = true;
+
+  // Allow the subtarget to override default policy.
+  const TargetSubtargetInfo &ST = TM.getSubtarget<TargetSubtargetInfo>();
+  ST.overrideSchedPolicy(RegionPolicy, Begin, End, NumRegionInstrs);
+
+  // After subtarget overrides, apply command line options.
+  if (!EnableRegPressure)
+    RegionPolicy.ShouldTrackPressure = false;
+
+  // Check -misched-topdown/bottomup can force or unforce scheduling direction.
+  // e.g. -misched-bottomup=false allows scheduling in both directions.
+  assert((!ForceTopDown || !ForceBottomUp) &&
+         "-misched-topdown incompatible with -misched-bottomup");
+  if (ForceBottomUp.getNumOccurrences() > 0) {
+    RegionPolicy.OnlyBottomUp = ForceBottomUp;
+    if (RegionPolicy.OnlyBottomUp)
+      RegionPolicy.OnlyTopDown = false;
+  }
+  if (ForceTopDown.getNumOccurrences() > 0) {
+    RegionPolicy.OnlyTopDown = ForceTopDown;
+    if (RegionPolicy.OnlyTopDown)
+      RegionPolicy.OnlyBottomUp = false;
+  }
+}
+
+void GenericScheduler::initialize(ScheduleDAGMI *dag) {
   DAG = dag;
   SchedModel = DAG->getSchedModel();
   TRI = DAG->TRI;
@@ -1498,14 +1765,17 @@ void ConvergingScheduler::initialize(ScheduleDAGMI *dag) {
   // are disabled, then these HazardRecs will be disabled.
   const InstrItineraryData *Itin = SchedModel->getInstrItineraries();
   const TargetMachine &TM = DAG->MF.getTarget();
-  Top.HazardRec = TM.getInstrInfo()->CreateTargetMIHazardRecognizer(Itin, DAG);
-  Bot.HazardRec = TM.getInstrInfo()->CreateTargetMIHazardRecognizer(Itin, DAG);
-
-  assert((!ForceTopDown || !ForceBottomUp) &&
-         "-misched-topdown incompatible with -misched-bottomup");
+  if (!Top.HazardRec) {
+    Top.HazardRec =
+      TM.getInstrInfo()->CreateTargetMIHazardRecognizer(Itin, DAG);
+  }
+  if (!Bot.HazardRec) {
+    Bot.HazardRec =
+      TM.getInstrInfo()->CreateTargetMIHazardRecognizer(Itin, DAG);
+  }
 }
 
-void ConvergingScheduler::releaseTopNode(SUnit *SU) {
+void GenericScheduler::releaseTopNode(SUnit *SU) {
   if (SU->isScheduled)
     return;
 
@@ -1524,7 +1794,7 @@ void ConvergingScheduler::releaseTopNode(SUnit *SU) {
   Top.releaseNode(SU, SU->TopReadyCycle);
 }
 
-void ConvergingScheduler::releaseBottomNode(SUnit *SU) {
+void GenericScheduler::releaseBottomNode(SUnit *SU) {
   if (SU->isScheduled)
     return;
 
@@ -1545,8 +1815,46 @@ void ConvergingScheduler::releaseBottomNode(SUnit *SU) {
   Bot.releaseNode(SU, SU->BotReadyCycle);
 }
 
-void ConvergingScheduler::registerRoots() {
+/// Set IsAcyclicLatencyLimited if the acyclic path is longer than the cyclic
+/// critical path by more cycles than it takes to drain the instruction buffer.
+/// We estimate an upper bounds on in-flight instructions as:
+///
+/// CyclesPerIteration = max( CyclicPath, Loop-Resource-Height )
+/// InFlightIterations = AcyclicPath / CyclesPerIteration
+/// InFlightResources = InFlightIterations * LoopResources
+///
+/// TODO: Check execution resources in addition to IssueCount.
+void GenericScheduler::checkAcyclicLatency() {
+  if (Rem.CyclicCritPath == 0 || Rem.CyclicCritPath >= Rem.CriticalPath)
+    return;
+
+  // Scaled number of cycles per loop iteration.
+  unsigned IterCount =
+    std::max(Rem.CyclicCritPath * SchedModel->getLatencyFactor(),
+             Rem.RemIssueCount);
+  // Scaled acyclic critical path.
+  unsigned AcyclicCount = Rem.CriticalPath * SchedModel->getLatencyFactor();
+  // InFlightCount = (AcyclicPath / IterCycles) * InstrPerLoop
+  unsigned InFlightCount =
+    (AcyclicCount * Rem.RemIssueCount + IterCount-1) / IterCount;
+  unsigned BufferLimit =
+    SchedModel->getMicroOpBufferSize() * SchedModel->getMicroOpFactor();
+
+  Rem.IsAcyclicLatencyLimited = InFlightCount > BufferLimit;
+
+  DEBUG(dbgs() << "IssueCycles="
+        << Rem.RemIssueCount / SchedModel->getLatencyFactor() << "c "
+        << "IterCycles=" << IterCount / SchedModel->getLatencyFactor()
+        << "c NumIters=" << (AcyclicCount + IterCount-1) / IterCount
+        << " InFlight=" << InFlightCount / SchedModel->getMicroOpFactor()
+        << "m BufferLim=" << SchedModel->getMicroOpBufferSize() << "m\n";
+        if (Rem.IsAcyclicLatencyLimited)
+          dbgs() << "  ACYCLIC LATENCY LIMIT\n");
+}
+
+void GenericScheduler::registerRoots() {
   Rem.CriticalPath = DAG->ExitSU.getDepth();
+
   // Some roots may not feed into ExitSU. Check all of them in case.
   for (std::vector<SUnit*>::const_iterator
          I = Bot.Available.begin(), E = Bot.Available.end(); I != E; ++I) {
@@ -1554,6 +1862,11 @@ void ConvergingScheduler::registerRoots() {
       Rem.CriticalPath = (*I)->getDepth();
   }
   DEBUG(dbgs() << "Critical Path: " << Rem.CriticalPath << '\n');
+
+  if (EnableCyclicPath) {
+    Rem.CyclicCritPath = DAG->computeCyclicCriticalPath();
+    checkAcyclicLatency();
+  }
 }
 
 /// Does this SU have a hazard within the current instruction group.
@@ -1569,7 +1882,7 @@ void ConvergingScheduler::registerRoots() {
 /// can dispatch per cycle.
 ///
 /// TODO: Also check whether the SU must start a new group.
-bool ConvergingScheduler::SchedBoundary::checkHazard(SUnit *SU) {
+bool GenericScheduler::SchedBoundary::checkHazard(SUnit *SU) {
   if (HazardRec->isEnabled())
     return HazardRec->getHazardType(SU) != ScheduleHazardRecognizer::NoHazard;
 
@@ -1583,7 +1896,7 @@ bool ConvergingScheduler::SchedBoundary::checkHazard(SUnit *SU) {
 }
 
 // Find the unscheduled node in ReadySUs with the highest latency.
-unsigned ConvergingScheduler::SchedBoundary::
+unsigned GenericScheduler::SchedBoundary::
 findMaxLatency(ArrayRef<SUnit*> ReadySUs) {
   SUnit *LateSU = 0;
   unsigned RemLatency = 0;
@@ -1605,7 +1918,7 @@ findMaxLatency(ArrayRef<SUnit*> ReadySUs) {
 // Count resources in this zone and the remaining unscheduled
 // instruction. Return the max count, scaled. Set OtherCritIdx to the critical
 // resource index, or zero if the zone is issue limited.
-unsigned ConvergingScheduler::SchedBoundary::
+unsigned GenericScheduler::SchedBoundary::
 getOtherResourceCount(unsigned &OtherCritIdx) {
   OtherCritIdx = 0;
   if (!SchedModel->hasInstrSchedModel())
@@ -1633,7 +1946,7 @@ getOtherResourceCount(unsigned &OtherCritIdx) {
 
 /// Set the CandPolicy for this zone given the current resources and latencies
 /// inside and outside the zone.
-void ConvergingScheduler::SchedBoundary::setPolicy(CandPolicy &Policy,
+void GenericScheduler::SchedBoundary::setPolicy(CandPolicy &Policy,
                                                    SchedBoundary &OtherZone) {
   // Now that potential stalls have been considered, apply preemptive heuristics
   // based on the the total latency and resources inside and outside this
@@ -1692,7 +2005,7 @@ void ConvergingScheduler::SchedBoundary::setPolicy(CandPolicy &Policy,
     Policy.DemandResIdx = OtherCritIdx;
 }
 
-void ConvergingScheduler::SchedBoundary::releaseNode(SUnit *SU,
+void GenericScheduler::SchedBoundary::releaseNode(SUnit *SU,
                                                      unsigned ReadyCycle) {
   if (ReadyCycle < MinReadyCycle)
     MinReadyCycle = ReadyCycle;
@@ -1710,7 +2023,7 @@ void ConvergingScheduler::SchedBoundary::releaseNode(SUnit *SU,
 }
 
 /// Move the boundary of scheduled code by one cycle.
-void ConvergingScheduler::SchedBoundary::bumpCycle(unsigned NextCycle) {
+void GenericScheduler::SchedBoundary::bumpCycle(unsigned NextCycle) {
   if (SchedModel->getMicroOpBufferSize() == 0) {
     assert(MinReadyCycle < UINT_MAX && "MinReadyCycle uninitialized");
     if (MinReadyCycle > NextCycle)
@@ -1748,7 +2061,7 @@ void ConvergingScheduler::SchedBoundary::bumpCycle(unsigned NextCycle) {
   DEBUG(dbgs() << "Cycle: " << CurrCycle << ' ' << Available.getName() << '\n');
 }
 
-void ConvergingScheduler::SchedBoundary::incExecutedResources(unsigned PIdx,
+void GenericScheduler::SchedBoundary::incExecutedResources(unsigned PIdx,
                                                               unsigned Count) {
   ExecutedResCounts[PIdx] += Count;
   if (ExecutedResCounts[PIdx] > MaxExecutedResCount)
@@ -1762,7 +2075,7 @@ void ConvergingScheduler::SchedBoundary::incExecutedResources(unsigned PIdx,
 ///
 /// \return the next cycle at which the instruction may execute without
 /// oversubscribing resources.
-unsigned ConvergingScheduler::SchedBoundary::
+unsigned GenericScheduler::SchedBoundary::
 countResource(unsigned PIdx, unsigned Cycles, unsigned ReadyCycle) {
   unsigned Factor = SchedModel->getResourceFactor(PIdx);
   unsigned Count = Factor * Cycles;
@@ -1787,7 +2100,7 @@ countResource(unsigned PIdx, unsigned Cycles, unsigned ReadyCycle) {
 }
 
 /// Move the boundary of scheduled code by one SUnit.
-void ConvergingScheduler::SchedBoundary::bumpNode(SUnit *SU) {
+void GenericScheduler::SchedBoundary::bumpNode(SUnit *SU) {
   // Update the reservation table.
   if (HazardRec->isEnabled()) {
     if (!isTop() && SU->isCall) {
@@ -1891,7 +2204,7 @@ void ConvergingScheduler::SchedBoundary::bumpNode(SUnit *SU) {
 
 /// Release pending ready nodes in to the available queue. This makes them
 /// visible to heuristics.
-void ConvergingScheduler::SchedBoundary::releasePending() {
+void GenericScheduler::SchedBoundary::releasePending() {
   // If the available queue is empty, it is safe to reset MinReadyCycle.
   if (Available.empty())
     MinReadyCycle = UINT_MAX;
@@ -1921,7 +2234,7 @@ void ConvergingScheduler::SchedBoundary::releasePending() {
 }
 
 /// Remove SU from the ready set for this boundary.
-void ConvergingScheduler::SchedBoundary::removeReady(SUnit *SU) {
+void GenericScheduler::SchedBoundary::removeReady(SUnit *SU) {
   if (Available.isInQueue(SU))
     Available.remove(Available.find(SU));
   else {
@@ -1933,7 +2246,7 @@ void ConvergingScheduler::SchedBoundary::removeReady(SUnit *SU) {
 /// If this queue only has one ready candidate, return it. As a side effect,
 /// defer any nodes that now hit a hazard, and advance the cycle until at least
 /// one node is ready. If multiple instructions are ready, return NULL.
-SUnit *ConvergingScheduler::SchedBoundary::pickOnlyChoice() {
+SUnit *GenericScheduler::SchedBoundary::pickOnlyChoice() {
   if (CheckPending)
     releasePending();
 
@@ -1962,7 +2275,7 @@ SUnit *ConvergingScheduler::SchedBoundary::pickOnlyChoice() {
 #ifndef NDEBUG
 // This is useful information to dump after bumpNode.
 // Note that the Queue contents are more useful before pickNodeFromQueue.
-void ConvergingScheduler::SchedBoundary::dumpScheduledState() {
+void GenericScheduler::SchedBoundary::dumpScheduledState() {
   unsigned ResFactor;
   unsigned ResCount;
   if (ZoneCritResIdx) {
@@ -1985,7 +2298,7 @@ void ConvergingScheduler::SchedBoundary::dumpScheduledState() {
 }
 #endif
 
-void ConvergingScheduler::SchedCandidate::
+void GenericScheduler::SchedCandidate::
 initResourceDelta(const ScheduleDAGMI *DAG,
                   const TargetSchedModel *SchedModel) {
   if (!Policy.ReduceResIdx && !Policy.DemandResIdx)
@@ -2005,9 +2318,9 @@ initResourceDelta(const ScheduleDAGMI *DAG,
 
 /// Return true if this heuristic determines order.
 static bool tryLess(int TryVal, int CandVal,
-                    ConvergingScheduler::SchedCandidate &TryCand,
-                    ConvergingScheduler::SchedCandidate &Cand,
-                    ConvergingScheduler::CandReason Reason) {
+                    GenericScheduler::SchedCandidate &TryCand,
+                    GenericScheduler::SchedCandidate &Cand,
+                    GenericScheduler::CandReason Reason) {
   if (TryVal < CandVal) {
     TryCand.Reason = Reason;
     return true;
@@ -2022,9 +2335,9 @@ static bool tryLess(int TryVal, int CandVal,
 }
 
 static bool tryGreater(int TryVal, int CandVal,
-                       ConvergingScheduler::SchedCandidate &TryCand,
-                       ConvergingScheduler::SchedCandidate &Cand,
-                       ConvergingScheduler::CandReason Reason) {
+                       GenericScheduler::SchedCandidate &TryCand,
+                       GenericScheduler::SchedCandidate &Cand,
+                       GenericScheduler::CandReason Reason) {
   if (TryVal > CandVal) {
     TryCand.Reason = Reason;
     return true;
@@ -2038,26 +2351,26 @@ static bool tryGreater(int TryVal, int CandVal,
   return false;
 }
 
-static bool tryPressure(const PressureElement &TryP,
-                        const PressureElement &CandP,
-                        ConvergingScheduler::SchedCandidate &TryCand,
-                        ConvergingScheduler::SchedCandidate &Cand,
-                        ConvergingScheduler::CandReason Reason) {
+static bool tryPressure(const PressureChange &TryP,
+                        const PressureChange &CandP,
+                        GenericScheduler::SchedCandidate &TryCand,
+                        GenericScheduler::SchedCandidate &Cand,
+                        GenericScheduler::CandReason Reason) {
+  int TryRank = TryP.getPSetOrMax();
+  int CandRank = CandP.getPSetOrMax();
   // If both candidates affect the same set, go with the smallest increase.
-  if (TryP.PSetID == CandP.PSetID) {
-    return tryLess(TryP.UnitIncrease, CandP.UnitIncrease, TryCand, Cand,
+  if (TryRank == CandRank) {
+    return tryLess(TryP.getUnitInc(), CandP.getUnitInc(), TryCand, Cand,
                    Reason);
   }
   // If one candidate decreases and the other increases, go with it.
-  if (tryLess(TryP.UnitIncrease < 0, CandP.UnitIncrease < 0, TryCand, Cand,
+  // Invalid candidates have UnitInc==0.
+  if (tryLess(TryP.getUnitInc() < 0, CandP.getUnitInc() < 0, TryCand, Cand,
               Reason)) {
     return true;
   }
-  // If TryP has lower Rank, it has a higher priority.
-  int TryRank = TryP.PSetRank();
-  int CandRank = CandP.PSetRank();
   // If the candidates are decreasing pressure, reverse priority.
-  if (TryP.UnitIncrease < 0)
+  if (TryP.getUnitInc() < 0)
     std::swap(TryRank, CandRank);
   return tryGreater(TryRank, CandRank, TryCand, Cand, Reason);
 }
@@ -2094,6 +2407,32 @@ static int biasPhysRegCopy(const SUnit *SU, bool isTop) {
   return 0;
 }
 
+static bool tryLatency(GenericScheduler::SchedCandidate &TryCand,
+                       GenericScheduler::SchedCandidate &Cand,
+                       GenericScheduler::SchedBoundary &Zone) {
+  if (Zone.isTop()) {
+    if (Cand.SU->getDepth() > Zone.getScheduledLatency()) {
+      if (tryLess(TryCand.SU->getDepth(), Cand.SU->getDepth(),
+                  TryCand, Cand, GenericScheduler::TopDepthReduce))
+        return true;
+    }
+    if (tryGreater(TryCand.SU->getHeight(), Cand.SU->getHeight(),
+                   TryCand, Cand, GenericScheduler::TopPathReduce))
+      return true;
+  }
+  else {
+    if (Cand.SU->getHeight() > Zone.getScheduledLatency()) {
+      if (tryLess(TryCand.SU->getHeight(), Cand.SU->getHeight(),
+                  TryCand, Cand, GenericScheduler::BotHeightReduce))
+        return true;
+    }
+    if (tryGreater(TryCand.SU->getDepth(), Cand.SU->getDepth(),
+                   TryCand, Cand, GenericScheduler::BotPathReduce))
+      return true;
+  }
+  return false;
+}
+
 /// Apply a set of heursitics to a new candidate. Heuristics are currently
 /// hierarchical. This may be more efficient than a graduated cost model because
 /// we don't need to evaluate all aspects of the model for each node in the
@@ -2105,16 +2444,44 @@ static int biasPhysRegCopy(const SUnit *SU, bool isTop) {
 /// \param Zone describes the scheduled zone that we are extending.
 /// \param RPTracker describes reg pressure within the scheduled zone.
 /// \param TempTracker is a scratch pressure tracker to reuse in queries.
-void ConvergingScheduler::tryCandidate(SchedCandidate &Cand,
+void GenericScheduler::tryCandidate(SchedCandidate &Cand,
                                        SchedCandidate &TryCand,
                                        SchedBoundary &Zone,
                                        const RegPressureTracker &RPTracker,
                                        RegPressureTracker &TempTracker) {
 
-  // Always initialize TryCand's RPDelta.
-  TempTracker.getMaxPressureDelta(TryCand.SU->getInstr(), TryCand.RPDelta,
-                                  DAG->getRegionCriticalPSets(),
-                                  DAG->getRegPressure().MaxSetPressure);
+  if (DAG->isTrackingPressure()) {
+    // Always initialize TryCand's RPDelta.
+    if (Zone.isTop()) {
+      TempTracker.getMaxDownwardPressureDelta(
+        TryCand.SU->getInstr(),
+        TryCand.RPDelta,
+        DAG->getRegionCriticalPSets(),
+        DAG->getRegPressure().MaxSetPressure);
+    }
+    else {
+      if (VerifyScheduling) {
+        TempTracker.getMaxUpwardPressureDelta(
+          TryCand.SU->getInstr(),
+          &DAG->getPressureDiff(TryCand.SU),
+          TryCand.RPDelta,
+          DAG->getRegionCriticalPSets(),
+          DAG->getRegPressure().MaxSetPressure);
+      }
+      else {
+        RPTracker.getUpwardPressureDelta(
+          TryCand.SU->getInstr(),
+          DAG->getPressureDiff(TryCand.SU),
+          TryCand.RPDelta,
+          DAG->getRegionCriticalPSets(),
+          DAG->getRegPressure().MaxSetPressure);
+      }
+    }
+  }
+  DEBUG(if (TryCand.RPDelta.Excess.isValid())
+          dbgs() << "  SU(" << TryCand.SU->NodeNum << ") "
+                 << TRI->getRegPressureSetName(TryCand.RPDelta.Excess.getPSet())
+                 << ":" << TryCand.RPDelta.Excess.getUnitInc() << "\n");
 
   // Initialize the candidate if needed.
   if (!Cand.isValid()) {
@@ -2129,13 +2496,22 @@ void ConvergingScheduler::tryCandidate(SchedCandidate &Cand,
 
   // Avoid exceeding the target's limit. If signed PSetID is negative, it is
   // invalid; convert it to INT_MAX to give it lowest priority.
-  if (tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand,
-                  RegExcess))
+  if (DAG->isTrackingPressure() && tryPressure(TryCand.RPDelta.Excess,
+                                               Cand.RPDelta.Excess,
+                                               TryCand, Cand, RegExcess))
     return;
 
   // Avoid increasing the max critical pressure in the scheduled region.
-  if (tryPressure(TryCand.RPDelta.CriticalMax, Cand.RPDelta.CriticalMax,
-                  TryCand, Cand, RegCritical))
+  if (DAG->isTrackingPressure() && tryPressure(TryCand.RPDelta.CriticalMax,
+                                               Cand.RPDelta.CriticalMax,
+                                               TryCand, Cand, RegCritical))
+    return;
+
+  // For loops that are acyclic path limited, aggressively schedule for latency.
+  // This can result in very long dependence chains scheduled in sequence, so
+  // once every cycle (when CurrMOps == 0), switch to normal heuristics.
+  if (Rem.IsAcyclicLatencyLimited && !Zone.CurrMOps
+      && tryLatency(TryCand, Cand, Zone))
     return;
 
   // Keep clustered nodes together to encourage downstream peephole
@@ -2157,8 +2533,9 @@ void ConvergingScheduler::tryCandidate(SchedCandidate &Cand,
     return;
   }
   // Avoid increasing the max pressure of the entire region.
-  if (tryPressure(TryCand.RPDelta.CurrentMax, Cand.RPDelta.CurrentMax,
-                  TryCand, Cand, RegMax))
+  if (DAG->isTrackingPressure() && tryPressure(TryCand.RPDelta.CurrentMax,
+                                               Cand.RPDelta.CurrentMax,
+                                               TryCand, Cand, RegMax))
     return;
 
   // Avoid critical resource consumption and balance the schedule.
@@ -2172,27 +2549,10 @@ void ConvergingScheduler::tryCandidate(SchedCandidate &Cand,
     return;
 
   // Avoid serializing long latency dependence chains.
-  if (Cand.Policy.ReduceLatency) {
-    if (Zone.isTop()) {
-      if (Cand.SU->getDepth() > Zone.getScheduledLatency()) {
-        if (tryLess(TryCand.SU->getDepth(), Cand.SU->getDepth(),
-                    TryCand, Cand, TopDepthReduce))
-          return;
-      }
-      if (tryGreater(TryCand.SU->getHeight(), Cand.SU->getHeight(),
-                     TryCand, Cand, TopPathReduce))
-        return;
-    }
-    else {
-      if (Cand.SU->getHeight() > Zone.getScheduledLatency()) {
-        if (tryLess(TryCand.SU->getHeight(), Cand.SU->getHeight(),
-                    TryCand, Cand, BotHeightReduce))
-          return;
-      }
-      if (tryGreater(TryCand.SU->getDepth(), Cand.SU->getDepth(),
-                     TryCand, Cand, BotPathReduce))
-        return;
-    }
+  // For acyclic path limited loops, latency was already checked above.
+  if (Cand.Policy.ReduceLatency && !Rem.IsAcyclicLatencyLimited
+      && tryLatency(TryCand, Cand, Zone)) {
+    return;
   }
 
   // Prefer immediate defs/users of the last scheduled instruction. This is a
@@ -2210,8 +2570,8 @@ void ConvergingScheduler::tryCandidate(SchedCandidate &Cand,
 }
 
 #ifndef NDEBUG
-const char *ConvergingScheduler::getReasonStr(
-  ConvergingScheduler::CandReason Reason) {
+const char *GenericScheduler::getReasonStr(
+  GenericScheduler::CandReason Reason) {
   switch (Reason) {
   case NoCand:         return "NOCAND    ";
   case PhysRegCopy:    return "PREG-COPY";
@@ -2232,8 +2592,8 @@ const char *ConvergingScheduler::getReasonStr(
   llvm_unreachable("Unknown reason!");
 }
 
-void ConvergingScheduler::traceCandidate(const SchedCandidate &Cand) {
-  PressureElement P;
+void GenericScheduler::traceCandidate(const SchedCandidate &Cand) {
+  PressureChange P;
   unsigned ResIdx = 0;
   unsigned Latency = 0;
   switch (Cand.Reason) {
@@ -2269,8 +2629,8 @@ void ConvergingScheduler::traceCandidate(const SchedCandidate &Cand) {
   }
   dbgs() << "  SU(" << Cand.SU->NodeNum << ") " << getReasonStr(Cand.Reason);
   if (P.isValid())
-    dbgs() << " " << TRI->getRegPressureSetName(P.PSetID)
-           << ":" << P.UnitIncrease << " ";
+    dbgs() << " " << TRI->getRegPressureSetName(P.getPSet())
+           << ":" << P.getUnitInc() << " ";
   else
     dbgs() << "      ";
   if (ResIdx)
@@ -2285,12 +2645,12 @@ void ConvergingScheduler::traceCandidate(const SchedCandidate &Cand) {
 }
 #endif
 
-/// Pick the best candidate from the top queue.
+/// Pick the best candidate from the queue.
 ///
 /// TODO: getMaxPressureDelta results can be mostly cached for each SUnit during
 /// DAG building. To adjust for the current scheduling location we need to
 /// maintain the number of vreg uses remaining to be top-scheduled.
-void ConvergingScheduler::pickNodeFromQueue(SchedBoundary &Zone,
+void GenericScheduler::pickNodeFromQueue(SchedBoundary &Zone,
                                             const RegPressureTracker &RPTracker,
                                             SchedCandidate &Cand) {
   ReadyQueue &Q = Zone.Available;
@@ -2315,14 +2675,14 @@ void ConvergingScheduler::pickNodeFromQueue(SchedBoundary &Zone,
   }
 }
 
-static void tracePick(const ConvergingScheduler::SchedCandidate &Cand,
+static void tracePick(const GenericScheduler::SchedCandidate &Cand,
                       bool IsTop) {
   DEBUG(dbgs() << "Pick " << (IsTop ? "Top " : "Bot ")
-        << ConvergingScheduler::getReasonStr(Cand.Reason) << '\n');
+        << GenericScheduler::getReasonStr(Cand.Reason) << '\n');
 }
 
 /// Pick the best candidate node from either the top or bottom queue.
-SUnit *ConvergingScheduler::pickNodeBidirectional(bool &IsTopNode) {
+SUnit *GenericScheduler::pickNodeBidirectional(bool &IsTopNode) {
   // Schedule as far as possible in the direction of no choice. This is most
   // efficient, but also provides the best heuristics for CriticalPSets.
   if (SUnit *SU = Bot.pickOnlyChoice()) {
@@ -2377,7 +2737,7 @@ SUnit *ConvergingScheduler::pickNodeBidirectional(bool &IsTopNode) {
 }
 
 /// Pick the best node to balance the schedule. Implements MachineSchedStrategy.
-SUnit *ConvergingScheduler::pickNode(bool &IsTopNode) {
+SUnit *GenericScheduler::pickNode(bool &IsTopNode) {
   if (DAG->top() == DAG->bottom()) {
     assert(Top.Available.empty() && Top.Pending.empty() &&
            Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage");
@@ -2385,24 +2745,26 @@ SUnit *ConvergingScheduler::pickNode(bool &IsTopNode) {
   }
   SUnit *SU;
   do {
-    if (ForceTopDown) {
+    if (RegionPolicy.OnlyTopDown) {
       SU = Top.pickOnlyChoice();
       if (!SU) {
         CandPolicy NoPolicy;
         SchedCandidate TopCand(NoPolicy);
         pickNodeFromQueue(Top, DAG->getTopRPTracker(), TopCand);
-        assert(TopCand.Reason != NoCand && "failed to find the first candidate");
+        assert(TopCand.Reason != NoCand && "failed to find a candidate");
+        tracePick(TopCand, true);
         SU = TopCand.SU;
       }
       IsTopNode = true;
     }
-    else if (ForceBottomUp) {
+    else if (RegionPolicy.OnlyBottomUp) {
       SU = Bot.pickOnlyChoice();
       if (!SU) {
         CandPolicy NoPolicy;
         SchedCandidate BotCand(NoPolicy);
         pickNodeFromQueue(Bot, DAG->getBotRPTracker(), BotCand);
-        assert(BotCand.Reason != NoCand && "failed to find the first candidate");
+        assert(BotCand.Reason != NoCand && "failed to find a candidate");
+        tracePick(BotCand, false);
         SU = BotCand.SU;
       }
       IsTopNode = false;
@@ -2421,7 +2783,7 @@ SUnit *ConvergingScheduler::pickNode(bool &IsTopNode) {
   return SU;
 }
 
-void ConvergingScheduler::reschedulePhysRegCopies(SUnit *SU, bool isTop) {
+void GenericScheduler::reschedulePhysRegCopies(SUnit *SU, bool isTop) {
 
   MachineBasicBlock::iterator InsertPos = SU->getInstr();
   if (!isTop)
@@ -2452,7 +2814,7 @@ void ConvergingScheduler::reschedulePhysRegCopies(SUnit *SU, bool isTop) {
 ///
 /// FIXME: Eventually, we may bundle physreg copies rather than rescheduling
 /// them here. See comments in biasPhysRegCopy.
-void ConvergingScheduler::schedNode(SUnit *SU, bool IsTopNode) {
+void GenericScheduler::schedNode(SUnit *SU, bool IsTopNode) {
   if (IsTopNode) {
     SU->TopReadyCycle = std::max(SU->TopReadyCycle, Top.CurrCycle);
     Top.bumpNode(SU);
@@ -2469,25 +2831,23 @@ void ConvergingScheduler::schedNode(SUnit *SU, bool IsTopNode) {
 
 /// Create the standard converging machine scheduler. This will be used as the
 /// default scheduler if the target does not set a default.
-static ScheduleDAGInstrs *createConvergingSched(MachineSchedContext *C) {
-  assert((!ForceTopDown || !ForceBottomUp) &&
-         "-misched-topdown incompatible with -misched-bottomup");
-  ScheduleDAGMI *DAG = new ScheduleDAGMI(C, new ConvergingScheduler());
+static ScheduleDAGInstrs *createGenericSched(MachineSchedContext *C) {
+  ScheduleDAGMI *DAG = new ScheduleDAGMI(C, new GenericScheduler(C));
   // Register DAG post-processors.
   //
   // FIXME: extend the mutation API to allow earlier mutations to instantiate
   // data and pass it to later mutations. Have a single mutation that gathers
   // the interesting nodes in one pass.
   DAG->addMutation(new CopyConstrain(DAG->TII, DAG->TRI));
-  if (EnableLoadCluster)
+  if (EnableLoadCluster && DAG->TII->enableClusterLoads())
     DAG->addMutation(new LoadClusterMutation(DAG->TII, DAG->TRI));
   if (EnableMacroFusion)
     DAG->addMutation(new MacroFusion(DAG->TII));
   return DAG;
 }
 static MachineSchedRegistry
-ConvergingSchedRegistry("converge", "Standard converging scheduler.",
-                        createConvergingSched);
+GenericSchedRegistry("converge", "Standard converging scheduler.",
+                     createGenericSched);
 
 //===----------------------------------------------------------------------===//
 // ILP Scheduler. Currently for experimental analysis of heuristics.
@@ -2529,15 +2889,6 @@ struct ILPOrder {
 
 /// \brief Schedule based on the ILP metric.
 class ILPScheduler : public MachineSchedStrategy {
-  /// In case all subtrees are eventually connected to a common root through
-  /// data dependence (e.g. reduction), place an upper limit on their size.
-  ///
-  /// FIXME: A subtree limit is generally good, but in the situation commented
-  /// above, where multiple similar subtrees feed a common root, we should
-  /// only split at a point where the resulting subtrees will be balanced.
-  /// (a motivating test case must be found).
-  static const unsigned SubtreeLimit = 16;
-
   ScheduleDAGMI *DAG;
   ILPOrder Cmp;
 
@@ -2721,7 +3072,7 @@ struct DOTGraphTraits<ScheduleDAGMI*> : public DefaultDOTGraphTraits {
   }
 
   static bool isNodeHidden(const SUnit *Node) {
-    return (Node->NumPreds > 10 || Node->NumSuccs > 10);
+    return (Node->Preds.size() > 10 || Node->Succs.size() > 10);
   }
 
   static bool hasNodeAddressLabel(const SUnit *Node,
@@ -2744,7 +3095,11 @@ struct DOTGraphTraits<ScheduleDAGMI*> : public DefaultDOTGraphTraits {
   static std::string getNodeLabel(const SUnit *SU, const ScheduleDAG *G) {
     std::string Str;
     raw_string_ostream SS(Str);
-    SS << "SU(" << SU->NodeNum << ')';
+    const SchedDFSResult *DFS =
+      static_cast<const ScheduleDAGMI*>(G)->getDFSResult();
+    SS << "SU:" << SU->NodeNum;
+    if (DFS)
+      SS << " I:" << DFS->getNumInstrs(SU);
     return SS.str();
   }
   static std::string getNodeDescription(const SUnit *SU, const ScheduleDAG *G) {
diff --git a/lib/CodeGen/MachineSink.cpp b/lib/CodeGen/MachineSink.cpp
index dacdbdd..105d7c2 100644
--- a/lib/CodeGen/MachineSink.cpp
+++ b/lib/CodeGen/MachineSink.cpp
@@ -308,12 +308,29 @@ bool MachineSinking::isWorthBreakingCriticalEdge(MachineInstr *MI,
   // to be sunk then it's probably worth it.
   for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
     const MachineOperand &MO = MI->getOperand(i);
-    if (!MO.isReg()) continue;
+    if (!MO.isReg() || !MO.isUse())
+      continue;
     unsigned Reg = MO.getReg();
-    if (Reg == 0 || !TargetRegisterInfo::isPhysicalRegister(Reg))
+    if (Reg == 0)
       continue;
-    if (MRI->hasOneNonDBGUse(Reg))
-      return true;
+
+    // We don't move live definitions of physical registers,
+    // so sinking their uses won't enable any opportunities.
+    if (TargetRegisterInfo::isPhysicalRegister(Reg))
+      continue;
+
+    // If this instruction is the only user of a virtual register,
+    // check if breaking the edge will enable sinking
+    // both this instruction and the defining instruction.
+    if (MRI->hasOneNonDBGUse(Reg)) {
+      // If the definition resides in same MBB,
+      // claim it's likely we can sink these together.
+      // If definition resides elsewhere, we aren't
+      // blocking it from being sunk so don't break the edge.
+      MachineInstr *DefMI = MRI->getVRegDef(Reg);
+      if (DefMI->getParent() == MI->getParent())
+        return true;
+    }
   }
 
   return false;
@@ -615,9 +632,8 @@ bool MachineSinking::SinkInstruction(MachineInstr *MI, bool &SawStore) {
 
   DEBUG(dbgs() << "Sink instr " << *MI << "\tinto block " << *SuccToSinkTo);
 
-  // If the block has multiple predecessors, this would introduce computation on
-  // a path that it doesn't already exist.  We could split the critical edge,
-  // but for now we just punt.
+  // If the block has multiple predecessors, this is a critical edge.
+  // Decide if we can sink along it or need to break the edge.
   if (SuccToSinkTo->pred_size() > 1) {
     // We cannot sink a load across a critical edge - there may be stores in
     // other code paths.
diff --git a/lib/CodeGen/MachineVerifier.cpp b/lib/CodeGen/MachineVerifier.cpp
index e74bfc8..d61470c 100644
--- a/lib/CodeGen/MachineVerifier.cpp
+++ b/lib/CodeGen/MachineVerifier.cpp
@@ -213,6 +213,10 @@ namespace {
                 const LiveInterval &LI);
     void report(const char *msg, const MachineBasicBlock *MBB,
                 const LiveInterval &LI);
+    void report(const char *msg, const MachineFunction *MF,
+                const LiveRange &LR);
+    void report(const char *msg, const MachineBasicBlock *MBB,
+                const LiveRange &LR);
 
     void verifyInlineAsm(const MachineInstr *MI);
 
@@ -225,9 +229,10 @@ namespace {
     void verifyLiveVariables();
     void verifyLiveIntervals();
     void verifyLiveInterval(const LiveInterval&);
-    void verifyLiveIntervalValue(const LiveInterval&, VNInfo*);
-    void verifyLiveIntervalSegment(const LiveInterval&,
-                                   LiveInterval::const_iterator);
+    void verifyLiveRangeValue(const LiveRange&, const VNInfo*, unsigned);
+    void verifyLiveRangeSegment(const LiveRange&,
+                                const LiveRange::const_iterator I, unsigned);
+    void verifyLiveRange(const LiveRange&, unsigned);
 
     void verifyStackFrame();
   };
@@ -414,23 +419,25 @@ void MachineVerifier::report(const char *msg,
 void MachineVerifier::report(const char *msg, const MachineFunction *MF,
                              const LiveInterval &LI) {
   report(msg, MF);
-  *OS << "- interval:    ";
-  if (TargetRegisterInfo::isVirtualRegister(LI.reg))
-    *OS << PrintReg(LI.reg, TRI);
-  else
-    *OS << PrintRegUnit(LI.reg, TRI);
-  *OS << ' ' << LI << '\n';
+  *OS << "- interval:    " << LI << '\n';
 }
 
 void MachineVerifier::report(const char *msg, const MachineBasicBlock *MBB,
                              const LiveInterval &LI) {
   report(msg, MBB);
-  *OS << "- interval:    ";
-  if (TargetRegisterInfo::isVirtualRegister(LI.reg))
-    *OS << PrintReg(LI.reg, TRI);
-  else
-    *OS << PrintRegUnit(LI.reg, TRI);
-  *OS << ' ' << LI << '\n';
+  *OS << "- interval:    " << LI << '\n';
+}
+
+void MachineVerifier::report(const char *msg, const MachineBasicBlock *MBB,
+                             const LiveRange &LR) {
+  report(msg, MBB);
+  *OS << "- liverange:    " << LR << "\n";
+}
+
+void MachineVerifier::report(const char *msg, const MachineFunction *MF,
+                             const LiveRange &LR) {
+  report(msg, MF);
+  *OS << "- liverange:    " << LR << "\n";
 }
 
 void MachineVerifier::markReachable(const MachineBasicBlock *MBB) {
@@ -768,7 +775,7 @@ void MachineVerifier::visitMachineInstrBefore(const MachineInstr *MI) {
   if (MI->getNumOperands() < MCID.getNumOperands()) {
     report("Too few operands", MI);
     *OS << MCID.getNumOperands() << " operands expected, but "
-        << MI->getNumExplicitOperands() << " given.\n";
+        << MI->getNumOperands() << " given.\n";
   }
 
   // Check the tied operands.
@@ -826,7 +833,7 @@ MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) {
     if (MO->isReg() &&
         !(MI->isVariadic() && MONum == MCID.getNumOperands()-1)) {
       if (MO->isDef() && !MCOI.isOptionalDef())
-          report("Explicit operand marked as def", MO, MONum);
+        report("Explicit operand marked as def", MO, MONum);
       if (MO->isImplicit())
         report("Explicit operand marked as implicit", MO, MONum);
     }
@@ -1001,16 +1008,16 @@ void MachineVerifier::checkLiveness(const MachineOperand *MO, unsigned MONum) {
       // Check the cached regunit intervals.
       if (TargetRegisterInfo::isPhysicalRegister(Reg) && !isReserved(Reg)) {
         for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units) {
-          if (const LiveInterval *LI = LiveInts->getCachedRegUnit(*Units)) {
-            LiveRangeQuery LRQ(*LI, UseIdx);
+          if (const LiveRange *LR = LiveInts->getCachedRegUnit(*Units)) {
+            LiveQueryResult LRQ = LR->Query(UseIdx);
             if (!LRQ.valueIn()) {
-              report("No live range at use", MO, MONum);
+              report("No live segment at use", MO, MONum);
               *OS << UseIdx << " is not live in " << PrintRegUnit(*Units, TRI)
-                  << ' ' << *LI << '\n';
+                  << ' ' << *LR << '\n';
             }
             if (MO->isKill() && !LRQ.isKill()) {
               report("Live range continues after kill flag", MO, MONum);
-              *OS << PrintRegUnit(*Units, TRI) << ' ' << *LI << '\n';
+              *OS << PrintRegUnit(*Units, TRI) << ' ' << *LR << '\n';
             }
           }
         }
@@ -1020,9 +1027,9 @@ void MachineVerifier::checkLiveness(const MachineOperand *MO, unsigned MONum) {
         if (LiveInts->hasInterval(Reg)) {
           // This is a virtual register interval.
           const LiveInterval &LI = LiveInts->getInterval(Reg);
-          LiveRangeQuery LRQ(LI, UseIdx);
+          LiveQueryResult LRQ = LI.Query(UseIdx);
           if (!LRQ.valueIn()) {
-            report("No live range at use", MO, MONum);
+            report("No live segment at use", MO, MONum);
             *OS << UseIdx << " is not live in " << LI << '\n';
           }
           // Check for extra kill flags.
@@ -1071,7 +1078,7 @@ void MachineVerifier::checkLiveness(const MachineOperand *MO, unsigned MONum) {
         llvm::next(MRI->def_begin(Reg)) != MRI->def_end())
       report("Multiple virtual register defs in SSA form", MO, MONum);
 
-    // Check LiveInts for a live range, but only for virtual registers.
+    // Check LiveInts for a live segment, but only for virtual registers.
     if (LiveInts && TargetRegisterInfo::isVirtualRegister(Reg) &&
         !LiveInts->isNotInMIMap(MI)) {
       SlotIndex DefIdx = LiveInts->getInstructionIndex(MI);
@@ -1086,9 +1093,17 @@ void MachineVerifier::checkLiveness(const MachineOperand *MO, unsigned MONum) {
               << DefIdx << " in " << LI << '\n';
           }
         } else {
-          report("No live range at def", MO, MONum);
+          report("No live segment at def", MO, MONum);
           *OS << DefIdx << " is not live in " << LI << '\n';
         }
+        // Check that, if the dead def flag is present, LiveInts agree.
+        if (MO->isDead()) {
+          LiveQueryResult LRQ = LI.Query(DefIdx);
+          if (!LRQ.isDeadDef()) {
+            report("Live range continues after dead def flag", MO, MONum);
+            *OS << "Live range: " << LI << '\n';
+          }
+        }
       } else {
         report("Virtual register has no Live interval", MO, MONum);
       }
@@ -1335,25 +1350,26 @@ void MachineVerifier::verifyLiveIntervals() {
 
   // Verify all the cached regunit intervals.
   for (unsigned i = 0, e = TRI->getNumRegUnits(); i != e; ++i)
-    if (const LiveInterval *LI = LiveInts->getCachedRegUnit(i))
-      verifyLiveInterval(*LI);
+    if (const LiveRange *LR = LiveInts->getCachedRegUnit(i))
+      verifyLiveRange(*LR, i);
 }
 
-void MachineVerifier::verifyLiveIntervalValue(const LiveInterval &LI,
-                                              VNInfo *VNI) {
+void MachineVerifier::verifyLiveRangeValue(const LiveRange &LR,
+                                           const VNInfo *VNI,
+                                           unsigned Reg) {
   if (VNI->isUnused())
     return;
 
-  const VNInfo *DefVNI = LI.getVNInfoAt(VNI->def);
+  const VNInfo *DefVNI = LR.getVNInfoAt(VNI->def);
 
   if (!DefVNI) {
-    report("Valno not live at def and not marked unused", MF, LI);
+    report("Valno not live at def and not marked unused", MF, LR);
     *OS << "Valno #" << VNI->id << '\n';
     return;
   }
 
   if (DefVNI != VNI) {
-    report("Live range at def has different valno", MF, LI);
+    report("Live segment at def has different valno", MF, LR);
     *OS << "Valno #" << VNI->id << " is defined at " << VNI->def
         << " where valno #" << DefVNI->id << " is live\n";
     return;
@@ -1361,15 +1377,15 @@ void MachineVerifier::verifyLiveIntervalValue(const LiveInterval &LI,
 
   const MachineBasicBlock *MBB = LiveInts->getMBBFromIndex(VNI->def);
   if (!MBB) {
-    report("Invalid definition index", MF, LI);
+    report("Invalid definition index", MF, LR);
     *OS << "Valno #" << VNI->id << " is defined at " << VNI->def
-        << " in " << LI << '\n';
+        << " in " << LR << '\n';
     return;
   }
 
   if (VNI->isPHIDef()) {
     if (VNI->def != LiveInts->getMBBStartIdx(MBB)) {
-      report("PHIDef value is not defined at MBB start", MBB, LI);
+      report("PHIDef value is not defined at MBB start", MBB, LR);
       *OS << "Valno #" << VNI->id << " is defined at " << VNI->def
           << ", not at the beginning of BB#" << MBB->getNumber() << '\n';
     }
@@ -1379,161 +1395,154 @@ void MachineVerifier::verifyLiveIntervalValue(const LiveInterval &LI,
   // Non-PHI def.
   const MachineInstr *MI = LiveInts->getInstructionFromIndex(VNI->def);
   if (!MI) {
-    report("No instruction at def index", MBB, LI);
+    report("No instruction at def index", MBB, LR);
     *OS << "Valno #" << VNI->id << " is defined at " << VNI->def << '\n';
     return;
   }
 
-  bool hasDef = false;
-  bool isEarlyClobber = false;
-  for (ConstMIBundleOperands MOI(MI); MOI.isValid(); ++MOI) {
-    if (!MOI->isReg() || !MOI->isDef())
-      continue;
-    if (TargetRegisterInfo::isVirtualRegister(LI.reg)) {
-      if (MOI->getReg() != LI.reg)
-        continue;
-    } else {
-      if (!TargetRegisterInfo::isPhysicalRegister(MOI->getReg()) ||
-          !TRI->hasRegUnit(MOI->getReg(), LI.reg))
+  if (Reg != 0) {
+    bool hasDef = false;
+    bool isEarlyClobber = false;
+    for (ConstMIBundleOperands MOI(MI); MOI.isValid(); ++MOI) {
+      if (!MOI->isReg() || !MOI->isDef())
         continue;
+      if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+        if (MOI->getReg() != Reg)
+          continue;
+      } else {
+        if (!TargetRegisterInfo::isPhysicalRegister(MOI->getReg()) ||
+            !TRI->hasRegUnit(MOI->getReg(), Reg))
+          continue;
+      }
+      hasDef = true;
+      if (MOI->isEarlyClobber())
+        isEarlyClobber = true;
     }
-    hasDef = true;
-    if (MOI->isEarlyClobber())
-      isEarlyClobber = true;
-  }
 
-  if (!hasDef) {
-    report("Defining instruction does not modify register", MI);
-    *OS << "Valno #" << VNI->id << " in " << LI << '\n';
-  }
+    if (!hasDef) {
+      report("Defining instruction does not modify register", MI);
+      *OS << "Valno #" << VNI->id << " in " << LR << '\n';
+    }
 
-  // Early clobber defs begin at USE slots, but other defs must begin at
-  // DEF slots.
-  if (isEarlyClobber) {
-    if (!VNI->def.isEarlyClobber()) {
-      report("Early clobber def must be at an early-clobber slot", MBB, LI);
+    // Early clobber defs begin at USE slots, but other defs must begin at
+    // DEF slots.
+    if (isEarlyClobber) {
+      if (!VNI->def.isEarlyClobber()) {
+        report("Early clobber def must be at an early-clobber slot", MBB, LR);
+        *OS << "Valno #" << VNI->id << " is defined at " << VNI->def << '\n';
+      }
+    } else if (!VNI->def.isRegister()) {
+      report("Non-PHI, non-early clobber def must be at a register slot",
+             MBB, LR);
       *OS << "Valno #" << VNI->id << " is defined at " << VNI->def << '\n';
     }
-  } else if (!VNI->def.isRegister()) {
-    report("Non-PHI, non-early clobber def must be at a register slot",
-           MBB, LI);
-    *OS << "Valno #" << VNI->id << " is defined at " << VNI->def << '\n';
   }
 }
 
-void
-MachineVerifier::verifyLiveIntervalSegment(const LiveInterval &LI,
-                                           LiveInterval::const_iterator I) {
-  const VNInfo *VNI = I->valno;
-  assert(VNI && "Live range has no valno");
-
-  if (VNI->id >= LI.getNumValNums() || VNI != LI.getValNumInfo(VNI->id)) {
-    report("Foreign valno in live range", MF, LI);
-    *OS << *I << " has a bad valno\n";
+void MachineVerifier::verifyLiveRangeSegment(const LiveRange &LR,
+                                             const LiveRange::const_iterator I,
+                                             unsigned Reg) {
+  const LiveRange::Segment &S = *I;
+  const VNInfo *VNI = S.valno;
+  assert(VNI && "Live segment has no valno");
+
+  if (VNI->id >= LR.getNumValNums() || VNI != LR.getValNumInfo(VNI->id)) {
+    report("Foreign valno in live segment", MF, LR);
+    *OS << S << " has a bad valno\n";
   }
 
   if (VNI->isUnused()) {
-    report("Live range valno is marked unused", MF, LI);
-    *OS << *I << '\n';
+    report("Live segment valno is marked unused", MF, LR);
+    *OS << S << '\n';
   }
 
-  const MachineBasicBlock *MBB = LiveInts->getMBBFromIndex(I->start);
+  const MachineBasicBlock *MBB = LiveInts->getMBBFromIndex(S.start);
   if (!MBB) {
-    report("Bad start of live segment, no basic block", MF, LI);
-    *OS << *I << '\n';
+    report("Bad start of live segment, no basic block", MF, LR);
+    *OS << S << '\n';
     return;
   }
   SlotIndex MBBStartIdx = LiveInts->getMBBStartIdx(MBB);
-  if (I->start != MBBStartIdx && I->start != VNI->def) {
-    report("Live segment must begin at MBB entry or valno def", MBB, LI);
-    *OS << *I << '\n';
+  if (S.start != MBBStartIdx && S.start != VNI->def) {
+    report("Live segment must begin at MBB entry or valno def", MBB, LR);
+    *OS << S << '\n';
   }
 
   const MachineBasicBlock *EndMBB =
-    LiveInts->getMBBFromIndex(I->end.getPrevSlot());
+    LiveInts->getMBBFromIndex(S.end.getPrevSlot());
   if (!EndMBB) {
-    report("Bad end of live segment, no basic block", MF, LI);
-    *OS << *I << '\n';
+    report("Bad end of live segment, no basic block", MF, LR);
+    *OS << S << '\n';
     return;
   }
 
   // No more checks for live-out segments.
-  if (I->end == LiveInts->getMBBEndIdx(EndMBB))
+  if (S.end == LiveInts->getMBBEndIdx(EndMBB))
     return;
 
   // RegUnit intervals are allowed dead phis.
-  if (!TargetRegisterInfo::isVirtualRegister(LI.reg) && VNI->isPHIDef() &&
-      I->start == VNI->def && I->end == VNI->def.getDeadSlot())
+  if (!TargetRegisterInfo::isVirtualRegister(Reg) && VNI->isPHIDef() &&
+      S.start == VNI->def && S.end == VNI->def.getDeadSlot())
     return;
 
   // The live segment is ending inside EndMBB
   const MachineInstr *MI =
-    LiveInts->getInstructionFromIndex(I->end.getPrevSlot());
+    LiveInts->getInstructionFromIndex(S.end.getPrevSlot());
   if (!MI) {
-    report("Live segment doesn't end at a valid instruction", EndMBB, LI);
-    *OS << *I << '\n';
+    report("Live segment doesn't end at a valid instruction", EndMBB, LR);
+    *OS << S << '\n';
     return;
   }
 
   // The block slot must refer to a basic block boundary.
-  if (I->end.isBlock()) {
-    report("Live segment ends at B slot of an instruction", EndMBB, LI);
-    *OS << *I << '\n';
+  if (S.end.isBlock()) {
+    report("Live segment ends at B slot of an instruction", EndMBB, LR);
+    *OS << S << '\n';
   }
 
-  if (I->end.isDead()) {
+  if (S.end.isDead()) {
     // Segment ends on the dead slot.
     // That means there must be a dead def.
-    if (!SlotIndex::isSameInstr(I->start, I->end)) {
-      report("Live segment ending at dead slot spans instructions", EndMBB, LI);
-      *OS << *I << '\n';
+    if (!SlotIndex::isSameInstr(S.start, S.end)) {
+      report("Live segment ending at dead slot spans instructions", EndMBB, LR);
+      *OS << S << '\n';
     }
   }
 
   // A live segment can only end at an early-clobber slot if it is being
   // redefined by an early-clobber def.
-  if (I->end.isEarlyClobber()) {
-    if (I+1 == LI.end() || (I+1)->start != I->end) {
+  if (S.end.isEarlyClobber()) {
+    if (I+1 == LR.end() || (I+1)->start != S.end) {
       report("Live segment ending at early clobber slot must be "
-             "redefined by an EC def in the same instruction", EndMBB, LI);
-      *OS << *I << '\n';
+             "redefined by an EC def in the same instruction", EndMBB, LR);
+      *OS << S << '\n';
     }
   }
 
   // The following checks only apply to virtual registers. Physreg liveness
   // is too weird to check.
-  if (TargetRegisterInfo::isVirtualRegister(LI.reg)) {
-    // A live range can end with either a redefinition, a kill flag on a
+  if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+    // A live segment can end with either a redefinition, a kill flag on a
     // use, or a dead flag on a def.
     bool hasRead = false;
-    bool hasDeadDef = false;
     for (ConstMIBundleOperands MOI(MI); MOI.isValid(); ++MOI) {
-      if (!MOI->isReg() || MOI->getReg() != LI.reg)
+      if (!MOI->isReg() || MOI->getReg() != Reg)
         continue;
       if (MOI->readsReg())
         hasRead = true;
-      if (MOI->isDef() && MOI->isDead())
-        hasDeadDef = true;
     }
-
-    if (I->end.isDead()) {
-      if (!hasDeadDef) {
-        report("Instruction doesn't have a dead def operand", MI);
-        I->print(*OS);
-        *OS << " in " << LI << '\n';
-      }
-    } else {
+    if (!S.end.isDead()) {
       if (!hasRead) {
-        report("Instruction ending live range doesn't read the register", MI);
-        *OS << *I << " in " << LI << '\n';
+        report("Instruction ending live segment doesn't read the register", MI);
+        *OS << S << " in " << LR << '\n';
       }
     }
   }
 
   // Now check all the basic blocks in this live segment.
   MachineFunction::const_iterator MFI = MBB;
-  // Is this live range the beginning of a non-PHIDef VN?
-  if (I->start == VNI->def && !VNI->isPHIDef()) {
+  // Is this live segment the beginning of a non-PHIDef VN?
+  if (S.start == VNI->def && !VNI->isPHIDef()) {
     // Not live-in to any blocks.
     if (MBB == EndMBB)
       return;
@@ -1541,9 +1550,9 @@ MachineVerifier::verifyLiveIntervalSegment(const LiveInterval &LI,
     ++MFI;
   }
   for (;;) {
-    assert(LiveInts->isLiveInToMBB(LI, MFI));
+    assert(LiveInts->isLiveInToMBB(LR, MFI));
     // We don't know how to track physregs into a landing pad.
-    if (!TargetRegisterInfo::isVirtualRegister(LI.reg) &&
+    if (!TargetRegisterInfo::isVirtualRegister(Reg) &&
         MFI->isLandingPad()) {
       if (&*MFI == EndMBB)
         break;
@@ -1559,11 +1568,11 @@ MachineVerifier::verifyLiveIntervalSegment(const LiveInterval &LI,
     for (MachineBasicBlock::const_pred_iterator PI = MFI->pred_begin(),
          PE = MFI->pred_end(); PI != PE; ++PI) {
       SlotIndex PEnd = LiveInts->getMBBEndIdx(*PI);
-      const VNInfo *PVNI = LI.getVNInfoBefore(PEnd);
+      const VNInfo *PVNI = LR.getVNInfoBefore(PEnd);
 
       // All predecessors must have a live-out value.
       if (!PVNI) {
-        report("Register not marked live out of predecessor", *PI, LI);
+        report("Register not marked live out of predecessor", *PI, LR);
         *OS << "Valno #" << VNI->id << " live into BB#" << MFI->getNumber()
             << '@' << LiveInts->getMBBStartIdx(MFI) << ", not live before "
             << PEnd << '\n';
@@ -1572,7 +1581,7 @@ MachineVerifier::verifyLiveIntervalSegment(const LiveInterval &LI,
 
       // Only PHI-defs can take different predecessor values.
       if (!IsPHI && PVNI != VNI) {
-        report("Different value live out of predecessor", *PI, LI);
+        report("Different value live out of predecessor", *PI, LR);
         *OS << "Valno #" << PVNI->id << " live out of BB#"
             << (*PI)->getNumber() << '@' << PEnd
             << "\nValno #" << VNI->id << " live into BB#" << MFI->getNumber()
@@ -1585,13 +1594,17 @@ MachineVerifier::verifyLiveIntervalSegment(const LiveInterval &LI,
   }
 }
 
-void MachineVerifier::verifyLiveInterval(const LiveInterval &LI) {
-  for (LiveInterval::const_vni_iterator I = LI.vni_begin(), E = LI.vni_end();
-       I!=E; ++I)
-    verifyLiveIntervalValue(LI, *I);
+void MachineVerifier::verifyLiveRange(const LiveRange &LR, unsigned Reg) {
+  for (LiveRange::const_vni_iterator I = LR.vni_begin(), E = LR.vni_end();
+       I != E; ++I)
+    verifyLiveRangeValue(LR, *I, Reg);
+
+  for (LiveRange::const_iterator I = LR.begin(), E = LR.end(); I != E; ++I)
+    verifyLiveRangeSegment(LR, I, Reg);
+}
 
-  for (LiveInterval::const_iterator I = LI.begin(), E = LI.end(); I!=E; ++I)
-    verifyLiveIntervalSegment(LI, I);
+void MachineVerifier::verifyLiveInterval(const LiveInterval &LI) {
+  verifyLiveRange(LI, LI.reg);
 
   // Check the LI only has one connected component.
   if (TargetRegisterInfo::isVirtualRegister(LI.reg)) {
diff --git a/lib/CodeGen/PHIElimination.cpp b/lib/CodeGen/PHIElimination.cpp
index bf23eca..dcd9072 100644
--- a/lib/CodeGen/PHIElimination.cpp
+++ b/lib/CodeGen/PHIElimination.cpp
@@ -313,14 +313,14 @@ void PHIElimination::LowerPHINode(MachineBasicBlock &MBB,
     if (IncomingReg) {
       // Add the region from the beginning of MBB to the copy instruction to
       // IncomingReg's live interval.
-      LiveInterval &IncomingLI = LIS->getOrCreateInterval(IncomingReg);
+      LiveInterval &IncomingLI = LIS->createEmptyInterval(IncomingReg);
       VNInfo *IncomingVNI = IncomingLI.getVNInfoAt(MBBStartIndex);
       if (!IncomingVNI)
         IncomingVNI = IncomingLI.getNextValue(MBBStartIndex,
                                               LIS->getVNInfoAllocator());
-      IncomingLI.addRange(LiveRange(MBBStartIndex,
-                                    DestCopyIndex.getRegSlot(),
-                                    IncomingVNI));
+      IncomingLI.addSegment(LiveInterval::Segment(MBBStartIndex,
+                                                  DestCopyIndex.getRegSlot(),
+                                                  IncomingVNI));
     }
 
     LiveInterval &DestLI = LIS->getInterval(DestReg);
@@ -332,14 +332,14 @@ void PHIElimination::LowerPHINode(MachineBasicBlock &MBB,
       // the copy instruction.
       VNInfo *OrigDestVNI = DestLI.getVNInfoAt(MBBStartIndex);
       assert(OrigDestVNI && "PHI destination should be live at block entry.");
-      DestLI.removeRange(MBBStartIndex, MBBStartIndex.getDeadSlot());
+      DestLI.removeSegment(MBBStartIndex, MBBStartIndex.getDeadSlot());
       DestLI.createDeadDef(DestCopyIndex.getRegSlot(),
                            LIS->getVNInfoAllocator());
       DestLI.removeValNo(OrigDestVNI);
     } else {
       // Otherwise, remove the region from the beginning of MBB to the copy
       // instruction from DestReg's live interval.
-      DestLI.removeRange(MBBStartIndex, DestCopyIndex.getRegSlot());
+      DestLI.removeSegment(MBBStartIndex, DestCopyIndex.getRegSlot());
       VNInfo *DestVNI = DestLI.getVNInfoAt(DestCopyIndex.getRegSlot());
       assert(DestVNI && "PHI destination should be live at its definition.");
       DestVNI->def = DestCopyIndex.getRegSlot();
@@ -460,7 +460,7 @@ void PHIElimination::LowerPHINode(MachineBasicBlock &MBB,
     if (LIS) {
       if (NewSrcInstr) {
         LIS->InsertMachineInstrInMaps(NewSrcInstr);
-        LIS->addLiveRangeToEndOfBlock(IncomingReg, NewSrcInstr);
+        LIS->addSegmentToEndOfBlock(IncomingReg, NewSrcInstr);
       }
 
       if (!SrcUndef &&
@@ -511,8 +511,8 @@ void PHIElimination::LowerPHINode(MachineBasicBlock &MBB,
                  "Cannot find kill instruction");
 
           SlotIndex LastUseIndex = LIS->getInstructionIndex(KillInst);
-          SrcLI.removeRange(LastUseIndex.getRegSlot(),
-                            LIS->getMBBEndIdx(&opBlock));
+          SrcLI.removeSegment(LastUseIndex.getRegSlot(),
+                              LIS->getMBBEndIdx(&opBlock));
         }
       }
     }
diff --git a/lib/CodeGen/PHIEliminationUtils.h b/lib/CodeGen/PHIEliminationUtils.h
index 9ac47fb..48234ae 100644
--- a/lib/CodeGen/PHIEliminationUtils.h
+++ b/lib/CodeGen/PHIEliminationUtils.h
@@ -1,4 +1,4 @@
-//=- PHIEliminationUtils.h - Helper functions for PHI elimination *- C++ -*--=//
+//=- PHIEliminationUtils.h - Helper functions for PHI elimination -*- C++ -*-=//
 //
 //                     The LLVM Compiler Infrastructure
 //
diff --git a/lib/CodeGen/Passes.cpp b/lib/CodeGen/Passes.cpp
index c0861c5..f4ffd03 100644
--- a/lib/CodeGen/Passes.cpp
+++ b/lib/CodeGen/Passes.cpp
@@ -58,8 +58,6 @@ OptimizeRegAlloc("optimize-regalloc", cl::Hidden,
 static cl::opt<cl::boolOrDefault>
 EnableMachineSched("enable-misched", cl::Hidden,
     cl::desc("Enable the machine instruction scheduling pass."));
-static cl::opt<bool> EnableStrongPHIElim("strong-phi-elim", cl::Hidden,
-    cl::desc("Use strong PHI elimination."));
 static cl::opt<bool> DisablePostRAMachineLICM("disable-postra-machine-licm",
     cl::Hidden,
     cl::desc("Disable Machine LICM"));
@@ -236,7 +234,7 @@ TargetPassConfig::TargetPassConfig(TargetMachine *tm, PassManagerBase &pm)
 
   // Temporarily disable experimental passes.
   const TargetSubtargetInfo &ST = TM->getSubtarget<TargetSubtargetInfo>();
-  if (!ST.enableMachineScheduler())
+  if (!ST.useMachineScheduler())
     disablePass(&MachineSchedulerID);
 }
 
@@ -675,24 +673,15 @@ void TargetPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
   // preferably fix the scavenger to not depend on them).
   addPass(&LiveVariablesID);
 
-  // Add passes that move from transformed SSA into conventional SSA. This is a
-  // "copy coalescing" problem.
-  //
-  if (!EnableStrongPHIElim) {
-    // Edge splitting is smarter with machine loop info.
-    addPass(&MachineLoopInfoID);
-    addPass(&PHIEliminationID);
-  }
+  // Edge splitting is smarter with machine loop info.
+  addPass(&MachineLoopInfoID);
+  addPass(&PHIEliminationID);
 
   // Eventually, we want to run LiveIntervals before PHI elimination.
   if (EarlyLiveIntervals)
     addPass(&LiveIntervalsID);
 
   addPass(&TwoAddressInstructionPassID);
-
-  if (EnableStrongPHIElim)
-    addPass(&StrongPHIEliminationID);
-
   addPass(&RegisterCoalescerID);
 
   // PreRA instruction scheduling.
diff --git a/lib/CodeGen/PeepholeOptimizer.cpp b/lib/CodeGen/PeepholeOptimizer.cpp
index a7439b5..28f2d2f 100644
--- a/lib/CodeGen/PeepholeOptimizer.cpp
+++ b/lib/CodeGen/PeepholeOptimizer.cpp
@@ -40,20 +40,30 @@
 //     If the branch instruction can use flag from "sub", then we can replace
 //     "sub" with "subs" and eliminate the "cmp" instruction.
 //
-// - Optimize Bitcast pairs:
-//
-//     v1 = bitcast v0
-//     v2 = bitcast v1
-//        = v2
-//   =>
-//     v1 = bitcast v0
-//        = v0
-//
 // - Optimize Loads:
 //
 //     Loads that can be folded into a later instruction. A load is foldable
 //     if it loads to virtual registers and the virtual register defined has 
 //     a single use.
+//
+// - Optimize Copies and Bitcast:
+//
+//     Rewrite copies and bitcasts to avoid cross register bank copies
+//     when possible.
+//     E.g., Consider the following example, where capital and lower
+//     letters denote different register file:
+//     b = copy A <-- cross-bank copy
+//     C = copy b <-- cross-bank copy
+//   =>
+//     b = copy A <-- cross-bank copy
+//     C = copy A <-- same-bank copy
+//
+//     E.g., for bitcast:
+//     b = bitcast A <-- cross-bank copy
+//     C = bitcast b <-- cross-bank copy
+//   =>
+//     b = bitcast A <-- cross-bank copy
+//     C = copy A    <-- same-bank copy
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "peephole-opt"
@@ -81,11 +91,11 @@ DisablePeephole("disable-peephole", cl::Hidden, cl::init(false),
                 cl::desc("Disable the peephole optimizer"));
 
 STATISTIC(NumReuse,      "Number of extension results reused");
-STATISTIC(NumBitcasts,   "Number of bitcasts eliminated");
 STATISTIC(NumCmps,       "Number of compares eliminated");
 STATISTIC(NumImmFold,    "Number of move immediate folded");
 STATISTIC(NumLoadFold,   "Number of loads folded");
 STATISTIC(NumSelects,    "Number of selects optimized");
+STATISTIC(NumCopiesBitcasts, "Number of copies/bitcasts optimized");
 
 namespace {
   class PeepholeOptimizer : public MachineFunctionPass {
@@ -112,11 +122,11 @@ namespace {
     }
 
   private:
-    bool optimizeBitcastInstr(MachineInstr *MI, MachineBasicBlock *MBB);
     bool optimizeCmpInstr(MachineInstr *MI, MachineBasicBlock *MBB);
     bool optimizeExtInstr(MachineInstr *MI, MachineBasicBlock *MBB,
                           SmallPtrSet<MachineInstr*, 8> &LocalMIs);
     bool optimizeSelect(MachineInstr *MI);
+    bool optimizeCopyOrBitcast(MachineInstr *MI);
     bool isMoveImmediate(MachineInstr *MI,
                          SmallSet<unsigned, 4> &ImmDefRegs,
                          DenseMap<unsigned, MachineInstr*> &ImmDefMIs);
@@ -298,78 +308,6 @@ optimizeExtInstr(MachineInstr *MI, MachineBasicBlock *MBB,
   return Changed;
 }
 
-/// optimizeBitcastInstr - If the instruction is a bitcast instruction A that
-/// cannot be optimized away during isel (e.g. ARM::VMOVSR, which bitcast
-/// a value cross register classes), and the source is defined by another
-/// bitcast instruction B. And if the register class of source of B matches
-/// the register class of instruction A, then it is legal to replace all uses
-/// of the def of A with source of B. e.g.
-///   %vreg0<def> = VMOVSR %vreg1
-///   %vreg3<def> = VMOVRS %vreg0
-///   Replace all uses of vreg3 with vreg1.
-
-bool PeepholeOptimizer::optimizeBitcastInstr(MachineInstr *MI,
-                                             MachineBasicBlock *MBB) {
-  unsigned NumDefs = MI->getDesc().getNumDefs();
-  unsigned NumSrcs = MI->getDesc().getNumOperands() - NumDefs;
-  if (NumDefs != 1)
-    return false;
-
-  unsigned Def = 0;
-  unsigned Src = 0;
-  for (unsigned i = 0, e = NumDefs + NumSrcs; i != e; ++i) {
-    const MachineOperand &MO = MI->getOperand(i);
-    if (!MO.isReg())
-      continue;
-    unsigned Reg = MO.getReg();
-    if (!Reg)
-      continue;
-    if (MO.isDef())
-      Def = Reg;
-    else if (Src)
-      // Multiple sources?
-      return false;
-    else
-      Src = Reg;
-  }
-
-  assert(Def && Src && "Malformed bitcast instruction!");
-
-  MachineInstr *DefMI = MRI->getVRegDef(Src);
-  if (!DefMI || !DefMI->isBitcast())
-    return false;
-
-  unsigned SrcSrc = 0;
-  NumDefs = DefMI->getDesc().getNumDefs();
-  NumSrcs = DefMI->getDesc().getNumOperands() - NumDefs;
-  if (NumDefs != 1)
-    return false;
-  for (unsigned i = 0, e = NumDefs + NumSrcs; i != e; ++i) {
-    const MachineOperand &MO = DefMI->getOperand(i);
-    if (!MO.isReg() || MO.isDef())
-      continue;
-    unsigned Reg = MO.getReg();
-    if (!Reg)
-      continue;
-    if (!MO.isDef()) {
-      if (SrcSrc)
-        // Multiple sources?
-        return false;
-      else
-        SrcSrc = Reg;
-    }
-  }
-
-  if (MRI->getRegClass(SrcSrc) != MRI->getRegClass(Def))
-    return false;
-
-  MRI->replaceRegWith(Def, SrcSrc);
-  MRI->clearKillFlags(SrcSrc);
-  MI->eraseFromParent();
-  ++NumBitcasts;
-  return true;
-}
-
 /// optimizeCmpInstr - If the instruction is a compare and the previous
 /// instruction it's comparing against all ready sets (or could be modified to
 /// set) the same flag as the compare, then we can remove the comparison and use
@@ -411,6 +349,150 @@ bool PeepholeOptimizer::optimizeSelect(MachineInstr *MI) {
   return true;
 }
 
+/// \brief Check if the registers defined by the pair (RegisterClass, SubReg)
+/// share the same register file.
+static bool shareSameRegisterFile(const TargetRegisterInfo &TRI,
+                                  const TargetRegisterClass *DefRC,
+                                  unsigned DefSubReg,
+                                  const TargetRegisterClass *SrcRC,
+                                  unsigned SrcSubReg) {
+  // Same register class.
+  if (DefRC == SrcRC)
+    return true;
+
+  // Both operands are sub registers. Check if they share a register class.
+  unsigned SrcIdx, DefIdx;
+  if (SrcSubReg && DefSubReg)
+    return TRI.getCommonSuperRegClass(SrcRC, SrcSubReg, DefRC, DefSubReg,
+                                      SrcIdx, DefIdx) != NULL;
+  // At most one of the register is a sub register, make it Src to avoid
+  // duplicating the test.
+  if (!SrcSubReg) {
+    std::swap(DefSubReg, SrcSubReg);
+    std::swap(DefRC, SrcRC);
+  }
+
+  // One of the register is a sub register, check if we can get a superclass.
+  if (SrcSubReg)
+    return TRI.getMatchingSuperRegClass(SrcRC, DefRC, SrcSubReg) != NULL;
+  // Plain copy.
+  return TRI.getCommonSubClass(DefRC, SrcRC) != NULL;
+}
+
+/// \brief Get the index of the definition and source for \p Copy
+/// instruction.
+/// \pre Copy.isCopy() or Copy.isBitcast().
+/// \return True if the Copy instruction has only one register source
+/// and one register definition. Otherwise, \p DefIdx and \p SrcIdx
+/// are invalid.
+static bool getCopyOrBitcastDefUseIdx(const MachineInstr &Copy,
+                                      unsigned &DefIdx, unsigned &SrcIdx) {
+  assert((Copy.isCopy() || Copy.isBitcast()) && "Wrong operation type.");
+  if (Copy.isCopy()) {
+    // Copy instruction are supposed to be: Def = Src.
+     if (Copy.getDesc().getNumOperands() != 2)
+       return false;
+     DefIdx = 0;
+     SrcIdx = 1;
+     assert(Copy.getOperand(DefIdx).isDef() && "Use comes before def!");
+     return true;
+  }
+  // Bitcast case.
+  // Bitcasts with more than one def are not supported.
+  if (Copy.getDesc().getNumDefs() != 1)
+    return false;
+  // Initialize SrcIdx to an undefined operand.
+  SrcIdx = Copy.getDesc().getNumOperands();
+  for (unsigned OpIdx = 0, EndOpIdx = SrcIdx; OpIdx != EndOpIdx; ++OpIdx) {
+    const MachineOperand &MO = Copy.getOperand(OpIdx);
+    if (!MO.isReg() || !MO.getReg())
+      continue;
+    if (MO.isDef())
+      DefIdx = OpIdx;
+    else if (SrcIdx != EndOpIdx)
+      // Multiple sources?
+      return false;
+    SrcIdx = OpIdx;
+  }
+  return true;
+}
+
+/// \brief Optimize a copy or bitcast instruction to avoid cross
+/// register bank copy. The optimization looks through a chain of
+/// copies and try to find a source that has a compatible register
+/// class.
+/// Two register classes are considered to be compatible if they share
+/// the same register bank.
+/// New copies issued by this optimization are register allocator
+/// friendly. This optimization does not remove any copy as it may
+/// overconstraint the register allocator, but replaces some when
+/// possible.
+/// \pre \p MI is a Copy (MI->isCopy() is true)
+/// \return True, when \p MI has been optimized. In that case, \p MI has
+/// been removed from its parent.
+bool PeepholeOptimizer::optimizeCopyOrBitcast(MachineInstr *MI) {
+  unsigned DefIdx, SrcIdx;
+  if (!MI || !getCopyOrBitcastDefUseIdx(*MI, DefIdx, SrcIdx))
+    return false;
+
+  const MachineOperand &MODef = MI->getOperand(DefIdx);
+  assert(MODef.isReg() && "Copies must be between registers.");
+  unsigned Def = MODef.getReg();
+
+  if (TargetRegisterInfo::isPhysicalRegister(Def))
+    return false;
+
+  const TargetRegisterClass *DefRC = MRI->getRegClass(Def);
+  unsigned DefSubReg = MODef.getSubReg();
+
+  unsigned Src;
+  unsigned SrcSubReg;
+  bool ShouldRewrite = false;
+  MachineInstr *Copy = MI;
+  const TargetRegisterInfo &TRI = *TM->getRegisterInfo();
+
+  // Follow the chain of copies until we reach the top or find a
+  // more suitable source.
+  do {
+    unsigned CopyDefIdx, CopySrcIdx;
+    if (!getCopyOrBitcastDefUseIdx(*Copy, CopyDefIdx, CopySrcIdx))
+      break;
+    const MachineOperand &MO = Copy->getOperand(CopySrcIdx);
+    assert(MO.isReg() && "Copies must be between registers.");
+    Src = MO.getReg();
+
+    if (TargetRegisterInfo::isPhysicalRegister(Src))
+      break;
+
+    const TargetRegisterClass *SrcRC = MRI->getRegClass(Src);
+    SrcSubReg = MO.getSubReg();
+
+    // If this source does not incur a cross register bank copy, use it.
+    ShouldRewrite = shareSameRegisterFile(TRI, DefRC, DefSubReg, SrcRC,
+                                          SrcSubReg);
+    // Follow the chain of copies: get the definition of Src.
+    Copy = MRI->getVRegDef(Src);
+  } while (!ShouldRewrite && Copy && (Copy->isCopy() || Copy->isBitcast()));
+
+  // If we did not find a more suitable source, there is nothing to optimize.
+  if (!ShouldRewrite || Src == MI->getOperand(SrcIdx).getReg())
+    return false;
+
+  // Rewrite the copy to avoid a cross register bank penalty. 
+  unsigned NewVR = TargetRegisterInfo::isPhysicalRegister(Def) ? Def :
+    MRI->createVirtualRegister(DefRC);
+  MachineInstr *NewCopy = BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+                                  TII->get(TargetOpcode::COPY), NewVR)
+    .addReg(Src, 0, SrcSubReg);
+  NewCopy->getOperand(0).setSubReg(DefSubReg);
+
+  MRI->replaceRegWith(Def, NewVR);
+  MRI->clearKillFlags(NewVR);
+  MI->eraseFromParent();
+  ++NumCopiesBitcasts;
+  return true;
+}
+
 /// isLoadFoldable - Check whether MI is a candidate for folding into a later
 /// instruction. We only fold loads to virtual registers and the virtual
 /// register defined has a single use.
@@ -523,7 +605,7 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
       if (MI->mayStore() || MI->isCall())
         FoldAsLoadDefReg = 0;
 
-      if ((MI->isBitcast() && optimizeBitcastInstr(MI, MBB)) ||
+      if (((MI->isBitcast() || MI->isCopy()) && optimizeCopyOrBitcast(MI)) ||
           (MI->isCompare() && optimizeCmpInstr(MI, MBB)) ||
           (MI->isSelect() && optimizeSelect(MI))) {
         // MI is deleted.
diff --git a/lib/CodeGen/PostRASchedulerList.cpp b/lib/CodeGen/PostRASchedulerList.cpp
index 27f5676..1afc1ec 100644
--- a/lib/CodeGen/PostRASchedulerList.cpp
+++ b/lib/CodeGen/PostRASchedulerList.cpp
@@ -127,6 +127,12 @@ namespace {
     /// The schedule. Null SUnit*'s represent noop instructions.
     std::vector<SUnit*> Sequence;
 
+    /// The index in BB of RegionEnd.
+    ///
+    /// This is the instruction number from the top of the current block, not
+    /// the SlotIndex. It is only used by the AntiDepBreaker.
+    unsigned EndIndex;
+
   public:
     SchedulePostRATDList(
       MachineFunction &MF, MachineLoopInfo &MLI, MachineDominatorTree &MDT,
@@ -141,11 +147,14 @@ namespace {
     ///
     void startBlock(MachineBasicBlock *BB);
 
+    // Set the index of RegionEnd within the current BB.
+    void setEndIndex(unsigned EndIdx) { EndIndex = EndIdx; }
+
     /// Initialize the scheduler state for the next scheduling region.
     virtual void enterRegion(MachineBasicBlock *bb,
                              MachineBasicBlock::iterator begin,
                              MachineBasicBlock::iterator end,
-                             unsigned endcount);
+                             unsigned regioninstrs);
 
     /// Notify that the scheduler has finished scheduling the current region.
     virtual void exitRegion();
@@ -197,7 +206,7 @@ SchedulePostRATDList::SchedulePostRATDList(
   TargetSubtargetInfo::AntiDepBreakMode AntiDepMode,
   SmallVectorImpl<const TargetRegisterClass*> &CriticalPathRCs)
   : ScheduleDAGInstrs(MF, MLI, MDT, /*IsPostRA=*/true), AA(AA),
-    LiveRegs(TRI->getNumRegs())
+    LiveRegs(TRI->getNumRegs()), EndIndex(0)
 {
   const TargetMachine &TM = MF.getTarget();
   const InstrItineraryData *InstrItins = TM.getInstrItineraryData();
@@ -223,8 +232,8 @@ SchedulePostRATDList::~SchedulePostRATDList() {
 void SchedulePostRATDList::enterRegion(MachineBasicBlock *bb,
                  MachineBasicBlock::iterator begin,
                  MachineBasicBlock::iterator end,
-                 unsigned endcount) {
-  ScheduleDAGInstrs::enterRegion(bb, begin, end, endcount);
+                 unsigned regioninstrs) {
+  ScheduleDAGInstrs::enterRegion(bb, begin, end, regioninstrs);
   Sequence.clear();
 }
 
@@ -312,20 +321,21 @@ bool PostRAScheduler::runOnMachineFunction(MachineFunction &Fn) {
     unsigned Count = MBB->size(), CurrentCount = Count;
     for (MachineBasicBlock::iterator I = Current; I != MBB->begin(); ) {
       MachineInstr *MI = llvm::prior(I);
+      --Count;
       // Calls are not scheduling boundaries before register allocation, but
       // post-ra we don't gain anything by scheduling across calls since we
       // don't need to worry about register pressure.
       if (MI->isCall() || TII->isSchedulingBoundary(MI, MBB, Fn)) {
-        Scheduler.enterRegion(MBB, I, Current, CurrentCount);
+        Scheduler.enterRegion(MBB, I, Current, CurrentCount - Count);
+        Scheduler.setEndIndex(CurrentCount);
         Scheduler.schedule();
         Scheduler.exitRegion();
         Scheduler.EmitSchedule();
         Current = MI;
-        CurrentCount = Count - 1;
+        CurrentCount = Count;
         Scheduler.Observe(MI, CurrentCount);
       }
       I = MI;
-      --Count;
       if (MI->isBundle())
         Count -= MI->getBundleSize();
     }
@@ -333,6 +343,7 @@ bool PostRAScheduler::runOnMachineFunction(MachineFunction &Fn) {
     assert((MBB->begin() == Current || CurrentCount != 0) &&
            "Instruction count mismatch!");
     Scheduler.enterRegion(MBB, MBB->begin(), Current, CurrentCount);
+    Scheduler.setEndIndex(CurrentCount);
     Scheduler.schedule();
     Scheduler.exitRegion();
     Scheduler.EmitSchedule();
@@ -504,11 +515,11 @@ void SchedulePostRATDList::FixupKills(MachineBasicBlock *MBB) {
 
     // Examine all used registers and set/clear kill flag. When a
     // register is used multiple times we only set the kill flag on
-    // the first use.
+    // the first use. Don't set kill flags on undef operands.
     killedRegs.reset();
     for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
       MachineOperand &MO = MI->getOperand(i);
-      if (!MO.isReg() || !MO.isUse()) continue;
+      if (!MO.isReg() || !MO.isUse() || MO.isUndef()) continue;
       unsigned Reg = MO.getReg();
       if ((Reg == 0) || MRI.isReserved(Reg)) continue;
 
diff --git a/lib/CodeGen/ProcessImplicitDefs.cpp b/lib/CodeGen/ProcessImplicitDefs.cpp
index e4e18c3..0c5173a 100644
--- a/lib/CodeGen/ProcessImplicitDefs.cpp
+++ b/lib/CodeGen/ProcessImplicitDefs.cpp
@@ -78,7 +78,7 @@ void ProcessImplicitDefs::processImplicitDef(MachineInstr *MI) {
   unsigned Reg = MI->getOperand(0).getReg();
 
   if (TargetRegisterInfo::isVirtualRegister(Reg)) {
-    // For virtual regiusters, mark all uses as <undef>, and convert users to
+    // For virtual registers, mark all uses as <undef>, and convert users to
     // implicit-def when possible.
     for (MachineRegisterInfo::use_nodbg_iterator UI =
          MRI->use_nodbg_begin(Reg),
diff --git a/lib/CodeGen/PrologEpilogInserter.cpp b/lib/CodeGen/PrologEpilogInserter.cpp
index 1965188..b0e494f 100644
--- a/lib/CodeGen/PrologEpilogInserter.cpp
+++ b/lib/CodeGen/PrologEpilogInserter.cpp
@@ -14,9 +14,6 @@
 // This pass must be run after register allocation.  After this pass is
 // executed, it is illegal to construct MO_FrameIndex operands.
 //
-// This pass provides an optional shrink wrapping variant of prolog/epilog
-// insertion, enabled via --shrink-wrap. See ShrinkWrapping.cpp.
-//
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "pei"
@@ -66,6 +63,38 @@ STATISTIC(NumScavengedRegs, "Number of frame index regs scavenged");
 STATISTIC(NumBytesStackSpace,
           "Number of bytes used for stack in all functions");
 
+void PEI::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesCFG();
+  AU.addPreserved<MachineLoopInfo>();
+  AU.addPreserved<MachineDominatorTree>();
+  AU.addRequired<TargetPassConfig>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+bool PEI::isReturnBlock(MachineBasicBlock* MBB) {
+  return (MBB && !MBB->empty() && MBB->back().isReturn());
+}
+
+/// Compute the set of return blocks
+void PEI::calculateSets(MachineFunction &Fn) {
+  // Sets used to compute spill, restore placement sets.
+  const std::vector<CalleeSavedInfo> &CSI =
+    Fn.getFrameInfo()->getCalleeSavedInfo();
+
+  // If no CSRs used, we are done.
+  if (CSI.empty())
+    return;
+
+  // Save refs to entry and return blocks.
+  EntryBlock = Fn.begin();
+  for (MachineFunction::iterator MBB = Fn.begin(), E = Fn.end();
+       MBB != E; ++MBB)
+    if (isReturnBlock(MBB))
+      ReturnBlocks.push_back(MBB);
+
+  return;
+}
+
 /// runOnMachineFunction - Insert prolog/epilog code and replace abstract
 /// frame indexes with appropriate references.
 ///
@@ -93,16 +122,11 @@ bool PEI::runOnMachineFunction(MachineFunction &Fn) {
   calculateCalleeSavedRegisters(Fn);
 
   // Determine placement of CSR spill/restore code:
-  //  - With shrink wrapping, place spills and restores to tightly
-  //    enclose regions in the Machine CFG of the function where
-  //    they are used.
-  //  - Without shink wrapping (default), place all spills in the
-  //    entry block, all restores in return blocks.
-  placeCSRSpillsAndRestores(Fn);
+  // place all spills in the entry block, all restores in return blocks.
+  calculateSets(Fn);
 
   // Add the code to save and restore the callee saved registers
-  if (!F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
-                                       Attribute::Naked))
+  if (!F->hasFnAttribute(Attribute::Naked))
     insertCSRSpillsAndRestores(Fn);
 
   // Allow the target machine to make final modifications to the function
@@ -117,8 +141,7 @@ bool PEI::runOnMachineFunction(MachineFunction &Fn) {
   // called functions.  Because of this, calculateCalleeSavedRegisters()
   // must be called before this function in order to set the AdjustsStack
   // and MaxCallFrameSize variables.
-  if (!F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
-                                       Attribute::Naked))
+  if (!F->hasFnAttribute(Attribute::Naked))
     insertPrologEpilogCode(Fn);
 
   // Replace all MO_FrameIndex operands with physical register references
@@ -143,7 +166,7 @@ bool PEI::runOnMachineFunction(MachineFunction &Fn) {
            << ") in " << Fn.getName()  << ".\n";
 
   delete RS;
-  clearAllSets();
+  ReturnBlocks.clear();
   return true;
 }
 
@@ -221,8 +244,7 @@ void PEI::calculateCalleeSavedRegisters(MachineFunction &F) {
     return;
 
   // In Naked functions we aren't going to save any registers.
-  if (F.getFunction()->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
-                                                    Attribute::Naked))
+  if (F.getFunction()->hasFnAttribute(Attribute::Naked))
     return;
 
   std::vector<CalleeSavedInfo> CSI;
@@ -286,7 +308,7 @@ void PEI::calculateCalleeSavedRegisters(MachineFunction &F) {
 }
 
 /// insertCSRSpillsAndRestores - Insert spill and restore code for
-/// callee saved registers used in the function, handling shrink wrapping.
+/// callee saved registers used in the function.
 ///
 void PEI::insertCSRSpillsAndRestores(MachineFunction &Fn) {
   // Get callee saved register information.
@@ -304,133 +326,33 @@ void PEI::insertCSRSpillsAndRestores(MachineFunction &Fn) {
   const TargetRegisterInfo *TRI = Fn.getTarget().getRegisterInfo();
   MachineBasicBlock::iterator I;
 
-  if (!ShrinkWrapThisFunction) {
-    // Spill using target interface.
-    I = EntryBlock->begin();
-    if (!TFI->spillCalleeSavedRegisters(*EntryBlock, I, CSI, TRI)) {
-      for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
-        // Add the callee-saved register as live-in.
-        // It's killed at the spill.
-        EntryBlock->addLiveIn(CSI[i].getReg());
-
-        // Insert the spill to the stack frame.
-        unsigned Reg = CSI[i].getReg();
-        const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
-        TII.storeRegToStackSlot(*EntryBlock, I, Reg, true,
-                                CSI[i].getFrameIdx(), RC, TRI);
-      }
-    }
-
-    // Restore using target interface.
-    for (unsigned ri = 0, re = ReturnBlocks.size(); ri != re; ++ri) {
-      MachineBasicBlock* MBB = ReturnBlocks[ri];
-      I = MBB->end(); --I;
-
-      // Skip over all terminator instructions, which are part of the return
-      // sequence.
-      MachineBasicBlock::iterator I2 = I;
-      while (I2 != MBB->begin() && (--I2)->isTerminator())
-        I = I2;
-
-      bool AtStart = I == MBB->begin();
-      MachineBasicBlock::iterator BeforeI = I;
-      if (!AtStart)
-        --BeforeI;
-
-      // Restore all registers immediately before the return and any
-      // terminators that precede it.
-      if (!TFI->restoreCalleeSavedRegisters(*MBB, I, CSI, TRI)) {
-        for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
-          unsigned Reg = CSI[i].getReg();
-          const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
-          TII.loadRegFromStackSlot(*MBB, I, Reg,
-                                   CSI[i].getFrameIdx(),
-                                   RC, TRI);
-          assert(I != MBB->begin() &&
-                 "loadRegFromStackSlot didn't insert any code!");
-          // Insert in reverse order.  loadRegFromStackSlot can insert
-          // multiple instructions.
-          if (AtStart)
-            I = MBB->begin();
-          else {
-            I = BeforeI;
-            ++I;
-          }
-        }
-      }
-    }
-    return;
-  }
-
-  // Insert spills.
-  std::vector<CalleeSavedInfo> blockCSI;
-  for (CSRegBlockMap::iterator BI = CSRSave.begin(),
-         BE = CSRSave.end(); BI != BE; ++BI) {
-    MachineBasicBlock* MBB = BI->first;
-    CSRegSet save = BI->second;
-
-    if (save.empty())
-      continue;
-
-    blockCSI.clear();
-    for (CSRegSet::iterator RI = save.begin(),
-           RE = save.end(); RI != RE; ++RI) {
-      blockCSI.push_back(CSI[*RI]);
-    }
-    assert(blockCSI.size() > 0 &&
-           "Could not collect callee saved register info");
-
-    I = MBB->begin();
-
-    // When shrink wrapping, use stack slot stores/loads.
-    for (unsigned i = 0, e = blockCSI.size(); i != e; ++i) {
+  // Spill using target interface.
+  I = EntryBlock->begin();
+  if (!TFI->spillCalleeSavedRegisters(*EntryBlock, I, CSI, TRI)) {
+    for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
       // Add the callee-saved register as live-in.
       // It's killed at the spill.
-      MBB->addLiveIn(blockCSI[i].getReg());
+      EntryBlock->addLiveIn(CSI[i].getReg());
 
       // Insert the spill to the stack frame.
-      unsigned Reg = blockCSI[i].getReg();
+      unsigned Reg = CSI[i].getReg();
       const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
-      TII.storeRegToStackSlot(*MBB, I, Reg,
-                              true,
-                              blockCSI[i].getFrameIdx(),
+      TII.storeRegToStackSlot(*EntryBlock, I, Reg, true, CSI[i].getFrameIdx(),
                               RC, TRI);
     }
   }
 
-  for (CSRegBlockMap::iterator BI = CSRRestore.begin(),
-         BE = CSRRestore.end(); BI != BE; ++BI) {
-    MachineBasicBlock* MBB = BI->first;
-    CSRegSet restore = BI->second;
-
-    if (restore.empty())
-      continue;
+  // Restore using target interface.
+  for (unsigned ri = 0, re = ReturnBlocks.size(); ri != re; ++ri) {
+    MachineBasicBlock *MBB = ReturnBlocks[ri];
+    I = MBB->end();
+    --I;
 
-    blockCSI.clear();
-    for (CSRegSet::iterator RI = restore.begin(),
-           RE = restore.end(); RI != RE; ++RI) {
-      blockCSI.push_back(CSI[*RI]);
-    }
-    assert(blockCSI.size() > 0 &&
-           "Could not find callee saved register info");
-
-    // If MBB is empty and needs restores, insert at the _beginning_.
-    if (MBB->empty()) {
-      I = MBB->begin();
-    } else {
-      I = MBB->end();
-      --I;
-
-      // Skip over all terminator instructions, which are part of the
-      // return sequence.
-      if (! I->isTerminator()) {
-        ++I;
-      } else {
-        MachineBasicBlock::iterator I2 = I;
-        while (I2 != MBB->begin() && (--I2)->isTerminator())
-          I = I2;
-      }
-    }
+    // Skip over all terminator instructions, which are part of the return
+    // sequence.
+    MachineBasicBlock::iterator I2 = I;
+    while (I2 != MBB->begin() && (--I2)->isTerminator())
+      I = I2;
 
     bool AtStart = I == MBB->begin();
     MachineBasicBlock::iterator BeforeI = I;
@@ -439,21 +361,21 @@ void PEI::insertCSRSpillsAndRestores(MachineFunction &Fn) {
 
     // Restore all registers immediately before the return and any
     // terminators that precede it.
-    for (unsigned i = 0, e = blockCSI.size(); i != e; ++i) {
-      unsigned Reg = blockCSI[i].getReg();
-      const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
-      TII.loadRegFromStackSlot(*MBB, I, Reg,
-                               blockCSI[i].getFrameIdx(),
-                               RC, TRI);
-      assert(I != MBB->begin() &&
-             "loadRegFromStackSlot didn't insert any code!");
-      // Insert in reverse order.  loadRegFromStackSlot can insert
-      // multiple instructions.
-      if (AtStart)
-        I = MBB->begin();
-      else {
-        I = BeforeI;
-        ++I;
+    if (!TFI->restoreCalleeSavedRegisters(*MBB, I, CSI, TRI)) {
+      for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+        unsigned Reg = CSI[i].getReg();
+        const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+        TII.loadRegFromStackSlot(*MBB, I, Reg, CSI[i].getFrameIdx(), RC, TRI);
+        assert(I != MBB->begin() &&
+               "loadRegFromStackSlot didn't insert any code!");
+        // Insert in reverse order.  loadRegFromStackSlot can insert
+        // multiple instructions.
+        if (AtStart)
+          I = MBB->begin();
+        else {
+          I = BeforeI;
+          ++I;
+        }
       }
     }
   }
diff --git a/lib/CodeGen/PrologEpilogInserter.h b/lib/CodeGen/PrologEpilogInserter.h
index 50f4daf..77cfa2b 100644
--- a/lib/CodeGen/PrologEpilogInserter.h
+++ b/lib/CodeGen/PrologEpilogInserter.h
@@ -14,9 +14,6 @@
 // This pass must be run after register allocation.  After this pass is
 // executed, it is illegal to construct MO_FrameIndex operands.
 //
-// This pass also implements a shrink wrapping variant of prolog/epilog
-// insertion.
-//
 //===----------------------------------------------------------------------===//
 
 #ifndef LLVM_CODEGEN_PEI_H
@@ -54,74 +51,16 @@ namespace llvm {
     // stack frame indexes.
     unsigned MinCSFrameIndex, MaxCSFrameIndex;
 
-    // Analysis info for spill/restore placement.
-    // "CSR": "callee saved register".
-
-    // CSRegSet contains indices into the Callee Saved Register Info
-    // vector built by calculateCalleeSavedRegisters() and accessed
-    // via MF.getFrameInfo()->getCalleeSavedInfo().
-    typedef SparseBitVector<> CSRegSet;
-
-    // CSRegBlockMap maps MachineBasicBlocks to sets of callee
-    // saved register indices.
-    typedef DenseMap<MachineBasicBlock*, CSRegSet> CSRegBlockMap;
-
-    // Set and maps for computing CSR spill/restore placement:
-    //  used in function (UsedCSRegs)
-    //  used in a basic block (CSRUsed)
-    //  anticipatable in a basic block (Antic{In,Out})
-    //  available in a basic block (Avail{In,Out})
-    //  to be spilled at the entry to a basic block (CSRSave)
-    //  to be restored at the end of a basic block (CSRRestore)
-    CSRegSet UsedCSRegs;
-    CSRegBlockMap CSRUsed;
-    CSRegBlockMap AnticIn, AnticOut;
-    CSRegBlockMap AvailIn, AvailOut;
-    CSRegBlockMap CSRSave;
-    CSRegBlockMap CSRRestore;
-
     // Entry and return blocks of the current function.
     MachineBasicBlock* EntryBlock;
     SmallVector<MachineBasicBlock*, 4> ReturnBlocks;
 
-    // Map of MBBs to top level MachineLoops.
-    DenseMap<MachineBasicBlock*, MachineLoop*> TLLoops;
-
-    // Flag to control shrink wrapping per-function:
-    // may choose to skip shrink wrapping for certain
-    // functions.
-    bool ShrinkWrapThisFunction;
-
     // Flag to control whether to use the register scavenger to resolve
     // frame index materialization registers. Set according to
     // TRI->requiresFrameIndexScavenging() for the curren function.
     bool FrameIndexVirtualScavenging;
 
-#ifndef NDEBUG
-    // Machine function handle.
-    MachineFunction* MF;
-
-    // Flag indicating that the current function
-    // has at least one "short" path in the machine
-    // CFG from the entry block to an exit block.
-    bool HasFastExitPath;
-#endif
-
-    bool calculateSets(MachineFunction &Fn);
-    bool calcAnticInOut(MachineBasicBlock* MBB);
-    bool calcAvailInOut(MachineBasicBlock* MBB);
-    void calculateAnticAvail(MachineFunction &Fn);
-    bool addUsesForMEMERegion(MachineBasicBlock* MBB,
-                              SmallVectorImpl<MachineBasicBlock *> &blks);
-    bool addUsesForTopLevelLoops(SmallVectorImpl<MachineBasicBlock *> &blks);
-    bool calcSpillPlacements(MachineBasicBlock* MBB,
-                             SmallVectorImpl<MachineBasicBlock *> &blks,
-                             CSRegBlockMap &prevSpills);
-    bool calcRestorePlacements(MachineBasicBlock* MBB,
-                               SmallVectorImpl<MachineBasicBlock *> &blks,
-                               CSRegBlockMap &prevRestores);
-    void placeSpillsAndRestores(MachineFunction &Fn);
-    void placeCSRSpillsAndRestores(MachineFunction &Fn);
+    void calculateSets(MachineFunction &Fn);
     void calculateCallsInformation(MachineFunction &Fn);
     void calculateCalleeSavedRegisters(MachineFunction &Fn);
     void insertCSRSpillsAndRestores(MachineFunction &Fn);
@@ -132,44 +71,8 @@ namespace llvm {
     void scavengeFrameVirtualRegs(MachineFunction &Fn);
     void insertPrologEpilogCode(MachineFunction &Fn);
 
-    // Initialize DFA sets, called before iterations.
-    void clearAnticAvailSets();
-    // Clear all sets constructed by shrink wrapping.
-    void clearAllSets();
-
-    // Initialize all shrink wrapping data.
-    void initShrinkWrappingInfo();
-
-    // Convienences for dealing with machine loops.
-    MachineBasicBlock* getTopLevelLoopPreheader(MachineLoop* LP);
-    MachineLoop* getTopLevelLoopParent(MachineLoop *LP);
-
-    // Propgate CSRs used in MBB to all MBBs of loop LP.
-    void propagateUsesAroundLoop(MachineBasicBlock* MBB, MachineLoop* LP);
-
     // Convenience for recognizing return blocks.
     bool isReturnBlock(MachineBasicBlock* MBB);
-
-#ifndef NDEBUG
-    // Debugging methods.
-
-    // Mark this function as having fast exit paths.
-    void findFastExitPath();
-
-    // Verify placement of spills/restores.
-    void verifySpillRestorePlacement();
-
-    std::string getBasicBlockName(const MachineBasicBlock* MBB);
-    std::string stringifyCSRegSet(const CSRegSet& s);
-    void dumpSet(const CSRegSet& s);
-    void dumpUsed(MachineBasicBlock* MBB);
-    void dumpAllUsed();
-    void dumpSets(MachineBasicBlock* MBB);
-    void dumpSets1(MachineBasicBlock* MBB);
-    void dumpAllSets();
-    void dumpSRSets();
-#endif
-
   };
 } // End llvm namespace
 #endif
diff --git a/lib/CodeGen/RegAllocBase.cpp b/lib/CodeGen/RegAllocBase.cpp
index df3e12a..293e306 100644
--- a/lib/CodeGen/RegAllocBase.cpp
+++ b/lib/CodeGen/RegAllocBase.cpp
@@ -50,6 +50,9 @@ bool RegAllocBase::VerifyEnabled = false;
 //                         RegAllocBase Implementation
 //===----------------------------------------------------------------------===//
 
+// Pin the vtable to this file.
+void RegAllocBase::anchor() {}
+
 void RegAllocBase::init(VirtRegMap &vrm,
                         LiveIntervals &lis,
                         LiveRegMatrix &mat) {
@@ -99,14 +102,13 @@ void RegAllocBase::allocatePhysRegs() {
     // result from splitting.
     DEBUG(dbgs() << "\nselectOrSplit "
                  << MRI->getRegClass(VirtReg->reg)->getName()
-                 << ':' << PrintReg(VirtReg->reg) << ' ' << *VirtReg << '\n');
-    typedef SmallVector<LiveInterval*, 4> VirtRegVec;
+                 << ':' << *VirtReg << '\n');
+    typedef SmallVector<unsigned, 4> VirtRegVec;
     VirtRegVec SplitVRegs;
     unsigned AvailablePhysReg = selectOrSplit(*VirtReg, SplitVRegs);
 
     if (AvailablePhysReg == ~0u) {
       // selectOrSplit failed to find a register!
-      const char *Msg = "ran out of registers during register allocation";
       // Probably caused by an inline asm.
       MachineInstr *MI;
       for (MachineRegisterInfo::reg_iterator I = MRI->reg_begin(VirtReg->reg);
@@ -114,9 +116,9 @@ void RegAllocBase::allocatePhysRegs() {
         if (MI->isInlineAsm())
           break;
       if (MI)
-        MI->emitError(Msg);
+        MI->emitError("inline assembly requires more registers than available");
       else
-        report_fatal_error(Msg);
+        report_fatal_error("ran out of registers during register allocation");
       // Keep going after reporting the error.
       VRM->assignVirt2Phys(VirtReg->reg,
                  RegClassInfo.getOrder(MRI->getRegClass(VirtReg->reg)).front());
@@ -128,7 +130,7 @@ void RegAllocBase::allocatePhysRegs() {
 
     for (VirtRegVec::iterator I = SplitVRegs.begin(), E = SplitVRegs.end();
          I != E; ++I) {
-      LiveInterval *SplitVirtReg = *I;
+      LiveInterval *SplitVirtReg = &LIS->getInterval(*I);
       assert(!VRM->hasPhys(SplitVirtReg->reg) && "Register already assigned");
       if (MRI->reg_nodbg_empty(SplitVirtReg->reg)) {
         DEBUG(dbgs() << "not queueing unused  " << *SplitVirtReg << '\n');
diff --git a/lib/CodeGen/RegAllocBase.h b/lib/CodeGen/RegAllocBase.h
index ccaabba..c17a8d9 100644
--- a/lib/CodeGen/RegAllocBase.h
+++ b/lib/CodeGen/RegAllocBase.h
@@ -38,7 +38,7 @@
 #define LLVM_CODEGEN_REGALLOCBASE
 
 #include "llvm/ADT/OwningPtr.h"
-#include "llvm/CodeGen/LiveIntervalUnion.h"
+#include "llvm/CodeGen/LiveInterval.h"
 #include "llvm/CodeGen/RegisterClassInfo.h"
 
 namespace llvm {
@@ -57,6 +57,7 @@ class Spiller;
 /// live range splitting. They must also override enqueue/dequeue to provide an
 /// assignment order.
 class RegAllocBase {
+  virtual void anchor();
 protected:
   const TargetRegisterInfo *TRI;
   MachineRegisterInfo *MRI;
@@ -90,7 +91,7 @@ protected:
   // or new set of split live virtual registers. It is up to the splitter to
   // converge quickly toward fully spilled live ranges.
   virtual unsigned selectOrSplit(LiveInterval &VirtReg,
-                                 SmallVectorImpl<LiveInterval*> &splitLVRs) = 0;
+                                 SmallVectorImpl<unsigned> &splitLVRs) = 0;
 
   // Use this group name for NamedRegionTimer.
   static const char TimerGroupName[];
diff --git a/lib/CodeGen/RegAllocBasic.cpp b/lib/CodeGen/RegAllocBasic.cpp
index d6a7d6f..6768e45 100644
--- a/lib/CodeGen/RegAllocBasic.cpp
+++ b/lib/CodeGen/RegAllocBasic.cpp
@@ -102,7 +102,7 @@ public:
   }
 
   virtual unsigned selectOrSplit(LiveInterval &VirtReg,
-                                 SmallVectorImpl<LiveInterval*> &SplitVRegs);
+                                 SmallVectorImpl<unsigned> &SplitVRegs);
 
   /// Perform register allocation.
   virtual bool runOnMachineFunction(MachineFunction &mf);
@@ -111,7 +111,7 @@ public:
   // that interfere with the most recently queried lvr.  Return true if spilling
   // was successful, and append any new spilled/split intervals to splitLVRs.
   bool spillInterferences(LiveInterval &VirtReg, unsigned PhysReg,
-                          SmallVectorImpl<LiveInterval*> &SplitVRegs);
+                          SmallVectorImpl<unsigned> &SplitVRegs);
 
   static char ID;
 };
@@ -126,7 +126,6 @@ RABasic::RABasic(): MachineFunctionPass(ID) {
   initializeSlotIndexesPass(*PassRegistry::getPassRegistry());
   initializeRegisterCoalescerPass(*PassRegistry::getPassRegistry());
   initializeMachineSchedulerPass(*PassRegistry::getPassRegistry());
-  initializeCalculateSpillWeightsPass(*PassRegistry::getPassRegistry());
   initializeLiveStacksPass(*PassRegistry::getPassRegistry());
   initializeMachineDominatorTreePass(*PassRegistry::getPassRegistry());
   initializeMachineLoopInfoPass(*PassRegistry::getPassRegistry());
@@ -143,7 +142,6 @@ void RABasic::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addPreserved<SlotIndexes>();
   AU.addRequired<LiveDebugVariables>();
   AU.addPreserved<LiveDebugVariables>();
-  AU.addRequired<CalculateSpillWeights>();
   AU.addRequired<LiveStacks>();
   AU.addPreserved<LiveStacks>();
   AU.addRequired<MachineBlockFrequencyInfo>();
@@ -168,7 +166,7 @@ void RABasic::releaseMemory() {
 // that interfere with VirtReg. The newly spilled or split live intervals are
 // returned by appending them to SplitVRegs.
 bool RABasic::spillInterferences(LiveInterval &VirtReg, unsigned PhysReg,
-                                 SmallVectorImpl<LiveInterval*> &SplitVRegs) {
+                                 SmallVectorImpl<unsigned> &SplitVRegs) {
   // Record each interference and determine if all are spillable before mutating
   // either the union or live intervals.
   SmallVector<LiveInterval*, 8> Intfs;
@@ -222,7 +220,7 @@ bool RABasic::spillInterferences(LiveInterval &VirtReg, unsigned PhysReg,
 // minimal, there is no value in caching them outside the scope of
 // selectOrSplit().
 unsigned RABasic::selectOrSplit(LiveInterval &VirtReg,
-                                SmallVectorImpl<LiveInterval*> &SplitVRegs) {
+                                SmallVectorImpl<unsigned> &SplitVRegs) {
   // Populate a list of physical register spill candidates.
   SmallVector<unsigned, 8> PhysRegSpillCands;
 
@@ -279,6 +277,11 @@ bool RABasic::runOnMachineFunction(MachineFunction &mf) {
   RegAllocBase::init(getAnalysis<VirtRegMap>(),
                      getAnalysis<LiveIntervals>(),
                      getAnalysis<LiveRegMatrix>());
+
+  calculateSpillWeightsAndHints(*LIS, *MF,
+                                getAnalysis<MachineLoopInfo>(),
+                                getAnalysis<MachineBlockFrequencyInfo>());
+
   SpillerInstance.reset(createInlineSpiller(*this, *MF, *VRM));
 
   allocatePhysRegs();
diff --git a/lib/CodeGen/RegAllocFast.cpp b/lib/CodeGen/RegAllocFast.cpp
index 6617e50..e92dbd2 100644
--- a/lib/CodeGen/RegAllocFast.cpp
+++ b/lib/CodeGen/RegAllocFast.cpp
@@ -144,7 +144,7 @@ namespace {
     // not be erased.
     bool isBulkSpilling;
 
-    enum {
+    enum LLVM_ENUM_INT_TYPE(unsigned) {
       spillClean = 1,
       spillDirty = 100,
       spillImpossible = ~0u
@@ -298,7 +298,7 @@ void RAFast::spillVirtReg(MachineBasicBlock::iterator MI,
     for (unsigned li = 0, le = LRIDbgValues.size(); li != le; ++li) {
       MachineInstr *DBG = LRIDbgValues[li];
       const MDNode *MDPtr = DBG->getOperand(2).getMetadata();
-      bool IsIndirect = DBG->getOperand(1).isImm(); // Register-indirect value?
+      bool IsIndirect = DBG->isIndirectDebugValue();
       uint64_t Offset = IsIndirect ? DBG->getOperand(1).getImm() : 0;
       DebugLoc DL;
       if (MI == MBB->end()) {
@@ -569,7 +569,10 @@ RAFast::LiveRegMap::iterator RAFast::allocVirtReg(MachineInstr *MI,
   }
 
   // Nothing we can do. Report an error and keep going with a bad allocation.
-  MI->emitError("ran out of registers during register allocation");
+  if (MI->isInlineAsm())
+    MI->emitError("inline assembly requires more registers than available");
+  else
+    MI->emitError("ran out of registers during register allocation");
   definePhysReg(MI, *AO.begin(), regFree);
   return assignVirtToPhysReg(VirtReg, *AO.begin());
 }
@@ -856,7 +859,7 @@ void RAFast::AllocateBasicBlock() {
             }
             else {
               // Modify DBG_VALUE now that the value is in a spill slot.
-              bool IsIndirect = MI->getOperand(1).isImm();
+              bool IsIndirect = MI->isIndirectDebugValue();
               uint64_t Offset = IsIndirect ? MI->getOperand(1).getImm() : 0;
               const MDNode *MDPtr =
                 MI->getOperand(MI->getNumOperands()-1).getMetadata();
diff --git a/lib/CodeGen/RegAllocGreedy.cpp b/lib/CodeGen/RegAllocGreedy.cpp
index f9e363b..c08d955 100644
--- a/lib/CodeGen/RegAllocGreedy.cpp
+++ b/lib/CodeGen/RegAllocGreedy.cpp
@@ -120,7 +120,9 @@ class RAGreedy : public MachineFunctionPass,
     RS_Done
   };
 
+#ifndef NDEBUG
   static const char *const StageName[];
+#endif
 
   // RegInfo - Keep additional information about each live range.
   struct RegInfo {
@@ -147,7 +149,7 @@ class RAGreedy : public MachineFunctionPass,
   void setStage(Iterator Begin, Iterator End, LiveRangeStage NewStage) {
     ExtraRegInfo.resize(MRI->getNumVirtRegs());
     for (;Begin != End; ++Begin) {
-      unsigned Reg = (*Begin)->reg;
+      unsigned Reg = *Begin;
       if (ExtraRegInfo[Reg].Stage == RS_New)
         ExtraRegInfo[Reg].Stage = NewStage;
     }
@@ -220,7 +222,7 @@ class RAGreedy : public MachineFunctionPass,
   /// class.
   SmallVector<GlobalSplitCandidate, 32> GlobalCand;
 
-  enum { NoCand = ~0u };
+  enum LLVM_ENUM_INT_TYPE(unsigned) { NoCand = ~0u };
 
   /// Candidate map. Each edge bundle is assigned to a GlobalCand entry, or to
   /// NoCand which indicates the stack interval.
@@ -241,7 +243,7 @@ public:
   virtual void enqueue(LiveInterval *LI);
   virtual LiveInterval *dequeue();
   virtual unsigned selectOrSplit(LiveInterval&,
-                                 SmallVectorImpl<LiveInterval*>&);
+                                 SmallVectorImpl<unsigned>&);
 
   /// Perform register allocation.
   virtual bool runOnMachineFunction(MachineFunction &mf);
@@ -265,22 +267,22 @@ private:
   bool shouldEvict(LiveInterval &A, bool, LiveInterval &B, bool);
   bool canEvictInterference(LiveInterval&, unsigned, bool, EvictionCost&);
   void evictInterference(LiveInterval&, unsigned,
-                         SmallVectorImpl<LiveInterval*>&);
+                         SmallVectorImpl<unsigned>&);
 
   unsigned tryAssign(LiveInterval&, AllocationOrder&,
-                     SmallVectorImpl<LiveInterval*>&);
+                     SmallVectorImpl<unsigned>&);
   unsigned tryEvict(LiveInterval&, AllocationOrder&,
-                    SmallVectorImpl<LiveInterval*>&, unsigned = ~0u);
+                    SmallVectorImpl<unsigned>&, unsigned = ~0u);
   unsigned tryRegionSplit(LiveInterval&, AllocationOrder&,
-                          SmallVectorImpl<LiveInterval*>&);
+                          SmallVectorImpl<unsigned>&);
   unsigned tryBlockSplit(LiveInterval&, AllocationOrder&,
-                         SmallVectorImpl<LiveInterval*>&);
+                         SmallVectorImpl<unsigned>&);
   unsigned tryInstructionSplit(LiveInterval&, AllocationOrder&,
-                               SmallVectorImpl<LiveInterval*>&);
+                               SmallVectorImpl<unsigned>&);
   unsigned tryLocalSplit(LiveInterval&, AllocationOrder&,
-    SmallVectorImpl<LiveInterval*>&);
+    SmallVectorImpl<unsigned>&);
   unsigned trySplit(LiveInterval&, AllocationOrder&,
-                    SmallVectorImpl<LiveInterval*>&);
+                    SmallVectorImpl<unsigned>&);
 };
 } // end anonymous namespace
 
@@ -313,7 +315,6 @@ RAGreedy::RAGreedy(): MachineFunctionPass(ID) {
   initializeSlotIndexesPass(*PassRegistry::getPassRegistry());
   initializeRegisterCoalescerPass(*PassRegistry::getPassRegistry());
   initializeMachineSchedulerPass(*PassRegistry::getPassRegistry());
-  initializeCalculateSpillWeightsPass(*PassRegistry::getPassRegistry());
   initializeLiveStacksPass(*PassRegistry::getPassRegistry());
   initializeMachineDominatorTreePass(*PassRegistry::getPassRegistry());
   initializeMachineLoopInfoPass(*PassRegistry::getPassRegistry());
@@ -337,7 +338,6 @@ void RAGreedy::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addPreserved<LiveDebugVariables>();
   AU.addRequired<LiveStacks>();
   AU.addPreserved<LiveStacks>();
-  AU.addRequired<CalculateSpillWeights>();
   AU.addRequired<MachineDominatorTree>();
   AU.addPreserved<MachineDominatorTree>();
   AU.addRequired<MachineLoopInfo>();
@@ -455,7 +455,7 @@ LiveInterval *RAGreedy::dequeue() {
 /// tryAssign - Try to assign VirtReg to an available register.
 unsigned RAGreedy::tryAssign(LiveInterval &VirtReg,
                              AllocationOrder &Order,
-                             SmallVectorImpl<LiveInterval*> &NewVRegs) {
+                             SmallVectorImpl<unsigned> &NewVRegs) {
   Order.rewind();
   unsigned PhysReg;
   while ((PhysReg = Order.next()))
@@ -638,7 +638,7 @@ bool RAGreedy::canEvictInterference(LiveInterval &VirtReg, unsigned PhysReg,
 /// from being assigned to Physreg. This assumes that canEvictInterference
 /// returned true.
 void RAGreedy::evictInterference(LiveInterval &VirtReg, unsigned PhysReg,
-                                 SmallVectorImpl<LiveInterval*> &NewVRegs) {
+                                 SmallVectorImpl<unsigned> &NewVRegs) {
   // Make sure that VirtReg has a cascade number, and assign that cascade
   // number to every evicted register. These live ranges than then only be
   // evicted by a newer cascade, preventing infinite loops.
@@ -670,7 +670,7 @@ void RAGreedy::evictInterference(LiveInterval &VirtReg, unsigned PhysReg,
            "Cannot decrease cascade number, illegal eviction");
     ExtraRegInfo[Intf->reg].Cascade = Cascade;
     ++NumEvicted;
-    NewVRegs.push_back(Intf);
+    NewVRegs.push_back(Intf->reg);
   }
 }
 
@@ -680,7 +680,7 @@ void RAGreedy::evictInterference(LiveInterval &VirtReg, unsigned PhysReg,
 /// @return         Physreg to assign VirtReg, or 0.
 unsigned RAGreedy::tryEvict(LiveInterval &VirtReg,
                             AllocationOrder &Order,
-                            SmallVectorImpl<LiveInterval*> &NewVRegs,
+                            SmallVectorImpl<unsigned> &NewVRegs,
                             unsigned CostPerUseLimit) {
   NamedRegionTimer T("Evict", TimerGroupName, TimePassesIsEnabled);
 
@@ -1125,7 +1125,7 @@ void RAGreedy::splitAroundRegion(LiveRangeEdit &LREdit,
 
   SmallVector<unsigned, 8> IntvMap;
   SE->finish(&IntvMap);
-  DebugVars->splitRegister(Reg, LREdit.regs());
+  DebugVars->splitRegister(Reg, LREdit.regs(), *LIS);
 
   ExtraRegInfo.resize(MRI->getNumVirtRegs());
   unsigned OrigBlocks = SA->getNumLiveBlocks();
@@ -1136,7 +1136,7 @@ void RAGreedy::splitAroundRegion(LiveRangeEdit &LREdit,
   // - Block-local splits are candidates for local splitting.
   // - DCE leftovers should go back on the queue.
   for (unsigned i = 0, e = LREdit.size(); i != e; ++i) {
-    LiveInterval &Reg = *LREdit.get(i);
+    LiveInterval &Reg = LIS->getInterval(LREdit.get(i));
 
     // Ignore old intervals from DCE.
     if (getStage(Reg) != RS_New)
@@ -1170,7 +1170,7 @@ void RAGreedy::splitAroundRegion(LiveRangeEdit &LREdit,
 }
 
 unsigned RAGreedy::tryRegionSplit(LiveInterval &VirtReg, AllocationOrder &Order,
-                                  SmallVectorImpl<LiveInterval*> &NewVRegs) {
+                                  SmallVectorImpl<unsigned> &NewVRegs) {
   unsigned NumCands = 0;
   unsigned BestCand = NoCand;
   BlockFrequency BestCost;
@@ -1305,7 +1305,7 @@ unsigned RAGreedy::tryRegionSplit(LiveInterval &VirtReg, AllocationOrder &Order,
 /// creates a lot of local live ranges, that will be split by tryLocalSplit if
 /// they don't allocate.
 unsigned RAGreedy::tryBlockSplit(LiveInterval &VirtReg, AllocationOrder &Order,
-                                 SmallVectorImpl<LiveInterval*> &NewVRegs) {
+                                 SmallVectorImpl<unsigned> &NewVRegs) {
   assert(&SA->getParent() == &VirtReg && "Live range wasn't analyzed");
   unsigned Reg = VirtReg.reg;
   bool SingleInstrs = RegClassInfo.isProperSubClass(MRI->getRegClass(Reg));
@@ -1326,14 +1326,14 @@ unsigned RAGreedy::tryBlockSplit(LiveInterval &VirtReg, AllocationOrder &Order,
   SE->finish(&IntvMap);
 
   // Tell LiveDebugVariables about the new ranges.
-  DebugVars->splitRegister(Reg, LREdit.regs());
+  DebugVars->splitRegister(Reg, LREdit.regs(), *LIS);
 
   ExtraRegInfo.resize(MRI->getNumVirtRegs());
 
   // Sort out the new intervals created by splitting. The remainder interval
   // goes straight to spilling, the new local ranges get to stay RS_New.
   for (unsigned i = 0, e = LREdit.size(); i != e; ++i) {
-    LiveInterval &LI = *LREdit.get(i);
+    LiveInterval &LI = LIS->getInterval(LREdit.get(i));
     if (getStage(LI) == RS_New && IntvMap[i] == 0)
       setStage(LI, RS_Spill);
   }
@@ -1357,7 +1357,7 @@ unsigned RAGreedy::tryBlockSplit(LiveInterval &VirtReg, AllocationOrder &Order,
 /// This is similar to spilling to a larger register class.
 unsigned
 RAGreedy::tryInstructionSplit(LiveInterval &VirtReg, AllocationOrder &Order,
-                              SmallVectorImpl<LiveInterval*> &NewVRegs) {
+                              SmallVectorImpl<unsigned> &NewVRegs) {
   // There is no point to this if there are no larger sub-classes.
   if (!RegClassInfo.isProperSubClass(MRI->getRegClass(VirtReg.reg)))
     return 0;
@@ -1393,7 +1393,7 @@ RAGreedy::tryInstructionSplit(LiveInterval &VirtReg, AllocationOrder &Order,
 
   SmallVector<unsigned, 8> IntvMap;
   SE->finish(&IntvMap);
-  DebugVars->splitRegister(VirtReg.reg, LREdit.regs());
+  DebugVars->splitRegister(VirtReg.reg, LREdit.regs(), *LIS);
   ExtraRegInfo.resize(MRI->getNumVirtRegs());
 
   // Assign all new registers to RS_Spill. This was the last chance.
@@ -1464,9 +1464,9 @@ void RAGreedy::calcGapWeights(unsigned PhysReg,
 
   // Add fixed interference.
   for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) {
-    const LiveInterval &LI = LIS->getRegUnit(*Units);
-    LiveInterval::const_iterator I = LI.find(StartIdx);
-    LiveInterval::const_iterator E = LI.end();
+    const LiveRange &LR = LIS->getRegUnit(*Units);
+    LiveRange::const_iterator I = LR.find(StartIdx);
+    LiveRange::const_iterator E = LR.end();
 
     // Same loop as above. Mark any overlapped gaps as HUGE_VALF.
     for (unsigned Gap = 0; I != E && I->start < StopIdx; ++I) {
@@ -1477,7 +1477,7 @@ void RAGreedy::calcGapWeights(unsigned PhysReg,
         break;
 
       for (; Gap != NumGaps; ++Gap) {
-        GapWeight[Gap] = HUGE_VALF;
+        GapWeight[Gap] = llvm::huge_valf;
         if (Uses[Gap+1].getBaseIndex() >= I->end)
           break;
       }
@@ -1491,7 +1491,7 @@ void RAGreedy::calcGapWeights(unsigned PhysReg,
 /// basic block.
 ///
 unsigned RAGreedy::tryLocalSplit(LiveInterval &VirtReg, AllocationOrder &Order,
-                                 SmallVectorImpl<LiveInterval*> &NewVRegs) {
+                                 SmallVectorImpl<unsigned> &NewVRegs) {
   assert(SA->getUseBlocks().size() == 1 && "Not a local interval");
   const SplitAnalysis::BlockInfo &BI = SA->getUseBlocks().front();
 
@@ -1583,7 +1583,7 @@ unsigned RAGreedy::tryLocalSplit(LiveInterval &VirtReg, AllocationOrder &Order,
     // Remove any gaps with regmask clobbers.
     if (Matrix->checkRegMaskInterference(VirtReg, PhysReg))
       for (unsigned i = 0, e = RegMaskGaps.size(); i != e; ++i)
-        GapWeight[RegMaskGaps[i]] = HUGE_VALF;
+        GapWeight[RegMaskGaps[i]] = llvm::huge_valf;
 
     // Try to find the best sequence of gaps to close.
     // The new spill weight must be larger than any gap interference.
@@ -1618,7 +1618,7 @@ unsigned RAGreedy::tryLocalSplit(LiveInterval &VirtReg, AllocationOrder &Order,
       // Legally, without causing looping?
       bool Legal = !ProgressRequired || NewGaps < NumGaps;
 
-      if (Legal && MaxGap < HUGE_VALF) {
+      if (Legal && MaxGap < llvm::huge_valf) {
         // Estimate the new spill weight. Each instruction reads or writes the
         // register. Conservatively assume there are no read-modify-write
         // instructions.
@@ -1685,7 +1685,7 @@ unsigned RAGreedy::tryLocalSplit(LiveInterval &VirtReg, AllocationOrder &Order,
   SE->useIntv(SegStart, SegStop);
   SmallVector<unsigned, 8> IntvMap;
   SE->finish(&IntvMap);
-  DebugVars->splitRegister(VirtReg.reg, LREdit.regs());
+  DebugVars->splitRegister(VirtReg.reg, LREdit.regs(), *LIS);
 
   // If the new range has the same number of instructions as before, mark it as
   // RS_Split2 so the next split will be forced to make progress. Otherwise,
@@ -1698,8 +1698,8 @@ unsigned RAGreedy::tryLocalSplit(LiveInterval &VirtReg, AllocationOrder &Order,
     assert(!ProgressRequired && "Didn't make progress when it was required.");
     for (unsigned i = 0, e = IntvMap.size(); i != e; ++i)
       if (IntvMap[i] == 1) {
-        setStage(*LREdit.get(i), RS_Split2);
-        DEBUG(dbgs() << PrintReg(LREdit.get(i)->reg));
+        setStage(LIS->getInterval(LREdit.get(i)), RS_Split2);
+        DEBUG(dbgs() << PrintReg(LREdit.get(i)));
       }
     DEBUG(dbgs() << '\n');
   }
@@ -1716,7 +1716,7 @@ unsigned RAGreedy::tryLocalSplit(LiveInterval &VirtReg, AllocationOrder &Order,
 /// assignable.
 /// @return Physreg when VirtReg may be assigned and/or new NewVRegs.
 unsigned RAGreedy::trySplit(LiveInterval &VirtReg, AllocationOrder &Order,
-                            SmallVectorImpl<LiveInterval*>&NewVRegs) {
+                            SmallVectorImpl<unsigned>&NewVRegs) {
   // Ranges must be Split2 or less.
   if (getStage(VirtReg) >= RS_Spill)
     return 0;
@@ -1765,7 +1765,7 @@ unsigned RAGreedy::trySplit(LiveInterval &VirtReg, AllocationOrder &Order,
 //===----------------------------------------------------------------------===//
 
 unsigned RAGreedy::selectOrSplit(LiveInterval &VirtReg,
-                                 SmallVectorImpl<LiveInterval*> &NewVRegs) {
+                                 SmallVectorImpl<unsigned> &NewVRegs) {
   // First try assigning a free register.
   AllocationOrder Order(VirtReg.reg, *VRM, RegClassInfo);
   if (unsigned PhysReg = tryAssign(VirtReg, Order, NewVRegs))
@@ -1790,7 +1790,7 @@ unsigned RAGreedy::selectOrSplit(LiveInterval &VirtReg,
   if (Stage < RS_Split) {
     setStage(VirtReg, RS_Split);
     DEBUG(dbgs() << "wait for second round\n");
-    NewVRegs.push_back(&VirtReg);
+    NewVRegs.push_back(VirtReg.reg);
     return 0;
   }
 
@@ -1838,6 +1838,8 @@ bool RAGreedy::runOnMachineFunction(MachineFunction &mf) {
   SpillPlacer = &getAnalysis<SpillPlacement>();
   DebugVars = &getAnalysis<LiveDebugVariables>();
 
+  calculateSpillWeightsAndHints(*LIS, mf, *Loops, *MBFI);
+
   DEBUG(LIS->dump());
 
   SA.reset(new SplitAnalysis(*VRM, *LIS, *Loops));
diff --git a/lib/CodeGen/RegAllocPBQP.cpp b/lib/CodeGen/RegAllocPBQP.cpp
index 81ecca1..88c8201 100644
--- a/lib/CodeGen/RegAllocPBQP.cpp
+++ b/lib/CodeGen/RegAllocPBQP.cpp
@@ -95,7 +95,6 @@ public:
       : MachineFunctionPass(ID), builder(b.take()), customPassID(cPassID) {
     initializeSlotIndexesPass(*PassRegistry::getPassRegistry());
     initializeLiveIntervalsPass(*PassRegistry::getPassRegistry());
-    initializeCalculateSpillWeightsPass(*PassRegistry::getPassRegistry());
     initializeLiveStacksPass(*PassRegistry::getPassRegistry());
     initializeVirtRegMapPass(*PassRegistry::getPassRegistry());
   }
@@ -158,13 +157,13 @@ char RegAllocPBQP::ID = 0;
 
 } // End anonymous namespace.
 
-unsigned PBQPRAProblem::getVRegForNode(PBQP::Graph::ConstNodeItr node) const {
+unsigned PBQPRAProblem::getVRegForNode(PBQP::Graph::NodeId node) const {
   Node2VReg::const_iterator vregItr = node2VReg.find(node);
   assert(vregItr != node2VReg.end() && "No vreg for node.");
   return vregItr->second;
 }
 
-PBQP::Graph::NodeItr PBQPRAProblem::getNodeForVReg(unsigned vreg) const {
+PBQP::Graph::NodeId PBQPRAProblem::getNodeForVReg(unsigned vreg) const {
   VReg2Node::const_iterator nodeItr = vreg2Node.find(vreg);
   assert(nodeItr != vreg2Node.end() && "No node for vreg.");
   return nodeItr->second;
@@ -247,7 +246,7 @@ PBQPRAProblem *PBQPBuilder::build(MachineFunction *mf, const LiveIntervals *lis,
     }
 
     // Construct the node.
-    PBQP::Graph::NodeItr node =
+    PBQP::Graph::NodeId node =
       g.addNode(PBQP::Vector(vrAllowed.size() + 1, 0));
 
     // Record the mapping and allowed set in the problem.
@@ -273,7 +272,7 @@ PBQPRAProblem *PBQPBuilder::build(MachineFunction *mf, const LiveIntervals *lis,
 
       assert(!l2.empty() && "Empty interval in vreg set?");
       if (l1.overlaps(l2)) {
-        PBQP::Graph::EdgeItr edge =
+        PBQP::Graph::EdgeId edge =
           g.addEdge(p->getNodeForVReg(vr1), p->getNodeForVReg(vr2),
                     PBQP::Matrix(vr1Allowed.size()+1, vr2Allowed.size()+1, 0));
 
@@ -364,16 +363,16 @@ PBQPRAProblem *PBQPBuilderWithCoalescing::build(MachineFunction *mf,
         }
         if (pregOpt < allowed.size()) {
           ++pregOpt; // +1 to account for spill option.
-          PBQP::Graph::NodeItr node = p->getNodeForVReg(src);
+          PBQP::Graph::NodeId node = p->getNodeForVReg(src);
           addPhysRegCoalesce(g.getNodeCosts(node), pregOpt, cBenefit);
         }
       } else {
         const PBQPRAProblem::AllowedSet *allowed1 = &p->getAllowedSet(dst);
         const PBQPRAProblem::AllowedSet *allowed2 = &p->getAllowedSet(src);
-        PBQP::Graph::NodeItr node1 = p->getNodeForVReg(dst);
-        PBQP::Graph::NodeItr node2 = p->getNodeForVReg(src);
-        PBQP::Graph::EdgeItr edge = g.findEdge(node1, node2);
-        if (edge == g.edgesEnd()) {
+        PBQP::Graph::NodeId node1 = p->getNodeForVReg(dst);
+        PBQP::Graph::NodeId node2 = p->getNodeForVReg(src);
+        PBQP::Graph::EdgeId edge = g.findEdge(node1, node2);
+        if (edge == g.invalidEdgeId()) {
           edge = g.addEdge(node1, node2, PBQP::Matrix(allowed1->size() + 1,
                                                       allowed2->size() + 1,
                                                       0));
@@ -432,7 +431,6 @@ void RegAllocPBQP::getAnalysisUsage(AnalysisUsage &au) const {
   //au.addRequiredID(SplitCriticalEdgesID);
   if (customPassID)
     au.addRequiredID(*customPassID);
-  au.addRequired<CalculateSpillWeights>();
   au.addRequired<LiveStacks>();
   au.addPreserved<LiveStacks>();
   au.addRequired<MachineBlockFrequencyInfo>();
@@ -477,11 +475,11 @@ bool RegAllocPBQP::mapPBQPToRegAlloc(const PBQPRAProblem &problem,
   const PBQP::Graph &g = problem.getGraph();
   // Iterate over the nodes mapping the PBQP solution to a register
   // assignment.
-  for (PBQP::Graph::ConstNodeItr node = g.nodesBegin(),
-                                 nodeEnd = g.nodesEnd();
-       node != nodeEnd; ++node) {
-    unsigned vreg = problem.getVRegForNode(node);
-    unsigned alloc = solution.getSelection(node);
+  for (PBQP::Graph::NodeItr nodeItr = g.nodesBegin(),
+                            nodeEnd = g.nodesEnd();
+       nodeItr != nodeEnd; ++nodeItr) {
+    unsigned vreg = problem.getVRegForNode(*nodeItr);
+    unsigned alloc = solution.getSelection(*nodeItr);
 
     if (problem.isPRegOption(vreg, alloc)) {
       unsigned preg = problem.getPRegForOption(vreg, alloc);
@@ -491,7 +489,7 @@ bool RegAllocPBQP::mapPBQPToRegAlloc(const PBQPRAProblem &problem,
       vrm->assignVirt2Phys(vreg, preg);
     } else if (problem.isSpillOption(vreg, alloc)) {
       vregsToAlloc.erase(vreg);
-      SmallVector<LiveInterval*, 8> newSpills;
+      SmallVector<unsigned, 8> newSpills;
       LiveRangeEdit LRE(&lis->getInterval(vreg), newSpills, *mf, *lis, vrm);
       spiller->spill(LRE);
 
@@ -502,9 +500,10 @@ bool RegAllocPBQP::mapPBQPToRegAlloc(const PBQPRAProblem &problem,
       // allocate.
       for (LiveRangeEdit::iterator itr = LRE.begin(), end = LRE.end();
            itr != end; ++itr) {
-        assert(!(*itr)->empty() && "Empty spill range.");
-        DEBUG(dbgs() << PrintReg((*itr)->reg, tri) << " ");
-        vregsToAlloc.insert((*itr)->reg);
+        LiveInterval &li = lis->getInterval(*itr);
+        assert(!li.empty() && "Empty spill range.");
+        DEBUG(dbgs() << PrintReg(li.reg, tri) << " ");
+        vregsToAlloc.insert(li.reg);
       }
 
       DEBUG(dbgs() << ")\n");
@@ -550,6 +549,9 @@ bool RegAllocPBQP::runOnMachineFunction(MachineFunction &MF) {
   lss = &getAnalysis<LiveStacks>();
   mbfi = &getAnalysis<MachineBlockFrequencyInfo>();
 
+  calculateSpillWeightsAndHints(*lis, MF, getAnalysis<MachineLoopInfo>(),
+                                *mbfi);
+
   vrm = &getAnalysis<VirtRegMap>();
   spiller.reset(createInlineSpiller(*this, MF, *vrm));
 
diff --git a/lib/CodeGen/RegisterCoalescer.cpp b/lib/CodeGen/RegisterCoalescer.cpp
index f99f1a3..dd86c1f 100644
--- a/lib/CodeGen/RegisterCoalescer.cpp
+++ b/lib/CodeGen/RegisterCoalescer.cpp
@@ -398,7 +398,7 @@ void RegisterCoalescer::getAnalysisUsage(AnalysisUsage &AU) const {
 }
 
 void RegisterCoalescer::eliminateDeadDefs() {
-  SmallVector<LiveInterval*, 8> NewRegs;
+  SmallVector<unsigned, 8> NewRegs;
   LiveRangeEdit(0, NewRegs, *MF, *LIS, 0, this).eliminateDeadDefs(DeadDefs);
 }
 
@@ -434,11 +434,11 @@ bool RegisterCoalescer::adjustCopiesBackFrom(const CoalescerPair &CP,
     LIS->getInterval(CP.isFlipped() ? CP.getSrcReg() : CP.getDstReg());
   SlotIndex CopyIdx = LIS->getInstructionIndex(CopyMI).getRegSlot();
 
-  // BValNo is a value number in B that is defined by a copy from A.  'B3' in
+  // BValNo is a value number in B that is defined by a copy from A.  'B1' in
   // the example above.
-  LiveInterval::iterator BLR = IntB.FindLiveRangeContaining(CopyIdx);
-  if (BLR == IntB.end()) return false;
-  VNInfo *BValNo = BLR->valno;
+  LiveInterval::iterator BS = IntB.FindSegmentContaining(CopyIdx);
+  if (BS == IntB.end()) return false;
+  VNInfo *BValNo = BS->valno;
 
   // Get the location that B is defined at.  Two options: either this value has
   // an unknown definition point or it is defined at CopyIdx.  If unknown, we
@@ -447,10 +447,10 @@ bool RegisterCoalescer::adjustCopiesBackFrom(const CoalescerPair &CP,
 
   // AValNo is the value number in A that defines the copy, A3 in the example.
   SlotIndex CopyUseIdx = CopyIdx.getRegSlot(true);
-  LiveInterval::iterator ALR = IntA.FindLiveRangeContaining(CopyUseIdx);
-  // The live range might not exist after fun with physreg coalescing.
-  if (ALR == IntA.end()) return false;
-  VNInfo *AValNo = ALR->valno;
+  LiveInterval::iterator AS = IntA.FindSegmentContaining(CopyUseIdx);
+  // The live segment might not exist after fun with physreg coalescing.
+  if (AS == IntA.end()) return false;
+  VNInfo *AValNo = AS->valno;
 
   // If AValNo is defined as a copy from IntB, we can potentially process this.
   // Get the instruction that defines this value number.
@@ -459,54 +459,54 @@ bool RegisterCoalescer::adjustCopiesBackFrom(const CoalescerPair &CP,
   if (!CP.isCoalescable(ACopyMI) || !ACopyMI->isFullCopy())
     return false;
 
-  // Get the LiveRange in IntB that this value number starts with.
-  LiveInterval::iterator ValLR =
-    IntB.FindLiveRangeContaining(AValNo->def.getPrevSlot());
-  if (ValLR == IntB.end())
+  // Get the Segment in IntB that this value number starts with.
+  LiveInterval::iterator ValS =
+    IntB.FindSegmentContaining(AValNo->def.getPrevSlot());
+  if (ValS == IntB.end())
     return false;
 
-  // Make sure that the end of the live range is inside the same block as
+  // Make sure that the end of the live segment is inside the same block as
   // CopyMI.
-  MachineInstr *ValLREndInst =
-    LIS->getInstructionFromIndex(ValLR->end.getPrevSlot());
-  if (!ValLREndInst || ValLREndInst->getParent() != CopyMI->getParent())
+  MachineInstr *ValSEndInst =
+    LIS->getInstructionFromIndex(ValS->end.getPrevSlot());
+  if (!ValSEndInst || ValSEndInst->getParent() != CopyMI->getParent())
     return false;
 
-  // Okay, we now know that ValLR ends in the same block that the CopyMI
-  // live-range starts.  If there are no intervening live ranges between them in
-  // IntB, we can merge them.
-  if (ValLR+1 != BLR) return false;
+  // Okay, we now know that ValS ends in the same block that the CopyMI
+  // live-range starts.  If there are no intervening live segments between them
+  // in IntB, we can merge them.
+  if (ValS+1 != BS) return false;
 
   DEBUG(dbgs() << "Extending: " << PrintReg(IntB.reg, TRI));
 
-  SlotIndex FillerStart = ValLR->end, FillerEnd = BLR->start;
+  SlotIndex FillerStart = ValS->end, FillerEnd = BS->start;
   // We are about to delete CopyMI, so need to remove it as the 'instruction
   // that defines this value #'. Update the valnum with the new defining
   // instruction #.
   BValNo->def = FillerStart;
 
   // Okay, we can merge them.  We need to insert a new liverange:
-  // [ValLR.end, BLR.begin) of either value number, then we merge the
+  // [ValS.end, BS.begin) of either value number, then we merge the
   // two value numbers.
-  IntB.addRange(LiveRange(FillerStart, FillerEnd, BValNo));
+  IntB.addSegment(LiveInterval::Segment(FillerStart, FillerEnd, BValNo));
 
   // Okay, merge "B1" into the same value number as "B0".
-  if (BValNo != ValLR->valno)
-    IntB.MergeValueNumberInto(BValNo, ValLR->valno);
+  if (BValNo != ValS->valno)
+    IntB.MergeValueNumberInto(BValNo, ValS->valno);
   DEBUG(dbgs() << "   result = " << IntB << '\n');
 
   // If the source instruction was killing the source register before the
   // merge, unset the isKill marker given the live range has been extended.
-  int UIdx = ValLREndInst->findRegisterUseOperandIdx(IntB.reg, true);
+  int UIdx = ValSEndInst->findRegisterUseOperandIdx(IntB.reg, true);
   if (UIdx != -1) {
-    ValLREndInst->getOperand(UIdx).setIsKill(false);
+    ValSEndInst->getOperand(UIdx).setIsKill(false);
   }
 
   // Rewrite the copy. If the copy instruction was killing the destination
   // register before the merge, find the last use and trim the live range. That
   // will also add the isKill marker.
   CopyMI->substituteRegister(IntA.reg, IntB.reg, 0, *TRI);
-  if (ALR->end == CopyIdx)
+  if (AS->end == CopyIdx)
     LIS->shrinkToUses(&IntA);
 
   ++numExtends;
@@ -527,11 +527,11 @@ bool RegisterCoalescer::hasOtherReachingDefs(LiveInterval &IntA,
   for (LiveInterval::iterator AI = IntA.begin(), AE = IntA.end();
        AI != AE; ++AI) {
     if (AI->valno != AValNo) continue;
-    LiveInterval::Ranges::iterator BI =
-      std::upper_bound(IntB.ranges.begin(), IntB.ranges.end(), AI->start);
-    if (BI != IntB.ranges.begin())
+    LiveInterval::iterator BI =
+      std::upper_bound(IntB.begin(), IntB.end(), AI->start);
+    if (BI != IntB.begin())
       --BI;
-    for (; BI != IntB.ranges.end() && AI->end >= BI->start; ++BI) {
+    for (; BI != IntB.end() && AI->end >= BI->start; ++BI) {
       if (BI->valno == BValNo)
         continue;
       if (BI->start <= AI->start && BI->end > AI->start)
@@ -577,14 +577,12 @@ bool RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP,
   LiveInterval &IntB =
     LIS->getInterval(CP.isFlipped() ? CP.getSrcReg() : CP.getDstReg());
 
-  // BValNo is a value number in B that is defined by a copy from A. 'B3' in
+  // BValNo is a value number in B that is defined by a copy from A. 'B1' in
   // the example above.
   VNInfo *BValNo = IntB.getVNInfoAt(CopyIdx);
   if (!BValNo || BValNo->def != CopyIdx)
     return false;
 
-  assert(BValNo->def == CopyIdx && "Copy doesn't define the value?");
-
   // AValNo is the value number in A that defines the copy, A3 in the example.
   VNInfo *AValNo = IntA.getVNInfoAt(CopyIdx.getRegSlot(true));
   assert(AValNo && "COPY source not live");
@@ -614,7 +612,7 @@ bool RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP,
 
   MachineOperand &NewDstMO = DefMI->getOperand(NewDstIdx);
   unsigned NewReg = NewDstMO.getReg();
-  if (NewReg != IntB.reg || !LiveRangeQuery(IntB, AValNo->def).isKill())
+  if (NewReg != IntB.reg || !IntB.Query(AValNo->def).isKill())
     return false;
 
   // Make sure there are no other definitions of IntB that would reach the
@@ -629,8 +627,8 @@ bool RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP,
        UE = MRI->use_nodbg_end(); UI != UE; ++UI) {
     MachineInstr *UseMI = &*UI;
     SlotIndex UseIdx = LIS->getInstructionIndex(UseMI);
-    LiveInterval::iterator ULR = IntA.FindLiveRangeContaining(UseIdx);
-    if (ULR == IntA.end() || ULR->valno != AValNo)
+    LiveInterval::iterator US = IntA.FindSegmentContaining(UseIdx);
+    if (US == IntA.end() || US->valno != AValNo)
       continue;
     // If this use is tied to a def, we can't rewrite the register.
     if (UseMI->isRegTiedToDefOperand(UI.getOperandNo()))
@@ -681,8 +679,8 @@ bool RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP,
       continue;
     }
     SlotIndex UseIdx = LIS->getInstructionIndex(UseMI).getRegSlot(true);
-    LiveInterval::iterator ULR = IntA.FindLiveRangeContaining(UseIdx);
-    if (ULR == IntA.end() || ULR->valno != AValNo)
+    LiveInterval::iterator US = IntA.FindSegmentContaining(UseIdx);
+    if (US == IntA.end() || US->valno != AValNo)
       continue;
     // Kill flags are no longer accurate. They are recomputed after RA.
     UseMO.setIsKill(false);
@@ -712,14 +710,14 @@ bool RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP,
     UseMI->eraseFromParent();
   }
 
-  // Extend BValNo by merging in IntA live ranges of AValNo. Val# definition
+  // Extend BValNo by merging in IntA live segments of AValNo. Val# definition
   // is updated.
   VNInfo *ValNo = BValNo;
   ValNo->def = AValNo->def;
   for (LiveInterval::iterator AI = IntA.begin(), AE = IntA.end();
        AI != AE; ++AI) {
     if (AI->valno != AValNo) continue;
-    IntB.addRange(LiveRange(AI->start, AI->end, ValNo));
+    IntB.addSegment(LiveInterval::Segment(AI->start, AI->end, ValNo));
   }
   DEBUG(dbgs() << "\t\textended: " << IntB << '\n');
 
@@ -744,7 +742,7 @@ bool RegisterCoalescer::reMaterializeTrivialDef(CoalescerPair &CP,
 
   LiveInterval &SrcInt = LIS->getInterval(SrcReg);
   SlotIndex CopyIdx = LIS->getInstructionIndex(CopyMI);
-  VNInfo *ValNo = LiveRangeQuery(SrcInt, CopyIdx).valueIn();
+  VNInfo *ValNo = SrcInt.Query(CopyIdx).valueIn();
   assert(ValNo && "CopyMI input register not live");
   if (ValNo->isPHIDef() || ValNo->isUnused())
     return false;
@@ -876,8 +874,8 @@ bool RegisterCoalescer::reMaterializeTrivialDef(CoalescerPair &CP,
   for (unsigned i = 0, e = NewMIImplDefs.size(); i != e; ++i) {
     unsigned Reg = NewMIImplDefs[i];
     for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units)
-      if (LiveInterval *LI = LIS->getCachedRegUnit(*Units))
-        LI->createDeadDef(NewMIIdx.getRegSlot(), LIS->getVNInfoAllocator());
+      if (LiveRange *LR = LIS->getCachedRegUnit(*Units))
+        LR->createDeadDef(NewMIIdx.getRegSlot(), LIS->getVNInfoAllocator());
   }
 
   DEBUG(dbgs() << "Remat: " << *NewMI);
@@ -1048,7 +1046,7 @@ bool RegisterCoalescer::joinCopy(MachineInstr *CopyMI, bool &Again) {
   if (CP.getSrcReg() == CP.getDstReg()) {
     LiveInterval &LI = LIS->getInterval(CP.getSrcReg());
     DEBUG(dbgs() << "\tCopy already coalesced: " << LI << '\n');
-    LiveRangeQuery LRQ(LI, LIS->getInstructionIndex(CopyMI));
+    LiveQueryResult LRQ = LI.Query(LIS->getInstructionIndex(CopyMI));
     if (VNInfo *DefVNI = LRQ.valueDefined()) {
       VNInfo *ReadVNI = LRQ.valueIn();
       assert(ReadVNI && "No value before copy and no <undef> flag.");
@@ -1091,8 +1089,8 @@ bool RegisterCoalescer::joinCopy(MachineInstr *CopyMI, bool &Again) {
     });
 
     // When possible, let DstReg be the larger interval.
-    if (!CP.isPartial() && LIS->getInterval(CP.getSrcReg()).ranges.size() >
-                           LIS->getInterval(CP.getDstReg()).ranges.size())
+    if (!CP.isPartial() && LIS->getInterval(CP.getSrcReg()).size() >
+                           LIS->getInterval(CP.getDstReg()).size())
       CP.flip();
   }
 
@@ -1109,7 +1107,8 @@ bool RegisterCoalescer::joinCopy(MachineInstr *CopyMI, bool &Again) {
     if (reMaterializeTrivialDef(CP, CopyMI, IsDefCopy))
       return true;
 
-    // If we can eliminate the copy without merging the live ranges, do so now.
+    // If we can eliminate the copy without merging the live segments, do so
+    // now.
     if (!CP.isPartial() && !CP.isPhys()) {
       if (adjustCopiesBackFrom(CP, CopyMI) ||
           removeCopyByCommutingDef(CP, CopyMI)) {
@@ -1157,10 +1156,12 @@ bool RegisterCoalescer::joinCopy(MachineInstr *CopyMI, bool &Again) {
   TRI->UpdateRegAllocHint(CP.getSrcReg(), CP.getDstReg(), *MF);
 
   DEBUG({
-    dbgs() << "\tJoined. Result = " << PrintReg(CP.getDstReg(), TRI);
-    if (!CP.isPhys())
+    dbgs() << "\tJoined. Result = ";
+    if (CP.isPhys())
+      dbgs() << PrintReg(CP.getDstReg(), TRI);
+    else
       dbgs() << LIS->getInterval(CP.getDstReg());
-     dbgs() << '\n';
+    dbgs() << '\n';
   });
 
   ++numJoins;
@@ -1172,8 +1173,7 @@ bool RegisterCoalescer::joinReservedPhysReg(CoalescerPair &CP) {
   assert(CP.isPhys() && "Must be a physreg copy");
   assert(MRI->isReserved(CP.getDstReg()) && "Not a reserved register");
   LiveInterval &RHS = LIS->getInterval(CP.getSrcReg());
-  DEBUG(dbgs() << "\t\tRHS = " << PrintReg(CP.getSrcReg()) << ' ' << RHS
-               << '\n');
+  DEBUG(dbgs() << "\t\tRHS = " << RHS << '\n');
 
   assert(CP.isFlipped() && RHS.containsOneValue() &&
          "Invalid join with reserved register");
@@ -1442,7 +1442,7 @@ VNInfo *JoinVals::stripCopies(VNInfo *VNI) {
     unsigned Reg = MI->getOperand(1).getReg();
     if (!TargetRegisterInfo::isVirtualRegister(Reg))
       break;
-    LiveRangeQuery LRQ(LIS->getInterval(Reg), VNI->def);
+    LiveQueryResult LRQ = LIS->getInterval(Reg).Query(VNI->def);
     if (!LRQ.valueIn())
       break;
     VNI = LRQ.valueIn();
@@ -1493,7 +1493,7 @@ JoinVals::analyzeValue(unsigned ValNo, JoinVals &Other) {
     // The <read-undef> flag on the def operand means that old lane values are
     // not important.
     if (Redef) {
-      V.RedefVNI = LiveRangeQuery(LI, VNI->def).valueIn();
+      V.RedefVNI = LI.Query(VNI->def).valueIn();
       assert(V.RedefVNI && "Instruction is reading nonexistent value");
       computeAssignment(V.RedefVNI->id, Other);
       V.ValidLanes |= Vals[V.RedefVNI->id].ValidLanes;
@@ -1510,7 +1510,7 @@ JoinVals::analyzeValue(unsigned ValNo, JoinVals &Other) {
   }
 
   // Find the value in Other that overlaps VNI->def, if any.
-  LiveRangeQuery OtherLRQ(Other.LI, VNI->def);
+  LiveQueryResult OtherLRQ = Other.LI.Query(VNI->def);
 
   // It is possible that both values are defined by the same instruction, or
   // the values are PHIs defined in the same block. When that happens, the two
@@ -1969,8 +1969,8 @@ bool RegisterCoalescer::joinVirtRegs(CoalescerPair &CP) {
   JoinVals RHSVals(RHS, CP.getSrcIdx(), NewVNInfo, CP, LIS, TRI);
   JoinVals LHSVals(LHS, CP.getDstIdx(), NewVNInfo, CP, LIS, TRI);
 
-  DEBUG(dbgs() << "\t\tRHS = " << PrintReg(CP.getSrcReg()) << ' ' << RHS
-               << "\n\t\tLHS = " << PrintReg(CP.getDstReg()) << ' ' << LHS
+  DEBUG(dbgs() << "\t\tRHS = " << RHS
+               << "\n\t\tLHS = " << LHS
                << '\n');
 
   // First compute NewVNInfo and the simple value mappings.
@@ -2001,8 +2001,7 @@ bool RegisterCoalescer::joinVirtRegs(CoalescerPair &CP) {
     LIS->shrinkToUses(&LIS->getInterval(ShrinkRegs.pop_back_val()));
 
   // Join RHS into LHS.
-  LHS.join(RHS, LHSVals.getAssignments(), RHSVals.getAssignments(), NewVNInfo,
-           MRI);
+  LHS.join(RHS, LHSVals.getAssignments(), RHSVals.getAssignments(), NewVNInfo);
 
   // Kill flags are going to be wrong if the live ranges were overlapping.
   // Eventually, we should simply clear all kill flags when computing live
@@ -2017,7 +2016,7 @@ bool RegisterCoalescer::joinVirtRegs(CoalescerPair &CP) {
   // CR_Replace conflicts.
   DEBUG(dbgs() << "\t\trestoring liveness to " << EndPoints.size()
                << " points: " << LHS << '\n');
-  LIS->extendToIndices(&LHS, EndPoints);
+  LIS->extendToIndices(LHS, EndPoints);
   return true;
 }
 
@@ -2043,9 +2042,8 @@ struct MBBPriorityInfo {
 // block (the unsigned), and then on the MBB number.
 //
 // EnableGlobalCopies assumes that the primary sort key is loop depth.
-static int compareMBBPriority(const void *L, const void *R) {
-  const MBBPriorityInfo *LHS = static_cast<const MBBPriorityInfo*>(L);
-  const MBBPriorityInfo *RHS = static_cast<const MBBPriorityInfo*>(R);
+static int compareMBBPriority(const MBBPriorityInfo *LHS,
+                              const MBBPriorityInfo *RHS) {
   // Deeper loops first
   if (LHS->Depth != RHS->Depth)
     return LHS->Depth > RHS->Depth ? -1 : 1;
@@ -2203,7 +2201,7 @@ bool RegisterCoalescer::runOnMachineFunction(MachineFunction &fn) {
 
   const TargetSubtargetInfo &ST = TM->getSubtarget<TargetSubtargetInfo>();
   if (EnableGlobalCopies == cl::BOU_UNSET)
-    JoinGlobalCopies = ST.enableMachineScheduler();
+    JoinGlobalCopies = ST.useMachineScheduler();
   else
     JoinGlobalCopies = (EnableGlobalCopies == cl::BOU_TRUE);
 
diff --git a/lib/CodeGen/RegisterPressure.cpp b/lib/CodeGen/RegisterPressure.cpp
index b7ab138..092ecdd 100644
--- a/lib/CodeGen/RegisterPressure.cpp
+++ b/lib/CodeGen/RegisterPressure.cpp
@@ -25,53 +25,19 @@ using namespace llvm;
 
 /// Increase pressure for each pressure set provided by TargetRegisterInfo.
 static void increaseSetPressure(std::vector<unsigned> &CurrSetPressure,
-                                std::vector<unsigned> &MaxSetPressure,
-                                const int *PSet, unsigned Weight) {
-  for (; *PSet != -1; ++PSet) {
-    CurrSetPressure[*PSet] += Weight;
-    if (&CurrSetPressure != &MaxSetPressure
-        && CurrSetPressure[*PSet] > MaxSetPressure[*PSet]) {
-      MaxSetPressure[*PSet] = CurrSetPressure[*PSet];
-    }
-  }
+                                PSetIterator PSetI) {
+  unsigned Weight = PSetI.getWeight();
+  for (; PSetI.isValid(); ++PSetI)
+    CurrSetPressure[*PSetI] += Weight;
 }
 
 /// Decrease pressure for each pressure set provided by TargetRegisterInfo.
 static void decreaseSetPressure(std::vector<unsigned> &CurrSetPressure,
-                                const int *PSet, unsigned Weight) {
-  for (; *PSet != -1; ++PSet) {
-    assert(CurrSetPressure[*PSet] >= Weight && "register pressure underflow");
-    CurrSetPressure[*PSet] -= Weight;
-  }
-}
-
-/// Directly increase pressure only within this RegisterPressure result.
-void RegisterPressure::increase(unsigned Reg, const TargetRegisterInfo *TRI,
-                                const MachineRegisterInfo *MRI) {
-  if (TargetRegisterInfo::isVirtualRegister(Reg)) {
-    const TargetRegisterClass *RC = MRI->getRegClass(Reg);
-    increaseSetPressure(MaxSetPressure, MaxSetPressure,
-                        TRI->getRegClassPressureSets(RC),
-                        TRI->getRegClassWeight(RC).RegWeight);
-  }
-  else {
-    increaseSetPressure(MaxSetPressure, MaxSetPressure,
-                        TRI->getRegUnitPressureSets(Reg),
-                        TRI->getRegUnitWeight(Reg));
-  }
-}
-
-/// Directly decrease pressure only within this RegisterPressure result.
-void RegisterPressure::decrease(unsigned Reg, const TargetRegisterInfo *TRI,
-                                const MachineRegisterInfo *MRI) {
-  if (TargetRegisterInfo::isVirtualRegister(Reg)) {
-    const TargetRegisterClass *RC = MRI->getRegClass(Reg);
-    decreaseSetPressure(MaxSetPressure, TRI->getRegClassPressureSets(RC),
-                        TRI->getRegClassWeight(RC).RegWeight);
-  }
-  else {
-    decreaseSetPressure(MaxSetPressure, TRI->getRegUnitPressureSets(Reg),
-                        TRI->getRegUnitWeight(Reg));
+                                PSetIterator PSetI) {
+  unsigned Weight = PSetI.getWeight();
+  for (; PSetI.isValid(); ++PSetI) {
+    assert(CurrSetPressure[*PSetI] >= Weight && "register pressure underflow");
+    CurrSetPressure[*PSetI] -= Weight;
   }
 }
 
@@ -113,36 +79,23 @@ void RegPressureTracker::dump() const {
 
 /// Increase the current pressure as impacted by these registers and bump
 /// the high water mark if needed.
-void RegPressureTracker::increaseRegPressure(ArrayRef<unsigned> Regs) {
-  for (unsigned I = 0, E = Regs.size(); I != E; ++I) {
-    if (TargetRegisterInfo::isVirtualRegister(Regs[I])) {
-      const TargetRegisterClass *RC = MRI->getRegClass(Regs[I]);
-      increaseSetPressure(CurrSetPressure, P.MaxSetPressure,
-                          TRI->getRegClassPressureSets(RC),
-                          TRI->getRegClassWeight(RC).RegWeight);
-    }
-    else {
-      increaseSetPressure(CurrSetPressure, P.MaxSetPressure,
-                          TRI->getRegUnitPressureSets(Regs[I]),
-                          TRI->getRegUnitWeight(Regs[I]));
+void RegPressureTracker::increaseRegPressure(ArrayRef<unsigned> RegUnits) {
+  for (unsigned i = 0, e = RegUnits.size(); i != e; ++i) {
+    PSetIterator PSetI = MRI->getPressureSets(RegUnits[i]);
+    unsigned Weight = PSetI.getWeight();
+    for (; PSetI.isValid(); ++PSetI) {
+      CurrSetPressure[*PSetI] += Weight;
+      if (CurrSetPressure[*PSetI] > P.MaxSetPressure[*PSetI]) {
+        P.MaxSetPressure[*PSetI] = CurrSetPressure[*PSetI];
+      }
     }
   }
 }
 
 /// Simply decrease the current pressure as impacted by these registers.
-void RegPressureTracker::decreaseRegPressure(ArrayRef<unsigned> Regs) {
-  for (unsigned I = 0, E = Regs.size(); I != E; ++I) {
-    if (TargetRegisterInfo::isVirtualRegister(Regs[I])) {
-      const TargetRegisterClass *RC = MRI->getRegClass(Regs[I]);
-      decreaseSetPressure(CurrSetPressure,
-                          TRI->getRegClassPressureSets(RC),
-                          TRI->getRegClassWeight(RC).RegWeight);
-    }
-    else {
-      decreaseSetPressure(CurrSetPressure, TRI->getRegUnitPressureSets(Regs[I]),
-                          TRI->getRegUnitWeight(Regs[I]));
-    }
-  }
+void RegPressureTracker::decreaseRegPressure(ArrayRef<unsigned> RegUnits) {
+  for (unsigned I = 0, E = RegUnits.size(); I != E; ++I)
+    decreaseSetPressure(CurrSetPressure, MRI->getPressureSets(RegUnits[I]));
 }
 
 /// Clear the result so it can be used for another round of pressure tracking.
@@ -194,12 +147,30 @@ void RegionPressure::openBottom(MachineBasicBlock::const_iterator PrevBottom) {
   LiveInRegs.clear();
 }
 
-const LiveInterval *RegPressureTracker::getInterval(unsigned Reg) const {
+const LiveRange *RegPressureTracker::getLiveRange(unsigned Reg) const {
   if (TargetRegisterInfo::isVirtualRegister(Reg))
     return &LIS->getInterval(Reg);
   return LIS->getCachedRegUnit(Reg);
 }
 
+void RegPressureTracker::reset() {
+  MBB = 0;
+  LIS = 0;
+
+  CurrSetPressure.clear();
+  LiveThruPressure.clear();
+  P.MaxSetPressure.clear();
+
+  if (RequireIntervals)
+    static_cast<IntervalPressure&>(P).reset();
+  else
+    static_cast<RegionPressure&>(P).reset();
+
+  LiveRegs.PhysRegs.clear();
+  LiveRegs.VirtRegs.clear();
+  UntiedDefs.clear();
+}
+
 /// Setup the RegPressureTracker.
 ///
 /// TODO: Add support for pressure without LiveIntervals.
@@ -210,6 +181,8 @@ void RegPressureTracker::init(const MachineFunction *mf,
                               MachineBasicBlock::const_iterator pos,
                               bool ShouldTrackUntiedDefs)
 {
+  reset();
+
   MF = mf;
   TRI = MF->getTarget().getRegisterInfo();
   RCI = rci;
@@ -224,19 +197,11 @@ void RegPressureTracker::init(const MachineFunction *mf,
 
   CurrPos = pos;
   CurrSetPressure.assign(TRI->getNumRegPressureSets(), 0);
-  LiveThruPressure.clear();
 
-  if (RequireIntervals)
-    static_cast<IntervalPressure&>(P).reset();
-  else
-    static_cast<RegionPressure&>(P).reset();
   P.MaxSetPressure = CurrSetPressure;
 
-  LiveRegs.PhysRegs.clear();
   LiveRegs.PhysRegs.setUniverse(TRI->getNumRegs());
-  LiveRegs.VirtRegs.clear();
   LiveRegs.VirtRegs.setUniverse(MRI->getNumVirtRegs());
-  UntiedDefs.clear();
   if (TrackUntiedDefs)
     UntiedDefs.setUniverse(MRI->getNumVirtRegs());
 }
@@ -328,24 +293,25 @@ void RegPressureTracker::initLiveThru(const RegPressureTracker &RPTracker) {
     unsigned Reg = P.LiveOutRegs[i];
     if (TargetRegisterInfo::isVirtualRegister(Reg)
         && !RPTracker.hasUntiedDef(Reg)) {
-      const TargetRegisterClass *RC = MRI->getRegClass(Reg);
-      increaseSetPressure(LiveThruPressure, LiveThruPressure,
-                          TRI->getRegClassPressureSets(RC),
-                          TRI->getRegClassWeight(RC).RegWeight);
+      increaseSetPressure(LiveThruPressure, MRI->getPressureSets(Reg));
     }
   }
 }
 
 /// \brief Convenient wrapper for checking membership in RegisterOperands.
-static bool containsReg(ArrayRef<unsigned> Regs, unsigned Reg) {
-  return std::find(Regs.begin(), Regs.end(), Reg) != Regs.end();
+/// (std::count() doesn't have an early exit).
+static bool containsReg(ArrayRef<unsigned> RegUnits, unsigned RegUnit) {
+  return std::find(RegUnits.begin(), RegUnits.end(), RegUnit) != RegUnits.end();
 }
 
 /// Collect this instruction's unique uses and defs into SmallVectors for
 /// processing defs and uses in order.
+///
+/// FIXME: always ignore tied opers
 class RegisterOperands {
   const TargetRegisterInfo *TRI;
   const MachineRegisterInfo *MRI;
+  bool IgnoreDead;
 
 public:
   SmallVector<unsigned, 8> Uses;
@@ -353,7 +319,8 @@ public:
   SmallVector<unsigned, 8> DeadDefs;
 
   RegisterOperands(const TargetRegisterInfo *tri,
-                   const MachineRegisterInfo *mri): TRI(tri), MRI(mri) {}
+                   const MachineRegisterInfo *mri, bool ID = false):
+    TRI(tri), MRI(mri), IgnoreDead(ID) {}
 
   /// Push this operand's register onto the correct vector.
   void collect(const MachineOperand &MO) {
@@ -362,25 +329,27 @@ public:
     if (MO.readsReg())
       pushRegUnits(MO.getReg(), Uses);
     if (MO.isDef()) {
-      if (MO.isDead())
-        pushRegUnits(MO.getReg(), DeadDefs);
+      if (MO.isDead()) {
+        if (!IgnoreDead)
+          pushRegUnits(MO.getReg(), DeadDefs);
+      }
       else
         pushRegUnits(MO.getReg(), Defs);
     }
   }
 
 protected:
-  void pushRegUnits(unsigned Reg, SmallVectorImpl<unsigned> &Regs) {
+  void pushRegUnits(unsigned Reg, SmallVectorImpl<unsigned> &RegUnits) {
     if (TargetRegisterInfo::isVirtualRegister(Reg)) {
-      if (containsReg(Regs, Reg))
+      if (containsReg(RegUnits, Reg))
         return;
-      Regs.push_back(Reg);
+      RegUnits.push_back(Reg);
     }
     else if (MRI->isAllocatable(Reg)) {
       for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units) {
-        if (containsReg(Regs, *Units))
+        if (containsReg(RegUnits, *Units))
           continue;
-        Regs.push_back(*Units);
+        RegUnits.push_back(*Units);
       }
     }
   }
@@ -399,6 +368,56 @@ static void collectOperands(const MachineInstr *MI,
   RegOpers.DeadDefs.erase(I, RegOpers.DeadDefs.end());
 }
 
+/// Initialize an array of N PressureDiffs.
+void PressureDiffs::init(unsigned N) {
+  Size = N;
+  if (N <= Max) {
+    memset(PDiffArray, 0, N * sizeof(PressureDiff));
+    return;
+  }
+  Max = Size;
+  free(PDiffArray);
+  PDiffArray = reinterpret_cast<PressureDiff*>(calloc(N, sizeof(PressureDiff)));
+}
+
+/// Add a change in pressure to the pressure diff of a given instruction.
+void PressureDiff::addPressureChange(unsigned RegUnit, bool IsDec,
+                                     const MachineRegisterInfo *MRI) {
+  PSetIterator PSetI = MRI->getPressureSets(RegUnit);
+  int Weight = IsDec ? -PSetI.getWeight() : PSetI.getWeight();
+  for (; PSetI.isValid(); ++PSetI) {
+    // Find an existing entry in the pressure diff for this PSet.
+    PressureDiff::iterator I = begin(), E = end();
+    for (; I != E && I->isValid(); ++I) {
+      if (I->getPSet() >= *PSetI)
+        break;
+    }
+    // If all pressure sets are more constrained, skip the remaining PSets.
+    if (I == E)
+      break;
+    // Insert this PressureChange.
+    if (!I->isValid() || I->getPSet() != *PSetI) {
+      PressureChange PTmp = PressureChange(*PSetI);
+      for (PressureDiff::iterator J = I; J != E && PTmp.isValid(); ++J)
+        std::swap(*J,PTmp);
+    }
+    // Update the units for this pressure set.
+    I->setUnitInc(I->getUnitInc() + Weight);
+  }
+}
+
+/// Record the pressure difference induced by the given operand list.
+static void collectPDiff(PressureDiff &PDiff, RegisterOperands &RegOpers,
+                         const MachineRegisterInfo *MRI) {
+  assert(!PDiff.begin()->isValid() && "stale PDiff");
+
+  for (unsigned i = 0, e = RegOpers.Defs.size(); i != e; ++i)
+    PDiff.addPressureChange(RegOpers.Defs[i], true, MRI);
+
+  for (unsigned i = 0, e = RegOpers.Uses.size(); i != e; ++i)
+    PDiff.addPressureChange(RegOpers.Uses[i], false, MRI);
+}
+
 /// Force liveness of registers.
 void RegPressureTracker::addLiveRegs(ArrayRef<unsigned> Regs) {
   for (unsigned i = 0, e = Regs.size(); i != e; ++i) {
@@ -415,7 +434,7 @@ void RegPressureTracker::discoverLiveIn(unsigned Reg) {
 
   // At live in discovery, unconditionally increase the high water mark.
   P.LiveInRegs.push_back(Reg);
-  P.increase(Reg, TRI, MRI);
+  increaseSetPressure(P.MaxSetPressure, MRI->getPressureSets(Reg));
 }
 
 /// Add Reg to the live out set and increase max pressure.
@@ -426,11 +445,16 @@ void RegPressureTracker::discoverLiveOut(unsigned Reg) {
 
   // At live out discovery, unconditionally increase the high water mark.
   P.LiveOutRegs.push_back(Reg);
-  P.increase(Reg, TRI, MRI);
+  increaseSetPressure(P.MaxSetPressure, MRI->getPressureSets(Reg));
 }
 
-/// Recede across the previous instruction.
-bool RegPressureTracker::recede() {
+/// Recede across the previous instruction. If LiveUses is provided, record any
+/// RegUnits that are made live by the current instruction's uses. This includes
+/// registers that are both defined and used by the instruction.  If a pressure
+/// difference pointer is provided record the changes is pressure caused by this
+/// instruction independent of liveness.
+bool RegPressureTracker::recede(SmallVectorImpl<unsigned> *LiveUses,
+                                PressureDiff *PDiff) {
   // Check for the top of the analyzable region.
   if (CurrPos == MBB->begin()) {
     closeRegion();
@@ -463,6 +487,9 @@ bool RegPressureTracker::recede() {
   RegisterOperands RegOpers(TRI, MRI);
   collectOperands(CurrPos, RegOpers);
 
+  if (PDiff)
+    collectPDiff(*PDiff, RegOpers, MRI);
+
   // Boost pressure for all dead defs together.
   increaseRegPressure(RegOpers.DeadDefs);
   decreaseRegPressure(RegOpers.DeadDefs);
@@ -471,10 +498,20 @@ bool RegPressureTracker::recede() {
   // TODO: consider earlyclobbers?
   for (unsigned i = 0, e = RegOpers.Defs.size(); i < e; ++i) {
     unsigned Reg = RegOpers.Defs[i];
-    if (LiveRegs.erase(Reg))
-      decreaseRegPressure(Reg);
-    else
-      discoverLiveOut(Reg);
+    bool DeadDef = false;
+    if (RequireIntervals) {
+      const LiveRange *LR = getLiveRange(Reg);
+      if (LR) {
+        LiveQueryResult LRQ = LR->Query(SlotIdx);
+        DeadDef = LRQ.isDeadDef();
+      }
+    }
+    if (!DeadDef) {
+      if (LiveRegs.erase(Reg))
+        decreaseRegPressure(Reg);
+      else
+        discoverLiveOut(Reg);
+    }
   }
 
   // Generate liveness for uses.
@@ -483,12 +520,17 @@ bool RegPressureTracker::recede() {
     if (!LiveRegs.contains(Reg)) {
       // Adjust liveouts if LiveIntervals are available.
       if (RequireIntervals) {
-        const LiveInterval *LI = getInterval(Reg);
-        if (LI && !LI->killedAt(SlotIdx))
-          discoverLiveOut(Reg);
+        const LiveRange *LR = getLiveRange(Reg);
+        if (LR) {
+          LiveQueryResult LRQ = LR->Query(SlotIdx);
+          if (!LRQ.isKill() && !LRQ.valueDefined())
+            discoverLiveOut(Reg);
+        }
       }
       increaseRegPressure(Reg);
       LiveRegs.insert(Reg);
+      if (LiveUses && !containsReg(*LiveUses, Reg))
+        LiveUses->push_back(Reg);
     }
   }
   if (TrackUntiedDefs) {
@@ -537,8 +579,8 @@ bool RegPressureTracker::advance() {
     // Kill liveness at last uses.
     bool lastUse = false;
     if (RequireIntervals) {
-      const LiveInterval *LI = getInterval(Reg);
-      lastUse = LI && LI->killedAt(SlotIdx);
+      const LiveRange *LR = getLiveRange(Reg);
+      lastUse = LR && LR->Query(SlotIdx).isKill();
     }
     else {
       // Allocatable physregs are always single-use before register rewriting.
@@ -576,8 +618,7 @@ static void computeExcessPressureDelta(ArrayRef<unsigned> OldPressureVec,
                                        RegPressureDelta &Delta,
                                        const RegisterClassInfo *RCI,
                                        ArrayRef<unsigned> LiveThruPressureVec) {
-  int ExcessUnits = 0;
-  unsigned PSetID = ~0U;
+  Delta.Excess = PressureChange();
   for (unsigned i = 0, e = OldPressureVec.size(); i < e; ++i) {
     unsigned POld = OldPressureVec[i];
     unsigned PNew = NewPressureVec[i];
@@ -599,13 +640,11 @@ static void computeExcessPressureDelta(ArrayRef<unsigned> OldPressureVec,
       PDiff = Limit - POld;   // Just obeyed limit.
 
     if (PDiff) {
-      ExcessUnits = PDiff;
-      PSetID = i;
+      Delta.Excess = PressureChange(i);
+      Delta.Excess.setUnitInc(PDiff);
       break;
     }
   }
-  Delta.Excess.PSetID = PSetID;
-  Delta.Excess.UnitIncrease = ExcessUnits;
 }
 
 /// Find the max change in max pressure that either surpasses a critical PSet
@@ -616,11 +655,11 @@ static void computeExcessPressureDelta(ArrayRef<unsigned> OldPressureVec,
 /// RegPressureTracker API change to work with pressure differences.
 static void computeMaxPressureDelta(ArrayRef<unsigned> OldMaxPressureVec,
                                     ArrayRef<unsigned> NewMaxPressureVec,
-                                    ArrayRef<PressureElement> CriticalPSets,
+                                    ArrayRef<PressureChange> CriticalPSets,
                                     ArrayRef<unsigned> MaxPressureLimit,
                                     RegPressureDelta &Delta) {
-  Delta.CriticalMax = PressureElement();
-  Delta.CurrentMax = PressureElement();
+  Delta.CriticalMax = PressureChange();
+  Delta.CurrentMax = PressureChange();
 
   unsigned CritIdx = 0, CritEnd = CriticalPSets.size();
   for (unsigned i = 0, e = OldMaxPressureVec.size(); i < e; ++i) {
@@ -630,27 +669,24 @@ static void computeMaxPressureDelta(ArrayRef<unsigned> OldMaxPressureVec,
       continue;
 
     if (!Delta.CriticalMax.isValid()) {
-      while (CritIdx != CritEnd && CriticalPSets[CritIdx].PSetID < i)
+      while (CritIdx != CritEnd && CriticalPSets[CritIdx].getPSet() < i)
         ++CritIdx;
 
-      if (CritIdx != CritEnd && CriticalPSets[CritIdx].PSetID == i) {
-        int PDiff = (int)PNew - (int)CriticalPSets[CritIdx].UnitIncrease;
+      if (CritIdx != CritEnd && CriticalPSets[CritIdx].getPSet() == i) {
+        int PDiff = (int)PNew - (int)CriticalPSets[CritIdx].getUnitInc();
         if (PDiff > 0) {
-          Delta.CriticalMax.PSetID = i;
-          Delta.CriticalMax.UnitIncrease = PDiff;
+          Delta.CriticalMax = PressureChange(i);
+          Delta.CriticalMax.setUnitInc(PDiff);
         }
       }
     }
     // Find the first increase above MaxPressureLimit.
     // (Ignores negative MDiff).
-    if (!Delta.CurrentMax.isValid()) {
-      int MDiff = (int)PNew - (int)MaxPressureLimit[i];
-      if (MDiff > 0) {
-        Delta.CurrentMax.PSetID = i;
-        Delta.CurrentMax.UnitIncrease = MDiff;
-        if (CritIdx == CritEnd || Delta.CriticalMax.isValid())
-          break;
-      }
+    if (!Delta.CurrentMax.isValid() && PNew > MaxPressureLimit[i]) {
+      Delta.CurrentMax = PressureChange(i);
+      Delta.CurrentMax.setUnitInc(PNew - POld);
+      if (CritIdx == CritEnd || Delta.CriticalMax.isValid())
+        break;
     }
   }
 }
@@ -665,7 +701,7 @@ void RegPressureTracker::bumpUpwardPressure(const MachineInstr *MI) {
   assert(!MI->isDebugValue() && "Expect a nondebug instruction.");
 
   // Account for register pressure similar to RegPressureTracker::recede().
-  RegisterOperands RegOpers(TRI, MRI);
+  RegisterOperands RegOpers(TRI, MRI, /*IgnoreDead=*/true);
   collectOperands(MI, RegOpers);
 
   // Boost max pressure for all dead defs together.
@@ -676,8 +712,19 @@ void RegPressureTracker::bumpUpwardPressure(const MachineInstr *MI) {
   // Kill liveness at live defs.
   for (unsigned i = 0, e = RegOpers.Defs.size(); i < e; ++i) {
     unsigned Reg = RegOpers.Defs[i];
-    if (!containsReg(RegOpers.Uses, Reg))
-      decreaseRegPressure(Reg);
+    bool DeadDef = false;
+    if (RequireIntervals) {
+      const LiveRange *LR = getLiveRange(Reg);
+      if (LR) {
+        SlotIndex SlotIdx = LIS->getInstructionIndex(MI);
+        LiveQueryResult LRQ = LR->Query(SlotIdx);
+        DeadDef = LRQ.isDeadDef();
+      }
+    }
+    if (!DeadDef) {
+      if (!containsReg(RegOpers.Uses, Reg))
+        decreaseRegPressure(Reg);
+    }
   }
   // Generate liveness for uses.
   for (unsigned i = 0, e = RegOpers.Uses.size(); i < e; ++i) {
@@ -699,8 +746,9 @@ void RegPressureTracker::bumpUpwardPressure(const MachineInstr *MI) {
 /// result per-SUnit with enough information to adjust for the current
 /// scheduling position. But this works as a proof of concept.
 void RegPressureTracker::
-getMaxUpwardPressureDelta(const MachineInstr *MI, RegPressureDelta &Delta,
-                          ArrayRef<PressureElement> CriticalPSets,
+getMaxUpwardPressureDelta(const MachineInstr *MI, PressureDiff *PDiff,
+                          RegPressureDelta &Delta,
+                          ArrayRef<PressureChange> CriticalPSets,
                           ArrayRef<unsigned> MaxPressureLimit) {
   // Snapshot Pressure.
   // FIXME: The snapshot heap space should persist. But I'm planning to
@@ -714,12 +762,113 @@ getMaxUpwardPressureDelta(const MachineInstr *MI, RegPressureDelta &Delta,
                              LiveThruPressure);
   computeMaxPressureDelta(SavedMaxPressure, P.MaxSetPressure, CriticalPSets,
                           MaxPressureLimit, Delta);
-  assert(Delta.CriticalMax.UnitIncrease >= 0 &&
-         Delta.CurrentMax.UnitIncrease >= 0 && "cannot decrease max pressure");
+  assert(Delta.CriticalMax.getUnitInc() >= 0 &&
+         Delta.CurrentMax.getUnitInc() >= 0 && "cannot decrease max pressure");
 
   // Restore the tracker's state.
   P.MaxSetPressure.swap(SavedMaxPressure);
   CurrSetPressure.swap(SavedPressure);
+
+#ifndef NDEBUG
+  if (!PDiff)
+    return;
+
+  // Check if the alternate algorithm yields the same result.
+  RegPressureDelta Delta2;
+  getUpwardPressureDelta(MI, *PDiff, Delta2, CriticalPSets, MaxPressureLimit);
+  if (Delta != Delta2) {
+    dbgs() << "DELTA: " << *MI;
+    if (Delta.Excess.isValid())
+      dbgs() << "Excess1 " << TRI->getRegPressureSetName(Delta.Excess.getPSet())
+             << " " << Delta.Excess.getUnitInc() << "\n";
+    if (Delta.CriticalMax.isValid())
+      dbgs() << "Critic1 " << TRI->getRegPressureSetName(Delta.CriticalMax.getPSet())
+             << " " << Delta.CriticalMax.getUnitInc() << "\n";
+    if (Delta.CurrentMax.isValid())
+      dbgs() << "CurrMx1 " << TRI->getRegPressureSetName(Delta.CurrentMax.getPSet())
+             << " " << Delta.CurrentMax.getUnitInc() << "\n";
+    if (Delta2.Excess.isValid())
+      dbgs() << "Excess2 " << TRI->getRegPressureSetName(Delta2.Excess.getPSet())
+             << " " << Delta2.Excess.getUnitInc() << "\n";
+    if (Delta2.CriticalMax.isValid())
+      dbgs() << "Critic2 " << TRI->getRegPressureSetName(Delta2.CriticalMax.getPSet())
+             << " " << Delta2.CriticalMax.getUnitInc() << "\n";
+    if (Delta2.CurrentMax.isValid())
+      dbgs() << "CurrMx2 " << TRI->getRegPressureSetName(Delta2.CurrentMax.getPSet())
+             << " " << Delta2.CurrentMax.getUnitInc() << "\n";
+    llvm_unreachable("RegP Delta Mismatch");
+  }
+#endif
+}
+
+/// This is a prototype of the fast version of querying register pressure that
+/// does not directly depend on current liveness. It's still slow because we
+/// recompute pressure change on-the-fly. This implementation only exists to
+/// prove correctness.
+///
+/// @param Delta captures information needed for heuristics.
+///
+/// @param CriticalPSets Are the pressure sets that are known to exceed some
+/// limit within the region, not necessarily at the current position.
+///
+/// @param MaxPressureLimit Is the max pressure within the region, not
+/// necessarily at the current position.
+void RegPressureTracker::
+getUpwardPressureDelta(const MachineInstr *MI, /*const*/ PressureDiff &PDiff,
+                       RegPressureDelta &Delta,
+                       ArrayRef<PressureChange> CriticalPSets,
+                       ArrayRef<unsigned> MaxPressureLimit) const {
+  unsigned CritIdx = 0, CritEnd = CriticalPSets.size();
+  for (PressureDiff::const_iterator
+         PDiffI = PDiff.begin(), PDiffE = PDiff.end();
+       PDiffI != PDiffE && PDiffI->isValid(); ++PDiffI) {
+
+    unsigned PSetID = PDiffI->getPSet();
+    unsigned Limit = RCI->getRegPressureSetLimit(PSetID);
+    if (!LiveThruPressure.empty())
+      Limit += LiveThruPressure[PSetID];
+
+    unsigned POld = CurrSetPressure[PSetID];
+    unsigned MOld = P.MaxSetPressure[PSetID];
+    unsigned MNew = MOld;
+    // Ignore DeadDefs here because they aren't captured by PressureChange.
+    unsigned PNew = POld + PDiffI->getUnitInc();
+    assert((PDiffI->getUnitInc() >= 0) == (PNew >= POld) && "PSet overflow");
+    if (PNew > MOld)
+      MNew = PNew;
+    // Check if current pressure has exceeded the limit.
+    if (!Delta.Excess.isValid()) {
+      unsigned ExcessInc = 0;
+      if (PNew > Limit)
+        ExcessInc = POld > Limit ? PNew - POld : PNew - Limit;
+      else if (POld > Limit)
+        ExcessInc = Limit - POld;
+      if (ExcessInc) {
+        Delta.Excess = PressureChange(PSetID);
+        Delta.Excess.setUnitInc(ExcessInc);
+      }
+    }
+    // Check if max pressure has exceeded a critical pressure set max.
+    if (MNew == MOld)
+      continue;
+    if (!Delta.CriticalMax.isValid()) {
+      while (CritIdx != CritEnd && CriticalPSets[CritIdx].getPSet() < PSetID)
+        ++CritIdx;
+
+      if (CritIdx != CritEnd && CriticalPSets[CritIdx].getPSet() == PSetID) {
+        int CritInc = (int)MNew - (int)CriticalPSets[CritIdx].getUnitInc();
+        if (CritInc > 0 && CritInc <= INT16_MAX) {
+          Delta.CriticalMax = PressureChange(PSetID);
+          Delta.CriticalMax.setUnitInc(CritInc);
+        }
+      }
+    }
+    // Check if max pressure has exceeded the current max.
+    if (!Delta.CurrentMax.isValid() && MNew > MaxPressureLimit[PSetID]) {
+      Delta.CurrentMax = PressureChange(PSetID);
+      Delta.CurrentMax.setUnitInc(MNew - MOld);
+    }
+  }
 }
 
 /// Helper to find a vreg use between two indices [PriorUseIdx, NextUseIdx).
@@ -765,10 +914,12 @@ void RegPressureTracker::bumpDownwardPressure(const MachineInstr *MI) {
       // FIXME: allow the caller to pass in the list of vreg uses that remain
       // to be bottom-scheduled to avoid searching uses at each query.
       SlotIndex CurrIdx = getCurrSlot();
-      const LiveInterval *LI = getInterval(Reg);
-      if (LI && LI->killedAt(SlotIdx)
-          && !findUseBetween(Reg, CurrIdx, SlotIdx, MRI, LIS)) {
-        decreaseRegPressure(Reg);
+      const LiveRange *LR = getLiveRange(Reg);
+      if (LR) {
+        LiveQueryResult LRQ = LR->Query(SlotIdx);
+        if (LRQ.isKill() && !findUseBetween(Reg, CurrIdx, SlotIdx, MRI, LIS)) {
+          decreaseRegPressure(Reg);
+        }
       }
     }
     else if (!TargetRegisterInfo::isVirtualRegister(Reg)) {
@@ -793,7 +944,7 @@ void RegPressureTracker::bumpDownwardPressure(const MachineInstr *MI) {
 /// This assumes that the current LiveIn set is sufficient.
 void RegPressureTracker::
 getMaxDownwardPressureDelta(const MachineInstr *MI, RegPressureDelta &Delta,
-                            ArrayRef<PressureElement> CriticalPSets,
+                            ArrayRef<PressureChange> CriticalPSets,
                             ArrayRef<unsigned> MaxPressureLimit) {
   // Snapshot Pressure.
   std::vector<unsigned> SavedPressure = CurrSetPressure;
@@ -805,8 +956,8 @@ getMaxDownwardPressureDelta(const MachineInstr *MI, RegPressureDelta &Delta,
                              LiveThruPressure);
   computeMaxPressureDelta(SavedMaxPressure, P.MaxSetPressure, CriticalPSets,
                           MaxPressureLimit, Delta);
-  assert(Delta.CriticalMax.UnitIncrease >= 0 &&
-         Delta.CurrentMax.UnitIncrease >= 0 && "cannot decrease max pressure");
+  assert(Delta.CriticalMax.getUnitInc() >= 0 &&
+         Delta.CurrentMax.getUnitInc() >= 0 && "cannot decrease max pressure");
 
   // Restore the tracker's state.
   P.MaxSetPressure.swap(SavedMaxPressure);
diff --git a/lib/CodeGen/ScheduleDAGInstrs.cpp b/lib/CodeGen/ScheduleDAGInstrs.cpp
index 892903c..7f1f9c4 100644
--- a/lib/CodeGen/ScheduleDAGInstrs.cpp
+++ b/lib/CodeGen/ScheduleDAGInstrs.cpp
@@ -36,6 +36,8 @@
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
+#include <queue>
+
 using namespace llvm;
 
 static cl::opt<bool> EnableAASchedMI("enable-aa-sched-mi", cl::Hidden,
@@ -178,14 +180,11 @@ void ScheduleDAGInstrs::finishBlock() {
 void ScheduleDAGInstrs::enterRegion(MachineBasicBlock *bb,
                                     MachineBasicBlock::iterator begin,
                                     MachineBasicBlock::iterator end,
-                                    unsigned endcount) {
+                                    unsigned regioninstrs) {
   assert(bb == BB && "startBlock should set BB");
   RegionBegin = begin;
   RegionEnd = end;
-  EndIndex = endcount;
-  MISUnitMap.clear();
-
-  ScheduleDAG::clearDAG();
+  NumRegionInstrs = regioninstrs;
 }
 
 /// Close the current scheduling region. Don't clear any state in case the
@@ -405,9 +404,19 @@ void ScheduleDAGInstrs::addVRegUseDeps(SUnit *SU, unsigned OperIdx) {
   MachineInstr *MI = SU->getInstr();
   unsigned Reg = MI->getOperand(OperIdx).getReg();
 
+  // Record this local VReg use.
+  VReg2UseMap::iterator UI = VRegUses.find(Reg);
+  for (; UI != VRegUses.end(); ++UI) {
+    if (UI->SU == SU)
+      break;
+  }
+  if (UI == VRegUses.end())
+    VRegUses.insert(VReg2SUnit(Reg, SU));
+
   // Lookup this operand's reaching definition.
   assert(LIS && "vreg dependencies requires LiveIntervals");
-  LiveRangeQuery LRQ(LIS->getInterval(Reg), LIS->getInstructionIndex(MI));
+  LiveQueryResult LRQ
+    = LIS->getInterval(Reg).Query(LIS->getInstructionIndex(MI));
   VNInfo *VNI = LRQ.valueIn();
 
   // VNI will be valid because MachineOperand::readsReg() is checked by caller.
@@ -635,8 +644,7 @@ void addChainDependency (AliasAnalysis *AA, const MachineFrameInfo *MFI,
                          bool isNormalMemory = false) {
   // If this is a false dependency,
   // do not add the edge, but rememeber the rejected node.
-  if (!EnableAASchedMI ||
-      MIsNeedChainEdge(AA, MFI, SUa->getInstr(), SUb->getInstr())) {
+  if (!AA || MIsNeedChainEdge(AA, MFI, SUa->getInstr(), SUb->getInstr())) {
     SDep Dep(SUa, isNormalMemory ? SDep::MayAliasMem : SDep::Barrier);
     Dep.setLatency(TrueMemOrderLatency);
     SUb->addPred(Dep);
@@ -664,7 +672,7 @@ void addChainDependency (AliasAnalysis *AA, const MachineFrameInfo *MFI,
 void ScheduleDAGInstrs::initSUnits() {
   // We'll be allocating one SUnit for each real instruction in the region,
   // which is contained within a basic block.
-  SUnits.reserve(BB->size());
+  SUnits.reserve(NumRegionInstrs);
 
   for (MachineBasicBlock::iterator I = RegionBegin; I != RegionEnd; ++I) {
     MachineInstr *MI = I;
@@ -686,10 +694,22 @@ void ScheduleDAGInstrs::initSUnits() {
 /// DAG builder is an efficient place to do it because it already visits
 /// operands.
 void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
-                                        RegPressureTracker *RPTracker) {
+                                        RegPressureTracker *RPTracker,
+                                        PressureDiffs *PDiffs) {
+  const TargetSubtargetInfo &ST = TM.getSubtarget<TargetSubtargetInfo>();
+  bool UseAA = EnableAASchedMI.getNumOccurrences() > 0 ? EnableAASchedMI
+                                                       : ST.useAA();
+  AliasAnalysis *AAForDep = UseAA ? AA : 0;
+
+  MISUnitMap.clear();
+  ScheduleDAG::clearDAG();
+
   // Create an SUnit for each real instruction.
   initSUnits();
 
+  if (PDiffs)
+    PDiffs->init(SUnits.size());
+
   // We build scheduling units by walking a block's instruction list from bottom
   // to top.
 
@@ -715,10 +735,9 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
   Uses.setUniverse(TRI->getNumRegs());
 
   assert(VRegDefs.empty() && "Only BuildSchedGraph may access VRegDefs");
-  // FIXME: Allow SparseSet to reserve space for the creation of virtual
-  // registers during scheduling. Don't artificially inflate the Universe
-  // because we want to assert that vregs are not created during DAG building.
+  VRegUses.clear();
   VRegDefs.setUniverse(MRI.getNumVirtRegs());
+  VRegUses.setUniverse(MRI.getNumVirtRegs());
 
   // Model data dependencies between instructions being scheduled and the
   // ExitSU.
@@ -738,17 +757,18 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
       DbgMI = MI;
       continue;
     }
+    SUnit *SU = MISUnitMap[MI];
+    assert(SU && "No SUnit mapped to this MI");
+
     if (RPTracker) {
-      RPTracker->recede();
+      PressureDiff *PDiff = PDiffs ? &(*PDiffs)[SU->NodeNum] : 0;
+      RPTracker->recede(/*LiveUses=*/0, PDiff);
       assert(RPTracker->getPos() == prior(MII) && "RPTracker can't find MI");
     }
 
     assert((CanHandleTerminators || (!MI->isTerminator() && !MI->isLabel())) &&
            "Cannot schedule terminators or labels!");
 
-    SUnit *SU = MISUnitMap[MI];
-    assert(SU && "No SUnit mapped to this MI");
-
     // Add register-based dependencies (data, anti, and output).
     bool HasVRegDef = false;
     for (unsigned j = 0, n = MI->getNumOperands(); j != n; ++j) {
@@ -826,20 +846,20 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
         unsigned ChainLatency = 0;
         if (AliasChain->getInstr()->mayLoad())
           ChainLatency = TrueMemOrderLatency;
-        addChainDependency(AA, MFI, SU, AliasChain, RejectMemNodes,
+        addChainDependency(AAForDep, MFI, SU, AliasChain, RejectMemNodes,
                            ChainLatency);
       }
       AliasChain = SU;
       for (unsigned k = 0, m = PendingLoads.size(); k != m; ++k)
-        addChainDependency(AA, MFI, SU, PendingLoads[k], RejectMemNodes,
+        addChainDependency(AAForDep, MFI, SU, PendingLoads[k], RejectMemNodes,
                            TrueMemOrderLatency);
       for (MapVector<const Value *, SUnit *>::iterator I = AliasMemDefs.begin(),
            E = AliasMemDefs.end(); I != E; ++I)
-        addChainDependency(AA, MFI, SU, I->second, RejectMemNodes);
+        addChainDependency(AAForDep, MFI, SU, I->second, RejectMemNodes);
       for (MapVector<const Value *, std::vector<SUnit *> >::iterator I =
            AliasMemUses.begin(), E = AliasMemUses.end(); I != E; ++I) {
         for (unsigned i = 0, e = I->second.size(); i != e; ++i)
-          addChainDependency(AA, MFI, SU, I->second[i], RejectMemNodes,
+          addChainDependency(AAForDep, MFI, SU, I->second[i], RejectMemNodes,
                              TrueMemOrderLatency);
       }
       adjustChainDeps(AA, MFI, SU, &ExitSU, RejectMemNodes,
@@ -872,7 +892,8 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
         MapVector<const Value *, SUnit *>::iterator IE =
           ((ThisMayAlias) ? AliasMemDefs.end() : NonAliasMemDefs.end());
         if (I != IE) {
-          addChainDependency(AA, MFI, SU, I->second, RejectMemNodes, 0, true);
+          addChainDependency(AAForDep, MFI, SU, I->second, RejectMemNodes,
+                             0, true);
           I->second = SU;
         } else {
           if (ThisMayAlias)
@@ -887,7 +908,7 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
           ((ThisMayAlias) ? AliasMemUses.end() : NonAliasMemUses.end());
         if (J != JE) {
           for (unsigned i = 0, e = J->second.size(); i != e; ++i)
-            addChainDependency(AA, MFI, SU, J->second[i], RejectMemNodes,
+            addChainDependency(AAForDep, MFI, SU, J->second[i], RejectMemNodes,
                                TrueMemOrderLatency, true);
           J->second.clear();
         }
@@ -896,11 +917,11 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
         // Add dependencies from all the PendingLoads, i.e. loads
         // with no underlying object.
         for (unsigned k = 0, m = PendingLoads.size(); k != m; ++k)
-          addChainDependency(AA, MFI, SU, PendingLoads[k], RejectMemNodes,
+          addChainDependency(AAForDep, MFI, SU, PendingLoads[k], RejectMemNodes,
                              TrueMemOrderLatency);
         // Add dependence on alias chain, if needed.
         if (AliasChain)
-          addChainDependency(AA, MFI, SU, AliasChain, RejectMemNodes);
+          addChainDependency(AAForDep, MFI, SU, AliasChain, RejectMemNodes);
         // But we also should check dependent instructions for the
         // SU in question.
         adjustChainDeps(AA, MFI, SU, &ExitSU, RejectMemNodes,
@@ -930,7 +951,7 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
           // potentially aliasing stores.
           for (MapVector<const Value *, SUnit *>::iterator I =
                  AliasMemDefs.begin(), E = AliasMemDefs.end(); I != E; ++I)
-            addChainDependency(AA, MFI, SU, I->second, RejectMemNodes);
+            addChainDependency(AAForDep, MFI, SU, I->second, RejectMemNodes);
 
           PendingLoads.push_back(SU);
           MayAlias = true;
@@ -952,7 +973,8 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
           MapVector<const Value *, SUnit *>::iterator IE =
             ((ThisMayAlias) ? AliasMemDefs.end() : NonAliasMemDefs.end());
           if (I != IE)
-            addChainDependency(AA, MFI, SU, I->second, RejectMemNodes, 0, true);
+            addChainDependency(AAForDep, MFI, SU, I->second, RejectMemNodes,
+                               0, true);
           if (ThisMayAlias)
             AliasMemUses[V].push_back(SU);
           else
@@ -962,7 +984,7 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
           adjustChainDeps(AA, MFI, SU, &ExitSU, RejectMemNodes, /*Latency=*/0);
         // Add dependencies on alias and barrier chains, if needed.
         if (MayAlias && AliasChain)
-          addChainDependency(AA, MFI, SU, AliasChain, RejectMemNodes);
+          addChainDependency(AAForDep, MFI, SU, AliasChain, RejectMemNodes);
         if (BarrierChain)
           BarrierChain->addPred(SDep(SU, SDep::Barrier));
       }
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index cb88941..43f72c5 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -35,6 +35,8 @@
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 #include <algorithm>
 using namespace llvm;
 
@@ -43,6 +45,7 @@ STATISTIC(PreIndexedNodes , "Number of pre-indexed nodes created");
 STATISTIC(PostIndexedNodes, "Number of post-indexed nodes created");
 STATISTIC(OpsNarrowed     , "Number of load/op/store narrowed");
 STATISTIC(LdStFP2Int      , "Number of fp load/store pairs transformed to int");
+STATISTIC(SlicedLoads, "Number of load sliced");
 
 namespace {
   static cl::opt<bool>
@@ -53,6 +56,14 @@ namespace {
     CombinerGlobalAA("combiner-global-alias-analysis", cl::Hidden,
                cl::desc("Include global information in alias analysis"));
 
+  /// Hidden option to stress test load slicing, i.e., when this option
+  /// is enabled, load slicing bypasses most of its profitability guards.
+  static cl::opt<bool>
+  StressLoadSlicing("combiner-stress-load-slicing", cl::Hidden,
+                    cl::desc("Bypass the profitability model of load "
+                             "slicing"),
+                    cl::init(false));
+
 //------------------------------ DAGCombiner ---------------------------------//
 
   class DAGCombiner {
@@ -62,6 +73,7 @@ namespace {
     CodeGenOpt::Level OptLevel;
     bool LegalOperations;
     bool LegalTypes;
+    bool ForCodeSize;
 
     // Worklist of all of the nodes that need to be simplified.
     //
@@ -144,6 +156,7 @@ namespace {
 
     bool CombineToPreIndexedLoadStore(SDNode *N);
     bool CombineToPostIndexedLoadStore(SDNode *N);
+    bool SliceUpLoad(SDNode *N);
 
     void ReplaceLoadWithPromotedLoad(SDNode *Load, SDNode *ExtLoad);
     SDValue PromoteOperand(SDValue Op, EVT PVT, bool &Replace);
@@ -283,11 +296,11 @@ namespace {
 
     /// isAlias - Return true if there is any possibility that the two addresses
     /// overlap.
-    bool isAlias(SDValue Ptr1, int64_t Size1,
+    bool isAlias(SDValue Ptr1, int64_t Size1, bool IsVolatile1,
                  const Value *SrcValue1, int SrcValueOffset1,
                  unsigned SrcValueAlign1,
                  const MDNode *TBAAInfo1,
-                 SDValue Ptr2, int64_t Size2,
+                 SDValue Ptr2, int64_t Size2, bool IsVolatile2,
                  const Value *SrcValue2, int SrcValueOffset2,
                  unsigned SrcValueAlign2,
                  const MDNode *TBAAInfo2) const;
@@ -299,7 +312,7 @@ namespace {
     /// FindAliasInfo - Extracts the relevant alias information from the memory
     /// node.  Returns true if the operand was a load.
     bool FindAliasInfo(SDNode *N,
-                       SDValue &Ptr, int64_t &Size,
+                       SDValue &Ptr, int64_t &Size, bool &IsVolatile,
                        const Value *&SrcValue, int &SrcValueOffset,
                        unsigned &SrcValueAlignment,
                        const MDNode *&TBAAInfo) const;
@@ -315,8 +328,15 @@ namespace {
 
   public:
     DAGCombiner(SelectionDAG &D, AliasAnalysis &A, CodeGenOpt::Level OL)
-      : DAG(D), TLI(D.getTargetLoweringInfo()), Level(BeforeLegalizeTypes),
-        OptLevel(OL), LegalOperations(false), LegalTypes(false), AA(A) {}
+        : DAG(D), TLI(D.getTargetLoweringInfo()), Level(BeforeLegalizeTypes),
+          OptLevel(OL), LegalOperations(false), LegalTypes(false), AA(A) {
+      AttributeSet FnAttrs =
+          DAG.getMachineFunction().getFunction()->getAttributes();
+      ForCodeSize =
+          FnAttrs.hasAttribute(AttributeSet::FunctionIndex,
+                               Attribute::OptimizeForSize) ||
+          FnAttrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::MinSize);
+    }
 
     /// Run - runs the dag combiner on all nodes in the work list
     void Run(CombineLevel AtLevel);
@@ -329,7 +349,8 @@ namespace {
       assert(LHSTy.isInteger() && "Shift amount is not an integer type!");
       if (LHSTy.isVector())
         return LHSTy;
-      return LegalTypes ? TLI.getScalarShiftAmountTy(LHSTy) : TLI.getPointerTy();
+      return LegalTypes ? TLI.getScalarShiftAmountTy(LHSTy)
+                        : TLI.getPointerTy();
     }
 
     /// isTypeLegal - This method returns true if we are running before type
@@ -744,9 +765,7 @@ SDValue DAGCombiner::PromoteOperand(SDValue Op, EVT PVT, bool &Replace) {
     Replace = true;
     return DAG.getExtLoad(ExtType, dl, PVT,
                           LD->getChain(), LD->getBasePtr(),
-                          LD->getPointerInfo(),
-                          MemVT, LD->isVolatile(),
-                          LD->isNonTemporal(), LD->getAlignment());
+                          MemVT, LD->getMemOperand());
   }
 
   unsigned Opc = Op.getOpcode();
@@ -967,9 +986,7 @@ bool DAGCombiner::PromoteLoad(SDValue Op) {
       : LD->getExtensionType();
     SDValue NewLD = DAG.getExtLoad(ExtType, dl, PVT,
                                    LD->getChain(), LD->getBasePtr(),
-                                   LD->getPointerInfo(),
-                                   MemVT, LD->isVolatile(),
-                                   LD->isNonTemporal(), LD->getAlignment());
+                                   MemVT, LD->getMemOperand());
     SDValue Result = DAG.getNode(ISD::TRUNCATE, dl, VT, NewLD);
 
     DEBUG(dbgs() << "\nPromoting ";
@@ -1017,7 +1034,8 @@ void DAGCombiner::Run(CombineLevel AtLevel) {
   // try and combine it.
   while (!WorkListContents.empty()) {
     SDNode *N;
-    // The WorkListOrder holds the SDNodes in order, but it may contain duplicates.
+    // The WorkListOrder holds the SDNodes in order, but it may contain
+    // duplicates.
     // In order to avoid a linear scan, we use a set (O(log N)) to hold what the
     // worklist *should* contain, and check the node we want to visit is should
     // actually be visited.
@@ -1617,19 +1635,8 @@ static SDValue tryFoldToZero(SDLoc DL, const TargetLowering &TLI, EVT VT,
                              bool LegalOperations, bool LegalTypes) {
   if (!VT.isVector())
     return DAG.getConstant(0, VT);
-  if (!LegalOperations || TLI.isOperationLegal(ISD::BUILD_VECTOR, VT)) {
-    // Produce a vector of zeros.
-    EVT ElemTy = VT.getVectorElementType();
-    if (LegalTypes && TLI.getTypeAction(*DAG.getContext(), ElemTy) ==
-                      TargetLowering::TypePromoteInteger)
-      ElemTy = TLI.getTypeToTransformTo(*DAG.getContext(), ElemTy);
-    assert((!LegalTypes || TLI.isTypeLegal(ElemTy)) &&
-           "Type for zero vector elements is not legal");
-    SDValue El = DAG.getConstant(0, ElemTy);
-    std::vector<SDValue> Ops(VT.getVectorNumElements(), El);
-    return DAG.getNode(ISD::BUILD_VECTOR, DL, VT,
-      &Ops[0], Ops.size());
-  }
+  if (!LegalOperations || TLI.isOperationLegal(ISD::BUILD_VECTOR, VT))
+    return DAG.getConstant(0, VT);
   return SDValue();
 }
 
@@ -1771,8 +1778,8 @@ SDValue DAGCombiner::visitSUBE(SDNode *N) {
   return SDValue();
 }
 
-/// isConstantSplatVector - Returns true if N is a BUILD_VECTOR node whose elements are
-/// all the same constant or undefined.
+/// isConstantSplatVector - Returns true if N is a BUILD_VECTOR node whose
+/// elements are all the same constant or undefined.
 static bool isConstantSplatVector(SDNode *N, APInt& SplatValue) {
   BuildVectorSDNode *C = dyn_cast<BuildVectorSDNode>(N);
   if (!C)
@@ -1808,9 +1815,11 @@ SDValue DAGCombiner::visitMUL(SDNode *N) {
     N1IsConst = isConstantSplatVector(N1.getNode(), ConstValue1);
   } else {
     N0IsConst = dyn_cast<ConstantSDNode>(N0) != 0;
-    ConstValue0 = N0IsConst? (dyn_cast<ConstantSDNode>(N0))->getAPIntValue() : APInt();
+    ConstValue0 = N0IsConst ? (dyn_cast<ConstantSDNode>(N0))->getAPIntValue()
+                            : APInt();
     N1IsConst = dyn_cast<ConstantSDNode>(N1) != 0;
-    ConstValue1 = N1IsConst? (dyn_cast<ConstantSDNode>(N1))->getAPIntValue() : APInt();
+    ConstValue1 = N1IsConst ? (dyn_cast<ConstantSDNode>(N1))->getAPIntValue()
+                            : APInt();
   }
 
   // fold (mul c1, c2) -> c1*c2
@@ -1823,20 +1832,24 @@ SDValue DAGCombiner::visitMUL(SDNode *N) {
   // fold (mul x, 0) -> 0
   if (N1IsConst && ConstValue1 == 0)
     return N1;
+  // We require a splat of the entire scalar bit width for non-contiguous
+  // bit patterns.
+  bool IsFullSplat =
+    ConstValue1.getBitWidth() == VT.getScalarType().getSizeInBits();
   // fold (mul x, 1) -> x
-  if (N1IsConst && ConstValue1 == 1)
+  if (N1IsConst && ConstValue1 == 1 && IsFullSplat)
     return N0;
   // fold (mul x, -1) -> 0-x
   if (N1IsConst && ConstValue1.isAllOnesValue())
     return DAG.getNode(ISD::SUB, SDLoc(N), VT,
                        DAG.getConstant(0, VT), N0);
   // fold (mul x, (1 << c)) -> x << c
-  if (N1IsConst && ConstValue1.isPowerOf2())
+  if (N1IsConst && ConstValue1.isPowerOf2() && IsFullSplat)
     return DAG.getNode(ISD::SHL, SDLoc(N), VT, N0,
                        DAG.getConstant(ConstValue1.logBase2(),
                                        getShiftAmountTy(N0.getValueType())));
   // fold (mul x, -(1 << c)) -> -(x << c) or (-x) << c
-  if (N1IsConst && (-ConstValue1).isPowerOf2()) {
+  if (N1IsConst && (-ConstValue1).isPowerOf2() && IsFullSplat) {
     unsigned Log2Val = (-ConstValue1).logBase2();
     // FIXME: If the input is something that is easily negated (e.g. a
     // single-use add), we should put the negate there.
@@ -2675,6 +2688,19 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
         return DAG.getSetCC(SDLoc(N), VT, ORNode, LR, Op1);
       }
     }
+    // Simplify (and (setne X, 0), (setne X, -1)) -> (setuge (add X, 1), 2)
+    if (LL == RL && isa<ConstantSDNode>(LR) && isa<ConstantSDNode>(RR) &&
+        Op0 == Op1 && LL.getValueType().isInteger() &&
+      Op0 == ISD::SETNE && ((cast<ConstantSDNode>(LR)->isNullValue() &&
+                                 cast<ConstantSDNode>(RR)->isAllOnesValue()) ||
+                                (cast<ConstantSDNode>(LR)->isAllOnesValue() &&
+                                 cast<ConstantSDNode>(RR)->isNullValue()))) {
+      SDValue ADDNode = DAG.getNode(ISD::ADD, SDLoc(N0), LL.getValueType(),
+                                    LL, DAG.getConstant(1, LL.getValueType()));
+      AddToWorkList(ADDNode.getNode());
+      return DAG.getSetCC(SDLoc(N), VT, ADDNode,
+                          DAG.getConstant(2, LL.getValueType()), ISD::SETUGE);
+    }
     // canonicalize equivalent to ll == rl
     if (LL == RR && LR == RL) {
       Op1 = ISD::getSetCCSwappedOperands(Op1);
@@ -2718,9 +2744,7 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
          TLI.isLoadExtLegal(ISD::ZEXTLOAD, MemVT))) {
       SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N0), VT,
                                        LN0->getChain(), LN0->getBasePtr(),
-                                       LN0->getPointerInfo(), MemVT,
-                                       LN0->isVolatile(), LN0->isNonTemporal(),
-                                       LN0->getAlignment());
+                                       MemVT, LN0->getMemOperand());
       AddToWorkList(N);
       CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1));
       return SDValue(N, 0);   // Return N so it doesn't get rechecked!
@@ -2739,11 +2763,8 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
         ((!LegalOperations && !LN0->isVolatile()) ||
          TLI.isLoadExtLegal(ISD::ZEXTLOAD, MemVT))) {
       SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N0), VT,
-                                       LN0->getChain(),
-                                       LN0->getBasePtr(), LN0->getPointerInfo(),
-                                       MemVT,
-                                       LN0->isVolatile(), LN0->isNonTemporal(),
-                                       LN0->getAlignment());
+                                       LN0->getChain(), LN0->getBasePtr(),
+                                       MemVT, LN0->getMemOperand());
       AddToWorkList(N);
       CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1));
       return SDValue(N, 0);   // Return N so it doesn't get rechecked!
@@ -2773,10 +2794,8 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
 
           SDValue NewLoad =
             DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(LN0), LoadResultTy,
-                           LN0->getChain(), LN0->getBasePtr(),
-                           LN0->getPointerInfo(),
-                           ExtVT, LN0->isVolatile(), LN0->isNonTemporal(),
-                           LN0->getAlignment());
+                           LN0->getChain(), LN0->getBasePtr(), ExtVT,
+                           LN0->getMemOperand());
           AddToWorkList(N);
           CombineTo(LN0, NewLoad, NewLoad.getValue(1));
           return SDValue(N, 0);   // Return N so it doesn't get rechecked!
@@ -2812,7 +2831,7 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
                            LN0->getChain(), NewPtr,
                            LN0->getPointerInfo(),
                            ExtVT, LN0->isVolatile(), LN0->isNonTemporal(),
-                           Alignment);
+                           Alignment, LN0->getTBAAInfo());
           AddToWorkList(N);
           CombineTo(LN0, Load, Load.getValue(1));
           return SDValue(N, 0);   // Return N so it doesn't get rechecked!
@@ -2848,6 +2867,14 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
     }
   }
 
+  // fold (and (or (srl N, 8), (shl N, 8)), 0xffff) -> (srl (bswap N), const)
+  if (N1C && N1C->getAPIntValue() == 0xffff && N0.getOpcode() == ISD::OR) {
+    SDValue BSwap = MatchBSwapHWordLow(N0.getNode(), N0.getOperand(0),
+                                       N0.getOperand(1), false);
+    if (BSwap.getNode())
+      return BSwap;
+  }
+
   return SDValue();
 }
 
@@ -2932,13 +2959,23 @@ SDValue DAGCombiner::MatchBSwapHWordLow(SDNode *N, SDValue N0, SDValue N1,
   if (N00 != N10)
     return SDValue();
 
-  // Make sure everything beyond the low halfword is zero since the SRL 16
-  // will clear the top bits.
+  // Make sure everything beyond the low halfword gets set to zero since the SRL
+  // 16 will clear the top bits.
   unsigned OpSizeInBits = VT.getSizeInBits();
-  if (DemandHighBits && OpSizeInBits > 16 &&
-      (!LookPassAnd0 || !LookPassAnd1) &&
-      !DAG.MaskedValueIsZero(N10, APInt::getHighBitsSet(OpSizeInBits, 16)))
-    return SDValue();
+  if (DemandHighBits && OpSizeInBits > 16) {
+    // If the left-shift isn't masked out then the only way this is a bswap is
+    // if all bits beyond the low 8 are 0. In that case the entire pattern
+    // reduces to a left shift anyway: leave it for other parts of the combiner.
+    if (!LookPassAnd0)
+      return SDValue();
+
+    // However, if the right shift isn't masked out then it might be because
+    // it's not needed. See if we can spot that too.
+    if (!LookPassAnd1 &&
+        !DAG.MaskedValueIsZero(
+            N10, APInt::getHighBitsSet(OpSizeInBits, OpSizeInBits - 16)))
+      return SDValue();
+  }
 
   SDValue Res = DAG.getNode(ISD::BSWAP, SDLoc(N), VT, N00);
   if (OpSizeInBits > 16)
@@ -3078,7 +3115,7 @@ SDValue DAGCombiner::MatchBSwapHWord(SDNode *N, SDValue N0, SDValue N1) {
   SDValue BSwap = DAG.getNode(ISD::BSWAP, SDLoc(N), VT,
                               SDValue(Parts[0],0));
 
-  // Result of the bswap should be rotated by 16. If it's not legal, than
+  // Result of the bswap should be rotated by 16. If it's not legal, then
   // do  (x << 16) | (x >> 16).
   SDValue ShAmt = DAG.getConstant(16, getShiftAmountTy(VT));
   if (TLI.isOperationLegalOrCustom(ISD::ROTL, VT))
@@ -3343,29 +3380,9 @@ SDNode *DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, SDLoc DL) {
   if (LHSMask.getNode() || RHSMask.getNode())
     return 0;
 
-  // fold (or (shl x, y), (srl x, (sub 32, y))) -> (rotl x, y)
-  // fold (or (shl x, y), (srl x, (sub 32, y))) -> (rotr x, (sub 32, y))
-  if (RHSShiftAmt.getOpcode() == ISD::SUB &&
-      LHSShiftAmt == RHSShiftAmt.getOperand(1)) {
-    if (ConstantSDNode *SUBC =
-          dyn_cast<ConstantSDNode>(RHSShiftAmt.getOperand(0))) {
-      if (SUBC->getAPIntValue() == OpSizeInBits)
-        return DAG.getNode(HasROTL ? ISD::ROTL : ISD::ROTR, DL, VT, LHSShiftArg,
-                           HasROTL ? LHSShiftAmt : RHSShiftAmt).getNode();
-    }
-  }
-
-  // fold (or (shl x, (sub 32, y)), (srl x, r)) -> (rotr x, y)
-  // fold (or (shl x, (sub 32, y)), (srl x, r)) -> (rotl x, (sub 32, y))
-  if (LHSShiftAmt.getOpcode() == ISD::SUB &&
-      RHSShiftAmt == LHSShiftAmt.getOperand(1))
-    if (ConstantSDNode *SUBC =
-          dyn_cast<ConstantSDNode>(LHSShiftAmt.getOperand(0)))
-      if (SUBC->getAPIntValue() == OpSizeInBits)
-        return DAG.getNode(HasROTR ? ISD::ROTR : ISD::ROTL, DL, VT, LHSShiftArg,
-                           HasROTR ? RHSShiftAmt : LHSShiftAmt).getNode();
-
-  // Look for sign/zext/any-extended or truncate cases:
+  // If the shift amount is sign/zext/any-extended just peel it off.
+  SDValue LExtOp0 = LHSShiftAmt;
+  SDValue RExtOp0 = RHSShiftAmt;
   if ((LHSShiftAmt.getOpcode() == ISD::SIGN_EXTEND ||
        LHSShiftAmt.getOpcode() == ISD::ZERO_EXTEND ||
        LHSShiftAmt.getOpcode() == ISD::ANY_EXTEND ||
@@ -3374,33 +3391,31 @@ SDNode *DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, SDLoc DL) {
        RHSShiftAmt.getOpcode() == ISD::ZERO_EXTEND ||
        RHSShiftAmt.getOpcode() == ISD::ANY_EXTEND ||
        RHSShiftAmt.getOpcode() == ISD::TRUNCATE)) {
-    SDValue LExtOp0 = LHSShiftAmt.getOperand(0);
-    SDValue RExtOp0 = RHSShiftAmt.getOperand(0);
-    if (RExtOp0.getOpcode() == ISD::SUB &&
-        RExtOp0.getOperand(1) == LExtOp0) {
-      // fold (or (shl x, (*ext y)), (srl x, (*ext (sub 32, y)))) ->
-      //   (rotl x, y)
-      // fold (or (shl x, (*ext y)), (srl x, (*ext (sub 32, y)))) ->
-      //   (rotr x, (sub 32, y))
-      if (ConstantSDNode *SUBC =
+    LExtOp0 = LHSShiftAmt.getOperand(0);
+    RExtOp0 = RHSShiftAmt.getOperand(0);
+  }
+
+  if (RExtOp0.getOpcode() == ISD::SUB && RExtOp0.getOperand(1) == LExtOp0) {
+    // fold (or (shl x, (*ext y)), (srl x, (*ext (sub 32, y)))) ->
+    //   (rotl x, y)
+    // fold (or (shl x, (*ext y)), (srl x, (*ext (sub 32, y)))) ->
+    //   (rotr x, (sub 32, y))
+    if (ConstantSDNode *SUBC =
             dyn_cast<ConstantSDNode>(RExtOp0.getOperand(0)))
-        if (SUBC->getAPIntValue() == OpSizeInBits)
-          return DAG.getNode(HasROTL ? ISD::ROTL : ISD::ROTR, DL, VT,
-                             LHSShiftArg,
-                             HasROTL ? LHSShiftAmt : RHSShiftAmt).getNode();
-    } else if (LExtOp0.getOpcode() == ISD::SUB &&
-               RExtOp0 == LExtOp0.getOperand(1)) {
-      // fold (or (shl x, (*ext (sub 32, y))), (srl x, (*ext y))) ->
-      //   (rotr x, y)
-      // fold (or (shl x, (*ext (sub 32, y))), (srl x, (*ext y))) ->
-      //   (rotl x, (sub 32, y))
-      if (ConstantSDNode *SUBC =
+      if (SUBC->getAPIntValue() == OpSizeInBits)
+        return DAG.getNode(HasROTL ? ISD::ROTL : ISD::ROTR, DL, VT, LHSShiftArg,
+                           HasROTL ? LHSShiftAmt : RHSShiftAmt).getNode();
+  } else if (LExtOp0.getOpcode() == ISD::SUB &&
+             RExtOp0 == LExtOp0.getOperand(1)) {
+    // fold (or (shl x, (*ext (sub 32, y))), (srl x, (*ext y))) ->
+    //   (rotr x, y)
+    // fold (or (shl x, (*ext (sub 32, y))), (srl x, (*ext y))) ->
+    //   (rotl x, (sub 32, y))
+    if (ConstantSDNode *SUBC =
             dyn_cast<ConstantSDNode>(LExtOp0.getOperand(0)))
-        if (SUBC->getAPIntValue() == OpSizeInBits)
-          return DAG.getNode(HasROTR ? ISD::ROTR : ISD::ROTL, DL, VT,
-                             LHSShiftArg,
-                             HasROTR ? RHSShiftAmt : LHSShiftAmt).getNode();
-    }
+      if (SUBC->getAPIntValue() == OpSizeInBits)
+        return DAG.getNode(HasROTR ? ISD::ROTR : ISD::ROTL, DL, VT, LHSShiftArg,
+                           HasROTR ? RHSShiftAmt : LHSShiftAmt).getNode();
   }
 
   return 0;
@@ -3620,6 +3635,12 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
   EVT VT = N0.getValueType();
   unsigned OpSizeInBits = VT.getScalarType().getSizeInBits();
 
+  // fold vector ops
+  if (VT.isVector()) {
+    SDValue FoldedVOp = SimplifyVBinOp(N);
+    if (FoldedVOp.getNode()) return FoldedVOp;
+  }
+
   // fold (shl c1, c2) -> c1<<c2
   if (N0C && N1C)
     return DAG.FoldConstantArithmetic(ISD::SHL, VT, N0C, N1C);
@@ -3697,6 +3718,27 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
     }
   }
 
+  // fold (shl (zext (srl x, C)), C) -> (zext (shl (srl x, C), C))
+  // Only fold this if the inner zext has no other uses to avoid increasing
+  // the total number of instructions.
+  if (N1C && N0.getOpcode() == ISD::ZERO_EXTEND && N0.hasOneUse() &&
+      N0.getOperand(0).getOpcode() == ISD::SRL &&
+      isa<ConstantSDNode>(N0.getOperand(0)->getOperand(1))) {
+    uint64_t c1 =
+      cast<ConstantSDNode>(N0.getOperand(0)->getOperand(1))->getZExtValue();
+    if (c1 < VT.getSizeInBits()) {
+      uint64_t c2 = N1C->getZExtValue();
+      if (c1 == c2) {
+        SDValue NewOp0 = N0.getOperand(0);
+        EVT CountVT = NewOp0.getOperand(1).getValueType();
+        SDValue NewSHL = DAG.getNode(ISD::SHL, SDLoc(N), NewOp0.getValueType(),
+                                     NewOp0, DAG.getConstant(c2, CountVT));
+        AddToWorkList(NewSHL.getNode());
+        return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N0), VT, NewSHL);
+      }
+    }
+  }
+
   // fold (shl (srl x, c1), c2) -> (and (shl x, (sub c2, c1), MASK) or
   //                               (and (srl x, (sub c1, c2), MASK)
   // Only fold this if the inner shift has no other uses -- if it does, folding
@@ -3750,6 +3792,12 @@ SDValue DAGCombiner::visitSRA(SDNode *N) {
   EVT VT = N0.getValueType();
   unsigned OpSizeInBits = VT.getScalarType().getSizeInBits();
 
+  // fold vector ops
+  if (VT.isVector()) {
+    SDValue FoldedVOp = SimplifyVBinOp(N);
+    if (FoldedVOp.getNode()) return FoldedVOp;
+  }
+
   // fold (sra c1, c2) -> (sra c1, c2)
   if (N0C && N1C)
     return DAG.FoldConstantArithmetic(ISD::SRA, VT, N0C, N1C);
@@ -3895,6 +3943,12 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {
   EVT VT = N0.getValueType();
   unsigned OpSizeInBits = VT.getScalarType().getSizeInBits();
 
+  // fold vector ops
+  if (VT.isVector()) {
+    SDValue FoldedVOp = SimplifyVBinOp(N);
+    if (FoldedVOp.getNode()) return FoldedVOp;
+  }
+
   // fold (srl c1, c2) -> c1 >>u c2
   if (N0C && N1C)
     return DAG.FoldConstantArithmetic(ISD::SRL, VT, N0C, N1C);
@@ -4217,6 +4271,23 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) {
   return SDValue();
 }
 
+static
+std::pair<SDValue, SDValue> SplitVSETCC(const SDNode *N, SelectionDAG &DAG) {
+  SDLoc DL(N);
+  EVT LoVT, HiVT;
+  llvm::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
+
+  // Split the inputs.
+  SDValue Lo, Hi, LL, LH, RL, RH;
+  llvm::tie(LL, LH) = DAG.SplitVectorOperand(N, 0);
+  llvm::tie(RL, RH) = DAG.SplitVectorOperand(N, 1);
+
+  Lo = DAG.getNode(N->getOpcode(), DL, LoVT, LL, RL, N->getOperand(2));
+  Hi = DAG.getNode(N->getOpcode(), DL, HiVT, LH, RH, N->getOperand(2));
+
+  return std::make_pair(Lo, Hi);
+}
+
 SDValue DAGCombiner::visitVSELECT(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
@@ -4254,6 +4325,34 @@ SDValue DAGCombiner::visitVSELECT(SDNode *N) {
     }
   }
 
+  // If the VSELECT result requires splitting and the mask is provided by a
+  // SETCC, then split both nodes and its operands before legalization. This
+  // prevents the type legalizer from unrolling SETCC into scalar comparisons
+  // and enables future optimizations (e.g. min/max pattern matching on X86).
+  if (N0.getOpcode() == ISD::SETCC) {
+    EVT VT = N->getValueType(0);
+
+    // Check if any splitting is required.
+    if (TLI.getTypeAction(*DAG.getContext(), VT) !=
+        TargetLowering::TypeSplitVector)
+      return SDValue();
+
+    SDValue Lo, Hi, CCLo, CCHi, LL, LH, RL, RH;
+    llvm::tie(CCLo, CCHi) = SplitVSETCC(N0.getNode(), DAG);
+    llvm::tie(LL, LH) = DAG.SplitVectorOperand(N, 1);
+    llvm::tie(RL, RH) = DAG.SplitVectorOperand(N, 2);
+
+    Lo = DAG.getNode(N->getOpcode(), DL, LL.getValueType(), CCLo, LL, RL);
+    Hi = DAG.getNode(N->getOpcode(), DL, LH.getValueType(), CCHi, LH, RH);
+
+    // Add the new VSELECT nodes to the work list in case they need to be split
+    // again.
+    AddToWorkList(Lo.getNode());
+    AddToWorkList(Hi.getNode());
+
+    return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
+  }
+
   return SDValue();
 }
 
@@ -4469,10 +4568,8 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
       LoadSDNode *LN0 = cast<LoadSDNode>(N0);
       SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(N), VT,
                                        LN0->getChain(),
-                                       LN0->getBasePtr(), LN0->getPointerInfo(),
-                                       N0.getValueType(),
-                                       LN0->isVolatile(), LN0->isNonTemporal(),
-                                       LN0->getAlignment());
+                                       LN0->getBasePtr(), N0.getValueType(),
+                                       LN0->getMemOperand());
       CombineTo(N, ExtLoad);
       SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
                                   N0.getValueType(), ExtLoad);
@@ -4493,10 +4590,8 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
         TLI.isLoadExtLegal(ISD::SEXTLOAD, MemVT)) {
       SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(N), VT,
                                        LN0->getChain(),
-                                       LN0->getBasePtr(), LN0->getPointerInfo(),
-                                       MemVT,
-                                       LN0->isVolatile(), LN0->isNonTemporal(),
-                                       LN0->getAlignment());
+                                       LN0->getBasePtr(), MemVT,
+                                       LN0->getMemOperand());
       CombineTo(N, ExtLoad);
       CombineTo(N0.getNode(),
                 DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
@@ -4524,11 +4619,8 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
       if (DoXform) {
         SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(LN0), VT,
                                          LN0->getChain(), LN0->getBasePtr(),
-                                         LN0->getPointerInfo(),
                                          LN0->getMemoryVT(),
-                                         LN0->isVolatile(),
-                                         LN0->isNonTemporal(),
-                                         LN0->getAlignment());
+                                         LN0->getMemOperand());
         APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
         Mask = Mask.sext(VT.getSizeInBits());
         SDValue And = DAG.getNode(N0.getOpcode(), SDLoc(N), VT,
@@ -4593,9 +4685,9 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
          TLI.isOperationLegal(ISD::SETCC, getSetCCResultType(VT)))) {
       return DAG.getSelect(SDLoc(N), VT,
                            DAG.getSetCC(SDLoc(N),
-                                        getSetCCResultType(VT),
-                                        N0.getOperand(0), N0.getOperand(1),
-                                        cast<CondCodeSDNode>(N0.getOperand(2))->get()),
+                           getSetCCResultType(VT),
+                           N0.getOperand(0), N0.getOperand(1),
+                           cast<CondCodeSDNode>(N0.getOperand(2))->get()),
                            NegOne, DAG.getConstant(0, VT));
     }
   }
@@ -4762,10 +4854,8 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
       LoadSDNode *LN0 = cast<LoadSDNode>(N0);
       SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N), VT,
                                        LN0->getChain(),
-                                       LN0->getBasePtr(), LN0->getPointerInfo(),
-                                       N0.getValueType(),
-                                       LN0->isVolatile(), LN0->isNonTemporal(),
-                                       LN0->getAlignment());
+                                       LN0->getBasePtr(), N0.getValueType(),
+                                       LN0->getMemOperand());
       CombineTo(N, ExtLoad);
       SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
                                   N0.getValueType(), ExtLoad);
@@ -4795,11 +4885,8 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
       if (DoXform) {
         SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(LN0), VT,
                                          LN0->getChain(), LN0->getBasePtr(),
-                                         LN0->getPointerInfo(),
                                          LN0->getMemoryVT(),
-                                         LN0->isVolatile(),
-                                         LN0->isNonTemporal(),
-                                         LN0->getAlignment());
+                                         LN0->getMemOperand());
         APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
         Mask = Mask.zext(VT.getSizeInBits());
         SDValue And = DAG.getNode(N0.getOpcode(), SDLoc(N), VT,
@@ -4826,10 +4913,8 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
         TLI.isLoadExtLegal(ISD::ZEXTLOAD, MemVT)) {
       SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N), VT,
                                        LN0->getChain(),
-                                       LN0->getBasePtr(), LN0->getPointerInfo(),
-                                       MemVT,
-                                       LN0->isVolatile(), LN0->isNonTemporal(),
-                                       LN0->getAlignment());
+                                       LN0->getBasePtr(), MemVT,
+                                       LN0->getMemOperand());
       CombineTo(N, ExtLoad);
       CombineTo(N0.getNode(),
                 DAG.getNode(ISD::TRUNCATE, SDLoc(N0), N0.getValueType(),
@@ -4992,10 +5077,8 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
       LoadSDNode *LN0 = cast<LoadSDNode>(N0);
       SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, SDLoc(N), VT,
                                        LN0->getChain(),
-                                       LN0->getBasePtr(), LN0->getPointerInfo(),
-                                       N0.getValueType(),
-                                       LN0->isVolatile(), LN0->isNonTemporal(),
-                                       LN0->getAlignment());
+                                       LN0->getBasePtr(), N0.getValueType(),
+                                       LN0->getMemOperand());
       CombineTo(N, ExtLoad);
       SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
                                   N0.getValueType(), ExtLoad);
@@ -5016,9 +5099,7 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
     EVT MemVT = LN0->getMemoryVT();
     SDValue ExtLoad = DAG.getExtLoad(LN0->getExtensionType(), SDLoc(N),
                                      VT, LN0->getChain(), LN0->getBasePtr(),
-                                     LN0->getPointerInfo(), MemVT,
-                                     LN0->isVolatile(), LN0->isNonTemporal(),
-                                     LN0->getAlignment());
+                                     MemVT, LN0->getMemOperand());
     CombineTo(N, ExtLoad);
     CombineTo(N0.getNode(),
               DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
@@ -5250,12 +5331,12 @@ SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) {
     Load =  DAG.getLoad(VT, SDLoc(N0), LN0->getChain(), NewPtr,
                         LN0->getPointerInfo().getWithOffset(PtrOff),
                         LN0->isVolatile(), LN0->isNonTemporal(),
-                        LN0->isInvariant(), NewAlign);
+                        LN0->isInvariant(), NewAlign, LN0->getTBAAInfo());
   else
     Load = DAG.getExtLoad(ExtType, SDLoc(N0), VT, LN0->getChain(),NewPtr,
                           LN0->getPointerInfo().getWithOffset(PtrOff),
                           ExtVT, LN0->isVolatile(), LN0->isNonTemporal(),
-                          NewAlign);
+                          NewAlign, LN0->getTBAAInfo());
 
   // Replace the old load's chain with the new load's chain.
   WorkListRemover DeadNodes(*this);
@@ -5353,10 +5434,8 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {
     LoadSDNode *LN0 = cast<LoadSDNode>(N0);
     SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(N), VT,
                                      LN0->getChain(),
-                                     LN0->getBasePtr(), LN0->getPointerInfo(),
-                                     EVT,
-                                     LN0->isVolatile(), LN0->isNonTemporal(),
-                                     LN0->getAlignment());
+                                     LN0->getBasePtr(), EVT,
+                                     LN0->getMemOperand());
     CombineTo(N, ExtLoad);
     CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1));
     AddToWorkList(ExtLoad.getNode());
@@ -5371,10 +5450,8 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {
     LoadSDNode *LN0 = cast<LoadSDNode>(N0);
     SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(N), VT,
                                      LN0->getChain(),
-                                     LN0->getBasePtr(), LN0->getPointerInfo(),
-                                     EVT,
-                                     LN0->isVolatile(), LN0->isNonTemporal(),
-                                     LN0->getAlignment());
+                                     LN0->getBasePtr(), EVT,
+                                     LN0->getMemOperand());
     CombineTo(N, ExtLoad);
     CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1));
     return SDValue(N, 0);   // Return N so it doesn't get rechecked!
@@ -5657,7 +5734,8 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) {
   if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
       // Do not change the width of a volatile load.
       !cast<LoadSDNode>(N0)->isVolatile() &&
-      (!LegalOperations || TLI.isOperationLegal(ISD::LOAD, VT))) {
+      (!LegalOperations || TLI.isOperationLegal(ISD::LOAD, VT)) &&
+      TLI.isLoadBitCastBeneficial(N0.getValueType(), VT)) {
     LoadSDNode *LN0 = cast<LoadSDNode>(N0);
     unsigned Align = TLI.getDataLayout()->
       getABITypeAlignment(VT.getTypeForEVT(*DAG.getContext()));
@@ -5667,7 +5745,8 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) {
       SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(),
                                  LN0->getBasePtr(), LN0->getPointerInfo(),
                                  LN0->isVolatile(), LN0->isNonTemporal(),
-                                 LN0->isInvariant(), OrigAlign);
+                                 LN0->isInvariant(), OrigAlign,
+                                 LN0->getTBAAInfo());
       AddToWorkList(N);
       CombineTo(N0.getNode(),
                 DAG.getNode(ISD::BITCAST, SDLoc(N0),
@@ -6652,16 +6731,14 @@ SDValue DAGCombiner::visitFP_EXTEND(SDNode *N) {
   }
 
   // fold (fpext (load x)) -> (fpext (fptrunc (extload x)))
-  if (ISD::isNON_EXTLoad(N0.getNode()) && N0.hasOneUse() &&
+  if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
       ((!LegalOperations && !cast<LoadSDNode>(N0)->isVolatile()) ||
        TLI.isLoadExtLegal(ISD::EXTLOAD, N0.getValueType()))) {
     LoadSDNode *LN0 = cast<LoadSDNode>(N0);
     SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, SDLoc(N), VT,
                                      LN0->getChain(),
-                                     LN0->getBasePtr(), LN0->getPointerInfo(),
-                                     N0.getValueType(),
-                                     LN0->isVolatile(), LN0->isNonTemporal(),
-                                     LN0->getAlignment());
+                                     LN0->getBasePtr(), N0.getValueType(),
+                                     LN0->getMemOperand());
     CombineTo(N, ExtLoad);
     CombineTo(N0.getNode(),
               DAG.getNode(ISD::FP_ROUND, SDLoc(N0),
@@ -7451,13 +7528,16 @@ SDValue DAGCombiner::visitLOAD(SDNode *N) {
                               LD->getValueType(0),
                               Chain, Ptr, LD->getPointerInfo(),
                               LD->getMemoryVT(),
-                              LD->isVolatile(), LD->isNonTemporal(), Align);
+                              LD->isVolatile(), LD->isNonTemporal(), Align,
+                              LD->getTBAAInfo());
         return CombineTo(N, NewLoad, SDValue(NewLoad.getNode(), 1), true);
       }
     }
   }
 
-  if (CombinerAA) {
+  bool UseAA = CombinerAA.getNumOccurrences() > 0 ? CombinerAA :
+    TLI.getTargetMachine().getSubtarget<TargetSubtargetInfo>().useAA();
+  if (UseAA) {
     // Walk up chain skipping non-aliasing memory nodes.
     SDValue BetterChain = FindBetterChain(N, Chain);
 
@@ -7468,17 +7548,12 @@ SDValue DAGCombiner::visitLOAD(SDNode *N) {
       // Replace the chain to void dependency.
       if (LD->getExtensionType() == ISD::NON_EXTLOAD) {
         ReplLoad = DAG.getLoad(N->getValueType(0), SDLoc(LD),
-                               BetterChain, Ptr, LD->getPointerInfo(),
-                               LD->isVolatile(), LD->isNonTemporal(),
-                               LD->isInvariant(), LD->getAlignment());
+                               BetterChain, Ptr, LD->getMemOperand());
       } else {
         ReplLoad = DAG.getExtLoad(LD->getExtensionType(), SDLoc(LD),
                                   LD->getValueType(0),
-                                  BetterChain, Ptr, LD->getPointerInfo(),
-                                  LD->getMemoryVT(),
-                                  LD->isVolatile(),
-                                  LD->isNonTemporal(),
-                                  LD->getAlignment());
+                                  BetterChain, Ptr, LD->getMemoryVT(),
+                                  LD->getMemOperand());
       }
 
       // Create token factor to keep old chain connected.
@@ -7498,9 +7573,562 @@ SDValue DAGCombiner::visitLOAD(SDNode *N) {
   if (CombineToPreIndexedLoadStore(N) || CombineToPostIndexedLoadStore(N))
     return SDValue(N, 0);
 
+  // Try to slice up N to more direct loads if the slices are mapped to
+  // different register banks or pairing can take place.
+  if (SliceUpLoad(N))
+    return SDValue(N, 0);
+
   return SDValue();
 }
 
+namespace {
+/// \brief Helper structure used to slice a load in smaller loads.
+/// Basically a slice is obtained from the following sequence:
+/// Origin = load Ty1, Base
+/// Shift = srl Ty1 Origin, CstTy Amount
+/// Inst = trunc Shift to Ty2
+///
+/// Then, it will be rewriten into:
+/// Slice = load SliceTy, Base + SliceOffset
+/// [Inst = zext Slice to Ty2], only if SliceTy <> Ty2
+///
+/// SliceTy is deduced from the number of bits that are actually used to
+/// build Inst.
+struct LoadedSlice {
+  /// \brief Helper structure used to compute the cost of a slice.
+  struct Cost {
+    /// Are we optimizing for code size.
+    bool ForCodeSize;
+    /// Various cost.
+    unsigned Loads;
+    unsigned Truncates;
+    unsigned CrossRegisterBanksCopies;
+    unsigned ZExts;
+    unsigned Shift;
+
+    Cost(bool ForCodeSize = false)
+        : ForCodeSize(ForCodeSize), Loads(0), Truncates(0),
+          CrossRegisterBanksCopies(0), ZExts(0), Shift(0) {}
+
+    /// \brief Get the cost of one isolated slice.
+    Cost(const LoadedSlice &LS, bool ForCodeSize = false)
+        : ForCodeSize(ForCodeSize), Loads(1), Truncates(0),
+          CrossRegisterBanksCopies(0), ZExts(0), Shift(0) {
+      EVT TruncType = LS.Inst->getValueType(0);
+      EVT LoadedType = LS.getLoadedType();
+      if (TruncType != LoadedType &&
+          !LS.DAG->getTargetLoweringInfo().isZExtFree(LoadedType, TruncType))
+        ZExts = 1;
+    }
+
+    /// \brief Account for slicing gain in the current cost.
+    /// Slicing provide a few gains like removing a shift or a
+    /// truncate. This method allows to grow the cost of the original
+    /// load with the gain from this slice.
+    void addSliceGain(const LoadedSlice &LS) {
+      // Each slice saves a truncate.
+      const TargetLowering &TLI = LS.DAG->getTargetLoweringInfo();
+      if (!TLI.isTruncateFree(LS.Inst->getValueType(0),
+                              LS.Inst->getOperand(0).getValueType()))
+        ++Truncates;
+      // If there is a shift amount, this slice gets rid of it.
+      if (LS.Shift)
+        ++Shift;
+      // If this slice can merge a cross register bank copy, account for it.
+      if (LS.canMergeExpensiveCrossRegisterBankCopy())
+        ++CrossRegisterBanksCopies;
+    }
+
+    Cost &operator+=(const Cost &RHS) {
+      Loads += RHS.Loads;
+      Truncates += RHS.Truncates;
+      CrossRegisterBanksCopies += RHS.CrossRegisterBanksCopies;
+      ZExts += RHS.ZExts;
+      Shift += RHS.Shift;
+      return *this;
+    }
+
+    bool operator==(const Cost &RHS) const {
+      return Loads == RHS.Loads && Truncates == RHS.Truncates &&
+             CrossRegisterBanksCopies == RHS.CrossRegisterBanksCopies &&
+             ZExts == RHS.ZExts && Shift == RHS.Shift;
+    }
+
+    bool operator!=(const Cost &RHS) const { return !(*this == RHS); }
+
+    bool operator<(const Cost &RHS) const {
+      // Assume cross register banks copies are as expensive as loads.
+      // FIXME: Do we want some more target hooks?
+      unsigned ExpensiveOpsLHS = Loads + CrossRegisterBanksCopies;
+      unsigned ExpensiveOpsRHS = RHS.Loads + RHS.CrossRegisterBanksCopies;
+      // Unless we are optimizing for code size, consider the
+      // expensive operation first.
+      if (!ForCodeSize && ExpensiveOpsLHS != ExpensiveOpsRHS)
+        return ExpensiveOpsLHS < ExpensiveOpsRHS;
+      return (Truncates + ZExts + Shift + ExpensiveOpsLHS) <
+             (RHS.Truncates + RHS.ZExts + RHS.Shift + ExpensiveOpsRHS);
+    }
+
+    bool operator>(const Cost &RHS) const { return RHS < *this; }
+
+    bool operator<=(const Cost &RHS) const { return !(RHS < *this); }
+
+    bool operator>=(const Cost &RHS) const { return !(*this < RHS); }
+  };
+  // The last instruction that represent the slice. This should be a
+  // truncate instruction.
+  SDNode *Inst;
+  // The original load instruction.
+  LoadSDNode *Origin;
+  // The right shift amount in bits from the original load.
+  unsigned Shift;
+  // The DAG from which Origin came from.
+  // This is used to get some contextual information about legal types, etc.
+  SelectionDAG *DAG;
+
+  LoadedSlice(SDNode *Inst = NULL, LoadSDNode *Origin = NULL,
+              unsigned Shift = 0, SelectionDAG *DAG = NULL)
+      : Inst(Inst), Origin(Origin), Shift(Shift), DAG(DAG) {}
+
+  LoadedSlice(const LoadedSlice &LS)
+      : Inst(LS.Inst), Origin(LS.Origin), Shift(LS.Shift), DAG(LS.DAG) {}
+
+  /// \brief Get the bits used in a chunk of bits \p BitWidth large.
+  /// \return Result is \p BitWidth and has used bits set to 1 and
+  ///         not used bits set to 0.
+  APInt getUsedBits() const {
+    // Reproduce the trunc(lshr) sequence:
+    // - Start from the truncated value.
+    // - Zero extend to the desired bit width.
+    // - Shift left.
+    assert(Origin && "No original load to compare against.");
+    unsigned BitWidth = Origin->getValueSizeInBits(0);
+    assert(Inst && "This slice is not bound to an instruction");
+    assert(Inst->getValueSizeInBits(0) <= BitWidth &&
+           "Extracted slice is bigger than the whole type!");
+    APInt UsedBits(Inst->getValueSizeInBits(0), 0);
+    UsedBits.setAllBits();
+    UsedBits = UsedBits.zext(BitWidth);
+    UsedBits <<= Shift;
+    return UsedBits;
+  }
+
+  /// \brief Get the size of the slice to be loaded in bytes.
+  unsigned getLoadedSize() const {
+    unsigned SliceSize = getUsedBits().countPopulation();
+    assert(!(SliceSize & 0x7) && "Size is not a multiple of a byte.");
+    return SliceSize / 8;
+  }
+
+  /// \brief Get the type that will be loaded for this slice.
+  /// Note: This may not be the final type for the slice.
+  EVT getLoadedType() const {
+    assert(DAG && "Missing context");
+    LLVMContext &Ctxt = *DAG->getContext();
+    return EVT::getIntegerVT(Ctxt, getLoadedSize() * 8);
+  }
+
+  /// \brief Get the alignment of the load used for this slice.
+  unsigned getAlignment() const {
+    unsigned Alignment = Origin->getAlignment();
+    unsigned Offset = getOffsetFromBase();
+    if (Offset != 0)
+      Alignment = MinAlign(Alignment, Alignment + Offset);
+    return Alignment;
+  }
+
+  /// \brief Check if this slice can be rewritten with legal operations.
+  bool isLegal() const {
+    // An invalid slice is not legal.
+    if (!Origin || !Inst || !DAG)
+      return false;
+
+    // Offsets are for indexed load only, we do not handle that.
+    if (Origin->getOffset().getOpcode() != ISD::UNDEF)
+      return false;
+
+    const TargetLowering &TLI = DAG->getTargetLoweringInfo();
+
+    // Check that the type is legal.
+    EVT SliceType = getLoadedType();
+    if (!TLI.isTypeLegal(SliceType))
+      return false;
+
+    // Check that the load is legal for this type.
+    if (!TLI.isOperationLegal(ISD::LOAD, SliceType))
+      return false;
+
+    // Check that the offset can be computed.
+    // 1. Check its type.
+    EVT PtrType = Origin->getBasePtr().getValueType();
+    if (PtrType == MVT::Untyped || PtrType.isExtended())
+      return false;
+
+    // 2. Check that it fits in the immediate.
+    if (!TLI.isLegalAddImmediate(getOffsetFromBase()))
+      return false;
+
+    // 3. Check that the computation is legal.
+    if (!TLI.isOperationLegal(ISD::ADD, PtrType))
+      return false;
+
+    // Check that the zext is legal if it needs one.
+    EVT TruncateType = Inst->getValueType(0);
+    if (TruncateType != SliceType &&
+        !TLI.isOperationLegal(ISD::ZERO_EXTEND, TruncateType))
+      return false;
+
+    return true;
+  }
+
+  /// \brief Get the offset in bytes of this slice in the original chunk of
+  /// bits.
+  /// \pre DAG != NULL.
+  uint64_t getOffsetFromBase() const {
+    assert(DAG && "Missing context.");
+    bool IsBigEndian =
+        DAG->getTargetLoweringInfo().getDataLayout()->isBigEndian();
+    assert(!(Shift & 0x7) && "Shifts not aligned on Bytes are not supported.");
+    uint64_t Offset = Shift / 8;
+    unsigned TySizeInBytes = Origin->getValueSizeInBits(0) / 8;
+    assert(!(Origin->getValueSizeInBits(0) & 0x7) &&
+           "The size of the original loaded type is not a multiple of a"
+           " byte.");
+    // If Offset is bigger than TySizeInBytes, it means we are loading all
+    // zeros. This should have been optimized before in the process.
+    assert(TySizeInBytes > Offset &&
+           "Invalid shift amount for given loaded size");
+    if (IsBigEndian)
+      Offset = TySizeInBytes - Offset - getLoadedSize();
+    return Offset;
+  }
+
+  /// \brief Generate the sequence of instructions to load the slice
+  /// represented by this object and redirect the uses of this slice to
+  /// this new sequence of instructions.
+  /// \pre this->Inst && this->Origin are valid Instructions and this
+  /// object passed the legal check: LoadedSlice::isLegal returned true.
+  /// \return The last instruction of the sequence used to load the slice.
+  SDValue loadSlice() const {
+    assert(Inst && Origin && "Unable to replace a non-existing slice.");
+    const SDValue &OldBaseAddr = Origin->getBasePtr();
+    SDValue BaseAddr = OldBaseAddr;
+    // Get the offset in that chunk of bytes w.r.t. the endianess.
+    int64_t Offset = static_cast<int64_t>(getOffsetFromBase());
+    assert(Offset >= 0 && "Offset too big to fit in int64_t!");
+    if (Offset) {
+      // BaseAddr = BaseAddr + Offset.
+      EVT ArithType = BaseAddr.getValueType();
+      BaseAddr = DAG->getNode(ISD::ADD, SDLoc(Origin), ArithType, BaseAddr,
+                              DAG->getConstant(Offset, ArithType));
+    }
+
+    // Create the type of the loaded slice according to its size.
+    EVT SliceType = getLoadedType();
+
+    // Create the load for the slice.
+    SDValue LastInst = DAG->getLoad(
+        SliceType, SDLoc(Origin), Origin->getChain(), BaseAddr,
+        Origin->getPointerInfo().getWithOffset(Offset), Origin->isVolatile(),
+        Origin->isNonTemporal(), Origin->isInvariant(), getAlignment());
+    // If the final type is not the same as the loaded type, this means that
+    // we have to pad with zero. Create a zero extend for that.
+    EVT FinalType = Inst->getValueType(0);
+    if (SliceType != FinalType)
+      LastInst =
+          DAG->getNode(ISD::ZERO_EXTEND, SDLoc(LastInst), FinalType, LastInst);
+    return LastInst;
+  }
+
+  /// \brief Check if this slice can be merged with an expensive cross register
+  /// bank copy. E.g.,
+  /// i = load i32
+  /// f = bitcast i32 i to float
+  bool canMergeExpensiveCrossRegisterBankCopy() const {
+    if (!Inst || !Inst->hasOneUse())
+      return false;
+    SDNode *Use = *Inst->use_begin();
+    if (Use->getOpcode() != ISD::BITCAST)
+      return false;
+    assert(DAG && "Missing context");
+    const TargetLowering &TLI = DAG->getTargetLoweringInfo();
+    EVT ResVT = Use->getValueType(0);
+    const TargetRegisterClass *ResRC = TLI.getRegClassFor(ResVT.getSimpleVT());
+    const TargetRegisterClass *ArgRC =
+        TLI.getRegClassFor(Use->getOperand(0).getValueType().getSimpleVT());
+    if (ArgRC == ResRC || !TLI.isOperationLegal(ISD::LOAD, ResVT))
+      return false;
+
+    // At this point, we know that we perform a cross-register-bank copy.
+    // Check if it is expensive.
+    const TargetRegisterInfo *TRI = TLI.getTargetMachine().getRegisterInfo();
+    // Assume bitcasts are cheap, unless both register classes do not
+    // explicitly share a common sub class.
+    if (!TRI || TRI->getCommonSubClass(ArgRC, ResRC))
+      return false;
+
+    // Check if it will be merged with the load.
+    // 1. Check the alignment constraint.
+    unsigned RequiredAlignment = TLI.getDataLayout()->getABITypeAlignment(
+        ResVT.getTypeForEVT(*DAG->getContext()));
+
+    if (RequiredAlignment > getAlignment())
+      return false;
+
+    // 2. Check that the load is a legal operation for that type.
+    if (!TLI.isOperationLegal(ISD::LOAD, ResVT))
+      return false;
+
+    // 3. Check that we do not have a zext in the way.
+    if (Inst->getValueType(0) != getLoadedType())
+      return false;
+
+    return true;
+  }
+};
+}
+
+/// \brief Sorts LoadedSlice according to their offset.
+struct LoadedSliceSorter {
+  bool operator()(const LoadedSlice &LHS, const LoadedSlice &RHS) {
+    assert(LHS.Origin == RHS.Origin && "Different bases not implemented.");
+    return LHS.getOffsetFromBase() < RHS.getOffsetFromBase();
+  }
+};
+
+/// \brief Check that all bits set in \p UsedBits form a dense region, i.e.,
+/// \p UsedBits looks like 0..0 1..1 0..0.
+static bool areUsedBitsDense(const APInt &UsedBits) {
+  // If all the bits are one, this is dense!
+  if (UsedBits.isAllOnesValue())
+    return true;
+
+  // Get rid of the unused bits on the right.
+  APInt NarrowedUsedBits = UsedBits.lshr(UsedBits.countTrailingZeros());
+  // Get rid of the unused bits on the left.
+  if (NarrowedUsedBits.countLeadingZeros())
+    NarrowedUsedBits = NarrowedUsedBits.trunc(NarrowedUsedBits.getActiveBits());
+  // Check that the chunk of bits is completely used.
+  return NarrowedUsedBits.isAllOnesValue();
+}
+
+/// \brief Check whether or not \p First and \p Second are next to each other
+/// in memory. This means that there is no hole between the bits loaded
+/// by \p First and the bits loaded by \p Second.
+static bool areSlicesNextToEachOther(const LoadedSlice &First,
+                                     const LoadedSlice &Second) {
+  assert(First.Origin == Second.Origin && First.Origin &&
+         "Unable to match different memory origins.");
+  APInt UsedBits = First.getUsedBits();
+  assert((UsedBits & Second.getUsedBits()) == 0 &&
+         "Slices are not supposed to overlap.");
+  UsedBits |= Second.getUsedBits();
+  return areUsedBitsDense(UsedBits);
+}
+
+/// \brief Adjust the \p GlobalLSCost according to the target
+/// paring capabilities and the layout of the slices.
+/// \pre \p GlobalLSCost should account for at least as many loads as
+/// there is in the slices in \p LoadedSlices.
+static void adjustCostForPairing(SmallVectorImpl<LoadedSlice> &LoadedSlices,
+                                 LoadedSlice::Cost &GlobalLSCost) {
+  unsigned NumberOfSlices = LoadedSlices.size();
+  // If there is less than 2 elements, no pairing is possible.
+  if (NumberOfSlices < 2)
+    return;
+
+  // Sort the slices so that elements that are likely to be next to each
+  // other in memory are next to each other in the list.
+  std::sort(LoadedSlices.begin(), LoadedSlices.end(), LoadedSliceSorter());
+  const TargetLowering &TLI = LoadedSlices[0].DAG->getTargetLoweringInfo();
+  // First (resp. Second) is the first (resp. Second) potentially candidate
+  // to be placed in a paired load.
+  const LoadedSlice *First = NULL;
+  const LoadedSlice *Second = NULL;
+  for (unsigned CurrSlice = 0; CurrSlice < NumberOfSlices; ++CurrSlice,
+                // Set the beginning of the pair.
+                                                           First = Second) {
+
+    Second = &LoadedSlices[CurrSlice];
+
+    // If First is NULL, it means we start a new pair.
+    // Get to the next slice.
+    if (!First)
+      continue;
+
+    EVT LoadedType = First->getLoadedType();
+
+    // If the types of the slices are different, we cannot pair them.
+    if (LoadedType != Second->getLoadedType())
+      continue;
+
+    // Check if the target supplies paired loads for this type.
+    unsigned RequiredAlignment = 0;
+    if (!TLI.hasPairedLoad(LoadedType, RequiredAlignment)) {
+      // move to the next pair, this type is hopeless.
+      Second = NULL;
+      continue;
+    }
+    // Check if we meet the alignment requirement.
+    if (RequiredAlignment > First->getAlignment())
+      continue;
+
+    // Check that both loads are next to each other in memory.
+    if (!areSlicesNextToEachOther(*First, *Second))
+      continue;
+
+    assert(GlobalLSCost.Loads > 0 && "We save more loads than we created!");
+    --GlobalLSCost.Loads;
+    // Move to the next pair.
+    Second = NULL;
+  }
+}
+
+/// \brief Check the profitability of all involved LoadedSlice.
+/// Currently, it is considered profitable if there is exactly two
+/// involved slices (1) which are (2) next to each other in memory, and
+/// whose cost (\see LoadedSlice::Cost) is smaller than the original load (3).
+///
+/// Note: The order of the elements in \p LoadedSlices may be modified, but not
+/// the elements themselves.
+///
+/// FIXME: When the cost model will be mature enough, we can relax
+/// constraints (1) and (2).
+static bool isSlicingProfitable(SmallVectorImpl<LoadedSlice> &LoadedSlices,
+                                const APInt &UsedBits, bool ForCodeSize) {
+  unsigned NumberOfSlices = LoadedSlices.size();
+  if (StressLoadSlicing)
+    return NumberOfSlices > 1;
+
+  // Check (1).
+  if (NumberOfSlices != 2)
+    return false;
+
+  // Check (2).
+  if (!areUsedBitsDense(UsedBits))
+    return false;
+
+  // Check (3).
+  LoadedSlice::Cost OrigCost(ForCodeSize), GlobalSlicingCost(ForCodeSize);
+  // The original code has one big load.
+  OrigCost.Loads = 1;
+  for (unsigned CurrSlice = 0; CurrSlice < NumberOfSlices; ++CurrSlice) {
+    const LoadedSlice &LS = LoadedSlices[CurrSlice];
+    // Accumulate the cost of all the slices.
+    LoadedSlice::Cost SliceCost(LS, ForCodeSize);
+    GlobalSlicingCost += SliceCost;
+
+    // Account as cost in the original configuration the gain obtained
+    // with the current slices.
+    OrigCost.addSliceGain(LS);
+  }
+
+  // If the target supports paired load, adjust the cost accordingly.
+  adjustCostForPairing(LoadedSlices, GlobalSlicingCost);
+  return OrigCost > GlobalSlicingCost;
+}
+
+/// \brief If the given load, \p LI, is used only by trunc or trunc(lshr)
+/// operations, split it in the various pieces being extracted.
+///
+/// This sort of thing is introduced by SROA.
+/// This slicing takes care not to insert overlapping loads.
+/// \pre LI is a simple load (i.e., not an atomic or volatile load).
+bool DAGCombiner::SliceUpLoad(SDNode *N) {
+  if (Level < AfterLegalizeDAG)
+    return false;
+
+  LoadSDNode *LD = cast<LoadSDNode>(N);
+  if (LD->isVolatile() || !ISD::isNormalLoad(LD) ||
+      !LD->getValueType(0).isInteger())
+    return false;
+
+  // Keep track of already used bits to detect overlapping values.
+  // In that case, we will just abort the transformation.
+  APInt UsedBits(LD->getValueSizeInBits(0), 0);
+
+  SmallVector<LoadedSlice, 4> LoadedSlices;
+
+  // Check if this load is used as several smaller chunks of bits.
+  // Basically, look for uses in trunc or trunc(lshr) and record a new chain
+  // of computation for each trunc.
+  for (SDNode::use_iterator UI = LD->use_begin(), UIEnd = LD->use_end();
+       UI != UIEnd; ++UI) {
+    // Skip the uses of the chain.
+    if (UI.getUse().getResNo() != 0)
+      continue;
+
+    SDNode *User = *UI;
+    unsigned Shift = 0;
+
+    // Check if this is a trunc(lshr).
+    if (User->getOpcode() == ISD::SRL && User->hasOneUse() &&
+        isa<ConstantSDNode>(User->getOperand(1))) {
+      Shift = cast<ConstantSDNode>(User->getOperand(1))->getZExtValue();
+      User = *User->use_begin();
+    }
+
+    // At this point, User is a Truncate, iff we encountered, trunc or
+    // trunc(lshr).
+    if (User->getOpcode() != ISD::TRUNCATE)
+      return false;
+
+    // The width of the type must be a power of 2 and greater than 8-bits.
+    // Otherwise the load cannot be represented in LLVM IR.
+    // Moreover, if we shifted with a non 8-bits multiple, the slice
+    // will be accross several bytes. We do not support that.
+    unsigned Width = User->getValueSizeInBits(0);
+    if (Width < 8 || !isPowerOf2_32(Width) || (Shift & 0x7))
+      return 0;
+
+    // Build the slice for this chain of computations.
+    LoadedSlice LS(User, LD, Shift, &DAG);
+    APInt CurrentUsedBits = LS.getUsedBits();
+
+    // Check if this slice overlaps with another.
+    if ((CurrentUsedBits & UsedBits) != 0)
+      return false;
+    // Update the bits used globally.
+    UsedBits |= CurrentUsedBits;
+
+    // Check if the new slice would be legal.
+    if (!LS.isLegal())
+      return false;
+
+    // Record the slice.
+    LoadedSlices.push_back(LS);
+  }
+
+  // Abort slicing if it does not seem to be profitable.
+  if (!isSlicingProfitable(LoadedSlices, UsedBits, ForCodeSize))
+    return false;
+
+  ++SlicedLoads;
+
+  // Rewrite each chain to use an independent load.
+  // By construction, each chain can be represented by a unique load.
+
+  // Prepare the argument for the new token factor for all the slices.
+  SmallVector<SDValue, 8> ArgChains;
+  for (SmallVectorImpl<LoadedSlice>::const_iterator
+           LSIt = LoadedSlices.begin(),
+           LSItEnd = LoadedSlices.end();
+       LSIt != LSItEnd; ++LSIt) {
+    SDValue SliceInst = LSIt->loadSlice();
+    CombineTo(LSIt->Inst, SliceInst, true);
+    if (SliceInst.getNode()->getOpcode() != ISD::LOAD)
+      SliceInst = SliceInst.getOperand(0);
+    assert(SliceInst->getOpcode() == ISD::LOAD &&
+           "It takes more than a zext to get to the loaded slice!!");
+    ArgChains.push_back(SliceInst.getValue(1));
+  }
+
+  SDValue Chain = DAG.getNode(ISD::TokenFactor, SDLoc(LD), MVT::Other,
+                              &ArgChains[0], ArgChains.size());
+  DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Chain);
+  return true;
+}
+
 /// CheckForMaskedLoad - Check to see if V is (and load (ptr), imm), where the
 /// load is having specific bytes cleared out.  If so, return the byte size
 /// being masked out and the shift amount.
@@ -7735,7 +8363,8 @@ SDValue DAGCombiner::ReduceLoadOpStoreWidth(SDNode *N) {
                                   LD->getChain(), NewPtr,
                                   LD->getPointerInfo().getWithOffset(PtrOff),
                                   LD->isVolatile(), LD->isNonTemporal(),
-                                  LD->isInvariant(), NewAlign);
+                                  LD->isInvariant(), NewAlign,
+                                  LD->getTBAAInfo());
       SDValue NewVal = DAG.getNode(Opc, SDLoc(Value), NewVT, NewLD,
                                    DAG.getConstant(NewImm, NewVT));
       SDValue NewST = DAG.getStore(Chain, SDLoc(N),
@@ -7846,17 +8475,28 @@ struct BaseIndexOffset {
   static BaseIndexOffset match(SDValue Ptr) {
     bool IsIndexSignExt = false;
 
-    // Just Base or possibly anything else.
+    // We only can pattern match BASE + INDEX + OFFSET. If Ptr is not an ADD
+    // instruction, then it could be just the BASE or everything else we don't
+    // know how to handle. Just use Ptr as BASE and give up.
     if (Ptr->getOpcode() != ISD::ADD)
       return BaseIndexOffset(Ptr, SDValue(), 0, IsIndexSignExt);
 
-    // Base + offset.
+    // We know that we have at least an ADD instruction. Try to pattern match
+    // the simple case of BASE + OFFSET.
     if (isa<ConstantSDNode>(Ptr->getOperand(1))) {
       int64_t Offset = cast<ConstantSDNode>(Ptr->getOperand(1))->getSExtValue();
       return  BaseIndexOffset(Ptr->getOperand(0), SDValue(), Offset,
                               IsIndexSignExt);
     }
 
+    // Inside a loop the current BASE pointer is calculated using an ADD and a
+    // MUL instruction. In this case Ptr is the actual BASE pointer.
+    // (i64 add (i64 %array_ptr)
+    //          (i64 mul (i64 %induction_var)
+    //                   (i64 %element_size)))
+    if (Ptr->getOperand(1)->getOpcode() == ISD::MUL)
+      return BaseIndexOffset(Ptr, SDValue(), 0, IsIndexSignExt);
+
     // Look at Base + Index + Offset cases.
     SDValue Base = Ptr->getOperand(0);
     SDValue IndexOffset = Ptr->getOperand(1);
@@ -8007,6 +8647,11 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) {
         Index = STn;
         break;
       } else if (LoadSDNode *Ldn = dyn_cast<LoadSDNode>(NextInChain)) {
+        if (Ldn->isVolatile()) {
+          Index = NULL;
+          break;
+        }
+
         // Save the load node for later. Continue the scan.
         AliasLoadNodes.push_back(Ldn);
         NextInChain = Ldn->getChain().getNode();
@@ -8384,7 +9029,8 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
          TLI.isOperationLegalOrCustom(ISD::STORE, SVT)))
       return DAG.getStore(Chain, SDLoc(N), Value.getOperand(0),
                           Ptr, ST->getPointerInfo(), ST->isVolatile(),
-                          ST->isNonTemporal(), OrigAlign);
+                          ST->isNonTemporal(), OrigAlign,
+                          ST->getTBAAInfo());
   }
 
   // Turn 'store undef, Ptr' -> nothing.
@@ -8399,7 +9045,7 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
     // transform should not be done in this case.
     if (Value.getOpcode() != ISD::TargetConstantFP) {
       SDValue Tmp;
-      switch (CFP->getValueType(0).getSimpleVT().SimpleTy) {
+      switch (CFP->getSimpleValueType(0).SimpleTy) {
       default: llvm_unreachable("Unknown FP type");
       case MVT::f16:    // We don't do this for these yet.
       case MVT::f80:
@@ -8412,8 +9058,7 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
           Tmp = DAG.getConstant((uint32_t)CFP->getValueAPF().
                               bitcastToAPInt().getZExtValue(), MVT::i32);
           return DAG.getStore(Chain, SDLoc(N), Tmp,
-                              Ptr, ST->getPointerInfo(), ST->isVolatile(),
-                              ST->isNonTemporal(), ST->getAlignment());
+                              Ptr, ST->getMemOperand());
         }
         break;
       case MVT::f64:
@@ -8423,8 +9068,7 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
           Tmp = DAG.getConstant(CFP->getValueAPF().bitcastToAPInt().
                                 getZExtValue(), MVT::i64);
           return DAG.getStore(Chain, SDLoc(N), Tmp,
-                              Ptr, ST->getPointerInfo(), ST->isVolatile(),
-                              ST->isNonTemporal(), ST->getAlignment());
+                              Ptr, ST->getMemOperand());
         }
 
         if (!ST->isVolatile() &&
@@ -8440,18 +9084,19 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
           unsigned Alignment = ST->getAlignment();
           bool isVolatile = ST->isVolatile();
           bool isNonTemporal = ST->isNonTemporal();
+          const MDNode *TBAAInfo = ST->getTBAAInfo();
 
           SDValue St0 = DAG.getStore(Chain, SDLoc(ST), Lo,
                                      Ptr, ST->getPointerInfo(),
                                      isVolatile, isNonTemporal,
-                                     ST->getAlignment());
+                                     ST->getAlignment(), TBAAInfo);
           Ptr = DAG.getNode(ISD::ADD, SDLoc(N), Ptr.getValueType(), Ptr,
                             DAG.getConstant(4, Ptr.getValueType()));
           Alignment = MinAlign(Alignment, 4U);
           SDValue St1 = DAG.getStore(Chain, SDLoc(ST), Hi,
                                      Ptr, ST->getPointerInfo().getWithOffset(4),
                                      isVolatile, isNonTemporal,
-                                     Alignment);
+                                     Alignment, TBAAInfo);
           return DAG.getNode(ISD::TokenFactor, SDLoc(N), MVT::Other,
                              St0, St1);
         }
@@ -8467,7 +9112,8 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
       if (Align > ST->getAlignment())
         return DAG.getTruncStore(Chain, SDLoc(N), Value,
                                  Ptr, ST->getPointerInfo(), ST->getMemoryVT(),
-                                 ST->isVolatile(), ST->isNonTemporal(), Align);
+                                 ST->isVolatile(), ST->isNonTemporal(), Align,
+                                 ST->getTBAAInfo());
     }
   }
 
@@ -8477,7 +9123,9 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
   if (NewST.getNode())
     return NewST;
 
-  if (CombinerAA) {
+  bool UseAA = CombinerAA.getNumOccurrences() > 0 ? CombinerAA :
+    TLI.getTargetMachine().getSubtarget<TargetSubtargetInfo>().useAA();
+  if (UseAA) {
     // Walk up chain skipping non-aliasing memory nodes.
     SDValue BetterChain = FindBetterChain(N, Chain);
 
@@ -8488,14 +9136,10 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
       // Replace the chain to avoid dependency.
       if (ST->isTruncatingStore()) {
         ReplStore = DAG.getTruncStore(BetterChain, SDLoc(N), Value, Ptr,
-                                      ST->getPointerInfo(),
-                                      ST->getMemoryVT(), ST->isVolatile(),
-                                      ST->isNonTemporal(), ST->getAlignment());
+                                      ST->getMemoryVT(), ST->getMemOperand());
       } else {
         ReplStore = DAG.getStore(BetterChain, SDLoc(N), Value, Ptr,
-                                 ST->getPointerInfo(),
-                                 ST->isVolatile(), ST->isNonTemporal(),
-                                 ST->getAlignment());
+                                 ST->getMemOperand());
       }
 
       // Create token to keep both nodes around.
@@ -8528,9 +9172,7 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
     AddToWorkList(Value.getNode());
     if (Shorter.getNode())
       return DAG.getTruncStore(Chain, SDLoc(N), Shorter,
-                               Ptr, ST->getPointerInfo(), ST->getMemoryVT(),
-                               ST->isVolatile(), ST->isNonTemporal(),
-                               ST->getAlignment());
+                               Ptr, ST->getMemoryVT(), ST->getMemOperand());
 
     // Otherwise, see if we can simplify the operation with
     // SimplifyDemandedBits, which only works if the value has a single use.
@@ -8561,9 +9203,7 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
       TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(),
                             ST->getMemoryVT())) {
     return DAG.getTruncStore(Chain, SDLoc(N), Value.getOperand(0),
-                             Ptr, ST->getPointerInfo(), ST->getMemoryVT(),
-                             ST->isVolatile(), ST->isNonTemporal(),
-                             ST->getAlignment());
+                             Ptr, ST->getMemoryVT(), ST->getMemOperand());
   }
 
   // Only perform this optimization before the types are legal, because we
@@ -8821,13 +9461,14 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
         ? ISD::ZEXTLOAD : ISD::EXTLOAD;
       Load = DAG.getExtLoad(ExtType, SDLoc(N), NVT, LN0->getChain(),
                             NewPtr, LN0->getPointerInfo().getWithOffset(PtrOff),
-                            LVT, LN0->isVolatile(), LN0->isNonTemporal(),Align);
+                            LVT, LN0->isVolatile(), LN0->isNonTemporal(),
+                            Align, LN0->getTBAAInfo());
       Chain = Load.getValue(1);
     } else {
       Load = DAG.getLoad(LVT, SDLoc(N), LN0->getChain(), NewPtr,
                          LN0->getPointerInfo().getWithOffset(PtrOff),
                          LN0->isVolatile(), LN0->isNonTemporal(),
-                         LN0->isInvariant(), Align);
+                         LN0->isInvariant(), Align, LN0->getTBAAInfo());
       Chain = Load.getValue(1);
       if (NVT.bitsLT(LVT))
         Load = DAG.getNode(ISD::TRUNCATE, SDLoc(N), NVT, Load);
@@ -9165,8 +9806,35 @@ SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) {
     return N->getOperand(0);
 
   // Check if all of the operands are undefs.
+  EVT VT = N->getValueType(0);
   if (ISD::allOperandsUndef(N))
-    return DAG.getUNDEF(N->getValueType(0));
+    return DAG.getUNDEF(VT);
+
+  // Optimize concat_vectors where one of the vectors is undef.
+  if (N->getNumOperands() == 2 &&
+      N->getOperand(1)->getOpcode() == ISD::UNDEF) {
+    SDValue In = N->getOperand(0);
+    assert(In.getValueType().isVector() && "Must concat vectors");
+
+    // Transform: concat_vectors(scalar, undef) -> scalar_to_vector(sclr).
+    if (In->getOpcode() == ISD::BITCAST &&
+        !In->getOperand(0)->getValueType(0).isVector()) {
+      SDValue Scalar = In->getOperand(0);
+      EVT SclTy = Scalar->getValueType(0);
+
+      if (!SclTy.isFloatingPoint() && !SclTy.isInteger())
+        return SDValue();
+
+      EVT NVT = EVT::getVectorVT(*DAG.getContext(), SclTy,
+                                 VT.getSizeInBits() / SclTy.getSizeInBits());
+      if (!TLI.isTypeLegal(NVT) || !TLI.isTypeLegal(Scalar.getValueType()))
+        return SDValue();
+
+      SDLoc dl = SDLoc(N);
+      SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, NVT, Scalar);
+      return DAG.getNode(ISD::BITCAST, dl, VT, Res);
+    }
+  }
 
   // Type legalization of vectors and DAG canonicalization of SHUFFLE_VECTOR
   // nodes often generate nop CONCAT_VECTOR nodes.
@@ -9225,7 +9893,8 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode* N) {
     //    (extract_subvec (concat V1, V2, ...), i)
     // Into:
     //    Vi if possible
-    // Only operand 0 is checked as 'concat' assumes all inputs of the same type.
+    // Only operand 0 is checked as 'concat' assumes all inputs of the same
+    // type.
     if (V->getOperand(0).getValueType() != NVT)
       return SDValue();
     unsigned Idx = dyn_cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
@@ -9358,10 +10027,10 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
     for (unsigned i = 0; i != NumElts; ++i) {
       int Idx = SVN->getMaskElt(i);
       if (Idx >= 0) {
-        if (Idx < (int)NumElts)
-          Idx += NumElts;
-        else
+        if (Idx >= (int)NumElts)
           Idx -= NumElts;
+        else
+          Idx = -1; // remove reference to lhs
       }
       NewMask.push_back(Idx);
     }
@@ -9738,7 +10407,7 @@ bool DAGCombiner::SimplifySelectOps(SDNode *TheSelect, SDValue LHS,
     if (LLD->getExtensionType() == ISD::NON_EXTLOAD) {
       Load = DAG.getLoad(TheSelect->getValueType(0),
                          SDLoc(TheSelect),
-                         // FIXME: Discards pointer info.
+                         // FIXME: Discards pointer and TBAA info.
                          LLD->getChain(), Addr, MachinePointerInfo(),
                          LLD->isVolatile(), LLD->isNonTemporal(),
                          LLD->isInvariant(), LLD->getAlignment());
@@ -9747,7 +10416,7 @@ bool DAGCombiner::SimplifySelectOps(SDNode *TheSelect, SDValue LHS,
                             RLD->getExtensionType() : LLD->getExtensionType(),
                             SDLoc(TheSelect),
                             TheSelect->getValueType(0),
-                            // FIXME: Discards pointer info.
+                            // FIXME: Discards pointer and TBAA info.
                             LLD->getChain(), Addr, MachinePointerInfo(),
                             LLD->getMemoryVT(), LLD->isVolatile(),
                             LLD->isNonTemporal(), LLD->getAlignment());
@@ -9852,7 +10521,7 @@ SDValue DAGCombiner::SimplifySelectCC(SDLoc DL, SDValue N0, SDValue N1,
         SDValue CstOffset = DAG.getSelect(DL, Zero.getValueType(),
                                           Cond, One, Zero);
         AddToWorkList(CstOffset.getNode());
-        CPIdx = DAG.getNode(ISD::ADD, DL, TLI.getPointerTy(), CPIdx,
+        CPIdx = DAG.getNode(ISD::ADD, DL, CPIdx.getValueType(), CPIdx,
                             CstOffset);
         AddToWorkList(CPIdx.getNode());
         return DAG.getLoad(TV->getValueType(0), DL, DAG.getEntryNode(), CPIdx,
@@ -9974,9 +10643,10 @@ SDValue DAGCombiner::SimplifySelectCC(SDLoc DL, SDValue N0, SDValue N1,
         return Temp;
 
       // shl setcc result by log2 n2c
-      return DAG.getNode(ISD::SHL, DL, N2.getValueType(), Temp,
-                         DAG.getConstant(N2C->getAPIntValue().logBase2(),
-                                         getShiftAmountTy(Temp.getValueType())));
+      return DAG.getNode(
+          ISD::SHL, DL, N2.getValueType(), Temp,
+          DAG.getConstant(N2C->getAPIntValue().logBase2(),
+                          getShiftAmountTy(Temp.getValueType())));
     }
   }
 
@@ -10132,17 +10802,20 @@ static bool FindBaseOffset(SDValue Ptr, SDValue &Base, int64_t &Offset,
 
 /// isAlias - Return true if there is any possibility that the two addresses
 /// overlap.
-bool DAGCombiner::isAlias(SDValue Ptr1, int64_t Size1,
+bool DAGCombiner::isAlias(SDValue Ptr1, int64_t Size1, bool IsVolatile1,
                           const Value *SrcValue1, int SrcValueOffset1,
                           unsigned SrcValueAlign1,
                           const MDNode *TBAAInfo1,
-                          SDValue Ptr2, int64_t Size2,
+                          SDValue Ptr2, int64_t Size2, bool IsVolatile2,
                           const Value *SrcValue2, int SrcValueOffset2,
                           unsigned SrcValueAlign2,
                           const MDNode *TBAAInfo2) const {
   // If they are the same then they must be aliases.
   if (Ptr1 == Ptr2) return true;
 
+  // If they are both volatile then they cannot be reordered.
+  if (IsVolatile1 && IsVolatile2) return true;
+
   // Gather base node and offset information.
   SDValue Base1, Base2;
   int64_t Offset1, Offset2;
@@ -10187,7 +10860,9 @@ bool DAGCombiner::isAlias(SDValue Ptr1, int64_t Size1,
       return false;
   }
 
-  if (CombinerGlobalAA) {
+  bool UseAA = CombinerGlobalAA.getNumOccurrences() > 0 ? CombinerGlobalAA :
+    TLI.getTargetMachine().getSubtarget<TargetSubtargetInfo>().useAA();
+  if (UseAA && SrcValue1 && SrcValue2) {
     // Use alias analysis information.
     int64_t MinOffset = std::min(SrcValueOffset1, SrcValueOffset2);
     int64_t Overlap1 = Size1 + SrcValueOffset1 - MinOffset;
@@ -10206,24 +10881,25 @@ bool DAGCombiner::isAlias(SDValue Ptr1, int64_t Size1,
 bool DAGCombiner::isAlias(LSBaseSDNode *Op0, LSBaseSDNode *Op1) {
   SDValue Ptr0, Ptr1;
   int64_t Size0, Size1;
+  bool IsVolatile0, IsVolatile1;
   const Value *SrcValue0, *SrcValue1;
   int SrcValueOffset0, SrcValueOffset1;
   unsigned SrcValueAlign0, SrcValueAlign1;
   const MDNode *SrcTBAAInfo0, *SrcTBAAInfo1;
-  FindAliasInfo(Op0, Ptr0, Size0, SrcValue0, SrcValueOffset0,
+  FindAliasInfo(Op0, Ptr0, Size0, IsVolatile0, SrcValue0, SrcValueOffset0,
                 SrcValueAlign0, SrcTBAAInfo0);
-  FindAliasInfo(Op1, Ptr1, Size1, SrcValue1, SrcValueOffset1,
+  FindAliasInfo(Op1, Ptr1, Size1, IsVolatile1, SrcValue1, SrcValueOffset1,
                 SrcValueAlign1, SrcTBAAInfo1);
-  return isAlias(Ptr0, Size0, SrcValue0, SrcValueOffset0,
+  return isAlias(Ptr0, Size0, IsVolatile0, SrcValue0, SrcValueOffset0,
                  SrcValueAlign0, SrcTBAAInfo0,
-                 Ptr1, Size1, SrcValue1, SrcValueOffset1,
+                 Ptr1, Size1, IsVolatile1, SrcValue1, SrcValueOffset1,
                  SrcValueAlign1, SrcTBAAInfo1);
 }
 
 /// FindAliasInfo - Extracts the relevant alias information from the memory
-/// node.  Returns true if the operand was a load.
+/// node.  Returns true if the operand was a nonvolatile load.
 bool DAGCombiner::FindAliasInfo(SDNode *N,
-                                SDValue &Ptr, int64_t &Size,
+                                SDValue &Ptr, int64_t &Size, bool &IsVolatile,
                                 const Value *&SrcValue,
                                 int &SrcValueOffset,
                                 unsigned &SrcValueAlign,
@@ -10232,11 +10908,12 @@ bool DAGCombiner::FindAliasInfo(SDNode *N,
 
   Ptr = LS->getBasePtr();
   Size = LS->getMemoryVT().getSizeInBits() >> 3;
+  IsVolatile = LS->isVolatile();
   SrcValue = LS->getSrcValue();
   SrcValueOffset = LS->getSrcValueOffset();
   SrcValueAlign = LS->getOriginalAlignment();
   TBAAInfo = LS->getTBAAInfo();
-  return isa<LoadSDNode>(LS);
+  return isa<LoadSDNode>(LS) && !IsVolatile;
 }
 
 /// GatherAllAliases - Walk up chain skipping non-aliasing memory nodes,
@@ -10249,12 +10926,13 @@ void DAGCombiner::GatherAllAliases(SDNode *N, SDValue OriginalChain,
   // Get alias information for node.
   SDValue Ptr;
   int64_t Size;
+  bool IsVolatile;
   const Value *SrcValue;
   int SrcValueOffset;
   unsigned SrcValueAlign;
   const MDNode *SrcTBAAInfo;
-  bool IsLoad = FindAliasInfo(N, Ptr, Size, SrcValue, SrcValueOffset,
-                              SrcValueAlign, SrcTBAAInfo);
+  bool IsLoad = FindAliasInfo(N, Ptr, Size, IsVolatile, SrcValue,
+                              SrcValueOffset, SrcValueAlign, SrcTBAAInfo);
 
   // Starting off.
   Chains.push_back(OriginalChain);
@@ -10295,20 +10973,21 @@ void DAGCombiner::GatherAllAliases(SDNode *N, SDValue OriginalChain,
       // Get alias information for Chain.
       SDValue OpPtr;
       int64_t OpSize;
+      bool OpIsVolatile;
       const Value *OpSrcValue;
       int OpSrcValueOffset;
       unsigned OpSrcValueAlign;
       const MDNode *OpSrcTBAAInfo;
       bool IsOpLoad = FindAliasInfo(Chain.getNode(), OpPtr, OpSize,
-                                    OpSrcValue, OpSrcValueOffset,
+                                    OpIsVolatile, OpSrcValue, OpSrcValueOffset,
                                     OpSrcValueAlign,
                                     OpSrcTBAAInfo);
 
       // If chain is alias then stop here.
       if (!(IsLoad && IsOpLoad) &&
-          isAlias(Ptr, Size, SrcValue, SrcValueOffset, SrcValueAlign,
-                  SrcTBAAInfo,
-                  OpPtr, OpSize, OpSrcValue, OpSrcValueOffset,
+          isAlias(Ptr, Size, IsVolatile, SrcValue, SrcValueOffset,
+                  SrcValueAlign, SrcTBAAInfo,
+                  OpPtr, OpSize, OpIsVolatile, OpSrcValue, OpSrcValueOffset,
                   OpSrcValueAlign, OpSrcTBAAInfo)) {
         Aliases.push_back(Chain);
       } else {
diff --git a/lib/CodeGen/SelectionDAG/FastISel.cpp b/lib/CodeGen/SelectionDAG/FastISel.cpp
index b4ac948f..a6f7461 100644
--- a/lib/CodeGen/SelectionDAG/FastISel.cpp
+++ b/lib/CodeGen/SelectionDAG/FastISel.cpp
@@ -638,29 +638,25 @@ bool FastISel::SelectCall(const User *I) {
         (!isa<AllocaInst>(Address) ||
          !FuncInfo.StaticAllocaMap.count(cast<AllocaInst>(Address))))
       Op = MachineOperand::CreateReg(FuncInfo.InitializeRegForValue(Address),
-                                      false);
+                                     false);
 
-    if (Op)
+    if (Op) {
       if (Op->isReg()) {
-        // Set the indirect flag if the type and the DIVariable's
-        // indirect field are in disagreement: Indirectly-addressed
-        // variables that are nonpointer types should be marked as
-        // indirect, and VLAs should be marked as indirect eventhough
-        // they are a pointer type.
-        bool IsIndirect = DI->getAddress()->getType()->isPointerTy()
-          ^ DIVar.isIndirect();
         Op->setIsDebug(true);
         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
-                TII.get(TargetOpcode::DBG_VALUE),
-                IsIndirect, Op->getReg(), Offset, DI->getVariable());
+                TII.get(TargetOpcode::DBG_VALUE), false, Op->getReg(), 0,
+                DI->getVariable());
       } else
         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
-                TII.get(TargetOpcode::DBG_VALUE)).addOperand(*Op).addImm(0)
-          .addMetadata(DI->getVariable());
-    else
+                TII.get(TargetOpcode::DBG_VALUE))
+            .addOperand(*Op)
+            .addImm(0)
+            .addMetadata(DI->getVariable());
+    } else {
       // We can't yet handle anything else here because it would require
       // generating code, thus altering codegen because of debug info.
       DEBUG(dbgs() << "Dropping debug info for " << *DI << "\n");
+    }
     return true;
   }
   case Intrinsic::dbg_value: {
@@ -688,6 +684,7 @@ bool FastISel::SelectCall(const User *I) {
         .addFPImm(CF).addImm(DI->getOffset())
         .addMetadata(DI->getVariable());
     } else if (unsigned Reg = lookUpRegForValue(V)) {
+      // FIXME: This does not handle register-indirect values at offset 0.
       bool IsIndirect = DI->getOffset() != 0;
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II, IsIndirect,
               Reg, DI->getOffset(), DI->getVariable());
@@ -1574,4 +1571,19 @@ bool FastISel::tryToFoldLoad(const LoadInst *LI, const Instruction *FoldInst) {
   return tryToFoldLoadIntoMI(User, RI.getOperandNo(), LI);
 }
 
+bool FastISel::canFoldAddIntoGEP(const User *GEP, const Value *Add) {
+  // Must be an add.
+  if (!isa<AddOperator>(Add))
+    return false;
+  // Type size needs to match.
+  if (TD.getTypeSizeInBits(GEP->getType()) !=
+      TD.getTypeSizeInBits(Add->getType()))
+    return false;
+  // Must be in the same basic block.
+  if (isa<Instruction>(Add) &&
+      FuncInfo.MBBMap[cast<Instruction>(Add)->getParent()] != FuncInfo.MBB)
+    return false;
+  // Must have a constant operand.
+  return isa<ConstantInt>(cast<AddOperator>(Add)->getOperand(1));
+}
 
diff --git a/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
index e107276..3a8fb85 100644
--- a/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
+++ b/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
@@ -21,6 +21,7 @@
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/StackMaps.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -211,6 +212,7 @@ void InstrEmitter::CreateVirtualRegisters(SDNode *Node,
   assert(Node->getMachineOpcode() != TargetOpcode::IMPLICIT_DEF &&
          "IMPLICIT_DEF should have been handled as a special case elsewhere!");
 
+  unsigned NumResults = CountResults(Node);
   for (unsigned i = 0; i < II.getNumDefs(); ++i) {
     // If the specific node value is only used by a CopyToReg and the dest reg
     // is a vreg in the same register class, use the CopyToReg'd destination
@@ -218,6 +220,10 @@ void InstrEmitter::CreateVirtualRegisters(SDNode *Node,
     unsigned VRBase = 0;
     const TargetRegisterClass *RC =
       TRI->getAllocatableClass(TII->getRegClass(II, i, TRI, *MF));
+    // If the register class is unknown for the given definition, then try to
+    // infer one from the value type.
+    if (!RC && i < NumResults)
+      RC = TLI->getRegClassFor(Node->getSimpleValueType(i));
     if (II.OpInfo[i].isOptionalDef()) {
       // Optional def must be a physical register.
       unsigned NumResults = CountResults(Node);
@@ -722,10 +728,20 @@ EmitMachineNode(SDNode *Node, bool IsClone, bool IsCloned,
 
   const MCInstrDesc &II = TII->get(Opc);
   unsigned NumResults = CountResults(Node);
+  unsigned NumDefs = II.getNumDefs();
+  const uint16_t *ScratchRegs = NULL;
+
+  // Handle PATCHPOINT specially and then use the generic code.
+  if (Opc == TargetOpcode::PATCHPOINT) {
+    unsigned CC = Node->getConstantOperandVal(PatchPointOpers::CCPos);
+    NumDefs = NumResults;
+    ScratchRegs = TLI->getScratchRegisters((CallingConv::ID) CC);
+  }
+
   unsigned NumImpUses = 0;
   unsigned NodeOperands =
-    countOperands(Node, II.getNumOperands() - II.getNumDefs(), NumImpUses);
-  bool HasPhysRegOuts = NumResults > II.getNumDefs() && II.getImplicitDefs()!=0;
+    countOperands(Node, II.getNumOperands() - NumDefs, NumImpUses);
+  bool HasPhysRegOuts = NumResults > NumDefs && II.getImplicitDefs()!=0;
 #ifndef NDEBUG
   unsigned NumMIOperands = NodeOperands + NumResults;
   if (II.isVariadic())
@@ -748,14 +764,20 @@ EmitMachineNode(SDNode *Node, bool IsClone, bool IsCloned,
 
   // Emit all of the actual operands of this instruction, adding them to the
   // instruction as appropriate.
-  bool HasOptPRefs = II.getNumDefs() > NumResults;
+  bool HasOptPRefs = NumDefs > NumResults;
   assert((!HasOptPRefs || !HasPhysRegOuts) &&
          "Unable to cope with optional defs and phys regs defs!");
-  unsigned NumSkip = HasOptPRefs ? II.getNumDefs() - NumResults : 0;
+  unsigned NumSkip = HasOptPRefs ? NumDefs - NumResults : 0;
   for (unsigned i = NumSkip; i != NodeOperands; ++i)
-    AddOperand(MIB, Node->getOperand(i), i-NumSkip+II.getNumDefs(), &II,
+    AddOperand(MIB, Node->getOperand(i), i-NumSkip+NumDefs, &II,
                VRBaseMap, /*IsDebug=*/false, IsClone, IsCloned);
 
+  // Add scratch registers as implicit def and early clobber
+  if (ScratchRegs)
+    for (unsigned i = 0; ScratchRegs[i]; ++i)
+      MIB.addReg(ScratchRegs[i], RegState::ImplicitDefine |
+                                 RegState::EarlyClobber);
+
   // Transfer all of the memory reference descriptions of this instruction.
   MIB.setMemRefs(cast<MachineSDNode>(Node)->memoperands_begin(),
                  cast<MachineSDNode>(Node)->memoperands_end());
@@ -784,8 +806,8 @@ EmitMachineNode(SDNode *Node, bool IsClone, bool IsCloned,
 
   // Additional results must be physical register defs.
   if (HasPhysRegOuts) {
-    for (unsigned i = II.getNumDefs(); i < NumResults; ++i) {
-      unsigned Reg = II.getImplicitDefs()[i - II.getNumDefs()];
+    for (unsigned i = NumDefs; i < NumResults; ++i) {
+      unsigned Reg = II.getImplicitDefs()[i - NumDefs];
       if (!Node->hasAnyUseOfValue(i))
         continue;
       // This implicitly defined physreg has a use.
diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index bd844e5..9061ae9 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -95,8 +95,8 @@ private:
                                      SDValue N1, SDValue N2,
                                      ArrayRef<int> Mask) const;
 
-  void LegalizeSetCCCondCode(EVT VT, SDValue &LHS, SDValue &RHS, SDValue &CC,
-                             SDLoc dl);
+  bool LegalizeSetCCCondCode(EVT VT, SDValue &LHS, SDValue &RHS, SDValue &CC,
+                             bool &NeedInvert, SDLoc dl);
 
   SDValue ExpandLibCall(RTLIB::Libcall LC, SDNode *Node, bool isSigned);
   SDValue ExpandLibCall(RTLIB::Libcall LC, EVT RetVT, const SDValue *Ops,
@@ -311,6 +311,8 @@ static void ExpandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG,
   SDValue Val = ST->getValue();
   EVT VT = Val.getValueType();
   int Alignment = ST->getAlignment();
+  unsigned AS = ST->getAddressSpace();
+
   SDLoc dl(ST);
   if (ST->getMemoryVT().isFloatingPoint() ||
       ST->getMemoryVT().isVector()) {
@@ -343,7 +345,7 @@ static void ExpandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG,
     SDValue Store = DAG.getTruncStore(Chain, dl,
                                       Val, StackPtr, MachinePointerInfo(),
                                       StoredVT, false, false, 0);
-    SDValue Increment = DAG.getConstant(RegBytes, TLI.getPointerTy());
+    SDValue Increment = DAG.getConstant(RegBytes, TLI.getPointerTy(AS));
     SmallVector<SDValue, 8> Stores;
     unsigned Offset = 0;
 
@@ -381,7 +383,8 @@ static void ExpandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG,
                                          .getWithOffset(Offset),
                                        MemVT, ST->isVolatile(),
                                        ST->isNonTemporal(),
-                                       MinAlign(ST->getAlignment(), Offset)));
+                                       MinAlign(ST->getAlignment(), Offset),
+                                       ST->getTBAAInfo()));
     // The order of the stores doesn't matter - say it with a TokenFactor.
     SDValue Result =
       DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Stores[0],
@@ -408,13 +411,14 @@ static void ExpandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG,
   Store1 = DAG.getTruncStore(Chain, dl, TLI.isLittleEndian()?Lo:Hi, Ptr,
                              ST->getPointerInfo(), NewStoredVT,
                              ST->isVolatile(), ST->isNonTemporal(), Alignment);
+
   Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
-                    DAG.getConstant(IncrementSize, TLI.getPointerTy()));
+                    DAG.getConstant(IncrementSize, TLI.getPointerTy(AS)));
   Alignment = MinAlign(Alignment, IncrementSize);
   Store2 = DAG.getTruncStore(Chain, dl, TLI.isLittleEndian()?Hi:Lo, Ptr,
                              ST->getPointerInfo().getWithOffset(IncrementSize),
                              NewStoredVT, ST->isVolatile(), ST->isNonTemporal(),
-                             Alignment);
+                             Alignment, ST->getTBAAInfo());
 
   SDValue Result =
     DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Store1, Store2);
@@ -438,10 +442,8 @@ ExpandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG,
     if (TLI.isTypeLegal(intVT) && TLI.isTypeLegal(LoadedVT)) {
       // Expand to a (misaligned) integer load of the same size,
       // then bitconvert to floating point or vector.
-      SDValue newLoad = DAG.getLoad(intVT, dl, Chain, Ptr, LD->getPointerInfo(),
-                                    LD->isVolatile(),
-                                    LD->isNonTemporal(),
-                                    LD->isInvariant(), LD->getAlignment());
+      SDValue newLoad = DAG.getLoad(intVT, dl, Chain, Ptr,
+                                    LD->getMemOperand());
       SDValue Result = DAG.getNode(ISD::BITCAST, dl, LoadedVT, newLoad);
       if (LoadedVT != VT)
         Result = DAG.getNode(VT.isFloatingPoint() ? ISD::FP_EXTEND :
@@ -474,7 +476,8 @@ ExpandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG,
                                  LD->getPointerInfo().getWithOffset(Offset),
                                  LD->isVolatile(), LD->isNonTemporal(),
                                  LD->isInvariant(),
-                                 MinAlign(LD->getAlignment(), Offset));
+                                 MinAlign(LD->getAlignment(), Offset),
+                                 LD->getTBAAInfo());
       // Follow the load with a store to the stack slot.  Remember the store.
       Stores.push_back(DAG.getStore(Load.getValue(1), dl, Load, StackPtr,
                                     MachinePointerInfo(), false, false, 0));
@@ -492,7 +495,8 @@ ExpandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG,
                                   LD->getPointerInfo().getWithOffset(Offset),
                                   MemVT, LD->isVolatile(),
                                   LD->isNonTemporal(),
-                                  MinAlign(LD->getAlignment(), Offset));
+                                  MinAlign(LD->getAlignment(), Offset),
+                                  LD->getTBAAInfo());
     // Follow the load with a store to the stack slot.  Remember the store.
     // On big-endian machines this requires a truncating store to ensure
     // that the bits end up in the right place.
@@ -536,23 +540,25 @@ ExpandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG,
   if (TLI.isLittleEndian()) {
     Lo = DAG.getExtLoad(ISD::ZEXTLOAD, dl, VT, Chain, Ptr, LD->getPointerInfo(),
                         NewLoadedVT, LD->isVolatile(),
-                        LD->isNonTemporal(), Alignment);
+                        LD->isNonTemporal(), Alignment, LD->getTBAAInfo());
     Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
-                      DAG.getConstant(IncrementSize, TLI.getPointerTy()));
+                      DAG.getConstant(IncrementSize, Ptr.getValueType()));
     Hi = DAG.getExtLoad(HiExtType, dl, VT, Chain, Ptr,
                         LD->getPointerInfo().getWithOffset(IncrementSize),
                         NewLoadedVT, LD->isVolatile(),
-                        LD->isNonTemporal(), MinAlign(Alignment,IncrementSize));
+                        LD->isNonTemporal(), MinAlign(Alignment, IncrementSize),
+                        LD->getTBAAInfo());
   } else {
     Hi = DAG.getExtLoad(HiExtType, dl, VT, Chain, Ptr, LD->getPointerInfo(),
                         NewLoadedVT, LD->isVolatile(),
-                        LD->isNonTemporal(), Alignment);
+                        LD->isNonTemporal(), Alignment, LD->getTBAAInfo());
     Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
-                      DAG.getConstant(IncrementSize, TLI.getPointerTy()));
+                      DAG.getConstant(IncrementSize, Ptr.getValueType()));
     Lo = DAG.getExtLoad(ISD::ZEXTLOAD, dl, VT, Chain, Ptr,
                         LD->getPointerInfo().getWithOffset(IncrementSize),
                         NewLoadedVT, LD->isVolatile(),
-                        LD->isNonTemporal(), MinAlign(Alignment,IncrementSize));
+                        LD->isNonTemporal(), MinAlign(Alignment, IncrementSize),
+                        LD->getTBAAInfo());
   }
 
   // aggregate the two parts
@@ -655,6 +661,7 @@ SDValue SelectionDAGLegalize::OptimizeFloatStore(StoreSDNode* ST) {
   unsigned Alignment = ST->getAlignment();
   bool isVolatile = ST->isVolatile();
   bool isNonTemporal = ST->isNonTemporal();
+  const MDNode *TBAAInfo = ST->getTBAAInfo();
   SDLoc dl(ST);
   if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(ST->getValue())) {
     if (CFP->getValueType(0) == MVT::f32 &&
@@ -663,7 +670,7 @@ SDValue SelectionDAGLegalize::OptimizeFloatStore(StoreSDNode* ST) {
                                       bitcastToAPInt().zextOrTrunc(32),
                               MVT::i32);
       return DAG.getStore(Chain, dl, Con, Ptr, ST->getPointerInfo(),
-                          isVolatile, isNonTemporal, Alignment);
+                          isVolatile, isNonTemporal, Alignment, TBAAInfo);
     }
 
     if (CFP->getValueType(0) == MVT::f64) {
@@ -672,7 +679,7 @@ SDValue SelectionDAGLegalize::OptimizeFloatStore(StoreSDNode* ST) {
         SDValue Con = DAG.getConstant(CFP->getValueAPF().bitcastToAPInt().
                                   zextOrTrunc(64), MVT::i64);
         return DAG.getStore(Chain, dl, Con, Ptr, ST->getPointerInfo(),
-                            isVolatile, isNonTemporal, Alignment);
+                            isVolatile, isNonTemporal, Alignment, TBAAInfo);
       }
 
       if (TLI.isTypeLegal(MVT::i32) && !ST->isVolatile()) {
@@ -685,12 +692,13 @@ SDValue SelectionDAGLegalize::OptimizeFloatStore(StoreSDNode* ST) {
         if (TLI.isBigEndian()) std::swap(Lo, Hi);
 
         Lo = DAG.getStore(Chain, dl, Lo, Ptr, ST->getPointerInfo(), isVolatile,
-                          isNonTemporal, Alignment);
+                          isNonTemporal, Alignment, TBAAInfo);
         Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
-                            DAG.getIntPtrConstant(4));
+                          DAG.getConstant(4, Ptr.getValueType()));
         Hi = DAG.getStore(Chain, dl, Hi, Ptr,
                           ST->getPointerInfo().getWithOffset(4),
-                          isVolatile, isNonTemporal, MinAlign(Alignment, 4U));
+                          isVolatile, isNonTemporal, MinAlign(Alignment, 4U),
+                          TBAAInfo);
 
         return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo, Hi);
       }
@@ -708,6 +716,7 @@ void SelectionDAGLegalize::LegalizeStoreOps(SDNode *Node) {
     unsigned Alignment = ST->getAlignment();
     bool isVolatile = ST->isVolatile();
     bool isNonTemporal = ST->isNonTemporal();
+    const MDNode *TBAAInfo = ST->getTBAAInfo();
 
     if (!ST->isTruncatingStore()) {
       if (SDNode *OptStore = OptimizeFloatStore(ST).getNode()) {
@@ -745,7 +754,7 @@ void SelectionDAGLegalize::LegalizeStoreOps(SDNode *Node) {
           SDValue Result =
             DAG.getStore(Chain, dl, Value, Ptr,
                          ST->getPointerInfo(), isVolatile,
-                         isNonTemporal, Alignment);
+                         isNonTemporal, Alignment, TBAAInfo);
           ReplaceNode(SDValue(Node, 0), Result);
           break;
         }
@@ -767,7 +776,8 @@ void SelectionDAGLegalize::LegalizeStoreOps(SDNode *Node) {
         Value = DAG.getZeroExtendInReg(Value, dl, StVT);
         SDValue Result =
           DAG.getTruncStore(Chain, dl, Value, Ptr, ST->getPointerInfo(),
-                            NVT, isVolatile, isNonTemporal, Alignment);
+                            NVT, isVolatile, isNonTemporal, Alignment,
+                            TBAAInfo);
         ReplaceNode(SDValue(Node, 0), Result);
       } else if (StWidth & (StWidth - 1)) {
         // If not storing a power-of-2 number of bits, expand as two stores.
@@ -788,19 +798,20 @@ void SelectionDAGLegalize::LegalizeStoreOps(SDNode *Node) {
           // Store the bottom RoundWidth bits.
           Lo = DAG.getTruncStore(Chain, dl, Value, Ptr, ST->getPointerInfo(),
                                  RoundVT,
-                                 isVolatile, isNonTemporal, Alignment);
+                                 isVolatile, isNonTemporal, Alignment,
+                                 TBAAInfo);
 
           // Store the remaining ExtraWidth bits.
           IncrementSize = RoundWidth / 8;
           Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
-                             DAG.getIntPtrConstant(IncrementSize));
+                            DAG.getConstant(IncrementSize, Ptr.getValueType()));
           Hi = DAG.getNode(ISD::SRL, dl, Value.getValueType(), Value,
                            DAG.getConstant(RoundWidth,
                                     TLI.getShiftAmountTy(Value.getValueType())));
           Hi = DAG.getTruncStore(Chain, dl, Hi, Ptr,
                              ST->getPointerInfo().getWithOffset(IncrementSize),
                                  ExtraVT, isVolatile, isNonTemporal,
-                                 MinAlign(Alignment, IncrementSize));
+                                 MinAlign(Alignment, IncrementSize), TBAAInfo);
         } else {
           // Big endian - avoid unaligned stores.
           // TRUNCSTORE:i24 X -> TRUNCSTORE:i16 (srl X, 8), TRUNCSTORE@+2:i8 X
@@ -809,16 +820,17 @@ void SelectionDAGLegalize::LegalizeStoreOps(SDNode *Node) {
                            DAG.getConstant(ExtraWidth,
                                     TLI.getShiftAmountTy(Value.getValueType())));
           Hi = DAG.getTruncStore(Chain, dl, Hi, Ptr, ST->getPointerInfo(),
-                                 RoundVT, isVolatile, isNonTemporal, Alignment);
+                                 RoundVT, isVolatile, isNonTemporal, Alignment,
+                                 TBAAInfo);
 
           // Store the remaining ExtraWidth bits.
           IncrementSize = RoundWidth / 8;
           Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
-                             DAG.getIntPtrConstant(IncrementSize));
+                             DAG.getConstant(IncrementSize, Ptr.getValueType()));
           Lo = DAG.getTruncStore(Chain, dl, Value, Ptr,
                               ST->getPointerInfo().getWithOffset(IncrementSize),
                                  ExtraVT, isVolatile, isNonTemporal,
-                                 MinAlign(Alignment, IncrementSize));
+                                 MinAlign(Alignment, IncrementSize), TBAAInfo);
         }
 
         // The order of the stores doesn't matter.
@@ -854,7 +866,7 @@ void SelectionDAGLegalize::LegalizeStoreOps(SDNode *Node) {
           Value = DAG.getNode(ISD::TRUNCATE, dl, StVT, Value);
           SDValue Result =
             DAG.getStore(Chain, dl, Value, Ptr, ST->getPointerInfo(),
-                         isVolatile, isNonTemporal, Alignment);
+                         isVolatile, isNonTemporal, Alignment, TBAAInfo);
           ReplaceNode(SDValue(Node, 0), Result);
           break;
         }
@@ -902,9 +914,7 @@ void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) {
       assert(NVT.getSizeInBits() == VT.getSizeInBits() &&
              "Can only promote loads to same size type");
 
-      SDValue Res = DAG.getLoad(NVT, dl, Chain, Ptr, LD->getPointerInfo(),
-                         LD->isVolatile(), LD->isNonTemporal(),
-                         LD->isInvariant(), LD->getAlignment());
+      SDValue Res = DAG.getLoad(NVT, dl, Chain, Ptr, LD->getMemOperand());
       RVal = DAG.getNode(ISD::BITCAST, dl, VT, Res);
       RChain = Res.getValue(1);
       break;
@@ -924,6 +934,7 @@ void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) {
   unsigned Alignment = LD->getAlignment();
   bool isVolatile = LD->isVolatile();
   bool isNonTemporal = LD->isNonTemporal();
+  const MDNode *TBAAInfo = LD->getTBAAInfo();
 
   if (SrcWidth != SrcVT.getStoreSizeInBits() &&
       // Some targets pretend to have an i1 loading operation, and actually
@@ -950,7 +961,7 @@ void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) {
     SDValue Result =
       DAG.getExtLoad(NewExtType, dl, Node->getValueType(0),
                      Chain, Ptr, LD->getPointerInfo(),
-                     NVT, isVolatile, isNonTemporal, Alignment);
+                     NVT, isVolatile, isNonTemporal, Alignment, TBAAInfo);
 
     Ch = Result.getValue(1); // The chain.
 
@@ -987,16 +998,16 @@ void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) {
       Lo = DAG.getExtLoad(ISD::ZEXTLOAD, dl, Node->getValueType(0),
                           Chain, Ptr,
                           LD->getPointerInfo(), RoundVT, isVolatile,
-                          isNonTemporal, Alignment);
+                          isNonTemporal, Alignment, TBAAInfo);
 
       // Load the remaining ExtraWidth bits.
       IncrementSize = RoundWidth / 8;
       Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
-                         DAG.getIntPtrConstant(IncrementSize));
+                         DAG.getConstant(IncrementSize, Ptr.getValueType()));
       Hi = DAG.getExtLoad(ExtType, dl, Node->getValueType(0), Chain, Ptr,
                           LD->getPointerInfo().getWithOffset(IncrementSize),
                           ExtraVT, isVolatile, isNonTemporal,
-                          MinAlign(Alignment, IncrementSize));
+                          MinAlign(Alignment, IncrementSize), TBAAInfo);
 
       // Build a factor node to remember that this load is independent of
       // the other one.
@@ -1016,17 +1027,17 @@ void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) {
       // Load the top RoundWidth bits.
       Hi = DAG.getExtLoad(ExtType, dl, Node->getValueType(0), Chain, Ptr,
                           LD->getPointerInfo(), RoundVT, isVolatile,
-                          isNonTemporal, Alignment);
+                          isNonTemporal, Alignment, TBAAInfo);
 
       // Load the remaining ExtraWidth bits.
       IncrementSize = RoundWidth / 8;
       Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
-                         DAG.getIntPtrConstant(IncrementSize));
+                         DAG.getConstant(IncrementSize, Ptr.getValueType()));
       Lo = DAG.getExtLoad(ISD::ZEXTLOAD,
                           dl, Node->getValueType(0), Chain, Ptr,
                           LD->getPointerInfo().getWithOffset(IncrementSize),
                           ExtraVT, isVolatile, isNonTemporal,
-                          MinAlign(Alignment, IncrementSize));
+                          MinAlign(Alignment, IncrementSize), TBAAInfo);
 
       // Build a factor node to remember that this load is independent of
       // the other one.
@@ -1079,9 +1090,7 @@ void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) {
     case TargetLowering::Expand:
              if (!TLI.isLoadExtLegal(ISD::EXTLOAD, SrcVT) && TLI.isTypeLegal(SrcVT)) {
                SDValue Load = DAG.getLoad(SrcVT, dl, Chain, Ptr,
-                                          LD->getPointerInfo(),
-                                          LD->isVolatile(), LD->isNonTemporal(),
-                                          LD->isInvariant(), LD->getAlignment());
+                                          LD->getMemOperand());
                unsigned ExtendOp;
                switch (ExtType) {
                case ISD::EXTLOAD:
@@ -1109,9 +1118,8 @@ void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) {
              // Turn the unsupported load into an EXTLOAD followed by an explicit
              // zero/sign extend inreg.
              SDValue Result = DAG.getExtLoad(ISD::EXTLOAD, dl, Node->getValueType(0),
-                                             Chain, Ptr, LD->getPointerInfo(), SrcVT,
-                                             LD->isVolatile(), LD->isNonTemporal(),
-                                             LD->getAlignment());
+                                             Chain, Ptr, SrcVT,
+                                             LD->getMemOperand());
              SDValue ValRes;
              if (ExtType == ISD::SEXTLOAD)
                ValRes = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl,
@@ -1386,11 +1394,7 @@ SDValue SelectionDAGLegalize::ExpandExtractFromVectorThroughStack(SDValue Op) {
   Idx = DAG.getNode(ISD::MUL, dl, Idx.getValueType(), Idx,
                     DAG.getConstant(EltSize, Idx.getValueType()));
 
-  if (Idx.getValueType().bitsGT(TLI.getPointerTy()))
-    Idx = DAG.getNode(ISD::TRUNCATE, dl, TLI.getPointerTy(), Idx);
-  else
-    Idx = DAG.getNode(ISD::ZERO_EXTEND, dl, TLI.getPointerTy(), Idx);
-
+  Idx = DAG.getZExtOrTrunc(Idx, dl, TLI.getPointerTy());
   StackPtr = DAG.getNode(ISD::ADD, dl, Idx.getValueType(), Idx, StackPtr);
 
   if (Op.getValueType().isVector())
@@ -1428,11 +1432,7 @@ SDValue SelectionDAGLegalize::ExpandInsertToVectorThroughStack(SDValue Op) {
 
   Idx = DAG.getNode(ISD::MUL, dl, Idx.getValueType(), Idx,
                     DAG.getConstant(EltSize, Idx.getValueType()));
-
-  if (Idx.getValueType().bitsGT(TLI.getPointerTy()))
-    Idx = DAG.getNode(ISD::TRUNCATE, dl, TLI.getPointerTy(), Idx);
-  else
-    Idx = DAG.getNode(ISD::ZERO_EXTEND, dl, TLI.getPointerTy(), Idx);
+  Idx = DAG.getZExtOrTrunc(Idx, dl, TLI.getPointerTy());
 
   SDValue SubStackPtr = DAG.getNode(ISD::ADD, dl, Idx.getValueType(), Idx,
                                     StackPtr);
@@ -1531,7 +1531,8 @@ SDValue SelectionDAGLegalize::ExpandFCOPYSIGN(SDNode* Node) {
       unsigned Strides = (FloatVT.getSizeInBits()-1)/LoadTy.getSizeInBits();
       unsigned ByteOffset = (Strides * LoadTy.getSizeInBits()) / 8;
       LoadPtr = DAG.getNode(ISD::ADD, dl, LoadPtr.getValueType(),
-                            LoadPtr, DAG.getIntPtrConstant(ByteOffset));
+                            LoadPtr,
+                            DAG.getConstant(ByteOffset, LoadPtr.getValueType()));
       // Load a legal integer containing the sign bit.
       SignBit = DAG.getLoad(LoadTy, dl, Ch, LoadPtr, MachinePointerInfo(),
                             false, false, false, 0);
@@ -1580,10 +1581,10 @@ void SelectionDAGLegalize::ExpandDYNAMIC_STACKALLOC(SDNode* Node,
   Chain = SP.getValue(1);
   unsigned Align = cast<ConstantSDNode>(Tmp3)->getZExtValue();
   unsigned StackAlign = TM.getFrameLowering()->getStackAlignment();
-  if (Align > StackAlign)
-    SP = DAG.getNode(ISD::AND, dl, VT, SP,
-                      DAG.getConstant(-(uint64_t)Align, VT));
   Tmp1 = DAG.getNode(ISD::SUB, dl, VT, SP, Size);       // Value
+  if (Align > StackAlign)
+    Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1,
+                       DAG.getConstant(-(uint64_t)Align, VT));
   Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1);     // Output chain
 
   Tmp2 = DAG.getCALLSEQ_END(Chain,  DAG.getIntPtrConstant(0, true),
@@ -1595,22 +1596,44 @@ void SelectionDAGLegalize::ExpandDYNAMIC_STACKALLOC(SDNode* Node,
 }
 
 /// LegalizeSetCCCondCode - Legalize a SETCC with given LHS and RHS and
-/// condition code CC on the current target. This routine expands SETCC with
-/// illegal condition code into AND / OR of multiple SETCC values.
-void SelectionDAGLegalize::LegalizeSetCCCondCode(EVT VT,
+/// condition code CC on the current target.
+///
+/// If the SETCC has been legalized using AND / OR, then the legalized node
+/// will be stored in LHS. RHS and CC will be set to SDValue(). NeedInvert
+/// will be set to false.
+///
+/// If the SETCC has been legalized by using getSetCCSwappedOperands(),
+/// then the values of LHS and RHS will be swapped, CC will be set to the
+/// new condition, and NeedInvert will be set to false.
+///
+/// If the SETCC has been legalized using the inverse condcode, then LHS and
+/// RHS will be unchanged, CC will set to the inverted condcode, and NeedInvert
+/// will be set to true. The caller must invert the result of the SETCC with
+/// SelectionDAG::getNOT() or take equivalent action to swap the effect of a
+/// true/false result.
+///
+/// \returns true if the SetCC has been legalized, false if it hasn't.
+bool SelectionDAGLegalize::LegalizeSetCCCondCode(EVT VT,
                                                  SDValue &LHS, SDValue &RHS,
                                                  SDValue &CC,
+                                                 bool &NeedInvert,
                                                  SDLoc dl) {
   MVT OpVT = LHS.getSimpleValueType();
   ISD::CondCode CCCode = cast<CondCodeSDNode>(CC)->get();
+  NeedInvert = false;
   switch (TLI.getCondCodeAction(CCCode, OpVT)) {
   default: llvm_unreachable("Unknown condition code action!");
   case TargetLowering::Legal:
     // Nothing to do.
     break;
   case TargetLowering::Expand: {
+    ISD::CondCode InvCC = ISD::getSetCCSwappedOperands(CCCode);
+    if (TLI.isCondCodeLegal(InvCC, OpVT)) {
+      std::swap(LHS, RHS);
+      CC = DAG.getCondCode(InvCC);
+      return true;
+    }
     ISD::CondCode CC1 = ISD::SETCC_INVALID, CC2 = ISD::SETCC_INVALID;
-    ISD::CondCode InvCC = ISD::SETCC_INVALID;
     unsigned Opc = 0;
     switch (CCCode) {
     default: llvm_unreachable("Don't know how to expand this condition!");
@@ -1650,18 +1673,21 @@ void SelectionDAGLegalize::LegalizeSetCCCondCode(EVT VT,
     case ISD::SETGT:
     case ISD::SETGE:
     case ISD::SETLT:
+      // We only support using the inverted operation, which is computed above
+      // and not a different manner of supporting expanding these cases.
+      llvm_unreachable("Don't know how to expand this condition!");
     case ISD::SETNE:
     case ISD::SETEQ:
-      InvCC = ISD::getSetCCSwappedOperands(CCCode);
-      if (TLI.getCondCodeAction(InvCC, OpVT) == TargetLowering::Expand) {
-        // We only support using the inverted operation and not a
-        // different manner of supporting expanding these cases.
-        llvm_unreachable("Don't know how to expand this condition!");
+      // Try inverting the result of the inverse condition.
+      InvCC = CCCode == ISD::SETEQ ? ISD::SETNE : ISD::SETEQ;
+      if (TLI.isCondCodeLegal(InvCC, OpVT)) {
+        CC = DAG.getCondCode(InvCC);
+        NeedInvert = true;
+        return true;
       }
-      LHS = DAG.getSetCC(dl, VT, RHS, LHS, InvCC);
-      RHS = SDValue();
-      CC = SDValue();
-      return;
+      // If inverting the condition didn't work then we have no means to expand
+      // the condition.
+      llvm_unreachable("Don't know how to expand this condition!");
     }
 
     SDValue SetCC1, SetCC2;
@@ -1678,9 +1704,10 @@ void SelectionDAGLegalize::LegalizeSetCCCondCode(EVT VT,
     LHS = DAG.getNode(Opc, dl, VT, SetCC1, SetCC2);
     RHS = SDValue();
     CC  = SDValue();
-    break;
+    return true;
   }
   }
+  return false;
 }
 
 /// EmitStackConvert - Emit a store/load combination to the stack.  This stores
@@ -1969,7 +1996,7 @@ SDValue SelectionDAGLegalize::ExpandFPLibCall(SDNode* Node,
                                               RTLIB::Libcall Call_F128,
                                               RTLIB::Libcall Call_PPCF128) {
   RTLIB::Libcall LC;
-  switch (Node->getValueType(0).getSimpleVT().SimpleTy) {
+  switch (Node->getSimpleValueType(0).SimpleTy) {
   default: llvm_unreachable("Unexpected request for libcall!");
   case MVT::f32: LC = Call_F32; break;
   case MVT::f64: LC = Call_F64; break;
@@ -1987,7 +2014,7 @@ SDValue SelectionDAGLegalize::ExpandIntLibCall(SDNode* Node, bool isSigned,
                                                RTLIB::Libcall Call_I64,
                                                RTLIB::Libcall Call_I128) {
   RTLIB::Libcall LC;
-  switch (Node->getValueType(0).getSimpleVT().SimpleTy) {
+  switch (Node->getSimpleValueType(0).SimpleTy) {
   default: llvm_unreachable("Unexpected request for libcall!");
   case MVT::i8:   LC = Call_I8; break;
   case MVT::i16:  LC = Call_I16; break;
@@ -2002,7 +2029,7 @@ SDValue SelectionDAGLegalize::ExpandIntLibCall(SDNode* Node, bool isSigned,
 static bool isDivRemLibcallAvailable(SDNode *Node, bool isSigned,
                                      const TargetLowering &TLI) {
   RTLIB::Libcall LC;
-  switch (Node->getValueType(0).getSimpleVT().SimpleTy) {
+  switch (Node->getSimpleValueType(0).SimpleTy) {
   default: llvm_unreachable("Unexpected request for libcall!");
   case MVT::i8:   LC= isSigned ? RTLIB::SDIVREM_I8  : RTLIB::UDIVREM_I8;  break;
   case MVT::i16:  LC= isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break;
@@ -2049,7 +2076,7 @@ SelectionDAGLegalize::ExpandDivRemLibCall(SDNode *Node,
   bool isSigned = Opcode == ISD::SDIVREM;
 
   RTLIB::Libcall LC;
-  switch (Node->getValueType(0).getSimpleVT().SimpleTy) {
+  switch (Node->getSimpleValueType(0).SimpleTy) {
   default: llvm_unreachable("Unexpected request for libcall!");
   case MVT::i8:   LC= isSigned ? RTLIB::SDIVREM_I8  : RTLIB::UDIVREM_I8;  break;
   case MVT::i16:  LC= isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break;
@@ -2106,7 +2133,7 @@ SelectionDAGLegalize::ExpandDivRemLibCall(SDNode *Node,
 /// isSinCosLibcallAvailable - Return true if sincos libcall is available.
 static bool isSinCosLibcallAvailable(SDNode *Node, const TargetLowering &TLI) {
   RTLIB::Libcall LC;
-  switch (Node->getValueType(0).getSimpleVT().SimpleTy) {
+  switch (Node->getSimpleValueType(0).SimpleTy) {
   default: llvm_unreachable("Unexpected request for libcall!");
   case MVT::f32:     LC = RTLIB::SINCOS_F32; break;
   case MVT::f64:     LC = RTLIB::SINCOS_F64; break;
@@ -2156,7 +2183,7 @@ void
 SelectionDAGLegalize::ExpandSinCosLibCall(SDNode *Node,
                                           SmallVectorImpl<SDValue> &Results) {
   RTLIB::Libcall LC;
-  switch (Node->getValueType(0).getSimpleVT().SimpleTy) {
+  switch (Node->getSimpleValueType(0).SimpleTy) {
   default: llvm_unreachable("Unexpected request for libcall!");
   case MVT::f32:     LC = RTLIB::SINCOS_F32; break;
   case MVT::f64:     LC = RTLIB::SINCOS_F64; break;
@@ -2232,11 +2259,11 @@ SDValue SelectionDAGLegalize::ExpandLegalINT_TO_FP(bool isSigned,
     SDValue StackSlot = DAG.CreateStackTemporary(MVT::f64);
 
     // word offset constant for Hi/Lo address computation
-    SDValue WordOff = DAG.getConstant(sizeof(int), TLI.getPointerTy());
+    SDValue WordOff = DAG.getConstant(sizeof(int), StackSlot.getValueType());
     // set up Hi and Lo (into buffer) address based on endian
     SDValue Hi = StackSlot;
-    SDValue Lo = DAG.getNode(ISD::ADD, dl,
-                             TLI.getPointerTy(), StackSlot, WordOff);
+    SDValue Lo = DAG.getNode(ISD::ADD, dl, StackSlot.getValueType(),
+                             StackSlot, WordOff);
     if (TLI.isLittleEndian())
       std::swap(Hi, Lo);
 
@@ -2382,7 +2409,7 @@ SDValue SelectionDAGLegalize::ExpandLegalINT_TO_FP(bool isSigned,
   // as a negative number.  To counteract this, the dynamic code adds an
   // offset depending on the data type.
   uint64_t FF;
-  switch (Op0.getValueType().getSimpleVT().SimpleTy) {
+  switch (Op0.getSimpleValueType().SimpleTy) {
   default: llvm_unreachable("Unsupported integer type!");
   case MVT::i8 : FF = 0x43800000ULL; break;  // 2^8  (as a float)
   case MVT::i16: FF = 0x47800000ULL; break;  // 2^16 (as a float)
@@ -2395,7 +2422,7 @@ SDValue SelectionDAGLegalize::ExpandLegalINT_TO_FP(bool isSigned,
 
   SDValue CPIdx = DAG.getConstantPool(FudgeFactor, TLI.getPointerTy());
   unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
-  CPIdx = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(), CPIdx, CstOffset);
+  CPIdx = DAG.getNode(ISD::ADD, dl, CPIdx.getValueType(), CPIdx, CstOffset);
   Alignment = std::min(Alignment, 4u);
   SDValue FudgeInReg;
   if (DestVT == MVT::f32)
@@ -2656,6 +2683,7 @@ std::pair <SDValue, SDValue> SelectionDAGLegalize::ExpandAtomic(SDNode *Node) {
     case MVT::i16: LC = RTLIB::SYNC_LOCK_TEST_AND_SET_2; break;
     case MVT::i32: LC = RTLIB::SYNC_LOCK_TEST_AND_SET_4; break;
     case MVT::i64: LC = RTLIB::SYNC_LOCK_TEST_AND_SET_8; break;
+    case MVT::i128:LC = RTLIB::SYNC_LOCK_TEST_AND_SET_16;break;
     }
     break;
   case ISD::ATOMIC_CMP_SWAP:
@@ -2665,6 +2693,7 @@ std::pair <SDValue, SDValue> SelectionDAGLegalize::ExpandAtomic(SDNode *Node) {
     case MVT::i16: LC = RTLIB::SYNC_VAL_COMPARE_AND_SWAP_2; break;
     case MVT::i32: LC = RTLIB::SYNC_VAL_COMPARE_AND_SWAP_4; break;
     case MVT::i64: LC = RTLIB::SYNC_VAL_COMPARE_AND_SWAP_8; break;
+    case MVT::i128:LC = RTLIB::SYNC_VAL_COMPARE_AND_SWAP_16;break;
     }
     break;
   case ISD::ATOMIC_LOAD_ADD:
@@ -2674,6 +2703,7 @@ std::pair <SDValue, SDValue> SelectionDAGLegalize::ExpandAtomic(SDNode *Node) {
     case MVT::i16: LC = RTLIB::SYNC_FETCH_AND_ADD_2; break;
     case MVT::i32: LC = RTLIB::SYNC_FETCH_AND_ADD_4; break;
     case MVT::i64: LC = RTLIB::SYNC_FETCH_AND_ADD_8; break;
+    case MVT::i128:LC = RTLIB::SYNC_FETCH_AND_ADD_16;break;
     }
     break;
   case ISD::ATOMIC_LOAD_SUB:
@@ -2683,6 +2713,7 @@ std::pair <SDValue, SDValue> SelectionDAGLegalize::ExpandAtomic(SDNode *Node) {
     case MVT::i16: LC = RTLIB::SYNC_FETCH_AND_SUB_2; break;
     case MVT::i32: LC = RTLIB::SYNC_FETCH_AND_SUB_4; break;
     case MVT::i64: LC = RTLIB::SYNC_FETCH_AND_SUB_8; break;
+    case MVT::i128:LC = RTLIB::SYNC_FETCH_AND_SUB_16;break;
     }
     break;
   case ISD::ATOMIC_LOAD_AND:
@@ -2692,6 +2723,7 @@ std::pair <SDValue, SDValue> SelectionDAGLegalize::ExpandAtomic(SDNode *Node) {
     case MVT::i16: LC = RTLIB::SYNC_FETCH_AND_AND_2; break;
     case MVT::i32: LC = RTLIB::SYNC_FETCH_AND_AND_4; break;
     case MVT::i64: LC = RTLIB::SYNC_FETCH_AND_AND_8; break;
+    case MVT::i128:LC = RTLIB::SYNC_FETCH_AND_AND_16;break;
     }
     break;
   case ISD::ATOMIC_LOAD_OR:
@@ -2701,6 +2733,7 @@ std::pair <SDValue, SDValue> SelectionDAGLegalize::ExpandAtomic(SDNode *Node) {
     case MVT::i16: LC = RTLIB::SYNC_FETCH_AND_OR_2; break;
     case MVT::i32: LC = RTLIB::SYNC_FETCH_AND_OR_4; break;
     case MVT::i64: LC = RTLIB::SYNC_FETCH_AND_OR_8; break;
+    case MVT::i128:LC = RTLIB::SYNC_FETCH_AND_OR_16;break;
     }
     break;
   case ISD::ATOMIC_LOAD_XOR:
@@ -2710,6 +2743,7 @@ std::pair <SDValue, SDValue> SelectionDAGLegalize::ExpandAtomic(SDNode *Node) {
     case MVT::i16: LC = RTLIB::SYNC_FETCH_AND_XOR_2; break;
     case MVT::i32: LC = RTLIB::SYNC_FETCH_AND_XOR_4; break;
     case MVT::i64: LC = RTLIB::SYNC_FETCH_AND_XOR_8; break;
+    case MVT::i128:LC = RTLIB::SYNC_FETCH_AND_XOR_16;break;
     }
     break;
   case ISD::ATOMIC_LOAD_NAND:
@@ -2719,6 +2753,47 @@ std::pair <SDValue, SDValue> SelectionDAGLegalize::ExpandAtomic(SDNode *Node) {
     case MVT::i16: LC = RTLIB::SYNC_FETCH_AND_NAND_2; break;
     case MVT::i32: LC = RTLIB::SYNC_FETCH_AND_NAND_4; break;
     case MVT::i64: LC = RTLIB::SYNC_FETCH_AND_NAND_8; break;
+    case MVT::i128:LC = RTLIB::SYNC_FETCH_AND_NAND_16;break;
+    }
+    break;
+  case ISD::ATOMIC_LOAD_MAX:
+    switch (VT.SimpleTy) {
+    default: llvm_unreachable("Unexpected value type for atomic!");
+    case MVT::i8:  LC = RTLIB::SYNC_FETCH_AND_MAX_1; break;
+    case MVT::i16: LC = RTLIB::SYNC_FETCH_AND_MAX_2; break;
+    case MVT::i32: LC = RTLIB::SYNC_FETCH_AND_MAX_4; break;
+    case MVT::i64: LC = RTLIB::SYNC_FETCH_AND_MAX_8; break;
+    case MVT::i128:LC = RTLIB::SYNC_FETCH_AND_MAX_16;break;
+    }
+    break;
+  case ISD::ATOMIC_LOAD_UMAX:
+    switch (VT.SimpleTy) {
+    default: llvm_unreachable("Unexpected value type for atomic!");
+    case MVT::i8:  LC = RTLIB::SYNC_FETCH_AND_UMAX_1; break;
+    case MVT::i16: LC = RTLIB::SYNC_FETCH_AND_UMAX_2; break;
+    case MVT::i32: LC = RTLIB::SYNC_FETCH_AND_UMAX_4; break;
+    case MVT::i64: LC = RTLIB::SYNC_FETCH_AND_UMAX_8; break;
+    case MVT::i128:LC = RTLIB::SYNC_FETCH_AND_UMAX_16;break;
+    }
+    break;
+  case ISD::ATOMIC_LOAD_MIN:
+    switch (VT.SimpleTy) {
+    default: llvm_unreachable("Unexpected value type for atomic!");
+    case MVT::i8:  LC = RTLIB::SYNC_FETCH_AND_MIN_1; break;
+    case MVT::i16: LC = RTLIB::SYNC_FETCH_AND_MIN_2; break;
+    case MVT::i32: LC = RTLIB::SYNC_FETCH_AND_MIN_4; break;
+    case MVT::i64: LC = RTLIB::SYNC_FETCH_AND_MIN_8; break;
+    case MVT::i128:LC = RTLIB::SYNC_FETCH_AND_MIN_16;break;
+    }
+    break;
+  case ISD::ATOMIC_LOAD_UMIN:
+    switch (VT.SimpleTy) {
+    default: llvm_unreachable("Unexpected value type for atomic!");
+    case MVT::i8:  LC = RTLIB::SYNC_FETCH_AND_UMIN_1; break;
+    case MVT::i16: LC = RTLIB::SYNC_FETCH_AND_UMIN_2; break;
+    case MVT::i32: LC = RTLIB::SYNC_FETCH_AND_UMIN_4; break;
+    case MVT::i64: LC = RTLIB::SYNC_FETCH_AND_UMIN_8; break;
+    case MVT::i128:LC = RTLIB::SYNC_FETCH_AND_UMIN_16;break;
     }
     break;
   }
@@ -2730,6 +2805,7 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
   SmallVector<SDValue, 8> Results;
   SDLoc dl(Node);
   SDValue Tmp1, Tmp2, Tmp3, Tmp4;
+  bool NeedInvert;
   switch (Node->getOpcode()) {
   case ISD::CTPOP:
   case ISD::CTLZ:
@@ -2947,20 +3023,20 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     if (Align > TLI.getMinStackArgumentAlignment()) {
       assert(((Align & (Align-1)) == 0) && "Expected Align to be a power of 2");
 
-      VAList = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(), VAList,
+      VAList = DAG.getNode(ISD::ADD, dl, VAList.getValueType(), VAList,
                            DAG.getConstant(Align - 1,
-                                           TLI.getPointerTy()));
+                                           VAList.getValueType()));
 
-      VAList = DAG.getNode(ISD::AND, dl, TLI.getPointerTy(), VAList,
+      VAList = DAG.getNode(ISD::AND, dl, VAList.getValueType(), VAList,
                            DAG.getConstant(-(int64_t)Align,
-                                           TLI.getPointerTy()));
+                                           VAList.getValueType()));
     }
 
     // Increment the pointer, VAList, to the next vaarg
-    Tmp3 = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(), VAList,
+    Tmp3 = DAG.getNode(ISD::ADD, dl, VAList.getValueType(), VAList,
                        DAG.getConstant(TLI.getDataLayout()->
                           getTypeAllocSize(VT.getTypeForEVT(*DAG.getContext())),
-                                       TLI.getPointerTy()));
+                                       VAList.getValueType()));
     // Store the incremented VAList to the legalized pointer
     Tmp3 = DAG.getStore(VAListLoad.getValue(1), dl, Tmp3, Tmp2,
                         MachinePointerInfo(V), false, false, 0);
@@ -3231,6 +3307,13 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
                                       RTLIB::NEARBYINT_F128,
                                       RTLIB::NEARBYINT_PPCF128));
     break;
+  case ISD::FROUND:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::ROUND_F32,
+                                      RTLIB::ROUND_F64,
+                                      RTLIB::ROUND_F80,
+                                      RTLIB::ROUND_F128,
+                                      RTLIB::ROUND_PPCF128));
+    break;
   case ISD::FPOWI:
     Results.push_back(ExpandFPLibCall(Node, RTLIB::POWI_F32, RTLIB::POWI_F64,
                                       RTLIB::POWI_F80, RTLIB::POWI_F128,
@@ -3565,9 +3648,10 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     unsigned EntrySize =
       DAG.getMachineFunction().getJumpTableInfo()->getEntrySize(TD);
 
-    Index = DAG.getNode(ISD::MUL, dl, PTy,
-                        Index, DAG.getConstant(EntrySize, PTy));
-    SDValue Addr = DAG.getNode(ISD::ADD, dl, PTy, Index, Table);
+    Index = DAG.getNode(ISD::MUL, dl, Index.getValueType(),
+                       Index, DAG.getConstant(EntrySize, Index.getValueType()));
+    SDValue Addr = DAG.getNode(ISD::ADD, dl, Index.getValueType(),
+                               Index, Table);
 
     EVT MemVT = EVT::getIntegerVT(*DAG.getContext(), EntrySize * 8);
     SDValue LD = DAG.getExtLoad(ISD::SEXTLOAD, dl, PTy, Chain, Addr,
@@ -3611,10 +3695,21 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     Tmp1 = Node->getOperand(0);
     Tmp2 = Node->getOperand(1);
     Tmp3 = Node->getOperand(2);
-    LegalizeSetCCCondCode(Node->getValueType(0), Tmp1, Tmp2, Tmp3, dl);
+    bool Legalized = LegalizeSetCCCondCode(Node->getValueType(0), Tmp1, Tmp2,
+                                           Tmp3, NeedInvert, dl);
+
+    if (Legalized) {
+      // If we expanded the SETCC by swapping LHS and RHS, or by inverting the
+      // condition code, create a new SETCC node.
+      if (Tmp3.getNode())
+        Tmp1 = DAG.getNode(ISD::SETCC, dl, Node->getValueType(0),
+                           Tmp1, Tmp2, Tmp3);
+
+      // If we expanded the SETCC by inverting the condition code, then wrap
+      // the existing SETCC in a NOT to restore the intended condition.
+      if (NeedInvert)
+        Tmp1 = DAG.getNOT(dl, Tmp1, Tmp1->getValueType(0));
 
-    // If we expanded the SETCC into an AND/OR, return the new node
-    if (Tmp2.getNode() == 0) {
       Results.push_back(Tmp1);
       break;
     }
@@ -3645,14 +3740,52 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     Tmp4 = Node->getOperand(3);   // False
     SDValue CC = Node->getOperand(4);
 
-    LegalizeSetCCCondCode(getSetCCResultType(Tmp1.getValueType()),
-                          Tmp1, Tmp2, CC, dl);
+    bool Legalized = false;
+    // Try to legalize by inverting the condition.  This is for targets that
+    // might support an ordered version of a condition, but not the unordered
+    // version (or vice versa).
+    ISD::CondCode InvCC = ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
+                                               Tmp1.getValueType().isInteger());
+    if (TLI.isCondCodeLegal(InvCC, Tmp1.getSimpleValueType())) {
+      // Use the new condition code and swap true and false
+      Legalized = true;
+      Tmp1 = DAG.getSelectCC(dl, Tmp1, Tmp2, Tmp4, Tmp3, InvCC);
+    } else {
+      // If The inverse is not legal, then try to swap the arguments using
+      // the inverse condition code.
+      ISD::CondCode SwapInvCC = ISD::getSetCCSwappedOperands(InvCC);
+      if (TLI.isCondCodeLegal(SwapInvCC, Tmp1.getSimpleValueType())) {
+        // The swapped inverse condition is legal, so swap true and false,
+        // lhs and rhs.
+        Legalized = true;
+        Tmp1 = DAG.getSelectCC(dl, Tmp2, Tmp1, Tmp4, Tmp3, SwapInvCC);
+      }
+    }
+
+    if (!Legalized) {
+      Legalized = LegalizeSetCCCondCode(
+          getSetCCResultType(Tmp1.getValueType()), Tmp1, Tmp2, CC, NeedInvert,
+          dl);
+
+      assert(Legalized && "Can't legalize SELECT_CC with legal condition!");
+
+      // If we expanded the SETCC by inverting the condition code, then swap
+      // the True/False operands to match.
+      if (NeedInvert)
+        std::swap(Tmp3, Tmp4);
 
-    assert(!Tmp2.getNode() && "Can't legalize SELECT_CC with legal condition!");
-    Tmp2 = DAG.getConstant(0, Tmp1.getValueType());
-    CC = DAG.getCondCode(ISD::SETNE);
-    Tmp1 = DAG.getNode(ISD::SELECT_CC, dl, Node->getValueType(0), Tmp1, Tmp2,
-                       Tmp3, Tmp4, CC);
+      // If we expanded the SETCC by swapping LHS and RHS, or by inverting the
+      // condition code, create a new SELECT_CC node.
+      if (CC.getNode()) {
+        Tmp1 = DAG.getNode(ISD::SELECT_CC, dl, Node->getValueType(0),
+                           Tmp1, Tmp2, Tmp3, Tmp4, CC);
+      } else {
+        Tmp2 = DAG.getConstant(0, Tmp1.getValueType());
+        CC = DAG.getCondCode(ISD::SETNE);
+        Tmp1 = DAG.getNode(ISD::SELECT_CC, dl, Node->getValueType(0), Tmp1, Tmp2,
+                           Tmp3, Tmp4, CC);
+      }
+    }
     Results.push_back(Tmp1);
     break;
   }
@@ -3662,14 +3795,27 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     Tmp3 = Node->getOperand(3);              // RHS
     Tmp4 = Node->getOperand(1);              // CC
 
-    LegalizeSetCCCondCode(getSetCCResultType(Tmp2.getValueType()),
-                          Tmp2, Tmp3, Tmp4, dl);
-
-    assert(!Tmp3.getNode() && "Can't legalize BR_CC with legal condition!");
-    Tmp3 = DAG.getConstant(0, Tmp2.getValueType());
-    Tmp4 = DAG.getCondCode(ISD::SETNE);
-    Tmp1 = DAG.getNode(ISD::BR_CC, dl, Node->getValueType(0), Tmp1, Tmp4, Tmp2,
-                       Tmp3, Node->getOperand(4));
+    bool Legalized = LegalizeSetCCCondCode(getSetCCResultType(
+        Tmp2.getValueType()), Tmp2, Tmp3, Tmp4, NeedInvert, dl);
+    (void)Legalized;
+    assert(Legalized && "Can't legalize BR_CC with legal condition!");
+
+    // If we expanded the SETCC by inverting the condition code, then wrap
+    // the existing SETCC in a NOT to restore the intended condition.
+    if (NeedInvert)
+      Tmp4 = DAG.getNOT(dl, Tmp4, Tmp4->getValueType(0));
+
+    // If we expanded the SETCC by swapping LHS and RHS, create a new BR_CC
+    // node.
+    if (Tmp4.getNode()) {
+      Tmp1 = DAG.getNode(ISD::BR_CC, dl, Node->getValueType(0), Tmp1,
+                         Tmp4, Tmp2, Tmp3, Node->getOperand(4));
+    } else {
+      Tmp3 = DAG.getConstant(0, Tmp2.getValueType());
+      Tmp4 = DAG.getCondCode(ISD::SETNE);
+      Tmp1 = DAG.getNode(ISD::BR_CC, dl, Node->getValueType(0), Tmp1, Tmp4, Tmp2,
+                         Tmp3, Node->getOperand(4));
+    }
     Results.push_back(Tmp1);
     break;
   }
diff --git a/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
index cea0b02..ecf4c5d 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -88,6 +88,7 @@ void DAGTypeLegalizer::SoftenFloatResult(SDNode *N, unsigned ResNo) {
     case ISD::FPOWI:       R = SoftenFloatRes_FPOWI(N); break;
     case ISD::FREM:        R = SoftenFloatRes_FREM(N); break;
     case ISD::FRINT:       R = SoftenFloatRes_FRINT(N); break;
+    case ISD::FROUND:      R = SoftenFloatRes_FROUND(N); break;
     case ISD::FSIN:        R = SoftenFloatRes_FSIN(N); break;
     case ISD::FSQRT:       R = SoftenFloatRes_FSQRT(N); break;
     case ISD::FSUB:        R = SoftenFloatRes_FSUB(N); break;
@@ -160,7 +161,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FADD(SDNode *N) {
                                            RTLIB::ADD_F80,
                                            RTLIB::ADD_F128,
                                            RTLIB::ADD_PPCF128),
-                         NVT, Ops, 2, false, SDLoc(N));
+                         NVT, Ops, 2, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FCEIL(SDNode *N) {
@@ -172,7 +173,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FCEIL(SDNode *N) {
                                            RTLIB::CEIL_F80,
                                            RTLIB::CEIL_F128,
                                            RTLIB::CEIL_PPCF128),
-                         NVT, &Op, 1, false, SDLoc(N));
+                         NVT, &Op, 1, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FCOPYSIGN(SDNode *N) {
@@ -226,7 +227,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FCOS(SDNode *N) {
                                            RTLIB::COS_F80,
                                            RTLIB::COS_F128,
                                            RTLIB::COS_PPCF128),
-                         NVT, &Op, 1, false, SDLoc(N));
+                         NVT, &Op, 1, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FDIV(SDNode *N) {
@@ -239,7 +240,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FDIV(SDNode *N) {
                                            RTLIB::DIV_F80,
                                            RTLIB::DIV_F128,
                                            RTLIB::DIV_PPCF128),
-                         NVT, Ops, 2, false, SDLoc(N));
+                         NVT, Ops, 2, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FEXP(SDNode *N) {
@@ -251,7 +252,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FEXP(SDNode *N) {
                                            RTLIB::EXP_F80,
                                            RTLIB::EXP_F128,
                                            RTLIB::EXP_PPCF128),
-                         NVT, &Op, 1, false, SDLoc(N));
+                         NVT, &Op, 1, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FEXP2(SDNode *N) {
@@ -263,7 +264,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FEXP2(SDNode *N) {
                                            RTLIB::EXP2_F80,
                                            RTLIB::EXP2_F128,
                                            RTLIB::EXP2_PPCF128),
-                         NVT, &Op, 1, false, SDLoc(N));
+                         NVT, &Op, 1, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FFLOOR(SDNode *N) {
@@ -275,7 +276,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FFLOOR(SDNode *N) {
                                            RTLIB::FLOOR_F80,
                                            RTLIB::FLOOR_F128,
                                            RTLIB::FLOOR_PPCF128),
-                         NVT, &Op, 1, false, SDLoc(N));
+                         NVT, &Op, 1, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FLOG(SDNode *N) {
@@ -287,7 +288,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FLOG(SDNode *N) {
                                            RTLIB::LOG_F80,
                                            RTLIB::LOG_F128,
                                            RTLIB::LOG_PPCF128),
-                         NVT, &Op, 1, false, SDLoc(N));
+                         NVT, &Op, 1, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FLOG2(SDNode *N) {
@@ -299,7 +300,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FLOG2(SDNode *N) {
                                            RTLIB::LOG2_F80,
                                            RTLIB::LOG2_F128,
                                            RTLIB::LOG2_PPCF128),
-                         NVT, &Op, 1, false, SDLoc(N));
+                         NVT, &Op, 1, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FLOG10(SDNode *N) {
@@ -311,7 +312,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FLOG10(SDNode *N) {
                                            RTLIB::LOG10_F80,
                                            RTLIB::LOG10_F128,
                                            RTLIB::LOG10_PPCF128),
-                         NVT, &Op, 1, false, SDLoc(N));
+                         NVT, &Op, 1, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FMA(SDNode *N) {
@@ -325,7 +326,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FMA(SDNode *N) {
                                            RTLIB::FMA_F80,
                                            RTLIB::FMA_F128,
                                            RTLIB::FMA_PPCF128),
-                         NVT, Ops, 3, false, SDLoc(N));
+                         NVT, Ops, 3, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FMUL(SDNode *N) {
@@ -338,7 +339,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FMUL(SDNode *N) {
                                            RTLIB::MUL_F80,
                                            RTLIB::MUL_F128,
                                            RTLIB::MUL_PPCF128),
-                         NVT, Ops, 2, false, SDLoc(N));
+                         NVT, Ops, 2, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FNEARBYINT(SDNode *N) {
@@ -350,7 +351,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FNEARBYINT(SDNode *N) {
                                            RTLIB::NEARBYINT_F80,
                                            RTLIB::NEARBYINT_F128,
                                            RTLIB::NEARBYINT_PPCF128),
-                         NVT, &Op, 1, false, SDLoc(N));
+                         NVT, &Op, 1, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FNEG(SDNode *N) {
@@ -364,7 +365,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FNEG(SDNode *N) {
                                            RTLIB::SUB_F80,
                                            RTLIB::SUB_F128,
                                            RTLIB::SUB_PPCF128),
-                         NVT, Ops, 2, false, SDLoc(N));
+                         NVT, Ops, 2, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FP_EXTEND(SDNode *N) {
@@ -372,7 +373,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FP_EXTEND(SDNode *N) {
   SDValue Op = N->getOperand(0);
   RTLIB::Libcall LC = RTLIB::getFPEXT(Op.getValueType(), N->getValueType(0));
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_EXTEND!");
-  return TLI.makeLibCall(DAG, LC, NVT, &Op, 1, false, SDLoc(N));
+  return TLI.makeLibCall(DAG, LC, NVT, &Op, 1, false, SDLoc(N)).first;
 }
 
 // FIXME: Should we just use 'normal' FP_EXTEND / FP_TRUNC instead of special
@@ -381,7 +382,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FP16_TO_FP32(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue Op = N->getOperand(0);
   return TLI.makeLibCall(DAG, RTLIB::FPEXT_F16_F32, NVT, &Op, 1, false,
-                         SDLoc(N));
+                         SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FP_ROUND(SDNode *N) {
@@ -389,7 +390,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FP_ROUND(SDNode *N) {
   SDValue Op = N->getOperand(0);
   RTLIB::Libcall LC = RTLIB::getFPROUND(Op.getValueType(), N->getValueType(0));
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_ROUND!");
-  return TLI.makeLibCall(DAG, LC, NVT, &Op, 1, false, SDLoc(N));
+  return TLI.makeLibCall(DAG, LC, NVT, &Op, 1, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FPOW(SDNode *N) {
@@ -402,7 +403,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FPOW(SDNode *N) {
                                            RTLIB::POW_F80,
                                            RTLIB::POW_F128,
                                            RTLIB::POW_PPCF128),
-                         NVT, Ops, 2, false, SDLoc(N));
+                         NVT, Ops, 2, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FPOWI(SDNode *N) {
@@ -416,7 +417,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FPOWI(SDNode *N) {
                                            RTLIB::POWI_F80,
                                            RTLIB::POWI_F128,
                                            RTLIB::POWI_PPCF128),
-                         NVT, Ops, 2, false, SDLoc(N));
+                         NVT, Ops, 2, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FREM(SDNode *N) {
@@ -429,7 +430,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FREM(SDNode *N) {
                                            RTLIB::REM_F80,
                                            RTLIB::REM_F128,
                                            RTLIB::REM_PPCF128),
-                         NVT, Ops, 2, false, SDLoc(N));
+                         NVT, Ops, 2, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FRINT(SDNode *N) {
@@ -441,7 +442,19 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FRINT(SDNode *N) {
                                            RTLIB::RINT_F80,
                                            RTLIB::RINT_F128,
                                            RTLIB::RINT_PPCF128),
-                         NVT, &Op, 1, false, SDLoc(N));
+                         NVT, &Op, 1, false, SDLoc(N)).first;
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FROUND(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Op = GetSoftenedFloat(N->getOperand(0));
+  return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
+                                           RTLIB::ROUND_F32,
+                                           RTLIB::ROUND_F64,
+                                           RTLIB::ROUND_F80,
+                                           RTLIB::ROUND_F128,
+                                           RTLIB::ROUND_PPCF128),
+                         NVT, &Op, 1, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FSIN(SDNode *N) {
@@ -453,7 +466,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FSIN(SDNode *N) {
                                            RTLIB::SIN_F80,
                                            RTLIB::SIN_F128,
                                            RTLIB::SIN_PPCF128),
-                         NVT, &Op, 1, false, SDLoc(N));
+                         NVT, &Op, 1, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FSQRT(SDNode *N) {
@@ -465,7 +478,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FSQRT(SDNode *N) {
                                            RTLIB::SQRT_F80,
                                            RTLIB::SQRT_F128,
                                            RTLIB::SQRT_PPCF128),
-                         NVT, &Op, 1, false, SDLoc(N));
+                         NVT, &Op, 1, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FSUB(SDNode *N) {
@@ -478,7 +491,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FSUB(SDNode *N) {
                                            RTLIB::SUB_F80,
                                            RTLIB::SUB_F128,
                                            RTLIB::SUB_PPCF128),
-                         NVT, Ops, 2, false, SDLoc(N));
+                         NVT, Ops, 2, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FTRUNC(SDNode *N) {
@@ -490,7 +503,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FTRUNC(SDNode *N) {
                                            RTLIB::TRUNC_F80,
                                            RTLIB::TRUNC_F128,
                                            RTLIB::TRUNC_PPCF128),
-                         NVT, &Op, 1, false, SDLoc(N));
+                         NVT, &Op, 1, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_LOAD(SDNode *N) {
@@ -504,7 +517,8 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_LOAD(SDNode *N) {
     NewL = DAG.getLoad(L->getAddressingMode(), L->getExtensionType(),
                        NVT, dl, L->getChain(), L->getBasePtr(), L->getOffset(),
                        L->getPointerInfo(), NVT, L->isVolatile(),
-                       L->isNonTemporal(), false, L->getAlignment());
+                       L->isNonTemporal(), false, L->getAlignment(),
+                       L->getTBAAInfo());
     // Legalized the chain result - switch anything that used the old chain to
     // use the new one.
     ReplaceValueWith(SDValue(N, 1), NewL.getValue(1));
@@ -516,7 +530,8 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_LOAD(SDNode *N) {
                      L->getMemoryVT(), dl, L->getChain(),
                      L->getBasePtr(), L->getOffset(), L->getPointerInfo(),
                      L->getMemoryVT(), L->isVolatile(),
-                     L->isNonTemporal(), false, L->getAlignment());
+                     L->isNonTemporal(), false, L->getAlignment(),
+                     L->getTBAAInfo());
   // Legalized the chain result - switch anything that used the old chain to
   // use the new one.
   ReplaceValueWith(SDValue(N, 1), NewL.getValue(1));
@@ -585,7 +600,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_XINT_TO_FP(SDNode *N) {
                            NVT, N->getOperand(0));
   return TLI.makeLibCall(DAG, LC,
                          TLI.getTypeToTransformTo(*DAG.getContext(), RVT),
-                         &Op, 1, false, dl);
+                         &Op, 1, false, dl).first;
 }
 
 
@@ -645,7 +660,7 @@ SDValue DAGTypeLegalizer::SoftenFloatOp_FP_ROUND(SDNode *N) {
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_ROUND libcall");
 
   SDValue Op = GetSoftenedFloat(N->getOperand(0));
-  return TLI.makeLibCall(DAG, LC, RVT, &Op, 1, false, SDLoc(N));
+  return TLI.makeLibCall(DAG, LC, RVT, &Op, 1, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatOp_BR_CC(SDNode *N) {
@@ -676,7 +691,7 @@ SDValue DAGTypeLegalizer::SoftenFloatOp_FP_TO_SINT(SDNode *N) {
   RTLIB::Libcall LC = RTLIB::getFPTOSINT(N->getOperand(0).getValueType(), RVT);
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_TO_SINT!");
   SDValue Op = GetSoftenedFloat(N->getOperand(0));
-  return TLI.makeLibCall(DAG, LC, RVT, &Op, 1, false, SDLoc(N));
+  return TLI.makeLibCall(DAG, LC, RVT, &Op, 1, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatOp_FP_TO_UINT(SDNode *N) {
@@ -684,14 +699,14 @@ SDValue DAGTypeLegalizer::SoftenFloatOp_FP_TO_UINT(SDNode *N) {
   RTLIB::Libcall LC = RTLIB::getFPTOUINT(N->getOperand(0).getValueType(), RVT);
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_TO_UINT!");
   SDValue Op = GetSoftenedFloat(N->getOperand(0));
-  return TLI.makeLibCall(DAG, LC, RVT, &Op, 1, false, SDLoc(N));
+  return TLI.makeLibCall(DAG, LC, RVT, &Op, 1, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatOp_FP32_TO_FP16(SDNode *N) {
   EVT RVT = N->getValueType(0);
   RTLIB::Libcall LC = RTLIB::FPROUND_F32_F16;
   SDValue Op = GetSoftenedFloat(N->getOperand(0));
-  return TLI.makeLibCall(DAG, LC, RVT, &Op, 1, false, SDLoc(N));
+  return TLI.makeLibCall(DAG, LC, RVT, &Op, 1, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatOp_SELECT_CC(SDNode *N) {
@@ -754,9 +769,7 @@ SDValue DAGTypeLegalizer::SoftenFloatOp_STORE(SDNode *N, unsigned OpNo) {
     Val = GetSoftenedFloat(Val);
 
   return DAG.getStore(ST->getChain(), dl, Val, ST->getBasePtr(),
-                      ST->getPointerInfo(),
-                      ST->isVolatile(), ST->isNonTemporal(),
-                      ST->getAlignment());
+                      ST->getMemOperand());
 }
 
 
@@ -817,6 +830,7 @@ void DAGTypeLegalizer::ExpandFloatResult(SDNode *N, unsigned ResNo) {
   case ISD::FPOW:       ExpandFloatRes_FPOW(N, Lo, Hi); break;
   case ISD::FPOWI:      ExpandFloatRes_FPOWI(N, Lo, Hi); break;
   case ISD::FRINT:      ExpandFloatRes_FRINT(N, Lo, Hi); break;
+  case ISD::FROUND:     ExpandFloatRes_FROUND(N, Lo, Hi); break;
   case ISD::FSIN:       ExpandFloatRes_FSIN(N, Lo, Hi); break;
   case ISD::FSQRT:      ExpandFloatRes_FSQRT(N, Lo, Hi); break;
   case ISD::FSUB:       ExpandFloatRes_FSUB(N, Lo, Hi); break;
@@ -912,7 +926,7 @@ void DAGTypeLegalizer::ExpandFloatRes_FDIV(SDNode *N, SDValue &Lo,
                                                    RTLIB::DIV_F128,
                                                    RTLIB::DIV_PPCF128),
                                  N->getValueType(0), Ops, 2, false,
-                                 SDLoc(N));
+                                 SDLoc(N)).first;
   GetPairElements(Call, Lo, Hi);
 }
 
@@ -986,7 +1000,7 @@ void DAGTypeLegalizer::ExpandFloatRes_FMA(SDNode *N, SDValue &Lo,
                                                    RTLIB::FMA_F128,
                                                    RTLIB::FMA_PPCF128),
                                  N->getValueType(0), Ops, 3, false,
-                                 SDLoc(N));
+                                 SDLoc(N)).first;
   GetPairElements(Call, Lo, Hi);
 }
 
@@ -1000,7 +1014,7 @@ void DAGTypeLegalizer::ExpandFloatRes_FMUL(SDNode *N, SDValue &Lo,
                                                    RTLIB::MUL_F128,
                                                    RTLIB::MUL_PPCF128),
                                  N->getValueType(0), Ops, 2, false,
-                                 SDLoc(N));
+                                 SDLoc(N)).first;
   GetPairElements(Call, Lo, Hi);
 }
 
@@ -1072,6 +1086,18 @@ void DAGTypeLegalizer::ExpandFloatRes_FRINT(SDNode *N,
   GetPairElements(Call, Lo, Hi);
 }
 
+void DAGTypeLegalizer::ExpandFloatRes_FROUND(SDNode *N,
+                                             SDValue &Lo, SDValue &Hi) {
+  SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
+                                         RTLIB::ROUND_F32,
+                                         RTLIB::ROUND_F64,
+                                         RTLIB::ROUND_F80,
+                                         RTLIB::ROUND_F128,
+                                         RTLIB::ROUND_PPCF128),
+                            N, false);
+  GetPairElements(Call, Lo, Hi);
+}
+
 void DAGTypeLegalizer::ExpandFloatRes_FSIN(SDNode *N,
                                            SDValue &Lo, SDValue &Hi) {
   SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
@@ -1102,7 +1128,7 @@ void DAGTypeLegalizer::ExpandFloatRes_FSUB(SDNode *N, SDValue &Lo,
                                                    RTLIB::SUB_F128,
                                                    RTLIB::SUB_PPCF128),
                                  N->getValueType(0), Ops, 2, false,
-                                 SDLoc(N));
+                                 SDLoc(N)).first;
   GetPairElements(Call, Lo, Hi);
 }
 
@@ -1134,8 +1160,7 @@ void DAGTypeLegalizer::ExpandFloatRes_LOAD(SDNode *N, SDValue &Lo,
   assert(LD->getMemoryVT().bitsLE(NVT) && "Float type not round?");
 
   Hi = DAG.getExtLoad(LD->getExtensionType(), dl, NVT, Chain, Ptr,
-                      LD->getPointerInfo(), LD->getMemoryVT(), LD->isVolatile(),
-                      LD->isNonTemporal(), LD->getAlignment());
+                      LD->getMemoryVT(), LD->getMemOperand());
 
   // Remember the chain.
   Chain = Hi.getValue(1);
@@ -1181,7 +1206,7 @@ void DAGTypeLegalizer::ExpandFloatRes_XINT_TO_FP(SDNode *N, SDValue &Lo,
     }
     assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported XINT_TO_FP!");
 
-    Hi = TLI.makeLibCall(DAG, LC, VT, &Src, 1, true, dl);
+    Hi = TLI.makeLibCall(DAG, LC, VT, &Src, 1, true, dl).first;
     GetPairElements(Hi, Lo, Hi);
   }
 
@@ -1251,6 +1276,7 @@ bool DAGTypeLegalizer::ExpandFloatOperand(SDNode *N, unsigned OpNo) {
   case ISD::EXTRACT_ELEMENT: Res = ExpandOp_EXTRACT_ELEMENT(N); break;
 
   case ISD::BR_CC:      Res = ExpandFloatOp_BR_CC(N); break;
+  case ISD::FCOPYSIGN:  Res = ExpandFloatOp_FCOPYSIGN(N); break;
   case ISD::FP_ROUND:   Res = ExpandFloatOp_FP_ROUND(N); break;
   case ISD::FP_TO_SINT: Res = ExpandFloatOp_FP_TO_SINT(N); break;
   case ISD::FP_TO_UINT: Res = ExpandFloatOp_FP_TO_UINT(N); break;
@@ -1325,6 +1351,17 @@ SDValue DAGTypeLegalizer::ExpandFloatOp_BR_CC(SDNode *N) {
                                 N->getOperand(4)), 0);
 }
 
+SDValue DAGTypeLegalizer::ExpandFloatOp_FCOPYSIGN(SDNode *N) {
+  assert(N->getOperand(1).getValueType() == MVT::ppcf128 &&
+         "Logic only correct for ppcf128!");
+  SDValue Lo, Hi;
+  GetExpandedFloat(N->getOperand(1), Lo, Hi);
+  // The ppcf128 value is providing only the sign; take it from the
+  // higher-order double (which must have the larger magnitude).
+  return DAG.getNode(ISD::FCOPYSIGN, SDLoc(N),
+                     N->getValueType(0), N->getOperand(0), Hi);
+}
+
 SDValue DAGTypeLegalizer::ExpandFloatOp_FP_ROUND(SDNode *N) {
   assert(N->getOperand(0).getValueType() == MVT::ppcf128 &&
          "Logic only correct for ppcf128!");
@@ -1353,7 +1390,7 @@ SDValue DAGTypeLegalizer::ExpandFloatOp_FP_TO_SINT(SDNode *N) {
 
   RTLIB::Libcall LC = RTLIB::getFPTOSINT(N->getOperand(0).getValueType(), RVT);
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_TO_SINT!");
-  return TLI.makeLibCall(DAG, LC, RVT, &N->getOperand(0), 1, false, dl);
+  return TLI.makeLibCall(DAG, LC, RVT, &N->getOperand(0), 1, false, dl).first;
 }
 
 SDValue DAGTypeLegalizer::ExpandFloatOp_FP_TO_UINT(SDNode *N) {
@@ -1386,7 +1423,7 @@ SDValue DAGTypeLegalizer::ExpandFloatOp_FP_TO_UINT(SDNode *N) {
   RTLIB::Libcall LC = RTLIB::getFPTOUINT(N->getOperand(0).getValueType(), RVT);
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_TO_UINT!");
   return TLI.makeLibCall(DAG, LC, N->getValueType(0), &N->getOperand(0), 1,
-                         false, dl);
+                         false, dl).first;
 }
 
 SDValue DAGTypeLegalizer::ExpandFloatOp_SELECT_CC(SDNode *N) {
@@ -1445,7 +1482,5 @@ SDValue DAGTypeLegalizer::ExpandFloatOp_STORE(SDNode *N, unsigned OpNo) {
   GetExpandedOp(ST->getValue(), Lo, Hi);
 
   return DAG.getTruncStore(Chain, SDLoc(N), Hi, Ptr,
-                           ST->getPointerInfo(),
-                           ST->getMemoryVT(), ST->isVolatile(),
-                           ST->isNonTemporal(), ST->getAlignment());
+                           ST->getMemoryVT(), ST->getMemOperand());
 }
diff --git a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index ff8f1f9..4255948 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -417,9 +417,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_LOAD(LoadSDNode *N) {
     ISD::isNON_EXTLoad(N) ? ISD::EXTLOAD : N->getExtensionType();
   SDLoc dl(N);
   SDValue Res = DAG.getExtLoad(ExtType, dl, NVT, N->getChain(), N->getBasePtr(),
-                               N->getPointerInfo(),
-                               N->getMemoryVT(), N->isVolatile(),
-                               N->isNonTemporal(), N->getAlignment());
+                               N->getMemoryVT(), N->getMemOperand());
 
   // Legalized the chain result - switch anything that used the old chain to
   // use the new one.
@@ -919,7 +917,8 @@ SDValue DAGTypeLegalizer::PromoteIntOp_BUILD_VECTOR(SDNode *N) {
   // type does not have a strange size (eg: it is not i1).
   EVT VecVT = N->getValueType(0);
   unsigned NumElts = VecVT.getVectorNumElements();
-  assert(!(NumElts & 1) && "Legal vector of one illegal element?");
+  assert(!((NumElts & 1) && (!TLI.isTypeLegal(VecVT))) &&
+		 "Legal vector of one illegal element?");
 
   // Promote the inserted value.  The type does not need to match the
   // vector element type.  Check that any extra bits introduced will be
@@ -1037,17 +1036,13 @@ SDValue DAGTypeLegalizer::PromoteIntOp_SINT_TO_FP(SDNode *N) {
 SDValue DAGTypeLegalizer::PromoteIntOp_STORE(StoreSDNode *N, unsigned OpNo){
   assert(ISD::isUNINDEXEDStore(N) && "Indexed store during type legalization!");
   SDValue Ch = N->getChain(), Ptr = N->getBasePtr();
-  unsigned Alignment = N->getAlignment();
-  bool isVolatile = N->isVolatile();
-  bool isNonTemporal = N->isNonTemporal();
   SDLoc dl(N);
 
   SDValue Val = GetPromotedInteger(N->getValue());  // Get promoted value.
 
   // Truncate the value and store the result.
-  return DAG.getTruncStore(Ch, dl, Val, Ptr, N->getPointerInfo(),
-                           N->getMemoryVT(),
-                           isVolatile, isNonTemporal, Alignment);
+  return DAG.getTruncStore(Ch, dl, Val, Ptr,
+                           N->getMemoryVT(), N->getMemOperand());
 }
 
 SDValue DAGTypeLegalizer::PromoteIntOp_TRUNCATE(SDNode *N) {
@@ -1193,6 +1188,7 @@ std::pair <SDValue, SDValue> DAGTypeLegalizer::ExpandAtomic(SDNode *Node) {
     case MVT::i16: LC = RTLIB::SYNC_LOCK_TEST_AND_SET_2; break;
     case MVT::i32: LC = RTLIB::SYNC_LOCK_TEST_AND_SET_4; break;
     case MVT::i64: LC = RTLIB::SYNC_LOCK_TEST_AND_SET_8; break;
+    case MVT::i128:LC = RTLIB::SYNC_LOCK_TEST_AND_SET_16;break;
     }
     break;
   case ISD::ATOMIC_CMP_SWAP:
@@ -1202,6 +1198,7 @@ std::pair <SDValue, SDValue> DAGTypeLegalizer::ExpandAtomic(SDNode *Node) {
     case MVT::i16: LC = RTLIB::SYNC_VAL_COMPARE_AND_SWAP_2; break;
     case MVT::i32: LC = RTLIB::SYNC_VAL_COMPARE_AND_SWAP_4; break;
     case MVT::i64: LC = RTLIB::SYNC_VAL_COMPARE_AND_SWAP_8; break;
+    case MVT::i128:LC = RTLIB::SYNC_VAL_COMPARE_AND_SWAP_16;break;
     }
     break;
   case ISD::ATOMIC_LOAD_ADD:
@@ -1211,6 +1208,7 @@ std::pair <SDValue, SDValue> DAGTypeLegalizer::ExpandAtomic(SDNode *Node) {
     case MVT::i16: LC = RTLIB::SYNC_FETCH_AND_ADD_2; break;
     case MVT::i32: LC = RTLIB::SYNC_FETCH_AND_ADD_4; break;
     case MVT::i64: LC = RTLIB::SYNC_FETCH_AND_ADD_8; break;
+    case MVT::i128:LC = RTLIB::SYNC_FETCH_AND_ADD_16;break;
     }
     break;
   case ISD::ATOMIC_LOAD_SUB:
@@ -1220,6 +1218,7 @@ std::pair <SDValue, SDValue> DAGTypeLegalizer::ExpandAtomic(SDNode *Node) {
     case MVT::i16: LC = RTLIB::SYNC_FETCH_AND_SUB_2; break;
     case MVT::i32: LC = RTLIB::SYNC_FETCH_AND_SUB_4; break;
     case MVT::i64: LC = RTLIB::SYNC_FETCH_AND_SUB_8; break;
+    case MVT::i128:LC = RTLIB::SYNC_FETCH_AND_SUB_16;break;
     }
     break;
   case ISD::ATOMIC_LOAD_AND:
@@ -1229,6 +1228,7 @@ std::pair <SDValue, SDValue> DAGTypeLegalizer::ExpandAtomic(SDNode *Node) {
     case MVT::i16: LC = RTLIB::SYNC_FETCH_AND_AND_2; break;
     case MVT::i32: LC = RTLIB::SYNC_FETCH_AND_AND_4; break;
     case MVT::i64: LC = RTLIB::SYNC_FETCH_AND_AND_8; break;
+    case MVT::i128:LC = RTLIB::SYNC_FETCH_AND_AND_16;break;
     }
     break;
   case ISD::ATOMIC_LOAD_OR:
@@ -1238,6 +1238,7 @@ std::pair <SDValue, SDValue> DAGTypeLegalizer::ExpandAtomic(SDNode *Node) {
     case MVT::i16: LC = RTLIB::SYNC_FETCH_AND_OR_2; break;
     case MVT::i32: LC = RTLIB::SYNC_FETCH_AND_OR_4; break;
     case MVT::i64: LC = RTLIB::SYNC_FETCH_AND_OR_8; break;
+    case MVT::i128:LC = RTLIB::SYNC_FETCH_AND_OR_16;break;
     }
     break;
   case ISD::ATOMIC_LOAD_XOR:
@@ -1247,6 +1248,7 @@ std::pair <SDValue, SDValue> DAGTypeLegalizer::ExpandAtomic(SDNode *Node) {
     case MVT::i16: LC = RTLIB::SYNC_FETCH_AND_XOR_2; break;
     case MVT::i32: LC = RTLIB::SYNC_FETCH_AND_XOR_4; break;
     case MVT::i64: LC = RTLIB::SYNC_FETCH_AND_XOR_8; break;
+    case MVT::i128:LC = RTLIB::SYNC_FETCH_AND_XOR_16;break;
     }
     break;
   case ISD::ATOMIC_LOAD_NAND:
@@ -1256,6 +1258,7 @@ std::pair <SDValue, SDValue> DAGTypeLegalizer::ExpandAtomic(SDNode *Node) {
     case MVT::i16: LC = RTLIB::SYNC_FETCH_AND_NAND_2; break;
     case MVT::i32: LC = RTLIB::SYNC_FETCH_AND_NAND_4; break;
     case MVT::i64: LC = RTLIB::SYNC_FETCH_AND_NAND_8; break;
+    case MVT::i128:LC = RTLIB::SYNC_FETCH_AND_NAND_16;break;
     }
     break;
   }
@@ -1770,7 +1773,8 @@ void DAGTypeLegalizer::ExpandIntRes_FP_TO_SINT(SDNode *N, SDValue &Lo,
   SDValue Op = N->getOperand(0);
   RTLIB::Libcall LC = RTLIB::getFPTOSINT(Op.getValueType(), VT);
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected fp-to-sint conversion!");
-  SplitInteger(TLI.makeLibCall(DAG, LC, VT, &Op, 1, true/*irrelevant*/, dl),
+  SplitInteger(TLI.makeLibCall(DAG, LC, VT, &Op, 1, true/*irrelevant*/,
+                               dl).first,
                Lo, Hi);
 }
 
@@ -1781,7 +1785,8 @@ void DAGTypeLegalizer::ExpandIntRes_FP_TO_UINT(SDNode *N, SDValue &Lo,
   SDValue Op = N->getOperand(0);
   RTLIB::Libcall LC = RTLIB::getFPTOUINT(Op.getValueType(), VT);
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected fp-to-uint conversion!");
-  SplitInteger(TLI.makeLibCall(DAG, LC, VT, &Op, 1, false/*irrelevant*/, dl),
+  SplitInteger(TLI.makeLibCall(DAG, LC, VT, &Op, 1, false/*irrelevant*/,
+                               dl).first,
                Lo, Hi);
 }
 
@@ -1803,6 +1808,7 @@ void DAGTypeLegalizer::ExpandIntRes_LOAD(LoadSDNode *N,
   bool isVolatile = N->isVolatile();
   bool isNonTemporal = N->isNonTemporal();
   bool isInvariant = N->isInvariant();
+  const MDNode *TBAAInfo = N->getTBAAInfo();
   SDLoc dl(N);
 
   assert(NVT.isByteSized() && "Expanded type not byte sized!");
@@ -1811,7 +1817,7 @@ void DAGTypeLegalizer::ExpandIntRes_LOAD(LoadSDNode *N,
     EVT MemVT = N->getMemoryVT();
 
     Lo = DAG.getExtLoad(ExtType, dl, NVT, Ch, Ptr, N->getPointerInfo(),
-                        MemVT, isVolatile, isNonTemporal, Alignment);
+                        MemVT, isVolatile, isNonTemporal, Alignment, TBAAInfo);
 
     // Remember the chain.
     Ch = Lo.getValue(1);
@@ -1833,7 +1839,8 @@ void DAGTypeLegalizer::ExpandIntRes_LOAD(LoadSDNode *N,
   } else if (TLI.isLittleEndian()) {
     // Little-endian - low bits are at low addresses.
     Lo = DAG.getLoad(NVT, dl, Ch, Ptr, N->getPointerInfo(),
-                     isVolatile, isNonTemporal, isInvariant, Alignment);
+                     isVolatile, isNonTemporal, isInvariant, Alignment,
+                     TBAAInfo);
 
     unsigned ExcessBits =
       N->getMemoryVT().getSizeInBits() - NVT.getSizeInBits();
@@ -1842,11 +1849,11 @@ void DAGTypeLegalizer::ExpandIntRes_LOAD(LoadSDNode *N,
     // Increment the pointer to the other half.
     unsigned IncrementSize = NVT.getSizeInBits()/8;
     Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
-                      DAG.getIntPtrConstant(IncrementSize));
+                      DAG.getConstant(IncrementSize, Ptr.getValueType()));
     Hi = DAG.getExtLoad(ExtType, dl, NVT, Ch, Ptr,
                         N->getPointerInfo().getWithOffset(IncrementSize), NEVT,
                         isVolatile, isNonTemporal,
-                        MinAlign(Alignment, IncrementSize));
+                        MinAlign(Alignment, IncrementSize), TBAAInfo);
 
     // Build a factor node to remember that this load is independent of the
     // other one.
@@ -1864,17 +1871,17 @@ void DAGTypeLegalizer::ExpandIntRes_LOAD(LoadSDNode *N,
     Hi = DAG.getExtLoad(ExtType, dl, NVT, Ch, Ptr, N->getPointerInfo(),
                         EVT::getIntegerVT(*DAG.getContext(),
                                           MemVT.getSizeInBits() - ExcessBits),
-                        isVolatile, isNonTemporal, Alignment);
+                        isVolatile, isNonTemporal, Alignment, TBAAInfo);
 
     // Increment the pointer to the other half.
     Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
-                      DAG.getIntPtrConstant(IncrementSize));
+                      DAG.getConstant(IncrementSize, Ptr.getValueType()));
     // Load the rest of the low bits.
     Lo = DAG.getExtLoad(ISD::ZEXTLOAD, dl, NVT, Ch, Ptr,
                         N->getPointerInfo().getWithOffset(IncrementSize),
                         EVT::getIntegerVT(*DAG.getContext(), ExcessBits),
                         isVolatile, isNonTemporal,
-                        MinAlign(Alignment, IncrementSize));
+                        MinAlign(Alignment, IncrementSize), TBAAInfo);
 
     // Build a factor node to remember that this load is independent of the
     // other one.
@@ -1997,7 +2004,8 @@ void DAGTypeLegalizer::ExpandIntRes_MUL(SDNode *N,
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported MUL!");
 
   SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
-  SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, 2, true/*irrelevant*/, dl),
+  SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, 2, true/*irrelevant*/,
+                               dl).first,
                Lo, Hi);
 }
 
@@ -2060,7 +2068,7 @@ void DAGTypeLegalizer::ExpandIntRes_SDIV(SDNode *N,
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported SDIV!");
 
   SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
-  SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, 2, true, dl), Lo, Hi);
+  SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, 2, true, dl).first, Lo, Hi);
 }
 
 void DAGTypeLegalizer::ExpandIntRes_Shift(SDNode *N,
@@ -2155,7 +2163,8 @@ void DAGTypeLegalizer::ExpandIntRes_Shift(SDNode *N,
 
   if (LC != RTLIB::UNKNOWN_LIBCALL && TLI.getLibcallName(LC)) {
     SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
-    SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, 2, isSigned, dl), Lo, Hi);
+    SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, 2, isSigned, dl).first, Lo,
+                 Hi);
     return;
   }
 
@@ -2238,7 +2247,7 @@ void DAGTypeLegalizer::ExpandIntRes_SREM(SDNode *N,
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported SREM!");
 
   SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
-  SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, 2, true, dl), Lo, Hi);
+  SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, 2, true, dl).first, Lo, Hi);
 }
 
 void DAGTypeLegalizer::ExpandIntRes_TRUNCATE(SDNode *N,
@@ -2378,7 +2387,7 @@ void DAGTypeLegalizer::ExpandIntRes_UDIV(SDNode *N,
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported UDIV!");
 
   SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
-  SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, 2, false, dl), Lo, Hi);
+  SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, 2, false, dl).first, Lo, Hi);
 }
 
 void DAGTypeLegalizer::ExpandIntRes_UREM(SDNode *N,
@@ -2398,7 +2407,7 @@ void DAGTypeLegalizer::ExpandIntRes_UREM(SDNode *N,
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported UREM!");
 
   SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
-  SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, 2, false, dl), Lo, Hi);
+  SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, 2, false, dl).first, Lo, Hi);
 }
 
 void DAGTypeLegalizer::ExpandIntRes_ZERO_EXTEND(SDNode *N,
@@ -2685,7 +2694,7 @@ SDValue DAGTypeLegalizer::ExpandIntOp_SINT_TO_FP(SDNode *N) {
   RTLIB::Libcall LC = RTLIB::getSINTTOFP(Op.getValueType(), DstVT);
   assert(LC != RTLIB::UNKNOWN_LIBCALL &&
          "Don't know how to expand this SINT_TO_FP!");
-  return TLI.makeLibCall(DAG, LC, DstVT, &Op, 1, true, SDLoc(N));
+  return TLI.makeLibCall(DAG, LC, DstVT, &Op, 1, true, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::ExpandIntOp_STORE(StoreSDNode *N, unsigned OpNo) {
@@ -2702,6 +2711,7 @@ SDValue DAGTypeLegalizer::ExpandIntOp_STORE(StoreSDNode *N, unsigned OpNo) {
   unsigned Alignment = N->getAlignment();
   bool isVolatile = N->isVolatile();
   bool isNonTemporal = N->isNonTemporal();
+  const MDNode *TBAAInfo = N->getTBAAInfo();
   SDLoc dl(N);
   SDValue Lo, Hi;
 
@@ -2711,7 +2721,7 @@ SDValue DAGTypeLegalizer::ExpandIntOp_STORE(StoreSDNode *N, unsigned OpNo) {
     GetExpandedInteger(N->getValue(), Lo, Hi);
     return DAG.getTruncStore(Ch, dl, Lo, Ptr, N->getPointerInfo(),
                              N->getMemoryVT(), isVolatile, isNonTemporal,
-                             Alignment);
+                             Alignment, TBAAInfo);
   }
 
   if (TLI.isLittleEndian()) {
@@ -2719,7 +2729,7 @@ SDValue DAGTypeLegalizer::ExpandIntOp_STORE(StoreSDNode *N, unsigned OpNo) {
     GetExpandedInteger(N->getValue(), Lo, Hi);
 
     Lo = DAG.getStore(Ch, dl, Lo, Ptr, N->getPointerInfo(),
-                      isVolatile, isNonTemporal, Alignment);
+                      isVolatile, isNonTemporal, Alignment, TBAAInfo);
 
     unsigned ExcessBits =
       N->getMemoryVT().getSizeInBits() - NVT.getSizeInBits();
@@ -2728,11 +2738,11 @@ SDValue DAGTypeLegalizer::ExpandIntOp_STORE(StoreSDNode *N, unsigned OpNo) {
     // Increment the pointer to the other half.
     unsigned IncrementSize = NVT.getSizeInBits()/8;
     Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
-                      DAG.getIntPtrConstant(IncrementSize));
+                      DAG.getConstant(IncrementSize, Ptr.getValueType()));
     Hi = DAG.getTruncStore(Ch, dl, Hi, Ptr,
                            N->getPointerInfo().getWithOffset(IncrementSize),
                            NEVT, isVolatile, isNonTemporal,
-                           MinAlign(Alignment, IncrementSize));
+                           MinAlign(Alignment, IncrementSize), TBAAInfo);
     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo, Hi);
   }
 
@@ -2760,17 +2770,17 @@ SDValue DAGTypeLegalizer::ExpandIntOp_STORE(StoreSDNode *N, unsigned OpNo) {
 
   // Store both the high bits and maybe some of the low bits.
   Hi = DAG.getTruncStore(Ch, dl, Hi, Ptr, N->getPointerInfo(),
-                         HiVT, isVolatile, isNonTemporal, Alignment);
+                         HiVT, isVolatile, isNonTemporal, Alignment, TBAAInfo);
 
   // Increment the pointer to the other half.
   Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
-                    DAG.getIntPtrConstant(IncrementSize));
+                    DAG.getConstant(IncrementSize, Ptr.getValueType()));
   // Store the lowest ExcessBits bits in the second half.
   Lo = DAG.getTruncStore(Ch, dl, Lo, Ptr,
                          N->getPointerInfo().getWithOffset(IncrementSize),
                          EVT::getIntegerVT(*DAG.getContext(), ExcessBits),
                          isVolatile, isNonTemporal,
-                         MinAlign(Alignment, IncrementSize));
+                         MinAlign(Alignment, IncrementSize), TBAAInfo);
   return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo, Hi);
 }
 
@@ -2835,7 +2845,8 @@ SDValue DAGTypeLegalizer::ExpandIntOp_UINT_TO_FP(SDNode *N) {
     SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet,
                                    Zero, Four);
     unsigned Alignment = cast<ConstantPoolSDNode>(FudgePtr)->getAlignment();
-    FudgePtr = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(), FudgePtr, Offset);
+    FudgePtr = DAG.getNode(ISD::ADD, dl, FudgePtr.getValueType(),
+                           FudgePtr, Offset);
     Alignment = std::min(Alignment, 4u);
 
     // Load the value out, extending it from f32 to the destination float type.
@@ -2852,7 +2863,7 @@ SDValue DAGTypeLegalizer::ExpandIntOp_UINT_TO_FP(SDNode *N) {
   RTLIB::Libcall LC = RTLIB::getUINTTOFP(SrcVT, DstVT);
   assert(LC != RTLIB::UNKNOWN_LIBCALL &&
          "Don't know how to expand this UINT_TO_FP!");
-  return TLI.makeLibCall(DAG, LC, DstVT, &Op, 1, true, dl);
+  return TLI.makeLibCall(DAG, LC, DstVT, &Op, 1, true, dl).first;
 }
 
 SDValue DAGTypeLegalizer::ExpandIntOp_ATOMIC_STORE(SDNode *N) {
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
index fd770d1..eb13230 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
@@ -958,20 +958,6 @@ SDValue DAGTypeLegalizer::DisintegrateMERGE_VALUES(SDNode *N, unsigned ResNo) {
   return SDValue(N->getOperand(ResNo));
 }
 
-/// GetSplitDestVTs - Compute the VTs needed for the low/hi parts of a type
-/// which is split into two not necessarily identical pieces.
-void DAGTypeLegalizer::GetSplitDestVTs(EVT InVT, EVT &LoVT, EVT &HiVT) {
-  // Currently all types are split in half.
-  if (!InVT.isVector()) {
-    LoVT = HiVT = TLI.getTypeToTransformTo(*DAG.getContext(), InVT);
-  } else {
-    unsigned NumElements = InVT.getVectorNumElements();
-    assert(!(NumElements & 1) && "Splitting vector, but not in half!");
-    LoVT = HiVT = EVT::getVectorVT(*DAG.getContext(),
-                                   InVT.getVectorElementType(), NumElements/2);
-  }
-}
-
 /// GetPairElements - Use ISD::EXTRACT_ELEMENT nodes to extract the low and
 /// high parts of the given value.
 void DAGTypeLegalizer::GetPairElements(SDValue Pair,
@@ -988,10 +974,7 @@ SDValue DAGTypeLegalizer::GetVectorElementPointer(SDValue VecPtr, EVT EltVT,
                                                   SDValue Index) {
   SDLoc dl(Index);
   // Make sure the index type is big enough to compute in.
-  if (Index.getValueType().bitsGT(TLI.getPointerTy()))
-    Index = DAG.getNode(ISD::TRUNCATE, dl, TLI.getPointerTy(), Index);
-  else
-    Index = DAG.getNode(ISD::ZERO_EXTEND, dl, TLI.getPointerTy(), Index);
+  Index = DAG.getZExtOrTrunc(Index, dl, TLI.getPointerTy());
 
   // Calculate the element offset and add it to the pointer.
   unsigned EltSize = EltVT.getSizeInBits() / 8; // FIXME: should be ABI size.
@@ -1024,20 +1007,23 @@ SDValue DAGTypeLegalizer::LibCallify(RTLIB::Libcall LC, SDNode *N,
   unsigned NumOps = N->getNumOperands();
   SDLoc dl(N);
   if (NumOps == 0) {
-    return TLI.makeLibCall(DAG, LC, N->getValueType(0), 0, 0, isSigned, dl);
+    return TLI.makeLibCall(DAG, LC, N->getValueType(0), 0, 0, isSigned,
+                           dl).first;
   } else if (NumOps == 1) {
     SDValue Op = N->getOperand(0);
-    return TLI.makeLibCall(DAG, LC, N->getValueType(0), &Op, 1, isSigned, dl);
+    return TLI.makeLibCall(DAG, LC, N->getValueType(0), &Op, 1, isSigned,
+                           dl).first;
   } else if (NumOps == 2) {
     SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
-    return TLI.makeLibCall(DAG, LC, N->getValueType(0), Ops, 2, isSigned, dl);
+    return TLI.makeLibCall(DAG, LC, N->getValueType(0), Ops, 2, isSigned,
+                           dl).first;
   }
   SmallVector<SDValue, 8> Ops(NumOps);
   for (unsigned i = 0; i < NumOps; ++i)
     Ops[i] = N->getOperand(i);
 
   return TLI.makeLibCall(DAG, LC, N->getValueType(0),
-                         &Ops[0], NumOps, isSigned, dl);
+                         &Ops[0], NumOps, isSigned, dl).first;
 }
 
 // ExpandChainLibCall - Expand a node into a call to a libcall. Similar to
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index 63e9af3..13bb08f 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -410,6 +410,7 @@ private:
   SDValue SoftenFloatRes_FPOWI(SDNode *N);
   SDValue SoftenFloatRes_FREM(SDNode *N);
   SDValue SoftenFloatRes_FRINT(SDNode *N);
+  SDValue SoftenFloatRes_FROUND(SDNode *N);
   SDValue SoftenFloatRes_FSIN(SDNode *N);
   SDValue SoftenFloatRes_FSQRT(SDNode *N);
   SDValue SoftenFloatRes_FSUB(SDNode *N);
@@ -470,6 +471,7 @@ private:
   void ExpandFloatRes_FPOWI     (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandFloatRes_FREM      (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandFloatRes_FRINT     (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FROUND    (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandFloatRes_FSIN      (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandFloatRes_FSQRT     (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandFloatRes_FSUB      (SDNode *N, SDValue &Lo, SDValue &Hi);
@@ -480,6 +482,7 @@ private:
   // Float Operand Expansion.
   bool ExpandFloatOperand(SDNode *N, unsigned OperandNo);
   SDValue ExpandFloatOp_BR_CC(SDNode *N);
+  SDValue ExpandFloatOp_FCOPYSIGN(SDNode *N);
   SDValue ExpandFloatOp_FP_ROUND(SDNode *N);
   SDValue ExpandFloatOp_FP_TO_SINT(SDNode *N);
   SDValue ExpandFloatOp_FP_TO_UINT(SDNode *N);
@@ -534,7 +537,7 @@ private:
   // Vector Operand Scalarization: <1 x ty> -> ty.
   bool ScalarizeVectorOperand(SDNode *N, unsigned OpNo);
   SDValue ScalarizeVecOp_BITCAST(SDNode *N);
-  SDValue ScalarizeVecOp_EXTEND(SDNode *N);
+  SDValue ScalarizeVecOp_UnaryOp(SDNode *N);
   SDValue ScalarizeVecOp_CONCAT_VECTORS(SDNode *N);
   SDValue ScalarizeVecOp_EXTRACT_VECTOR_ELT(SDNode *N);
   SDValue ScalarizeVecOp_STORE(StoreSDNode *N, unsigned OpNo);
@@ -558,6 +561,7 @@ private:
   void SplitVecRes_BinOp(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_TernaryOp(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_UnaryOp(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_ExtendOp(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_InregOp(SDNode *N, SDValue &Lo, SDValue &Hi);
 
   void SplitVecRes_BITCAST(SDNode *N, SDValue &Lo, SDValue &Hi);
@@ -628,6 +632,7 @@ private:
 
   SDValue WidenVecRes_Ternary(SDNode *N);
   SDValue WidenVecRes_Binary(SDNode *N);
+  SDValue WidenVecRes_BinaryCanTrap(SDNode *N);
   SDValue WidenVecRes_Convert(SDNode *N);
   SDValue WidenVecRes_POWI(SDNode *N);
   SDValue WidenVecRes_Shift(SDNode *N);
@@ -699,10 +704,6 @@ private:
       GetExpandedFloat(Op, Lo, Hi);
   }
 
-  /// GetSplitDestVTs - Compute the VTs needed for the low/hi parts of a type
-  /// which is split (or expanded) into two not necessarily identical pieces.
-  void GetSplitDestVTs(EVT InVT, EVT &LoVT, EVT &HiVT);
-
   /// GetPairElements - Use ISD::EXTRACT_ELEMENT nodes to extract the low and
   /// high parts of the given value.
   void GetPairElements(SDValue Pair, SDValue &Lo, SDValue &Hi);
@@ -730,6 +731,12 @@ private:
       GetExpandedFloat(Op, Lo, Hi);
   }
 
+
+  /// This function will split the integer \p Op into \p NumElements
+  /// operations of type \p EltVT and store them in \p Ops.
+  void IntegerToVector(SDValue Op, unsigned NumElements,
+                       SmallVectorImpl<SDValue> &Ops, EVT EltVT);
+
   // Generic Result Expansion.
   void ExpandRes_MERGE_VALUES      (SDNode *N, unsigned ResNo,
                                     SDValue &Lo, SDValue &Hi);
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp b/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
index 96f6143..c749fde 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
@@ -77,13 +77,9 @@ void DAGTypeLegalizer::ExpandRes_BITCAST(SDNode *N, SDValue &Lo, SDValue &Hi) {
     case TargetLowering::TypeWidenVector: {
       assert(!(InVT.getVectorNumElements() & 1) && "Unsupported BITCAST");
       InOp = GetWidenedVector(InOp);
-      EVT InNVT = EVT::getVectorVT(*DAG.getContext(), InVT.getVectorElementType(),
-                                   InVT.getVectorNumElements()/2);
-      Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InNVT, InOp,
-                       DAG.getConstant(0, TLI.getVectorIdxTy()));
-      Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InNVT, InOp,
-                       DAG.getConstant(InNVT.getVectorNumElements(),
-                                       TLI.getVectorIdxTy()));
+      EVT LoVT, HiVT;
+      llvm::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(InVT);
+      llvm::tie(Lo, Hi) = DAG.SplitVector(InOp, dl, LoVT, HiVT);
       if (TLI.isBigEndian())
         std::swap(Lo, Hi);
       Lo = DAG.getNode(ISD::BITCAST, dl, NOutVT, Lo);
@@ -169,7 +165,8 @@ void DAGTypeLegalizer::ExpandRes_BITCAST(SDNode *N, SDValue &Lo, SDValue &Hi) {
   // Increment the pointer to the other half.
   unsigned IncrementSize = NOutVT.getSizeInBits() / 8;
   StackPtr = DAG.getNode(ISD::ADD, dl, StackPtr.getValueType(), StackPtr,
-                         DAG.getIntPtrConstant(IncrementSize));
+                         DAG.getConstant(IncrementSize,
+                                         StackPtr.getValueType()));
 
   // Load the second half from the stack slot.
   Hi = DAG.getLoad(NOutVT, dl, Store, StackPtr,
@@ -253,20 +250,22 @@ void DAGTypeLegalizer::ExpandRes_NormalLoad(SDNode *N, SDValue &Lo,
   bool isVolatile = LD->isVolatile();
   bool isNonTemporal = LD->isNonTemporal();
   bool isInvariant = LD->isInvariant();
+  const MDNode *TBAAInfo = LD->getTBAAInfo();
 
   assert(NVT.isByteSized() && "Expanded type not byte sized!");
 
   Lo = DAG.getLoad(NVT, dl, Chain, Ptr, LD->getPointerInfo(),
-                   isVolatile, isNonTemporal, isInvariant, Alignment);
+                   isVolatile, isNonTemporal, isInvariant, Alignment,
+                   TBAAInfo);
 
   // Increment the pointer to the other half.
   unsigned IncrementSize = NVT.getSizeInBits() / 8;
   Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
-                    DAG.getIntPtrConstant(IncrementSize));
+                    DAG.getConstant(IncrementSize, Ptr.getValueType()));
   Hi = DAG.getLoad(NVT, dl, Chain, Ptr,
                    LD->getPointerInfo().getWithOffset(IncrementSize),
                    isVolatile, isNonTemporal, isInvariant,
-                   MinAlign(Alignment, IncrementSize));
+                   MinAlign(Alignment, IncrementSize), TBAAInfo);
 
   // Build a factor node to remember that this load is independent of the
   // other one.
@@ -307,6 +306,25 @@ void DAGTypeLegalizer::ExpandRes_VAARG(SDNode *N, SDValue &Lo, SDValue &Hi) {
 // Generic Operand Expansion.
 //===--------------------------------------------------------------------===//
 
+void DAGTypeLegalizer::IntegerToVector(SDValue Op, unsigned NumElements,
+                                       SmallVectorImpl<SDValue> &Ops,
+                                       EVT EltVT) {
+  assert(Op.getValueType().isInteger());
+  SDLoc DL(Op);
+  SDValue Parts[2];
+
+  if (NumElements > 1) {
+    NumElements >>= 1;
+    SplitInteger(Op, Parts[0], Parts[1]);
+      if (TLI.isBigEndian())
+        std::swap(Parts[0], Parts[1]);
+    IntegerToVector(Parts[0], NumElements, Ops, EltVT);
+    IntegerToVector(Parts[1], NumElements, Ops, EltVT);
+  } else {
+    Ops.push_back(DAG.getNode(ISD::BITCAST, DL, EltVT, Op));
+  }
+}
+
 SDValue DAGTypeLegalizer::ExpandOp_BITCAST(SDNode *N) {
   SDLoc dl(N);
   if (N->getValueType(0).isVector()) {
@@ -315,21 +333,27 @@ SDValue DAGTypeLegalizer::ExpandOp_BITCAST(SDNode *N) {
     // instead, but only if the new vector type is legal (otherwise there
     // is no point, and it might create expansion loops).  For example, on
     // x86 this turns v1i64 = BITCAST i64 into v1i64 = BITCAST v2i32.
+    //
+    // FIXME: I'm not sure why we are first trying to split the input into
+    // a 2 element vector, so I'm leaving it here to maintain the current
+    // behavior.
+    unsigned NumElts = 2;
     EVT OVT = N->getOperand(0).getValueType();
     EVT NVT = EVT::getVectorVT(*DAG.getContext(),
                                TLI.getTypeToTransformTo(*DAG.getContext(), OVT),
-                               2);
-
-    if (isTypeLegal(NVT)) {
-      SDValue Parts[2];
-      GetExpandedOp(N->getOperand(0), Parts[0], Parts[1]);
+                               NumElts);
+    if (!isTypeLegal(NVT)) {
+      // If we can't find a legal type by splitting the integer in half,
+      // then we can use the node's value type.
+      NumElts = N->getValueType(0).getVectorNumElements();
+      NVT = N->getValueType(0);
+    }
 
-      if (TLI.isBigEndian())
-        std::swap(Parts[0], Parts[1]);
+    SmallVector<SDValue, 8> Ops;
+    IntegerToVector(N->getOperand(0), NumElts, Ops, NVT.getVectorElementType());
 
-      SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, Parts, 2);
-      return DAG.getNode(ISD::BITCAST, dl, N->getValueType(0), Vec);
-    }
+    SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, &Ops[0], NumElts);
+    return DAG.getNode(ISD::BITCAST, dl, N->getValueType(0), Vec);
   }
 
   // Otherwise, store to a temporary and load out again as the new type.
@@ -439,6 +463,7 @@ SDValue DAGTypeLegalizer::ExpandOp_NormalStore(SDNode *N, unsigned OpNo) {
   unsigned Alignment = St->getAlignment();
   bool isVolatile = St->isVolatile();
   bool isNonTemporal = St->isNonTemporal();
+  const MDNode *TBAAInfo = St->getTBAAInfo();
 
   assert(NVT.isByteSized() && "Expanded type not byte sized!");
   unsigned IncrementSize = NVT.getSizeInBits() / 8;
@@ -450,15 +475,14 @@ SDValue DAGTypeLegalizer::ExpandOp_NormalStore(SDNode *N, unsigned OpNo) {
     std::swap(Lo, Hi);
 
   Lo = DAG.getStore(Chain, dl, Lo, Ptr, St->getPointerInfo(),
-                    isVolatile, isNonTemporal, Alignment);
+                    isVolatile, isNonTemporal, Alignment, TBAAInfo);
 
   Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
-                    DAG.getIntPtrConstant(IncrementSize));
-  assert(isTypeLegal(Ptr.getValueType()) && "Pointers must be legal!");
+                    DAG.getConstant(IncrementSize, Ptr.getValueType()));
   Hi = DAG.getStore(Chain, dl, Hi, Ptr,
                     St->getPointerInfo().getWithOffset(IncrementSize),
                     isVolatile, isNonTemporal,
-                    MinAlign(Alignment, IncrementSize));
+                    MinAlign(Alignment, IncrementSize), TBAAInfo);
 
   return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo, Hi);
 }
@@ -489,14 +513,12 @@ void DAGTypeLegalizer::SplitRes_SELECT(SDNode *N, SDValue &Lo,
   SDValue Cond = N->getOperand(0);
   CL = CH = Cond;
   if (Cond.getValueType().isVector()) {
-    assert(Cond.getValueType().getVectorElementType() == MVT::i1 &&
-           "Condition legalized before result?");
-    unsigned NumElements = Cond.getValueType().getVectorNumElements();
-    EVT VCondTy = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElements / 2);
-    CL = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VCondTy, Cond,
-                     DAG.getConstant(0, TLI.getVectorIdxTy()));
-    CH = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VCondTy, Cond,
-                     DAG.getConstant(NumElements / 2, TLI.getVectorIdxTy()));
+    // Check if there are already splitted versions of the vector available and
+    // use those instead of splitting the mask operand again.
+    if (getTypeAction(Cond.getValueType()) == TargetLowering::TypeSplitVector)
+      GetSplitVector(Cond, CL, CH);
+    else
+      llvm::tie(CL, CH) = DAG.SplitVector(Cond, dl);
   }
 
   Lo = DAG.getNode(N->getOpcode(), dl, LL.getValueType(), CL, LL, RL);
@@ -518,7 +540,7 @@ void DAGTypeLegalizer::SplitRes_SELECT_CC(SDNode *N, SDValue &Lo,
 
 void DAGTypeLegalizer::SplitRes_UNDEF(SDNode *N, SDValue &Lo, SDValue &Hi) {
   EVT LoVT, HiVT;
-  GetSplitDestVTs(N->getValueType(0), LoVT, HiVT);
+  llvm::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
   Lo = DAG.getUNDEF(LoVT);
   Hi = DAG.getUNDEF(HiVT);
 }
diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index bbe11b8..2c3cdcc 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -171,7 +171,7 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
         return TranslateLegalizeResults(Op, Result);
       case TargetLowering::Custom:
         Changed = true;
-        return LegalizeOp(TLI.LowerOperation(Result, DAG));
+        return TranslateLegalizeResults(Op, TLI.LowerOperation(Result, DAG));
       case TargetLowering::Expand:
         Changed = true;
         return LegalizeOp(ExpandStore(Op));
@@ -227,6 +227,7 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
   case ISD::FP_TO_UINT:
   case ISD::FNEG:
   case ISD::FABS:
+  case ISD::FCOPYSIGN:
   case ISD::FSQRT:
   case ISD::FSIN:
   case ISD::FCOS:
@@ -241,6 +242,7 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
   case ISD::FTRUNC:
   case ISD::FRINT:
   case ISD::FNEARBYINT:
+  case ISD::FROUND:
   case ISD::FFLOOR:
   case ISD::FP_ROUND:
   case ISD::FP_EXTEND:
@@ -416,7 +418,8 @@ SDValue VectorLegalizer::ExpandLoad(SDValue Op) {
         ScalarLoad = DAG.getLoad(WideVT, dl, Chain, BasePTR,
                                  LD->getPointerInfo().getWithOffset(Offset),
                                  LD->isVolatile(), LD->isNonTemporal(),
-                                 LD->isInvariant(), LD->getAlignment());
+                                 LD->isInvariant(), LD->getAlignment(),
+                                 LD->getTBAAInfo());
       } else {
         EVT LoadVT = WideVT;
         while (RemainingBytes < LoadBytes) {
@@ -426,13 +429,14 @@ SDValue VectorLegalizer::ExpandLoad(SDValue Op) {
         ScalarLoad = DAG.getExtLoad(ISD::EXTLOAD, dl, WideVT, Chain, BasePTR,
                                     LD->getPointerInfo().getWithOffset(Offset),
                                     LoadVT, LD->isVolatile(),
-                                    LD->isNonTemporal(), LD->getAlignment());
+                                    LD->isNonTemporal(), LD->getAlignment(),
+                                    LD->getTBAAInfo());
       }
 
       RemainingBytes -= LoadBytes;
       Offset += LoadBytes;
       BasePTR = DAG.getNode(ISD::ADD, dl, BasePTR.getValueType(), BasePTR,
-                            DAG.getIntPtrConstant(LoadBytes));
+                            DAG.getConstant(LoadBytes, BasePTR.getValueType()));
 
       LoadVals.push_back(ScalarLoad.getValue(0));
       LoadChains.push_back(ScalarLoad.getValue(1));
@@ -497,10 +501,10 @@ SDValue VectorLegalizer::ExpandLoad(SDValue Op) {
                 Chain, BasePTR, LD->getPointerInfo().getWithOffset(Idx * Stride),
                 SrcVT.getScalarType(),
                 LD->isVolatile(), LD->isNonTemporal(),
-                LD->getAlignment());
+                LD->getAlignment(), LD->getTBAAInfo());
 
       BasePTR = DAG.getNode(ISD::ADD, dl, BasePTR.getValueType(), BasePTR,
-                         DAG.getIntPtrConstant(Stride));
+                         DAG.getConstant(Stride, BasePTR.getValueType()));
 
       Vals.push_back(ScalarLoad.getValue(0));
       LoadChains.push_back(ScalarLoad.getValue(1));
@@ -529,6 +533,7 @@ SDValue VectorLegalizer::ExpandStore(SDValue Op) {
   unsigned Alignment = ST->getAlignment();
   bool isVolatile = ST->isVolatile();
   bool isNonTemporal = ST->isNonTemporal();
+  const MDNode *TBAAInfo = ST->getTBAAInfo();
 
   unsigned NumElem = StVT.getVectorNumElements();
   // The type of the data we want to save
@@ -556,10 +561,10 @@ SDValue VectorLegalizer::ExpandStore(SDValue Op) {
     // This scalar TruncStore may be illegal, but we legalize it later.
     SDValue Store = DAG.getTruncStore(Chain, dl, Ex, BasePTR,
                ST->getPointerInfo().getWithOffset(Idx*Stride), MemSclVT,
-               isVolatile, isNonTemporal, Alignment);
+               isVolatile, isNonTemporal, Alignment, TBAAInfo);
 
     BasePTR = DAG.getNode(ISD::ADD, dl, BasePTR.getValueType(), BasePTR,
-                                DAG.getIntPtrConstant(Stride));
+                               DAG.getConstant(Stride, BasePTR.getValueType()));
 
     Stores.push_back(Store);
   }
@@ -597,10 +602,7 @@ SDValue VectorLegalizer::ExpandSELECT(SDValue Op) {
     return DAG.UnrollVectorOp(Op.getNode());
 
   // Generate a mask operand.
-  EVT MaskTy = TLI.getSetCCResultType(*DAG.getContext(), VT);
-  assert(MaskTy.isVector() && "Invalid CC type");
-  assert(MaskTy.getSizeInBits() == Op1.getValueType().getSizeInBits()
-         && "Invalid mask size");
+  EVT MaskTy = VT.changeVectorElementTypeToInteger();
 
   // What is the size of each element in the vector mask.
   EVT BitTy = MaskTy.getScalarType();
diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 54380ec..f7a3e3d 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -83,6 +83,7 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::FP_TO_SINT:
   case ISD::FP_TO_UINT:
   case ISD::FRINT:
+  case ISD::FROUND:
   case ISD::FSIN:
   case ISD::FSQRT:
   case ISD::FTRUNC:
@@ -97,6 +98,7 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::ADD:
   case ISD::AND:
   case ISD::FADD:
+  case ISD::FCOPYSIGN:
   case ISD::FDIV:
   case ISD::FMUL:
   case ISD::FPOW:
@@ -215,7 +217,8 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_LOAD(LoadSDNode *N) {
                                N->getPointerInfo(),
                                N->getMemoryVT().getVectorElementType(),
                                N->isVolatile(), N->isNonTemporal(),
-                               N->isInvariant(), N->getOriginalAlignment());
+                               N->isInvariant(), N->getOriginalAlignment(),
+                               N->getTBAAInfo());
 
   // Legalized the chain result - switch anything that used the old chain to
   // use the new one.
@@ -369,7 +372,8 @@ bool DAGTypeLegalizer::ScalarizeVectorOperand(SDNode *N, unsigned OpNo) {
     case ISD::ANY_EXTEND:
     case ISD::ZERO_EXTEND:
     case ISD::SIGN_EXTEND:
-      Res = ScalarizeVecOp_EXTEND(N);
+    case ISD::TRUNCATE:
+      Res = ScalarizeVecOp_UnaryOp(N);
       break;
     case ISD::CONCAT_VECTORS:
       Res = ScalarizeVecOp_CONCAT_VECTORS(N);
@@ -408,7 +412,7 @@ SDValue DAGTypeLegalizer::ScalarizeVecOp_BITCAST(SDNode *N) {
 
 /// ScalarizeVecOp_EXTEND - If the value to extend is a vector that needs
 /// to be scalarized, it must be <1 x ty>.  Extend the element instead.
-SDValue DAGTypeLegalizer::ScalarizeVecOp_EXTEND(SDNode *N) {
+SDValue DAGTypeLegalizer::ScalarizeVecOp_UnaryOp(SDNode *N) {
   assert(N->getValueType(0).getVectorNumElements() == 1 &&
          "Unexected vector type!");
   SDValue Elt = GetScalarizedVector(N->getOperand(0));
@@ -455,12 +459,12 @@ SDValue DAGTypeLegalizer::ScalarizeVecOp_STORE(StoreSDNode *N, unsigned OpNo){
                              N->getBasePtr(), N->getPointerInfo(),
                              N->getMemoryVT().getVectorElementType(),
                              N->isVolatile(), N->isNonTemporal(),
-                             N->getAlignment());
+                             N->getAlignment(), N->getTBAAInfo());
 
   return DAG.getStore(N->getChain(), dl, GetScalarizedVector(N->getOperand(1)),
                       N->getBasePtr(), N->getPointerInfo(),
                       N->isVolatile(), N->isNonTemporal(),
-                      N->getOriginalAlignment());
+                      N->getOriginalAlignment(), N->getTBAAInfo());
 }
 
 
@@ -517,7 +521,6 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
     SplitVecRes_VECTOR_SHUFFLE(cast<ShuffleVectorSDNode>(N), Lo, Hi);
     break;
 
-  case ISD::ANY_EXTEND:
   case ISD::CONVERT_RNDSAT:
   case ISD::CTLZ:
   case ISD::CTTZ:
@@ -540,21 +543,27 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::FP_TO_SINT:
   case ISD::FP_TO_UINT:
   case ISD::FRINT:
+  case ISD::FROUND:
   case ISD::FSIN:
   case ISD::FSQRT:
   case ISD::FTRUNC:
-  case ISD::SIGN_EXTEND:
   case ISD::SINT_TO_FP:
   case ISD::TRUNCATE:
   case ISD::UINT_TO_FP:
-  case ISD::ZERO_EXTEND:
     SplitVecRes_UnaryOp(N, Lo, Hi);
     break;
 
+  case ISD::ANY_EXTEND:
+  case ISD::SIGN_EXTEND:
+  case ISD::ZERO_EXTEND:
+    SplitVecRes_ExtendOp(N, Lo, Hi);
+    break;
+
   case ISD::ADD:
   case ISD::SUB:
   case ISD::MUL:
   case ISD::FADD:
+  case ISD::FCOPYSIGN:
   case ISD::FSUB:
   case ISD::FMUL:
   case ISD::SDIV:
@@ -615,7 +624,7 @@ void DAGTypeLegalizer::SplitVecRes_BITCAST(SDNode *N, SDValue &Lo,
   // We know the result is a vector.  The input may be either a vector or a
   // scalar value.
   EVT LoVT, HiVT;
-  GetSplitDestVTs(N->getValueType(0), LoVT, HiVT);
+  llvm::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
   SDLoc dl(N);
 
   SDValue InOp = N->getOperand(0);
@@ -670,7 +679,7 @@ void DAGTypeLegalizer::SplitVecRes_BUILD_VECTOR(SDNode *N, SDValue &Lo,
                                                 SDValue &Hi) {
   EVT LoVT, HiVT;
   SDLoc dl(N);
-  GetSplitDestVTs(N->getValueType(0), LoVT, HiVT);
+  llvm::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
   unsigned LoNumElts = LoVT.getVectorNumElements();
   SmallVector<SDValue, 8> LoOps(N->op_begin(), N->op_begin()+LoNumElts);
   Lo = DAG.getNode(ISD::BUILD_VECTOR, dl, LoVT, &LoOps[0], LoOps.size());
@@ -691,7 +700,7 @@ void DAGTypeLegalizer::SplitVecRes_CONCAT_VECTORS(SDNode *N, SDValue &Lo,
   }
 
   EVT LoVT, HiVT;
-  GetSplitDestVTs(N->getValueType(0), LoVT, HiVT);
+  llvm::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
 
   SmallVector<SDValue, 8> LoOps(N->op_begin(), N->op_begin()+NumSubvectors);
   Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, LoVT, &LoOps[0], LoOps.size());
@@ -707,7 +716,7 @@ void DAGTypeLegalizer::SplitVecRes_EXTRACT_SUBVECTOR(SDNode *N, SDValue &Lo,
   SDLoc dl(N);
 
   EVT LoVT, HiVT;
-  GetSplitDestVTs(N->getValueType(0), LoVT, HiVT);
+  llvm::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
 
   Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, LoVT, Vec, Idx);
   uint64_t IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
@@ -731,7 +740,8 @@ void DAGTypeLegalizer::SplitVecRes_InregOp(SDNode *N, SDValue &Lo,
   SDLoc dl(N);
 
   EVT LoVT, HiVT;
-  GetSplitDestVTs(cast<VTSDNode>(N->getOperand(1))->getVT(), LoVT, HiVT);
+  llvm::tie(LoVT, HiVT) =
+    DAG.GetSplitDestVTs(cast<VTSDNode>(N->getOperand(1))->getVT());
 
   Lo = DAG.getNode(N->getOpcode(), dl, LHSLo.getValueType(), LHSLo,
                    DAG.getValueType(LoVT));
@@ -783,7 +793,7 @@ void DAGTypeLegalizer::SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo,
   // Increment the pointer to the other part.
   unsigned IncrementSize = Lo.getValueType().getSizeInBits() / 8;
   StackPtr = DAG.getNode(ISD::ADD, dl, StackPtr.getValueType(), StackPtr,
-                         DAG.getIntPtrConstant(IncrementSize));
+                       DAG.getConstant(IncrementSize, StackPtr.getValueType()));
 
   // Load the Hi part from the stack slot.
   Hi = DAG.getLoad(Hi.getValueType(), dl, Store, StackPtr, MachinePointerInfo(),
@@ -794,7 +804,7 @@ void DAGTypeLegalizer::SplitVecRes_SCALAR_TO_VECTOR(SDNode *N, SDValue &Lo,
                                                     SDValue &Hi) {
   EVT LoVT, HiVT;
   SDLoc dl(N);
-  GetSplitDestVTs(N->getValueType(0), LoVT, HiVT);
+  llvm::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
   Lo = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoVT, N->getOperand(0));
   Hi = DAG.getUNDEF(HiVT);
 }
@@ -804,7 +814,7 @@ void DAGTypeLegalizer::SplitVecRes_LOAD(LoadSDNode *LD, SDValue &Lo,
   assert(ISD::isUNINDEXEDLoad(LD) && "Indexed load during type legalization!");
   EVT LoVT, HiVT;
   SDLoc dl(LD);
-  GetSplitDestVTs(LD->getValueType(0), LoVT, HiVT);
+  llvm::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(LD->getValueType(0));
 
   ISD::LoadExtType ExtType = LD->getExtensionType();
   SDValue Ch = LD->getChain();
@@ -815,20 +825,22 @@ void DAGTypeLegalizer::SplitVecRes_LOAD(LoadSDNode *LD, SDValue &Lo,
   bool isVolatile = LD->isVolatile();
   bool isNonTemporal = LD->isNonTemporal();
   bool isInvariant = LD->isInvariant();
+  const MDNode *TBAAInfo = LD->getTBAAInfo();
 
   EVT LoMemVT, HiMemVT;
-  GetSplitDestVTs(MemoryVT, LoMemVT, HiMemVT);
+  llvm::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
 
   Lo = DAG.getLoad(ISD::UNINDEXED, ExtType, LoVT, dl, Ch, Ptr, Offset,
                    LD->getPointerInfo(), LoMemVT, isVolatile, isNonTemporal,
-                   isInvariant, Alignment);
+                   isInvariant, Alignment, TBAAInfo);
 
   unsigned IncrementSize = LoMemVT.getSizeInBits()/8;
   Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
-                    DAG.getIntPtrConstant(IncrementSize));
+                    DAG.getConstant(IncrementSize, Ptr.getValueType()));
   Hi = DAG.getLoad(ISD::UNINDEXED, ExtType, HiVT, dl, Ch, Ptr, Offset,
                    LD->getPointerInfo().getWithOffset(IncrementSize),
-                   HiMemVT, isVolatile, isNonTemporal, isInvariant, Alignment);
+                   HiMemVT, isVolatile, isNonTemporal, isInvariant, Alignment,
+                   TBAAInfo);
 
   // Build a factor node to remember that this load is independent of the
   // other one.
@@ -847,24 +859,12 @@ void DAGTypeLegalizer::SplitVecRes_SETCC(SDNode *N, SDValue &Lo, SDValue &Hi) {
 
   EVT LoVT, HiVT;
   SDLoc DL(N);
-  GetSplitDestVTs(N->getValueType(0), LoVT, HiVT);
+  llvm::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
 
   // Split the input.
-  EVT InVT = N->getOperand(0).getValueType();
   SDValue LL, LH, RL, RH;
-  EVT InNVT = EVT::getVectorVT(*DAG.getContext(), InVT.getVectorElementType(),
-                               LoVT.getVectorNumElements());
-  LL = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, N->getOperand(0),
-                   DAG.getConstant(0, TLI.getVectorIdxTy()));
-  LH = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, N->getOperand(0),
-                   DAG.getConstant(InNVT.getVectorNumElements(),
-                   TLI.getVectorIdxTy()));
-
-  RL = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, N->getOperand(1),
-                   DAG.getConstant(0, TLI.getVectorIdxTy()));
-  RH = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, N->getOperand(1),
-                   DAG.getConstant(InNVT.getVectorNumElements(),
-                   TLI.getVectorIdxTy()));
+  llvm::tie(LL, LH) = DAG.SplitVectorOperand(N, 0);
+  llvm::tie(RL, RH) = DAG.SplitVectorOperand(N, 1);
 
   Lo = DAG.getNode(N->getOpcode(), DL, LoVT, LL, RL, N->getOperand(2));
   Hi = DAG.getNode(N->getOpcode(), DL, HiVT, LH, RH, N->getOperand(2));
@@ -875,22 +875,15 @@ void DAGTypeLegalizer::SplitVecRes_UnaryOp(SDNode *N, SDValue &Lo,
   // Get the dest types - they may not match the input types, e.g. int_to_fp.
   EVT LoVT, HiVT;
   SDLoc dl(N);
-  GetSplitDestVTs(N->getValueType(0), LoVT, HiVT);
+  llvm::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
 
   // If the input also splits, handle it directly for a compile time speedup.
   // Otherwise split it by hand.
   EVT InVT = N->getOperand(0).getValueType();
-  if (getTypeAction(InVT) == TargetLowering::TypeSplitVector) {
+  if (getTypeAction(InVT) == TargetLowering::TypeSplitVector)
     GetSplitVector(N->getOperand(0), Lo, Hi);
-  } else {
-    EVT InNVT = EVT::getVectorVT(*DAG.getContext(), InVT.getVectorElementType(),
-                                 LoVT.getVectorNumElements());
-    Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InNVT, N->getOperand(0),
-                     DAG.getConstant(0, TLI.getVectorIdxTy()));
-    Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InNVT, N->getOperand(0),
-                     DAG.getConstant(InNVT.getVectorNumElements(),
-                                     TLI.getVectorIdxTy()));
-  }
+  else
+    llvm::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
 
   if (N->getOpcode() == ISD::FP_ROUND) {
     Lo = DAG.getNode(N->getOpcode(), dl, LoVT, Lo, N->getOperand(1));
@@ -913,6 +906,58 @@ void DAGTypeLegalizer::SplitVecRes_UnaryOp(SDNode *N, SDValue &Lo,
   }
 }
 
+void DAGTypeLegalizer::SplitVecRes_ExtendOp(SDNode *N, SDValue &Lo,
+                                            SDValue &Hi) {
+  SDLoc dl(N);
+  EVT SrcVT = N->getOperand(0).getValueType();
+  EVT DestVT = N->getValueType(0);
+  EVT LoVT, HiVT;
+  llvm::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(DestVT);
+
+  // We can do better than a generic split operation if the extend is doing
+  // more than just doubling the width of the elements and the following are
+  // true:
+  //   - The number of vector elements is even,
+  //   - the source type is legal,
+  //   - the type of a split source is illegal,
+  //   - the type of an extended (by doubling element size) source is legal, and
+  //   - the type of that extended source when split is legal.
+  //
+  // This won't necessarily completely legalize the operation, but it will
+  // more effectively move in the right direction and prevent falling down
+  // to scalarization in many cases due to the input vector being split too
+  // far.
+  unsigned NumElements = SrcVT.getVectorNumElements();
+  if ((NumElements & 1) == 0 &&
+      SrcVT.getSizeInBits() * 2 < DestVT.getSizeInBits()) {
+    LLVMContext &Ctx = *DAG.getContext();
+    EVT NewSrcVT = EVT::getVectorVT(
+        Ctx, EVT::getIntegerVT(
+                 Ctx, SrcVT.getVectorElementType().getSizeInBits() * 2),
+        NumElements);
+    EVT SplitSrcVT =
+        EVT::getVectorVT(Ctx, SrcVT.getVectorElementType(), NumElements / 2);
+    EVT SplitLoVT, SplitHiVT;
+    llvm::tie(SplitLoVT, SplitHiVT) = DAG.GetSplitDestVTs(NewSrcVT);
+    if (TLI.isTypeLegal(SrcVT) && !TLI.isTypeLegal(SplitSrcVT) &&
+        TLI.isTypeLegal(NewSrcVT) && TLI.isTypeLegal(SplitLoVT)) {
+      DEBUG(dbgs() << "Split vector extend via incremental extend:";
+            N->dump(&DAG); dbgs() << "\n");
+      // Extend the source vector by one step.
+      SDValue NewSrc =
+          DAG.getNode(N->getOpcode(), dl, NewSrcVT, N->getOperand(0));
+      // Get the low and high halves of the new, extended one step, vector.
+      llvm::tie(Lo, Hi) = DAG.SplitVector(NewSrc, dl);
+      // Extend those vector halves the rest of the way.
+      Lo = DAG.getNode(N->getOpcode(), dl, LoVT, Lo);
+      Hi = DAG.getNode(N->getOpcode(), dl, HiVT, Hi);
+      return;
+    }
+  }
+  // Fall back to the generic unary operator splitting otherwise.
+  SplitVecRes_UnaryOp(N, Lo, Hi);
+}
+
 void DAGTypeLegalizer::SplitVecRes_VECTOR_SHUFFLE(ShuffleVectorSDNode *N,
                                                   SDValue &Lo, SDValue &Hi) {
   // The low and high parts of the original input give four input vectors.
@@ -1105,41 +1150,23 @@ SDValue DAGTypeLegalizer::SplitVecOp_VSELECT(SDNode *N, unsigned OpNo) {
   SDValue Mask = N->getOperand(0);
   SDValue Src0 = N->getOperand(1);
   SDValue Src1 = N->getOperand(2);
+  EVT Src0VT = Src0.getValueType();
   SDLoc DL(N);
-  EVT MaskVT = Mask.getValueType();
-  assert(MaskVT.isVector() && "VSELECT without a vector mask?");
+  assert(Mask.getValueType().isVector() && "VSELECT without a vector mask?");
 
   SDValue Lo, Hi;
   GetSplitVector(N->getOperand(0), Lo, Hi);
   assert(Lo.getValueType() == Hi.getValueType() &&
          "Lo and Hi have differing types");
 
-  unsigned LoNumElts = Lo.getValueType().getVectorNumElements();
-  unsigned HiNumElts = Hi.getValueType().getVectorNumElements();
-  assert(LoNumElts == HiNumElts && "Asymmetric vector split?");
-
-  LLVMContext &Ctx = *DAG.getContext();
-  SDValue Zero = DAG.getConstant(0, TLI.getVectorIdxTy());
-  SDValue LoElts = DAG.getConstant(LoNumElts, TLI.getVectorIdxTy());
-  EVT Src0VT = Src0.getValueType();
-  EVT Src0EltTy = Src0VT.getVectorElementType();
-  EVT MaskEltTy = MaskVT.getVectorElementType();
-
-  EVT LoOpVT = EVT::getVectorVT(Ctx, Src0EltTy, LoNumElts);
-  EVT LoMaskVT = EVT::getVectorVT(Ctx, MaskEltTy, LoNumElts);
-  EVT HiOpVT = EVT::getVectorVT(Ctx, Src0EltTy, HiNumElts);
-  EVT HiMaskVT = EVT::getVectorVT(Ctx, MaskEltTy, HiNumElts);
+  EVT LoOpVT, HiOpVT;
+  llvm::tie(LoOpVT, HiOpVT) = DAG.GetSplitDestVTs(Src0VT);
+  assert(LoOpVT == HiOpVT && "Asymmetric vector split?");
 
-  SDValue LoOp0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LoOpVT, Src0, Zero);
-  SDValue LoOp1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LoOpVT, Src1, Zero);
-
-  SDValue HiOp0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HiOpVT, Src0, LoElts);
-  SDValue HiOp1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HiOpVT, Src1, LoElts);
-
-  SDValue LoMask =
-    DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LoMaskVT, Mask, Zero);
-  SDValue HiMask =
-    DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HiMaskVT, Mask, LoElts);
+  SDValue LoOp0, HiOp0, LoOp1, HiOp1, LoMask, HiMask;
+  llvm::tie(LoOp0, HiOp0) = DAG.SplitVector(Src0, DL);
+  llvm::tie(LoOp1, HiOp1) = DAG.SplitVector(Src1, DL);
+  llvm::tie(LoMask, HiMask) = DAG.SplitVector(Mask, DL);
 
   SDValue LoSelect =
     DAG.getNode(ISD::VSELECT, DL, LoOpVT, LoMask, LoOp0, LoOp1);
@@ -1249,33 +1276,34 @@ SDValue DAGTypeLegalizer::SplitVecOp_STORE(StoreSDNode *N, unsigned OpNo) {
   unsigned Alignment = N->getOriginalAlignment();
   bool isVol = N->isVolatile();
   bool isNT = N->isNonTemporal();
+  const MDNode *TBAAInfo = N->getTBAAInfo();
   SDValue Lo, Hi;
   GetSplitVector(N->getOperand(1), Lo, Hi);
 
   EVT LoMemVT, HiMemVT;
-  GetSplitDestVTs(MemoryVT, LoMemVT, HiMemVT);
+  llvm::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
 
   unsigned IncrementSize = LoMemVT.getSizeInBits()/8;
 
   if (isTruncating)
     Lo = DAG.getTruncStore(Ch, DL, Lo, Ptr, N->getPointerInfo(),
-                           LoMemVT, isVol, isNT, Alignment);
+                           LoMemVT, isVol, isNT, Alignment, TBAAInfo);
   else
     Lo = DAG.getStore(Ch, DL, Lo, Ptr, N->getPointerInfo(),
-                      isVol, isNT, Alignment);
+                      isVol, isNT, Alignment, TBAAInfo);
 
   // Increment the pointer to the other half.
   Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
-                    DAG.getIntPtrConstant(IncrementSize));
+                    DAG.getConstant(IncrementSize, Ptr.getValueType()));
 
   if (isTruncating)
     Hi = DAG.getTruncStore(Ch, DL, Hi, Ptr,
                            N->getPointerInfo().getWithOffset(IncrementSize),
-                           HiMemVT, isVol, isNT, Alignment);
+                           HiMemVT, isVol, isNT, Alignment, TBAAInfo);
   else
     Hi = DAG.getStore(Ch, DL, Hi, Ptr,
                       N->getPointerInfo().getWithOffset(IncrementSize),
-                      isVol, isNT, Alignment);
+                      isVol, isNT, Alignment, TBAAInfo);
 
   return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Lo, Hi);
 }
@@ -1341,13 +1369,8 @@ SDValue DAGTypeLegalizer::SplitVecOp_TRUNCATE(SDNode *N) {
   SDLoc DL(N);
 
   // Extract the halves of the input via extract_subvector.
-  EVT SplitVT = EVT::getVectorVT(*DAG.getContext(),
-                                 InVT.getVectorElementType(), NumElements/2);
-  SDValue InLoVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, InVec,
-                                DAG.getConstant(0, TLI.getVectorIdxTy()));
-  SDValue InHiVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, InVec,
-                                DAG.getConstant(NumElements/2,
-                                TLI.getVectorIdxTy()));
+  SDValue InLoVec, InHiVec;
+  llvm::tie(InLoVec, InHiVec) = DAG.SplitVector(InVec, DL);
   // Truncate them to 1/2 the element size.
   EVT HalfElementVT = EVT::getIntegerVT(*DAG.getContext(), InElementSize/2);
   EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), HalfElementVT,
@@ -1446,27 +1469,31 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::VECTOR_SHUFFLE:
     Res = WidenVecRes_VECTOR_SHUFFLE(cast<ShuffleVectorSDNode>(N));
     break;
+
   case ISD::ADD:
   case ISD::AND:
   case ISD::BSWAP:
+  case ISD::MUL:
+  case ISD::MULHS:
+  case ISD::MULHU:
+  case ISD::OR:
+  case ISD::SUB:
+  case ISD::XOR:
+    Res = WidenVecRes_Binary(N);
+    break;
+
   case ISD::FADD:
   case ISD::FCOPYSIGN:
-  case ISD::FDIV:
   case ISD::FMUL:
   case ISD::FPOW:
-  case ISD::FREM:
   case ISD::FSUB:
-  case ISD::MUL:
-  case ISD::MULHS:
-  case ISD::MULHU:
-  case ISD::OR:
+  case ISD::FDIV:
+  case ISD::FREM:
   case ISD::SDIV:
-  case ISD::SREM:
   case ISD::UDIV:
+  case ISD::SREM:
   case ISD::UREM:
-  case ISD::SUB:
-  case ISD::XOR:
-    Res = WidenVecRes_Binary(N);
+    Res = WidenVecRes_BinaryCanTrap(N);
     break;
 
   case ISD::FPOWI:
@@ -1507,6 +1534,7 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::FNEARBYINT:
   case ISD::FNEG:
   case ISD::FRINT:
+  case ISD::FROUND:
   case ISD::FSIN:
   case ISD::FSQRT:
   case ISD::FTRUNC:
@@ -1534,6 +1562,15 @@ SDValue DAGTypeLegalizer::WidenVecRes_Ternary(SDNode *N) {
 
 SDValue DAGTypeLegalizer::WidenVecRes_Binary(SDNode *N) {
   // Binary op widening.
+  SDLoc dl(N);
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue InOp1 = GetWidenedVector(N->getOperand(0));
+  SDValue InOp2 = GetWidenedVector(N->getOperand(1));
+  return DAG.getNode(N->getOpcode(), dl, WidenVT, InOp1, InOp2);
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_BinaryCanTrap(SDNode *N) {
+  // Binary op widening for operations that can trap.
   unsigned Opcode = N->getOpcode();
   SDLoc dl(N);
   EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
@@ -2532,6 +2569,7 @@ SDValue DAGTypeLegalizer::GenWidenVectorLoads(SmallVectorImpl<SDValue> &LdChain,
   bool      isVolatile = LD->isVolatile();
   bool      isNonTemporal = LD->isNonTemporal();
   bool      isInvariant = LD->isInvariant();
+  const MDNode *TBAAInfo = LD->getTBAAInfo();
 
   int LdWidth = LdVT.getSizeInBits();
   int WidthDiff = WidenWidth - LdWidth;          // Difference
@@ -2541,7 +2579,8 @@ SDValue DAGTypeLegalizer::GenWidenVectorLoads(SmallVectorImpl<SDValue> &LdChain,
   EVT NewVT = FindMemType(DAG, TLI, LdWidth, WidenVT, LdAlign, WidthDiff);
   int NewVTWidth = NewVT.getSizeInBits();
   SDValue LdOp = DAG.getLoad(NewVT, dl, Chain, BasePtr, LD->getPointerInfo(),
-                             isVolatile, isNonTemporal, isInvariant, Align);
+                             isVolatile, isNonTemporal, isInvariant, Align,
+                             TBAAInfo);
   LdChain.push_back(LdOp.getValue(1));
 
   // Check if we can load the element with one instruction
@@ -2577,7 +2616,7 @@ SDValue DAGTypeLegalizer::GenWidenVectorLoads(SmallVectorImpl<SDValue> &LdChain,
     unsigned Increment = NewVTWidth / 8;
     Offset += Increment;
     BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
-                          DAG.getIntPtrConstant(Increment));
+                          DAG.getConstant(Increment, BasePtr.getValueType()));
 
     SDValue L;
     if (LdWidth < NewVTWidth) {
@@ -2586,7 +2625,8 @@ SDValue DAGTypeLegalizer::GenWidenVectorLoads(SmallVectorImpl<SDValue> &LdChain,
       NewVTWidth = NewVT.getSizeInBits();
       L = DAG.getLoad(NewVT, dl, Chain, BasePtr,
                       LD->getPointerInfo().getWithOffset(Offset), isVolatile,
-                      isNonTemporal, isInvariant, MinAlign(Align, Increment));
+                      isNonTemporal, isInvariant, MinAlign(Align, Increment),
+                      TBAAInfo);
       LdChain.push_back(L.getValue(1));
       if (L->getValueType(0).isVector()) {
         SmallVector<SDValue, 16> Loads;
@@ -2602,7 +2642,8 @@ SDValue DAGTypeLegalizer::GenWidenVectorLoads(SmallVectorImpl<SDValue> &LdChain,
     } else {
       L = DAG.getLoad(NewVT, dl, Chain, BasePtr,
                       LD->getPointerInfo().getWithOffset(Offset), isVolatile,
-                      isNonTemporal, isInvariant, MinAlign(Align, Increment));
+                      isNonTemporal, isInvariant, MinAlign(Align, Increment),
+                      TBAAInfo);
       LdChain.push_back(L.getValue(1));
     }
 
@@ -2682,6 +2723,7 @@ DAGTypeLegalizer::GenWidenVectorExtLoads(SmallVectorImpl<SDValue> &LdChain,
   unsigned  Align    = LD->getAlignment();
   bool      isVolatile = LD->isVolatile();
   bool      isNonTemporal = LD->isNonTemporal();
+  const MDNode *TBAAInfo = LD->getTBAAInfo();
 
   EVT EltVT = WidenVT.getVectorElementType();
   EVT LdEltVT = LdVT.getVectorElementType();
@@ -2693,15 +2735,17 @@ DAGTypeLegalizer::GenWidenVectorExtLoads(SmallVectorImpl<SDValue> &LdChain,
   unsigned Increment = LdEltVT.getSizeInBits() / 8;
   Ops[0] = DAG.getExtLoad(ExtType, dl, EltVT, Chain, BasePtr,
                           LD->getPointerInfo(),
-                          LdEltVT, isVolatile, isNonTemporal, Align);
+                          LdEltVT, isVolatile, isNonTemporal, Align, TBAAInfo);
   LdChain.push_back(Ops[0].getValue(1));
   unsigned i = 0, Offset = Increment;
   for (i=1; i < NumElts; ++i, Offset += Increment) {
     SDValue NewBasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(),
-                                     BasePtr, DAG.getIntPtrConstant(Offset));
+                                     BasePtr,
+                                     DAG.getConstant(Offset,
+                                                     BasePtr.getValueType()));
     Ops[i] = DAG.getExtLoad(ExtType, dl, EltVT, Chain, NewBasePtr,
                             LD->getPointerInfo().getWithOffset(Offset), LdEltVT,
-                            isVolatile, isNonTemporal, Align);
+                            isVolatile, isNonTemporal, Align, TBAAInfo);
     LdChain.push_back(Ops[i].getValue(1));
   }
 
@@ -2724,6 +2768,7 @@ void DAGTypeLegalizer::GenWidenVectorStores(SmallVectorImpl<SDValue> &StChain,
   unsigned Align = ST->getAlignment();
   bool     isVolatile = ST->isVolatile();
   bool     isNonTemporal = ST->isNonTemporal();
+  const MDNode *TBAAInfo = ST->getTBAAInfo();
   SDValue  ValOp = GetWidenedVector(ST->getValue());
   SDLoc dl(ST);
 
@@ -2750,12 +2795,12 @@ void DAGTypeLegalizer::GenWidenVectorStores(SmallVectorImpl<SDValue> &StChain,
         StChain.push_back(DAG.getStore(Chain, dl, EOp, BasePtr,
                                     ST->getPointerInfo().getWithOffset(Offset),
                                        isVolatile, isNonTemporal,
-                                       MinAlign(Align, Offset)));
+                                       MinAlign(Align, Offset), TBAAInfo));
         StWidth -= NewVTWidth;
         Offset += Increment;
         Idx += NumVTElts;
         BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
-                              DAG.getIntPtrConstant(Increment));
+                              DAG.getConstant(Increment, BasePtr.getValueType()));
       } while (StWidth != 0 && StWidth >= NewVTWidth);
     } else {
       // Cast the vector to the scalar type we can store
@@ -2770,11 +2815,11 @@ void DAGTypeLegalizer::GenWidenVectorStores(SmallVectorImpl<SDValue> &StChain,
         StChain.push_back(DAG.getStore(Chain, dl, EOp, BasePtr,
                                     ST->getPointerInfo().getWithOffset(Offset),
                                        isVolatile, isNonTemporal,
-                                       MinAlign(Align, Offset)));
+                                       MinAlign(Align, Offset), TBAAInfo));
         StWidth -= NewVTWidth;
         Offset += Increment;
         BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
-                              DAG.getIntPtrConstant(Increment));
+                            DAG.getConstant(Increment, BasePtr.getValueType()));
       } while (StWidth != 0 && StWidth >= NewVTWidth);
       // Restore index back to be relative to the original widen element type
       Idx = Idx * NewVTWidth / ValEltWidth;
@@ -2792,6 +2837,7 @@ DAGTypeLegalizer::GenWidenVectorTruncStores(SmallVectorImpl<SDValue> &StChain,
   unsigned Align = ST->getAlignment();
   bool     isVolatile = ST->isVolatile();
   bool     isNonTemporal = ST->isNonTemporal();
+  const MDNode *TBAAInfo = ST->getTBAAInfo();
   SDValue  ValOp = GetWidenedVector(ST->getValue());
   SDLoc dl(ST);
 
@@ -2814,17 +2860,19 @@ DAGTypeLegalizer::GenWidenVectorTruncStores(SmallVectorImpl<SDValue> &StChain,
                             DAG.getConstant(0, TLI.getVectorIdxTy()));
   StChain.push_back(DAG.getTruncStore(Chain, dl, EOp, BasePtr,
                                       ST->getPointerInfo(), StEltVT,
-                                      isVolatile, isNonTemporal, Align));
+                                      isVolatile, isNonTemporal, Align,
+                                      TBAAInfo));
   unsigned Offset = Increment;
   for (unsigned i=1; i < NumElts; ++i, Offset += Increment) {
     SDValue NewBasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(),
-                                     BasePtr, DAG.getIntPtrConstant(Offset));
+                                     BasePtr, DAG.getConstant(Offset,
+                                                       BasePtr.getValueType()));
     SDValue EOp = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ValEltVT, ValOp,
                             DAG.getConstant(0, TLI.getVectorIdxTy()));
     StChain.push_back(DAG.getTruncStore(Chain, dl, EOp, NewBasePtr,
                                       ST->getPointerInfo().getWithOffset(Offset),
                                         StEltVT, isVolatile, isNonTemporal,
-                                        MinAlign(Align, Offset)));
+                                        MinAlign(Align, Offset), TBAAInfo));
   }
 }
 
diff --git a/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp b/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp
index d684164..1dd2128 100644
--- a/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp
+++ b/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp
@@ -389,10 +389,9 @@ signed ResourcePriorityQueue::regPressureDelta(SUnit *SU, bool RawPressure) {
 // Constants used to denote relative importance of
 // heuristic components for cost computation.
 static const unsigned PriorityOne = 200;
-static const unsigned PriorityTwo = 100;
-static const unsigned PriorityThree = 50;
-static const unsigned PriorityFour = 15;
-static const unsigned PriorityFive = 5;
+static const unsigned PriorityTwo = 50;
+static const unsigned PriorityThree = 15;
+static const unsigned PriorityFour = 5;
 static const unsigned ScaleOne = 20;
 static const unsigned ScaleTwo = 10;
 static const unsigned ScaleThree = 5;
@@ -449,7 +448,7 @@ signed ResourcePriorityQueue::SUSchedulingCost(SUnit *SU) {
     if (N->isMachineOpcode()) {
       const MCInstrDesc &TID = TII->get(N->getMachineOpcode());
       if (TID.isCall())
-        ResCount += (PriorityThree + (ScaleThree*N->getNumValues()));
+        ResCount += (PriorityTwo + (ScaleThree*N->getNumValues()));
     }
     else
       switch (N->getOpcode()) {
@@ -457,11 +456,11 @@ signed ResourcePriorityQueue::SUSchedulingCost(SUnit *SU) {
       case ISD::TokenFactor:
       case ISD::CopyFromReg:
       case ISD::CopyToReg:
-        ResCount += PriorityFive;
+        ResCount += PriorityFour;
         break;
 
       case ISD::INLINEASM:
-        ResCount += PriorityFour;
+        ResCount += PriorityThree;
         break;
       }
   }
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
index f5fe168..1a562d7 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
@@ -718,7 +718,7 @@ void ScheduleDAGRRList::ScheduleNodeBottomUp(SUnit *SU) {
   // indicate the scheduled cycle.
   SU->setHeightToAtLeast(CurCycle);
 
-  // Reserve resources for the scheduled intruction.
+  // Reserve resources for the scheduled instruction.
   EmitNode(SU);
 
   Sequence.push_back(SU);
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
index 982dcc9..054e3dd 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
@@ -690,15 +690,6 @@ void ScheduleDAGSDNodes::VerifyScheduledSequence(bool isBottomUp) {
 }
 #endif // NDEBUG
 
-namespace {
-  struct OrderSorter {
-    bool operator()(const std::pair<unsigned, MachineInstr*> &A,
-                    const std::pair<unsigned, MachineInstr*> &B) {
-      return A.first < B.first;
-    }
-  };
-}
-
 /// ProcessSDDbgValues - Process SDDbgValues associated with this node.
 static void
 ProcessSDDbgValues(SDNode *N, SelectionDAG *DAG, InstrEmitter &Emitter,
@@ -744,7 +735,10 @@ ProcessSourceNode(SDNode *N, SelectionDAG *DAG, InstrEmitter &Emitter,
   }
 
   MachineBasicBlock *BB = Emitter.getBlock();
-  if (Emitter.getInsertPos() == BB->begin() || BB->back().isPHI()) {
+  if (Emitter.getInsertPos() == BB->begin() || BB->back().isPHI() ||
+      // Fast-isel may have inserted some instructions, in which case the
+      // BB->back().isPHI() test will not fire when we want it to.
+      prior(Emitter.getInsertPos())->isPHI()) {
     // Did not insert any instruction.
     Orders.push_back(std::make_pair(Order, (MachineInstr*)0));
     return;
@@ -857,7 +851,7 @@ EmitSchedule(MachineBasicBlock::iterator &InsertPos) {
 
     // Sort the source order instructions and use the order to insert debug
     // values.
-    std::sort(Orders.begin(), Orders.end(), OrderSorter());
+    std::sort(Orders.begin(), Orders.end(), less_first());
 
     SDDbgInfo::DbgIterator DI = DAG->DbgBegin();
     SDDbgInfo::DbgIterator DE = DAG->DbgEnd();
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index bc6063c..45d5a4f 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -869,16 +869,19 @@ unsigned SelectionDAG::getEVTAlignment(EVT VT) const {
 
 // EntryNode could meaningfully have debug info if we can find it...
 SelectionDAG::SelectionDAG(const TargetMachine &tm, CodeGenOpt::Level OL)
-  : TM(tm), TSI(*tm.getSelectionDAGInfo()), TTI(0), OptLevel(OL),
+  : TM(tm), TSI(*tm.getSelectionDAGInfo()), TTI(0), TLI(0), OptLevel(OL),
     EntryNode(ISD::EntryToken, 0, DebugLoc(), getVTList(MVT::Other)),
-    Root(getEntryNode()), UpdateListeners(0) {
+    Root(getEntryNode()), NewNodesMustHaveLegalTypes(false),
+    UpdateListeners(0) {
   AllNodes.push_back(&EntryNode);
   DbgInfo = new SDDbgInfo();
 }
 
-void SelectionDAG::init(MachineFunction &mf, const TargetTransformInfo *tti) {
+void SelectionDAG::init(MachineFunction &mf, const TargetTransformInfo *tti,
+                        const TargetLowering *tli) {
   MF = &mf;
   TTI = tti;
+  TLI = tli;
   Context = &mf.getFunction()->getContext();
 }
 
@@ -983,6 +986,54 @@ SDValue SelectionDAG::getConstant(const ConstantInt &Val, EVT VT, bool isT) {
    APInt NewVal = Elt->getValue().zext(EltVT.getSizeInBits());
    Elt = ConstantInt::get(*getContext(), NewVal);
   }
+  // In other cases the element type is illegal and needs to be expanded, for
+  // example v2i64 on MIPS32. In this case, find the nearest legal type, split
+  // the value into n parts and use a vector type with n-times the elements.
+  // Then bitcast to the type requested.
+  // Legalizing constants too early makes the DAGCombiner's job harder so we
+  // only legalize if the DAG tells us we must produce legal types.
+  else if (NewNodesMustHaveLegalTypes && VT.isVector() &&
+           TLI->getTypeAction(*getContext(), EltVT) ==
+           TargetLowering::TypeExpandInteger) {
+    APInt NewVal = Elt->getValue();
+    EVT ViaEltVT = TLI->getTypeToTransformTo(*getContext(), EltVT);
+    unsigned ViaEltSizeInBits = ViaEltVT.getSizeInBits();
+    unsigned ViaVecNumElts = VT.getSizeInBits() / ViaEltSizeInBits;
+    EVT ViaVecVT = EVT::getVectorVT(*getContext(), ViaEltVT, ViaVecNumElts);
+
+    // Check the temporary vector is the correct size. If this fails then
+    // getTypeToTransformTo() probably returned a type whose size (in bits)
+    // isn't a power-of-2 factor of the requested type size.
+    assert(ViaVecVT.getSizeInBits() == VT.getSizeInBits());
+
+    SmallVector<SDValue, 2> EltParts;
+    for (unsigned i = 0; i < ViaVecNumElts / VT.getVectorNumElements(); ++i) {
+      EltParts.push_back(getConstant(NewVal.lshr(i * ViaEltSizeInBits)
+                                           .trunc(ViaEltSizeInBits),
+                                     ViaEltVT, isT));
+    }
+
+    // EltParts is currently in little endian order. If we actually want
+    // big-endian order then reverse it now.
+    if (TLI->isBigEndian())
+      std::reverse(EltParts.begin(), EltParts.end());
+
+    // The elements must be reversed when the element order is different
+    // to the endianness of the elements (because the BITCAST is itself a
+    // vector shuffle in this situation). However, we do not need any code to
+    // perform this reversal because getConstant() is producing a vector
+    // splat.
+    // This situation occurs in MIPS MSA.
+
+    SmallVector<SDValue, 8> Ops;
+    for (unsigned i = 0; i < VT.getVectorNumElements(); ++i)
+      Ops.insert(Ops.end(), EltParts.begin(), EltParts.end());
+
+    SDValue Result = getNode(ISD::BITCAST, SDLoc(), VT,
+                             getNode(ISD::BUILD_VECTOR, SDLoc(), ViaVecVT,
+                                     &Ops[0], Ops.size()));
+    return Result;
+  }
 
   assert(Elt->getBitWidth() == EltVT.getSizeInBits() &&
          "APInt size does not match type size!");
@@ -1077,9 +1128,10 @@ SDValue SelectionDAG::getGlobalAddress(const GlobalValue *GV, SDLoc DL,
                                        unsigned char TargetFlags) {
   assert((TargetFlags == 0 || isTargetGA) &&
          "Cannot set target flags on target-independent globals");
+  const TargetLowering *TLI = TM.getTargetLowering();
 
   // Truncate (with sign-extension) the offset value to the pointer size.
-  unsigned BitWidth = TM.getTargetLowering()->getPointerTy().getSizeInBits();
+  unsigned BitWidth = TLI->getPointerTypeSizeInBits(GV->getType());
   if (BitWidth < 64)
     Offset = SignExtend64(Offset, BitWidth);
 
@@ -1298,11 +1350,8 @@ static void commuteShuffle(SDValue &N1, SDValue &N2, SmallVectorImpl<int> &M) {
 
 SDValue SelectionDAG::getVectorShuffle(EVT VT, SDLoc dl, SDValue N1,
                                        SDValue N2, const int *Mask) {
-  assert(N1.getValueType() == N2.getValueType() && "Invalid VECTOR_SHUFFLE");
-  assert(VT.isVector() && N1.getValueType().isVector() &&
-         "Vector Shuffle VTs must be a vectors");
-  assert(VT.getVectorElementType() == N1.getValueType().getVectorElementType()
-         && "Vector Shuffle VTs must have same element type");
+  assert(VT == N1.getValueType() && VT == N2.getValueType() &&
+         "Invalid VECTOR_SHUFFLE");
 
   // Canonicalize shuffle undef, undef -> undef
   if (N1.getOpcode() == ISD::UNDEF && N2.getOpcode() == ISD::UNDEF)
@@ -1351,17 +1400,13 @@ SDValue SelectionDAG::getVectorShuffle(EVT VT, SDLoc dl, SDValue N1,
     commuteShuffle(N1, N2, MaskVec);
   }
 
-  // If Identity shuffle, or all shuffle in to undef, return that node.
-  bool AllUndef = true;
+  // If Identity shuffle return that node.
   bool Identity = true;
   for (unsigned i = 0; i != NElts; ++i) {
     if (MaskVec[i] >= 0 && MaskVec[i] != (int)i) Identity = false;
-    if (MaskVec[i] >= 0) AllUndef = false;
   }
-  if (Identity && NElts == N1.getValueType().getVectorNumElements())
+  if (Identity && NElts)
     return N1;
-  if (AllUndef)
-    return getUNDEF(VT);
 
   FoldingSetNodeID ID;
   SDValue Ops[2] = { N1, N2 };
@@ -1380,7 +1425,9 @@ SDValue SelectionDAG::getVectorShuffle(EVT VT, SDLoc dl, SDValue N1,
   memcpy(MaskAlloc, &MaskVec[0], NElts * sizeof(int));
 
   ShuffleVectorSDNode *N =
-    new (NodeAllocator) ShuffleVectorSDNode(VT, dl.getIROrder(), dl.getDebugLoc(), N1, N2, MaskAlloc);
+    new (NodeAllocator) ShuffleVectorSDNode(VT, dl.getIROrder(),
+                                            dl.getDebugLoc(), N1, N2,
+                                            MaskAlloc);
   CSEMap.InsertNode(N, IP);
   AllNodes.push_back(N);
   return SDValue(N, 0);
@@ -1403,8 +1450,9 @@ SDValue SelectionDAG::getConvertRndSat(EVT VT, SDLoc dl,
   if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
     return SDValue(E, 0);
 
-  CvtRndSatSDNode *N = new (NodeAllocator) CvtRndSatSDNode(VT, dl.getIROrder(), dl.getDebugLoc(), Ops, 5,
-                                                           Code);
+  CvtRndSatSDNode *N = new (NodeAllocator) CvtRndSatSDNode(VT, dl.getIROrder(),
+                                                           dl.getDebugLoc(),
+                                                           Ops, 5, Code);
   CSEMap.InsertNode(N, IP);
   AllNodes.push_back(N);
   return SDValue(N, 0);
@@ -1447,7 +1495,8 @@ SDValue SelectionDAG::getEHLabel(SDLoc dl, SDValue Root, MCSymbol *Label) {
   if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
     return SDValue(E, 0);
 
-  SDNode *N = new (NodeAllocator) EHLabelSDNode(dl.getIROrder(), dl.getDebugLoc(), Root, Label);
+  SDNode *N = new (NodeAllocator) EHLabelSDNode(dl.getIROrder(),
+                                                dl.getDebugLoc(), Root, Label);
   CSEMap.InsertNode(N, IP);
   AllNodes.push_back(N);
   return SDValue(N, 0);
@@ -1510,6 +1559,26 @@ SDValue SelectionDAG::getMDNode(const MDNode *MD) {
   return SDValue(N, 0);
 }
 
+/// getAddrSpaceCast - Return an AddrSpaceCastSDNode.
+SDValue SelectionDAG::getAddrSpaceCast(SDLoc dl, EVT VT, SDValue Ptr,
+                                       unsigned SrcAS, unsigned DestAS) {
+  SDValue Ops[] = {Ptr};
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, ISD::ADDRSPACECAST, getVTList(VT), &Ops[0], 1);
+  ID.AddInteger(SrcAS);
+  ID.AddInteger(DestAS);
+
+  void *IP = 0;
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+    return SDValue(E, 0);
+
+  SDNode *N = new (NodeAllocator) AddrSpaceCastSDNode(dl.getIROrder(),
+                                                      dl.getDebugLoc(),
+                                                      VT, Ptr, SrcAS, DestAS);
+  CSEMap.InsertNode(N, IP);
+  AllNodes.push_back(N);
+  return SDValue(N, 0);
+}
 
 /// getShiftAmountOperand - Return the specified value casted to
 /// the target's desired shift amount type.
@@ -1561,7 +1630,12 @@ SDValue SelectionDAG::FoldSetCC(EVT VT, SDValue N1,
   case ISD::SETFALSE:
   case ISD::SETFALSE2: return getConstant(0, VT);
   case ISD::SETTRUE:
-  case ISD::SETTRUE2:  return getConstant(1, VT);
+  case ISD::SETTRUE2: {
+    const TargetLowering *TLI = TM.getTargetLowering();
+    TargetLowering::BooleanContent Cnt = TLI->getBooleanContents(VT.isVector());
+    return getConstant(
+        Cnt == TargetLowering::ZeroOrNegativeOneBooleanContent ? -1ULL : 1, VT);
+  }
 
   case ISD::SETOEQ:
   case ISD::SETOGT:
@@ -1643,7 +1717,12 @@ SDValue SelectionDAG::FoldSetCC(EVT VT, SDValue N1,
       }
     } else {
       // Ensure that the constant occurs on the RHS.
-      return getSetCC(dl, VT, N2, N1, ISD::getSetCCSwappedOperands(Cond));
+      ISD::CondCode SwappedCond = ISD::getSetCCSwappedOperands(Cond);
+      MVT CompVT = N1.getValueType().getSimpleVT();
+      if (!TM.getTargetLowering()->isCondCodeLegal(SwappedCond, CompVT))
+        return SDValue();
+
+      return getSetCC(dl, VT, N2, N1, SwappedCond);
     }
   }
 
@@ -1942,7 +2021,6 @@ void SelectionDAG::ComputeMaskedBits(SDValue Op, APInt &KnownZero,
   case ISD::SIGN_EXTEND: {
     EVT InVT = Op.getOperand(0).getValueType();
     unsigned InBits = InVT.getScalarType().getSizeInBits();
-    APInt InSignBit = APInt::getSignBit(InBits);
     APInt NewBits   = APInt::getHighBitsSet(BitWidth, BitWidth - InBits);
 
     KnownZero = KnownZero.trunc(InBits);
@@ -2054,7 +2132,6 @@ void SelectionDAG::ComputeMaskedBits(SDValue Op, APInt &KnownZero,
       const APInt &RA = Rem->getAPIntValue().abs();
       if (RA.isPowerOf2()) {
         APInt LowBits = RA - 1;
-        APInt Mask2 = LowBits | APInt::getSignBit(BitWidth);
         ComputeMaskedBits(Op.getOperand(0), KnownZero2,KnownOne2,Depth+1);
 
         // The low bits of the first operand are unchanged by the srem.
@@ -2150,7 +2227,8 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const{
   }
 
   case ISD::SIGN_EXTEND:
-    Tmp = VTBits-Op.getOperand(0).getValueType().getScalarType().getSizeInBits();
+    Tmp =
+        VTBits-Op.getOperand(0).getValueType().getScalarType().getSizeInBits();
     return ComputeNumSignBits(Op.getOperand(0), Depth+1) + Tmp;
 
   case ISD::SIGN_EXTEND_INREG:
@@ -2411,7 +2489,8 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT) {
   if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
     return SDValue(E, 0);
 
-  SDNode *N = new (NodeAllocator) SDNode(Opcode, DL.getIROrder(), DL.getDebugLoc(), getVTList(VT));
+  SDNode *N = new (NodeAllocator) SDNode(Opcode, DL.getIROrder(),
+                                         DL.getDebugLoc(), getVTList(VT));
   CSEMap.InsertNode(N, IP);
 
   AllNodes.push_back(N);
@@ -2672,10 +2751,12 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL,
     if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
       return SDValue(E, 0);
 
-    N = new (NodeAllocator) UnarySDNode(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs, Operand);
+    N = new (NodeAllocator) UnarySDNode(Opcode, DL.getIROrder(),
+                                        DL.getDebugLoc(), VTs, Operand);
     CSEMap.InsertNode(N, IP);
   } else {
-    N = new (NodeAllocator) UnarySDNode(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs, Operand);
+    N = new (NodeAllocator) UnarySDNode(Opcode, DL.getIROrder(),
+                                        DL.getDebugLoc(), VTs, Operand);
   }
 
   AllNodes.push_back(N);
@@ -3073,9 +3154,10 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT, SDValue N1,
     if (VT.isSimple() && N1.getValueType().isSimple()) {
       assert(VT.isVector() && N1.getValueType().isVector() &&
              "Extract subvector VTs must be a vectors!");
-      assert(VT.getVectorElementType() == N1.getValueType().getVectorElementType() &&
+      assert(VT.getVectorElementType() ==
+             N1.getValueType().getVectorElementType() &&
              "Extract subvector VTs must have the same element type!");
-      assert(VT.getSimpleVT() <= N1.getValueType().getSimpleVT() &&
+      assert(VT.getSimpleVT() <= N1.getSimpleValueType() &&
              "Extract subvector must be from larger vector to smaller vector!");
 
       if (isa<ConstantSDNode>(Index.getNode())) {
@@ -3086,7 +3168,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT, SDValue N1,
       }
 
       // Trivial extraction.
-      if (VT.getSimpleVT() == N1.getValueType().getSimpleVT())
+      if (VT.getSimpleVT() == N1.getSimpleValueType())
         return N1;
     }
     break;
@@ -3244,10 +3326,12 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT, SDValue N1,
     if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
       return SDValue(E, 0);
 
-    N = new (NodeAllocator) BinarySDNode(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs, N1, N2);
+    N = new (NodeAllocator) BinarySDNode(Opcode, DL.getIROrder(),
+                                         DL.getDebugLoc(), VTs, N1, N2);
     CSEMap.InsertNode(N, IP);
   } else {
-    N = new (NodeAllocator) BinarySDNode(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs, N1, N2);
+    N = new (NodeAllocator) BinarySDNode(Opcode, DL.getIROrder(),
+                                         DL.getDebugLoc(), VTs, N1, N2);
   }
 
   AllNodes.push_back(N);
@@ -3316,7 +3400,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT,
              "Insert subvector VTs must be a vectors");
       assert(VT == N1.getValueType() &&
              "Dest and insert subvector source types must match!");
-      assert(N2.getValueType().getSimpleVT() <= N1.getValueType().getSimpleVT() &&
+      assert(N2.getSimpleValueType() <= N1.getSimpleValueType() &&
              "Insert subvector must be from smaller vector to larger vector!");
       if (isa<ConstantSDNode>(Index.getNode())) {
         assert((N2.getValueType().getVectorNumElements() +
@@ -3326,7 +3410,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT,
       }
 
       // Trivial insertion.
-      if (VT.getSimpleVT() == N2.getValueType().getSimpleVT())
+      if (VT.getSimpleVT() == N2.getSimpleValueType())
         return N2;
     }
     break;
@@ -3349,10 +3433,12 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT,
     if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
       return SDValue(E, 0);
 
-    N = new (NodeAllocator) TernarySDNode(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs, N1, N2, N3);
+    N = new (NodeAllocator) TernarySDNode(Opcode, DL.getIROrder(),
+                                          DL.getDebugLoc(), VTs, N1, N2, N3);
     CSEMap.InsertNode(N, IP);
   } else {
-    N = new (NodeAllocator) TernarySDNode(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs, N1, N2, N3);
+    N = new (NodeAllocator) TernarySDNode(Opcode, DL.getIROrder(),
+                                          DL.getDebugLoc(), VTs, N1, N2, N3);
   }
 
   AllNodes.push_back(N);
@@ -3771,7 +3857,7 @@ static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, SDLoc dl,
   for (unsigned i = 0; i < NumMemOps; i++) {
     EVT VT = MemOps[i];
     unsigned VTSize = VT.getSizeInBits() / 8;
-    SDValue Value, Store;
+    SDValue Value;
 
     Value = DAG.getLoad(VT, dl, Chain,
                         getMemBasePlusOffset(Src, SrcOff, dl, DAG),
@@ -3787,7 +3873,7 @@ static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, SDLoc dl,
   for (unsigned i = 0; i < NumMemOps; i++) {
     EVT VT = MemOps[i];
     unsigned VTSize = VT.getSizeInBits() / 8;
-    SDValue Value, Store;
+    SDValue Store;
 
     Store = DAG.getStore(Chain, dl, LoadValues[i],
                          getMemBasePlusOffset(Dst, DstOff, dl, DAG),
@@ -3800,6 +3886,24 @@ static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, SDLoc dl,
                      &OutChains[0], OutChains.size());
 }
 
+/// \brief Lower the call to 'memset' intrinsic function into a series of store
+/// operations.
+///
+/// \param DAG Selection DAG where lowered code is placed.
+/// \param dl Link to corresponding IR location.
+/// \param Chain Control flow dependency.
+/// \param Dst Pointer to destination memory location.
+/// \param Src Value of byte to write into the memory.
+/// \param Size Number of bytes to write.
+/// \param Align Alignment of the destination in bytes.
+/// \param isVol True if destination is volatile.
+/// \param DstPtrInfo IR information on the memory pointer.
+/// \returns New head in the control flow, if lowering was successful, empty
+/// SDValue otherwise.
+///
+/// The function tries to replace 'llvm.memset' intrinsic with several store
+/// operations and value calculation code. This is usually profitable for small
+/// memory size.
 static SDValue getMemsetStores(SelectionDAG &DAG, SDLoc dl,
                                SDValue Chain, SDValue Dst,
                                SDValue Src, uint64_t Size,
@@ -4078,6 +4182,37 @@ SDValue SelectionDAG::getMemset(SDValue Chain, SDLoc dl, SDValue Dst,
 }
 
 SDValue SelectionDAG::getAtomic(unsigned Opcode, SDLoc dl, EVT MemVT,
+                                SDVTList VTList, SDValue* Ops, unsigned NumOps,
+                                MachineMemOperand *MMO,
+                                AtomicOrdering Ordering,
+                                SynchronizationScope SynchScope) {
+  FoldingSetNodeID ID;
+  ID.AddInteger(MemVT.getRawBits());
+  AddNodeIDNode(ID, Opcode, VTList, Ops, NumOps);
+  ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
+  void* IP = 0;
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
+    cast<AtomicSDNode>(E)->refineAlignment(MMO);
+    return SDValue(E, 0);
+  }
+
+  // Allocate the operands array for the node out of the BumpPtrAllocator, since
+  // SDNode doesn't have access to it.  This memory will be "leaked" when
+  // the node is deallocated, but recovered when the allocator is released.
+  // If the number of operands is less than 5 we use AtomicSDNode's internal
+  // storage.
+  SDUse *DynOps = NumOps > 4 ? OperandAllocator.Allocate<SDUse>(NumOps) : 0;
+
+  SDNode *N = new (NodeAllocator) AtomicSDNode(Opcode, dl.getIROrder(),
+                                               dl.getDebugLoc(), VTList, MemVT,
+                                               Ops, DynOps, NumOps, MMO,
+                                               Ordering, SynchScope);
+  CSEMap.InsertNode(N, IP);
+  AllNodes.push_back(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getAtomic(unsigned Opcode, SDLoc dl, EVT MemVT,
                                 SDValue Chain, SDValue Ptr, SDValue Cmp,
                                 SDValue Swp, MachinePointerInfo PtrInfo,
                                 unsigned Alignment,
@@ -4117,22 +4252,8 @@ SDValue SelectionDAG::getAtomic(unsigned Opcode, SDLoc dl, EVT MemVT,
   EVT VT = Cmp.getValueType();
 
   SDVTList VTs = getVTList(VT, MVT::Other);
-  FoldingSetNodeID ID;
-  ID.AddInteger(MemVT.getRawBits());
   SDValue Ops[] = {Chain, Ptr, Cmp, Swp};
-  AddNodeIDNode(ID, Opcode, VTs, Ops, 4);
-  ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
-  void* IP = 0;
-  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
-    cast<AtomicSDNode>(E)->refineAlignment(MMO);
-    return SDValue(E, 0);
-  }
-  SDNode *N = new (NodeAllocator) AtomicSDNode(Opcode, dl.getIROrder(), dl.getDebugLoc(), VTs, MemVT, Chain,
-                                               Ptr, Cmp, Swp, MMO, Ordering,
-                                               SynchScope);
-  CSEMap.InsertNode(N, IP);
-  AllNodes.push_back(N);
-  return SDValue(N, 0);
+  return getAtomic(Opcode, dl, MemVT, VTs, Ops, 4, MMO, Ordering, SynchScope);
 }
 
 SDValue SelectionDAG::getAtomic(unsigned Opcode, SDLoc dl, EVT MemVT,
@@ -4190,22 +4311,8 @@ SDValue SelectionDAG::getAtomic(unsigned Opcode, SDLoc dl, EVT MemVT,
 
   SDVTList VTs = Opcode == ISD::ATOMIC_STORE ? getVTList(MVT::Other) :
                                                getVTList(VT, MVT::Other);
-  FoldingSetNodeID ID;
-  ID.AddInteger(MemVT.getRawBits());
   SDValue Ops[] = {Chain, Ptr, Val};
-  AddNodeIDNode(ID, Opcode, VTs, Ops, 3);
-  ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
-  void* IP = 0;
-  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
-    cast<AtomicSDNode>(E)->refineAlignment(MMO);
-    return SDValue(E, 0);
-  }
-  SDNode *N = new (NodeAllocator) AtomicSDNode(Opcode, dl.getIROrder(), dl.getDebugLoc(), VTs, MemVT, Chain,
-                                               Ptr, Val, MMO,
-                                               Ordering, SynchScope);
-  CSEMap.InsertNode(N, IP);
-  AllNodes.push_back(N);
-  return SDValue(N, 0);
+  return getAtomic(Opcode, dl, MemVT, VTs, Ops, 3, MMO, Ordering, SynchScope);
 }
 
 SDValue SelectionDAG::getAtomic(unsigned Opcode, SDLoc dl, EVT MemVT,
@@ -4248,21 +4355,8 @@ SDValue SelectionDAG::getAtomic(unsigned Opcode, SDLoc dl, EVT MemVT,
   assert(Opcode == ISD::ATOMIC_LOAD && "Invalid Atomic Op");
 
   SDVTList VTs = getVTList(VT, MVT::Other);
-  FoldingSetNodeID ID;
-  ID.AddInteger(MemVT.getRawBits());
   SDValue Ops[] = {Chain, Ptr};
-  AddNodeIDNode(ID, Opcode, VTs, Ops, 2);
-  ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
-  void* IP = 0;
-  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
-    cast<AtomicSDNode>(E)->refineAlignment(MMO);
-    return SDValue(E, 0);
-  }
-  SDNode *N = new (NodeAllocator) AtomicSDNode(Opcode, dl.getIROrder(), dl.getDebugLoc(), VTs, MemVT, Chain,
-                                               Ptr, MMO, Ordering, SynchScope);
-  CSEMap.InsertNode(N, IP);
-  AllNodes.push_back(N);
-  return SDValue(N, 0);
+  return getAtomic(Opcode, dl, MemVT, VTs, Ops, 2, MMO, Ordering, SynchScope);
 }
 
 /// getMergeValues - Create a MERGE_VALUES node from the given operands.
@@ -4339,12 +4433,14 @@ SelectionDAG::getMemIntrinsicNode(unsigned Opcode, SDLoc dl, SDVTList VTList,
       return SDValue(E, 0);
     }
 
-    N = new (NodeAllocator) MemIntrinsicSDNode(Opcode, dl.getIROrder(), dl.getDebugLoc(), VTList, Ops, NumOps,
-                                               MemVT, MMO);
+    N = new (NodeAllocator) MemIntrinsicSDNode(Opcode, dl.getIROrder(),
+                                               dl.getDebugLoc(), VTList, Ops,
+                                               NumOps, MemVT, MMO);
     CSEMap.InsertNode(N, IP);
   } else {
-    N = new (NodeAllocator) MemIntrinsicSDNode(Opcode, dl.getIROrder(), dl.getDebugLoc(), VTList, Ops, NumOps,
-                                               MemVT, MMO);
+    N = new (NodeAllocator) MemIntrinsicSDNode(Opcode, dl.getIROrder(),
+                                               dl.getDebugLoc(), VTList, Ops,
+                                               NumOps, MemVT, MMO);
   }
   AllNodes.push_back(N);
   return SDValue(N, 0);
@@ -4458,7 +4554,8 @@ SelectionDAG::getLoad(ISD::MemIndexedMode AM, ISD::LoadExtType ExtType,
     cast<LoadSDNode>(E)->refineAlignment(MMO);
     return SDValue(E, 0);
   }
-  SDNode *N = new (NodeAllocator) LoadSDNode(Ops, dl.getIROrder(), dl.getDebugLoc(), VTs, AM, ExtType,
+  SDNode *N = new (NodeAllocator) LoadSDNode(Ops, dl.getIROrder(),
+                                             dl.getDebugLoc(), VTs, AM, ExtType,
                                              MemVT, MMO);
   CSEMap.InsertNode(N, IP);
   AllNodes.push_back(N);
@@ -4478,6 +4575,14 @@ SDValue SelectionDAG::getLoad(EVT VT, SDLoc dl,
                  TBAAInfo, Ranges);
 }
 
+SDValue SelectionDAG::getLoad(EVT VT, SDLoc dl,
+                              SDValue Chain, SDValue Ptr,
+                              MachineMemOperand *MMO) {
+  SDValue Undef = getUNDEF(Ptr.getValueType());
+  return getLoad(ISD::UNINDEXED, ISD::NON_EXTLOAD, VT, dl, Chain, Ptr, Undef,
+                 VT, MMO);
+}
+
 SDValue SelectionDAG::getExtLoad(ISD::LoadExtType ExtType, SDLoc dl, EVT VT,
                                  SDValue Chain, SDValue Ptr,
                                  MachinePointerInfo PtrInfo, EVT MemVT,
@@ -4490,6 +4595,14 @@ SDValue SelectionDAG::getExtLoad(ISD::LoadExtType ExtType, SDLoc dl, EVT VT,
 }
 
 
+SDValue SelectionDAG::getExtLoad(ISD::LoadExtType ExtType, SDLoc dl, EVT VT,
+                                 SDValue Chain, SDValue Ptr, EVT MemVT,
+                                 MachineMemOperand *MMO) {
+  SDValue Undef = getUNDEF(Ptr.getValueType());
+  return getLoad(ISD::UNINDEXED, ExtType, VT, dl, Chain, Ptr, Undef,
+                 MemVT, MMO);
+}
+
 SDValue
 SelectionDAG::getIndexedLoad(SDValue OrigLoad, SDLoc dl, SDValue Base,
                              SDValue Offset, ISD::MemIndexedMode AM) {
@@ -4548,8 +4661,9 @@ SDValue SelectionDAG::getStore(SDValue Chain, SDLoc dl, SDValue Val,
     cast<StoreSDNode>(E)->refineAlignment(MMO);
     return SDValue(E, 0);
   }
-  SDNode *N = new (NodeAllocator) StoreSDNode(Ops, dl.getIROrder(), dl.getDebugLoc(), VTs, ISD::UNINDEXED,
-                                              false, VT, MMO);
+  SDNode *N = new (NodeAllocator) StoreSDNode(Ops, dl.getIROrder(),
+                                              dl.getDebugLoc(), VTs,
+                                              ISD::UNINDEXED, false, VT, MMO);
   CSEMap.InsertNode(N, IP);
   AllNodes.push_back(N);
   return SDValue(N, 0);
@@ -4616,8 +4730,9 @@ SDValue SelectionDAG::getTruncStore(SDValue Chain, SDLoc dl, SDValue Val,
     cast<StoreSDNode>(E)->refineAlignment(MMO);
     return SDValue(E, 0);
   }
-  SDNode *N = new (NodeAllocator) StoreSDNode(Ops, dl.getIROrder(), dl.getDebugLoc(), VTs, ISD::UNINDEXED,
-                                              true, SVT, MMO);
+  SDNode *N = new (NodeAllocator) StoreSDNode(Ops, dl.getIROrder(),
+                                              dl.getDebugLoc(), VTs,
+                                              ISD::UNINDEXED, true, SVT, MMO);
   CSEMap.InsertNode(N, IP);
   AllNodes.push_back(N);
   return SDValue(N, 0);
@@ -4640,7 +4755,8 @@ SelectionDAG::getIndexedStore(SDValue OrigStore, SDLoc dl, SDValue Base,
   if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
     return SDValue(E, 0);
 
-  SDNode *N = new (NodeAllocator) StoreSDNode(Ops, dl.getIROrder(), dl.getDebugLoc(), VTs, AM,
+  SDNode *N = new (NodeAllocator) StoreSDNode(Ops, dl.getIROrder(),
+                                              dl.getDebugLoc(), VTs, AM,
                                               ST->isTruncatingStore(),
                                               ST->getMemoryVT(),
                                               ST->getMemOperand());
@@ -4715,10 +4831,12 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT,
     if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
       return SDValue(E, 0);
 
-    N = new (NodeAllocator) SDNode(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs, Ops, NumOps);
+    N = new (NodeAllocator) SDNode(Opcode, DL.getIROrder(), DL.getDebugLoc(),
+                                   VTs, Ops, NumOps);
     CSEMap.InsertNode(N, IP);
   } else {
-    N = new (NodeAllocator) SDNode(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs, Ops, NumOps);
+    N = new (NodeAllocator) SDNode(Opcode, DL.getIROrder(), DL.getDebugLoc(),
+                                   VTs, Ops, NumOps);
   }
 
   AllNodes.push_back(N);
@@ -4781,26 +4899,36 @@ SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, SDVTList VTList,
       return SDValue(E, 0);
 
     if (NumOps == 1) {
-      N = new (NodeAllocator) UnarySDNode(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTList, Ops[0]);
+      N = new (NodeAllocator) UnarySDNode(Opcode, DL.getIROrder(),
+                                          DL.getDebugLoc(), VTList, Ops[0]);
     } else if (NumOps == 2) {
-      N = new (NodeAllocator) BinarySDNode(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTList, Ops[0], Ops[1]);
+      N = new (NodeAllocator) BinarySDNode(Opcode, DL.getIROrder(),
+                                           DL.getDebugLoc(), VTList, Ops[0],
+                                           Ops[1]);
     } else if (NumOps == 3) {
-      N = new (NodeAllocator) TernarySDNode(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTList, Ops[0], Ops[1],
-                                            Ops[2]);
+      N = new (NodeAllocator) TernarySDNode(Opcode, DL.getIROrder(),
+                                            DL.getDebugLoc(), VTList, Ops[0],
+                                            Ops[1], Ops[2]);
     } else {
-      N = new (NodeAllocator) SDNode(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTList, Ops, NumOps);
+      N = new (NodeAllocator) SDNode(Opcode, DL.getIROrder(), DL.getDebugLoc(),
+                                     VTList, Ops, NumOps);
     }
     CSEMap.InsertNode(N, IP);
   } else {
     if (NumOps == 1) {
-      N = new (NodeAllocator) UnarySDNode(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTList, Ops[0]);
+      N = new (NodeAllocator) UnarySDNode(Opcode, DL.getIROrder(),
+                                          DL.getDebugLoc(), VTList, Ops[0]);
     } else if (NumOps == 2) {
-      N = new (NodeAllocator) BinarySDNode(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTList, Ops[0], Ops[1]);
+      N = new (NodeAllocator) BinarySDNode(Opcode, DL.getIROrder(),
+                                           DL.getDebugLoc(), VTList, Ops[0],
+                                           Ops[1]);
     } else if (NumOps == 3) {
-      N = new (NodeAllocator) TernarySDNode(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTList, Ops[0], Ops[1],
-                                            Ops[2]);
+      N = new (NodeAllocator) TernarySDNode(Opcode, DL.getIROrder(),
+                                            DL.getDebugLoc(), VTList, Ops[0],
+                                            Ops[1], Ops[2]);
     } else {
-      N = new (NodeAllocator) SDNode(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTList, Ops, NumOps);
+      N = new (NodeAllocator) SDNode(Opcode, DL.getIROrder(), DL.getDebugLoc(),
+                                     VTList, Ops, NumOps);
     }
   }
   AllNodes.push_back(N);
@@ -4851,76 +4979,81 @@ SDVTList SelectionDAG::getVTList(EVT VT) {
 }
 
 SDVTList SelectionDAG::getVTList(EVT VT1, EVT VT2) {
-  for (std::vector<SDVTList>::reverse_iterator I = VTList.rbegin(),
-       E = VTList.rend(); I != E; ++I)
-    if (I->NumVTs == 2 && I->VTs[0] == VT1 && I->VTs[1] == VT2)
-      return *I;
-
-  EVT *Array = Allocator.Allocate<EVT>(2);
-  Array[0] = VT1;
-  Array[1] = VT2;
-  SDVTList Result = makeVTList(Array, 2);
-  VTList.push_back(Result);
-  return Result;
+  FoldingSetNodeID ID;
+  ID.AddInteger(2U);
+  ID.AddInteger(VT1.getRawBits());
+  ID.AddInteger(VT2.getRawBits());
+
+  void *IP = 0;
+  SDVTListNode *Result = VTListMap.FindNodeOrInsertPos(ID, IP);
+  if (Result == NULL) {
+    EVT *Array = Allocator.Allocate<EVT>(2);
+    Array[0] = VT1;
+    Array[1] = VT2;
+    Result = new (Allocator) SDVTListNode(ID.Intern(Allocator), Array, 2);
+    VTListMap.InsertNode(Result, IP);
+  }
+  return Result->getSDVTList();
 }
 
 SDVTList SelectionDAG::getVTList(EVT VT1, EVT VT2, EVT VT3) {
-  for (std::vector<SDVTList>::reverse_iterator I = VTList.rbegin(),
-       E = VTList.rend(); I != E; ++I)
-    if (I->NumVTs == 3 && I->VTs[0] == VT1 && I->VTs[1] == VT2 &&
-                          I->VTs[2] == VT3)
-      return *I;
-
-  EVT *Array = Allocator.Allocate<EVT>(3);
-  Array[0] = VT1;
-  Array[1] = VT2;
-  Array[2] = VT3;
-  SDVTList Result = makeVTList(Array, 3);
-  VTList.push_back(Result);
-  return Result;
+  FoldingSetNodeID ID;
+  ID.AddInteger(3U);
+  ID.AddInteger(VT1.getRawBits());
+  ID.AddInteger(VT2.getRawBits());
+  ID.AddInteger(VT3.getRawBits());
+
+  void *IP = 0;
+  SDVTListNode *Result = VTListMap.FindNodeOrInsertPos(ID, IP);
+  if (Result == NULL) {
+    EVT *Array = Allocator.Allocate<EVT>(3);
+    Array[0] = VT1;
+    Array[1] = VT2;
+    Array[2] = VT3;
+    Result = new (Allocator) SDVTListNode(ID.Intern(Allocator), Array, 3);
+    VTListMap.InsertNode(Result, IP);
+  }
+  return Result->getSDVTList();
 }
 
 SDVTList SelectionDAG::getVTList(EVT VT1, EVT VT2, EVT VT3, EVT VT4) {
-  for (std::vector<SDVTList>::reverse_iterator I = VTList.rbegin(),
-       E = VTList.rend(); I != E; ++I)
-    if (I->NumVTs == 4 && I->VTs[0] == VT1 && I->VTs[1] == VT2 &&
-                          I->VTs[2] == VT3 && I->VTs[3] == VT4)
-      return *I;
-
-  EVT *Array = Allocator.Allocate<EVT>(4);
-  Array[0] = VT1;
-  Array[1] = VT2;
-  Array[2] = VT3;
-  Array[3] = VT4;
-  SDVTList Result = makeVTList(Array, 4);
-  VTList.push_back(Result);
-  return Result;
+  FoldingSetNodeID ID;
+  ID.AddInteger(4U);
+  ID.AddInteger(VT1.getRawBits());
+  ID.AddInteger(VT2.getRawBits());
+  ID.AddInteger(VT3.getRawBits());
+  ID.AddInteger(VT4.getRawBits());
+
+  void *IP = 0;
+  SDVTListNode *Result = VTListMap.FindNodeOrInsertPos(ID, IP);
+  if (Result == NULL) {
+    EVT *Array = Allocator.Allocate<EVT>(4);
+    Array[0] = VT1;
+    Array[1] = VT2;
+    Array[2] = VT3;
+    Array[3] = VT4;
+    Result = new (Allocator) SDVTListNode(ID.Intern(Allocator), Array, 4);
+    VTListMap.InsertNode(Result, IP);
+  }
+  return Result->getSDVTList();
 }
 
 SDVTList SelectionDAG::getVTList(const EVT *VTs, unsigned NumVTs) {
-  switch (NumVTs) {
-    case 0: llvm_unreachable("Cannot have nodes without results!");
-    case 1: return getVTList(VTs[0]);
-    case 2: return getVTList(VTs[0], VTs[1]);
-    case 3: return getVTList(VTs[0], VTs[1], VTs[2]);
-    case 4: return getVTList(VTs[0], VTs[1], VTs[2], VTs[3]);
-    default: break;
+  FoldingSetNodeID ID;
+  ID.AddInteger(NumVTs);
+  for (unsigned index = 0; index < NumVTs; index++) {
+    ID.AddInteger(VTs[index].getRawBits());
   }
 
-  for (std::vector<SDVTList>::reverse_iterator I = VTList.rbegin(),
-       E = VTList.rend(); I != E; ++I) {
-    if (I->NumVTs != NumVTs || VTs[0] != I->VTs[0] || VTs[1] != I->VTs[1])
-      continue;
-
-    if (std::equal(&VTs[2], &VTs[NumVTs], &I->VTs[2]))
-      return *I;
+  void *IP = 0;
+  SDVTListNode *Result = VTListMap.FindNodeOrInsertPos(ID, IP);
+  if (Result == NULL) {
+    EVT *Array = Allocator.Allocate<EVT>(NumVTs);
+    std::copy(VTs, VTs + NumVTs, Array);
+    Result = new (Allocator) SDVTListNode(ID.Intern(Allocator), Array, NumVTs);
+    VTListMap.InsertNode(Result, IP);
   }
-
-  EVT *Array = Allocator.Allocate<EVT>(NumVTs);
-  std::copy(VTs, VTs+NumVTs, Array);
-  SDVTList Result = makeVTList(Array, NumVTs);
-  VTList.push_back(Result);
-  return Result;
+  return Result->getSDVTList();
 }
 
 
@@ -5410,7 +5543,8 @@ SelectionDAG::getMachineNode(unsigned Opcode, SDLoc DL, SDVTList VTs,
   }
 
   // Allocate a new MachineSDNode.
-  N = new (NodeAllocator) MachineSDNode(~Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs);
+  N = new (NodeAllocator) MachineSDNode(~Opcode, DL.getIROrder(),
+                                        DL.getDebugLoc(), VTs);
 
   // Initialize the operands list.
   if (NumOps > array_lengthof(N->LocalOperands))
@@ -5916,6 +6050,12 @@ GlobalAddressSDNode::GlobalAddressSDNode(unsigned Opc, unsigned Order,
   TheGlobal = GA;
 }
 
+AddrSpaceCastSDNode::AddrSpaceCastSDNode(unsigned Order, DebugLoc dl, EVT VT,
+                                         SDValue X, unsigned SrcAS,
+                                         unsigned DestAS)
+ : UnarySDNode(ISD::ADDRSPACECAST, Order, dl, getSDVTList(VT), X),
+   SrcAddrSpace(SrcAS), DestAddrSpace(DestAS) {}
+
 MemSDNode::MemSDNode(unsigned Opc, unsigned Order, DebugLoc dl, SDVTList VTs,
                      EVT memvt, MachineMemOperand *mmo)
  : SDNode(Opc, Order, dl, VTs), MemoryVT(memvt), MMO(mmo) {
@@ -6162,8 +6302,8 @@ SDValue SelectionDAG::UnrollVectorOp(SDNode *N, unsigned ResNE) {
     case ISD::ROTL:
     case ISD::ROTR:
       Scalars.push_back(getNode(N->getOpcode(), dl, EltVT, Operands[0],
-                                getShiftAmountOperand(Operands[0].getValueType(),
-                                                      Operands[1])));
+                               getShiftAmountOperand(Operands[0].getValueType(),
+                                                     Operands[1])));
       break;
     case ISD::SIGN_EXTEND_INREG:
     case ISD::FP_ROUND_INREG: {
@@ -6235,7 +6375,7 @@ unsigned SelectionDAG::InferPtrAlignment(SDValue Ptr) const {
   int64_t GVOffset = 0;
   const TargetLowering *TLI = TM.getTargetLowering();
   if (TLI->isGAPlusOffset(Ptr.getNode(), GV, GVOffset)) {
-    unsigned PtrWidth = TLI->getPointerTy().getSizeInBits();
+    unsigned PtrWidth = TLI->getPointerTypeSizeInBits(GV->getType());
     APInt KnownZero(PtrWidth, 0), KnownOne(PtrWidth, 0);
     llvm::ComputeMaskedBits(const_cast<GlobalValue*>(GV), KnownZero, KnownOne,
                             TLI->getDataLayout());
@@ -6268,6 +6408,38 @@ unsigned SelectionDAG::InferPtrAlignment(SDValue Ptr) const {
   return 0;
 }
 
+/// GetSplitDestVTs - Compute the VTs needed for the low/hi parts of a type
+/// which is split (or expanded) into two not necessarily identical pieces.
+std::pair<EVT, EVT> SelectionDAG::GetSplitDestVTs(const EVT &VT) const {
+  // Currently all types are split in half.
+  EVT LoVT, HiVT;
+  if (!VT.isVector()) {
+    LoVT = HiVT = TLI->getTypeToTransformTo(*getContext(), VT);
+  } else {
+    unsigned NumElements = VT.getVectorNumElements();
+    assert(!(NumElements & 1) && "Splitting vector, but not in half!");
+    LoVT = HiVT = EVT::getVectorVT(*getContext(), VT.getVectorElementType(),
+                                   NumElements/2);
+  }
+  return std::make_pair(LoVT, HiVT);
+}
+
+/// SplitVector - Split the vector with EXTRACT_SUBVECTOR and return the
+/// low/high part.
+std::pair<SDValue, SDValue>
+SelectionDAG::SplitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT,
+                          const EVT &HiVT) {
+  assert(LoVT.getVectorNumElements() + HiVT.getVectorNumElements() <=
+         N.getValueType().getVectorNumElements() &&
+         "More vector elements requested than available!");
+  SDValue Lo, Hi;
+  Lo = getNode(ISD::EXTRACT_SUBVECTOR, DL, LoVT, N,
+               getConstant(0, TLI->getVectorIdxTy()));
+  Hi = getNode(ISD::EXTRACT_SUBVECTOR, DL, HiVT, N,
+               getConstant(LoVT.getVectorNumElements(), TLI->getVectorIdxTy()));
+  return std::make_pair(Lo, Hi);
+}
+
 // getAddressSpace - Return the address space this GlobalAddress belongs to.
 unsigned GlobalAddressSDNode::getAddressSpace() const {
   return getGlobal()->getType()->getAddressSpace();
@@ -6389,7 +6561,7 @@ static void checkForCyclesHelper(const SDNode *N,
 
 void llvm::checkForCycles(const llvm::SDNode *N) {
 #ifdef XDEBUG
-  assert(N && "Checking nonexistant SDNode");
+  assert(N && "Checking nonexistent SDNode");
   SmallPtrSet<const SDNode*, 32> visited;
   SmallPtrSet<const SDNode*, 32> checked;
   checkForCyclesHelper(N, visited, checked);
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index b9f4381..2b2713d 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -33,6 +33,7 @@
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/StackMaps.h"
 #include "llvm/DebugInfo.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/Constants.h"
@@ -49,7 +50,6 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/IntegersSubsetMapping.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetFrameLowering.h"
@@ -58,6 +58,7 @@
 #include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetSelectionDAGInfo.h"
 #include <algorithm>
 using namespace llvm;
 
@@ -1063,8 +1064,10 @@ SDValue SelectionDAGBuilder::getValueImpl(const Value *V) {
     if (const GlobalValue *GV = dyn_cast<GlobalValue>(C))
       return DAG.getGlobalAddress(GV, getCurSDLoc(), VT);
 
-    if (isa<ConstantPointerNull>(C))
-      return DAG.getConstant(0, TLI->getPointerTy());
+    if (isa<ConstantPointerNull>(C)) {
+      unsigned AS = V->getType()->getPointerAddressSpace();
+      return DAG.getConstant(0, TLI->getPointerTy(AS));
+    }
 
     if (const ConstantFP *CFP = dyn_cast<ConstantFP>(C))
       return DAG.getConstantFP(*CFP, VT);
@@ -1268,7 +1271,7 @@ void SelectionDAGBuilder::visitRet(const ReturnInst &I) {
 
         for (unsigned i = 0; i < NumParts; ++i) {
           Outs.push_back(ISD::OutputArg(Flags, Parts[i].getValueType(),
-                                        /*isfixed=*/true, 0, 0));
+                                        VT, /*isfixed=*/true, 0, 0));
           OutVals.push_back(Parts[i]);
         }
       }
@@ -1617,8 +1620,7 @@ void SelectionDAGBuilder::visitSwitchCase(CaseBlock &CB,
     } else
       Cond = DAG.getSetCC(dl, MVT::i1, CondLHS, getValue(CB.CmpRHS), CB.CC);
   } else {
-    assert(CB.CC == ISD::SETCC_INVALID &&
-           "Condition is undefined for to-the-range belonging check.");
+    assert(CB.CC == ISD::SETLE && "Can handle only LE ranges now");
 
     const APInt& Low = cast<ConstantInt>(CB.CmpLHS)->getValue();
     const APInt& High  = cast<ConstantInt>(CB.CmpRHS)->getValue();
@@ -1626,9 +1628,9 @@ void SelectionDAGBuilder::visitSwitchCase(CaseBlock &CB,
     SDValue CmpOp = getValue(CB.CmpMHS);
     EVT VT = CmpOp.getValueType();
 
-    if (cast<ConstantInt>(CB.CmpLHS)->isMinValue(false)) {
+    if (cast<ConstantInt>(CB.CmpLHS)->isMinValue(true)) {
       Cond = DAG.getSetCC(dl, MVT::i1, CmpOp, DAG.getConstant(High, VT),
-                          ISD::SETULE);
+                          ISD::SETLE);
     } else {
       SDValue SUB = DAG.getNode(ISD::SUB, dl,
                                 VT, CmpOp, DAG.getConstant(Low, VT));
@@ -1741,6 +1743,77 @@ void SelectionDAGBuilder::visitJumpTableHeader(JumpTable &JT,
   DAG.setRoot(BrCond);
 }
 
+/// Codegen a new tail for a stack protector check ParentMBB which has had its
+/// tail spliced into a stack protector check success bb.
+///
+/// For a high level explanation of how this fits into the stack protector
+/// generation see the comment on the declaration of class
+/// StackProtectorDescriptor.
+void SelectionDAGBuilder::visitSPDescriptorParent(StackProtectorDescriptor &SPD,
+                                                  MachineBasicBlock *ParentBB) {
+
+  // First create the loads to the guard/stack slot for the comparison.
+  const TargetLowering *TLI = TM.getTargetLowering();
+  EVT PtrTy = TLI->getPointerTy();
+
+  MachineFrameInfo *MFI = ParentBB->getParent()->getFrameInfo();
+  int FI = MFI->getStackProtectorIndex();
+
+  const Value *IRGuard = SPD.getGuard();
+  SDValue GuardPtr = getValue(IRGuard);
+  SDValue StackSlotPtr = DAG.getFrameIndex(FI, PtrTy);
+
+  unsigned Align =
+    TLI->getDataLayout()->getPrefTypeAlignment(IRGuard->getType());
+  SDValue Guard = DAG.getLoad(PtrTy, getCurSDLoc(), DAG.getEntryNode(),
+                              GuardPtr, MachinePointerInfo(IRGuard, 0),
+                              true, false, false, Align);
+
+  SDValue StackSlot = DAG.getLoad(PtrTy, getCurSDLoc(), DAG.getEntryNode(),
+                                  StackSlotPtr,
+                                  MachinePointerInfo::getFixedStack(FI),
+                                  true, false, false, Align);
+
+  // Perform the comparison via a subtract/getsetcc.
+  EVT VT = Guard.getValueType();
+  SDValue Sub = DAG.getNode(ISD::SUB, getCurSDLoc(), VT, Guard, StackSlot);
+
+  SDValue Cmp = DAG.getSetCC(getCurSDLoc(),
+                             TLI->getSetCCResultType(*DAG.getContext(),
+                                                     Sub.getValueType()),
+                             Sub, DAG.getConstant(0, VT),
+                             ISD::SETNE);
+
+  // If the sub is not 0, then we know the guard/stackslot do not equal, so
+  // branch to failure MBB.
+  SDValue BrCond = DAG.getNode(ISD::BRCOND, getCurSDLoc(),
+                               MVT::Other, StackSlot.getOperand(0),
+                               Cmp, DAG.getBasicBlock(SPD.getFailureMBB()));
+  // Otherwise branch to success MBB.
+  SDValue Br = DAG.getNode(ISD::BR, getCurSDLoc(),
+                           MVT::Other, BrCond,
+                           DAG.getBasicBlock(SPD.getSuccessMBB()));
+
+  DAG.setRoot(Br);
+}
+
+/// Codegen the failure basic block for a stack protector check.
+///
+/// A failure stack protector machine basic block consists simply of a call to
+/// __stack_chk_fail().
+///
+/// For a high level explanation of how this fits into the stack protector
+/// generation see the comment on the declaration of class
+/// StackProtectorDescriptor.
+void
+SelectionDAGBuilder::visitSPDescriptorFailure(StackProtectorDescriptor &SPD) {
+  const TargetLowering *TLI = TM.getTargetLowering();
+  SDValue Chain = TLI->makeLibCall(DAG, RTLIB::STACKPROTECTOR_CHECK_FAIL,
+                                   MVT::isVoid, 0, 0, false, getCurSDLoc(),
+                                   false, false).second;
+  DAG.setRoot(Chain);
+}
+
 /// visitBitTestHeader - This function emits necessary code to produce value
 /// suitable for "bit tests"
 void SelectionDAGBuilder::visitBitTestHeader(BitTestBlock &B,
@@ -2073,7 +2146,7 @@ bool SelectionDAGBuilder::handleSmallSwitchRange(CaseRec& CR,
       CC = ISD::SETEQ;
       LHS = SV; RHS = I->High; MHS = NULL;
     } else {
-      CC = ISD::SETCC_INVALID;
+      CC = ISD::SETLE;
       LHS = I->Low; MHS = SV; RHS = I->High;
     }
 
@@ -2107,7 +2180,7 @@ static inline bool areJTsAllowed(const TargetLowering &TLI) {
 
 static APInt ComputeRange(const APInt &First, const APInt &Last) {
   uint32_t BitWidth = std::max(Last.getBitWidth(), First.getBitWidth()) + 1;
-  APInt LastExt = Last.zext(BitWidth), FirstExt = First.zext(BitWidth);
+  APInt LastExt = Last.sext(BitWidth), FirstExt = First.sext(BitWidth);
   return (LastExt - FirstExt + 1ULL);
 }
 
@@ -2174,7 +2247,7 @@ bool SelectionDAGBuilder::handleJTSwitchCase(CaseRec &CR,
     const APInt &Low = cast<ConstantInt>(I->Low)->getValue();
     const APInt &High = cast<ConstantInt>(I->High)->getValue();
 
-    if (Low.ule(TEI) && TEI.ule(High)) {
+    if (Low.sle(TEI) && TEI.sle(High)) {
       DestBBs.push_back(I->BB);
       if (TEI==High)
         ++I;
@@ -2348,7 +2421,7 @@ bool SelectionDAGBuilder::handleBTSplitSwitchCase(CaseRec& CR,
   // Create a CaseBlock record representing a conditional branch to
   // the LHS node if the value being switched on SV is less than C.
   // Otherwise, branch to LHS.
-  CaseBlock CB(ISD::SETULT, SV, C, NULL, TrueBB, FalseBB, CR.CaseBB);
+  CaseBlock CB(ISD::SETLT, SV, C, NULL, TrueBB, FalseBB, CR.CaseBB);
 
   if (CR.CaseBB == SwitchBB)
     visitSwitchCase(CB, SwitchBB);
@@ -2378,7 +2451,7 @@ bool SelectionDAGBuilder::handleBitTestsSwitchCase(CaseRec& CR,
   MachineFunction *CurMF = FuncInfo.MF;
 
   // If target does not have legal shift left, do not emit bit tests at all.
-  if (!TLI->isOperationLegal(ISD::SHL, TLI->getPointerTy()))
+  if (!TLI->isOperationLegal(ISD::SHL, PTy))
     return false;
 
   size_t numCmps = 0;
@@ -2421,7 +2494,7 @@ bool SelectionDAGBuilder::handleBitTestsSwitchCase(CaseRec& CR,
   // Optimize the case where all the case values fit in a
   // word without having to subtract minValue. In this case,
   // we can optimize away the subtraction.
-  if (maxValue.ult(IntPtrBits)) {
+  if (minValue.isNonNegative() && maxValue.slt(IntPtrBits)) {
     cmpRange = maxValue;
   } else {
     lowBound = minValue;
@@ -2496,12 +2569,7 @@ bool SelectionDAGBuilder::handleBitTestsSwitchCase(CaseRec& CR,
 /// Clusterify - Transform simple list of Cases into list of CaseRange's
 size_t SelectionDAGBuilder::Clusterify(CaseVector& Cases,
                                        const SwitchInst& SI) {
-
-  /// Use a shorter form of declaration, and also
-  /// show the we want to use CRSBuilder as Clusterifier.
-  typedef IntegersSubsetMapping<MachineBasicBlock> Clusterifier;
-
-  Clusterifier TheClusterifier;
+  size_t numCmps = 0;
 
   BranchProbabilityInfo *BPI = FuncInfo.BPI;
   // Start with "simple" cases
@@ -2510,27 +2578,40 @@ size_t SelectionDAGBuilder::Clusterify(CaseVector& Cases,
     const BasicBlock *SuccBB = i.getCaseSuccessor();
     MachineBasicBlock *SMBB = FuncInfo.MBBMap[SuccBB];
 
-    TheClusterifier.add(i.getCaseValueEx(), SMBB,
-        BPI ? BPI->getEdgeWeight(SI.getParent(), i.getSuccessorIndex()) : 0);
-  }
-
-  TheClusterifier.optimize();
-
-  size_t numCmps = 0;
-  for (Clusterifier::RangeIterator i = TheClusterifier.begin(),
-       e = TheClusterifier.end(); i != e; ++i, ++numCmps) {
-    Clusterifier::Cluster &C = *i;
-    // Update edge weight for the cluster.
-    unsigned W = C.first.Weight;
-
-    // FIXME: Currently work with ConstantInt based numbers.
-    // Changing it to APInt based is a pretty heavy for this commit.
-    Cases.push_back(Case(C.first.getLow().toConstantInt(),
-                         C.first.getHigh().toConstantInt(), C.second, W));
+    uint32_t ExtraWeight =
+      BPI ? BPI->getEdgeWeight(SI.getParent(), i.getSuccessorIndex()) : 0;
+
+    Cases.push_back(Case(i.getCaseValue(), i.getCaseValue(),
+                         SMBB, ExtraWeight));
+  }
+  std::sort(Cases.begin(), Cases.end(), CaseCmp());
+
+  // Merge case into clusters
+  if (Cases.size() >= 2)
+    // Must recompute end() each iteration because it may be
+    // invalidated by erase if we hold on to it
+    for (CaseItr I = Cases.begin(), J = llvm::next(Cases.begin());
+         J != Cases.end(); ) {
+      const APInt& nextValue = cast<ConstantInt>(J->Low)->getValue();
+      const APInt& currentValue = cast<ConstantInt>(I->High)->getValue();
+      MachineBasicBlock* nextBB = J->BB;
+      MachineBasicBlock* currentBB = I->BB;
+
+      // If the two neighboring cases go to the same destination, merge them
+      // into a single case.
+      if ((nextValue - currentValue == 1) && (currentBB == nextBB)) {
+        I->High = J->High;
+        I->ExtraWeight += J->ExtraWeight;
+        J = Cases.erase(J);
+      } else {
+        I = J++;
+      }
+    }
 
-    if (C.first.getLow() != C.first.getHigh())
-    // A range counts double, since it requires two compares.
-    ++numCmps;
+  for (CaseItr I=Cases.begin(), E=Cases.end(); I!=E; ++I, ++numCmps) {
+    if (I->Low != I->High)
+      // A range counts double, since it requires two compares.
+      ++numCmps;
   }
 
   return numCmps;
@@ -2859,6 +2940,21 @@ void SelectionDAGBuilder::visitBitCast(const User &I) {
     setValue(&I, N);            // noop cast.
 }
 
+void SelectionDAGBuilder::visitAddrSpaceCast(const User &I) {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  const Value *SV = I.getOperand(0);
+  SDValue N = getValue(SV);
+  EVT DestVT = TM.getTargetLowering()->getValueType(I.getType());
+
+  unsigned SrcAS = SV->getType()->getPointerAddressSpace();
+  unsigned DestAS = I.getType()->getPointerAddressSpace();
+
+  if (!TLI.isNoopAddrSpaceCast(SrcAS, DestAS))
+    N = DAG.getAddrSpaceCast(getCurSDLoc(), DestVT, N, SrcAS, DestAS);
+
+  setValue(&I, N);
+}
+
 void SelectionDAGBuilder::visitInsertElement(const User &I) {
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   SDValue InVec = getValue(I.getOperand(0));
@@ -3151,10 +3247,12 @@ void SelectionDAGBuilder::visitExtractValue(const ExtractValueInst &I) {
 }
 
 void SelectionDAGBuilder::visitGetElementPtr(const User &I) {
-  SDValue N = getValue(I.getOperand(0));
+  Value *Op0 = I.getOperand(0);
   // Note that the pointer operand may be a vector of pointers. Take the scalar
   // element which holds a pointer.
-  Type *Ty = I.getOperand(0)->getType()->getScalarType();
+  Type *Ty = Op0->getType()->getScalarType();
+  unsigned AS = Ty->getPointerAddressSpace();
+  SDValue N = getValue(Op0);
 
   for (GetElementPtrInst::const_op_iterator OI = I.op_begin()+1, E = I.op_end();
        OI != E; ++OI) {
@@ -3179,14 +3277,13 @@ void SelectionDAGBuilder::visitGetElementPtr(const User &I) {
         uint64_t Offs =
             TD->getTypeAllocSize(Ty)*cast<ConstantInt>(CI)->getSExtValue();
         SDValue OffsVal;
-        EVT PTy = TLI->getPointerTy();
+        EVT PTy = TLI->getPointerTy(AS);
         unsigned PtrBits = PTy.getSizeInBits();
         if (PtrBits < 64)
-          OffsVal = DAG.getNode(ISD::TRUNCATE, getCurSDLoc(),
-                                TLI->getPointerTy(),
+          OffsVal = DAG.getNode(ISD::TRUNCATE, getCurSDLoc(), PTy,
                                 DAG.getConstant(Offs, MVT::i64));
         else
-          OffsVal = DAG.getIntPtrConstant(Offs);
+          OffsVal = DAG.getConstant(Offs, PTy);
 
         N = DAG.getNode(ISD::ADD, getCurSDLoc(), N.getValueType(), N,
                         OffsVal);
@@ -3194,7 +3291,7 @@ void SelectionDAGBuilder::visitGetElementPtr(const User &I) {
       }
 
       // N = N + Idx * ElementSize;
-      APInt ElementSize = APInt(TLI->getPointerTy().getSizeInBits(),
+      APInt ElementSize = APInt(TLI->getPointerSizeInBits(AS),
                                 TD->getTypeAllocSize(Ty));
       SDValue IdxN = getValue(Idx);
 
@@ -3451,7 +3548,7 @@ void SelectionDAGBuilder::visitAtomicCmpXchg(const AtomicCmpXchgInst &I) {
 
   SDValue L =
     DAG.getAtomic(ISD::ATOMIC_CMP_SWAP, dl,
-                  getValue(I.getCompareOperand()).getValueType().getSimpleVT(),
+                  getValue(I.getCompareOperand()).getSimpleValueType(),
                   InChain,
                   getValue(I.getPointerOperand()),
                   getValue(I.getCompareOperand()),
@@ -3499,7 +3596,7 @@ void SelectionDAGBuilder::visitAtomicRMW(const AtomicRMWInst &I) {
 
   SDValue L =
     DAG.getAtomic(NT, dl,
-                  getValue(I.getValOperand()).getValueType().getSimpleVT(),
+                  getValue(I.getValOperand()).getSimpleValueType(),
                   InChain,
                   getValue(I.getPointerOperand()),
                   getValue(I.getValOperand()),
@@ -4193,7 +4290,7 @@ static SDValue expandExp2(SDLoc dl, SDValue Op, SelectionDAG &DAG,
 static SDValue expandPow(SDLoc dl, SDValue LHS, SDValue RHS,
                          SelectionDAG &DAG, const TargetLowering &TLI) {
   bool IsExp10 = false;
-  if (LHS.getValueType() == MVT::f32 && LHS.getValueType() == MVT::f32 &&
+  if (LHS.getValueType() == MVT::f32 && RHS.getValueType() == MVT::f32 &&
       LimitFloatPrecision > 0 && LimitFloatPrecision <= 18) {
     if (ConstantFPSDNode *LHSC = dyn_cast<ConstantFPSDNode>(LHS)) {
       APFloat Ten(10.0f);
@@ -4705,14 +4802,14 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     SDValue CfaArg = DAG.getSExtOrTrunc(getValue(I.getArgOperand(0)), sdl,
                                         TLI->getPointerTy());
     SDValue Offset = DAG.getNode(ISD::ADD, sdl,
-                                 TLI->getPointerTy(),
+                                 CfaArg.getValueType(),
                                  DAG.getNode(ISD::FRAME_TO_ARGS_OFFSET, sdl,
-                                             TLI->getPointerTy()),
+                                             CfaArg.getValueType()),
                                  CfaArg);
     SDValue FA = DAG.getNode(ISD::FRAMEADDR, sdl,
                              TLI->getPointerTy(),
                              DAG.getConstant(0, TLI->getPointerTy()));
-    setValue(&I, DAG.getNode(ISD::ADD, sdl, TLI->getPointerTy(),
+    setValue(&I, DAG.getNode(ISD::ADD, sdl, FA.getValueType(),
                              FA, Offset));
     return 0;
   }
@@ -4902,7 +4999,8 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
   case Intrinsic::ceil:
   case Intrinsic::trunc:
   case Intrinsic::rint:
-  case Intrinsic::nearbyint: {
+  case Intrinsic::nearbyint:
+  case Intrinsic::round: {
     unsigned Opcode;
     switch (Intrinsic) {
     default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
@@ -4915,6 +5013,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     case Intrinsic::trunc:     Opcode = ISD::FTRUNC;     break;
     case Intrinsic::rint:      Opcode = ISD::FRINT;      break;
     case Intrinsic::nearbyint: Opcode = ISD::FNEARBYINT; break;
+    case Intrinsic::round:     Opcode = ISD::FROUND;     break;
     }
 
     setValue(&I, DAG.getNode(Opcode, sdl,
@@ -4922,6 +5021,12 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
                              getValue(I.getArgOperand(0))));
     return 0;
   }
+  case Intrinsic::copysign:
+    setValue(&I, DAG.getNode(ISD::FCOPYSIGN, sdl,
+                             getValue(I.getArgOperand(0)).getValueType(),
+                             getValue(I.getArgOperand(0)),
+                             getValue(I.getArgOperand(1))));
+    return 0;
   case Intrinsic::fma:
     setValue(&I, DAG.getNode(ISD::FMA, sdl,
                              getValue(I.getArgOperand(0)).getValueType(),
@@ -5207,9 +5312,30 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
   case Intrinsic::invariant_end:
     // Discard region information.
     return 0;
+  case Intrinsic::stackprotectorcheck: {
+    // Do not actually emit anything for this basic block. Instead we initialize
+    // the stack protector descriptor and export the guard variable so we can
+    // access it in FinishBasicBlock.
+    const BasicBlock *BB = I.getParent();
+    SPDescriptor.initialize(BB, FuncInfo.MBBMap[BB], I);
+    ExportFromCurrentBlock(SPDescriptor.getGuard());
+
+    // Flush our exports since we are going to process a terminator.
+    (void)getControlRoot();
+    return 0;
+  }
   case Intrinsic::donothing:
     // ignore
     return 0;
+  case Intrinsic::experimental_stackmap: {
+    visitStackmap(I);
+    return 0;
+  }
+  case Intrinsic::experimental_patchpoint_void:
+  case Intrinsic::experimental_patchpoint_i64: {
+    visitPatchpoint(I);
+    return 0;
+  }
   }
 }
 
@@ -5274,15 +5400,8 @@ void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee,
     SDValue ArgNode = getValue(V);
     Entry.Node = ArgNode; Entry.Ty = V->getType();
 
-    unsigned attrInd = i - CS.arg_begin() + 1;
-    Entry.isSExt     = CS.paramHasAttr(attrInd, Attribute::SExt);
-    Entry.isZExt     = CS.paramHasAttr(attrInd, Attribute::ZExt);
-    Entry.isInReg    = CS.paramHasAttr(attrInd, Attribute::InReg);
-    Entry.isSRet     = CS.paramHasAttr(attrInd, Attribute::StructRet);
-    Entry.isNest     = CS.paramHasAttr(attrInd, Attribute::Nest);
-    Entry.isByVal    = CS.paramHasAttr(attrInd, Attribute::ByVal);
-    Entry.isReturned = CS.paramHasAttr(attrInd, Attribute::Returned);
-    Entry.Alignment  = CS.getParamAlignment(attrInd);
+    // Skip the first return-type Attribute to get to params.
+    Entry.setAttributes(&CS, i - CS.arg_begin() + 1);
     Args.push_back(Entry);
   }
 
@@ -5364,8 +5483,8 @@ void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee,
   }
 
   if (!Result.second.getNode()) {
-    // As a special case, a null chain means that a tail call has been emitted and
-    // the DAG root is already updated.
+    // As a special case, a null chain means that a tail call has been emitted
+    // and the DAG root is already updated.
     HasTailCall = true;
 
     // Since there's no actual continuation from this block, nothing can be
@@ -5445,6 +5564,18 @@ static SDValue getMemCmpLoad(const Value *PtrVal, MVT LoadVT,
   return LoadVal;
 }
 
+/// processIntegerCallValue - Record the value for an instruction that
+/// produces an integer result, converting the type where necessary.
+void SelectionDAGBuilder::processIntegerCallValue(const Instruction &I,
+                                                  SDValue Value,
+                                                  bool IsSigned) {
+  EVT VT = TM.getTargetLowering()->getValueType(I.getType(), true);
+  if (IsSigned)
+    Value = DAG.getSExtOrTrunc(Value, getCurSDLoc(), VT);
+  else
+    Value = DAG.getZExtOrTrunc(Value, getCurSDLoc(), VT);
+  setValue(&I, Value);
+}
 
 /// visitMemCmpCall - See if we can lower a call to memcmp in an optimized form.
 /// If so, return true and lower it, otherwise return false and it will be
@@ -5460,15 +5591,33 @@ bool SelectionDAGBuilder::visitMemCmpCall(const CallInst &I) {
       !I.getType()->isIntegerTy())
     return false;
 
-  const ConstantInt *Size = dyn_cast<ConstantInt>(I.getArgOperand(2));
+  const Value *Size = I.getArgOperand(2);
+  const ConstantInt *CSize = dyn_cast<ConstantInt>(Size);
+  if (CSize && CSize->getZExtValue() == 0) {
+    EVT CallVT = TM.getTargetLowering()->getValueType(I.getType(), true);
+    setValue(&I, DAG.getConstant(0, CallVT));
+    return true;
+  }
+
+  const TargetSelectionDAGInfo &TSI = DAG.getSelectionDAGInfo();
+  std::pair<SDValue, SDValue> Res =
+    TSI.EmitTargetCodeForMemcmp(DAG, getCurSDLoc(), DAG.getRoot(),
+                                getValue(LHS), getValue(RHS), getValue(Size),
+                                MachinePointerInfo(LHS),
+                                MachinePointerInfo(RHS));
+  if (Res.first.getNode()) {
+    processIntegerCallValue(I, Res.first, true);
+    PendingLoads.push_back(Res.second);
+    return true;
+  }
 
   // memcmp(S1,S2,2) != 0 -> (*(short*)LHS != *(short*)RHS)  != 0
   // memcmp(S1,S2,4) != 0 -> (*(int*)LHS != *(int*)RHS)  != 0
-  if (Size && IsOnlyUsedInZeroEqualityComparison(&I)) {
+  if (CSize && IsOnlyUsedInZeroEqualityComparison(&I)) {
     bool ActuallyDoIt = true;
     MVT LoadVT;
     Type *LoadTy;
-    switch (Size->getZExtValue()) {
+    switch (CSize->getZExtValue()) {
     default:
       LoadVT = MVT::Other;
       LoadTy = 0;
@@ -5476,20 +5625,20 @@ bool SelectionDAGBuilder::visitMemCmpCall(const CallInst &I) {
       break;
     case 2:
       LoadVT = MVT::i16;
-      LoadTy = Type::getInt16Ty(Size->getContext());
+      LoadTy = Type::getInt16Ty(CSize->getContext());
       break;
     case 4:
       LoadVT = MVT::i32;
-      LoadTy = Type::getInt32Ty(Size->getContext());
+      LoadTy = Type::getInt32Ty(CSize->getContext());
       break;
     case 8:
       LoadVT = MVT::i64;
-      LoadTy = Type::getInt64Ty(Size->getContext());
+      LoadTy = Type::getInt64Ty(CSize->getContext());
       break;
         /*
     case 16:
       LoadVT = MVT::v4i32;
-      LoadTy = Type::getInt32Ty(Size->getContext());
+      LoadTy = Type::getInt32Ty(CSize->getContext());
       LoadTy = VectorType::get(LoadTy, 4);
       break;
          */
@@ -5503,7 +5652,7 @@ bool SelectionDAGBuilder::visitMemCmpCall(const CallInst &I) {
     // supports unaligned loads of that type.  Expanding into byte loads would
     // bloat the code.
     const TargetLowering *TLI = TM.getTargetLowering();
-    if (ActuallyDoIt && Size->getZExtValue() > 4) {
+    if (ActuallyDoIt && CSize->getZExtValue() > 4) {
       // TODO: Handle 5 byte compare as 4-byte + 1 byte.
       // TODO: Handle 8 byte compare on x86-32 as two 32-bit loads.
       if (!TLI->isTypeLegal(LoadVT) ||!TLI->allowsUnalignedMemoryAccesses(LoadVT))
@@ -5516,8 +5665,7 @@ bool SelectionDAGBuilder::visitMemCmpCall(const CallInst &I) {
 
       SDValue Res = DAG.getSetCC(getCurSDLoc(), MVT::i1, LHSVal, RHSVal,
                                  ISD::SETNE);
-      EVT CallVT = TLI->getValueType(I.getType(), true);
-      setValue(&I, DAG.getZExtOrTrunc(Res, getCurSDLoc(), CallVT));
+      processIntegerCallValue(I, Res, false);
       return true;
     }
   }
@@ -5526,6 +5674,148 @@ bool SelectionDAGBuilder::visitMemCmpCall(const CallInst &I) {
   return false;
 }
 
+/// visitMemChrCall -- See if we can lower a memchr call into an optimized
+/// form.  If so, return true and lower it, otherwise return false and it
+/// will be lowered like a normal call.
+bool SelectionDAGBuilder::visitMemChrCall(const CallInst &I) {
+  // Verify that the prototype makes sense.  void *memchr(void *, int, size_t)
+  if (I.getNumArgOperands() != 3)
+    return false;
+
+  const Value *Src = I.getArgOperand(0);
+  const Value *Char = I.getArgOperand(1);
+  const Value *Length = I.getArgOperand(2);
+  if (!Src->getType()->isPointerTy() ||
+      !Char->getType()->isIntegerTy() ||
+      !Length->getType()->isIntegerTy() ||
+      !I.getType()->isPointerTy())
+    return false;
+
+  const TargetSelectionDAGInfo &TSI = DAG.getSelectionDAGInfo();
+  std::pair<SDValue, SDValue> Res =
+    TSI.EmitTargetCodeForMemchr(DAG, getCurSDLoc(), DAG.getRoot(),
+                                getValue(Src), getValue(Char), getValue(Length),
+                                MachinePointerInfo(Src));
+  if (Res.first.getNode()) {
+    setValue(&I, Res.first);
+    PendingLoads.push_back(Res.second);
+    return true;
+  }
+
+  return false;
+}
+
+/// visitStrCpyCall -- See if we can lower a strcpy or stpcpy call into an
+/// optimized form.  If so, return true and lower it, otherwise return false
+/// and it will be lowered like a normal call.
+bool SelectionDAGBuilder::visitStrCpyCall(const CallInst &I, bool isStpcpy) {
+  // Verify that the prototype makes sense.  char *strcpy(char *, char *)
+  if (I.getNumArgOperands() != 2)
+    return false;
+
+  const Value *Arg0 = I.getArgOperand(0), *Arg1 = I.getArgOperand(1);
+  if (!Arg0->getType()->isPointerTy() ||
+      !Arg1->getType()->isPointerTy() ||
+      !I.getType()->isPointerTy())
+    return false;
+
+  const TargetSelectionDAGInfo &TSI = DAG.getSelectionDAGInfo();
+  std::pair<SDValue, SDValue> Res =
+    TSI.EmitTargetCodeForStrcpy(DAG, getCurSDLoc(), getRoot(),
+                                getValue(Arg0), getValue(Arg1),
+                                MachinePointerInfo(Arg0),
+                                MachinePointerInfo(Arg1), isStpcpy);
+  if (Res.first.getNode()) {
+    setValue(&I, Res.first);
+    DAG.setRoot(Res.second);
+    return true;
+  }
+
+  return false;
+}
+
+/// visitStrCmpCall - See if we can lower a call to strcmp in an optimized form.
+/// If so, return true and lower it, otherwise return false and it will be
+/// lowered like a normal call.
+bool SelectionDAGBuilder::visitStrCmpCall(const CallInst &I) {
+  // Verify that the prototype makes sense.  int strcmp(void*,void*)
+  if (I.getNumArgOperands() != 2)
+    return false;
+
+  const Value *Arg0 = I.getArgOperand(0), *Arg1 = I.getArgOperand(1);
+  if (!Arg0->getType()->isPointerTy() ||
+      !Arg1->getType()->isPointerTy() ||
+      !I.getType()->isIntegerTy())
+    return false;
+
+  const TargetSelectionDAGInfo &TSI = DAG.getSelectionDAGInfo();
+  std::pair<SDValue, SDValue> Res =
+    TSI.EmitTargetCodeForStrcmp(DAG, getCurSDLoc(), DAG.getRoot(),
+                                getValue(Arg0), getValue(Arg1),
+                                MachinePointerInfo(Arg0),
+                                MachinePointerInfo(Arg1));
+  if (Res.first.getNode()) {
+    processIntegerCallValue(I, Res.first, true);
+    PendingLoads.push_back(Res.second);
+    return true;
+  }
+
+  return false;
+}
+
+/// visitStrLenCall -- See if we can lower a strlen call into an optimized
+/// form.  If so, return true and lower it, otherwise return false and it
+/// will be lowered like a normal call.
+bool SelectionDAGBuilder::visitStrLenCall(const CallInst &I) {
+  // Verify that the prototype makes sense.  size_t strlen(char *)
+  if (I.getNumArgOperands() != 1)
+    return false;
+
+  const Value *Arg0 = I.getArgOperand(0);
+  if (!Arg0->getType()->isPointerTy() || !I.getType()->isIntegerTy())
+    return false;
+
+  const TargetSelectionDAGInfo &TSI = DAG.getSelectionDAGInfo();
+  std::pair<SDValue, SDValue> Res =
+    TSI.EmitTargetCodeForStrlen(DAG, getCurSDLoc(), DAG.getRoot(),
+                                getValue(Arg0), MachinePointerInfo(Arg0));
+  if (Res.first.getNode()) {
+    processIntegerCallValue(I, Res.first, false);
+    PendingLoads.push_back(Res.second);
+    return true;
+  }
+
+  return false;
+}
+
+/// visitStrNLenCall -- See if we can lower a strnlen call into an optimized
+/// form.  If so, return true and lower it, otherwise return false and it
+/// will be lowered like a normal call.
+bool SelectionDAGBuilder::visitStrNLenCall(const CallInst &I) {
+  // Verify that the prototype makes sense.  size_t strnlen(char *, size_t)
+  if (I.getNumArgOperands() != 2)
+    return false;
+
+  const Value *Arg0 = I.getArgOperand(0), *Arg1 = I.getArgOperand(1);
+  if (!Arg0->getType()->isPointerTy() ||
+      !Arg1->getType()->isIntegerTy() ||
+      !I.getType()->isIntegerTy())
+    return false;
+
+  const TargetSelectionDAGInfo &TSI = DAG.getSelectionDAGInfo();
+  std::pair<SDValue, SDValue> Res =
+    TSI.EmitTargetCodeForStrnlen(DAG, getCurSDLoc(), DAG.getRoot(),
+                                 getValue(Arg0), getValue(Arg1),
+                                 MachinePointerInfo(Arg0));
+  if (Res.first.getNode()) {
+    processIntegerCallValue(I, Res.first, false);
+    PendingLoads.push_back(Res.second);
+    return true;
+  }
+
+  return false;
+}
+
 /// visitUnaryFloatCall - If a call instruction is a unary floating-point
 /// operation (as expected), translate it to an SDNode with the specified opcode
 /// and return true.
@@ -5644,6 +5934,12 @@ void SelectionDAGBuilder::visitCall(const CallInst &I) {
         if (visitUnaryFloatCall(I, ISD::FRINT))
           return;
         break;
+      case LibFunc::round:
+      case LibFunc::roundf:
+      case LibFunc::roundl:
+        if (visitUnaryFloatCall(I, ISD::FROUND))
+          return;
+        break;
       case LibFunc::trunc:
       case LibFunc::truncf:
       case LibFunc::truncl:
@@ -5666,6 +5962,30 @@ void SelectionDAGBuilder::visitCall(const CallInst &I) {
         if (visitMemCmpCall(I))
           return;
         break;
+      case LibFunc::memchr:
+        if (visitMemChrCall(I))
+          return;
+        break;
+      case LibFunc::strcpy:
+        if (visitStrCpyCall(I, false))
+          return;
+        break;
+      case LibFunc::stpcpy:
+        if (visitStrCpyCall(I, true))
+          return;
+        break;
+      case LibFunc::strcmp:
+        if (visitStrCmpCall(I))
+          return;
+        break;
+      case LibFunc::strlen:
+        if (visitStrLenCall(I))
+          return;
+        break;
+      case LibFunc::strnlen:
+        if (visitStrNLenCall(I))
+          return;
+        break;
       }
     }
   }
@@ -6421,6 +6741,248 @@ void SelectionDAGBuilder::visitVACopy(const CallInst &I) {
                           DAG.getSrcValue(I.getArgOperand(1))));
 }
 
+/// \brief Lower an argument list according to the target calling convention.
+///
+/// \return A tuple of <return-value, token-chain>
+///
+/// This is a helper for lowering intrinsics that follow a target calling
+/// convention or require stack pointer adjustment. Only a subset of the
+/// intrinsic's operands need to participate in the calling convention.
+std::pair<SDValue, SDValue>
+SelectionDAGBuilder::LowerCallOperands(const CallInst &CI, unsigned ArgIdx,
+                                       unsigned NumArgs, SDValue Callee,
+                                       bool useVoidTy) {
+  TargetLowering::ArgListTy Args;
+  Args.reserve(NumArgs);
+
+  // Populate the argument list.
+  // Attributes for args start at offset 1, after the return attribute.
+  ImmutableCallSite CS(&CI);
+  for (unsigned ArgI = ArgIdx, ArgE = ArgIdx + NumArgs, AttrI = ArgIdx + 1;
+       ArgI != ArgE; ++ArgI) {
+    const Value *V = CI.getOperand(ArgI);
+
+    assert(!V->getType()->isEmptyTy() && "Empty type passed to intrinsic.");
+
+    TargetLowering::ArgListEntry Entry;
+    Entry.Node = getValue(V);
+    Entry.Ty = V->getType();
+    Entry.setAttributes(&CS, AttrI);
+    Args.push_back(Entry);
+  }
+
+  Type *retTy = useVoidTy ? Type::getVoidTy(*DAG.getContext()) : CI.getType();
+  TargetLowering::CallLoweringInfo CLI(getRoot(), retTy, /*retSExt*/ false,
+    /*retZExt*/ false, /*isVarArg*/ false, /*isInReg*/ false, NumArgs,
+    CI.getCallingConv(), /*isTailCall*/ false, /*doesNotReturn*/ false,
+    /*isReturnValueUsed*/ CI.use_empty(), Callee, Args, DAG, getCurSDLoc());
+
+  const TargetLowering *TLI = TM.getTargetLowering();
+  return TLI->LowerCallTo(CLI);
+}
+
+/// \brief Lower llvm.experimental.stackmap directly to its target opcode.
+void SelectionDAGBuilder::visitStackmap(const CallInst &CI) {
+  // void @llvm.experimental.stackmap(i32 <id>, i32 <numShadowBytes>,
+  //                                  [live variables...])
+
+  assert(CI.getType()->isVoidTy() && "Stackmap cannot return a value.");
+
+  SDValue Callee = getValue(CI.getCalledValue());
+
+  // Lower into a call sequence with no args and no return value.
+  std::pair<SDValue, SDValue> Result = LowerCallOperands(CI, 0, 0, Callee);
+  // Set the root to the target-lowered call chain.
+  SDValue Chain = Result.second;
+  DAG.setRoot(Chain);
+
+  /// Get a call instruction from the call sequence chain.
+  /// Tail calls are not allowed.
+  SDNode *CallEnd = Chain.getNode();
+  assert(CallEnd->getOpcode() == ISD::CALLSEQ_END &&
+         "Expected a callseq node.");
+  SDNode *Call = CallEnd->getOperand(0).getNode();
+  bool hasGlue = Call->getGluedNode();
+
+  // Replace the target specific call node with the stackmap intrinsic.
+  SmallVector<SDValue, 8> Ops;
+
+  // Add the <id> and <numShadowBytes> constants.
+  for (unsigned i = 0; i < 2; ++i) {
+    SDValue tmp = getValue(CI.getOperand(i));
+    Ops.push_back(DAG.getTargetConstant(
+        cast<ConstantSDNode>(tmp)->getZExtValue(), MVT::i32));
+  }
+  // Push live variables for the stack map.
+  for (unsigned i = 2, e = CI.getNumArgOperands(); i != e; ++i)
+    Ops.push_back(getValue(CI.getArgOperand(i)));
+
+  // Push the chain (this is originally the first operand of the call, but
+  // becomes now the last or second to last operand).
+  Ops.push_back(*(Call->op_begin()));
+
+    // Push the glue flag (last operand).
+  if (hasGlue)
+    Ops.push_back(*(Call->op_end()-1));
+
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+
+  // Replace the target specific call node with a STACKMAP node.
+  MachineSDNode *MN = DAG.getMachineNode(TargetOpcode::STACKMAP, getCurSDLoc(),
+                                         NodeTys, Ops);
+
+  // StackMap generates no value, so nothing goes in the NodeMap.
+
+  // Fixup the consumers of the intrinsic. The chain and glue may be used in the
+  // call sequence.
+  DAG.ReplaceAllUsesWith(Call, MN);
+
+  DAG.DeleteNode(Call);
+}
+
+/// \brief Lower llvm.experimental.patchpoint directly to its target opcode.
+void SelectionDAGBuilder::visitPatchpoint(const CallInst &CI) {
+  // void|i64 @llvm.experimental.patchpoint.void|i64(i32 <id>,
+  //                                                 i32 <numBytes>,
+  //                                                 i8* <target>,
+  //                                                 i32 <numArgs>,
+  //                                                 [Args...],
+  //                                                 [live variables...])
+
+  CallingConv::ID CC = CI.getCallingConv();
+  bool isAnyRegCC = CC == CallingConv::AnyReg;
+  bool hasDef = !CI.getType()->isVoidTy();
+  SDValue Callee = getValue(CI.getOperand(2)); // <target>
+
+  // Get the real number of arguments participating in the call <numArgs>
+  unsigned NumArgs =
+    cast<ConstantSDNode>(getValue(CI.getArgOperand(3)))->getZExtValue();
+
+  // Skip the four meta args: <id>, <numNopBytes>, <target>, <numArgs>
+  assert(CI.getNumArgOperands() >= NumArgs + 4 &&
+         "Not enough arguments provided to the patchpoint intrinsic");
+
+  // For AnyRegCC the arguments are lowered later on manually.
+  unsigned NumCallArgs = isAnyRegCC ? 0 : NumArgs;
+  std::pair<SDValue, SDValue> Result =
+    LowerCallOperands(CI, 4, NumCallArgs, Callee, isAnyRegCC);
+
+  // Set the root to the target-lowered call chain.
+  SDValue Chain = Result.second;
+  DAG.setRoot(Chain);
+
+  SDNode *CallEnd = Chain.getNode();
+  if (hasDef && (CallEnd->getOpcode() == ISD::CopyFromReg))
+    CallEnd = CallEnd->getOperand(0).getNode();
+
+  /// Get a call instruction from the call sequence chain.
+  /// Tail calls are not allowed.
+  assert(CallEnd->getOpcode() == ISD::CALLSEQ_END &&
+         "Expected a callseq node.");
+  SDNode *Call = CallEnd->getOperand(0).getNode();
+  bool hasGlue = Call->getGluedNode();
+
+  // Replace the target specific call node with the patchable intrinsic.
+  SmallVector<SDValue, 8> Ops;
+
+  // Add the <id> and <numNopBytes> constants.
+  for (unsigned i = 0; i < 2; ++i) {
+    SDValue tmp = getValue(CI.getOperand(i));
+    Ops.push_back(DAG.getTargetConstant(
+        cast<ConstantSDNode>(tmp)->getZExtValue(), MVT::i32));
+  }
+  // Assume that the Callee is a constant address.
+  Ops.push_back(
+    DAG.getIntPtrConstant(cast<ConstantSDNode>(Callee)->getZExtValue(),
+                          /*isTarget=*/true));
+
+  // Adjust <numArgs> to account for any arguments that have been passed on the
+  // stack instead.
+  // Call Node: Chain, Target, {Args}, RegMask, [Glue]
+  unsigned NumCallRegArgs = Call->getNumOperands() - (hasGlue ? 4 : 3);
+  NumCallRegArgs = isAnyRegCC ? NumArgs : NumCallRegArgs;
+  Ops.push_back(DAG.getTargetConstant(NumCallRegArgs, MVT::i32));
+
+  // Add the calling convention
+  Ops.push_back(DAG.getTargetConstant((unsigned)CC, MVT::i32));
+
+  // Add the arguments we omitted previously. The register allocator should
+  // place these in any free register.
+  if (isAnyRegCC)
+    for (unsigned i = 4, e = NumArgs + 4; i != e; ++i)
+      Ops.push_back(getValue(CI.getArgOperand(i)));
+
+  // Push the arguments from the call instruction.
+  SDNode::op_iterator e = hasGlue ? Call->op_end()-2 : Call->op_end()-1;
+  for (SDNode::op_iterator i = Call->op_begin()+2; i != e; ++i)
+    Ops.push_back(*i);
+
+  // Push live variables for the stack map.
+  for (unsigned i = NumArgs + 4, e = CI.getNumArgOperands(); i != e; ++i) {
+    SDValue OpVal = getValue(CI.getArgOperand(i));
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(OpVal)) {
+      Ops.push_back(
+        DAG.getTargetConstant(StackMaps::ConstantOp, MVT::i64));
+      Ops.push_back(
+        DAG.getTargetConstant(C->getSExtValue(), MVT::i64));
+    } else
+      Ops.push_back(OpVal);
+  }
+
+  // Push the register mask info.
+  if (hasGlue)
+    Ops.push_back(*(Call->op_end()-2));
+  else
+    Ops.push_back(*(Call->op_end()-1));
+
+  // Push the chain (this is originally the first operand of the call, but
+  // becomes now the last or second to last operand).
+  Ops.push_back(*(Call->op_begin()));
+
+  // Push the glue flag (last operand).
+  if (hasGlue)
+    Ops.push_back(*(Call->op_end()-1));
+
+  SDVTList NodeTys;
+  if (isAnyRegCC && hasDef) {
+    // Create the return types based on the intrinsic definition
+    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+    SmallVector<EVT, 3> ValueVTs;
+    ComputeValueVTs(TLI, CI.getType(), ValueVTs);
+    assert(ValueVTs.size() == 1 && "Expected only one return value type.");
+
+    // There is always a chain and a glue type at the end
+    ValueVTs.push_back(MVT::Other);
+    ValueVTs.push_back(MVT::Glue);
+    NodeTys = DAG.getVTList(ValueVTs.data(), ValueVTs.size());
+  } else
+    NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+
+  // Replace the target specific call node with a PATCHPOINT node.
+  MachineSDNode *MN = DAG.getMachineNode(TargetOpcode::PATCHPOINT,
+                                         getCurSDLoc(), NodeTys, Ops);
+
+  // Update the NodeMap.
+  if (hasDef) {
+    if (isAnyRegCC)
+      setValue(&CI, SDValue(MN, 0));
+    else
+      setValue(&CI, Result.first);
+  }
+
+  // Fixup the consumers of the intrinsic. The chain and glue may be used in the
+  // call sequence. Furthermore the location of the chain and glue can change
+  // when the AnyReg calling convention is used and the intrinsic returns a
+  // value.
+  if (isAnyRegCC && hasDef) {
+    SDValue From[] = {SDValue(Call, 0), SDValue(Call, 1)};
+    SDValue To[] = {SDValue(MN, 1), SDValue(MN, 2)};
+    DAG.ReplaceAllUsesOfValuesWith(From, To, 2);
+  } else
+    DAG.ReplaceAllUsesWith(Call, MN);
+  DAG.DeleteNode(Call);
+}
+
 /// TargetLowering::LowerCallTo - This is the default LowerCallTo
 /// implementation, which just calls LowerCall.
 /// FIXME: When all targets are
@@ -6438,6 +7000,7 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
     for (unsigned i = 0; i != NumRegs; ++i) {
       ISD::InputArg MyFlags;
       MyFlags.VT = RegisterVT;
+      MyFlags.ArgVT = VT;
       MyFlags.Used = CLI.IsReturnValueUsed;
       if (CLI.RetSExt)
         MyFlags.Flags.setSExt();
@@ -6527,7 +7090,7 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
 
       for (unsigned j = 0; j != NumParts; ++j) {
         // if it isn't first piece, alignment must be 1
-        ISD::OutputArg MyFlags(Flags, Parts[j].getValueType(),
+        ISD::OutputArg MyFlags(Flags, Parts[j].getValueType(), VT,
                                i < CLI.NumFixedArgs,
                                i, j*Parts[j].getValueType().getStoreSize());
         if (NumParts > 1 && j == 0)
@@ -6666,7 +7229,7 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
     ISD::ArgFlagsTy Flags;
     Flags.setSRet();
     MVT RegisterVT = TLI->getRegisterType(*DAG.getContext(), ValueVTs[0]);
-    ISD::InputArg RetArg(Flags, RegisterVT, true, 0, 0);
+    ISD::InputArg RetArg(Flags, RegisterVT, ValueVTs[0], true, 0, 0);
     Ins.push_back(RetArg);
   }
 
@@ -6677,6 +7240,7 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
     SmallVector<EVT, 4> ValueVTs;
     ComputeValueVTs(*TLI, I->getType(), ValueVTs);
     bool isArgValueUsed = !I->use_empty();
+    unsigned PartBase = 0;
     for (unsigned Value = 0, NumValues = ValueVTs.size();
          Value != NumValues; ++Value) {
       EVT VT = ValueVTs[Value];
@@ -6714,8 +7278,8 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
       MVT RegisterVT = TLI->getRegisterType(*CurDAG->getContext(), VT);
       unsigned NumRegs = TLI->getNumRegisters(*CurDAG->getContext(), VT);
       for (unsigned i = 0; i != NumRegs; ++i) {
-        ISD::InputArg MyFlags(Flags, RegisterVT, isArgValueUsed,
-                              Idx-1, i*RegisterVT.getStoreSize());
+        ISD::InputArg MyFlags(Flags, RegisterVT, VT, isArgValueUsed,
+                              Idx-1, PartBase+i*RegisterVT.getStoreSize());
         if (NumRegs > 1 && i == 0)
           MyFlags.Flags.setSplit();
         // if it isn't first piece, alignment must be 1
@@ -6723,6 +7287,7 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
           MyFlags.Flags.setOrigAlign(1);
         Ins.push_back(MyFlags);
       }
+      PartBase += VT.getStoreSize();
     }
   }
 
@@ -6940,3 +7505,22 @@ SelectionDAGBuilder::HandlePHINodesInSuccessorBlocks(const BasicBlock *LLVMBB) {
 
   ConstantsOut.clear();
 }
+
+/// Add a successor MBB to ParentMBB< creating a new MachineBB for BB if SuccMBB
+/// is 0.
+MachineBasicBlock *
+SelectionDAGBuilder::StackProtectorDescriptor::
+AddSuccessorMBB(const BasicBlock *BB,
+                MachineBasicBlock *ParentMBB,
+                MachineBasicBlock *SuccMBB) {
+  // If SuccBB has not been created yet, create it.
+  if (!SuccMBB) {
+    MachineFunction *MF = ParentMBB->getParent();
+    MachineFunction::iterator BBI = ParentMBB;
+    SuccMBB = MF->CreateMachineBasicBlock(BB);
+    MF->insert(++BBI, SuccMBB);
+  }
+  // Add it as a successor of ParentMBB.
+  ParentMBB->addSuccessor(SuccMBB);
+  return SuccMBB;
+}
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
index ef73c00..835f643 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
@@ -1,4 +1,4 @@
-//===-- SelectionDAGBuilder.h - Selection-DAG building --------*- c++ -*---===//
+//===-- SelectionDAGBuilder.h - Selection-DAG building --------*- C++ -*---===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -26,6 +26,7 @@
 
 namespace llvm {
 
+class AddrSpaceCastInst;
 class AliasAnalysis;
 class AllocaInst;
 class BasicBlock;
@@ -84,7 +85,7 @@ class SelectionDAGBuilder {
   const Instruction *CurInst;
 
   DenseMap<const Value*, SDValue> NodeMap;
-  
+
   /// UnusedArgNodeMap - Maps argument value for unused arguments. This is used
   /// to preserve debug information for incoming arguments.
   DenseMap<const Value*, SDValue> UnusedArgNodeMap;
@@ -182,6 +183,17 @@ private:
 
   typedef std::vector<CaseRec> CaseRecVector;
 
+  /// The comparison function for sorting the switch case values in the vector.
+  /// WARNING: Case ranges should be disjoint!
+  struct CaseCmp {
+    bool operator()(const Case &C1, const Case &C2) {
+      assert(isa<ConstantInt>(C1.Low) && isa<ConstantInt>(C2.High));
+      const ConstantInt* CI1 = cast<const ConstantInt>(C1.Low);
+      const ConstantInt* CI2 = cast<const ConstantInt>(C2.High);
+      return CI1->getValue().slt(CI2->getValue());
+    }
+  };
+
   struct CaseBitsCmp {
     bool operator()(const CaseBits &C1, const CaseBits &C2) {
       return C1.Bits > C2.Bits;
@@ -224,7 +236,7 @@ private:
   struct JumpTable {
     JumpTable(unsigned R, unsigned J, MachineBasicBlock *M,
               MachineBasicBlock *D): Reg(R), JTI(J), MBB(M), Default(D) {}
-  
+
     /// Reg - the virtual register containing the index of the jump table entry
     //. to jump to.
     unsigned Reg;
@@ -278,6 +290,201 @@ private:
     BitTestInfo Cases;
   };
 
+  /// A class which encapsulates all of the information needed to generate a
+  /// stack protector check and signals to isel via its state being initialized
+  /// that a stack protector needs to be generated.
+  ///
+  /// *NOTE* The following is a high level documentation of SelectionDAG Stack
+  /// Protector Generation. The reason that it is placed here is for a lack of
+  /// other good places to stick it.
+  ///
+  /// High Level Overview of SelectionDAG Stack Protector Generation:
+  ///
+  /// Previously, generation of stack protectors was done exclusively in the
+  /// pre-SelectionDAG Codegen LLVM IR Pass "Stack Protector". This necessitated
+  /// splitting basic blocks at the IR level to create the success/failure basic
+  /// blocks in the tail of the basic block in question. As a result of this,
+  /// calls that would have qualified for the sibling call optimization were no
+  /// longer eligible for optimization since said calls were no longer right in
+  /// the "tail position" (i.e. the immediate predecessor of a ReturnInst
+  /// instruction).
+  ///
+  /// Then it was noticed that since the sibling call optimization causes the
+  /// callee to reuse the caller's stack, if we could delay the generation of
+  /// the stack protector check until later in CodeGen after the sibling call
+  /// decision was made, we get both the tail call optimization and the stack
+  /// protector check!
+  ///
+  /// A few goals in solving this problem were:
+  ///
+  ///   1. Preserve the architecture independence of stack protector generation.
+  ///
+  ///   2. Preserve the normal IR level stack protector check for platforms like
+  ///      OpenBSD for which we support platform specific stack protector
+  ///      generation.
+  ///
+  /// The main problem that guided the present solution is that one can not
+  /// solve this problem in an architecture independent manner at the IR level
+  /// only. This is because:
+  ///
+  ///   1. The decision on whether or not to perform a sibling call on certain
+  ///      platforms (for instance i386) requires lower level information
+  ///      related to available registers that can not be known at the IR level.
+  ///
+  ///   2. Even if the previous point were not true, the decision on whether to
+  ///      perform a tail call is done in LowerCallTo in SelectionDAG which
+  ///      occurs after the Stack Protector Pass. As a result, one would need to
+  ///      put the relevant callinst into the stack protector check success
+  ///      basic block (where the return inst is placed) and then move it back
+  ///      later at SelectionDAG/MI time before the stack protector check if the
+  ///      tail call optimization failed. The MI level option was nixed
+  ///      immediately since it would require platform specific pattern
+  ///      matching. The SelectionDAG level option was nixed because
+  ///      SelectionDAG only processes one IR level basic block at a time
+  ///      implying one could not create a DAG Combine to move the callinst.
+  ///
+  /// To get around this problem a few things were realized:
+  ///
+  ///   1. While one can not handle multiple IR level basic blocks at the
+  ///      SelectionDAG Level, one can generate multiple machine basic blocks
+  ///      for one IR level basic block. This is how we handle bit tests and
+  ///      switches.
+  ///
+  ///   2. At the MI level, tail calls are represented via a special return
+  ///      MIInst called "tcreturn". Thus if we know the basic block in which we
+  ///      wish to insert the stack protector check, we get the correct behavior
+  ///      by always inserting the stack protector check right before the return
+  ///      statement. This is a "magical transformation" since no matter where
+  ///      the stack protector check intrinsic is, we always insert the stack
+  ///      protector check code at the end of the BB.
+  ///
+  /// Given the aforementioned constraints, the following solution was devised:
+  ///
+  ///   1. On platforms that do not support SelectionDAG stack protector check
+  ///      generation, allow for the normal IR level stack protector check
+  ///      generation to continue.
+  ///
+  ///   2. On platforms that do support SelectionDAG stack protector check
+  ///      generation:
+  ///
+  ///     a. Use the IR level stack protector pass to decide if a stack
+  ///        protector is required/which BB we insert the stack protector check
+  ///        in by reusing the logic already therein. If we wish to generate a
+  ///        stack protector check in a basic block, we place a special IR
+  ///        intrinsic called llvm.stackprotectorcheck right before the BB's
+  ///        returninst or if there is a callinst that could potentially be
+  ///        sibling call optimized, before the call inst.
+  ///
+  ///     b. Then when a BB with said intrinsic is processed, we codegen the BB
+  ///        normally via SelectBasicBlock. In said process, when we visit the
+  ///        stack protector check, we do not actually emit anything into the
+  ///        BB. Instead, we just initialize the stack protector descriptor
+  ///        class (which involves stashing information/creating the success
+  ///        mbbb and the failure mbb if we have not created one for this
+  ///        function yet) and export the guard variable that we are going to
+  ///        compare.
+  ///
+  ///     c. After we finish selecting the basic block, in FinishBasicBlock if
+  ///        the StackProtectorDescriptor attached to the SelectionDAGBuilder is
+  ///        initialized, we first find a splice point in the parent basic block
+  ///        before the terminator and then splice the terminator of said basic
+  ///        block into the success basic block. Then we code-gen a new tail for
+  ///        the parent basic block consisting of the two loads, the comparison,
+  ///        and finally two branches to the success/failure basic blocks. We
+  ///        conclude by code-gening the failure basic block if we have not
+  ///        code-gened it already (all stack protector checks we generate in
+  ///        the same function, use the same failure basic block).
+  class StackProtectorDescriptor {
+  public:
+    StackProtectorDescriptor() : ParentMBB(0), SuccessMBB(0), FailureMBB(0),
+                                 Guard(0) { }
+    ~StackProtectorDescriptor() { }
+
+    /// Returns true if all fields of the stack protector descriptor are
+    /// initialized implying that we should/are ready to emit a stack protector.
+    bool shouldEmitStackProtector() const {
+      return ParentMBB && SuccessMBB && FailureMBB && Guard;
+    }
+
+    /// Initialize the stack protector descriptor structure for a new basic
+    /// block.
+    void initialize(const BasicBlock *BB,
+                    MachineBasicBlock *MBB,
+                    const CallInst &StackProtCheckCall) {
+      // Make sure we are not initialized yet.
+      assert(!shouldEmitStackProtector() && "Stack Protector Descriptor is "
+             "already initialized!");
+      ParentMBB = MBB;
+      SuccessMBB = AddSuccessorMBB(BB, MBB);
+      FailureMBB = AddSuccessorMBB(BB, MBB, FailureMBB);
+      if (!Guard)
+        Guard = StackProtCheckCall.getArgOperand(0);
+    }
+
+    /// Reset state that changes when we handle different basic blocks.
+    ///
+    /// This currently includes:
+    ///
+    /// 1. The specific basic block we are generating a
+    /// stack protector for (ParentMBB).
+    ///
+    /// 2. The successor machine basic block that will contain the tail of
+    /// parent mbb after we create the stack protector check (SuccessMBB). This
+    /// BB is visited only on stack protector check success.
+    void resetPerBBState() {
+      ParentMBB = 0;
+      SuccessMBB = 0;
+    }
+
+    /// Reset state that only changes when we switch functions.
+    ///
+    /// This currently includes:
+    ///
+    /// 1. FailureMBB since we reuse the failure code path for all stack
+    /// protector checks created in an individual function.
+    ///
+    /// 2.The guard variable since the guard variable we are checking against is
+    /// always the same.
+    void resetPerFunctionState() {
+      FailureMBB = 0;
+      Guard = 0;
+    }
+
+    MachineBasicBlock *getParentMBB() { return ParentMBB; }
+    MachineBasicBlock *getSuccessMBB() { return SuccessMBB; }
+    MachineBasicBlock *getFailureMBB() { return FailureMBB; }
+    const Value *getGuard() { return Guard; }
+
+  private:
+    /// The basic block for which we are generating the stack protector.
+    ///
+    /// As a result of stack protector generation, we will splice the
+    /// terminators of this basic block into the successor mbb SuccessMBB and
+    /// replace it with a compare/branch to the successor mbbs
+    /// SuccessMBB/FailureMBB depending on whether or not the stack protector
+    /// was violated.
+    MachineBasicBlock *ParentMBB;
+
+    /// A basic block visited on stack protector check success that contains the
+    /// terminators of ParentMBB.
+    MachineBasicBlock *SuccessMBB;
+
+    /// This basic block visited on stack protector check failure that will
+    /// contain a call to __stack_chk_fail().
+    MachineBasicBlock *FailureMBB;
+
+    /// The guard variable which we will compare against the stored value in the
+    /// stack protector stack slot.
+    const Value *Guard;
+
+    /// Add a successor machine basic block to ParentMBB. If the successor mbb
+    /// has not been created yet (i.e. if SuccMBB = 0), then the machine basic
+    /// block will be created.
+    MachineBasicBlock *AddSuccessorMBB(const BasicBlock *BB,
+                                       MachineBasicBlock *ParentMBB,
+                                       MachineBasicBlock *SuccMBB = 0);
+  };
+
 private:
   const TargetMachine &TM;
 public:
@@ -295,6 +502,9 @@ public:
   /// BitTestCases - Vector of BitTestBlock structures used to communicate
   /// SwitchInst code generation information.
   std::vector<BitTestBlock> BitTestCases;
+  /// A StackProtectorDescriptor structure used to communicate stack protector
+  /// information in between SelectBasicBlock and FinishBasicBlock.
+  StackProtectorDescriptor SPDescriptor;
 
   // Emit PHI-node-operand constants only once even if used by multiple
   // PHI nodes.
@@ -305,9 +515,9 @@ public:
   FunctionLoweringInfo &FuncInfo;
 
   /// OptLevel - What optimization level we're generating code for.
-  /// 
+  ///
   CodeGenOpt::Level OptLevel;
-  
+
   /// GFI - Garbage collection metadata for the function.
   GCFunctionInfo *GFI;
 
@@ -389,7 +599,7 @@ public:
     assert(N.getNode() == 0 && "Already set a value for this node!");
     N = NewN;
   }
-  
+
   void setUnusedArgValue(const Value *V, SDValue NewN) {
     SDValue &N = UnusedArgNodeMap[V];
     assert(N.getNode() == 0 && "Already set a value for this node!");
@@ -410,6 +620,12 @@ public:
   void LowerCallTo(ImmutableCallSite CS, SDValue Callee, bool IsTailCall,
                    MachineBasicBlock *LandingPad = NULL);
 
+  std::pair<SDValue, SDValue> LowerCallOperands(const CallInst &CI,
+                                                unsigned ArgIdx,
+                                                unsigned NumArgs,
+                                                SDValue Callee,
+                                                bool useVoidTy = false);
+
   /// UpdateSplitBlock - When an MBB was split during scheduling, update the
   /// references that ned to refer to the last resulting block.
   void UpdateSplitBlock(MachineBasicBlock *First, MachineBasicBlock *Last);
@@ -451,6 +667,9 @@ private:
 public:
   void visitSwitchCase(CaseBlock &CB,
                        MachineBasicBlock *SwitchBB);
+  void visitSPDescriptorParent(StackProtectorDescriptor &SPD,
+                               MachineBasicBlock *ParentBB);
+  void visitSPDescriptorFailure(StackProtectorDescriptor &SPD);
   void visitBitTestHeader(BitTestBlock &B, MachineBasicBlock *SwitchBB);
   void visitBitTestCase(BitTestBlock &BB,
                         MachineBasicBlock* NextMBB,
@@ -461,7 +680,7 @@ public:
   void visitJumpTable(JumpTable &JT);
   void visitJumpTableHeader(JumpTable &JT, JumpTableHeader &JTH,
                             MachineBasicBlock *SwitchBB);
-  
+
 private:
   // These all get lowered before this pass.
   void visitInvoke(const InvokeInst &I);
@@ -502,6 +721,7 @@ private:
   void visitPtrToInt(const User &I);
   void visitIntToPtr(const User &I);
   void visitBitCast(const User &I);
+  void visitAddrSpaceCast(const User &I);
 
   void visitExtractElement(const User &I);
   void visitInsertElement(const User &I);
@@ -523,6 +743,11 @@ private:
   void visitPHI(const PHINode &I);
   void visitCall(const CallInst &I);
   bool visitMemCmpCall(const CallInst &I);
+  bool visitMemChrCall(const CallInst &I);
+  bool visitStrCpyCall(const CallInst &I, bool isStpcpy);
+  bool visitStrCmpCall(const CallInst &I);
+  bool visitStrLenCall(const CallInst &I);
+  bool visitStrNLenCall(const CallInst &I);
   bool visitUnaryFloatCall(const CallInst &I, unsigned Opcode);
   void visitAtomicLoad(const LoadInst &I);
   void visitAtomicStore(const StoreInst &I);
@@ -535,6 +760,8 @@ private:
   void visitVAArg(const VAArgInst &I);
   void visitVAEnd(const CallInst &I);
   void visitVACopy(const CallInst &I);
+  void visitStackmap(const CallInst &I);
+  void visitPatchpoint(const CallInst &I);
 
   void visitUserOp1(const Instruction &I) {
     llvm_unreachable("UserOp1 should not exist at instruction selection time!");
@@ -543,10 +770,13 @@ private:
     llvm_unreachable("UserOp2 should not exist at instruction selection time!");
   }
 
+  void processIntegerCallValue(const Instruction &I,
+                               SDValue Value, bool IsSigned);
+
   void HandlePHINodesInSuccessorBlocks(const BasicBlock *LLVMBB);
 
   /// EmitFuncArgumentDbgValue - If V is an function argument then create
-  /// corresponding DBG_VALUE machine instruction for it now. At the end of 
+  /// corresponding DBG_VALUE machine instruction for it now. At the end of
   /// instruction selection, they will be inserted to the entry BB.
   bool EmitFuncArgumentDbgValue(const Value *V, MDNode *Variable,
                                 int64_t Offset, const SDValue &N);
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index d8ee221..c04a08d 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -142,6 +142,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::FCEIL:                      return "fceil";
   case ISD::FRINT:                      return "frint";
   case ISD::FNEARBYINT:                 return "fnearbyint";
+  case ISD::FROUND:                     return "fround";
   case ISD::FEXP:                       return "fexp";
   case ISD::FEXP2:                      return "fexp2";
   case ISD::FLOG:                       return "flog";
@@ -223,6 +224,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::FP_TO_SINT:                 return "fp_to_sint";
   case ISD::FP_TO_UINT:                 return "fp_to_uint";
   case ISD::BITCAST:                    return "bitcast";
+  case ISD::ADDRSPACECAST:              return "addrspacecast";
   case ISD::FP16_TO_FP32:               return "fp16_to_fp32";
   case ISD::FP32_TO_FP16:               return "fp32_to_fp16";
 
@@ -484,6 +486,13 @@ void SDNode::print_details(raw_ostream &OS, const SelectionDAG *G) const {
       OS << " " << offset;
     if (unsigned int TF = BA->getTargetFlags())
       OS << " [TF=" << TF << ']';
+  } else if (const AddrSpaceCastSDNode *ASC =
+               dyn_cast<AddrSpaceCastSDNode>(this)) {
+    OS << '['
+       << ASC->getSrcAddressSpace()
+       << " -> "
+       << ASC->getDestAddressSpace()
+       << ']';
   }
 
   if (unsigned Order = getIROrder())
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index 01da51c..3a0cfa1 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -223,6 +223,44 @@ defaultListDAGScheduler("default", "Best scheduler for the target",
 
 namespace llvm {
   //===--------------------------------------------------------------------===//
+  /// \brief This class is used by SelectionDAGISel to temporarily override
+  /// the optimization level on a per-function basis.
+  class OptLevelChanger {
+    SelectionDAGISel &IS;
+    CodeGenOpt::Level SavedOptLevel;
+    bool SavedFastISel;
+
+  public:
+    OptLevelChanger(SelectionDAGISel &ISel,
+                    CodeGenOpt::Level NewOptLevel) : IS(ISel) {
+      SavedOptLevel = IS.OptLevel;
+      if (NewOptLevel == SavedOptLevel)
+        return;
+      IS.OptLevel = NewOptLevel;
+      IS.TM.setOptLevel(NewOptLevel);
+      SavedFastISel = IS.TM.Options.EnableFastISel;
+      if (NewOptLevel == CodeGenOpt::None)
+        IS.TM.setFastISel(true);
+      DEBUG(dbgs() << "\nChanging optimization level for Function "
+            << IS.MF->getFunction()->getName() << "\n");
+      DEBUG(dbgs() << "\tBefore: -O" << SavedOptLevel
+            << " ; After: -O" << NewOptLevel << "\n");
+    }
+
+    ~OptLevelChanger() {
+      if (IS.OptLevel == SavedOptLevel)
+        return;
+      DEBUG(dbgs() << "\nRestoring optimization level for Function "
+            << IS.MF->getFunction()->getName() << "\n");
+      DEBUG(dbgs() << "\tBefore: -O" << IS.OptLevel
+            << " ; After: -O" << SavedOptLevel << "\n");
+      IS.OptLevel = SavedOptLevel;
+      IS.TM.setOptLevel(SavedOptLevel);
+      IS.TM.setFastISel(SavedFastISel);
+    }
+  };
+
+  //===--------------------------------------------------------------------===//
   /// createDefaultScheduler - This creates an instruction scheduler appropriate
   /// for the target.
   ScheduleDAGSDNodes* createDefaultScheduler(SelectionDAGISel *IS,
@@ -230,7 +268,7 @@ namespace llvm {
     const TargetLowering *TLI = IS->getTargetLowering();
     const TargetSubtargetInfo &ST = IS->TM.getSubtarget<TargetSubtargetInfo>();
 
-    if (OptLevel == CodeGenOpt::None || ST.enableMachineScheduler() ||
+    if (OptLevel == CodeGenOpt::None || ST.useMachineScheduler() ||
         TLI->getSchedulingPreference() == Sched::Source)
       return createSourceListDAGScheduler(IS, OptLevel);
     if (TLI->getSchedulingPreference() == Sched::RegPressure)
@@ -356,6 +394,7 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
   const Function &Fn = *mf.getFunction();
   const TargetInstrInfo &TII = *TM.getInstrInfo();
   const TargetRegisterInfo &TRI = *TM.getRegisterInfo();
+  const TargetLowering *TLI = TM.getTargetLowering();
 
   MF = &mf;
   RegInfo = &MF->getRegInfo();
@@ -369,11 +408,17 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
   ST.resetSubtargetFeatures(MF);
   TM.resetTargetOptions(MF);
 
+  // Reset OptLevel to None for optnone functions.
+  CodeGenOpt::Level NewOptLevel = OptLevel;
+  if (Fn.hasFnAttribute(Attribute::OptimizeNone))
+    NewOptLevel = CodeGenOpt::None;
+  OptLevelChanger OLC(*this, NewOptLevel);
+
   DEBUG(dbgs() << "\n\n\n=== " << Fn.getName() << "\n");
 
   SplitCriticalSideEffectEdges(const_cast<Function&>(Fn), this);
 
-  CurDAG->init(*MF, TTI);
+  CurDAG->init(*MF, TTI, TLI);
   FuncInfo->set(Fn, *MF);
 
   if (UseMBPI && OptLevel != CodeGenOpt::None)
@@ -408,9 +453,13 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
       EntryMBB->insert(EntryMBB->begin(), MI);
     else {
       MachineInstr *Def = RegInfo->getVRegDef(Reg);
-      MachineBasicBlock::iterator InsertPos = Def;
-      // FIXME: VR def may not be in entry block.
-      Def->getParent()->insert(llvm::next(InsertPos), MI);
+      if (Def) {
+        MachineBasicBlock::iterator InsertPos = Def;
+        // FIXME: VR def may not be in entry block.
+        Def->getParent()->insert(llvm::next(InsertPos), MI);
+      } else
+        DEBUG(dbgs() << "Dropping debug info for dead vreg"
+              << TargetRegisterInfo::virtReg2Index(Reg) << "\n");
     }
 
     // If Reg is live-in then update debug info to track its copy in a vreg.
@@ -422,7 +471,7 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
       MachineBasicBlock::iterator InsertPos = Def;
       const MDNode *Variable =
         MI->getOperand(MI->getNumOperands()-1).getMetadata();
-      bool IsIndirect = MI->getOperand(1).isImm();
+      bool IsIndirect = MI->isIndirectDebugValue();
       unsigned Offset = IsIndirect ? MI->getOperand(1).getImm() : 0;
       // Def is never a terminator here, so it is ok to increment InsertPos.
       BuildMI(*EntryMBB, ++InsertPos, MI->getDebugLoc(),
@@ -497,6 +546,10 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
       if (J == E) break;
       To = J->second;
     }
+    // Make sure the new register has a sufficiently constrained register class.
+    if (TargetRegisterInfo::isVirtualRegister(From) &&
+        TargetRegisterInfo::isVirtualRegister(To))
+      MRI.constrainRegClass(To, MRI.getRegClass(From));
     // Replace it.
     MRI.replaceRegWith(From, To);
   }
@@ -617,6 +670,8 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
   DEBUG(dbgs() << "Type-legalized selection DAG: BB#" << BlockNumber
         << " '" << BlockName << "'\n"; CurDAG->dump());
 
+  CurDAG->NewNodesMustHaveLegalTypes = true;
+
   if (Changed) {
     if (ViewDAGCombineLT)
       CurDAG->viewGraph("dag-combine-lt input for " + BlockName);
@@ -1140,6 +1195,91 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {
 
   delete FastIS;
   SDB->clearDanglingDebugInfo();
+  SDB->SPDescriptor.resetPerFunctionState();
+}
+
+/// Given that the input MI is before a partial terminator sequence TSeq, return
+/// true if M + TSeq also a partial terminator sequence.
+///
+/// A Terminator sequence is a sequence of MachineInstrs which at this point in
+/// lowering copy vregs into physical registers, which are then passed into
+/// terminator instructors so we can satisfy ABI constraints. A partial
+/// terminator sequence is an improper subset of a terminator sequence (i.e. it
+/// may be the whole terminator sequence).
+static bool MIIsInTerminatorSequence(const MachineInstr *MI) {
+  // If we do not have a copy or an implicit def, we return true if and only if
+  // MI is a debug value.
+  if (!MI->isCopy() && !MI->isImplicitDef())
+    // Sometimes DBG_VALUE MI sneak in between the copies from the vregs to the
+    // physical registers if there is debug info associated with the terminator
+    // of our mbb. We want to include said debug info in our terminator
+    // sequence, so we return true in that case.
+    return MI->isDebugValue();
+
+  // We have left the terminator sequence if we are not doing one of the
+  // following:
+  //
+  // 1. Copying a vreg into a physical register.
+  // 2. Copying a vreg into a vreg.
+  // 3. Defining a register via an implicit def.
+
+  // OPI should always be a register definition...
+  MachineInstr::const_mop_iterator OPI = MI->operands_begin();
+  if (!OPI->isReg() || !OPI->isDef())
+    return false;
+
+  // Defining any register via an implicit def is always ok.
+  if (MI->isImplicitDef())
+    return true;
+
+  // Grab the copy source...
+  MachineInstr::const_mop_iterator OPI2 = OPI;
+  ++OPI2;
+  assert(OPI2 != MI->operands_end()
+         && "Should have a copy implying we should have 2 arguments.");
+
+  // Make sure that the copy dest is not a vreg when the copy source is a
+  // physical register.
+  if (!OPI2->isReg() ||
+      (!TargetRegisterInfo::isPhysicalRegister(OPI->getReg()) &&
+       TargetRegisterInfo::isPhysicalRegister(OPI2->getReg())))
+    return false;
+
+  return true;
+}
+
+/// Find the split point at which to splice the end of BB into its success stack
+/// protector check machine basic block.
+///
+/// On many platforms, due to ABI constraints, terminators, even before register
+/// allocation, use physical registers. This creates an issue for us since
+/// physical registers at this point can not travel across basic
+/// blocks. Luckily, selectiondag always moves physical registers into vregs
+/// when they enter functions and moves them through a sequence of copies back
+/// into the physical registers right before the terminator creating a
+/// ``Terminator Sequence''. This function is searching for the beginning of the
+/// terminator sequence so that we can ensure that we splice off not just the
+/// terminator, but additionally the copies that move the vregs into the
+/// physical registers.
+static MachineBasicBlock::iterator
+FindSplitPointForStackProtector(MachineBasicBlock *BB, DebugLoc DL) {
+  MachineBasicBlock::iterator SplitPoint = BB->getFirstTerminator();
+  //
+  if (SplitPoint == BB->begin())
+    return SplitPoint;
+
+  MachineBasicBlock::iterator Start = BB->begin();
+  MachineBasicBlock::iterator Previous = SplitPoint;
+  --Previous;
+
+  while (MIIsInTerminatorSequence(Previous)) {
+    SplitPoint = Previous;
+    if (Previous == Start)
+      break;
+    --Previous;
+  }
+
+  return SplitPoint;
 }
 
 void
@@ -1152,11 +1292,13 @@ SelectionDAGISel::FinishBasicBlock() {
                  << FuncInfo->PHINodesToUpdate[i].first
                  << ", " << FuncInfo->PHINodesToUpdate[i].second << ")\n");
 
+  const bool MustUpdatePHINodes = SDB->SwitchCases.empty() &&
+                                  SDB->JTCases.empty() &&
+                                  SDB->BitTestCases.empty();
+
   // Next, now that we know what the last MBB the LLVM BB expanded is, update
   // PHI nodes in successors.
-  if (SDB->SwitchCases.empty() &&
-      SDB->JTCases.empty() &&
-      SDB->BitTestCases.empty()) {
+  if (MustUpdatePHINodes) {
     for (unsigned i = 0, e = FuncInfo->PHINodesToUpdate.size(); i != e; ++i) {
       MachineInstrBuilder PHI(*MF, FuncInfo->PHINodesToUpdate[i].first);
       assert(PHI->isPHI() &&
@@ -1165,9 +1307,54 @@ SelectionDAGISel::FinishBasicBlock() {
         continue;
       PHI.addReg(FuncInfo->PHINodesToUpdate[i].second).addMBB(FuncInfo->MBB);
     }
-    return;
   }
 
+  // Handle stack protector.
+  if (SDB->SPDescriptor.shouldEmitStackProtector()) {
+    MachineBasicBlock *ParentMBB = SDB->SPDescriptor.getParentMBB();
+    MachineBasicBlock *SuccessMBB = SDB->SPDescriptor.getSuccessMBB();
+
+    // Find the split point to split the parent mbb. At the same time copy all
+    // physical registers used in the tail of parent mbb into virtual registers
+    // before the split point and back into physical registers after the split
+    // point. This prevents us needing to deal with Live-ins and many other
+    // register allocation issues caused by us splitting the parent mbb. The
+    // register allocator will clean up said virtual copies later on.
+    MachineBasicBlock::iterator SplitPoint =
+      FindSplitPointForStackProtector(ParentMBB, SDB->getCurDebugLoc());
+
+    // Splice the terminator of ParentMBB into SuccessMBB.
+    SuccessMBB->splice(SuccessMBB->end(), ParentMBB,
+                       SplitPoint,
+                       ParentMBB->end());
+
+    // Add compare/jump on neq/jump to the parent BB.
+    FuncInfo->MBB = ParentMBB;
+    FuncInfo->InsertPt = ParentMBB->end();
+    SDB->visitSPDescriptorParent(SDB->SPDescriptor, ParentMBB);
+    CurDAG->setRoot(SDB->getRoot());
+    SDB->clear();
+    CodeGenAndEmitDAG();
+
+    // CodeGen Failure MBB if we have not codegened it yet.
+    MachineBasicBlock *FailureMBB = SDB->SPDescriptor.getFailureMBB();
+    if (!FailureMBB->size()) {
+      FuncInfo->MBB = FailureMBB;
+      FuncInfo->InsertPt = FailureMBB->end();
+      SDB->visitSPDescriptorFailure(SDB->SPDescriptor);
+      CurDAG->setRoot(SDB->getRoot());
+      SDB->clear();
+      CodeGenAndEmitDAG();
+    }
+
+    // Clear the Per-BB State.
+    SDB->SPDescriptor.resetPerBBState();
+  }
+
+  // If we updated PHI Nodes, return early.
+  if (MustUpdatePHINodes)
+    return;
+
   for (unsigned i = 0, e = SDB->BitTestCases.size(); i != e; ++i) {
     // Lower header first, if it wasn't already lowered
     if (!SDB->BitTestCases[i].Emitted) {
@@ -1741,15 +1928,15 @@ WalkChainUsers(const SDNode *ChainedNode,
 
     SDNode *User = *UI;
 
+    if (User->getOpcode() == ISD::HANDLENODE)  // Root of the graph.
+      continue;
+
     // If we see an already-selected machine node, then we've gone beyond the
     // pattern that we're selecting down into the already selected chunk of the
     // DAG.
-    if (User->isMachineOpcode() ||
-        User->getOpcode() == ISD::HANDLENODE)  // Root of the graph.
-      continue;
-
     unsigned UserOpcode = User->getOpcode();
-    if (UserOpcode == ISD::CopyToReg ||
+    if (User->isMachineOpcode() ||
+        UserOpcode == ISD::CopyToReg ||
         UserOpcode == ISD::CopyFromReg ||
         UserOpcode == ISD::INLINEASM ||
         UserOpcode == ISD::EH_LABEL ||
@@ -1886,7 +2073,6 @@ HandleMergeInputChains(SmallVectorImpl<SDNode*> &ChainNodesMatched,
     }
   }
 
-  SDValue Res;
   if (InputChains.size() == 1)
     return InputChains[0];
   return CurDAG->getNode(ISD::TokenFactor, SDLoc(ChainNodesMatched[0]),
@@ -1962,6 +2148,18 @@ CheckSame(const unsigned char *MatcherTable, unsigned &MatcherIndex,
   return N == RecordedNodes[RecNo].first;
 }
 
+/// CheckChildSame - Implements OP_CheckChildXSame.
+LLVM_ATTRIBUTE_ALWAYS_INLINE static bool
+CheckChildSame(const unsigned char *MatcherTable, unsigned &MatcherIndex,
+             SDValue N,
+             const SmallVectorImpl<std::pair<SDValue, SDNode*> > &RecordedNodes,
+             unsigned ChildNo) {
+  if (ChildNo >= N.getNumOperands())
+    return false;  // Match fails if out of range child #.
+  return ::CheckSame(MatcherTable, MatcherIndex, N.getOperand(ChildNo),
+                     RecordedNodes);
+}
+
 /// CheckPatternPredicate - Implements OP_CheckPatternPredicate.
 LLVM_ATTRIBUTE_ALWAYS_INLINE static bool
 CheckPatternPredicate(const unsigned char *MatcherTable, unsigned &MatcherIndex,
@@ -2076,6 +2274,13 @@ static unsigned IsPredicateKnownToFail(const unsigned char *Table,
   case SelectionDAGISel::OPC_CheckSame:
     Result = !::CheckSame(Table, Index, N, RecordedNodes);
     return Index;
+  case SelectionDAGISel::OPC_CheckChild0Same:
+  case SelectionDAGISel::OPC_CheckChild1Same:
+  case SelectionDAGISel::OPC_CheckChild2Same:
+  case SelectionDAGISel::OPC_CheckChild3Same:
+    Result = !::CheckChildSame(Table, Index, N, RecordedNodes,
+                        Table[Index-1] - SelectionDAGISel::OPC_CheckChild0Same);
+    return Index;
   case SelectionDAGISel::OPC_CheckPatternPredicate:
     Result = !::CheckPatternPredicate(Table, Index, SDISel);
     return Index;
@@ -2373,6 +2578,14 @@ SelectCodeCommon(SDNode *NodeToMatch, const unsigned char *MatcherTable,
     case OPC_CheckSame:
       if (!::CheckSame(MatcherTable, MatcherIndex, N, RecordedNodes)) break;
       continue;
+
+    case OPC_CheckChild0Same: case OPC_CheckChild1Same:
+    case OPC_CheckChild2Same: case OPC_CheckChild3Same:
+      if (!::CheckChildSame(MatcherTable, MatcherIndex, N, RecordedNodes,
+                            Opcode-OPC_CheckChild0Same))
+        break;
+      continue;
+
     case OPC_CheckPatternPredicate:
       if (!::CheckPatternPredicate(MatcherTable, MatcherIndex, *this)) break;
       continue;
@@ -2432,7 +2645,7 @@ SelectCodeCommon(SDNode *NodeToMatch, const unsigned char *MatcherTable,
     }
 
     case OPC_SwitchType: {
-      MVT CurNodeVT = N.getValueType().getSimpleVT();
+      MVT CurNodeVT = N.getSimpleValueType();
       unsigned SwitchStart = MatcherIndex-1; (void)SwitchStart;
       unsigned CaseSize;
       while (1) {
@@ -2544,7 +2757,7 @@ SelectCodeCommon(SDNode *NodeToMatch, const unsigned char *MatcherTable,
     case OPC_EmitConvertToTarget:  {
       // Convert from IMM/FPIMM to target version.
       unsigned RecNo = MatcherTable[MatcherIndex++];
-      assert(RecNo < RecordedNodes.size() && "Invalid CheckSame");
+      assert(RecNo < RecordedNodes.size() && "Invalid EmitConvertToTarget");
       SDValue Imm = RecordedNodes[RecNo].first;
 
       if (Imm->getOpcode() == ISD::Constant) {
@@ -2569,7 +2782,7 @@ SelectCodeCommon(SDNode *NodeToMatch, const unsigned char *MatcherTable,
 
       // Read all of the chained nodes.
       unsigned RecNo = Opcode == OPC_EmitMergeInputChains1_1;
-      assert(RecNo < RecordedNodes.size() && "Invalid CheckSame");
+      assert(RecNo < RecordedNodes.size() && "Invalid EmitMergeInputChains");
       ChainNodesMatched.push_back(RecordedNodes[RecNo].first.getNode());
 
       // FIXME: What if other value results of the node have uses not matched
@@ -2606,7 +2819,7 @@ SelectCodeCommon(SDNode *NodeToMatch, const unsigned char *MatcherTable,
       // Read all of the chained nodes.
       for (unsigned i = 0; i != NumChains; ++i) {
         unsigned RecNo = MatcherTable[MatcherIndex++];
-        assert(RecNo < RecordedNodes.size() && "Invalid CheckSame");
+        assert(RecNo < RecordedNodes.size() && "Invalid EmitMergeInputChains");
         ChainNodesMatched.push_back(RecordedNodes[RecNo].first.getNode());
 
         // FIXME: What if other value results of the node have uses not matched
@@ -2633,7 +2846,7 @@ SelectCodeCommon(SDNode *NodeToMatch, const unsigned char *MatcherTable,
 
     case OPC_EmitCopyToReg: {
       unsigned RecNo = MatcherTable[MatcherIndex++];
-      assert(RecNo < RecordedNodes.size() && "Invalid CheckSame");
+      assert(RecNo < RecordedNodes.size() && "Invalid EmitCopyToReg");
       unsigned DestPhysReg = MatcherTable[MatcherIndex++];
 
       if (InputChain.getNode() == 0)
@@ -2650,7 +2863,7 @@ SelectCodeCommon(SDNode *NodeToMatch, const unsigned char *MatcherTable,
     case OPC_EmitNodeXForm: {
       unsigned XFormNo = MatcherTable[MatcherIndex++];
       unsigned RecNo = MatcherTable[MatcherIndex++];
-      assert(RecNo < RecordedNodes.size() && "Invalid CheckSame");
+      assert(RecNo < RecordedNodes.size() && "Invalid EmitNodeXForm");
       SDValue Res = RunSDNodeXForm(RecordedNodes[RecNo].first, XFormNo);
       RecordedNodes.push_back(std::pair<SDValue,SDNode*>(Res, (SDNode*) 0));
       continue;
@@ -2827,7 +3040,7 @@ SelectCodeCommon(SDNode *NodeToMatch, const unsigned char *MatcherTable,
         if (RecNo & 128)
           RecNo = GetVBR(RecNo, MatcherTable, MatcherIndex);
 
-        assert(RecNo < RecordedNodes.size() && "Invalid CheckSame");
+        assert(RecNo < RecordedNodes.size() && "Invalid MarkGlueResults");
         GlueResultNodesMatched.push_back(RecordedNodes[RecNo].first.getNode());
       }
       continue;
@@ -2844,7 +3057,7 @@ SelectCodeCommon(SDNode *NodeToMatch, const unsigned char *MatcherTable,
         if (ResSlot & 128)
           ResSlot = GetVBR(ResSlot, MatcherTable, MatcherIndex);
 
-        assert(ResSlot < RecordedNodes.size() && "Invalid CheckSame");
+        assert(ResSlot < RecordedNodes.size() && "Invalid CompleteMatch");
         SDValue Res = RecordedNodes[ResSlot].first;
 
         assert(i < NodeToMatch->getNumValues() &&
diff --git a/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index e3c6306..82b068d 100644
--- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -64,13 +64,29 @@ bool TargetLowering::isInTailCallPosition(SelectionDAG &DAG, SDNode *Node,
   return isUsedByReturnOnly(Node, Chain);
 }
 
+/// \brief Set CallLoweringInfo attribute flags based on a call instruction
+/// and called function attributes.
+void TargetLowering::ArgListEntry::setAttributes(ImmutableCallSite *CS,
+                                                 unsigned AttrIdx) {
+  isSExt     = CS->paramHasAttr(AttrIdx, Attribute::SExt);
+  isZExt     = CS->paramHasAttr(AttrIdx, Attribute::ZExt);
+  isInReg    = CS->paramHasAttr(AttrIdx, Attribute::InReg);
+  isSRet     = CS->paramHasAttr(AttrIdx, Attribute::StructRet);
+  isNest     = CS->paramHasAttr(AttrIdx, Attribute::Nest);
+  isByVal    = CS->paramHasAttr(AttrIdx, Attribute::ByVal);
+  isReturned = CS->paramHasAttr(AttrIdx, Attribute::Returned);
+  Alignment  = CS->getParamAlignment(AttrIdx);
+}
 
 /// Generate a libcall taking the given operands as arguments and returning a
 /// result of type RetVT.
-SDValue TargetLowering::makeLibCall(SelectionDAG &DAG,
-                                    RTLIB::Libcall LC, EVT RetVT,
-                                    const SDValue *Ops, unsigned NumOps,
-                                    bool isSigned, SDLoc dl) const {
+std::pair<SDValue, SDValue>
+TargetLowering::makeLibCall(SelectionDAG &DAG,
+                            RTLIB::Libcall LC, EVT RetVT,
+                            const SDValue *Ops, unsigned NumOps,
+                            bool isSigned, SDLoc dl,
+                            bool doesNotReturn,
+                            bool isReturnValueUsed) const {
   TargetLowering::ArgListTy Args;
   Args.reserve(NumOps);
 
@@ -89,11 +105,9 @@ SDValue TargetLowering::makeLibCall(SelectionDAG &DAG,
   CallLoweringInfo CLI(DAG.getEntryNode(), RetTy, isSigned, !isSigned, false,
                     false, 0, getLibcallCallingConv(LC),
                     /*isTailCall=*/false,
-                    /*doesNotReturn=*/false, /*isReturnValueUsed=*/true,
-                    Callee, Args, DAG, dl);
-  std::pair<SDValue,SDValue> CallInfo = LowerCallTo(CLI);
-
-  return CallInfo.first;
+                    doesNotReturn, isReturnValueUsed, Callee, Args,
+                    DAG, dl);
+  return LowerCallTo(CLI);
 }
 
 
@@ -183,14 +197,16 @@ void TargetLowering::softenSetCCOperands(SelectionDAG &DAG, EVT VT,
   // Use the target specific return value for comparions lib calls.
   EVT RetVT = getCmpLibcallReturnType();
   SDValue Ops[2] = { NewLHS, NewRHS };
-  NewLHS = makeLibCall(DAG, LC1, RetVT, Ops, 2, false/*sign irrelevant*/, dl);
+  NewLHS = makeLibCall(DAG, LC1, RetVT, Ops, 2, false/*sign irrelevant*/,
+                       dl).first;
   NewRHS = DAG.getConstant(0, RetVT);
   CCCode = getCmpLibcallCC(LC1);
   if (LC2 != RTLIB::UNKNOWN_LIBCALL) {
     SDValue Tmp = DAG.getNode(ISD::SETCC, dl,
                               getSetCCResultType(*DAG.getContext(), RetVT),
                               NewLHS, NewRHS, DAG.getCondCode(CCCode));
-    NewLHS = makeLibCall(DAG, LC2, RetVT, Ops, 2, false/*sign irrelevant*/, dl);
+    NewLHS = makeLibCall(DAG, LC2, RetVT, Ops, 2, false/*sign irrelevant*/,
+                         dl).first;
     NewLHS = DAG.getNode(ISD::SETCC, dl,
                          getSetCCResultType(*DAG.getContext(), RetVT), NewLHS,
                          NewRHS, DAG.getCondCode(getCmpLibcallCC(LC2)));
@@ -632,6 +648,31 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
                           TLO.DAG.getNode(ISD::ANY_EXTEND, dl, Op.getValueType(),
                                           NarrowShl));
         }
+        // Repeat the SHL optimization above in cases where an extension
+        // intervenes: (shl (anyext (shr x, c1)), c2) to
+        // (shl (anyext x), c2-c1).  This requires that the bottom c1 bits
+        // aren't demanded (as above) and that the shifted upper c1 bits of
+        // x aren't demanded.
+        if (InOp.hasOneUse() &&
+            InnerOp.getOpcode() == ISD::SRL &&
+            InnerOp.hasOneUse() &&
+            isa<ConstantSDNode>(InnerOp.getOperand(1))) {
+          uint64_t InnerShAmt = cast<ConstantSDNode>(InnerOp.getOperand(1))
+            ->getZExtValue();
+          if (InnerShAmt < ShAmt &&
+              InnerShAmt < InnerBits &&
+              NewMask.lshr(InnerBits - InnerShAmt + ShAmt) == 0 &&
+              NewMask.trunc(ShAmt) == 0) {
+            SDValue NewSA =
+              TLO.DAG.getConstant(ShAmt - InnerShAmt,
+                                  Op.getOperand(1).getValueType());
+            EVT VT = Op.getValueType();
+            SDValue NewExt = TLO.DAG.getNode(ISD::ANY_EXTEND, dl, VT,
+                                             InnerOp.getOperand(0));
+            return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::SHL, dl, VT,
+                                                     NewExt, NewSA));
+          }
+        }
       }
 
       KnownZero <<= SA->getZExtValue();
@@ -722,13 +763,24 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
 
       // If the input sign bit is known to be zero, or if none of the top bits
       // are demanded, turn this into an unsigned shift right.
-      if (KnownZero.intersects(SignBit) || (HighBits & ~NewMask) == HighBits) {
+      if (KnownZero.intersects(SignBit) || (HighBits & ~NewMask) == HighBits)
         return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::SRL, dl, VT,
                                                  Op.getOperand(0),
                                                  Op.getOperand(1)));
-      } else if (KnownOne.intersects(SignBit)) { // New bits are known one.
-        KnownOne |= HighBits;
+
+      int Log2 = NewMask.exactLogBase2();
+      if (Log2 >= 0) {
+        // The bit must come from the sign.
+        SDValue NewSA =
+          TLO.DAG.getConstant(BitWidth - 1 - Log2,
+                              Op.getOperand(1).getValueType());
+        return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::SRL, dl, VT,
+                                                 Op.getOperand(0), NewSA));
       }
+
+      if (KnownOne.intersects(SignBit))
+        // New bits are known one.
+        KnownOne |= HighBits;
     }
     break;
   case ISD::SIGN_EXTEND_INREG: {
@@ -1077,13 +1129,20 @@ TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
   case ISD::SETFALSE:
   case ISD::SETFALSE2: return DAG.getConstant(0, VT);
   case ISD::SETTRUE:
-  case ISD::SETTRUE2:  return DAG.getConstant(1, VT);
+  case ISD::SETTRUE2: {
+    TargetLowering::BooleanContent Cnt = getBooleanContents(VT.isVector());
+    return DAG.getConstant(
+        Cnt == TargetLowering::ZeroOrNegativeOneBooleanContent ? -1ULL : 1, VT);
+  }
   }
 
   // Ensure that the constant occurs on the RHS, and fold constant
   // comparisons.
-  if (isa<ConstantSDNode>(N0.getNode()))
-    return DAG.getSetCC(dl, VT, N1, N0, ISD::getSetCCSwappedOperands(Cond));
+  ISD::CondCode SwappedCC = ISD::getSetCCSwappedOperands(Cond);
+  if (isa<ConstantSDNode>(N0.getNode()) &&
+      (DCI.isBeforeLegalizeOps() ||
+       isCondCodeLegal(SwappedCC, N0.getSimpleValueType())))
+    return DAG.getSetCC(dl, VT, N1, N0, SwappedCC);
 
   if (ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1.getNode())) {
     const APInt &C1 = N1C->getAPIntValue();
@@ -1178,6 +1237,7 @@ TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
     // the test is for equality or unsigned, and all 1 bits of the const are
     // in the same partial word, see if we can shorten the load.
     if (DCI.isBeforeLegalize() &&
+        !ISD::isSignedIntSetCC(Cond) &&
         N0.getOpcode() == ISD::AND && C1 == 0 &&
         N0.getNode()->hasOneUse() &&
         isa<LoadSDNode>(N0.getOperand(0)) &&
@@ -1322,7 +1382,9 @@ TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
         ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
         CC = ISD::getSetCCInverse(CC,
                                   N0.getOperand(0).getValueType().isInteger());
-        return DAG.getSetCC(dl, VT, N0.getOperand(0), N0.getOperand(1), CC);
+        if (DCI.isBeforeLegalizeOps() ||
+            isCondCodeLegal(CC, N0.getOperand(0).getSimpleValueType()))
+          return DAG.getSetCC(dl, VT, N0.getOperand(0), N0.getOperand(1), CC);
       }
 
       if ((N0.getOpcode() == ISD::XOR ||
@@ -1759,16 +1821,22 @@ TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
       if (N0.getOperand(0) == N1 || N0.getOperand(1) == N1) {
         if (ValueHasExactlyOneBitSet(N1, DAG)) {
           Cond = ISD::getSetCCInverse(Cond, /*isInteger=*/true);
-          SDValue Zero = DAG.getConstant(0, N1.getValueType());
-          return DAG.getSetCC(dl, VT, N0, Zero, Cond);
+          if (DCI.isBeforeLegalizeOps() ||
+              isCondCodeLegal(Cond, N0.getSimpleValueType())) {
+            SDValue Zero = DAG.getConstant(0, N1.getValueType());
+            return DAG.getSetCC(dl, VT, N0, Zero, Cond);
+          }
         }
       }
     if (N1.getOpcode() == ISD::AND)
       if (N1.getOperand(0) == N0 || N1.getOperand(1) == N0) {
         if (ValueHasExactlyOneBitSet(N0, DAG)) {
           Cond = ISD::getSetCCInverse(Cond, /*isInteger=*/true);
-          SDValue Zero = DAG.getConstant(0, N0.getValueType());
-          return DAG.getSetCC(dl, VT, N1, Zero, Cond);
+          if (DCI.isBeforeLegalizeOps() ||
+              isCondCodeLegal(Cond, N1.getSimpleValueType())) {
+            SDValue Zero = DAG.getConstant(0, N0.getValueType());
+            return DAG.getSetCC(dl, VT, N1, Zero, Cond);
+          }
         }
       }
   }
@@ -1993,7 +2061,7 @@ void TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
 std::pair<unsigned, const TargetRegisterClass*> TargetLowering::
 getRegForInlineAsmConstraint(const std::string &Constraint,
                              MVT VT) const {
-  if (Constraint[0] != '{')
+  if (Constraint.empty() || Constraint[0] != '{')
     return std::make_pair(0u, static_cast<TargetRegisterClass*>(0));
   assert(*(Constraint.end()-1) == '}' && "Not a brace enclosed constraint?");
 
@@ -2142,8 +2210,9 @@ TargetLowering::AsmOperandInfoVector TargetLowering::ParseConstraints(
           break;
         }
       } else if (PointerType *PT = dyn_cast<PointerType>(OpTy)) {
-        OpInfo.ConstraintVT = MVT::getIntegerVT(
-            8*getDataLayout()->getPointerSize(PT->getAddressSpace()));
+        unsigned PtrSize
+          = getDataLayout()->getPointerSizeInBits(PT->getAddressSpace());
+        OpInfo.ConstraintVT = MVT::getIntegerVT(PtrSize);
       } else {
         OpInfo.ConstraintVT = MVT::getVT(OpTy, true);
       }
diff --git a/lib/CodeGen/ShrinkWrapping.cpp b/lib/CodeGen/ShrinkWrapping.cpp
deleted file mode 100644
index 6c826de..0000000
--- a/lib/CodeGen/ShrinkWrapping.cpp
+++ /dev/null
@@ -1,1152 +0,0 @@
-//===-- ShrinkWrapping.cpp - Reduce spills/restores of callee-saved regs --===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements a shrink wrapping variant of prolog/epilog insertion:
-// - Spills and restores of callee-saved registers (CSRs) are placed in the
-//   machine CFG to tightly surround their uses so that execution paths that
-//   do not use CSRs do not pay the spill/restore penalty.
-//
-// - Avoiding placment of spills/restores in loops: if a CSR is used inside a
-//   loop the spills are placed in the loop preheader, and restores are
-//   placed in the loop exit nodes (the successors of loop _exiting_ nodes).
-//
-// - Covering paths without CSR uses:
-//   If a region in a CFG uses CSRs and has multiple entry and/or exit points,
-//   the use info for the CSRs inside the region is propagated outward in the
-//   CFG to ensure validity of the spill/restore placements. This decreases
-//   the effectiveness of shrink wrapping but does not require edge splitting
-//   in the machine CFG.
-//
-// This shrink wrapping implementation uses an iterative analysis to determine
-// which basic blocks require spills and restores for CSRs.
-//
-// This pass uses MachineDominators and MachineLoopInfo. Loop information
-// is used to prevent placement of callee-saved register spills/restores
-// in the bodies of loops.
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "shrink-wrap"
-
-#include "PrologEpilogInserter.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/PostOrderIterator.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SparseBitVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/CodeGen/MachineDominators.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineLoopInfo.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Compiler.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetRegisterInfo.h"
-#include <sstream>
-
-using namespace llvm;
-
-STATISTIC(numSRReduced, "Number of CSR spills+restores reduced.");
-
-// Shrink Wrapping:
-static cl::opt<bool>
-ShrinkWrapping("shrink-wrap",
-               cl::desc("Shrink wrap callee-saved register spills/restores"));
-
-// Shrink wrap only the specified function, a debugging aid.
-static cl::opt<std::string>
-ShrinkWrapFunc("shrink-wrap-func", cl::Hidden,
-               cl::desc("Shrink wrap the specified function"),
-               cl::value_desc("funcname"),
-               cl::init(""));
-
-// Debugging level for shrink wrapping.
-enum ShrinkWrapDebugLevel {
-  Disabled, BasicInfo, Iterations, Details
-};
-
-static cl::opt<enum ShrinkWrapDebugLevel>
-ShrinkWrapDebugging("shrink-wrap-dbg", cl::Hidden,
-  cl::desc("Print shrink wrapping debugging information"),
-  cl::values(
-    clEnumVal(Disabled  , "disable debug output"),
-    clEnumVal(BasicInfo , "print basic DF sets"),
-    clEnumVal(Iterations, "print SR sets for each iteration"),
-    clEnumVal(Details   , "print all DF sets"),
-    clEnumValEnd));
-
-
-void PEI::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.setPreservesCFG();
-  if (ShrinkWrapping || ShrinkWrapFunc != "") {
-    AU.addRequired<MachineLoopInfo>();
-    AU.addRequired<MachineDominatorTree>();
-  }
-  AU.addPreserved<MachineLoopInfo>();
-  AU.addPreserved<MachineDominatorTree>();
-  AU.addRequired<TargetPassConfig>();
-  MachineFunctionPass::getAnalysisUsage(AU);
-}
-
-//===----------------------------------------------------------------------===//
-//  ShrinkWrapping implementation
-//===----------------------------------------------------------------------===//
-
-// Convienences for dealing with machine loops.
-MachineBasicBlock* PEI::getTopLevelLoopPreheader(MachineLoop* LP) {
-  assert(LP && "Machine loop is NULL.");
-  MachineBasicBlock* PHDR = LP->getLoopPreheader();
-  MachineLoop* PLP = LP->getParentLoop();
-  while (PLP) {
-    PHDR = PLP->getLoopPreheader();
-    PLP = PLP->getParentLoop();
-  }
-  return PHDR;
-}
-
-MachineLoop* PEI::getTopLevelLoopParent(MachineLoop *LP) {
-  if (LP == 0)
-    return 0;
-  MachineLoop* PLP = LP->getParentLoop();
-  while (PLP) {
-    LP = PLP;
-    PLP = PLP->getParentLoop();
-  }
-  return LP;
-}
-
-bool PEI::isReturnBlock(MachineBasicBlock* MBB) {
-  return (MBB && !MBB->empty() && MBB->back().isReturn());
-}
-
-// Initialize shrink wrapping DFA sets, called before iterations.
-void PEI::clearAnticAvailSets() {
-  AnticIn.clear();
-  AnticOut.clear();
-  AvailIn.clear();
-  AvailOut.clear();
-}
-
-// Clear all sets constructed by shrink wrapping.
-void PEI::clearAllSets() {
-  ReturnBlocks.clear();
-  clearAnticAvailSets();
-  UsedCSRegs.clear();
-  CSRUsed.clear();
-  TLLoops.clear();
-  CSRSave.clear();
-  CSRRestore.clear();
-}
-
-// Initialize all shrink wrapping data.
-void PEI::initShrinkWrappingInfo() {
-  clearAllSets();
-  EntryBlock = 0;
-#ifndef NDEBUG
-  HasFastExitPath = false;
-#endif
-  ShrinkWrapThisFunction = ShrinkWrapping;
-  // DEBUG: enable or disable shrink wrapping for the current function
-  // via --shrink-wrap-func=<funcname>.
-#ifndef NDEBUG
-  if (ShrinkWrapFunc != "") {
-    std::string MFName = MF->getName().str();
-    ShrinkWrapThisFunction = (MFName == ShrinkWrapFunc);
-  }
-#endif
-}
-
-
-/// placeCSRSpillsAndRestores - determine which MBBs of the function
-/// need save, restore code for callee-saved registers by doing a DF analysis
-/// similar to the one used in code motion (GVNPRE). This produces maps of MBBs
-/// to sets of registers (CSRs) for saves and restores. MachineLoopInfo
-/// is used to ensure that CSR save/restore code is not placed inside loops.
-/// This function computes the maps of MBBs -> CSRs to spill and restore
-/// in CSRSave, CSRRestore.
-///
-/// If shrink wrapping is not being performed, place all spills in
-/// the entry block, all restores in return blocks. In this case,
-/// CSRSave has a single mapping, CSRRestore has mappings for each
-/// return block.
-///
-void PEI::placeCSRSpillsAndRestores(MachineFunction &Fn) {
-
-  DEBUG(MF = &Fn);
-
-  initShrinkWrappingInfo();
-
-  DEBUG(if (ShrinkWrapThisFunction) {
-      dbgs() << "Place CSR spills/restores for "
-             << MF->getName() << "\n";
-    });
-
-  if (calculateSets(Fn))
-    placeSpillsAndRestores(Fn);
-}
-
-/// calcAnticInOut - calculate the anticipated in/out reg sets
-/// for the given MBB by looking forward in the MCFG at MBB's
-/// successors.
-///
-bool PEI::calcAnticInOut(MachineBasicBlock* MBB) {
-  bool changed = false;
-
-  // AnticOut[MBB] = INTERSECT(AnticIn[S] for S in SUCCESSORS(MBB))
-  SmallVector<MachineBasicBlock*, 4> successors;
-  for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(),
-         SE = MBB->succ_end(); SI != SE; ++SI) {
-    MachineBasicBlock* SUCC = *SI;
-    if (SUCC != MBB)
-      successors.push_back(SUCC);
-  }
-
-  unsigned i = 0, e = successors.size();
-  if (i != e) {
-    CSRegSet prevAnticOut = AnticOut[MBB];
-    MachineBasicBlock* SUCC = successors[i];
-
-    AnticOut[MBB] = AnticIn[SUCC];
-    for (++i; i != e; ++i) {
-      SUCC = successors[i];
-      AnticOut[MBB] &= AnticIn[SUCC];
-    }
-    if (prevAnticOut != AnticOut[MBB])
-      changed = true;
-  }
-
-  // AnticIn[MBB] = UNION(CSRUsed[MBB], AnticOut[MBB]);
-  CSRegSet prevAnticIn = AnticIn[MBB];
-  AnticIn[MBB] = CSRUsed[MBB] | AnticOut[MBB];
-  if (prevAnticIn != AnticIn[MBB])
-    changed = true;
-  return changed;
-}
-
-/// calcAvailInOut - calculate the available in/out reg sets
-/// for the given MBB by looking backward in the MCFG at MBB's
-/// predecessors.
-///
-bool PEI::calcAvailInOut(MachineBasicBlock* MBB) {
-  bool changed = false;
-
-  // AvailIn[MBB] = INTERSECT(AvailOut[P] for P in PREDECESSORS(MBB))
-  SmallVector<MachineBasicBlock*, 4> predecessors;
-  for (MachineBasicBlock::pred_iterator PI = MBB->pred_begin(),
-         PE = MBB->pred_end(); PI != PE; ++PI) {
-    MachineBasicBlock* PRED = *PI;
-    if (PRED != MBB)
-      predecessors.push_back(PRED);
-  }
-
-  unsigned i = 0, e = predecessors.size();
-  if (i != e) {
-    CSRegSet prevAvailIn = AvailIn[MBB];
-    MachineBasicBlock* PRED = predecessors[i];
-
-    AvailIn[MBB] = AvailOut[PRED];
-    for (++i; i != e; ++i) {
-      PRED = predecessors[i];
-      AvailIn[MBB] &= AvailOut[PRED];
-    }
-    if (prevAvailIn != AvailIn[MBB])
-      changed = true;
-  }
-
-  // AvailOut[MBB] = UNION(CSRUsed[MBB], AvailIn[MBB]);
-  CSRegSet prevAvailOut = AvailOut[MBB];
-  AvailOut[MBB] = CSRUsed[MBB] | AvailIn[MBB];
-  if (prevAvailOut != AvailOut[MBB])
-    changed = true;
-  return changed;
-}
-
-/// calculateAnticAvail - build the sets anticipated and available
-/// registers in the MCFG of the current function iteratively,
-/// doing a combined forward and backward analysis.
-///
-void PEI::calculateAnticAvail(MachineFunction &Fn) {
-  // Initialize data flow sets.
-  clearAnticAvailSets();
-
-  // Calculate Antic{In,Out} and Avail{In,Out} iteratively on the MCFG.
-  bool changed = true;
-  unsigned iterations = 0;
-  while (changed) {
-    changed = false;
-    ++iterations;
-    for (MachineFunction::iterator MBBI = Fn.begin(), MBBE = Fn.end();
-         MBBI != MBBE; ++MBBI) {
-      MachineBasicBlock* MBB = MBBI;
-
-      // Calculate anticipated in, out regs at MBB from
-      // anticipated at successors of MBB.
-      changed |= calcAnticInOut(MBB);
-
-      // Calculate available in, out regs at MBB from
-      // available at predecessors of MBB.
-      changed |= calcAvailInOut(MBB);
-    }
-  }
-
-  DEBUG({
-      if (ShrinkWrapDebugging >= Details) {
-        dbgs()
-          << "-----------------------------------------------------------\n"
-          << " Antic/Avail Sets:\n"
-          << "-----------------------------------------------------------\n"
-          << "iterations = " << iterations << "\n"
-          << "-----------------------------------------------------------\n"
-          << "MBB | USED | ANTIC_IN | ANTIC_OUT | AVAIL_IN | AVAIL_OUT\n"
-          << "-----------------------------------------------------------\n";
-
-        for (MachineFunction::iterator MBBI = Fn.begin(), MBBE = Fn.end();
-             MBBI != MBBE; ++MBBI) {
-          MachineBasicBlock* MBB = MBBI;
-          dumpSets(MBB);
-        }
-
-        dbgs()
-          << "-----------------------------------------------------------\n";
-      }
-    });
-}
-
-/// propagateUsesAroundLoop - copy used register info from MBB to all blocks
-/// of the loop given by LP and its parent loops. This prevents spills/restores
-/// from being placed in the bodies of loops.
-///
-void PEI::propagateUsesAroundLoop(MachineBasicBlock* MBB, MachineLoop* LP) {
-  if (! MBB || !LP)
-    return;
-
-  std::vector<MachineBasicBlock*> loopBlocks = LP->getBlocks();
-  for (unsigned i = 0, e = loopBlocks.size(); i != e; ++i) {
-    MachineBasicBlock* LBB = loopBlocks[i];
-    if (LBB == MBB)
-      continue;
-    if (CSRUsed[LBB].contains(CSRUsed[MBB]))
-      continue;
-    CSRUsed[LBB] |= CSRUsed[MBB];
-  }
-}
-
-/// calculateSets - collect the CSRs used in this function, compute
-/// the DF sets that describe the initial minimal regions in the
-/// Machine CFG around which CSR spills and restores must be placed.
-///
-/// Additionally, this function decides if shrink wrapping should
-/// be disabled for the current function, checking the following:
-///  1. the current function has more than 500 MBBs: heuristic limit
-///     on function size to reduce compile time impact of the current
-///     iterative algorithm.
-///  2. all CSRs are used in the entry block.
-///  3. all CSRs are used in all immediate successors of the entry block.
-///  4. all CSRs are used in a subset of blocks, each of which dominates
-///     all return blocks. These blocks, taken as a subgraph of the MCFG,
-///     are equivalent to the entry block since all execution paths pass
-///     through them.
-///
-bool PEI::calculateSets(MachineFunction &Fn) {
-  // Sets used to compute spill, restore placement sets.
-  const std::vector<CalleeSavedInfo> CSI =
-    Fn.getFrameInfo()->getCalleeSavedInfo();
-
-  // If no CSRs used, we are done.
-  if (CSI.empty()) {
-    DEBUG(if (ShrinkWrapThisFunction)
-            dbgs() << "DISABLED: " << Fn.getName()
-                   << ": uses no callee-saved registers\n");
-    return false;
-  }
-
-  // Save refs to entry and return blocks.
-  EntryBlock = Fn.begin();
-  for (MachineFunction::iterator MBB = Fn.begin(), E = Fn.end();
-       MBB != E; ++MBB)
-    if (isReturnBlock(MBB))
-      ReturnBlocks.push_back(MBB);
-
-  // Determine if this function has fast exit paths.
-  DEBUG(if (ShrinkWrapThisFunction)
-          findFastExitPath());
-
-  // Limit shrink wrapping via the current iterative bit vector
-  // implementation to functions with <= 500 MBBs.
-  if (Fn.size() > 500) {
-    DEBUG(if (ShrinkWrapThisFunction)
-            dbgs() << "DISABLED: " << Fn.getName()
-                   << ": too large (" << Fn.size() << " MBBs)\n");
-    ShrinkWrapThisFunction = false;
-  }
-
-  // Return now if not shrink wrapping.
-  if (! ShrinkWrapThisFunction)
-    return false;
-
-  // Collect set of used CSRs.
-  for (unsigned inx = 0, e = CSI.size(); inx != e; ++inx) {
-    UsedCSRegs.set(inx);
-  }
-
-  // Walk instructions in all MBBs, create CSRUsed[] sets, choose
-  // whether or not to shrink wrap this function.
-  MachineLoopInfo &LI = getAnalysis<MachineLoopInfo>();
-  MachineDominatorTree &DT = getAnalysis<MachineDominatorTree>();
-  const TargetRegisterInfo *TRI = Fn.getTarget().getRegisterInfo();
-
-  bool allCSRUsesInEntryBlock = true;
-  for (MachineFunction::iterator MBBI = Fn.begin(), MBBE = Fn.end();
-       MBBI != MBBE; ++MBBI) {
-    MachineBasicBlock* MBB = MBBI;
-    for (MachineBasicBlock::iterator I = MBB->begin(); I != MBB->end(); ++I) {
-      for (unsigned inx = 0, e = CSI.size(); inx != e; ++inx) {
-        unsigned Reg = CSI[inx].getReg();
-        // If instruction I reads or modifies Reg, add it to UsedCSRegs,
-        // CSRUsed map for the current block.
-        for (unsigned opInx = 0, opEnd = I->getNumOperands();
-             opInx != opEnd; ++opInx) {
-          const MachineOperand &MO = I->getOperand(opInx);
-          if (! (MO.isReg() && (MO.isUse() || MO.isDef())))
-            continue;
-          unsigned MOReg = MO.getReg();
-          if (!MOReg)
-            continue;
-          if (MOReg == Reg ||
-              (TargetRegisterInfo::isPhysicalRegister(MOReg) &&
-               TargetRegisterInfo::isPhysicalRegister(Reg) &&
-               TRI->isSubRegister(Reg, MOReg))) {
-            // CSR Reg is defined/used in block MBB.
-            CSRUsed[MBB].set(inx);
-            // Check for uses in EntryBlock.
-            if (MBB != EntryBlock)
-              allCSRUsesInEntryBlock = false;
-          }
-        }
-      }
-    }
-
-    if (CSRUsed[MBB].empty())
-      continue;
-
-    // Propagate CSRUsed[MBB] in loops
-    if (MachineLoop* LP = LI.getLoopFor(MBB)) {
-      // Add top level loop to work list.
-      MachineBasicBlock* HDR = getTopLevelLoopPreheader(LP);
-      MachineLoop* PLP = getTopLevelLoopParent(LP);
-
-      if (! HDR) {
-        HDR = PLP->getHeader();
-        assert(HDR->pred_size() > 0 && "Loop header has no predecessors?");
-        MachineBasicBlock::pred_iterator PI = HDR->pred_begin();
-        HDR = *PI;
-      }
-      TLLoops[HDR] = PLP;
-
-      // Push uses from inside loop to its parent loops,
-      // or to all other MBBs in its loop.
-      if (LP->getLoopDepth() > 1) {
-        for (MachineLoop* PLP = LP->getParentLoop(); PLP;
-             PLP = PLP->getParentLoop()) {
-          propagateUsesAroundLoop(MBB, PLP);
-        }
-      } else {
-        propagateUsesAroundLoop(MBB, LP);
-      }
-    }
-  }
-
-  if (allCSRUsesInEntryBlock) {
-    DEBUG(dbgs() << "DISABLED: " << Fn.getName()
-                 << ": all CSRs used in EntryBlock\n");
-    ShrinkWrapThisFunction = false;
-  } else {
-    bool allCSRsUsedInEntryFanout = true;
-    for (MachineBasicBlock::succ_iterator SI = EntryBlock->succ_begin(),
-           SE = EntryBlock->succ_end(); SI != SE; ++SI) {
-      MachineBasicBlock* SUCC = *SI;
-      if (CSRUsed[SUCC] != UsedCSRegs)
-        allCSRsUsedInEntryFanout = false;
-    }
-    if (allCSRsUsedInEntryFanout) {
-      DEBUG(dbgs() << "DISABLED: " << Fn.getName()
-                   << ": all CSRs used in imm successors of EntryBlock\n");
-      ShrinkWrapThisFunction = false;
-    }
-  }
-
-  if (ShrinkWrapThisFunction) {
-    // Check if MBB uses CSRs and dominates all exit nodes.
-    // Such nodes are equiv. to the entry node w.r.t.
-    // CSR uses: every path through the function must
-    // pass through this node. If each CSR is used at least
-    // once by these nodes, shrink wrapping is disabled.
-    CSRegSet CSRUsedInChokePoints;
-    for (MachineFunction::iterator MBBI = Fn.begin(), MBBE = Fn.end();
-         MBBI != MBBE; ++MBBI) {
-      MachineBasicBlock* MBB = MBBI;
-      if (MBB == EntryBlock || CSRUsed[MBB].empty() || MBB->succ_size() < 1)
-        continue;
-      bool dominatesExitNodes = true;
-      for (unsigned ri = 0, re = ReturnBlocks.size(); ri != re; ++ri)
-        if (! DT.dominates(MBB, ReturnBlocks[ri])) {
-          dominatesExitNodes = false;
-          break;
-        }
-      if (dominatesExitNodes) {
-        CSRUsedInChokePoints |= CSRUsed[MBB];
-        if (CSRUsedInChokePoints == UsedCSRegs) {
-          DEBUG(dbgs() << "DISABLED: " << Fn.getName()
-                       << ": all CSRs used in choke point(s) at "
-                       << getBasicBlockName(MBB) << "\n");
-          ShrinkWrapThisFunction = false;
-          break;
-        }
-      }
-    }
-  }
-
-  // Return now if we have decided not to apply shrink wrapping
-  // to the current function.
-  if (! ShrinkWrapThisFunction)
-    return false;
-
-  DEBUG({
-      dbgs() << "ENABLED: " << Fn.getName();
-      if (HasFastExitPath)
-        dbgs() << " (fast exit path)";
-      dbgs() << "\n";
-      if (ShrinkWrapDebugging >= BasicInfo) {
-        dbgs() << "------------------------------"
-             << "-----------------------------\n";
-        dbgs() << "UsedCSRegs = " << stringifyCSRegSet(UsedCSRegs) << "\n";
-        if (ShrinkWrapDebugging >= Details) {
-          dbgs() << "------------------------------"
-               << "-----------------------------\n";
-          dumpAllUsed();
-        }
-      }
-    });
-
-  // Build initial DF sets to determine minimal regions in the
-  // Machine CFG around which CSRs must be spilled and restored.
-  calculateAnticAvail(Fn);
-
-  return true;
-}
-
-/// addUsesForMEMERegion - add uses of CSRs spilled or restored in
-/// multi-entry, multi-exit (MEME) regions so spill and restore
-/// placement will not break code that enters or leaves a
-/// shrink-wrapped region by inducing spills with no matching
-/// restores or restores with no matching spills. A MEME region
-/// is a subgraph of the MCFG with multiple entry edges, multiple
-/// exit edges, or both. This code propagates use information
-/// through the MCFG until all paths requiring spills and restores
-/// _outside_ the computed minimal placement regions have been covered.
-///
-bool PEI::addUsesForMEMERegion(MachineBasicBlock* MBB,
-                               SmallVectorImpl<MachineBasicBlock *> &blks) {
-  if (MBB->succ_size() < 2 && MBB->pred_size() < 2) {
-    bool processThisBlock = false;
-    for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(),
-           SE = MBB->succ_end(); SI != SE; ++SI) {
-      MachineBasicBlock* SUCC = *SI;
-      if (SUCC->pred_size() > 1) {
-        processThisBlock = true;
-        break;
-      }
-    }
-    if (!CSRRestore[MBB].empty() && MBB->succ_size() > 0) {
-      for (MachineBasicBlock::pred_iterator PI = MBB->pred_begin(),
-             PE = MBB->pred_end(); PI != PE; ++PI) {
-        MachineBasicBlock* PRED = *PI;
-        if (PRED->succ_size() > 1) {
-          processThisBlock = true;
-          break;
-        }
-      }
-    }
-    if (! processThisBlock)
-      return false;
-  }
-
-  CSRegSet prop;
-  if (!CSRSave[MBB].empty())
-    prop = CSRSave[MBB];
-  else if (!CSRRestore[MBB].empty())
-    prop = CSRRestore[MBB];
-  else
-    prop = CSRUsed[MBB];
-  if (prop.empty())
-    return false;
-
-  // Propagate selected bits to successors, predecessors of MBB.
-  bool addedUses = false;
-  for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(),
-         SE = MBB->succ_end(); SI != SE; ++SI) {
-    MachineBasicBlock* SUCC = *SI;
-    // Self-loop
-    if (SUCC == MBB)
-      continue;
-    if (! CSRUsed[SUCC].contains(prop)) {
-      CSRUsed[SUCC] |= prop;
-      addedUses = true;
-      blks.push_back(SUCC);
-      DEBUG(if (ShrinkWrapDebugging >= Iterations)
-              dbgs() << getBasicBlockName(MBB)
-                   << "(" << stringifyCSRegSet(prop) << ")->"
-                   << "successor " << getBasicBlockName(SUCC) << "\n");
-    }
-  }
-  for (MachineBasicBlock::pred_iterator PI = MBB->pred_begin(),
-         PE = MBB->pred_end(); PI != PE; ++PI) {
-    MachineBasicBlock* PRED = *PI;
-    // Self-loop
-    if (PRED == MBB)
-      continue;
-    if (! CSRUsed[PRED].contains(prop)) {
-      CSRUsed[PRED] |= prop;
-      addedUses = true;
-      blks.push_back(PRED);
-      DEBUG(if (ShrinkWrapDebugging >= Iterations)
-              dbgs() << getBasicBlockName(MBB)
-                   << "(" << stringifyCSRegSet(prop) << ")->"
-                   << "predecessor " << getBasicBlockName(PRED) << "\n");
-    }
-  }
-  return addedUses;
-}
-
-/// addUsesForTopLevelLoops - add uses for CSRs used inside top
-/// level loops to the exit blocks of those loops.
-///
-bool PEI::addUsesForTopLevelLoops(SmallVectorImpl<MachineBasicBlock *> &blks) {
-  bool addedUses = false;
-
-  // Place restores for top level loops where needed.
-  for (DenseMap<MachineBasicBlock*, MachineLoop*>::iterator
-         I = TLLoops.begin(), E = TLLoops.end(); I != E; ++I) {
-    MachineBasicBlock* MBB = I->first;
-    MachineLoop* LP = I->second;
-    MachineBasicBlock* HDR = LP->getHeader();
-    SmallVector<MachineBasicBlock*, 4> exitBlocks;
-    CSRegSet loopSpills;
-
-    loopSpills = CSRSave[MBB];
-    if (CSRSave[MBB].empty()) {
-      loopSpills = CSRUsed[HDR];
-      assert(!loopSpills.empty() && "No CSRs used in loop?");
-    } else if (CSRRestore[MBB].contains(CSRSave[MBB]))
-      continue;
-
-    LP->getExitBlocks(exitBlocks);
-    assert(exitBlocks.size() > 0 && "Loop has no top level exit blocks?");
-    for (unsigned i = 0, e = exitBlocks.size(); i != e; ++i) {
-      MachineBasicBlock* EXB = exitBlocks[i];
-      if (! CSRUsed[EXB].contains(loopSpills)) {
-        CSRUsed[EXB] |= loopSpills;
-        addedUses = true;
-        DEBUG(if (ShrinkWrapDebugging >= Iterations)
-                dbgs() << "LOOP " << getBasicBlockName(MBB)
-                     << "(" << stringifyCSRegSet(loopSpills) << ")->"
-                     << getBasicBlockName(EXB) << "\n");
-        if (EXB->succ_size() > 1 || EXB->pred_size() > 1)
-          blks.push_back(EXB);
-      }
-    }
-  }
-  return addedUses;
-}
-
-/// calcSpillPlacements - determine which CSRs should be spilled
-/// in MBB using AnticIn sets of MBB's predecessors, keeping track
-/// of changes to spilled reg sets. Add MBB to the set of blocks
-/// that need to be processed for propagating use info to cover
-/// multi-entry/exit regions.
-///
-bool PEI::calcSpillPlacements(MachineBasicBlock* MBB,
-                              SmallVectorImpl<MachineBasicBlock *> &blks,
-                              CSRegBlockMap &prevSpills) {
-  bool placedSpills = false;
-  // Intersect (CSRegs - AnticIn[P]) for P in Predecessors(MBB)
-  CSRegSet anticInPreds;
-  SmallVector<MachineBasicBlock*, 4> predecessors;
-  for (MachineBasicBlock::pred_iterator PI = MBB->pred_begin(),
-         PE = MBB->pred_end(); PI != PE; ++PI) {
-    MachineBasicBlock* PRED = *PI;
-    if (PRED != MBB)
-      predecessors.push_back(PRED);
-  }
-  unsigned i = 0, e = predecessors.size();
-  if (i != e) {
-    MachineBasicBlock* PRED = predecessors[i];
-    anticInPreds = UsedCSRegs - AnticIn[PRED];
-    for (++i; i != e; ++i) {
-      PRED = predecessors[i];
-      anticInPreds &= (UsedCSRegs - AnticIn[PRED]);
-    }
-  } else {
-    // Handle uses in entry blocks (which have no predecessors).
-    // This is necessary because the DFA formulation assumes the
-    // entry and (multiple) exit nodes cannot have CSR uses, which
-    // is not the case in the real world.
-    anticInPreds = UsedCSRegs;
-  }
-  // Compute spills required at MBB:
-  CSRSave[MBB] |= (AnticIn[MBB] - AvailIn[MBB]) & anticInPreds;
-
-  if (! CSRSave[MBB].empty()) {
-    if (MBB == EntryBlock) {
-      for (unsigned ri = 0, re = ReturnBlocks.size(); ri != re; ++ri)
-        CSRRestore[ReturnBlocks[ri]] |= CSRSave[MBB];
-    } else {
-      // Reset all regs spilled in MBB that are also spilled in EntryBlock.
-      if (CSRSave[EntryBlock].intersects(CSRSave[MBB])) {
-        CSRSave[MBB] = CSRSave[MBB] - CSRSave[EntryBlock];
-      }
-    }
-  }
-  placedSpills = (CSRSave[MBB] != prevSpills[MBB]);
-  prevSpills[MBB] = CSRSave[MBB];
-  // Remember this block for adding restores to successor
-  // blocks for multi-entry region.
-  if (placedSpills)
-    blks.push_back(MBB);
-
-  DEBUG(if (! CSRSave[MBB].empty() && ShrinkWrapDebugging >= Iterations)
-          dbgs() << "SAVE[" << getBasicBlockName(MBB) << "] = "
-               << stringifyCSRegSet(CSRSave[MBB]) << "\n");
-
-  return placedSpills;
-}
-
-/// calcRestorePlacements - determine which CSRs should be restored
-/// in MBB using AvailOut sets of MBB's succcessors, keeping track
-/// of changes to restored reg sets. Add MBB to the set of blocks
-/// that need to be processed for propagating use info to cover
-/// multi-entry/exit regions.
-///
-bool PEI::calcRestorePlacements(MachineBasicBlock* MBB,
-                                SmallVectorImpl<MachineBasicBlock *> &blks,
-                                CSRegBlockMap &prevRestores) {
-  bool placedRestores = false;
-  // Intersect (CSRegs - AvailOut[S]) for S in Successors(MBB)
-  CSRegSet availOutSucc;
-  SmallVector<MachineBasicBlock*, 4> successors;
-  for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(),
-         SE = MBB->succ_end(); SI != SE; ++SI) {
-    MachineBasicBlock* SUCC = *SI;
-    if (SUCC != MBB)
-      successors.push_back(SUCC);
-  }
-  unsigned i = 0, e = successors.size();
-  if (i != e) {
-    MachineBasicBlock* SUCC = successors[i];
-    availOutSucc = UsedCSRegs - AvailOut[SUCC];
-    for (++i; i != e; ++i) {
-      SUCC = successors[i];
-      availOutSucc &= (UsedCSRegs - AvailOut[SUCC]);
-    }
-  } else {
-    if (! CSRUsed[MBB].empty() || ! AvailOut[MBB].empty()) {
-      // Handle uses in return blocks (which have no successors).
-      // This is necessary because the DFA formulation assumes the
-      // entry and (multiple) exit nodes cannot have CSR uses, which
-      // is not the case in the real world.
-      availOutSucc = UsedCSRegs;
-    }
-  }
-  // Compute restores required at MBB:
-  CSRRestore[MBB] |= (AvailOut[MBB] - AnticOut[MBB]) & availOutSucc;
-
-  // Postprocess restore placements at MBB.
-  // Remove the CSRs that are restored in the return blocks.
-  // Lest this be confusing, note that:
-  // CSRSave[EntryBlock] == CSRRestore[B] for all B in ReturnBlocks.
-  if (MBB->succ_size() && ! CSRRestore[MBB].empty()) {
-    if (! CSRSave[EntryBlock].empty())
-      CSRRestore[MBB] = CSRRestore[MBB] - CSRSave[EntryBlock];
-  }
-  placedRestores = (CSRRestore[MBB] != prevRestores[MBB]);
-  prevRestores[MBB] = CSRRestore[MBB];
-  // Remember this block for adding saves to predecessor
-  // blocks for multi-entry region.
-  if (placedRestores)
-    blks.push_back(MBB);
-
-  DEBUG(if (! CSRRestore[MBB].empty() && ShrinkWrapDebugging >= Iterations)
-          dbgs() << "RESTORE[" << getBasicBlockName(MBB) << "] = "
-               << stringifyCSRegSet(CSRRestore[MBB]) << "\n");
-
-  return placedRestores;
-}
-
-/// placeSpillsAndRestores - place spills and restores of CSRs
-/// used in MBBs in minimal regions that contain the uses.
-///
-void PEI::placeSpillsAndRestores(MachineFunction &Fn) {
-  CSRegBlockMap prevCSRSave;
-  CSRegBlockMap prevCSRRestore;
-  SmallVector<MachineBasicBlock*, 4> cvBlocks, ncvBlocks;
-  bool changed = true;
-  unsigned iterations = 0;
-
-  // Iterate computation of spill and restore placements in the MCFG until:
-  //   1. CSR use info has been fully propagated around the MCFG, and
-  //   2. computation of CSRSave[], CSRRestore[] reach fixed points.
-  while (changed) {
-    changed = false;
-    ++iterations;
-
-    DEBUG(if (ShrinkWrapDebugging >= Iterations)
-            dbgs() << "iter " << iterations
-                 << " --------------------------------------------------\n");
-
-    // Calculate CSR{Save,Restore} sets using Antic, Avail on the MCFG,
-    // which determines the placements of spills and restores.
-    // Keep track of changes to spills, restores in each iteration to
-    // minimize the total iterations.
-    bool SRChanged = false;
-    for (MachineFunction::iterator MBBI = Fn.begin(), MBBE = Fn.end();
-         MBBI != MBBE; ++MBBI) {
-      MachineBasicBlock* MBB = MBBI;
-
-      // Place spills for CSRs in MBB.
-      SRChanged |= calcSpillPlacements(MBB, cvBlocks, prevCSRSave);
-
-      // Place restores for CSRs in MBB.
-      SRChanged |= calcRestorePlacements(MBB, cvBlocks, prevCSRRestore);
-    }
-
-    // Add uses of CSRs used inside loops where needed.
-    changed |= addUsesForTopLevelLoops(cvBlocks);
-
-    // Add uses for CSRs spilled or restored at branch, join points.
-    if (changed || SRChanged) {
-      while (! cvBlocks.empty()) {
-        MachineBasicBlock* MBB = cvBlocks.pop_back_val();
-        changed |= addUsesForMEMERegion(MBB, ncvBlocks);
-      }
-      if (! ncvBlocks.empty()) {
-        cvBlocks = ncvBlocks;
-        ncvBlocks.clear();
-      }
-    }
-
-    if (changed) {
-      calculateAnticAvail(Fn);
-      CSRSave.clear();
-      CSRRestore.clear();
-    }
-  }
-
-  // Check for effectiveness:
-  //  SR0 = {r | r in CSRSave[EntryBlock], CSRRestore[RB], RB in ReturnBlocks}
-  //  numSRReduced = |(UsedCSRegs - SR0)|, approx. SR0 by CSRSave[EntryBlock]
-  // Gives a measure of how many CSR spills have been moved from EntryBlock
-  // to minimal regions enclosing their uses.
-  CSRegSet notSpilledInEntryBlock = (UsedCSRegs - CSRSave[EntryBlock]);
-  unsigned numSRReducedThisFunc = notSpilledInEntryBlock.count();
-  numSRReduced += numSRReducedThisFunc;
-  DEBUG(if (ShrinkWrapDebugging >= BasicInfo) {
-      dbgs() << "-----------------------------------------------------------\n";
-      dbgs() << "total iterations = " << iterations << " ( "
-           << Fn.getName()
-           << " " << numSRReducedThisFunc
-           << " " << Fn.size()
-           << " )\n";
-      dbgs() << "-----------------------------------------------------------\n";
-      dumpSRSets();
-      dbgs() << "-----------------------------------------------------------\n";
-      if (numSRReducedThisFunc)
-        verifySpillRestorePlacement();
-    });
-}
-
-// Debugging methods.
-#ifndef NDEBUG
-/// findFastExitPath - debugging method used to detect functions
-/// with at least one path from the entry block to a return block
-/// directly or which has a very small number of edges.
-///
-void PEI::findFastExitPath() {
-  if (! EntryBlock)
-    return;
-  // Fina a path from EntryBlock to any return block that does not branch:
-  //        Entry
-  //          |     ...
-  //          v      |
-  //         B1<-----+
-  //          |
-  //          v
-  //       Return
-  for (MachineBasicBlock::succ_iterator SI = EntryBlock->succ_begin(),
-         SE = EntryBlock->succ_end(); SI != SE; ++SI) {
-    MachineBasicBlock* SUCC = *SI;
-
-    // Assume positive, disprove existence of fast path.
-    HasFastExitPath = true;
-
-    // Check the immediate successors.
-    if (isReturnBlock(SUCC)) {
-      if (ShrinkWrapDebugging >= BasicInfo)
-        dbgs() << "Fast exit path: " << getBasicBlockName(EntryBlock)
-             << "->" << getBasicBlockName(SUCC) << "\n";
-      break;
-    }
-    // Traverse df from SUCC, look for a branch block.
-    std::string exitPath = getBasicBlockName(SUCC);
-    for (df_iterator<MachineBasicBlock*> BI = df_begin(SUCC),
-           BE = df_end(SUCC); BI != BE; ++BI) {
-      MachineBasicBlock* SBB = *BI;
-      // Reject paths with branch nodes.
-      if (SBB->succ_size() > 1) {
-        HasFastExitPath = false;
-        break;
-      }
-      exitPath += "->" + getBasicBlockName(SBB);
-    }
-    if (HasFastExitPath) {
-      if (ShrinkWrapDebugging >= BasicInfo)
-        dbgs() << "Fast exit path: " << getBasicBlockName(EntryBlock)
-             << "->" << exitPath << "\n";
-      break;
-    }
-  }
-}
-
-/// verifySpillRestorePlacement - check the current spill/restore
-/// sets for safety. Attempt to find spills without restores or
-/// restores without spills.
-/// Spills: walk df from each MBB in spill set ensuring that
-///         all CSRs spilled at MMBB are restored on all paths
-///         from MBB to all exit blocks.
-/// Restores: walk idf from each MBB in restore set ensuring that
-///           all CSRs restored at MBB are spilled on all paths
-///           reaching MBB.
-///
-void PEI::verifySpillRestorePlacement() {
-  unsigned numReturnBlocks = 0;
-  for (MachineFunction::iterator MBBI = MF->begin(), MBBE = MF->end();
-       MBBI != MBBE; ++MBBI) {
-    MachineBasicBlock* MBB = MBBI;
-    if (isReturnBlock(MBB) || MBB->succ_size() == 0)
-      ++numReturnBlocks;
-  }
-  for (CSRegBlockMap::iterator BI = CSRSave.begin(),
-         BE = CSRSave.end(); BI != BE; ++BI) {
-    MachineBasicBlock* MBB = BI->first;
-    CSRegSet spilled = BI->second;
-    CSRegSet restored;
-
-    if (spilled.empty())
-      continue;
-
-    DEBUG(dbgs() << "SAVE[" << getBasicBlockName(MBB) << "] = "
-                 << stringifyCSRegSet(spilled)
-                 << "  RESTORE[" << getBasicBlockName(MBB) << "] = "
-                 << stringifyCSRegSet(CSRRestore[MBB]) << "\n");
-
-    if (CSRRestore[MBB].intersects(spilled)) {
-      restored |= (CSRRestore[MBB] & spilled);
-    }
-
-    // Walk depth first from MBB to find restores of all CSRs spilled at MBB:
-    // we must find restores for all spills w/no intervening spills on all
-    // paths from MBB to all return blocks.
-    for (df_iterator<MachineBasicBlock*> BI = df_begin(MBB),
-           BE = df_end(MBB); BI != BE; ++BI) {
-      MachineBasicBlock* SBB = *BI;
-      if (SBB == MBB)
-        continue;
-      // Stop when we encounter spills of any CSRs spilled at MBB that
-      // have not yet been seen to be restored.
-      if (CSRSave[SBB].intersects(spilled) &&
-          !restored.contains(CSRSave[SBB] & spilled))
-        break;
-      // Collect the CSRs spilled at MBB that are restored
-      // at this DF successor of MBB.
-      if (CSRRestore[SBB].intersects(spilled))
-        restored |= (CSRRestore[SBB] & spilled);
-      // If we are at a retun block, check that the restores
-      // we have seen so far exhaust the spills at MBB, then
-      // reset the restores.
-      if (isReturnBlock(SBB) || SBB->succ_size() == 0) {
-        if (restored != spilled) {
-          CSRegSet notRestored = (spilled - restored);
-          DEBUG(dbgs() << MF->getName() << ": "
-                       << stringifyCSRegSet(notRestored)
-                       << " spilled at " << getBasicBlockName(MBB)
-                       << " are never restored on path to return "
-                       << getBasicBlockName(SBB) << "\n");
-        }
-        restored.clear();
-      }
-    }
-  }
-
-  // Check restore placements.
-  for (CSRegBlockMap::iterator BI = CSRRestore.begin(),
-         BE = CSRRestore.end(); BI != BE; ++BI) {
-    MachineBasicBlock* MBB = BI->first;
-    CSRegSet restored = BI->second;
-    CSRegSet spilled;
-
-    if (restored.empty())
-      continue;
-
-    DEBUG(dbgs() << "SAVE[" << getBasicBlockName(MBB) << "] = "
-                 << stringifyCSRegSet(CSRSave[MBB])
-                 << "  RESTORE[" << getBasicBlockName(MBB) << "] = "
-                 << stringifyCSRegSet(restored) << "\n");
-
-    if (CSRSave[MBB].intersects(restored)) {
-      spilled |= (CSRSave[MBB] & restored);
-    }
-    // Walk inverse depth first from MBB to find spills of all
-    // CSRs restored at MBB:
-    for (idf_iterator<MachineBasicBlock*> BI = idf_begin(MBB),
-           BE = idf_end(MBB); BI != BE; ++BI) {
-      MachineBasicBlock* PBB = *BI;
-      if (PBB == MBB)
-        continue;
-      // Stop when we encounter restores of any CSRs restored at MBB that
-      // have not yet been seen to be spilled.
-      if (CSRRestore[PBB].intersects(restored) &&
-          !spilled.contains(CSRRestore[PBB] & restored))
-        break;
-      // Collect the CSRs restored at MBB that are spilled
-      // at this DF predecessor of MBB.
-      if (CSRSave[PBB].intersects(restored))
-        spilled |= (CSRSave[PBB] & restored);
-    }
-    if (spilled != restored) {
-      CSRegSet notSpilled = (restored - spilled);
-      DEBUG(dbgs() << MF->getName() << ": "
-                   << stringifyCSRegSet(notSpilled)
-                   << " restored at " << getBasicBlockName(MBB)
-                   << " are never spilled\n");
-    }
-  }
-}
-
-// Debugging print methods.
-std::string PEI::getBasicBlockName(const MachineBasicBlock* MBB) {
-  if (!MBB)
-    return "";
-
-  if (MBB->getBasicBlock())
-    return MBB->getBasicBlock()->getName().str();
-
-  std::ostringstream name;
-  name << "_MBB_" << MBB->getNumber();
-  return name.str();
-}
-
-std::string PEI::stringifyCSRegSet(const CSRegSet& s) {
-  const TargetRegisterInfo* TRI = MF->getTarget().getRegisterInfo();
-  const std::vector<CalleeSavedInfo> CSI =
-    MF->getFrameInfo()->getCalleeSavedInfo();
-
-  std::ostringstream srep;
-  if (CSI.size() == 0) {
-    srep << "[]";
-    return srep.str();
-  }
-  srep << "[";
-  CSRegSet::iterator I = s.begin(), E = s.end();
-  if (I != E) {
-    unsigned reg = CSI[*I].getReg();
-    srep << TRI->getName(reg);
-    for (++I; I != E; ++I) {
-      reg = CSI[*I].getReg();
-      srep << ",";
-      srep << TRI->getName(reg);
-    }
-  }
-  srep << "]";
-  return srep.str();
-}
-
-void PEI::dumpSet(const CSRegSet& s) {
-  DEBUG(dbgs() << stringifyCSRegSet(s) << "\n");
-}
-
-void PEI::dumpUsed(MachineBasicBlock* MBB) {
-  DEBUG({
-      if (MBB)
-        dbgs() << "CSRUsed[" << getBasicBlockName(MBB) << "] = "
-               << stringifyCSRegSet(CSRUsed[MBB])  << "\n";
-    });
-}
-
-void PEI::dumpAllUsed() {
-    for (MachineFunction::iterator MBBI = MF->begin(), MBBE = MF->end();
-         MBBI != MBBE; ++MBBI) {
-      MachineBasicBlock* MBB = MBBI;
-      dumpUsed(MBB);
-    }
-}
-
-void PEI::dumpSets(MachineBasicBlock* MBB) {
-  DEBUG({
-      if (MBB)
-        dbgs() << getBasicBlockName(MBB)           << " | "
-               << stringifyCSRegSet(CSRUsed[MBB])  << " | "
-               << stringifyCSRegSet(AnticIn[MBB])  << " | "
-               << stringifyCSRegSet(AnticOut[MBB]) << " | "
-               << stringifyCSRegSet(AvailIn[MBB])  << " | "
-               << stringifyCSRegSet(AvailOut[MBB]) << "\n";
-    });
-}
-
-void PEI::dumpSets1(MachineBasicBlock* MBB) {
-  DEBUG({
-      if (MBB)
-        dbgs() << getBasicBlockName(MBB)             << " | "
-               << stringifyCSRegSet(CSRUsed[MBB])    << " | "
-               << stringifyCSRegSet(AnticIn[MBB])    << " | "
-               << stringifyCSRegSet(AnticOut[MBB])   << " | "
-               << stringifyCSRegSet(AvailIn[MBB])    << " | "
-               << stringifyCSRegSet(AvailOut[MBB])   << " | "
-               << stringifyCSRegSet(CSRSave[MBB])    << " | "
-               << stringifyCSRegSet(CSRRestore[MBB]) << "\n";
-    });
-}
-
-void PEI::dumpAllSets() {
-    for (MachineFunction::iterator MBBI = MF->begin(), MBBE = MF->end();
-         MBBI != MBBE; ++MBBI) {
-      MachineBasicBlock* MBB = MBBI;
-      dumpSets1(MBB);
-    }
-}
-
-void PEI::dumpSRSets() {
-  DEBUG({
-      for (MachineFunction::iterator MBB = MF->begin(), E = MF->end();
-           MBB != E; ++MBB) {
-        if (!CSRSave[MBB].empty()) {
-          dbgs() << "SAVE[" << getBasicBlockName(MBB) << "] = "
-                 << stringifyCSRegSet(CSRSave[MBB]);
-          if (CSRRestore[MBB].empty())
-            dbgs() << '\n';
-        }
-
-        if (!CSRRestore[MBB].empty() && !CSRSave[MBB].empty())
-          dbgs() << "    "
-                 << "RESTORE[" << getBasicBlockName(MBB) << "] = "
-                 << stringifyCSRegSet(CSRRestore[MBB]) << "\n";
-      }
-    });
-}
-#endif
diff --git a/lib/CodeGen/SjLjEHPrepare.cpp b/lib/CodeGen/SjLjEHPrepare.cpp
index 2fc8f46..da2e710 100644
--- a/lib/CodeGen/SjLjEHPrepare.cpp
+++ b/lib/CodeGen/SjLjEHPrepare.cpp
@@ -42,41 +42,40 @@ STATISTIC(NumInvokes, "Number of invokes replaced");
 STATISTIC(NumSpilled, "Number of registers live across unwind edges");
 
 namespace {
-  class SjLjEHPrepare : public FunctionPass {
-    const TargetMachine *TM;
-    Type *FunctionContextTy;
-    Constant *RegisterFn;
-    Constant *UnregisterFn;
-    Constant *BuiltinSetjmpFn;
-    Constant *FrameAddrFn;
-    Constant *StackAddrFn;
-    Constant *StackRestoreFn;
-    Constant *LSDAAddrFn;
-    Value *PersonalityFn;
-    Constant *CallSiteFn;
-    Constant *FuncCtxFn;
-    AllocaInst *FuncCtx;
-  public:
-    static char ID; // Pass identification, replacement for typeid
-    explicit SjLjEHPrepare(const TargetMachine *TM)
-      : FunctionPass(ID), TM(TM) { }
-    bool doInitialization(Module &M);
-    bool runOnFunction(Function &F);
-
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const {}
-    const char *getPassName() const {
-      return "SJLJ Exception Handling preparation";
-    }
+class SjLjEHPrepare : public FunctionPass {
+  const TargetMachine *TM;
+  Type *FunctionContextTy;
+  Constant *RegisterFn;
+  Constant *UnregisterFn;
+  Constant *BuiltinSetjmpFn;
+  Constant *FrameAddrFn;
+  Constant *StackAddrFn;
+  Constant *StackRestoreFn;
+  Constant *LSDAAddrFn;
+  Value *PersonalityFn;
+  Constant *CallSiteFn;
+  Constant *FuncCtxFn;
+  AllocaInst *FuncCtx;
+
+public:
+  static char ID; // Pass identification, replacement for typeid
+  explicit SjLjEHPrepare(const TargetMachine *TM) : FunctionPass(ID), TM(TM) {}
+  bool doInitialization(Module &M);
+  bool runOnFunction(Function &F);
+
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const {}
+  const char *getPassName() const {
+    return "SJLJ Exception Handling preparation";
+  }
 
-  private:
-    bool setupEntryBlockAndCallSites(Function &F);
-    void substituteLPadValues(LandingPadInst *LPI, Value *ExnVal,
-                              Value *SelVal);
-    Value *setupFunctionContext(Function &F, ArrayRef<LandingPadInst*> LPads);
-    void lowerIncomingArguments(Function &F);
-    void lowerAcrossUnwindEdges(Function &F, ArrayRef<InvokeInst*> Invokes);
-    void insertCallSiteStore(Instruction *I, int Number);
-  };
+private:
+  bool setupEntryBlockAndCallSites(Function &F);
+  void substituteLPadValues(LandingPadInst *LPI, Value *ExnVal, Value *SelVal);
+  Value *setupFunctionContext(Function &F, ArrayRef<LandingPadInst *> LPads);
+  void lowerIncomingArguments(Function &F);
+  void lowerAcrossUnwindEdges(Function &F, ArrayRef<InvokeInst *> Invokes);
+  void insertCallSiteStore(Instruction *I, int Number);
+};
 } // end anonymous namespace
 
 char SjLjEHPrepare::ID = 0;
@@ -92,23 +91,19 @@ bool SjLjEHPrepare::doInitialization(Module &M) {
   // builtin_setjmp uses a five word jbuf
   Type *VoidPtrTy = Type::getInt8PtrTy(M.getContext());
   Type *Int32Ty = Type::getInt32Ty(M.getContext());
-  FunctionContextTy =
-    StructType::get(VoidPtrTy,                        // __prev
-                    Int32Ty,                          // call_site
-                    ArrayType::get(Int32Ty, 4),       // __data
-                    VoidPtrTy,                        // __personality
-                    VoidPtrTy,                        // __lsda
-                    ArrayType::get(VoidPtrTy, 5),     // __jbuf
-                    NULL);
-  RegisterFn = M.getOrInsertFunction("_Unwind_SjLj_Register",
-                                     Type::getVoidTy(M.getContext()),
-                                     PointerType::getUnqual(FunctionContextTy),
-                                     (Type *)0);
-  UnregisterFn =
-    M.getOrInsertFunction("_Unwind_SjLj_Unregister",
-                          Type::getVoidTy(M.getContext()),
-                          PointerType::getUnqual(FunctionContextTy),
-                          (Type *)0);
+  FunctionContextTy = StructType::get(VoidPtrTy,                  // __prev
+                                      Int32Ty,                    // call_site
+                                      ArrayType::get(Int32Ty, 4), // __data
+                                      VoidPtrTy, // __personality
+                                      VoidPtrTy, // __lsda
+                                      ArrayType::get(VoidPtrTy, 5), // __jbuf
+                                      NULL);
+  RegisterFn = M.getOrInsertFunction(
+      "_Unwind_SjLj_Register", Type::getVoidTy(M.getContext()),
+      PointerType::getUnqual(FunctionContextTy), (Type *)0);
+  UnregisterFn = M.getOrInsertFunction(
+      "_Unwind_SjLj_Unregister", Type::getVoidTy(M.getContext()),
+      PointerType::getUnqual(FunctionContextTy), (Type *)0);
   FrameAddrFn = Intrinsic::getDeclaration(&M, Intrinsic::frameaddress);
   StackAddrFn = Intrinsic::getDeclaration(&M, Intrinsic::stacksave);
   StackRestoreFn = Intrinsic::getDeclaration(&M, Intrinsic::stackrestore);
@@ -134,16 +129,17 @@ void SjLjEHPrepare::insertCallSiteStore(Instruction *I, int Number) {
   Value *CallSite = Builder.CreateGEP(FuncCtx, Idxs, "call_site");
 
   // Insert a store of the call-site number
-  ConstantInt *CallSiteNoC = ConstantInt::get(Type::getInt32Ty(I->getContext()),
-                                              Number);
-  Builder.CreateStore(CallSiteNoC, CallSite, true/*volatile*/);
+  ConstantInt *CallSiteNoC =
+      ConstantInt::get(Type::getInt32Ty(I->getContext()), Number);
+  Builder.CreateStore(CallSiteNoC, CallSite, true /*volatile*/);
 }
 
 /// MarkBlocksLiveIn - Insert BB and all of its predescessors into LiveBBs until
 /// we reach blocks we've already seen.
 static void MarkBlocksLiveIn(BasicBlock *BB,
-                             SmallPtrSet<BasicBlock*, 64> &LiveBBs) {
-  if (!LiveBBs.insert(BB)) return; // already been here.
+                             SmallPtrSet<BasicBlock *, 64> &LiveBBs) {
+  if (!LiveBBs.insert(BB))
+    return; // already been here.
 
   for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI)
     MarkBlocksLiveIn(*PI, LiveBBs);
@@ -153,12 +149,14 @@ static void MarkBlocksLiveIn(BasicBlock *BB,
 /// instruction with those returned by the personality function.
 void SjLjEHPrepare::substituteLPadValues(LandingPadInst *LPI, Value *ExnVal,
                                          Value *SelVal) {
-  SmallVector<Value*, 8> UseWorkList(LPI->use_begin(), LPI->use_end());
+  SmallVector<Value *, 8> UseWorkList(LPI->use_begin(), LPI->use_end());
   while (!UseWorkList.empty()) {
     Value *Val = UseWorkList.pop_back_val();
     ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(Val);
-    if (!EVI) continue;
-    if (EVI->getNumIndices() != 1) continue;
+    if (!EVI)
+      continue;
+    if (EVI->getNumIndices() != 1)
+      continue;
     if (*EVI->idx_begin() == 0)
       EVI->replaceAllUsesWith(ExnVal);
     else if (*EVI->idx_begin() == 1)
@@ -167,14 +165,15 @@ void SjLjEHPrepare::substituteLPadValues(LandingPadInst *LPI, Value *ExnVal,
       EVI->eraseFromParent();
   }
 
-  if (LPI->getNumUses() == 0)  return;
+  if (LPI->getNumUses() == 0)
+    return;
 
   // There are still some uses of LPI. Construct an aggregate with the exception
   // values and replace the LPI with that aggregate.
   Type *LPadType = LPI->getType();
   Value *LPadVal = UndefValue::get(LPadType);
-  IRBuilder<>
-    Builder(llvm::next(BasicBlock::iterator(cast<Instruction>(SelVal))));
+  IRBuilder<> Builder(
+      llvm::next(BasicBlock::iterator(cast<Instruction>(SelVal))));
   LPadVal = Builder.CreateInsertValue(LPadVal, ExnVal, 0, "lpad.val");
   LPadVal = Builder.CreateInsertValue(LPadVal, SelVal, 1, "lpad.val");
 
@@ -183,8 +182,8 @@ void SjLjEHPrepare::substituteLPadValues(LandingPadInst *LPI, Value *ExnVal,
 
 /// setupFunctionContext - Allocate the function context on the stack and fill
 /// it with all of the data that we know at this point.
-Value *SjLjEHPrepare::
-setupFunctionContext(Function &F, ArrayRef<LandingPadInst*> LPads) {
+Value *SjLjEHPrepare::setupFunctionContext(Function &F,
+                                           ArrayRef<LandingPadInst *> LPads) {
   BasicBlock *EntryBB = F.begin();
 
   // Create an alloca for the incoming jump buffer ptr and the new jump buffer
@@ -192,9 +191,9 @@ setupFunctionContext(Function &F, ArrayRef<LandingPadInst*> LPads) {
   // because the value needs to be added to the global context list.
   const TargetLowering *TLI = TM->getTargetLowering();
   unsigned Align =
-    TLI->getDataLayout()->getPrefTypeAlignment(FunctionContextTy);
-  FuncCtx =
-    new AllocaInst(FunctionContextTy, 0, Align, "fn_context", EntryBB->begin());
+      TLI->getDataLayout()->getPrefTypeAlignment(FunctionContextTy);
+  FuncCtx = new AllocaInst(FunctionContextTy, 0, Align, "fn_context",
+                           EntryBB->begin());
 
   // Fill in the function context structure.
   for (unsigned I = 0, E = LPads.size(); I != E; ++I) {
@@ -205,13 +204,13 @@ setupFunctionContext(Function &F, ArrayRef<LandingPadInst*> LPads) {
     Value *FCData = Builder.CreateConstGEP2_32(FuncCtx, 0, 2, "__data");
 
     // The exception values come back in context->__data[0].
-    Value *ExceptionAddr = Builder.CreateConstGEP2_32(FCData, 0, 0,
-                                                      "exception_gep");
+    Value *ExceptionAddr =
+        Builder.CreateConstGEP2_32(FCData, 0, 0, "exception_gep");
     Value *ExnVal = Builder.CreateLoad(ExceptionAddr, true, "exn_val");
     ExnVal = Builder.CreateIntToPtr(ExnVal, Builder.getInt8PtrTy());
 
-    Value *SelectorAddr = Builder.CreateConstGEP2_32(FCData, 0, 1,
-                                                     "exn_selector_gep");
+    Value *SelectorAddr =
+        Builder.CreateConstGEP2_32(FCData, 0, 1, "exn_selector_gep");
     Value *SelVal = Builder.CreateLoad(SelectorAddr, true, "exn_selector_val");
 
     substituteLPadValues(LPI, ExnVal, SelVal);
@@ -221,11 +220,11 @@ setupFunctionContext(Function &F, ArrayRef<LandingPadInst*> LPads) {
   IRBuilder<> Builder(EntryBB->getTerminator());
   if (!PersonalityFn)
     PersonalityFn = LPads[0]->getPersonalityFn();
-  Value *PersonalityFieldPtr = Builder.CreateConstGEP2_32(FuncCtx, 0, 3,
-                                                          "pers_fn_gep");
-  Builder.CreateStore(Builder.CreateBitCast(PersonalityFn,
-                                            Builder.getInt8PtrTy()),
-                      PersonalityFieldPtr, /*isVolatile=*/true);
+  Value *PersonalityFieldPtr =
+      Builder.CreateConstGEP2_32(FuncCtx, 0, 3, "pers_fn_gep");
+  Builder.CreateStore(
+      Builder.CreateBitCast(PersonalityFn, Builder.getInt8PtrTy()),
+      PersonalityFieldPtr, /*isVolatile=*/true);
 
   // LSDA address
   Value *LSDA = Builder.CreateCall(LSDAAddrFn, "lsda_addr");
@@ -245,8 +244,8 @@ void SjLjEHPrepare::lowerIncomingArguments(Function &F) {
          isa<ConstantInt>(cast<AllocaInst>(AfterAllocaInsPt)->getArraySize()))
     ++AfterAllocaInsPt;
 
-  for (Function::arg_iterator
-         AI = F.arg_begin(), AE = F.arg_end(); AI != AE; ++AI) {
+  for (Function::arg_iterator AI = F.arg_begin(), AE = F.arg_end(); AI != AE;
+       ++AI) {
     Type *Ty = AI->getType();
 
     // Aggregate types can't be cast, but are legal argument types, so we have
@@ -265,9 +264,8 @@ void SjLjEHPrepare::lowerIncomingArguments(Function &F) {
       // This is always a no-op cast because we're casting AI to AI->getType()
       // so src and destination types are identical. BitCast is the only
       // possibility.
-      CastInst *NC =
-        new BitCastInst(AI, AI->getType(), AI->getName() + ".tmp",
-                        AfterAllocaInsPt);
+      CastInst *NC = new BitCastInst(AI, AI->getType(), AI->getName() + ".tmp",
+                                     AfterAllocaInsPt);
       AI->replaceAllUsesWith(NC);
 
       // Set the operand of the cast instruction back to the AllocaInst.
@@ -284,20 +282,21 @@ void SjLjEHPrepare::lowerIncomingArguments(Function &F) {
 /// lowerAcrossUnwindEdges - Find all variables which are alive across an unwind
 /// edge and spill them.
 void SjLjEHPrepare::lowerAcrossUnwindEdges(Function &F,
-                                           ArrayRef<InvokeInst*> Invokes) {
+                                           ArrayRef<InvokeInst *> Invokes) {
   // Finally, scan the code looking for instructions with bad live ranges.
-  for (Function::iterator
-         BB = F.begin(), BBE = F.end(); BB != BBE; ++BB) {
-    for (BasicBlock::iterator
-           II = BB->begin(), IIE = BB->end(); II != IIE; ++II) {
+  for (Function::iterator BB = F.begin(), BBE = F.end(); BB != BBE; ++BB) {
+    for (BasicBlock::iterator II = BB->begin(), IIE = BB->end(); II != IIE;
+         ++II) {
       // Ignore obvious cases we don't have to handle. In particular, most
       // instructions either have no uses or only have a single use inside the
       // current block. Ignore them quickly.
       Instruction *Inst = II;
-      if (Inst->use_empty()) continue;
+      if (Inst->use_empty())
+        continue;
       if (Inst->hasOneUse() &&
           cast<Instruction>(Inst->use_back())->getParent() == BB &&
-          !isa<PHINode>(Inst->use_back())) continue;
+          !isa<PHINode>(Inst->use_back()))
+        continue;
 
       // If this is an alloca in the entry block, it's not a real register
       // value.
@@ -306,16 +305,16 @@ void SjLjEHPrepare::lowerAcrossUnwindEdges(Function &F,
           continue;
 
       // Avoid iterator invalidation by copying users to a temporary vector.
-      SmallVector<Instruction*, 16> Users;
-      for (Value::use_iterator
-             UI = Inst->use_begin(), E = Inst->use_end(); UI != E; ++UI) {
+      SmallVector<Instruction *, 16> Users;
+      for (Value::use_iterator UI = Inst->use_begin(), E = Inst->use_end();
+           UI != E; ++UI) {
         Instruction *User = cast<Instruction>(*UI);
         if (User->getParent() != BB || isa<PHINode>(User))
           Users.push_back(User);
       }
 
       // Find all of the blocks that this value is live in.
-      SmallPtrSet<BasicBlock*, 64> LiveBBs;
+      SmallPtrSet<BasicBlock *, 64> LiveBBs;
       LiveBBs.insert(Inst->getParent());
       while (!Users.empty()) {
         Instruction *U = Users.back();
@@ -339,7 +338,7 @@ void SjLjEHPrepare::lowerAcrossUnwindEdges(Function &F,
         BasicBlock *UnwindBlock = Invokes[i]->getUnwindDest();
         if (UnwindBlock != BB && LiveBBs.count(UnwindBlock)) {
           DEBUG(dbgs() << "SJLJ Spill: " << *Inst << " around "
-                << UnwindBlock->getName() << "\n");
+                       << UnwindBlock->getName() << "\n");
           NeedsSpill = true;
           break;
         }
@@ -362,15 +361,16 @@ void SjLjEHPrepare::lowerAcrossUnwindEdges(Function &F,
     LandingPadInst *LPI = UnwindBlock->getLandingPadInst();
 
     // Place PHIs into a set to avoid invalidating the iterator.
-    SmallPtrSet<PHINode*, 8> PHIsToDemote;
-    for (BasicBlock::iterator
-           PN = UnwindBlock->begin(); isa<PHINode>(PN); ++PN)
+    SmallPtrSet<PHINode *, 8> PHIsToDemote;
+    for (BasicBlock::iterator PN = UnwindBlock->begin(); isa<PHINode>(PN); ++PN)
       PHIsToDemote.insert(cast<PHINode>(PN));
-    if (PHIsToDemote.empty()) continue;
+    if (PHIsToDemote.empty())
+      continue;
 
     // Demote the PHIs to the stack.
-    for (SmallPtrSet<PHINode*, 8>::iterator
-           I = PHIsToDemote.begin(), E = PHIsToDemote.end(); I != E; ++I)
+    for (SmallPtrSet<PHINode *, 8>::iterator I = PHIsToDemote.begin(),
+                                             E = PHIsToDemote.end();
+         I != E; ++I)
       DemotePHIToStack(*I);
 
     // Move the landingpad instruction back to the top of the landing pad block.
@@ -382,9 +382,9 @@ void SjLjEHPrepare::lowerAcrossUnwindEdges(Function &F,
 /// the function context and marking the call sites with the appropriate
 /// values. These values are used by the DWARF EH emitter.
 bool SjLjEHPrepare::setupEntryBlockAndCallSites(Function &F) {
-  SmallVector<ReturnInst*, 16> Returns;
-  SmallVector<InvokeInst*, 16> Invokes;
-  SmallSetVector<LandingPadInst*, 16> LPads;
+  SmallVector<ReturnInst *, 16> Returns;
+  SmallVector<InvokeInst *, 16> Invokes;
+  SmallSetVector<LandingPadInst *, 16> LPads;
 
   // Look through the terminators of the basic blocks to find invokes.
   for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
@@ -404,7 +404,8 @@ bool SjLjEHPrepare::setupEntryBlockAndCallSites(Function &F) {
       Returns.push_back(RI);
     }
 
-  if (Invokes.empty()) return false;
+  if (Invokes.empty())
+    return false;
 
   NumInvokes += Invokes.size();
 
@@ -412,7 +413,7 @@ bool SjLjEHPrepare::setupEntryBlockAndCallSites(Function &F) {
   lowerAcrossUnwindEdges(F, Invokes);
 
   Value *FuncCtx =
-    setupFunctionContext(F, makeArrayRef(LPads.begin(), LPads.end()));
+      setupFunctionContext(F, makeArrayRef(LPads.begin(), LPads.end()));
   BasicBlock *EntryBB = F.begin();
   IRBuilder<> Builder(EntryBB->getTerminator());
 
@@ -446,7 +447,7 @@ bool SjLjEHPrepare::setupEntryBlockAndCallSites(Function &F) {
     insertCallSiteStore(Invokes[I], I + 1);
 
     ConstantInt *CallSiteNum =
-      ConstantInt::get(Type::getInt32Ty(F.getContext()), I + 1);
+        ConstantInt::get(Type::getInt32Ty(F.getContext()), I + 1);
 
     // Record the call site value for the back end so it stays associated with
     // the invoke.
@@ -468,8 +469,8 @@ bool SjLjEHPrepare::setupEntryBlockAndCallSites(Function &F) {
       }
 
   // Register the function context and make sure it's known to not throw
-  CallInst *Register = CallInst::Create(RegisterFn, FuncCtx, "",
-                                        EntryBB->getTerminator());
+  CallInst *Register =
+      CallInst::Create(RegisterFn, FuncCtx, "", EntryBB->getTerminator());
   Register->setDoesNotThrow();
 
   // Following any allocas not in the entry block, update the saved SP in the
diff --git a/lib/CodeGen/Spiller.cpp b/lib/CodeGen/Spiller.cpp
index 209792f..d5b3a4a 100644
--- a/lib/CodeGen/Spiller.cpp
+++ b/lib/CodeGen/Spiller.cpp
@@ -77,7 +77,7 @@ protected:
 
     DEBUG(dbgs() << "Spilling everywhere " << *li << "\n");
 
-    assert(li->weight != HUGE_VALF &&
+    assert(li->weight != llvm::huge_valf &&
            "Attempting to spill already spilled value.");
 
     assert(!TargetRegisterInfo::isStackSlot(li->reg) &&
@@ -115,15 +115,14 @@ protected:
         indices.push_back(i);
       }
 
-      // Create a new vreg & interval for this instr.
-      LiveInterval *newLI = &LRE.create();
-      newLI->weight = HUGE_VALF;
+      // Create a new virtual register for the load and/or store.
+      unsigned NewVReg = LRE.create();
 
       // Update the reg operands & kill flags.
       for (unsigned i = 0; i < indices.size(); ++i) {
         unsigned mopIdx = indices[i];
         MachineOperand &mop = mi->getOperand(mopIdx);
-        mop.setReg(newLI->reg);
+        mop.setReg(NewVReg);
         if (mop.isUse() && !mi->isRegTiedToDefOperand(mopIdx)) {
           mop.setIsKill(true);
         }
@@ -133,28 +132,20 @@ protected:
       // Insert reload if necessary.
       MachineBasicBlock::iterator miItr(mi);
       if (hasUse) {
-        tii->loadRegFromStackSlot(*mi->getParent(), miItr, newLI->reg, ss, trc,
+        MachineInstrSpan MIS(miItr);
+
+        tii->loadRegFromStackSlot(*mi->getParent(), miItr, NewVReg, ss, trc,
                                   tri);
-        MachineInstr *loadInstr(prior(miItr));
-        SlotIndex loadIndex =
-          lis->InsertMachineInstrInMaps(loadInstr).getRegSlot();
-        SlotIndex endIndex = loadIndex.getNextIndex();
-        VNInfo *loadVNI =
-          newLI->getNextValue(loadIndex, lis->getVNInfoAllocator());
-        newLI->addRange(LiveRange(loadIndex, endIndex, loadVNI));
+        lis->InsertMachineInstrRangeInMaps(MIS.begin(), miItr);
       }
 
       // Insert store if necessary.
       if (hasDef) {
-        tii->storeRegToStackSlot(*mi->getParent(), llvm::next(miItr),newLI->reg,
+        MachineInstrSpan MIS(miItr);
+
+        tii->storeRegToStackSlot(*mi->getParent(), llvm::next(miItr), NewVReg,
                                  true, ss, trc, tri);
-        MachineInstr *storeInstr(llvm::next(miItr));
-        SlotIndex storeIndex =
-          lis->InsertMachineInstrInMaps(storeInstr).getRegSlot();
-        SlotIndex beginIndex = storeIndex.getPrevIndex();
-        VNInfo *storeVNI =
-          newLI->getNextValue(beginIndex, lis->getVNInfoAllocator());
-        newLI->addRange(LiveRange(beginIndex, storeIndex, storeVNI));
+        lis->InsertMachineInstrRangeInMaps(llvm::next(miItr), MIS.end());
       }
     }
   }
diff --git a/lib/CodeGen/SplitKit.cpp b/lib/CodeGen/SplitKit.cpp
index e717fac..68a15f7 100644
--- a/lib/CodeGen/SplitKit.cpp
+++ b/lib/CodeGen/SplitKit.cpp
@@ -214,7 +214,7 @@ bool SplitAnalysis::calcLiveBlockInfo() {
 
       // When not live in, the first use should be a def.
       if (!BI.LiveIn) {
-        assert(LVI->start == LVI->valno->def && "Dangling LiveRange start");
+        assert(LVI->start == LVI->valno->def && "Dangling Segment start");
         assert(LVI->start == BI.FirstInstr && "First instr should be a def");
         BI.FirstDef = BI.FirstInstr;
       }
@@ -245,8 +245,8 @@ bool SplitAnalysis::calcLiveBlockInfo() {
           BI.FirstInstr = BI.FirstDef = LVI->start;
         }
 
-        // A LiveRange that starts in the middle of the block must be a def.
-        assert(LVI->start == LVI->valno->def && "Dangling LiveRange start");
+        // A Segment that starts in the middle of the block must be a def.
+        assert(LVI->start == LVI->valno->def && "Dangling Segment start");
         if (!BI.FirstDef)
           BI.FirstDef = LVI->start;
       }
@@ -377,7 +377,7 @@ VNInfo *SplitEditor::defValue(unsigned RegIdx,
   assert(ParentVNI && "Mapping  NULL value");
   assert(Idx.isValid() && "Invalid SlotIndex");
   assert(Edit->getParent().getVNInfoAt(Idx) == ParentVNI && "Bad Parent VNI");
-  LiveInterval *LI = Edit->get(RegIdx);
+  LiveInterval *LI = &LIS.getInterval(Edit->get(RegIdx));
 
   // Create a new value.
   VNInfo *VNI = LI->getNextValue(Idx, LIS.getVNInfoAllocator());
@@ -395,14 +395,14 @@ VNInfo *SplitEditor::defValue(unsigned RegIdx,
   // If the previous value was a simple mapping, add liveness for it now.
   if (VNInfo *OldVNI = InsP.first->second.getPointer()) {
     SlotIndex Def = OldVNI->def;
-    LI->addRange(LiveRange(Def, Def.getDeadSlot(), OldVNI));
+    LI->addSegment(LiveInterval::Segment(Def, Def.getDeadSlot(), OldVNI));
     // No longer a simple mapping.  Switch to a complex, non-forced mapping.
     InsP.first->second = ValueForcePair();
   }
 
   // This is a complex mapping, add liveness for VNI
   SlotIndex Def = VNI->def;
-  LI->addRange(LiveRange(Def, Def.getDeadSlot(), VNI));
+  LI->addSegment(LiveInterval::Segment(Def, Def.getDeadSlot(), VNI));
 
   return VNI;
 }
@@ -422,7 +422,8 @@ void SplitEditor::forceRecompute(unsigned RegIdx, const VNInfo *ParentVNI) {
   // This was previously a single mapping. Make sure the old def is represented
   // by a trivial live range.
   SlotIndex Def = VNI->def;
-  Edit->get(RegIdx)->addRange(LiveRange(Def, Def.getDeadSlot(), VNI));
+  LiveInterval *LI = &LIS.getInterval(Edit->get(RegIdx));
+  LI->addSegment(LiveInterval::Segment(Def, Def.getDeadSlot(), VNI));
   // Mark as complex mapped, forced.
   VFP = ValueForcePair(0, true);
 }
@@ -434,7 +435,7 @@ VNInfo *SplitEditor::defFromParent(unsigned RegIdx,
                                    MachineBasicBlock::iterator I) {
   MachineInstr *CopyMI = 0;
   SlotIndex Def;
-  LiveInterval *LI = Edit->get(RegIdx);
+  LiveInterval *LI = &LIS.getInterval(Edit->get(RegIdx));
 
   // We may be trying to avoid interference that ends at a deleted instruction,
   // so always begin RegIdx 0 early and all others late.
@@ -462,11 +463,11 @@ VNInfo *SplitEditor::defFromParent(unsigned RegIdx,
 unsigned SplitEditor::openIntv() {
   // Create the complement as index 0.
   if (Edit->empty())
-    Edit->create();
+    Edit->createEmptyInterval();
 
   // Create the open interval.
   OpenIdx = Edit->size();
-  Edit->create();
+  Edit->createEmptyInterval();
   return OpenIdx;
 }
 
@@ -631,7 +632,7 @@ void SplitEditor::overlapIntv(SlotIndex Start, SlotIndex End) {
 //===----------------------------------------------------------------------===//
 
 void SplitEditor::removeBackCopies(SmallVectorImpl<VNInfo*> &Copies) {
-  LiveInterval *LI = Edit->get(0);
+  LiveInterval *LI = &LIS.getInterval(Edit->get(0));
   DEBUG(dbgs() << "Removing " << Copies.size() << " back-copies.\n");
   RegAssignMap::iterator AssignI;
   AssignI.setMap(RegAssign);
@@ -730,7 +731,7 @@ SplitEditor::findShallowDominator(MachineBasicBlock *MBB,
 
 void SplitEditor::hoistCopiesForSize() {
   // Get the complement interval, always RegIdx 0.
-  LiveInterval *LI = Edit->get(0);
+  LiveInterval *LI = &LIS.getInterval(Edit->get(0));
   LiveInterval *Parent = &Edit->getParent();
 
   // Track the nearest common dominator for all back-copies for each ParentVNI,
@@ -861,13 +862,13 @@ bool SplitEditor::transferValues() {
 
       // The interval [Start;End) is continuously mapped to RegIdx, ParentVNI.
       DEBUG(dbgs() << " [" << Start << ';' << End << ")=" << RegIdx);
-      LiveInterval *LI = Edit->get(RegIdx);
+      LiveRange &LR = LIS.getInterval(Edit->get(RegIdx));
 
       // Check for a simply defined value that can be blitted directly.
       ValueForcePair VFP = Values.lookup(std::make_pair(RegIdx, ParentVNI->id));
       if (VNInfo *VNI = VFP.getPointer()) {
         DEBUG(dbgs() << ':' << VNI->id);
-        LI->addRange(LiveRange(Start, End, VNI));
+        LR.addSegment(LiveInterval::Segment(Start, End, VNI));
         Start = End;
         continue;
       }
@@ -891,7 +892,7 @@ bool SplitEditor::transferValues() {
 
       // The first block may be live-in, or it may have its own def.
       if (Start != BlockStart) {
-        VNInfo *VNI = LI->extendInBlock(BlockStart, std::min(BlockEnd, End));
+        VNInfo *VNI = LR.extendInBlock(BlockStart, std::min(BlockEnd, End));
         assert(VNI && "Missing def for complex mapped value");
         DEBUG(dbgs() << ':' << VNI->id << "*BB#" << MBB->getNumber());
         // MBB has its own def. Is it also live-out?
@@ -911,7 +912,7 @@ bool SplitEditor::transferValues() {
         if (BlockStart == ParentVNI->def) {
           // This block has the def of a parent PHI, so it isn't live-in.
           assert(ParentVNI->isPHIDef() && "Non-phi defined at block start?");
-          VNInfo *VNI = LI->extendInBlock(BlockStart, std::min(BlockEnd, End));
+          VNInfo *VNI = LR.extendInBlock(BlockStart, std::min(BlockEnd, End));
           assert(VNI && "Missing def for complex mapped parent PHI");
           if (End >= BlockEnd)
             LRC.setLiveOutValue(MBB, VNI); // Live-out as well.
@@ -919,10 +920,10 @@ bool SplitEditor::transferValues() {
           // This block needs a live-in value.  The last block covered may not
           // be live-out.
           if (End < BlockEnd)
-            LRC.addLiveInBlock(LI, MDT[MBB], End);
+            LRC.addLiveInBlock(LR, MDT[MBB], End);
           else {
             // Live-through, and we don't know the value.
-            LRC.addLiveInBlock(LI, MDT[MBB]);
+            LRC.addLiveInBlock(LR, MDT[MBB]);
             LRC.setLiveOutValue(MBB, 0);
           }
         }
@@ -949,7 +950,7 @@ void SplitEditor::extendPHIKillRanges() {
     if (PHIVNI->isUnused() || !PHIVNI->isPHIDef())
       continue;
     unsigned RegIdx = RegAssign.lookup(PHIVNI->def);
-    LiveInterval *LI = Edit->get(RegIdx);
+    LiveRange &LR = LIS.getInterval(Edit->get(RegIdx));
     LiveRangeCalc &LRC = getLRCalc(RegIdx);
     MachineBasicBlock *MBB = LIS.getMBBFromIndex(PHIVNI->def);
     for (MachineBasicBlock::pred_iterator PI = MBB->pred_begin(),
@@ -961,7 +962,7 @@ void SplitEditor::extendPHIKillRanges() {
       if (Edit->getParent().liveAt(LastUse)) {
         assert(RegAssign.lookup(LastUse) == RegIdx &&
                "Different register assignment in phi predecessor");
-        LRC.extend(LI, End);
+        LRC.extend(LR, End);
       }
     }
   }
@@ -990,7 +991,7 @@ void SplitEditor::rewriteAssigned(bool ExtendRanges) {
 
     // Rewrite to the mapped register at Idx.
     unsigned RegIdx = RegAssign.lookup(Idx);
-    LiveInterval *LI = Edit->get(RegIdx);
+    LiveInterval *LI = &LIS.getInterval(Edit->get(RegIdx));
     MO.setReg(LI->reg);
     DEBUG(dbgs() << "  rewr BB#" << MI->getParent()->getNumber() << '\t'
                  << Idx << ':' << RegIdx << '\t' << *MI);
@@ -1011,14 +1012,14 @@ void SplitEditor::rewriteAssigned(bool ExtendRanges) {
     } else
       Idx = Idx.getRegSlot(true);
 
-    getLRCalc(RegIdx).extend(LI, Idx.getNextSlot());
+    getLRCalc(RegIdx).extend(*LI, Idx.getNextSlot());
   }
 }
 
 void SplitEditor::deleteRematVictims() {
   SmallVector<MachineInstr*, 8> Dead;
   for (LiveRangeEdit::iterator I = Edit->begin(), E = Edit->end(); I != E; ++I){
-    LiveInterval *LI = *I;
+    LiveInterval *LI = &LIS.getInterval(*I);
     for (LiveInterval::const_iterator LII = LI->begin(), LIE = LI->end();
            LII != LIE; ++LII) {
       // Dead defs end at the dead slot.
@@ -1091,8 +1092,10 @@ void SplitEditor::finish(SmallVectorImpl<unsigned> *LRMap) {
     deleteRematVictims();
 
   // Get rid of unused values and set phi-kill flags.
-  for (LiveRangeEdit::iterator I = Edit->begin(), E = Edit->end(); I != E; ++I)
-    (*I)->RenumberValues(LIS);
+  for (LiveRangeEdit::iterator I = Edit->begin(), E = Edit->end(); I != E; ++I) {
+    LiveInterval &LI = LIS.getInterval(*I);
+    LI.RenumberValues();
+  }
 
   // Provide a reverse mapping from original indices to Edit ranges.
   if (LRMap) {
@@ -1105,7 +1108,7 @@ void SplitEditor::finish(SmallVectorImpl<unsigned> *LRMap) {
   ConnectedVNInfoEqClasses ConEQ(LIS);
   for (unsigned i = 0, e = Edit->size(); i != e; ++i) {
     // Don't use iterators, they are invalidated by create() below.
-    LiveInterval *li = Edit->get(i);
+    LiveInterval *li = &LIS.getInterval(Edit->get(i));
     unsigned NumComp = ConEQ.Classify(li);
     if (NumComp <= 1)
       continue;
@@ -1113,7 +1116,7 @@ void SplitEditor::finish(SmallVectorImpl<unsigned> *LRMap) {
     SmallVector<LiveInterval*, 8> dups;
     dups.push_back(li);
     for (unsigned j = 1; j != NumComp; ++j)
-      dups.push_back(&Edit->create());
+      dups.push_back(&Edit->createEmptyInterval());
     ConEQ.Distribute(&dups[0], MRI);
     // The new intervals all map back to i.
     if (LRMap)
diff --git a/lib/CodeGen/StackColoring.cpp b/lib/CodeGen/StackColoring.cpp
index faaa6e7..3dbc050 100644
--- a/lib/CodeGen/StackColoring.cpp
+++ b/lib/CodeGen/StackColoring.cpp
@@ -170,7 +170,7 @@ private:
   /// slots to use the joint slots.
   void remapInstructions(DenseMap<int, int> &SlotRemap);
 
-  /// The input program may contain intructions which are not inside lifetime
+  /// The input program may contain instructions which are not inside lifetime
   /// markers. This can happen due to a bug in the compiler or due to a bug in
   /// user code (for example, returning a reference to a local variable).
   /// This procedure checks all of the instructions in the function and
@@ -450,14 +450,14 @@ void StackColoring::calculateLiveIntervals(unsigned NumSlots) {
       SlotIndex F = Finishes[i];
       if (S < F) {
         // We have a single consecutive region.
-        Intervals[i]->addRange(LiveRange(S, F, ValNum));
+        Intervals[i]->addSegment(LiveInterval::Segment(S, F, ValNum));
       } else {
         // We have two non consecutive regions. This happens when
         // LIFETIME_START appears after the LIFETIME_END marker.
         SlotIndex NewStart = Indexes->getMBBStartIdx(MBB);
         SlotIndex NewFin = Indexes->getMBBEndIdx(MBB);
-        Intervals[i]->addRange(LiveRange(NewStart, F, ValNum));
-        Intervals[i]->addRange(LiveRange(S, NewFin, ValNum));
+        Intervals[i]->addSegment(LiveInterval::Segment(NewStart, F, ValNum));
+        Intervals[i]->addSegment(LiveInterval::Segment(S, NewFin, ValNum));
       }
     }
   }
@@ -763,7 +763,7 @@ bool StackColoring::runOnMachineFunction(MachineFunction &Func) {
         // Merge disjoint slots.
         if (!First->overlaps(*Second)) {
           Changed = true;
-          First->MergeRangesInAsValue(*Second, First->getValNumInfo(0));
+          First->MergeSegmentsInAsValue(*Second, First->getValNumInfo(0));
           SlotRemap[SecondSlot] = FirstSlot;
           SortedSlots[J] = -1;
           DEBUG(dbgs()<<"Merging #"<<FirstSlot<<" and slots #"<<
diff --git a/lib/CodeGen/StackMaps.cpp b/lib/CodeGen/StackMaps.cpp
new file mode 100644
index 0000000..40893ea
--- /dev/null
+++ b/lib/CodeGen/StackMaps.cpp
@@ -0,0 +1,314 @@
+//===---------------------------- StackMaps.cpp ---------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "stackmaps"
+
+#include "llvm/CodeGen/StackMaps.h"
+
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCObjectFileInfo.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetOpcodes.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+#include <iterator>
+
+using namespace llvm;
+
+PatchPointOpers::PatchPointOpers(const MachineInstr *MI):
+  MI(MI),
+  HasDef(MI->getOperand(0).isReg() && MI->getOperand(0).isDef() &&
+         !MI->getOperand(0).isImplicit()),
+  IsAnyReg(MI->getOperand(getMetaIdx(CCPos)).getImm() == CallingConv::AnyReg) {
+
+#ifndef NDEBUG
+  {
+  unsigned CheckStartIdx = 0, e = MI->getNumOperands();
+  while (CheckStartIdx < e && MI->getOperand(CheckStartIdx).isReg() &&
+         MI->getOperand(CheckStartIdx).isDef() &&
+         !MI->getOperand(CheckStartIdx).isImplicit())
+    ++CheckStartIdx;
+
+  assert(getMetaIdx() == CheckStartIdx &&
+         "Unexpected additonal definition in Patchpoint intrinsic.");
+  }
+#endif
+}
+
+unsigned PatchPointOpers::getNextScratchIdx(unsigned StartIdx) const {
+  if (!StartIdx)
+    StartIdx = getVarIdx();
+
+  // Find the next scratch register (implicit def and early clobber)
+  unsigned ScratchIdx = StartIdx, e = MI->getNumOperands();
+  while (ScratchIdx < e &&
+         !(MI->getOperand(ScratchIdx).isReg() &&
+           MI->getOperand(ScratchIdx).isDef() &&
+           MI->getOperand(ScratchIdx).isImplicit() &&
+           MI->getOperand(ScratchIdx).isEarlyClobber()))
+    ++ScratchIdx;
+
+  assert(ScratchIdx != e && "No scratch register available");
+  return ScratchIdx;
+}
+
+void StackMaps::recordStackMapOpers(const MachineInstr &MI, uint32_t ID,
+                                    MachineInstr::const_mop_iterator MOI,
+                                    MachineInstr::const_mop_iterator MOE,
+                                    bool recordResult) {
+
+  MCContext &OutContext = AP.OutStreamer.getContext();
+  MCSymbol *MILabel = OutContext.CreateTempSymbol();
+  AP.OutStreamer.EmitLabel(MILabel);
+
+  LocationVec CallsiteLocs;
+
+  if (recordResult) {
+    std::pair<Location, MachineInstr::const_mop_iterator> ParseResult =
+      OpParser(MI.operands_begin(), llvm::next(MI.operands_begin()), AP.TM);
+
+    Location &Loc = ParseResult.first;
+    assert(Loc.LocType == Location::Register &&
+           "Stackmap return location must be a register.");
+    CallsiteLocs.push_back(Loc);
+  }
+
+  while (MOI != MOE) {
+    std::pair<Location, MachineInstr::const_mop_iterator> ParseResult =
+      OpParser(MOI, MOE, AP.TM);
+
+    Location &Loc = ParseResult.first;
+
+    // Move large constants into the constant pool.
+    if (Loc.LocType == Location::Constant && (Loc.Offset & ~0xFFFFFFFFULL)) {
+      Loc.LocType = Location::ConstantIndex;
+      Loc.Offset = ConstPool.getConstantIndex(Loc.Offset);
+    }
+
+    CallsiteLocs.push_back(Loc);
+    MOI = ParseResult.second;
+  }
+
+  const MCExpr *CSOffsetExpr = MCBinaryExpr::CreateSub(
+    MCSymbolRefExpr::Create(MILabel, OutContext),
+    MCSymbolRefExpr::Create(AP.CurrentFnSym, OutContext),
+    OutContext);
+
+  CSInfos.push_back(CallsiteInfo(CSOffsetExpr, ID, CallsiteLocs));
+}
+
+static MachineInstr::const_mop_iterator
+getStackMapEndMOP(MachineInstr::const_mop_iterator MOI,
+                  MachineInstr::const_mop_iterator MOE) {
+  for (; MOI != MOE; ++MOI)
+    if (MOI->isRegMask() || (MOI->isReg() && MOI->isImplicit()))
+      break;
+
+  return MOI;
+}
+
+void StackMaps::recordStackMap(const MachineInstr &MI) {
+  assert(MI.getOpcode() == TargetOpcode::STACKMAP && "exected stackmap");
+
+  int64_t ID = MI.getOperand(0).getImm();
+  assert((int32_t)ID == ID && "Stack maps hold 32-bit IDs");
+  recordStackMapOpers(MI, ID, llvm::next(MI.operands_begin(), 2),
+                      getStackMapEndMOP(MI.operands_begin(),
+                                        MI.operands_end()));
+}
+
+void StackMaps::recordPatchPoint(const MachineInstr &MI) {
+  assert(MI.getOpcode() == TargetOpcode::PATCHPOINT && "exected stackmap");
+
+  PatchPointOpers opers(&MI);
+  int64_t ID = opers.getMetaOper(PatchPointOpers::IDPos).getImm();
+  assert((int32_t)ID == ID && "Stack maps hold 32-bit IDs");
+  MachineInstr::const_mop_iterator MOI =
+    llvm::next(MI.operands_begin(), opers.getStackMapStartIdx());
+  recordStackMapOpers(MI, ID, MOI, getStackMapEndMOP(MOI, MI.operands_end()),
+                      opers.isAnyReg() && opers.hasDef());
+
+#ifndef NDEBUG
+  // verify anyregcc
+  LocationVec &Locations = CSInfos.back().Locations;
+  if (opers.isAnyReg()) {
+    unsigned NArgs = opers.getMetaOper(PatchPointOpers::NArgPos).getImm();
+    for (unsigned i = 0, e = (opers.hasDef() ? NArgs+1 : NArgs); i != e; ++i)
+      assert(Locations[i].LocType == Location::Register &&
+             "anyreg arg must be in reg.");
+  }
+#endif
+}
+
+/// serializeToStackMapSection conceptually populates the following fields:
+///
+/// uint32 : Reserved (header)
+/// uint32 : NumConstants
+/// int64  : Constants[NumConstants]
+/// uint32 : NumRecords
+/// StkMapRecord[NumRecords] {
+///   uint32 : PatchPoint ID
+///   uint32 : Instruction Offset
+///   uint16 : Reserved (record flags)
+///   uint16 : NumLocations
+///   Location[NumLocations] {
+///     uint8  : Register | Direct | Indirect | Constant | ConstantIndex
+///     uint8  : Size in Bytes
+///     uint16 : Dwarf RegNum
+///     int32  : Offset
+///   }
+/// }
+///
+/// Location Encoding, Type, Value:
+///   0x1, Register, Reg                 (value in register)
+///   0x2, Direct, Reg + Offset          (frame index)
+///   0x3, Indirect, [Reg + Offset]      (spilled value)
+///   0x4, Constant, Offset              (small constant)
+///   0x5, ConstIndex, Constants[Offset] (large constant)
+///
+void StackMaps::serializeToStackMapSection() {
+  // Bail out if there's no stack map data.
+  if (CSInfos.empty())
+    return;
+
+  MCContext &OutContext = AP.OutStreamer.getContext();
+  const TargetRegisterInfo *TRI = AP.TM.getRegisterInfo();
+
+  // Create the section.
+  const MCSection *StackMapSection =
+    OutContext.getObjectFileInfo()->getStackMapSection();
+  AP.OutStreamer.SwitchSection(StackMapSection);
+
+  // Emit a dummy symbol to force section inclusion.
+  AP.OutStreamer.EmitLabel(
+    OutContext.GetOrCreateSymbol(Twine("__LLVM_StackMaps")));
+
+  // Serialize data.
+  const char *WSMP = "Stack Maps: ";
+  (void)WSMP;
+  const MCRegisterInfo &MCRI = *OutContext.getRegisterInfo();
+
+  DEBUG(dbgs() << "********** Stack Map Output **********\n");
+
+  // Header.
+  AP.OutStreamer.EmitIntValue(0, 4);
+
+  // Num constants.
+  AP.OutStreamer.EmitIntValue(ConstPool.getNumConstants(), 4);
+
+  // Constant pool entries.
+  for (unsigned i = 0; i < ConstPool.getNumConstants(); ++i)
+    AP.OutStreamer.EmitIntValue(ConstPool.getConstant(i), 8);
+
+  DEBUG(dbgs() << WSMP << "#callsites = " << CSInfos.size() << "\n");
+  AP.OutStreamer.EmitIntValue(CSInfos.size(), 4);
+
+  for (CallsiteInfoList::const_iterator CSII = CSInfos.begin(),
+                                        CSIE = CSInfos.end();
+       CSII != CSIE; ++CSII) {
+
+    unsigned CallsiteID = CSII->ID;
+    const LocationVec &CSLocs = CSII->Locations;
+
+    DEBUG(dbgs() << WSMP << "callsite " << CallsiteID << "\n");
+
+    // Verify stack map entry. It's better to communicate a problem to the
+    // runtime than crash in case of in-process compilation. Currently, we do
+    // simple overflow checks, but we may eventually communicate other
+    // compilation errors this way.
+    if (CSLocs.size() > UINT16_MAX) {
+      AP.OutStreamer.EmitIntValue(UINT32_MAX, 4); // Invalid ID.
+      AP.OutStreamer.EmitValue(CSII->CSOffsetExpr, 4);
+      AP.OutStreamer.EmitIntValue(0, 2); // Reserved.
+      AP.OutStreamer.EmitIntValue(0, 2); // 0 locations.
+      continue;
+    }
+
+    AP.OutStreamer.EmitIntValue(CallsiteID, 4);
+    AP.OutStreamer.EmitValue(CSII->CSOffsetExpr, 4);
+
+    // Reserved for flags.
+    AP.OutStreamer.EmitIntValue(0, 2);
+
+    DEBUG(dbgs() << WSMP << "  has " << CSLocs.size() << " locations\n");
+
+    AP.OutStreamer.EmitIntValue(CSLocs.size(), 2);
+
+    unsigned operIdx = 0;
+    for (LocationVec::const_iterator LocI = CSLocs.begin(), LocE = CSLocs.end();
+         LocI != LocE; ++LocI, ++operIdx) {
+      const Location &Loc = *LocI;
+      DEBUG(
+        dbgs() << WSMP << "  Loc " << operIdx << ": ";
+        switch (Loc.LocType) {
+        case Location::Unprocessed:
+          dbgs() << "<Unprocessed operand>";
+          break;
+        case Location::Register:
+          dbgs() << "Register " << MCRI.getName(Loc.Reg);
+          break;
+        case Location::Direct:
+          dbgs() << "Direct " << MCRI.getName(Loc.Reg);
+          if (Loc.Offset)
+            dbgs() << " + " << Loc.Offset;
+          break;
+        case Location::Indirect:
+          dbgs() << "Indirect " << MCRI.getName(Loc.Reg)
+                 << " + " << Loc.Offset;
+          break;
+        case Location::Constant:
+          dbgs() << "Constant " << Loc.Offset;
+          break;
+        case Location::ConstantIndex:
+          dbgs() << "Constant Index " << Loc.Offset;
+          break;
+        }
+        dbgs() << "\n";
+      );
+
+      unsigned RegNo = 0;
+      int Offset = Loc.Offset;
+      if(Loc.Reg) {
+        RegNo = MCRI.getDwarfRegNum(Loc.Reg, false);
+        for (MCSuperRegIterator SR(Loc.Reg, TRI);
+             SR.isValid() && (int)RegNo < 0; ++SR) {
+          RegNo = TRI->getDwarfRegNum(*SR, false);
+        }
+        // If this is a register location, put the subregister byte offset in
+        // the location offset.
+        if (Loc.LocType == Location::Register) {
+          assert(!Loc.Offset && "Register location should have zero offset");
+          unsigned LLVMRegNo = MCRI.getLLVMRegNum(RegNo, false);
+          unsigned SubRegIdx = MCRI.getSubRegIndex(LLVMRegNo, Loc.Reg);
+          if (SubRegIdx)
+            Offset = MCRI.getSubRegIdxOffset(SubRegIdx);
+        }
+      }
+      else {
+        assert(Loc.LocType != Location::Register &&
+               "Missing location register");
+      }
+      AP.OutStreamer.EmitIntValue(Loc.LocType, 1);
+      AP.OutStreamer.EmitIntValue(Loc.Size, 1);
+      AP.OutStreamer.EmitIntValue(RegNo, 2);
+      AP.OutStreamer.EmitIntValue(Offset, 4);
+    }
+  }
+
+  AP.OutStreamer.AddBlankLine();
+
+  CSInfos.clear();
+}
diff --git a/lib/CodeGen/StackProtector.cpp b/lib/CodeGen/StackProtector.cpp
index 4c56380..9020449 100644
--- a/lib/CodeGen/StackProtector.cpp
+++ b/lib/CodeGen/StackProtector.cpp
@@ -15,11 +15,13 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "stack-protector"
+#include "llvm/CodeGen/StackProtector.h"
+#include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/Triple.h"
 #include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
@@ -27,12 +29,12 @@
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Module.h"
-#include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Target/TargetLowering.h"
 #include <cstdlib>
 using namespace llvm;
 
@@ -40,137 +42,93 @@ STATISTIC(NumFunProtected, "Number of functions protected");
 STATISTIC(NumAddrTaken, "Number of local variables that have their address"
                         " taken.");
 
-namespace {
-  class StackProtector : public FunctionPass {
-    const TargetMachine *TM;
-
-    /// TLI - Keep a pointer of a TargetLowering to consult for determining
-    /// target type sizes.
-    const TargetLoweringBase *TLI;
-    const Triple Trip;
-
-    Function *F;
-    Module *M;
-
-    DominatorTree *DT;
-
-    /// \brief The minimum size of buffers that will receive stack smashing
-    /// protection when -fstack-protection is used.
-    unsigned SSPBufferSize;
-
-    /// VisitedPHIs - The set of PHI nodes visited when determining
-    /// if a variable's reference has been taken.  This set 
-    /// is maintained to ensure we don't visit the same PHI node multiple
-    /// times.
-    SmallPtrSet<const PHINode*, 16> VisitedPHIs;
-
-    /// InsertStackProtectors - Insert code into the prologue and epilogue of
-    /// the function.
-    ///
-    ///  - The prologue code loads and stores the stack guard onto the stack.
-    ///  - The epilogue checks the value stored in the prologue against the
-    ///    original value. It calls __stack_chk_fail if they differ.
-    bool InsertStackProtectors();
-
-    /// CreateFailBB - Create a basic block to jump to when the stack protector
-    /// check fails.
-    BasicBlock *CreateFailBB();
-
-    /// ContainsProtectableArray - Check whether the type either is an array or
-    /// contains an array of sufficient size so that we need stack protectors
-    /// for it.
-    bool ContainsProtectableArray(Type *Ty, bool Strong = false,
-                                  bool InStruct = false) const;
-
-    /// \brief Check whether a stack allocation has its address taken.
-    bool HasAddressTaken(const Instruction *AI);
-
-    /// RequiresStackProtector - Check whether or not this function needs a
-    /// stack protector based upon the stack protector level.
-    bool RequiresStackProtector();
-  public:
-    static char ID;             // Pass identification, replacement for typeid.
-    StackProtector() : FunctionPass(ID), TM(0), TLI(0), SSPBufferSize(0) {
-      initializeStackProtectorPass(*PassRegistry::getPassRegistry());
-    }
-    StackProtector(const TargetMachine *TM)
-      : FunctionPass(ID), TM(TM), TLI(0), Trip(TM->getTargetTriple()),
-        SSPBufferSize(8) {
-      initializeStackProtectorPass(*PassRegistry::getPassRegistry());
-    }
-
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
-      AU.addPreserved<DominatorTree>();
-    }
-
-    virtual bool runOnFunction(Function &Fn);
-  };
-} // end anonymous namespace
+static cl::opt<bool> EnableSelectionDAGSP("enable-selectiondag-sp",
+                                          cl::init(true), cl::Hidden);
 
 char StackProtector::ID = 0;
-INITIALIZE_PASS(StackProtector, "stack-protector",
-                "Insert stack protectors", false, false)
+INITIALIZE_PASS(StackProtector, "stack-protector", "Insert stack protectors",
+                false, true)
 
 FunctionPass *llvm::createStackProtectorPass(const TargetMachine *TM) {
   return new StackProtector(TM);
 }
 
+StackProtector::SSPLayoutKind
+StackProtector::getSSPLayout(const AllocaInst *AI) const {
+  return AI ? Layout.lookup(AI) : SSPLK_None;
+}
+
 bool StackProtector::runOnFunction(Function &Fn) {
   F = &Fn;
   M = F->getParent();
   DT = getAnalysisIfAvailable<DominatorTree>();
   TLI = TM->getTargetLowering();
 
-  if (!RequiresStackProtector()) return false;
+  if (!RequiresStackProtector())
+    return false;
 
-  Attribute Attr =
-    Fn.getAttributes().getAttribute(AttributeSet::FunctionIndex,
-                                    "stack-protector-buffer-size");
+  Attribute Attr = Fn.getAttributes().getAttribute(
+      AttributeSet::FunctionIndex, "stack-protector-buffer-size");
   if (Attr.isStringAttribute())
-    SSPBufferSize = atoi(Attr.getValueAsString().data());
+    Attr.getValueAsString().getAsInteger(10, SSPBufferSize);
 
   ++NumFunProtected;
   return InsertStackProtectors();
 }
 
-/// ContainsProtectableArray - Check whether the type either is an array or
-/// contains a char array of sufficient size so that we need stack protectors
-/// for it.
-bool StackProtector::ContainsProtectableArray(Type *Ty, bool Strong,
+/// \param [out] IsLarge is set to true if a protectable array is found and
+/// it is "large" ( >= ssp-buffer-size).  In the case of a structure with
+/// multiple arrays, this gets set if any of them is large.
+bool StackProtector::ContainsProtectableArray(Type *Ty, bool &IsLarge,
+                                              bool Strong,
                                               bool InStruct) const {
-  if (!Ty) return false;
+  if (!Ty)
+    return false;
   if (ArrayType *AT = dyn_cast<ArrayType>(Ty)) {
-    // In strong mode any array, regardless of type and size, triggers a
-    // protector
-    if (Strong)
-      return true;
     if (!AT->getElementType()->isIntegerTy(8)) {
       // If we're on a non-Darwin platform or we're inside of a structure, don't
       // add stack protectors unless the array is a character array.
-      if (InStruct || !Trip.isOSDarwin())
-          return false;
+      // However, in strong mode any array, regardless of type and size,
+      // triggers a protector.
+      if (!Strong && (InStruct || !Trip.isOSDarwin()))
+        return false;
     }
 
     // If an array has more than SSPBufferSize bytes of allocated space, then we
     // emit stack protectors.
-    if (SSPBufferSize <= TLI->getDataLayout()->getTypeAllocSize(AT))
+    if (SSPBufferSize <= TLI->getDataLayout()->getTypeAllocSize(AT)) {
+      IsLarge = true;
+      return true;
+    }
+
+    if (Strong)
+      // Require a protector for all arrays in strong mode
       return true;
   }
 
   const StructType *ST = dyn_cast<StructType>(Ty);
-  if (!ST) return false;
+  if (!ST)
+    return false;
 
+  bool NeedsProtector = false;
   for (StructType::element_iterator I = ST->element_begin(),
-         E = ST->element_end(); I != E; ++I)
-    if (ContainsProtectableArray(*I, Strong, true))
-      return true;
+                                    E = ST->element_end();
+       I != E; ++I)
+    if (ContainsProtectableArray(*I, IsLarge, Strong, true)) {
+      // If the element is a protectable array and is large (>= SSPBufferSize)
+      // then we are done.  If the protectable array is not large, then
+      // keep looking in case a subsequent element is a large array.
+      if (IsLarge)
+        return true;
+      NeedsProtector = true;
+    }
 
-  return false;
+  return NeedsProtector;
 }
 
 bool StackProtector::HasAddressTaken(const Instruction *AI) {
   for (Value::const_use_iterator UI = AI->use_begin(), UE = AI->use_end();
-        UI != UE; ++UI) {
+       UI != UE; ++UI) {
     const User *U = *UI;
     if (const StoreInst *SI = dyn_cast<StoreInst>(U)) {
       if (AI == SI->getValueOperand())
@@ -217,11 +175,13 @@ bool StackProtector::HasAddressTaken(const Instruction *AI) {
 /// address taken.
 bool StackProtector::RequiresStackProtector() {
   bool Strong = false;
+  bool NeedsProtector = false;
   if (F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
-                                      Attribute::StackProtectReq))
-    return true;
-  else if (F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
-                                           Attribute::StackProtectStrong))
+                                      Attribute::StackProtectReq)) {
+    NeedsProtector = true;
+    Strong = true; // Use the same heuristic as strong to determine SSPLayout
+  } else if (F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
+                                             Attribute::StackProtectStrong))
     Strong = true;
   else if (!F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
                                             Attribute::StackProtect))
@@ -230,39 +190,116 @@ bool StackProtector::RequiresStackProtector() {
   for (Function::iterator I = F->begin(), E = F->end(); I != E; ++I) {
     BasicBlock *BB = I;
 
-    for (BasicBlock::iterator
-           II = BB->begin(), IE = BB->end(); II != IE; ++II) {
+    for (BasicBlock::iterator II = BB->begin(), IE = BB->end(); II != IE;
+         ++II) {
       if (AllocaInst *AI = dyn_cast<AllocaInst>(II)) {
         if (AI->isArrayAllocation()) {
           // SSP-Strong: Enable protectors for any call to alloca, regardless
           // of size.
           if (Strong)
             return true;
-  
+
           if (const ConstantInt *CI =
-               dyn_cast<ConstantInt>(AI->getArraySize())) {
-            if (CI->getLimitedValue(SSPBufferSize) >= SSPBufferSize)
+                  dyn_cast<ConstantInt>(AI->getArraySize())) {
+            if (CI->getLimitedValue(SSPBufferSize) >= SSPBufferSize) {
               // A call to alloca with size >= SSPBufferSize requires
               // stack protectors.
-              return true;
+              Layout.insert(std::make_pair(AI, SSPLK_LargeArray));
+              NeedsProtector = true;
+            } else if (Strong) {
+              // Require protectors for all alloca calls in strong mode.
+              Layout.insert(std::make_pair(AI, SSPLK_SmallArray));
+              NeedsProtector = true;
+            }
           } else {
             // A call to alloca with a variable size requires protectors.
-            return true;
+            Layout.insert(std::make_pair(AI, SSPLK_LargeArray));
+            NeedsProtector = true;
           }
+          continue;
         }
 
-        if (ContainsProtectableArray(AI->getAllocatedType(), Strong))
-          return true;
+        bool IsLarge = false;
+        if (ContainsProtectableArray(AI->getAllocatedType(), IsLarge, Strong)) {
+          Layout.insert(std::make_pair(AI, IsLarge ? SSPLK_LargeArray
+                                                   : SSPLK_SmallArray));
+          NeedsProtector = true;
+          continue;
+        }
 
         if (Strong && HasAddressTaken(AI)) {
-          ++NumAddrTaken; 
-          return true;
+          ++NumAddrTaken;
+          Layout.insert(std::make_pair(AI, SSPLK_AddrOf));
+          NeedsProtector = true;
         }
       }
     }
   }
 
-  return false;
+  return NeedsProtector;
+}
+
+static bool InstructionWillNotHaveChain(const Instruction *I) {
+  return !I->mayHaveSideEffects() && !I->mayReadFromMemory() &&
+         isSafeToSpeculativelyExecute(I);
+}
+
+/// Identify if RI has a previous instruction in the "Tail Position" and return
+/// it. Otherwise return 0.
+///
+/// This is based off of the code in llvm::isInTailCallPosition. The difference
+/// is that it inverts the first part of llvm::isInTailCallPosition since
+/// isInTailCallPosition is checking if a call is in a tail call position, and
+/// we are searching for an unknown tail call that might be in the tail call
+/// position. Once we find the call though, the code uses the same refactored
+/// code, returnTypeIsEligibleForTailCall.
+static CallInst *FindPotentialTailCall(BasicBlock *BB, ReturnInst *RI,
+                                       const TargetLoweringBase *TLI) {
+  // Establish a reasonable upper bound on the maximum amount of instructions we
+  // will look through to find a tail call.
+  unsigned SearchCounter = 0;
+  const unsigned MaxSearch = 4;
+  bool NoInterposingChain = true;
+
+  for (BasicBlock::reverse_iterator I = llvm::next(BB->rbegin()),
+                                    E = BB->rend();
+       I != E && SearchCounter < MaxSearch; ++I) {
+    Instruction *Inst = &*I;
+
+    // Skip over debug intrinsics and do not allow them to affect our MaxSearch
+    // counter.
+    if (isa<DbgInfoIntrinsic>(Inst))
+      continue;
+
+    // If we find a call and the following conditions are satisifed, then we
+    // have found a tail call that satisfies at least the target independent
+    // requirements of a tail call:
+    //
+    // 1. The call site has the tail marker.
+    //
+    // 2. The call site either will not cause the creation of a chain or if a
+    // chain is necessary there are no instructions in between the callsite and
+    // the call which would create an interposing chain.
+    //
+    // 3. The return type of the function does not impede tail call
+    // optimization.
+    if (CallInst *CI = dyn_cast<CallInst>(Inst)) {
+      if (CI->isTailCall() &&
+          (InstructionWillNotHaveChain(CI) || NoInterposingChain) &&
+          returnTypeIsEligibleForTailCall(BB->getParent(), CI, RI, *TLI))
+        return CI;
+    }
+
+    // If we did not find a call see if we have an instruction that may create
+    // an interposing chain.
+    NoInterposingChain =
+        NoInterposingChain && InstructionWillNotHaveChain(Inst);
+
+    // Increment max search.
+    SearchCounter++;
+  }
+
+  return 0;
 }
 
 /// Insert code into the entry block that stores the __stack_chk_guard
@@ -273,36 +310,36 @@ bool StackProtector::RequiresStackProtector() {
 ///     StackGuard = load __stack_chk_guard
 ///     call void @llvm.stackprotect.create(StackGuard, StackGuardSlot)
 ///
-static void CreatePrologue(Function *F, Module *M, ReturnInst *RI,
+/// Returns true if the platform/triple supports the stackprotectorcreate pseudo
+/// node.
+static bool CreatePrologue(Function *F, Module *M, ReturnInst *RI,
                            const TargetLoweringBase *TLI, const Triple &Trip,
                            AllocaInst *&AI, Value *&StackGuardVar) {
+  bool SupportsSelectionDAGSP = false;
   PointerType *PtrTy = Type::getInt8PtrTy(RI->getContext());
   unsigned AddressSpace, Offset;
   if (TLI->getStackCookieLocation(AddressSpace, Offset)) {
     Constant *OffsetVal =
-      ConstantInt::get(Type::getInt32Ty(RI->getContext()), Offset);
-    
-    StackGuardVar = ConstantExpr::getIntToPtr(OffsetVal,
-                                              PointerType::get(PtrTy,
-                                                               AddressSpace));
+        ConstantInt::get(Type::getInt32Ty(RI->getContext()), Offset);
+
+    StackGuardVar = ConstantExpr::getIntToPtr(
+        OffsetVal, PointerType::get(PtrTy, AddressSpace));
   } else if (Trip.getOS() == llvm::Triple::OpenBSD) {
     StackGuardVar = M->getOrInsertGlobal("__guard_local", PtrTy);
     cast<GlobalValue>(StackGuardVar)
-      ->setVisibility(GlobalValue::HiddenVisibility);
+        ->setVisibility(GlobalValue::HiddenVisibility);
   } else {
+    SupportsSelectionDAGSP = true;
     StackGuardVar = M->getOrInsertGlobal("__stack_chk_guard", PtrTy);
   }
-  
-  BasicBlock &Entry = F->getEntryBlock();
-  Instruction *InsPt = &Entry.front();
-  
-  AI = new AllocaInst(PtrTy, "StackGuardSlot", InsPt);
-  LoadInst *LI = new LoadInst(StackGuardVar, "StackGuard", false, InsPt);
-  
-  Value *Args[] = { LI, AI };
-  CallInst::
-    Create(Intrinsic::getDeclaration(M, Intrinsic::stackprotector),
-           Args, "", InsPt);
+
+  IRBuilder<> B(&F->getEntryBlock().front());
+  AI = B.CreateAlloca(PtrTy, 0, "StackGuardSlot");
+  LoadInst *LI = B.CreateLoad(StackGuardVar, "StackGuard");
+  B.CreateCall2(Intrinsic::getDeclaration(M, Intrinsic::stackprotector), LI,
+                AI);
+
+  return SupportsSelectionDAGSP;
 }
 
 /// InsertStackProtectors - Insert code into the prologue and epilogue of the
@@ -312,72 +349,102 @@ static void CreatePrologue(Function *F, Module *M, ReturnInst *RI,
 ///  - The epilogue checks the value stored in the prologue against the original
 ///    value. It calls __stack_chk_fail if they differ.
 bool StackProtector::InsertStackProtectors() {
-  BasicBlock *FailBB = 0;       // The basic block to jump to if check fails.
-  BasicBlock *FailBBDom = 0;    // FailBB's dominator.
-  AllocaInst *AI = 0;           // Place on stack that stores the stack guard.
-  Value *StackGuardVar = 0;  // The stack guard variable.
+  bool HasPrologue = false;
+  bool SupportsSelectionDAGSP =
+      EnableSelectionDAGSP && !TM->Options.EnableFastISel;
+  AllocaInst *AI = 0;       // Place on stack that stores the stack guard.
+  Value *StackGuardVar = 0; // The stack guard variable.
 
-  for (Function::iterator I = F->begin(), E = F->end(); I != E; ) {
+  for (Function::iterator I = F->begin(), E = F->end(); I != E;) {
     BasicBlock *BB = I++;
     ReturnInst *RI = dyn_cast<ReturnInst>(BB->getTerminator());
-    if (!RI) continue;
+    if (!RI)
+      continue;
 
-    if (!FailBB) {
-      CreatePrologue(F, M, RI, TLI, Trip, AI, StackGuardVar);
-      // Create the basic block to jump to when the guard check fails.
-      FailBB = CreateFailBB();
+    if (!HasPrologue) {
+      HasPrologue = true;
+      SupportsSelectionDAGSP &=
+          CreatePrologue(F, M, RI, TLI, Trip, AI, StackGuardVar);
     }
 
-    // For each block with a return instruction, convert this:
-    //
-    //   return:
-    //     ...
-    //     ret ...
-    //
-    // into this:
-    //
-    //   return:
-    //     ...
-    //     %1 = load __stack_chk_guard
-    //     %2 = load StackGuardSlot
-    //     %3 = cmp i1 %1, %2
-    //     br i1 %3, label %SP_return, label %CallStackCheckFailBlk
-    //
-    //   SP_return:
-    //     ret ...
-    //
-    //   CallStackCheckFailBlk:
-    //     call void @__stack_chk_fail()
-    //     unreachable
-
-    // Split the basic block before the return instruction.
-    BasicBlock *NewBB = BB->splitBasicBlock(RI, "SP_return");
+    if (SupportsSelectionDAGSP) {
+      // Since we have a potential tail call, insert the special stack check
+      // intrinsic.
+      Instruction *InsertionPt = 0;
+      if (CallInst *CI = FindPotentialTailCall(BB, RI, TLI)) {
+        InsertionPt = CI;
+      } else {
+        InsertionPt = RI;
+        // At this point we know that BB has a return statement so it *DOES*
+        // have a terminator.
+        assert(InsertionPt != 0 && "BB must have a terminator instruction at "
+                                   "this point.");
+      }
 
-    if (DT && DT->isReachableFromEntry(BB)) {
-      DT->addNewBlock(NewBB, BB);
-      FailBBDom = FailBBDom ? DT->findNearestCommonDominator(FailBBDom, BB) :BB;
-    }
+      Function *Intrinsic =
+          Intrinsic::getDeclaration(M, Intrinsic::stackprotectorcheck);
+      CallInst::Create(Intrinsic, StackGuardVar, "", InsertionPt);
+
+    } else {
+      // If we do not support SelectionDAG based tail calls, generate IR level
+      // tail calls.
+      //
+      // For each block with a return instruction, convert this:
+      //
+      //   return:
+      //     ...
+      //     ret ...
+      //
+      // into this:
+      //
+      //   return:
+      //     ...
+      //     %1 = load __stack_chk_guard
+      //     %2 = load StackGuardSlot
+      //     %3 = cmp i1 %1, %2
+      //     br i1 %3, label %SP_return, label %CallStackCheckFailBlk
+      //
+      //   SP_return:
+      //     ret ...
+      //
+      //   CallStackCheckFailBlk:
+      //     call void @__stack_chk_fail()
+      //     unreachable
+
+      // Create the FailBB. We duplicate the BB every time since the MI tail
+      // merge pass will merge together all of the various BB into one including
+      // fail BB generated by the stack protector pseudo instruction.
+      BasicBlock *FailBB = CreateFailBB();
+
+      // Split the basic block before the return instruction.
+      BasicBlock *NewBB = BB->splitBasicBlock(RI, "SP_return");
+
+      // Update the dominator tree if we need to.
+      if (DT && DT->isReachableFromEntry(BB)) {
+        DT->addNewBlock(NewBB, BB);
+        DT->addNewBlock(FailBB, BB);
+      }
 
-    // Remove default branch instruction to the new BB.
-    BB->getTerminator()->eraseFromParent();
+      // Remove default branch instruction to the new BB.
+      BB->getTerminator()->eraseFromParent();
 
-    // Move the newly created basic block to the point right after the old basic
-    // block so that it's in the "fall through" position.
-    NewBB->moveAfter(BB);
+      // Move the newly created basic block to the point right after the old
+      // basic block so that it's in the "fall through" position.
+      NewBB->moveAfter(BB);
 
-    // Generate the stack protector instructions in the old basic block.
-    LoadInst *LI1 = new LoadInst(StackGuardVar, "", false, BB);
-    LoadInst *LI2 = new LoadInst(AI, "", true, BB);
-    ICmpInst *Cmp = new ICmpInst(*BB, CmpInst::ICMP_EQ, LI1, LI2, "");
-    BranchInst::Create(NewBB, FailBB, Cmp, BB);
+      // Generate the stack protector instructions in the old basic block.
+      IRBuilder<> B(BB);
+      LoadInst *LI1 = B.CreateLoad(StackGuardVar);
+      LoadInst *LI2 = B.CreateLoad(AI);
+      Value *Cmp = B.CreateICmpEQ(LI1, LI2);
+      B.CreateCondBr(Cmp, NewBB, FailBB);
+    }
   }
 
   // Return if we didn't modify any basic blocks. I.e., there are no return
   // statements in the function.
-  if (!FailBB) return false;
-
-  if (DT && FailBBDom)
-    DT->addNewBlock(FailBB, FailBBDom);
+  if (!HasPrologue)
+    return false;
 
   return true;
 }
@@ -387,29 +454,18 @@ bool StackProtector::InsertStackProtectors() {
 BasicBlock *StackProtector::CreateFailBB() {
   LLVMContext &Context = F->getContext();
   BasicBlock *FailBB = BasicBlock::Create(Context, "CallStackCheckFailBlk", F);
+  IRBuilder<> B(FailBB);
   if (Trip.getOS() == llvm::Triple::OpenBSD) {
     Constant *StackChkFail = M->getOrInsertFunction(
         "__stack_smash_handler", Type::getVoidTy(Context),
         Type::getInt8PtrTy(Context), NULL);
 
-    Constant *NameStr = ConstantDataArray::getString(Context, F->getName());
-    Constant *FuncName =
-        new GlobalVariable(*M, NameStr->getType(), true,
-                           GlobalVariable::PrivateLinkage, NameStr, "SSH");
-
-    SmallVector<Constant *, 2> IdxList;
-    IdxList.push_back(ConstantInt::get(Type::getInt8Ty(Context), 0));
-    IdxList.push_back(ConstantInt::get(Type::getInt8Ty(Context), 0));
-
-    SmallVector<Value *, 1> Args;
-    Args.push_back(ConstantExpr::getGetElementPtr(FuncName, IdxList));
-
-    CallInst::Create(StackChkFail, Args, "", FailBB);
+    B.CreateCall(StackChkFail, B.CreateGlobalStringPtr(F->getName(), "SSH"));
   } else {
     Constant *StackChkFail = M->getOrInsertFunction(
         "__stack_chk_fail", Type::getVoidTy(Context), NULL);
-    CallInst::Create(StackChkFail, "", FailBB);
+    B.CreateCall(StackChkFail);
   }
-  new UnreachableInst(Context, FailBB);
+  B.CreateUnreachable();
   return FailBB;
 }
diff --git a/lib/CodeGen/StrongPHIElimination.cpp b/lib/CodeGen/StrongPHIElimination.cpp
deleted file mode 100644
index b337c53..0000000
--- a/lib/CodeGen/StrongPHIElimination.cpp
+++ /dev/null
@@ -1,825 +0,0 @@
-//===- StrongPHIElimination.cpp - Eliminate PHI nodes by inserting copies -===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass eliminates PHI instructions by aggressively coalescing the copies
-// that would be inserted by a naive algorithm and only inserting the copies
-// that are necessary. The coalescing technique initially assumes that all
-// registers appearing in a PHI instruction do not interfere. It then eliminates
-// proven interferences, using dominators to only perform a linear number of
-// interference tests instead of the quadratic number of interference tests
-// that this would naively require. This is a technique derived from:
-// 
-//    Budimlic, et al. Fast copy coalescing and live-range identification.
-//    In Proceedings of the ACM SIGPLAN 2002 Conference on Programming Language
-//    Design and Implementation (Berlin, Germany, June 17 - 19, 2002).
-//    PLDI '02. ACM, New York, NY, 25-32.
-//
-// The original implementation constructs a data structure they call a dominance
-// forest for this purpose. The dominance forest was shown to be unnecessary,
-// as it is possible to emulate the creation and traversal of a dominance forest
-// by directly using the dominator tree, rather than actually constructing the
-// dominance forest.  This technique is explained in:
-//
-//   Boissinot, et al. Revisiting Out-of-SSA Translation for Correctness, Code
-//     Quality and Efficiency,
-//   In Proceedings of the 7th annual IEEE/ACM International Symposium on Code
-//   Generation and Optimization (Seattle, Washington, March 22 - 25, 2009).
-//   CGO '09. IEEE, Washington, DC, 114-125.
-//
-// Careful implementation allows for all of the dominator forest interference
-// checks to be performed at once in a single depth-first traversal of the
-// dominator tree, which is what is implemented here.
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "strongphielim"
-#include "llvm/CodeGen/Passes.h"
-#include "PHIEliminationUtils.h"
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/CodeGen/LiveIntervalAnalysis.h"
-#include "llvm/CodeGen/MachineDominators.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Target/TargetInstrInfo.h"
-using namespace llvm;
-
-namespace {
-  class StrongPHIElimination : public MachineFunctionPass {
-  public:
-    static char ID; // Pass identification, replacement for typeid
-    StrongPHIElimination() : MachineFunctionPass(ID) {
-      initializeStrongPHIEliminationPass(*PassRegistry::getPassRegistry());
-    }
-
-    virtual void getAnalysisUsage(AnalysisUsage&) const;
-    bool runOnMachineFunction(MachineFunction&);
-
-  private:
-    /// This struct represents a single node in the union-find data structure
-    /// representing the variable congruence classes. There is one difference
-    /// from a normal union-find data structure. We steal two bits from the parent
-    /// pointer . One of these bits is used to represent whether the register
-    /// itself has been isolated, and the other is used to represent whether the
-    /// PHI with that register as its destination has been isolated.
-    ///
-    /// Note that this leads to the strange situation where the leader of a
-    /// congruence class may no longer logically be a member, due to being
-    /// isolated.
-    struct Node {
-      enum Flags {
-        kRegisterIsolatedFlag = 1,
-        kPHIIsolatedFlag = 2
-      };
-      Node(unsigned v) : value(v), rank(0) { parent.setPointer(this); }
-
-      Node *getLeader();
-
-      PointerIntPair<Node*, 2> parent;
-      unsigned value;
-      unsigned rank;
-    };
-
-    /// Add a register in a new congruence class containing only itself.
-    void addReg(unsigned);
-
-    /// Join the congruence classes of two registers. This function is biased
-    /// towards the left argument, i.e. after
-    ///
-    /// addReg(r2);
-    /// unionRegs(r1, r2);
-    ///
-    /// the leader of the unioned congruence class is the same as the leader of
-    /// r1's congruence class prior to the union. This is actually relied upon
-    /// in the copy insertion code.
-    void unionRegs(unsigned, unsigned);
-
-    /// Get the color of a register. The color is 0 if the register has been
-    /// isolated.
-    unsigned getRegColor(unsigned);
-
-    // Isolate a register.
-    void isolateReg(unsigned);
-
-    /// Get the color of a PHI. The color of a PHI is 0 if the PHI has been
-    /// isolated. Otherwise, it is the original color of its destination and
-    /// all of its operands (before they were isolated, if they were).
-    unsigned getPHIColor(MachineInstr*);
-
-    /// Isolate a PHI.
-    void isolatePHI(MachineInstr*);
-
-    /// Traverses a basic block, splitting any interferences found between
-    /// registers in the same congruence class. It takes two DenseMaps as
-    /// arguments that it also updates: CurrentDominatingParent, which maps
-    /// a color to the register in that congruence class whose definition was
-    /// most recently seen, and ImmediateDominatingParent, which maps a register
-    /// to the register in the same congruence class that most immediately
-    /// dominates it.
-    ///
-    /// This function assumes that it is being called in a depth-first traversal
-    /// of the dominator tree.
-    void SplitInterferencesForBasicBlock(
-      MachineBasicBlock&,
-      DenseMap<unsigned, unsigned> &CurrentDominatingParent,
-      DenseMap<unsigned, unsigned> &ImmediateDominatingParent);
-
-    // Lowers a PHI instruction, inserting copies of the source and destination
-    // registers as necessary.
-    void InsertCopiesForPHI(MachineInstr*, MachineBasicBlock*);
-
-    // Merges the live interval of Reg into NewReg and renames Reg to NewReg
-    // everywhere that Reg appears. Requires Reg and NewReg to have non-
-    // overlapping lifetimes.
-    void MergeLIsAndRename(unsigned Reg, unsigned NewReg);
-
-    MachineRegisterInfo *MRI;
-    const TargetInstrInfo *TII;
-    MachineDominatorTree *DT;
-    LiveIntervals *LI;
-
-    BumpPtrAllocator Allocator;
-
-    DenseMap<unsigned, Node*> RegNodeMap;
-
-    // Maps a basic block to a list of its defs of registers that appear as PHI
-    // sources.
-    DenseMap<MachineBasicBlock*, std::vector<MachineInstr*> > PHISrcDefs;
-
-    // Maps a color to a pair of a MachineInstr* and a virtual register, which
-    // is the operand of that PHI corresponding to the current basic block.
-    DenseMap<unsigned, std::pair<MachineInstr*, unsigned> > CurrentPHIForColor;
-
-    // FIXME: Can these two data structures be combined? Would a std::multimap
-    // be any better?
-
-    // Stores pairs of predecessor basic blocks and the source registers of
-    // inserted copy instructions.
-    typedef DenseSet<std::pair<MachineBasicBlock*, unsigned> > SrcCopySet;
-    SrcCopySet InsertedSrcCopySet;
-
-    // Maps pairs of predecessor basic blocks and colors to their defining copy
-    // instructions.
-    typedef DenseMap<std::pair<MachineBasicBlock*, unsigned>, MachineInstr*>
-      SrcCopyMap;
-    SrcCopyMap InsertedSrcCopyMap;
-
-    // Maps inserted destination copy registers to their defining copy
-    // instructions.
-    typedef DenseMap<unsigned, MachineInstr*> DestCopyMap;
-    DestCopyMap InsertedDestCopies;
-  };
-
-  struct MIIndexCompare {
-    MIIndexCompare(LiveIntervals *LiveIntervals) : LI(LiveIntervals) { }
-
-    bool operator()(const MachineInstr *LHS, const MachineInstr *RHS) const {
-      return LI->getInstructionIndex(LHS) < LI->getInstructionIndex(RHS);
-    }
-
-    LiveIntervals *LI;
-  };
-} // namespace
-
-STATISTIC(NumPHIsLowered, "Number of PHIs lowered");
-STATISTIC(NumDestCopiesInserted, "Number of destination copies inserted");
-STATISTIC(NumSrcCopiesInserted, "Number of source copies inserted");
-
-char StrongPHIElimination::ID = 0;
-INITIALIZE_PASS_BEGIN(StrongPHIElimination, "strong-phi-node-elimination",
-  "Eliminate PHI nodes for register allocation, intelligently", false, false)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
-INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
-INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
-INITIALIZE_PASS_END(StrongPHIElimination, "strong-phi-node-elimination",
-  "Eliminate PHI nodes for register allocation, intelligently", false, false)
-
-char &llvm::StrongPHIEliminationID = StrongPHIElimination::ID;
-
-void StrongPHIElimination::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.setPreservesCFG();
-  AU.addRequired<MachineDominatorTree>();
-  AU.addRequired<SlotIndexes>();
-  AU.addPreserved<SlotIndexes>();
-  AU.addRequired<LiveIntervals>();
-  AU.addPreserved<LiveIntervals>();
-  MachineFunctionPass::getAnalysisUsage(AU);
-}
-
-static MachineOperand *findLastUse(MachineBasicBlock *MBB, unsigned Reg) {
-  // FIXME: This only needs to check from the first terminator, as only the
-  // first terminator can use a virtual register.
-  for (MachineBasicBlock::reverse_iterator RI = MBB->rbegin(); ; ++RI) {
-    assert (RI != MBB->rend());
-    MachineInstr *MI = &*RI;
-
-    for (MachineInstr::mop_iterator OI = MI->operands_begin(),
-         OE = MI->operands_end(); OI != OE; ++OI) {
-      MachineOperand &MO = *OI;
-      if (MO.isReg() && MO.isUse() && MO.getReg() == Reg)
-        return &MO;
-    }
-  }
-}
-
-bool StrongPHIElimination::runOnMachineFunction(MachineFunction &MF) {
-  MRI = &MF.getRegInfo();
-  TII = MF.getTarget().getInstrInfo();
-  DT = &getAnalysis<MachineDominatorTree>();
-  LI = &getAnalysis<LiveIntervals>();
-
-  for (MachineFunction::iterator I = MF.begin(), E = MF.end();
-       I != E; ++I) {
-    for (MachineBasicBlock::iterator BBI = I->begin(), BBE = I->end();
-         BBI != BBE && BBI->isPHI(); ++BBI) {
-      unsigned DestReg = BBI->getOperand(0).getReg();
-      addReg(DestReg);
-      PHISrcDefs[I].push_back(BBI);
-
-      for (unsigned i = 1; i < BBI->getNumOperands(); i += 2) {
-        MachineOperand &SrcMO = BBI->getOperand(i);
-        unsigned SrcReg = SrcMO.getReg();
-        addReg(SrcReg);
-        unionRegs(DestReg, SrcReg);
-
-        MachineInstr *DefMI = MRI->getVRegDef(SrcReg);
-        if (DefMI)
-          PHISrcDefs[DefMI->getParent()].push_back(DefMI);
-      }
-    }
-  }
-
-  // Perform a depth-first traversal of the dominator tree, splitting
-  // interferences amongst PHI-congruence classes.
-  DenseMap<unsigned, unsigned> CurrentDominatingParent;
-  DenseMap<unsigned, unsigned> ImmediateDominatingParent;
-  for (df_iterator<MachineDomTreeNode*> DI = df_begin(DT->getRootNode()),
-       DE = df_end(DT->getRootNode()); DI != DE; ++DI) {
-    SplitInterferencesForBasicBlock(*DI->getBlock(),
-                                    CurrentDominatingParent,
-                                    ImmediateDominatingParent);
-  }
-
-  // Insert copies for all PHI source and destination registers.
-  for (MachineFunction::iterator I = MF.begin(), E = MF.end();
-       I != E; ++I) {
-    for (MachineBasicBlock::iterator BBI = I->begin(), BBE = I->end();
-         BBI != BBE && BBI->isPHI(); ++BBI) {
-      InsertCopiesForPHI(BBI, I);
-    }
-  }
-
-  // FIXME: Preserve the equivalence classes during copy insertion and use
-  // the preversed equivalence classes instead of recomputing them.
-  RegNodeMap.clear();
-  for (MachineFunction::iterator I = MF.begin(), E = MF.end();
-       I != E; ++I) {
-    for (MachineBasicBlock::iterator BBI = I->begin(), BBE = I->end();
-         BBI != BBE && BBI->isPHI(); ++BBI) {
-      unsigned DestReg = BBI->getOperand(0).getReg();
-      addReg(DestReg);
-
-      for (unsigned i = 1; i < BBI->getNumOperands(); i += 2) {
-        unsigned SrcReg = BBI->getOperand(i).getReg();
-        addReg(SrcReg);
-        unionRegs(DestReg, SrcReg);
-      }
-    }
-  }
-
-  DenseMap<unsigned, unsigned> RegRenamingMap;
-  bool Changed = false;
-  for (MachineFunction::iterator I = MF.begin(), E = MF.end();
-       I != E; ++I) {
-    MachineBasicBlock::iterator BBI = I->begin(), BBE = I->end();
-    while (BBI != BBE && BBI->isPHI()) {
-      MachineInstr *PHI = BBI;
-
-      assert(PHI->getNumOperands() > 0);
-
-      unsigned SrcReg = PHI->getOperand(1).getReg();
-      unsigned SrcColor = getRegColor(SrcReg);
-      unsigned NewReg = RegRenamingMap[SrcColor];
-      if (!NewReg) {
-        NewReg = SrcReg;
-        RegRenamingMap[SrcColor] = SrcReg;
-      }
-      MergeLIsAndRename(SrcReg, NewReg);
-
-      unsigned DestReg = PHI->getOperand(0).getReg();
-      if (!InsertedDestCopies.count(DestReg))
-        MergeLIsAndRename(DestReg, NewReg);
-
-      for (unsigned i = 3; i < PHI->getNumOperands(); i += 2) {
-        unsigned SrcReg = PHI->getOperand(i).getReg();
-        MergeLIsAndRename(SrcReg, NewReg);
-      }
-
-      ++BBI;
-      LI->RemoveMachineInstrFromMaps(PHI);
-      PHI->eraseFromParent();
-      Changed = true;
-    }
-  }
-
-  // Due to the insertion of copies to split live ranges, the live intervals are
-  // guaranteed to not overlap, except in one case: an original PHI source and a
-  // PHI destination copy. In this case, they have the same value and thus don't
-  // truly intersect, so we merge them into the value live at that point.
-  // FIXME: Is there some better way we can handle this?
-  for (DestCopyMap::iterator I = InsertedDestCopies.begin(),
-       E = InsertedDestCopies.end(); I != E; ++I) {
-    unsigned DestReg = I->first;
-    unsigned DestColor = getRegColor(DestReg);
-    unsigned NewReg = RegRenamingMap[DestColor];
-
-    LiveInterval &DestLI = LI->getInterval(DestReg);
-    LiveInterval &NewLI = LI->getInterval(NewReg);
-
-    assert(DestLI.ranges.size() == 1
-           && "PHI destination copy's live interval should be a single live "
-               "range from the beginning of the BB to the copy instruction.");
-    LiveRange *DestLR = DestLI.begin();
-    VNInfo *NewVNI = NewLI.getVNInfoAt(DestLR->start);
-    if (!NewVNI) {
-      NewVNI = NewLI.createValueCopy(DestLR->valno, LI->getVNInfoAllocator());
-      MachineInstr *CopyInstr = I->second;
-      CopyInstr->getOperand(1).setIsKill(true);
-    }
-
-    LiveRange NewLR(DestLR->start, DestLR->end, NewVNI);
-    NewLI.addRange(NewLR);
-
-    LI->removeInterval(DestReg);
-    MRI->replaceRegWith(DestReg, NewReg);
-  }
-
-  // Adjust the live intervals of all PHI source registers to handle the case
-  // where the PHIs in successor blocks were the only later uses of the source
-  // register.
-  for (SrcCopySet::iterator I = InsertedSrcCopySet.begin(),
-       E = InsertedSrcCopySet.end(); I != E; ++I) {
-    MachineBasicBlock *MBB = I->first;
-    unsigned SrcReg = I->second;
-    if (unsigned RenamedRegister = RegRenamingMap[getRegColor(SrcReg)])
-      SrcReg = RenamedRegister;
-
-    LiveInterval &SrcLI = LI->getInterval(SrcReg);
-
-    bool isLiveOut = false;
-    for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(),
-         SE = MBB->succ_end(); SI != SE; ++SI) {
-      if (SrcLI.liveAt(LI->getMBBStartIdx(*SI))) {
-        isLiveOut = true;
-        break;
-      }
-    }
-
-    if (isLiveOut)
-      continue;
-
-    MachineOperand *LastUse = findLastUse(MBB, SrcReg);
-    assert(LastUse);
-    SlotIndex LastUseIndex = LI->getInstructionIndex(LastUse->getParent());
-    SrcLI.removeRange(LastUseIndex.getRegSlot(), LI->getMBBEndIdx(MBB));
-    LastUse->setIsKill(true);
-  }
-
-  Allocator.Reset();
-  RegNodeMap.clear();
-  PHISrcDefs.clear();
-  InsertedSrcCopySet.clear();
-  InsertedSrcCopyMap.clear();
-  InsertedDestCopies.clear();
-
-  return Changed;
-}
-
-void StrongPHIElimination::addReg(unsigned Reg) {
-  Node *&N = RegNodeMap[Reg];
-  if (!N)
-    N = new (Allocator) Node(Reg);
-}
-
-StrongPHIElimination::Node*
-StrongPHIElimination::Node::getLeader() {
-  Node *N = this;
-  Node *Parent = parent.getPointer();
-  Node *Grandparent = Parent->parent.getPointer();
-
-  while (Parent != Grandparent) {
-    N->parent.setPointer(Grandparent);
-    N = Grandparent;
-    Parent = Parent->parent.getPointer();
-    Grandparent = Parent->parent.getPointer();
-  }
-
-  return Parent;
-}
-
-unsigned StrongPHIElimination::getRegColor(unsigned Reg) {
-  DenseMap<unsigned, Node*>::iterator RI = RegNodeMap.find(Reg);
-  if (RI == RegNodeMap.end())
-    return 0;
-  Node *Node = RI->second;
-  if (Node->parent.getInt() & Node::kRegisterIsolatedFlag)
-    return 0;
-  return Node->getLeader()->value;
-}
-
-void StrongPHIElimination::unionRegs(unsigned Reg1, unsigned Reg2) {
-  Node *Node1 = RegNodeMap[Reg1]->getLeader();
-  Node *Node2 = RegNodeMap[Reg2]->getLeader();
-
-  if (Node1->rank > Node2->rank) {
-    Node2->parent.setPointer(Node1->getLeader());
-  } else if (Node1->rank < Node2->rank) {
-    Node1->parent.setPointer(Node2->getLeader());
-  } else if (Node1 != Node2) {
-    Node2->parent.setPointer(Node1->getLeader());
-    Node1->rank++;
-  }
-}
-
-void StrongPHIElimination::isolateReg(unsigned Reg) {
-  Node *Node = RegNodeMap[Reg];
-  Node->parent.setInt(Node->parent.getInt() | Node::kRegisterIsolatedFlag);
-}
-
-unsigned StrongPHIElimination::getPHIColor(MachineInstr *PHI) {
-  assert(PHI->isPHI());
-
-  unsigned DestReg = PHI->getOperand(0).getReg();
-  Node *DestNode = RegNodeMap[DestReg];
-  if (DestNode->parent.getInt() & Node::kPHIIsolatedFlag)
-    return 0;
-
-  for (unsigned i = 1; i < PHI->getNumOperands(); i += 2) {
-    unsigned SrcColor = getRegColor(PHI->getOperand(i).getReg());
-    if (SrcColor)
-      return SrcColor;
-  }
-  return 0;
-}
-
-void StrongPHIElimination::isolatePHI(MachineInstr *PHI) {
-  assert(PHI->isPHI());
-  Node *Node = RegNodeMap[PHI->getOperand(0).getReg()];
-  Node->parent.setInt(Node->parent.getInt() | Node::kPHIIsolatedFlag);
-}
-
-/// SplitInterferencesForBasicBlock - traverses a basic block, splitting any
-/// interferences found between registers in the same congruence class. It
-/// takes two DenseMaps as arguments that it also updates:
-///
-/// 1) CurrentDominatingParent, which maps a color to the register in that
-///    congruence class whose definition was most recently seen.
-///
-/// 2) ImmediateDominatingParent, which maps a register to the register in the
-///    same congruence class that most immediately dominates it.
-///
-/// This function assumes that it is being called in a depth-first traversal
-/// of the dominator tree.
-///
-/// The algorithm used here is a generalization of the dominance-based SSA test
-/// for two variables. If there are variables a_1, ..., a_n such that
-///
-///   def(a_1) dom ... dom def(a_n),
-///
-/// then we can test for an interference between any two a_i by only using O(n)
-/// interference tests between pairs of variables. If i < j and a_i and a_j
-/// interfere, then a_i is alive at def(a_j), so it is also alive at def(a_i+1).
-/// Thus, in order to test for an interference involving a_i, we need only check
-/// for a potential interference with a_i+1.
-///
-/// This method can be generalized to arbitrary sets of variables by performing
-/// a depth-first traversal of the dominator tree. As we traverse down a branch
-/// of the dominator tree, we keep track of the current dominating variable and
-/// only perform an interference test with that variable. However, when we go to
-/// another branch of the dominator tree, the definition of the current dominating
-/// variable may no longer dominate the current block. In order to correct this,
-/// we need to use a stack of past choices of the current dominating variable
-/// and pop from this stack until we find a variable whose definition actually
-/// dominates the current block.
-/// 
-/// There will be one push on this stack for each variable that has become the
-/// current dominating variable, so instead of using an explicit stack we can
-/// simply associate the previous choice for a current dominating variable with
-/// the new choice. This works better in our implementation, where we test for
-/// interference in multiple distinct sets at once.
-void
-StrongPHIElimination::SplitInterferencesForBasicBlock(
-    MachineBasicBlock &MBB,
-    DenseMap<unsigned, unsigned> &CurrentDominatingParent,
-    DenseMap<unsigned, unsigned> &ImmediateDominatingParent) {
-  // Sort defs by their order in the original basic block, as the code below
-  // assumes that it is processing definitions in dominance order.
-  std::vector<MachineInstr*> &DefInstrs = PHISrcDefs[&MBB];
-  std::sort(DefInstrs.begin(), DefInstrs.end(), MIIndexCompare(LI));
-
-  for (std::vector<MachineInstr*>::const_iterator BBI = DefInstrs.begin(),
-       BBE = DefInstrs.end(); BBI != BBE; ++BBI) {
-    for (MachineInstr::const_mop_iterator I = (*BBI)->operands_begin(),
-         E = (*BBI)->operands_end(); I != E; ++I) {
-      const MachineOperand &MO = *I;
-
-      // FIXME: This would be faster if it were possible to bail out of checking
-      // an instruction's operands after the explicit defs, but this is incorrect
-      // for variadic instructions, which may appear before register allocation
-      // in the future.
-      if (!MO.isReg() || !MO.isDef())
-        continue;
-
-      unsigned DestReg = MO.getReg();
-      if (!DestReg || !TargetRegisterInfo::isVirtualRegister(DestReg))
-        continue;
-
-      // If the virtual register being defined is not used in any PHI or has
-      // already been isolated, then there are no more interferences to check.
-      unsigned DestColor = getRegColor(DestReg);
-      if (!DestColor)
-        continue;
-
-      // The input to this pass sometimes is not in SSA form in every basic
-      // block, as some virtual registers have redefinitions. We could eliminate
-      // this by fixing the passes that generate the non-SSA code, or we could
-      // handle it here by tracking defining machine instructions rather than
-      // virtual registers. For now, we just handle the situation conservatively
-      // in a way that will possibly lead to false interferences.
-      unsigned &CurrentParent = CurrentDominatingParent[DestColor];
-      unsigned NewParent = CurrentParent;
-      if (NewParent == DestReg)
-        continue;
-
-      // Pop registers from the stack represented by ImmediateDominatingParent
-      // until we find a parent that dominates the current instruction.
-      while (NewParent && (!DT->dominates(MRI->getVRegDef(NewParent), *BBI)
-                           || !getRegColor(NewParent)))
-        NewParent = ImmediateDominatingParent[NewParent];
-
-      // If NewParent is nonzero, then its definition dominates the current
-      // instruction, so it is only necessary to check for the liveness of
-      // NewParent in order to check for an interference.
-      if (NewParent
-          && LI->getInterval(NewParent).liveAt(LI->getInstructionIndex(*BBI))) {
-        // If there is an interference, always isolate the new register. This
-        // could be improved by using a heuristic that decides which of the two
-        // registers to isolate.
-        isolateReg(DestReg);
-        CurrentParent = NewParent;
-      } else {
-        // If there is no interference, update ImmediateDominatingParent and set
-        // the CurrentDominatingParent for this color to the current register.
-        ImmediateDominatingParent[DestReg] = NewParent;
-        CurrentParent = DestReg;
-      }
-    }
-  }
-
-  // We now walk the PHIs in successor blocks and check for interferences. This
-  // is necessary because the use of a PHI's operands are logically contained in
-  // the predecessor block. The def of a PHI's destination register is processed
-  // along with the other defs in a basic block.
-
-  CurrentPHIForColor.clear();
-
-  for (MachineBasicBlock::succ_iterator SI = MBB.succ_begin(),
-       SE = MBB.succ_end(); SI != SE; ++SI) {
-    for (MachineBasicBlock::iterator BBI = (*SI)->begin(), BBE = (*SI)->end();
-         BBI != BBE && BBI->isPHI(); ++BBI) {
-      MachineInstr *PHI = BBI;
-
-      // If a PHI is already isolated, either by being isolated directly or
-      // having all of its operands isolated, ignore it.
-      unsigned Color = getPHIColor(PHI);
-      if (!Color)
-        continue;
-
-      // Find the index of the PHI operand that corresponds to this basic block.
-      unsigned PredIndex;
-      for (PredIndex = 1; PredIndex < PHI->getNumOperands(); PredIndex += 2) {
-        if (PHI->getOperand(PredIndex + 1).getMBB() == &MBB)
-          break;
-      }
-      assert(PredIndex < PHI->getNumOperands());
-      unsigned PredOperandReg = PHI->getOperand(PredIndex).getReg();
-
-      // Pop registers from the stack represented by ImmediateDominatingParent
-      // until we find a parent that dominates the current instruction.
-      unsigned &CurrentParent = CurrentDominatingParent[Color];
-      unsigned NewParent = CurrentParent;
-      while (NewParent
-             && (!DT->dominates(MRI->getVRegDef(NewParent)->getParent(), &MBB)
-                 || !getRegColor(NewParent)))
-        NewParent = ImmediateDominatingParent[NewParent];
-      CurrentParent = NewParent;
-
-      // If there is an interference with a register, always isolate the
-      // register rather than the PHI. It is also possible to isolate the
-      // PHI, but that introduces copies for all of the registers involved
-      // in that PHI.
-      if (NewParent && LI->isLiveOutOfMBB(LI->getInterval(NewParent), &MBB)
-                    && NewParent != PredOperandReg)
-        isolateReg(NewParent);
-
-      std::pair<MachineInstr*, unsigned>
-        &CurrentPHI = CurrentPHIForColor[Color];
-
-      // If two PHIs have the same operand from every shared predecessor, then
-      // they don't actually interfere. Otherwise, isolate the current PHI. This
-      // could possibly be improved, e.g. we could isolate the PHI with the
-      // fewest operands.
-      if (CurrentPHI.first && CurrentPHI.second != PredOperandReg)
-        isolatePHI(PHI);
-      else
-        CurrentPHI = std::make_pair(PHI, PredOperandReg);
-    }
-  }
-}
-
-void StrongPHIElimination::InsertCopiesForPHI(MachineInstr *PHI,
-                                              MachineBasicBlock *MBB) {
-  assert(PHI->isPHI());
-  ++NumPHIsLowered;
-  unsigned PHIColor = getPHIColor(PHI);
-
-  for (unsigned i = 1; i < PHI->getNumOperands(); i += 2) {
-    MachineOperand &SrcMO = PHI->getOperand(i);
-
-    // If a source is defined by an implicit def, there is no need to insert a
-    // copy in the predecessor.
-    if (SrcMO.isUndef())
-      continue;
-
-    unsigned SrcReg = SrcMO.getReg();
-    assert(TargetRegisterInfo::isVirtualRegister(SrcReg) &&
-           "Machine PHI Operands must all be virtual registers!");
-
-    MachineBasicBlock *PredBB = PHI->getOperand(i + 1).getMBB();
-    unsigned SrcColor = getRegColor(SrcReg);
-
-    // If neither the PHI nor the operand were isolated, then we only need to
-    // set the phi-kill flag on the VNInfo at this PHI.
-    if (PHIColor && SrcColor == PHIColor) {
-      LiveInterval &SrcInterval = LI->getInterval(SrcReg);
-      SlotIndex PredIndex = LI->getMBBEndIdx(PredBB);
-      VNInfo *SrcVNI = SrcInterval.getVNInfoBefore(PredIndex);
-      (void)SrcVNI;
-      assert(SrcVNI);
-      continue;
-    }
-
-    unsigned CopyReg = 0;
-    if (PHIColor) {
-      SrcCopyMap::const_iterator I
-        = InsertedSrcCopyMap.find(std::make_pair(PredBB, PHIColor));
-      CopyReg
-        = I != InsertedSrcCopyMap.end() ? I->second->getOperand(0).getReg() : 0;
-    }
-
-    if (!CopyReg) {
-      const TargetRegisterClass *RC = MRI->getRegClass(SrcReg);
-      CopyReg = MRI->createVirtualRegister(RC);
-
-      MachineBasicBlock::iterator
-        CopyInsertPoint = findPHICopyInsertPoint(PredBB, MBB, SrcReg);
-      unsigned SrcSubReg = SrcMO.getSubReg();
-      MachineInstr *CopyInstr = BuildMI(*PredBB,
-                                        CopyInsertPoint,
-                                        PHI->getDebugLoc(),
-                                        TII->get(TargetOpcode::COPY),
-                                        CopyReg).addReg(SrcReg, 0, SrcSubReg);
-      LI->InsertMachineInstrInMaps(CopyInstr);
-      ++NumSrcCopiesInserted;
-
-      // addLiveRangeToEndOfBlock() also adds the phikill flag to the VNInfo for
-      // the newly added range.
-      LI->addLiveRangeToEndOfBlock(CopyReg, CopyInstr);
-      InsertedSrcCopySet.insert(std::make_pair(PredBB, SrcReg));
-
-      addReg(CopyReg);
-      if (PHIColor) {
-        unionRegs(PHIColor, CopyReg);
-        assert(getRegColor(CopyReg) != CopyReg);
-      } else {
-        PHIColor = CopyReg;
-        assert(getRegColor(CopyReg) == CopyReg);
-      }
-
-      // Insert into map if not already there.
-      InsertedSrcCopyMap.insert(std::make_pair(std::make_pair(PredBB, PHIColor),
-                                               CopyInstr));
-    }
-
-    SrcMO.setReg(CopyReg);
-
-    // If SrcReg is not live beyond the PHI, trim its interval so that it is no
-    // longer live-in to MBB. Note that SrcReg may appear in other PHIs that are
-    // processed later, but this is still correct to do at this point because we
-    // never rely on LiveIntervals being correct while inserting copies.
-    // FIXME: Should this just count uses at PHIs like the normal PHIElimination
-    // pass does?
-    LiveInterval &SrcLI = LI->getInterval(SrcReg);
-    SlotIndex MBBStartIndex = LI->getMBBStartIdx(MBB);
-    SlotIndex PHIIndex = LI->getInstructionIndex(PHI);
-    SlotIndex NextInstrIndex = PHIIndex.getNextIndex();
-    if (SrcLI.liveAt(MBBStartIndex) && SrcLI.expiredAt(NextInstrIndex))
-      SrcLI.removeRange(MBBStartIndex, PHIIndex, true);
-  }
-
-  unsigned DestReg = PHI->getOperand(0).getReg();
-  unsigned DestColor = getRegColor(DestReg);
-
-  if (PHIColor && DestColor == PHIColor) {
-    LiveInterval &DestLI = LI->getInterval(DestReg);
-
-    // Set the phi-def flag for the VN at this PHI.
-    SlotIndex PHIIndex = LI->getInstructionIndex(PHI);
-    VNInfo *DestVNI = DestLI.getVNInfoAt(PHIIndex.getRegSlot());
-    assert(DestVNI);
-  
-    // Prior to PHI elimination, the live ranges of PHIs begin at their defining
-    // instruction. After PHI elimination, PHI instructions are replaced by VNs
-    // with the phi-def flag set, and the live ranges of these VNs start at the
-    // beginning of the basic block.
-    SlotIndex MBBStartIndex = LI->getMBBStartIdx(MBB);
-    DestVNI->def = MBBStartIndex;
-    DestLI.addRange(LiveRange(MBBStartIndex,
-                              PHIIndex.getRegSlot(),
-                              DestVNI));
-    return;
-  }
-
-  const TargetRegisterClass *RC = MRI->getRegClass(DestReg);
-  unsigned CopyReg = MRI->createVirtualRegister(RC);
-
-  MachineInstr *CopyInstr = BuildMI(*MBB,
-                                    MBB->SkipPHIsAndLabels(MBB->begin()),
-                                    PHI->getDebugLoc(),
-                                    TII->get(TargetOpcode::COPY),
-                                    DestReg).addReg(CopyReg);
-  LI->InsertMachineInstrInMaps(CopyInstr);
-  PHI->getOperand(0).setReg(CopyReg);
-  ++NumDestCopiesInserted;
-
-  // Add the region from the beginning of MBB to the copy instruction to
-  // CopyReg's live interval, and give the VNInfo the phidef flag.
-  LiveInterval &CopyLI = LI->getOrCreateInterval(CopyReg);
-  SlotIndex MBBStartIndex = LI->getMBBStartIdx(MBB);
-  SlotIndex DestCopyIndex = LI->getInstructionIndex(CopyInstr);
-  VNInfo *CopyVNI = CopyLI.getNextValue(MBBStartIndex,
-                                        LI->getVNInfoAllocator());
-  CopyLI.addRange(LiveRange(MBBStartIndex,
-                            DestCopyIndex.getRegSlot(),
-                            CopyVNI));
-
-  // Adjust DestReg's live interval to adjust for its new definition at
-  // CopyInstr.
-  LiveInterval &DestLI = LI->getOrCreateInterval(DestReg);
-  SlotIndex PHIIndex = LI->getInstructionIndex(PHI);
-  DestLI.removeRange(PHIIndex.getRegSlot(), DestCopyIndex.getRegSlot());
-
-  VNInfo *DestVNI = DestLI.getVNInfoAt(DestCopyIndex.getRegSlot());
-  assert(DestVNI);
-  DestVNI->def = DestCopyIndex.getRegSlot();
-
-  InsertedDestCopies[CopyReg] = CopyInstr;
-}
-
-void StrongPHIElimination::MergeLIsAndRename(unsigned Reg, unsigned NewReg) {
-  if (Reg == NewReg)
-    return;
-
-  LiveInterval &OldLI = LI->getInterval(Reg);
-  LiveInterval &NewLI = LI->getInterval(NewReg);
-
-  // Merge the live ranges of the two registers.
-  DenseMap<VNInfo*, VNInfo*> VNMap;
-  for (LiveInterval::iterator LRI = OldLI.begin(), LRE = OldLI.end();
-       LRI != LRE; ++LRI) {
-    LiveRange OldLR = *LRI;
-    VNInfo *OldVN = OldLR.valno;
-
-    VNInfo *&NewVN = VNMap[OldVN];
-    if (!NewVN) {
-      NewVN = NewLI.createValueCopy(OldVN, LI->getVNInfoAllocator());
-      VNMap[OldVN] = NewVN;
-    }
-
-    LiveRange LR(OldLR.start, OldLR.end, NewVN);
-    NewLI.addRange(LR);
-  }
-
-  // Remove the LiveInterval for the register being renamed and replace all
-  // of its defs and uses with the new register.
-  LI->removeInterval(Reg);
-  MRI->replaceRegWith(Reg, NewReg);
-}
diff --git a/lib/CodeGen/TailDuplication.cpp b/lib/CodeGen/TailDuplication.cpp
index 8a1d567..ff0181e 100644
--- a/lib/CodeGen/TailDuplication.cpp
+++ b/lib/CodeGen/TailDuplication.cpp
@@ -638,8 +638,6 @@ bothUsedInPHI(const MachineBasicBlock &A,
 
 bool
 TailDuplicatePass::canCompletelyDuplicateBB(MachineBasicBlock &BB) {
-  SmallPtrSet<MachineBasicBlock*, 8> Succs(BB.succ_begin(), BB.succ_end());
-
   for (MachineBasicBlock::pred_iterator PI = BB.pred_begin(),
        PE = BB.pred_end(); PI != PE; ++PI) {
     MachineBasicBlock *PredBB = *PI;
diff --git a/lib/CodeGen/TargetInstrInfo.cpp b/lib/CodeGen/TargetInstrInfo.cpp
index bb8bd42..bf4fd65 100644
--- a/lib/CodeGen/TargetInstrInfo.cpp
+++ b/lib/CodeGen/TargetInstrInfo.cpp
@@ -17,6 +17,7 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/CodeGen/ScoreboardHazardRecognizer.h"
+#include "llvm/IR/DataLayout.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCInstrItineraries.h"
 #include "llvm/Support/CommandLine.h"
@@ -276,6 +277,36 @@ bool TargetInstrInfo::hasStoreToStackSlot(const MachineInstr *MI,
   return false;
 }
 
+bool TargetInstrInfo::getStackSlotRange(const TargetRegisterClass *RC,
+                                        unsigned SubIdx, unsigned &Size,
+                                        unsigned &Offset,
+                                        const TargetMachine *TM) const {
+  if (!SubIdx) {
+    Size = RC->getSize();
+    Offset = 0;
+    return true;
+  }
+  unsigned BitSize = TM->getRegisterInfo()->getSubRegIdxSize(SubIdx);
+  // Convert bit size to byte size to be consistent with
+  // MCRegisterClass::getSize().
+  if (BitSize % 8)
+    return false;
+
+  int BitOffset = TM->getRegisterInfo()->getSubRegIdxOffset(SubIdx);
+  if (BitOffset < 0 || BitOffset % 8)
+    return false;
+
+  Size = BitSize /= 8;
+  Offset = (unsigned)BitOffset / 8;
+
+  assert(RC->getSize() >= (Offset + Size) && "bad subregister range");
+
+  if (!TM->getDataLayout()->isLittleEndian()) {
+    Offset = RC->getSize() - (Offset + Size);
+  }
+  return true;
+}
+
 void TargetInstrInfo::reMaterialize(MachineBasicBlock &MBB,
                                     MachineBasicBlock::iterator I,
                                     unsigned DestReg,
@@ -364,6 +395,7 @@ TargetInstrInfo::foldMemoryOperand(MachineBasicBlock::iterator MI,
 
   // Ask the target to do the actual folding.
   if (MachineInstr *NewMI = foldMemoryOperandImpl(MF, MI, Ops, FI)) {
+    NewMI->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
     // Add a memory operand, foldMemoryOperandImpl doesn't do that.
     assert((!(Flags & MachineMemOperand::MOStore) ||
             NewMI->mayStore()) &&
@@ -424,9 +456,19 @@ TargetInstrInfo::foldMemoryOperand(MachineBasicBlock::iterator MI,
   NewMI = MBB.insert(MI, NewMI);
 
   // Copy the memoperands from the load to the folded instruction.
-  NewMI->setMemRefs(LoadMI->memoperands_begin(),
-                    LoadMI->memoperands_end());
-
+  if (MI->memoperands_empty()) {
+    NewMI->setMemRefs(LoadMI->memoperands_begin(),
+                      LoadMI->memoperands_end());
+  }
+  else {
+    // Handle the rare case of folding multiple loads.
+    NewMI->setMemRefs(MI->memoperands_begin(),
+                      MI->memoperands_end());
+    for (MachineInstr::mmo_iterator I = LoadMI->memoperands_begin(),
+           E = LoadMI->memoperands_end(); I != E; ++I) {
+      NewMI->addMemOperand(MF, *I);
+    }
+  }
   return NewMI;
 }
 
@@ -630,6 +672,10 @@ unsigned TargetInstrInfo::defaultDefLatency(const MCSchedModel *SchedModel,
   return 1;
 }
 
+unsigned TargetInstrInfo::getPredicationCost(const MachineInstr *) const {
+  return 0;
+}
+
 unsigned TargetInstrInfo::
 getInstrLatency(const InstrItineraryData *ItinData,
                 const MachineInstr *MI,
diff --git a/lib/CodeGen/TargetLoweringBase.cpp b/lib/CodeGen/TargetLoweringBase.cpp
index 8d8f81b..30305af 100644
--- a/lib/CodeGen/TargetLoweringBase.cpp
+++ b/lib/CodeGen/TargetLoweringBase.cpp
@@ -191,6 +191,11 @@ static void InitLibcallNames(const char **Names, const TargetMachine &TM) {
   Names[RTLIB::NEARBYINT_F80] = "nearbyintl";
   Names[RTLIB::NEARBYINT_F128] = "nearbyintl";
   Names[RTLIB::NEARBYINT_PPCF128] = "nearbyintl";
+  Names[RTLIB::ROUND_F32] = "roundf";
+  Names[RTLIB::ROUND_F64] = "round";
+  Names[RTLIB::ROUND_F80] = "roundl";
+  Names[RTLIB::ROUND_F128] = "roundl";
+  Names[RTLIB::ROUND_PPCF128] = "roundl";
   Names[RTLIB::FLOOR_F32] = "floorf";
   Names[RTLIB::FLOOR_F64] = "floor";
   Names[RTLIB::FLOOR_F80] = "floorl";
@@ -313,34 +318,62 @@ static void InitLibcallNames(const char **Names, const TargetMachine &TM) {
   Names[RTLIB::SYNC_VAL_COMPARE_AND_SWAP_2] = "__sync_val_compare_and_swap_2";
   Names[RTLIB::SYNC_VAL_COMPARE_AND_SWAP_4] = "__sync_val_compare_and_swap_4";
   Names[RTLIB::SYNC_VAL_COMPARE_AND_SWAP_8] = "__sync_val_compare_and_swap_8";
+  Names[RTLIB::SYNC_VAL_COMPARE_AND_SWAP_16] = "__sync_val_compare_and_swap_16";
   Names[RTLIB::SYNC_LOCK_TEST_AND_SET_1] = "__sync_lock_test_and_set_1";
   Names[RTLIB::SYNC_LOCK_TEST_AND_SET_2] = "__sync_lock_test_and_set_2";
   Names[RTLIB::SYNC_LOCK_TEST_AND_SET_4] = "__sync_lock_test_and_set_4";
   Names[RTLIB::SYNC_LOCK_TEST_AND_SET_8] = "__sync_lock_test_and_set_8";
+  Names[RTLIB::SYNC_LOCK_TEST_AND_SET_16] = "__sync_lock_test_and_set_16";
   Names[RTLIB::SYNC_FETCH_AND_ADD_1] = "__sync_fetch_and_add_1";
   Names[RTLIB::SYNC_FETCH_AND_ADD_2] = "__sync_fetch_and_add_2";
   Names[RTLIB::SYNC_FETCH_AND_ADD_4] = "__sync_fetch_and_add_4";
   Names[RTLIB::SYNC_FETCH_AND_ADD_8] = "__sync_fetch_and_add_8";
+  Names[RTLIB::SYNC_FETCH_AND_ADD_16] = "__sync_fetch_and_add_16";
   Names[RTLIB::SYNC_FETCH_AND_SUB_1] = "__sync_fetch_and_sub_1";
   Names[RTLIB::SYNC_FETCH_AND_SUB_2] = "__sync_fetch_and_sub_2";
   Names[RTLIB::SYNC_FETCH_AND_SUB_4] = "__sync_fetch_and_sub_4";
   Names[RTLIB::SYNC_FETCH_AND_SUB_8] = "__sync_fetch_and_sub_8";
+  Names[RTLIB::SYNC_FETCH_AND_SUB_16] = "__sync_fetch_and_sub_16";
   Names[RTLIB::SYNC_FETCH_AND_AND_1] = "__sync_fetch_and_and_1";
   Names[RTLIB::SYNC_FETCH_AND_AND_2] = "__sync_fetch_and_and_2";
   Names[RTLIB::SYNC_FETCH_AND_AND_4] = "__sync_fetch_and_and_4";
   Names[RTLIB::SYNC_FETCH_AND_AND_8] = "__sync_fetch_and_and_8";
+  Names[RTLIB::SYNC_FETCH_AND_AND_16] = "__sync_fetch_and_and_16";
   Names[RTLIB::SYNC_FETCH_AND_OR_1] = "__sync_fetch_and_or_1";
   Names[RTLIB::SYNC_FETCH_AND_OR_2] = "__sync_fetch_and_or_2";
   Names[RTLIB::SYNC_FETCH_AND_OR_4] = "__sync_fetch_and_or_4";
   Names[RTLIB::SYNC_FETCH_AND_OR_8] = "__sync_fetch_and_or_8";
+  Names[RTLIB::SYNC_FETCH_AND_OR_16] = "__sync_fetch_and_or_16";
   Names[RTLIB::SYNC_FETCH_AND_XOR_1] = "__sync_fetch_and_xor_1";
   Names[RTLIB::SYNC_FETCH_AND_XOR_2] = "__sync_fetch_and_xor_2";
   Names[RTLIB::SYNC_FETCH_AND_XOR_4] = "__sync_fetch_and_xor_4";
   Names[RTLIB::SYNC_FETCH_AND_XOR_8] = "__sync_fetch_and_xor_8";
+  Names[RTLIB::SYNC_FETCH_AND_XOR_16] = "__sync_fetch_and_xor_16";
   Names[RTLIB::SYNC_FETCH_AND_NAND_1] = "__sync_fetch_and_nand_1";
   Names[RTLIB::SYNC_FETCH_AND_NAND_2] = "__sync_fetch_and_nand_2";
   Names[RTLIB::SYNC_FETCH_AND_NAND_4] = "__sync_fetch_and_nand_4";
   Names[RTLIB::SYNC_FETCH_AND_NAND_8] = "__sync_fetch_and_nand_8";
+  Names[RTLIB::SYNC_FETCH_AND_NAND_16] = "__sync_fetch_and_nand_16";
+  Names[RTLIB::SYNC_FETCH_AND_MAX_1] = "__sync_fetch_and_max_1";
+  Names[RTLIB::SYNC_FETCH_AND_MAX_2] = "__sync_fetch_and_max_2";
+  Names[RTLIB::SYNC_FETCH_AND_MAX_4] = "__sync_fetch_and_max_4";
+  Names[RTLIB::SYNC_FETCH_AND_MAX_8] = "__sync_fetch_and_max_8";
+  Names[RTLIB::SYNC_FETCH_AND_MAX_16] = "__sync_fetch_and_max_16";
+  Names[RTLIB::SYNC_FETCH_AND_UMAX_1] = "__sync_fetch_and_umax_1";
+  Names[RTLIB::SYNC_FETCH_AND_UMAX_2] = "__sync_fetch_and_umax_2";
+  Names[RTLIB::SYNC_FETCH_AND_UMAX_4] = "__sync_fetch_and_umax_4";
+  Names[RTLIB::SYNC_FETCH_AND_UMAX_8] = "__sync_fetch_and_umax_8";
+  Names[RTLIB::SYNC_FETCH_AND_UMAX_16] = "__sync_fetch_and_umax_16";
+  Names[RTLIB::SYNC_FETCH_AND_MIN_1] = "__sync_fetch_and_min_1";
+  Names[RTLIB::SYNC_FETCH_AND_MIN_2] = "__sync_fetch_and_min_2";
+  Names[RTLIB::SYNC_FETCH_AND_MIN_4] = "__sync_fetch_and_min_4";
+  Names[RTLIB::SYNC_FETCH_AND_MIN_8] = "__sync_fetch_and_min_8";
+  Names[RTLIB::SYNC_FETCH_AND_MIN_16] = "__sync_fetch_and_min_16";
+  Names[RTLIB::SYNC_FETCH_AND_UMIN_1] = "__sync_fetch_and_umin_1";
+  Names[RTLIB::SYNC_FETCH_AND_UMIN_2] = "__sync_fetch_and_umin_2";
+  Names[RTLIB::SYNC_FETCH_AND_UMIN_4] = "__sync_fetch_and_umin_4";
+  Names[RTLIB::SYNC_FETCH_AND_UMIN_8] = "__sync_fetch_and_umin_8";
+  Names[RTLIB::SYNC_FETCH_AND_UMIN_16] = "__sync_fetch_and_umin_16";
   
   if (Triple(TM.getTargetTriple()).getEnvironment() == Triple::GNU) {
     Names[RTLIB::SINCOS_F32] = "sincosf";
@@ -356,6 +389,13 @@ static void InitLibcallNames(const char **Names, const TargetMachine &TM) {
     Names[RTLIB::SINCOS_F128] = 0;
     Names[RTLIB::SINCOS_PPCF128] = 0;
   }
+
+  if (Triple(TM.getTargetTriple()).getOS() != Triple::OpenBSD) {
+    Names[RTLIB::STACKPROTECTOR_CHECK_FAIL] = "__stack_chk_fail";
+  } else {
+    // These are generally not available.
+    Names[RTLIB::STACKPROTECTOR_CHECK_FAIL] = 0;
+  }
 }
 
 /// InitLibcallCallingConvs - Set default libcall CallingConvs.
@@ -624,7 +664,6 @@ TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm,
 
   // Perform these initializations only once.
   IsLittleEndian = TD->isLittleEndian();
-  PointerTy = MVT::getIntegerVT(8*TD->getPointerSize(0));
   MaxStoresPerMemset = MaxStoresPerMemcpy = MaxStoresPerMemmove = 8;
   MaxStoresPerMemsetOptSize = MaxStoresPerMemcpyOptSize
     = MaxStoresPerMemmoveOptSize = 4;
@@ -682,6 +721,14 @@ void TargetLoweringBase::initActions() {
     // These operations default to expand.
     setOperationAction(ISD::FGETSIGN, (MVT::SimpleValueType)VT, Expand);
     setOperationAction(ISD::CONCAT_VECTORS, (MVT::SimpleValueType)VT, Expand);
+
+    // These library functions default to expand.
+    setOperationAction(ISD::FROUND, (MVT::SimpleValueType)VT, Expand);
+
+    // These operations default to expand for vector types.
+    if (VT >= MVT::FIRST_VECTOR_VALUETYPE &&
+        VT <= MVT::LAST_VECTOR_VALUETYPE)
+      setOperationAction(ISD::FCOPYSIGN, (MVT::SimpleValueType)VT, Expand);
   }
 
   // Most targets ignore the @llvm.prefetch intrinsic.
@@ -747,6 +794,19 @@ void TargetLoweringBase::initActions() {
   setOperationAction(ISD::DEBUGTRAP, MVT::Other, Expand);
 }
 
+MVT TargetLoweringBase::getPointerTy(uint32_t AS) const {
+  return MVT::getIntegerVT(getPointerSizeInBits(AS));
+}
+
+unsigned TargetLoweringBase::getPointerSizeInBits(uint32_t AS) const {
+  return TD->getPointerSizeInBits(AS);
+}
+
+unsigned TargetLoweringBase::getPointerTypeSizeInBits(Type *Ty) const {
+  assert(Ty->isPointerTy());
+  return getPointerSizeInBits(Ty->getPointerAddressSpace());
+}
+
 MVT TargetLoweringBase::getScalarShiftAmountTy(EVT LHSTy) const {
   return MVT::getIntegerVT(8*TD->getPointerSize(0));
 }
@@ -1162,7 +1222,7 @@ void llvm::GetReturnInfo(Type* ReturnType, AttributeSet attr,
       Flags.setZExt();
 
     for (unsigned i = 0; i < NumParts; ++i)
-      Outs.push_back(ISD::OutputArg(Flags, PartVT, /*isFixed=*/true, 0, 0));
+      Outs.push_back(ISD::OutputArg(Flags, PartVT, VT, /*isFixed=*/true, 0, 0));
   }
 }
 
@@ -1228,6 +1288,7 @@ int TargetLoweringBase::InstructionOpcodeToISD(unsigned Opcode) const {
   case PtrToInt:       return ISD::BITCAST;
   case IntToPtr:       return ISD::BITCAST;
   case BitCast:        return ISD::BITCAST;
+  case AddrSpaceCast:  return ISD::ADDRSPACECAST;
   case ICmp:           return ISD::SETCC;
   case FCmp:           return ISD::SETCC;
   case PHI:            return 0;
diff --git a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
index 07cf871..59d7b57 100644
--- a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
+++ b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
@@ -52,10 +52,10 @@ TargetLoweringObjectFileELF::getCFIPersonalitySymbol(const GlobalValue *GV,
   default:
     report_fatal_error("We do not support this DWARF encoding yet!");
   case dwarf::DW_EH_PE_absptr:
-    return  Mang->getSymbol(GV);
+    return  getSymbol(*Mang, GV);
   case dwarf::DW_EH_PE_pcrel: {
     return getContext().GetOrCreateSymbol(StringRef("DW.ref.") +
-                                          Mang->getSymbol(GV)->getName());
+                                          getSymbol(*Mang, GV)->getName());
   }
   }
 }
@@ -104,7 +104,7 @@ getTTypeGlobalReference(const GlobalValue *GV, Mangler *Mang,
     MCSymbol *SSym = getContext().GetOrCreateSymbol(Name.str());
     MachineModuleInfoImpl::StubValueTy &StubSym = ELFMMI.getGVStubEntry(SSym);
     if (StubSym.getPointer() == 0) {
-      MCSymbol *Sym = Mang->getSymbol(GV);
+      MCSymbol *Sym = getSymbol(*Mang, GV);
       StubSym = MachineModuleInfoImpl::StubValueTy(Sym, !GV->hasLocalLinkage());
     }
 
@@ -252,7 +252,7 @@ SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind,
     Prefix = getSectionPrefixForGlobal(Kind);
 
     SmallString<128> Name(Prefix, Prefix+strlen(Prefix));
-    MCSymbol *Sym = Mang->getSymbol(GV);
+    MCSymbol *Sym = getSymbol(*Mang, GV);
     Name.append(Sym->getName().begin(), Sym->getName().end());
     StringRef Group = "";
     unsigned Flags = getELFSectionFlags(Kind);
@@ -523,6 +523,11 @@ getExplicitSectionGlobal(const GlobalValue *GV, SectionKind Kind,
 const MCSection *TargetLoweringObjectFileMachO::
 SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind,
                        Mangler *Mang, const TargetMachine &TM) const {
+
+  // Handle thread local data.
+  if (Kind.isThreadBSS()) return TLSBSSSection;
+  if (Kind.isThreadData()) return TLSDataSection;
+
   if (Kind.isText())
     return GV->isWeakForLinker() ? TextCoalSection : TextSection;
 
@@ -575,10 +580,6 @@ SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind,
   if (Kind.isBSSLocal())
     return DataBSSSection;
 
-  // Handle thread local data.
-  if (Kind.isThreadBSS()) return TLSBSSSection;
-  if (Kind.isThreadData()) return TLSDataSection;
-
   // Otherwise, just drop the variable in the normal data section.
   return DataSection;
 }
@@ -613,7 +614,7 @@ shouldEmitUsedDirectiveFor(const GlobalValue *GV, Mangler *Mang) const {
     // FIXME: ObjC metadata is currently emitted as internal symbols that have
     // \1L and \0l prefixes on them.  Fix them to be Private/LinkerPrivate and
     // this horrible hack can go away.
-    MCSymbol *Sym = Mang->getSymbol(GV);
+    MCSymbol *Sym = getSymbol(*Mang, GV);
     if (Sym->getName()[0] == 'L' || Sym->getName()[0] == 'l')
       return false;
   }
@@ -642,7 +643,7 @@ getTTypeGlobalReference(const GlobalValue *GV, Mangler *Mang,
       GV->hasHiddenVisibility() ? MachOMMI.getHiddenGVStubEntry(SSym) :
                                   MachOMMI.getGVStubEntry(SSym);
     if (StubSym.getPointer() == 0) {
-      MCSymbol *Sym = Mang->getSymbol(GV);
+      MCSymbol *Sym = getSymbol(*Mang, GV);
       StubSym = MachineModuleInfoImpl::StubValueTy(Sym, !GV->hasLocalLinkage());
     }
 
@@ -671,7 +672,7 @@ getCFIPersonalitySymbol(const GlobalValue *GV, Mangler *Mang,
   MCSymbol *SSym = getContext().GetOrCreateSymbol(Name.str());
   MachineModuleInfoImpl::StubValueTy &StubSym = MachOMMI.getGVStubEntry(SSym);
   if (StubSym.getPointer() == 0) {
-    MCSymbol *Sym = Mang->getSymbol(GV);
+    MCSymbol *Sym = getSymbol(*Mang, GV);
     StubSym = MachineModuleInfoImpl::StubValueTy(Sym, !GV->hasLocalLinkage());
   }
 
@@ -732,6 +733,7 @@ getExplicitSectionGlobal(const GlobalValue *GV, SectionKind Kind,
   return getContext().getCOFFSection(Name,
                                      Characteristics,
                                      Kind,
+                                     "",
                                      Selection);
 }
 
@@ -767,16 +769,22 @@ SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind,
     Characteristics |= COFF::IMAGE_SCN_LNK_COMDAT;
 
     return getContext().getCOFFSection(Name.str(), Characteristics,
-                                       Kind, COFF::IMAGE_COMDAT_SELECT_ANY);
+                                       Kind, "", COFF::IMAGE_COMDAT_SELECT_ANY);
   }
 
   if (Kind.isText())
-    return getTextSection();
+    return TextSection;
 
   if (Kind.isThreadLocal())
-    return getTLSDataSection();
+    return TLSDataSection;
 
-  return getDataSection();
+  if (Kind.isReadOnly())
+    return ReadOnlySection;
+
+  if (Kind.isBSS())
+    return BSSSection;
+
+  return DataSection;
 }
 
 void TargetLoweringObjectFileCOFF::
diff --git a/lib/CodeGen/TargetOptionsImpl.cpp b/lib/CodeGen/TargetOptionsImpl.cpp
index 7a39a4c..f7bf86b 100644
--- a/lib/CodeGen/TargetOptionsImpl.cpp
+++ b/lib/CodeGen/TargetOptionsImpl.cpp
@@ -22,10 +22,8 @@ using namespace llvm;
 bool TargetOptions::DisableFramePointerElim(const MachineFunction &MF) const {
   // Check to see if we should eliminate non-leaf frame pointers and then
   // check to see if we should eliminate all frame pointers.
-  bool NoFramePointerElimNonLeaf =
-    MF.getFunction()->getFnAttribute("no-frame-pointer-elim-non-leaf")
-      .getValueAsString() == "true";
-  if (NoFramePointerElimNonLeaf && !NoFramePointerElim) {
+  if (MF.getFunction()->hasFnAttribute("no-frame-pointer-elim-non-leaf") &&
+      !NoFramePointerElim) {
     const MachineFrameInfo *MFI = MF.getFrameInfo();
     return MFI->hasCalls();
   }
diff --git a/lib/CodeGen/TargetRegisterInfo.cpp b/lib/CodeGen/TargetRegisterInfo.cpp
index ffcee1f..5a15243 100644
--- a/lib/CodeGen/TargetRegisterInfo.cpp
+++ b/lib/CodeGen/TargetRegisterInfo.cpp
@@ -73,6 +73,14 @@ void PrintRegUnit::print(raw_ostream &OS) const {
     OS << '~' << TRI->getName(*Roots);
 }
 
+void PrintVRegOrUnit::print(raw_ostream &OS) const {
+  if (TRI && TRI->isVirtualRegister(Unit)) {
+    OS << "%vreg" << TargetRegisterInfo::virtReg2Index(Unit);
+    return;
+  }
+  PrintRegUnit::print(OS);
+}
+
 /// getAllocatableClass - Return the maximal subclass of the given register
 /// class that is alloctable, or NULL.
 const TargetRegisterClass *
diff --git a/lib/CodeGen/TargetSchedule.cpp b/lib/CodeGen/TargetSchedule.cpp
index 64ee9d1..b0f2ca6 100644
--- a/lib/CodeGen/TargetSchedule.cpp
+++ b/lib/CodeGen/TargetSchedule.cpp
@@ -210,7 +210,8 @@ unsigned TargetSchedModel::computeOperandLatency(
   // unit latency (defaultDefLatency may be too conservative).
 #ifndef NDEBUG
   if (SCDesc->isValid() && !DefMI->getOperand(DefOperIdx).isImplicit()
-      && !DefMI->getDesc().OpInfo[DefOperIdx].isOptionalDef()) {
+      && !DefMI->getDesc().OpInfo[DefOperIdx].isOptionalDef()
+      && SchedModel.isComplete()) {
     std::string Err;
     raw_string_ostream ss(Err);
     ss << "DefIdx " << DefIdx << " exceeds machine model writes for "
@@ -224,10 +225,13 @@ unsigned TargetSchedModel::computeOperandLatency(
   return DefMI->isTransient() ? 0 : TII->defaultDefLatency(&SchedModel, DefMI);
 }
 
-unsigned TargetSchedModel::computeInstrLatency(const MachineInstr *MI) const {
+unsigned
+TargetSchedModel::computeInstrLatency(const MachineInstr *MI,
+                                      bool UseDefaultDefLatency) const {
   // For the itinerary model, fall back to the old subtarget hook.
   // Allow subtargets to compute Bundle latencies outside the machine model.
-  if (hasInstrItineraries() || MI->isBundle())
+  if (hasInstrItineraries() || MI->isBundle() ||
+      (!hasInstrSchedModel() && !UseDefaultDefLatency))
     return TII->getInstrLatency(&InstrItins, MI);
 
   if (hasInstrSchedModel()) {
diff --git a/lib/CodeGen/TwoAddressInstructionPass.cpp b/lib/CodeGen/TwoAddressInstructionPass.cpp
index c52e675..b9a6b47 100644
--- a/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -1400,7 +1400,7 @@ TwoAddressInstructionPass::processTiedPairs(MachineInstr *MI,
         VNInfo *VNI = LI.getNextValue(LastCopyIdx, LIS->getVNInfoAllocator());
         SlotIndex endIdx =
           LIS->getInstructionIndex(MI).getRegSlot(IsEarlyClobber);
-        LI.addRange(LiveRange(LastCopyIdx, endIdx, VNI));
+        LI.addSegment(LiveInterval::Segment(LastCopyIdx, endIdx, VNI));
       }
     }
 
@@ -1457,7 +1457,7 @@ TwoAddressInstructionPass::processTiedPairs(MachineInstr *MI,
 
       SlotIndex UseIdx = MIIdx.getRegSlot(IsEarlyClobber);
       if (I->end == UseIdx)
-        LI.removeRange(LastCopyIdx, UseIdx);
+        LI.removeSegment(LastCopyIdx, UseIdx);
     }
 
   } else if (RemovedKillFlag) {
diff --git a/lib/CodeGen/UnreachableBlockElim.cpp b/lib/CodeGen/UnreachableBlockElim.cpp
index a95ebcd..f735ef2 100644
--- a/lib/CodeGen/UnreachableBlockElim.cpp
+++ b/lib/CodeGen/UnreachableBlockElim.cpp
@@ -24,7 +24,6 @@
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/Analysis/Dominators.h"
-#include "llvm/Analysis/ProfileInfo.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
@@ -50,7 +49,6 @@ namespace {
 
     virtual void getAnalysisUsage(AnalysisUsage &AU) const {
       AU.addPreserved<DominatorTree>();
-      AU.addPreserved<ProfileInfo>();
     }
   };
 }
@@ -87,9 +85,7 @@ bool UnreachableBlockElim::runOnFunction(Function &F) {
     }
 
   // Actually remove the blocks now.
-  ProfileInfo *PI = getAnalysisIfAvailable<ProfileInfo>();
   for (unsigned i = 0, e = DeadBlocks.size(); i != e; ++i) {
-    if (PI) PI->removeBlock(DeadBlocks[i]);
     DeadBlocks[i]->eraseFromParent();
   }
 
diff --git a/lib/CodeGen/VirtRegMap.cpp b/lib/CodeGen/VirtRegMap.cpp
index cd012d2..e0aa405 100644
--- a/lib/CodeGen/VirtRegMap.cpp
+++ b/lib/CodeGen/VirtRegMap.cpp
@@ -28,6 +28,7 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Function.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
@@ -264,15 +265,36 @@ void VirtRegRewriter::rewrite() {
   SmallVector<unsigned, 8> SuperDeads;
   SmallVector<unsigned, 8> SuperDefs;
   SmallVector<unsigned, 8> SuperKills;
+  SmallPtrSet<const MachineInstr *, 4> NoReturnInsts;
 
   for (MachineFunction::iterator MBBI = MF->begin(), MBBE = MF->end();
        MBBI != MBBE; ++MBBI) {
     DEBUG(MBBI->print(dbgs(), Indexes));
+    bool IsExitBB = MBBI->succ_empty();
     for (MachineBasicBlock::instr_iterator
            MII = MBBI->instr_begin(), MIE = MBBI->instr_end(); MII != MIE;) {
       MachineInstr *MI = MII;
       ++MII;
 
+      // Check if this instruction is a call to a noreturn function.
+      // If so, all the definitions set by this instruction can be ignored.
+      if (IsExitBB && MI->isCall())
+        for (MachineInstr::mop_iterator MOI = MI->operands_begin(),
+               MOE = MI->operands_end(); MOI != MOE; ++MOI) {
+          MachineOperand &MO = *MOI;
+          if (!MO.isGlobal())
+            continue;
+          const Function *Func = dyn_cast<Function>(MO.getGlobal());
+          if (!Func || !Func->hasFnAttribute(Attribute::NoReturn) ||
+              // We need to keep correct unwind information
+              // even if the function will not return, since the
+              // runtime may need it.
+              !Func->hasFnAttribute(Attribute::NoUnwind))
+            continue;
+          NoReturnInsts.insert(MI);
+          break;
+        }
+
       for (MachineInstr::mop_iterator MOI = MI->operands_begin(),
            MOE = MI->operands_end(); MOI != MOE; ++MOI) {
         MachineOperand &MO = *MOI;
@@ -353,7 +375,25 @@ void VirtRegRewriter::rewrite() {
   }
 
   // Tell MRI about physical registers in use.
-  for (unsigned Reg = 1, RegE = TRI->getNumRegs(); Reg != RegE; ++Reg)
-    if (!MRI->reg_nodbg_empty(Reg))
-      MRI->setPhysRegUsed(Reg);
+  if (NoReturnInsts.empty()) {
+    for (unsigned Reg = 1, RegE = TRI->getNumRegs(); Reg != RegE; ++Reg)
+      if (!MRI->reg_nodbg_empty(Reg))
+        MRI->setPhysRegUsed(Reg);
+  } else {
+    for (unsigned Reg = 1, RegE = TRI->getNumRegs(); Reg != RegE; ++Reg) {
+      if (MRI->reg_nodbg_empty(Reg))
+        continue;
+      // Check if this register has a use that will impact the rest of the
+      // code. Uses in debug and noreturn instructions do not impact the
+      // generated code.
+      for (MachineRegisterInfo::reg_nodbg_iterator It =
+             MRI->reg_nodbg_begin(Reg),
+             EndIt = MRI->reg_nodbg_end(); It != EndIt; ++It) {
+        if (!NoReturnInsts.count(&(*It))) {
+          MRI->setPhysRegUsed(Reg);
+          break;
+        }
+      }
+    }
+  }
 }
diff --git a/lib/DebugInfo/CMakeLists.txt b/lib/DebugInfo/CMakeLists.txt
index 4a6221d..61a3fb0 100644
--- a/lib/DebugInfo/CMakeLists.txt
+++ b/lib/DebugInfo/CMakeLists.txt
@@ -12,4 +12,6 @@ add_llvm_library(LLVMDebugInfo
   DWARFDebugLoc.cpp
   DWARFDebugRangeList.cpp
   DWARFFormValue.cpp
+  DWARFTypeUnit.cpp
+  DWARFUnit.cpp
   )
diff --git a/lib/DebugInfo/DWARFAbbreviationDeclaration.cpp b/lib/DebugInfo/DWARFAbbreviationDeclaration.cpp
index 2de62ab..f46fd58 100644
--- a/lib/DebugInfo/DWARFAbbreviationDeclaration.cpp
+++ b/lib/DebugInfo/DWARFAbbreviationDeclaration.cpp
@@ -14,37 +14,51 @@
 using namespace llvm;
 using namespace dwarf;
 
-bool
-DWARFAbbreviationDeclaration::extract(DataExtractor data, uint32_t* offset_ptr){
-  return extract(data, offset_ptr, data.getULEB128(offset_ptr));
+void DWARFAbbreviationDeclaration::clear() {
+  Code = 0;
+  Tag = 0;
+  HasChildren = false;
+  Attributes.clear();
 }
 
-bool
-DWARFAbbreviationDeclaration::extract(DataExtractor data, uint32_t* offset_ptr,
-                                      uint32_t code) {
-  Code = code;
-  Attribute.clear();
-  if (Code) {
-    Tag = data.getULEB128(offset_ptr);
-    HasChildren = data.getU8(offset_ptr);
+DWARFAbbreviationDeclaration::DWARFAbbreviationDeclaration() {
+  clear();
+}
 
-    while (data.isValidOffset(*offset_ptr)) {
-      uint16_t attr = data.getULEB128(offset_ptr);
-      uint16_t form = data.getULEB128(offset_ptr);
+bool
+DWARFAbbreviationDeclaration::extract(DataExtractor Data, uint32_t* OffsetPtr) {
+  clear();
+  Code = Data.getULEB128(OffsetPtr);
+  if (Code == 0) {
+    return false;
+  }
+  Tag = Data.getULEB128(OffsetPtr);
+  uint8_t ChildrenByte = Data.getU8(OffsetPtr);
+  HasChildren = (ChildrenByte == DW_CHILDREN_yes);
 
-      if (attr && form)
-        Attribute.push_back(DWARFAttribute(attr, form));
-      else
-        break;
+  while (true) {
+    uint32_t CurOffset = *OffsetPtr;
+    uint16_t Attr = Data.getULEB128(OffsetPtr);
+    if (CurOffset == *OffsetPtr) {
+      clear();
+      return false;
     }
-
-    return Tag != 0;
-  } else {
-    Tag = 0;
-    HasChildren = false;
+    CurOffset = *OffsetPtr;
+    uint16_t Form = Data.getULEB128(OffsetPtr);
+    if (CurOffset == *OffsetPtr) {
+      clear();
+      return false;
+    }
+    if (Attr == 0 && Form == 0)
+      break;
+    Attributes.push_back(AttributeSpec(Attr, Form));
   }
 
-  return false;
+  if (Tag == 0) {
+    clear();
+    return false;
+  }
+  return true;
 }
 
 void DWARFAbbreviationDeclaration::dump(raw_ostream &OS) const {
@@ -55,19 +69,19 @@ void DWARFAbbreviationDeclaration::dump(raw_ostream &OS) const {
   else
     OS << format("DW_TAG_Unknown_%x", getTag());
   OS << "\tDW_CHILDREN_" << (hasChildren() ? "yes" : "no") << '\n';
-  for (unsigned i = 0, e = Attribute.size(); i != e; ++i) {
+  for (unsigned i = 0, e = Attributes.size(); i != e; ++i) {
     OS << '\t';
-    const char *attrString = AttributeString(Attribute[i].getAttribute());
+    const char *attrString = AttributeString(Attributes[i].Attr);
     if (attrString)
       OS << attrString;
     else
-      OS << format("DW_AT_Unknown_%x", Attribute[i].getAttribute());
+      OS << format("DW_AT_Unknown_%x", Attributes[i].Attr);
     OS << '\t';
-    const char *formString = FormEncodingString(Attribute[i].getForm());
+    const char *formString = FormEncodingString(Attributes[i].Form);
     if (formString)
       OS << formString;
     else
-      OS << format("DW_FORM_Unknown_%x", Attribute[i].getForm());
+      OS << format("DW_FORM_Unknown_%x", Attributes[i].Form);
     OS << '\n';
   }
   OS << '\n';
@@ -75,8 +89,8 @@ void DWARFAbbreviationDeclaration::dump(raw_ostream &OS) const {
 
 uint32_t
 DWARFAbbreviationDeclaration::findAttributeIndex(uint16_t attr) const {
-  for (uint32_t i = 0, e = Attribute.size(); i != e; ++i) {
-    if (Attribute[i].getAttribute() == attr)
+  for (uint32_t i = 0, e = Attributes.size(); i != e; ++i) {
+    if (Attributes[i].Attr == attr)
       return i;
   }
   return -1U;
diff --git a/lib/DebugInfo/DWARFAbbreviationDeclaration.h b/lib/DebugInfo/DWARFAbbreviationDeclaration.h
index 9a3fcd8..e9b072e 100644
--- a/lib/DebugInfo/DWARFAbbreviationDeclaration.h
+++ b/lib/DebugInfo/DWARFAbbreviationDeclaration.h
@@ -10,7 +10,6 @@
 #ifndef LLVM_DEBUGINFO_DWARFABBREVIATIONDECLARATION_H
 #define LLVM_DEBUGINFO_DWARFABBREVIATIONDECLARATION_H
 
-#include "DWARFAttribute.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/DataExtractor.h"
 
@@ -22,31 +21,33 @@ class DWARFAbbreviationDeclaration {
   uint32_t Code;
   uint32_t Tag;
   bool HasChildren;
-  SmallVector<DWARFAttribute, 8> Attribute;
+
+  struct AttributeSpec {
+    AttributeSpec(uint16_t Attr, uint16_t Form) : Attr(Attr), Form(Form) {}
+    uint16_t Attr;
+    uint16_t Form;
+  };
+  SmallVector<AttributeSpec, 8> Attributes;
 public:
-  enum { InvalidCode = 0 };
-  DWARFAbbreviationDeclaration()
-    : Code(InvalidCode), Tag(0), HasChildren(0) {}
+  DWARFAbbreviationDeclaration();
 
   uint32_t getCode() const { return Code; }
   uint32_t getTag() const { return Tag; }
   bool hasChildren() const { return HasChildren; }
-  uint32_t getNumAttributes() const { return Attribute.size(); }
+  uint32_t getNumAttributes() const { return Attributes.size(); }
   uint16_t getAttrByIndex(uint32_t idx) const {
-    return Attribute.size() > idx ? Attribute[idx].getAttribute() : 0;
+    return idx < Attributes.size() ? Attributes[idx].Attr : 0;
   }
   uint16_t getFormByIndex(uint32_t idx) const {
-    return Attribute.size() > idx ? Attribute[idx].getForm() : 0;
+    return idx < Attributes.size() ? Attributes[idx].Form : 0;
   }
 
   uint32_t findAttributeIndex(uint16_t attr) const;
-  bool extract(DataExtractor data, uint32_t* offset_ptr);
-  bool extract(DataExtractor data, uint32_t* offset_ptr, uint32_t code);
-  bool isValid() const { return Code != 0 && Tag != 0; }
+  bool extract(DataExtractor Data, uint32_t* OffsetPtr);
   void dump(raw_ostream &OS) const;
-  const SmallVectorImpl<DWARFAttribute> &getAttributes() const {
-    return Attribute;
-  }
+
+private:
+  void clear();
 };
 
 }
diff --git a/lib/DebugInfo/DWARFAttribute.h b/lib/DebugInfo/DWARFAttribute.h
deleted file mode 100644
index 6f49b63..0000000
--- a/lib/DebugInfo/DWARFAttribute.h
+++ /dev/null
@@ -1,30 +0,0 @@
-//===-- DWARFAttribute.h ----------------------------------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_DEBUGINFO_DWARFATTRIBUTE_H
-#define LLVM_DEBUGINFO_DWARFATTRIBUTE_H
-
-#include "llvm/Support/DataTypes.h"
-
-namespace llvm {
-
-class DWARFAttribute {
-  uint16_t Attribute;
-  uint16_t Form;
-  public:
-  DWARFAttribute(uint16_t attr, uint16_t form)
-    : Attribute(attr), Form(form) {}
-
-  uint16_t getAttribute() const { return Attribute; }
-  uint16_t getForm() const { return Form; }
-};
-
-}
-
-#endif
diff --git a/lib/DebugInfo/DWARFCompileUnit.cpp b/lib/DebugInfo/DWARFCompileUnit.cpp
index 93b1622..33869d8 100644
--- a/lib/DebugInfo/DWARFCompileUnit.cpp
+++ b/lib/DebugInfo/DWARFCompileUnit.cpp
@@ -8,96 +8,18 @@
 //===----------------------------------------------------------------------===//
 
 #include "DWARFCompileUnit.h"
-#include "DWARFContext.h"
-#include "llvm/DebugInfo/DWARFFormValue.h"
-#include "llvm/Support/Dwarf.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
-using namespace llvm;
-using namespace dwarf;
-
-DataExtractor DWARFCompileUnit::getDebugInfoExtractor() const {
-  return DataExtractor(InfoSection, isLittleEndian, AddrSize);
-}
-
-bool DWARFCompileUnit::extract(DataExtractor debug_info, uint32_t *offset_ptr) {
-  clear();
-
-  Offset = *offset_ptr;
-
-  if (debug_info.isValidOffset(*offset_ptr)) {
-    uint64_t abbrOffset;
-    Length = debug_info.getU32(offset_ptr);
-    Version = debug_info.getU16(offset_ptr);
-    abbrOffset = debug_info.getU32(offset_ptr);
-    AddrSize = debug_info.getU8(offset_ptr);
-
-    bool lengthOK = debug_info.isValidOffset(getNextCompileUnitOffset()-1);
-    bool versionOK = DWARFContext::isSupportedVersion(Version);
-    bool abbrOffsetOK = AbbrevSection.size() > abbrOffset;
-    bool addrSizeOK = AddrSize == 4 || AddrSize == 8;
-
-    if (lengthOK && versionOK && addrSizeOK && abbrOffsetOK && Abbrev != NULL) {
-      Abbrevs = Abbrev->getAbbreviationDeclarationSet(abbrOffset);
-      return true;
-    }
-
-    // reset the offset to where we tried to parse from if anything went wrong
-    *offset_ptr = Offset;
-  }
-
-  return false;
-}
 
-uint32_t
-DWARFCompileUnit::extract(uint32_t offset, DataExtractor debug_info_data,
-                          const DWARFAbbreviationDeclarationSet *abbrevs) {
-  clear();
-
-  Offset = offset;
-
-  if (debug_info_data.isValidOffset(offset)) {
-    Length = debug_info_data.getU32(&offset);
-    Version = debug_info_data.getU16(&offset);
-    bool abbrevsOK = debug_info_data.getU32(&offset) == abbrevs->getOffset();
-    Abbrevs = abbrevs;
-    AddrSize = debug_info_data.getU8(&offset);
-
-    bool versionOK = DWARFContext::isSupportedVersion(Version);
-    bool addrSizeOK = AddrSize == 4 || AddrSize == 8;
-
-    if (versionOK && addrSizeOK && abbrevsOK &&
-        debug_info_data.isValidOffset(offset))
-      return offset;
-  }
-  return 0;
-}
-
-bool DWARFCompileUnit::extractRangeList(uint32_t RangeListOffset,
-                                        DWARFDebugRangeList &RangeList) const {
-  // Require that compile unit is extracted.
-  assert(DieArray.size() > 0);
-  DataExtractor RangesData(RangeSection, isLittleEndian, AddrSize);
-  return RangeList.extract(RangesData, &RangeListOffset);
-}
-
-void DWARFCompileUnit::clear() {
-  Offset = 0;
-  Length = 0;
-  Version = 0;
-  Abbrevs = 0;
-  AddrSize = 0;
-  BaseAddr = 0;
-  clearDIEs(false);
-}
+using namespace llvm;
 
 void DWARFCompileUnit::dump(raw_ostream &OS) {
-  OS << format("0x%08x", Offset) << ": Compile Unit:"
-     << " length = " << format("0x%08x", Length)
-     << " version = " << format("0x%04x", Version)
-     << " abbr_offset = " << format("0x%04x", Abbrevs->getOffset())
-     << " addr_size = " << format("0x%02x", AddrSize)
-     << " (next CU at " << format("0x%08x", getNextCompileUnitOffset())
+  OS << format("0x%08x", getOffset()) << ": Compile Unit:"
+     << " length = " << format("0x%08x", getLength())
+     << " version = " << format("0x%04x", getVersion())
+     << " abbr_offset = " << format("0x%04x", getAbbreviations()->getOffset())
+     << " addr_size = " << format("0x%02x", getAddressByteSize())
+     << " (next unit at " << format("0x%08x", getNextUnitOffset())
      << ")\n";
 
   const DWARFDebugInfoEntryMinimal *CU = getCompileUnitDIE(false);
@@ -105,174 +27,6 @@ void DWARFCompileUnit::dump(raw_ostream &OS) {
   CU->dump(OS, this, -1U);
 }
 
-const char *DWARFCompileUnit::getCompilationDir() {
-  extractDIEsIfNeeded(true);
-  if (DieArray.empty())
-    return 0;
-  return DieArray[0].getAttributeValueAsString(this, DW_AT_comp_dir, 0);
-}
-
-void DWARFCompileUnit::setDIERelations() {
-  if (DieArray.empty())
-    return;
-  DWARFDebugInfoEntryMinimal *die_array_begin = &DieArray.front();
-  DWARFDebugInfoEntryMinimal *die_array_end = &DieArray.back();
-  DWARFDebugInfoEntryMinimal *curr_die;
-  // We purposely are skipping the last element in the array in the loop below
-  // so that we can always have a valid next item
-  for (curr_die = die_array_begin; curr_die < die_array_end; ++curr_die) {
-    // Since our loop doesn't include the last element, we can always
-    // safely access the next die in the array.
-    DWARFDebugInfoEntryMinimal *next_die = curr_die + 1;
-
-    const DWARFAbbreviationDeclaration *curr_die_abbrev =
-      curr_die->getAbbreviationDeclarationPtr();
-
-    if (curr_die_abbrev) {
-      // Normal DIE
-      if (curr_die_abbrev->hasChildren())
-        next_die->setParent(curr_die);
-      else
-        curr_die->setSibling(next_die);
-    } else {
-      // NULL DIE that terminates a sibling chain
-      DWARFDebugInfoEntryMinimal *parent = curr_die->getParent();
-      if (parent)
-        parent->setSibling(next_die);
-    }
-  }
-
-  // Since we skipped the last element, we need to fix it up!
-  if (die_array_begin < die_array_end)
-    curr_die->setParent(die_array_begin);
-}
-
-void DWARFCompileUnit::extractDIEsToVector(
-    bool AppendCUDie, bool AppendNonCUDies,
-    std::vector<DWARFDebugInfoEntryMinimal> &Dies) const {
-  if (!AppendCUDie && !AppendNonCUDies)
-    return;
-
-  // Set the offset to that of the first DIE and calculate the start of the
-  // next compilation unit header.
-  uint32_t Offset = getFirstDIEOffset();
-  uint32_t NextCUOffset = getNextCompileUnitOffset();
-  DWARFDebugInfoEntryMinimal DIE;
-  uint32_t Depth = 0;
-  const uint8_t *FixedFormSizes =
-    DWARFFormValue::getFixedFormSizes(getAddressByteSize(), getVersion());
-  bool IsCUDie = true;
-
-  while (Offset < NextCUOffset &&
-         DIE.extractFast(this, FixedFormSizes, &Offset)) {
-    if (IsCUDie) {
-      if (AppendCUDie)
-        Dies.push_back(DIE);
-      if (!AppendNonCUDies)
-        break;
-      // The average bytes per DIE entry has been seen to be
-      // around 14-20 so let's pre-reserve the needed memory for
-      // our DIE entries accordingly.
-      Dies.reserve(Dies.size() + getDebugInfoSize() / 14);
-      IsCUDie = false;
-    } else {
-      Dies.push_back(DIE);
-    }
-
-    const DWARFAbbreviationDeclaration *AbbrDecl =
-      DIE.getAbbreviationDeclarationPtr();
-    if (AbbrDecl) {
-      // Normal DIE
-      if (AbbrDecl->hasChildren())
-        ++Depth;
-    } else {
-      // NULL DIE.
-      if (Depth > 0)
-        --Depth;
-      if (Depth == 0)
-        break;  // We are done with this compile unit!
-    }
-  }
-
-  // Give a little bit of info if we encounter corrupt DWARF (our offset
-  // should always terminate at or before the start of the next compilation
-  // unit header).
-  if (Offset > NextCUOffset)
-    fprintf(stderr, "warning: DWARF compile unit extends beyond its "
-                    "bounds cu 0x%8.8x at 0x%8.8x'\n", getOffset(), Offset);
-}
-
-size_t DWARFCompileUnit::extractDIEsIfNeeded(bool CUDieOnly) {
-  if ((CUDieOnly && DieArray.size() > 0) ||
-      DieArray.size() > 1)
-    return 0; // Already parsed.
-
-  extractDIEsToVector(DieArray.empty(), !CUDieOnly, DieArray);
-
-  // Set the base address of current compile unit.
-  if (!DieArray.empty()) {
-    uint64_t BaseAddr =
-      DieArray[0].getAttributeValueAsUnsigned(this, DW_AT_low_pc, -1U);
-    if (BaseAddr == -1U)
-      BaseAddr = DieArray[0].getAttributeValueAsUnsigned(this, DW_AT_entry_pc, 0);
-    setBaseAddress(BaseAddr);
-  }
-
-  setDIERelations();
-  return DieArray.size();
-}
-
-void DWARFCompileUnit::clearDIEs(bool KeepCUDie) {
-  if (DieArray.size() > (unsigned)KeepCUDie) {
-    // std::vectors never get any smaller when resized to a smaller size,
-    // or when clear() or erase() are called, the size will report that it
-    // is smaller, but the memory allocated remains intact (call capacity()
-    // to see this). So we need to create a temporary vector and swap the
-    // contents which will cause just the internal pointers to be swapped
-    // so that when temporary vector goes out of scope, it will destroy the
-    // contents.
-    std::vector<DWARFDebugInfoEntryMinimal> TmpArray;
-    DieArray.swap(TmpArray);
-    // Save at least the compile unit DIE
-    if (KeepCUDie)
-      DieArray.push_back(TmpArray.front());
-  }
-}
-
-void
-DWARFCompileUnit::buildAddressRangeTable(DWARFDebugAranges *debug_aranges,
-                                         bool clear_dies_if_already_not_parsed){
-  // This function is usually called if there in no .debug_aranges section
-  // in order to produce a compile unit level set of address ranges that
-  // is accurate. If the DIEs weren't parsed, then we don't want all dies for
-  // all compile units to stay loaded when they weren't needed. So we can end
-  // up parsing the DWARF and then throwing them all away to keep memory usage
-  // down.
-  const bool clear_dies = extractDIEsIfNeeded(false) > 1 &&
-                          clear_dies_if_already_not_parsed;
-  DieArray[0].buildAddressRangeTable(this, debug_aranges);
-
-  // Keep memory down by clearing DIEs if this generate function
-  // caused them to be parsed.
-  if (clear_dies)
-    clearDIEs(true);
-}
-
-DWARFDebugInfoEntryInlinedChain
-DWARFCompileUnit::getInlinedChainForAddress(uint64_t Address) {
-  // First, find a subprogram that contains the given address (the root
-  // of inlined chain).
-  extractDIEsIfNeeded(false);
-  const DWARFDebugInfoEntryMinimal *SubprogramDIE = 0;
-  for (size_t i = 0, n = DieArray.size(); i != n; i++) {
-    if (DieArray[i].isSubprogramDIE() &&
-        DieArray[i].addressRangeContainsAddress(this, Address)) {
-      SubprogramDIE = &DieArray[i];
-      break;
-    }
-  }
-  // Get inlined chain rooted at this subprogram DIE.
-  if (!SubprogramDIE)
-    return DWARFDebugInfoEntryInlinedChain();
-  return SubprogramDIE->getInlinedChainForAddress(this, Address);
+// VTable anchor.
+DWARFCompileUnit::~DWARFCompileUnit() {
 }
diff --git a/lib/DebugInfo/DWARFCompileUnit.h b/lib/DebugInfo/DWARFCompileUnit.h
index dc2214b..1c9573b 100644
--- a/lib/DebugInfo/DWARFCompileUnit.h
+++ b/lib/DebugInfo/DWARFCompileUnit.h
@@ -10,118 +10,19 @@
 #ifndef LLVM_DEBUGINFO_DWARFCOMPILEUNIT_H
 #define LLVM_DEBUGINFO_DWARFCOMPILEUNIT_H
 
-#include "DWARFDebugAbbrev.h"
-#include "DWARFDebugInfoEntry.h"
-#include "DWARFDebugRangeList.h"
-#include "DWARFRelocMap.h"
-#include <vector>
+#include "DWARFUnit.h"
 
 namespace llvm {
 
-class DWARFDebugAbbrev;
-class StringRef;
-class raw_ostream;
-
-class DWARFCompileUnit {
-  const DWARFDebugAbbrev *Abbrev;
-  StringRef InfoSection;
-  StringRef AbbrevSection;
-  StringRef RangeSection;
-  StringRef StringSection;
-  StringRef StringOffsetSection;
-  StringRef AddrOffsetSection;
-  const RelocAddrMap *RelocMap;
-  bool isLittleEndian;
-
-  uint32_t Offset;
-  uint32_t Length;
-  uint16_t Version;
-  const DWARFAbbreviationDeclarationSet *Abbrevs;
-  uint8_t AddrSize;
-  uint64_t BaseAddr;
-  // The compile unit debug information entry items.
-  std::vector<DWARFDebugInfoEntryMinimal> DieArray;
+class DWARFCompileUnit : public DWARFUnit {
 public:
-
   DWARFCompileUnit(const DWARFDebugAbbrev *DA, StringRef IS, StringRef AS,
                    StringRef RS, StringRef SS, StringRef SOS, StringRef AOS,
-                   const RelocAddrMap *M, bool LE) :
-    Abbrev(DA), InfoSection(IS), AbbrevSection(AS),
-    RangeSection(RS), StringSection(SS), StringOffsetSection(SOS),
-    AddrOffsetSection(AOS), RelocMap(M), isLittleEndian(LE) {
-    clear();
-  }
-
-  StringRef getStringSection() const { return StringSection; }
-  StringRef getStringOffsetSection() const { return StringOffsetSection; }
-  StringRef getAddrOffsetSection() const { return AddrOffsetSection; }
-  const RelocAddrMap *getRelocMap() const { return RelocMap; }
-  DataExtractor getDebugInfoExtractor() const;
-
-  bool extract(DataExtractor debug_info, uint32_t* offset_ptr);
-  uint32_t extract(uint32_t offset, DataExtractor debug_info_data,
-                   const DWARFAbbreviationDeclarationSet *abbrevs);
-
-  /// extractDIEsIfNeeded - Parses a compile unit and indexes its DIEs if it
-  /// hasn't already been done. Returns the number of DIEs parsed at this call.
-  size_t extractDIEsIfNeeded(bool CUDieOnly);
-  /// extractRangeList - extracts the range list referenced by this compile
-  /// unit from .debug_ranges section. Returns true on success.
-  /// Requires that compile unit is already extracted.
-  bool extractRangeList(uint32_t RangeListOffset,
-                        DWARFDebugRangeList &RangeList) const;
-  void clear();
+                   const RelocAddrMap *M, bool LE)
+      : DWARFUnit(DA, IS, AS, RS, SS, SOS, AOS, M, LE) {}
   void dump(raw_ostream &OS);
-  uint32_t getOffset() const { return Offset; }
-  /// Size in bytes of the compile unit header.
-  uint32_t getSize() const { return 11; }
-  bool containsDIEOffset(uint32_t die_offset) const {
-    return die_offset >= getFirstDIEOffset() &&
-      die_offset < getNextCompileUnitOffset();
-  }
-  uint32_t getFirstDIEOffset() const { return Offset + getSize(); }
-  uint32_t getNextCompileUnitOffset() const { return Offset + Length + 4; }
-  /// Size in bytes of the .debug_info data associated with this compile unit.
-  size_t getDebugInfoSize() const { return Length + 4 - getSize(); }
-  uint32_t getLength() const { return Length; }
-  uint16_t getVersion() const { return Version; }
-  const DWARFAbbreviationDeclarationSet *getAbbreviations() const {
-    return Abbrevs;
-  }
-  uint8_t getAddressByteSize() const { return AddrSize; }
-  uint64_t getBaseAddress() const { return BaseAddr; }
-
-  void setBaseAddress(uint64_t base_addr) {
-    BaseAddr = base_addr;
-  }
-
-  const DWARFDebugInfoEntryMinimal *
-  getCompileUnitDIE(bool extract_cu_die_only = true) {
-    extractDIEsIfNeeded(extract_cu_die_only);
-    return DieArray.empty() ? NULL : &DieArray[0];
-  }
-
-  const char *getCompilationDir();
-
-  /// setDIERelations - We read in all of the DIE entries into our flat list
-  /// of DIE entries and now we need to go back through all of them and set the
-  /// parent, sibling and child pointers for quick DIE navigation.
-  void setDIERelations();
-
-  void buildAddressRangeTable(DWARFDebugAranges *debug_aranges,
-                              bool clear_dies_if_already_not_parsed);
-
-  /// getInlinedChainForAddress - fetches inlined chain for a given address.
-  /// Returns empty chain if there is no subprogram containing address. The
-  /// chain is valid as long as parsed compile unit DIEs are not cleared.
-  DWARFDebugInfoEntryInlinedChain getInlinedChainForAddress(uint64_t Address);
-
-private:
-  /// extractDIEsToVector - Appends all parsed DIEs to a vector.
-  void extractDIEsToVector(bool AppendCUDie, bool AppendNonCUDIEs,
-                           std::vector<DWARFDebugInfoEntryMinimal> &DIEs) const;
-  /// clearDIEs - Clear parsed DIEs to keep memory usage low.
-  void clearDIEs(bool KeepCUDie);
+  // VTable anchor.
+  ~DWARFCompileUnit() LLVM_OVERRIDE;
 };
 
 }
diff --git a/lib/DebugInfo/DWARFContext.cpp b/lib/DebugInfo/DWARFContext.cpp
index 51ad645..e477190 100644
--- a/lib/DebugInfo/DWARFContext.cpp
+++ b/lib/DebugInfo/DWARFContext.cpp
@@ -23,6 +23,41 @@ using namespace object;
 
 typedef DWARFDebugLine::LineTable DWARFLineTable;
 
+DWARFContext::~DWARFContext() {
+  DeleteContainerPointers(CUs);
+  DeleteContainerPointers(TUs);
+  DeleteContainerPointers(DWOCUs);
+}
+
+static void dumpPubSection(raw_ostream &OS, StringRef Name, StringRef Data,
+                           bool LittleEndian, bool GnuStyle) {
+  OS << "\n." << Name << " contents:\n";
+  DataExtractor pubNames(Data, LittleEndian, 0);
+  uint32_t offset = 0;
+  OS << "length = " << format("0x%08x", pubNames.getU32(&offset));
+  OS << " version = " << format("0x%04x", pubNames.getU16(&offset));
+  OS << " unit_offset = " << format("0x%08x", pubNames.getU32(&offset));
+  OS << " unit_size = " << format("0x%08x", pubNames.getU32(&offset)) << '\n';
+  if (GnuStyle)
+    OS << "Offset     Linkage  Kind     Name\n";
+  else
+    OS << "Offset     Name\n";
+
+  while (offset < Data.size()) {
+    uint32_t dieRef = pubNames.getU32(&offset);
+    if (dieRef == 0)
+      break;
+    OS << format("0x%8.8x ", dieRef);
+    if (GnuStyle) {
+      PubIndexEntryDescriptor desc(pubNames.getU8(&offset));
+      OS << format("%-8s", dwarf::GDBIndexEntryLinkageString(desc.Linkage))
+         << ' ' << format("%-8s", dwarf::GDBIndexEntryKindString(desc.Kind))
+         << ' ';
+    }
+    OS << '\"' << pubNames.getCStr(&offset) << "\"\n";
+  }
+}
+
 void DWARFContext::dump(raw_ostream &OS, DIDumpType DumpType) {
   if (DumpType == DIDT_All || DumpType == DIDT_Abbrev) {
     OS << ".debug_abbrev contents:\n";
@@ -35,8 +70,14 @@ void DWARFContext::dump(raw_ostream &OS, DIDumpType DumpType) {
       getCompileUnitAtIndex(i)->dump(OS);
   }
 
+  if (DumpType == DIDT_All || DumpType == DIDT_Types) {
+    OS << "\n.debug_types contents:\n";
+    for (unsigned i = 0, e = getNumTypeUnits(); i != e; ++i)
+      getTypeUnitAtIndex(i)->dump(OS);
+  }
+
   if (DumpType == DIDT_All || DumpType == DIDT_Loc) {
-    OS << ".debug_loc contents:\n";
+    OS << "\n.debug_loc contents:\n";
     getDebugLoc()->dump(OS);
   }
 
@@ -61,13 +102,13 @@ void DWARFContext::dump(raw_ostream &OS, DIDumpType DumpType) {
       DWARFCompileUnit *cu = getCompileUnitAtIndex(i);
       savedAddressByteSize = cu->getAddressByteSize();
       unsigned stmtOffset =
-        cu->getCompileUnitDIE()->getAttributeValueAsUnsigned(cu, DW_AT_stmt_list,
-                                                             -1U);
+          cu->getCompileUnitDIE()->getAttributeValueAsSectionOffset(
+              cu, DW_AT_stmt_list, -1U);
       if (stmtOffset != -1U) {
-        DataExtractor lineData(getLineSection(), isLittleEndian(),
+        DataExtractor lineData(getLineSection().Data, isLittleEndian(),
                                savedAddressByteSize);
         DWARFDebugLine::DumpingState state(OS);
-        DWARFDebugLine::parseStatementTable(lineData, &lineRelocMap(), &stmtOffset, state);
+        DWARFDebugLine::parseStatementTable(lineData, &getLineSection().Relocs, &stmtOffset, state);
       }
     }
   }
@@ -97,23 +138,21 @@ void DWARFContext::dump(raw_ostream &OS, DIDumpType DumpType) {
       rangeList.dump(OS);
   }
 
-  if (DumpType == DIDT_All || DumpType == DIDT_Pubnames) {
-    OS << "\n.debug_pubnames contents:\n";
-    DataExtractor pubNames(getPubNamesSection(), isLittleEndian(), 0);
-    offset = 0;
-    OS << "Length:                " << pubNames.getU32(&offset) << "\n";
-    OS << "Version:               " << pubNames.getU16(&offset) << "\n";
-    OS << "Offset in .debug_info: " << pubNames.getU32(&offset) << "\n";
-    OS << "Size:                  " << pubNames.getU32(&offset) << "\n";
-    OS << "\n  Offset    Name\n";
-    while (offset < getPubNamesSection().size()) {
-      uint32_t n = pubNames.getU32(&offset);
-      if (n == 0)
-        break;
-      OS << format("%8x    ", n);
-      OS << pubNames.getCStr(&offset) << "\n";
-    }
-  }
+  if (DumpType == DIDT_All || DumpType == DIDT_Pubnames)
+    dumpPubSection(OS, "debug_pubnames", getPubNamesSection(),
+                   isLittleEndian(), false);
+
+  if (DumpType == DIDT_All || DumpType == DIDT_Pubtypes)
+    dumpPubSection(OS, "debug_pubtypes", getPubTypesSection(),
+                   isLittleEndian(), false);
+
+  if (DumpType == DIDT_All || DumpType == DIDT_GnuPubnames)
+    dumpPubSection(OS, "debug_gnu_pubnames", getGnuPubNamesSection(),
+                   isLittleEndian(), true /* GnuStyle */);
+
+  if (DumpType == DIDT_All || DumpType == DIDT_GnuPubtypes)
+    dumpPubSection(OS, "debug_gnu_pubtypes", getGnuPubTypesSection(),
+                   isLittleEndian(), true /* GnuStyle */);
 
   if (DumpType == DIDT_All || DumpType == DIDT_AbbrevDwo) {
     const DWARFDebugAbbrev *D = getDebugAbbrevDWO();
@@ -180,8 +219,8 @@ const DWARFDebugLoc *DWARFContext::getDebugLoc() {
   if (Loc)
     return Loc.get();
 
-  DataExtractor LocData(getLocSection(), isLittleEndian(), 0);
-  Loc.reset(new DWARFDebugLoc(locRelocMap()));
+  DataExtractor LocData(getLocSection().Data, isLittleEndian(), 0);
+  Loc.reset(new DWARFDebugLoc(getLocSection().Relocs));
   // assume all compile units have the same address byte size
   if (getNumCompileUnits())
     Loc->parse(LocData, getCompileUnitAtIndex(0)->getAddressByteSize());
@@ -192,13 +231,7 @@ const DWARFDebugAranges *DWARFContext::getDebugAranges() {
   if (Aranges)
     return Aranges.get();
 
-  DataExtractor arangesData(getARangeSection(), isLittleEndian(), 0);
-
   Aranges.reset(new DWARFDebugAranges());
-  Aranges->extract(arangesData);
-  // Generate aranges from DIEs: even if .debug_aranges section is present,
-  // it may describe only a small subset of compilation units, so we need to
-  // manually build aranges for the rest of them.
   Aranges->generate(this);
   return Aranges.get();
 }
@@ -226,11 +259,11 @@ const DWARFDebugFrame *DWARFContext::getDebugFrame() {
 const DWARFLineTable *
 DWARFContext::getLineTableForCompileUnit(DWARFCompileUnit *cu) {
   if (!Line)
-    Line.reset(new DWARFDebugLine(&lineRelocMap()));
+    Line.reset(new DWARFDebugLine(&getLineSection().Relocs));
 
   unsigned stmtOffset =
-    cu->getCompileUnitDIE()->getAttributeValueAsUnsigned(cu, DW_AT_stmt_list,
-                                                         -1U);
+      cu->getCompileUnitDIE()->getAttributeValueAsSectionOffset(
+          cu, DW_AT_stmt_list, -1U);
   if (stmtOffset == -1U)
     return 0; // No line table for this compile unit.
 
@@ -239,64 +272,79 @@ DWARFContext::getLineTableForCompileUnit(DWARFCompileUnit *cu) {
     return lt;
 
   // We have to parse it first.
-  DataExtractor lineData(getLineSection(), isLittleEndian(),
+  DataExtractor lineData(getLineSection().Data, isLittleEndian(),
                          cu->getAddressByteSize());
   return Line->getOrParseLineTable(lineData, stmtOffset);
 }
 
 void DWARFContext::parseCompileUnits() {
   uint32_t offset = 0;
-  const DataExtractor &DIData = DataExtractor(getInfoSection(),
+  const DataExtractor &DIData = DataExtractor(getInfoSection().Data,
                                               isLittleEndian(), 0);
   while (DIData.isValidOffset(offset)) {
-    CUs.push_back(DWARFCompileUnit(getDebugAbbrev(), getInfoSection(),
-                                   getAbbrevSection(), getRangeSection(),
-                                   getStringSection(), StringRef(),
-                                   getAddrSection(),
-                                   &infoRelocMap(),
-                                   isLittleEndian()));
-    if (!CUs.back().extract(DIData, &offset)) {
-      CUs.pop_back();
+    OwningPtr<DWARFCompileUnit> CU(new DWARFCompileUnit(
+        getDebugAbbrev(), getInfoSection().Data, getAbbrevSection(),
+        getRangeSection(), getStringSection(), StringRef(), getAddrSection(),
+        &getInfoSection().Relocs, isLittleEndian()));
+    if (!CU->extract(DIData, &offset)) {
       break;
     }
+    CUs.push_back(CU.take());
+    offset = CUs.back()->getNextUnitOffset();
+  }
+}
 
-    offset = CUs.back().getNextCompileUnitOffset();
+void DWARFContext::parseTypeUnits() {
+  const std::map<object::SectionRef, Section> &Sections = getTypesSections();
+  for (std::map<object::SectionRef, Section>::const_iterator
+           I = Sections.begin(),
+           E = Sections.end();
+       I != E; ++I) {
+    uint32_t offset = 0;
+    const DataExtractor &DIData =
+        DataExtractor(I->second.Data, isLittleEndian(), 0);
+    while (DIData.isValidOffset(offset)) {
+      OwningPtr<DWARFTypeUnit> TU(new DWARFTypeUnit(
+          getDebugAbbrev(), I->second.Data, getAbbrevSection(),
+          getRangeSection(), getStringSection(), StringRef(), getAddrSection(),
+          &I->second.Relocs, isLittleEndian()));
+      if (!TU->extract(DIData, &offset))
+        break;
+      TUs.push_back(TU.take());
+      offset = TUs.back()->getNextUnitOffset();
+    }
   }
 }
 
 void DWARFContext::parseDWOCompileUnits() {
   uint32_t offset = 0;
-  const DataExtractor &DIData = DataExtractor(getInfoDWOSection(),
-                                              isLittleEndian(), 0);
+  const DataExtractor &DIData =
+      DataExtractor(getInfoDWOSection().Data, isLittleEndian(), 0);
   while (DIData.isValidOffset(offset)) {
-    DWOCUs.push_back(DWARFCompileUnit(getDebugAbbrevDWO(), getInfoDWOSection(),
-                                      getAbbrevDWOSection(),
-                                      getRangeDWOSection(),
-                                      getStringDWOSection(),
-                                      getStringOffsetDWOSection(),
-                                      getAddrSection(),
-                                      &infoDWORelocMap(),
-                                      isLittleEndian()));
-    if (!DWOCUs.back().extract(DIData, &offset)) {
-      DWOCUs.pop_back();
+    OwningPtr<DWARFCompileUnit> DWOCU(new DWARFCompileUnit(
+        getDebugAbbrevDWO(), getInfoDWOSection().Data, getAbbrevDWOSection(),
+        getRangeDWOSection(), getStringDWOSection(),
+        getStringOffsetDWOSection(), getAddrSection(),
+        &getInfoDWOSection().Relocs, isLittleEndian()));
+    if (!DWOCU->extract(DIData, &offset)) {
       break;
     }
-
-    offset = DWOCUs.back().getNextCompileUnitOffset();
+    DWOCUs.push_back(DWOCU.take());
+    offset = DWOCUs.back()->getNextUnitOffset();
   }
 }
 
 namespace {
   struct OffsetComparator {
-    bool operator()(const DWARFCompileUnit &LHS,
-                    const DWARFCompileUnit &RHS) const {
-      return LHS.getOffset() < RHS.getOffset();
+    bool operator()(const DWARFCompileUnit *LHS,
+                    const DWARFCompileUnit *RHS) const {
+      return LHS->getOffset() < RHS->getOffset();
     }
-    bool operator()(const DWARFCompileUnit &LHS, uint32_t RHS) const {
-      return LHS.getOffset() < RHS;
+    bool operator()(const DWARFCompileUnit *LHS, uint32_t RHS) const {
+      return LHS->getOffset() < RHS;
     }
-    bool operator()(uint32_t LHS, const DWARFCompileUnit &RHS) const {
-      return LHS < RHS.getOffset();
+    bool operator()(uint32_t LHS, const DWARFCompileUnit *RHS) const {
+      return LHS < RHS->getOffset();
     }
   };
 }
@@ -305,10 +353,11 @@ DWARFCompileUnit *DWARFContext::getCompileUnitForOffset(uint32_t Offset) {
   if (CUs.empty())
     parseCompileUnits();
 
-  DWARFCompileUnit *CU = std::lower_bound(CUs.begin(), CUs.end(), Offset,
-                                          OffsetComparator());
-  if (CU != CUs.end())
-    return &*CU;
+  DWARFCompileUnit **CU =
+      std::lower_bound(CUs.begin(), CUs.end(), Offset, OffsetComparator());
+  if (CU != CUs.end()) {
+    return *CU;
+  }
   return 0;
 }
 
@@ -380,7 +429,7 @@ DILineInfo DWARFContext::getLineInfoForAddress(uint64_t Address,
         CU->getInlinedChainForAddress(Address);
     if (InlinedChain.DIEs.size() > 0) {
       const DWARFDebugInfoEntryMinimal &TopFunctionDIE = InlinedChain.DIEs[0];
-      if (const char *Name = TopFunctionDIE.getSubroutineName(InlinedChain.CU))
+      if (const char *Name = TopFunctionDIE.getSubroutineName(InlinedChain.U))
         FunctionName = Name;
     }
   }
@@ -413,19 +462,16 @@ DILineInfoTable DWARFContext::getLineInfoForAddressRange(uint64_t Address,
         CU->getInlinedChainForAddress(Address);
     if (InlinedChain.DIEs.size() > 0) {
       const DWARFDebugInfoEntryMinimal &TopFunctionDIE = InlinedChain.DIEs[0];
-      if (const char *Name = TopFunctionDIE.getSubroutineName(InlinedChain.CU))
+      if (const char *Name = TopFunctionDIE.getSubroutineName(InlinedChain.U))
         FunctionName = Name;
     }
   }
 
-  StringRef  FuncNameRef = StringRef(FunctionName);
-
   // If the Specifier says we don't need FileLineInfo, just
   // return the top-most function at the starting address.
   if (!Specifier.needs(DILineInfoSpecifier::FileLineInfo)) {
-    Lines.push_back(std::make_pair(Address,
-                                   DILineInfo(StringRef("<invalid>"),
-                                              FuncNameRef, 0, 0)));
+    Lines.push_back(
+        std::make_pair(Address, DILineInfo("<invalid>", FunctionName, 0, 0)));
     return Lines;
   }
 
@@ -446,9 +492,8 @@ DILineInfoTable DWARFContext::getLineInfoForAddressRange(uint64_t Address,
     std::string FileName = "<invalid>";
     getFileNameForCompileUnit(CU, LineTable, Row.File,
                               NeedsAbsoluteFilePath, FileName);
-    Lines.push_back(std::make_pair(Row.Address,
-                                   DILineInfo(StringRef(FileName),
-                                         FuncNameRef, Row.Line, Row.Column)));
+    Lines.push_back(std::make_pair(
+        Row.Address, DILineInfo(FileName, FunctionName, Row.Line, Row.Column)));
   }
 
   return Lines;
@@ -476,7 +521,7 @@ DIInliningInfo DWARFContext::getInliningInfoForAddress(uint64_t Address,
     uint32_t Column = 0;
     // Get function name if necessary.
     if (Specifier.needs(DILineInfoSpecifier::FunctionName)) {
-      if (const char *Name = FunctionDIE.getSubroutineName(InlinedChain.CU))
+      if (const char *Name = FunctionDIE.getSubroutineName(InlinedChain.U))
         FunctionName = Name;
     }
     if (Specifier.needs(DILineInfoSpecifier::FileLineInfo)) {
@@ -500,7 +545,7 @@ DIInliningInfo DWARFContext::getInliningInfoForAddress(uint64_t Address,
       }
       // Get call file/line/column of a current DIE.
       if (i + 1 < n) {
-        FunctionDIE.getCallerFrame(InlinedChain.CU, CallFile, CallLine,
+        FunctionDIE.getCallerFrame(InlinedChain.U, CallFile, CallLine,
                                    CallColumn);
       }
     }
@@ -557,29 +602,37 @@ DWARFContextInMemory::DWARFContextInMemory(object::ObjectFile *Obj) :
       UncompressedSections.push_back(UncompressedSection.take());
     }
 
-    StringRef *Section = StringSwitch<StringRef*>(name)
-        .Case("debug_info", &InfoSection)
-        .Case("debug_abbrev", &AbbrevSection)
-        .Case("debug_loc", &LocSection)
-        .Case("debug_line", &LineSection)
-        .Case("debug_aranges", &ARangeSection)
-        .Case("debug_frame", &DebugFrameSection)
-        .Case("debug_str", &StringSection)
-        .Case("debug_ranges", &RangeSection)
-        .Case("debug_pubnames", &PubNamesSection)
-        .Case("debug_info.dwo", &InfoDWOSection)
-        .Case("debug_abbrev.dwo", &AbbrevDWOSection)
-        .Case("debug_str.dwo", &StringDWOSection)
-        .Case("debug_str_offsets.dwo", &StringOffsetDWOSection)
-        .Case("debug_addr", &AddrSection)
-        // Any more debug info sections go here.
-        .Default(0);
+    StringRef *Section =
+        StringSwitch<StringRef *>(name)
+            .Case("debug_info", &InfoSection.Data)
+            .Case("debug_abbrev", &AbbrevSection)
+            .Case("debug_loc", &LocSection.Data)
+            .Case("debug_line", &LineSection.Data)
+            .Case("debug_aranges", &ARangeSection)
+            .Case("debug_frame", &DebugFrameSection)
+            .Case("debug_str", &StringSection)
+            .Case("debug_ranges", &RangeSection)
+            .Case("debug_pubnames", &PubNamesSection)
+            .Case("debug_pubtypes", &PubTypesSection)
+            .Case("debug_gnu_pubnames", &GnuPubNamesSection)
+            .Case("debug_gnu_pubtypes", &GnuPubTypesSection)
+            .Case("debug_info.dwo", &InfoDWOSection.Data)
+            .Case("debug_abbrev.dwo", &AbbrevDWOSection)
+            .Case("debug_str.dwo", &StringDWOSection)
+            .Case("debug_str_offsets.dwo", &StringOffsetDWOSection)
+            .Case("debug_addr", &AddrSection)
+            // Any more debug info sections go here.
+            .Default(0);
     if (Section) {
       *Section = data;
       if (name == "debug_ranges") {
         // FIXME: Use the other dwo range section when we emit it.
         RangeDWOSection = data;
       }
+    } else if (name == "debug_types") {
+      // Find debug_types data by section rather than name as there are
+      // multiple, comdat grouped, debug_types sections.
+      TypesSections[*i].Data = data;
     }
 
     section_iterator RelocatedSection = i->getRelocatedSection();
@@ -594,13 +647,18 @@ DWARFContextInMemory::DWARFContextInMemory(object::ObjectFile *Obj) :
     // TODO: Add support for relocations in other sections as needed.
     // Record relocations for the debug_info and debug_line sections.
     RelocAddrMap *Map = StringSwitch<RelocAddrMap*>(RelSecName)
-        .Case("debug_info", &InfoRelocMap)
-        .Case("debug_loc", &LocRelocMap)
-        .Case("debug_info.dwo", &InfoDWORelocMap)
-        .Case("debug_line", &LineRelocMap)
+        .Case("debug_info", &InfoSection.Relocs)
+        .Case("debug_loc", &LocSection.Relocs)
+        .Case("debug_info.dwo", &InfoDWOSection.Relocs)
+        .Case("debug_line", &LineSection.Relocs)
         .Default(0);
-    if (!Map)
-      continue;
+    if (!Map) {
+      if (RelSecName != "debug_types")
+        continue;
+      // Find debug_types relocs by section rather than name as there are
+      // multiple, comdat grouped, debug_types sections.
+      Map = &TypesSections[*RelocatedSection].Relocs;
+    }
 
     if (i->begin_relocations() != i->end_relocations()) {
       uint64_t SectionSize;
diff --git a/lib/DebugInfo/DWARFContext.h b/lib/DebugInfo/DWARFContext.h
index 5d8f714..03863ab 100644
--- a/lib/DebugInfo/DWARFContext.h
+++ b/lib/DebugInfo/DWARFContext.h
@@ -16,6 +16,7 @@
 #include "DWARFDebugLine.h"
 #include "DWARFDebugLoc.h"
 #include "DWARFDebugRangeList.h"
+#include "DWARFTypeUnit.h"
 #include "llvm/ADT/OwningPtr.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/DebugInfo/DIContext.h"
@@ -27,14 +28,15 @@ namespace llvm {
 /// information parsing. The actual data is supplied through pure virtual
 /// methods that a concrete implementation provides.
 class DWARFContext : public DIContext {
-  SmallVector<DWARFCompileUnit, 1> CUs;
+  SmallVector<DWARFCompileUnit *, 1> CUs;
+  SmallVector<DWARFTypeUnit *, 1> TUs;
   OwningPtr<DWARFDebugAbbrev> Abbrev;
   OwningPtr<DWARFDebugLoc> Loc;
   OwningPtr<DWARFDebugAranges> Aranges;
   OwningPtr<DWARFDebugLine> Line;
   OwningPtr<DWARFDebugFrame> DebugFrame;
 
-  SmallVector<DWARFCompileUnit, 1> DWOCUs;
+  SmallVector<DWARFCompileUnit *, 1> DWOCUs;
   OwningPtr<DWARFDebugAbbrev> AbbrevDWO;
 
   DWARFContext(DWARFContext &) LLVM_DELETED_FUNCTION;
@@ -43,12 +45,21 @@ class DWARFContext : public DIContext {
   /// Read compile units from the debug_info section and store them in CUs.
   void parseCompileUnits();
 
+  /// Read type units from the debug_types sections and store them in CUs.
+  void parseTypeUnits();
+
   /// Read compile units from the debug_info.dwo section and store them in
   /// DWOCUs.
   void parseDWOCompileUnits();
 
 public:
+  struct Section {
+    StringRef Data;
+    RelocAddrMap Relocs;
+  };
+
   DWARFContext() : DIContext(CK_DWARF) {}
+  virtual ~DWARFContext();
 
   static bool classof(const DIContext *DICtx) {
     return DICtx->getKind() == CK_DWARF;
@@ -63,6 +74,13 @@ public:
     return CUs.size();
   }
 
+  /// Get the number of compile units in this context.
+  unsigned getNumTypeUnits() {
+    if (TUs.empty())
+      parseTypeUnits();
+    return TUs.size();
+  }
+
   /// Get the number of compile units in the DWO context.
   unsigned getNumDWOCompileUnits() {
     if (DWOCUs.empty())
@@ -74,14 +92,21 @@ public:
   DWARFCompileUnit *getCompileUnitAtIndex(unsigned index) {
     if (CUs.empty())
       parseCompileUnits();
-    return &CUs[index];
+    return CUs[index];
+  }
+
+  /// Get the type unit at the specified index for this compile unit.
+  DWARFTypeUnit *getTypeUnitAtIndex(unsigned index) {
+    if (TUs.empty())
+      parseTypeUnits();
+    return TUs[index];
   }
 
   /// Get the compile unit at the specified index for the DWO compile units.
   DWARFCompileUnit *getDWOCompileUnitAtIndex(unsigned index) {
     if (DWOCUs.empty())
       parseDWOCompileUnits();
-    return &DWOCUs[index];
+    return DWOCUs[index];
   }
 
   /// Get a pointer to the parsed DebugAbbrev object.
@@ -112,27 +137,27 @@ public:
 
   virtual bool isLittleEndian() const = 0;
   virtual uint8_t getAddressSize() const = 0;
-  virtual const RelocAddrMap &infoRelocMap() const = 0;
-  virtual const RelocAddrMap &lineRelocMap() const = 0;
-  virtual const RelocAddrMap &locRelocMap() const = 0;
-  virtual StringRef getInfoSection() = 0;
+  virtual const Section &getInfoSection() = 0;
+  virtual const std::map<object::SectionRef, Section> &getTypesSections() = 0;
   virtual StringRef getAbbrevSection() = 0;
-  virtual StringRef getLocSection() = 0;
+  virtual const Section &getLocSection() = 0;
   virtual StringRef getARangeSection() = 0;
   virtual StringRef getDebugFrameSection() = 0;
-  virtual StringRef getLineSection() = 0;
+  virtual const Section &getLineSection() = 0;
   virtual StringRef getStringSection() = 0;
   virtual StringRef getRangeSection() = 0;
   virtual StringRef getPubNamesSection() = 0;
+  virtual StringRef getPubTypesSection() = 0;
+  virtual StringRef getGnuPubNamesSection() = 0;
+  virtual StringRef getGnuPubTypesSection() = 0;
 
   // Sections for DWARF5 split dwarf proposal.
-  virtual StringRef getInfoDWOSection() = 0;
+  virtual const Section &getInfoDWOSection() = 0;
   virtual StringRef getAbbrevDWOSection() = 0;
   virtual StringRef getStringDWOSection() = 0;
   virtual StringRef getStringOffsetDWOSection() = 0;
   virtual StringRef getRangeDWOSection() = 0;
   virtual StringRef getAddrSection() = 0;
-  virtual const RelocAddrMap &infoDWORelocMap() const = 0;
 
   static bool isSupportedVersion(unsigned version) {
     return version == 2 || version == 3 || version == 4;
@@ -153,22 +178,22 @@ class DWARFContextInMemory : public DWARFContext {
   virtual void anchor();
   bool IsLittleEndian;
   uint8_t AddressSize;
-  RelocAddrMap InfoRelocMap;
-  RelocAddrMap LocRelocMap;
-  RelocAddrMap LineRelocMap;
-  StringRef InfoSection;
+  Section InfoSection;
+  std::map<object::SectionRef, Section> TypesSections;
   StringRef AbbrevSection;
-  StringRef LocSection;
+  Section LocSection;
   StringRef ARangeSection;
   StringRef DebugFrameSection;
-  StringRef LineSection;
+  Section LineSection;
   StringRef StringSection;
   StringRef RangeSection;
   StringRef PubNamesSection;
+  StringRef PubTypesSection;
+  StringRef GnuPubNamesSection;
+  StringRef GnuPubTypesSection;
 
   // Sections for DWARF5 split dwarf proposal.
-  RelocAddrMap InfoDWORelocMap;
-  StringRef InfoDWOSection;
+  Section InfoDWOSection;
   StringRef AbbrevDWOSection;
   StringRef StringDWOSection;
   StringRef StringOffsetDWOSection;
@@ -182,21 +207,24 @@ public:
   ~DWARFContextInMemory();
   virtual bool isLittleEndian() const { return IsLittleEndian; }
   virtual uint8_t getAddressSize() const { return AddressSize; }
-  virtual const RelocAddrMap &infoRelocMap() const { return InfoRelocMap; }
-  virtual const RelocAddrMap &locRelocMap() const { return LocRelocMap; }
-  virtual const RelocAddrMap &lineRelocMap() const { return LineRelocMap; }
-  virtual StringRef getInfoSection() { return InfoSection; }
+  virtual const Section &getInfoSection() { return InfoSection; }
+  virtual const std::map<object::SectionRef, Section> &getTypesSections() {
+    return TypesSections;
+  }
   virtual StringRef getAbbrevSection() { return AbbrevSection; }
-  virtual StringRef getLocSection() { return LocSection; }
+  virtual const Section &getLocSection() { return LocSection; }
   virtual StringRef getARangeSection() { return ARangeSection; }
   virtual StringRef getDebugFrameSection() { return DebugFrameSection; }
-  virtual StringRef getLineSection() { return LineSection; }
+  virtual const Section &getLineSection() { return LineSection; }
   virtual StringRef getStringSection() { return StringSection; }
   virtual StringRef getRangeSection() { return RangeSection; }
   virtual StringRef getPubNamesSection() { return PubNamesSection; }
+  virtual StringRef getPubTypesSection() { return PubTypesSection; }
+  virtual StringRef getGnuPubNamesSection() { return GnuPubNamesSection; }
+  virtual StringRef getGnuPubTypesSection() { return GnuPubTypesSection; }
 
   // Sections for DWARF5 split dwarf proposal.
-  virtual StringRef getInfoDWOSection() { return InfoDWOSection; }
+  virtual const Section &getInfoDWOSection() { return InfoDWOSection; }
   virtual StringRef getAbbrevDWOSection() { return AbbrevDWOSection; }
   virtual StringRef getStringDWOSection() { return StringDWOSection; }
   virtual StringRef getStringOffsetDWOSection() {
@@ -206,9 +234,6 @@ public:
   virtual StringRef getAddrSection() {
     return AddrSection;
   }
-  virtual const RelocAddrMap &infoDWORelocMap() const {
-    return InfoDWORelocMap;
-  }
 };
 
 }
diff --git a/lib/DebugInfo/DWARFDebugArangeSet.cpp b/lib/DebugInfo/DWARFDebugArangeSet.cpp
index 7dff9ff..229376e 100644
--- a/lib/DebugInfo/DWARFDebugArangeSet.cpp
+++ b/lib/DebugInfo/DWARFDebugArangeSet.cpp
@@ -20,32 +20,6 @@ void DWARFDebugArangeSet::clear() {
   ArangeDescriptors.clear();
 }
 
-void DWARFDebugArangeSet::compact() {
-  if (ArangeDescriptors.empty())
-    return;
-
-  // Iterate through all arange descriptors and combine any ranges that
-  // overlap or have matching boundaries. The ArangeDescriptors are assumed
-  // to be in ascending order.
-  uint32_t i = 0;
-  while (i + 1 < ArangeDescriptors.size()) {
-    if (ArangeDescriptors[i].getEndAddress() >= ArangeDescriptors[i+1].Address){
-      // The current range ends at or exceeds the start of the next address
-      // range. Compute the max end address between the two and use that to
-      // make the new length.
-      const uint64_t max_end_addr =
-        std::max(ArangeDescriptors[i].getEndAddress(),
-                 ArangeDescriptors[i+1].getEndAddress());
-      ArangeDescriptors[i].Length = max_end_addr - ArangeDescriptors[i].Address;
-      // Now remove the next entry as it was just combined with the previous one
-      ArangeDescriptors.erase(ArangeDescriptors.begin()+i+1);
-    } else {
-      // Discontiguous address range, just proceed to the next one.
-      ++i;
-    }
-  }
-}
-
 bool
 DWARFDebugArangeSet::extract(DataExtractor data, uint32_t *offset_ptr) {
   if (data.isValidOffset(*offset_ptr)) {
@@ -126,26 +100,3 @@ void DWARFDebugArangeSet::dump(raw_ostream &OS) const {
        << format(" 0x%*.*" PRIx64 ")\n",
                  hex_width, hex_width, pos->getEndAddress());
 }
-
-
-namespace {
-  class DescriptorContainsAddress {
-    const uint64_t Address;
-  public:
-    DescriptorContainsAddress(uint64_t address) : Address(address) {}
-    bool operator()(const DWARFDebugArangeSet::Descriptor &desc) const {
-      return Address >= desc.Address && Address < (desc.Address + desc.Length);
-    }
-  };
-}
-
-uint32_t DWARFDebugArangeSet::findAddress(uint64_t address) const {
-  DescriptorConstIter end = ArangeDescriptors.end();
-  DescriptorConstIter pos =
-    std::find_if(ArangeDescriptors.begin(), end, // Range
-                 DescriptorContainsAddress(address)); // Predicate
-  if (pos != end)
-    return HeaderData.CuOffset;
-
-  return -1U;
-}
diff --git a/lib/DebugInfo/DWARFDebugArangeSet.h b/lib/DebugInfo/DWARFDebugArangeSet.h
index d768676..49a7132 100644
--- a/lib/DebugInfo/DWARFDebugArangeSet.h
+++ b/lib/DebugInfo/DWARFDebugArangeSet.h
@@ -44,7 +44,6 @@ public:
 
 private:
   typedef std::vector<Descriptor> DescriptorColl;
-  typedef DescriptorColl::iterator DescriptorIter;
   typedef DescriptorColl::const_iterator DescriptorConstIter;
 
   uint32_t Offset;
@@ -54,15 +53,11 @@ private:
 public:
   DWARFDebugArangeSet() { clear(); }
   void clear();
-  void compact();
   bool extract(DataExtractor data, uint32_t *offset_ptr);
   void dump(raw_ostream &OS) const;
 
   uint32_t getCompileUnitDIEOffset() const { return HeaderData.CuOffset; }
-  uint32_t getOffsetOfNextEntry() const { return Offset + HeaderData.Length + 4; }
-  uint32_t findAddress(uint64_t address) const;
   uint32_t getNumDescriptors() const { return ArangeDescriptors.size(); }
-  const struct Header &getHeader() const { return HeaderData; }
   const Descriptor *getDescriptor(uint32_t i) const {
     if (i < ArangeDescriptors.size())
       return &ArangeDescriptors[i];
diff --git a/lib/DebugInfo/DWARFDebugAranges.cpp b/lib/DebugInfo/DWARFDebugAranges.cpp
index f79862d..591d4bd 100644
--- a/lib/DebugInfo/DWARFDebugAranges.cpp
+++ b/lib/DebugInfo/DWARFDebugAranges.cpp
@@ -16,128 +16,79 @@
 #include <cassert>
 using namespace llvm;
 
-// Compare function DWARFDebugAranges::Range structures
-static bool RangeLessThan(const DWARFDebugAranges::Range &range1,
-                          const DWARFDebugAranges::Range &range2) {
-  return range1.LoPC < range2.LoPC;
-}
-
-namespace {
-  class CountArangeDescriptors {
-  public:
-    CountArangeDescriptors(uint32_t &count_ref) : Count(count_ref) {}
-    void operator()(const DWARFDebugArangeSet &Set) {
-      Count += Set.getNumDescriptors();
-    }
-    uint32_t &Count;
-  };
-
-  class AddArangeDescriptors {
-  public:
-    AddArangeDescriptors(DWARFDebugAranges::RangeColl &Ranges,
-                         DWARFDebugAranges::ParsedCUOffsetColl &CUOffsets)
-      : RangeCollection(Ranges),
-        CUOffsetCollection(CUOffsets) {}
-    void operator()(const DWARFDebugArangeSet &Set) {
-      DWARFDebugAranges::Range Range;
-      Range.Offset = Set.getCompileUnitDIEOffset();
-      CUOffsetCollection.insert(Range.Offset);
-
-      for (uint32_t i = 0, n = Set.getNumDescriptors(); i < n; ++i) {
-        const DWARFDebugArangeSet::Descriptor *ArangeDescPtr =
-            Set.getDescriptor(i);
-        Range.LoPC = ArangeDescPtr->Address;
-        Range.Length = ArangeDescPtr->Length;
-
-        // Insert each item in increasing address order so binary searching
-        // can later be done!
-        DWARFDebugAranges::RangeColl::iterator InsertPos =
-          std::lower_bound(RangeCollection.begin(), RangeCollection.end(),
-                           Range, RangeLessThan);
-        RangeCollection.insert(InsertPos, Range);
-      }
-
-    }
-    DWARFDebugAranges::RangeColl &RangeCollection;
-    DWARFDebugAranges::ParsedCUOffsetColl &CUOffsetCollection;
-  };
-}
-
-bool DWARFDebugAranges::extract(DataExtractor debug_aranges_data) {
-  if (debug_aranges_data.isValidOffset(0)) {
-    uint32_t offset = 0;
-
-    typedef std::vector<DWARFDebugArangeSet> SetCollection;
-    SetCollection sets;
-
-    DWARFDebugArangeSet set;
-    Range range;
-    while (set.extract(debug_aranges_data, &offset))
-      sets.push_back(set);
-
-    uint32_t count = 0;
-
-    std::for_each(sets.begin(), sets.end(), CountArangeDescriptors(count));
-
-    if (count > 0) {
-      Aranges.reserve(count);
-      AddArangeDescriptors range_adder(Aranges, ParsedCUOffsets);
-      std::for_each(sets.begin(), sets.end(), range_adder);
-    }
+void DWARFDebugAranges::extract(DataExtractor DebugArangesData) {
+  if (!DebugArangesData.isValidOffset(0))
+    return;
+  uint32_t Offset = 0;
+  typedef std::vector<DWARFDebugArangeSet> RangeSetColl;
+  RangeSetColl Sets;
+  DWARFDebugArangeSet Set;
+  uint32_t TotalRanges = 0;
+
+  while (Set.extract(DebugArangesData, &Offset)) {
+    Sets.push_back(Set);
+    TotalRanges += Set.getNumDescriptors();
   }
-  return false;
-}
+  if (TotalRanges == 0)
+    return;
 
-bool DWARFDebugAranges::generate(DWARFContext *ctx) {
-  if (ctx) {
-    const uint32_t num_compile_units = ctx->getNumCompileUnits();
-    for (uint32_t cu_idx = 0; cu_idx < num_compile_units; ++cu_idx) {
-      if (DWARFCompileUnit *cu = ctx->getCompileUnitAtIndex(cu_idx)) {
-        uint32_t CUOffset = cu->getOffset();
-        if (ParsedCUOffsets.insert(CUOffset).second)
-          cu->buildAddressRangeTable(this, true);
-      }
+  Aranges.reserve(TotalRanges);
+  for (RangeSetColl::const_iterator I = Sets.begin(), E = Sets.end(); I != E;
+       ++I) {
+    uint32_t CUOffset = I->getCompileUnitDIEOffset();
+
+    for (uint32_t i = 0, n = I->getNumDescriptors(); i < n; ++i) {
+      const DWARFDebugArangeSet::Descriptor *ArangeDescPtr =
+          I->getDescriptor(i);
+      uint64_t LowPC = ArangeDescPtr->Address;
+      uint64_t HighPC = LowPC + ArangeDescPtr->Length;
+      appendRange(CUOffset, LowPC, HighPC);
     }
   }
-  sort(true, /* overlap size */ 0);
-  return !isEmpty();
 }
 
-void DWARFDebugAranges::dump(raw_ostream &OS) const {
-  const uint32_t num_ranges = getNumRanges();
-  for (uint32_t i = 0; i < num_ranges; ++i) {
-    const Range &range = Aranges[i];
-    OS << format("0x%8.8x: [0x%8.8" PRIx64 " - 0x%8.8" PRIx64 ")\n",
-                 range.Offset, (uint64_t)range.LoPC, (uint64_t)range.HiPC());
+void DWARFDebugAranges::generate(DWARFContext *CTX) {
+  clear();
+  if (!CTX)
+    return;
+
+  // Extract aranges from .debug_aranges section.
+  DataExtractor ArangesData(CTX->getARangeSection(), CTX->isLittleEndian(), 0);
+  extract(ArangesData);
+
+  // Generate aranges from DIEs: even if .debug_aranges section is present,
+  // it may describe only a small subset of compilation units, so we need to
+  // manually build aranges for the rest of them.
+  for (uint32_t i = 0, n = CTX->getNumCompileUnits(); i < n; ++i) {
+    if (DWARFCompileUnit *CU = CTX->getCompileUnitAtIndex(i)) {
+      uint32_t CUOffset = CU->getOffset();
+      if (ParsedCUOffsets.insert(CUOffset).second)
+        CU->buildAddressRangeTable(this, true, CUOffset);
+    }
   }
-}
 
-void DWARFDebugAranges::Range::dump(raw_ostream &OS) const {
-  OS << format("{0x%8.8x}: [0x%8.8" PRIx64 " - 0x%8.8" PRIx64 ")\n",
-               Offset, LoPC, HiPC());
+  sortAndMinimize();
 }
 
-void DWARFDebugAranges::appendRange(uint32_t offset, uint64_t low_pc,
-                                    uint64_t high_pc) {
+void DWARFDebugAranges::appendRange(uint32_t CUOffset, uint64_t LowPC,
+                                    uint64_t HighPC) {
   if (!Aranges.empty()) {
-    if (Aranges.back().Offset == offset && Aranges.back().HiPC() == low_pc) {
-      Aranges.back().setHiPC(high_pc);
+    if (Aranges.back().CUOffset == CUOffset &&
+        Aranges.back().HighPC() == LowPC) {
+      Aranges.back().setHighPC(HighPC);
       return;
     }
   }
-  Aranges.push_back(Range(low_pc, high_pc, offset));
+  Aranges.push_back(Range(LowPC, HighPC, CUOffset));
 }
 
-void DWARFDebugAranges::sort(bool minimize, uint32_t n) {
+void DWARFDebugAranges::sortAndMinimize() {
   const size_t orig_arange_size = Aranges.size();
   // Size of one? If so, no sorting is needed
   if (orig_arange_size <= 1)
     return;
   // Sort our address range entries
-  std::stable_sort(Aranges.begin(), Aranges.end(), RangeLessThan);
-
-  if (!minimize)
-    return;
+  std::stable_sort(Aranges.begin(), Aranges.end());
 
   // Most address ranges are contiguous from function to function
   // so our new ranges will likely be smaller. We calculate the size
@@ -151,7 +102,7 @@ void DWARFDebugAranges::sort(bool minimize, uint32_t n) {
   // copy the new minimal stuff over to the new collection.
   size_t minimal_size = 1;
   for (size_t i = 1; i < orig_arange_size; ++i) {
-    if (!Range::SortedOverlapCheck(Aranges[i-1], Aranges[i], n))
+    if (!Range::SortedOverlapCheck(Aranges[i-1], Aranges[i]))
       ++minimal_size;
   }
 
@@ -166,14 +117,14 @@ void DWARFDebugAranges::sort(bool minimize, uint32_t n) {
   uint32_t j = 0;
   minimal_aranges[j] = Aranges[0];
   for (size_t i = 1; i < orig_arange_size; ++i) {
-    if(Range::SortedOverlapCheck (minimal_aranges[j], Aranges[i], n)) {
-      minimal_aranges[j].setHiPC (Aranges[i].HiPC());
+    if (Range::SortedOverlapCheck(minimal_aranges[j], Aranges[i])) {
+      minimal_aranges[j].setHighPC(Aranges[i].HighPC());
     } else {
       // Only increment j if we aren't merging
       minimal_aranges[++j] = Aranges[i];
     }
   }
-  assert (j+1 == minimal_size);
+  assert(j+1 == minimal_size);
 
   // Now swap our new minimal aranges into place. The local
   // minimal_aranges will then contian the old big collection
@@ -181,50 +132,21 @@ void DWARFDebugAranges::sort(bool minimize, uint32_t n) {
   minimal_aranges.swap(Aranges);
 }
 
-uint32_t DWARFDebugAranges::findAddress(uint64_t address) const {
+uint32_t DWARFDebugAranges::findAddress(uint64_t Address) const {
   if (!Aranges.empty()) {
-    Range range(address);
+    Range range(Address);
     RangeCollIterator begin = Aranges.begin();
     RangeCollIterator end = Aranges.end();
-    RangeCollIterator pos = std::lower_bound(begin, end, range, RangeLessThan);
+    RangeCollIterator pos =
+        std::lower_bound(begin, end, range);
 
-    if (pos != end && pos->LoPC <= address && address < pos->HiPC()) {
-      return pos->Offset;
+    if (pos != end && pos->containsAddress(Address)) {
+      return pos->CUOffset;
     } else if (pos != begin) {
       --pos;
-      if (pos->LoPC <= address && address < pos->HiPC())
-        return (*pos).Offset;
+      if (pos->containsAddress(Address))
+        return pos->CUOffset;
     }
   }
   return -1U;
 }
-
-bool
-DWARFDebugAranges::allRangesAreContiguous(uint64_t &LoPC, uint64_t &HiPC) const{
-  if (Aranges.empty())
-    return false;
-
-  uint64_t next_addr = 0;
-  RangeCollIterator begin = Aranges.begin();
-  for (RangeCollIterator pos = begin, end = Aranges.end(); pos != end;
-       ++pos) {
-    if (pos != begin && pos->LoPC != next_addr)
-      return false;
-    next_addr = pos->HiPC();
-  }
-  // We checked for empty at the start of function so front() will be valid.
-  LoPC = Aranges.front().LoPC;
-  // We checked for empty at the start of function so back() will be valid.
-  HiPC = Aranges.back().HiPC();
-  return true;
-}
-
-bool DWARFDebugAranges::getMaxRange(uint64_t &LoPC, uint64_t &HiPC) const {
-  if (Aranges.empty())
-    return false;
-  // We checked for empty at the start of function so front() will be valid.
-  LoPC = Aranges.front().LoPC;
-  // We checked for empty at the start of function so back() will be valid.
-  HiPC = Aranges.back().HiPC();
-  return true;
-}
diff --git a/lib/DebugInfo/DWARFDebugAranges.h b/lib/DebugInfo/DWARFDebugAranges.h
index 1509ffa..35ad8e5 100644
--- a/lib/DebugInfo/DWARFDebugAranges.h
+++ b/lib/DebugInfo/DWARFDebugAranges.h
@@ -20,81 +20,61 @@ class DWARFContext;
 
 class DWARFDebugAranges {
 public:
+  void clear() {
+    Aranges.clear();
+    ParsedCUOffsets.clear();
+  }
+
+  void generate(DWARFContext *CTX);
+
+  // Use appendRange multiple times and then call sortAndMinimize.
+  void appendRange(uint32_t CUOffset, uint64_t LowPC, uint64_t HighPC);
+
+  uint32_t findAddress(uint64_t Address) const;
+
+private:
+  void extract(DataExtractor DebugArangesData);
+  void sortAndMinimize();
+
   struct Range {
-    explicit Range(uint64_t lo = -1ULL, uint64_t hi = -1ULL,
-                   uint32_t off = -1U)
-      : LoPC(lo), Length(hi-lo), Offset(off) {}
-
-    void clear() {
-      LoPC = -1ULL;
-      Length = 0;
-      Offset = -1U;
-    }
+    explicit Range(uint64_t LowPC = -1ULL, uint64_t HighPC = -1ULL,
+                   uint32_t CUOffset = -1U)
+      : LowPC(LowPC), Length(HighPC - LowPC), CUOffset(CUOffset) {}
 
-    void setHiPC(uint64_t HiPC) {
-      if (HiPC == -1ULL || HiPC <= LoPC)
+    void setHighPC(uint64_t HighPC) {
+      if (HighPC == -1ULL || HighPC <= LowPC)
         Length = 0;
       else
-        Length = HiPC - LoPC;
+        Length = HighPC - LowPC;
     }
-    uint64_t HiPC() const {
+    uint64_t HighPC() const {
       if (Length)
-        return LoPC + Length;
+        return LowPC + Length;
       return -1ULL;
     }
-    bool isValidRange() const { return Length > 0; }
+    bool containsAddress(uint64_t Address) const {
+      return LowPC <= Address && Address < HighPC();
+    }
 
-    static bool SortedOverlapCheck(const Range &curr_range,
-                                   const Range &next_range, uint32_t n) {
-      if (curr_range.Offset != next_range.Offset)
-        return false;
-      return curr_range.HiPC() + n >= next_range.LoPC;
+    bool operator <(const Range &other) const {
+      return LowPC < other.LowPC;
     }
 
-    bool contains(const Range &range) const {
-      return LoPC <= range.LoPC && range.HiPC() <= HiPC();
+    static bool SortedOverlapCheck(const Range &Left, const Range &Right) {
+      if (Left.CUOffset != Right.CUOffset)
+        return false;
+      return Left.HighPC() >= Right.LowPC;
     }
 
-    void dump(raw_ostream &OS) const;
-    uint64_t LoPC; // Start of address range
-    uint32_t Length; // End of address range (not including this address)
-    uint32_t Offset; // Offset of the compile unit or die
+    uint64_t LowPC; // Start of address range.
+    uint32_t Length; // End of address range (not including this address).
+    uint32_t CUOffset; // Offset of the compile unit or die.
   };
 
-  void clear() {
-    Aranges.clear();
-    ParsedCUOffsets.clear();
-  }
-  bool allRangesAreContiguous(uint64_t& LoPC, uint64_t& HiPC) const;
-  bool getMaxRange(uint64_t& LoPC, uint64_t& HiPC) const;
-  bool extract(DataExtractor debug_aranges_data);
-  bool generate(DWARFContext *ctx);
-
-  // Use append range multiple times and then call sort
-  void appendRange(uint32_t cu_offset, uint64_t low_pc, uint64_t high_pc);
-  void sort(bool minimize, uint32_t n);
-
-  const Range *rangeAtIndex(uint32_t idx) const {
-    if (idx < Aranges.size())
-      return &Aranges[idx];
-    return NULL;
-  }
-  void dump(raw_ostream &OS) const;
-  uint32_t findAddress(uint64_t address) const;
-  bool isEmpty() const { return Aranges.empty(); }
-  uint32_t getNumRanges() const { return Aranges.size(); }
-
-  uint32_t offsetAtIndex(uint32_t idx) const {
-    if (idx < Aranges.size())
-      return Aranges[idx].Offset;
-    return -1U;
-  }
-
   typedef std::vector<Range>              RangeColl;
   typedef RangeColl::const_iterator       RangeCollIterator;
   typedef DenseSet<uint32_t>              ParsedCUOffsetColl;
 
-private:
   RangeColl Aranges;
   ParsedCUOffsetColl ParsedCUOffsets;
 };
diff --git a/lib/DebugInfo/DWARFDebugInfoEntry.cpp b/lib/DebugInfo/DWARFDebugInfoEntry.cpp
index 0c7b7e3..babfd2e 100644
--- a/lib/DebugInfo/DWARFDebugInfoEntry.cpp
+++ b/lib/DebugInfo/DWARFDebugInfoEntry.cpp
@@ -19,11 +19,10 @@
 using namespace llvm;
 using namespace dwarf;
 
-void DWARFDebugInfoEntryMinimal::dump(raw_ostream &OS,
-                                      const DWARFCompileUnit *cu,
+void DWARFDebugInfoEntryMinimal::dump(raw_ostream &OS, const DWARFUnit *u,
                                       unsigned recurseDepth,
                                       unsigned indent) const {
-  DataExtractor debug_info_data = cu->getDebugInfoExtractor();
+  DataExtractor debug_info_data = u->getDebugInfoExtractor();
   uint32_t offset = Offset;
 
   if (debug_info_data.isValidOffset(offset)) {
@@ -45,13 +44,13 @@ void DWARFDebugInfoEntryMinimal::dump(raw_ostream &OS,
         for (uint32_t i = 0; i != numAttributes; ++i) {
           uint16_t attr = AbbrevDecl->getAttrByIndex(i);
           uint16_t form = AbbrevDecl->getFormByIndex(i);
-          dumpAttribute(OS, cu, &offset, attr, form, indent);
+          dumpAttribute(OS, u, &offset, attr, form, indent);
         }
 
         const DWARFDebugInfoEntryMinimal *child = getFirstChild();
         if (recurseDepth > 0 && child) {
           while (child) {
-            child->dump(OS, cu, recurseDepth-1, indent+2);
+            child->dump(OS, u, recurseDepth-1, indent+2);
             child = child->getSibling();
           }
         }
@@ -66,12 +65,11 @@ void DWARFDebugInfoEntryMinimal::dump(raw_ostream &OS,
 }
 
 void DWARFDebugInfoEntryMinimal::dumpAttribute(raw_ostream &OS,
-                                               const DWARFCompileUnit *cu,
-                                               uint32_t* offset_ptr,
-                                               uint16_t attr,
-                                               uint16_t form,
+                                               const DWARFUnit *u,
+                                               uint32_t *offset_ptr,
+                                               uint16_t attr, uint16_t form,
                                                unsigned indent) const {
-  OS << format("0x%8.8x: ", *offset_ptr);
+  OS << "            ";
   OS.indent(indent+2);
   const char *attrString = AttributeString(attr);
   if (attrString)
@@ -86,57 +84,20 @@ void DWARFDebugInfoEntryMinimal::dumpAttribute(raw_ostream &OS,
 
   DWARFFormValue formValue(form);
 
-  if (!formValue.extractValue(cu->getDebugInfoExtractor(), offset_ptr, cu))
+  if (!formValue.extractValue(u->getDebugInfoExtractor(), offset_ptr, u))
     return;
 
   OS << "\t(";
-  formValue.dump(OS, cu);
+  formValue.dump(OS, u);
   OS << ")\n";
 }
 
-bool DWARFDebugInfoEntryMinimal::extractFast(const DWARFCompileUnit *CU,
-                                             const uint8_t *FixedFormSizes,
+bool DWARFDebugInfoEntryMinimal::extractFast(const DWARFUnit *U,
                                              uint32_t *OffsetPtr) {
   Offset = *OffsetPtr;
-  DataExtractor DebugInfoData = CU->getDebugInfoExtractor();
-  uint64_t AbbrCode = DebugInfoData.getULEB128(OffsetPtr);
-  if (0 == AbbrCode) {
-    // NULL debug tag entry.
-    AbbrevDecl = NULL;
-    return true;
-  }
-  AbbrevDecl = CU->getAbbreviations()->getAbbreviationDeclaration(AbbrCode);
-  assert(AbbrevDecl);
-  assert(FixedFormSizes); // For best performance this should be specified!
-
-  // Skip all data in the .debug_info for the attributes
-  for (uint32_t i = 0, n = AbbrevDecl->getNumAttributes(); i < n; ++i) {
-    uint16_t Form = AbbrevDecl->getFormByIndex(i);
-
-    // FIXME: Currently we're checking if this is less than the last
-    // entry in the fixed_form_sizes table, but this should be changed
-    // to use dynamic dispatch.
-    uint8_t FixedFormSize =
-        (Form < DW_FORM_ref_sig8) ? FixedFormSizes[Form] : 0;
-    if (FixedFormSize)
-      *OffsetPtr += FixedFormSize;
-    else if (!DWARFFormValue::skipValue(Form, DebugInfoData, OffsetPtr,
-                                        CU)) {
-      // Restore the original offset.
-      *OffsetPtr = Offset;
-      return false;
-    }
-  }
-  return true;
-}
-
-bool
-DWARFDebugInfoEntryMinimal::extract(const DWARFCompileUnit *CU,
-                                    uint32_t *OffsetPtr) {
-  DataExtractor DebugInfoData = CU->getDebugInfoExtractor();
-  const uint32_t CUEndOffset = CU->getNextCompileUnitOffset();
-  Offset = *OffsetPtr;
-  if ((Offset >= CUEndOffset) || !DebugInfoData.isValidOffset(Offset))
+  DataExtractor DebugInfoData = U->getDebugInfoExtractor();
+  uint32_t UEndOffset = U->getNextUnitOffset();
+  if (Offset >= UEndOffset || !DebugInfoData.isValidOffset(Offset))
     return false;
   uint64_t AbbrCode = DebugInfoData.getULEB128(OffsetPtr);
   if (0 == AbbrCode) {
@@ -144,31 +105,25 @@ DWARFDebugInfoEntryMinimal::extract(const DWARFCompileUnit *CU,
     AbbrevDecl = NULL;
     return true;
   }
-  AbbrevDecl = CU->getAbbreviations()->getAbbreviationDeclaration(AbbrCode);
+  AbbrevDecl = U->getAbbreviations()->getAbbreviationDeclaration(AbbrCode);
   if (0 == AbbrevDecl) {
     // Restore the original offset.
     *OffsetPtr = Offset;
     return false;
   }
-  bool IsCompileUnitTag = (AbbrevDecl->getTag() == DW_TAG_compile_unit);
-  if (IsCompileUnitTag)
-    const_cast<DWARFCompileUnit*>(CU)->setBaseAddress(0);
+  ArrayRef<uint8_t> FixedFormSizes = DWARFFormValue::getFixedFormSizes(
+      U->getAddressByteSize(), U->getVersion());
+  assert(FixedFormSizes.size() > 0);
 
   // Skip all data in the .debug_info for the attributes
   for (uint32_t i = 0, n = AbbrevDecl->getNumAttributes(); i < n; ++i) {
-    uint16_t Attr = AbbrevDecl->getAttrByIndex(i);
     uint16_t Form = AbbrevDecl->getFormByIndex(i);
 
-    if (IsCompileUnitTag &&
-        ((Attr == DW_AT_entry_pc) || (Attr == DW_AT_low_pc))) {
-      DWARFFormValue FormValue(Form);
-      if (FormValue.extractValue(DebugInfoData, OffsetPtr, CU)) {
-        if (Attr == DW_AT_low_pc || Attr == DW_AT_entry_pc)
-          const_cast<DWARFCompileUnit*>(CU)
-            ->setBaseAddress(FormValue.getUnsigned());
-      }
-    } else if (!DWARFFormValue::skipValue(Form, DebugInfoData, OffsetPtr,
-                                          CU)) {
+    uint8_t FixedFormSize =
+        (Form < FixedFormSizes.size()) ? FixedFormSizes[Form] : 0;
+    if (FixedFormSize)
+      *OffsetPtr += FixedFormSize;
+    else if (!DWARFFormValue::skipValue(Form, DebugInfoData, OffsetPtr, U)) {
       // Restore the original offset.
       *OffsetPtr = Offset;
       return false;
@@ -187,190 +142,179 @@ bool DWARFDebugInfoEntryMinimal::isSubroutineDIE() const {
          Tag == DW_TAG_inlined_subroutine;
 }
 
-uint32_t
-DWARFDebugInfoEntryMinimal::getAttributeValue(const DWARFCompileUnit *cu,
-                                              const uint16_t attr,
-                                              DWARFFormValue &form_value,
-                                              uint32_t *end_attr_offset_ptr)
-                                              const {
-  if (AbbrevDecl) {
-    uint32_t attr_idx = AbbrevDecl->findAttributeIndex(attr);
-
-    if (attr_idx != -1U) {
-      uint32_t offset = getOffset();
+bool DWARFDebugInfoEntryMinimal::getAttributeValue(
+    const DWARFUnit *U, const uint16_t Attr, DWARFFormValue &FormValue) const {
+  if (!AbbrevDecl)
+    return false;
 
-      DataExtractor debug_info_data = cu->getDebugInfoExtractor();
+  uint32_t AttrIdx = AbbrevDecl->findAttributeIndex(Attr);
+  if (AttrIdx == -1U)
+    return false;
 
-      // Skip the abbreviation code so we are at the data for the attributes
-      debug_info_data.getULEB128(&offset);
+  DataExtractor DebugInfoData = U->getDebugInfoExtractor();
+  uint32_t DebugInfoOffset = getOffset();
 
-      uint32_t idx = 0;
-      while (idx < attr_idx)
-        DWARFFormValue::skipValue(AbbrevDecl->getFormByIndex(idx++),
-                                  debug_info_data, &offset, cu);
+  // Skip the abbreviation code so we are at the data for the attributes
+  DebugInfoData.getULEB128(&DebugInfoOffset);
 
-      const uint32_t attr_offset = offset;
-      form_value = DWARFFormValue(AbbrevDecl->getFormByIndex(idx));
-      if (form_value.extractValue(debug_info_data, &offset, cu)) {
-        if (end_attr_offset_ptr)
-          *end_attr_offset_ptr = offset;
-        return attr_offset;
-      }
-    }
+  // Skip preceding attribute values.
+  for (uint32_t i = 0; i < AttrIdx; ++i) {
+    DWARFFormValue::skipValue(AbbrevDecl->getFormByIndex(i),
+                              DebugInfoData, &DebugInfoOffset, U);
   }
 
-  return 0;
+  FormValue = DWARFFormValue(AbbrevDecl->getFormByIndex(AttrIdx));
+  return FormValue.extractValue(DebugInfoData, &DebugInfoOffset, U);
 }
 
-const char*
-DWARFDebugInfoEntryMinimal::getAttributeValueAsString(
-                                                     const DWARFCompileUnit* cu,
-                                                     const uint16_t attr,
-                                                     const char* fail_value)
-                                                     const {
-  DWARFFormValue form_value;
-  if (getAttributeValue(cu, attr, form_value)) {
-    DataExtractor stringExtractor(cu->getStringSection(), false, 0);
-    return form_value.getAsCString(&stringExtractor);
-  }
-  return fail_value;
+const char *DWARFDebugInfoEntryMinimal::getAttributeValueAsString(
+    const DWARFUnit *U, const uint16_t Attr, const char *FailValue) const {
+  DWARFFormValue FormValue;
+  if (!getAttributeValue(U, Attr, FormValue))
+    return FailValue;
+  Optional<const char *> Result = FormValue.getAsCString(U);
+  return Result.hasValue() ? Result.getValue() : FailValue;
+}
+
+uint64_t DWARFDebugInfoEntryMinimal::getAttributeValueAsAddress(
+    const DWARFUnit *U, const uint16_t Attr, uint64_t FailValue) const {
+  DWARFFormValue FormValue;
+  if (!getAttributeValue(U, Attr, FormValue))
+    return FailValue;
+  Optional<uint64_t> Result = FormValue.getAsAddress(U);
+  return Result.hasValue() ? Result.getValue() : FailValue;
 }
 
-uint64_t
-DWARFDebugInfoEntryMinimal::getAttributeValueAsUnsigned(
-                                                    const DWARFCompileUnit* cu,
-                                                    const uint16_t attr,
-                                                    uint64_t fail_value) const {
-  DWARFFormValue form_value;
-  if (getAttributeValue(cu, attr, form_value))
-      return form_value.getUnsigned();
-  return fail_value;
+uint64_t DWARFDebugInfoEntryMinimal::getAttributeValueAsUnsignedConstant(
+    const DWARFUnit *U, const uint16_t Attr, uint64_t FailValue) const {
+  DWARFFormValue FormValue;
+  if (!getAttributeValue(U, Attr, FormValue))
+    return FailValue;
+  Optional<uint64_t> Result = FormValue.getAsUnsignedConstant();
+  return Result.hasValue() ? Result.getValue() : FailValue;
 }
 
-int64_t
-DWARFDebugInfoEntryMinimal::getAttributeValueAsSigned(
-                                                     const DWARFCompileUnit* cu,
-                                                     const uint16_t attr,
-                                                     int64_t fail_value) const {
-  DWARFFormValue form_value;
-  if (getAttributeValue(cu, attr, form_value))
-      return form_value.getSigned();
-  return fail_value;
+uint64_t DWARFDebugInfoEntryMinimal::getAttributeValueAsReference(
+    const DWARFUnit *U, const uint16_t Attr, uint64_t FailValue) const {
+  DWARFFormValue FormValue;
+  if (!getAttributeValue(U, Attr, FormValue))
+    return FailValue;
+  Optional<uint64_t> Result = FormValue.getAsReference(U);
+  return Result.hasValue() ? Result.getValue() : FailValue;
 }
 
-uint64_t
-DWARFDebugInfoEntryMinimal::getAttributeValueAsReference(
-                                                     const DWARFCompileUnit* cu,
-                                                     const uint16_t attr,
-                                                     uint64_t fail_value)
-                                                     const {
-  DWARFFormValue form_value;
-  if (getAttributeValue(cu, attr, form_value))
-      return form_value.getReference(cu);
-  return fail_value;
+uint64_t DWARFDebugInfoEntryMinimal::getAttributeValueAsSectionOffset(
+    const DWARFUnit *U, const uint16_t Attr, uint64_t FailValue) const {
+  DWARFFormValue FormValue;
+  if (!getAttributeValue(U, Attr, FormValue))
+    return FailValue;
+  Optional<uint64_t> Result = FormValue.getAsSectionOffset();
+  return Result.hasValue() ? Result.getValue() : FailValue;
 }
 
-bool DWARFDebugInfoEntryMinimal::getLowAndHighPC(const DWARFCompileUnit *CU,
+bool DWARFDebugInfoEntryMinimal::getLowAndHighPC(const DWARFUnit *U,
                                                  uint64_t &LowPC,
                                                  uint64_t &HighPC) const {
-  HighPC = -1ULL;
-  LowPC = getAttributeValueAsUnsigned(CU, DW_AT_low_pc, -1ULL);
-  if (LowPC != -1ULL)
-    HighPC = getAttributeValueAsUnsigned(CU, DW_AT_high_pc, -1ULL);
+  LowPC = getAttributeValueAsAddress(U, DW_AT_low_pc, -1ULL);
+  if (LowPC == -1ULL)
+    return false;
+  HighPC = getAttributeValueAsAddress(U, DW_AT_high_pc, -1ULL);
+  if (HighPC == -1ULL) {
+    // Since DWARF4, DW_AT_high_pc may also be of class constant, in which case
+    // it represents function size.
+    HighPC = getAttributeValueAsUnsignedConstant(U, DW_AT_high_pc, -1ULL);
+    if (HighPC != -1ULL)
+      HighPC += LowPC;
+  }
   return (HighPC != -1ULL);
 }
 
-void
-DWARFDebugInfoEntryMinimal::buildAddressRangeTable(const DWARFCompileUnit *CU,
-                                               DWARFDebugAranges *DebugAranges)
-                                                   const {
+void DWARFDebugInfoEntryMinimal::buildAddressRangeTable(
+    const DWARFUnit *U, DWARFDebugAranges *DebugAranges,
+    uint32_t UOffsetInAranges) const {
   if (AbbrevDecl) {
     if (isSubprogramDIE()) {
       uint64_t LowPC, HighPC;
-      if (getLowAndHighPC(CU, LowPC, HighPC)) {
-        DebugAranges->appendRange(CU->getOffset(), LowPC, HighPC);
-      }
+      if (getLowAndHighPC(U, LowPC, HighPC))
+        DebugAranges->appendRange(UOffsetInAranges, LowPC, HighPC);
       // FIXME: try to append ranges from .debug_ranges section.
     }
 
-    const DWARFDebugInfoEntryMinimal *child = getFirstChild();
-    while (child) {
-      child->buildAddressRangeTable(CU, DebugAranges);
-      child = child->getSibling();
+    const DWARFDebugInfoEntryMinimal *Child = getFirstChild();
+    while (Child) {
+      Child->buildAddressRangeTable(U, DebugAranges, UOffsetInAranges);
+      Child = Child->getSibling();
     }
   }
 }
 
-bool
-DWARFDebugInfoEntryMinimal::addressRangeContainsAddress(
-                                                     const DWARFCompileUnit *CU,
-                                                     const uint64_t Address)
-                                                     const {
+bool DWARFDebugInfoEntryMinimal::addressRangeContainsAddress(
+    const DWARFUnit *U, const uint64_t Address) const {
   if (isNULL())
     return false;
   uint64_t LowPC, HighPC;
-  if (getLowAndHighPC(CU, LowPC, HighPC))
+  if (getLowAndHighPC(U, LowPC, HighPC))
     return (LowPC <= Address && Address <= HighPC);
   // Try to get address ranges from .debug_ranges section.
-  uint32_t RangesOffset = getAttributeValueAsReference(CU, DW_AT_ranges, -1U);
+  uint32_t RangesOffset =
+      getAttributeValueAsSectionOffset(U, DW_AT_ranges, -1U);
   if (RangesOffset != -1U) {
     DWARFDebugRangeList RangeList;
-    if (CU->extractRangeList(RangesOffset, RangeList))
-      return RangeList.containsAddress(CU->getBaseAddress(), Address);
+    if (U->extractRangeList(RangesOffset, RangeList))
+      return RangeList.containsAddress(U->getBaseAddress(), Address);
   }
   return false;
 }
 
-const char*
-DWARFDebugInfoEntryMinimal::getSubroutineName(const DWARFCompileUnit *CU)
-                                                                         const {
+const char *
+DWARFDebugInfoEntryMinimal::getSubroutineName(const DWARFUnit *U) const {
   if (!isSubroutineDIE())
     return 0;
   // Try to get mangled name if possible.
   if (const char *name =
-      getAttributeValueAsString(CU, DW_AT_MIPS_linkage_name, 0))
+      getAttributeValueAsString(U, DW_AT_MIPS_linkage_name, 0))
     return name;
-  if (const char *name = getAttributeValueAsString(CU, DW_AT_linkage_name, 0))
+  if (const char *name = getAttributeValueAsString(U, DW_AT_linkage_name, 0))
     return name;
-  if (const char *name = getAttributeValueAsString(CU, DW_AT_name, 0))
+  if (const char *name = getAttributeValueAsString(U, DW_AT_name, 0))
     return name;
   // Try to get name from specification DIE.
   uint32_t spec_ref =
-      getAttributeValueAsReference(CU, DW_AT_specification, -1U);
+      getAttributeValueAsReference(U, DW_AT_specification, -1U);
   if (spec_ref != -1U) {
     DWARFDebugInfoEntryMinimal spec_die;
-    if (spec_die.extract(CU, &spec_ref)) {
-      if (const char *name = spec_die.getSubroutineName(CU))
+    if (spec_die.extractFast(U, &spec_ref)) {
+      if (const char *name = spec_die.getSubroutineName(U))
         return name;
     }
   }
   // Try to get name from abstract origin DIE.
   uint32_t abs_origin_ref =
-      getAttributeValueAsReference(CU, DW_AT_abstract_origin, -1U);
+      getAttributeValueAsReference(U, DW_AT_abstract_origin, -1U);
   if (abs_origin_ref != -1U) {
     DWARFDebugInfoEntryMinimal abs_origin_die;
-    if (abs_origin_die.extract(CU, &abs_origin_ref)) {
-      if (const char *name = abs_origin_die.getSubroutineName(CU))
+    if (abs_origin_die.extractFast(U, &abs_origin_ref)) {
+      if (const char *name = abs_origin_die.getSubroutineName(U))
         return name;
     }
   }
   return 0;
 }
 
-void DWARFDebugInfoEntryMinimal::getCallerFrame(const DWARFCompileUnit *CU,
+void DWARFDebugInfoEntryMinimal::getCallerFrame(const DWARFUnit *U,
                                                 uint32_t &CallFile,
                                                 uint32_t &CallLine,
                                                 uint32_t &CallColumn) const {
-  CallFile = getAttributeValueAsUnsigned(CU, DW_AT_call_file, 0);
-  CallLine = getAttributeValueAsUnsigned(CU, DW_AT_call_line, 0);
-  CallColumn = getAttributeValueAsUnsigned(CU, DW_AT_call_column, 0);
+  CallFile = getAttributeValueAsUnsignedConstant(U, DW_AT_call_file, 0);
+  CallLine = getAttributeValueAsUnsignedConstant(U, DW_AT_call_line, 0);
+  CallColumn = getAttributeValueAsUnsignedConstant(U, DW_AT_call_column, 0);
 }
 
 DWARFDebugInfoEntryInlinedChain
 DWARFDebugInfoEntryMinimal::getInlinedChainForAddress(
-    const DWARFCompileUnit *CU, const uint64_t Address) const {
+    const DWARFUnit *U, const uint64_t Address) const {
   DWARFDebugInfoEntryInlinedChain InlinedChain;
-  InlinedChain.CU = CU;
+  InlinedChain.U = U;
   if (isNULL())
     return InlinedChain;
   for (const DWARFDebugInfoEntryMinimal *DIE = this; DIE; ) {
@@ -382,7 +326,7 @@ DWARFDebugInfoEntryMinimal::getInlinedChainForAddress(
     // Try to get child which also contains provided address.
     const DWARFDebugInfoEntryMinimal *Child = DIE->getFirstChild();
     while (Child) {
-      if (Child->addressRangeContainsAddress(CU, Address)) {
+      if (Child->addressRangeContainsAddress(U, Address)) {
         // Assume there is only one such child.
         break;
       }
diff --git a/lib/DebugInfo/DWARFDebugInfoEntry.h b/lib/DebugInfo/DWARFDebugInfoEntry.h
index a69911f..aa61056 100644
--- a/lib/DebugInfo/DWARFDebugInfoEntry.h
+++ b/lib/DebugInfo/DWARFDebugInfoEntry.h
@@ -18,6 +18,7 @@ namespace llvm {
 
 class DWARFDebugAranges;
 class DWARFCompileUnit;
+class DWARFUnit;
 class DWARFContext;
 class DWARFFormValue;
 struct DWARFDebugInfoEntryInlinedChain;
@@ -39,23 +40,15 @@ public:
   DWARFDebugInfoEntryMinimal()
     : Offset(0), ParentIdx(0), SiblingIdx(0), AbbrevDecl(0) {}
 
-  void dump(raw_ostream &OS, const DWARFCompileUnit *cu,
-            unsigned recurseDepth, unsigned indent = 0) const;
-  void dumpAttribute(raw_ostream &OS, const DWARFCompileUnit *cu,
-                     uint32_t *offset_ptr, uint16_t attr, uint16_t form,
-                     unsigned indent = 0) const;
+  void dump(raw_ostream &OS, const DWARFUnit *u, unsigned recurseDepth,
+            unsigned indent = 0) const;
+  void dumpAttribute(raw_ostream &OS, const DWARFUnit *u, uint32_t *offset_ptr,
+                     uint16_t attr, uint16_t form, unsigned indent = 0) const;
 
-  /// Extracts a debug info entry, which is a child of a given compile unit,
+  /// Extracts a debug info entry, which is a child of a given unit,
   /// starting at a given offset. If DIE can't be extracted, returns false and
   /// doesn't change OffsetPtr.
-  bool extractFast(const DWARFCompileUnit *CU, const uint8_t *FixedFormSizes,
-                   uint32_t *OffsetPtr);
-
-  /// Extract a debug info entry for a given compile unit from the
-  /// .debug_info and .debug_abbrev data starting at the given offset.
-  /// If compile unit can't be parsed, returns false and doesn't change
-  /// OffsetPtr.
-  bool extract(const DWARFCompileUnit *CU, uint32_t *OffsetPtr);
+  bool extractFast(const DWARFUnit *U, uint32_t *OffsetPtr);
 
   uint32_t getTag() const { return AbbrevDecl ? AbbrevDecl->getTag() : 0; }
   bool isNULL() const { return AbbrevDecl == 0; }
@@ -120,54 +113,54 @@ public:
     return AbbrevDecl;
   }
 
-  uint32_t getAttributeValue(const DWARFCompileUnit *cu,
-                             const uint16_t attr, DWARFFormValue &formValue,
-                             uint32_t *end_attr_offset_ptr = 0) const;
+  bool getAttributeValue(const DWARFUnit *U, const uint16_t Attr,
+                         DWARFFormValue &FormValue) const;
+
+  const char *getAttributeValueAsString(const DWARFUnit *U, const uint16_t Attr,
+                                        const char *FailValue) const;
 
-  const char* getAttributeValueAsString(const DWARFCompileUnit* cu,
-                                        const uint16_t attr,
-                                        const char *fail_value) const;
+  uint64_t getAttributeValueAsAddress(const DWARFUnit *U, const uint16_t Attr,
+                                      uint64_t FailValue) const;
 
-  uint64_t getAttributeValueAsUnsigned(const DWARFCompileUnit *cu,
-                                       const uint16_t attr,
-                                       uint64_t fail_value) const;
+  uint64_t getAttributeValueAsUnsignedConstant(const DWARFUnit *U,
+                                               const uint16_t Attr,
+                                               uint64_t FailValue) const;
 
-  uint64_t getAttributeValueAsReference(const DWARFCompileUnit *cu,
-                                        const uint16_t attr,
-                                        uint64_t fail_value) const;
+  uint64_t getAttributeValueAsReference(const DWARFUnit *U, const uint16_t Attr,
+                                        uint64_t FailValue) const;
 
-  int64_t getAttributeValueAsSigned(const DWARFCompileUnit* cu,
-                                    const uint16_t attr,
-                                    int64_t fail_value) const;
+  uint64_t getAttributeValueAsSectionOffset(const DWARFUnit *U,
+                                            const uint16_t Attr,
+                                            uint64_t FailValue) const;
 
   /// Retrieves DW_AT_low_pc and DW_AT_high_pc from CU.
   /// Returns true if both attributes are present.
-  bool getLowAndHighPC(const DWARFCompileUnit *CU,
-                       uint64_t &LowPC, uint64_t &HighPC) const;
+  bool getLowAndHighPC(const DWARFUnit *U, uint64_t &LowPC,
+                       uint64_t &HighPC) const;
 
-  void buildAddressRangeTable(const DWARFCompileUnit *CU,
-                              DWARFDebugAranges *DebugAranges) const;
+  void buildAddressRangeTable(const DWARFUnit *U,
+                              DWARFDebugAranges *DebugAranges,
+                              uint32_t CUOffsetInAranges) const;
 
-  bool addressRangeContainsAddress(const DWARFCompileUnit *CU,
+  bool addressRangeContainsAddress(const DWARFUnit *U,
                                    const uint64_t Address) const;
 
   /// If a DIE represents a subprogram (or inlined subroutine),
   /// returns its mangled name (or short name, if mangled is missing).
   /// This name may be fetched from specification or abstract origin
   /// for this subprogram. Returns null if no name is found.
-  const char* getSubroutineName(const DWARFCompileUnit *CU) const;
+  const char *getSubroutineName(const DWARFUnit *U) const;
 
   /// Retrieves values of DW_AT_call_file, DW_AT_call_line and
   /// DW_AT_call_column from DIE (or zeroes if they are missing).
-  void getCallerFrame(const DWARFCompileUnit *CU, uint32_t &CallFile,
+  void getCallerFrame(const DWARFUnit *U, uint32_t &CallFile,
                       uint32_t &CallLine, uint32_t &CallColumn) const;
 
   /// Get inlined chain for a given address, rooted at the current DIE.
   /// Returns empty chain if address is not contained in address range
   /// of current DIE.
   DWARFDebugInfoEntryInlinedChain
-  getInlinedChainForAddress(const DWARFCompileUnit *CU,
-                            const uint64_t Address) const;
+  getInlinedChainForAddress(const DWARFUnit *U, const uint64_t Address) const;
 };
 
 /// DWARFDebugInfoEntryInlinedChain - represents a chain of inlined_subroutine
@@ -176,9 +169,9 @@ public:
 /// (except the last DIE) in this chain is contained in address
 /// range for next DIE in the chain.
 struct DWARFDebugInfoEntryInlinedChain {
-  DWARFDebugInfoEntryInlinedChain() : CU(0) {}
+  DWARFDebugInfoEntryInlinedChain() : U(0) {}
   SmallVector<DWARFDebugInfoEntryMinimal, 4> DIEs;
-  const DWARFCompileUnit *CU;
+  const DWARFUnit *U;
 };
 
 }
diff --git a/lib/DebugInfo/DWARFDebugLine.cpp b/lib/DebugInfo/DWARFDebugLine.cpp
index 192381c..13d09dd 100644
--- a/lib/DebugInfo/DWARFDebugLine.cpp
+++ b/lib/DebugInfo/DWARFDebugLine.cpp
@@ -211,7 +211,7 @@ DWARFDebugLine::parsePrologue(DataExtractor debug_line_data,
 
   if (*offset_ptr != end_prologue_offset) {
     fprintf(stderr, "warning: parsing line table prologue at 0x%8.8x should"
-                    " have ended at 0x%8.8x but it ended ad 0x%8.8x\n",
+                    " have ended at 0x%8.8x but it ended at 0x%8.8x\n",
             prologue_offset, end_prologue_offset, *offset_ptr);
     return false;
   }
diff --git a/lib/DebugInfo/DWARFFormValue.cpp b/lib/DebugInfo/DWARFFormValue.cpp
index 1a1cf24..da71fb3 100644
--- a/lib/DebugInfo/DWARFFormValue.cpp
+++ b/lib/DebugInfo/DWARFFormValue.cpp
@@ -10,6 +10,8 @@
 #include "llvm/DebugInfo/DWARFFormValue.h"
 #include "DWARFCompileUnit.h"
 #include "DWARFContext.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Dwarf.h"
 #include "llvm/Support/Format.h"
@@ -19,64 +21,114 @@ using namespace llvm;
 using namespace dwarf;
 
 namespace {
-template <uint8_t AddrSize, uint8_t RefAddrSize> struct FixedFormSizes {
-  static const uint8_t sizes[];
-};
+uint8_t getRefAddrSize(uint8_t AddrSize, uint16_t Version) {
+  // FIXME: Support DWARF64.
+  return (Version == 2) ? AddrSize : 4;
 }
 
 template <uint8_t AddrSize, uint8_t RefAddrSize>
-const uint8_t FixedFormSizes<AddrSize, RefAddrSize>::sizes[] = {
-  0,           // 0x00 unused
-  AddrSize,    // 0x01 DW_FORM_addr
-  0,           // 0x02 unused
-  0,           // 0x03 DW_FORM_block2
-  0,           // 0x04 DW_FORM_block4
-  2,           // 0x05 DW_FORM_data2
-  4,           // 0x06 DW_FORM_data4
-  8,           // 0x07 DW_FORM_data8
-  0,           // 0x08 DW_FORM_string
-  0,           // 0x09 DW_FORM_block
-  0,           // 0x0a DW_FORM_block1
-  1,           // 0x0b DW_FORM_data1
-  1,           // 0x0c DW_FORM_flag
-  0,           // 0x0d DW_FORM_sdata
-  4,           // 0x0e DW_FORM_strp
-  0,           // 0x0f DW_FORM_udata
-  RefAddrSize, // 0x10 DW_FORM_ref_addr
-  1,           // 0x11 DW_FORM_ref1
-  2,           // 0x12 DW_FORM_ref2
-  4,           // 0x13 DW_FORM_ref4
-  8,           // 0x14 DW_FORM_ref8
-  0,           // 0x15 DW_FORM_ref_udata
-  0,           // 0x16 DW_FORM_indirect
-  4,           // 0x17 DW_FORM_sec_offset
-  0,           // 0x18 DW_FORM_exprloc
-  0,           // 0x19 DW_FORM_flag_present
-  8,           // 0x20 DW_FORM_ref_sig8
-};
-
-static uint8_t getRefAddrSize(uint8_t AddrSize, uint16_t Version) {
-  // FIXME: Support DWARF64.
-  return (Version == 2) ? AddrSize : 4;
+ArrayRef<uint8_t> makeFixedFormSizesArrayRef() {
+  static const uint8_t sizes[] = {
+    0,           // 0x00 unused
+    AddrSize,    // 0x01 DW_FORM_addr
+    0,           // 0x02 unused
+    0,           // 0x03 DW_FORM_block2
+    0,           // 0x04 DW_FORM_block4
+    2,           // 0x05 DW_FORM_data2
+    4,           // 0x06 DW_FORM_data4
+    8,           // 0x07 DW_FORM_data8
+    0,           // 0x08 DW_FORM_string
+    0,           // 0x09 DW_FORM_block
+    0,           // 0x0a DW_FORM_block1
+    1,           // 0x0b DW_FORM_data1
+    1,           // 0x0c DW_FORM_flag
+    0,           // 0x0d DW_FORM_sdata
+    4,           // 0x0e DW_FORM_strp
+    0,           // 0x0f DW_FORM_udata
+    RefAddrSize, // 0x10 DW_FORM_ref_addr
+    1,           // 0x11 DW_FORM_ref1
+    2,           // 0x12 DW_FORM_ref2
+    4,           // 0x13 DW_FORM_ref4
+    8,           // 0x14 DW_FORM_ref8
+    0,           // 0x15 DW_FORM_ref_udata
+    0,           // 0x16 DW_FORM_indirect
+    4,           // 0x17 DW_FORM_sec_offset
+    0,           // 0x18 DW_FORM_exprloc
+    0,           // 0x19 DW_FORM_flag_present
+  };
+  return makeArrayRef(sizes);
+}
 }
 
-const uint8_t *
-DWARFFormValue::getFixedFormSizes(uint8_t AddrSize, uint16_t Version) {
+ArrayRef<uint8_t> DWARFFormValue::getFixedFormSizes(uint8_t AddrSize,
+                                                    uint16_t Version) {
   uint8_t RefAddrSize = getRefAddrSize(AddrSize, Version);
   if (AddrSize == 4 && RefAddrSize == 4)
-    return FixedFormSizes<4, 4>::sizes;
+    return makeFixedFormSizesArrayRef<4, 4>();
   if (AddrSize == 4 && RefAddrSize == 8)
-    return FixedFormSizes<4, 8>::sizes;
+    return makeFixedFormSizesArrayRef<4, 8>();
   if (AddrSize == 8 && RefAddrSize == 4)
-    return FixedFormSizes<8, 4>::sizes;
+    return makeFixedFormSizesArrayRef<8, 4>();
   if (AddrSize == 8 && RefAddrSize == 8)
-    return FixedFormSizes<8, 8>::sizes;
-  return 0;
+    return makeFixedFormSizesArrayRef<8, 8>();
+  return None;
 }
 
-bool
-DWARFFormValue::extractValue(DataExtractor data, uint32_t *offset_ptr,
-                             const DWARFCompileUnit *cu) {
+static const DWARFFormValue::FormClass DWARF4FormClasses[] = {
+  DWARFFormValue::FC_Unknown,       // 0x0
+  DWARFFormValue::FC_Address,       // 0x01 DW_FORM_addr
+  DWARFFormValue::FC_Unknown,       // 0x02 unused
+  DWARFFormValue::FC_Block,         // 0x03 DW_FORM_block2
+  DWARFFormValue::FC_Block,         // 0x04 DW_FORM_block4
+  DWARFFormValue::FC_Constant,      // 0x05 DW_FORM_data2
+  // --- These can be FC_SectionOffset in DWARF3 and below:
+  DWARFFormValue::FC_Constant,      // 0x06 DW_FORM_data4
+  DWARFFormValue::FC_Constant,      // 0x07 DW_FORM_data8
+  // ---
+  DWARFFormValue::FC_String,        // 0x08 DW_FORM_string
+  DWARFFormValue::FC_Block,         // 0x09 DW_FORM_block
+  DWARFFormValue::FC_Block,         // 0x0a DW_FORM_block1
+  DWARFFormValue::FC_Constant,      // 0x0b DW_FORM_data1
+  DWARFFormValue::FC_Flag,          // 0x0c DW_FORM_flag
+  DWARFFormValue::FC_Constant,      // 0x0d DW_FORM_sdata
+  DWARFFormValue::FC_String,        // 0x0e DW_FORM_strp
+  DWARFFormValue::FC_Constant,      // 0x0f DW_FORM_udata
+  DWARFFormValue::FC_Reference,     // 0x10 DW_FORM_ref_addr
+  DWARFFormValue::FC_Reference,     // 0x11 DW_FORM_ref1
+  DWARFFormValue::FC_Reference,     // 0x12 DW_FORM_ref2
+  DWARFFormValue::FC_Reference,     // 0x13 DW_FORM_ref4
+  DWARFFormValue::FC_Reference,     // 0x14 DW_FORM_ref8
+  DWARFFormValue::FC_Reference,     // 0x15 DW_FORM_ref_udata
+  DWARFFormValue::FC_Indirect,      // 0x16 DW_FORM_indirect
+  DWARFFormValue::FC_SectionOffset, // 0x17 DW_FORM_sec_offset
+  DWARFFormValue::FC_Exprloc,       // 0x18 DW_FORM_exprloc
+  DWARFFormValue::FC_Flag,          // 0x19 DW_FORM_flag_present
+};
+
+bool DWARFFormValue::isFormClass(DWARFFormValue::FormClass FC) const {
+  // First, check DWARF4 form classes.
+  if (Form < ArrayRef<FormClass>(DWARF4FormClasses).size() &&
+      DWARF4FormClasses[Form] == FC)
+    return true;
+  // Check DW_FORM_ref_sig8 from DWARF4.
+  if (Form == DW_FORM_ref_sig8)
+    return (FC == FC_Reference);
+  // Check for some DWARF5 forms.
+  if (Form == DW_FORM_GNU_addr_index)
+    return (FC == FC_Address);
+  if (Form == DW_FORM_GNU_str_index)
+    return (FC == FC_String);
+  // In DWARF3 DW_FORM_data4 and DW_FORM_data8 served also as a section offset.
+  // Don't check for DWARF version here, as some producers may still do this
+  // by mistake.
+  if ((Form == DW_FORM_data4 || Form == DW_FORM_data8) &&
+      FC == FC_SectionOffset)
+    return true;
+  return false;
+}
+
+bool DWARFFormValue::extractValue(DataExtractor data, uint32_t *offset_ptr,
+                                  const DWARFUnit *cu) {
   bool indirect = false;
   bool is_block = false;
   Value.data = NULL;
@@ -156,10 +208,6 @@ DWARFFormValue::extractValue(DataExtractor data, uint32_t *offset_ptr,
       break;
     case DW_FORM_string:
       Value.cstr = data.getCStr(offset_ptr);
-      // Set the string value to also be the data for inlined cstr form
-      // values only so we can tell the differnence between DW_FORM_string
-      // and DW_FORM_strp form values
-      Value.data = (const uint8_t*)Value.cstr;
       break;
     case DW_FORM_indirect:
       Form = data.getULEB128(offset_ptr);
@@ -183,8 +231,6 @@ DWARFFormValue::extractValue(DataExtractor data, uint32_t *offset_ptr,
       Value.uval = data.getU64(offset_ptr);
       break;
     case DW_FORM_GNU_addr_index:
-      Value.uval = data.getULEB128(offset_ptr);
-      break;
     case DW_FORM_GNU_str_index:
       Value.uval = data.getULEB128(offset_ptr);
       break;
@@ -207,13 +253,13 @@ DWARFFormValue::extractValue(DataExtractor data, uint32_t *offset_ptr,
 
 bool
 DWARFFormValue::skipValue(DataExtractor debug_info_data, uint32_t* offset_ptr,
-                          const DWARFCompileUnit *cu) const {
+                          const DWARFUnit *cu) const {
   return DWARFFormValue::skipValue(Form, debug_info_data, offset_ptr, cu);
 }
 
 bool
 DWARFFormValue::skipValue(uint16_t form, DataExtractor debug_info_data,
-                          uint32_t *offset_ptr, const DWARFCompileUnit *cu) {
+                          uint32_t *offset_ptr, const DWARFUnit *cu) {
   bool indirect = false;
   do {
     switch (form) {
@@ -313,21 +359,20 @@ DWARFFormValue::skipValue(uint16_t form, DataExtractor debug_info_data,
 }
 
 void
-DWARFFormValue::dump(raw_ostream &OS, const DWARFCompileUnit *cu) const {
+DWARFFormValue::dump(raw_ostream &OS, const DWARFUnit *cu) const {
   DataExtractor debug_str_data(cu->getStringSection(), true, 0);
   DataExtractor debug_str_offset_data(cu->getStringOffsetSection(), true, 0);
-  uint64_t uvalue = getUnsigned();
+  uint64_t uvalue = Value.uval;
   bool cu_relative_offset = false;
 
   switch (Form) {
   case DW_FORM_addr:      OS << format("0x%016" PRIx64, uvalue); break;
   case DW_FORM_GNU_addr_index: {
-    StringRef AddrOffsetSec = cu->getAddrOffsetSection();
     OS << format(" indexed (%8.8x) address = ", (uint32_t)uvalue);
-    if (AddrOffsetSec.size() != 0) {
-      DataExtractor DA(AddrOffsetSec, true, cu->getAddressByteSize());
-      OS << format("0x%016" PRIx64, getIndirectAddress(&DA, cu));
-    } else
+    uint64_t Address;
+    if (cu->getAddrOffsetSectionItem(uvalue, Address))
+      OS << format("0x%016" PRIx64, Address);
+    else
       OS << "<no .debug_addr section>";
     break;
   }
@@ -340,7 +385,7 @@ DWARFFormValue::dump(raw_ostream &OS, const DWARFCompileUnit *cu) const {
   case DW_FORM_data8:     OS << format("0x%016" PRIx64, uvalue); break;
   case DW_FORM_string:
     OS << '"';
-    OS.write_escaped(getAsCString(NULL));
+    OS.write_escaped(Value.cstr);
     OS << '"';
     break;
   case DW_FORM_exprloc:
@@ -372,25 +417,24 @@ DWARFFormValue::dump(raw_ostream &OS, const DWARFCompileUnit *cu) const {
     }
     break;
 
-  case DW_FORM_sdata:     OS << getSigned();   break;
-  case DW_FORM_udata:     OS << getUnsigned(); break;
+  case DW_FORM_sdata:     OS << Value.sval; break;
+  case DW_FORM_udata:     OS << Value.uval; break;
   case DW_FORM_strp: {
     OS << format(" .debug_str[0x%8.8x] = ", (uint32_t)uvalue);
-    const char* dbg_str = getAsCString(&debug_str_data);
-    if (dbg_str) {
+    Optional<const char *> DbgStr = getAsCString(cu);
+    if (DbgStr.hasValue()) {
       OS << '"';
-      OS.write_escaped(dbg_str);
+      OS.write_escaped(DbgStr.getValue());
       OS << '"';
     }
     break;
   }
   case DW_FORM_GNU_str_index: {
     OS << format(" indexed (%8.8x) string = ", (uint32_t)uvalue);
-    const char *dbg_str = getIndirectCString(&debug_str_data,
-                                             &debug_str_offset_data);
-    if (dbg_str) {
+    Optional<const char *> DbgStr = getAsCString(cu);
+    if (DbgStr.hasValue()) {
       OS << '"';
-      OS.write_escaped(dbg_str);
+      OS.write_escaped(DbgStr.getValue());
       OS << '"';
     }
     break;
@@ -439,97 +483,67 @@ DWARFFormValue::dump(raw_ostream &OS, const DWARFCompileUnit *cu) const {
     OS << format(" => {0x%8.8" PRIx64 "}", uvalue + (cu ? cu->getOffset() : 0));
 }
 
-const char*
-DWARFFormValue::getAsCString(const DataExtractor *debug_str_data_ptr) const {
-  if (isInlinedCStr()) {
+Optional<const char *> DWARFFormValue::getAsCString(const DWARFUnit *U) const {
+  if (!isFormClass(FC_String))
+    return None;
+  if (Form == DW_FORM_string)
     return Value.cstr;
-  } else if (debug_str_data_ptr) {
-    uint32_t offset = Value.uval;
-    return debug_str_data_ptr->getCStr(&offset);
+  if (U == 0)
+    return None;
+  uint32_t Offset = Value.uval;
+  if (Form == DW_FORM_GNU_str_index) {
+    uint32_t StrOffset;
+    if (!U->getStringOffsetSectionItem(Offset, StrOffset))
+      return None;
+    Offset = StrOffset;
   }
-  return NULL;
-}
-
-const char*
-DWARFFormValue::getIndirectCString(const DataExtractor *DS,
-                                   const DataExtractor *DSO) const {
-  if (!DS || !DSO) return NULL;
-
-  uint32_t offset = Value.uval * 4;
-  uint32_t soffset = DSO->getU32(&offset);
-  return DS->getCStr(&soffset);
-}
-
-uint64_t
-DWARFFormValue::getIndirectAddress(const DataExtractor *DA,
-                                   const DWARFCompileUnit *cu) const {
-  if (!DA) return 0;
-
-  uint32_t offset = Value.uval * cu->getAddressByteSize();
-  return DA->getAddress(&offset);
+  if (const char *Str = U->getStringExtractor().getCStr(&Offset)) {
+    return Str;
+  }
+  return None;
 }
 
-uint64_t DWARFFormValue::getReference(const DWARFCompileUnit *cu) const {
-  uint64_t die_offset = Value.uval;
-  switch (Form) {
-  case DW_FORM_ref1:
-  case DW_FORM_ref2:
-  case DW_FORM_ref4:
-  case DW_FORM_ref8:
-  case DW_FORM_ref_udata:
-      die_offset += (cu ? cu->getOffset() : 0);
-      break;
-  default:
-      break;
+Optional<uint64_t> DWARFFormValue::getAsAddress(const DWARFUnit *U) const {
+  if (!isFormClass(FC_Address))
+    return None;
+  if (Form == DW_FORM_GNU_addr_index) {
+    uint32_t Index = Value.uval;
+    uint64_t Result;
+    if (U == 0 || !U->getAddrOffsetSectionItem(Index, Result))
+      return None;
+    return Result;
   }
-
-  return die_offset;
+  return Value.uval;
 }
 
-bool
-DWARFFormValue::resolveCompileUnitReferences(const DWARFCompileUnit *cu) {
+Optional<uint64_t> DWARFFormValue::getAsReference(const DWARFUnit *U) const {
+  if (!isFormClass(FC_Reference))
+    return None;
   switch (Form) {
   case DW_FORM_ref1:
   case DW_FORM_ref2:
   case DW_FORM_ref4:
   case DW_FORM_ref8:
   case DW_FORM_ref_udata:
-    Value.uval += cu->getOffset();
-    Form = DW_FORM_ref_addr;
-    return true;
+    if (U == 0)
+      return None;
+    return Value.uval + U->getOffset();
+  case DW_FORM_ref_addr:
+    return Value.uval;
+  // FIXME: Add proper support for DW_FORM_ref_sig8
   default:
-    break;
+    return Value.uval;
   }
-  return false;
 }
 
-const uint8_t *DWARFFormValue::BlockData() const {
-  if (!isInlinedCStr())
-    return Value.data;
-  return NULL;
+Optional<uint64_t> DWARFFormValue::getAsSectionOffset() const {
+  if (!isFormClass(FC_SectionOffset))
+    return None;
+  return Value.uval;
 }
 
-bool DWARFFormValue::isBlockForm(uint16_t form) {
-  switch (form) {
-  case DW_FORM_exprloc:
-  case DW_FORM_block:
-  case DW_FORM_block1:
-  case DW_FORM_block2:
-  case DW_FORM_block4:
-    return true;
-  }
-  return false;
-}
-
-bool DWARFFormValue::isDataForm(uint16_t form) {
-  switch (form) {
-  case DW_FORM_sdata:
-  case DW_FORM_udata:
-  case DW_FORM_data1:
-  case DW_FORM_data2:
-  case DW_FORM_data4:
-  case DW_FORM_data8:
-    return true;
-  }
-  return false;
+Optional<uint64_t> DWARFFormValue::getAsUnsignedConstant() const {
+  if (!isFormClass(FC_Constant) || Form == DW_FORM_sdata)
+    return None;
+  return Value.uval;
 }
diff --git a/lib/DebugInfo/DWARFTypeUnit.cpp b/lib/DebugInfo/DWARFTypeUnit.cpp
new file mode 100644
index 0000000..303bf70
--- /dev/null
+++ b/lib/DebugInfo/DWARFTypeUnit.cpp
@@ -0,0 +1,39 @@
+//===-- DWARFTypeUnit.cpp -------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "DWARFTypeUnit.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+bool DWARFTypeUnit::extractImpl(DataExtractor debug_info,
+                                uint32_t *offset_ptr) {
+  if (!DWARFUnit::extractImpl(debug_info, offset_ptr))
+    return false;
+  TypeHash = debug_info.getU64(offset_ptr);
+  TypeOffset = debug_info.getU32(offset_ptr);
+  return TypeOffset < getLength();
+}
+
+void DWARFTypeUnit::dump(raw_ostream &OS) {
+  OS << format("0x%08x", getOffset()) << ": Type Unit:"
+     << " length = " << format("0x%08x", getLength())
+     << " version = " << format("0x%04x", getVersion())
+     << " abbr_offset = " << format("0x%04x", getAbbreviations()->getOffset())
+     << " addr_size = " << format("0x%02x", getAddressByteSize())
+     << " type_signature = " << format("0x%16" PRIx64, TypeHash)
+     << " type_offset = " << format("0x%04x", TypeOffset)
+     << " (next unit at " << format("0x%08x", getNextUnitOffset())
+     << ")\n";
+
+  const DWARFDebugInfoEntryMinimal *CU = getCompileUnitDIE(false);
+  assert(CU && "Null Compile Unit?");
+  CU->dump(OS, this, -1U);
+}
diff --git a/lib/DebugInfo/DWARFTypeUnit.h b/lib/DebugInfo/DWARFTypeUnit.h
new file mode 100644
index 0000000..7a0dab2
--- /dev/null
+++ b/lib/DebugInfo/DWARFTypeUnit.h
@@ -0,0 +1,35 @@
+//===-- DWARFTypeUnit.h -----------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_DWARFTYPEUNIT_H
+#define LLVM_DEBUGINFO_DWARFTYPEUNIT_H
+
+#include "DWARFUnit.h"
+
+namespace llvm {
+
+class DWARFTypeUnit : public DWARFUnit {
+private:
+  uint64_t TypeHash;
+  uint32_t TypeOffset;
+public:
+  DWARFTypeUnit(const DWARFDebugAbbrev *DA, StringRef IS, StringRef AS,
+                StringRef RS, StringRef SS, StringRef SOS, StringRef AOS,
+                const RelocAddrMap *M, bool LE)
+      : DWARFUnit(DA, IS, AS, RS, SS, SOS, AOS, M, LE) {}
+  uint32_t getSize() const LLVM_OVERRIDE { return DWARFUnit::getSize() + 12; }
+  void dump(raw_ostream &OS);
+protected:
+  bool extractImpl(DataExtractor debug_info, uint32_t *offset_ptr) LLVM_OVERRIDE;
+};
+
+}
+
+#endif
+
diff --git a/lib/DebugInfo/DWARFUnit.cpp b/lib/DebugInfo/DWARFUnit.cpp
new file mode 100644
index 0000000..5167eb9
--- /dev/null
+++ b/lib/DebugInfo/DWARFUnit.cpp
@@ -0,0 +1,365 @@
+//===-- DWARFUnit.cpp -----------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "DWARFUnit.h"
+#include "DWARFContext.h"
+#include "llvm/DebugInfo/DWARFFormValue.h"
+#include "llvm/Support/Dwarf.h"
+#include "llvm/Support/Path.h"
+#include <cstdio>
+
+using namespace llvm;
+using namespace dwarf;
+
+DWARFUnit::DWARFUnit(const DWARFDebugAbbrev *DA, StringRef IS, StringRef AS,
+                     StringRef RS, StringRef SS, StringRef SOS, StringRef AOS,
+                     const RelocAddrMap *M, bool LE)
+    : Abbrev(DA), InfoSection(IS), AbbrevSection(AS), RangeSection(RS),
+      StringSection(SS), StringOffsetSection(SOS), AddrOffsetSection(AOS),
+      RelocMap(M), isLittleEndian(LE) {
+  clear();
+}
+
+DWARFUnit::~DWARFUnit() {
+}
+
+bool DWARFUnit::getAddrOffsetSectionItem(uint32_t Index,
+                                                uint64_t &Result) const {
+  uint32_t Offset = AddrOffsetSectionBase + Index * AddrSize;
+  if (AddrOffsetSection.size() < Offset + AddrSize)
+    return false;
+  DataExtractor DA(AddrOffsetSection, isLittleEndian, AddrSize);
+  Result = DA.getAddress(&Offset);
+  return true;
+}
+
+bool DWARFUnit::getStringOffsetSectionItem(uint32_t Index,
+                                                  uint32_t &Result) const {
+  // FIXME: string offset section entries are 8-byte for DWARF64.
+  const uint32_t ItemSize = 4;
+  uint32_t Offset = Index * ItemSize;
+  if (StringOffsetSection.size() < Offset + ItemSize)
+    return false;
+  DataExtractor DA(StringOffsetSection, isLittleEndian, 0);
+  Result = DA.getU32(&Offset);
+  return true;
+}
+
+bool DWARFUnit::extractImpl(DataExtractor debug_info, uint32_t *offset_ptr) {
+  Length = debug_info.getU32(offset_ptr);
+  Version = debug_info.getU16(offset_ptr);
+  uint64_t abbrOffset = debug_info.getU32(offset_ptr);
+  AddrSize = debug_info.getU8(offset_ptr);
+
+  bool lengthOK = debug_info.isValidOffset(getNextUnitOffset() - 1);
+  bool versionOK = DWARFContext::isSupportedVersion(Version);
+  bool abbrOffsetOK = AbbrevSection.size() > abbrOffset;
+  bool addrSizeOK = AddrSize == 4 || AddrSize == 8;
+
+  if (!lengthOK || !versionOK || !addrSizeOK || !abbrOffsetOK)
+    return false;
+
+  Abbrevs = Abbrev->getAbbreviationDeclarationSet(abbrOffset);
+  return true;
+}
+
+bool DWARFUnit::extract(DataExtractor debug_info, uint32_t *offset_ptr) {
+  clear();
+
+  Offset = *offset_ptr;
+
+  if (debug_info.isValidOffset(*offset_ptr)) {
+    if (extractImpl(debug_info, offset_ptr))
+      return true;
+
+    // reset the offset to where we tried to parse from if anything went wrong
+    *offset_ptr = Offset;
+  }
+
+  return false;
+}
+
+bool DWARFUnit::extractRangeList(uint32_t RangeListOffset,
+                                        DWARFDebugRangeList &RangeList) const {
+  // Require that compile unit is extracted.
+  assert(DieArray.size() > 0);
+  DataExtractor RangesData(RangeSection, isLittleEndian, AddrSize);
+  uint32_t ActualRangeListOffset = RangeSectionBase + RangeListOffset;
+  return RangeList.extract(RangesData, &ActualRangeListOffset);
+}
+
+void DWARFUnit::clear() {
+  Offset = 0;
+  Length = 0;
+  Version = 0;
+  Abbrevs = 0;
+  AddrSize = 0;
+  BaseAddr = 0;
+  RangeSectionBase = 0;
+  AddrOffsetSectionBase = 0;
+  clearDIEs(false);
+  DWO.reset();
+}
+
+const char *DWARFUnit::getCompilationDir() {
+  extractDIEsIfNeeded(true);
+  if (DieArray.empty())
+    return 0;
+  return DieArray[0].getAttributeValueAsString(this, DW_AT_comp_dir, 0);
+}
+
+uint64_t DWARFUnit::getDWOId() {
+  extractDIEsIfNeeded(true);
+  const uint64_t FailValue = -1ULL;
+  if (DieArray.empty())
+    return FailValue;
+  return DieArray[0]
+      .getAttributeValueAsUnsignedConstant(this, DW_AT_GNU_dwo_id, FailValue);
+}
+
+void DWARFUnit::setDIERelations() {
+  if (DieArray.empty())
+    return;
+  DWARFDebugInfoEntryMinimal *die_array_begin = &DieArray.front();
+  DWARFDebugInfoEntryMinimal *die_array_end = &DieArray.back();
+  DWARFDebugInfoEntryMinimal *curr_die;
+  // We purposely are skipping the last element in the array in the loop below
+  // so that we can always have a valid next item
+  for (curr_die = die_array_begin; curr_die < die_array_end; ++curr_die) {
+    // Since our loop doesn't include the last element, we can always
+    // safely access the next die in the array.
+    DWARFDebugInfoEntryMinimal *next_die = curr_die + 1;
+
+    const DWARFAbbreviationDeclaration *curr_die_abbrev =
+      curr_die->getAbbreviationDeclarationPtr();
+
+    if (curr_die_abbrev) {
+      // Normal DIE
+      if (curr_die_abbrev->hasChildren())
+        next_die->setParent(curr_die);
+      else
+        curr_die->setSibling(next_die);
+    } else {
+      // NULL DIE that terminates a sibling chain
+      DWARFDebugInfoEntryMinimal *parent = curr_die->getParent();
+      if (parent)
+        parent->setSibling(next_die);
+    }
+  }
+
+  // Since we skipped the last element, we need to fix it up!
+  if (die_array_begin < die_array_end)
+    curr_die->setParent(die_array_begin);
+}
+
+void DWARFUnit::extractDIEsToVector(
+    bool AppendCUDie, bool AppendNonCUDies,
+    std::vector<DWARFDebugInfoEntryMinimal> &Dies) const {
+  if (!AppendCUDie && !AppendNonCUDies)
+    return;
+
+  // Set the offset to that of the first DIE and calculate the start of the
+  // next compilation unit header.
+  uint32_t Offset = getFirstDIEOffset();
+  uint32_t NextCUOffset = getNextUnitOffset();
+  DWARFDebugInfoEntryMinimal DIE;
+  uint32_t Depth = 0;
+  bool IsCUDie = true;
+
+  while (Offset < NextCUOffset && DIE.extractFast(this, &Offset)) {
+    if (IsCUDie) {
+      if (AppendCUDie)
+        Dies.push_back(DIE);
+      if (!AppendNonCUDies)
+        break;
+      // The average bytes per DIE entry has been seen to be
+      // around 14-20 so let's pre-reserve the needed memory for
+      // our DIE entries accordingly.
+      Dies.reserve(Dies.size() + getDebugInfoSize() / 14);
+      IsCUDie = false;
+    } else {
+      Dies.push_back(DIE);
+    }
+
+    const DWARFAbbreviationDeclaration *AbbrDecl =
+      DIE.getAbbreviationDeclarationPtr();
+    if (AbbrDecl) {
+      // Normal DIE
+      if (AbbrDecl->hasChildren())
+        ++Depth;
+    } else {
+      // NULL DIE.
+      if (Depth > 0)
+        --Depth;
+      if (Depth == 0)
+        break;  // We are done with this compile unit!
+    }
+  }
+
+  // Give a little bit of info if we encounter corrupt DWARF (our offset
+  // should always terminate at or before the start of the next compilation
+  // unit header).
+  if (Offset > NextCUOffset)
+    fprintf(stderr, "warning: DWARF compile unit extends beyond its "
+                    "bounds cu 0x%8.8x at 0x%8.8x'\n", getOffset(), Offset);
+}
+
+size_t DWARFUnit::extractDIEsIfNeeded(bool CUDieOnly) {
+  if ((CUDieOnly && DieArray.size() > 0) ||
+      DieArray.size() > 1)
+    return 0; // Already parsed.
+
+  bool HasCUDie = DieArray.size() > 0;
+  extractDIEsToVector(!HasCUDie, !CUDieOnly, DieArray);
+
+  if (DieArray.empty())
+    return 0;
+
+  // If CU DIE was just parsed, copy several attribute values from it.
+  if (!HasCUDie) {
+    uint64_t BaseAddr =
+        DieArray[0].getAttributeValueAsAddress(this, DW_AT_low_pc, -1ULL);
+    if (BaseAddr == -1ULL)
+      BaseAddr = DieArray[0].getAttributeValueAsAddress(this, DW_AT_entry_pc, 0);
+    setBaseAddress(BaseAddr);
+    AddrOffsetSectionBase = DieArray[0].getAttributeValueAsSectionOffset(
+        this, DW_AT_GNU_addr_base, 0);
+    RangeSectionBase = DieArray[0].getAttributeValueAsSectionOffset(
+        this, DW_AT_GNU_ranges_base, 0);
+  }
+
+  setDIERelations();
+  return DieArray.size();
+}
+
+DWARFUnit::DWOHolder::DWOHolder(object::ObjectFile *DWOFile)
+    : DWOFile(DWOFile),
+      DWOContext(cast<DWARFContext>(DIContext::getDWARFContext(DWOFile))),
+      DWOU(0) {
+  if (DWOContext->getNumDWOCompileUnits() > 0)
+    DWOU = DWOContext->getDWOCompileUnitAtIndex(0);
+}
+
+bool DWARFUnit::parseDWO() {
+  if (DWO.get() != 0)
+    return false;
+  extractDIEsIfNeeded(true);
+  if (DieArray.empty())
+    return false;
+  const char *DWOFileName =
+      DieArray[0].getAttributeValueAsString(this, DW_AT_GNU_dwo_name, 0);
+  if (DWOFileName == 0)
+    return false;
+  const char *CompilationDir =
+      DieArray[0].getAttributeValueAsString(this, DW_AT_comp_dir, 0);
+  SmallString<16> AbsolutePath;
+  if (sys::path::is_relative(DWOFileName) && CompilationDir != 0) {
+    sys::path::append(AbsolutePath, CompilationDir);
+  }
+  sys::path::append(AbsolutePath, DWOFileName);
+  object::ObjectFile *DWOFile =
+      object::ObjectFile::createObjectFile(AbsolutePath);
+  if (!DWOFile)
+    return false;
+  // Reset DWOHolder.
+  DWO.reset(new DWOHolder(DWOFile));
+  DWARFUnit *DWOCU = DWO->getUnit();
+  // Verify that compile unit in .dwo file is valid.
+  if (DWOCU == 0 || DWOCU->getDWOId() != getDWOId()) {
+    DWO.reset();
+    return false;
+  }
+  // Share .debug_addr and .debug_ranges section with compile unit in .dwo
+  DWOCU->setAddrOffsetSection(AddrOffsetSection, AddrOffsetSectionBase);
+  DWOCU->setRangesSection(RangeSection, RangeSectionBase);
+  return true;
+}
+
+void DWARFUnit::clearDIEs(bool KeepCUDie) {
+  if (DieArray.size() > (unsigned)KeepCUDie) {
+    // std::vectors never get any smaller when resized to a smaller size,
+    // or when clear() or erase() are called, the size will report that it
+    // is smaller, but the memory allocated remains intact (call capacity()
+    // to see this). So we need to create a temporary vector and swap the
+    // contents which will cause just the internal pointers to be swapped
+    // so that when temporary vector goes out of scope, it will destroy the
+    // contents.
+    std::vector<DWARFDebugInfoEntryMinimal> TmpArray;
+    DieArray.swap(TmpArray);
+    // Save at least the compile unit DIE
+    if (KeepCUDie)
+      DieArray.push_back(TmpArray.front());
+  }
+}
+
+void
+DWARFUnit::buildAddressRangeTable(DWARFDebugAranges *debug_aranges,
+                                         bool clear_dies_if_already_not_parsed,
+                                         uint32_t CUOffsetInAranges) {
+  // This function is usually called if there in no .debug_aranges section
+  // in order to produce a compile unit level set of address ranges that
+  // is accurate. If the DIEs weren't parsed, then we don't want all dies for
+  // all compile units to stay loaded when they weren't needed. So we can end
+  // up parsing the DWARF and then throwing them all away to keep memory usage
+  // down.
+  const bool clear_dies = extractDIEsIfNeeded(false) > 1 &&
+                          clear_dies_if_already_not_parsed;
+  DieArray[0].buildAddressRangeTable(this, debug_aranges, CUOffsetInAranges);
+  bool DWOCreated = parseDWO();
+  if (DWO.get()) {
+    // If there is a .dwo file for this compile unit, then skeleton CU DIE
+    // doesn't have children, and we should instead build address range table
+    // from DIEs in the .debug_info.dwo section of .dwo file.
+    DWO->getUnit()->buildAddressRangeTable(
+        debug_aranges, clear_dies_if_already_not_parsed, CUOffsetInAranges);
+  }
+  if (DWOCreated && clear_dies_if_already_not_parsed)
+    DWO.reset();
+
+  // Keep memory down by clearing DIEs if this generate function
+  // caused them to be parsed.
+  if (clear_dies)
+    clearDIEs(true);
+}
+
+const DWARFDebugInfoEntryMinimal *
+DWARFUnit::getSubprogramForAddress(uint64_t Address) {
+  extractDIEsIfNeeded(false);
+  for (size_t i = 0, n = DieArray.size(); i != n; i++)
+    if (DieArray[i].isSubprogramDIE() &&
+        DieArray[i].addressRangeContainsAddress(this, Address)) {
+      return &DieArray[i];
+    }
+  return 0;
+}
+
+DWARFDebugInfoEntryInlinedChain
+DWARFUnit::getInlinedChainForAddress(uint64_t Address) {
+  // First, find a subprogram that contains the given address (the root
+  // of inlined chain).
+  const DWARFUnit *ChainCU = 0;
+  const DWARFDebugInfoEntryMinimal *SubprogramDIE =
+      getSubprogramForAddress(Address);
+  if (SubprogramDIE) {
+    ChainCU = this;
+  } else {
+    // Try to look for subprogram DIEs in the DWO file.
+    parseDWO();
+    if (DWO.get()) {
+      SubprogramDIE = DWO->getUnit()->getSubprogramForAddress(Address);
+      if (SubprogramDIE)
+        ChainCU = DWO->getUnit();
+    }
+  }
+
+  // Get inlined chain rooted at this subprogram DIE.
+  if (!SubprogramDIE)
+    return DWARFDebugInfoEntryInlinedChain();
+  return SubprogramDIE->getInlinedChainForAddress(ChainCU, Address);
+}
diff --git a/lib/DebugInfo/DWARFUnit.h b/lib/DebugInfo/DWARFUnit.h
new file mode 100644
index 0000000..bd768a6
--- /dev/null
+++ b/lib/DebugInfo/DWARFUnit.h
@@ -0,0 +1,168 @@
+//===-- DWARFUnit.h ---------------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_DWARFUNIT_H
+#define LLVM_DEBUGINFO_DWARFUNIT_H
+
+#include "llvm/ADT/OwningPtr.h"
+#include "DWARFDebugAbbrev.h"
+#include "DWARFDebugInfoEntry.h"
+#include "DWARFDebugRangeList.h"
+#include "DWARFRelocMap.h"
+#include <vector>
+
+namespace llvm {
+
+namespace object {
+class ObjectFile;
+}
+
+class DWARFDebugAbbrev;
+class StringRef;
+class raw_ostream;
+
+class DWARFUnit {
+  const DWARFDebugAbbrev *Abbrev;
+  StringRef InfoSection;
+  StringRef AbbrevSection;
+  StringRef RangeSection;
+  uint32_t RangeSectionBase;
+  StringRef StringSection;
+  StringRef StringOffsetSection;
+  StringRef AddrOffsetSection;
+  uint32_t AddrOffsetSectionBase;
+  const RelocAddrMap *RelocMap;
+  bool isLittleEndian;
+
+  uint32_t Offset;
+  uint32_t Length;
+  uint16_t Version;
+  const DWARFAbbreviationDeclarationSet *Abbrevs;
+  uint8_t AddrSize;
+  uint64_t BaseAddr;
+  // The compile unit debug information entry items.
+  std::vector<DWARFDebugInfoEntryMinimal> DieArray;
+
+  class DWOHolder {
+    OwningPtr<object::ObjectFile> DWOFile;
+    OwningPtr<DWARFContext> DWOContext;
+    DWARFUnit *DWOU;
+  public:
+    DWOHolder(object::ObjectFile *DWOFile);
+    DWARFUnit *getUnit() const { return DWOU; }
+  };
+  OwningPtr<DWOHolder> DWO;
+
+protected:
+  virtual bool extractImpl(DataExtractor debug_info, uint32_t *offset_ptr);
+
+public:
+
+  DWARFUnit(const DWARFDebugAbbrev *DA, StringRef IS, StringRef AS,
+            StringRef RS, StringRef SS, StringRef SOS, StringRef AOS,
+            const RelocAddrMap *M, bool LE);
+
+  virtual ~DWARFUnit();
+
+  StringRef getStringSection() const { return StringSection; }
+  StringRef getStringOffsetSection() const { return StringOffsetSection; }
+  void setAddrOffsetSection(StringRef AOS, uint32_t Base) {
+    AddrOffsetSection = AOS;
+    AddrOffsetSectionBase = Base;
+  }
+  void setRangesSection(StringRef RS, uint32_t Base) {
+    RangeSection = RS;
+    RangeSectionBase = Base;
+  }
+
+  bool getAddrOffsetSectionItem(uint32_t Index, uint64_t &Result) const;
+  // FIXME: Result should be uint64_t in DWARF64.
+  bool getStringOffsetSectionItem(uint32_t Index, uint32_t &Result) const;
+
+  DataExtractor getDebugInfoExtractor() const {
+    return DataExtractor(InfoSection, isLittleEndian, AddrSize);
+  }
+  DataExtractor getStringExtractor() const {
+    return DataExtractor(StringSection, false, 0);
+  }
+
+  const RelocAddrMap *getRelocMap() const { return RelocMap; }
+
+  bool extract(DataExtractor debug_info, uint32_t* offset_ptr);
+
+  /// extractRangeList - extracts the range list referenced by this compile
+  /// unit from .debug_ranges section. Returns true on success.
+  /// Requires that compile unit is already extracted.
+  bool extractRangeList(uint32_t RangeListOffset,
+                        DWARFDebugRangeList &RangeList) const;
+  void clear();
+  uint32_t getOffset() const { return Offset; }
+  /// Size in bytes of the compile unit header.
+  virtual uint32_t getSize() const { return 11; }
+  uint32_t getFirstDIEOffset() const { return Offset + getSize(); }
+  uint32_t getNextUnitOffset() const { return Offset + Length + 4; }
+  /// Size in bytes of the .debug_info data associated with this compile unit.
+  size_t getDebugInfoSize() const { return Length + 4 - getSize(); }
+  uint32_t getLength() const { return Length; }
+  uint16_t getVersion() const { return Version; }
+  const DWARFAbbreviationDeclarationSet *getAbbreviations() const {
+    return Abbrevs;
+  }
+  uint8_t getAddressByteSize() const { return AddrSize; }
+  uint64_t getBaseAddress() const { return BaseAddr; }
+
+  void setBaseAddress(uint64_t base_addr) {
+    BaseAddr = base_addr;
+  }
+
+  const DWARFDebugInfoEntryMinimal *
+  getCompileUnitDIE(bool extract_cu_die_only = true) {
+    extractDIEsIfNeeded(extract_cu_die_only);
+    return DieArray.empty() ? NULL : &DieArray[0];
+  }
+
+  const char *getCompilationDir();
+  uint64_t getDWOId();
+
+  void buildAddressRangeTable(DWARFDebugAranges *debug_aranges,
+                              bool clear_dies_if_already_not_parsed,
+                              uint32_t CUOffsetInAranges);
+
+  /// getInlinedChainForAddress - fetches inlined chain for a given address.
+  /// Returns empty chain if there is no subprogram containing address. The
+  /// chain is valid as long as parsed compile unit DIEs are not cleared.
+  DWARFDebugInfoEntryInlinedChain getInlinedChainForAddress(uint64_t Address);
+
+private:
+  /// extractDIEsIfNeeded - Parses a compile unit and indexes its DIEs if it
+  /// hasn't already been done. Returns the number of DIEs parsed at this call.
+  size_t extractDIEsIfNeeded(bool CUDieOnly);
+  /// extractDIEsToVector - Appends all parsed DIEs to a vector.
+  void extractDIEsToVector(bool AppendCUDie, bool AppendNonCUDIEs,
+                           std::vector<DWARFDebugInfoEntryMinimal> &DIEs) const;
+  /// setDIERelations - We read in all of the DIE entries into our flat list
+  /// of DIE entries and now we need to go back through all of them and set the
+  /// parent, sibling and child pointers for quick DIE navigation.
+  void setDIERelations();
+  /// clearDIEs - Clear parsed DIEs to keep memory usage low.
+  void clearDIEs(bool KeepCUDie);
+
+  /// parseDWO - Parses .dwo file for current compile unit. Returns true if
+  /// it was actually constructed.
+  bool parseDWO();
+
+  /// getSubprogramForAddress - Returns subprogram DIE with address range
+  /// encompassing the provided address. The pointer is alive as long as parsed
+  /// compile unit DIEs are not cleared.
+  const DWARFDebugInfoEntryMinimal *getSubprogramForAddress(uint64_t Address);
+};
+
+}
+
+#endif
diff --git a/lib/ExecutionEngine/ExecutionEngine.cpp b/lib/ExecutionEngine/ExecutionEngine.cpp
index c463e9f..2a610d5 100644
--- a/lib/ExecutionEngine/ExecutionEngine.cpp
+++ b/lib/ExecutionEngine/ExecutionEngine.cpp
@@ -15,6 +15,7 @@
 #define DEBUG_TYPE "jit"
 #include "llvm/ExecutionEngine/ExecutionEngine.h"
 #include "llvm/ExecutionEngine/JITMemoryManager.h"
+#include "llvm/ExecutionEngine/ObjectCache.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ExecutionEngine/GenericValue.h"
@@ -39,6 +40,11 @@ using namespace llvm;
 STATISTIC(NumInitBytes, "Number of bytes of global vars initialized");
 STATISTIC(NumGlobals  , "Number of global vars initialized");
 
+// Pin the vtable to this file.
+void ObjectCache::anchor() {}
+void ObjectBuffer::anchor() {}
+void ObjectBufferStream::anchor() {}
+
 ExecutionEngine *(*ExecutionEngine::JITCtor)(
   Module *M,
   std::string *ErrorStr,
@@ -56,9 +62,7 @@ ExecutionEngine *(*ExecutionEngine::InterpCtor)(Module *M,
 
 ExecutionEngine::ExecutionEngine(Module *M)
   : EEState(*this),
-    LazyFunctionCreator(0),
-    ExceptionTableRegister(0),
-    ExceptionTableDeregister(0) {
+    LazyFunctionCreator(0) {
   CompilingLazily         = false;
   GVCompilationDisabled   = false;
   SymbolSearchingDisabled = false;
@@ -72,16 +76,6 @@ ExecutionEngine::~ExecutionEngine() {
     delete Modules[i];
 }
 
-void ExecutionEngine::DeregisterAllTables() {
-  if (ExceptionTableDeregister) {
-    DenseMap<const Function*, void*>::iterator it = AllExceptionTables.begin();
-    DenseMap<const Function*, void*>::iterator ite = AllExceptionTables.end();
-    for (; it != ite; ++it)
-      ExceptionTableDeregister(it->second);
-    AllExceptionTables.clear();
-  }
-}
-
 namespace {
 /// \brief Helper class which uses a value handler to automatically deletes the
 /// memory block when the GlobalVariable is destroyed.
@@ -556,6 +550,24 @@ GenericValue ExecutionEngine::getConstantValue(const Constant *C) {
       // with the correct bit width.
       Result.IntVal = APInt(C->getType()->getPrimitiveSizeInBits(), 0);
       break;
+    case Type::StructTyID: {
+      // if the whole struct is 'undef' just reserve memory for the value.
+      if(StructType *STy = dyn_cast<StructType>(C->getType())) {
+        unsigned int elemNum = STy->getNumElements();
+        Result.AggregateVal.resize(elemNum);
+        for (unsigned int i = 0; i < elemNum; ++i) {
+          Type *ElemTy = STy->getElementType(i);
+          if (ElemTy->isIntegerTy())
+            Result.AggregateVal[i].IntVal = 
+              APInt(ElemTy->getPrimitiveSizeInBits(), 0);
+          else if (ElemTy->isAggregateType()) {
+              const Constant *ElemUndef = UndefValue::get(ElemTy);
+              Result.AggregateVal[i] = getConstantValue(ElemUndef);
+            }
+          }
+        }
+      }
+      break;
     case Type::VectorTyID:
       // if the whole vector is 'undef' just reserve memory for the value.
       const VectorType* VTy = dyn_cast<VectorType>(C->getType());
@@ -564,7 +576,7 @@ GenericValue ExecutionEngine::getConstantValue(const Constant *C) {
       Result.AggregateVal.resize(elemNum);
       if (ElemTy->isIntegerTy())
         for (unsigned int i = 0; i < elemNum; ++i)
-          Result.AggregateVal[i].IntVal = 
+          Result.AggregateVal[i].IntVal =
             APInt(ElemTy->getPrimitiveSizeInBits(), 0);
       break;
     }
@@ -1283,6 +1295,10 @@ void ExecutionEngine::EmitGlobalVariable(const GlobalVariable *GV) {
   if (GA == 0) {
     // If it's not already specified, allocate memory for the global.
     GA = getMemoryForGV(GV);
+
+    // If we failed to allocate memory for this global, return.
+    if (GA == 0) return;
+
     addGlobalMapping(GV, GA);
   }
 
diff --git a/lib/ExecutionEngine/ExecutionEngineBindings.cpp b/lib/ExecutionEngine/ExecutionEngineBindings.cpp
index 88e73bf..2d34eea 100644
--- a/lib/ExecutionEngine/ExecutionEngineBindings.cpp
+++ b/lib/ExecutionEngine/ExecutionEngineBindings.cpp
@@ -339,14 +339,10 @@ void *LLVMGetPointerToGlobal(LLVMExecutionEngineRef EE, LLVMValueRef Global) {
 namespace {
 
 struct SimpleBindingMMFunctions {
-  uint8_t *(*AllocateCodeSection)(void *Opaque,
-                                  uintptr_t Size, unsigned Alignment,
-                                  unsigned SectionID);
-  uint8_t *(*AllocateDataSection)(void *Opaque,
-                                  uintptr_t Size, unsigned Alignment,
-                                  unsigned SectionID, LLVMBool IsReadOnly);
-  LLVMBool (*FinalizeMemory)(void *Opaque, char **ErrMsg);
-  void (*Destroy)(void *Opaque);
+  LLVMMemoryManagerAllocateCodeSectionCallback AllocateCodeSection;
+  LLVMMemoryManagerAllocateDataSectionCallback AllocateDataSection;
+  LLVMMemoryManagerFinalizeMemoryCallback FinalizeMemory;
+  LLVMMemoryManagerDestroyCallback Destroy;
 };
 
 class SimpleBindingMemoryManager : public RTDyldMemoryManager {
@@ -355,12 +351,13 @@ public:
                              void *Opaque);
   virtual ~SimpleBindingMemoryManager();
   
-  virtual uint8_t *allocateCodeSection(uintptr_t Size, unsigned Alignment,
-                                       unsigned SectionID);
+  virtual uint8_t *allocateCodeSection(
+    uintptr_t Size, unsigned Alignment, unsigned SectionID,
+    StringRef SectionName);
 
-  virtual uint8_t *allocateDataSection(uintptr_t Size, unsigned Alignment,
-                                       unsigned SectionID,
-                                       bool isReadOnly);
+  virtual uint8_t *allocateDataSection(
+    uintptr_t Size, unsigned Alignment, unsigned SectionID,
+    StringRef SectionName, bool isReadOnly);
 
   virtual bool finalizeMemory(std::string *ErrMsg);
   
@@ -388,13 +385,17 @@ SimpleBindingMemoryManager::~SimpleBindingMemoryManager() {
 }
 
 uint8_t *SimpleBindingMemoryManager::allocateCodeSection(
-  uintptr_t Size, unsigned Alignment, unsigned SectionID) {
-  return Functions.AllocateCodeSection(Opaque, Size, Alignment, SectionID);
+  uintptr_t Size, unsigned Alignment, unsigned SectionID,
+  StringRef SectionName) {
+  return Functions.AllocateCodeSection(Opaque, Size, Alignment, SectionID,
+                                       SectionName.str().c_str());
 }
 
 uint8_t *SimpleBindingMemoryManager::allocateDataSection(
-  uintptr_t Size, unsigned Alignment, unsigned SectionID, bool isReadOnly) {
+  uintptr_t Size, unsigned Alignment, unsigned SectionID,
+  StringRef SectionName, bool isReadOnly) {
   return Functions.AllocateDataSection(Opaque, Size, Alignment, SectionID,
+                                       SectionName.str().c_str(),
                                        isReadOnly);
 }
 
@@ -415,14 +416,10 @@ bool SimpleBindingMemoryManager::finalizeMemory(std::string *ErrMsg) {
 
 LLVMMCJITMemoryManagerRef LLVMCreateSimpleMCJITMemoryManager(
   void *Opaque,
-  uint8_t *(*AllocateCodeSection)(void *Opaque,
-                                  uintptr_t Size, unsigned Alignment,
-                                  unsigned SectionID),
-  uint8_t *(*AllocateDataSection)(void *Opaque,
-                                  uintptr_t Size, unsigned Alignment,
-                                  unsigned SectionID, LLVMBool IsReadOnly),
-  LLVMBool (*FinalizeMemory)(void *Opaque, char **ErrMsg),
-  void (*Destroy)(void *Opaque)) {
+  LLVMMemoryManagerAllocateCodeSectionCallback AllocateCodeSection,
+  LLVMMemoryManagerAllocateDataSectionCallback AllocateDataSection,
+  LLVMMemoryManagerFinalizeMemoryCallback FinalizeMemory,
+  LLVMMemoryManagerDestroyCallback Destroy) {
   
   if (!AllocateCodeSection || !AllocateDataSection || !FinalizeMemory ||
       !Destroy)
diff --git a/lib/ExecutionEngine/IntelJITEvents/IntelJITEventsWrapper.h b/lib/ExecutionEngine/IntelJITEvents/IntelJITEventsWrapper.h
index 3d9ff53..777d0f1 100644
--- a/lib/ExecutionEngine/IntelJITEvents/IntelJITEventsWrapper.h
+++ b/lib/ExecutionEngine/IntelJITEvents/IntelJITEventsWrapper.h
@@ -61,7 +61,7 @@ public:
     GetNewMethodIDFunc(GetNewMethodIDImpl) {
   }
 
-  // Sends an event anncouncing that a function has been emitted
+  // Sends an event announcing that a function has been emitted
   //   return values are event-specific.  See Intel documentation for details.
   int  iJIT_NotifyEvent(iJIT_JVM_EVENT EventType, void *EventSpecificData) {
     if (!NotifyEventFunc)
diff --git a/lib/ExecutionEngine/Interpreter/Execution.cpp b/lib/ExecutionEngine/Interpreter/Execution.cpp
index fc3d579..5de0659 100644
--- a/lib/ExecutionEngine/Interpreter/Execution.cpp
+++ b/lib/ExecutionEngine/Interpreter/Execution.cpp
@@ -786,20 +786,31 @@ void Interpreter::visitBinaryOperator(BinaryOperator &I) {
 }
 
 static GenericValue executeSelectInst(GenericValue Src1, GenericValue Src2,
-                                      GenericValue Src3) {
-  return Src1.IntVal == 0 ? Src3 : Src2;
+                                      GenericValue Src3, const Type *Ty) {
+    GenericValue Dest;
+    if(Ty->isVectorTy()) {
+      assert(Src1.AggregateVal.size() == Src2.AggregateVal.size());
+      assert(Src2.AggregateVal.size() == Src3.AggregateVal.size());
+      Dest.AggregateVal.resize( Src1.AggregateVal.size() );
+      for (size_t i = 0; i < Src1.AggregateVal.size(); ++i)
+        Dest.AggregateVal[i] = (Src1.AggregateVal[i].IntVal == 0) ?
+          Src3.AggregateVal[i] : Src2.AggregateVal[i];
+    } else {
+      Dest = (Src1.IntVal == 0) ? Src3 : Src2;
+    }
+    return Dest;
 }
 
 void Interpreter::visitSelectInst(SelectInst &I) {
   ExecutionContext &SF = ECStack.back();
+  const Type * Ty = I.getOperand(0)->getType();
   GenericValue Src1 = getOperandValue(I.getOperand(0), SF);
   GenericValue Src2 = getOperandValue(I.getOperand(1), SF);
   GenericValue Src3 = getOperandValue(I.getOperand(2), SF);
-  GenericValue R = executeSelectInst(Src1, Src2, Src3);
+  GenericValue R = executeSelectInst(Src1, Src2, Src3, Ty);
   SetValue(&I, R, SF);
 }
 
-
 //===----------------------------------------------------------------------===//
 //                     Terminator Instruction Implementations
 //===----------------------------------------------------------------------===//
@@ -887,40 +898,11 @@ void Interpreter::visitSwitchInst(SwitchInst &I) {
   // Check to see if any of the cases match...
   BasicBlock *Dest = 0;
   for (SwitchInst::CaseIt i = I.case_begin(), e = I.case_end(); i != e; ++i) {
-    IntegersSubset& Case = i.getCaseValueEx();
-    if (Case.isSingleNumber()) {
-      // FIXME: Currently work with ConstantInt based numbers.
-      const ConstantInt *CI = Case.getSingleNumber(0).toConstantInt();
-      GenericValue Val = getOperandValue(const_cast<ConstantInt*>(CI), SF);
-      if (executeICMP_EQ(Val, CondVal, ElTy).IntVal != 0) {
-        Dest = cast<BasicBlock>(i.getCaseSuccessor());
-        break;        
-      }
+    GenericValue CaseVal = getOperandValue(i.getCaseValue(), SF);
+    if (executeICMP_EQ(CondVal, CaseVal, ElTy).IntVal != 0) {
+      Dest = cast<BasicBlock>(i.getCaseSuccessor());
+      break;
     }
-    if (Case.isSingleNumbersOnly()) {
-      for (unsigned n = 0, en = Case.getNumItems(); n != en; ++n) {
-        // FIXME: Currently work with ConstantInt based numbers.
-        const ConstantInt *CI = Case.getSingleNumber(n).toConstantInt();
-        GenericValue Val = getOperandValue(const_cast<ConstantInt*>(CI), SF);
-        if (executeICMP_EQ(Val, CondVal, ElTy).IntVal != 0) {
-          Dest = cast<BasicBlock>(i.getCaseSuccessor());
-          break;        
-        }
-      }      
-    } else
-      for (unsigned n = 0, en = Case.getNumItems(); n != en; ++n) {
-        IntegersSubset::Range r = Case.getItem(n);
-        // FIXME: Currently work with ConstantInt based numbers.
-        const ConstantInt *LowCI = r.getLow().toConstantInt();
-        const ConstantInt *HighCI = r.getHigh().toConstantInt();
-        GenericValue Low = getOperandValue(const_cast<ConstantInt*>(LowCI), SF);
-        GenericValue High = getOperandValue(const_cast<ConstantInt*>(HighCI), SF);
-        if (executeICMP_ULE(Low, CondVal, ElTy).IntVal != 0 &&
-            executeICMP_ULE(CondVal, High, ElTy).IntVal != 0) {
-          Dest = cast<BasicBlock>(i.getCaseSuccessor());
-          break;        
-        }
-      }
   }
   if (!Dest) Dest = I.getDefaultDest();   // No cases matched: use default
   SwitchToNewBasicBlock(Dest, SF);
@@ -1793,10 +1775,204 @@ void Interpreter::visitExtractElementInst(ExtractElementInst &I) {
   SetValue(&I, Dest, SF);
 }
 
+void Interpreter::visitInsertElementInst(InsertElementInst &I) {
+  ExecutionContext &SF = ECStack.back();
+  Type *Ty = I.getType();
+
+  if(!(Ty->isVectorTy()) )
+    llvm_unreachable("Unhandled dest type for insertelement instruction");
+
+  GenericValue Src1 = getOperandValue(I.getOperand(0), SF);
+  GenericValue Src2 = getOperandValue(I.getOperand(1), SF);
+  GenericValue Src3 = getOperandValue(I.getOperand(2), SF);
+  GenericValue Dest;
+
+  Type *TyContained = Ty->getContainedType(0);
+
+  const unsigned indx = unsigned(Src3.IntVal.getZExtValue());
+  Dest.AggregateVal = Src1.AggregateVal;
+
+  if(Src1.AggregateVal.size() <= indx)
+      llvm_unreachable("Invalid index in insertelement instruction");
+  switch (TyContained->getTypeID()) {
+    default:
+      llvm_unreachable("Unhandled dest type for insertelement instruction");
+    case Type::IntegerTyID:
+      Dest.AggregateVal[indx].IntVal = Src2.IntVal;
+      break;
+    case Type::FloatTyID:
+      Dest.AggregateVal[indx].FloatVal = Src2.FloatVal;
+      break;
+    case Type::DoubleTyID:
+      Dest.AggregateVal[indx].DoubleVal = Src2.DoubleVal;
+      break;
+  }
+  SetValue(&I, Dest, SF);
+}
+
+void Interpreter::visitShuffleVectorInst(ShuffleVectorInst &I){
+  ExecutionContext &SF = ECStack.back();
+
+  Type *Ty = I.getType();
+  if(!(Ty->isVectorTy()))
+    llvm_unreachable("Unhandled dest type for shufflevector instruction");
+
+  GenericValue Src1 = getOperandValue(I.getOperand(0), SF);
+  GenericValue Src2 = getOperandValue(I.getOperand(1), SF);
+  GenericValue Src3 = getOperandValue(I.getOperand(2), SF);
+  GenericValue Dest;
+
+  // There is no need to check types of src1 and src2, because the compiled
+  // bytecode can't contain different types for src1 and src2 for a
+  // shufflevector instruction.
+
+  Type *TyContained = Ty->getContainedType(0);
+  unsigned src1Size = (unsigned)Src1.AggregateVal.size();
+  unsigned src2Size = (unsigned)Src2.AggregateVal.size();
+  unsigned src3Size = (unsigned)Src3.AggregateVal.size();
+
+  Dest.AggregateVal.resize(src3Size);
+
+  switch (TyContained->getTypeID()) {
+    default:
+      llvm_unreachable("Unhandled dest type for insertelement instruction");
+      break;
+    case Type::IntegerTyID:
+      for( unsigned i=0; i<src3Size; i++) {
+        unsigned j = Src3.AggregateVal[i].IntVal.getZExtValue();
+        if(j < src1Size)
+          Dest.AggregateVal[i].IntVal = Src1.AggregateVal[j].IntVal;
+        else if(j < src1Size + src2Size)
+          Dest.AggregateVal[i].IntVal = Src2.AggregateVal[j-src1Size].IntVal;
+        else
+          // The selector may not be greater than sum of lengths of first and
+          // second operands and llasm should not allow situation like
+          // %tmp = shufflevector <2 x i32> <i32 3, i32 4>, <2 x i32> undef,
+          //                      <2 x i32> < i32 0, i32 5 >,
+          // where i32 5 is invalid, but let it be additional check here:
+          llvm_unreachable("Invalid mask in shufflevector instruction");
+      }
+      break;
+    case Type::FloatTyID:
+      for( unsigned i=0; i<src3Size; i++) {
+        unsigned j = Src3.AggregateVal[i].IntVal.getZExtValue();
+        if(j < src1Size)
+          Dest.AggregateVal[i].FloatVal = Src1.AggregateVal[j].FloatVal;
+        else if(j < src1Size + src2Size)
+          Dest.AggregateVal[i].FloatVal = Src2.AggregateVal[j-src1Size].FloatVal;
+        else
+          llvm_unreachable("Invalid mask in shufflevector instruction");
+        }
+      break;
+    case Type::DoubleTyID:
+      for( unsigned i=0; i<src3Size; i++) {
+        unsigned j = Src3.AggregateVal[i].IntVal.getZExtValue();
+        if(j < src1Size)
+          Dest.AggregateVal[i].DoubleVal = Src1.AggregateVal[j].DoubleVal;
+        else if(j < src1Size + src2Size)
+          Dest.AggregateVal[i].DoubleVal =
+            Src2.AggregateVal[j-src1Size].DoubleVal;
+        else
+          llvm_unreachable("Invalid mask in shufflevector instruction");
+      }
+      break;
+  }
+  SetValue(&I, Dest, SF);
+}
+
+void Interpreter::visitExtractValueInst(ExtractValueInst &I) {
+  ExecutionContext &SF = ECStack.back();
+  Value *Agg = I.getAggregateOperand();
+  GenericValue Dest;
+  GenericValue Src = getOperandValue(Agg, SF);
+
+  ExtractValueInst::idx_iterator IdxBegin = I.idx_begin();
+  unsigned Num = I.getNumIndices();
+  GenericValue *pSrc = &Src;
+
+  for (unsigned i = 0 ; i < Num; ++i) {
+    pSrc = &pSrc->AggregateVal[*IdxBegin];
+    ++IdxBegin;
+  }
+
+  Type *IndexedType = ExtractValueInst::getIndexedType(Agg->getType(), I.getIndices());
+  switch (IndexedType->getTypeID()) {
+    default:
+      llvm_unreachable("Unhandled dest type for extractelement instruction");
+    break;
+    case Type::IntegerTyID:
+      Dest.IntVal = pSrc->IntVal;
+    break;
+    case Type::FloatTyID:
+      Dest.FloatVal = pSrc->FloatVal;
+    break;
+    case Type::DoubleTyID:
+      Dest.DoubleVal = pSrc->DoubleVal;
+    break;
+    case Type::ArrayTyID:
+    case Type::StructTyID:
+    case Type::VectorTyID:
+      Dest.AggregateVal = pSrc->AggregateVal;
+    break;
+    case Type::PointerTyID:
+      Dest.PointerVal = pSrc->PointerVal;
+    break;
+  }
+
+  SetValue(&I, Dest, SF);
+}
+
+void Interpreter::visitInsertValueInst(InsertValueInst &I) {
+
+  ExecutionContext &SF = ECStack.back();
+  Value *Agg = I.getAggregateOperand();
+
+  GenericValue Src1 = getOperandValue(Agg, SF);
+  GenericValue Src2 = getOperandValue(I.getOperand(1), SF);
+  GenericValue Dest = Src1; // Dest is a slightly changed Src1
+
+  ExtractValueInst::idx_iterator IdxBegin = I.idx_begin();
+  unsigned Num = I.getNumIndices();
+
+  GenericValue *pDest = &Dest;
+  for (unsigned i = 0 ; i < Num; ++i) {
+    pDest = &pDest->AggregateVal[*IdxBegin];
+    ++IdxBegin;
+  }
+  // pDest points to the target value in the Dest now
+
+  Type *IndexedType = ExtractValueInst::getIndexedType(Agg->getType(), I.getIndices());
+
+  switch (IndexedType->getTypeID()) {
+    default:
+      llvm_unreachable("Unhandled dest type for insertelement instruction");
+    break;
+    case Type::IntegerTyID:
+      pDest->IntVal = Src2.IntVal;
+    break;
+    case Type::FloatTyID:
+      pDest->FloatVal = Src2.FloatVal;
+    break;
+    case Type::DoubleTyID:
+      pDest->DoubleVal = Src2.DoubleVal;
+    break;
+    case Type::ArrayTyID:
+    case Type::StructTyID:
+    case Type::VectorTyID:
+      pDest->AggregateVal = Src2.AggregateVal;
+    break;
+    case Type::PointerTyID:
+      pDest->PointerVal = Src2.PointerVal;
+    break;
+  }
+
+  SetValue(&I, Dest, SF);
+}
+
 GenericValue Interpreter::getConstantExprValue (ConstantExpr *CE,
                                                 ExecutionContext &SF) {
   switch (CE->getOpcode()) {
-  case Instruction::Trunc:   
+  case Instruction::Trunc:
       return executeTruncInst(CE->getOperand(0), CE->getType(), SF);
   case Instruction::ZExt:
       return executeZExtInst(CE->getOperand(0), CE->getType(), SF);
@@ -1832,7 +2008,8 @@ GenericValue Interpreter::getConstantExprValue (ConstantExpr *CE,
   case Instruction::Select:
     return executeSelectInst(getOperandValue(CE->getOperand(0), SF),
                              getOperandValue(CE->getOperand(1), SF),
-                             getOperandValue(CE->getOperand(2), SF));
+                             getOperandValue(CE->getOperand(2), SF),
+                             CE->getOperand(0)->getType());
   default :
     break;
   }
diff --git a/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp b/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp
index bef4bbf..a03c7f5 100644
--- a/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp
+++ b/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp
@@ -406,6 +406,7 @@ GenericValue lle_X_sprintf(FunctionType *FT,
       break;
     }
   }
+  return GV;
 }
 
 // int printf(const char *, ...) - a very rough implementation to make output
@@ -434,7 +435,7 @@ GenericValue lle_X_sscanf(FunctionType *FT,
 
   GenericValue GV;
   GV.IntVal = APInt(32, sscanf(Args[0], Args[1], Args[2], Args[3], Args[4],
-                        Args[5], Args[6], Args[7], Args[8], Args[9]));
+                    Args[5], Args[6], Args[7], Args[8], Args[9]));
   return GV;
 }
 
@@ -450,7 +451,7 @@ GenericValue lle_X_scanf(FunctionType *FT,
 
   GenericValue GV;
   GV.IntVal = APInt(32, scanf( Args[0], Args[1], Args[2], Args[3], Args[4],
-                        Args[5], Args[6], Args[7], Args[8], Args[9]));
+                    Args[5], Args[6], Args[7], Args[8], Args[9]));
   return GV;
 }
 
@@ -470,6 +471,30 @@ GenericValue lle_X_fprintf(FunctionType *FT,
   return GV;
 }
 
+static GenericValue lle_X_memset(FunctionType *FT,
+                                 const std::vector<GenericValue> &Args) {
+  int val = (int)Args[1].IntVal.getSExtValue();
+  size_t len = (size_t)Args[2].IntVal.getZExtValue();
+  memset((void *)GVTOP(Args[0]), val, len);
+  // llvm.memset.* returns void, lle_X_* returns GenericValue,
+  // so here we return GenericValue with IntVal set to zero
+  GenericValue GV;
+  GV.IntVal = 0;
+  return GV;
+}
+
+static GenericValue lle_X_memcpy(FunctionType *FT,
+                                 const std::vector<GenericValue> &Args) {
+  memcpy(GVTOP(Args[0]), GVTOP(Args[1]),
+         (size_t)(Args[2].IntVal.getLimitedValue()));
+
+  // llvm.memcpy* returns void, lle_X_* returns GenericValue,
+  // so here we return GenericValue with IntVal set to zero
+  GenericValue GV;
+  GV.IntVal = 0;
+  return GV;
+}
+
 void Interpreter::initializeExternalFunctions() {
   sys::ScopedLock Writer(*FunctionsLock);
   FuncNames["lle_X_atexit"]       = lle_X_atexit;
@@ -481,4 +506,6 @@ void Interpreter::initializeExternalFunctions() {
   FuncNames["lle_X_sscanf"]       = lle_X_sscanf;
   FuncNames["lle_X_scanf"]        = lle_X_scanf;
   FuncNames["lle_X_fprintf"]      = lle_X_fprintf;
+  FuncNames["lle_X_memset"]       = lle_X_memset;
+  FuncNames["lle_X_memcpy"]       = lle_X_memcpy;
 }
diff --git a/lib/ExecutionEngine/Interpreter/Interpreter.h b/lib/ExecutionEngine/Interpreter/Interpreter.h
index 2952d7e..98269ef 100644
--- a/lib/ExecutionEngine/Interpreter/Interpreter.h
+++ b/lib/ExecutionEngine/Interpreter/Interpreter.h
@@ -179,6 +179,12 @@ public:
 
   void visitVAArgInst(VAArgInst &I);
   void visitExtractElementInst(ExtractElementInst &I);
+  void visitInsertElementInst(InsertElementInst &I);
+  void visitShuffleVectorInst(ShuffleVectorInst &I);
+
+  void visitExtractValueInst(ExtractValueInst &I);
+  void visitInsertValueInst(InsertValueInst &I);
+
   void visitInstruction(Instruction &I) {
     errs() << I << "\n";
     llvm_unreachable("Instruction not interpretable yet!");
diff --git a/lib/ExecutionEngine/JIT/JIT.cpp b/lib/ExecutionEngine/JIT/JIT.cpp
index 53ea0a2..246a675 100644
--- a/lib/ExecutionEngine/JIT/JIT.cpp
+++ b/lib/ExecutionEngine/JIT/JIT.cpp
@@ -67,140 +67,6 @@ static struct RegisterJIT {
 extern "C" void LLVMLinkInJIT() {
 }
 
-// Determine whether we can register EH tables.
-#if (defined(__GNUC__) && !defined(__ARM_EABI__) && \
-     !defined(__USING_SJLJ_EXCEPTIONS__))
-#define HAVE_EHTABLE_SUPPORT 1
-#else
-#define HAVE_EHTABLE_SUPPORT 0
-#endif
-
-#if HAVE_EHTABLE_SUPPORT
-
-// libgcc defines the __register_frame function to dynamically register new
-// dwarf frames for exception handling. This functionality is not portable
-// across compilers and is only provided by GCC. We use the __register_frame
-// function here so that code generated by the JIT cooperates with the unwinding
-// runtime of libgcc. When JITting with exception handling enable, LLVM
-// generates dwarf frames and registers it to libgcc with __register_frame.
-//
-// The __register_frame function works with Linux.
-//
-// Unfortunately, this functionality seems to be in libgcc after the unwinding
-// library of libgcc for darwin was written. The code for darwin overwrites the
-// value updated by __register_frame with a value fetched with "keymgr".
-// "keymgr" is an obsolete functionality, which should be rewritten some day.
-// In the meantime, since "keymgr" is on all libgccs shipped with apple-gcc, we
-// need a workaround in LLVM which uses the "keymgr" to dynamically modify the
-// values of an opaque key, used by libgcc to find dwarf tables.
-
-extern "C" void __register_frame(void*);
-extern "C" void __deregister_frame(void*);
-
-#if defined(__APPLE__) && MAC_OS_X_VERSION_MAX_ALLOWED <= 1050
-# define USE_KEYMGR 1
-#else
-# define USE_KEYMGR 0
-#endif
-
-#if USE_KEYMGR
-
-namespace {
-
-// LibgccObject - This is the structure defined in libgcc. There is no #include
-// provided for this structure, so we also define it here. libgcc calls it
-// "struct object". The structure is undocumented in libgcc.
-struct LibgccObject {
-  void *unused1;
-  void *unused2;
-  void *unused3;
-
-  /// frame - Pointer to the exception table.
-  void *frame;
-
-  /// encoding -  The encoding of the object?
-  union {
-    struct {
-      unsigned long sorted : 1;
-      unsigned long from_array : 1;
-      unsigned long mixed_encoding : 1;
-      unsigned long encoding : 8;
-      unsigned long count : 21;
-    } b;
-    size_t i;
-  } encoding;
-
-  /// fde_end - libgcc defines this field only if some macro is defined. We
-  /// include this field even if it may not there, to make libgcc happy.
-  char *fde_end;
-
-  /// next - At least we know it's a chained list!
-  struct LibgccObject *next;
-};
-
-// "kemgr" stuff. Apparently, all frame tables are stored there.
-extern "C" void _keymgr_set_and_unlock_processwide_ptr(int, void *);
-extern "C" void *_keymgr_get_and_lock_processwide_ptr(int);
-#define KEYMGR_GCC3_DW2_OBJ_LIST        302     /* Dwarf2 object list  */
-
-/// LibgccObjectInfo - libgcc defines this struct as km_object_info. It
-/// probably contains all dwarf tables that are loaded.
-struct LibgccObjectInfo {
-
-  /// seenObjects - LibgccObjects already parsed by the unwinding runtime.
-  ///
-  struct LibgccObject* seenObjects;
-
-  /// unseenObjects - LibgccObjects not parsed yet by the unwinding runtime.
-  ///
-  struct LibgccObject* unseenObjects;
-
-  unsigned unused[2];
-};
-
-/// darwin_register_frame - Since __register_frame does not work with darwin's
-/// libgcc,we provide our own function, which "tricks" libgcc by modifying the
-/// "Dwarf2 object list" key.
-void DarwinRegisterFrame(void* FrameBegin) {
-  // Get the key.
-  LibgccObjectInfo* LOI = (struct LibgccObjectInfo*)
-    _keymgr_get_and_lock_processwide_ptr(KEYMGR_GCC3_DW2_OBJ_LIST);
-  assert(LOI && "This should be preallocated by the runtime");
-
-  // Allocate a new LibgccObject to represent this frame. Deallocation of this
-  // object may be impossible: since darwin code in libgcc was written after
-  // the ability to dynamically register frames, things may crash if we
-  // deallocate it.
-  struct LibgccObject* ob = (struct LibgccObject*)
-    malloc(sizeof(struct LibgccObject));
-
-  // Do like libgcc for the values of the field.
-  ob->unused1 = (void *)-1;
-  ob->unused2 = 0;
-  ob->unused3 = 0;
-  ob->frame = FrameBegin;
-  ob->encoding.i = 0;
-  ob->encoding.b.encoding = llvm::dwarf::DW_EH_PE_omit;
-
-  // Put the info on both places, as libgcc uses the first or the second
-  // field. Note that we rely on having two pointers here. If fde_end was a
-  // char, things would get complicated.
-  ob->fde_end = (char*)LOI->unseenObjects;
-  ob->next = LOI->unseenObjects;
-
-  // Update the key's unseenObjects list.
-  LOI->unseenObjects = ob;
-
-  // Finally update the "key". Apparently, libgcc requires it.
-  _keymgr_set_and_unlock_processwide_ptr(KEYMGR_GCC3_DW2_OBJ_LIST,
-                                         LOI);
-
-}
-
-}
-#endif // __APPLE__
-#endif // HAVE_EHTABLE_SUPPORT
-
 /// createJIT - This is the factory method for creating a JIT for the current
 /// machine, it does not fall back to the interpreter.  This takes ownership
 /// of the module.
@@ -293,33 +159,11 @@ JIT::JIT(Module *M, TargetMachine &tm, TargetJITInfo &tji,
     report_fatal_error("Target does not support machine code emission!");
   }
 
-  // Register routine for informing unwinding runtime about new EH frames
-#if HAVE_EHTABLE_SUPPORT
-#if USE_KEYMGR
-  struct LibgccObjectInfo* LOI = (struct LibgccObjectInfo*)
-    _keymgr_get_and_lock_processwide_ptr(KEYMGR_GCC3_DW2_OBJ_LIST);
-
-  // The key is created on demand, and libgcc creates it the first time an
-  // exception occurs. Since we need the key to register frames, we create
-  // it now.
-  if (!LOI)
-    LOI = (LibgccObjectInfo*)calloc(sizeof(struct LibgccObjectInfo), 1);
-  _keymgr_set_and_unlock_processwide_ptr(KEYMGR_GCC3_DW2_OBJ_LIST, LOI);
-  InstallExceptionTableRegister(DarwinRegisterFrame);
-  // Not sure about how to deregister on Darwin.
-#else
-  InstallExceptionTableRegister(__register_frame);
-  InstallExceptionTableDeregister(__deregister_frame);
-#endif // __APPLE__
-#endif // HAVE_EHTABLE_SUPPORT
-
   // Initialize passes.
   PM.doInitialization();
 }
 
 JIT::~JIT() {
-  // Unregister all exception tables registered by this JIT.
-  DeregisterAllTables();
   // Cleanup.
   AllJits->Remove(this);
   delete jitstate;
diff --git a/lib/ExecutionEngine/JIT/JITMemoryManager.cpp b/lib/ExecutionEngine/JIT/JITMemoryManager.cpp
index 94db245..f58d31b 100644
--- a/lib/ExecutionEngine/JIT/JITMemoryManager.cpp
+++ b/lib/ExecutionEngine/JIT/JITMemoryManager.cpp
@@ -464,7 +464,7 @@ namespace {
 
     /// allocateCodeSection - Allocate memory for a code section.
     uint8_t *allocateCodeSection(uintptr_t Size, unsigned Alignment,
-                                 unsigned SectionID) {
+                                 unsigned SectionID, StringRef SectionName) {
       // Grow the required block size to account for the block header
       Size += sizeof(*CurBlock);
 
@@ -510,7 +510,8 @@ namespace {
 
     /// allocateDataSection - Allocate memory for a data section.
     uint8_t *allocateDataSection(uintptr_t Size, unsigned Alignment,
-                                 unsigned SectionID, bool IsReadOnly) {
+                                 unsigned SectionID, StringRef SectionName,
+                                 bool IsReadOnly) {
       return (uint8_t*)DataAllocator.Allocate(Size, Alignment);
     }
 
@@ -793,7 +794,7 @@ static void runAtExitHandlers() {
 // not inlined, and hiding their real definitions in a separate archive file
 // that the dynamic linker can't see. For more info, search for
 // 'libc_nonshared.a' on Google, or read http://llvm.org/PR274.
-#if defined(__linux__)
+#if defined(__linux__) && defined(__GLIBC__)
 /* stat functions are redirecting to __xstat with a version number.  On x86-64
  * linking with libc_nonshared.a and -Wl,--export-dynamic doesn't make 'stat'
  * available as an exported symbol, so we have to add it explicitly.
diff --git a/lib/ExecutionEngine/MCJIT/MCJIT.cpp b/lib/ExecutionEngine/MCJIT/MCJIT.cpp
index 09dd924..195c458 100644
--- a/lib/ExecutionEngine/MCJIT/MCJIT.cpp
+++ b/lib/ExecutionEngine/MCJIT/MCJIT.cpp
@@ -14,10 +14,12 @@
 #include "llvm/ExecutionEngine/MCJIT.h"
 #include "llvm/ExecutionEngine/ObjectBuffer.h"
 #include "llvm/ExecutionEngine/ObjectImage.h"
+#include "llvm/PassManager.h"
 #include "llvm/ExecutionEngine/SectionMemoryManager.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/Support/DynamicLibrary.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -53,37 +55,63 @@ ExecutionEngine *MCJIT::createJIT(Module *M,
 
 MCJIT::MCJIT(Module *m, TargetMachine *tm, RTDyldMemoryManager *MM,
              bool AllocateGVsWithCode)
-  : ExecutionEngine(m), TM(tm), Ctx(0), MemMgr(MM), Dyld(MM),
-    IsLoaded(false), M(m), ObjCache(0) {
+  : ExecutionEngine(m), TM(tm), Ctx(0), MemMgr(this, MM), Dyld(&MemMgr),
+    ObjCache(0) {
 
+  OwnedModules.addModule(m);
   setDataLayout(TM->getDataLayout());
 }
 
 MCJIT::~MCJIT() {
-  if (LoadedObject)
-    NotifyFreeingObject(*LoadedObject.get());
-  delete MemMgr;
+  MutexGuard locked(lock);
+  // FIXME: We are managing our modules, so we do not want the base class
+  // ExecutionEngine to manage them as well. To avoid double destruction
+  // of the first (and only) module added in ExecutionEngine constructor
+  // we remove it from EE and will destruct it ourselves.
+  //
+  // It may make sense to move our module manager (based on SmallStPtr) back
+  // into EE if the JIT and Interpreter can live with it.
+  // If so, additional functions: addModule, removeModule, FindFunctionNamed,
+  // runStaticConstructorsDestructors could be moved back to EE as well.
+  //
+  Modules.clear();
+  Dyld.deregisterEHFrames();
+
+  LoadedObjectMap::iterator it, end = LoadedObjects.end();
+  for (it = LoadedObjects.begin(); it != end; ++it) {
+    ObjectImage *Obj = it->second;
+    if (Obj) {
+      NotifyFreeingObject(*Obj);
+      delete Obj;
+    }
+  }
+  LoadedObjects.clear();
   delete TM;
 }
 
+void MCJIT::addModule(Module *M) {
+  MutexGuard locked(lock);
+  OwnedModules.addModule(M);
+}
+
+bool MCJIT::removeModule(Module *M) {
+  MutexGuard locked(lock);
+  return OwnedModules.removeModule(M);
+}
+
+
+
 void MCJIT::setObjectCache(ObjectCache* NewCache) {
+  MutexGuard locked(lock);
   ObjCache = NewCache;
 }
 
-ObjectBufferStream* MCJIT::emitObject(Module *m) {
-  /// Currently, MCJIT only supports a single module and the module passed to
-  /// this function call is expected to be the contained module.  The module
-  /// is passed as a parameter here to prepare for multiple module support in
-  /// the future.
-  assert(M == m);
-
-  // Get a thread lock to make sure we aren't trying to compile multiple times
+ObjectBufferStream* MCJIT::emitObject(Module *M) {
   MutexGuard locked(lock);
 
-  // FIXME: Track compilation state on a per-module basis when multiple modules
-  //        are supported.
-  // Re-compilation is not supported
-  assert(!IsLoaded);
+  // This must be a module which has already been added but not loaded to this
+  // MCJIT instance, since these conditions are tested by our caller,
+  // generateCodeForModule.
 
   PassManager PM;
 
@@ -99,7 +127,7 @@ ObjectBufferStream* MCJIT::emitObject(Module *m) {
   }
 
   // Initialize passes.
-  PM.run(*m);
+  PM.run(*M);
   // Flush the output buffer to get the generated code into memory
   CompiledObject->flush();
 
@@ -109,21 +137,22 @@ ObjectBufferStream* MCJIT::emitObject(Module *m) {
     // MemoryBuffer is a thin wrapper around the actual memory, so it's OK
     // to create a temporary object here and delete it after the call.
     OwningPtr<MemoryBuffer> MB(CompiledObject->getMemBuffer());
-    ObjCache->notifyObjectCompiled(m, MB.get());
+    ObjCache->notifyObjectCompiled(M, MB.get());
   }
 
   return CompiledObject.take();
 }
 
-void MCJIT::loadObject(Module *M) {
-
+void MCJIT::generateCodeForModule(Module *M) {
   // Get a thread lock to make sure we aren't trying to load multiple times
   MutexGuard locked(lock);
 
-  // FIXME: Track compilation state on a per-module basis when multiple modules
-  //        are supported.
+  // This must be a module which has already been added to this MCJIT instance.
+  assert(OwnedModules.ownsModule(M) &&
+         "MCJIT::generateCodeForModule: Unknown module.");
+
   // Re-compilation is not supported
-  if (IsLoaded)
+  if (OwnedModules.hasModuleBeenLoaded(M))
     return;
 
   OwningPtr<ObjectBuffer> ObjectToLoad;
@@ -141,59 +170,137 @@ void MCJIT::loadObject(Module *M) {
   }
 
   // Load the object into the dynamic linker.
-  // handing off ownership of the buffer
-  LoadedObject.reset(Dyld.loadObject(ObjectToLoad.take()));
+  // MCJIT now owns the ObjectImage pointer (via its LoadedObjects map).
+  ObjectImage *LoadedObject = Dyld.loadObject(ObjectToLoad.take());
+  LoadedObjects[M] = LoadedObject;
   if (!LoadedObject)
     report_fatal_error(Dyld.getErrorString());
 
-  // Resolve any relocations.
-  Dyld.resolveRelocations();
-
   // FIXME: Make this optional, maybe even move it to a JIT event listener
   LoadedObject->registerWithDebugger();
 
   NotifyObjectEmitted(*LoadedObject);
 
-  // FIXME: Add support for per-module compilation state
-  IsLoaded = true;
+  OwnedModules.markModuleAsLoaded(M);
 }
 
-// FIXME: Add a parameter to identify which object is being finalized when
-// MCJIT supports multiple modules.
-// FIXME: Provide a way to separate code emission, relocations and page 
-// protection in the interface.
+void MCJIT::finalizeLoadedModules() {
+  MutexGuard locked(lock);
+
+  // Resolve any outstanding relocations.
+  Dyld.resolveRelocations();
+
+  OwnedModules.markAllLoadedModulesAsFinalized();
+
+  // Register EH frame data for any module we own which has been loaded
+  Dyld.registerEHFrames();
+
+  // Set page permissions.
+  MemMgr.finalizeMemory();
+}
+
+// FIXME: Rename this.
 void MCJIT::finalizeObject() {
-  // If the module hasn't been compiled, just do that.
-  if (!IsLoaded) {
-    // If the call to Dyld.resolveRelocations() is removed from loadObject()
-    // we'll need to do that here.
-    loadObject(M);
-  } else {
-    // Resolve any relocations.
-    Dyld.resolveRelocations();
+  MutexGuard locked(lock);
+
+  for (ModulePtrSet::iterator I = OwnedModules.begin_added(),
+                              E = OwnedModules.end_added();
+       I != E; ++I) {
+    Module *M = *I;
+    generateCodeForModule(M);
   }
 
-  StringRef EHData = Dyld.getEHFrameSection();
-  if (!EHData.empty())
-    MemMgr->registerEHFrames(EHData);
+  finalizeLoadedModules();
+}
 
-  // Set page permissions.
-  MemMgr->finalizeMemory();
+void MCJIT::finalizeModule(Module *M) {
+  MutexGuard locked(lock);
+
+  // This must be a module which has already been added to this MCJIT instance.
+  assert(OwnedModules.ownsModule(M) && "MCJIT::finalizeModule: Unknown module.");
+
+  // If the module hasn't been compiled, just do that.
+  if (!OwnedModules.hasModuleBeenLoaded(M))
+    generateCodeForModule(M);
+
+  finalizeLoadedModules();
 }
 
 void *MCJIT::getPointerToBasicBlock(BasicBlock *BB) {
   report_fatal_error("not yet implemented");
 }
 
-void *MCJIT::getPointerToFunction(Function *F) {
-  // FIXME: This should really return a uint64_t since it's a pointer in the
-  // target address space, not our local address space. That's part of the
-  // ExecutionEngine interface, though. Fix that when the old JIT finally
-  // dies.
+uint64_t MCJIT::getExistingSymbolAddress(const std::string &Name) {
+  // Check with the RuntimeDyld to see if we already have this symbol.
+  if (Name[0] == '\1')
+    return Dyld.getSymbolLoadAddress(Name.substr(1));
+  return Dyld.getSymbolLoadAddress((TM->getMCAsmInfo()->getGlobalPrefix()
+                                       + Name));
+}
+
+Module *MCJIT::findModuleForSymbol(const std::string &Name,
+                                   bool CheckFunctionsOnly) {
+  MutexGuard locked(lock);
+
+  // If it hasn't already been generated, see if it's in one of our modules.
+  for (ModulePtrSet::iterator I = OwnedModules.begin_added(),
+                              E = OwnedModules.end_added();
+       I != E; ++I) {
+    Module *M = *I;
+    Function *F = M->getFunction(Name);
+    if (F && !F->isDeclaration())
+      return M;
+    if (!CheckFunctionsOnly) {
+      GlobalVariable *G = M->getGlobalVariable(Name);
+      if (G && !G->isDeclaration())
+        return M;
+      // FIXME: Do we need to worry about global aliases?
+    }
+  }
+  // We didn't find the symbol in any of our modules.
+  return NULL;
+}
+
+uint64_t MCJIT::getSymbolAddress(const std::string &Name,
+                                 bool CheckFunctionsOnly)
+{
+  MutexGuard locked(lock);
+
+  // First, check to see if we already have this symbol.
+  uint64_t Addr = getExistingSymbolAddress(Name);
+  if (Addr)
+    return Addr;
+
+  // If it hasn't already been generated, see if it's in one of our modules.
+  Module *M = findModuleForSymbol(Name, CheckFunctionsOnly);
+  if (!M)
+    return 0;
+
+  generateCodeForModule(M);
+
+  // Check the RuntimeDyld table again, it should be there now.
+  return getExistingSymbolAddress(Name);
+}
 
-  // FIXME: Add support for per-module compilation state
-  if (!IsLoaded)
-    loadObject(M);
+uint64_t MCJIT::getGlobalValueAddress(const std::string &Name) {
+  MutexGuard locked(lock);
+  uint64_t Result = getSymbolAddress(Name, false);
+  if (Result != 0)
+    finalizeLoadedModules();
+  return Result;
+}
+
+uint64_t MCJIT::getFunctionAddress(const std::string &Name) {
+  MutexGuard locked(lock);
+  uint64_t Result = getSymbolAddress(Name, true);
+  if (Result != 0)
+    finalizeLoadedModules();
+  return Result;
+}
+
+// Deprecated.  Use getFunctionAddress instead.
+void *MCJIT::getPointerToFunction(Function *F) {
+  MutexGuard locked(lock);
 
   if (F->isDeclaration() || F->hasAvailableExternallyLinkage()) {
     bool AbortOnFailure = !F->hasExternalWeakLinkage();
@@ -202,6 +309,16 @@ void *MCJIT::getPointerToFunction(Function *F) {
     return Addr;
   }
 
+  Module *M = F->getParent();
+  bool HasBeenAddedButNotLoaded = OwnedModules.hasModuleBeenAddedButNotLoaded(M);
+
+  // Make sure the relevant module has been compiled and loaded.
+  if (HasBeenAddedButNotLoaded)
+    generateCodeForModule(M);
+  else if (!OwnedModules.hasModuleBeenLoaded(M))
+    // If this function doesn't belong to one of our modules, we're done.
+    return NULL;
+
   // FIXME: Should the Dyld be retaining module information? Probably not.
   // FIXME: Should we be using the mangler for this? Probably.
   //
@@ -222,6 +339,45 @@ void MCJIT::freeMachineCodeForFunction(Function *F) {
   report_fatal_error("not yet implemented");
 }
 
+void MCJIT::runStaticConstructorsDestructorsInModulePtrSet(
+    bool isDtors, ModulePtrSet::iterator I, ModulePtrSet::iterator E) {
+  for (; I != E; ++I) {
+    ExecutionEngine::runStaticConstructorsDestructors(*I, isDtors);
+  }
+}
+
+void MCJIT::runStaticConstructorsDestructors(bool isDtors) {
+  // Execute global ctors/dtors for each module in the program.
+  runStaticConstructorsDestructorsInModulePtrSet(
+      isDtors, OwnedModules.begin_added(), OwnedModules.end_added());
+  runStaticConstructorsDestructorsInModulePtrSet(
+      isDtors, OwnedModules.begin_loaded(), OwnedModules.end_loaded());
+  runStaticConstructorsDestructorsInModulePtrSet(
+      isDtors, OwnedModules.begin_finalized(), OwnedModules.end_finalized());
+}
+
+Function *MCJIT::FindFunctionNamedInModulePtrSet(const char *FnName,
+                                                 ModulePtrSet::iterator I,
+                                                 ModulePtrSet::iterator E) {
+  for (; I != E; ++I) {
+    if (Function *F = (*I)->getFunction(FnName))
+      return F;
+  }
+  return 0;
+}
+
+Function *MCJIT::FindFunctionNamed(const char *FnName) {
+  Function *F = FindFunctionNamedInModulePtrSet(
+      FnName, OwnedModules.begin_added(), OwnedModules.end_added());
+  if (!F)
+    F = FindFunctionNamedInModulePtrSet(FnName, OwnedModules.begin_loaded(),
+                                        OwnedModules.end_loaded());
+  if (!F)
+    F = FindFunctionNamedInModulePtrSet(FnName, OwnedModules.begin_finalized(),
+                                        OwnedModules.end_finalized());
+  return F;
+}
+
 GenericValue MCJIT::runFunction(Function *F,
                                 const std::vector<GenericValue> &ArgValues) {
   assert(F && "Function *F was null at entry to run()");
@@ -324,12 +480,8 @@ GenericValue MCJIT::runFunction(Function *F,
 
 void *MCJIT::getPointerToNamedFunction(const std::string &Name,
                                        bool AbortOnFailure) {
-  // FIXME: Add support for per-module compilation state
-  if (!IsLoaded)
-    loadObject(M);
-
-  if (!isSymbolSearchingDisabled() && MemMgr) {
-    void *ptr = MemMgr->getPointerToNamedFunction(Name, false);
+  if (!isSymbolSearchingDisabled()) {
+    void *ptr = MemMgr.getPointerToNamedFunction(Name, false);
     if (ptr)
       return ptr;
   }
@@ -365,6 +517,7 @@ void MCJIT::UnregisterJITEventListener(JITEventListener *L) {
 }
 void MCJIT::NotifyObjectEmitted(const ObjectImage& Obj) {
   MutexGuard locked(lock);
+  MemMgr.notifyObjectLoaded(this, &Obj);
   for (unsigned I = 0, S = EventListeners.size(); I < S; ++I) {
     EventListeners[I]->NotifyObjectEmitted(Obj);
   }
@@ -375,3 +528,14 @@ void MCJIT::NotifyFreeingObject(const ObjectImage& Obj) {
     EventListeners[I]->NotifyFreeingObject(Obj);
   }
 }
+
+uint64_t LinkingMemoryManager::getSymbolAddress(const std::string &Name) {
+  uint64_t Result = ParentEngine->getSymbolAddress(Name, false);
+  // If the symbols wasn't found and it begins with an underscore, try again
+  // without the underscore.
+  if (!Result && Name[0] == '_')
+    Result = ParentEngine->getSymbolAddress(Name.substr(1), false);
+  if (Result)
+    return Result;
+  return ClientMM->getSymbolAddress(Name);
+}
diff --git a/lib/ExecutionEngine/MCJIT/MCJIT.h b/lib/ExecutionEngine/MCJIT/MCJIT.h
index a899d4f..86b478b 100644
--- a/lib/ExecutionEngine/MCJIT/MCJIT.h
+++ b/lib/ExecutionEngine/MCJIT/MCJIT.h
@@ -10,48 +10,235 @@
 #ifndef LLVM_LIB_EXECUTIONENGINE_MCJIT_H
 #define LLVM_LIB_EXECUTIONENGINE_MCJIT_H
 
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ExecutionEngine/ExecutionEngine.h"
 #include "llvm/ExecutionEngine/ObjectCache.h"
+#include "llvm/ExecutionEngine/ObjectImage.h"
 #include "llvm/ExecutionEngine/RuntimeDyld.h"
-#include "llvm/PassManager.h"
+#include "llvm/IR/Module.h"
 
 namespace llvm {
+class MCJIT;
 
-class ObjectImage;
+// This is a helper class that the MCJIT execution engine uses for linking
+// functions across modules that it owns.  It aggregates the memory manager
+// that is passed in to the MCJIT constructor and defers most functionality
+// to that object.
+class LinkingMemoryManager : public RTDyldMemoryManager {
+public:
+  LinkingMemoryManager(MCJIT *Parent, RTDyldMemoryManager *MM)
+    : ParentEngine(Parent), ClientMM(MM) {}
+
+  virtual uint64_t getSymbolAddress(const std::string &Name);
+
+  // Functions deferred to client memory manager
+  virtual uint8_t *allocateCodeSection(uintptr_t Size, unsigned Alignment,
+                                       unsigned SectionID, StringRef SectionName) {
+    return ClientMM->allocateCodeSection(Size, Alignment, SectionID, SectionName);
+  }
+
+  virtual uint8_t *allocateDataSection(uintptr_t Size, unsigned Alignment,
+                                       unsigned SectionID, StringRef SectionName,
+                                       bool IsReadOnly) {
+    return ClientMM->allocateDataSection(Size, Alignment,
+                                         SectionID, SectionName, IsReadOnly);
+  }
+
+  virtual void notifyObjectLoaded(ExecutionEngine *EE,
+                                  const ObjectImage *Obj) {
+    ClientMM->notifyObjectLoaded(EE, Obj);
+  }
+
+  virtual void registerEHFrames(uint8_t *Addr, uint64_t LoadAddr, size_t Size) {
+    ClientMM->registerEHFrames(Addr, LoadAddr, Size);
+  }
+
+  virtual void deregisterEHFrames(uint8_t *Addr,
+                                  uint64_t LoadAddr,
+                                  size_t Size) {
+    ClientMM->deregisterEHFrames(Addr, LoadAddr, Size);
+  }
+
+  virtual bool finalizeMemory(std::string *ErrMsg = 0) {
+    return ClientMM->finalizeMemory(ErrMsg);
+  }
 
-// FIXME: This makes all kinds of horrible assumptions for the time being,
-// like only having one module, not needing to worry about multi-threading,
-// blah blah. Purely in get-it-up-and-limping mode for now.
+private:
+  MCJIT *ParentEngine;
+  OwningPtr<RTDyldMemoryManager> ClientMM;
+};
+
+// About Module states: added->loaded->finalized.
+//
+// The purpose of the "added" state is having modules in standby. (added=known
+// but not compiled). The idea is that you can add a module to provide function
+// definitions but if nothing in that module is referenced by a module in which
+// a function is executed (note the wording here because it's not exactly the
+// ideal case) then the module never gets compiled. This is sort of lazy
+// compilation.
+//
+// The purpose of the "loaded" state (loaded=compiled and required sections
+// copied into local memory but not yet ready for execution) is to have an
+// intermediate state wherein clients can remap the addresses of sections, using
+// MCJIT::mapSectionAddress, (in preparation for later copying to a new location
+// or an external process) before relocations and page permissions are applied.
+//
+// It might not be obvious at first glance, but the "remote-mcjit" case in the
+// lli tool does this.  In that case, the intermediate action is taken by the
+// RemoteMemoryManager in response to the notifyObjectLoaded function being
+// called.
 
 class MCJIT : public ExecutionEngine {
   MCJIT(Module *M, TargetMachine *tm, RTDyldMemoryManager *MemMgr,
         bool AllocateGVsWithCode);
 
+  typedef llvm::SmallPtrSet<Module *, 4> ModulePtrSet;
+
+  class OwningModuleContainer {
+  public:
+    OwningModuleContainer() {
+    }
+    ~OwningModuleContainer() {
+      freeModulePtrSet(AddedModules);
+      freeModulePtrSet(LoadedModules);
+      freeModulePtrSet(FinalizedModules);
+    }
+
+    ModulePtrSet::iterator begin_added() { return AddedModules.begin(); }
+    ModulePtrSet::iterator end_added() { return AddedModules.end(); }
+
+    ModulePtrSet::iterator begin_loaded() { return LoadedModules.begin(); }
+    ModulePtrSet::iterator end_loaded() { return LoadedModules.end(); }
+
+    ModulePtrSet::iterator begin_finalized() { return FinalizedModules.begin(); }
+    ModulePtrSet::iterator end_finalized() { return FinalizedModules.end(); }
+
+    void addModule(Module *M) {
+      AddedModules.insert(M);
+    }
+
+    bool removeModule(Module *M) {
+      return AddedModules.erase(M) || LoadedModules.erase(M) ||
+             FinalizedModules.erase(M);
+    }
+
+    bool hasModuleBeenAddedButNotLoaded(Module *M) {
+      return AddedModules.count(M) != 0;
+    }
+
+    bool hasModuleBeenLoaded(Module *M) {
+      // If the module is in either the "loaded" or "finalized" sections it
+      // has been loaded.
+      return (LoadedModules.count(M) != 0 ) || (FinalizedModules.count(M) != 0);
+    }
+
+    bool hasModuleBeenFinalized(Module *M) {
+      return FinalizedModules.count(M) != 0;
+    }
+
+    bool ownsModule(Module* M) {
+      return (AddedModules.count(M) != 0) || (LoadedModules.count(M) != 0) ||
+             (FinalizedModules.count(M) != 0);
+    }
+
+    void markModuleAsLoaded(Module *M) {
+      // This checks against logic errors in the MCJIT implementation.
+      // This function should never be called with either a Module that MCJIT
+      // does not own or a Module that has already been loaded and/or finalized.
+      assert(AddedModules.count(M) &&
+             "markModuleAsLoaded: Module not found in AddedModules");
+
+      // Remove the module from the "Added" set.
+      AddedModules.erase(M);
+
+      // Add the Module to the "Loaded" set.
+      LoadedModules.insert(M);
+    }
+
+    void markModuleAsFinalized(Module *M) {
+      // This checks against logic errors in the MCJIT implementation.
+      // This function should never be called with either a Module that MCJIT
+      // does not own, a Module that has not been loaded or a Module that has
+      // already been finalized.
+      assert(LoadedModules.count(M) &&
+             "markModuleAsFinalized: Module not found in LoadedModules");
+
+      // Remove the module from the "Loaded" section of the list.
+      LoadedModules.erase(M);
+
+      // Add the Module to the "Finalized" section of the list by inserting it
+      // before the 'end' iterator.
+      FinalizedModules.insert(M);
+    }
+
+    void markAllLoadedModulesAsFinalized() {
+      for (ModulePtrSet::iterator I = LoadedModules.begin(),
+                                  E = LoadedModules.end();
+           I != E; ++I) {
+        Module *M = *I;
+        FinalizedModules.insert(M);
+      }
+      LoadedModules.clear();
+    }
+
+  private:
+    ModulePtrSet AddedModules;
+    ModulePtrSet LoadedModules;
+    ModulePtrSet FinalizedModules;
+
+    void freeModulePtrSet(ModulePtrSet& MPS) {
+      // Go through the module set and delete everything.
+      for (ModulePtrSet::iterator I = MPS.begin(), E = MPS.end(); I != E; ++I) {
+        Module *M = *I;
+        delete M;
+      }
+      MPS.clear();
+    }
+  };
+
   TargetMachine *TM;
   MCContext *Ctx;
-  RTDyldMemoryManager *MemMgr;
+  LinkingMemoryManager MemMgr;
   RuntimeDyld Dyld;
   SmallVector<JITEventListener*, 2> EventListeners;
 
-  // FIXME: Add support for multiple modules
-  bool IsLoaded;
-  Module *M;
-  OwningPtr<ObjectImage> LoadedObject;
+  OwningModuleContainer OwnedModules;
+
+  typedef DenseMap<Module *, ObjectImage *> LoadedObjectMap;
+  LoadedObjectMap  LoadedObjects;
 
   // An optional ObjectCache to be notified of compiled objects and used to
   // perform lookup of pre-compiled code to avoid re-compilation.
   ObjectCache *ObjCache;
 
+  Function *FindFunctionNamedInModulePtrSet(const char *FnName,
+                                            ModulePtrSet::iterator I,
+                                            ModulePtrSet::iterator E);
+
+  void runStaticConstructorsDestructorsInModulePtrSet(bool isDtors,
+                                                      ModulePtrSet::iterator I,
+                                                      ModulePtrSet::iterator E);
+
 public:
   ~MCJIT();
 
   /// @name ExecutionEngine interface implementation
   /// @{
+  virtual void addModule(Module *M);
+  virtual bool removeModule(Module *M);
+
+  /// FindFunctionNamed - Search all of the active modules to find the one that
+  /// defines FnName.  This is very slow operation and shouldn't be used for
+  /// general code.
+  virtual Function *FindFunctionNamed(const char *FnName);
 
   /// Sets the object manager that MCJIT should use to avoid compilation.
   virtual void setObjectCache(ObjectCache *manager);
 
+  virtual void generateCodeForModule(Module *M);
+
   /// finalizeObject - ensure the module is fully processed and is usable.
   ///
   /// It is the user-level function for completing the process of making the
@@ -59,7 +246,17 @@ public:
   /// object have been relocated using mapSectionAddress.  When this method is
   /// called the MCJIT execution engine will reapply relocations for a loaded
   /// object.
+  /// Is it OK to finalize a set of modules, add modules and finalize again.
+  // FIXME: Do we really need both of these?
   virtual void finalizeObject();
+  virtual void finalizeModule(Module *);
+  void finalizeLoadedModules();
+
+  /// runStaticConstructorsDestructors - This method is used to execute all of
+  /// the static constructors or destructors for a program.
+  ///
+  /// \param isDtors - Run the destructors instead of constructors.
+  void runStaticConstructorsDestructors(bool isDtors);
 
   virtual void *getPointerToBasicBlock(BasicBlock *BB);
 
@@ -91,10 +288,15 @@ public:
                                  uint64_t TargetAddress) {
     Dyld.mapSectionAddress(LocalAddress, TargetAddress);
   }
-
   virtual void RegisterJITEventListener(JITEventListener *L);
   virtual void UnregisterJITEventListener(JITEventListener *L);
 
+  // If successful, these function will implicitly finalize all loaded objects.
+  // To get a function address within MCJIT without causing a finalize, use
+  // getSymbolAddress.
+  virtual uint64_t getGlobalValueAddress(const std::string &Name);
+  virtual uint64_t getFunctionAddress(const std::string &Name);
+
   /// @}
   /// @name (Private) Registration Interfaces
   /// @{
@@ -111,6 +313,11 @@ public:
 
   // @}
 
+  // This is not directly exposed via the ExecutionEngine API, but it is
+  // used by the LinkingMemoryManager.
+  uint64_t getSymbolAddress(const std::string &Name,
+                          bool CheckFunctionsOnly);
+
 protected:
   /// emitObject -- Generate a JITed object in memory from the specified module
   /// Currently, MCJIT only supports a single module and the module passed to
@@ -119,10 +326,12 @@ protected:
   /// the future.
   ObjectBufferStream* emitObject(Module *M);
 
-  void loadObject(Module *M);
-
   void NotifyObjectEmitted(const ObjectImage& Obj);
   void NotifyFreeingObject(const ObjectImage& Obj);
+
+  uint64_t getExistingSymbolAddress(const std::string &Name);
+  Module *findModuleForSymbol(const std::string &Name,
+                              bool CheckFunctionsOnly);
 };
 
 } // End llvm namespace
diff --git a/lib/ExecutionEngine/MCJIT/SectionMemoryManager.cpp b/lib/ExecutionEngine/MCJIT/SectionMemoryManager.cpp
index 650832e..cf90e77 100644
--- a/lib/ExecutionEngine/MCJIT/SectionMemoryManager.cpp
+++ b/lib/ExecutionEngine/MCJIT/SectionMemoryManager.cpp
@@ -19,9 +19,10 @@
 namespace llvm {
 
 uint8_t *SectionMemoryManager::allocateDataSection(uintptr_t Size,
-                                                    unsigned Alignment,
-                                                    unsigned SectionID,
-                                                    bool IsReadOnly) {
+                                                   unsigned Alignment,
+                                                   unsigned SectionID,
+                                                   StringRef SectionName,
+                                                   bool IsReadOnly) {
   if (IsReadOnly)
     return allocateSection(RODataMem, Size, Alignment);
   return allocateSection(RWDataMem, Size, Alignment);
@@ -29,7 +30,8 @@ uint8_t *SectionMemoryManager::allocateDataSection(uintptr_t Size,
 
 uint8_t *SectionMemoryManager::allocateCodeSection(uintptr_t Size,
                                                    unsigned Alignment,
-                                                   unsigned SectionID) {
+                                                   unsigned SectionID,
+                                                   StringRef SectionName) {
   return allocateSection(CodeMem, Size, Alignment);
 }
 
@@ -105,6 +107,9 @@ bool SectionMemoryManager::finalizeMemory(std::string *ErrMsg)
   // FIXME: Should in-progress permissions be reverted if an error occurs?
   error_code ec;
 
+  // Don't allow free memory blocks to be used after setting protection flags.
+  CodeMem.FreeMem.clear();
+
   // Make code memory executable.
   ec = applyMemoryGroupPermissions(CodeMem,
                                    sys::Memory::MF_READ | sys::Memory::MF_EXEC);
@@ -115,6 +120,9 @@ bool SectionMemoryManager::finalizeMemory(std::string *ErrMsg)
     return true;
   }
 
+  // Don't allow free memory blocks to be used after setting protection flags.
+  RODataMem.FreeMem.clear();
+
   // Make read-only data memory read-only.
   ec = applyMemoryGroupPermissions(RODataMem,
                                    sys::Memory::MF_READ | sys::Memory::MF_EXEC);
diff --git a/lib/ExecutionEngine/OProfileJIT/OProfileJITEventListener.cpp b/lib/ExecutionEngine/OProfileJIT/OProfileJITEventListener.cpp
index 38867ec..f11df82 100644
--- a/lib/ExecutionEngine/OProfileJIT/OProfileJITEventListener.cpp
+++ b/lib/ExecutionEngine/OProfileJIT/OProfileJITEventListener.cpp
@@ -20,7 +20,9 @@
 #include "llvm/IR/Function.h"
 #include "llvm/ADT/OwningPtr.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/ExecutionEngine/ObjectImage.h"
 #include "llvm/ExecutionEngine/OProfileWrapper.h"
+#include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/Errno.h"
@@ -52,6 +54,10 @@ public:
                                 const JITEvent_EmittedFunctionDetails &Details);
 
   virtual void NotifyFreeingMachineCode(void *OldPtr);
+
+  virtual void NotifyObjectEmitted(const ObjectImage &Obj);
+
+  virtual void NotifyFreeingObject(const ObjectImage &Obj);
 };
 
 void OProfileJITEventListener::initialize() {
@@ -159,6 +165,66 @@ void OProfileJITEventListener::NotifyFreeingMachineCode(void *FnStart) {
   }
 }
 
+void OProfileJITEventListener::NotifyObjectEmitted(const ObjectImage &Obj) {
+  if (!Wrapper.isAgentAvailable()) {
+    return;
+  }
+
+  // Use symbol info to iterate functions in the object.
+  error_code ec;
+  for (object::symbol_iterator I = Obj.begin_symbols(),
+                               E = Obj.end_symbols();
+                        I != E && !ec;
+                        I.increment(ec)) {
+    object::SymbolRef::Type SymType;
+    if (I->getType(SymType)) continue;
+    if (SymType == object::SymbolRef::ST_Function) {
+      StringRef  Name;
+      uint64_t   Addr;
+      uint64_t   Size;
+      if (I->getName(Name)) continue;
+      if (I->getAddress(Addr)) continue;
+      if (I->getSize(Size)) continue;
+
+      if (Wrapper.op_write_native_code(Name.data(), Addr, (void*)Addr, Size)
+                        == -1) {
+        DEBUG(dbgs() << "Failed to tell OProfile about native function "
+          << Name << " at ["
+          << (void*)Addr << "-" << ((char*)Addr + Size) << "]\n");
+        continue;
+      }
+      // TODO: support line number info (similar to IntelJITEventListener.cpp)
+    }
+  }
+}
+
+void OProfileJITEventListener::NotifyFreeingObject(const ObjectImage &Obj) {
+  if (!Wrapper.isAgentAvailable()) {
+    return;
+  }
+
+  // Use symbol info to iterate functions in the object.
+  error_code ec;
+  for (object::symbol_iterator I = Obj.begin_symbols(),
+                               E = Obj.end_symbols();
+                        I != E && !ec;
+                        I.increment(ec)) {
+    object::SymbolRef::Type SymType;
+    if (I->getType(SymType)) continue;
+    if (SymType == object::SymbolRef::ST_Function) {
+      uint64_t   Addr;
+      if (I->getAddress(Addr)) continue;
+
+      if (Wrapper.op_unload_native_code(Addr) == -1) {
+        DEBUG(dbgs()
+          << "Failed to tell OProfile about unload of native function at "
+          << (void*)Addr << "\n");
+        continue;
+      }
+    }
+  }
+}
+
 }  // anonymous namespace.
 
 namespace llvm {
diff --git a/lib/ExecutionEngine/OProfileJIT/OProfileWrapper.cpp b/lib/ExecutionEngine/OProfileJIT/OProfileWrapper.cpp
index 7c0d395..61d8dc2 100644
--- a/lib/ExecutionEngine/OProfileJIT/OProfileWrapper.cpp
+++ b/lib/ExecutionEngine/OProfileJIT/OProfileWrapper.cpp
@@ -13,22 +13,20 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/ExecutionEngine/OProfileWrapper.h"
-
 #define DEBUG_TYPE "oprofile-wrapper"
+#include "llvm/ExecutionEngine/OProfileWrapper.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/DynamicLibrary.h"
 #include "llvm/Support/Mutex.h"
 #include "llvm/Support/MutexGuard.h"
 #include "llvm/ADT/SmallString.h"
-
-#include <sstream>
 #include <cstring>
-#include <stddef.h>
 #include <dirent.h>
-#include <sys/stat.h>
 #include <fcntl.h>
+#include <sstream>
+#include <stddef.h>
+#include <sys/stat.h>
 #include <unistd.h>
 
 namespace {
@@ -143,6 +141,10 @@ bool OProfileWrapper::checkForOProfileProcEntry() {
         close(CmdLineFD);
         ssize_t Idx = 0;
 
+        if (ExeName[0] != '/') {
+          BaseName = ExeName;
+        }
+
         // Find the terminator for the first string
         while (Idx < NumRead-1 && ExeName[Idx] != 0) {
           Idx++;
@@ -161,7 +163,8 @@ bool OProfileWrapper::checkForOProfileProcEntry() {
         }
 
         // Test this to see if it is the oprofile daemon
-        if (BaseName != 0 && !strcmp("oprofiled", BaseName)) {
+        if (BaseName != 0 && (!strcmp("oprofiled", BaseName) ||
+                              !strcmp("operf", BaseName))) {
           // If it is, we're done
           closedir(ProcDir);
           return true;
diff --git a/lib/ExecutionEngine/RTDyldMemoryManager.cpp b/lib/ExecutionEngine/RTDyldMemoryManager.cpp
index 4e76457..26e1fdd 100644
--- a/lib/ExecutionEngine/RTDyldMemoryManager.cpp
+++ b/lib/ExecutionEngine/RTDyldMemoryManager.cpp
@@ -33,8 +33,8 @@ namespace llvm {
 RTDyldMemoryManager::~RTDyldMemoryManager() {}
 
 // Determine whether we can register EH tables.
-#if (defined(__GNUC__) && !defined(__ARM_EABI__) && \
-     !defined(__USING_SJLJ_EXCEPTIONS__))
+#if (defined(__GNUC__) && !defined(__ARM_EABI__) && !defined(__ia64__) && \
+     !defined(__SEH__) && !defined(__USING_SJLJ_EXCEPTIONS__))
 #define HAVE_EHTABLE_SUPPORT 1
 #else
 #define HAVE_EHTABLE_SUPPORT 0
@@ -42,35 +42,180 @@ RTDyldMemoryManager::~RTDyldMemoryManager() {}
 
 #if HAVE_EHTABLE_SUPPORT
 extern "C" void __register_frame(void*);
+extern "C" void __deregister_frame(void*);
+#else
+// The building compiler does not have __(de)register_frame but
+// it may be found at runtime in a dynamically-loaded library.
+// For example, this happens when building LLVM with Visual C++
+// but using the MingW runtime.
+void __register_frame(void *p) {
+  static bool Searched = false;
+  static void *rf = 0;
+
+  if (!Searched) {
+    Searched = true;
+    rf = llvm::sys::DynamicLibrary::SearchForAddressOfSymbol(
+                                                      "__register_frame");
+  }
+  if (rf)
+    ((void (*)(void *))rf)(p);
+}
+
+void __deregister_frame(void *p) {
+  static bool Searched = false;
+  static void *df = 0;
+
+  if (!Searched) {
+    Searched = true;
+    df = llvm::sys::DynamicLibrary::SearchForAddressOfSymbol(
+                                                      "__deregister_frame");
+  }
+  if (df)
+    ((void (*)(void *))df)(p);
+}
+#endif
+
+#ifdef __APPLE__
 
-static const char *processFDE(const char *Entry) {
+static const char *processFDE(const char *Entry, bool isDeregister) {
   const char *P = Entry;
   uint32_t Length = *((const uint32_t *)P);
   P += 4;
   uint32_t Offset = *((const uint32_t *)P);
-  if (Offset != 0)
-    __register_frame(const_cast<char *>(Entry));
+  if (Offset != 0) {
+    if (isDeregister)
+      __deregister_frame(const_cast<char *>(Entry));
+    else
+      __register_frame(const_cast<char *>(Entry));
+  }
   return P + Length;
 }
-#endif
 
-void RTDyldMemoryManager::registerEHFrames(StringRef SectionData) {
-#if HAVE_EHTABLE_SUPPORT
-  const char *P = SectionData.data();
-  const char *End = SectionData.data() + SectionData.size();
+// This implementation handles frame registration for local targets.
+// Memory managers for remote targets should re-implement this function
+// and use the LoadAddr parameter.
+void RTDyldMemoryManager::registerEHFrames(uint8_t *Addr,
+                                           uint64_t LoadAddr,
+                                           size_t Size) {
+  // On OS X OS X __register_frame takes a single FDE as an argument.
+  // See http://lists.cs.uiuc.edu/pipermail/llvmdev/2013-April/061768.html
+  const char *P = (const char *)Addr;
+  const char *End = P + Size;
   do  {
-    P = processFDE(P);
+    P = processFDE(P, false);
   } while(P != End);
-#endif
 }
 
+void RTDyldMemoryManager::deregisterEHFrames(uint8_t *Addr,
+                                           uint64_t LoadAddr,
+                                           size_t Size) {
+  const char *P = (const char *)Addr;
+  const char *End = P + Size;
+  do  {
+    P = processFDE(P, true);
+  } while(P != End);
+}
+
+#else
+
+void RTDyldMemoryManager::registerEHFrames(uint8_t *Addr,
+                                           uint64_t LoadAddr,
+                                           size_t Size) {
+  // On Linux __register_frame takes a single argument: 
+  // a pointer to the start of the .eh_frame section.
+
+  // How can it find the end? Because crtendS.o is linked 
+  // in and it has an .eh_frame section with four zero chars.
+  __register_frame(Addr);
+}
+
+void RTDyldMemoryManager::deregisterEHFrames(uint8_t *Addr,
+                                           uint64_t LoadAddr,
+                                           size_t Size) {
+  __deregister_frame(Addr);
+}
+
+#endif
+
 static int jit_noop() {
   return 0;
 }
 
-void *RTDyldMemoryManager::getPointerToNamedFunction(const std::string &Name,
-                                                     bool AbortOnFailure) {
-#if defined(__linux__)
+// ARM math functions are statically linked on Android from libgcc.a, but not
+// available at runtime for dynamic linking. On Linux these are usually placed
+// in libgcc_s.so so can be found by normal dynamic lookup.
+#if defined(__BIONIC__) && defined(__arm__)
+// List of functions which are statically linked on Android and can be generated
+// by LLVM. This is done as a nested macro which is used once to declare the
+// imported functions with ARM_MATH_DECL and once to compare them to the
+// user-requested symbol in getSymbolAddress with ARM_MATH_CHECK. The test
+// assumes that all functions start with __aeabi_ and getSymbolAddress must be
+// modified if that changes.
+#define ARM_MATH_IMPORTS(PP) \
+  PP(__aeabi_d2f) \
+  PP(__aeabi_d2iz) \
+  PP(__aeabi_d2lz) \
+  PP(__aeabi_d2uiz) \
+  PP(__aeabi_d2ulz) \
+  PP(__aeabi_dadd) \
+  PP(__aeabi_dcmpeq) \
+  PP(__aeabi_dcmpge) \
+  PP(__aeabi_dcmpgt) \
+  PP(__aeabi_dcmple) \
+  PP(__aeabi_dcmplt) \
+  PP(__aeabi_dcmpun) \
+  PP(__aeabi_ddiv) \
+  PP(__aeabi_dmul) \
+  PP(__aeabi_dsub) \
+  PP(__aeabi_f2d) \
+  PP(__aeabi_f2iz) \
+  PP(__aeabi_f2lz) \
+  PP(__aeabi_f2uiz) \
+  PP(__aeabi_f2ulz) \
+  PP(__aeabi_fadd) \
+  PP(__aeabi_fcmpeq) \
+  PP(__aeabi_fcmpge) \
+  PP(__aeabi_fcmpgt) \
+  PP(__aeabi_fcmple) \
+  PP(__aeabi_fcmplt) \
+  PP(__aeabi_fcmpun) \
+  PP(__aeabi_fdiv) \
+  PP(__aeabi_fmul) \
+  PP(__aeabi_fsub) \
+  PP(__aeabi_i2d) \
+  PP(__aeabi_i2f) \
+  PP(__aeabi_idiv) \
+  PP(__aeabi_idivmod) \
+  PP(__aeabi_l2d) \
+  PP(__aeabi_l2f) \
+  PP(__aeabi_lasr) \
+  PP(__aeabi_ldivmod) \
+  PP(__aeabi_llsl) \
+  PP(__aeabi_llsr) \
+  PP(__aeabi_lmul) \
+  PP(__aeabi_ui2d) \
+  PP(__aeabi_ui2f) \
+  PP(__aeabi_uidiv) \
+  PP(__aeabi_uidivmod) \
+  PP(__aeabi_ul2d) \
+  PP(__aeabi_ul2f) \
+  PP(__aeabi_uldivmod)
+
+// Declare statically linked math functions on ARM. The function declarations
+// here do not have the correct prototypes for each function in
+// ARM_MATH_IMPORTS, but it doesn't matter because only the symbol addresses are
+// needed. In particular the __aeabi_*divmod functions do not have calling
+// conventions which match any C prototype.
+#define ARM_MATH_DECL(name) extern "C" void name();
+ARM_MATH_IMPORTS(ARM_MATH_DECL)
+#undef ARM_MATH_DECL
+#endif
+
+uint64_t RTDyldMemoryManager::getSymbolAddress(const std::string &Name) {
+  // This implementation assumes that the host program is the target.
+  // Clients generating code for a remote target should implement their own
+  // memory manager.
+#if defined(__linux__) && defined(__GLIBC__)
   //===--------------------------------------------------------------------===//
   // Function stubs that are invoked instead of certain library calls
   //
@@ -80,15 +225,26 @@ void *RTDyldMemoryManager::getPointerToNamedFunction(const std::string &Name,
   // not inlined, and hiding their real definitions in a separate archive file
   // that the dynamic linker can't see. For more info, search for
   // 'libc_nonshared.a' on Google, or read http://llvm.org/PR274.
-  if (Name == "stat") return (void*)(intptr_t)&stat;
-  if (Name == "fstat") return (void*)(intptr_t)&fstat;
-  if (Name == "lstat") return (void*)(intptr_t)&lstat;
-  if (Name == "stat64") return (void*)(intptr_t)&stat64;
-  if (Name == "fstat64") return (void*)(intptr_t)&fstat64;
-  if (Name == "lstat64") return (void*)(intptr_t)&lstat64;
-  if (Name == "atexit") return (void*)(intptr_t)&atexit;
-  if (Name == "mknod") return (void*)(intptr_t)&mknod;
-#endif // __linux__
+  if (Name == "stat") return (uint64_t)&stat;
+  if (Name == "fstat") return (uint64_t)&fstat;
+  if (Name == "lstat") return (uint64_t)&lstat;
+  if (Name == "stat64") return (uint64_t)&stat64;
+  if (Name == "fstat64") return (uint64_t)&fstat64;
+  if (Name == "lstat64") return (uint64_t)&lstat64;
+  if (Name == "atexit") return (uint64_t)&atexit;
+  if (Name == "mknod") return (uint64_t)&mknod;
+#endif // __linux__ && __GLIBC__
+  
+  // See ARM_MATH_IMPORTS definition for explanation
+#if defined(__BIONIC__) && defined(__arm__)
+  if (Name.compare(0, 8, "__aeabi_") == 0) {
+    // Check if the user has requested any of the functions listed in
+    // ARM_MATH_IMPORTS, and if so redirect to the statically linked symbol.
+#define ARM_MATH_CHECK(fn) if (Name == #fn) return (uint64_t)&fn;
+    ARM_MATH_IMPORTS(ARM_MATH_CHECK)
+#undef ARM_MATH_CHECK
+  }
+#endif
 
   // We should not invoke parent's ctors/dtors from generated main()!
   // On Mingw and Cygwin, the symbol __main is resolved to
@@ -96,23 +252,31 @@ void *RTDyldMemoryManager::getPointerToNamedFunction(const std::string &Name,
   // (and register wrong callee's dtors with atexit(3)).
   // We expect ExecutionEngine::runStaticConstructorsDestructors()
   // is called before ExecutionEngine::runFunctionAsMain() is called.
-  if (Name == "__main") return (void*)(intptr_t)&jit_noop;
+  if (Name == "__main") return (uint64_t)&jit_noop;
 
   const char *NameStr = Name.c_str();
   void *Ptr = sys::DynamicLibrary::SearchForAddressOfSymbol(NameStr);
-  if (Ptr) return Ptr;
+  if (Ptr)
+    return (uint64_t)Ptr;
 
   // If it wasn't found and if it starts with an underscore ('_') character,
   // try again without the underscore.
   if (NameStr[0] == '_') {
     Ptr = sys::DynamicLibrary::SearchForAddressOfSymbol(NameStr+1);
-    if (Ptr) return Ptr;
+    if (Ptr)
+      return (uint64_t)Ptr;
   }
+  return 0;
+}
 
-  if (AbortOnFailure)
+void *RTDyldMemoryManager::getPointerToNamedFunction(const std::string &Name,
+                                                     bool AbortOnFailure) {
+  uint64_t Addr = getSymbolAddress(Name);
+
+  if (!Addr && AbortOnFailure)
     report_fatal_error("Program used external function '" + Name +
                        "' which could not be resolved!");
-  return 0;
+  return (void*)Addr;
 }
 
 } // namespace llvm
diff --git a/lib/ExecutionEngine/RuntimeDyld/JITRegistrar.h b/lib/ExecutionEngine/RuntimeDyld/JITRegistrar.h
index 69e9dbe..6a514ea 100644
--- a/lib/ExecutionEngine/RuntimeDyld/JITRegistrar.h
+++ b/lib/ExecutionEngine/RuntimeDyld/JITRegistrar.h
@@ -16,6 +16,7 @@ namespace llvm {
 
 /// Global access point for the JIT debugging interface.
 class JITRegistrar {
+  virtual void anchor();
 public:
   /// Instantiates the JIT service.
   JITRegistrar() {}
diff --git a/lib/ExecutionEngine/RuntimeDyld/ObjectImageCommon.h b/lib/ExecutionEngine/RuntimeDyld/ObjectImageCommon.h
index 89350cc..9cbde5d 100644
--- a/lib/ExecutionEngine/RuntimeDyld/ObjectImageCommon.h
+++ b/lib/ExecutionEngine/RuntimeDyld/ObjectImageCommon.h
@@ -23,6 +23,7 @@ namespace llvm {
 class ObjectImageCommon : public ObjectImage {
   ObjectImageCommon(); // = delete
   ObjectImageCommon(const ObjectImageCommon &other); // = delete
+  virtual void anchor();
 
 protected:
   object::ObjectFile *ObjFile;
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
index 943622f..161135a 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
@@ -13,12 +13,14 @@
 
 #define DEBUG_TYPE "dyld"
 #include "llvm/ExecutionEngine/RuntimeDyld.h"
+#include "JITRegistrar.h"
 #include "ObjectImageCommon.h"
 #include "RuntimeDyldELF.h"
 #include "RuntimeDyldImpl.h"
 #include "RuntimeDyldMachO.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MathExtras.h"
+#include "llvm/Support/MutexGuard.h"
 #include "llvm/Object/ELF.h"
 
 using namespace llvm;
@@ -27,30 +29,44 @@ using namespace llvm::object;
 // Empty out-of-line virtual destructor as the key function.
 RuntimeDyldImpl::~RuntimeDyldImpl() {}
 
+// Pin the JITRegistrar's and ObjectImage*'s vtables to this file.
+void JITRegistrar::anchor() {}
+void ObjectImage::anchor() {}
+void ObjectImageCommon::anchor() {}
+
 namespace llvm {
 
-StringRef RuntimeDyldImpl::getEHFrameSection() {
-  return StringRef();
+void RuntimeDyldImpl::registerEHFrames() {
+}
+
+void RuntimeDyldImpl::deregisterEHFrames() {
 }
 
 // Resolve the relocations for all symbols we currently know about.
 void RuntimeDyldImpl::resolveRelocations() {
+  MutexGuard locked(lock);
+
   // First, resolve relocations associated with external symbols.
   resolveExternalSymbols();
 
   // Just iterate over the sections we have and resolve all the relocations
   // in them. Gross overkill, but it gets the job done.
   for (int i = 0, e = Sections.size(); i != e; ++i) {
+    // The Section here (Sections[i]) refers to the section in which the
+    // symbol for the relocation is located.  The SectionID in the relocation
+    // entry provides the section to which the relocation will be applied.
     uint64_t Addr = Sections[i].LoadAddress;
     DEBUG(dbgs() << "Resolving relocations Section #" << i
             << "\t" << format("%p", (uint8_t *)Addr)
             << "\n");
     resolveRelocationList(Relocations[i], Addr);
+    Relocations.erase(i);
   }
 }
 
 void RuntimeDyldImpl::mapSectionAddress(const void *LocalAddress,
                                         uint64_t TargetAddress) {
+  MutexGuard locked(lock);
   for (unsigned i = 0, e = Sections.size(); i != e; ++i) {
     if (Sections[i].Address == LocalAddress) {
       reassignSectionAddress(i, TargetAddress);
@@ -67,11 +83,15 @@ ObjectImage *RuntimeDyldImpl::createObjectImage(ObjectBuffer *InputBuffer) {
 }
 
 ObjectImage *RuntimeDyldImpl::loadObject(ObjectBuffer *InputBuffer) {
+  MutexGuard locked(lock);
+
   OwningPtr<ObjectImage> obj(createObjectImage(InputBuffer));
   if (!obj)
     report_fatal_error("Unable to create object image from memory buffer!");
 
+  // Save information about our target
   Arch = (Triple::ArchType)obj->getArch();
+  IsTargetLittleEndian = obj->getObjectFile()->isLittleEndian();
 
   // Symbols found in this object
   StringMap<SymbolLoc> LocalSymbols;
@@ -166,6 +186,9 @@ ObjectImage *RuntimeDyldImpl::loadObject(ObjectBuffer *InputBuffer) {
     }
   }
 
+  // Give the subclasses a chance to tie-up any loose ends.
+  finalizeLoad(LocalSections);
+
   return obj.take();
 }
 
@@ -175,8 +198,8 @@ void RuntimeDyldImpl::emitCommonSymbols(ObjectImage &Obj,
                                         SymbolTableMap &SymbolTable) {
   // Allocate memory for the section
   unsigned SectionID = Sections.size();
-  uint8_t *Addr = MemMgr->allocateDataSection(TotalSize, sizeof(void*),
-                                              SectionID, false);
+  uint8_t *Addr = MemMgr->allocateDataSection(
+    TotalSize, sizeof(void*), SectionID, StringRef(), false);
   if (!Addr)
     report_fatal_error("Unable to allocate memory for common symbols!");
   uint64_t Offset = 0;
@@ -247,6 +270,7 @@ unsigned RuntimeDyldImpl::emitSection(ObjectImage &Obj,
   bool IsZeroInit;
   bool IsReadOnly;
   uint64_t DataSize;
+  unsigned PaddingSize = 0;
   StringRef Name;
   Check(Section.isRequiredForExecution(IsRequired));
   Check(Section.isVirtual(IsVirtual));
@@ -261,6 +285,12 @@ unsigned RuntimeDyldImpl::emitSection(ObjectImage &Obj,
       StubBufSize += StubAlignment - EndAlignment;
   }
 
+  // The .eh_frame section (at least on Linux) needs an extra four bytes padded
+  // with zeroes added at the end.  For MachO objects, this section has a
+  // slightly different name, so this won't have any effect for MachO objects.
+  if (Name == ".eh_frame")
+    PaddingSize = 4;
+
   unsigned Allocate;
   unsigned SectionID = Sections.size();
   uint8_t *Addr;
@@ -269,10 +299,11 @@ unsigned RuntimeDyldImpl::emitSection(ObjectImage &Obj,
   // Some sections, such as debug info, don't need to be loaded for execution.
   // Leave those where they are.
   if (IsRequired) {
-    Allocate = DataSize + StubBufSize;
+    Allocate = DataSize + PaddingSize + StubBufSize;
     Addr = IsCode
-      ? MemMgr->allocateCodeSection(Allocate, Alignment, SectionID)
-      : MemMgr->allocateDataSection(Allocate, Alignment, SectionID, IsReadOnly);
+      ? MemMgr->allocateCodeSection(Allocate, Alignment, SectionID, Name)
+      : MemMgr->allocateDataSection(Allocate, Alignment, SectionID, Name,
+                                    IsReadOnly);
     if (!Addr)
       report_fatal_error("Unable to allocate section memory!");
 
@@ -286,6 +317,13 @@ unsigned RuntimeDyldImpl::emitSection(ObjectImage &Obj,
     else
       memcpy(Addr, pData, DataSize);
 
+    // Fill in any extra bytes we allocated for padding
+    if (PaddingSize != 0) {
+      memset(Addr + DataSize, 0, PaddingSize);
+      // Update the DataSize variable so that the stub offset is set correctly.
+      DataSize += PaddingSize;
+    }
+
     DEBUG(dbgs() << "emitSection SectionID: " << SectionID
                  << " Name: " << Name
                  << " obj addr: " << format("%p", pData)
@@ -421,6 +459,10 @@ uint8_t *RuntimeDyldImpl::createStubFunction(uint8_t *Addr) {
     writeInt16BE(Addr+6,  0x07F1);     // brc 15,%r1
     // 8-byte address stored at Addr + 8
     return Addr;
+  } else if (Arch == Triple::x86_64) {
+    *Addr      = 0xFF; // jmp
+    *(Addr+1)  = 0x25; // rip
+    // 32-bit PC-relative address of the GOT entry will be stored at Addr+2
   }
   return Addr;
 }
@@ -454,30 +496,52 @@ void RuntimeDyldImpl::resolveRelocationList(const RelocationList &Relocs,
 }
 
 void RuntimeDyldImpl::resolveExternalSymbols() {
-  StringMap<RelocationList>::iterator i = ExternalSymbolRelocations.begin(),
-                                      e = ExternalSymbolRelocations.end();
-  for (; i != e; i++) {
+  while(!ExternalSymbolRelocations.empty()) {
+    StringMap<RelocationList>::iterator i = ExternalSymbolRelocations.begin();
+
     StringRef Name = i->first();
-    RelocationList &Relocs = i->second;
-    SymbolTableMap::const_iterator Loc = GlobalSymbolTable.find(Name);
-    if (Loc == GlobalSymbolTable.end()) {
-      if (Name.size() == 0) {
-        // This is an absolute symbol, use an address of zero.
-        DEBUG(dbgs() << "Resolving absolute relocations." << "\n");
-        resolveRelocationList(Relocs, 0);
+    if (Name.size() == 0) {
+      // This is an absolute symbol, use an address of zero.
+      DEBUG(dbgs() << "Resolving absolute relocations." << "\n");
+      RelocationList &Relocs = i->second;
+      resolveRelocationList(Relocs, 0);
+    } else {
+      uint64_t Addr = 0;
+      SymbolTableMap::const_iterator Loc = GlobalSymbolTable.find(Name);
+      if (Loc == GlobalSymbolTable.end()) {
+          // This is an external symbol, try to get its address from
+          // MemoryManager.
+          Addr = MemMgr->getSymbolAddress(Name.data());
+          // The call to getSymbolAddress may have caused additional modules to
+          // be loaded, which may have added new entries to the
+          // ExternalSymbolRelocations map.  Consquently, we need to update our
+          // iterator.  This is also why retrieval of the relocation list
+          // associated with this symbol is deferred until below this point.
+          // New entries may have been added to the relocation list.
+          i = ExternalSymbolRelocations.find(Name);
       } else {
-        // This is an external symbol, try to get its address from
-        // MemoryManager.
-        uint8_t *Addr = (uint8_t*) MemMgr->getPointerToNamedFunction(Name.data(),
-                                                                   true);
-        DEBUG(dbgs() << "Resolving relocations Name: " << Name
-                << "\t" << format("%p", Addr)
-                << "\n");
-        resolveRelocationList(Relocs, (uintptr_t)Addr);
+        // We found the symbol in our global table.  It was probably in a
+        // Module that we loaded previously.
+        SymbolLoc SymLoc = Loc->second;
+        Addr = getSectionLoadAddress(SymLoc.first) + SymLoc.second;
       }
-    } else {
-      report_fatal_error("Expected external symbol");
+
+      // FIXME: Implement error handling that doesn't kill the host program!
+      if (!Addr)
+        report_fatal_error("Program used external function '" + Name +
+                          "' which could not be resolved!");
+
+      updateGOTEntries(Name, Addr);
+      DEBUG(dbgs() << "Resolving relocations Name: " << Name
+              << "\t" << format("0x%lx", Addr)
+              << "\n");
+      // This list may have been updated when we called getSymbolAddress, so
+      // don't change this code to get the list earlier.
+      RelocationList &Relocs = i->second;
+      resolveRelocationList(Relocs, Addr);
     }
+
+    ExternalSymbolRelocations.erase(i);
   }
 }
 
@@ -526,8 +590,10 @@ ObjectImage *RuntimeDyld::loadObject(ObjectBuffer *InputBuffer) {
     case sys::fs::file_magic::bitcode:
     case sys::fs::file_magic::archive:
     case sys::fs::file_magic::coff_object:
+    case sys::fs::file_magic::coff_import_library:
     case sys::fs::file_magic::pecoff_executable:
     case sys::fs::file_magic::macho_universal_binary:
+    case sys::fs::file_magic::windows_resource:
       report_fatal_error("Incompatible object format!");
     }
   } else {
@@ -539,10 +605,14 @@ ObjectImage *RuntimeDyld::loadObject(ObjectBuffer *InputBuffer) {
 }
 
 void *RuntimeDyld::getSymbolAddress(StringRef Name) {
+  if (!Dyld)
+    return NULL;
   return Dyld->getSymbolAddress(Name);
 }
 
 uint64_t RuntimeDyld::getSymbolLoadAddress(StringRef Name) {
+  if (!Dyld)
+    return 0;
   return Dyld->getSymbolLoadAddress(Name);
 }
 
@@ -564,8 +634,14 @@ StringRef RuntimeDyld::getErrorString() {
   return Dyld->getErrorString();
 }
 
-StringRef RuntimeDyld::getEHFrameSection() {
-  return Dyld->getEHFrameSection();
+void RuntimeDyld::registerEHFrames() {
+  if (Dyld)
+    Dyld->registerEHFrames();
+}
+
+void RuntimeDyld::deregisterEHFrames() {
+  if (Dyld)
+    Dyld->deregisterEHFrames();
 }
 
 } // end namespace llvm
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
index cd99c3c..f2c69fc 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
@@ -22,7 +22,7 @@
 #include "llvm/ADT/Triple.h"
 #include "llvm/ExecutionEngine/ObjectBuffer.h"
 #include "llvm/ExecutionEngine/ObjectImage.h"
-#include "llvm/Object/ELF.h"
+#include "llvm/Object/ELFObjectFile.h"
 #include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/ELF.h"
 using namespace llvm;
@@ -151,12 +151,31 @@ void DyldELFObject<ELFT>::updateSymbolAddress(const SymbolRef &SymRef,
 
 namespace llvm {
 
-StringRef RuntimeDyldELF::getEHFrameSection() {
-  for (int i = 0, e = Sections.size(); i != e; ++i) {
-    if (Sections[i].Name == ".eh_frame")
-      return StringRef((const char*)Sections[i].Address, Sections[i].Size);
+void RuntimeDyldELF::registerEHFrames() {
+  if (!MemMgr)
+    return;
+  for (int i = 0, e = UnregisteredEHFrameSections.size(); i != e; ++i) {
+    SID EHFrameSID = UnregisteredEHFrameSections[i];
+    uint8_t *EHFrameAddr = Sections[EHFrameSID].Address;
+    uint64_t EHFrameLoadAddr = Sections[EHFrameSID].LoadAddress;
+    size_t EHFrameSize = Sections[EHFrameSID].Size;
+    MemMgr->registerEHFrames(EHFrameAddr, EHFrameLoadAddr, EHFrameSize);
+    RegisteredEHFrameSections.push_back(EHFrameSID);
   }
-  return StringRef();
+  UnregisteredEHFrameSections.clear();
+}
+
+void RuntimeDyldELF::deregisterEHFrames() {
+  if (!MemMgr)
+    return;
+  for (int i = 0, e = RegisteredEHFrameSections.size(); i != e; ++i) {
+    SID EHFrameSID = RegisteredEHFrameSections[i];
+    uint8_t *EHFrameAddr = Sections[EHFrameSID].Address;
+    uint64_t EHFrameLoadAddr = Sections[EHFrameSID].LoadAddress;
+    size_t EHFrameSize = Sections[EHFrameSID].Size;
+    MemMgr->deregisterEHFrames(EHFrameAddr, EHFrameLoadAddr, EHFrameSize);
+  }
+  RegisteredEHFrameSections.clear();
 }
 
 ObjectImage *RuntimeDyldELF::createObjectImage(ObjectBuffer *Buffer) {
@@ -202,7 +221,8 @@ void RuntimeDyldELF::resolveX86_64Relocation(const SectionEntry &Section,
                                              uint64_t Offset,
                                              uint64_t Value,
                                              uint32_t Type,
-                                             int64_t Addend) {
+                                             int64_t  Addend,
+                                             uint64_t SymOffset) {
   switch (Type) {
   default:
     llvm_unreachable("Relocation type not implemented yet!");
@@ -227,6 +247,21 @@ void RuntimeDyldELF::resolveX86_64Relocation(const SectionEntry &Section,
                  << " at " << format("%p\n",Target));
     break;
   }
+  case ELF::R_X86_64_GOTPCREL: {
+    // findGOTEntry returns the 'G + GOT' part of the relocation calculation
+    // based on the load/target address of the GOT (not the current/local addr).
+    uint64_t GOTAddr = findGOTEntry(Value, SymOffset);
+    uint32_t *Target = reinterpret_cast<uint32_t*>(Section.Address + Offset);
+    uint64_t  FinalAddress = Section.LoadAddress + Offset;
+    // The processRelocationRef method combines the symbol offset and the addend
+    // and in most cases that's what we want.  For this relocation type, we need
+    // the raw addend, so we subtract the symbol offset to get it.
+    int64_t RealOffset = GOTAddr + Addend - SymOffset - FinalAddress;
+    assert(RealOffset <= INT32_MAX && RealOffset >= INT32_MIN);
+    int32_t TruncOffset = (RealOffset & 0xFFFFFFFF);
+    *Target = TruncOffset;
+    break;
+  }
   case ELF::R_X86_64_PC32: {
     // Get the placeholder value from the generated object since
     // a previous relocation attempt may have overwritten the loaded version
@@ -240,6 +275,16 @@ void RuntimeDyldELF::resolveX86_64Relocation(const SectionEntry &Section,
     *Target = TruncOffset;
     break;
   }
+  case ELF::R_X86_64_PC64: {
+    // Get the placeholder value from the generated object since
+    // a previous relocation attempt may have overwritten the loaded version
+    uint64_t *Placeholder = reinterpret_cast<uint64_t*>(Section.ObjAddress
+                                                                   + Offset);
+    uint64_t *Target = reinterpret_cast<uint64_t*>(Section.Address + Offset);
+    uint64_t  FinalAddress = Section.LoadAddress + Offset;
+    *Target = *Placeholder + Value + Addend - FinalAddress;
+    break;
+  }
   }
 }
 
@@ -304,7 +349,7 @@ void RuntimeDyldELF::resolveAArch64Relocation(const SectionEntry &Section,
   }
   case ELF::R_AARCH64_PREL32: {
     uint64_t Result = Value + Addend - FinalAddress;
-    assert(static_cast<int64_t>(Result) >= INT32_MIN && 
+    assert(static_cast<int64_t>(Result) >= INT32_MIN &&
            static_cast<int64_t>(Result) <= UINT32_MAX);
     *TargetPtr = static_cast<uint32_t>(Result & 0xffffffffU);
     break;
@@ -316,7 +361,7 @@ void RuntimeDyldELF::resolveAArch64Relocation(const SectionEntry &Section,
     uint64_t BranchImm = Value + Addend - FinalAddress;
 
     // "Check that -2^27 <= result < 2^27".
-    assert(-(1LL << 27) <= static_cast<int64_t>(BranchImm) && 
+    assert(-(1LL << 27) <= static_cast<int64_t>(BranchImm) &&
            static_cast<int64_t>(BranchImm) < (1LL << 27));
 
     // AArch64 code is emitted with .rela relocations. The data already in any
@@ -341,7 +386,6 @@ void RuntimeDyldELF::resolveAArch64Relocation(const SectionEntry &Section,
   case ELF::R_AARCH64_MOVW_UABS_G2_NC: {
     uint64_t Result = Value + Addend;
 
-
     // AArch64 code is emitted with .rela relocations. The data already in any
     // bits affected by the relocation on entry is garbage.
     *TargetPtr &= 0xffe0001fU;
@@ -585,7 +629,7 @@ void RuntimeDyldELF::findOPDEntrySection(ObjectImage &Obj,
       // Finally compares the Symbol value and the target symbol offset
       // to check if this .opd entry refers to the symbol the relocation
       // points to.
-      if (Rel.Addend != (intptr_t)TargetSymbolOffset)
+      if (Rel.Addend != (int64_t)TargetSymbolOffset)
         continue;
 
       section_iterator tsi(Obj.end_sections());
@@ -735,20 +779,42 @@ void RuntimeDyldELF::resolveSystemZRelocation(const SectionEntry &Section,
   }
 }
 
+// The target location for the relocation is described by RE.SectionID and
+// RE.Offset.  RE.SectionID can be used to find the SectionEntry.  Each
+// SectionEntry has three members describing its location.
+// SectionEntry::Address is the address at which the section has been loaded
+// into memory in the current (host) process.  SectionEntry::LoadAddress is the
+// address that the section will have in the target process.
+// SectionEntry::ObjAddress is the address of the bits for this section in the
+// original emitted object image (also in the current address space).
+//
+// Relocations will be applied as if the section were loaded at
+// SectionEntry::LoadAddress, but they will be applied at an address based
+// on SectionEntry::Address.  SectionEntry::ObjAddress will be used to refer to
+// Target memory contents if they are required for value calculations.
+//
+// The Value parameter here is the load address of the symbol for the
+// relocation to be applied.  For relocations which refer to symbols in the
+// current object Value will be the LoadAddress of the section in which
+// the symbol resides (RE.Addend provides additional information about the
+// symbol location).  For external symbols, Value will be the address of the
+// symbol in the target address space.
 void RuntimeDyldELF::resolveRelocation(const RelocationEntry &RE,
-				       uint64_t Value) {
+                                       uint64_t Value) {
   const SectionEntry &Section = Sections[RE.SectionID];
-  return resolveRelocation(Section, RE.Offset, Value, RE.RelType, RE.Addend);
+  return resolveRelocation(Section, RE.Offset, Value, RE.RelType, RE.Addend,
+                           RE.SymOffset);
 }
 
 void RuntimeDyldELF::resolveRelocation(const SectionEntry &Section,
                                        uint64_t Offset,
                                        uint64_t Value,
                                        uint32_t Type,
-                                       int64_t Addend) {
+                                       int64_t  Addend,
+                                       uint64_t SymOffset) {
   switch (Arch) {
   case Triple::x86_64:
-    resolveX86_64Relocation(Section, Offset, Value, Type, Addend);
+    resolveX86_64Relocation(Section, Offset, Value, Type, Addend, SymOffset);
     break;
   case Triple::x86:
     resolveX86Relocation(Section, Offset,
@@ -811,6 +877,7 @@ void RuntimeDyldELF::processRelocationRef(unsigned SectionID,
   }
   if (lsi != Symbols.end()) {
     Value.SectionID = lsi->second.first;
+    Value.Offset = lsi->second.second;
     Value.Addend = lsi->second.second + Addend;
   } else {
     // Search for the symbol in the global symbol table
@@ -819,6 +886,7 @@ void RuntimeDyldELF::processRelocationRef(unsigned SectionID,
       gsi = GlobalSymbolTable.find(TargetName.data());
     if (gsi != GlobalSymbolTable.end()) {
       Value.SectionID = gsi->second.first;
+      Value.Offset = gsi->second.second;
       Value.Addend = gsi->second.second + Addend;
     } else {
       switch (SymType) {
@@ -841,9 +909,17 @@ void RuntimeDyldELF::processRelocationRef(unsigned SectionID,
           Value.Addend = Addend;
           break;
         }
+        case SymbolRef::ST_Data:
         case SymbolRef::ST_Unknown: {
           Value.SymbolName = TargetName.data();
           Value.Addend = Addend;
+
+          // Absolute relocations will have a zero symbol ID (STN_UNDEF), which
+          // will manifest here as a NULL symbol name.
+          // We can set this as a valid (but empty) symbol name, and rely
+          // on addRelocationForSymbol to handle this.
+          if (!Value.SymbolName)
+              Value.SymbolName = "";
           break;
         }
         default:
@@ -955,8 +1031,8 @@ void RuntimeDyldELF::processRelocationRef(unsigned SectionID,
     //  Look up for existing stub.
     StubMap::const_iterator i = Stubs.find(Value);
     if (i != Stubs.end()) {
-      resolveRelocation(Section, Offset,
-                        (uint64_t)Section.Address + i->second, RelType, 0);
+      RelocationEntry RE(SectionID, Offset, RelType, i->second);
+      addRelocationForSection(RE, SectionID);
       DEBUG(dbgs() << " Stub function found\n");
     } else {
       // Create a new stub function.
@@ -981,9 +1057,8 @@ void RuntimeDyldELF::processRelocationRef(unsigned SectionID,
         addRelocationForSection(RELo, Value.SectionID);
       }
 
-      resolveRelocation(Section, Offset,
-                        (uint64_t)Section.Address + Section.StubOffset,
-                        RelType, 0);
+      RelocationEntry RE(SectionID, Offset, RelType, Section.StubOffset);
+      addRelocationForSection(RE, SectionID);
       Section.StubOffset += getMaxStubSize();
     }
   } else if (Arch == Triple::ppc64 || Arch == Triple::ppc64le) {
@@ -1069,7 +1144,10 @@ void RuntimeDyldELF::processRelocationRef(unsigned SectionID,
       RelocationEntry RE(SectionID, Offset, RelType, Value.Addend);
       // Extra check to avoid relocation againt empty symbols (usually
       // the R_PPC64_TOC).
-      if (Value.SymbolName && !TargetName.empty())
+      if (SymType != SymbolRef::ST_Unknown && TargetName.empty())
+        Value.SymbolName = NULL;
+
+      if (Value.SymbolName)
         addRelocationForSymbol(RE, Value.SymbolName);
       else
         addRelocationForSection(RE, Value.SectionID);
@@ -1121,8 +1199,67 @@ void RuntimeDyldELF::processRelocationRef(unsigned SectionID,
                         ELF::R_390_PC32DBL, Addend);
     else
       resolveRelocation(Section, Offset, StubAddress, RelType, Addend);
+  } else if (Arch == Triple::x86_64 && RelType == ELF::R_X86_64_PLT32) {
+    // The way the PLT relocations normally work is that the linker allocates the
+    // PLT and this relocation makes a PC-relative call into the PLT.  The PLT
+    // entry will then jump to an address provided by the GOT.  On first call, the
+    // GOT address will point back into PLT code that resolves the symbol.  After
+    // the first call, the GOT entry points to the actual function.
+    //
+    // For local functions we're ignoring all of that here and just replacing
+    // the PLT32 relocation type with PC32, which will translate the relocation
+    // into a PC-relative call directly to the function. For external symbols we
+    // can't be sure the function will be within 2^32 bytes of the call site, so
+    // we need to create a stub, which calls into the GOT.  This case is
+    // equivalent to the usual PLT implementation except that we use the stub
+    // mechanism in RuntimeDyld (which puts stubs at the end of the section)
+    // rather than allocating a PLT section.
+    if (Value.SymbolName) {
+      // This is a call to an external function.
+      // Look for an existing stub.
+      SectionEntry &Section = Sections[SectionID];
+      StubMap::const_iterator i = Stubs.find(Value);
+      uintptr_t StubAddress;
+      if (i != Stubs.end()) {
+        StubAddress = uintptr_t(Section.Address) + i->second;
+        DEBUG(dbgs() << " Stub function found\n");
+      } else {
+        // Create a new stub function (equivalent to a PLT entry).
+        DEBUG(dbgs() << " Create a new stub function\n");
+
+        uintptr_t BaseAddress = uintptr_t(Section.Address);
+        uintptr_t StubAlignment = getStubAlignment();
+        StubAddress = (BaseAddress + Section.StubOffset +
+                      StubAlignment - 1) & -StubAlignment;
+        unsigned StubOffset = StubAddress - BaseAddress;
+        Stubs[Value] = StubOffset;
+        createStubFunction((uint8_t *)StubAddress);
+
+        // Create a GOT entry for the external function.
+        GOTEntries.push_back(Value);
+
+        // Make our stub function a relative call to the GOT entry.
+        RelocationEntry RE(SectionID, StubOffset + 2,
+                           ELF::R_X86_64_GOTPCREL, -4);
+        addRelocationForSymbol(RE, Value.SymbolName);
+
+        // Bump our stub offset counter
+        Section.StubOffset = StubOffset + getMaxStubSize();
+      }
+
+      // Make the target call a call into the stub table.
+      resolveRelocation(Section, Offset, StubAddress,
+                      ELF::R_X86_64_PC32, Addend);
+    } else {
+      RelocationEntry RE(SectionID, Offset, ELF::R_X86_64_PC32, Value.Addend,
+                         Value.Offset);
+      addRelocationForSection(RE, Value.SectionID);
+    }
   } else {
-    RelocationEntry RE(SectionID, Offset, RelType, Value.Addend);
+    if (Arch == Triple::x86_64 && RelType == ELF::R_X86_64_GOTPCREL) {
+      GOTEntries.push_back(Value);
+    }
+    RelocationEntry RE(SectionID, Offset, RelType, Value.Addend, Value.Offset);
     if (Value.SymbolName)
       addRelocationForSymbol(RE, Value.SymbolName);
     else
@@ -1130,6 +1267,137 @@ void RuntimeDyldELF::processRelocationRef(unsigned SectionID,
   }
 }
 
+void RuntimeDyldELF::updateGOTEntries(StringRef Name, uint64_t Addr) {
+
+  SmallVectorImpl<std::pair<SID, GOTRelocations> >::iterator it;
+  SmallVectorImpl<std::pair<SID, GOTRelocations> >::iterator end = GOTs.end();
+
+  for (it = GOTs.begin(); it != end; ++it) {
+    GOTRelocations &GOTEntries = it->second;
+    for (int i = 0, e = GOTEntries.size(); i != e; ++i) {
+      if (GOTEntries[i].SymbolName != 0 && GOTEntries[i].SymbolName == Name) {
+        GOTEntries[i].Offset = Addr;
+      }
+    }
+  }
+}
+
+size_t RuntimeDyldELF::getGOTEntrySize() {
+  // We don't use the GOT in all of these cases, but it's essentially free
+  // to put them all here.
+  size_t Result = 0;
+  switch (Arch) {
+  case Triple::x86_64:
+  case Triple::aarch64:
+  case Triple::ppc64:
+  case Triple::ppc64le:
+  case Triple::systemz:
+    Result = sizeof(uint64_t);
+    break;
+  case Triple::x86:
+  case Triple::arm:
+  case Triple::thumb:
+  case Triple::mips:
+  case Triple::mipsel:
+    Result = sizeof(uint32_t);
+    break;
+  default: llvm_unreachable("Unsupported CPU type!");
+  }
+  return Result;
+}
+
+uint64_t RuntimeDyldELF::findGOTEntry(uint64_t LoadAddress,
+                                      uint64_t Offset) {
+
+  const size_t GOTEntrySize = getGOTEntrySize();
+
+  SmallVectorImpl<std::pair<SID, GOTRelocations> >::const_iterator it;
+  SmallVectorImpl<std::pair<SID, GOTRelocations> >::const_iterator end = GOTs.end();
+
+  int GOTIndex = -1;
+  for (it = GOTs.begin(); it != end; ++it) {
+    SID GOTSectionID = it->first;
+    const GOTRelocations &GOTEntries = it->second;
+
+    // Find the matching entry in our vector.
+    uint64_t SymbolOffset = 0;
+    for (int i = 0, e = GOTEntries.size(); i != e; ++i) {
+      if (GOTEntries[i].SymbolName == 0) {
+        if (getSectionLoadAddress(GOTEntries[i].SectionID) == LoadAddress &&
+            GOTEntries[i].Offset == Offset) {
+          GOTIndex = i;
+          SymbolOffset = GOTEntries[i].Offset;
+          break;
+        }
+      } else {
+        // GOT entries for external symbols use the addend as the address when
+        // the external symbol has been resolved.
+        if (GOTEntries[i].Offset == LoadAddress) {
+          GOTIndex = i;
+          // Don't use the Addend here.  The relocation handler will use it.
+          break;
+        }
+      }
+    }
+
+    if (GOTIndex != -1) {
+      if (GOTEntrySize == sizeof(uint64_t)) {
+        uint64_t *LocalGOTAddr = (uint64_t*)getSectionAddress(GOTSectionID);
+        // Fill in this entry with the address of the symbol being referenced.
+        LocalGOTAddr[GOTIndex] = LoadAddress + SymbolOffset;
+      } else {
+        uint32_t *LocalGOTAddr = (uint32_t*)getSectionAddress(GOTSectionID);
+        // Fill in this entry with the address of the symbol being referenced.
+        LocalGOTAddr[GOTIndex] = (uint32_t)(LoadAddress + SymbolOffset);
+      }
+
+      // Calculate the load address of this entry
+      return getSectionLoadAddress(GOTSectionID) + (GOTIndex * GOTEntrySize);
+    }
+  }
+
+  assert(GOTIndex != -1 && "Unable to find requested GOT entry.");
+  return 0;
+}
+
+void RuntimeDyldELF::finalizeLoad(ObjSectionToIDMap &SectionMap) {
+  // If necessary, allocate the global offset table
+  if (MemMgr) {
+    // Allocate the GOT if necessary
+    size_t numGOTEntries = GOTEntries.size();
+    if (numGOTEntries != 0) {
+      // Allocate memory for the section
+      unsigned SectionID = Sections.size();
+      size_t TotalSize = numGOTEntries * getGOTEntrySize();
+      uint8_t *Addr = MemMgr->allocateDataSection(TotalSize, getGOTEntrySize(),
+                                                  SectionID, ".got", false);
+      if (!Addr)
+        report_fatal_error("Unable to allocate memory for GOT!");
+
+      GOTs.push_back(std::make_pair(SectionID, GOTEntries));
+      Sections.push_back(SectionEntry(".got", Addr, TotalSize, 0));
+      // For now, initialize all GOT entries to zero.  We'll fill them in as
+      // needed when GOT-based relocations are applied.
+      memset(Addr, 0, TotalSize);
+    }
+  }
+  else {
+    report_fatal_error("Unable to allocate memory for GOT!");
+  }
+
+  // Look for and record the EH frame section.
+  ObjSectionToIDMap::iterator i, e;
+  for (i = SectionMap.begin(), e = SectionMap.end(); i != e; ++i) {
+    const SectionRef &Section = i->first;
+    StringRef Name;
+    Section.getName(Name);
+    if (Name == ".eh_frame") {
+      UnregisteredEHFrameSections.push_back(i->second);
+      break;
+    }
+  }
+}
+
 bool RuntimeDyldELF::isCompatibleFormat(const ObjectBuffer *Buffer) const {
   if (Buffer->getBufferSize() < strlen(ELF::ElfMagic))
     return false;
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h
index 794c7ec..3adf827 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h
@@ -15,6 +15,7 @@
 #define LLVM_RUNTIME_DYLD_ELF_H
 
 #include "RuntimeDyldImpl.h"
+#include "llvm/ADT/DenseMap.h"
 
 using namespace llvm;
 
@@ -35,13 +36,15 @@ class RuntimeDyldELF : public RuntimeDyldImpl {
                          uint64_t Offset,
                          uint64_t Value,
                          uint32_t Type,
-                         int64_t Addend);
+                         int64_t Addend,
+                         uint64_t SymOffset=0);
 
   void resolveX86_64Relocation(const SectionEntry &Section,
                                uint64_t Offset,
                                uint64_t Value,
                                uint32_t Type,
-                               int64_t Addend);
+                               int64_t  Addend,
+                               uint64_t SymOffset);
 
   void resolveX86Relocation(const SectionEntry &Section,
                             uint64_t Offset,
@@ -79,13 +82,55 @@ class RuntimeDyldELF : public RuntimeDyldImpl {
                                 uint32_t Type,
                                 int64_t Addend);
 
+  unsigned getMaxStubSize() {
+    if (Arch == Triple::aarch64)
+      return 20; // movz; movk; movk; movk; br
+    if (Arch == Triple::arm || Arch == Triple::thumb)
+      return 8; // 32-bit instruction and 32-bit address
+    else if (Arch == Triple::mipsel || Arch == Triple::mips)
+      return 16;
+    else if (Arch == Triple::ppc64 || Arch == Triple::ppc64le)
+      return 44;
+    else if (Arch == Triple::x86_64)
+      return 6; // 2-byte jmp instruction + 32-bit relative address
+    else if (Arch == Triple::systemz)
+      return 16;
+    else
+      return 0;
+  }
+
+  unsigned getStubAlignment() {
+    if (Arch == Triple::systemz)
+      return 8;
+    else
+      return 1;
+  }
+
   uint64_t findPPC64TOC() const;
   void findOPDEntrySection(ObjectImage &Obj,
                            ObjSectionToIDMap &LocalSections,
                            RelocationValueRef &Rel);
 
+  uint64_t findGOTEntry(uint64_t LoadAddr, uint64_t Offset);
+  size_t getGOTEntrySize();
+
+  virtual void updateGOTEntries(StringRef Name, uint64_t Addr);
+
+  // Relocation entries for symbols whose position-independant offset is
+  // updated in a global offset table.
+  typedef SmallVector<RelocationValueRef, 2> GOTRelocations;
+  GOTRelocations GOTEntries; // List of entries requiring finalization.
+  SmallVector<std::pair<SID, GOTRelocations>, 8> GOTs; // Allocated tables.
+
+  // When a module is loaded we save the SectionID of the EH frame section
+  // in a table until we receive a request to register all unregistered
+  // EH frame sections with the memory manager.
+  SmallVector<SID, 2> UnregisteredEHFrameSections;
+  SmallVector<SID, 2> RegisteredEHFrameSections;
+
 public:
-  RuntimeDyldELF(RTDyldMemoryManager *mm) : RuntimeDyldImpl(mm) {}
+  RuntimeDyldELF(RTDyldMemoryManager *mm) : RuntimeDyldImpl(mm)
+                                          {}
 
   virtual void resolveRelocation(const RelocationEntry &RE, uint64_t Value);
   virtual void processRelocationRef(unsigned SectionID,
@@ -96,7 +141,9 @@ public:
                                     StubMap &Stubs);
   virtual bool isCompatibleFormat(const ObjectBuffer *Buffer) const;
   virtual ObjectImage *createObjectImage(ObjectBuffer *InputBuffer);
-  virtual StringRef getEHFrameSection();
+  virtual void registerEHFrames();
+  virtual void deregisterEHFrames();
+  virtual void finalizeLoad(ObjSectionToIDMap &SectionMap);
   virtual ~RuntimeDyldELF();
 };
 
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
index 14d945b..3014b30 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
@@ -25,6 +25,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/Host.h"
+#include "llvm/Support/Mutex.h"
 #include "llvm/Support/SwapByteOrder.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/system_error.h"
@@ -80,14 +81,18 @@ public:
   unsigned SectionID;
 
   /// Offset - offset into the section.
-  uintptr_t Offset;
+  uint64_t Offset;
 
   /// RelType - relocation type.
   uint32_t RelType;
 
   /// Addend - the relocation addend encoded in the instruction itself.  Also
   /// used to make a relocation section relative instead of symbol relative.
-  intptr_t Addend;
+  int64_t Addend;
+
+  /// SymOffset - Section offset of the relocation entry's symbol (used for GOT
+  /// lookup).
+  uint64_t SymOffset;
 
   /// True if this is a PCRel relocation (MachO specific).
   bool IsPCRel;
@@ -97,26 +102,39 @@ public:
 
   RelocationEntry(unsigned id, uint64_t offset, uint32_t type, int64_t addend)
     : SectionID(id), Offset(offset), RelType(type), Addend(addend),
-      IsPCRel(false), Size(0) {}
+      SymOffset(0), IsPCRel(false), Size(0) {}
+
+  RelocationEntry(unsigned id, uint64_t offset, uint32_t type, int64_t addend,
+                  uint64_t symoffset)
+    : SectionID(id), Offset(offset), RelType(type), Addend(addend),
+      SymOffset(symoffset), IsPCRel(false), Size(0) {}
 
   RelocationEntry(unsigned id, uint64_t offset, uint32_t type, int64_t addend,
                   bool IsPCRel, unsigned Size)
     : SectionID(id), Offset(offset), RelType(type), Addend(addend),
-      IsPCRel(IsPCRel), Size(Size) {}
+      SymOffset(0), IsPCRel(IsPCRel), Size(Size) {}
 };
 
 class RelocationValueRef {
 public:
   unsigned  SectionID;
-  intptr_t  Addend;
+  uint64_t  Offset;
+  int64_t   Addend;
   const char *SymbolName;
-  RelocationValueRef(): SectionID(0), Addend(0), SymbolName(0) {}
+  RelocationValueRef(): SectionID(0), Offset(0), Addend(0), SymbolName(0) {}
 
   inline bool operator==(const RelocationValueRef &Other) const {
-    return std::memcmp(this, &Other, sizeof(RelocationValueRef)) == 0;
+    return SectionID == Other.SectionID && Offset == Other.Offset &&
+           Addend == Other.Addend && SymbolName == Other.SymbolName;
   }
   inline bool operator <(const RelocationValueRef &Other) const {
-    return std::memcmp(this, &Other, sizeof(RelocationValueRef)) < 0;
+    if (SectionID != Other.SectionID)
+      return SectionID < Other.SectionID;
+    if (Offset != Other.Offset)
+      return Offset < Other.Offset;
+    if (Addend != Other.Addend)
+      return Addend < Other.Addend;
+    return SymbolName < Other.SymbolName;
   }
 };
 
@@ -130,6 +148,9 @@ protected:
   typedef SmallVector<SectionEntry, 64> SectionList;
   SectionList Sections;
 
+  typedef unsigned SID; // Type for SectionIDs
+  #define RTDYLD_INVALID_SECTION_ID ((SID)(-1)) 
+
   // Keep a map of sections from object file to the SectionID which
   // references it.
   typedef std::map<SectionRef, unsigned> ObjSectionToIDMap;
@@ -164,30 +185,22 @@ protected:
   typedef std::map<RelocationValueRef, uintptr_t> StubMap;
 
   Triple::ArchType Arch;
-
-  inline unsigned getMaxStubSize() {
-    if (Arch == Triple::aarch64)
-      return 20; // movz; movk; movk; movk; br
-    if (Arch == Triple::arm || Arch == Triple::thumb)
-      return 8; // 32-bit instruction and 32-bit address
-    else if (Arch == Triple::mipsel || Arch == Triple::mips)
-      return 16;
-    else if (Arch == Triple::ppc64 || Arch == Triple::ppc64le)
-      return 44;
-    else if (Arch == Triple::x86_64)
-      return 8; // GOT
-    else if (Arch == Triple::systemz)
-      return 16;
-    else
-      return 0;
-  }
-
-  inline unsigned getStubAlignment() {
-    if (Arch == Triple::systemz)
-      return 8;
-    else
-      return 1;
-  }
+  bool IsTargetLittleEndian;
+
+  // This mutex prevents simultaneously loading objects from two different
+  // threads.  This keeps us from having to protect individual data structures
+  // and guarantees that section allocation requests to the memory manager
+  // won't be interleaved between modules.  It is also used in mapSectionAddress
+  // and resolveRelocations to protect write access to internal data structures.
+  //
+  // loadObject may be called on the same thread during the handling of of
+  // processRelocations, and that's OK.  The handling of the relocation lists
+  // is written in such a way as to work correctly if new elements are added to
+  // the end of the list while the list is being processed.
+  sys::Mutex lock;
+
+  virtual unsigned getMaxStubSize() = 0;
+  virtual unsigned getStubAlignment() = 0;
 
   bool HasError;
   std::string ErrorStr;
@@ -208,14 +221,14 @@ protected:
   }
 
   void writeInt16BE(uint8_t *Addr, uint16_t Value) {
-    if (sys::IsLittleEndianHost)
+    if (IsTargetLittleEndian)
       Value = sys::SwapByteOrder(Value);
     *Addr     = (Value >> 8) & 0xFF;
     *(Addr+1) = Value & 0xFF;
   }
 
   void writeInt32BE(uint8_t *Addr, uint32_t Value) {
-    if (sys::IsLittleEndianHost)
+    if (IsTargetLittleEndian)
       Value = sys::SwapByteOrder(Value);
     *Addr     = (Value >> 24) & 0xFF;
     *(Addr+1) = (Value >> 16) & 0xFF;
@@ -224,7 +237,7 @@ protected:
   }
 
   void writeInt64BE(uint8_t *Addr, uint64_t Value) {
-    if (sys::IsLittleEndianHost)
+    if (IsTargetLittleEndian)
       Value = sys::SwapByteOrder(Value);
     *Addr     = (Value >> 56) & 0xFF;
     *(Addr+1) = (Value >> 48) & 0xFF;
@@ -292,6 +305,11 @@ protected:
 
   /// \brief Resolve relocations to external symbols.
   void resolveExternalSymbols();
+
+  /// \brief Update GOT entries for external symbols.
+  // The base class does nothing.  ELF overrides this.
+  virtual void updateGOTEntries(StringRef Name, uint64_t Addr) {}
+
   virtual ObjectImage *createObjectImage(ObjectBuffer *InputBuffer);
 public:
   RuntimeDyldImpl(RTDyldMemoryManager *mm) : MemMgr(mm), HasError(false) {}
@@ -303,18 +321,20 @@ public:
   void *getSymbolAddress(StringRef Name) {
     // FIXME: Just look up as a function for now. Overly simple of course.
     // Work in progress.
-    if (GlobalSymbolTable.find(Name) == GlobalSymbolTable.end())
+    SymbolTableMap::const_iterator pos = GlobalSymbolTable.find(Name);
+    if (pos == GlobalSymbolTable.end())
       return 0;
-    SymbolLoc Loc = GlobalSymbolTable.lookup(Name);
+    SymbolLoc Loc = pos->second;
     return getSectionAddress(Loc.first) + Loc.second;
   }
 
   uint64_t getSymbolLoadAddress(StringRef Name) {
     // FIXME: Just look up as a function for now. Overly simple of course.
     // Work in progress.
-    if (GlobalSymbolTable.find(Name) == GlobalSymbolTable.end())
+    SymbolTableMap::const_iterator pos = GlobalSymbolTable.find(Name);
+    if (pos == GlobalSymbolTable.end())
       return 0;
-    SymbolLoc Loc = GlobalSymbolTable.lookup(Name);
+    SymbolLoc Loc = pos->second;
     return getSectionLoadAddress(Loc.first) + Loc.second;
   }
 
@@ -335,7 +355,11 @@ public:
 
   virtual bool isCompatibleFormat(const ObjectBuffer *Buffer) const = 0;
 
-  virtual StringRef getEHFrameSection();
+  virtual void registerEHFrames();
+
+  virtual void deregisterEHFrames();
+
+  virtual void finalizeLoad(ObjSectionToIDMap &SectionMap) {}
 };
 
 } // end namespace llvm
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp
index 0384b32..5b92867 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp
@@ -55,35 +55,80 @@ static intptr_t computeDelta(SectionEntry *A, SectionEntry *B) {
   return ObjDistance - MemDistance;
 }
 
-StringRef RuntimeDyldMachO::getEHFrameSection() {
-  SectionEntry *Text = NULL;
-  SectionEntry *EHFrame = NULL;
-  SectionEntry *ExceptTab = NULL;
-  for (int i = 0, e = Sections.size(); i != e; ++i) {
-    if (Sections[i].Name == "__eh_frame")
-      EHFrame = &Sections[i];
-    else if (Sections[i].Name == "__text")
-      Text = &Sections[i];
-    else if (Sections[i].Name == "__gcc_except_tab")
-      ExceptTab = &Sections[i];
+void RuntimeDyldMachO::registerEHFrames() {
+
+  if (!MemMgr)
+    return;
+  for (int i = 0, e = UnregisteredEHFrameSections.size(); i != e; ++i) {
+    EHFrameRelatedSections &SectionInfo = UnregisteredEHFrameSections[i];
+    if (SectionInfo.EHFrameSID == RTDYLD_INVALID_SECTION_ID ||
+        SectionInfo.TextSID == RTDYLD_INVALID_SECTION_ID)
+      continue;
+    SectionEntry *Text = &Sections[SectionInfo.TextSID];
+    SectionEntry *EHFrame = &Sections[SectionInfo.EHFrameSID];
+    SectionEntry *ExceptTab = NULL;
+    if (SectionInfo.ExceptTabSID != RTDYLD_INVALID_SECTION_ID)
+      ExceptTab = &Sections[SectionInfo.ExceptTabSID];
+
+    intptr_t DeltaForText = computeDelta(Text, EHFrame);
+    intptr_t DeltaForEH = 0;
+    if (ExceptTab)
+      DeltaForEH = computeDelta(ExceptTab, EHFrame);
+
+    unsigned char *P = EHFrame->Address;
+    unsigned char *End = P + EHFrame->Size;
+    do  {
+      P = processFDE(P, DeltaForText, DeltaForEH);
+    } while(P != End);
+
+    MemMgr->registerEHFrames(EHFrame->Address,
+                             EHFrame->LoadAddress,
+                             EHFrame->Size);
   }
-  if (Text == NULL || EHFrame == NULL)
-    return StringRef();
-
-  intptr_t DeltaForText = computeDelta(Text, EHFrame);
-  intptr_t DeltaForEH = 0;
-  if (ExceptTab)
-    DeltaForEH = computeDelta(ExceptTab, EHFrame);
-
-  unsigned char *P = EHFrame->Address;
-  unsigned char *End = P + EHFrame->Size;
-  do  {
-    P = processFDE(P, DeltaForText, DeltaForEH);
-  } while(P != End);
+  UnregisteredEHFrameSections.clear();
+}
 
-  return StringRef((char*)EHFrame->Address, EHFrame->Size);
+void RuntimeDyldMachO::finalizeLoad(ObjSectionToIDMap &SectionMap) {
+  unsigned EHFrameSID = RTDYLD_INVALID_SECTION_ID;
+  unsigned TextSID = RTDYLD_INVALID_SECTION_ID;
+  unsigned ExceptTabSID = RTDYLD_INVALID_SECTION_ID;
+  ObjSectionToIDMap::iterator i, e;
+  for (i = SectionMap.begin(), e = SectionMap.end(); i != e; ++i) {
+    const SectionRef &Section = i->first;
+    StringRef Name;
+    Section.getName(Name);
+    if (Name == "__eh_frame")
+      EHFrameSID = i->second;
+    else if (Name == "__text")
+      TextSID = i->second;
+    else if (Name == "__gcc_except_tab")
+      ExceptTabSID = i->second;
+  }
+  UnregisteredEHFrameSections.push_back(EHFrameRelatedSections(EHFrameSID,
+                                                               TextSID,
+                                                               ExceptTabSID));
 }
 
+// The target location for the relocation is described by RE.SectionID and
+// RE.Offset.  RE.SectionID can be used to find the SectionEntry.  Each
+// SectionEntry has three members describing its location.
+// SectionEntry::Address is the address at which the section has been loaded
+// into memory in the current (host) process.  SectionEntry::LoadAddress is the
+// address that the section will have in the target process.
+// SectionEntry::ObjAddress is the address of the bits for this section in the
+// original emitted object image (also in the current address space).
+//
+// Relocations will be applied as if the section were loaded at
+// SectionEntry::LoadAddress, but they will be applied at an address based
+// on SectionEntry::Address.  SectionEntry::ObjAddress will be used to refer to
+// Target memory contents if they are required for value calculations.
+//
+// The Value parameter here is the load address of the symbol for the
+// relocation to be applied.  For relocations which refer to symbols in the
+// current object Value will be the LoadAddress of the section in which
+// the symbol resides (RE.Addend provides additional information about the
+// symbol location).  For external symbols, Value will be the address of the
+// symbol in the target address space.
 void RuntimeDyldMachO::resolveRelocation(const RelocationEntry &RE,
                                          uint64_t Value) {
   const SectionEntry &Section = Sections[RE.SectionID];
@@ -160,7 +205,7 @@ bool RuntimeDyldMachO::resolveI386Relocation(uint8_t *LocalAddress,
   switch (Type) {
   default:
     llvm_unreachable("Invalid relocation type!");
-  case macho::RIT_Vanilla: {
+  case MachO::GENERIC_RELOC_VANILLA: {
     uint8_t *p = LocalAddress;
     uint64_t ValueToWrite = Value + Addend;
     for (unsigned i = 0; i < Size; ++i) {
@@ -169,9 +214,9 @@ bool RuntimeDyldMachO::resolveI386Relocation(uint8_t *LocalAddress,
     }
     return false;
   }
-  case macho::RIT_Difference:
-  case macho::RIT_Generic_LocalDifference:
-  case macho::RIT_Generic_PreboundLazyPointer:
+  case MachO::GENERIC_RELOC_SECTDIFF:
+  case MachO::GENERIC_RELOC_LOCAL_SECTDIFF:
+  case MachO::GENERIC_RELOC_PB_LA_PTR:
     return Error("Relocation type not implemented yet!");
   }
 }
@@ -193,12 +238,12 @@ bool RuntimeDyldMachO::resolveX86_64Relocation(uint8_t *LocalAddress,
   switch(Type) {
   default:
     llvm_unreachable("Invalid relocation type!");
-  case macho::RIT_X86_64_Signed1:
-  case macho::RIT_X86_64_Signed2:
-  case macho::RIT_X86_64_Signed4:
-  case macho::RIT_X86_64_Signed:
-  case macho::RIT_X86_64_Unsigned:
-  case macho::RIT_X86_64_Branch: {
+  case MachO::X86_64_RELOC_SIGNED_1:
+  case MachO::X86_64_RELOC_SIGNED_2:
+  case MachO::X86_64_RELOC_SIGNED_4:
+  case MachO::X86_64_RELOC_SIGNED:
+  case MachO::X86_64_RELOC_UNSIGNED:
+  case MachO::X86_64_RELOC_BRANCH: {
     Value += Addend;
     // Mask in the target value a byte at a time (we don't have an alignment
     // guarantee for the target address, so this is safest).
@@ -209,10 +254,10 @@ bool RuntimeDyldMachO::resolveX86_64Relocation(uint8_t *LocalAddress,
     }
     return false;
   }
-  case macho::RIT_X86_64_GOTLoad:
-  case macho::RIT_X86_64_GOT:
-  case macho::RIT_X86_64_Subtractor:
-  case macho::RIT_X86_64_TLV:
+  case MachO::X86_64_RELOC_GOT_LOAD:
+  case MachO::X86_64_RELOC_GOT:
+  case MachO::X86_64_RELOC_SUBTRACTOR:
+  case MachO::X86_64_RELOC_TLV:
     return Error("Relocation type not implemented yet!");
   }
 }
@@ -237,7 +282,7 @@ bool RuntimeDyldMachO::resolveARMRelocation(uint8_t *LocalAddress,
   switch(Type) {
   default:
     llvm_unreachable("Invalid relocation type!");
-  case macho::RIT_Vanilla: {
+  case MachO::ARM_RELOC_VANILLA: {
     // Mask in the target value a byte at a time (we don't have an alignment
     // guarantee for the target address, so this is safest).
     uint8_t *p = (uint8_t*)LocalAddress;
@@ -247,7 +292,7 @@ bool RuntimeDyldMachO::resolveARMRelocation(uint8_t *LocalAddress,
     }
     break;
   }
-  case macho::RIT_ARM_Branch24Bit: {
+  case MachO::ARM_RELOC_BR24: {
     // Mask the value into the target address. We know instructions are
     // 32-bit aligned, so we can do it all at once.
     uint32_t *p = (uint32_t*)LocalAddress;
@@ -263,14 +308,14 @@ bool RuntimeDyldMachO::resolveARMRelocation(uint8_t *LocalAddress,
     *p = (*p & ~0xffffff) | Value;
     break;
   }
-  case macho::RIT_ARM_ThumbBranch22Bit:
-  case macho::RIT_ARM_ThumbBranch32Bit:
-  case macho::RIT_ARM_Half:
-  case macho::RIT_ARM_HalfDifference:
-  case macho::RIT_Pair:
-  case macho::RIT_Difference:
-  case macho::RIT_ARM_LocalDifference:
-  case macho::RIT_ARM_PreboundLazyPointer:
+  case MachO::ARM_THUMB_RELOC_BR22:
+  case MachO::ARM_THUMB_32BIT_BRANCH:
+  case MachO::ARM_RELOC_HALF:
+  case MachO::ARM_RELOC_HALF_SECTDIFF:
+  case MachO::ARM_RELOC_PAIR:
+  case MachO::ARM_RELOC_SECTDIFF:
+  case MachO::ARM_RELOC_LOCAL_SECTDIFF:
+  case MachO::ARM_RELOC_PB_LA_PTR:
     return Error("Relocation type not implemented yet!");
   }
   return false;
@@ -284,9 +329,19 @@ void RuntimeDyldMachO::processRelocationRef(unsigned SectionID,
                                             StubMap &Stubs) {
   const ObjectFile *OF = Obj.getObjectFile();
   const MachOObjectFile *MachO = static_cast<const MachOObjectFile*>(OF);
-  macho::RelocationEntry RE = MachO->getRelocation(RelI.getRawDataRefImpl());
+  MachO::any_relocation_info RE= MachO->getRelocation(RelI.getRawDataRefImpl());
 
   uint32_t RelType = MachO->getAnyRelocationType(RE);
+
+  // FIXME: Properly handle scattered relocations.
+  //        For now, optimistically skip these: they can often be ignored, as
+  //        the static linker will already have applied the relocation, and it
+  //        only needs to be reapplied if symbols move relative to one another.
+  //        Note: This will fail horribly where the relocations *do* need to be
+  //        applied, but that was already the case.
+  if (MachO->isRelocationScattered(RE))
+    return;
+
   RelocationValueRef Value;
   SectionEntry &Section = Sections[SectionID];
 
@@ -329,7 +384,8 @@ void RuntimeDyldMachO::processRelocationRef(unsigned SectionID,
     Value.Addend = Addend - Addr;
   }
 
-  if (Arch == Triple::x86_64 && RelType == macho::RIT_X86_64_GOT) {
+  if (Arch == Triple::x86_64 && (RelType == MachO::X86_64_RELOC_GOT ||
+                                 RelType == MachO::X86_64_RELOC_GOT_LOAD)) {
     assert(IsPCRel);
     assert(Size == 2);
     StubMap::const_iterator i = Stubs.find(Value);
@@ -340,8 +396,7 @@ void RuntimeDyldMachO::processRelocationRef(unsigned SectionID,
       Stubs[Value] = Section.StubOffset;
       uint8_t *GOTEntry = Section.Address + Section.StubOffset;
       RelocationEntry RE(SectionID, Section.StubOffset,
-                         macho::RIT_X86_64_Unsigned, Value.Addend - 4, false,
-                         3);
+                         MachO::X86_64_RELOC_UNSIGNED, 0, false, 3);
       if (Value.SymbolName)
         addRelocationForSymbol(RE, Value.SymbolName);
       else
@@ -350,9 +405,9 @@ void RuntimeDyldMachO::processRelocationRef(unsigned SectionID,
       Addr = GOTEntry;
     }
     resolveRelocation(Section, Offset, (uint64_t)Addr,
-                      macho::RIT_X86_64_Unsigned, 4, true, 2);
+                      MachO::X86_64_RELOC_UNSIGNED, Value.Addend, true, 2);
   } else if (Arch == Triple::arm &&
-             (RelType & 0xf) == macho::RIT_ARM_Branch24Bit) {
+             (RelType & 0xf) == MachO::ARM_RELOC_BR24) {
     // This is an ARM branch relocation, need to use a stub function.
 
     //  Look up for existing stub.
@@ -367,7 +422,7 @@ void RuntimeDyldMachO::processRelocationRef(unsigned SectionID,
       uint8_t *StubTargetAddr = createStubFunction(Section.Address +
                                                    Section.StubOffset);
       RelocationEntry RE(SectionID, StubTargetAddr - Section.Address,
-                         macho::RIT_Vanilla, Value.Addend);
+                         MachO::GENERIC_RELOC_VANILLA, Value.Addend);
       if (Value.SymbolName)
         addRelocationForSymbol(RE, Value.SymbolName);
       else
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.h b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.h
index df8d3bb..bbf6aa9 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.h
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.h
@@ -54,6 +54,35 @@ class RuntimeDyldMachO : public RuntimeDyldImpl {
                          int64_t Addend,
                          bool isPCRel,
                          unsigned Size);
+
+  unsigned getMaxStubSize() {
+    if (Arch == Triple::arm || Arch == Triple::thumb)
+      return 8; // 32-bit instruction and 32-bit address
+    else if (Arch == Triple::x86_64)
+      return 8; // GOT entry
+    else
+      return 0;
+  }
+
+  unsigned getStubAlignment() {
+    return 1;
+  }
+
+  struct EHFrameRelatedSections {
+    EHFrameRelatedSections() : EHFrameSID(RTDYLD_INVALID_SECTION_ID),
+                               TextSID(RTDYLD_INVALID_SECTION_ID),
+                               ExceptTabSID(RTDYLD_INVALID_SECTION_ID) {}
+    EHFrameRelatedSections(SID EH, SID T, SID Ex) 
+      : EHFrameSID(EH), TextSID(T), ExceptTabSID(Ex) {}
+    SID EHFrameSID;
+    SID TextSID;
+    SID ExceptTabSID;
+  };
+
+  // When a module is loaded we save the SectionID of the EH frame section
+  // in a table until we receive a request to register all unregistered
+  // EH frame sections with the memory manager.
+  SmallVector<EHFrameRelatedSections, 2> UnregisteredEHFrameSections;
 public:
   RuntimeDyldMachO(RTDyldMemoryManager *mm) : RuntimeDyldImpl(mm) {}
 
@@ -65,7 +94,8 @@ public:
                                     const SymbolTableMap &Symbols,
                                     StubMap &Stubs);
   virtual bool isCompatibleFormat(const ObjectBuffer *Buffer) const;
-  virtual StringRef getEHFrameSection();
+  virtual void registerEHFrames();
+  virtual void finalizeLoad(ObjSectionToIDMap &SectionMap);
 };
 
 } // end namespace llvm
diff --git a/lib/ExecutionEngine/TargetSelect.cpp b/lib/ExecutionEngine/TargetSelect.cpp
index 558d8b3..9b7d348 100644
--- a/lib/ExecutionEngine/TargetSelect.cpp
+++ b/lib/ExecutionEngine/TargetSelect.cpp
@@ -91,7 +91,7 @@ TargetMachine *EngineBuilder::selectTarget(const Triple &TargetTriple,
   // FIXME: non-iOS ARM FastISel is broken with MCJIT.
   if (UseMCJIT &&
       TheTriple.getArch() == Triple::arm &&
-      TheTriple.getOS() != Triple::IOS &&
+      !TheTriple.isiOS() &&
       OptLevel == CodeGenOpt::None) {
     OptLevel = CodeGenOpt::Less;
   }
diff --git a/lib/IR/AsmWriter.cpp b/lib/IR/AsmWriter.cpp
index f275305..7decffd 100644
--- a/lib/IR/AsmWriter.cpp
+++ b/lib/IR/AsmWriter.cpp
@@ -74,6 +74,8 @@ static void PrintCallingConv(unsigned cc, raw_ostream &Out) {
   default:                         Out << "cc" << cc; break;
   case CallingConv::Fast:          Out << "fastcc"; break;
   case CallingConv::Cold:          Out << "coldcc"; break;
+  case CallingConv::WebKit_JS:     Out << "webkit_jscc"; break;
+  case CallingConv::AnyReg:        Out << "anyregcc"; break;
   case CallingConv::X86_StdCall:   Out << "x86_stdcallcc"; break;
   case CallingConv::X86_FastCall:  Out << "x86_fastcallcc"; break;
   case CallingConv::X86_ThisCall:  Out << "x86_thiscallcc"; break;
@@ -1394,9 +1396,6 @@ static void PrintLinkage(GlobalValue::LinkageTypes LT,
   case GlobalValue::InternalLinkage:      Out << "internal ";       break;
   case GlobalValue::LinkOnceAnyLinkage:   Out << "linkonce ";       break;
   case GlobalValue::LinkOnceODRLinkage:   Out << "linkonce_odr ";   break;
-  case GlobalValue::LinkOnceODRAutoHideLinkage:
-    Out << "linkonce_odr_auto_hide ";
-    break;
   case GlobalValue::WeakAnyLinkage:       Out << "weak ";           break;
   case GlobalValue::WeakODRLinkage:       Out << "weak_odr ";       break;
   case GlobalValue::CommonLinkage:        Out << "common ";         break;
@@ -1647,6 +1646,10 @@ void AssemblyWriter::printFunction(const Function *F) {
     Out << " align " << F->getAlignment();
   if (F->hasGC())
     Out << " gc \"" << F->getGC() << '"';
+  if (F->hasPrefixData()) {
+    Out << " prefix ";
+    writeOperand(F->getPrefixData(), true);
+  }
   if (F->isDeclaration()) {
     Out << '\n';
   } else {
diff --git a/lib/IR/AttributeImpl.h b/lib/IR/AttributeImpl.h
index 9da3f96..ea954ac 100644
--- a/lib/IR/AttributeImpl.h
+++ b/lib/IR/AttributeImpl.h
@@ -94,6 +94,7 @@ public:
 /// attribute enties, which are for target-dependent attributes.
 
 class EnumAttributeImpl : public AttributeImpl {
+  virtual void anchor();
   Attribute::AttrKind Kind;
 
 protected:
@@ -108,6 +109,7 @@ public:
 };
 
 class AlignAttributeImpl : public EnumAttributeImpl {
+  virtual void anchor();
   unsigned Align;
 
 public:
@@ -122,6 +124,7 @@ public:
 };
 
 class StringAttributeImpl : public AttributeImpl {
+  virtual void anchor();
   std::string Kind;
   std::string Val;
 
diff --git a/lib/IR/Attributes.cpp b/lib/IR/Attributes.cpp
index f466d16..0f2b7a0 100644
--- a/lib/IR/Attributes.cpp
+++ b/lib/IR/Attributes.cpp
@@ -196,6 +196,8 @@ std::string Attribute::getAsString(bool InAttrGrp) const {
     return "noreturn";
   if (hasAttribute(Attribute::NoUnwind))
     return "nounwind";
+  if (hasAttribute(Attribute::OptimizeNone))
+    return "optnone";
   if (hasAttribute(Attribute::OptimizeForSize))
     return "optsize";
   if (hasAttribute(Attribute::ReadNone))
@@ -284,7 +286,11 @@ bool Attribute::operator<(Attribute A) const {
 // AttributeImpl Definition
 //===----------------------------------------------------------------------===//
 
+// Pin the vtabels to this file.
 AttributeImpl::~AttributeImpl() {}
+void EnumAttributeImpl::anchor() {}
+void AlignAttributeImpl::anchor() {}
+void StringAttributeImpl::anchor() {}
 
 bool AttributeImpl::hasAttribute(Attribute::AttrKind A) const {
   if (isStringAttribute()) return false;
@@ -381,6 +387,7 @@ uint64_t AttributeImpl::getAttrMask(Attribute::AttrKind Val) {
   case Attribute::Returned:        return 1ULL << 39;
   case Attribute::Cold:            return 1ULL << 40;
   case Attribute::Builtin:         return 1ULL << 41;
+  case Attribute::OptimizeNone:    return 1ULL << 42;
   }
   llvm_unreachable("Unsupported attribute type");
 }
diff --git a/lib/IR/AutoUpgrade.cpp b/lib/IR/AutoUpgrade.cpp
index a4f5289..d12bf7b 100644
--- a/lib/IR/AutoUpgrade.cpp
+++ b/lib/IR/AutoUpgrade.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/AutoUpgrade.h"
+#include "llvm/DebugInfo.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
@@ -88,6 +89,20 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
     }
     break;
   }
+  case 'o':
+    // We only need to change the name to match the mangling including the
+    // address space.
+    if (F->arg_size() == 2 && Name.startswith("objectsize.")) {
+      Type *Tys[2] = { F->getReturnType(), F->arg_begin()->getType() };
+      if (F->getName() != Intrinsic::getName(Intrinsic::objectsize, Tys)) {
+        F->setName(Name + ".old");
+        NewFn = Intrinsic::getDeclaration(F->getParent(),
+                                          Intrinsic::objectsize, Tys);
+        return true;
+      }
+    }
+    break;
+
   case 'x': {
     if (Name.startswith("x86.sse2.pcmpeq.") ||
         Name.startswith("x86.sse2.pcmpgt.") ||
@@ -97,6 +112,7 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
         Name == "x86.avx.movnt.dq.256" ||
         Name == "x86.avx.movnt.pd.256" ||
         Name == "x86.avx.movnt.ps.256" ||
+        Name == "x86.sse42.crc32.64.8" ||
         (Name.startswith("x86.xop.vpcom") && F->arg_size() == 2)) {
       NewFn = 0;
       return true;
@@ -257,6 +273,12 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
       Function *VPCOM = Intrinsic::getDeclaration(F->getParent(), intID);
       Rep = Builder.CreateCall3(VPCOM, CI->getArgOperand(0),
                                 CI->getArgOperand(1), Builder.getInt8(Imm));
+    } else if (Name == "llvm.x86.sse42.crc32.64.8") {
+      Function *CRC32 = Intrinsic::getDeclaration(F->getParent(),
+                                               Intrinsic::x86_sse42_crc32_32_8);
+      Value *Trunc0 = Builder.CreateTrunc(CI->getArgOperand(0), Type::getInt32Ty(C));
+      Rep = Builder.CreateCall2(CRC32, Trunc0, CI->getArgOperand(1));
+      Rep = Builder.CreateZExt(Rep, CI->getType(), "");
     } else {
       bool PD128 = false, PD256 = false, PS128 = false, PS256 = false;
       if (Name == "llvm.x86.avx.vpermil.pd.256")
@@ -317,6 +339,14 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
     CI->eraseFromParent();
     return;
 
+  case Intrinsic::objectsize:
+    CI->replaceAllUsesWith(Builder.CreateCall2(NewFn,
+                                               CI->getArgOperand(0),
+                                               CI->getArgOperand(1),
+                                               Name));
+    CI->eraseFromParent();
+    return;
+
   case Intrinsic::arm_neon_vclz: {
     // Change name from llvm.arm.neon.vclz.* to llvm.ctlz.*
     CI->replaceAllUsesWith(Builder.CreateCall2(NewFn, CI->getArgOperand(0),
@@ -391,3 +421,81 @@ void llvm::UpgradeCallsToIntrinsic(Function* F) {
   }
 }
 
+void llvm::UpgradeInstWithTBAATag(Instruction *I) {
+  MDNode *MD = I->getMetadata(LLVMContext::MD_tbaa);
+  assert(MD && "UpgradeInstWithTBAATag should have a TBAA tag");
+  // Check if the tag uses struct-path aware TBAA format.
+  if (isa<MDNode>(MD->getOperand(0)) && MD->getNumOperands() >= 3)
+    return;
+
+  if (MD->getNumOperands() == 3) {
+    Value *Elts[] = {
+      MD->getOperand(0),
+      MD->getOperand(1)
+    };
+    MDNode *ScalarType = MDNode::get(I->getContext(), Elts);
+    // Create a MDNode <ScalarType, ScalarType, offset 0, const>
+    Value *Elts2[] = {
+      ScalarType, ScalarType,
+      Constant::getNullValue(Type::getInt64Ty(I->getContext())),
+      MD->getOperand(2)
+    };
+    I->setMetadata(LLVMContext::MD_tbaa, MDNode::get(I->getContext(), Elts2));
+  } else {
+    // Create a MDNode <MD, MD, offset 0>
+    Value *Elts[] = {MD, MD,
+      Constant::getNullValue(Type::getInt64Ty(I->getContext()))};
+    I->setMetadata(LLVMContext::MD_tbaa, MDNode::get(I->getContext(), Elts));
+  }
+}
+
+Instruction *llvm::UpgradeBitCastInst(unsigned Opc, Value *V, Type *DestTy,
+                                      Instruction *&Temp) {
+  if (Opc != Instruction::BitCast)
+    return 0;
+
+  Temp = 0;
+  Type *SrcTy = V->getType();
+  if (SrcTy->isPtrOrPtrVectorTy() && DestTy->isPtrOrPtrVectorTy() &&
+      SrcTy->getPointerAddressSpace() != DestTy->getPointerAddressSpace()) {
+    LLVMContext &Context = V->getContext();
+
+    // We have no information about target data layout, so we assume that
+    // the maximum pointer size is 64bit.
+    Type *MidTy = Type::getInt64Ty(Context);
+    Temp = CastInst::Create(Instruction::PtrToInt, V, MidTy);
+
+    return CastInst::Create(Instruction::IntToPtr, Temp, DestTy);
+  }
+
+  return 0;
+}
+
+Value *llvm::UpgradeBitCastExpr(unsigned Opc, Constant *C, Type *DestTy) {
+  if (Opc != Instruction::BitCast)
+    return 0;
+
+  Type *SrcTy = C->getType();
+  if (SrcTy->isPtrOrPtrVectorTy() && DestTy->isPtrOrPtrVectorTy() &&
+      SrcTy->getPointerAddressSpace() != DestTy->getPointerAddressSpace()) {
+    LLVMContext &Context = C->getContext();
+
+    // We have no information about target data layout, so we assume that
+    // the maximum pointer size is 64bit.
+    Type *MidTy = Type::getInt64Ty(Context);
+
+    return ConstantExpr::getIntToPtr(ConstantExpr::getPtrToInt(C, MidTy),
+                                     DestTy);
+  }
+
+  return 0;
+}
+
+/// Check the debug info version number, if it is out-dated, drop the debug
+/// info. Return true if module is modified.
+bool llvm::UpgradeDebugInfo(Module &M) {
+  if (getDebugMetadataVersionFromModule(M) == DEBUG_METADATA_VERSION)
+    return false;
+
+  return StripDebugInfo(M);
+}
diff --git a/lib/IR/CMakeLists.txt b/lib/IR/CMakeLists.txt
index c2a4ee3..581946c 100644
--- a/lib/IR/CMakeLists.txt
+++ b/lib/IR/CMakeLists.txt
@@ -6,10 +6,10 @@ add_llvm_library(LLVMCore
   ConstantFold.cpp
   Constants.cpp
   Core.cpp
+  DIBuilder.cpp
   DataLayout.cpp
   DebugInfo.cpp
   DebugLoc.cpp
-  DIBuilder.cpp
   Dominators.cpp
   Function.cpp
   GCOV.cpp
@@ -23,6 +23,7 @@ add_llvm_library(LLVMCore
   LLVMContext.cpp
   LLVMContextImpl.cpp
   LeakDetector.cpp
+  LegacyPassManager.cpp
   Metadata.cpp
   Module.cpp
   Pass.cpp
diff --git a/lib/IR/ConstantFold.cpp b/lib/IR/ConstantFold.cpp
index 8c5a983..f5e225c 100644
--- a/lib/IR/ConstantFold.cpp
+++ b/lib/IR/ConstantFold.cpp
@@ -689,6 +689,8 @@ Constant *llvm::ConstantFoldCastInstruction(unsigned opc, Constant *V,
   }
   case Instruction::BitCast:
     return FoldBitCast(V, DestTy);
+  case Instruction::AddrSpaceCast:
+    return 0;
   }
 }
 
@@ -1897,6 +1899,37 @@ static bool isInBoundsIndices(ArrayRef<IndexTy> Idxs) {
   return true;
 }
 
+/// \brief Test whether a given ConstantInt is in-range for a SequentialType.
+static bool isIndexInRangeOfSequentialType(const SequentialType *STy,
+                                           const ConstantInt *CI) {
+  if (const PointerType *PTy = dyn_cast<PointerType>(STy))
+    // Only handle pointers to sized types, not pointers to functions.
+    return PTy->getElementType()->isSized();
+
+  uint64_t NumElements = 0;
+  // Determine the number of elements in our sequential type.
+  if (const ArrayType *ATy = dyn_cast<ArrayType>(STy))
+    NumElements = ATy->getNumElements();
+  else if (const VectorType *VTy = dyn_cast<VectorType>(STy))
+    NumElements = VTy->getNumElements();
+
+  assert((isa<ArrayType>(STy) || NumElements > 0) &&
+         "didn't expect non-array type to have zero elements!");
+
+  // We cannot bounds check the index if it doesn't fit in an int64_t.
+  if (CI->getValue().getActiveBits() > 64)
+    return false;
+
+  // A negative index or an index past the end of our sequential type is
+  // considered out-of-range.
+  int64_t IndexVal = CI->getSExtValue();
+  if (IndexVal < 0 || (NumElements > 0 && (uint64_t)IndexVal >= NumElements))
+    return false;
+
+  // Otherwise, it is in-range.
+  return true;
+}
+
 template<typename IndexTy>
 static Constant *ConstantFoldGetElementPtrImpl(Constant *C,
                                                bool inBounds,
@@ -1940,7 +1973,32 @@ static Constant *ConstantFoldGetElementPtrImpl(Constant *C,
            I != E; ++I)
         LastTy = *I;
 
-      if ((LastTy && isa<SequentialType>(LastTy)) || Idx0->isNullValue()) {
+      // We cannot combine indices if doing so would take us outside of an
+      // array or vector.  Doing otherwise could trick us if we evaluated such a
+      // GEP as part of a load.
+      //
+      // e.g. Consider if the original GEP was:
+      // i8* getelementptr ({ [2 x i8], i32, i8, [3 x i8] }* @main.c,
+      //                    i32 0, i32 0, i64 0)
+      //
+      // If we then tried to offset it by '8' to get to the third element,
+      // an i8, we should *not* get:
+      // i8* getelementptr ({ [2 x i8], i32, i8, [3 x i8] }* @main.c,
+      //                    i32 0, i32 0, i64 8)
+      //
+      // This GEP tries to index array element '8  which runs out-of-bounds.
+      // Subsequent evaluation would get confused and produce erroneous results.
+      //
+      // The following prohibits such a GEP from being formed by checking to see
+      // if the index is in-range with respect to an array or vector.
+      bool PerformFold = false;
+      if (Idx0->isNullValue())
+        PerformFold = true;
+      else if (SequentialType *STy = dyn_cast_or_null<SequentialType>(LastTy))
+        if (ConstantInt *CI = dyn_cast<ConstantInt>(Idx0))
+          PerformFold = isIndexInRangeOfSequentialType(STy, CI);
+
+      if (PerformFold) {
         SmallVector<Value*, 16> NewIndices;
         NewIndices.reserve(Idxs.size() + CE->getNumOperands());
         for (unsigned i = 1, e = CE->getNumOperands()-1; i != e; ++i)
@@ -2000,8 +2058,8 @@ static Constant *ConstantFoldGetElementPtrImpl(Constant *C,
   }
 
   // Check to see if any array indices are not within the corresponding
-  // notional array bounds. If so, try to determine if they can be factored
-  // out into preceding dimensions.
+  // notional array or vector bounds. If so, try to determine if they can be
+  // factored out into preceding dimensions.
   bool Unknown = false;
   SmallVector<Constant *, 8> NewIdxs;
   Type *Ty = C->getType();
@@ -2009,16 +2067,20 @@ static Constant *ConstantFoldGetElementPtrImpl(Constant *C,
   for (unsigned i = 0, e = Idxs.size(); i != e;
        Prev = Ty, Ty = cast<CompositeType>(Ty)->getTypeAtIndex(Idxs[i]), ++i) {
     if (ConstantInt *CI = dyn_cast<ConstantInt>(Idxs[i])) {
-      if (ArrayType *ATy = dyn_cast<ArrayType>(Ty))
-        if (ATy->getNumElements() <= INT64_MAX &&
-            ATy->getNumElements() != 0 &&
-            CI->getSExtValue() >= (int64_t)ATy->getNumElements()) {
+      if (isa<ArrayType>(Ty) || isa<VectorType>(Ty))
+        if (CI->getSExtValue() > 0 &&
+            !isIndexInRangeOfSequentialType(cast<SequentialType>(Ty), CI)) {
           if (isa<SequentialType>(Prev)) {
             // It's out of range, but we can factor it into the prior
             // dimension.
             NewIdxs.resize(Idxs.size());
-            ConstantInt *Factor = ConstantInt::get(CI->getType(),
-                                                   ATy->getNumElements());
+            uint64_t NumElements = 0;
+            if (const ArrayType *ATy = dyn_cast<ArrayType>(Ty))
+              NumElements = ATy->getNumElements();
+            else
+              NumElements = cast<VectorType>(Ty)->getNumElements();
+
+            ConstantInt *Factor = ConstantInt::get(CI->getType(), NumElements);
             NewIdxs[i] = ConstantExpr::getSRem(CI, Factor);
 
             Constant *PrevIdx = cast<Constant>(Idxs[i-1]);
diff --git a/lib/IR/Constants.cpp b/lib/IR/Constants.cpp
index 9067b34..690ac59 100644
--- a/lib/IR/Constants.cpp
+++ b/lib/IR/Constants.cpp
@@ -1126,6 +1126,7 @@ getWithOperands(ArrayRef<Constant*> Ops, Type *Ty) const {
   case Instruction::PtrToInt:
   case Instruction::IntToPtr:
   case Instruction::BitCast:
+  case Instruction::AddrSpaceCast:
     return ConstantExpr::getCast(getOpcode(), Ops[0], Ty);
   case Instruction::Select:
     return ConstantExpr::getSelect(Ops[0], Ops[1], Ops[2]);
@@ -1461,6 +1462,7 @@ Constant *ConstantExpr::getCast(unsigned oc, Constant *C, Type *Ty) {
   case Instruction::PtrToInt: return getPtrToInt(C, Ty);
   case Instruction::IntToPtr: return getIntToPtr(C, Ty);
   case Instruction::BitCast:  return getBitCast(C, Ty);
+  case Instruction::AddrSpaceCast:  return getAddrSpaceCast(C, Ty);
   }
 }
 
@@ -1489,10 +1491,26 @@ Constant *ConstantExpr::getPointerCast(Constant *S, Type *Ty) {
 
   if (Ty->isIntOrIntVectorTy())
     return getPtrToInt(S, Ty);
+
+  unsigned SrcAS = S->getType()->getPointerAddressSpace();
+  if (Ty->isPtrOrPtrVectorTy() && SrcAS != Ty->getPointerAddressSpace())
+    return getAddrSpaceCast(S, Ty);
+
   return getBitCast(S, Ty);
 }
 
-Constant *ConstantExpr::getIntegerCast(Constant *C, Type *Ty, 
+Constant *ConstantExpr::getPointerBitCastOrAddrSpaceCast(Constant *S,
+                                                         Type *Ty) {
+  assert(S->getType()->isPtrOrPtrVectorTy() && "Invalid cast");
+  assert(Ty->isPtrOrPtrVectorTy() && "Invalid cast");
+
+  if (S->getType()->getPointerAddressSpace() != Ty->getPointerAddressSpace())
+    return getAddrSpaceCast(S, Ty);
+
+  return getBitCast(S, Ty);
+}
+
+Constant *ConstantExpr::getIntegerCast(Constant *C, Type *Ty,
                                        bool isSigned) {
   assert(C->getType()->isIntOrIntVectorTy() &&
          Ty->isIntOrIntVectorTy() && "Invalid cast");
@@ -1662,6 +1680,13 @@ Constant *ConstantExpr::getBitCast(Constant *C, Type *DstTy) {
   return getFoldedCast(Instruction::BitCast, C, DstTy);
 }
 
+Constant *ConstantExpr::getAddrSpaceCast(Constant *C, Type *DstTy) {
+  assert(CastInst::castIsValid(Instruction::AddrSpaceCast, C, DstTy) &&
+         "Invalid constantexpr addrspacecast!");
+
+  return getFoldedCast(Instruction::AddrSpaceCast, C, DstTy);
+}
+
 Constant *ConstantExpr::get(unsigned Opcode, Constant *C1, Constant *C2,
                             unsigned Flags) {
   // Check the operands for consistency first.
diff --git a/lib/IR/Core.cpp b/lib/IR/Core.cpp
index 66610bd..c70f459 100644
--- a/lib/IR/Core.cpp
+++ b/lib/IR/Core.cpp
@@ -97,7 +97,7 @@ LLVMModuleRef LLVMModuleCreateWithName(const char *ModuleID) {
   return wrap(new Module(ModuleID, getGlobalContext()));
 }
 
-LLVMModuleRef LLVMModuleCreateWithNameInContext(const char *ModuleID, 
+LLVMModuleRef LLVMModuleCreateWithNameInContext(const char *ModuleID,
                                                 LLVMContextRef C) {
   return wrap(new Module(ModuleID, *unwrap(C)));
 }
@@ -147,6 +147,16 @@ LLVMBool LLVMPrintModuleToFile(LLVMModuleRef M, const char *Filename,
   return false;
 }
 
+char *LLVMPrintModuleToString(LLVMModuleRef M) {
+  std::string buf;
+  raw_string_ostream os(buf);
+
+  unwrap(M)->print(os, NULL);
+  os.flush();
+
+  return strdup(buf.c_str());
+}
+
 /*--.. Operations on inline assembler ......................................--*/
 void LLVMSetModuleInlineAsm(LLVMModuleRef M, const char *Asm) {
   unwrap(M)->setModuleInlineAsm(StringRef(Asm));
@@ -210,6 +220,20 @@ LLVMContextRef LLVMGetTypeContext(LLVMTypeRef Ty) {
   return wrap(&unwrap(Ty)->getContext());
 }
 
+void LLVMDumpType(LLVMTypeRef Ty) {
+  return unwrap(Ty)->dump();
+}
+
+char *LLVMPrintTypeToString(LLVMTypeRef Ty) {
+  std::string buf;
+  raw_string_ostream os(buf);
+
+  unwrap(Ty)->print(os);
+  os.flush();
+
+  return strdup(buf.c_str());
+}
+
 /*--.. Operations on integer types .........................................--*/
 
 LLVMTypeRef LLVMInt1TypeInContext(LLVMContextRef C)  {
@@ -450,6 +474,16 @@ void LLVMDumpValue(LLVMValueRef Val) {
   unwrap(Val)->dump();
 }
 
+char* LLVMPrintValueToString(LLVMValueRef Val) {
+  std::string buf;
+  raw_string_ostream os(buf);
+
+  unwrap(Val)->print(os);
+  os.flush();
+
+  return strdup(buf.c_str());
+}
+
 void LLVMReplaceAllUsesWith(LLVMValueRef OldVal, LLVMValueRef NewVal) {
   unwrap(OldVal)->replaceAllUsesWith(unwrap(NewVal));
 }
@@ -681,7 +715,7 @@ LLVMValueRef LLVMConstStringInContext(LLVMContextRef C, const char *Str,
   return wrap(ConstantDataArray::getString(*unwrap(C), StringRef(Str, Length),
                                            DontNullTerminate == 0));
 }
-LLVMValueRef LLVMConstStructInContext(LLVMContextRef C, 
+LLVMValueRef LLVMConstStructInContext(LLVMContextRef C,
                                       LLVMValueRef *ConstantVals,
                                       unsigned Count, LLVMBool Packed) {
   Constant **Elements = unwrap<Constant>(ConstantVals, Count);
@@ -999,6 +1033,12 @@ LLVMValueRef LLVMConstBitCast(LLVMValueRef ConstantVal, LLVMTypeRef ToType) {
                                        unwrap(ToType)));
 }
 
+LLVMValueRef LLVMConstAddrSpaceCast(LLVMValueRef ConstantVal,
+                                    LLVMTypeRef ToType) {
+  return wrap(ConstantExpr::getAddrSpaceCast(unwrap<Constant>(ConstantVal),
+                                             unwrap(ToType)));
+}
+
 LLVMValueRef LLVMConstZExtOrBitCast(LLVMValueRef ConstantVal,
                                     LLVMTypeRef ToType) {
   return wrap(ConstantExpr::getZExtOrBitCast(unwrap<Constant>(ConstantVal),
@@ -1110,8 +1150,6 @@ LLVMLinkage LLVMGetLinkage(LLVMValueRef Global) {
     return LLVMLinkOnceAnyLinkage;
   case GlobalValue::LinkOnceODRLinkage:
     return LLVMLinkOnceODRLinkage;
-  case GlobalValue::LinkOnceODRAutoHideLinkage:
-    return LLVMLinkOnceODRAutoHideLinkage;
   case GlobalValue::WeakAnyLinkage:
     return LLVMWeakAnyLinkage;
   case GlobalValue::WeakODRLinkage:
@@ -1156,7 +1194,8 @@ void LLVMSetLinkage(LLVMValueRef Global, LLVMLinkage Linkage) {
     GV->setLinkage(GlobalValue::LinkOnceODRLinkage);
     break;
   case LLVMLinkOnceODRAutoHideLinkage:
-    GV->setLinkage(GlobalValue::LinkOnceODRAutoHideLinkage);
+    DEBUG(errs() << "LLVMSetLinkage(): LLVMLinkOnceODRAutoHideLinkage is no "
+                    "longer supported.");
     break;
   case LLVMWeakAnyLinkage:
     GV->setLinkage(GlobalValue::WeakAnyLinkage);
@@ -1216,12 +1255,30 @@ void LLVMSetVisibility(LLVMValueRef Global, LLVMVisibility Viz) {
     ->setVisibility(static_cast<GlobalValue::VisibilityTypes>(Viz));
 }
 
-unsigned LLVMGetAlignment(LLVMValueRef Global) {
-  return unwrap<GlobalValue>(Global)->getAlignment();
+/*--.. Operations on global variables, load and store instructions .........--*/
+
+unsigned LLVMGetAlignment(LLVMValueRef V) {
+  Value *P = unwrap<Value>(V);
+  if (GlobalValue *GV = dyn_cast<GlobalValue>(P))
+    return GV->getAlignment();
+  if (LoadInst *LI = dyn_cast<LoadInst>(P))
+    return LI->getAlignment();
+  if (StoreInst *SI = dyn_cast<StoreInst>(P))
+    return SI->getAlignment();
+
+  llvm_unreachable("only GlobalValue, LoadInst and StoreInst have alignment");
 }
 
-void LLVMSetAlignment(LLVMValueRef Global, unsigned Bytes) {
-  unwrap<GlobalValue>(Global)->setAlignment(Bytes);
+void LLVMSetAlignment(LLVMValueRef V, unsigned Bytes) {
+  Value *P = unwrap<Value>(V);
+  if (GlobalValue *GV = dyn_cast<GlobalValue>(P))
+    GV->setAlignment(Bytes);
+  else if (LoadInst *LI = dyn_cast<LoadInst>(P))
+    LI->setAlignment(Bytes);
+  else if (StoreInst *SI = dyn_cast<StoreInst>(P))
+    SI->setAlignment(Bytes);
+  else
+    llvm_unreachable("only GlobalValue, LoadInst and StoreInst have alignment");
 }
 
 /*--.. Operations on global variables ......................................--*/
@@ -1553,7 +1610,7 @@ LLVMAttribute LLVMGetAttribute(LLVMValueRef Arg) {
   return (LLVMAttribute)A->getParent()->getAttributes().
     Raw(A->getArgNo()+1);
 }
-  
+
 
 void LLVMSetParamAlignment(LLVMValueRef Arg, unsigned align) {
   Argument *A = unwrap<Argument>(Arg);
@@ -1745,7 +1802,7 @@ void LLVMSetInstructionCallConv(LLVMValueRef Instr, unsigned CC) {
   llvm_unreachable("LLVMSetInstructionCallConv applies only to call and invoke!");
 }
 
-void LLVMAddInstrAttribute(LLVMValueRef Instr, unsigned index, 
+void LLVMAddInstrAttribute(LLVMValueRef Instr, unsigned index,
                            LLVMAttribute PA) {
   CallSite Call = CallSite(unwrap<Instruction>(Instr));
   AttrBuilder B(PA);
@@ -1755,7 +1812,7 @@ void LLVMAddInstrAttribute(LLVMValueRef Instr, unsigned index,
                                                          index, B)));
 }
 
-void LLVMRemoveInstrAttribute(LLVMValueRef Instr, unsigned index, 
+void LLVMRemoveInstrAttribute(LLVMValueRef Instr, unsigned index,
                               LLVMAttribute PA) {
   CallSite Call = CallSite(unwrap<Instruction>(Instr));
   AttrBuilder B(PA);
@@ -1765,7 +1822,7 @@ void LLVMRemoveInstrAttribute(LLVMValueRef Instr, unsigned index,
                                                            index, B)));
 }
 
-void LLVMSetInstrParamAlignment(LLVMValueRef Instr, unsigned index, 
+void LLVMSetInstrParamAlignment(LLVMValueRef Instr, unsigned index,
                                 unsigned align) {
   CallSite Call = CallSite(unwrap<Instruction>(Instr));
   AttrBuilder B;
@@ -2119,8 +2176,8 @@ LLVMValueRef LLVMBuildMalloc(LLVMBuilderRef B, LLVMTypeRef Ty,
   Type* ITy = Type::getInt32Ty(unwrap(B)->GetInsertBlock()->getContext());
   Constant* AllocSize = ConstantExpr::getSizeOf(unwrap(Ty));
   AllocSize = ConstantExpr::getTruncOrBitCast(AllocSize, ITy);
-  Instruction* Malloc = CallInst::CreateMalloc(unwrap(B)->GetInsertBlock(), 
-                                               ITy, unwrap(Ty), AllocSize, 
+  Instruction* Malloc = CallInst::CreateMalloc(unwrap(B)->GetInsertBlock(),
+                                               ITy, unwrap(Ty), AllocSize,
                                                0, 0, "");
   return wrap(unwrap(B)->Insert(Malloc, Twine(Name)));
 }
@@ -2130,8 +2187,8 @@ LLVMValueRef LLVMBuildArrayMalloc(LLVMBuilderRef B, LLVMTypeRef Ty,
   Type* ITy = Type::getInt32Ty(unwrap(B)->GetInsertBlock()->getContext());
   Constant* AllocSize = ConstantExpr::getSizeOf(unwrap(Ty));
   AllocSize = ConstantExpr::getTruncOrBitCast(AllocSize, ITy);
-  Instruction* Malloc = CallInst::CreateMalloc(unwrap(B)->GetInsertBlock(), 
-                                               ITy, unwrap(Ty), AllocSize, 
+  Instruction* Malloc = CallInst::CreateMalloc(unwrap(B)->GetInsertBlock(),
+                                               ITy, unwrap(Ty), AllocSize,
                                                unwrap(Val), 0, "");
   return wrap(unwrap(B)->Insert(Malloc, Twine(Name)));
 }
@@ -2157,7 +2214,7 @@ LLVMValueRef LLVMBuildLoad(LLVMBuilderRef B, LLVMValueRef PointerVal,
   return wrap(unwrap(B)->CreateLoad(unwrap(PointerVal), Name));
 }
 
-LLVMValueRef LLVMBuildStore(LLVMBuilderRef B, LLVMValueRef Val, 
+LLVMValueRef LLVMBuildStore(LLVMBuilderRef B, LLVMValueRef Val,
                             LLVMValueRef PointerVal) {
   return wrap(unwrap(B)->CreateStore(unwrap(Val), unwrap(PointerVal)));
 }
@@ -2267,6 +2324,11 @@ LLVMValueRef LLVMBuildBitCast(LLVMBuilderRef B, LLVMValueRef Val,
   return wrap(unwrap(B)->CreateBitCast(unwrap(Val), unwrap(DestTy), Name));
 }
 
+LLVMValueRef LLVMBuildAddrSpaceCast(LLVMBuilderRef B, LLVMValueRef Val,
+                                    LLVMTypeRef DestTy, const char *Name) {
+  return wrap(unwrap(B)->CreateAddrSpaceCast(unwrap(Val), unwrap(DestTy), Name));
+}
+
 LLVMValueRef LLVMBuildZExtOrBitCast(LLVMBuilderRef B, LLVMValueRef Val,
                                     LLVMTypeRef DestTy, const char *Name) {
   return wrap(unwrap(B)->CreateZExtOrBitCast(unwrap(Val), unwrap(DestTy),
@@ -2396,9 +2458,9 @@ LLVMValueRef LLVMBuildPtrDiff(LLVMBuilderRef B, LLVMValueRef LHS,
   return wrap(unwrap(B)->CreatePtrDiff(unwrap(LHS), unwrap(RHS), Name));
 }
 
-LLVMValueRef LLVMBuildAtomicRMW(LLVMBuilderRef B,LLVMAtomicRMWBinOp op, 
-                               LLVMValueRef PTR, LLVMValueRef Val, 
-                               LLVMAtomicOrdering ordering, 
+LLVMValueRef LLVMBuildAtomicRMW(LLVMBuilderRef B,LLVMAtomicRMWBinOp op,
+                               LLVMValueRef PTR, LLVMValueRef Val,
+                               LLVMAtomicOrdering ordering,
                                LLVMBool singleThread) {
   AtomicRMWInst::BinOp intop;
   switch (op) {
@@ -2421,14 +2483,14 @@ LLVMValueRef LLVMBuildAtomicRMW(LLVMBuilderRef B,LLVMAtomicRMWBinOp op,
     case LLVMAtomicOrderingMonotonic: intordering = Monotonic; break;
     case LLVMAtomicOrderingAcquire: intordering = Acquire; break;
     case LLVMAtomicOrderingRelease: intordering = Release; break;
-    case LLVMAtomicOrderingAcquireRelease: 
-      intordering = AcquireRelease; 
+    case LLVMAtomicOrderingAcquireRelease:
+      intordering = AcquireRelease;
       break;
-    case LLVMAtomicOrderingSequentiallyConsistent: 
-      intordering = SequentiallyConsistent; 
+    case LLVMAtomicOrderingSequentiallyConsistent:
+      intordering = SequentiallyConsistent;
       break;
   }
-  return wrap(unwrap(B)->CreateAtomicRMW(intop, unwrap(PTR), unwrap(Val), 
+  return wrap(unwrap(B)->CreateAtomicRMW(intop, unwrap(PTR), unwrap(Val),
     intordering, singleThread ? SingleThread : CrossThread));
 }
 
diff --git a/lib/IR/DIBuilder.cpp b/lib/IR/DIBuilder.cpp
index 3005f77..c4a9f41 100644
--- a/lib/IR/DIBuilder.cpp
+++ b/lib/IR/DIBuilder.cpp
@@ -30,17 +30,24 @@ static Constant *GetTagConstant(LLVMContext &VMContext, unsigned Tag) {
 }
 
 DIBuilder::DIBuilder(Module &m)
-  : M(m), VMContext(M.getContext()), TempEnumTypes(0),
-    TempRetainTypes(0), TempSubprograms(0), TempGVs(0), DeclareFn(0),
-    ValueFn(0)
-{}
+    : M(m), VMContext(M.getContext()), TempEnumTypes(0), TempRetainTypes(0),
+      TempSubprograms(0), TempGVs(0), DeclareFn(0), ValueFn(0) {}
 
 /// finalize - Construct any deferred debug info descriptors.
 void DIBuilder::finalize() {
   DIArray Enums = getOrCreateArray(AllEnumTypes);
   DIType(TempEnumTypes).replaceAllUsesWith(Enums);
 
-  DIArray RetainTypes = getOrCreateArray(AllRetainTypes);
+  SmallVector<Value *, 16> RetainValues;
+  // Declarations and definitions of the same type may be retained. Some
+  // clients RAUW these pairs, leaving duplicates in the retained types
+  // list. Use a set to remove the duplicates while we transform the
+  // TrackingVHs back into Values.
+  SmallPtrSet<Value *, 16> RetainSet;
+  for (unsigned I = 0, E = AllRetainTypes.size(); I < E; I++)
+    if (RetainSet.insert(AllRetainTypes[I]))
+      RetainValues.push_back(AllRetainTypes[I]);
+  DIArray RetainTypes = getOrCreateArray(RetainValues);
   DIType(TempRetainTypes).replaceAllUsesWith(RetainTypes);
 
   DIArray SPs = getOrCreateArray(AllSubprograms);
@@ -79,7 +86,7 @@ static MDNode *createFilePathPair(LLVMContext &VMContext, StringRef Filename,
   assert(!Filename.empty() && "Unable to create file without name");
   Value *Pair[] = {
     MDString::get(VMContext, Filename),
-    MDString::get(VMContext, Directory),
+    MDString::get(VMContext, Directory)
   };
   return MDNode::get(VMContext, Pair);
 }
@@ -274,7 +281,7 @@ DIDerivedType DIBuilder::createQualifiedType(unsigned Tag, DIType FromTy) {
     ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Align
     ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Offset
     ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Flags
-    FromTy
+    FromTy.getRef()
   };
   return DIDerivedType(MDNode::get(VMContext, Elts));
 }
@@ -294,7 +301,7 @@ DIBuilder::createPointerType(DIType PointeeTy, uint64_t SizeInBits,
     ConstantInt::get(Type::getInt64Ty(VMContext), AlignInBits),
     ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Offset
     ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Flags
-    PointeeTy
+    PointeeTy.getRef()
   };
   return DIDerivedType(MDNode::get(VMContext, Elts));
 }
@@ -308,12 +315,12 @@ DIDerivedType DIBuilder::createMemberPointerType(DIType PointeeTy,
     NULL, // Unused
     NULL,
     ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Line
-    ConstantInt::get(Type::getInt64Ty(VMContext), 0),
-    ConstantInt::get(Type::getInt64Ty(VMContext), 0),
+    ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Size
+    ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Align
     ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Offset
     ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Flags
-    PointeeTy,
-    Base
+    PointeeTy.getRef(),
+    Base.getRef()
   };
   return DIDerivedType(MDNode::get(VMContext, Elts));
 }
@@ -333,7 +340,7 @@ DIDerivedType DIBuilder::createReferenceType(unsigned Tag, DIType RTy) {
     ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Align
     ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Offset
     ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Flags
-    RTy
+    RTy.getRef()
   };
   return DIDerivedType(MDNode::get(VMContext, Elts));
 }
@@ -346,14 +353,14 @@ DIDerivedType DIBuilder::createTypedef(DIType Ty, StringRef Name, DIFile File,
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_typedef),
     File.getFileNode(),
-    getNonCompileUnitScope(Context),
+    DIScope(getNonCompileUnitScope(Context)).getRef(),
     MDString::get(VMContext, Name),
     ConstantInt::get(Type::getInt32Ty(VMContext), LineNo),
     ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Size
     ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Align
     ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Offset
     ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Flags
-    Ty
+    Ty.getRef()
   };
   return DIDerivedType(MDNode::get(VMContext, Elts));
 }
@@ -366,56 +373,59 @@ DIDerivedType DIBuilder::createFriend(DIType Ty, DIType FriendTy) {
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_friend),
     NULL,
-    Ty,
+    Ty.getRef(),
     NULL, // Name
     ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Line
     ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Size
     ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Align
     ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Offset
     ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Flags
-    FriendTy
+    FriendTy.getRef()
   };
   return DIDerivedType(MDNode::get(VMContext, Elts));
 }
 
 /// createInheritance - Create debugging information entry to establish
 /// inheritance relationship between two types.
-DIDerivedType DIBuilder::createInheritance(
-    DIType Ty, DIType BaseTy, uint64_t BaseOffset, unsigned Flags) {
+DIDerivedType DIBuilder::createInheritance(DIType Ty, DIType BaseTy,
+                                           uint64_t BaseOffset,
+                                           unsigned Flags) {
   assert(Ty.isType() && "Unable to create inheritance");
   // TAG_inheritance is encoded in DIDerivedType format.
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_inheritance),
     NULL,
-    Ty,
+    Ty.getRef(),
     NULL, // Name
     ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Line
     ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Size
     ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Align
     ConstantInt::get(Type::getInt64Ty(VMContext), BaseOffset),
     ConstantInt::get(Type::getInt32Ty(VMContext), Flags),
-    BaseTy
+    BaseTy.getRef()
   };
   return DIDerivedType(MDNode::get(VMContext, Elts));
 }
 
 /// createMemberType - Create debugging information entry for a member.
-DIDerivedType DIBuilder::createMemberType(
-    DIDescriptor Scope, StringRef Name, DIFile File, unsigned LineNumber,
-    uint64_t SizeInBits, uint64_t AlignInBits, uint64_t OffsetInBits,
-    unsigned Flags, DIType Ty) {
+DIDerivedType DIBuilder::createMemberType(DIDescriptor Scope, StringRef Name,
+                                          DIFile File, unsigned LineNumber,
+                                          uint64_t SizeInBits,
+                                          uint64_t AlignInBits,
+                                          uint64_t OffsetInBits, unsigned Flags,
+                                          DIType Ty) {
   // TAG_member is encoded in DIDerivedType format.
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_member),
     File.getFileNode(),
-    getNonCompileUnitScope(Scope),
+    DIScope(getNonCompileUnitScope(Scope)).getRef(),
     MDString::get(VMContext, Name),
     ConstantInt::get(Type::getInt32Ty(VMContext), LineNumber),
     ConstantInt::get(Type::getInt64Ty(VMContext), SizeInBits),
     ConstantInt::get(Type::getInt64Ty(VMContext), AlignInBits),
     ConstantInt::get(Type::getInt64Ty(VMContext), OffsetInBits),
     ConstantInt::get(Type::getInt32Ty(VMContext), Flags),
-    Ty
+    Ty.getRef()
   };
   return DIDerivedType(MDNode::get(VMContext, Elts));
 }
@@ -432,14 +442,14 @@ DIBuilder::createStaticMemberType(DIDescriptor Scope, StringRef Name,
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_member),
     File.getFileNode(),
-    getNonCompileUnitScope(Scope),
+    DIScope(getNonCompileUnitScope(Scope)).getRef(),
     MDString::get(VMContext, Name),
     ConstantInt::get(Type::getInt32Ty(VMContext), LineNumber),
-    ConstantInt::get(Type::getInt64Ty(VMContext), 0/*SizeInBits*/),
-    ConstantInt::get(Type::getInt64Ty(VMContext), 0/*AlignInBits*/),
-    ConstantInt::get(Type::getInt64Ty(VMContext), 0/*OffsetInBits*/),
+    ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Size
+    ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Align
+    ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Offset
     ConstantInt::get(Type::getInt32Ty(VMContext), Flags),
-    Ty,
+    Ty.getRef(),
     Val
   };
   return DIDerivedType(MDNode::get(VMContext, Elts));
@@ -448,13 +458,11 @@ DIBuilder::createStaticMemberType(DIDescriptor Scope, StringRef Name,
 /// createObjCIVar - Create debugging information entry for Objective-C
 /// instance variable.
 DIDerivedType
-DIBuilder::createObjCIVar(StringRef Name,
-                          DIFile File, unsigned LineNumber,
+DIBuilder::createObjCIVar(StringRef Name, DIFile File, unsigned LineNumber,
                           uint64_t SizeInBits, uint64_t AlignInBits,
-                          uint64_t OffsetInBits, unsigned Flags,
-                          DIType Ty, StringRef PropertyName,
-                          StringRef GetterName, StringRef SetterName,
-                          unsigned PropertyAttributes) {
+                          uint64_t OffsetInBits, unsigned Flags, DIType Ty,
+                          StringRef PropertyName, StringRef GetterName,
+                          StringRef SetterName, unsigned PropertyAttributes) {
   // TAG_member is encoded in DIDerivedType format.
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_member),
@@ -477,12 +485,12 @@ DIBuilder::createObjCIVar(StringRef Name,
 
 /// createObjCIVar - Create debugging information entry for Objective-C
 /// instance variable.
-DIDerivedType
-DIBuilder::createObjCIVar(StringRef Name,
-                          DIFile File, unsigned LineNumber,
-                          uint64_t SizeInBits, uint64_t AlignInBits,
-                          uint64_t OffsetInBits, unsigned Flags,
-                          DIType Ty, MDNode *PropertyNode) {
+DIDerivedType DIBuilder::createObjCIVar(StringRef Name, DIFile File,
+                                        unsigned LineNumber,
+                                        uint64_t SizeInBits,
+                                        uint64_t AlignInBits,
+                                        uint64_t OffsetInBits, unsigned Flags,
+                                        DIType Ty, MDNode *PropertyNode) {
   // TAG_member is encoded in DIDerivedType format.
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_member),
@@ -502,12 +510,10 @@ DIBuilder::createObjCIVar(StringRef Name,
 
 /// createObjCProperty - Create debugging information entry for Objective-C
 /// property.
-DIObjCProperty DIBuilder::createObjCProperty(StringRef Name,
-                                             DIFile File, unsigned LineNumber,
-                                             StringRef GetterName,
-                                             StringRef SetterName,
-                                             unsigned PropertyAttributes,
-                                             DIType Ty) {
+DIObjCProperty
+DIBuilder::createObjCProperty(StringRef Name, DIFile File, unsigned LineNumber,
+                              StringRef GetterName, StringRef SetterName,
+                              unsigned PropertyAttributes, DIType Ty) {
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_APPLE_property),
     MDString::get(VMContext, Name),
@@ -529,9 +535,9 @@ DIBuilder::createTemplateTypeParameter(DIDescriptor Context, StringRef Name,
                                        unsigned ColumnNo) {
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_template_type_parameter),
-    getNonCompileUnitScope(Context),
+    DIScope(getNonCompileUnitScope(Context)).getRef(),
     MDString::get(VMContext, Name),
-    Ty,
+    Ty.getRef(),
     File,
     ConstantInt::get(Type::getInt32Ty(VMContext), LineNo),
     ConstantInt::get(Type::getInt32Ty(VMContext), ColumnNo)
@@ -547,9 +553,9 @@ DIBuilder::createTemplateValueParameter(unsigned Tag, DIDescriptor Context,
                                         unsigned ColumnNo) {
   Value *Elts[] = {
     GetTagConstant(VMContext, Tag),
-    getNonCompileUnitScope(Context),
+    DIScope(getNonCompileUnitScope(Context)).getRef(),
     MDString::get(VMContext, Name),
-    Ty,
+    Ty.getRef(),
     Val,
     File,
     ConstantInt::get(Type::getInt32Ty(VMContext), LineNo),
@@ -598,30 +604,34 @@ DICompositeType DIBuilder::createClassType(DIDescriptor Context, StringRef Name,
                                            uint64_t OffsetInBits,
                                            unsigned Flags, DIType DerivedFrom,
                                            DIArray Elements,
-                                           MDNode *VTableHolder,
-                                           MDNode *TemplateParams) {
+                                           DIType VTableHolder,
+                                           MDNode *TemplateParams,
+                                           StringRef UniqueIdentifier) {
   assert((!Context || Context.isScope() || Context.isType()) &&
          "createClassType should be called with a valid Context");
   // TAG_class_type is encoded in DICompositeType format.
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_class_type),
     File.getFileNode(),
-    getNonCompileUnitScope(Context),
+    DIScope(getNonCompileUnitScope(Context)).getRef(),
     MDString::get(VMContext, Name),
     ConstantInt::get(Type::getInt32Ty(VMContext), LineNumber),
     ConstantInt::get(Type::getInt64Ty(VMContext), SizeInBits),
     ConstantInt::get(Type::getInt64Ty(VMContext), AlignInBits),
     ConstantInt::get(Type::getInt32Ty(VMContext), OffsetInBits),
     ConstantInt::get(Type::getInt32Ty(VMContext), Flags),
-    DerivedFrom,
+    DerivedFrom.getRef(),
     Elements,
     ConstantInt::get(Type::getInt32Ty(VMContext), 0),
-    VTableHolder,
-    TemplateParams
+    VTableHolder.getRef(),
+    TemplateParams,
+    UniqueIdentifier.empty() ? NULL : MDString::get(VMContext, UniqueIdentifier)
   };
   DICompositeType R(MDNode::get(VMContext, Elts));
   assert(R.isCompositeType() &&
          "createClassType should return a DICompositeType");
+  if (!UniqueIdentifier.empty())
+    retainType(R);
   return R;
 }
 
@@ -634,27 +644,31 @@ DICompositeType DIBuilder::createStructType(DIDescriptor Context,
                                             unsigned Flags, DIType DerivedFrom,
                                             DIArray Elements,
                                             unsigned RunTimeLang,
-                                            MDNode *VTableHolder) {
+                                            DIType VTableHolder,
+                                            StringRef UniqueIdentifier) {
  // TAG_structure_type is encoded in DICompositeType format.
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_structure_type),
     File.getFileNode(),
-    getNonCompileUnitScope(Context),
+    DIScope(getNonCompileUnitScope(Context)).getRef(),
     MDString::get(VMContext, Name),
     ConstantInt::get(Type::getInt32Ty(VMContext), LineNumber),
     ConstantInt::get(Type::getInt64Ty(VMContext), SizeInBits),
     ConstantInt::get(Type::getInt64Ty(VMContext), AlignInBits),
     ConstantInt::get(Type::getInt32Ty(VMContext), 0),
     ConstantInt::get(Type::getInt32Ty(VMContext), Flags),
-    DerivedFrom,
+    DerivedFrom.getRef(),
     Elements,
     ConstantInt::get(Type::getInt32Ty(VMContext), RunTimeLang),
-    VTableHolder,
+    VTableHolder.getRef(),
     NULL,
+    UniqueIdentifier.empty() ? NULL : MDString::get(VMContext, UniqueIdentifier)
   };
   DICompositeType R(MDNode::get(VMContext, Elts));
   assert(R.isCompositeType() &&
          "createStructType should return a DICompositeType");
+  if (!UniqueIdentifier.empty())
+    retainType(R);
   return R;
 }
 
@@ -664,45 +678,52 @@ DICompositeType DIBuilder::createUnionType(DIDescriptor Scope, StringRef Name,
                                            uint64_t SizeInBits,
                                            uint64_t AlignInBits, unsigned Flags,
                                            DIArray Elements,
-                                           unsigned RunTimeLang) {
+                                           unsigned RunTimeLang,
+                                           StringRef UniqueIdentifier) {
   // TAG_union_type is encoded in DICompositeType format.
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_union_type),
     File.getFileNode(),
-    getNonCompileUnitScope(Scope),
+    DIScope(getNonCompileUnitScope(Scope)).getRef(),
     MDString::get(VMContext, Name),
     ConstantInt::get(Type::getInt32Ty(VMContext), LineNumber),
     ConstantInt::get(Type::getInt64Ty(VMContext), SizeInBits),
     ConstantInt::get(Type::getInt64Ty(VMContext), AlignInBits),
-    ConstantInt::get(Type::getInt64Ty(VMContext), 0),
+    ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Offset
     ConstantInt::get(Type::getInt32Ty(VMContext), Flags),
     NULL,
     Elements,
     ConstantInt::get(Type::getInt32Ty(VMContext), RunTimeLang),
-    Constant::getNullValue(Type::getInt32Ty(VMContext)),
-    NULL
+    NULL,
+    NULL,
+    UniqueIdentifier.empty() ? NULL : MDString::get(VMContext, UniqueIdentifier)
   };
-  return DICompositeType(MDNode::get(VMContext, Elts));
+  DICompositeType R(MDNode::get(VMContext, Elts));
+  if (!UniqueIdentifier.empty())
+    retainType(R);
+  return R;
 }
 
 /// createSubroutineType - Create subroutine type.
-DICompositeType
-DIBuilder::createSubroutineType(DIFile File, DIArray ParameterTypes) {
+DICompositeType DIBuilder::createSubroutineType(DIFile File,
+                                                DIArray ParameterTypes) {
   // TAG_subroutine_type is encoded in DICompositeType format.
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_subroutine_type),
     Constant::getNullValue(Type::getInt32Ty(VMContext)),
-    Constant::getNullValue(Type::getInt32Ty(VMContext)),
+    NULL,
     MDString::get(VMContext, ""),
-    ConstantInt::get(Type::getInt32Ty(VMContext), 0),
-    ConstantInt::get(Type::getInt64Ty(VMContext), 0),
-    ConstantInt::get(Type::getInt64Ty(VMContext), 0),
-    ConstantInt::get(Type::getInt64Ty(VMContext), 0),
-    ConstantInt::get(Type::getInt32Ty(VMContext), 0),
+    ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Line
+    ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Size
+    ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Align
+    ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Offset
+    ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Flags
     NULL,
     ParameterTypes,
     ConstantInt::get(Type::getInt32Ty(VMContext), 0),
-    Constant::getNullValue(Type::getInt32Ty(VMContext))
+    NULL,
+    NULL,
+    NULL  // Type Identifer
   };
   return DICompositeType(MDNode::get(VMContext, Elts));
 }
@@ -712,26 +733,30 @@ DIBuilder::createSubroutineType(DIFile File, DIArray ParameterTypes) {
 DICompositeType DIBuilder::createEnumerationType(
     DIDescriptor Scope, StringRef Name, DIFile File, unsigned LineNumber,
     uint64_t SizeInBits, uint64_t AlignInBits, DIArray Elements,
-    DIType UnderlyingType) {
+    DIType UnderlyingType, StringRef UniqueIdentifier) {
   // TAG_enumeration_type is encoded in DICompositeType format.
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_enumeration_type),
     File.getFileNode(),
-    getNonCompileUnitScope(Scope),
+    DIScope(getNonCompileUnitScope(Scope)).getRef(),
     MDString::get(VMContext, Name),
     ConstantInt::get(Type::getInt32Ty(VMContext), LineNumber),
     ConstantInt::get(Type::getInt64Ty(VMContext), SizeInBits),
     ConstantInt::get(Type::getInt64Ty(VMContext), AlignInBits),
-    ConstantInt::get(Type::getInt32Ty(VMContext), 0),
-    ConstantInt::get(Type::getInt32Ty(VMContext), 0),
-    UnderlyingType,
+    ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Offset
+    ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Flags
+    UnderlyingType.getRef(),
     Elements,
     ConstantInt::get(Type::getInt32Ty(VMContext), 0),
-    Constant::getNullValue(Type::getInt32Ty(VMContext))
+    NULL,
+    NULL,
+    UniqueIdentifier.empty() ? NULL : MDString::get(VMContext, UniqueIdentifier)
   };
-  MDNode *Node = MDNode::get(VMContext, Elts);
-  AllEnumTypes.push_back(Node);
-  return DICompositeType(Node);
+  DICompositeType CTy(MDNode::get(VMContext, Elts));
+  AllEnumTypes.push_back(CTy);
+  if (!UniqueIdentifier.empty())
+    retainType(CTy);
+  return CTy;
 }
 
 /// createArrayType - Create debugging information entry for an array.
@@ -743,15 +768,17 @@ DICompositeType DIBuilder::createArrayType(uint64_t Size, uint64_t AlignInBits,
     NULL, // Filename/Directory,
     NULL, // Unused
     MDString::get(VMContext, ""),
-    ConstantInt::get(Type::getInt32Ty(VMContext), 0),
+    ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Line
     ConstantInt::get(Type::getInt64Ty(VMContext), Size),
     ConstantInt::get(Type::getInt64Ty(VMContext), AlignInBits),
-    ConstantInt::get(Type::getInt32Ty(VMContext), 0),
-    ConstantInt::get(Type::getInt32Ty(VMContext), 0),
-    Ty,
+    ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Offset
+    ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Flags
+    Ty.getRef(),
     Subscripts,
     ConstantInt::get(Type::getInt32Ty(VMContext), 0),
-    Constant::getNullValue(Type::getInt32Ty(VMContext))
+    NULL,
+    NULL,
+    NULL  // Type Identifer
   };
   return DICompositeType(MDNode::get(VMContext, Elts));
 }
@@ -759,22 +786,23 @@ DICompositeType DIBuilder::createArrayType(uint64_t Size, uint64_t AlignInBits,
 /// createVectorType - Create debugging information entry for a vector.
 DICompositeType DIBuilder::createVectorType(uint64_t Size, uint64_t AlignInBits,
                                             DIType Ty, DIArray Subscripts) {
-
   // A vector is an array type with the FlagVector flag applied.
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_array_type),
     NULL, // Filename/Directory,
     NULL, // Unused
     MDString::get(VMContext, ""),
-    ConstantInt::get(Type::getInt32Ty(VMContext), 0),
+    ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Line
     ConstantInt::get(Type::getInt64Ty(VMContext), Size),
     ConstantInt::get(Type::getInt64Ty(VMContext), AlignInBits),
-    ConstantInt::get(Type::getInt32Ty(VMContext), 0),
+    ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Offset
     ConstantInt::get(Type::getInt32Ty(VMContext), DIType::FlagVector),
-    Ty,
+    Ty.getRef(),
     Subscripts,
     ConstantInt::get(Type::getInt32Ty(VMContext), 0),
-    Constant::getNullValue(Type::getInt32Ty(VMContext))
+    NULL,
+    NULL,
+    NULL  // Type Identifer
   };
   return DICompositeType(MDNode::get(VMContext, Elts));
 }
@@ -787,17 +815,14 @@ DIType DIBuilder::createArtificialType(DIType Ty) {
   SmallVector<Value *, 9> Elts;
   MDNode *N = Ty;
   assert (N && "Unexpected input DIType!");
-  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
-    if (Value *V = N->getOperand(i))
-      Elts.push_back(V);
-    else
-      Elts.push_back(Constant::getNullValue(Type::getInt32Ty(VMContext)));
-  }
+  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
+    Elts.push_back(N->getOperand(i));
 
   unsigned CurFlags = Ty.getFlags();
   CurFlags = CurFlags | DIType::FlagArtificial;
 
   // Flags are stored at this slot.
+  // FIXME: Add an enum for this magic value.
   Elts[8] =  ConstantInt::get(Type::getInt32Ty(VMContext), CurFlags);
 
   return DIType(MDNode::get(VMContext, Elts));
@@ -812,17 +837,14 @@ DIType DIBuilder::createObjectPointerType(DIType Ty) {
   SmallVector<Value *, 9> Elts;
   MDNode *N = Ty;
   assert (N && "Unexpected input DIType!");
-  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
-    if (Value *V = N->getOperand(i))
-      Elts.push_back(V);
-    else
-      Elts.push_back(Constant::getNullValue(Type::getInt32Ty(VMContext)));
-  }
+  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
+    Elts.push_back(N->getOperand(i));
 
   unsigned CurFlags = Ty.getFlags();
   CurFlags = CurFlags | (DIType::FlagObjectPointer | DIType::FlagArtificial);
 
   // Flags are stored at this slot.
+  // FIXME: Add an enum for this magic value.
   Elts[8] = ConstantInt::get(Type::getInt32Ty(VMContext), CurFlags);
 
   return DIType(MDNode::get(VMContext, Elts));
@@ -831,7 +853,7 @@ DIType DIBuilder::createObjectPointerType(DIType Ty) {
 /// retainType - Retain DIType in a module even if it is not referenced
 /// through debug info anchors.
 void DIBuilder::retainType(DIType T) {
-  AllRetainTypes.push_back(T);
+  AllRetainTypes.push_back(TrackingVH<MDNode>(T));
 }
 
 /// createUnspecifiedParameter - Create unspeicified type descriptor
@@ -845,31 +867,35 @@ DIDescriptor DIBuilder::createUnspecifiedParameter() {
 
 /// createForwardDecl - Create a temporary forward-declared type that
 /// can be RAUW'd if the full type is seen.
-DIType DIBuilder::createForwardDecl(unsigned Tag, StringRef Name,
-                                    DIDescriptor Scope, DIFile F,
-                                    unsigned Line, unsigned RuntimeLang,
-                                    uint64_t SizeInBits,
-                                    uint64_t AlignInBits) {
+DICompositeType
+DIBuilder::createForwardDecl(unsigned Tag, StringRef Name, DIDescriptor Scope,
+                             DIFile F, unsigned Line, unsigned RuntimeLang,
+                             uint64_t SizeInBits, uint64_t AlignInBits,
+                             StringRef UniqueIdentifier) {
   // Create a temporary MDNode.
   Value *Elts[] = {
     GetTagConstant(VMContext, Tag),
     F.getFileNode(),
-    getNonCompileUnitScope(Scope),
+    DIScope(getNonCompileUnitScope(Scope)).getRef(),
     MDString::get(VMContext, Name),
     ConstantInt::get(Type::getInt32Ty(VMContext), Line),
     ConstantInt::get(Type::getInt64Ty(VMContext), SizeInBits),
     ConstantInt::get(Type::getInt64Ty(VMContext), AlignInBits),
-    ConstantInt::get(Type::getInt32Ty(VMContext), 0),
-    ConstantInt::get(Type::getInt32Ty(VMContext),
-                     DIDescriptor::FlagFwdDecl),
+    ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Offset
+    ConstantInt::get(Type::getInt32Ty(VMContext), DIDescriptor::FlagFwdDecl),
     NULL,
     DIArray(),
-    ConstantInt::get(Type::getInt32Ty(VMContext), RuntimeLang)
+    ConstantInt::get(Type::getInt32Ty(VMContext), RuntimeLang),
+    NULL,
+    NULL, //TemplateParams
+    UniqueIdentifier.empty() ? NULL : MDString::get(VMContext, UniqueIdentifier)
   };
   MDNode *Node = MDNode::getTemporary(VMContext, Elts);
-  DIType RetTy(Node);
-  assert(RetTy.isType() &&
+  DICompositeType RetTy(Node);
+  assert(RetTy.isCompositeType() &&
          "createForwardDecl result should be a DIType");
+  if (!UniqueIdentifier.empty())
+    retainType(RetTy);
   return RetTy;
 }
 
@@ -895,10 +921,11 @@ DISubrange DIBuilder::getOrCreateSubrange(int64_t Lo, int64_t Count) {
 }
 
 /// \brief Create a new descriptor for the specified global.
-DIGlobalVariable DIBuilder::
-createGlobalVariable(StringRef Name, StringRef LinkageName, DIFile F,
-                     unsigned LineNumber, DIType Ty, bool isLocalToUnit,
-                     Value *Val) {
+DIGlobalVariable DIBuilder::createGlobalVariable(StringRef Name,
+                                                 StringRef LinkageName,
+                                                 DIFile F, unsigned LineNumber,
+                                                 DIType Ty, bool isLocalToUnit,
+                                                 Value *Val) {
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_variable),
     Constant::getNullValue(Type::getInt32Ty(VMContext)),
@@ -920,19 +947,22 @@ createGlobalVariable(StringRef Name, StringRef LinkageName, DIFile F,
 }
 
 /// \brief Create a new descriptor for the specified global.
-DIGlobalVariable DIBuilder::
-createGlobalVariable(StringRef Name, DIFile F, unsigned LineNumber,
-                     DIType Ty, bool isLocalToUnit, Value *Val) {
+DIGlobalVariable DIBuilder::createGlobalVariable(StringRef Name, DIFile F,
+                                                 unsigned LineNumber, DIType Ty,
+                                                 bool isLocalToUnit,
+                                                 Value *Val) {
   return createGlobalVariable(Name, Name, F, LineNumber, Ty, isLocalToUnit,
                               Val);
 }
 
 /// createStaticVariable - Create a new descriptor for the specified static
 /// variable.
-DIGlobalVariable DIBuilder::
-createStaticVariable(DIDescriptor Context, StringRef Name,
-                     StringRef LinkageName, DIFile F, unsigned LineNumber,
-                     DIType Ty, bool isLocalToUnit, Value *Val, MDNode *Decl) {
+DIGlobalVariable DIBuilder::createStaticVariable(DIDescriptor Context,
+                                                 StringRef Name,
+                                                 StringRef LinkageName,
+                                                 DIFile F, unsigned LineNumber,
+                                                 DIType Ty, bool isLocalToUnit,
+                                                 Value *Val, MDNode *Decl) {
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_variable),
     Constant::getNullValue(Type::getInt32Ty(VMContext)),
@@ -1012,24 +1042,38 @@ DIVariable DIBuilder::createComplexVariable(unsigned Tag, DIDescriptor Scope,
 }
 
 /// createFunction - Create a new descriptor for the specified function.
-DISubprogram DIBuilder::createFunction(DIDescriptor Context,
-                                       StringRef Name,
-                                       StringRef LinkageName,
-                                       DIFile File, unsigned LineNo,
-                                       DICompositeType Ty,
+/// FIXME: this is added for dragonegg. Once we update dragonegg
+/// to call resolve function, this will be removed.
+DISubprogram DIBuilder::createFunction(DIScopeRef Context, StringRef Name,
+                                       StringRef LinkageName, DIFile File,
+                                       unsigned LineNo, DICompositeType Ty,
+                                       bool isLocalToUnit, bool isDefinition,
+                                       unsigned ScopeLine, unsigned Flags,
+                                       bool isOptimized, Function *Fn,
+                                       MDNode *TParams, MDNode *Decl) {
+  // dragonegg does not generate identifier for types, so using an empty map
+  // to resolve the context should be fine.
+  DITypeIdentifierMap EmptyMap;
+  return createFunction(Context.resolve(EmptyMap), Name, LinkageName, File,
+                        LineNo, Ty, isLocalToUnit, isDefinition, ScopeLine,
+                        Flags, isOptimized, Fn, TParams, Decl);
+}
+
+/// createFunction - Create a new descriptor for the specified function.
+DISubprogram DIBuilder::createFunction(DIDescriptor Context, StringRef Name,
+                                       StringRef LinkageName, DIFile File,
+                                       unsigned LineNo, DICompositeType Ty,
                                        bool isLocalToUnit, bool isDefinition,
-                                       unsigned ScopeLine,
-                                       unsigned Flags, bool isOptimized,
-                                       Function *Fn,
-                                       MDNode *TParams,
-                                       MDNode *Decl) {
+                                       unsigned ScopeLine, unsigned Flags,
+                                       bool isOptimized, Function *Fn,
+                                       MDNode *TParams, MDNode *Decl) {
   assert(Ty.getTag() == dwarf::DW_TAG_subroutine_type &&
          "function types should be subroutines");
   Value *TElts[] = { GetTagConstant(VMContext, DW_TAG_base_type) };
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_subprogram),
     File.getFileNode(),
-    getNonCompileUnitScope(Context),
+    DIScope(getNonCompileUnitScope(Context)).getRef(),
     MDString::get(VMContext, Name),
     MDString::get(VMContext, Name),
     MDString::get(VMContext, LinkageName),
@@ -1059,26 +1103,24 @@ DISubprogram DIBuilder::createFunction(DIDescriptor Context,
 }
 
 /// createMethod - Create a new descriptor for the specified C++ method.
-DISubprogram DIBuilder::createMethod(DIDescriptor Context,
-                                     StringRef Name,
-                                     StringRef LinkageName,
-                                     DIFile F,
+DISubprogram DIBuilder::createMethod(DIDescriptor Context, StringRef Name,
+                                     StringRef LinkageName, DIFile F,
                                      unsigned LineNo, DICompositeType Ty,
-                                     bool isLocalToUnit,
-                                     bool isDefinition,
+                                     bool isLocalToUnit, bool isDefinition,
                                      unsigned VK, unsigned VIndex,
-                                     MDNode *VTableHolder,
-                                     unsigned Flags,
-                                     bool isOptimized,
-                                     Function *Fn,
+                                     DIType VTableHolder, unsigned Flags,
+                                     bool isOptimized, Function *Fn,
                                      MDNode *TParam) {
   assert(Ty.getTag() == dwarf::DW_TAG_subroutine_type &&
          "function types should be subroutines");
+  assert(getNonCompileUnitScope(Context) &&
+         "Methods should have both a Context and a context that isn't "
+         "the compile unit.");
   Value *TElts[] = { GetTagConstant(VMContext, DW_TAG_base_type) };
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_subprogram),
     F.getFileNode(),
-    getNonCompileUnitScope(Context),
+    DIScope(Context).getRef(),
     MDString::get(VMContext, Name),
     MDString::get(VMContext, Name),
     MDString::get(VMContext, LinkageName),
@@ -1086,9 +1128,9 @@ DISubprogram DIBuilder::createMethod(DIDescriptor Context,
     Ty,
     ConstantInt::get(Type::getInt1Ty(VMContext), isLocalToUnit),
     ConstantInt::get(Type::getInt1Ty(VMContext), isDefinition),
-    ConstantInt::get(Type::getInt32Ty(VMContext), (unsigned)VK),
+    ConstantInt::get(Type::getInt32Ty(VMContext), VK),
     ConstantInt::get(Type::getInt32Ty(VMContext), VIndex),
-    VTableHolder,
+    VTableHolder.getRef(),
     ConstantInt::get(Type::getInt32Ty(VMContext), Flags),
     ConstantInt::get(Type::getInt1Ty(VMContext), isOptimized),
     Fn,
diff --git a/lib/IR/DataLayout.cpp b/lib/IR/DataLayout.cpp
index d786d33..6bdc09e 100644
--- a/lib/IR/DataLayout.cpp
+++ b/lib/IR/DataLayout.cpp
@@ -629,6 +629,13 @@ Type *DataLayout::getSmallestLegalIntType(LLVMContext &C, unsigned Width) const
   return 0;
 }
 
+unsigned DataLayout::getLargestLegalIntTypeSize() const {
+  unsigned MaxWidth = 0;
+  for (unsigned i = 0, e = (unsigned)LegalIntWidths.size(); i != e; ++i)
+    MaxWidth = std::max<unsigned>(MaxWidth, LegalIntWidths[i]);
+  return MaxWidth;
+}
+
 uint64_t DataLayout::getIndexedOffset(Type *ptrTy,
                                       ArrayRef<Value *> Indices) const {
   Type *Ty = ptrTy;
diff --git a/lib/IR/DebugInfo.cpp b/lib/IR/DebugInfo.cpp
index ff37542..70a756f 100644
--- a/lib/IR/DebugInfo.cpp
+++ b/lib/IR/DebugInfo.cpp
@@ -75,8 +75,8 @@ uint64_t DIDescriptor::getUInt64Field(unsigned Elt) const {
     return 0;
 
   if (Elt < DbgNode->getNumOperands())
-    if (ConstantInt *CI
-        = dyn_cast_or_null<ConstantInt>(DbgNode->getOperand(Elt)))
+    if (ConstantInt *CI =
+            dyn_cast_or_null<ConstantInt>(DbgNode->getOperand(Elt)))
       return CI->getZExtValue();
 
   return 0;
@@ -87,8 +87,8 @@ int64_t DIDescriptor::getInt64Field(unsigned Elt) const {
     return 0;
 
   if (Elt < DbgNode->getNumOperands())
-    if (ConstantInt *CI
-        = dyn_cast_or_null<ConstantInt>(DbgNode->getOperand(Elt)))
+    if (ConstantInt *CI =
+            dyn_cast_or_null<ConstantInt>(DbgNode->getOperand(Elt)))
       return CI->getSExtValue();
 
   return 0;
@@ -104,7 +104,7 @@ GlobalVariable *DIDescriptor::getGlobalVariableField(unsigned Elt) const {
     return 0;
 
   if (Elt < DbgNode->getNumOperands())
-      return dyn_cast_or_null<GlobalVariable>(DbgNode->getOperand(Elt));
+    return dyn_cast_or_null<GlobalVariable>(DbgNode->getOperand(Elt));
   return 0;
 }
 
@@ -113,7 +113,7 @@ Constant *DIDescriptor::getConstantField(unsigned Elt) const {
     return 0;
 
   if (Elt < DbgNode->getNumOperands())
-      return dyn_cast_or_null<Constant>(DbgNode->getOperand(Elt));
+    return dyn_cast_or_null<Constant>(DbgNode->getOperand(Elt));
   return 0;
 }
 
@@ -122,7 +122,7 @@ Function *DIDescriptor::getFunctionField(unsigned Elt) const {
     return 0;
 
   if (Elt < DbgNode->getNumOperands())
-      return dyn_cast_or_null<Function>(DbgNode->getOperand(Elt));
+    return dyn_cast_or_null<Function>(DbgNode->getOperand(Elt));
   return 0;
 }
 
@@ -131,19 +131,17 @@ void DIDescriptor::replaceFunctionField(unsigned Elt, Function *F) {
     return;
 
   if (Elt < DbgNode->getNumOperands()) {
-    MDNode *Node = const_cast<MDNode*>(DbgNode);
+    MDNode *Node = const_cast<MDNode *>(DbgNode);
     Node->replaceOperandWith(Elt, F);
   }
 }
 
 unsigned DIVariable::getNumAddrElements() const {
-  return DbgNode->getNumOperands()-8;
+  return DbgNode->getNumOperands() - 8;
 }
 
 /// getInlinedAt - If this variable is inlined then return inline location.
-MDNode *DIVariable::getInlinedAt() const {
-  return getNodeField(DbgNode, 7);
-}
+MDNode *DIVariable::getInlinedAt() const { return getNodeField(DbgNode, 7); }
 
 //===----------------------------------------------------------------------===//
 // Predicates
@@ -152,7 +150,8 @@ MDNode *DIVariable::getInlinedAt() const {
 /// isBasicType - Return true if the specified tag is legal for
 /// DIBasicType.
 bool DIDescriptor::isBasicType() const {
-  if (!DbgNode) return false;
+  if (!DbgNode)
+    return false;
   switch (getTag()) {
   case dwarf::DW_TAG_base_type:
   case dwarf::DW_TAG_unspecified_type:
@@ -164,7 +163,8 @@ bool DIDescriptor::isBasicType() const {
 
 /// isDerivedType - Return true if the specified tag is legal for DIDerivedType.
 bool DIDescriptor::isDerivedType() const {
-  if (!DbgNode) return false;
+  if (!DbgNode)
+    return false;
   switch (getTag()) {
   case dwarf::DW_TAG_typedef:
   case dwarf::DW_TAG_pointer_type:
@@ -187,7 +187,8 @@ bool DIDescriptor::isDerivedType() const {
 /// isCompositeType - Return true if the specified tag is legal for
 /// DICompositeType.
 bool DIDescriptor::isCompositeType() const {
-  if (!DbgNode) return false;
+  if (!DbgNode)
+    return false;
   switch (getTag()) {
   case dwarf::DW_TAG_array_type:
   case dwarf::DW_TAG_structure_type:
@@ -203,7 +204,8 @@ bool DIDescriptor::isCompositeType() const {
 
 /// isVariable - Return true if the specified tag is legal for DIVariable.
 bool DIDescriptor::isVariable() const {
-  if (!DbgNode) return false;
+  if (!DbgNode)
+    return false;
   switch (getTag()) {
   case dwarf::DW_TAG_auto_variable:
   case dwarf::DW_TAG_arg_variable:
@@ -240,17 +242,19 @@ bool DIDescriptor::isUnspecifiedParameter() const {
 /// isScope - Return true if the specified tag is one of the scope
 /// related tag.
 bool DIDescriptor::isScope() const {
-  if (!DbgNode) return false;
+  if (!DbgNode)
+    return false;
   switch (getTag()) {
   case dwarf::DW_TAG_compile_unit:
   case dwarf::DW_TAG_lexical_block:
   case dwarf::DW_TAG_subprogram:
   case dwarf::DW_TAG_namespace:
+  case dwarf::DW_TAG_file_type:
     return true;
   default:
     break;
   }
-  return false;
+  return isType();
 }
 
 /// isTemplateTypeParameter - Return true if the specified tag is
@@ -286,13 +290,13 @@ bool DIDescriptor::isNameSpace() const {
 /// lexical block with an extra file.
 bool DIDescriptor::isLexicalBlockFile() const {
   return DbgNode && getTag() == dwarf::DW_TAG_lexical_block &&
-    (DbgNode->getNumOperands() == 3);
+         (DbgNode->getNumOperands() == 3);
 }
 
 /// isLexicalBlock - Return true if the specified tag is DW_TAG_lexical_block.
 bool DIDescriptor::isLexicalBlock() const {
   return DbgNode && getTag() == dwarf::DW_TAG_lexical_block &&
-    (DbgNode->getNumOperands() > 3);
+         (DbgNode->getNumOperands() > 3);
 }
 
 /// isSubrange - Return true if the specified tag is DW_TAG_subrange_type.
@@ -339,10 +343,10 @@ void DIType::replaceAllUsesWith(DIDescriptor &D) {
   // this detail by allowing a value to be replaced with replaceAllUsesWith()
   // itself.
   if (DbgNode != D) {
-    MDNode *Node = const_cast<MDNode*>(DbgNode);
+    MDNode *Node = const_cast<MDNode *>(DbgNode);
     const MDNode *DN = D;
     const Value *V = cast_or_null<Value>(DN);
-    Node->replaceAllUsesWith(const_cast<Value*>(V));
+    Node->replaceAllUsesWith(const_cast<Value *>(V));
     MDNode::deleteTemporary(Node);
   }
 }
@@ -359,31 +363,14 @@ void DIType::replaceAllUsesWith(MDNode *D) {
   // this detail by allowing a value to be replaced with replaceAllUsesWith()
   // itself.
   if (DbgNode != D) {
-    MDNode *Node = const_cast<MDNode*>(DbgNode);
+    MDNode *Node = const_cast<MDNode *>(DbgNode);
     const MDNode *DN = D;
     const Value *V = cast_or_null<Value>(DN);
-    Node->replaceAllUsesWith(const_cast<Value*>(V));
+    Node->replaceAllUsesWith(const_cast<Value *>(V));
     MDNode::deleteTemporary(Node);
   }
 }
 
-/// isUnsignedDIType - Return true if type encoding is unsigned.
-bool DIType::isUnsignedDIType() {
-  DIDerivedType DTy(DbgNode);
-  if (DTy.Verify())
-    return DTy.getTypeDerivedFrom().isUnsignedDIType();
-
-  DIBasicType BTy(DbgNode);
-  if (BTy.Verify()) {
-    unsigned Encoding = BTy.getEncoding();
-    if (Encoding == dwarf::DW_ATE_unsigned ||
-        Encoding == dwarf::DW_ATE_unsigned_char ||
-        Encoding == dwarf::DW_ATE_boolean)
-      return true;
-  }
-  return false;
-}
-
 /// Verify - Verify that a compile unit is well formed.
 bool DICompileUnit::Verify() const {
   if (!isCompileUnit())
@@ -413,22 +400,53 @@ static bool fieldIsMDNode(const MDNode *DbgNode, unsigned Elt) {
   // FIXME: This function should return true, if the field is null or the field
   // is indeed a MDNode: return !Fld || isa<MDNode>(Fld).
   Value *Fld = getField(DbgNode, Elt);
-  if (Fld && isa<MDString>(Fld) &&
-      !cast<MDString>(Fld)->getString().empty())
+  if (Fld && isa<MDString>(Fld) && !cast<MDString>(Fld)->getString().empty())
     return false;
   return true;
 }
 
+/// Check if a field at position Elt of a MDNode is a MDString.
+static bool fieldIsMDString(const MDNode *DbgNode, unsigned Elt) {
+  Value *Fld = getField(DbgNode, Elt);
+  return !Fld || isa<MDString>(Fld);
+}
+
+/// Check if a value can be a reference to a type.
+static bool isTypeRef(const Value *Val) {
+  return !Val ||
+         (isa<MDString>(Val) && !cast<MDString>(Val)->getString().empty()) ||
+         (isa<MDNode>(Val) && DIType(cast<MDNode>(Val)).isType());
+}
+
+/// Check if a field at position Elt of a MDNode can be a reference to a type.
+static bool fieldIsTypeRef(const MDNode *DbgNode, unsigned Elt) {
+  Value *Fld = getField(DbgNode, Elt);
+  return isTypeRef(Fld);
+}
+
+/// Check if a value can be a ScopeRef.
+static bool isScopeRef(const Value *Val) {
+  return !Val ||
+         (isa<MDString>(Val) && !cast<MDString>(Val)->getString().empty()) ||
+         (isa<MDNode>(Val) && DIScope(cast<MDNode>(Val)).isScope());
+}
+
+/// Check if a field at position Elt of a MDNode can be a ScopeRef.
+static bool fieldIsScopeRef(const MDNode *DbgNode, unsigned Elt) {
+  Value *Fld = getField(DbgNode, Elt);
+  return isScopeRef(Fld);
+}
+
 /// Verify - Verify that a type descriptor is well formed.
 bool DIType::Verify() const {
   if (!isType())
     return false;
   // Make sure Context @ field 2 is MDNode.
-  if (!fieldIsMDNode(DbgNode, 2))
+  if (!fieldIsScopeRef(DbgNode, 2))
     return false;
 
   // FIXME: Sink this into the various subclass verifies.
-  unsigned Tag = getTag();
+  uint16_t Tag = getTag();
   if (!isBasicType() && Tag != dwarf::DW_TAG_const_type &&
       Tag != dwarf::DW_TAG_volatile_type && Tag != dwarf::DW_TAG_pointer_type &&
       Tag != dwarf::DW_TAG_ptr_to_member_type &&
@@ -460,12 +478,12 @@ bool DIBasicType::Verify() const {
 
 /// Verify - Verify that a derived type descriptor is well formed.
 bool DIDerivedType::Verify() const {
-  // Make sure DerivedFrom @ field 9 is MDNode.
-  if (!fieldIsMDNode(DbgNode, 9))
+  // Make sure DerivedFrom @ field 9 is TypeRef.
+  if (!fieldIsTypeRef(DbgNode, 9))
     return false;
   if (getTag() == dwarf::DW_TAG_ptr_to_member_type)
-    // Make sure ClassType @ field 10 is MDNode.
-    if (!fieldIsMDNode(DbgNode, 10))
+    // Make sure ClassType @ field 10 is a TypeRef.
+    if (!fieldIsTypeRef(DbgNode, 10))
       return false;
 
   return isDerivedType() && DbgNode->getNumOperands() >= 10 &&
@@ -477,13 +495,17 @@ bool DICompositeType::Verify() const {
   if (!isCompositeType())
     return false;
 
-  // Make sure DerivedFrom @ field 9 and ContainingType @ field 12 are MDNodes.
-  if (!fieldIsMDNode(DbgNode, 9))
+  // Make sure DerivedFrom @ field 9 and ContainingType @ field 12 are TypeRef.
+  if (!fieldIsTypeRef(DbgNode, 9))
     return false;
-  if (!fieldIsMDNode(DbgNode, 12))
+  if (!fieldIsTypeRef(DbgNode, 12))
     return false;
 
-  return DbgNode->getNumOperands() >= 10 && DbgNode->getNumOperands() <= 14;
+  // Make sure the type identifier at field 14 is MDString, it can be null.
+  if (!fieldIsMDString(DbgNode, 14))
+    return false;
+
+  return DbgNode->getNumOperands() == 15;
 }
 
 /// Verify - Verify that a subprogram descriptor is well formed.
@@ -491,13 +513,13 @@ bool DISubprogram::Verify() const {
   if (!isSubprogram())
     return false;
 
-  // Make sure context @ field 2 and type @ field 7 are MDNodes.
-  if (!fieldIsMDNode(DbgNode, 2))
+  // Make sure context @ field 2 is a ScopeRef and type @ field 7 is a MDNode.
+  if (!fieldIsScopeRef(DbgNode, 2))
     return false;
   if (!fieldIsMDNode(DbgNode, 7))
     return false;
   // Containing type @ field 12.
-  if (!fieldIsMDNode(DbgNode, 12))
+  if (!fieldIsTypeRef(DbgNode, 12))
     return false;
   return DbgNode->getNumOperands() == 20;
 }
@@ -550,9 +572,7 @@ bool DINameSpace::Verify() const {
 }
 
 /// \brief Retrieve the MDNode for the directory/file pair.
-MDNode *DIFile::getFileNode() const {
-  return getNodeField(DbgNode, 1);
-}
+MDNode *DIFile::getFileNode() const { return getNodeField(DbgNode, 1); }
 
 /// \brief Verify that the file descriptor is well formed.
 bool DIFile::Verify() const {
@@ -595,56 +615,77 @@ bool DIImportedEntity::Verify() const {
          (DbgNode->getNumOperands() == 4 || DbgNode->getNumOperands() == 5);
 }
 
-/// getOriginalTypeSize - If this type is derived from a base type then
-/// return base type size.
-uint64_t DIDerivedType::getOriginalTypeSize() const {
-  unsigned Tag = getTag();
-
-  if (Tag != dwarf::DW_TAG_member && Tag != dwarf::DW_TAG_typedef &&
-      Tag != dwarf::DW_TAG_const_type && Tag != dwarf::DW_TAG_volatile_type &&
-      Tag != dwarf::DW_TAG_restrict_type)
-    return getSizeInBits();
-
-  DIType BaseType = getTypeDerivedFrom();
-
-  // If this type is not derived from any type then take conservative approach.
-  if (!BaseType.isValid())
-    return getSizeInBits();
-
-  // If this is a derived type, go ahead and get the base type, unless it's a
-  // reference then it's just the size of the field. Pointer types have no need
-  // of this since they're a different type of qualification on the type.
-  if (BaseType.getTag() == dwarf::DW_TAG_reference_type ||
-      BaseType.getTag() == dwarf::DW_TAG_rvalue_reference_type)
-    return getSizeInBits();
-
-  if (BaseType.isDerivedType())
-    return DIDerivedType(BaseType).getOriginalTypeSize();
-
-  return BaseType.getSizeInBits();
-}
-
 /// getObjCProperty - Return property node, if this ivar is associated with one.
 MDNode *DIDerivedType::getObjCProperty() const {
   return getNodeField(DbgNode, 10);
 }
 
+MDString *DICompositeType::getIdentifier() const {
+  return cast_or_null<MDString>(getField(DbgNode, 14));
+}
+
+#ifndef NDEBUG
+static void VerifySubsetOf(const MDNode *LHS, const MDNode *RHS) {
+  for (unsigned i = 0; i != LHS->getNumOperands(); ++i) {
+    // Skip the 'empty' list (that's a single i32 0, rather than truly empty).
+    if (i == 0 && isa<ConstantInt>(LHS->getOperand(i)))
+      continue;
+    const MDNode *E = cast<MDNode>(LHS->getOperand(i));
+    bool found = false;
+    for (unsigned j = 0; !found && j != RHS->getNumOperands(); ++j)
+      found = E == RHS->getOperand(j);
+    assert(found && "Losing a member during member list replacement");
+  }
+}
+#endif
+
 /// \brief Set the array of member DITypes.
 void DICompositeType::setTypeArray(DIArray Elements, DIArray TParams) {
-  assert((!TParams || DbgNode->getNumOperands() == 14) &&
+  assert((!TParams || DbgNode->getNumOperands() == 15) &&
          "If you're setting the template parameters this should include a slot "
          "for that!");
   TrackingVH<MDNode> N(*this);
-  N->replaceOperandWith(10, Elements);
+  if (Elements) {
+#ifndef NDEBUG
+    // Check that the new list of members contains all the old members as well.
+    if (const MDNode *El = cast_or_null<MDNode>(N->getOperand(10)))
+      VerifySubsetOf(El, Elements);
+#endif
+    N->replaceOperandWith(10, Elements);
+  }
   if (TParams)
     N->replaceOperandWith(13, TParams);
   DbgNode = N;
 }
 
+void DICompositeType::addMember(DIDescriptor D) {
+  SmallVector<llvm::Value *, 16> M;
+  DIArray OrigM = getTypeArray();
+  unsigned Elements = OrigM.getNumElements();
+  if (Elements == 1 && !OrigM.getElement(0))
+    Elements = 0;
+  M.reserve(Elements + 1);
+  for (unsigned i = 0; i != Elements; ++i)
+    M.push_back(OrigM.getElement(i));
+  M.push_back(D);
+  setTypeArray(DIArray(MDNode::get(DbgNode->getContext(), M)));
+}
+
+/// Generate a reference to this DIType. Uses the type identifier instead
+/// of the actual MDNode if possible, to help type uniquing.
+DIScopeRef DIScope::getRef() const {
+  if (!isCompositeType())
+    return DIScopeRef(*this);
+  DICompositeType DTy(DbgNode);
+  if (!DTy.getIdentifier())
+    return DIScopeRef(*this);
+  return DIScopeRef(DTy.getIdentifier());
+}
+
 /// \brief Set the containing type.
 void DICompositeType::setContainingType(DICompositeType ContainingType) {
   TrackingVH<MDNode> N(*this);
-  N->replaceOperandWith(12, ContainingType);
+  N->replaceOperandWith(12, ContainingType.getRef());
   DbgNode = N;
 }
 
@@ -674,7 +715,7 @@ bool DISubprogram::describes(const Function *F) {
 }
 
 unsigned DISubprogram::isOptimized() const {
-  assert (DbgNode && "Invalid subprogram descriptor!");
+  assert(DbgNode && "Invalid subprogram descriptor!");
   if (DbgNode->getNumOperands() == 15)
     return getUnsignedField(14);
   return 0;
@@ -694,25 +735,39 @@ Value *DITemplateValueParameter::getValue() const {
 
 // If the current node has a parent scope then return that,
 // else return an empty scope.
-DIScope DIScope::getContext() const {
+DIScopeRef DIScope::getContext() const {
 
   if (isType())
     return DIType(DbgNode).getContext();
 
   if (isSubprogram())
-    return DISubprogram(DbgNode).getContext();
+    return DIScopeRef(DISubprogram(DbgNode).getContext());
 
   if (isLexicalBlock())
-    return DILexicalBlock(DbgNode).getContext();
+    return DIScopeRef(DILexicalBlock(DbgNode).getContext());
 
   if (isLexicalBlockFile())
-    return DILexicalBlockFile(DbgNode).getContext();
+    return DIScopeRef(DILexicalBlockFile(DbgNode).getContext());
 
   if (isNameSpace())
-    return DINameSpace(DbgNode).getContext();
+    return DIScopeRef(DINameSpace(DbgNode).getContext());
 
   assert((isFile() || isCompileUnit()) && "Unhandled type of scope.");
-  return DIScope();
+  return DIScopeRef(NULL);
+}
+
+// If the scope node has a name, return that, else return an empty string.
+StringRef DIScope::getName() const {
+  if (isType())
+    return DIType(DbgNode).getName();
+  if (isSubprogram())
+    return DISubprogram(DbgNode).getName();
+  if (isNameSpace())
+    return DINameSpace(DbgNode).getName();
+  assert((isLexicalBlock() || isLexicalBlockFile() || isFile() ||
+          isCompileUnit()) &&
+         "Unhandled type of scope.");
+  return StringRef();
 }
 
 StringRef DIScope::getFilename() const {
@@ -748,7 +803,6 @@ DIArray DICompileUnit::getSubprograms() const {
   return DIArray(getNodeField(DbgNode, 9));
 }
 
-
 DIArray DICompileUnit::getGlobalVariables() const {
   if (!DbgNode || DbgNode->getNumOperands() < 13)
     return DIArray();
@@ -813,8 +867,7 @@ DIVariable llvm::createInlinedVariable(MDNode *DV, MDNode *InlinedScope,
   SmallVector<Value *, 16> Elts;
   // Insert inlined scope as 7th element.
   for (unsigned i = 0, e = DV->getNumOperands(); i != e; ++i)
-    i == 7 ? Elts.push_back(InlinedScope) :
-             Elts.push_back(DV->getOperand(i));
+    i == 7 ? Elts.push_back(InlinedScope) : Elts.push_back(DV->getOperand(i));
   return DIVariable(MDNode::get(VMContext, Elts));
 }
 
@@ -823,9 +876,8 @@ DIVariable llvm::cleanseInlinedVariable(MDNode *DV, LLVMContext &VMContext) {
   SmallVector<Value *, 16> Elts;
   // Insert inlined scope as 7th element.
   for (unsigned i = 0, e = DV->getNumOperands(); i != e; ++i)
-    i == 7 ?
-      Elts.push_back(Constant::getNullValue(Type::getInt32Ty(VMContext))):
-      Elts.push_back(DV->getOperand(i));
+    i == 7 ? Elts.push_back(Constant::getNullValue(Type::getInt32Ty(VMContext)))
+           : Elts.push_back(DV->getOperand(i));
   return DIVariable(MDNode::get(VMContext, Elts));
 }
 
@@ -849,23 +901,42 @@ DICompositeType llvm::getDICompositeType(DIType T) {
   if (T.isCompositeType())
     return DICompositeType(T);
 
-  if (T.isDerivedType())
-    return getDICompositeType(DIDerivedType(T).getTypeDerivedFrom());
+  if (T.isDerivedType()) {
+    // This function is currently used by dragonegg and dragonegg does
+    // not generate identifier for types, so using an empty map to resolve
+    // DerivedFrom should be fine.
+    DITypeIdentifierMap EmptyMap;
+    return getDICompositeType(
+        DIDerivedType(T).getTypeDerivedFrom().resolve(EmptyMap));
+  }
 
   return DICompositeType();
 }
 
-/// isSubprogramContext - Return true if Context is either a subprogram
-/// or another context nested inside a subprogram.
-bool llvm::isSubprogramContext(const MDNode *Context) {
-  if (!Context)
-    return false;
-  DIDescriptor D(Context);
-  if (D.isSubprogram())
-    return true;
-  if (D.isType())
-    return isSubprogramContext(DIType(Context).getContext());
-  return false;
+/// Update DITypeIdentifierMap by going through retained types of each CU.
+DITypeIdentifierMap
+llvm::generateDITypeIdentifierMap(const NamedMDNode *CU_Nodes) {
+  DITypeIdentifierMap Map;
+  for (unsigned CUi = 0, CUe = CU_Nodes->getNumOperands(); CUi != CUe; ++CUi) {
+    DICompileUnit CU(CU_Nodes->getOperand(CUi));
+    DIArray Retain = CU.getRetainedTypes();
+    for (unsigned Ti = 0, Te = Retain.getNumElements(); Ti != Te; ++Ti) {
+      if (!Retain.getElement(Ti).isCompositeType())
+        continue;
+      DICompositeType Ty(Retain.getElement(Ti));
+      if (MDString *TypeId = Ty.getIdentifier()) {
+        // Definition has priority over declaration.
+        // Try to insert (TypeId, Ty) to Map.
+        std::pair<DITypeIdentifierMap::iterator, bool> P =
+            Map.insert(std::make_pair(TypeId, Ty));
+        // If TypeId already exists in Map and this is a definition, replace
+        // whatever we had (declaration or definition) with the definition.
+        if (!P.second && !Ty.isForwardDecl())
+          P.first->second = Ty;
+      }
+    }
+  }
+  return Map;
 }
 
 //===----------------------------------------------------------------------===//
@@ -879,10 +950,21 @@ void DebugInfoFinder::reset() {
   TYs.clear();
   Scopes.clear();
   NodesSeen.clear();
+  TypeIdentifierMap.clear();
+  TypeMapInitialized = false;
+}
+
+void DebugInfoFinder::InitializeTypeMap(const Module &M) {
+  if (!TypeMapInitialized)
+    if (NamedMDNode *CU_Nodes = M.getNamedMetadata("llvm.dbg.cu")) {
+      TypeIdentifierMap = generateDITypeIdentifierMap(CU_Nodes);
+      TypeMapInitialized = true;
+    }
 }
 
 /// processModule - Process entire module and collect debug info.
 void DebugInfoFinder::processModule(const Module &M) {
+  InitializeTypeMap(M);
   if (NamedMDNode *CU_Nodes = M.getNamedMetadata("llvm.dbg.cu")) {
     for (unsigned i = 0, e = CU_Nodes->getNumOperands(); i != e; ++i) {
       DICompileUnit CU(CU_Nodes->getOperand(i));
@@ -904,27 +986,38 @@ void DebugInfoFinder::processModule(const Module &M) {
       DIArray RetainedTypes = CU.getRetainedTypes();
       for (unsigned i = 0, e = RetainedTypes.getNumElements(); i != e; ++i)
         processType(DIType(RetainedTypes.getElement(i)));
-      // FIXME: We really shouldn't be bailing out after visiting just one CU
-      return;
+      DIArray Imports = CU.getImportedEntities();
+      for (unsigned i = 0, e = Imports.getNumElements(); i != e; ++i) {
+        DIImportedEntity Import = DIImportedEntity(Imports.getElement(i));
+        DIDescriptor Entity = Import.getEntity();
+        if (Entity.isType())
+          processType(DIType(Entity));
+        else if (Entity.isSubprogram())
+          processSubprogram(DISubprogram(Entity));
+        else if (Entity.isNameSpace())
+          processScope(DINameSpace(Entity).getContext());
+      }
     }
   }
 }
 
 /// processLocation - Process DILocation.
-void DebugInfoFinder::processLocation(DILocation Loc) {
-  if (!Loc) return;
+void DebugInfoFinder::processLocation(const Module &M, DILocation Loc) {
+  if (!Loc)
+    return;
+  InitializeTypeMap(M);
   processScope(Loc.getScope());
-  processLocation(Loc.getOrigLocation());
+  processLocation(M, Loc.getOrigLocation());
 }
 
 /// processType - Process DIType.
 void DebugInfoFinder::processType(DIType DT) {
   if (!addType(DT))
     return;
-  processScope(DT.getContext());
+  processScope(DT.getContext().resolve(TypeIdentifierMap));
   if (DT.isCompositeType()) {
     DICompositeType DCT(DT);
-    processType(DCT.getTypeDerivedFrom());
+    processType(DCT.getTypeDerivedFrom().resolve(TypeIdentifierMap));
     DIArray DA = DCT.getTypeArray();
     for (unsigned i = 0, e = DA.getNumElements(); i != e; ++i) {
       DIDescriptor D = DA.getElement(i);
@@ -935,7 +1028,7 @@ void DebugInfoFinder::processType(DIType DT) {
     }
   } else if (DT.isDerivedType()) {
     DIDerivedType DDT(DT);
-    processType(DDT.getTypeDerivedFrom());
+    processType(DDT.getTypeDerivedFrom().resolve(TypeIdentifierMap));
   }
 }
 
@@ -975,8 +1068,7 @@ void DebugInfoFinder::processLexicalBlock(DILexicalBlock LB) {
   else if (Context.isLexicalBlockFile()) {
     DILexicalBlockFile DBF = DILexicalBlockFile(Context);
     return processLexicalBlock(DILexicalBlock(DBF.getScope()));
-  }
-  else
+  } else
     return processSubprogram(DISubprogram(Context));
 }
 
@@ -984,14 +1076,30 @@ void DebugInfoFinder::processLexicalBlock(DILexicalBlock LB) {
 void DebugInfoFinder::processSubprogram(DISubprogram SP) {
   if (!addSubprogram(SP))
     return;
-  processScope(SP.getContext());
+  processScope(SP.getContext().resolve(TypeIdentifierMap));
   processType(SP.getType());
+  DIArray TParams = SP.getTemplateParams();
+  for (unsigned I = 0, E = TParams.getNumElements(); I != E; ++I) {
+    DIDescriptor Element = TParams.getElement(I);
+    if (Element.isTemplateTypeParameter()) {
+      DITemplateTypeParameter TType(Element);
+      processScope(TType.getContext().resolve(TypeIdentifierMap));
+      processType(TType.getType().resolve(TypeIdentifierMap));
+    } else if (Element.isTemplateValueParameter()) {
+      DITemplateValueParameter TVal(Element);
+      processScope(TVal.getContext().resolve(TypeIdentifierMap));
+      processType(TVal.getType().resolve(TypeIdentifierMap));
+    }
+  }
 }
 
 /// processDeclare - Process DbgDeclareInst.
-void DebugInfoFinder::processDeclare(const DbgDeclareInst *DDI) {
+void DebugInfoFinder::processDeclare(const Module &M,
+                                     const DbgDeclareInst *DDI) {
   MDNode *N = dyn_cast<MDNode>(DDI->getVariable());
-  if (!N) return;
+  if (!N)
+    return;
+  InitializeTypeMap(M);
 
   DIDescriptor DV(N);
   if (!DV.isVariable())
@@ -1003,9 +1111,11 @@ void DebugInfoFinder::processDeclare(const DbgDeclareInst *DDI) {
   processType(DIVariable(N).getType());
 }
 
-void DebugInfoFinder::processValue(const DbgValueInst *DVI) {
+void DebugInfoFinder::processValue(const Module &M, const DbgValueInst *DVI) {
   MDNode *N = dyn_cast<MDNode>(DVI->getVariable());
-  if (!N) return;
+  if (!N)
+    return;
+  InitializeTypeMap(M);
 
   DIDescriptor DV(N);
   if (!DV.isVariable())
@@ -1083,12 +1193,14 @@ bool DebugInfoFinder::addScope(DIScope Scope) {
 
 /// dump - Print descriptor to dbgs() with a newline.
 void DIDescriptor::dump() const {
-  print(dbgs()); dbgs() << '\n';
+  print(dbgs());
+  dbgs() << '\n';
 }
 
 /// print - Print descriptor.
 void DIDescriptor::print(raw_ostream &OS) const {
-  if (!DbgNode) return;
+  if (!DbgNode)
+    return;
 
   if (const char *Tag = dwarf::TagString(getTag()))
     OS << "[ " << Tag << " ]";
@@ -1150,7 +1262,8 @@ void DIEnumerator::printInternal(raw_ostream &OS) const {
 }
 
 void DIType::printInternal(raw_ostream &OS) const {
-  if (!DbgNode) return;
+  if (!DbgNode)
+    return;
 
   StringRef Res = getName();
   if (!Res.empty())
@@ -1158,13 +1271,11 @@ void DIType::printInternal(raw_ostream &OS) const {
 
   // TODO: Print context?
 
-  OS << " [line " << getLineNumber()
-     << ", size " << getSizeInBits()
-     << ", align " << getAlignInBits()
-     << ", offset " << getOffsetInBits();
+  OS << " [line " << getLineNumber() << ", size " << getSizeInBits()
+     << ", align " << getAlignInBits() << ", offset " << getOffsetInBits();
   if (isBasicType())
     if (const char *Enc =
-        dwarf::AttributeEncodingString(DIBasicType(DbgNode).getEncoding()))
+            dwarf::AttributeEncodingString(DIBasicType(DbgNode).getEncoding()))
       OS << ", enc " << Enc;
   OS << "]";
 
@@ -1260,16 +1371,15 @@ void DIObjCProperty::printInternal(raw_ostream &OS) const {
   if (!Name.empty())
     OS << " [" << Name << ']';
 
-  OS << " [line " << getLineNumber()
-     << ", properties " << getUnsignedField(6) << ']';
+  OS << " [line " << getLineNumber() << ", properties " << getUnsignedField(6)
+     << ']';
 }
 
 static void printDebugLoc(DebugLoc DL, raw_ostream &CommentOS,
                           const LLVMContext &Ctx) {
-  if (!DL.isUnknown()) {          // Print source line info.
+  if (!DL.isUnknown()) { // Print source line info.
     DIScope Scope(DL.getScope(Ctx));
-    assert(Scope.isScope() &&
-      "Scope of a DebugLoc should be a DIScope.");
+    assert(Scope.isScope() && "Scope of a DebugLoc should be a DIScope.");
     // Omit the directory, because it's likely to be long and uninteresting.
     CommentOS << Scope.getFilename();
     CommentOS << ':' << DL.getLine();
@@ -1298,3 +1408,81 @@ void DIVariable::printExtendedName(raw_ostream &OS) const {
     }
   }
 }
+
+/// Specialize constructor to make sure it has the correct type.
+template <> DIRef<DIScope>::DIRef(const Value *V) : Val(V) {
+  assert(isScopeRef(V) && "DIScopeRef should be a MDString or MDNode");
+}
+template <> DIRef<DIType>::DIRef(const Value *V) : Val(V) {
+  assert(isTypeRef(V) && "DITypeRef should be a MDString or MDNode");
+}
+
+/// Specialize getFieldAs to handle fields that are references to DIScopes.
+template <>
+DIScopeRef DIDescriptor::getFieldAs<DIScopeRef>(unsigned Elt) const {
+  return DIScopeRef(getField(DbgNode, Elt));
+}
+/// Specialize getFieldAs to handle fields that are references to DITypes.
+template <> DITypeRef DIDescriptor::getFieldAs<DITypeRef>(unsigned Elt) const {
+  return DITypeRef(getField(DbgNode, Elt));
+}
+
+/// Strip debug info in the module if it exists.
+/// To do this, we remove all calls to the debugger intrinsics and any named
+/// metadata for debugging. We also remove debug locations for instructions.
+/// Return true if module is modified.
+bool llvm::StripDebugInfo(Module &M) {
+
+  bool Changed = false;
+
+  // Remove all of the calls to the debugger intrinsics, and remove them from
+  // the module.
+  if (Function *Declare = M.getFunction("llvm.dbg.declare")) {
+    while (!Declare->use_empty()) {
+      CallInst *CI = cast<CallInst>(Declare->use_back());
+      CI->eraseFromParent();
+    }
+    Declare->eraseFromParent();
+    Changed = true;
+  }
+
+  if (Function *DbgVal = M.getFunction("llvm.dbg.value")) {
+    while (!DbgVal->use_empty()) {
+      CallInst *CI = cast<CallInst>(DbgVal->use_back());
+      CI->eraseFromParent();
+    }
+    DbgVal->eraseFromParent();
+    Changed = true;
+  }
+
+  for (Module::named_metadata_iterator NMI = M.named_metadata_begin(),
+         NME = M.named_metadata_end(); NMI != NME;) {
+    NamedMDNode *NMD = NMI;
+    ++NMI;
+    if (NMD->getName().startswith("llvm.dbg.")) {
+      NMD->eraseFromParent();
+      Changed = true;
+    }
+  }
+
+  for (Module::iterator MI = M.begin(), ME = M.end(); MI != ME; ++MI)
+    for (Function::iterator FI = MI->begin(), FE = MI->end(); FI != FE;
+         ++FI)
+      for (BasicBlock::iterator BI = FI->begin(), BE = FI->end(); BI != BE;
+           ++BI) {
+        if (!BI->getDebugLoc().isUnknown()) {
+          Changed = true;
+          BI->setDebugLoc(DebugLoc());
+        }
+      }
+
+  return Changed;
+}
+
+/// Return Debug Info Metadata Version by checking module flags.
+unsigned llvm::getDebugMetadataVersionFromModule(const Module &M) {
+  Value *Val = M.getModuleFlag("Debug Info Version");
+  if (!Val)
+    return 0;
+  return cast<ConstantInt>(Val)->getZExtValue();
+}
diff --git a/lib/IR/Function.cpp b/lib/IR/Function.cpp
index bf9d949..e8a2402 100644
--- a/lib/IR/Function.cpp
+++ b/lib/IR/Function.cpp
@@ -276,6 +276,9 @@ void Function::dropAllReferences() {
   // blockaddresses, but BasicBlock's destructor takes care of those.
   while (!BasicBlocks.empty())
     BasicBlocks.begin()->eraseFromParent();
+
+  // Prefix data is stored in a side table.
+  setPrefixData(0);
 }
 
 void Function::addAttribute(unsigned i, Attribute::AttrKind attr) {
@@ -351,6 +354,10 @@ void Function::copyAttributesFrom(const GlobalValue *Src) {
     setGC(SrcF->getGC());
   else
     clearGC();
+  if (SrcF->hasPrefixData())
+    setPrefixData(SrcF->getPrefixData());
+  else
+    setPrefixData(0);
 }
 
 /// getIntrinsicID - This method returns the ID number of the specified
@@ -446,7 +453,9 @@ enum IIT_Info {
   IIT_STRUCT5 = 22,
   IIT_EXTEND_VEC_ARG = 23,
   IIT_TRUNC_VEC_ARG = 24,
-  IIT_ANYPTR = 25
+  IIT_ANYPTR = 25,
+  IIT_V1   = 26,
+  IIT_VARARG = 27
 };
 
 
@@ -460,6 +469,9 @@ static void DecodeIITType(unsigned &NextElt, ArrayRef<unsigned char> Infos,
   case IIT_Done:
     OutputTable.push_back(IITDescriptor::get(IITDescriptor::Void, 0));
     return;
+  case IIT_VARARG:
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::VarArg, 0));
+    return;
   case IIT_MMX:
     OutputTable.push_back(IITDescriptor::get(IITDescriptor::MMX, 0));
     return;
@@ -490,6 +502,10 @@ static void DecodeIITType(unsigned &NextElt, ArrayRef<unsigned char> Infos,
   case IIT_I64:
     OutputTable.push_back(IITDescriptor::get(IITDescriptor::Integer, 64));
     return;
+  case IIT_V1:
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Vector, 1));
+    DecodeIITType(NextElt, Infos, OutputTable);
+    return;
   case IIT_V2:
     OutputTable.push_back(IITDescriptor::get(IITDescriptor::Vector, 2));
     DecodeIITType(NextElt, Infos, OutputTable);
@@ -601,6 +617,7 @@ static Type *DecodeFixedType(ArrayRef<Intrinsic::IITDescriptor> &Infos,
 
   switch (D.Kind) {
   case IITDescriptor::Void: return Type::getVoidTy(Context);
+  case IITDescriptor::VarArg: return Type::getVoidTy(Context);
   case IITDescriptor::MMX: return Type::getX86_MMXTy(Context);
   case IITDescriptor::Metadata: return Type::getMetadataTy(Context);
   case IITDescriptor::Half: return Type::getHalfTy(Context);
@@ -720,3 +737,32 @@ bool Function::callsFunctionThatReturnsTwice() const {
 
   return false;
 }
+
+Constant *Function::getPrefixData() const {
+  assert(hasPrefixData());
+  const LLVMContextImpl::PrefixDataMapTy &PDMap =
+      getContext().pImpl->PrefixDataMap;
+  assert(PDMap.find(this) != PDMap.end());
+  return cast<Constant>(PDMap.find(this)->second->getReturnValue());
+}
+
+void Function::setPrefixData(Constant *PrefixData) {
+  if (!PrefixData && !hasPrefixData())
+    return;
+
+  unsigned SCData = getSubclassDataFromValue();
+  LLVMContextImpl::PrefixDataMapTy &PDMap = getContext().pImpl->PrefixDataMap;
+  ReturnInst *&PDHolder = PDMap[this];
+  if (PrefixData) {
+    if (PDHolder)
+      PDHolder->setOperand(0, PrefixData);
+    else
+      PDHolder = ReturnInst::Create(getContext(), PrefixData);
+    SCData |= 2;
+  } else {
+    delete PDHolder;
+    PDMap.erase(this);
+    SCData &= ~2;
+  }
+  setValueSubclassData(SCData);
+}
diff --git a/lib/IR/GCOV.cpp b/lib/IR/GCOV.cpp
index e9baa5c..f0f8c7d 100644
--- a/lib/IR/GCOV.cpp
+++ b/lib/IR/GCOV.cpp
@@ -7,14 +7,16 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// GCOV implements the interface to read and write coverage files that use 
+// GCOV implements the interface to read and write coverage files that use
 // 'gcov' format.
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/GCOV.h"
 #include "llvm/ADT/OwningPtr.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/Format.h"
 #include "llvm/Support/MemoryObject.h"
 #include "llvm/Support/system_error.h"
 using namespace llvm;
@@ -43,25 +45,45 @@ bool GCOVFile::read(GCOVBuffer &Buffer) {
   if (Format == GCOV::InvalidGCOV)
     return false;
 
-  unsigned i = 0;
-  while (1) {
-    GCOVFunction *GFun = NULL;
-    if (isGCDAFile(Format)) {
-      // Use existing function while reading .gcda file.
-      assert(i < Functions.size() && ".gcda data does not match .gcno data");
-      GFun = Functions[i];
-    } else if (isGCNOFile(Format)){
-      GFun = new GCOVFunction();
+  if (isGCNOFile(Format)) {
+    while (true) {
+      if (!Buffer.readFunctionTag()) break;
+      GCOVFunction *GFun = new GCOVFunction();
+      if (!GFun->read(Buffer, Format))
+        return false;
       Functions.push_back(GFun);
     }
-    if (!GFun || !GFun->read(Buffer, Format))
-      break;
-    ++i;
   }
+  else if (isGCDAFile(Format)) {
+    for (size_t i = 0, e = Functions.size(); i < e; ++i) {
+      if (!Buffer.readFunctionTag()) {
+        errs() << "Unexpected number of functions.\n";
+        return false;
+      }
+      if (!Functions[i]->read(Buffer, Format))
+        return false;
+    }
+    if (Buffer.readObjectTag()) {
+      uint32_t Length;
+      uint32_t Dummy;
+      if (!Buffer.readInt(Length)) return false;
+      if (!Buffer.readInt(Dummy)) return false; // checksum
+      if (!Buffer.readInt(Dummy)) return false; // num
+      if (!Buffer.readInt(RunCount)) return false;;
+      Buffer.advanceCursor(Length-3);
+    }
+    while (Buffer.readProgramTag()) {
+      uint32_t Length;
+      if (!Buffer.readInt(Length)) return false;
+      Buffer.advanceCursor(Length);
+      ++ProgramCount;
+    }
+  }
+
   return true;
 }
 
-/// dump - Dump GCOVFile content on standard out for debugging purposes.
+/// dump - Dump GCOVFile content to dbgs() for debugging purposes.
 void GCOVFile::dump() {
   for (SmallVectorImpl<GCOVFunction *>::iterator I = Functions.begin(),
          E = Functions.end(); I != E; ++I)
@@ -72,9 +94,10 @@ void GCOVFile::dump() {
 /// reading .gcno and .gcda files.
 void GCOVFile::collectLineCounts(FileInfo &FI) {
   for (SmallVectorImpl<GCOVFunction *>::iterator I = Functions.begin(),
-         E = Functions.end(); I != E; ++I) 
+         E = Functions.end(); I != E; ++I)
     (*I)->collectLineCounts(FI);
-  FI.print();
+  FI.setRunCount(RunCount);
+  FI.setProgramCount(ProgramCount);
 }
 
 //===----------------------------------------------------------------------===//
@@ -85,76 +108,121 @@ GCOVFunction::~GCOVFunction() {
   DeleteContainerPointers(Blocks);
 }
 
-/// read - Read a aunction from the buffer. Return false if buffer cursor
+/// read - Read a function from the buffer. Return false if buffer cursor
 /// does not point to a function tag.
 bool GCOVFunction::read(GCOVBuffer &Buff, GCOV::GCOVFormat Format) {
-  if (!Buff.readFunctionTag())
-    return false;
-
-  Buff.readInt(); // Function header length
-  Ident = Buff.readInt(); 
-  Buff.readInt(); // Checksum #1
+  uint32_t Dummy;
+  if (!Buff.readInt(Dummy)) return false; // Function header length
+  if (!Buff.readInt(Ident)) return false;
+  if (!Buff.readInt(Dummy)) return false; // Checksum #1
   if (Format != GCOV::GCNO_402 && Format != GCOV::GCDA_402)
-    Buff.readInt(); // Checksum #2
+    if (!Buff.readInt(Dummy)) return false; // Checksum #2
+
+  if (!Buff.readString(Name)) return false;
 
-  Name = Buff.readString();
   if (Format == GCOV::GCNO_402 || Format == GCOV::GCNO_404)
-    Filename = Buff.readString();
+    if (!Buff.readString(Filename)) return false;
 
   if (Format == GCOV::GCDA_402 || Format == GCOV::GCDA_404) {
-    Buff.readArcTag();
-    uint32_t Count = Buff.readInt() / 2;
-    for (unsigned i = 0, e = Count; i != e; ++i) {
-      Blocks[i]->addCount(Buff.readInt64());
+    if (!Buff.readArcTag()) {
+      errs() << "Arc tag not found.\n";
+      return false;
+    }
+    uint32_t Count;
+    if (!Buff.readInt(Count)) return false;
+    Count /= 2;
+
+    // This for loop adds the counts for each block. A second nested loop is
+    // required to combine the edge counts that are contained in the GCDA file.
+    for (uint32_t Line = 0; Count > 0; ++Line) {
+      if (Line >= Blocks.size()) {
+        errs() << "Unexpected number of edges.\n";
+        return false;
+      }
+      GCOVBlock &Block = *Blocks[Line];
+      for (size_t Edge = 0, End = Block.getNumEdges(); Edge < End; ++Edge) {
+        if (Count == 0) {
+          errs() << "Unexpected number of edges.\n";
+          return false;
+        }
+        uint64_t ArcCount;
+        if (!Buff.readInt64(ArcCount)) return false;
+        Block.addCount(ArcCount);
+        --Count;
+      }
     }
     return true;
   }
 
-  LineNumber = Buff.readInt();
+  if (!Buff.readInt(LineNumber)) return false;
 
   // read blocks.
-  bool BlockTagFound = Buff.readBlockTag();
-  (void)BlockTagFound;
-  assert(BlockTagFound && "Block Tag not found!");
-  uint32_t BlockCount = Buff.readInt();
-  for (int i = 0, e = BlockCount; i != e; ++i) {
-    Buff.readInt(); // Block flags;
-    Blocks.push_back(new GCOVBlock(i));
+  if (!Buff.readBlockTag()) {
+    errs() << "Block tag not found.\n";
+    return false;
+  }
+  uint32_t BlockCount;
+  if (!Buff.readInt(BlockCount)) return false;
+  for (uint32_t i = 0, e = BlockCount; i != e; ++i) {
+    if (!Buff.readInt(Dummy)) return false; // Block flags;
+    Blocks.push_back(new GCOVBlock(*this, i));
   }
 
   // read edges.
   while (Buff.readEdgeTag()) {
-    uint32_t EdgeCount = (Buff.readInt() - 1) / 2;
-    uint32_t BlockNo = Buff.readInt();
-    assert(BlockNo < BlockCount && "Unexpected Block number!");
-    for (int i = 0, e = EdgeCount; i != e; ++i) {
-      Blocks[BlockNo]->addEdge(Buff.readInt());
-      Buff.readInt(); // Edge flag
+    uint32_t EdgeCount;
+    if (!Buff.readInt(EdgeCount)) return false;
+    EdgeCount = (EdgeCount - 1) / 2;
+    uint32_t BlockNo;
+    if (!Buff.readInt(BlockNo)) return false;
+    if (BlockNo >= BlockCount) {
+      errs() << "Unexpected block number.\n";
+      return false;
+    }
+    for (uint32_t i = 0, e = EdgeCount; i != e; ++i) {
+      uint32_t Dst;
+      if (!Buff.readInt(Dst)) return false;
+      Blocks[BlockNo]->addEdge(Dst);
+      if (!Buff.readInt(Dummy)) return false; // Edge flag
     }
   }
 
   // read line table.
   while (Buff.readLineTag()) {
-    uint32_t LineTableLength = Buff.readInt();
-    uint32_t Size = Buff.getCursor() + LineTableLength*4;
-    uint32_t BlockNo = Buff.readInt();
-    assert(BlockNo < BlockCount && "Unexpected Block number!");
+    uint32_t LineTableLength;
+    if (!Buff.readInt(LineTableLength)) return false;
+    uint32_t EndPos = Buff.getCursor() + LineTableLength*4;
+    uint32_t BlockNo;
+    if (!Buff.readInt(BlockNo)) return false;
+    if (BlockNo >= BlockCount) {
+      errs() << "Unexpected block number.\n";
+      return false;
+    }
     GCOVBlock *Block = Blocks[BlockNo];
-    Buff.readInt(); // flag
-    while (Buff.getCursor() != (Size - 4)) {
-      StringRef Filename = Buff.readString();
-      if (Buff.getCursor() == (Size - 4)) break;
-      while (uint32_t L = Buff.readInt())
-        Block->addLine(Filename, L);
+    if (!Buff.readInt(Dummy)) return false; // flag
+    while (Buff.getCursor() != (EndPos - 4)) {
+      StringRef F;
+      if (!Buff.readString(F)) return false;
+      if (F != Filename) {
+        errs() << "Multiple sources for a single basic block.\n";
+        return false;
+      }
+      if (Buff.getCursor() == (EndPos - 4)) break;
+      while (true) {
+        uint32_t Line;
+        if (!Buff.readInt(Line)) return false;
+        if (!Line) break;
+        Block->addLine(Line);
+      }
     }
-    Buff.readInt(); // flag
+    if (!Buff.readInt(Dummy)) return false; // flag
   }
   return true;
 }
 
-/// dump - Dump GCOVFunction content on standard out for debugging purposes.
+/// dump - Dump GCOVFunction content to dbgs() for debugging purposes.
 void GCOVFunction::dump() {
-  outs() <<  "===== " << Name << " @ " << Filename << ":" << LineNumber << "\n";
+  dbgs() <<  "===== " << Name << " @ " << Filename << ":" << LineNumber << "\n";
   for (SmallVectorImpl<GCOVBlock *>::iterator I = Blocks.begin(),
          E = Blocks.end(); I != E; ++I)
     (*I)->dump();
@@ -174,110 +242,73 @@ void GCOVFunction::collectLineCounts(FileInfo &FI) {
 /// ~GCOVBlock - Delete GCOVBlock and its content.
 GCOVBlock::~GCOVBlock() {
   Edges.clear();
-  DeleteContainerSeconds(Lines);
-}
-
-void GCOVBlock::addLine(StringRef Filename, uint32_t LineNo) {
-  GCOVLines *&LinesForFile = Lines[Filename];
-  if (!LinesForFile)
-    LinesForFile = new GCOVLines();
-  LinesForFile->add(LineNo);
+  Lines.clear();
 }
 
 /// collectLineCounts - Collect line counts. This must be used after
 /// reading .gcno and .gcda files.
 void GCOVBlock::collectLineCounts(FileInfo &FI) {
-  for (StringMap<GCOVLines *>::iterator I = Lines.begin(),
+  for (SmallVectorImpl<uint32_t>::iterator I = Lines.begin(),
          E = Lines.end(); I != E; ++I)
-    I->second->collectLineCounts(FI, I->first(), Counter);
+    FI.addLineCount(Parent.getFilename(), *I, Counter);
 }
 
-/// dump - Dump GCOVBlock content on standard out for debugging purposes.
+/// dump - Dump GCOVBlock content to dbgs() for debugging purposes.
 void GCOVBlock::dump() {
-  outs() << "Block : " << Number << " Counter : " << Counter << "\n";
+  dbgs() << "Block : " << Number << " Counter : " << Counter << "\n";
   if (!Edges.empty()) {
-    outs() << "\tEdges : ";
+    dbgs() << "\tEdges : ";
     for (SmallVectorImpl<uint32_t>::iterator I = Edges.begin(), E = Edges.end();
          I != E; ++I)
-      outs() << (*I) << ",";
-    outs() << "\n";
+      dbgs() << (*I) << ",";
+    dbgs() << "\n";
   }
   if (!Lines.empty()) {
-    outs() << "\tLines : ";
-    for (StringMap<GCOVLines *>::iterator LI = Lines.begin(),
-           LE = Lines.end(); LI != LE; ++LI) {
-      outs() << LI->first() << " -> ";
-      LI->second->dump();
-      outs() << "\n";
-    }
+    dbgs() << "\tLines : ";
+    for (SmallVectorImpl<uint32_t>::iterator I = Lines.begin(),
+           E = Lines.end(); I != E; ++I)
+      dbgs() << (*I) << ",";
+    dbgs() << "\n";
   }
 }
 
 //===----------------------------------------------------------------------===//
-// GCOVLines implementation.
-
-/// collectLineCounts - Collect line counts. This must be used after
-/// reading .gcno and .gcda files.
-void GCOVLines::collectLineCounts(FileInfo &FI, StringRef Filename, 
-                                  uint32_t Count) {
-  for (SmallVectorImpl<uint32_t>::iterator I = Lines.begin(),
-         E = Lines.end(); I != E; ++I)
-    FI.addLineCount(Filename, *I, Count);
-}
-
-/// dump - Dump GCOVLines content on standard out for debugging purposes.
-void GCOVLines::dump() {
-  for (SmallVectorImpl<uint32_t>::iterator I = Lines.begin(),
-         E = Lines.end(); I != E; ++I)
-    outs() << (*I) << ",";
-}
-
-//===----------------------------------------------------------------------===//
 // FileInfo implementation.
 
-/// addLineCount - Add line count for the given line number in a file.
-void FileInfo::addLineCount(StringRef Filename, uint32_t Line, uint32_t Count) {
-  if (LineInfo.find(Filename) == LineInfo.end()) {
-    OwningPtr<MemoryBuffer> Buff;
-    if (error_code ec = MemoryBuffer::getFileOrSTDIN(Filename, Buff)) {
-      errs() << Filename << ": " << ec.message() << "\n";
-      return;
-    }
-    StringRef AllLines = Buff.take()->getBuffer();
-    LineCounts L(AllLines.count('\n')+2);
-    L[Line-1] = Count;
-    LineInfo[Filename] = L;
-    return;
-  }
-  LineCounts &L = LineInfo[Filename];
-  L[Line-1] = Count;
-}
-
 /// print -  Print source files with collected line count information.
-void FileInfo::print() {
+void FileInfo::print(raw_fd_ostream &OS, StringRef gcnoFile,
+                     StringRef gcdaFile) {
   for (StringMap<LineCounts>::iterator I = LineInfo.begin(), E = LineInfo.end();
        I != E; ++I) {
     StringRef Filename = I->first();
-    outs() << Filename << "\n";
+    OS << "        -:    0:Source:" << Filename << "\n";
+    OS << "        -:    0:Graph:" << gcnoFile << "\n";
+    OS << "        -:    0:Data:" << gcdaFile << "\n";
+    OS << "        -:    0:Runs:" << RunCount << "\n";
+    OS << "        -:    0:Programs:" << ProgramCount << "\n";
     LineCounts &L = LineInfo[Filename];
     OwningPtr<MemoryBuffer> Buff;
     if (error_code ec = MemoryBuffer::getFileOrSTDIN(Filename, Buff)) {
       errs() << Filename << ": " << ec.message() << "\n";
       return;
     }
-    StringRef AllLines = Buff.take()->getBuffer();
-    for (unsigned i = 0, e = L.size(); i != e; ++i) {
-      if (L[i])
-        outs() << L[i] << ":\t";
-      else
-        outs() << " :\t";
+    StringRef AllLines = Buff->getBuffer();
+    uint32_t i = 0;
+    while (!AllLines.empty()) {
+      if (L.find(i) != L.end()) {
+        if (L[i] == 0)
+          OS << "    #####:";
+        else
+          OS << format("%9" PRIu64 ":", L[i]);
+      } else {
+        OS << "        -:";
+      }
       std::pair<StringRef, StringRef> P = AllLines.split('\n');
       if (AllLines != P.first)
-        outs() << P.first;
-      outs() << "\n";
+        OS << format("%5u:", i+1) << P.first;
+      OS << "\n";
       AllLines = P.second;
+      ++i;
     }
   }
 }
-
-
diff --git a/lib/IR/Globals.cpp b/lib/IR/Globals.cpp
index 6d547f3..da3b02a 100644
--- a/lib/IR/Globals.cpp
+++ b/lib/IR/Globals.cpp
@@ -229,14 +229,14 @@ void GlobalAlias::setAliasee(Constant *Aliasee) {
   setOperand(0, Aliasee);
 }
 
-const GlobalValue *GlobalAlias::getAliasedGlobal() const {
-  const Constant *C = getAliasee();
+GlobalValue *GlobalAlias::getAliasedGlobal() {
+  Constant *C = getAliasee();
   if (C == 0) return 0;
   
-  if (const GlobalValue *GV = dyn_cast<GlobalValue>(C))
+  if (GlobalValue *GV = dyn_cast<GlobalValue>(C))
     return GV;
 
-  const ConstantExpr *CE = cast<ConstantExpr>(C);
+  ConstantExpr *CE = cast<ConstantExpr>(C);
   assert((CE->getOpcode() == Instruction::BitCast || 
           CE->getOpcode() == Instruction::GetElementPtr) &&
          "Unsupported aliasee");
@@ -244,18 +244,18 @@ const GlobalValue *GlobalAlias::getAliasedGlobal() const {
   return cast<GlobalValue>(CE->getOperand(0));
 }
 
-const GlobalValue *GlobalAlias::resolveAliasedGlobal(bool stopOnWeak) const {
-  SmallPtrSet<const GlobalValue*, 3> Visited;
+GlobalValue *GlobalAlias::resolveAliasedGlobal(bool stopOnWeak) {
+  SmallPtrSet<GlobalValue*, 3> Visited;
 
   // Check if we need to stop early.
   if (stopOnWeak && mayBeOverridden())
     return this;
 
-  const GlobalValue *GV = getAliasedGlobal();
+  GlobalValue *GV = getAliasedGlobal();
   Visited.insert(GV);
 
   // Iterate over aliasing chain, stopping on weak alias if necessary.
-  while (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV)) {
+  while (GlobalAlias *GA = dyn_cast<GlobalAlias>(GV)) {
     if (stopOnWeak && GA->mayBeOverridden())
       break;
 
diff --git a/lib/IR/Instruction.cpp b/lib/IR/Instruction.cpp
index 2b5a0b3..a7773c4 100644
--- a/lib/IR/Instruction.cpp
+++ b/lib/IR/Instruction.cpp
@@ -223,18 +223,19 @@ const char *Instruction::getOpcodeName(unsigned OpCode) {
   case GetElementPtr: return "getelementptr";
 
   // Convert instructions...
-  case Trunc:     return "trunc";
-  case ZExt:      return "zext";
-  case SExt:      return "sext";
-  case FPTrunc:   return "fptrunc";
-  case FPExt:     return "fpext";
-  case FPToUI:    return "fptoui";
-  case FPToSI:    return "fptosi";
-  case UIToFP:    return "uitofp";
-  case SIToFP:    return "sitofp";
-  case IntToPtr:  return "inttoptr";
-  case PtrToInt:  return "ptrtoint";
-  case BitCast:   return "bitcast";
+  case Trunc:         return "trunc";
+  case ZExt:          return "zext";
+  case SExt:          return "sext";
+  case FPTrunc:       return "fptrunc";
+  case FPExt:         return "fpext";
+  case FPToUI:        return "fptoui";
+  case FPToSI:        return "fptosi";
+  case UIToFP:        return "uitofp";
+  case SIToFP:        return "sitofp";
+  case IntToPtr:      return "inttoptr";
+  case PtrToInt:      return "ptrtoint";
+  case BitCast:       return "bitcast";
+  case AddrSpaceCast: return "addrspacecast";
 
   // Other instructions...
   case ICmp:           return "icmp";
diff --git a/lib/IR/Instructions.cpp b/lib/IR/Instructions.cpp
index 205cb43..8a6b77b 100644
--- a/lib/IR/Instructions.cpp
+++ b/lib/IR/Instructions.cpp
@@ -2095,7 +2095,9 @@ bool CastInst::isNoopCast(Instruction::CastOps Opcode,
     case Instruction::SIToFP:
     case Instruction::FPToUI:
     case Instruction::FPToSI:
-      return false; // These always modify bits
+    case Instruction::AddrSpaceCast:
+      // TODO: Target informations may give a more accurate answer here.
+      return false;
     case Instruction::BitCast:
       return true;  // BitCast never modifies bits.
     case Instruction::PtrToInt:
@@ -2137,44 +2139,46 @@ unsigned CastInst::isEliminableCastPair(
   // ZEXT          <       Integral   Unsigned     Integer      Any
   // SEXT          <       Integral    Signed      Integer      Any
   // FPTOUI       n/a      FloatPt      n/a        Integral   Unsigned
-  // FPTOSI       n/a      FloatPt      n/a        Integral    Signed 
-  // UITOFP       n/a      Integral   Unsigned     FloatPt      n/a   
-  // SITOFP       n/a      Integral    Signed      FloatPt      n/a   
-  // FPTRUNC       >       FloatPt      n/a        FloatPt      n/a   
-  // FPEXT         <       FloatPt      n/a        FloatPt      n/a   
+  // FPTOSI       n/a      FloatPt      n/a        Integral    Signed
+  // UITOFP       n/a      Integral   Unsigned     FloatPt      n/a
+  // SITOFP       n/a      Integral    Signed      FloatPt      n/a
+  // FPTRUNC       >       FloatPt      n/a        FloatPt      n/a
+  // FPEXT         <       FloatPt      n/a        FloatPt      n/a
   // PTRTOINT     n/a      Pointer      n/a        Integral   Unsigned
   // INTTOPTR     n/a      Integral   Unsigned     Pointer      n/a
-  // BITCAST       =       FirstClass   n/a       FirstClass    n/a   
+  // BITCAST       =       FirstClass   n/a       FirstClass    n/a
+  // ADDRSPCST    n/a      Pointer      n/a        Pointer      n/a
   //
   // NOTE: some transforms are safe, but we consider them to be non-profitable.
   // For example, we could merge "fptoui double to i32" + "zext i32 to i64",
   // into "fptoui double to i64", but this loses information about the range
-  // of the produced value (we no longer know the top-part is all zeros). 
+  // of the produced value (we no longer know the top-part is all zeros).
   // Further this conversion is often much more expensive for typical hardware,
-  // and causes issues when building libgcc.  We disallow fptosi+sext for the 
+  // and causes issues when building libgcc.  We disallow fptosi+sext for the
   // same reason.
-  const unsigned numCastOps = 
+  const unsigned numCastOps =
     Instruction::CastOpsEnd - Instruction::CastOpsBegin;
   static const uint8_t CastResults[numCastOps][numCastOps] = {
-    // T        F  F  U  S  F  F  P  I  B   -+
-    // R  Z  S  P  P  I  I  T  P  2  N  T    |
-    // U  E  E  2  2  2  2  R  E  I  T  C    +- secondOp
-    // N  X  X  U  S  F  F  N  X  N  2  V    |
-    // C  T  T  I  I  P  P  C  T  T  P  T   -+
-    {  1, 0, 0,99,99, 0, 0,99,99,99, 0, 3 }, // Trunc      -+
-    {  8, 1, 9,99,99, 2, 0,99,99,99, 2, 3 }, // ZExt        |
-    {  8, 0, 1,99,99, 0, 2,99,99,99, 0, 3 }, // SExt        |
-    {  0, 0, 0,99,99, 0, 0,99,99,99, 0, 3 }, // FPToUI      |
-    {  0, 0, 0,99,99, 0, 0,99,99,99, 0, 3 }, // FPToSI      |
-    { 99,99,99, 0, 0,99,99, 0, 0,99,99, 4 }, // UIToFP      +- firstOp
-    { 99,99,99, 0, 0,99,99, 0, 0,99,99, 4 }, // SIToFP      |
-    { 99,99,99, 0, 0,99,99, 1, 0,99,99, 4 }, // FPTrunc     |
-    { 99,99,99, 2, 2,99,99,10, 2,99,99, 4 }, // FPExt       |
-    {  1, 0, 0,99,99, 0, 0,99,99,99, 7, 3 }, // PtrToInt    |
-    { 99,99,99,99,99,99,99,99,99,13,99,12 }, // IntToPtr    |
-    {  5, 5, 5, 6, 6, 5, 5, 6, 6,11, 5, 1 }, // BitCast    -+
+    // T        F  F  U  S  F  F  P  I  B  A  -+
+    // R  Z  S  P  P  I  I  T  P  2  N  T  S   |
+    // U  E  E  2  2  2  2  R  E  I  T  C  C   +- secondOp
+    // N  X  X  U  S  F  F  N  X  N  2  V  V   |
+    // C  T  T  I  I  P  P  C  T  T  P  T  T  -+
+    {  1, 0, 0,99,99, 0, 0,99,99,99, 0, 3, 0}, // Trunc         -+
+    {  8, 1, 9,99,99, 2, 0,99,99,99, 2, 3, 0}, // ZExt           |
+    {  8, 0, 1,99,99, 0, 2,99,99,99, 0, 3, 0}, // SExt           |
+    {  0, 0, 0,99,99, 0, 0,99,99,99, 0, 3, 0}, // FPToUI         |
+    {  0, 0, 0,99,99, 0, 0,99,99,99, 0, 3, 0}, // FPToSI         |
+    { 99,99,99, 0, 0,99,99, 0, 0,99,99, 4, 0}, // UIToFP         +- firstOp
+    { 99,99,99, 0, 0,99,99, 0, 0,99,99, 4, 0}, // SIToFP         |
+    { 99,99,99, 0, 0,99,99, 1, 0,99,99, 4, 0}, // FPTrunc        |
+    { 99,99,99, 2, 2,99,99,10, 2,99,99, 4, 0}, // FPExt          |
+    {  1, 0, 0,99,99, 0, 0,99,99,99, 7, 3, 0}, // PtrToInt       |
+    { 99,99,99,99,99,99,99,99,99,11,99,15, 0}, // IntToPtr       |
+    {  5, 5, 5, 6, 6, 5, 5, 6, 6,16, 5, 1,14}, // BitCast        |
+    {  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,13,12}, // AddrSpaceCast -+
   };
-  
+
   // If either of the casts are a bitcast from scalar to vector, disallow the
   // merging. However, bitcast of A->B->A are allowed.
   bool isFirstBitcast  = (firstOp == Instruction::BitCast);
@@ -2191,47 +2195,50 @@ unsigned CastInst::isEliminableCastPair(
                             [secondOp-Instruction::CastOpsBegin];
   switch (ElimCase) {
     case 0: 
-      // categorically disallowed
+      // Categorically disallowed.
       return 0;
     case 1: 
-      // allowed, use first cast's opcode
+      // Allowed, use first cast's opcode.
       return firstOp;
     case 2: 
-      // allowed, use second cast's opcode
+      // Allowed, use second cast's opcode.
       return secondOp;
     case 3: 
-      // no-op cast in second op implies firstOp as long as the DestTy 
+      // No-op cast in second op implies firstOp as long as the DestTy
       // is integer and we are not converting between a vector and a
       // non vector type.
       if (!SrcTy->isVectorTy() && DstTy->isIntegerTy())
         return firstOp;
       return 0;
     case 4:
-      // no-op cast in second op implies firstOp as long as the DestTy
+      // No-op cast in second op implies firstOp as long as the DestTy
       // is floating point.
       if (DstTy->isFloatingPointTy())
         return firstOp;
       return 0;
     case 5: 
-      // no-op cast in first op implies secondOp as long as the SrcTy
+      // No-op cast in first op implies secondOp as long as the SrcTy
       // is an integer.
       if (SrcTy->isIntegerTy())
         return secondOp;
       return 0;
     case 6:
-      // no-op cast in first op implies secondOp as long as the SrcTy
+      // No-op cast in first op implies secondOp as long as the SrcTy
       // is a floating point.
       if (SrcTy->isFloatingPointTy())
         return secondOp;
       return 0;
     case 7: {
+      // Cannot simplify if address spaces are different!
+      if (SrcTy->getPointerAddressSpace() != DstTy->getPointerAddressSpace())
+        return 0;
+
       unsigned MidSize = MidTy->getScalarSizeInBits();
-      // Check the address spaces first. If we know they are in the same address
-      // space, the pointer sizes must be the same so we can still fold this
-      // without knowing the actual sizes as long we know that the intermediate
-      // pointer is the largest possible pointer size.
-      if (MidSize == 64 &&
-          SrcTy->getPointerAddressSpace() == DstTy->getPointerAddressSpace())
+      // We can still fold this without knowing the actual sizes as long we
+      // know that the intermediate pointer is the largest possible
+      // pointer size.
+      // FIXME: Is this always true?
+      if (MidSize == 64)
         return Instruction::BitCast;
 
       // ptrtoint, inttoptr -> bitcast (ptr -> ptr) if int size is >= ptr size.
@@ -2254,7 +2261,8 @@ unsigned CastInst::isEliminableCastPair(
         return firstOp;
       return secondOp;
     }
-    case 9: // zext, sext -> zext, because sext can't sign extend after zext
+    case 9:
+      // zext, sext -> zext, because sext can't sign extend after zext
       return Instruction::ZExt;
     case 10:
       // fpext followed by ftrunc is allowed if the bit size returned to is
@@ -2263,46 +2271,6 @@ unsigned CastInst::isEliminableCastPair(
         return Instruction::BitCast;
       return 0; // If the types are not the same we can't eliminate it.
     case 11: {
-      // bitcast followed by ptrtoint is allowed as long as the bitcast is a
-      // pointer to pointer cast, and the pointers are the same size.
-      PointerType *SrcPtrTy = dyn_cast<PointerType>(SrcTy);
-      PointerType *MidPtrTy = dyn_cast<PointerType>(MidTy);
-      if (!SrcPtrTy || !MidPtrTy)
-        return 0;
-
-      // If the address spaces are the same, we know they are the same size
-      // without size information
-      if (SrcPtrTy->getAddressSpace() == MidPtrTy->getAddressSpace())
-        return secondOp;
-
-      if (!SrcIntPtrTy || !MidIntPtrTy)
-        return 0;
-
-      if (SrcIntPtrTy->getScalarSizeInBits() ==
-          MidIntPtrTy->getScalarSizeInBits())
-        return secondOp;
-
-      return 0;
-    }
-    case 12: {
-      // inttoptr, bitcast -> inttoptr if bitcast is a ptr to ptr cast
-      // and the ptrs are to address spaces of the same size
-      PointerType *MidPtrTy = dyn_cast<PointerType>(MidTy);
-      PointerType *DstPtrTy = dyn_cast<PointerType>(DstTy);
-      if (!MidPtrTy || !DstPtrTy)
-        return 0;
-
-      if (MidPtrTy->getAddressSpace() == DstPtrTy->getAddressSpace())
-        return firstOp;
-
-      if (MidIntPtrTy &&
-          DstIntPtrTy &&
-          MidIntPtrTy->getScalarSizeInBits() ==
-          DstIntPtrTy->getScalarSizeInBits())
-        return firstOp;
-      return 0;
-    }
-    case 13: {
       // inttoptr, ptrtoint -> bitcast if SrcSize<=PtrSize and SrcSize==DstSize
       if (!MidIntPtrTy)
         return 0;
@@ -2313,8 +2281,65 @@ unsigned CastInst::isEliminableCastPair(
         return Instruction::BitCast;
       return 0;
     }
+    case 12: {
+      // addrspacecast, addrspacecast -> bitcast,       if SrcAS == DstAS
+      // addrspacecast, addrspacecast -> addrspacecast, if SrcAS != DstAS
+      if (SrcTy->getPointerAddressSpace() != DstTy->getPointerAddressSpace())
+        return Instruction::AddrSpaceCast;
+      return Instruction::BitCast;
+    }
+    case 13:
+      // FIXME: this state can be merged with (1), but the following assert
+      // is useful to check the correcteness of the sequence due to semantic
+      // change of bitcast.
+      assert(
+        SrcTy->isPtrOrPtrVectorTy() &&
+        MidTy->isPtrOrPtrVectorTy() &&
+        DstTy->isPtrOrPtrVectorTy() &&
+        SrcTy->getPointerAddressSpace() != MidTy->getPointerAddressSpace() &&
+        MidTy->getPointerAddressSpace() == DstTy->getPointerAddressSpace() &&
+        "Illegal addrspacecast, bitcast sequence!");
+      // Allowed, use first cast's opcode
+      return firstOp;
+    case 14:
+      // FIXME: this state can be merged with (2), but the following assert
+      // is useful to check the correcteness of the sequence due to semantic
+      // change of bitcast.
+      assert(
+        SrcTy->isPtrOrPtrVectorTy() &&
+        MidTy->isPtrOrPtrVectorTy() &&
+        DstTy->isPtrOrPtrVectorTy() &&
+        SrcTy->getPointerAddressSpace() == MidTy->getPointerAddressSpace() &&
+        MidTy->getPointerAddressSpace() != DstTy->getPointerAddressSpace() &&
+        "Illegal bitcast, addrspacecast sequence!");
+      // Allowed, use second cast's opcode
+      return secondOp;
+    case 15:
+      // FIXME: this state can be merged with (1), but the following assert
+      // is useful to check the correcteness of the sequence due to semantic
+      // change of bitcast.
+      assert(
+        SrcTy->isIntOrIntVectorTy() &&
+        MidTy->isPtrOrPtrVectorTy() &&
+        DstTy->isPtrOrPtrVectorTy() &&
+        MidTy->getPointerAddressSpace() == DstTy->getPointerAddressSpace() &&
+        "Illegal inttoptr, bitcast sequence!");
+      // Allowed, use first cast's opcode
+      return firstOp;
+    case 16:
+      // FIXME: this state can be merged with (2), but the following assert
+      // is useful to check the correcteness of the sequence due to semantic
+      // change of bitcast.
+      assert(
+        SrcTy->isPtrOrPtrVectorTy() &&
+        MidTy->isPtrOrPtrVectorTy() &&
+        DstTy->isIntOrIntVectorTy() &&
+        SrcTy->getPointerAddressSpace() == MidTy->getPointerAddressSpace() &&
+        "Illegal bitcast, ptrtoint sequence!");
+      // Allowed, use second cast's opcode
+      return secondOp;
     case 99: 
-      // cast combination can't happen (error in input). This is for all cases
+      // Cast combination can't happen (error in input). This is for all cases
       // where the MidTy is not the same for the two cast instructions.
       llvm_unreachable("Invalid Cast Combination");
     default:
@@ -2327,19 +2352,20 @@ CastInst *CastInst::Create(Instruction::CastOps op, Value *S, Type *Ty,
   assert(castIsValid(op, S, Ty) && "Invalid cast!");
   // Construct and return the appropriate CastInst subclass
   switch (op) {
-    case Trunc:    return new TruncInst    (S, Ty, Name, InsertBefore);
-    case ZExt:     return new ZExtInst     (S, Ty, Name, InsertBefore);
-    case SExt:     return new SExtInst     (S, Ty, Name, InsertBefore);
-    case FPTrunc:  return new FPTruncInst  (S, Ty, Name, InsertBefore);
-    case FPExt:    return new FPExtInst    (S, Ty, Name, InsertBefore);
-    case UIToFP:   return new UIToFPInst   (S, Ty, Name, InsertBefore);
-    case SIToFP:   return new SIToFPInst   (S, Ty, Name, InsertBefore);
-    case FPToUI:   return new FPToUIInst   (S, Ty, Name, InsertBefore);
-    case FPToSI:   return new FPToSIInst   (S, Ty, Name, InsertBefore);
-    case PtrToInt: return new PtrToIntInst (S, Ty, Name, InsertBefore);
-    case IntToPtr: return new IntToPtrInst (S, Ty, Name, InsertBefore);
-    case BitCast:  return new BitCastInst  (S, Ty, Name, InsertBefore);
-    default: llvm_unreachable("Invalid opcode provided");
+  case Trunc:         return new TruncInst         (S, Ty, Name, InsertBefore);
+  case ZExt:          return new ZExtInst          (S, Ty, Name, InsertBefore);
+  case SExt:          return new SExtInst          (S, Ty, Name, InsertBefore);
+  case FPTrunc:       return new FPTruncInst       (S, Ty, Name, InsertBefore);
+  case FPExt:         return new FPExtInst         (S, Ty, Name, InsertBefore);
+  case UIToFP:        return new UIToFPInst        (S, Ty, Name, InsertBefore);
+  case SIToFP:        return new SIToFPInst        (S, Ty, Name, InsertBefore);
+  case FPToUI:        return new FPToUIInst        (S, Ty, Name, InsertBefore);
+  case FPToSI:        return new FPToSIInst        (S, Ty, Name, InsertBefore);
+  case PtrToInt:      return new PtrToIntInst      (S, Ty, Name, InsertBefore);
+  case IntToPtr:      return new IntToPtrInst      (S, Ty, Name, InsertBefore);
+  case BitCast:       return new BitCastInst       (S, Ty, Name, InsertBefore);
+  case AddrSpaceCast: return new AddrSpaceCastInst (S, Ty, Name, InsertBefore);
+  default: llvm_unreachable("Invalid opcode provided");
   }
 }
 
@@ -2348,19 +2374,20 @@ CastInst *CastInst::Create(Instruction::CastOps op, Value *S, Type *Ty,
   assert(castIsValid(op, S, Ty) && "Invalid cast!");
   // Construct and return the appropriate CastInst subclass
   switch (op) {
-    case Trunc:    return new TruncInst    (S, Ty, Name, InsertAtEnd);
-    case ZExt:     return new ZExtInst     (S, Ty, Name, InsertAtEnd);
-    case SExt:     return new SExtInst     (S, Ty, Name, InsertAtEnd);
-    case FPTrunc:  return new FPTruncInst  (S, Ty, Name, InsertAtEnd);
-    case FPExt:    return new FPExtInst    (S, Ty, Name, InsertAtEnd);
-    case UIToFP:   return new UIToFPInst   (S, Ty, Name, InsertAtEnd);
-    case SIToFP:   return new SIToFPInst   (S, Ty, Name, InsertAtEnd);
-    case FPToUI:   return new FPToUIInst   (S, Ty, Name, InsertAtEnd);
-    case FPToSI:   return new FPToSIInst   (S, Ty, Name, InsertAtEnd);
-    case PtrToInt: return new PtrToIntInst (S, Ty, Name, InsertAtEnd);
-    case IntToPtr: return new IntToPtrInst (S, Ty, Name, InsertAtEnd);
-    case BitCast:  return new BitCastInst  (S, Ty, Name, InsertAtEnd);
-    default: llvm_unreachable("Invalid opcode provided");
+  case Trunc:         return new TruncInst         (S, Ty, Name, InsertAtEnd);
+  case ZExt:          return new ZExtInst          (S, Ty, Name, InsertAtEnd);
+  case SExt:          return new SExtInst          (S, Ty, Name, InsertAtEnd);
+  case FPTrunc:       return new FPTruncInst       (S, Ty, Name, InsertAtEnd);
+  case FPExt:         return new FPExtInst         (S, Ty, Name, InsertAtEnd);
+  case UIToFP:        return new UIToFPInst        (S, Ty, Name, InsertAtEnd);
+  case SIToFP:        return new SIToFPInst        (S, Ty, Name, InsertAtEnd);
+  case FPToUI:        return new FPToUIInst        (S, Ty, Name, InsertAtEnd);
+  case FPToSI:        return new FPToSIInst        (S, Ty, Name, InsertAtEnd);
+  case PtrToInt:      return new PtrToIntInst      (S, Ty, Name, InsertAtEnd);
+  case IntToPtr:      return new IntToPtrInst      (S, Ty, Name, InsertAtEnd);
+  case BitCast:       return new BitCastInst       (S, Ty, Name, InsertAtEnd);
+  case AddrSpaceCast: return new AddrSpaceCastInst (S, Ty, Name, InsertAtEnd);
+  default: llvm_unreachable("Invalid opcode provided");
   }
 }
 
@@ -2425,6 +2452,11 @@ CastInst *CastInst::CreatePointerCast(Value *S, Type *Ty,
 
   if (Ty->isIntOrIntVectorTy())
     return Create(Instruction::PtrToInt, S, Ty, Name, InsertAtEnd);
+
+  Type *STy = S->getType();
+  if (STy->getPointerAddressSpace() != Ty->getPointerAddressSpace())
+    return Create(Instruction::AddrSpaceCast, S, Ty, Name, InsertAtEnd);
+
   return Create(Instruction::BitCast, S, Ty, Name, InsertAtEnd);
 }
 
@@ -2442,6 +2474,11 @@ CastInst *CastInst::CreatePointerCast(Value *S, Type *Ty,
 
   if (Ty->isIntOrIntVectorTy())
     return Create(Instruction::PtrToInt, S, Ty, Name, InsertBefore);
+
+  Type *STy = S->getType();
+  if (STy->getPointerAddressSpace() != Ty->getPointerAddressSpace())
+    return Create(Instruction::AddrSpaceCast, S, Ty, Name, InsertBefore);
+
   return Create(Instruction::BitCast, S, Ty, Name, InsertBefore);
 }
 
@@ -2687,7 +2724,8 @@ CastInst::getCastOpcode(
     return BitCast;
   } else if (DestTy->isPointerTy()) {
     if (SrcTy->isPointerTy()) {
-      // TODO: Address space pointer sizes may not match
+      if (DestTy->getPointerAddressSpace() != SrcTy->getPointerAddressSpace())
+        return AddrSpaceCast;
       return BitCast;                               // ptr -> ptr
     } else if (SrcTy->isIntegerTy()) {
       return IntToPtr;                              // int -> ptr
@@ -2782,13 +2820,27 @@ CastInst::castIsValid(Instruction::CastOps op, Value *S, Type *DstTy) {
   case Instruction::BitCast:
     // BitCast implies a no-op cast of type only. No bits change.
     // However, you can't cast pointers to anything but pointers.
-    if (SrcTy->isPointerTy() != DstTy->isPointerTy())
+    if (SrcTy->isPtrOrPtrVectorTy() != DstTy->isPtrOrPtrVectorTy())
       return false;
 
-    // Now we know we're not dealing with a pointer/non-pointer mismatch. In all
-    // these cases, the cast is okay if the source and destination bit widths
-    // are identical.
-    return SrcTy->getPrimitiveSizeInBits() == DstTy->getPrimitiveSizeInBits();
+    // For non pointer cases, the cast is okay if the source and destination bit
+    // widths are identical.
+    if (!SrcTy->isPtrOrPtrVectorTy())
+      return SrcTy->getPrimitiveSizeInBits() == DstTy->getPrimitiveSizeInBits();
+
+    // If both are pointers then the address spaces must match and vector of
+    // pointers must have the same number of elements.
+    return SrcTy->getPointerAddressSpace() == DstTy->getPointerAddressSpace() &&
+           SrcTy->isVectorTy() == DstTy->isVectorTy() &&
+           (!SrcTy->isVectorTy() ||
+            SrcTy->getVectorNumElements() == SrcTy->getVectorNumElements());
+
+  case Instruction::AddrSpaceCast:
+    return SrcTy->isPtrOrPtrVectorTy() && DstTy->isPtrOrPtrVectorTy() &&
+           SrcTy->getPointerAddressSpace() != DstTy->getPointerAddressSpace() &&
+           SrcTy->isVectorTy() == DstTy->isVectorTy() &&
+           (!SrcTy->isVectorTy() ||
+            SrcTy->getVectorNumElements() == SrcTy->getVectorNumElements());
   }
 }
 
@@ -2935,6 +2987,18 @@ BitCastInst::BitCastInst(
   assert(castIsValid(getOpcode(), S, Ty) && "Illegal BitCast");
 }
 
+AddrSpaceCastInst::AddrSpaceCastInst(
+  Value *S, Type *Ty, const Twine &Name, Instruction *InsertBefore
+) : CastInst(Ty, AddrSpaceCast, S, Name, InsertBefore) {
+  assert(castIsValid(getOpcode(), S, Ty) && "Illegal AddrSpaceCast");
+}
+
+AddrSpaceCastInst::AddrSpaceCastInst(
+  Value *S, Type *Ty, const Twine &Name, BasicBlock *InsertAtEnd
+) : CastInst(Ty, AddrSpaceCast, S, Name, InsertAtEnd) {
+  assert(castIsValid(getOpcode(), S, Ty) && "Illegal AddrSpaceCast");
+}
+
 //===----------------------------------------------------------------------===//
 //                               CmpInst Classes
 //===----------------------------------------------------------------------===//
@@ -3267,7 +3331,6 @@ SwitchInst::SwitchInst(const SwitchInst &SI)
     OL[i] = InOL[i];
     OL[i+1] = InOL[i+1];
   }
-  TheSubsets = SI.TheSubsets;
   SubclassOptionalData = SI.SubclassOptionalData;
 }
 
@@ -3279,16 +3342,6 @@ SwitchInst::~SwitchInst() {
 /// addCase - Add an entry to the switch instruction...
 ///
 void SwitchInst::addCase(ConstantInt *OnVal, BasicBlock *Dest) {
-  IntegersSubsetToBB Mapping;
-  
-  // FIXME: Currently we work with ConstantInt based cases.
-  // So inititalize IntItem container directly from ConstantInt.
-  Mapping.add(IntItem::fromConstantInt(OnVal));
-  IntegersSubset CaseRanges = Mapping.getCase();
-  addCase(CaseRanges, Dest);
-}
-
-void SwitchInst::addCase(IntegersSubset& OnVal, BasicBlock *Dest) {
   unsigned NewCaseIdx = getNumCases(); 
   unsigned OpNo = NumOperands;
   if (OpNo+2 > ReservedSpace)
@@ -3296,17 +3349,14 @@ void SwitchInst::addCase(IntegersSubset& OnVal, BasicBlock *Dest) {
   // Initialize some new operands.
   assert(OpNo+1 < ReservedSpace && "Growing didn't work!");
   NumOperands = OpNo+2;
-
-  SubsetsIt TheSubsetsIt = TheSubsets.insert(TheSubsets.end(), OnVal);
-  
-  CaseIt Case(this, NewCaseIdx, TheSubsetsIt);
-  Case.updateCaseValueOperand(OnVal);
+  CaseIt Case(this, NewCaseIdx);
+  Case.setValue(OnVal);
   Case.setSuccessor(Dest);
 }
 
 /// removeCase - This method removes the specified case and its successor
 /// from the switch instruction.
-void SwitchInst::removeCase(CaseIt& i) {
+void SwitchInst::removeCase(CaseIt i) {
   unsigned idx = i.getCaseIndex();
   
   assert(2 + idx*2 < getNumOperands() && "Case index out of range!!!");
@@ -3323,16 +3373,6 @@ void SwitchInst::removeCase(CaseIt& i) {
   // Nuke the last value.
   OL[NumOps-2].set(0);
   OL[NumOps-2+1].set(0);
-
-  // Do the same with TheCases collection:
-  if (i.SubsetIt != --TheSubsets.end()) {
-    *i.SubsetIt = TheSubsets.back();
-    TheSubsets.pop_back();
-  } else {
-    TheSubsets.pop_back();
-    i.SubsetIt = TheSubsets.end();
-  }
-  
   NumOperands = NumOps-2;
 }
 
@@ -3577,6 +3617,10 @@ BitCastInst *BitCastInst::clone_impl() const {
   return new BitCastInst(getOperand(0), getType());
 }
 
+AddrSpaceCastInst *AddrSpaceCastInst::clone_impl() const {
+  return new AddrSpaceCastInst(getOperand(0), getType());
+}
+
 CallInst *CallInst::clone_impl() const {
   return  new(getNumOperands()) CallInst(*this);
 }
diff --git a/lib/IR/LLVMContextImpl.h b/lib/IR/LLVMContextImpl.h
index 0c659b8..407b985 100644
--- a/lib/IR/LLVMContextImpl.h
+++ b/lib/IR/LLVMContextImpl.h
@@ -355,6 +355,11 @@ public:
   typedef DenseMap<const Function*, unsigned> IntrinsicIDCacheTy;
   IntrinsicIDCacheTy IntrinsicIDCache;
 
+  /// \brief Mapping from a function to its prefix data, which is stored as the
+  /// operand of an unparented ReturnInst so that the prefix data has a Use.
+  typedef DenseMap<const Function *, ReturnInst *> PrefixDataMapTy;
+  PrefixDataMapTy PrefixDataMap;
+
   int getOrAddScopeRecordIdxEntry(MDNode *N, int ExistingIdx);
   int getOrAddScopeInlinedAtIdxEntry(MDNode *Scope, MDNode *IA,int ExistingIdx);
   
diff --git a/lib/IR/LegacyPassManager.cpp b/lib/IR/LegacyPassManager.cpp
new file mode 100644
index 0000000..a431d82
--- /dev/null
+++ b/lib/IR/LegacyPassManager.cpp
@@ -0,0 +1,1920 @@
+//===- LegacyPassManager.cpp - LLVM Pass Infrastructure Implementation ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the legacy LLVM Pass Manager infrastructure.
+//
+//===----------------------------------------------------------------------===//
+
+
+#include "llvm/Assembly/PrintModulePass.h"
+#include "llvm/Assembly/Writer.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/LegacyPassManagers.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/Mutex.h"
+#include "llvm/Support/PassNameParser.h"
+#include "llvm/Support/Timer.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <map>
+using namespace llvm;
+using namespace llvm::legacy;
+
+// See PassManagers.h for Pass Manager infrastructure overview.
+
+//===----------------------------------------------------------------------===//
+// Pass debugging information.  Often it is useful to find out what pass is
+// running when a crash occurs in a utility.  When this library is compiled with
+// debugging on, a command line option (--debug-pass) is enabled that causes the
+// pass name to be printed before it executes.
+//
+
+namespace {
+// Different debug levels that can be enabled...
+enum PassDebugLevel {
+  Disabled, Arguments, Structure, Executions, Details
+};
+}
+
+static cl::opt<enum PassDebugLevel>
+PassDebugging("debug-pass", cl::Hidden,
+                  cl::desc("Print PassManager debugging information"),
+                  cl::values(
+  clEnumVal(Disabled  , "disable debug output"),
+  clEnumVal(Arguments , "print pass arguments to pass to 'opt'"),
+  clEnumVal(Structure , "print pass structure before run()"),
+  clEnumVal(Executions, "print pass name before it is executed"),
+  clEnumVal(Details   , "print pass details when it is executed"),
+                             clEnumValEnd));
+
+namespace {
+typedef llvm::cl::list<const llvm::PassInfo *, bool, PassNameParser>
+PassOptionList;
+}
+
+// Print IR out before/after specified passes.
+static PassOptionList
+PrintBefore("print-before",
+            llvm::cl::desc("Print IR before specified passes"),
+            cl::Hidden);
+
+static PassOptionList
+PrintAfter("print-after",
+           llvm::cl::desc("Print IR after specified passes"),
+           cl::Hidden);
+
+static cl::opt<bool>
+PrintBeforeAll("print-before-all",
+               llvm::cl::desc("Print IR before each pass"),
+               cl::init(false));
+static cl::opt<bool>
+PrintAfterAll("print-after-all",
+              llvm::cl::desc("Print IR after each pass"),
+              cl::init(false));
+
+/// This is a helper to determine whether to print IR before or
+/// after a pass.
+
+static bool ShouldPrintBeforeOrAfterPass(const PassInfo *PI,
+                                         PassOptionList &PassesToPrint) {
+  for (unsigned i = 0, ie = PassesToPrint.size(); i < ie; ++i) {
+    const llvm::PassInfo *PassInf = PassesToPrint[i];
+    if (PassInf)
+      if (PassInf->getPassArgument() == PI->getPassArgument()) {
+        return true;
+      }
+  }
+  return false;
+}
+
+/// This is a utility to check whether a pass should have IR dumped
+/// before it.
+static bool ShouldPrintBeforePass(const PassInfo *PI) {
+  return PrintBeforeAll || ShouldPrintBeforeOrAfterPass(PI, PrintBefore);
+}
+
+/// This is a utility to check whether a pass should have IR dumped
+/// after it.
+static bool ShouldPrintAfterPass(const PassInfo *PI) {
+  return PrintAfterAll || ShouldPrintBeforeOrAfterPass(PI, PrintAfter);
+}
+
+/// isPassDebuggingExecutionsOrMore - Return true if -debug-pass=Executions
+/// or higher is specified.
+bool PMDataManager::isPassDebuggingExecutionsOrMore() const {
+  return PassDebugging >= Executions;
+}
+
+
+
+
+void PassManagerPrettyStackEntry::print(raw_ostream &OS) const {
+  if (V == 0 && M == 0)
+    OS << "Releasing pass '";
+  else
+    OS << "Running pass '";
+
+  OS << P->getPassName() << "'";
+
+  if (M) {
+    OS << " on module '" << M->getModuleIdentifier() << "'.\n";
+    return;
+  }
+  if (V == 0) {
+    OS << '\n';
+    return;
+  }
+
+  OS << " on ";
+  if (isa<Function>(V))
+    OS << "function";
+  else if (isa<BasicBlock>(V))
+    OS << "basic block";
+  else
+    OS << "value";
+
+  OS << " '";
+  WriteAsOperand(OS, V, /*PrintTy=*/false, M);
+  OS << "'\n";
+}
+
+
+namespace {
+//===----------------------------------------------------------------------===//
+// BBPassManager
+//
+/// BBPassManager manages BasicBlockPass. It batches all the
+/// pass together and sequence them to process one basic block before
+/// processing next basic block.
+class BBPassManager : public PMDataManager, public FunctionPass {
+
+public:
+  static char ID;
+  explicit BBPassManager()
+    : PMDataManager(), FunctionPass(ID) {}
+
+  /// Execute all of the passes scheduled for execution.  Keep track of
+  /// whether any of the passes modifies the function, and if so, return true.
+  bool runOnFunction(Function &F);
+
+  /// Pass Manager itself does not invalidate any analysis info.
+  void getAnalysisUsage(AnalysisUsage &Info) const {
+    Info.setPreservesAll();
+  }
+
+  bool doInitialization(Module &M);
+  bool doInitialization(Function &F);
+  bool doFinalization(Module &M);
+  bool doFinalization(Function &F);
+
+  virtual PMDataManager *getAsPMDataManager() { return this; }
+  virtual Pass *getAsPass() { return this; }
+
+  virtual const char *getPassName() const {
+    return "BasicBlock Pass Manager";
+  }
+
+  // Print passes managed by this manager
+  void dumpPassStructure(unsigned Offset) {
+    llvm::dbgs().indent(Offset*2) << "BasicBlockPass Manager\n";
+    for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
+      BasicBlockPass *BP = getContainedPass(Index);
+      BP->dumpPassStructure(Offset + 1);
+      dumpLastUses(BP, Offset+1);
+    }
+  }
+
+  BasicBlockPass *getContainedPass(unsigned N) {
+    assert(N < PassVector.size() && "Pass number out of range!");
+    BasicBlockPass *BP = static_cast<BasicBlockPass *>(PassVector[N]);
+    return BP;
+  }
+
+  virtual PassManagerType getPassManagerType() const {
+    return PMT_BasicBlockPassManager;
+  }
+};
+
+char BBPassManager::ID = 0;
+} // End anonymous namespace
+
+namespace llvm {
+namespace legacy {
+//===----------------------------------------------------------------------===//
+// FunctionPassManagerImpl
+//
+/// FunctionPassManagerImpl manages FPPassManagers
+class FunctionPassManagerImpl : public Pass,
+                                public PMDataManager,
+                                public PMTopLevelManager {
+  virtual void anchor();
+private:
+  bool wasRun;
+public:
+  static char ID;
+  explicit FunctionPassManagerImpl() :
+    Pass(PT_PassManager, ID), PMDataManager(),
+    PMTopLevelManager(new FPPassManager()), wasRun(false) {}
+
+  /// add - Add a pass to the queue of passes to run.  This passes ownership of
+  /// the Pass to the PassManager.  When the PassManager is destroyed, the pass
+  /// will be destroyed as well, so there is no need to delete the pass.  This
+  /// implies that all passes MUST be allocated with 'new'.
+  void add(Pass *P) {
+    schedulePass(P);
+  }
+
+  /// createPrinterPass - Get a function printer pass.
+  Pass *createPrinterPass(raw_ostream &O, const std::string &Banner) const {
+    return createPrintFunctionPass(Banner, &O);
+  }
+
+  // Prepare for running an on the fly pass, freeing memory if needed
+  // from a previous run.
+  void releaseMemoryOnTheFly();
+
+  /// run - Execute all of the passes scheduled for execution.  Keep track of
+  /// whether any of the passes modifies the module, and if so, return true.
+  bool run(Function &F);
+
+  /// doInitialization - Run all of the initializers for the function passes.
+  ///
+  bool doInitialization(Module &M);
+
+  /// doFinalization - Run all of the finalizers for the function passes.
+  ///
+  bool doFinalization(Module &M);
+
+
+  virtual PMDataManager *getAsPMDataManager() { return this; }
+  virtual Pass *getAsPass() { return this; }
+  virtual PassManagerType getTopLevelPassManagerType() {
+    return PMT_FunctionPassManager;
+  }
+
+  /// Pass Manager itself does not invalidate any analysis info.
+  void getAnalysisUsage(AnalysisUsage &Info) const {
+    Info.setPreservesAll();
+  }
+
+  FPPassManager *getContainedManager(unsigned N) {
+    assert(N < PassManagers.size() && "Pass number out of range!");
+    FPPassManager *FP = static_cast<FPPassManager *>(PassManagers[N]);
+    return FP;
+  }
+};
+
+void FunctionPassManagerImpl::anchor() {}
+
+char FunctionPassManagerImpl::ID = 0;
+} // End of legacy namespace
+} // End of llvm namespace
+
+namespace {
+//===----------------------------------------------------------------------===//
+// MPPassManager
+//
+/// MPPassManager manages ModulePasses and function pass managers.
+/// It batches all Module passes and function pass managers together and
+/// sequences them to process one module.
+class MPPassManager : public Pass, public PMDataManager {
+public:
+  static char ID;
+  explicit MPPassManager() :
+    Pass(PT_PassManager, ID), PMDataManager() { }
+
+  // Delete on the fly managers.
+  virtual ~MPPassManager() {
+    for (std::map<Pass *, FunctionPassManagerImpl *>::iterator
+           I = OnTheFlyManagers.begin(), E = OnTheFlyManagers.end();
+         I != E; ++I) {
+      FunctionPassManagerImpl *FPP = I->second;
+      delete FPP;
+    }
+  }
+
+  /// createPrinterPass - Get a module printer pass.
+  Pass *createPrinterPass(raw_ostream &O, const std::string &Banner) const {
+    return createPrintModulePass(&O, false, Banner);
+  }
+
+  /// run - Execute all of the passes scheduled for execution.  Keep track of
+  /// whether any of the passes modifies the module, and if so, return true.
+  bool runOnModule(Module &M);
+
+  using llvm::Pass::doInitialization;
+  using llvm::Pass::doFinalization;
+
+  /// doInitialization - Run all of the initializers for the module passes.
+  ///
+  bool doInitialization();
+
+  /// doFinalization - Run all of the finalizers for the module passes.
+  ///
+  bool doFinalization();
+
+  /// Pass Manager itself does not invalidate any analysis info.
+  void getAnalysisUsage(AnalysisUsage &Info) const {
+    Info.setPreservesAll();
+  }
+
+  /// Add RequiredPass into list of lower level passes required by pass P.
+  /// RequiredPass is run on the fly by Pass Manager when P requests it
+  /// through getAnalysis interface.
+  virtual void addLowerLevelRequiredPass(Pass *P, Pass *RequiredPass);
+
+  /// Return function pass corresponding to PassInfo PI, that is
+  /// required by module pass MP. Instantiate analysis pass, by using
+  /// its runOnFunction() for function F.
+  virtual Pass* getOnTheFlyPass(Pass *MP, AnalysisID PI, Function &F);
+
+  virtual const char *getPassName() const {
+    return "Module Pass Manager";
+  }
+
+  virtual PMDataManager *getAsPMDataManager() { return this; }
+  virtual Pass *getAsPass() { return this; }
+
+  // Print passes managed by this manager
+  void dumpPassStructure(unsigned Offset) {
+    llvm::dbgs().indent(Offset*2) << "ModulePass Manager\n";
+    for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
+      ModulePass *MP = getContainedPass(Index);
+      MP->dumpPassStructure(Offset + 1);
+      std::map<Pass *, FunctionPassManagerImpl *>::const_iterator I =
+        OnTheFlyManagers.find(MP);
+      if (I != OnTheFlyManagers.end())
+        I->second->dumpPassStructure(Offset + 2);
+      dumpLastUses(MP, Offset+1);
+    }
+  }
+
+  ModulePass *getContainedPass(unsigned N) {
+    assert(N < PassVector.size() && "Pass number out of range!");
+    return static_cast<ModulePass *>(PassVector[N]);
+  }
+
+  virtual PassManagerType getPassManagerType() const {
+    return PMT_ModulePassManager;
+  }
+
+ private:
+  /// Collection of on the fly FPPassManagers. These managers manage
+  /// function passes that are required by module passes.
+  std::map<Pass *, FunctionPassManagerImpl *> OnTheFlyManagers;
+};
+
+char MPPassManager::ID = 0;
+} // End anonymous namespace
+
+namespace llvm {
+namespace legacy {
+//===----------------------------------------------------------------------===//
+// PassManagerImpl
+//
+
+/// PassManagerImpl manages MPPassManagers
+class PassManagerImpl : public Pass,
+                        public PMDataManager,
+                        public PMTopLevelManager {
+  virtual void anchor();
+
+public:
+  static char ID;
+  explicit PassManagerImpl() :
+    Pass(PT_PassManager, ID), PMDataManager(),
+                              PMTopLevelManager(new MPPassManager()) {}
+
+  /// add - Add a pass to the queue of passes to run.  This passes ownership of
+  /// the Pass to the PassManager.  When the PassManager is destroyed, the pass
+  /// will be destroyed as well, so there is no need to delete the pass.  This
+  /// implies that all passes MUST be allocated with 'new'.
+  void add(Pass *P) {
+    schedulePass(P);
+  }
+
+  /// createPrinterPass - Get a module printer pass.
+  Pass *createPrinterPass(raw_ostream &O, const std::string &Banner) const {
+    return createPrintModulePass(&O, false, Banner);
+  }
+
+  /// run - Execute all of the passes scheduled for execution.  Keep track of
+  /// whether any of the passes modifies the module, and if so, return true.
+  bool run(Module &M);
+
+  using llvm::Pass::doInitialization;
+  using llvm::Pass::doFinalization;
+
+  /// doInitialization - Run all of the initializers for the module passes.
+  ///
+  bool doInitialization();
+
+  /// doFinalization - Run all of the finalizers for the module passes.
+  ///
+  bool doFinalization();
+
+  /// Pass Manager itself does not invalidate any analysis info.
+  void getAnalysisUsage(AnalysisUsage &Info) const {
+    Info.setPreservesAll();
+  }
+
+  virtual PMDataManager *getAsPMDataManager() { return this; }
+  virtual Pass *getAsPass() { return this; }
+  virtual PassManagerType getTopLevelPassManagerType() {
+    return PMT_ModulePassManager;
+  }
+
+  MPPassManager *getContainedManager(unsigned N) {
+    assert(N < PassManagers.size() && "Pass number out of range!");
+    MPPassManager *MP = static_cast<MPPassManager *>(PassManagers[N]);
+    return MP;
+  }
+};
+
+void PassManagerImpl::anchor() {}
+
+char PassManagerImpl::ID = 0;
+} // End of legacy namespace
+} // End of llvm namespace
+
+namespace {
+
+//===----------------------------------------------------------------------===//
+/// TimingInfo Class - This class is used to calculate information about the
+/// amount of time each pass takes to execute.  This only happens when
+/// -time-passes is enabled on the command line.
+///
+
+static ManagedStatic<sys::SmartMutex<true> > TimingInfoMutex;
+
+class TimingInfo {
+  DenseMap<Pass*, Timer*> TimingData;
+  TimerGroup TG;
+public:
+  // Use 'create' member to get this.
+  TimingInfo() : TG("... Pass execution timing report ...") {}
+
+  // TimingDtor - Print out information about timing information
+  ~TimingInfo() {
+    // Delete all of the timers, which accumulate their info into the
+    // TimerGroup.
+    for (DenseMap<Pass*, Timer*>::iterator I = TimingData.begin(),
+         E = TimingData.end(); I != E; ++I)
+      delete I->second;
+    // TimerGroup is deleted next, printing the report.
+  }
+
+  // createTheTimeInfo - This method either initializes the TheTimeInfo pointer
+  // to a non null value (if the -time-passes option is enabled) or it leaves it
+  // null.  It may be called multiple times.
+  static void createTheTimeInfo();
+
+  /// getPassTimer - Return the timer for the specified pass if it exists.
+  Timer *getPassTimer(Pass *P) {
+    if (P->getAsPMDataManager())
+      return 0;
+
+    sys::SmartScopedLock<true> Lock(*TimingInfoMutex);
+    Timer *&T = TimingData[P];
+    if (T == 0)
+      T = new Timer(P->getPassName(), TG);
+    return T;
+  }
+};
+
+} // End of anon namespace
+
+static TimingInfo *TheTimeInfo;
+
+//===----------------------------------------------------------------------===//
+// PMTopLevelManager implementation
+
+/// Initialize top level manager. Create first pass manager.
+PMTopLevelManager::PMTopLevelManager(PMDataManager *PMDM) {
+  PMDM->setTopLevelManager(this);
+  addPassManager(PMDM);
+  activeStack.push(PMDM);
+}
+
+/// Set pass P as the last user of the given analysis passes.
+void
+PMTopLevelManager::setLastUser(ArrayRef<Pass*> AnalysisPasses, Pass *P) {
+  unsigned PDepth = 0;
+  if (P->getResolver())
+    PDepth = P->getResolver()->getPMDataManager().getDepth();
+
+  for (SmallVectorImpl<Pass *>::const_iterator I = AnalysisPasses.begin(),
+         E = AnalysisPasses.end(); I != E; ++I) {
+    Pass *AP = *I;
+    LastUser[AP] = P;
+
+    if (P == AP)
+      continue;
+
+    // Update the last users of passes that are required transitive by AP.
+    AnalysisUsage *AnUsage = findAnalysisUsage(AP);
+    const AnalysisUsage::VectorType &IDs = AnUsage->getRequiredTransitiveSet();
+    SmallVector<Pass *, 12> LastUses;
+    SmallVector<Pass *, 12> LastPMUses;
+    for (AnalysisUsage::VectorType::const_iterator I = IDs.begin(),
+         E = IDs.end(); I != E; ++I) {
+      Pass *AnalysisPass = findAnalysisPass(*I);
+      assert(AnalysisPass && "Expected analysis pass to exist.");
+      AnalysisResolver *AR = AnalysisPass->getResolver();
+      assert(AR && "Expected analysis resolver to exist.");
+      unsigned APDepth = AR->getPMDataManager().getDepth();
+
+      if (PDepth == APDepth)
+        LastUses.push_back(AnalysisPass);
+      else if (PDepth > APDepth)
+        LastPMUses.push_back(AnalysisPass);
+    }
+
+    setLastUser(LastUses, P);
+
+    // If this pass has a corresponding pass manager, push higher level
+    // analysis to this pass manager.
+    if (P->getResolver())
+      setLastUser(LastPMUses, P->getResolver()->getPMDataManager().getAsPass());
+
+
+    // If AP is the last user of other passes then make P last user of
+    // such passes.
+    for (DenseMap<Pass *, Pass *>::iterator LUI = LastUser.begin(),
+           LUE = LastUser.end(); LUI != LUE; ++LUI) {
+      if (LUI->second == AP)
+        // DenseMap iterator is not invalidated here because
+        // this is just updating existing entries.
+        LastUser[LUI->first] = P;
+    }
+  }
+}
+
+/// Collect passes whose last user is P
+void PMTopLevelManager::collectLastUses(SmallVectorImpl<Pass *> &LastUses,
+                                        Pass *P) {
+  DenseMap<Pass *, SmallPtrSet<Pass *, 8> >::iterator DMI =
+    InversedLastUser.find(P);
+  if (DMI == InversedLastUser.end())
+    return;
+
+  SmallPtrSet<Pass *, 8> &LU = DMI->second;
+  for (SmallPtrSet<Pass *, 8>::iterator I = LU.begin(),
+         E = LU.end(); I != E; ++I) {
+    LastUses.push_back(*I);
+  }
+
+}
+
+AnalysisUsage *PMTopLevelManager::findAnalysisUsage(Pass *P) {
+  AnalysisUsage *AnUsage = NULL;
+  DenseMap<Pass *, AnalysisUsage *>::iterator DMI = AnUsageMap.find(P);
+  if (DMI != AnUsageMap.end())
+    AnUsage = DMI->second;
+  else {
+    AnUsage = new AnalysisUsage();
+    P->getAnalysisUsage(*AnUsage);
+    AnUsageMap[P] = AnUsage;
+  }
+  return AnUsage;
+}
+
+/// Schedule pass P for execution. Make sure that passes required by
+/// P are run before P is run. Update analysis info maintained by
+/// the manager. Remove dead passes. This is a recursive function.
+void PMTopLevelManager::schedulePass(Pass *P) {
+
+  // TODO : Allocate function manager for this pass, other wise required set
+  // may be inserted into previous function manager
+
+  // Give pass a chance to prepare the stage.
+  P->preparePassManager(activeStack);
+
+  // If P is an analysis pass and it is available then do not
+  // generate the analysis again. Stale analysis info should not be
+  // available at this point.
+  const PassInfo *PI =
+    PassRegistry::getPassRegistry()->getPassInfo(P->getPassID());
+  if (PI && PI->isAnalysis() && findAnalysisPass(P->getPassID())) {
+    delete P;
+    return;
+  }
+
+  AnalysisUsage *AnUsage = findAnalysisUsage(P);
+
+  bool checkAnalysis = true;
+  while (checkAnalysis) {
+    checkAnalysis = false;
+
+    const AnalysisUsage::VectorType &RequiredSet = AnUsage->getRequiredSet();
+    for (AnalysisUsage::VectorType::const_iterator I = RequiredSet.begin(),
+           E = RequiredSet.end(); I != E; ++I) {
+
+      Pass *AnalysisPass = findAnalysisPass(*I);
+      if (!AnalysisPass) {
+        const PassInfo *PI = PassRegistry::getPassRegistry()->getPassInfo(*I);
+
+        if (PI == NULL) {
+          // Pass P is not in the global PassRegistry
+          dbgs() << "Pass '"  << P->getPassName() << "' is not initialized." << "\n";
+          dbgs() << "Verify if there is a pass dependency cycle." << "\n";
+          dbgs() << "Required Passes:" << "\n";
+          for (AnalysisUsage::VectorType::const_iterator I2 = RequiredSet.begin(),
+                 E = RequiredSet.end(); I2 != E && I2 != I; ++I2) {
+            Pass *AnalysisPass2 = findAnalysisPass(*I2);
+            if (AnalysisPass2) {
+              dbgs() << "\t" << AnalysisPass2->getPassName() << "\n";
+            } else {
+              dbgs() << "\t"   << "Error: Required pass not found! Possible causes:"  << "\n";
+              dbgs() << "\t\t" << "- Pass misconfiguration (e.g.: missing macros)"    << "\n";
+              dbgs() << "\t\t" << "- Corruption of the global PassRegistry"           << "\n";
+            }
+          }
+        }
+
+        assert(PI && "Expected required passes to be initialized");
+        AnalysisPass = PI->createPass();
+        if (P->getPotentialPassManagerType () ==
+            AnalysisPass->getPotentialPassManagerType())
+          // Schedule analysis pass that is managed by the same pass manager.
+          schedulePass(AnalysisPass);
+        else if (P->getPotentialPassManagerType () >
+                 AnalysisPass->getPotentialPassManagerType()) {
+          // Schedule analysis pass that is managed by a new manager.
+          schedulePass(AnalysisPass);
+          // Recheck analysis passes to ensure that required analyses that
+          // are already checked are still available.
+          checkAnalysis = true;
+        } else
+          // Do not schedule this analysis. Lower level analsyis
+          // passes are run on the fly.
+          delete AnalysisPass;
+      }
+    }
+  }
+
+  // Now all required passes are available.
+  if (ImmutablePass *IP = P->getAsImmutablePass()) {
+    // P is a immutable pass and it will be managed by this
+    // top level manager. Set up analysis resolver to connect them.
+    PMDataManager *DM = getAsPMDataManager();
+    AnalysisResolver *AR = new AnalysisResolver(*DM);
+    P->setResolver(AR);
+    DM->initializeAnalysisImpl(P);
+    addImmutablePass(IP);
+    DM->recordAvailableAnalysis(IP);
+    return;
+  }
+
+  if (PI && !PI->isAnalysis() && ShouldPrintBeforePass(PI)) {
+    Pass *PP = P->createPrinterPass(
+      dbgs(), std::string("*** IR Dump Before ") + P->getPassName() + " ***");
+    PP->assignPassManager(activeStack, getTopLevelPassManagerType());
+  }
+
+  // Add the requested pass to the best available pass manager.
+  P->assignPassManager(activeStack, getTopLevelPassManagerType());
+
+  if (PI && !PI->isAnalysis() && ShouldPrintAfterPass(PI)) {
+    Pass *PP = P->createPrinterPass(
+      dbgs(), std::string("*** IR Dump After ") + P->getPassName() + " ***");
+    PP->assignPassManager(activeStack, getTopLevelPassManagerType());
+  }
+}
+
+/// Find the pass that implements Analysis AID. Search immutable
+/// passes and all pass managers. If desired pass is not found
+/// then return NULL.
+Pass *PMTopLevelManager::findAnalysisPass(AnalysisID AID) {
+
+  // Check pass managers
+  for (SmallVectorImpl<PMDataManager *>::iterator I = PassManagers.begin(),
+         E = PassManagers.end(); I != E; ++I)
+    if (Pass *P = (*I)->findAnalysisPass(AID, false))
+      return P;
+
+  // Check other pass managers
+  for (SmallVectorImpl<PMDataManager *>::iterator
+         I = IndirectPassManagers.begin(),
+         E = IndirectPassManagers.end(); I != E; ++I)
+    if (Pass *P = (*I)->findAnalysisPass(AID, false))
+      return P;
+
+  // Check the immutable passes. Iterate in reverse order so that we find
+  // the most recently registered passes first.
+  for (SmallVectorImpl<ImmutablePass *>::reverse_iterator I =
+       ImmutablePasses.rbegin(), E = ImmutablePasses.rend(); I != E; ++I) {
+    AnalysisID PI = (*I)->getPassID();
+    if (PI == AID)
+      return *I;
+
+    // If Pass not found then check the interfaces implemented by Immutable Pass
+    const PassInfo *PassInf =
+      PassRegistry::getPassRegistry()->getPassInfo(PI);
+    assert(PassInf && "Expected all immutable passes to be initialized");
+    const std::vector<const PassInfo*> &ImmPI =
+      PassInf->getInterfacesImplemented();
+    for (std::vector<const PassInfo*>::const_iterator II = ImmPI.begin(),
+         EE = ImmPI.end(); II != EE; ++II) {
+      if ((*II)->getTypeInfo() == AID)
+        return *I;
+    }
+  }
+
+  return 0;
+}
+
+// Print passes managed by this top level manager.
+void PMTopLevelManager::dumpPasses() const {
+
+  if (PassDebugging < Structure)
+    return;
+
+  // Print out the immutable passes
+  for (unsigned i = 0, e = ImmutablePasses.size(); i != e; ++i) {
+    ImmutablePasses[i]->dumpPassStructure(0);
+  }
+
+  // Every class that derives from PMDataManager also derives from Pass
+  // (sometimes indirectly), but there's no inheritance relationship
+  // between PMDataManager and Pass, so we have to getAsPass to get
+  // from a PMDataManager* to a Pass*.
+  for (SmallVectorImpl<PMDataManager *>::const_iterator I =
+       PassManagers.begin(), E = PassManagers.end(); I != E; ++I)
+    (*I)->getAsPass()->dumpPassStructure(1);
+}
+
+void PMTopLevelManager::dumpArguments() const {
+
+  if (PassDebugging < Arguments)
+    return;
+
+  dbgs() << "Pass Arguments: ";
+  for (SmallVectorImpl<ImmutablePass *>::const_iterator I =
+       ImmutablePasses.begin(), E = ImmutablePasses.end(); I != E; ++I)
+    if (const PassInfo *PI =
+        PassRegistry::getPassRegistry()->getPassInfo((*I)->getPassID())) {
+      assert(PI && "Expected all immutable passes to be initialized");
+      if (!PI->isAnalysisGroup())
+        dbgs() << " -" << PI->getPassArgument();
+    }
+  for (SmallVectorImpl<PMDataManager *>::const_iterator I =
+       PassManagers.begin(), E = PassManagers.end(); I != E; ++I)
+    (*I)->dumpPassArguments();
+  dbgs() << "\n";
+}
+
+void PMTopLevelManager::initializeAllAnalysisInfo() {
+  for (SmallVectorImpl<PMDataManager *>::iterator I = PassManagers.begin(),
+         E = PassManagers.end(); I != E; ++I)
+    (*I)->initializeAnalysisInfo();
+
+  // Initailize other pass managers
+  for (SmallVectorImpl<PMDataManager *>::iterator
+       I = IndirectPassManagers.begin(), E = IndirectPassManagers.end();
+       I != E; ++I)
+    (*I)->initializeAnalysisInfo();
+
+  for (DenseMap<Pass *, Pass *>::iterator DMI = LastUser.begin(),
+        DME = LastUser.end(); DMI != DME; ++DMI) {
+    DenseMap<Pass *, SmallPtrSet<Pass *, 8> >::iterator InvDMI =
+      InversedLastUser.find(DMI->second);
+    if (InvDMI != InversedLastUser.end()) {
+      SmallPtrSet<Pass *, 8> &L = InvDMI->second;
+      L.insert(DMI->first);
+    } else {
+      SmallPtrSet<Pass *, 8> L; L.insert(DMI->first);
+      InversedLastUser[DMI->second] = L;
+    }
+  }
+}
+
+/// Destructor
+PMTopLevelManager::~PMTopLevelManager() {
+  for (SmallVectorImpl<PMDataManager *>::iterator I = PassManagers.begin(),
+         E = PassManagers.end(); I != E; ++I)
+    delete *I;
+
+  for (SmallVectorImpl<ImmutablePass *>::iterator
+         I = ImmutablePasses.begin(), E = ImmutablePasses.end(); I != E; ++I)
+    delete *I;
+
+  for (DenseMap<Pass *, AnalysisUsage *>::iterator DMI = AnUsageMap.begin(),
+         DME = AnUsageMap.end(); DMI != DME; ++DMI)
+    delete DMI->second;
+}
+
+//===----------------------------------------------------------------------===//
+// PMDataManager implementation
+
+/// Augement AvailableAnalysis by adding analysis made available by pass P.
+void PMDataManager::recordAvailableAnalysis(Pass *P) {
+  AnalysisID PI = P->getPassID();
+
+  AvailableAnalysis[PI] = P;
+
+  assert(!AvailableAnalysis.empty());
+
+  // This pass is the current implementation of all of the interfaces it
+  // implements as well.
+  const PassInfo *PInf = PassRegistry::getPassRegistry()->getPassInfo(PI);
+  if (PInf == 0) return;
+  const std::vector<const PassInfo*> &II = PInf->getInterfacesImplemented();
+  for (unsigned i = 0, e = II.size(); i != e; ++i)
+    AvailableAnalysis[II[i]->getTypeInfo()] = P;
+}
+
+// Return true if P preserves high level analysis used by other
+// passes managed by this manager
+bool PMDataManager::preserveHigherLevelAnalysis(Pass *P) {
+  AnalysisUsage *AnUsage = TPM->findAnalysisUsage(P);
+  if (AnUsage->getPreservesAll())
+    return true;
+
+  const AnalysisUsage::VectorType &PreservedSet = AnUsage->getPreservedSet();
+  for (SmallVectorImpl<Pass *>::iterator I = HigherLevelAnalysis.begin(),
+         E = HigherLevelAnalysis.end(); I  != E; ++I) {
+    Pass *P1 = *I;
+    if (P1->getAsImmutablePass() == 0 &&
+        std::find(PreservedSet.begin(), PreservedSet.end(),
+                  P1->getPassID()) ==
+           PreservedSet.end())
+      return false;
+  }
+
+  return true;
+}
+
+/// verifyPreservedAnalysis -- Verify analysis preserved by pass P.
+void PMDataManager::verifyPreservedAnalysis(Pass *P) {
+  // Don't do this unless assertions are enabled.
+#ifdef NDEBUG
+  return;
+#endif
+  AnalysisUsage *AnUsage = TPM->findAnalysisUsage(P);
+  const AnalysisUsage::VectorType &PreservedSet = AnUsage->getPreservedSet();
+
+  // Verify preserved analysis
+  for (AnalysisUsage::VectorType::const_iterator I = PreservedSet.begin(),
+         E = PreservedSet.end(); I != E; ++I) {
+    AnalysisID AID = *I;
+    if (Pass *AP = findAnalysisPass(AID, true)) {
+      TimeRegion PassTimer(getPassTimer(AP));
+      AP->verifyAnalysis();
+    }
+  }
+}
+
+/// Remove Analysis not preserved by Pass P
+void PMDataManager::removeNotPreservedAnalysis(Pass *P) {
+  AnalysisUsage *AnUsage = TPM->findAnalysisUsage(P);
+  if (AnUsage->getPreservesAll())
+    return;
+
+  const AnalysisUsage::VectorType &PreservedSet = AnUsage->getPreservedSet();
+  for (DenseMap<AnalysisID, Pass*>::iterator I = AvailableAnalysis.begin(),
+         E = AvailableAnalysis.end(); I != E; ) {
+    DenseMap<AnalysisID, Pass*>::iterator Info = I++;
+    if (Info->second->getAsImmutablePass() == 0 &&
+        std::find(PreservedSet.begin(), PreservedSet.end(), Info->first) ==
+        PreservedSet.end()) {
+      // Remove this analysis
+      if (PassDebugging >= Details) {
+        Pass *S = Info->second;
+        dbgs() << " -- '" <<  P->getPassName() << "' is not preserving '";
+        dbgs() << S->getPassName() << "'\n";
+      }
+      AvailableAnalysis.erase(Info);
+    }
+  }
+
+  // Check inherited analysis also. If P is not preserving analysis
+  // provided by parent manager then remove it here.
+  for (unsigned Index = 0; Index < PMT_Last; ++Index) {
+
+    if (!InheritedAnalysis[Index])
+      continue;
+
+    for (DenseMap<AnalysisID, Pass*>::iterator
+           I = InheritedAnalysis[Index]->begin(),
+           E = InheritedAnalysis[Index]->end(); I != E; ) {
+      DenseMap<AnalysisID, Pass *>::iterator Info = I++;
+      if (Info->second->getAsImmutablePass() == 0 &&
+          std::find(PreservedSet.begin(), PreservedSet.end(), Info->first) ==
+             PreservedSet.end()) {
+        // Remove this analysis
+        if (PassDebugging >= Details) {
+          Pass *S = Info->second;
+          dbgs() << " -- '" <<  P->getPassName() << "' is not preserving '";
+          dbgs() << S->getPassName() << "'\n";
+        }
+        InheritedAnalysis[Index]->erase(Info);
+      }
+    }
+  }
+}
+
+/// Remove analysis passes that are not used any longer
+void PMDataManager::removeDeadPasses(Pass *P, StringRef Msg,
+                                     enum PassDebuggingString DBG_STR) {
+
+  SmallVector<Pass *, 12> DeadPasses;
+
+  // If this is a on the fly manager then it does not have TPM.
+  if (!TPM)
+    return;
+
+  TPM->collectLastUses(DeadPasses, P);
+
+  if (PassDebugging >= Details && !DeadPasses.empty()) {
+    dbgs() << " -*- '" <<  P->getPassName();
+    dbgs() << "' is the last user of following pass instances.";
+    dbgs() << " Free these instances\n";
+  }
+
+  for (SmallVectorImpl<Pass *>::iterator I = DeadPasses.begin(),
+         E = DeadPasses.end(); I != E; ++I)
+    freePass(*I, Msg, DBG_STR);
+}
+
+void PMDataManager::freePass(Pass *P, StringRef Msg,
+                             enum PassDebuggingString DBG_STR) {
+  dumpPassInfo(P, FREEING_MSG, DBG_STR, Msg);
+
+  {
+    // If the pass crashes releasing memory, remember this.
+    PassManagerPrettyStackEntry X(P);
+    TimeRegion PassTimer(getPassTimer(P));
+
+    P->releaseMemory();
+  }
+
+  AnalysisID PI = P->getPassID();
+  if (const PassInfo *PInf = PassRegistry::getPassRegistry()->getPassInfo(PI)) {
+    // Remove the pass itself (if it is not already removed).
+    AvailableAnalysis.erase(PI);
+
+    // Remove all interfaces this pass implements, for which it is also
+    // listed as the available implementation.
+    const std::vector<const PassInfo*> &II = PInf->getInterfacesImplemented();
+    for (unsigned i = 0, e = II.size(); i != e; ++i) {
+      DenseMap<AnalysisID, Pass*>::iterator Pos =
+        AvailableAnalysis.find(II[i]->getTypeInfo());
+      if (Pos != AvailableAnalysis.end() && Pos->second == P)
+        AvailableAnalysis.erase(Pos);
+    }
+  }
+}
+
+/// Add pass P into the PassVector. Update
+/// AvailableAnalysis appropriately if ProcessAnalysis is true.
+void PMDataManager::add(Pass *P, bool ProcessAnalysis) {
+  // This manager is going to manage pass P. Set up analysis resolver
+  // to connect them.
+  AnalysisResolver *AR = new AnalysisResolver(*this);
+  P->setResolver(AR);
+
+  // If a FunctionPass F is the last user of ModulePass info M
+  // then the F's manager, not F, records itself as a last user of M.
+  SmallVector<Pass *, 12> TransferLastUses;
+
+  if (!ProcessAnalysis) {
+    // Add pass
+    PassVector.push_back(P);
+    return;
+  }
+
+  // At the moment, this pass is the last user of all required passes.
+  SmallVector<Pass *, 12> LastUses;
+  SmallVector<Pass *, 8> RequiredPasses;
+  SmallVector<AnalysisID, 8> ReqAnalysisNotAvailable;
+
+  unsigned PDepth = this->getDepth();
+
+  collectRequiredAnalysis(RequiredPasses,
+                          ReqAnalysisNotAvailable, P);
+  for (SmallVectorImpl<Pass *>::iterator I = RequiredPasses.begin(),
+         E = RequiredPasses.end(); I != E; ++I) {
+    Pass *PRequired = *I;
+    unsigned RDepth = 0;
+
+    assert(PRequired->getResolver() && "Analysis Resolver is not set");
+    PMDataManager &DM = PRequired->getResolver()->getPMDataManager();
+    RDepth = DM.getDepth();
+
+    if (PDepth == RDepth)
+      LastUses.push_back(PRequired);
+    else if (PDepth > RDepth) {
+      // Let the parent claim responsibility of last use
+      TransferLastUses.push_back(PRequired);
+      // Keep track of higher level analysis used by this manager.
+      HigherLevelAnalysis.push_back(PRequired);
+    } else
+      llvm_unreachable("Unable to accommodate Required Pass");
+  }
+
+  // Set P as P's last user until someone starts using P.
+  // However, if P is a Pass Manager then it does not need
+  // to record its last user.
+  if (P->getAsPMDataManager() == 0)
+    LastUses.push_back(P);
+  TPM->setLastUser(LastUses, P);
+
+  if (!TransferLastUses.empty()) {
+    Pass *My_PM = getAsPass();
+    TPM->setLastUser(TransferLastUses, My_PM);
+    TransferLastUses.clear();
+  }
+
+  // Now, take care of required analyses that are not available.
+  for (SmallVectorImpl<AnalysisID>::iterator
+         I = ReqAnalysisNotAvailable.begin(),
+         E = ReqAnalysisNotAvailable.end() ;I != E; ++I) {
+    const PassInfo *PI = PassRegistry::getPassRegistry()->getPassInfo(*I);
+    Pass *AnalysisPass = PI->createPass();
+    this->addLowerLevelRequiredPass(P, AnalysisPass);
+  }
+
+  // Take a note of analysis required and made available by this pass.
+  // Remove the analysis not preserved by this pass
+  removeNotPreservedAnalysis(P);
+  recordAvailableAnalysis(P);
+
+  // Add pass
+  PassVector.push_back(P);
+}
+
+
+/// Populate RP with analysis pass that are required by
+/// pass P and are available. Populate RP_NotAvail with analysis
+/// pass that are required by pass P but are not available.
+void PMDataManager::collectRequiredAnalysis(SmallVectorImpl<Pass *> &RP,
+                                       SmallVectorImpl<AnalysisID> &RP_NotAvail,
+                                            Pass *P) {
+  AnalysisUsage *AnUsage = TPM->findAnalysisUsage(P);
+  const AnalysisUsage::VectorType &RequiredSet = AnUsage->getRequiredSet();
+  for (AnalysisUsage::VectorType::const_iterator
+         I = RequiredSet.begin(), E = RequiredSet.end(); I != E; ++I) {
+    if (Pass *AnalysisPass = findAnalysisPass(*I, true))
+      RP.push_back(AnalysisPass);
+    else
+      RP_NotAvail.push_back(*I);
+  }
+
+  const AnalysisUsage::VectorType &IDs = AnUsage->getRequiredTransitiveSet();
+  for (AnalysisUsage::VectorType::const_iterator I = IDs.begin(),
+         E = IDs.end(); I != E; ++I) {
+    if (Pass *AnalysisPass = findAnalysisPass(*I, true))
+      RP.push_back(AnalysisPass);
+    else
+      RP_NotAvail.push_back(*I);
+  }
+}
+
+// All Required analyses should be available to the pass as it runs!  Here
+// we fill in the AnalysisImpls member of the pass so that it can
+// successfully use the getAnalysis() method to retrieve the
+// implementations it needs.
+//
+void PMDataManager::initializeAnalysisImpl(Pass *P) {
+  AnalysisUsage *AnUsage = TPM->findAnalysisUsage(P);
+
+  for (AnalysisUsage::VectorType::const_iterator
+         I = AnUsage->getRequiredSet().begin(),
+         E = AnUsage->getRequiredSet().end(); I != E; ++I) {
+    Pass *Impl = findAnalysisPass(*I, true);
+    if (Impl == 0)
+      // This may be analysis pass that is initialized on the fly.
+      // If that is not the case then it will raise an assert when it is used.
+      continue;
+    AnalysisResolver *AR = P->getResolver();
+    assert(AR && "Analysis Resolver is not set");
+    AR->addAnalysisImplsPair(*I, Impl);
+  }
+}
+
+/// Find the pass that implements Analysis AID. If desired pass is not found
+/// then return NULL.
+Pass *PMDataManager::findAnalysisPass(AnalysisID AID, bool SearchParent) {
+
+  // Check if AvailableAnalysis map has one entry.
+  DenseMap<AnalysisID, Pass*>::const_iterator I =  AvailableAnalysis.find(AID);
+
+  if (I != AvailableAnalysis.end())
+    return I->second;
+
+  // Search Parents through TopLevelManager
+  if (SearchParent)
+    return TPM->findAnalysisPass(AID);
+
+  return NULL;
+}
+
+// Print list of passes that are last used by P.
+void PMDataManager::dumpLastUses(Pass *P, unsigned Offset) const{
+
+  SmallVector<Pass *, 12> LUses;
+
+  // If this is a on the fly manager then it does not have TPM.
+  if (!TPM)
+    return;
+
+  TPM->collectLastUses(LUses, P);
+
+  for (SmallVectorImpl<Pass *>::iterator I = LUses.begin(),
+         E = LUses.end(); I != E; ++I) {
+    llvm::dbgs() << "--" << std::string(Offset*2, ' ');
+    (*I)->dumpPassStructure(0);
+  }
+}
+
+void PMDataManager::dumpPassArguments() const {
+  for (SmallVectorImpl<Pass *>::const_iterator I = PassVector.begin(),
+        E = PassVector.end(); I != E; ++I) {
+    if (PMDataManager *PMD = (*I)->getAsPMDataManager())
+      PMD->dumpPassArguments();
+    else
+      if (const PassInfo *PI =
+            PassRegistry::getPassRegistry()->getPassInfo((*I)->getPassID()))
+        if (!PI->isAnalysisGroup())
+          dbgs() << " -" << PI->getPassArgument();
+  }
+}
+
+void PMDataManager::dumpPassInfo(Pass *P, enum PassDebuggingString S1,
+                                 enum PassDebuggingString S2,
+                                 StringRef Msg) {
+  if (PassDebugging < Executions)
+    return;
+  dbgs() << (void*)this << std::string(getDepth()*2+1, ' ');
+  switch (S1) {
+  case EXECUTION_MSG:
+    dbgs() << "Executing Pass '" << P->getPassName();
+    break;
+  case MODIFICATION_MSG:
+    dbgs() << "Made Modification '" << P->getPassName();
+    break;
+  case FREEING_MSG:
+    dbgs() << " Freeing Pass '" << P->getPassName();
+    break;
+  default:
+    break;
+  }
+  switch (S2) {
+  case ON_BASICBLOCK_MSG:
+    dbgs() << "' on BasicBlock '" << Msg << "'...\n";
+    break;
+  case ON_FUNCTION_MSG:
+    dbgs() << "' on Function '" << Msg << "'...\n";
+    break;
+  case ON_MODULE_MSG:
+    dbgs() << "' on Module '"  << Msg << "'...\n";
+    break;
+  case ON_REGION_MSG:
+    dbgs() << "' on Region '"  << Msg << "'...\n";
+    break;
+  case ON_LOOP_MSG:
+    dbgs() << "' on Loop '" << Msg << "'...\n";
+    break;
+  case ON_CG_MSG:
+    dbgs() << "' on Call Graph Nodes '" << Msg << "'...\n";
+    break;
+  default:
+    break;
+  }
+}
+
+void PMDataManager::dumpRequiredSet(const Pass *P) const {
+  if (PassDebugging < Details)
+    return;
+
+  AnalysisUsage analysisUsage;
+  P->getAnalysisUsage(analysisUsage);
+  dumpAnalysisUsage("Required", P, analysisUsage.getRequiredSet());
+}
+
+void PMDataManager::dumpPreservedSet(const Pass *P) const {
+  if (PassDebugging < Details)
+    return;
+
+  AnalysisUsage analysisUsage;
+  P->getAnalysisUsage(analysisUsage);
+  dumpAnalysisUsage("Preserved", P, analysisUsage.getPreservedSet());
+}
+
+void PMDataManager::dumpAnalysisUsage(StringRef Msg, const Pass *P,
+                                   const AnalysisUsage::VectorType &Set) const {
+  assert(PassDebugging >= Details);
+  if (Set.empty())
+    return;
+  dbgs() << (const void*)P << std::string(getDepth()*2+3, ' ') << Msg << " Analyses:";
+  for (unsigned i = 0; i != Set.size(); ++i) {
+    if (i) dbgs() << ',';
+    const PassInfo *PInf = PassRegistry::getPassRegistry()->getPassInfo(Set[i]);
+    if (!PInf) {
+      // Some preserved passes, such as AliasAnalysis, may not be initialized by
+      // all drivers.
+      dbgs() << " Uninitialized Pass";
+      continue;
+    }
+    dbgs() << ' ' << PInf->getPassName();
+  }
+  dbgs() << '\n';
+}
+
+/// Add RequiredPass into list of lower level passes required by pass P.
+/// RequiredPass is run on the fly by Pass Manager when P requests it
+/// through getAnalysis interface.
+/// This should be handled by specific pass manager.
+void PMDataManager::addLowerLevelRequiredPass(Pass *P, Pass *RequiredPass) {
+  if (TPM) {
+    TPM->dumpArguments();
+    TPM->dumpPasses();
+  }
+
+  // Module Level pass may required Function Level analysis info
+  // (e.g. dominator info). Pass manager uses on the fly function pass manager
+  // to provide this on demand. In that case, in Pass manager terminology,
+  // module level pass is requiring lower level analysis info managed by
+  // lower level pass manager.
+
+  // When Pass manager is not able to order required analysis info, Pass manager
+  // checks whether any lower level manager will be able to provide this
+  // analysis info on demand or not.
+#ifndef NDEBUG
+  dbgs() << "Unable to schedule '" << RequiredPass->getPassName();
+  dbgs() << "' required by '" << P->getPassName() << "'\n";
+#endif
+  llvm_unreachable("Unable to schedule pass");
+}
+
+Pass *PMDataManager::getOnTheFlyPass(Pass *P, AnalysisID PI, Function &F) {
+  llvm_unreachable("Unable to find on the fly pass");
+}
+
+// Destructor
+PMDataManager::~PMDataManager() {
+  for (SmallVectorImpl<Pass *>::iterator I = PassVector.begin(),
+         E = PassVector.end(); I != E; ++I)
+    delete *I;
+}
+
+//===----------------------------------------------------------------------===//
+// NOTE: Is this the right place to define this method ?
+// getAnalysisIfAvailable - Return analysis result or null if it doesn't exist.
+Pass *AnalysisResolver::getAnalysisIfAvailable(AnalysisID ID, bool dir) const {
+  return PM.findAnalysisPass(ID, dir);
+}
+
+Pass *AnalysisResolver::findImplPass(Pass *P, AnalysisID AnalysisPI,
+                                     Function &F) {
+  return PM.getOnTheFlyPass(P, AnalysisPI, F);
+}
+
+//===----------------------------------------------------------------------===//
+// BBPassManager implementation
+
+/// Execute all of the passes scheduled for execution by invoking
+/// runOnBasicBlock method.  Keep track of whether any of the passes modifies
+/// the function, and if so, return true.
+bool BBPassManager::runOnFunction(Function &F) {
+  if (F.isDeclaration())
+    return false;
+
+  bool Changed = doInitialization(F);
+
+  for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I)
+    for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
+      BasicBlockPass *BP = getContainedPass(Index);
+      bool LocalChanged = false;
+
+      dumpPassInfo(BP, EXECUTION_MSG, ON_BASICBLOCK_MSG, I->getName());
+      dumpRequiredSet(BP);
+
+      initializeAnalysisImpl(BP);
+
+      {
+        // If the pass crashes, remember this.
+        PassManagerPrettyStackEntry X(BP, *I);
+        TimeRegion PassTimer(getPassTimer(BP));
+
+        LocalChanged |= BP->runOnBasicBlock(*I);
+      }
+
+      Changed |= LocalChanged;
+      if (LocalChanged)
+        dumpPassInfo(BP, MODIFICATION_MSG, ON_BASICBLOCK_MSG,
+                     I->getName());
+      dumpPreservedSet(BP);
+
+      verifyPreservedAnalysis(BP);
+      removeNotPreservedAnalysis(BP);
+      recordAvailableAnalysis(BP);
+      removeDeadPasses(BP, I->getName(), ON_BASICBLOCK_MSG);
+    }
+
+  return doFinalization(F) || Changed;
+}
+
+// Implement doInitialization and doFinalization
+bool BBPassManager::doInitialization(Module &M) {
+  bool Changed = false;
+
+  for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index)
+    Changed |= getContainedPass(Index)->doInitialization(M);
+
+  return Changed;
+}
+
+bool BBPassManager::doFinalization(Module &M) {
+  bool Changed = false;
+
+  for (int Index = getNumContainedPasses() - 1; Index >= 0; --Index)
+    Changed |= getContainedPass(Index)->doFinalization(M);
+
+  return Changed;
+}
+
+bool BBPassManager::doInitialization(Function &F) {
+  bool Changed = false;
+
+  for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
+    BasicBlockPass *BP = getContainedPass(Index);
+    Changed |= BP->doInitialization(F);
+  }
+
+  return Changed;
+}
+
+bool BBPassManager::doFinalization(Function &F) {
+  bool Changed = false;
+
+  for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
+    BasicBlockPass *BP = getContainedPass(Index);
+    Changed |= BP->doFinalization(F);
+  }
+
+  return Changed;
+}
+
+
+//===----------------------------------------------------------------------===//
+// FunctionPassManager implementation
+
+/// Create new Function pass manager
+FunctionPassManager::FunctionPassManager(Module *m) : M(m) {
+  FPM = new FunctionPassManagerImpl();
+  // FPM is the top level manager.
+  FPM->setTopLevelManager(FPM);
+
+  AnalysisResolver *AR = new AnalysisResolver(*FPM);
+  FPM->setResolver(AR);
+}
+
+FunctionPassManager::~FunctionPassManager() {
+  delete FPM;
+}
+
+/// add - Add a pass to the queue of passes to run.  This passes
+/// ownership of the Pass to the PassManager.  When the
+/// PassManager_X is destroyed, the pass will be destroyed as well, so
+/// there is no need to delete the pass. (TODO delete passes.)
+/// This implies that all passes MUST be allocated with 'new'.
+void FunctionPassManager::add(Pass *P) {
+  FPM->add(P);
+}
+
+/// run - Execute all of the passes scheduled for execution.  Keep
+/// track of whether any of the passes modifies the function, and if
+/// so, return true.
+///
+bool FunctionPassManager::run(Function &F) {
+  if (F.isMaterializable()) {
+    std::string errstr;
+    if (F.Materialize(&errstr))
+      report_fatal_error("Error reading bitcode file: " + Twine(errstr));
+  }
+  return FPM->run(F);
+}
+
+
+/// doInitialization - Run all of the initializers for the function passes.
+///
+bool FunctionPassManager::doInitialization() {
+  return FPM->doInitialization(*M);
+}
+
+/// doFinalization - Run all of the finalizers for the function passes.
+///
+bool FunctionPassManager::doFinalization() {
+  return FPM->doFinalization(*M);
+}
+
+//===----------------------------------------------------------------------===//
+// FunctionPassManagerImpl implementation
+//
+bool FunctionPassManagerImpl::doInitialization(Module &M) {
+  bool Changed = false;
+
+  dumpArguments();
+  dumpPasses();
+
+  SmallVectorImpl<ImmutablePass *>& IPV = getImmutablePasses();
+  for (SmallVectorImpl<ImmutablePass *>::const_iterator I = IPV.begin(),
+       E = IPV.end(); I != E; ++I) {
+    Changed |= (*I)->doInitialization(M);
+  }
+
+  for (unsigned Index = 0; Index < getNumContainedManagers(); ++Index)
+    Changed |= getContainedManager(Index)->doInitialization(M);
+
+  return Changed;
+}
+
+bool FunctionPassManagerImpl::doFinalization(Module &M) {
+  bool Changed = false;
+
+  for (int Index = getNumContainedManagers() - 1; Index >= 0; --Index)
+    Changed |= getContainedManager(Index)->doFinalization(M);
+
+  SmallVectorImpl<ImmutablePass *>& IPV = getImmutablePasses();
+  for (SmallVectorImpl<ImmutablePass *>::const_iterator I = IPV.begin(),
+       E = IPV.end(); I != E; ++I) {
+    Changed |= (*I)->doFinalization(M);
+  }
+
+  return Changed;
+}
+
+/// cleanup - After running all passes, clean up pass manager cache.
+void FPPassManager::cleanup() {
+ for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
+    FunctionPass *FP = getContainedPass(Index);
+    AnalysisResolver *AR = FP->getResolver();
+    assert(AR && "Analysis Resolver is not set");
+    AR->clearAnalysisImpls();
+ }
+}
+
+void FunctionPassManagerImpl::releaseMemoryOnTheFly() {
+  if (!wasRun)
+    return;
+  for (unsigned Index = 0; Index < getNumContainedManagers(); ++Index) {
+    FPPassManager *FPPM = getContainedManager(Index);
+    for (unsigned Index = 0; Index < FPPM->getNumContainedPasses(); ++Index) {
+      FPPM->getContainedPass(Index)->releaseMemory();
+    }
+  }
+  wasRun = false;
+}
+
+// Execute all the passes managed by this top level manager.
+// Return true if any function is modified by a pass.
+bool FunctionPassManagerImpl::run(Function &F) {
+  bool Changed = false;
+  TimingInfo::createTheTimeInfo();
+
+  initializeAllAnalysisInfo();
+  for (unsigned Index = 0; Index < getNumContainedManagers(); ++Index)
+    Changed |= getContainedManager(Index)->runOnFunction(F);
+
+  for (unsigned Index = 0; Index < getNumContainedManagers(); ++Index)
+    getContainedManager(Index)->cleanup();
+
+  wasRun = true;
+  return Changed;
+}
+
+//===----------------------------------------------------------------------===//
+// FPPassManager implementation
+
+char FPPassManager::ID = 0;
+/// Print passes managed by this manager
+void FPPassManager::dumpPassStructure(unsigned Offset) {
+  dbgs().indent(Offset*2) << "FunctionPass Manager\n";
+  for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
+    FunctionPass *FP = getContainedPass(Index);
+    FP->dumpPassStructure(Offset + 1);
+    dumpLastUses(FP, Offset+1);
+  }
+}
+
+
+/// Execute all of the passes scheduled for execution by invoking
+/// runOnFunction method.  Keep track of whether any of the passes modifies
+/// the function, and if so, return true.
+bool FPPassManager::runOnFunction(Function &F) {
+  if (F.isDeclaration())
+    return false;
+
+  bool Changed = false;
+
+  // Collect inherited analysis from Module level pass manager.
+  populateInheritedAnalysis(TPM->activeStack);
+
+  for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
+    FunctionPass *FP = getContainedPass(Index);
+    bool LocalChanged = false;
+
+    dumpPassInfo(FP, EXECUTION_MSG, ON_FUNCTION_MSG, F.getName());
+    dumpRequiredSet(FP);
+
+    initializeAnalysisImpl(FP);
+
+    {
+      PassManagerPrettyStackEntry X(FP, F);
+      TimeRegion PassTimer(getPassTimer(FP));
+
+      LocalChanged |= FP->runOnFunction(F);
+    }
+
+    Changed |= LocalChanged;
+    if (LocalChanged)
+      dumpPassInfo(FP, MODIFICATION_MSG, ON_FUNCTION_MSG, F.getName());
+    dumpPreservedSet(FP);
+
+    verifyPreservedAnalysis(FP);
+    removeNotPreservedAnalysis(FP);
+    recordAvailableAnalysis(FP);
+    removeDeadPasses(FP, F.getName(), ON_FUNCTION_MSG);
+  }
+  return Changed;
+}
+
+bool FPPassManager::runOnModule(Module &M) {
+  bool Changed = false;
+
+  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I)
+    Changed |= runOnFunction(*I);
+
+  return Changed;
+}
+
+bool FPPassManager::doInitialization(Module &M) {
+  bool Changed = false;
+
+  for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index)
+    Changed |= getContainedPass(Index)->doInitialization(M);
+
+  return Changed;
+}
+
+bool FPPassManager::doFinalization(Module &M) {
+  bool Changed = false;
+
+  for (int Index = getNumContainedPasses() - 1; Index >= 0; --Index)
+    Changed |= getContainedPass(Index)->doFinalization(M);
+
+  return Changed;
+}
+
+//===----------------------------------------------------------------------===//
+// MPPassManager implementation
+
+/// Execute all of the passes scheduled for execution by invoking
+/// runOnModule method.  Keep track of whether any of the passes modifies
+/// the module, and if so, return true.
+bool
+MPPassManager::runOnModule(Module &M) {
+  bool Changed = false;
+
+  // Initialize on-the-fly passes
+  for (std::map<Pass *, FunctionPassManagerImpl *>::iterator
+       I = OnTheFlyManagers.begin(), E = OnTheFlyManagers.end();
+       I != E; ++I) {
+    FunctionPassManagerImpl *FPP = I->second;
+    Changed |= FPP->doInitialization(M);
+  }
+
+  // Initialize module passes
+  for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index)
+    Changed |= getContainedPass(Index)->doInitialization(M);
+
+  for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
+    ModulePass *MP = getContainedPass(Index);
+    bool LocalChanged = false;
+
+    dumpPassInfo(MP, EXECUTION_MSG, ON_MODULE_MSG, M.getModuleIdentifier());
+    dumpRequiredSet(MP);
+
+    initializeAnalysisImpl(MP);
+
+    {
+      PassManagerPrettyStackEntry X(MP, M);
+      TimeRegion PassTimer(getPassTimer(MP));
+
+      LocalChanged |= MP->runOnModule(M);
+    }
+
+    Changed |= LocalChanged;
+    if (LocalChanged)
+      dumpPassInfo(MP, MODIFICATION_MSG, ON_MODULE_MSG,
+                   M.getModuleIdentifier());
+    dumpPreservedSet(MP);
+
+    verifyPreservedAnalysis(MP);
+    removeNotPreservedAnalysis(MP);
+    recordAvailableAnalysis(MP);
+    removeDeadPasses(MP, M.getModuleIdentifier(), ON_MODULE_MSG);
+  }
+
+  // Finalize module passes
+  for (int Index = getNumContainedPasses() - 1; Index >= 0; --Index)
+    Changed |= getContainedPass(Index)->doFinalization(M);
+
+  // Finalize on-the-fly passes
+  for (std::map<Pass *, FunctionPassManagerImpl *>::iterator
+       I = OnTheFlyManagers.begin(), E = OnTheFlyManagers.end();
+       I != E; ++I) {
+    FunctionPassManagerImpl *FPP = I->second;
+    // We don't know when is the last time an on-the-fly pass is run,
+    // so we need to releaseMemory / finalize here
+    FPP->releaseMemoryOnTheFly();
+    Changed |= FPP->doFinalization(M);
+  }
+
+  return Changed;
+}
+
+/// Add RequiredPass into list of lower level passes required by pass P.
+/// RequiredPass is run on the fly by Pass Manager when P requests it
+/// through getAnalysis interface.
+void MPPassManager::addLowerLevelRequiredPass(Pass *P, Pass *RequiredPass) {
+  assert(P->getPotentialPassManagerType() == PMT_ModulePassManager &&
+         "Unable to handle Pass that requires lower level Analysis pass");
+  assert((P->getPotentialPassManagerType() <
+          RequiredPass->getPotentialPassManagerType()) &&
+         "Unable to handle Pass that requires lower level Analysis pass");
+
+  FunctionPassManagerImpl *FPP = OnTheFlyManagers[P];
+  if (!FPP) {
+    FPP = new FunctionPassManagerImpl();
+    // FPP is the top level manager.
+    FPP->setTopLevelManager(FPP);
+
+    OnTheFlyManagers[P] = FPP;
+  }
+  FPP->add(RequiredPass);
+
+  // Register P as the last user of RequiredPass.
+  if (RequiredPass) {
+    SmallVector<Pass *, 1> LU;
+    LU.push_back(RequiredPass);
+    FPP->setLastUser(LU,  P);
+  }
+}
+
+/// Return function pass corresponding to PassInfo PI, that is
+/// required by module pass MP. Instantiate analysis pass, by using
+/// its runOnFunction() for function F.
+Pass* MPPassManager::getOnTheFlyPass(Pass *MP, AnalysisID PI, Function &F){
+  FunctionPassManagerImpl *FPP = OnTheFlyManagers[MP];
+  assert(FPP && "Unable to find on the fly pass");
+
+  FPP->releaseMemoryOnTheFly();
+  FPP->run(F);
+  return ((PMTopLevelManager*)FPP)->findAnalysisPass(PI);
+}
+
+
+//===----------------------------------------------------------------------===//
+// PassManagerImpl implementation
+
+//
+/// run - Execute all of the passes scheduled for execution.  Keep track of
+/// whether any of the passes modifies the module, and if so, return true.
+bool PassManagerImpl::run(Module &M) {
+  bool Changed = false;
+  TimingInfo::createTheTimeInfo();
+
+  dumpArguments();
+  dumpPasses();
+
+  SmallVectorImpl<ImmutablePass *>& IPV = getImmutablePasses();
+  for (SmallVectorImpl<ImmutablePass *>::const_iterator I = IPV.begin(),
+       E = IPV.end(); I != E; ++I) {
+    Changed |= (*I)->doInitialization(M);
+  }
+
+  initializeAllAnalysisInfo();
+  for (unsigned Index = 0; Index < getNumContainedManagers(); ++Index)
+    Changed |= getContainedManager(Index)->runOnModule(M);
+
+  for (SmallVectorImpl<ImmutablePass *>::const_iterator I = IPV.begin(),
+       E = IPV.end(); I != E; ++I) {
+    Changed |= (*I)->doFinalization(M);
+  }
+
+  return Changed;
+}
+
+//===----------------------------------------------------------------------===//
+// PassManager implementation
+
+/// Create new pass manager
+PassManager::PassManager() {
+  PM = new PassManagerImpl();
+  // PM is the top level manager
+  PM->setTopLevelManager(PM);
+}
+
+PassManager::~PassManager() {
+  delete PM;
+}
+
+/// add - Add a pass to the queue of passes to run.  This passes ownership of
+/// the Pass to the PassManager.  When the PassManager is destroyed, the pass
+/// will be destroyed as well, so there is no need to delete the pass.  This
+/// implies that all passes MUST be allocated with 'new'.
+void PassManager::add(Pass *P) {
+  PM->add(P);
+}
+
+/// run - Execute all of the passes scheduled for execution.  Keep track of
+/// whether any of the passes modifies the module, and if so, return true.
+bool PassManager::run(Module &M) {
+  return PM->run(M);
+}
+
+//===----------------------------------------------------------------------===//
+// TimingInfo implementation
+
+bool llvm::TimePassesIsEnabled = false;
+static cl::opt<bool,true>
+EnableTiming("time-passes", cl::location(TimePassesIsEnabled),
+            cl::desc("Time each pass, printing elapsed time for each on exit"));
+
+// createTheTimeInfo - This method either initializes the TheTimeInfo pointer to
+// a non null value (if the -time-passes option is enabled) or it leaves it
+// null.  It may be called multiple times.
+void TimingInfo::createTheTimeInfo() {
+  if (!TimePassesIsEnabled || TheTimeInfo) return;
+
+  // Constructed the first time this is called, iff -time-passes is enabled.
+  // This guarantees that the object will be constructed before static globals,
+  // thus it will be destroyed before them.
+  static ManagedStatic<TimingInfo> TTI;
+  TheTimeInfo = &*TTI;
+}
+
+/// If TimingInfo is enabled then start pass timer.
+Timer *llvm::getPassTimer(Pass *P) {
+  if (TheTimeInfo)
+    return TheTimeInfo->getPassTimer(P);
+  return 0;
+}
+
+//===----------------------------------------------------------------------===//
+// PMStack implementation
+//
+
+// Pop Pass Manager from the stack and clear its analysis info.
+void PMStack::pop() {
+
+  PMDataManager *Top = this->top();
+  Top->initializeAnalysisInfo();
+
+  S.pop_back();
+}
+
+// Push PM on the stack and set its top level manager.
+void PMStack::push(PMDataManager *PM) {
+  assert(PM && "Unable to push. Pass Manager expected");
+  assert(PM->getDepth()==0 && "Pass Manager depth set too early");
+
+  if (!this->empty()) {
+    assert(PM->getPassManagerType() > this->top()->getPassManagerType()
+           && "pushing bad pass manager to PMStack");
+    PMTopLevelManager *TPM = this->top()->getTopLevelManager();
+
+    assert(TPM && "Unable to find top level manager");
+    TPM->addIndirectPassManager(PM);
+    PM->setTopLevelManager(TPM);
+    PM->setDepth(this->top()->getDepth()+1);
+  } else {
+    assert((PM->getPassManagerType() == PMT_ModulePassManager
+           || PM->getPassManagerType() == PMT_FunctionPassManager)
+           && "pushing bad pass manager to PMStack");
+    PM->setDepth(1);
+  }
+
+  S.push_back(PM);
+}
+
+// Dump content of the pass manager stack.
+void PMStack::dump() const {
+  for (std::vector<PMDataManager *>::const_iterator I = S.begin(),
+         E = S.end(); I != E; ++I)
+    dbgs() << (*I)->getAsPass()->getPassName() << ' ';
+
+  if (!S.empty())
+    dbgs() << '\n';
+}
+
+/// Find appropriate Module Pass Manager in the PM Stack and
+/// add self into that manager.
+void ModulePass::assignPassManager(PMStack &PMS,
+                                   PassManagerType PreferredType) {
+  // Find Module Pass Manager
+  while (!PMS.empty()) {
+    PassManagerType TopPMType = PMS.top()->getPassManagerType();
+    if (TopPMType == PreferredType)
+      break; // We found desired pass manager
+    else if (TopPMType > PMT_ModulePassManager)
+      PMS.pop();    // Pop children pass managers
+    else
+      break;
+  }
+  assert(!PMS.empty() && "Unable to find appropriate Pass Manager");
+  PMS.top()->add(this);
+}
+
+/// Find appropriate Function Pass Manager or Call Graph Pass Manager
+/// in the PM Stack and add self into that manager.
+void FunctionPass::assignPassManager(PMStack &PMS,
+                                     PassManagerType PreferredType) {
+
+  // Find Function Pass Manager
+  while (!PMS.empty()) {
+    if (PMS.top()->getPassManagerType() > PMT_FunctionPassManager)
+      PMS.pop();
+    else
+      break;
+  }
+
+  // Create new Function Pass Manager if needed.
+  FPPassManager *FPP;
+  if (PMS.top()->getPassManagerType() == PMT_FunctionPassManager) {
+    FPP = (FPPassManager *)PMS.top();
+  } else {
+    assert(!PMS.empty() && "Unable to create Function Pass Manager");
+    PMDataManager *PMD = PMS.top();
+
+    // [1] Create new Function Pass Manager
+    FPP = new FPPassManager();
+    FPP->populateInheritedAnalysis(PMS);
+
+    // [2] Set up new manager's top level manager
+    PMTopLevelManager *TPM = PMD->getTopLevelManager();
+    TPM->addIndirectPassManager(FPP);
+
+    // [3] Assign manager to manage this new manager. This may create
+    // and push new managers into PMS
+    FPP->assignPassManager(PMS, PMD->getPassManagerType());
+
+    // [4] Push new manager into PMS
+    PMS.push(FPP);
+  }
+
+  // Assign FPP as the manager of this pass.
+  FPP->add(this);
+}
+
+/// Find appropriate Basic Pass Manager or Call Graph Pass Manager
+/// in the PM Stack and add self into that manager.
+void BasicBlockPass::assignPassManager(PMStack &PMS,
+                                       PassManagerType PreferredType) {
+  BBPassManager *BBP;
+
+  // Basic Pass Manager is a leaf pass manager. It does not handle
+  // any other pass manager.
+  if (!PMS.empty() &&
+      PMS.top()->getPassManagerType() == PMT_BasicBlockPassManager) {
+    BBP = (BBPassManager *)PMS.top();
+  } else {
+    // If leaf manager is not Basic Block Pass manager then create new
+    // basic Block Pass manager.
+    assert(!PMS.empty() && "Unable to create BasicBlock Pass Manager");
+    PMDataManager *PMD = PMS.top();
+
+    // [1] Create new Basic Block Manager
+    BBP = new BBPassManager();
+
+    // [2] Set up new manager's top level manager
+    // Basic Block Pass Manager does not live by itself
+    PMTopLevelManager *TPM = PMD->getTopLevelManager();
+    TPM->addIndirectPassManager(BBP);
+
+    // [3] Assign manager to manage this new manager. This may create
+    // and push new managers into PMS
+    BBP->assignPassManager(PMS, PreferredType);
+
+    // [4] Push new manager into PMS
+    PMS.push(BBP);
+  }
+
+  // Assign BBP as the manager of this pass.
+  BBP->add(this);
+}
+
+PassManagerBase::~PassManagerBase() {}
diff --git a/lib/IR/Metadata.cpp b/lib/IR/Metadata.cpp
index bd4d9c0..a32d25c 100644
--- a/lib/IR/Metadata.cpp
+++ b/lib/IR/Metadata.cpp
@@ -65,7 +65,7 @@ class MDNodeOperand : public CallbackVH {
 
 public:
   MDNodeOperand(Value *V) : CallbackVH(V) {}
-  ~MDNodeOperand() {}
+  virtual ~MDNodeOperand();
 
   void set(Value *V) {
     unsigned IsFirst = this->getValPtrInt();
@@ -82,6 +82,8 @@ public:
 };
 } // end namespace llvm.
 
+// Provide out-of-line definition to prevent weak vtable.
+MDNodeOperand::~MDNodeOperand() {}
 
 void MDNodeOperand::deleted() {
   getParent()->replaceOperand(this, 0);
diff --git a/lib/IR/Module.cpp b/lib/IR/Module.cpp
index 968b8f4..4f240c7 100644
--- a/lib/IR/Module.cpp
+++ b/lib/IR/Module.cpp
@@ -245,7 +245,7 @@ GlobalVariable *Module::getGlobalVariable(StringRef Name, bool AllowLocal) {
 ///   1. If it does not exist, add a declaration of the global and return it.
 ///   2. Else, the global exists but has the wrong type: return the function
 ///      with a constantexpr cast to the right type.
-///   3. Finally, if the existing global is the correct delclaration, return the
+///   3. Finally, if the existing global is the correct declaration, return the
 ///      existing global.
 Constant *Module::getOrInsertGlobal(StringRef Name, Type *Ty) {
   // See if we have a definition for the specified global already.
@@ -260,8 +260,10 @@ Constant *Module::getOrInsertGlobal(StringRef Name, Type *Ty) {
 
   // If the variable exists but has the wrong type, return a bitcast to the
   // right type.
-  if (GV->getType() != PointerType::getUnqual(Ty))
-    return ConstantExpr::getBitCast(GV, PointerType::getUnqual(Ty));
+  Type *GVTy = GV->getType();
+  PointerType *PTy = PointerType::get(Ty, GVTy->getPointerAddressSpace());
+  if (GVTy != PTy)
+    return ConstantExpr::getBitCast(GV, PTy);
 
   // Otherwise, we just found the existing function or a prototype.
   return GV;
@@ -316,11 +318,16 @@ getModuleFlagsMetadata(SmallVectorImpl<ModuleFlagEntry> &Flags) const {
 
   for (unsigned i = 0, e = ModFlags->getNumOperands(); i != e; ++i) {
     MDNode *Flag = ModFlags->getOperand(i);
-    ConstantInt *Behavior = cast<ConstantInt>(Flag->getOperand(0));
-    MDString *Key = cast<MDString>(Flag->getOperand(1));
-    Value *Val = Flag->getOperand(2);
-    Flags.push_back(ModuleFlagEntry(ModFlagBehavior(Behavior->getZExtValue()),
-                                    Key, Val));
+    if (Flag->getNumOperands() >= 3 && isa<ConstantInt>(Flag->getOperand(0)) &&
+        isa<MDString>(Flag->getOperand(1))) {
+      // Check the operands of the MDNode before accessing the operands.
+      // The verifier will actually catch these failures.
+      ConstantInt *Behavior = cast<ConstantInt>(Flag->getOperand(0));
+      MDString *Key = cast<MDString>(Flag->getOperand(1));
+      Value *Val = Flag->getOperand(2);
+      Flags.push_back(ModuleFlagEntry(ModFlagBehavior(Behavior->getZExtValue()),
+                                      Key, Val));
+    }
   }
 }
 
@@ -399,9 +406,15 @@ bool Module::isDematerializable(const GlobalValue *GV) const {
 }
 
 bool Module::Materialize(GlobalValue *GV, std::string *ErrInfo) {
-  if (Materializer)
-    return Materializer->Materialize(GV, ErrInfo);
-  return false;
+  if (!Materializer)
+    return false;
+
+  error_code EC = Materializer->Materialize(GV);
+  if (!EC)
+    return false;
+  if (ErrInfo)
+    *ErrInfo = EC.message();
+  return true;
 }
 
 void Module::Dematerialize(GlobalValue *GV) {
@@ -412,7 +425,12 @@ void Module::Dematerialize(GlobalValue *GV) {
 bool Module::MaterializeAll(std::string *ErrInfo) {
   if (!Materializer)
     return false;
-  return Materializer->MaterializeModule(this, ErrInfo);
+  error_code EC = Materializer->MaterializeModule(this);
+  if (!EC)
+    return false;
+  if (ErrInfo)
+    *ErrInfo = EC.message();
+  return true;
 }
 
 bool Module::MaterializeAllPermanently(std::string *ErrInfo) {
diff --git a/lib/IR/PassManager.cpp b/lib/IR/PassManager.cpp
index ee53c85..966af7d 100644
--- a/lib/IR/PassManager.cpp
+++ b/lib/IR/PassManager.cpp
@@ -1,4 +1,4 @@
-//===- PassManager.cpp - LLVM Pass Infrastructure Implementation ----------===//
+//===- PassManager.h - Infrastructure for managing & running IR passes ----===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -6,1907 +6,152 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
-//
-// This file implements the LLVM Pass Manager infrastructure.
-//
-//===----------------------------------------------------------------------===//
 
+#include "llvm/IR/PassManager.h"
+#include "llvm/ADT/STLExtras.h"
 
-#include "llvm/PassManagers.h"
-#include "llvm/Assembly/PrintModulePass.h"
-#include "llvm/Assembly/Writer.h"
-#include "llvm/IR/Module.h"
-#include "llvm/PassManager.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/ManagedStatic.h"
-#include "llvm/Support/Mutex.h"
-#include "llvm/Support/PassNameParser.h"
-#include "llvm/Support/Timer.h"
-#include "llvm/Support/raw_ostream.h"
-#include <algorithm>
-#include <map>
 using namespace llvm;
 
-// See PassManagers.h for Pass Manager infrastructure overview.
-
-namespace llvm {
-
-//===----------------------------------------------------------------------===//
-// Pass debugging information.  Often it is useful to find out what pass is
-// running when a crash occurs in a utility.  When this library is compiled with
-// debugging on, a command line option (--debug-pass) is enabled that causes the
-// pass name to be printed before it executes.
-//
-
-// Different debug levels that can be enabled...
-enum PassDebugLevel {
-  Disabled, Arguments, Structure, Executions, Details
-};
-
-static cl::opt<enum PassDebugLevel>
-PassDebugging("debug-pass", cl::Hidden,
-                  cl::desc("Print PassManager debugging information"),
-                  cl::values(
-  clEnumVal(Disabled  , "disable debug output"),
-  clEnumVal(Arguments , "print pass arguments to pass to 'opt'"),
-  clEnumVal(Structure , "print pass structure before run()"),
-  clEnumVal(Executions, "print pass name before it is executed"),
-  clEnumVal(Details   , "print pass details when it is executed"),
-                             clEnumValEnd));
-
-typedef llvm::cl::list<const llvm::PassInfo *, bool, PassNameParser>
-PassOptionList;
-
-// Print IR out before/after specified passes.
-static PassOptionList
-PrintBefore("print-before",
-            llvm::cl::desc("Print IR before specified passes"),
-            cl::Hidden);
-
-static PassOptionList
-PrintAfter("print-after",
-           llvm::cl::desc("Print IR after specified passes"),
-           cl::Hidden);
-
-static cl::opt<bool>
-PrintBeforeAll("print-before-all",
-               llvm::cl::desc("Print IR before each pass"),
-               cl::init(false));
-static cl::opt<bool>
-PrintAfterAll("print-after-all",
-              llvm::cl::desc("Print IR after each pass"),
-              cl::init(false));
-
-/// This is a helper to determine whether to print IR before or
-/// after a pass.
-
-static bool ShouldPrintBeforeOrAfterPass(const PassInfo *PI,
-                                         PassOptionList &PassesToPrint) {
-  for (unsigned i = 0, ie = PassesToPrint.size(); i < ie; ++i) {
-    const llvm::PassInfo *PassInf = PassesToPrint[i];
-    if (PassInf)
-      if (PassInf->getPassArgument() == PI->getPassArgument()) {
-        return true;
-      }
-  }
-  return false;
-}
-
-/// This is a utility to check whether a pass should have IR dumped
-/// before it.
-static bool ShouldPrintBeforePass(const PassInfo *PI) {
-  return PrintBeforeAll || ShouldPrintBeforeOrAfterPass(PI, PrintBefore);
-}
-
-/// This is a utility to check whether a pass should have IR dumped
-/// after it.
-static bool ShouldPrintAfterPass(const PassInfo *PI) {
-  return PrintAfterAll || ShouldPrintBeforeOrAfterPass(PI, PrintAfter);
-}
-
-} // End of llvm namespace
-
-/// isPassDebuggingExecutionsOrMore - Return true if -debug-pass=Executions
-/// or higher is specified.
-bool PMDataManager::isPassDebuggingExecutionsOrMore() const {
-  return PassDebugging >= Executions;
-}
-
-
-
-
-void PassManagerPrettyStackEntry::print(raw_ostream &OS) const {
-  if (V == 0 && M == 0)
-    OS << "Releasing pass '";
-  else
-    OS << "Running pass '";
-
-  OS << P->getPassName() << "'";
-
-  if (M) {
-    OS << " on module '" << M->getModuleIdentifier() << "'.\n";
-    return;
-  }
-  if (V == 0) {
-    OS << '\n';
-    return;
-  }
-
-  OS << " on ";
-  if (isa<Function>(V))
-    OS << "function";
-  else if (isa<BasicBlock>(V))
-    OS << "basic block";
-  else
-    OS << "value";
-
-  OS << " '";
-  WriteAsOperand(OS, V, /*PrintTy=*/false, M);
-  OS << "'\n";
+void ModulePassManager::run() {
+  for (unsigned Idx = 0, Size = Passes.size(); Idx != Size; ++Idx)
+    if (Passes[Idx]->run(M))
+      if (AM) AM->invalidateAll(M);
 }
 
-
-namespace {
-
-//===----------------------------------------------------------------------===//
-// BBPassManager
-//
-/// BBPassManager manages BasicBlockPass. It batches all the
-/// pass together and sequence them to process one basic block before
-/// processing next basic block.
-class BBPassManager : public PMDataManager, public FunctionPass {
-
-public:
-  static char ID;
-  explicit BBPassManager()
-    : PMDataManager(), FunctionPass(ID) {}
-
-  /// Execute all of the passes scheduled for execution.  Keep track of
-  /// whether any of the passes modifies the function, and if so, return true.
-  bool runOnFunction(Function &F);
-
-  /// Pass Manager itself does not invalidate any analysis info.
-  void getAnalysisUsage(AnalysisUsage &Info) const {
-    Info.setPreservesAll();
-  }
-
-  bool doInitialization(Module &M);
-  bool doInitialization(Function &F);
-  bool doFinalization(Module &M);
-  bool doFinalization(Function &F);
-
-  virtual PMDataManager *getAsPMDataManager() { return this; }
-  virtual Pass *getAsPass() { return this; }
-
-  virtual const char *getPassName() const {
-    return "BasicBlock Pass Manager";
-  }
-
-  // Print passes managed by this manager
-  void dumpPassStructure(unsigned Offset) {
-    llvm::dbgs().indent(Offset*2) << "BasicBlockPass Manager\n";
-    for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
-      BasicBlockPass *BP = getContainedPass(Index);
-      BP->dumpPassStructure(Offset + 1);
-      dumpLastUses(BP, Offset+1);
-    }
-  }
-
-  BasicBlockPass *getContainedPass(unsigned N) {
-    assert(N < PassVector.size() && "Pass number out of range!");
-    BasicBlockPass *BP = static_cast<BasicBlockPass *>(PassVector[N]);
-    return BP;
-  }
-
-  virtual PassManagerType getPassManagerType() const {
-    return PMT_BasicBlockPassManager;
-  }
-};
-
-char BBPassManager::ID = 0;
-}
-
-namespace llvm {
-
-//===----------------------------------------------------------------------===//
-// FunctionPassManagerImpl
-//
-/// FunctionPassManagerImpl manages FPPassManagers
-class FunctionPassManagerImpl : public Pass,
-                                public PMDataManager,
-                                public PMTopLevelManager {
-  virtual void anchor();
-private:
-  bool wasRun;
-public:
-  static char ID;
-  explicit FunctionPassManagerImpl() :
-    Pass(PT_PassManager, ID), PMDataManager(),
-    PMTopLevelManager(new FPPassManager()), wasRun(false) {}
-
-  /// add - Add a pass to the queue of passes to run.  This passes ownership of
-  /// the Pass to the PassManager.  When the PassManager is destroyed, the pass
-  /// will be destroyed as well, so there is no need to delete the pass.  This
-  /// implies that all passes MUST be allocated with 'new'.
-  void add(Pass *P) {
-    schedulePass(P);
-  }
-
-  /// createPrinterPass - Get a function printer pass.
-  Pass *createPrinterPass(raw_ostream &O, const std::string &Banner) const {
-    return createPrintFunctionPass(Banner, &O);
-  }
-
-  // Prepare for running an on the fly pass, freeing memory if needed
-  // from a previous run.
-  void releaseMemoryOnTheFly();
-
-  /// run - Execute all of the passes scheduled for execution.  Keep track of
-  /// whether any of the passes modifies the module, and if so, return true.
-  bool run(Function &F);
-
-  /// doInitialization - Run all of the initializers for the function passes.
-  ///
-  bool doInitialization(Module &M);
-
-  /// doFinalization - Run all of the finalizers for the function passes.
-  ///
-  bool doFinalization(Module &M);
-
-
-  virtual PMDataManager *getAsPMDataManager() { return this; }
-  virtual Pass *getAsPass() { return this; }
-  virtual PassManagerType getTopLevelPassManagerType() {
-    return PMT_FunctionPassManager;
-  }
-
-  /// Pass Manager itself does not invalidate any analysis info.
-  void getAnalysisUsage(AnalysisUsage &Info) const {
-    Info.setPreservesAll();
-  }
-
-  FPPassManager *getContainedManager(unsigned N) {
-    assert(N < PassManagers.size() && "Pass number out of range!");
-    FPPassManager *FP = static_cast<FPPassManager *>(PassManagers[N]);
-    return FP;
-  }
-};
-
-void FunctionPassManagerImpl::anchor() {}
-
-char FunctionPassManagerImpl::ID = 0;
-
-//===----------------------------------------------------------------------===//
-// MPPassManager
-//
-/// MPPassManager manages ModulePasses and function pass managers.
-/// It batches all Module passes and function pass managers together and
-/// sequences them to process one module.
-class MPPassManager : public Pass, public PMDataManager {
-public:
-  static char ID;
-  explicit MPPassManager() :
-    Pass(PT_PassManager, ID), PMDataManager() { }
-
-  // Delete on the fly managers.
-  virtual ~MPPassManager() {
-    for (std::map<Pass *, FunctionPassManagerImpl *>::iterator
-           I = OnTheFlyManagers.begin(), E = OnTheFlyManagers.end();
-         I != E; ++I) {
-      FunctionPassManagerImpl *FPP = I->second;
-      delete FPP;
-    }
-  }
-
-  /// createPrinterPass - Get a module printer pass.
-  Pass *createPrinterPass(raw_ostream &O, const std::string &Banner) const {
-    return createPrintModulePass(&O, false, Banner);
-  }
-
-  /// run - Execute all of the passes scheduled for execution.  Keep track of
-  /// whether any of the passes modifies the module, and if so, return true.
-  bool runOnModule(Module &M);
-
-  using llvm::Pass::doInitialization;
-  using llvm::Pass::doFinalization;
-
-  /// doInitialization - Run all of the initializers for the module passes.
-  ///
-  bool doInitialization();
-
-  /// doFinalization - Run all of the finalizers for the module passes.
-  ///
-  bool doFinalization();
-
-  /// Pass Manager itself does not invalidate any analysis info.
-  void getAnalysisUsage(AnalysisUsage &Info) const {
-    Info.setPreservesAll();
-  }
-
-  /// Add RequiredPass into list of lower level passes required by pass P.
-  /// RequiredPass is run on the fly by Pass Manager when P requests it
-  /// through getAnalysis interface.
-  virtual void addLowerLevelRequiredPass(Pass *P, Pass *RequiredPass);
-
-  /// Return function pass corresponding to PassInfo PI, that is
-  /// required by module pass MP. Instantiate analysis pass, by using
-  /// its runOnFunction() for function F.
-  virtual Pass* getOnTheFlyPass(Pass *MP, AnalysisID PI, Function &F);
-
-  virtual const char *getPassName() const {
-    return "Module Pass Manager";
-  }
-
-  virtual PMDataManager *getAsPMDataManager() { return this; }
-  virtual Pass *getAsPass() { return this; }
-
-  // Print passes managed by this manager
-  void dumpPassStructure(unsigned Offset) {
-    llvm::dbgs().indent(Offset*2) << "ModulePass Manager\n";
-    for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
-      ModulePass *MP = getContainedPass(Index);
-      MP->dumpPassStructure(Offset + 1);
-      std::map<Pass *, FunctionPassManagerImpl *>::const_iterator I =
-        OnTheFlyManagers.find(MP);
-      if (I != OnTheFlyManagers.end())
-        I->second->dumpPassStructure(Offset + 2);
-      dumpLastUses(MP, Offset+1);
-    }
-  }
-
-  ModulePass *getContainedPass(unsigned N) {
-    assert(N < PassVector.size() && "Pass number out of range!");
-    return static_cast<ModulePass *>(PassVector[N]);
-  }
-
-  virtual PassManagerType getPassManagerType() const {
-    return PMT_ModulePassManager;
-  }
-
- private:
-  /// Collection of on the fly FPPassManagers. These managers manage
-  /// function passes that are required by module passes.
-  std::map<Pass *, FunctionPassManagerImpl *> OnTheFlyManagers;
-};
-
-char MPPassManager::ID = 0;
-//===----------------------------------------------------------------------===//
-// PassManagerImpl
-//
-
-/// PassManagerImpl manages MPPassManagers
-class PassManagerImpl : public Pass,
-                        public PMDataManager,
-                        public PMTopLevelManager {
-  virtual void anchor();
-
-public:
-  static char ID;
-  explicit PassManagerImpl() :
-    Pass(PT_PassManager, ID), PMDataManager(),
-                              PMTopLevelManager(new MPPassManager()) {}
-
-  /// add - Add a pass to the queue of passes to run.  This passes ownership of
-  /// the Pass to the PassManager.  When the PassManager is destroyed, the pass
-  /// will be destroyed as well, so there is no need to delete the pass.  This
-  /// implies that all passes MUST be allocated with 'new'.
-  void add(Pass *P) {
-    schedulePass(P);
-  }
-
-  /// createPrinterPass - Get a module printer pass.
-  Pass *createPrinterPass(raw_ostream &O, const std::string &Banner) const {
-    return createPrintModulePass(&O, false, Banner);
-  }
-
-  /// run - Execute all of the passes scheduled for execution.  Keep track of
-  /// whether any of the passes modifies the module, and if so, return true.
-  bool run(Module &M);
-
-  using llvm::Pass::doInitialization;
-  using llvm::Pass::doFinalization;
-
-  /// doInitialization - Run all of the initializers for the module passes.
-  ///
-  bool doInitialization();
-
-  /// doFinalization - Run all of the finalizers for the module passes.
-  ///
-  bool doFinalization();
-
-  /// Pass Manager itself does not invalidate any analysis info.
-  void getAnalysisUsage(AnalysisUsage &Info) const {
-    Info.setPreservesAll();
-  }
-
-  virtual PMDataManager *getAsPMDataManager() { return this; }
-  virtual Pass *getAsPass() { return this; }
-  virtual PassManagerType getTopLevelPassManagerType() {
-    return PMT_ModulePassManager;
-  }
-
-  MPPassManager *getContainedManager(unsigned N) {
-    assert(N < PassManagers.size() && "Pass number out of range!");
-    MPPassManager *MP = static_cast<MPPassManager *>(PassManagers[N]);
-    return MP;
-  }
-};
-
-void PassManagerImpl::anchor() {}
-
-char PassManagerImpl::ID = 0;
-} // End of llvm namespace
-
-namespace {
-
-//===----------------------------------------------------------------------===//
-/// TimingInfo Class - This class is used to calculate information about the
-/// amount of time each pass takes to execute.  This only happens when
-/// -time-passes is enabled on the command line.
-///
-
-static ManagedStatic<sys::SmartMutex<true> > TimingInfoMutex;
-
-class TimingInfo {
-  DenseMap<Pass*, Timer*> TimingData;
-  TimerGroup TG;
-public:
-  // Use 'create' member to get this.
-  TimingInfo() : TG("... Pass execution timing report ...") {}
-
-  // TimingDtor - Print out information about timing information
-  ~TimingInfo() {
-    // Delete all of the timers, which accumulate their info into the
-    // TimerGroup.
-    for (DenseMap<Pass*, Timer*>::iterator I = TimingData.begin(),
-         E = TimingData.end(); I != E; ++I)
-      delete I->second;
-    // TimerGroup is deleted next, printing the report.
-  }
-
-  // createTheTimeInfo - This method either initializes the TheTimeInfo pointer
-  // to a non null value (if the -time-passes option is enabled) or it leaves it
-  // null.  It may be called multiple times.
-  static void createTheTimeInfo();
-
-  /// getPassTimer - Return the timer for the specified pass if it exists.
-  Timer *getPassTimer(Pass *P) {
-    if (P->getAsPMDataManager())
-      return 0;
-
-    sys::SmartScopedLock<true> Lock(*TimingInfoMutex);
-    Timer *&T = TimingData[P];
-    if (T == 0)
-      T = new Timer(P->getPassName(), TG);
-    return T;
-  }
-};
-
-} // End of anon namespace
-
-static TimingInfo *TheTimeInfo;
-
-//===----------------------------------------------------------------------===//
-// PMTopLevelManager implementation
-
-/// Initialize top level manager. Create first pass manager.
-PMTopLevelManager::PMTopLevelManager(PMDataManager *PMDM) {
-  PMDM->setTopLevelManager(this);
-  addPassManager(PMDM);
-  activeStack.push(PMDM);
-}
-
-/// Set pass P as the last user of the given analysis passes.
-void
-PMTopLevelManager::setLastUser(ArrayRef<Pass*> AnalysisPasses, Pass *P) {
-  unsigned PDepth = 0;
-  if (P->getResolver())
-    PDepth = P->getResolver()->getPMDataManager().getDepth();
-
-  for (SmallVectorImpl<Pass *>::const_iterator I = AnalysisPasses.begin(),
-         E = AnalysisPasses.end(); I != E; ++I) {
-    Pass *AP = *I;
-    LastUser[AP] = P;
-
-    if (P == AP)
-      continue;
-
-    // Update the last users of passes that are required transitive by AP.
-    AnalysisUsage *AnUsage = findAnalysisUsage(AP);
-    const AnalysisUsage::VectorType &IDs = AnUsage->getRequiredTransitiveSet();
-    SmallVector<Pass *, 12> LastUses;
-    SmallVector<Pass *, 12> LastPMUses;
-    for (AnalysisUsage::VectorType::const_iterator I = IDs.begin(),
-         E = IDs.end(); I != E; ++I) {
-      Pass *AnalysisPass = findAnalysisPass(*I);
-      assert(AnalysisPass && "Expected analysis pass to exist.");
-      AnalysisResolver *AR = AnalysisPass->getResolver();
-      assert(AR && "Expected analysis resolver to exist.");
-      unsigned APDepth = AR->getPMDataManager().getDepth();
-
-      if (PDepth == APDepth)
-        LastUses.push_back(AnalysisPass);
-      else if (PDepth > APDepth)
-        LastPMUses.push_back(AnalysisPass);
-    }
-
-    setLastUser(LastUses, P);
-
-    // If this pass has a corresponding pass manager, push higher level
-    // analysis to this pass manager.
-    if (P->getResolver())
-      setLastUser(LastPMUses, P->getResolver()->getPMDataManager().getAsPass());
-
-
-    // If AP is the last user of other passes then make P last user of
-    // such passes.
-    for (DenseMap<Pass *, Pass *>::iterator LUI = LastUser.begin(),
-           LUE = LastUser.end(); LUI != LUE; ++LUI) {
-      if (LUI->second == AP)
-        // DenseMap iterator is not invalidated here because
-        // this is just updating existing entries.
-        LastUser[LUI->first] = P;
-    }
-  }
-}
-
-/// Collect passes whose last user is P
-void PMTopLevelManager::collectLastUses(SmallVectorImpl<Pass *> &LastUses,
-                                        Pass *P) {
-  DenseMap<Pass *, SmallPtrSet<Pass *, 8> >::iterator DMI =
-    InversedLastUser.find(P);
-  if (DMI == InversedLastUser.end())
-    return;
-
-  SmallPtrSet<Pass *, 8> &LU = DMI->second;
-  for (SmallPtrSet<Pass *, 8>::iterator I = LU.begin(),
-         E = LU.end(); I != E; ++I) {
-    LastUses.push_back(*I);
-  }
-
-}
-
-AnalysisUsage *PMTopLevelManager::findAnalysisUsage(Pass *P) {
-  AnalysisUsage *AnUsage = NULL;
-  DenseMap<Pass *, AnalysisUsage *>::iterator DMI = AnUsageMap.find(P);
-  if (DMI != AnUsageMap.end())
-    AnUsage = DMI->second;
-  else {
-    AnUsage = new AnalysisUsage();
-    P->getAnalysisUsage(*AnUsage);
-    AnUsageMap[P] = AnUsage;
-  }
-  return AnUsage;
-}
-
-/// Schedule pass P for execution. Make sure that passes required by
-/// P are run before P is run. Update analysis info maintained by
-/// the manager. Remove dead passes. This is a recursive function.
-void PMTopLevelManager::schedulePass(Pass *P) {
-
-  // TODO : Allocate function manager for this pass, other wise required set
-  // may be inserted into previous function manager
-
-  // Give pass a chance to prepare the stage.
-  P->preparePassManager(activeStack);
-
-  // If P is an analysis pass and it is available then do not
-  // generate the analysis again. Stale analysis info should not be
-  // available at this point.
-  const PassInfo *PI =
-    PassRegistry::getPassRegistry()->getPassInfo(P->getPassID());
-  if (PI && PI->isAnalysis() && findAnalysisPass(P->getPassID())) {
-    delete P;
-    return;
-  }
-
-  AnalysisUsage *AnUsage = findAnalysisUsage(P);
-
-  bool checkAnalysis = true;
-  while (checkAnalysis) {
-    checkAnalysis = false;
-
-    const AnalysisUsage::VectorType &RequiredSet = AnUsage->getRequiredSet();
-    for (AnalysisUsage::VectorType::const_iterator I = RequiredSet.begin(),
-           E = RequiredSet.end(); I != E; ++I) {
-
-      Pass *AnalysisPass = findAnalysisPass(*I);
-      if (!AnalysisPass) {
-        const PassInfo *PI = PassRegistry::getPassRegistry()->getPassInfo(*I);
-
-        if (PI == NULL) {
-          // Pass P is not in the global PassRegistry
-          dbgs() << "Pass '"  << P->getPassName() << "' is not initialized." << "\n";
-          dbgs() << "Verify if there is a pass dependency cycle." << "\n";
-          dbgs() << "Required Passes:" << "\n";
-          for (AnalysisUsage::VectorType::const_iterator I2 = RequiredSet.begin(),
-                 E = RequiredSet.end(); I2 != E && I2 != I; ++I2) {
-            Pass *AnalysisPass2 = findAnalysisPass(*I2);
-            if (AnalysisPass2) {
-              dbgs() << "\t" << AnalysisPass2->getPassName() << "\n";
-            } else {
-              dbgs() << "\t"   << "Error: Required pass not found! Possible causes:"  << "\n";
-              dbgs() << "\t\t" << "- Pass misconfiguration (e.g.: missing macros)"    << "\n";
-              dbgs() << "\t\t" << "- Corruption of the global PassRegistry"           << "\n";
-            }
-          }
-        }
-
-        assert(PI && "Expected required passes to be initialized");
-        AnalysisPass = PI->createPass();
-        if (P->getPotentialPassManagerType () ==
-            AnalysisPass->getPotentialPassManagerType())
-          // Schedule analysis pass that is managed by the same pass manager.
-          schedulePass(AnalysisPass);
-        else if (P->getPotentialPassManagerType () >
-                 AnalysisPass->getPotentialPassManagerType()) {
-          // Schedule analysis pass that is managed by a new manager.
-          schedulePass(AnalysisPass);
-          // Recheck analysis passes to ensure that required analyses that
-          // are already checked are still available.
-          checkAnalysis = true;
-        } else
-          // Do not schedule this analysis. Lower level analsyis
-          // passes are run on the fly.
-          delete AnalysisPass;
+bool FunctionPassManager::run(Module *M) {
+  bool Changed = false;
+  for (Module::iterator I = M->begin(), E = M->end(); I != E; ++I)
+    for (unsigned Idx = 0, Size = Passes.size(); Idx != Size; ++Idx)
+      if (Passes[Idx]->run(I)) {
+        Changed = true;
+        if (AM) AM->invalidateAll(I);
       }
-    }
-  }
-
-  // Now all required passes are available.
-  if (ImmutablePass *IP = P->getAsImmutablePass()) {
-    // P is a immutable pass and it will be managed by this
-    // top level manager. Set up analysis resolver to connect them.
-    PMDataManager *DM = getAsPMDataManager();
-    AnalysisResolver *AR = new AnalysisResolver(*DM);
-    P->setResolver(AR);
-    DM->initializeAnalysisImpl(P);
-    addImmutablePass(IP);
-    DM->recordAvailableAnalysis(IP);
-    return;
-  }
-
-  if (PI && !PI->isAnalysis() && ShouldPrintBeforePass(PI)) {
-    Pass *PP = P->createPrinterPass(
-      dbgs(), std::string("*** IR Dump Before ") + P->getPassName() + " ***");
-    PP->assignPassManager(activeStack, getTopLevelPassManagerType());
-  }
-
-  // Add the requested pass to the best available pass manager.
-  P->assignPassManager(activeStack, getTopLevelPassManagerType());
-
-  if (PI && !PI->isAnalysis() && ShouldPrintAfterPass(PI)) {
-    Pass *PP = P->createPrinterPass(
-      dbgs(), std::string("*** IR Dump After ") + P->getPassName() + " ***");
-    PP->assignPassManager(activeStack, getTopLevelPassManagerType());
-  }
-}
-
-/// Find the pass that implements Analysis AID. Search immutable
-/// passes and all pass managers. If desired pass is not found
-/// then return NULL.
-Pass *PMTopLevelManager::findAnalysisPass(AnalysisID AID) {
-
-  // Check pass managers
-  for (SmallVectorImpl<PMDataManager *>::iterator I = PassManagers.begin(),
-         E = PassManagers.end(); I != E; ++I)
-    if (Pass *P = (*I)->findAnalysisPass(AID, false))
-      return P;
-
-  // Check other pass managers
-  for (SmallVectorImpl<PMDataManager *>::iterator
-         I = IndirectPassManagers.begin(),
-         E = IndirectPassManagers.end(); I != E; ++I)
-    if (Pass *P = (*I)->findAnalysisPass(AID, false))
-      return P;
-
-  // Check the immutable passes. Iterate in reverse order so that we find
-  // the most recently registered passes first.
-  for (SmallVectorImpl<ImmutablePass *>::reverse_iterator I =
-       ImmutablePasses.rbegin(), E = ImmutablePasses.rend(); I != E; ++I) {
-    AnalysisID PI = (*I)->getPassID();
-    if (PI == AID)
-      return *I;
-
-    // If Pass not found then check the interfaces implemented by Immutable Pass
-    const PassInfo *PassInf =
-      PassRegistry::getPassRegistry()->getPassInfo(PI);
-    assert(PassInf && "Expected all immutable passes to be initialized");
-    const std::vector<const PassInfo*> &ImmPI =
-      PassInf->getInterfacesImplemented();
-    for (std::vector<const PassInfo*>::const_iterator II = ImmPI.begin(),
-         EE = ImmPI.end(); II != EE; ++II) {
-      if ((*II)->getTypeInfo() == AID)
-        return *I;
-    }
-  }
-
-  return 0;
-}
-
-// Print passes managed by this top level manager.
-void PMTopLevelManager::dumpPasses() const {
-
-  if (PassDebugging < Structure)
-    return;
-
-  // Print out the immutable passes
-  for (unsigned i = 0, e = ImmutablePasses.size(); i != e; ++i) {
-    ImmutablePasses[i]->dumpPassStructure(0);
-  }
-
-  // Every class that derives from PMDataManager also derives from Pass
-  // (sometimes indirectly), but there's no inheritance relationship
-  // between PMDataManager and Pass, so we have to getAsPass to get
-  // from a PMDataManager* to a Pass*.
-  for (SmallVectorImpl<PMDataManager *>::const_iterator I =
-       PassManagers.begin(), E = PassManagers.end(); I != E; ++I)
-    (*I)->getAsPass()->dumpPassStructure(1);
-}
-
-void PMTopLevelManager::dumpArguments() const {
-
-  if (PassDebugging < Arguments)
-    return;
-
-  dbgs() << "Pass Arguments: ";
-  for (SmallVectorImpl<ImmutablePass *>::const_iterator I =
-       ImmutablePasses.begin(), E = ImmutablePasses.end(); I != E; ++I)
-    if (const PassInfo *PI =
-        PassRegistry::getPassRegistry()->getPassInfo((*I)->getPassID())) {
-      assert(PI && "Expected all immutable passes to be initialized");
-      if (!PI->isAnalysisGroup())
-        dbgs() << " -" << PI->getPassArgument();
-    }
-  for (SmallVectorImpl<PMDataManager *>::const_iterator I =
-       PassManagers.begin(), E = PassManagers.end(); I != E; ++I)
-    (*I)->dumpPassArguments();
-  dbgs() << "\n";
+  return Changed;
 }
 
-void PMTopLevelManager::initializeAllAnalysisInfo() {
-  for (SmallVectorImpl<PMDataManager *>::iterator I = PassManagers.begin(),
-         E = PassManagers.end(); I != E; ++I)
-    (*I)->initializeAnalysisInfo();
+void AnalysisManager::invalidateAll(Function *F) {
+  assert(F->getParent() == M && "Invalidating a function from another module!");
 
-  // Initailize other pass managers
-  for (SmallVectorImpl<PMDataManager *>::iterator
-       I = IndirectPassManagers.begin(), E = IndirectPassManagers.end();
+  // First invalidate any module results we still have laying about.
+  // FIXME: This is a total hack based on the fact that erasure doesn't
+  // invalidate iteration for DenseMap.
+  for (ModuleAnalysisResultMapT::iterator I = ModuleAnalysisResults.begin(),
+                                          E = ModuleAnalysisResults.end();
        I != E; ++I)
-    (*I)->initializeAnalysisInfo();
-
-  for (DenseMap<Pass *, Pass *>::iterator DMI = LastUser.begin(),
-        DME = LastUser.end(); DMI != DME; ++DMI) {
-    DenseMap<Pass *, SmallPtrSet<Pass *, 8> >::iterator InvDMI =
-      InversedLastUser.find(DMI->second);
-    if (InvDMI != InversedLastUser.end()) {
-      SmallPtrSet<Pass *, 8> &L = InvDMI->second;
-      L.insert(DMI->first);
+    if (I->second->invalidate(M))
+      ModuleAnalysisResults.erase(I);
+
+  // Now clear all the invalidated results associated specifically with this
+  // function.
+  SmallVector<void *, 8> InvalidatedPassIDs;
+  FunctionAnalysisResultListT &ResultsList = FunctionAnalysisResultLists[F];
+  for (FunctionAnalysisResultListT::iterator I = ResultsList.begin(),
+                                             E = ResultsList.end();
+       I != E;)
+    if (I->second->invalidate(F)) {
+      InvalidatedPassIDs.push_back(I->first);
+      I = ResultsList.erase(I);
     } else {
-      SmallPtrSet<Pass *, 8> L; L.insert(DMI->first);
-      InversedLastUser[DMI->second] = L;
+      ++I;
     }
-  }
-}
-
-/// Destructor
-PMTopLevelManager::~PMTopLevelManager() {
-  for (SmallVectorImpl<PMDataManager *>::iterator I = PassManagers.begin(),
-         E = PassManagers.end(); I != E; ++I)
-    delete *I;
-
-  for (SmallVectorImpl<ImmutablePass *>::iterator
-         I = ImmutablePasses.begin(), E = ImmutablePasses.end(); I != E; ++I)
-    delete *I;
-
-  for (DenseMap<Pass *, AnalysisUsage *>::iterator DMI = AnUsageMap.begin(),
-         DME = AnUsageMap.end(); DMI != DME; ++DMI)
-    delete DMI->second;
-}
-
-//===----------------------------------------------------------------------===//
-// PMDataManager implementation
-
-/// Augement AvailableAnalysis by adding analysis made available by pass P.
-void PMDataManager::recordAvailableAnalysis(Pass *P) {
-  AnalysisID PI = P->getPassID();
-
-  AvailableAnalysis[PI] = P;
-
-  assert(!AvailableAnalysis.empty());
-
-  // This pass is the current implementation of all of the interfaces it
-  // implements as well.
-  const PassInfo *PInf = PassRegistry::getPassRegistry()->getPassInfo(PI);
-  if (PInf == 0) return;
-  const std::vector<const PassInfo*> &II = PInf->getInterfacesImplemented();
-  for (unsigned i = 0, e = II.size(); i != e; ++i)
-    AvailableAnalysis[II[i]->getTypeInfo()] = P;
+  while (!InvalidatedPassIDs.empty())
+    FunctionAnalysisResults.erase(
+        std::make_pair(InvalidatedPassIDs.pop_back_val(), F));
 }
 
-// Return true if P preserves high level analysis used by other
-// passes managed by this manager
-bool PMDataManager::preserveHigherLevelAnalysis(Pass *P) {
-  AnalysisUsage *AnUsage = TPM->findAnalysisUsage(P);
-  if (AnUsage->getPreservesAll())
-    return true;
-
-  const AnalysisUsage::VectorType &PreservedSet = AnUsage->getPreservedSet();
-  for (SmallVectorImpl<Pass *>::iterator I = HigherLevelAnalysis.begin(),
-         E = HigherLevelAnalysis.end(); I  != E; ++I) {
-    Pass *P1 = *I;
-    if (P1->getAsImmutablePass() == 0 &&
-        std::find(PreservedSet.begin(), PreservedSet.end(),
-                  P1->getPassID()) ==
-           PreservedSet.end())
-      return false;
-  }
-
-  return true;
-}
-
-/// verifyPreservedAnalysis -- Verify analysis preserved by pass P.
-void PMDataManager::verifyPreservedAnalysis(Pass *P) {
-  // Don't do this unless assertions are enabled.
-#ifdef NDEBUG
-  return;
-#endif
-  AnalysisUsage *AnUsage = TPM->findAnalysisUsage(P);
-  const AnalysisUsage::VectorType &PreservedSet = AnUsage->getPreservedSet();
-
-  // Verify preserved analysis
-  for (AnalysisUsage::VectorType::const_iterator I = PreservedSet.begin(),
-         E = PreservedSet.end(); I != E; ++I) {
-    AnalysisID AID = *I;
-    if (Pass *AP = findAnalysisPass(AID, true)) {
-      TimeRegion PassTimer(getPassTimer(AP));
-      AP->verifyAnalysis();
-    }
-  }
-}
-
-/// Remove Analysis not preserved by Pass P
-void PMDataManager::removeNotPreservedAnalysis(Pass *P) {
-  AnalysisUsage *AnUsage = TPM->findAnalysisUsage(P);
-  if (AnUsage->getPreservesAll())
-    return;
-
-  const AnalysisUsage::VectorType &PreservedSet = AnUsage->getPreservedSet();
-  for (DenseMap<AnalysisID, Pass*>::iterator I = AvailableAnalysis.begin(),
-         E = AvailableAnalysis.end(); I != E; ) {
-    DenseMap<AnalysisID, Pass*>::iterator Info = I++;
-    if (Info->second->getAsImmutablePass() == 0 &&
-        std::find(PreservedSet.begin(), PreservedSet.end(), Info->first) ==
-        PreservedSet.end()) {
-      // Remove this analysis
-      if (PassDebugging >= Details) {
-        Pass *S = Info->second;
-        dbgs() << " -- '" <<  P->getPassName() << "' is not preserving '";
-        dbgs() << S->getPassName() << "'\n";
-      }
-      AvailableAnalysis.erase(Info);
-    }
-  }
-
-  // Check inherited analysis also. If P is not preserving analysis
-  // provided by parent manager then remove it here.
-  for (unsigned Index = 0; Index < PMT_Last; ++Index) {
-
-    if (!InheritedAnalysis[Index])
-      continue;
-
-    for (DenseMap<AnalysisID, Pass*>::iterator
-           I = InheritedAnalysis[Index]->begin(),
-           E = InheritedAnalysis[Index]->end(); I != E; ) {
-      DenseMap<AnalysisID, Pass *>::iterator Info = I++;
-      if (Info->second->getAsImmutablePass() == 0 &&
-          std::find(PreservedSet.begin(), PreservedSet.end(), Info->first) ==
-             PreservedSet.end()) {
-        // Remove this analysis
-        if (PassDebugging >= Details) {
-          Pass *S = Info->second;
-          dbgs() << " -- '" <<  P->getPassName() << "' is not preserving '";
-          dbgs() << S->getPassName() << "'\n";
-        }
-        InheritedAnalysis[Index]->erase(Info);
-      }
-    }
-  }
-}
-
-/// Remove analysis passes that are not used any longer
-void PMDataManager::removeDeadPasses(Pass *P, StringRef Msg,
-                                     enum PassDebuggingString DBG_STR) {
-
-  SmallVector<Pass *, 12> DeadPasses;
-
-  // If this is a on the fly manager then it does not have TPM.
-  if (!TPM)
-    return;
-
-  TPM->collectLastUses(DeadPasses, P);
-
-  if (PassDebugging >= Details && !DeadPasses.empty()) {
-    dbgs() << " -*- '" <<  P->getPassName();
-    dbgs() << "' is the last user of following pass instances.";
-    dbgs() << " Free these instances\n";
-  }
-
-  for (SmallVectorImpl<Pass *>::iterator I = DeadPasses.begin(),
-         E = DeadPasses.end(); I != E; ++I)
-    freePass(*I, Msg, DBG_STR);
-}
-
-void PMDataManager::freePass(Pass *P, StringRef Msg,
-                             enum PassDebuggingString DBG_STR) {
-  dumpPassInfo(P, FREEING_MSG, DBG_STR, Msg);
-
-  {
-    // If the pass crashes releasing memory, remember this.
-    PassManagerPrettyStackEntry X(P);
-    TimeRegion PassTimer(getPassTimer(P));
-
-    P->releaseMemory();
-  }
-
-  AnalysisID PI = P->getPassID();
-  if (const PassInfo *PInf = PassRegistry::getPassRegistry()->getPassInfo(PI)) {
-    // Remove the pass itself (if it is not already removed).
-    AvailableAnalysis.erase(PI);
-
-    // Remove all interfaces this pass implements, for which it is also
-    // listed as the available implementation.
-    const std::vector<const PassInfo*> &II = PInf->getInterfacesImplemented();
-    for (unsigned i = 0, e = II.size(); i != e; ++i) {
-      DenseMap<AnalysisID, Pass*>::iterator Pos =
-        AvailableAnalysis.find(II[i]->getTypeInfo());
-      if (Pos != AvailableAnalysis.end() && Pos->second == P)
-        AvailableAnalysis.erase(Pos);
-    }
-  }
-}
-
-/// Add pass P into the PassVector. Update
-/// AvailableAnalysis appropriately if ProcessAnalysis is true.
-void PMDataManager::add(Pass *P, bool ProcessAnalysis) {
-  // This manager is going to manage pass P. Set up analysis resolver
-  // to connect them.
-  AnalysisResolver *AR = new AnalysisResolver(*this);
-  P->setResolver(AR);
-
-  // If a FunctionPass F is the last user of ModulePass info M
-  // then the F's manager, not F, records itself as a last user of M.
-  SmallVector<Pass *, 12> TransferLastUses;
-
-  if (!ProcessAnalysis) {
-    // Add pass
-    PassVector.push_back(P);
-    return;
-  }
-
-  // At the moment, this pass is the last user of all required passes.
-  SmallVector<Pass *, 12> LastUses;
-  SmallVector<Pass *, 8> RequiredPasses;
-  SmallVector<AnalysisID, 8> ReqAnalysisNotAvailable;
-
-  unsigned PDepth = this->getDepth();
-
-  collectRequiredAnalysis(RequiredPasses,
-                          ReqAnalysisNotAvailable, P);
-  for (SmallVectorImpl<Pass *>::iterator I = RequiredPasses.begin(),
-         E = RequiredPasses.end(); I != E; ++I) {
-    Pass *PRequired = *I;
-    unsigned RDepth = 0;
-
-    assert(PRequired->getResolver() && "Analysis Resolver is not set");
-    PMDataManager &DM = PRequired->getResolver()->getPMDataManager();
-    RDepth = DM.getDepth();
-
-    if (PDepth == RDepth)
-      LastUses.push_back(PRequired);
-    else if (PDepth > RDepth) {
-      // Let the parent claim responsibility of last use
-      TransferLastUses.push_back(PRequired);
-      // Keep track of higher level analysis used by this manager.
-      HigherLevelAnalysis.push_back(PRequired);
-    } else
-      llvm_unreachable("Unable to accommodate Required Pass");
-  }
-
-  // Set P as P's last user until someone starts using P.
-  // However, if P is a Pass Manager then it does not need
-  // to record its last user.
-  if (P->getAsPMDataManager() == 0)
-    LastUses.push_back(P);
-  TPM->setLastUser(LastUses, P);
-
-  if (!TransferLastUses.empty()) {
-    Pass *My_PM = getAsPass();
-    TPM->setLastUser(TransferLastUses, My_PM);
-    TransferLastUses.clear();
-  }
-
-  // Now, take care of required analyses that are not available.
-  for (SmallVectorImpl<AnalysisID>::iterator
-         I = ReqAnalysisNotAvailable.begin(),
-         E = ReqAnalysisNotAvailable.end() ;I != E; ++I) {
-    const PassInfo *PI = PassRegistry::getPassRegistry()->getPassInfo(*I);
-    Pass *AnalysisPass = PI->createPass();
-    this->addLowerLevelRequiredPass(P, AnalysisPass);
-  }
-
-  // Take a note of analysis required and made available by this pass.
-  // Remove the analysis not preserved by this pass
-  removeNotPreservedAnalysis(P);
-  recordAvailableAnalysis(P);
-
-  // Add pass
-  PassVector.push_back(P);
-}
-
-
-/// Populate RP with analysis pass that are required by
-/// pass P and are available. Populate RP_NotAvail with analysis
-/// pass that are required by pass P but are not available.
-void PMDataManager::collectRequiredAnalysis(SmallVectorImpl<Pass *> &RP,
-                                       SmallVectorImpl<AnalysisID> &RP_NotAvail,
-                                            Pass *P) {
-  AnalysisUsage *AnUsage = TPM->findAnalysisUsage(P);
-  const AnalysisUsage::VectorType &RequiredSet = AnUsage->getRequiredSet();
-  for (AnalysisUsage::VectorType::const_iterator
-         I = RequiredSet.begin(), E = RequiredSet.end(); I != E; ++I) {
-    if (Pass *AnalysisPass = findAnalysisPass(*I, true))
-      RP.push_back(AnalysisPass);
-    else
-      RP_NotAvail.push_back(*I);
-  }
-
-  const AnalysisUsage::VectorType &IDs = AnUsage->getRequiredTransitiveSet();
-  for (AnalysisUsage::VectorType::const_iterator I = IDs.begin(),
-         E = IDs.end(); I != E; ++I) {
-    if (Pass *AnalysisPass = findAnalysisPass(*I, true))
-      RP.push_back(AnalysisPass);
-    else
-      RP_NotAvail.push_back(*I);
-  }
-}
-
-// All Required analyses should be available to the pass as it runs!  Here
-// we fill in the AnalysisImpls member of the pass so that it can
-// successfully use the getAnalysis() method to retrieve the
-// implementations it needs.
-//
-void PMDataManager::initializeAnalysisImpl(Pass *P) {
-  AnalysisUsage *AnUsage = TPM->findAnalysisUsage(P);
-
-  for (AnalysisUsage::VectorType::const_iterator
-         I = AnUsage->getRequiredSet().begin(),
-         E = AnUsage->getRequiredSet().end(); I != E; ++I) {
-    Pass *Impl = findAnalysisPass(*I, true);
-    if (Impl == 0)
-      // This may be analysis pass that is initialized on the fly.
-      // If that is not the case then it will raise an assert when it is used.
-      continue;
-    AnalysisResolver *AR = P->getResolver();
-    assert(AR && "Analysis Resolver is not set");
-    AR->addAnalysisImplsPair(*I, Impl);
-  }
-}
-
-/// Find the pass that implements Analysis AID. If desired pass is not found
-/// then return NULL.
-Pass *PMDataManager::findAnalysisPass(AnalysisID AID, bool SearchParent) {
-
-  // Check if AvailableAnalysis map has one entry.
-  DenseMap<AnalysisID, Pass*>::const_iterator I =  AvailableAnalysis.find(AID);
-
-  if (I != AvailableAnalysis.end())
-    return I->second;
-
-  // Search Parents through TopLevelManager
-  if (SearchParent)
-    return TPM->findAnalysisPass(AID);
-
-  return NULL;
-}
-
-// Print list of passes that are last used by P.
-void PMDataManager::dumpLastUses(Pass *P, unsigned Offset) const{
-
-  SmallVector<Pass *, 12> LUses;
-
-  // If this is a on the fly manager then it does not have TPM.
-  if (!TPM)
-    return;
-
-  TPM->collectLastUses(LUses, P);
-
-  for (SmallVectorImpl<Pass *>::iterator I = LUses.begin(),
-         E = LUses.end(); I != E; ++I) {
-    llvm::dbgs() << "--" << std::string(Offset*2, ' ');
-    (*I)->dumpPassStructure(0);
-  }
-}
-
-void PMDataManager::dumpPassArguments() const {
-  for (SmallVectorImpl<Pass *>::const_iterator I = PassVector.begin(),
-        E = PassVector.end(); I != E; ++I) {
-    if (PMDataManager *PMD = (*I)->getAsPMDataManager())
-      PMD->dumpPassArguments();
-    else
-      if (const PassInfo *PI =
-            PassRegistry::getPassRegistry()->getPassInfo((*I)->getPassID()))
-        if (!PI->isAnalysisGroup())
-          dbgs() << " -" << PI->getPassArgument();
-  }
-}
-
-void PMDataManager::dumpPassInfo(Pass *P, enum PassDebuggingString S1,
-                                 enum PassDebuggingString S2,
-                                 StringRef Msg) {
-  if (PassDebugging < Executions)
-    return;
-  dbgs() << (void*)this << std::string(getDepth()*2+1, ' ');
-  switch (S1) {
-  case EXECUTION_MSG:
-    dbgs() << "Executing Pass '" << P->getPassName();
-    break;
-  case MODIFICATION_MSG:
-    dbgs() << "Made Modification '" << P->getPassName();
-    break;
-  case FREEING_MSG:
-    dbgs() << " Freeing Pass '" << P->getPassName();
-    break;
-  default:
-    break;
-  }
-  switch (S2) {
-  case ON_BASICBLOCK_MSG:
-    dbgs() << "' on BasicBlock '" << Msg << "'...\n";
-    break;
-  case ON_FUNCTION_MSG:
-    dbgs() << "' on Function '" << Msg << "'...\n";
-    break;
-  case ON_MODULE_MSG:
-    dbgs() << "' on Module '"  << Msg << "'...\n";
-    break;
-  case ON_REGION_MSG:
-    dbgs() << "' on Region '"  << Msg << "'...\n";
-    break;
-  case ON_LOOP_MSG:
-    dbgs() << "' on Loop '" << Msg << "'...\n";
-    break;
-  case ON_CG_MSG:
-    dbgs() << "' on Call Graph Nodes '" << Msg << "'...\n";
-    break;
-  default:
-    break;
-  }
-}
-
-void PMDataManager::dumpRequiredSet(const Pass *P) const {
-  if (PassDebugging < Details)
-    return;
-
-  AnalysisUsage analysisUsage;
-  P->getAnalysisUsage(analysisUsage);
-  dumpAnalysisUsage("Required", P, analysisUsage.getRequiredSet());
-}
-
-void PMDataManager::dumpPreservedSet(const Pass *P) const {
-  if (PassDebugging < Details)
-    return;
-
-  AnalysisUsage analysisUsage;
-  P->getAnalysisUsage(analysisUsage);
-  dumpAnalysisUsage("Preserved", P, analysisUsage.getPreservedSet());
-}
-
-void PMDataManager::dumpAnalysisUsage(StringRef Msg, const Pass *P,
-                                   const AnalysisUsage::VectorType &Set) const {
-  assert(PassDebugging >= Details);
-  if (Set.empty())
-    return;
-  dbgs() << (const void*)P << std::string(getDepth()*2+3, ' ') << Msg << " Analyses:";
-  for (unsigned i = 0; i != Set.size(); ++i) {
-    if (i) dbgs() << ',';
-    const PassInfo *PInf = PassRegistry::getPassRegistry()->getPassInfo(Set[i]);
-    if (!PInf) {
-      // Some preserved passes, such as AliasAnalysis, may not be initialized by
-      // all drivers.
-      dbgs() << " Uninitialized Pass";
-      continue;
-    }
-    dbgs() << ' ' << PInf->getPassName();
-  }
-  dbgs() << '\n';
-}
-
-/// Add RequiredPass into list of lower level passes required by pass P.
-/// RequiredPass is run on the fly by Pass Manager when P requests it
-/// through getAnalysis interface.
-/// This should be handled by specific pass manager.
-void PMDataManager::addLowerLevelRequiredPass(Pass *P, Pass *RequiredPass) {
-  if (TPM) {
-    TPM->dumpArguments();
-    TPM->dumpPasses();
-  }
-
-  // Module Level pass may required Function Level analysis info
-  // (e.g. dominator info). Pass manager uses on the fly function pass manager
-  // to provide this on demand. In that case, in Pass manager terminology,
-  // module level pass is requiring lower level analysis info managed by
-  // lower level pass manager.
-
-  // When Pass manager is not able to order required analysis info, Pass manager
-  // checks whether any lower level manager will be able to provide this
-  // analysis info on demand or not.
-#ifndef NDEBUG
-  dbgs() << "Unable to schedule '" << RequiredPass->getPassName();
-  dbgs() << "' required by '" << P->getPassName() << "'\n";
-#endif
-  llvm_unreachable("Unable to schedule pass");
-}
-
-Pass *PMDataManager::getOnTheFlyPass(Pass *P, AnalysisID PI, Function &F) {
-  llvm_unreachable("Unable to find on the fly pass");
-}
-
-// Destructor
-PMDataManager::~PMDataManager() {
-  for (SmallVectorImpl<Pass *>::iterator I = PassVector.begin(),
-         E = PassVector.end(); I != E; ++I)
-    delete *I;
-}
-
-//===----------------------------------------------------------------------===//
-// NOTE: Is this the right place to define this method ?
-// getAnalysisIfAvailable - Return analysis result or null if it doesn't exist.
-Pass *AnalysisResolver::getAnalysisIfAvailable(AnalysisID ID, bool dir) const {
-  return PM.findAnalysisPass(ID, dir);
-}
-
-Pass *AnalysisResolver::findImplPass(Pass *P, AnalysisID AnalysisPI,
-                                     Function &F) {
-  return PM.getOnTheFlyPass(P, AnalysisPI, F);
-}
-
-//===----------------------------------------------------------------------===//
-// BBPassManager implementation
-
-/// Execute all of the passes scheduled for execution by invoking
-/// runOnBasicBlock method.  Keep track of whether any of the passes modifies
-/// the function, and if so, return true.
-bool BBPassManager::runOnFunction(Function &F) {
-  if (F.isDeclaration())
-    return false;
-
-  bool Changed = doInitialization(F);
-
-  for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I)
-    for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
-      BasicBlockPass *BP = getContainedPass(Index);
-      bool LocalChanged = false;
-
-      dumpPassInfo(BP, EXECUTION_MSG, ON_BASICBLOCK_MSG, I->getName());
-      dumpRequiredSet(BP);
-
-      initializeAnalysisImpl(BP);
-
-      {
-        // If the pass crashes, remember this.
-        PassManagerPrettyStackEntry X(BP, *I);
-        TimeRegion PassTimer(getPassTimer(BP));
-
-        LocalChanged |= BP->runOnBasicBlock(*I);
+void AnalysisManager::invalidateAll(Module *M) {
+  // First invalidate any module results we still have laying about.
+  // FIXME: This is a total hack based on the fact that erasure doesn't
+  // invalidate iteration for DenseMap.
+  for (ModuleAnalysisResultMapT::iterator I = ModuleAnalysisResults.begin(),
+                                          E = ModuleAnalysisResults.end();
+       I != E; ++I)
+    if (I->second->invalidate(M))
+      ModuleAnalysisResults.erase(I);
+
+  // Now walk all of the functions for which there are cached results, and
+  // attempt to invalidate each of those as the entire module may have changed.
+  // FIXME: How do we handle functions which have been deleted or RAUWed?
+  SmallVector<void *, 8> InvalidatedPassIDs;
+  for (FunctionAnalysisResultListMapT::iterator
+           FI = FunctionAnalysisResultLists.begin(),
+           FE = FunctionAnalysisResultLists.end();
+       FI != FE; ++FI) {
+    Function *F = FI->first;
+    FunctionAnalysisResultListT &ResultsList = FI->second;
+    for (FunctionAnalysisResultListT::iterator I = ResultsList.begin(),
+                                               E = ResultsList.end();
+         I != E;)
+      if (I->second->invalidate(F)) {
+        InvalidatedPassIDs.push_back(I->first);
+        I = ResultsList.erase(I);
+      } else {
+        ++I;
       }
-
-      Changed |= LocalChanged;
-      if (LocalChanged)
-        dumpPassInfo(BP, MODIFICATION_MSG, ON_BASICBLOCK_MSG,
-                     I->getName());
-      dumpPreservedSet(BP);
-
-      verifyPreservedAnalysis(BP);
-      removeNotPreservedAnalysis(BP);
-      recordAvailableAnalysis(BP);
-      removeDeadPasses(BP, I->getName(), ON_BASICBLOCK_MSG);
-    }
-
-  return doFinalization(F) || Changed;
-}
-
-// Implement doInitialization and doFinalization
-bool BBPassManager::doInitialization(Module &M) {
-  bool Changed = false;
-
-  for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index)
-    Changed |= getContainedPass(Index)->doInitialization(M);
-
-  return Changed;
-}
-
-bool BBPassManager::doFinalization(Module &M) {
-  bool Changed = false;
-
-  for (int Index = getNumContainedPasses() - 1; Index >= 0; --Index)
-    Changed |= getContainedPass(Index)->doFinalization(M);
-
-  return Changed;
-}
-
-bool BBPassManager::doInitialization(Function &F) {
-  bool Changed = false;
-
-  for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
-    BasicBlockPass *BP = getContainedPass(Index);
-    Changed |= BP->doInitialization(F);
+    while (!InvalidatedPassIDs.empty())
+      FunctionAnalysisResults.erase(
+          std::make_pair(InvalidatedPassIDs.pop_back_val(), F));
   }
-
-  return Changed;
 }
 
-bool BBPassManager::doFinalization(Function &F) {
-  bool Changed = false;
+const AnalysisManager::AnalysisResultConcept<Module> &
+AnalysisManager::getResultImpl(void *PassID, Module *M) {
+  assert(M == this->M && "Wrong module used when querying the AnalysisManager");
+  ModuleAnalysisResultMapT::iterator RI;
+  bool Inserted;
+  llvm::tie(RI, Inserted) = ModuleAnalysisResults.insert(std::make_pair(
+      PassID, polymorphic_ptr<AnalysisResultConcept<Module> >()));
 
-  for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
-    BasicBlockPass *BP = getContainedPass(Index);
-    Changed |= BP->doFinalization(F);
+  if (Inserted) {
+    // We don't have a cached result for this result. Look up the pass and run
+    // it to produce a result, which we then add to the cache.
+    ModuleAnalysisPassMapT::const_iterator PI =
+        ModuleAnalysisPasses.find(PassID);
+    assert(PI != ModuleAnalysisPasses.end() &&
+           "Analysis passes must be registered prior to being queried!");
+    RI->second = PI->second->run(M);
   }
 
-  return Changed;
+  return *RI->second;
 }
 
+const AnalysisManager::AnalysisResultConcept<Function> &
+AnalysisManager::getResultImpl(void *PassID, Function *F) {
+  assert(F->getParent() == M && "Analyzing a function from another module!");
 
-//===----------------------------------------------------------------------===//
-// FunctionPassManager implementation
-
-/// Create new Function pass manager
-FunctionPassManager::FunctionPassManager(Module *m) : M(m) {
-  FPM = new FunctionPassManagerImpl();
-  // FPM is the top level manager.
-  FPM->setTopLevelManager(FPM);
-
-  AnalysisResolver *AR = new AnalysisResolver(*FPM);
-  FPM->setResolver(AR);
-}
-
-FunctionPassManager::~FunctionPassManager() {
-  delete FPM;
-}
+  FunctionAnalysisResultMapT::iterator RI;
+  bool Inserted;
+  llvm::tie(RI, Inserted) = FunctionAnalysisResults.insert(std::make_pair(
+      std::make_pair(PassID, F), FunctionAnalysisResultListT::iterator()));
 
-/// add - Add a pass to the queue of passes to run.  This passes
-/// ownership of the Pass to the PassManager.  When the
-/// PassManager_X is destroyed, the pass will be destroyed as well, so
-/// there is no need to delete the pass. (TODO delete passes.)
-/// This implies that all passes MUST be allocated with 'new'.
-void FunctionPassManager::add(Pass *P) {
-  FPM->add(P);
-}
-
-/// run - Execute all of the passes scheduled for execution.  Keep
-/// track of whether any of the passes modifies the function, and if
-/// so, return true.
-///
-bool FunctionPassManager::run(Function &F) {
-  if (F.isMaterializable()) {
-    std::string errstr;
-    if (F.Materialize(&errstr))
-      report_fatal_error("Error reading bitcode file: " + Twine(errstr));
+  if (Inserted) {
+    // We don't have a cached result for this result. Look up the pass and run
+    // it to produce a result, which we then add to the cache.
+    FunctionAnalysisPassMapT::const_iterator PI =
+        FunctionAnalysisPasses.find(PassID);
+    assert(PI != FunctionAnalysisPasses.end() &&
+           "Analysis passes must be registered prior to being queried!");
+    FunctionAnalysisResultListT &ResultList = FunctionAnalysisResultLists[F];
+    ResultList.push_back(std::make_pair(PassID, PI->second->run(F)));
+    RI->second = llvm::prior(ResultList.end());
   }
-  return FPM->run(F);
-}
 
-
-/// doInitialization - Run all of the initializers for the function passes.
-///
-bool FunctionPassManager::doInitialization() {
-  return FPM->doInitialization(*M);
-}
-
-/// doFinalization - Run all of the finalizers for the function passes.
-///
-bool FunctionPassManager::doFinalization() {
-  return FPM->doFinalization(*M);
-}
-
-//===----------------------------------------------------------------------===//
-// FunctionPassManagerImpl implementation
-//
-bool FunctionPassManagerImpl::doInitialization(Module &M) {
-  bool Changed = false;
-
-  dumpArguments();
-  dumpPasses();
-
-  SmallVectorImpl<ImmutablePass *>& IPV = getImmutablePasses();
-  for (SmallVectorImpl<ImmutablePass *>::const_iterator I = IPV.begin(),
-       E = IPV.end(); I != E; ++I) {
-    Changed |= (*I)->doInitialization(M);
-  }
-
-  for (unsigned Index = 0; Index < getNumContainedManagers(); ++Index)
-    Changed |= getContainedManager(Index)->doInitialization(M);
-
-  return Changed;
+  return *RI->second->second;
 }
 
-bool FunctionPassManagerImpl::doFinalization(Module &M) {
-  bool Changed = false;
-
-  for (int Index = getNumContainedManagers() - 1; Index >= 0; --Index)
-    Changed |= getContainedManager(Index)->doFinalization(M);
-
-  SmallVectorImpl<ImmutablePass *>& IPV = getImmutablePasses();
-  for (SmallVectorImpl<ImmutablePass *>::const_iterator I = IPV.begin(),
-       E = IPV.end(); I != E; ++I) {
-    Changed |= (*I)->doFinalization(M);
-  }
-
-  return Changed;
+void AnalysisManager::invalidateImpl(void *PassID, Module *M) {
+  assert(M == this->M && "Invalidating a pass over a different module!");
+  ModuleAnalysisResults.erase(PassID);
 }
 
-/// cleanup - After running all passes, clean up pass manager cache.
-void FPPassManager::cleanup() {
- for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
-    FunctionPass *FP = getContainedPass(Index);
-    AnalysisResolver *AR = FP->getResolver();
-    assert(AR && "Analysis Resolver is not set");
-    AR->clearAnalysisImpls();
- }
-}
+void AnalysisManager::invalidateImpl(void *PassID, Function *F) {
+  assert(F->getParent() == M &&
+         "Invalidating a pass over a function from another module!");
 
-void FunctionPassManagerImpl::releaseMemoryOnTheFly() {
-  if (!wasRun)
+  FunctionAnalysisResultMapT::iterator RI =
+      FunctionAnalysisResults.find(std::make_pair(PassID, F));
+  if (RI == FunctionAnalysisResults.end())
     return;
-  for (unsigned Index = 0; Index < getNumContainedManagers(); ++Index) {
-    FPPassManager *FPPM = getContainedManager(Index);
-    for (unsigned Index = 0; Index < FPPM->getNumContainedPasses(); ++Index) {
-      FPPM->getContainedPass(Index)->releaseMemory();
-    }
-  }
-  wasRun = false;
-}
-
-// Execute all the passes managed by this top level manager.
-// Return true if any function is modified by a pass.
-bool FunctionPassManagerImpl::run(Function &F) {
-  bool Changed = false;
-  TimingInfo::createTheTimeInfo();
-
-  initializeAllAnalysisInfo();
-  for (unsigned Index = 0; Index < getNumContainedManagers(); ++Index)
-    Changed |= getContainedManager(Index)->runOnFunction(F);
-
-  for (unsigned Index = 0; Index < getNumContainedManagers(); ++Index)
-    getContainedManager(Index)->cleanup();
-
-  wasRun = true;
-  return Changed;
-}
 
-//===----------------------------------------------------------------------===//
-// FPPassManager implementation
-
-char FPPassManager::ID = 0;
-/// Print passes managed by this manager
-void FPPassManager::dumpPassStructure(unsigned Offset) {
-  dbgs().indent(Offset*2) << "FunctionPass Manager\n";
-  for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
-    FunctionPass *FP = getContainedPass(Index);
-    FP->dumpPassStructure(Offset + 1);
-    dumpLastUses(FP, Offset+1);
-  }
-}
-
-
-/// Execute all of the passes scheduled for execution by invoking
-/// runOnFunction method.  Keep track of whether any of the passes modifies
-/// the function, and if so, return true.
-bool FPPassManager::runOnFunction(Function &F) {
-  if (F.isDeclaration())
-    return false;
-
-  bool Changed = false;
-
-  // Collect inherited analysis from Module level pass manager.
-  populateInheritedAnalysis(TPM->activeStack);
-
-  for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
-    FunctionPass *FP = getContainedPass(Index);
-    bool LocalChanged = false;
-
-    dumpPassInfo(FP, EXECUTION_MSG, ON_FUNCTION_MSG, F.getName());
-    dumpRequiredSet(FP);
-
-    initializeAnalysisImpl(FP);
-
-    {
-      PassManagerPrettyStackEntry X(FP, F);
-      TimeRegion PassTimer(getPassTimer(FP));
-
-      LocalChanged |= FP->runOnFunction(F);
-    }
-
-    Changed |= LocalChanged;
-    if (LocalChanged)
-      dumpPassInfo(FP, MODIFICATION_MSG, ON_FUNCTION_MSG, F.getName());
-    dumpPreservedSet(FP);
-
-    verifyPreservedAnalysis(FP);
-    removeNotPreservedAnalysis(FP);
-    recordAvailableAnalysis(FP);
-    removeDeadPasses(FP, F.getName(), ON_FUNCTION_MSG);
-  }
-  return Changed;
-}
-
-bool FPPassManager::runOnModule(Module &M) {
-  bool Changed = false;
-
-  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I)
-    Changed |= runOnFunction(*I);
-
-  return Changed;
-}
-
-bool FPPassManager::doInitialization(Module &M) {
-  bool Changed = false;
-
-  for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index)
-    Changed |= getContainedPass(Index)->doInitialization(M);
-  
-  return Changed;
-}
-
-bool FPPassManager::doFinalization(Module &M) {
-  bool Changed = false;
-
-  for (int Index = getNumContainedPasses() - 1; Index >= 0; --Index)
-    Changed |= getContainedPass(Index)->doFinalization(M);
-  
-  return Changed;
+  FunctionAnalysisResultLists[F].erase(RI->second);
 }
-
-//===----------------------------------------------------------------------===//
-// MPPassManager implementation
-
-/// Execute all of the passes scheduled for execution by invoking
-/// runOnModule method.  Keep track of whether any of the passes modifies
-/// the module, and if so, return true.
-bool
-MPPassManager::runOnModule(Module &M) {
-  bool Changed = false;
-
-  // Initialize on-the-fly passes
-  for (std::map<Pass *, FunctionPassManagerImpl *>::iterator
-       I = OnTheFlyManagers.begin(), E = OnTheFlyManagers.end();
-       I != E; ++I) {
-    FunctionPassManagerImpl *FPP = I->second;
-    Changed |= FPP->doInitialization(M);
-  }
-
-  // Initialize module passes
-  for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index)
-    Changed |= getContainedPass(Index)->doInitialization(M);
-
-  for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
-    ModulePass *MP = getContainedPass(Index);
-    bool LocalChanged = false;
-
-    dumpPassInfo(MP, EXECUTION_MSG, ON_MODULE_MSG, M.getModuleIdentifier());
-    dumpRequiredSet(MP);
-
-    initializeAnalysisImpl(MP);
-
-    {
-      PassManagerPrettyStackEntry X(MP, M);
-      TimeRegion PassTimer(getPassTimer(MP));
-
-      LocalChanged |= MP->runOnModule(M);
-    }
-
-    Changed |= LocalChanged;
-    if (LocalChanged)
-      dumpPassInfo(MP, MODIFICATION_MSG, ON_MODULE_MSG,
-                   M.getModuleIdentifier());
-    dumpPreservedSet(MP);
-
-    verifyPreservedAnalysis(MP);
-    removeNotPreservedAnalysis(MP);
-    recordAvailableAnalysis(MP);
-    removeDeadPasses(MP, M.getModuleIdentifier(), ON_MODULE_MSG);
-  }
-
-  // Finalize module passes
-  for (int Index = getNumContainedPasses() - 1; Index >= 0; --Index)
-    Changed |= getContainedPass(Index)->doFinalization(M);
-
-  // Finalize on-the-fly passes
-  for (std::map<Pass *, FunctionPassManagerImpl *>::iterator
-       I = OnTheFlyManagers.begin(), E = OnTheFlyManagers.end();
-       I != E; ++I) {
-    FunctionPassManagerImpl *FPP = I->second;
-    // We don't know when is the last time an on-the-fly pass is run,
-    // so we need to releaseMemory / finalize here
-    FPP->releaseMemoryOnTheFly();
-    Changed |= FPP->doFinalization(M);
-  }
-  
-  return Changed;
-}
-
-/// Add RequiredPass into list of lower level passes required by pass P.
-/// RequiredPass is run on the fly by Pass Manager when P requests it
-/// through getAnalysis interface.
-void MPPassManager::addLowerLevelRequiredPass(Pass *P, Pass *RequiredPass) {
-  assert(P->getPotentialPassManagerType() == PMT_ModulePassManager &&
-         "Unable to handle Pass that requires lower level Analysis pass");
-  assert((P->getPotentialPassManagerType() <
-          RequiredPass->getPotentialPassManagerType()) &&
-         "Unable to handle Pass that requires lower level Analysis pass");
-
-  FunctionPassManagerImpl *FPP = OnTheFlyManagers[P];
-  if (!FPP) {
-    FPP = new FunctionPassManagerImpl();
-    // FPP is the top level manager.
-    FPP->setTopLevelManager(FPP);
-
-    OnTheFlyManagers[P] = FPP;
-  }
-  FPP->add(RequiredPass);
-
-  // Register P as the last user of RequiredPass.
-  if (RequiredPass) {
-    SmallVector<Pass *, 1> LU;
-    LU.push_back(RequiredPass);
-    FPP->setLastUser(LU,  P);
-  }
-}
-
-/// Return function pass corresponding to PassInfo PI, that is
-/// required by module pass MP. Instantiate analysis pass, by using
-/// its runOnFunction() for function F.
-Pass* MPPassManager::getOnTheFlyPass(Pass *MP, AnalysisID PI, Function &F){
-  FunctionPassManagerImpl *FPP = OnTheFlyManagers[MP];
-  assert(FPP && "Unable to find on the fly pass");
-
-  FPP->releaseMemoryOnTheFly();
-  FPP->run(F);
-  return ((PMTopLevelManager*)FPP)->findAnalysisPass(PI);
-}
-
-
-//===----------------------------------------------------------------------===//
-// PassManagerImpl implementation
-
-//
-/// run - Execute all of the passes scheduled for execution.  Keep track of
-/// whether any of the passes modifies the module, and if so, return true.
-bool PassManagerImpl::run(Module &M) {
-  bool Changed = false;
-  TimingInfo::createTheTimeInfo();
-
-  dumpArguments();
-  dumpPasses();
-
-  SmallVectorImpl<ImmutablePass *>& IPV = getImmutablePasses();
-  for (SmallVectorImpl<ImmutablePass *>::const_iterator I = IPV.begin(),
-       E = IPV.end(); I != E; ++I) {
-    Changed |= (*I)->doInitialization(M);
-  }
-
-  initializeAllAnalysisInfo();
-  for (unsigned Index = 0; Index < getNumContainedManagers(); ++Index)
-    Changed |= getContainedManager(Index)->runOnModule(M);
-
-  for (SmallVectorImpl<ImmutablePass *>::const_iterator I = IPV.begin(),
-       E = IPV.end(); I != E; ++I) {
-    Changed |= (*I)->doFinalization(M);
-  }
-
-  return Changed;
-}
-
-//===----------------------------------------------------------------------===//
-// PassManager implementation
-
-/// Create new pass manager
-PassManager::PassManager() {
-  PM = new PassManagerImpl();
-  // PM is the top level manager
-  PM->setTopLevelManager(PM);
-}
-
-PassManager::~PassManager() {
-  delete PM;
-}
-
-/// add - Add a pass to the queue of passes to run.  This passes ownership of
-/// the Pass to the PassManager.  When the PassManager is destroyed, the pass
-/// will be destroyed as well, so there is no need to delete the pass.  This
-/// implies that all passes MUST be allocated with 'new'.
-void PassManager::add(Pass *P) {
-  PM->add(P);
-}
-
-/// run - Execute all of the passes scheduled for execution.  Keep track of
-/// whether any of the passes modifies the module, and if so, return true.
-bool PassManager::run(Module &M) {
-  return PM->run(M);
-}
-
-//===----------------------------------------------------------------------===//
-// TimingInfo implementation
-
-bool llvm::TimePassesIsEnabled = false;
-static cl::opt<bool,true>
-EnableTiming("time-passes", cl::location(TimePassesIsEnabled),
-            cl::desc("Time each pass, printing elapsed time for each on exit"));
-
-// createTheTimeInfo - This method either initializes the TheTimeInfo pointer to
-// a non null value (if the -time-passes option is enabled) or it leaves it
-// null.  It may be called multiple times.
-void TimingInfo::createTheTimeInfo() {
-  if (!TimePassesIsEnabled || TheTimeInfo) return;
-
-  // Constructed the first time this is called, iff -time-passes is enabled.
-  // This guarantees that the object will be constructed before static globals,
-  // thus it will be destroyed before them.
-  static ManagedStatic<TimingInfo> TTI;
-  TheTimeInfo = &*TTI;
-}
-
-/// If TimingInfo is enabled then start pass timer.
-Timer *llvm::getPassTimer(Pass *P) {
-  if (TheTimeInfo)
-    return TheTimeInfo->getPassTimer(P);
-  return 0;
-}
-
-//===----------------------------------------------------------------------===//
-// PMStack implementation
-//
-
-// Pop Pass Manager from the stack and clear its analysis info.
-void PMStack::pop() {
-
-  PMDataManager *Top = this->top();
-  Top->initializeAnalysisInfo();
-
-  S.pop_back();
-}
-
-// Push PM on the stack and set its top level manager.
-void PMStack::push(PMDataManager *PM) {
-  assert(PM && "Unable to push. Pass Manager expected");
-  assert(PM->getDepth()==0 && "Pass Manager depth set too early");
-
-  if (!this->empty()) {
-    assert(PM->getPassManagerType() > this->top()->getPassManagerType()
-           && "pushing bad pass manager to PMStack");
-    PMTopLevelManager *TPM = this->top()->getTopLevelManager();
-
-    assert(TPM && "Unable to find top level manager");
-    TPM->addIndirectPassManager(PM);
-    PM->setTopLevelManager(TPM);
-    PM->setDepth(this->top()->getDepth()+1);
-  } else {
-    assert((PM->getPassManagerType() == PMT_ModulePassManager
-           || PM->getPassManagerType() == PMT_FunctionPassManager)
-           && "pushing bad pass manager to PMStack");
-    PM->setDepth(1);
-  }
-
-  S.push_back(PM);
-}
-
-// Dump content of the pass manager stack.
-void PMStack::dump() const {
-  for (std::vector<PMDataManager *>::const_iterator I = S.begin(),
-         E = S.end(); I != E; ++I)
-    dbgs() << (*I)->getAsPass()->getPassName() << ' ';
-
-  if (!S.empty())
-    dbgs() << '\n';
-}
-
-/// Find appropriate Module Pass Manager in the PM Stack and
-/// add self into that manager.
-void ModulePass::assignPassManager(PMStack &PMS,
-                                   PassManagerType PreferredType) {
-  // Find Module Pass Manager
-  while (!PMS.empty()) {
-    PassManagerType TopPMType = PMS.top()->getPassManagerType();
-    if (TopPMType == PreferredType)
-      break; // We found desired pass manager
-    else if (TopPMType > PMT_ModulePassManager)
-      PMS.pop();    // Pop children pass managers
-    else
-      break;
-  }
-  assert(!PMS.empty() && "Unable to find appropriate Pass Manager");
-  PMS.top()->add(this);
-}
-
-/// Find appropriate Function Pass Manager or Call Graph Pass Manager
-/// in the PM Stack and add self into that manager.
-void FunctionPass::assignPassManager(PMStack &PMS,
-                                     PassManagerType PreferredType) {
-
-  // Find Function Pass Manager
-  while (!PMS.empty()) {
-    if (PMS.top()->getPassManagerType() > PMT_FunctionPassManager)
-      PMS.pop();
-    else
-      break;
-  }
-
-  // Create new Function Pass Manager if needed.
-  FPPassManager *FPP;
-  if (PMS.top()->getPassManagerType() == PMT_FunctionPassManager) {
-    FPP = (FPPassManager *)PMS.top();
-  } else {
-    assert(!PMS.empty() && "Unable to create Function Pass Manager");
-    PMDataManager *PMD = PMS.top();
-
-    // [1] Create new Function Pass Manager
-    FPP = new FPPassManager();
-    FPP->populateInheritedAnalysis(PMS);
-
-    // [2] Set up new manager's top level manager
-    PMTopLevelManager *TPM = PMD->getTopLevelManager();
-    TPM->addIndirectPassManager(FPP);
-
-    // [3] Assign manager to manage this new manager. This may create
-    // and push new managers into PMS
-    FPP->assignPassManager(PMS, PMD->getPassManagerType());
-
-    // [4] Push new manager into PMS
-    PMS.push(FPP);
-  }
-
-  // Assign FPP as the manager of this pass.
-  FPP->add(this);
-}
-
-/// Find appropriate Basic Pass Manager or Call Graph Pass Manager
-/// in the PM Stack and add self into that manager.
-void BasicBlockPass::assignPassManager(PMStack &PMS,
-                                       PassManagerType PreferredType) {
-  BBPassManager *BBP;
-
-  // Basic Pass Manager is a leaf pass manager. It does not handle
-  // any other pass manager.
-  if (!PMS.empty() &&
-      PMS.top()->getPassManagerType() == PMT_BasicBlockPassManager) {
-    BBP = (BBPassManager *)PMS.top();
-  } else {
-    // If leaf manager is not Basic Block Pass manager then create new
-    // basic Block Pass manager.
-    assert(!PMS.empty() && "Unable to create BasicBlock Pass Manager");
-    PMDataManager *PMD = PMS.top();
-
-    // [1] Create new Basic Block Manager
-    BBP = new BBPassManager();
-
-    // [2] Set up new manager's top level manager
-    // Basic Block Pass Manager does not live by itself
-    PMTopLevelManager *TPM = PMD->getTopLevelManager();
-    TPM->addIndirectPassManager(BBP);
-
-    // [3] Assign manager to manage this new manager. This may create
-    // and push new managers into PMS
-    BBP->assignPassManager(PMS, PreferredType);
-
-    // [4] Push new manager into PMS
-    PMS.push(BBP);
-  }
-
-  // Assign BBP as the manager of this pass.
-  BBP->add(this);
-}
-
-PassManagerBase::~PassManagerBase() {}
diff --git a/lib/IR/Type.cpp b/lib/IR/Type.cpp
index 46c61fc..432cbc9 100644
--- a/lib/IR/Type.cpp
+++ b/lib/IR/Type.cpp
@@ -616,11 +616,7 @@ bool StructType::isLayoutIdentical(StructType *Other) const {
 /// getTypeByName - Return the type with the specified name, or null if there
 /// is none by that name.
 StructType *Module::getTypeByName(StringRef Name) const {
-  StringMap<StructType*>::iterator I =
-    getContext().pImpl->NamedStructTypes.find(Name);
-  if (I != getContext().pImpl->NamedStructTypes.end())
-    return I->second;
-  return 0;
+  return getContext().pImpl->NamedStructTypes.lookup(Name);
 }
 
 
diff --git a/lib/IR/TypeFinder.cpp b/lib/IR/TypeFinder.cpp
index d5e6203..689b903 100644
--- a/lib/IR/TypeFinder.cpp
+++ b/lib/IR/TypeFinder.cpp
@@ -44,6 +44,9 @@ void TypeFinder::run(const Module &M, bool onlyNamed) {
   for (Module::const_iterator FI = M.begin(), E = M.end(); FI != E; ++FI) {
     incorporateType(FI->getType());
 
+    if (FI->hasPrefixData())
+      incorporateValue(FI->getPrefixData());
+
     // First incorporate the arguments.
     for (Function::const_arg_iterator AI = FI->arg_begin(),
            AE = FI->arg_end(); AI != AE; ++AI)
@@ -91,19 +94,27 @@ void TypeFinder::clear() {
 /// incorporateType - This method adds the type to the list of used structures
 /// if it's not in there already.
 void TypeFinder::incorporateType(Type *Ty) {
-  // Check to see if we're already visited this type.
+  // Check to see if we've already visited this type.
   if (!VisitedTypes.insert(Ty).second)
     return;
 
-  // If this is a structure or opaque type, add a name for the type.
-  if (StructType *STy = dyn_cast<StructType>(Ty))
-    if (!OnlyNamed || STy->hasName())
-      StructTypes.push_back(STy);
-
-  // Recursively walk all contained types.
-  for (Type::subtype_iterator I = Ty->subtype_begin(),
-         E = Ty->subtype_end(); I != E; ++I)
-    incorporateType(*I);
+  SmallVector<Type *, 4> TypeWorklist;
+  TypeWorklist.push_back(Ty);
+  do {
+    Ty = TypeWorklist.pop_back_val();
+
+    // If this is a structure or opaque type, add a name for the type.
+    if (StructType *STy = dyn_cast<StructType>(Ty))
+      if (!OnlyNamed || STy->hasName())
+        StructTypes.push_back(STy);
+
+    // Add all unvisited subtypes to worklist for processing
+    for (Type::subtype_reverse_iterator I = Ty->subtype_rbegin(),
+                                        E = Ty->subtype_rend();
+         I != E; ++I)
+      if (VisitedTypes.insert(*I).second)
+        TypeWorklist.push_back(*I);
+  } while (!TypeWorklist.empty());
 }
 
 /// incorporateValue - This method is used to walk operand lists finding types
diff --git a/lib/IR/Value.cpp b/lib/IR/Value.cpp
index 81d7efa..62a3b31 100644
--- a/lib/IR/Value.cpp
+++ b/lib/IR/Value.cpp
@@ -365,7 +365,8 @@ static Value *stripPointerCastsAndOffsets(Value *V) {
         break;
       }
       V = GEP->getPointerOperand();
-    } else if (Operator::getOpcode(V) == Instruction::BitCast) {
+    } else if (Operator::getOpcode(V) == Instruction::BitCast ||
+               Operator::getOpcode(V) == Instruction::AddrSpaceCast) {
       V = cast<Operator>(V)->getOperand(0);
     } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) {
       if (StripKind == PSK_ZeroIndices || GA->mayBeOverridden())
@@ -393,6 +394,42 @@ Value *Value::stripInBoundsConstantOffsets() {
   return stripPointerCastsAndOffsets<PSK_InBoundsConstantIndices>(this);
 }
 
+Value *Value::stripAndAccumulateInBoundsConstantOffsets(const DataLayout &DL,
+                                                        APInt &Offset) {
+  if (!getType()->isPointerTy())
+    return this;
+
+  assert(Offset.getBitWidth() == DL.getPointerSizeInBits(cast<PointerType>(
+                                     getType())->getAddressSpace()) &&
+         "The offset must have exactly as many bits as our pointer.");
+
+  // Even though we don't look through PHI nodes, we could be called on an
+  // instruction in an unreachable block, which may be on a cycle.
+  SmallPtrSet<Value *, 4> Visited;
+  Visited.insert(this);
+  Value *V = this;
+  do {
+    if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
+      if (!GEP->isInBounds())
+        return V;
+      APInt GEPOffset(Offset);
+      if (!GEP->accumulateConstantOffset(DL, GEPOffset))
+        return V;
+      Offset = GEPOffset;
+      V = GEP->getPointerOperand();
+    } else if (Operator::getOpcode(V) == Instruction::BitCast) {
+      V = cast<Operator>(V)->getOperand(0);
+    } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) {
+      V = GA->getAliasee();
+    } else {
+      return V;
+    }
+    assert(V->getType()->isPointerTy() && "Unexpected operand type!");
+  } while (Visited.insert(V));
+
+  return V;
+}
+
 Value *Value::stripInBoundsOffsets() {
   return stripPointerCastsAndOffsets<PSK_InBounds>(this);
 }
@@ -698,9 +735,5 @@ void ValueHandleBase::ValueIsRAUWd(Value *Old, Value *New) {
 #endif
 }
 
-// Default implementation for CallbackVH.
-void CallbackVH::allUsesReplacedWith(Value *) {}
-
-void CallbackVH::deleted() {
-  setValPtr(NULL);
-}
+// Pin the vtable to this file.
+void CallbackVH::anchor() {}
diff --git a/lib/IR/ValueTypes.cpp b/lib/IR/ValueTypes.cpp
index ba04d60..2d4da95 100644
--- a/lib/IR/ValueTypes.cpp
+++ b/lib/IR/ValueTypes.cpp
@@ -134,6 +134,7 @@ std::string EVT::getEVTString() const {
   case MVT::v16i1:   return "v16i1";
   case MVT::v32i1:   return "v32i1";
   case MVT::v64i1:   return "v64i1";
+  case MVT::v1i8:    return "v1i8";
   case MVT::v2i8:    return "v2i8";
   case MVT::v4i8:    return "v4i8";
   case MVT::v8i8:    return "v8i8";
@@ -156,11 +157,15 @@ std::string EVT::getEVTString() const {
   case MVT::v4i64:   return "v4i64";
   case MVT::v8i64:   return "v8i64";
   case MVT::v16i64:  return "v16i64";
+  case MVT::v1f32:   return "v1f32";
   case MVT::v2f32:   return "v2f32";
   case MVT::v2f16:   return "v2f16";
+  case MVT::v4f16:   return "v4f16";
+  case MVT::v8f16:   return "v8f16";
   case MVT::v4f32:   return "v4f32";
   case MVT::v8f32:   return "v8f32";
   case MVT::v16f32:  return "v16f32";
+  case MVT::v1f64:   return "v1f64";
   case MVT::v2f64:   return "v2f64";
   case MVT::v4f64:   return "v4f64";
   case MVT::v8f64:   return "v8f64";
@@ -197,6 +202,7 @@ Type *EVT::getTypeForEVT(LLVMContext &Context) const {
   case MVT::v16i1:   return VectorType::get(Type::getInt1Ty(Context), 16);
   case MVT::v32i1:   return VectorType::get(Type::getInt1Ty(Context), 32);
   case MVT::v64i1:   return VectorType::get(Type::getInt1Ty(Context), 64);
+  case MVT::v1i8:    return VectorType::get(Type::getInt8Ty(Context), 1);
   case MVT::v2i8:    return VectorType::get(Type::getInt8Ty(Context), 2);
   case MVT::v4i8:    return VectorType::get(Type::getInt8Ty(Context), 4);
   case MVT::v8i8:    return VectorType::get(Type::getInt8Ty(Context), 8);
@@ -220,10 +226,14 @@ Type *EVT::getTypeForEVT(LLVMContext &Context) const {
   case MVT::v8i64:   return VectorType::get(Type::getInt64Ty(Context), 8);
   case MVT::v16i64:  return VectorType::get(Type::getInt64Ty(Context), 16);
   case MVT::v2f16:   return VectorType::get(Type::getHalfTy(Context), 2);
+  case MVT::v4f16:   return VectorType::get(Type::getHalfTy(Context), 4);
+  case MVT::v8f16:   return VectorType::get(Type::getHalfTy(Context), 8);
+  case MVT::v1f32:   return VectorType::get(Type::getFloatTy(Context), 1);
   case MVT::v2f32:   return VectorType::get(Type::getFloatTy(Context), 2);
   case MVT::v4f32:   return VectorType::get(Type::getFloatTy(Context), 4);
   case MVT::v8f32:   return VectorType::get(Type::getFloatTy(Context), 8);
   case MVT::v16f32:   return VectorType::get(Type::getFloatTy(Context), 16);
+  case MVT::v1f64:   return VectorType::get(Type::getDoubleTy(Context), 1);
   case MVT::v2f64:   return VectorType::get(Type::getDoubleTy(Context), 2);
   case MVT::v4f64:   return VectorType::get(Type::getDoubleTy(Context), 4); 
   case MVT::v8f64:   return VectorType::get(Type::getDoubleTy(Context), 8); 
diff --git a/lib/IR/Verifier.cpp b/lib/IR/Verifier.cpp
index 0eda97f..da6b573 100644
--- a/lib/IR/Verifier.cpp
+++ b/lib/IR/Verifier.cpp
@@ -78,7 +78,7 @@
 using namespace llvm;
 
 static cl::opt<bool> DisableDebugInfoVerifier("disable-debug-info-verifier",
-                                              cl::init(false));
+                                              cl::init(true));
 
 namespace {  // Anonymous namespace for class
   struct PreVerifier : public FunctionPass {
@@ -167,7 +167,6 @@ namespace {
     bool doInitialization(Module &M) {
       Mod = &M;
       Context = &M.getContext();
-      Finder.reset();
 
       DL = getAnalysisIfAvailable<DataLayout>();
 
@@ -183,10 +182,15 @@ namespace {
       Mod = F.getParent();
       if (!Context) Context = &F.getContext();
 
+      Finder.reset();
       visit(F);
       InstsInThisBlock.clear();
       PersonalityFn = 0;
 
+      if (!DisableDebugInfoVerifier)
+        // Verify Debug Info.
+        verifyDebugInfo();
+
       // We must abort before returning back to the pass manager, or else the
       // pass manager may try to run other passes on the broken module.
       return abortIfBroken();
@@ -214,9 +218,14 @@ namespace {
         visitNamedMDNode(*I);
 
       visitModuleFlags(M);
+      visitModuleIdents(M);
 
-      // Verify Debug Info.
-      verifyDebugInfo(M);
+      if (!DisableDebugInfoVerifier) {
+        Finder.reset();
+        Finder.processModule(M);
+        // Verify Debug Info.
+        verifyDebugInfo();
+      }
 
       // If the module is broken, abort at this time.
       return abortIfBroken();
@@ -258,6 +267,7 @@ namespace {
     void visitGlobalAlias(GlobalAlias &GA);
     void visitNamedMDNode(NamedMDNode &NMD);
     void visitMDNode(MDNode &MD, Function *F);
+    void visitModuleIdents(Module &M);
     void visitModuleFlags(Module &M);
     void visitModuleFlag(MDNode *Op, DenseMap<MDString*, MDNode*> &SeenIDs,
                          SmallVectorImpl<MDNode*> &Requirements);
@@ -279,6 +289,7 @@ namespace {
     void visitIntToPtrInst(IntToPtrInst &I);
     void visitPtrToIntInst(PtrToIntInst &I);
     void visitBitCastInst(BitCastInst &I);
+    void visitAddrSpaceCastInst(AddrSpaceCastInst &I);
     void visitPHINode(PHINode &PN);
     void visitBinaryOperator(BinaryOperator &B);
     void visitICmpInst(ICmpInst &IC);
@@ -317,6 +328,8 @@ namespace {
     bool VerifyIntrinsicType(Type *Ty,
                              ArrayRef<Intrinsic::IITDescriptor> &Infos,
                              SmallVectorImpl<Type*> &ArgTys);
+    bool VerifyIntrinsicIsVarArg(bool isVarArg,
+                                 ArrayRef<Intrinsic::IITDescriptor> &Infos);
     bool VerifyAttributeCount(AttributeSet Attrs, unsigned Params);
     void VerifyAttributeTypes(AttributeSet Attrs, unsigned Idx,
                               bool isFunction, const Value *V);
@@ -328,7 +341,7 @@ namespace {
     void VerifyBitcastType(const Value *V, Type *DestTy, Type *SrcTy);
     void VerifyConstantExprBitcastType(const ConstantExpr *CE);
 
-    void verifyDebugInfo(Module &M);
+    void verifyDebugInfo();
 
     void WriteValue(const Value *V) {
       if (!V) return;
@@ -427,10 +440,6 @@ void Verifier::visitGlobalValue(GlobalValue &GV) {
     Assert1(GVar && GVar->getType()->getElementType()->isArrayTy(),
             "Only global arrays can have appending linkage!", GVar);
   }
-
-  Assert1(!GV.hasLinkOnceODRAutoHideLinkage() || GV.hasDefaultVisibility(),
-          "linkonce_odr_auto_hide can only have default visibility!",
-          &GV);
 }
 
 void Verifier::visitGlobalVariable(GlobalVariable &GV) {
@@ -527,8 +536,7 @@ void Verifier::visitGlobalVariable(GlobalVariable &GV) {
 void Verifier::visitGlobalAlias(GlobalAlias &GA) {
   Assert1(!GA.getName().empty(),
           "Alias name cannot be empty!", &GA);
-  Assert1(GA.hasExternalLinkage() || GA.hasLocalLinkage() ||
-          GA.hasWeakLinkage(),
+  Assert1(GlobalAlias::isValidLinkage(GA.getLinkage()),
           "Alias should have external or external weak linkage!", &GA);
   Assert1(GA.getAliasee(),
           "Aliasee cannot be NULL!", &GA);
@@ -612,6 +620,24 @@ void Verifier::visitMDNode(MDNode &MD, Function *F) {
   }
 }
 
+void Verifier::visitModuleIdents(Module &M) {
+  const NamedMDNode *Idents = M.getNamedMetadata("llvm.ident");
+  if (!Idents) 
+    return;
+  
+  // llvm.ident takes a list of metadata entry. Each entry has only one string.
+  // Scan each llvm.ident entry and make sure that this requirement is met.
+  for (unsigned i = 0, e = Idents->getNumOperands(); i != e; ++i) {
+    const MDNode *N = Idents->getOperand(i);
+    Assert1(N->getNumOperands() == 1,
+            "incorrect number of operands in llvm.ident metadata", N);
+    Assert1(isa<MDString>(N->getOperand(0)),
+            ("invalid value for llvm.ident metadata entry operand"
+             "(the operand should be a string)"),
+            N->getOperand(0));
+  } 
+}
+
 void Verifier::visitModuleFlags(Module &M) {
   const NamedMDNode *Flags = M.getModuleFlagsMetadata();
   if (!Flags) return;
@@ -751,7 +777,8 @@ void Verifier::VerifyAttributeTypes(AttributeSet Attrs, unsigned Idx,
         I->getKindAsEnum() == Attribute::NoDuplicate ||
         I->getKindAsEnum() == Attribute::Builtin ||
         I->getKindAsEnum() == Attribute::NoBuiltin ||
-        I->getKindAsEnum() == Attribute::Cold) {
+        I->getKindAsEnum() == Attribute::Cold ||
+        I->getKindAsEnum() == Attribute::OptimizeNone) {
       if (!isFunction) {
         CheckFailed("Attribute '" + I->getAsString() +
                     "' only applies to functions!", V);
@@ -897,6 +924,21 @@ void Verifier::VerifyFunctionAttrs(FunctionType *FT, AttributeSet Attrs,
             Attrs.hasAttribute(AttributeSet::FunctionIndex,
                                Attribute::AlwaysInline)),
           "Attributes 'noinline and alwaysinline' are incompatible!", V);
+
+  if (Attrs.hasAttribute(AttributeSet::FunctionIndex, 
+                         Attribute::OptimizeNone)) {
+    Assert1(Attrs.hasAttribute(AttributeSet::FunctionIndex,
+                               Attribute::NoInline),
+            "Attribute 'optnone' requires 'noinline'!", V);
+
+    Assert1(!Attrs.hasAttribute(AttributeSet::FunctionIndex,
+                                Attribute::OptimizeForSize),
+            "Attributes 'optsize and optnone' are incompatible!", V);
+
+    Assert1(!Attrs.hasAttribute(AttributeSet::FunctionIndex,
+                                Attribute::MinSize),
+            "Attributes 'minsize and optnone' are incompatible!", V);
+  }
 }
 
 void Verifier::VerifyBitcastType(const Value *V, Type *DestTy, Type *SrcTy) {
@@ -930,11 +972,9 @@ void Verifier::VerifyBitcastType(const Value *V, Type *DestTy, Type *SrcTy) {
   unsigned SrcAS = SrcTy->getPointerAddressSpace();
   unsigned DstAS = DestTy->getPointerAddressSpace();
 
-  unsigned SrcASSize = DL->getPointerSizeInBits(SrcAS);
-  unsigned DstASSize = DL->getPointerSizeInBits(DstAS);
-  Assert1(SrcASSize == DstASSize,
-          "Bitcasts between pointers of different address spaces must have "
-          "the same size pointers, otherwise use PtrToInt/IntToPtr.", V);
+  Assert1(SrcAS == DstAS,
+          "Bitcasts between pointers of different address spaces is not legal."
+          "Use AddrSpaceCast instead.", V);
 }
 
 void Verifier::VerifyConstantExprBitcastType(const ConstantExpr *CE) {
@@ -1152,27 +1192,12 @@ void Verifier::visitSwitchInst(SwitchInst &SI) {
   // Check to make sure that all of the constants in the switch instruction
   // have the same type as the switched-on value.
   Type *SwitchTy = SI.getCondition()->getType();
-  IntegerType *IntTy = cast<IntegerType>(SwitchTy);
-  IntegersSubsetToBB Mapping;
-  std::map<IntegersSubset::Range, unsigned> RangeSetMap;
+  SmallPtrSet<ConstantInt*, 32> Constants;
   for (SwitchInst::CaseIt i = SI.case_begin(), e = SI.case_end(); i != e; ++i) {
-    IntegersSubset CaseRanges = i.getCaseValueEx();
-    for (unsigned ri = 0, rie = CaseRanges.getNumItems(); ri < rie; ++ri) {
-      IntegersSubset::Range r = CaseRanges.getItem(ri);
-      Assert1(((const APInt&)r.getLow()).getBitWidth() == IntTy->getBitWidth(),
-              "Switch constants must all be same type as switch value!", &SI);
-      Assert1(((const APInt&)r.getHigh()).getBitWidth() == IntTy->getBitWidth(),
-              "Switch constants must all be same type as switch value!", &SI);
-      Mapping.add(r);
-      RangeSetMap[r] = i.getCaseIndex();
-    }
-  }
-
-  IntegersSubsetToBB::RangeIterator errItem;
-  if (!Mapping.verify(errItem)) {
-    unsigned CaseIndex = RangeSetMap[errItem->first];
-    SwitchInst::CaseIt i(&SI, CaseIndex);
-    Assert2(false, "Duplicate integer as switch case", &SI, i.getCaseValueEx());
+    Assert1(i.getCaseValue()->getType() == SwitchTy,
+            "Switch constants must all be same type as switch value!", &SI);
+    Assert2(Constants.insert(i.getCaseValue()),
+            "Duplicate integer as switch case", &SI, i.getCaseValue());
   }
 
   visitTerminatorInst(SI);
@@ -1435,6 +1460,22 @@ void Verifier::visitBitCastInst(BitCastInst &I) {
   visitInstruction(I);
 }
 
+void Verifier::visitAddrSpaceCastInst(AddrSpaceCastInst &I) {
+  Type *SrcTy = I.getOperand(0)->getType();
+  Type *DestTy = I.getType();
+
+  Assert1(SrcTy->isPtrOrPtrVectorTy(),
+          "AddrSpaceCast source must be a pointer", &I);
+  Assert1(DestTy->isPtrOrPtrVectorTy(),
+          "AddrSpaceCast result must be a pointer", &I);
+  Assert1(SrcTy->getPointerAddressSpace() != DestTy->getPointerAddressSpace(),
+          "AddrSpaceCast must be between different address spaces", &I);
+  if (SrcTy->isVectorTy())
+    Assert1(SrcTy->getVectorNumElements() == DestTy->getVectorNumElements(),
+            "AddrSpaceCast vector pointer number of elements mismatch", &I);
+  visitInstruction(I);
+}
+
 /// visitPHINode - Ensure that a PHI node is well formed.
 ///
 void Verifier::visitPHINode(PHINode &PN) {
@@ -1537,14 +1578,6 @@ void Verifier::VerifyCallSite(CallSite CS) {
               "Function has metadata parameter but isn't an intrinsic", I);
   }
 
-  // If the call site has the 'builtin' attribute, verify that it's applied to a
-  // direct call to a function with the 'nobuiltin' attribute.
-  if (CS.hasFnAttr(Attribute::Builtin))
-    Assert1(CS.getCalledFunction() &&
-            CS.getCalledFunction()->hasFnAttribute(Attribute::NoBuiltin),
-            "Attribute 'builtin' can only be used in a call to a function with "
-            "the 'nobuiltin' attribute.", I);
-
   visitInstruction(*I);
 }
 
@@ -2098,7 +2131,7 @@ void Verifier::visitInstruction(Instruction &I) {
 
   if (!DisableDebugInfoVerifier) {
     MD = I.getMetadata(LLVMContext::MD_dbg);
-    Finder.processLocation(DILocation(MD));
+    Finder.processLocation(*Mod, DILocation(MD));
   }
 
   InstsInThisBlock.insert(&I);
@@ -2121,6 +2154,7 @@ bool Verifier::VerifyIntrinsicType(Type *Ty,
 
   switch (D.Kind) {
   case IITDescriptor::Void: return !Ty->isVoidTy();
+  case IITDescriptor::VarArg: return true;
   case IITDescriptor::MMX:  return !Ty->isX86_MMXTy();
   case IITDescriptor::Metadata: return !Ty->isMetadataTy();
   case IITDescriptor::Half: return !Ty->isHalfTy();
@@ -2185,6 +2219,33 @@ bool Verifier::VerifyIntrinsicType(Type *Ty,
   llvm_unreachable("unhandled");
 }
 
+/// \brief Verify if the intrinsic has variable arguments.
+/// This method is intended to be called after all the fixed arguments have been
+/// verified first.
+///
+/// This method returns true on error and does not print an error message.
+bool
+Verifier::VerifyIntrinsicIsVarArg(bool isVarArg,
+                                  ArrayRef<Intrinsic::IITDescriptor> &Infos) {
+  using namespace Intrinsic;
+
+  // If there are no descriptors left, then it can't be a vararg.
+  if (Infos.empty())
+    return isVarArg ? true : false;
+
+  // There should be only one descriptor remaining at this point.
+  if (Infos.size() != 1)
+    return true;
+
+  // Check and verify the descriptor.
+  IITDescriptor D = Infos.front();
+  Infos = Infos.slice(1);
+  if (D.Kind == IITDescriptor::VarArg)
+    return isVarArg ? false : true;
+
+  return true;
+}
+
 /// visitIntrinsicFunction - Allow intrinsics to be verified in different ways.
 ///
 void Verifier::visitIntrinsicFunctionCall(Intrinsic::ID ID, CallInst &CI) {
@@ -2195,7 +2256,7 @@ void Verifier::visitIntrinsicFunctionCall(Intrinsic::ID ID, CallInst &CI) {
   // Verify that the intrinsic prototype lines up with what the .td files
   // describe.
   FunctionType *IFTy = IF->getFunctionType();
-  Assert1(!IFTy->isVarArg(), "Intrinsic prototypes are not varargs", IF);
+  bool IsVarArg = IFTy->isVarArg();
 
   SmallVector<Intrinsic::IITDescriptor, 8> Table;
   getIntrinsicInfoTableEntries(ID, Table);
@@ -2207,6 +2268,16 @@ void Verifier::visitIntrinsicFunctionCall(Intrinsic::ID ID, CallInst &CI) {
   for (unsigned i = 0, e = IFTy->getNumParams(); i != e; ++i)
     Assert1(!VerifyIntrinsicType(IFTy->getParamType(i), TableRef, ArgTys),
             "Intrinsic has incorrect argument type!", IF);
+
+  // Verify if the intrinsic call matches the vararg property.
+  if (IsVarArg)
+    Assert1(!VerifyIntrinsicIsVarArg(IsVarArg, TableRef),
+            "Intrinsic was not defined with variable arguments!", IF);
+  else
+    Assert1(!VerifyIntrinsicIsVarArg(IsVarArg, TableRef),
+            "Callsite was not defined with variable arguments!", IF);
+
+  // All descriptors should be absorbed by now.
   Assert1(TableRef.empty(), "Intrinsic has too few arguments!", IF);
 
   // Now that we have the intrinsic ID and the actual argument types (and we
@@ -2238,13 +2309,13 @@ void Verifier::visitIntrinsicFunctionCall(Intrinsic::ID ID, CallInst &CI) {
     Assert1(MD->getNumOperands() == 1,
                 "invalid llvm.dbg.declare intrinsic call 2", &CI);
     if (!DisableDebugInfoVerifier)
-      Finder.processDeclare(cast<DbgDeclareInst>(&CI));
+      Finder.processDeclare(*Mod, cast<DbgDeclareInst>(&CI));
   } break;
   case Intrinsic::dbg_value: { //llvm.dbg.value
     if (!DisableDebugInfoVerifier) {
       Assert1(CI.getArgOperand(0) && isa<MDNode>(CI.getArgOperand(0)),
               "invalid llvm.dbg.value intrinsic call 1", &CI);
-      Finder.processValue(cast<DbgValueInst>(&CI));
+      Finder.processValue(*Mod, cast<DbgValueInst>(&CI));
     }
     break;
   }
@@ -2309,11 +2380,9 @@ void Verifier::visitIntrinsicFunctionCall(Intrinsic::ID ID, CallInst &CI) {
   }
 }
 
-void Verifier::verifyDebugInfo(Module &M) {
+void Verifier::verifyDebugInfo() {
   // Verify Debug Info.
   if (!DisableDebugInfoVerifier) {
-    Finder.processModule(M);
-
     for (DebugInfoFinder::iterator I = Finder.compile_unit_begin(),
          E = Finder.compile_unit_end(); I != E; ++I)
       Assert1(DICompileUnit(*I).Verify(), "DICompileUnit does not Verify!", *I);
@@ -2352,6 +2421,7 @@ bool llvm::verifyFunction(const Function &f, VerifierFailureAction action) {
   FunctionPassManager FPM(F.getParent());
   Verifier *V = new Verifier(action);
   FPM.add(V);
+  FPM.doInitialization();
   FPM.run(F);
   return V->Broken;
 }
diff --git a/lib/IRReader/IRReader.cpp b/lib/IRReader/IRReader.cpp
index 656fe18..935e81d 100644
--- a/lib/IRReader/IRReader.cpp
+++ b/lib/IRReader/IRReader.cpp
@@ -11,10 +11,15 @@
 #include "llvm/ADT/OwningPtr.h"
 #include "llvm/Assembly/Parser.h"
 #include "llvm/Bitcode/ReaderWriter.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/system_error.h"
 #include "llvm/Support/Timer.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm-c/Core.h"
+#include "llvm-c/IRReader.h"
 
 using namespace llvm;
 
@@ -87,3 +92,30 @@ Module *llvm::ParseIRFile(const std::string &Filename, SMDiagnostic &Err,
 
   return ParseIR(File.take(), Err, Context);
 }
+
+//===----------------------------------------------------------------------===//
+// C API.
+//===----------------------------------------------------------------------===//
+
+LLVMBool LLVMParseIRInContext(LLVMContextRef ContextRef,
+                              LLVMMemoryBufferRef MemBuf, LLVMModuleRef *OutM,
+                              char **OutMessage) {
+  SMDiagnostic Diag;
+
+  *OutM = wrap(ParseIR(unwrap(MemBuf), Diag, *unwrap(ContextRef)));
+
+  if(!*OutM) {
+    if (OutMessage) {
+      std::string buf;
+      raw_string_ostream os(buf);
+
+      Diag.print(NULL, os, false);
+      os.flush();
+
+      *OutMessage = strdup(buf.c_str());
+    }
+    return 1;
+  }
+
+  return 0;
+}
diff --git a/lib/LLVMBuild.txt b/lib/LLVMBuild.txt
index ff288bc..00280c8 100644
--- a/lib/LLVMBuild.txt
+++ b/lib/LLVMBuild.txt
@@ -16,7 +16,7 @@
 ;===------------------------------------------------------------------------===;
 
 [common]
-subdirectories = Analysis AsmParser Bitcode CodeGen DebugInfo ExecutionEngine Linker IR IRReader MC Object Option Support TableGen Target Transforms
+subdirectories = Analysis AsmParser Bitcode CodeGen DebugInfo ExecutionEngine Linker IR IRReader LTO MC Object Option Support TableGen Target Transforms
 
 [component_0]
 type = Group
diff --git a/lib/LTO/CMakeLists.txt b/lib/LTO/CMakeLists.txt
new file mode 100644
index 0000000..8e00bcb
--- /dev/null
+++ b/lib/LTO/CMakeLists.txt
@@ -0,0 +1,4 @@
+add_llvm_library(LLVMLTO
+  LTOModule.cpp
+  LTOCodeGenerator.cpp
+  )
diff --git a/lib/LTO/LLVMBuild.txt b/lib/LTO/LLVMBuild.txt
new file mode 100644
index 0000000..38c1170
--- /dev/null
+++ b/lib/LTO/LLVMBuild.txt
@@ -0,0 +1,22 @@
+;===- ./lib/LTO/LLVMBuild.txt ----------------------------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = LTO
+parent = Libraries
+required_libraries = Analysis BitReader BitWriter Core IPO Linker MC MCParser Scalar Support Target Vectorize
+\ No newline at end of file
diff --git a/lib/LTO/LTOCodeGenerator.cpp b/lib/LTO/LTOCodeGenerator.cpp
new file mode 100644
index 0000000..2b3648e
--- /dev/null
+++ b/lib/LTO/LTOCodeGenerator.cpp
@@ -0,0 +1,521 @@
+//===-LTOCodeGenerator.cpp - LLVM Link Time Optimizer ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Link Time Optimization library. This library is
+// intended to be used by linker to optimize code at link time.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/LTO/LTOCodeGenerator.h"
+#include "llvm/LTO/LTOModule.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/Analysis/Verifier.h"
+#include "llvm/Bitcode/ReaderWriter.h"
+#include "llvm/CodeGen/RuntimeLibcalls.h"
+#include "llvm/Config/config.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Linker.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/SubtargetFeature.h"
+#include "llvm/PassManager.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/Host.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Signals.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/TargetSelect.h"
+#include "llvm/Support/ToolOutputFile.h"
+#include "llvm/Support/system_error.h"
+#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/Mangler.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/IPO/PassManagerBuilder.h"
+#include "llvm/Transforms/ObjCARC.h"
+using namespace llvm;
+
+const char* LTOCodeGenerator::getVersionString() {
+#ifdef LLVM_VERSION_INFO
+  return PACKAGE_NAME " version " PACKAGE_VERSION ", " LLVM_VERSION_INFO;
+#else
+  return PACKAGE_NAME " version " PACKAGE_VERSION;
+#endif
+}
+
+LTOCodeGenerator::LTOCodeGenerator()
+    : Context(getGlobalContext()), Linker(new Module("ld-temp.o", Context)),
+      TargetMach(NULL), EmitDwarfDebugInfo(false), ScopeRestrictionsDone(false),
+      CodeModel(LTO_CODEGEN_PIC_MODEL_DYNAMIC), NativeObjectFile(NULL) {
+  initializeLTOPasses();
+}
+
+LTOCodeGenerator::~LTOCodeGenerator() {
+  delete TargetMach;
+  delete NativeObjectFile;
+  TargetMach = NULL;
+  NativeObjectFile = NULL;
+
+  Linker.deleteModule();
+
+  for (std::vector<char *>::iterator I = CodegenOptions.begin(),
+                                     E = CodegenOptions.end();
+       I != E; ++I)
+    free(*I);
+}
+
+// Initialize LTO passes. Please keep this funciton in sync with
+// PassManagerBuilder::populateLTOPassManager(), and make sure all LTO
+// passes are initialized. 
+//
+void LTOCodeGenerator::initializeLTOPasses() {
+  PassRegistry &R = *PassRegistry::getPassRegistry();
+
+  initializeInternalizePassPass(R);
+  initializeIPSCCPPass(R);
+  initializeGlobalOptPass(R);
+  initializeConstantMergePass(R);
+  initializeDAHPass(R);
+  initializeInstCombinerPass(R);
+  initializeSimpleInlinerPass(R);
+  initializePruneEHPass(R);
+  initializeGlobalDCEPass(R);
+  initializeArgPromotionPass(R);
+  initializeJumpThreadingPass(R);
+  initializeSROAPass(R);
+  initializeSROA_DTPass(R);
+  initializeSROA_SSAUpPass(R);
+  initializeFunctionAttrsPass(R);
+  initializeGlobalsModRefPass(R);
+  initializeLICMPass(R);
+  initializeGVNPass(R);
+  initializeMemCpyOptPass(R);
+  initializeDCEPass(R);
+  initializeCFGSimplifyPassPass(R);
+}
+
+bool LTOCodeGenerator::addModule(LTOModule* mod, std::string& errMsg) {
+  bool ret = Linker.linkInModule(mod->getLLVVMModule(), &errMsg);
+
+  const std::vector<const char*> &undefs = mod->getAsmUndefinedRefs();
+  for (int i = 0, e = undefs.size(); i != e; ++i)
+    AsmUndefinedRefs[undefs[i]] = 1;
+
+  return !ret;
+}
+
+void LTOCodeGenerator::setTargetOptions(TargetOptions options) {
+  Options.LessPreciseFPMADOption = options.LessPreciseFPMADOption;
+  Options.NoFramePointerElim = options.NoFramePointerElim;
+  Options.AllowFPOpFusion = options.AllowFPOpFusion;
+  Options.UnsafeFPMath = options.UnsafeFPMath;
+  Options.NoInfsFPMath = options.NoInfsFPMath;
+  Options.NoNaNsFPMath = options.NoNaNsFPMath;
+  Options.HonorSignDependentRoundingFPMathOption =
+    options.HonorSignDependentRoundingFPMathOption;
+  Options.UseSoftFloat = options.UseSoftFloat;
+  Options.FloatABIType = options.FloatABIType;
+  Options.NoZerosInBSS = options.NoZerosInBSS;
+  Options.GuaranteedTailCallOpt = options.GuaranteedTailCallOpt;
+  Options.DisableTailCalls = options.DisableTailCalls;
+  Options.StackAlignmentOverride = options.StackAlignmentOverride;
+  Options.TrapFuncName = options.TrapFuncName;
+  Options.PositionIndependentExecutable = options.PositionIndependentExecutable;
+  Options.EnableSegmentedStacks = options.EnableSegmentedStacks;
+  Options.UseInitArray = options.UseInitArray;
+}
+
+void LTOCodeGenerator::setDebugInfo(lto_debug_model debug) {
+  switch (debug) {
+  case LTO_DEBUG_MODEL_NONE:
+    EmitDwarfDebugInfo = false;
+    return;
+
+  case LTO_DEBUG_MODEL_DWARF:
+    EmitDwarfDebugInfo = true;
+    return;
+  }
+  llvm_unreachable("Unknown debug format!");
+}
+
+void LTOCodeGenerator::setCodePICModel(lto_codegen_model model) {
+  switch (model) {
+  case LTO_CODEGEN_PIC_MODEL_STATIC:
+  case LTO_CODEGEN_PIC_MODEL_DYNAMIC:
+  case LTO_CODEGEN_PIC_MODEL_DYNAMIC_NO_PIC:
+    CodeModel = model;
+    return;
+  }
+  llvm_unreachable("Unknown PIC model!");
+}
+
+bool LTOCodeGenerator::writeMergedModules(const char *path,
+                                          std::string &errMsg) {
+  if (!determineTarget(errMsg))
+    return false;
+
+  // mark which symbols can not be internalized
+  applyScopeRestrictions();
+
+  // create output file
+  std::string ErrInfo;
+  tool_output_file Out(path, ErrInfo, sys::fs::F_Binary);
+  if (!ErrInfo.empty()) {
+    errMsg = "could not open bitcode file for writing: ";
+    errMsg += path;
+    return false;
+  }
+
+  // write bitcode to it
+  WriteBitcodeToFile(Linker.getModule(), Out.os());
+  Out.os().close();
+
+  if (Out.os().has_error()) {
+    errMsg = "could not write bitcode file: ";
+    errMsg += path;
+    Out.os().clear_error();
+    return false;
+  }
+
+  Out.keep();
+  return true;
+}
+
+bool LTOCodeGenerator::compile_to_file(const char** name, 
+                                       bool disableOpt,
+                                       bool disableInline,
+                                       bool disableGVNLoadPRE,
+                                       std::string& errMsg) {
+  // make unique temp .o file to put generated object file
+  SmallString<128> Filename;
+  int FD;
+  error_code EC = sys::fs::createTemporaryFile("lto-llvm", "o", FD, Filename);
+  if (EC) {
+    errMsg = EC.message();
+    return false;
+  }
+
+  // generate object file
+  tool_output_file objFile(Filename.c_str(), FD);
+
+  bool genResult = generateObjectFile(objFile.os(), disableOpt, disableInline,
+                                      disableGVNLoadPRE, errMsg);
+  objFile.os().close();
+  if (objFile.os().has_error()) {
+    objFile.os().clear_error();
+    sys::fs::remove(Twine(Filename));
+    return false;
+  }
+
+  objFile.keep();
+  if (!genResult) {
+    sys::fs::remove(Twine(Filename));
+    return false;
+  }
+
+  NativeObjectPath = Filename.c_str();
+  *name = NativeObjectPath.c_str();
+  return true;
+}
+
+const void* LTOCodeGenerator::compile(size_t* length,
+                                      bool disableOpt,
+                                      bool disableInline,
+                                      bool disableGVNLoadPRE,
+                                      std::string& errMsg) {
+  const char *name;
+  if (!compile_to_file(&name, disableOpt, disableInline, disableGVNLoadPRE,
+                       errMsg))
+    return NULL;
+
+  // remove old buffer if compile() called twice
+  delete NativeObjectFile;
+
+  // read .o file into memory buffer
+  OwningPtr<MemoryBuffer> BuffPtr;
+  if (error_code ec = MemoryBuffer::getFile(name, BuffPtr, -1, false)) {
+    errMsg = ec.message();
+    sys::fs::remove(NativeObjectPath);
+    return NULL;
+  }
+  NativeObjectFile = BuffPtr.take();
+
+  // remove temp files
+  sys::fs::remove(NativeObjectPath);
+
+  // return buffer, unless error
+  if (NativeObjectFile == NULL)
+    return NULL;
+  *length = NativeObjectFile->getBufferSize();
+  return NativeObjectFile->getBufferStart();
+}
+
+bool LTOCodeGenerator::determineTarget(std::string &errMsg) {
+  if (TargetMach != NULL)
+    return true;
+
+  std::string TripleStr = Linker.getModule()->getTargetTriple();
+  if (TripleStr.empty())
+    TripleStr = sys::getDefaultTargetTriple();
+  llvm::Triple Triple(TripleStr);
+
+  // create target machine from info for merged modules
+  const Target *march = TargetRegistry::lookupTarget(TripleStr, errMsg);
+  if (march == NULL)
+    return false;
+
+  // The relocation model is actually a static member of TargetMachine and
+  // needs to be set before the TargetMachine is instantiated.
+  Reloc::Model RelocModel = Reloc::Default;
+  switch (CodeModel) {
+  case LTO_CODEGEN_PIC_MODEL_STATIC:
+    RelocModel = Reloc::Static;
+    break;
+  case LTO_CODEGEN_PIC_MODEL_DYNAMIC:
+    RelocModel = Reloc::PIC_;
+    break;
+  case LTO_CODEGEN_PIC_MODEL_DYNAMIC_NO_PIC:
+    RelocModel = Reloc::DynamicNoPIC;
+    break;
+  }
+
+  // construct LTOModule, hand over ownership of module and target
+  SubtargetFeatures Features;
+  Features.getDefaultSubtargetFeatures(Triple);
+  std::string FeatureStr = Features.getString();
+  // Set a default CPU for Darwin triples.
+  if (MCpu.empty() && Triple.isOSDarwin()) {
+    if (Triple.getArch() == llvm::Triple::x86_64)
+      MCpu = "core2";
+    else if (Triple.getArch() == llvm::Triple::x86)
+      MCpu = "yonah";
+  }
+
+  TargetMach = march->createTargetMachine(TripleStr, MCpu, FeatureStr, Options,
+                                          RelocModel, CodeModel::Default,
+                                          CodeGenOpt::Aggressive);
+  return true;
+}
+
+void LTOCodeGenerator::
+applyRestriction(GlobalValue &GV,
+                 const ArrayRef<StringRef> &Libcalls,
+                 std::vector<const char*> &MustPreserveList,
+                 SmallPtrSet<GlobalValue*, 8> &AsmUsed,
+                 Mangler &Mangler) {
+  SmallString<64> Buffer;
+  Mangler.getNameWithPrefix(Buffer, &GV, false);
+
+  if (GV.isDeclaration())
+    return;
+  if (MustPreserveSymbols.count(Buffer))
+    MustPreserveList.push_back(GV.getName().data());
+  if (AsmUndefinedRefs.count(Buffer))
+    AsmUsed.insert(&GV);
+
+  // Conservatively append user-supplied runtime library functions to
+  // llvm.compiler.used.  These could be internalized and deleted by
+  // optimizations like -globalopt, causing problems when later optimizations
+  // add new library calls (e.g., llvm.memset => memset and printf => puts).
+  // Leave it to the linker to remove any dead code (e.g. with -dead_strip).
+  if (isa<Function>(GV) &&
+      std::binary_search(Libcalls.begin(), Libcalls.end(), GV.getName()))
+    AsmUsed.insert(&GV);
+}
+
+static void findUsedValues(GlobalVariable *LLVMUsed,
+                           SmallPtrSet<GlobalValue*, 8> &UsedValues) {
+  if (LLVMUsed == 0) return;
+
+  ConstantArray *Inits = cast<ConstantArray>(LLVMUsed->getInitializer());
+  for (unsigned i = 0, e = Inits->getNumOperands(); i != e; ++i)
+    if (GlobalValue *GV =
+        dyn_cast<GlobalValue>(Inits->getOperand(i)->stripPointerCasts()))
+      UsedValues.insert(GV);
+}
+
+static void accumulateAndSortLibcalls(std::vector<StringRef> &Libcalls,
+                                      const TargetLibraryInfo& TLI,
+                                      const TargetLowering *Lowering)
+{
+  // TargetLibraryInfo has info on C runtime library calls on the current
+  // target.
+  for (unsigned I = 0, E = static_cast<unsigned>(LibFunc::NumLibFuncs);
+       I != E; ++I) {
+    LibFunc::Func F = static_cast<LibFunc::Func>(I);
+    if (TLI.has(F))
+      Libcalls.push_back(TLI.getName(F));
+  }
+
+  // TargetLowering has info on library calls that CodeGen expects to be
+  // available, both from the C runtime and compiler-rt.
+  if (Lowering)
+    for (unsigned I = 0, E = static_cast<unsigned>(RTLIB::UNKNOWN_LIBCALL);
+         I != E; ++I)
+      if (const char *Name
+          = Lowering->getLibcallName(static_cast<RTLIB::Libcall>(I)))
+        Libcalls.push_back(Name);
+
+  array_pod_sort(Libcalls.begin(), Libcalls.end());
+  Libcalls.erase(std::unique(Libcalls.begin(), Libcalls.end()),
+                 Libcalls.end());
+}
+
+void LTOCodeGenerator::applyScopeRestrictions() {
+  if (ScopeRestrictionsDone)
+    return;
+  Module *mergedModule = Linker.getModule();
+
+  // Start off with a verification pass.
+  PassManager passes;
+  passes.add(createVerifierPass());
+
+  // mark which symbols can not be internalized
+  Mangler Mangler(TargetMach);
+  std::vector<const char*> MustPreserveList;
+  SmallPtrSet<GlobalValue*, 8> AsmUsed;
+  std::vector<StringRef> Libcalls;
+  TargetLibraryInfo TLI(Triple(TargetMach->getTargetTriple()));
+  accumulateAndSortLibcalls(Libcalls, TLI, TargetMach->getTargetLowering());
+
+  for (Module::iterator f = mergedModule->begin(),
+         e = mergedModule->end(); f != e; ++f)
+    applyRestriction(*f, Libcalls, MustPreserveList, AsmUsed, Mangler);
+  for (Module::global_iterator v = mergedModule->global_begin(),
+         e = mergedModule->global_end(); v !=  e; ++v)
+    applyRestriction(*v, Libcalls, MustPreserveList, AsmUsed, Mangler);
+  for (Module::alias_iterator a = mergedModule->alias_begin(),
+         e = mergedModule->alias_end(); a != e; ++a)
+    applyRestriction(*a, Libcalls, MustPreserveList, AsmUsed, Mangler);
+
+  GlobalVariable *LLVMCompilerUsed =
+    mergedModule->getGlobalVariable("llvm.compiler.used");
+  findUsedValues(LLVMCompilerUsed, AsmUsed);
+  if (LLVMCompilerUsed)
+    LLVMCompilerUsed->eraseFromParent();
+
+  if (!AsmUsed.empty()) {
+    llvm::Type *i8PTy = llvm::Type::getInt8PtrTy(Context);
+    std::vector<Constant*> asmUsed2;
+    for (SmallPtrSet<GlobalValue*, 16>::const_iterator i = AsmUsed.begin(),
+           e = AsmUsed.end(); i !=e; ++i) {
+      GlobalValue *GV = *i;
+      Constant *c = ConstantExpr::getBitCast(GV, i8PTy);
+      asmUsed2.push_back(c);
+    }
+
+    llvm::ArrayType *ATy = llvm::ArrayType::get(i8PTy, asmUsed2.size());
+    LLVMCompilerUsed =
+      new llvm::GlobalVariable(*mergedModule, ATy, false,
+                               llvm::GlobalValue::AppendingLinkage,
+                               llvm::ConstantArray::get(ATy, asmUsed2),
+                               "llvm.compiler.used");
+
+    LLVMCompilerUsed->setSection("llvm.metadata");
+  }
+
+  passes.add(createInternalizePass(MustPreserveList));
+
+  // apply scope restrictions
+  passes.run(*mergedModule);
+
+  ScopeRestrictionsDone = true;
+}
+
+/// Optimize merged modules using various IPO passes
+bool LTOCodeGenerator::generateObjectFile(raw_ostream &out,
+                                          bool DisableOpt,
+                                          bool DisableInline,
+                                          bool DisableGVNLoadPRE,
+                                          std::string &errMsg) {
+  if (!this->determineTarget(errMsg))
+    return false;
+
+  Module *mergedModule = Linker.getModule();
+
+  // Mark which symbols can not be internalized
+  this->applyScopeRestrictions();
+
+  // Instantiate the pass manager to organize the passes.
+  PassManager passes;
+
+  // Start off with a verification pass.
+  passes.add(createVerifierPass());
+
+  // Add an appropriate DataLayout instance for this module...
+  passes.add(new DataLayout(*TargetMach->getDataLayout()));
+  TargetMach->addAnalysisPasses(passes);
+
+  // Enabling internalize here would use its AllButMain variant. It
+  // keeps only main if it exists and does nothing for libraries. Instead
+  // we create the pass ourselves with the symbol list provided by the linker.
+  if (!DisableOpt)
+    PassManagerBuilder().populateLTOPassManager(passes,
+                                              /*Internalize=*/false,
+                                              !DisableInline,
+                                              DisableGVNLoadPRE);
+
+  // Make sure everything is still good.
+  passes.add(createVerifierPass());
+
+  PassManager codeGenPasses;
+
+  codeGenPasses.add(new DataLayout(*TargetMach->getDataLayout()));
+  TargetMach->addAnalysisPasses(codeGenPasses);
+
+  formatted_raw_ostream Out(out);
+
+  // If the bitcode files contain ARC code and were compiled with optimization,
+  // the ObjCARCContractPass must be run, so do it unconditionally here.
+  codeGenPasses.add(createObjCARCContractPass());
+
+  if (TargetMach->addPassesToEmitFile(codeGenPasses, Out,
+                                      TargetMachine::CGFT_ObjectFile)) {
+    errMsg = "target file type not supported";
+    return false;
+  }
+
+  // Run our queue of passes all at once now, efficiently.
+  passes.run(*mergedModule);
+
+  // Run the code generator, and write assembly file
+  codeGenPasses.run(*mergedModule);
+
+  return true;
+}
+
+/// setCodeGenDebugOptions - Set codegen debugging options to aid in debugging
+/// LTO problems.
+void LTOCodeGenerator::setCodeGenDebugOptions(const char *options) {
+  for (std::pair<StringRef, StringRef> o = getToken(options);
+       !o.first.empty(); o = getToken(o.second)) {
+    // ParseCommandLineOptions() expects argv[0] to be program name. Lazily add
+    // that.
+    if (CodegenOptions.empty())
+      CodegenOptions.push_back(strdup("libLLVMLTO"));
+    CodegenOptions.push_back(strdup(o.first.str().c_str()));
+  }
+}
+
+void LTOCodeGenerator::parseCodeGenDebugOptions() {
+  // if options were requested, set them
+  if (!CodegenOptions.empty())
+    cl::ParseCommandLineOptions(CodegenOptions.size(),
+                                const_cast<char **>(&CodegenOptions[0]));
+}
diff --git a/lib/LTO/LTOModule.cpp b/lib/LTO/LTOModule.cpp
new file mode 100644
index 0000000..65416be
--- /dev/null
+++ b/lib/LTO/LTOModule.cpp
@@ -0,0 +1,794 @@
+//===-- LTOModule.cpp - LLVM Link Time Optimizer --------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Link Time Optimization library. This library is
+// intended to be used by linker to optimize code at link time.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/LTO/LTOModule.h"
+#include "llvm/ADT/OwningPtr.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/Bitcode/ReaderWriter.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCParser/MCAsmParser.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCTargetAsmParser.h"
+#include "llvm/MC/SubtargetFeature.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Host.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/TargetSelect.h"
+#include "llvm/Support/system_error.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Transforms/Utils/GlobalStatus.h"
+using namespace llvm;
+
+LTOModule::LTOModule(llvm::Module *m, llvm::TargetMachine *t)
+  : _module(m), _target(t),
+    _context(_target->getMCAsmInfo(), _target->getRegisterInfo(), NULL),
+    _mangler(t) {}
+
+/// isBitcodeFile - Returns 'true' if the file (or memory contents) is LLVM
+/// bitcode.
+bool LTOModule::isBitcodeFile(const void *mem, size_t length) {
+  return sys::fs::identify_magic(StringRef((const char *)mem, length)) ==
+         sys::fs::file_magic::bitcode;
+}
+
+bool LTOModule::isBitcodeFile(const char *path) {
+  sys::fs::file_magic type;
+  if (sys::fs::identify_magic(path, type))
+    return false;
+  return type == sys::fs::file_magic::bitcode;
+}
+
+/// isBitcodeFileForTarget - Returns 'true' if the file (or memory contents) is
+/// LLVM bitcode for the specified triple.
+bool LTOModule::isBitcodeFileForTarget(const void *mem, size_t length,
+                                       const char *triplePrefix) {
+  MemoryBuffer *buffer = makeBuffer(mem, length);
+  if (!buffer)
+    return false;
+  return isTargetMatch(buffer, triplePrefix);
+}
+
+bool LTOModule::isBitcodeFileForTarget(const char *path,
+                                       const char *triplePrefix) {
+  OwningPtr<MemoryBuffer> buffer;
+  if (MemoryBuffer::getFile(path, buffer))
+    return false;
+  return isTargetMatch(buffer.take(), triplePrefix);
+}
+
+/// isTargetMatch - Returns 'true' if the memory buffer is for the specified
+/// target triple.
+bool LTOModule::isTargetMatch(MemoryBuffer *buffer, const char *triplePrefix) {
+  std::string Triple = getBitcodeTargetTriple(buffer, getGlobalContext());
+  delete buffer;
+  return strncmp(Triple.c_str(), triplePrefix, strlen(triplePrefix)) == 0;
+}
+
+/// makeLTOModule - Create an LTOModule. N.B. These methods take ownership of
+/// the buffer.
+LTOModule *LTOModule::makeLTOModule(const char *path, TargetOptions options,
+                                    std::string &errMsg) {
+  OwningPtr<MemoryBuffer> buffer;
+  if (error_code ec = MemoryBuffer::getFile(path, buffer)) {
+    errMsg = ec.message();
+    return NULL;
+  }
+  return makeLTOModule(buffer.take(), options, errMsg);
+}
+
+LTOModule *LTOModule::makeLTOModule(int fd, const char *path,
+                                    size_t size, TargetOptions options,
+                                    std::string &errMsg) {
+  return makeLTOModule(fd, path, size, 0, options, errMsg);
+}
+
+LTOModule *LTOModule::makeLTOModule(int fd, const char *path,
+                                    size_t map_size,
+                                    off_t offset,
+                                    TargetOptions options,
+                                    std::string &errMsg) {
+  OwningPtr<MemoryBuffer> buffer;
+  if (error_code ec =
+          MemoryBuffer::getOpenFileSlice(fd, path, buffer, map_size, offset)) {
+    errMsg = ec.message();
+    return NULL;
+  }
+  return makeLTOModule(buffer.take(), options, errMsg);
+}
+
+LTOModule *LTOModule::makeLTOModule(const void *mem, size_t length,
+                                    TargetOptions options,
+                                    std::string &errMsg) {
+  OwningPtr<MemoryBuffer> buffer(makeBuffer(mem, length));
+  if (!buffer)
+    return NULL;
+  return makeLTOModule(buffer.take(), options, errMsg);
+}
+
+LTOModule *LTOModule::makeLTOModule(MemoryBuffer *buffer,
+                                    TargetOptions options,
+                                    std::string &errMsg) {
+  // parse bitcode buffer
+  OwningPtr<Module> m(getLazyBitcodeModule(buffer, getGlobalContext(),
+                                           &errMsg));
+  if (!m) {
+    delete buffer;
+    return NULL;
+  }
+
+  std::string TripleStr = m->getTargetTriple();
+  if (TripleStr.empty())
+    TripleStr = sys::getDefaultTargetTriple();
+  llvm::Triple Triple(TripleStr);
+
+  // find machine architecture for this module
+  const Target *march = TargetRegistry::lookupTarget(TripleStr, errMsg);
+  if (!march)
+    return NULL;
+
+  // construct LTOModule, hand over ownership of module and target
+  SubtargetFeatures Features;
+  Features.getDefaultSubtargetFeatures(Triple);
+  std::string FeatureStr = Features.getString();
+  // Set a default CPU for Darwin triples.
+  std::string CPU;
+  if (Triple.isOSDarwin()) {
+    if (Triple.getArch() == llvm::Triple::x86_64)
+      CPU = "core2";
+    else if (Triple.getArch() == llvm::Triple::x86)
+      CPU = "yonah";
+  }
+
+  TargetMachine *target = march->createTargetMachine(TripleStr, CPU, FeatureStr,
+                                                     options);
+  m->MaterializeAllPermanently();
+
+  LTOModule *Ret = new LTOModule(m.take(), target);
+  if (Ret->parseSymbols(errMsg)) {
+    delete Ret;
+    return NULL;
+  }
+
+  return Ret;
+}
+
+/// makeBuffer - Create a MemoryBuffer from a memory range.
+MemoryBuffer *LTOModule::makeBuffer(const void *mem, size_t length) {
+  const char *startPtr = (const char*)mem;
+  return MemoryBuffer::getMemBuffer(StringRef(startPtr, length), "", false);
+}
+
+/// objcClassNameFromExpression - Get string that the data pointer points to.
+bool
+LTOModule::objcClassNameFromExpression(const Constant *c, std::string &name) {
+  if (const ConstantExpr *ce = dyn_cast<ConstantExpr>(c)) {
+    Constant *op = ce->getOperand(0);
+    if (GlobalVariable *gvn = dyn_cast<GlobalVariable>(op)) {
+      Constant *cn = gvn->getInitializer();
+      if (ConstantDataArray *ca = dyn_cast<ConstantDataArray>(cn)) {
+        if (ca->isCString()) {
+          name = ".objc_class_name_" + ca->getAsCString().str();
+          return true;
+        }
+      }
+    }
+  }
+  return false;
+}
+
+/// addObjCClass - Parse i386/ppc ObjC class data structure.
+void LTOModule::addObjCClass(const GlobalVariable *clgv) {
+  const ConstantStruct *c = dyn_cast<ConstantStruct>(clgv->getInitializer());
+  if (!c) return;
+
+  // second slot in __OBJC,__class is pointer to superclass name
+  std::string superclassName;
+  if (objcClassNameFromExpression(c->getOperand(1), superclassName)) {
+    NameAndAttributes info;
+    StringMap<NameAndAttributes>::value_type &entry =
+      _undefines.GetOrCreateValue(superclassName);
+    if (!entry.getValue().name) {
+      const char *symbolName = entry.getKey().data();
+      info.name = symbolName;
+      info.attributes = LTO_SYMBOL_DEFINITION_UNDEFINED;
+      info.isFunction = false;
+      info.symbol = clgv;
+      entry.setValue(info);
+    }
+  }
+
+  // third slot in __OBJC,__class is pointer to class name
+  std::string className;
+  if (objcClassNameFromExpression(c->getOperand(2), className)) {
+    StringSet::value_type &entry = _defines.GetOrCreateValue(className);
+    entry.setValue(1);
+
+    NameAndAttributes info;
+    info.name = entry.getKey().data();
+    info.attributes = LTO_SYMBOL_PERMISSIONS_DATA |
+      LTO_SYMBOL_DEFINITION_REGULAR | LTO_SYMBOL_SCOPE_DEFAULT;
+    info.isFunction = false;
+    info.symbol = clgv;
+    _symbols.push_back(info);
+  }
+}
+
+/// addObjCCategory - Parse i386/ppc ObjC category data structure.
+void LTOModule::addObjCCategory(const GlobalVariable *clgv) {
+  const ConstantStruct *c = dyn_cast<ConstantStruct>(clgv->getInitializer());
+  if (!c) return;
+
+  // second slot in __OBJC,__category is pointer to target class name
+  std::string targetclassName;
+  if (!objcClassNameFromExpression(c->getOperand(1), targetclassName))
+    return;
+
+  NameAndAttributes info;
+  StringMap<NameAndAttributes>::value_type &entry =
+    _undefines.GetOrCreateValue(targetclassName);
+
+  if (entry.getValue().name)
+    return;
+
+  const char *symbolName = entry.getKey().data();
+  info.name = symbolName;
+  info.attributes = LTO_SYMBOL_DEFINITION_UNDEFINED;
+  info.isFunction = false;
+  info.symbol = clgv;
+  entry.setValue(info);
+}
+
+/// addObjCClassRef - Parse i386/ppc ObjC class list data structure.
+void LTOModule::addObjCClassRef(const GlobalVariable *clgv) {
+  std::string targetclassName;
+  if (!objcClassNameFromExpression(clgv->getInitializer(), targetclassName))
+    return;
+
+  NameAndAttributes info;
+  StringMap<NameAndAttributes>::value_type &entry =
+    _undefines.GetOrCreateValue(targetclassName);
+  if (entry.getValue().name)
+    return;
+
+  const char *symbolName = entry.getKey().data();
+  info.name = symbolName;
+  info.attributes = LTO_SYMBOL_DEFINITION_UNDEFINED;
+  info.isFunction = false;
+  info.symbol = clgv;
+  entry.setValue(info);
+}
+
+/// addDefinedDataSymbol - Add a data symbol as defined to the list.
+void LTOModule::addDefinedDataSymbol(const GlobalValue *v) {
+  // Add to list of defined symbols.
+  addDefinedSymbol(v, false);
+
+  if (!v->hasSection() /* || !isTargetDarwin */)
+    return;
+
+  // Special case i386/ppc ObjC data structures in magic sections:
+  // The issue is that the old ObjC object format did some strange
+  // contortions to avoid real linker symbols.  For instance, the
+  // ObjC class data structure is allocated statically in the executable
+  // that defines that class.  That data structures contains a pointer to
+  // its superclass.  But instead of just initializing that part of the
+  // struct to the address of its superclass, and letting the static and
+  // dynamic linkers do the rest, the runtime works by having that field
+  // instead point to a C-string that is the name of the superclass.
+  // At runtime the objc initialization updates that pointer and sets
+  // it to point to the actual super class.  As far as the linker
+  // knows it is just a pointer to a string.  But then someone wanted the
+  // linker to issue errors at build time if the superclass was not found.
+  // So they figured out a way in mach-o object format to use an absolute
+  // symbols (.objc_class_name_Foo = 0) and a floating reference
+  // (.reference .objc_class_name_Bar) to cause the linker into erroring when
+  // a class was missing.
+  // The following synthesizes the implicit .objc_* symbols for the linker
+  // from the ObjC data structures generated by the front end.
+
+  // special case if this data blob is an ObjC class definition
+  if (v->getSection().compare(0, 15, "__OBJC,__class,") == 0) {
+    if (const GlobalVariable *gv = dyn_cast<GlobalVariable>(v)) {
+      addObjCClass(gv);
+    }
+  }
+
+  // special case if this data blob is an ObjC category definition
+  else if (v->getSection().compare(0, 18, "__OBJC,__category,") == 0) {
+    if (const GlobalVariable *gv = dyn_cast<GlobalVariable>(v)) {
+      addObjCCategory(gv);
+    }
+  }
+
+  // special case if this data blob is the list of referenced classes
+  else if (v->getSection().compare(0, 18, "__OBJC,__cls_refs,") == 0) {
+    if (const GlobalVariable *gv = dyn_cast<GlobalVariable>(v)) {
+      addObjCClassRef(gv);
+    }
+  }
+}
+
+/// addDefinedFunctionSymbol - Add a function symbol as defined to the list.
+void LTOModule::addDefinedFunctionSymbol(const Function *f) {
+  // add to list of defined symbols
+  addDefinedSymbol(f, true);
+}
+
+static bool canBeHidden(const GlobalValue *GV) {
+  GlobalValue::LinkageTypes L = GV->getLinkage();
+
+  if (L != GlobalValue::LinkOnceODRLinkage)
+    return false;
+
+  if (GV->hasUnnamedAddr())
+    return true;
+
+  GlobalStatus GS;
+  if (GlobalStatus::analyzeGlobal(GV, GS))
+    return false;
+
+  return !GS.IsCompared;
+}
+
+/// addDefinedSymbol - Add a defined symbol to the list.
+void LTOModule::addDefinedSymbol(const GlobalValue *def, bool isFunction) {
+  // ignore all llvm.* symbols
+  if (def->getName().startswith("llvm."))
+    return;
+
+  // string is owned by _defines
+  SmallString<64> Buffer;
+  _mangler.getNameWithPrefix(Buffer, def, false);
+
+  // set alignment part log2() can have rounding errors
+  uint32_t align = def->getAlignment();
+  uint32_t attr = align ? countTrailingZeros(def->getAlignment()) : 0;
+
+  // set permissions part
+  if (isFunction) {
+    attr |= LTO_SYMBOL_PERMISSIONS_CODE;
+  } else {
+    const GlobalVariable *gv = dyn_cast<GlobalVariable>(def);
+    if (gv && gv->isConstant())
+      attr |= LTO_SYMBOL_PERMISSIONS_RODATA;
+    else
+      attr |= LTO_SYMBOL_PERMISSIONS_DATA;
+  }
+
+  // set definition part
+  if (def->hasWeakLinkage() || def->hasLinkOnceLinkage() ||
+      def->hasLinkerPrivateWeakLinkage())
+    attr |= LTO_SYMBOL_DEFINITION_WEAK;
+  else if (def->hasCommonLinkage())
+    attr |= LTO_SYMBOL_DEFINITION_TENTATIVE;
+  else
+    attr |= LTO_SYMBOL_DEFINITION_REGULAR;
+
+  // set scope part
+  if (def->hasHiddenVisibility())
+    attr |= LTO_SYMBOL_SCOPE_HIDDEN;
+  else if (def->hasProtectedVisibility())
+    attr |= LTO_SYMBOL_SCOPE_PROTECTED;
+  else if (canBeHidden(def))
+    attr |= LTO_SYMBOL_SCOPE_DEFAULT_CAN_BE_HIDDEN;
+  else if (def->hasExternalLinkage() || def->hasWeakLinkage() ||
+           def->hasLinkOnceLinkage() || def->hasCommonLinkage() ||
+           def->hasLinkerPrivateWeakLinkage())
+    attr |= LTO_SYMBOL_SCOPE_DEFAULT;
+  else
+    attr |= LTO_SYMBOL_SCOPE_INTERNAL;
+
+  StringSet::value_type &entry = _defines.GetOrCreateValue(Buffer);
+  entry.setValue(1);
+
+  // fill information structure
+  NameAndAttributes info;
+  StringRef Name = entry.getKey();
+  info.name = Name.data();
+  assert(info.name[Name.size()] == '\0');
+  info.attributes = attr;
+  info.isFunction = isFunction;
+  info.symbol = def;
+
+  // add to table of symbols
+  _symbols.push_back(info);
+}
+
+/// addAsmGlobalSymbol - Add a global symbol from module-level ASM to the
+/// defined list.
+void LTOModule::addAsmGlobalSymbol(const char *name,
+                                   lto_symbol_attributes scope) {
+  StringSet::value_type &entry = _defines.GetOrCreateValue(name);
+
+  // only add new define if not already defined
+  if (entry.getValue())
+    return;
+
+  entry.setValue(1);
+
+  NameAndAttributes &info = _undefines[entry.getKey().data()];
+
+  if (info.symbol == 0) {
+    // FIXME: This is trying to take care of module ASM like this:
+    //
+    //   module asm ".zerofill __FOO, __foo, _bar_baz_qux, 0"
+    //
+    // but is gross and its mother dresses it funny. Have the ASM parser give us
+    // more details for this type of situation so that we're not guessing so
+    // much.
+
+    // fill information structure
+    info.name = entry.getKey().data();
+    info.attributes =
+      LTO_SYMBOL_PERMISSIONS_DATA | LTO_SYMBOL_DEFINITION_REGULAR | scope;
+    info.isFunction = false;
+    info.symbol = 0;
+
+    // add to table of symbols
+    _symbols.push_back(info);
+    return;
+  }
+
+  if (info.isFunction)
+    addDefinedFunctionSymbol(cast<Function>(info.symbol));
+  else
+    addDefinedDataSymbol(info.symbol);
+
+  _symbols.back().attributes &= ~LTO_SYMBOL_SCOPE_MASK;
+  _symbols.back().attributes |= scope;
+}
+
+/// addAsmGlobalSymbolUndef - Add a global symbol from module-level ASM to the
+/// undefined list.
+void LTOModule::addAsmGlobalSymbolUndef(const char *name) {
+  StringMap<NameAndAttributes>::value_type &entry =
+    _undefines.GetOrCreateValue(name);
+
+  _asm_undefines.push_back(entry.getKey().data());
+
+  // we already have the symbol
+  if (entry.getValue().name)
+    return;
+
+  uint32_t attr = LTO_SYMBOL_DEFINITION_UNDEFINED;;
+  attr |= LTO_SYMBOL_SCOPE_DEFAULT;
+  NameAndAttributes info;
+  info.name = entry.getKey().data();
+  info.attributes = attr;
+  info.isFunction = false;
+  info.symbol = 0;
+
+  entry.setValue(info);
+}
+
+/// addPotentialUndefinedSymbol - Add a symbol which isn't defined just yet to a
+/// list to be resolved later.
+void
+LTOModule::addPotentialUndefinedSymbol(const GlobalValue *decl, bool isFunc) {
+  // ignore all llvm.* symbols
+  if (decl->getName().startswith("llvm."))
+    return;
+
+  // ignore all aliases
+  if (isa<GlobalAlias>(decl))
+    return;
+
+  SmallString<64> name;
+  _mangler.getNameWithPrefix(name, decl, false);
+
+  StringMap<NameAndAttributes>::value_type &entry =
+    _undefines.GetOrCreateValue(name);
+
+  // we already have the symbol
+  if (entry.getValue().name)
+    return;
+
+  NameAndAttributes info;
+
+  info.name = entry.getKey().data();
+
+  if (decl->hasExternalWeakLinkage())
+    info.attributes = LTO_SYMBOL_DEFINITION_WEAKUNDEF;
+  else
+    info.attributes = LTO_SYMBOL_DEFINITION_UNDEFINED;
+
+  info.isFunction = isFunc;
+  info.symbol = decl;
+
+  entry.setValue(info);
+}
+
+namespace {
+  class RecordStreamer : public MCStreamer {
+  public:
+    enum State { NeverSeen, Global, Defined, DefinedGlobal, Used };
+
+  private:
+    StringMap<State> Symbols;
+
+    void markDefined(const MCSymbol &Symbol) {
+      State &S = Symbols[Symbol.getName()];
+      switch (S) {
+      case DefinedGlobal:
+      case Global:
+        S = DefinedGlobal;
+        break;
+      case NeverSeen:
+      case Defined:
+      case Used:
+        S = Defined;
+        break;
+      }
+    }
+    void markGlobal(const MCSymbol &Symbol) {
+      State &S = Symbols[Symbol.getName()];
+      switch (S) {
+      case DefinedGlobal:
+      case Defined:
+        S = DefinedGlobal;
+        break;
+
+      case NeverSeen:
+      case Global:
+      case Used:
+        S = Global;
+        break;
+      }
+    }
+    void markUsed(const MCSymbol &Symbol) {
+      State &S = Symbols[Symbol.getName()];
+      switch (S) {
+      case DefinedGlobal:
+      case Defined:
+      case Global:
+        break;
+
+      case NeverSeen:
+      case Used:
+        S = Used;
+        break;
+      }
+    }
+
+    // FIXME: mostly copied for the obj streamer.
+    void AddValueSymbols(const MCExpr *Value) {
+      switch (Value->getKind()) {
+      case MCExpr::Target:
+        // FIXME: What should we do in here?
+        break;
+
+      case MCExpr::Constant:
+        break;
+
+      case MCExpr::Binary: {
+        const MCBinaryExpr *BE = cast<MCBinaryExpr>(Value);
+        AddValueSymbols(BE->getLHS());
+        AddValueSymbols(BE->getRHS());
+        break;
+      }
+
+      case MCExpr::SymbolRef:
+        markUsed(cast<MCSymbolRefExpr>(Value)->getSymbol());
+        break;
+
+      case MCExpr::Unary:
+        AddValueSymbols(cast<MCUnaryExpr>(Value)->getSubExpr());
+        break;
+      }
+    }
+
+  public:
+    typedef StringMap<State>::const_iterator const_iterator;
+
+    const_iterator begin() {
+      return Symbols.begin();
+    }
+
+    const_iterator end() {
+      return Symbols.end();
+    }
+
+    RecordStreamer(MCContext &Context) : MCStreamer(Context, 0) {}
+
+    virtual void EmitInstruction(const MCInst &Inst) {
+      // Scan for values.
+      for (unsigned i = Inst.getNumOperands(); i--; )
+        if (Inst.getOperand(i).isExpr())
+          AddValueSymbols(Inst.getOperand(i).getExpr());
+    }
+    virtual void EmitLabel(MCSymbol *Symbol) {
+      Symbol->setSection(*getCurrentSection().first);
+      markDefined(*Symbol);
+    }
+    virtual void EmitDebugLabel(MCSymbol *Symbol) {
+      EmitLabel(Symbol);
+    }
+    virtual void EmitAssignment(MCSymbol *Symbol, const MCExpr *Value) {
+      // FIXME: should we handle aliases?
+      markDefined(*Symbol);
+    }
+    virtual bool EmitSymbolAttribute(MCSymbol *Symbol, MCSymbolAttr Attribute) {
+      if (Attribute == MCSA_Global)
+        markGlobal(*Symbol);
+      return true;
+    }
+    virtual void EmitZerofill(const MCSection *Section, MCSymbol *Symbol,
+                              uint64_t Size , unsigned ByteAlignment) {
+      markDefined(*Symbol);
+    }
+    virtual void EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
+                                  unsigned ByteAlignment) {
+      markDefined(*Symbol);
+    }
+
+    virtual void EmitBundleAlignMode(unsigned AlignPow2) {}
+    virtual void EmitBundleLock(bool AlignToEnd) {}
+    virtual void EmitBundleUnlock() {}
+
+    // Noop calls.
+    virtual void ChangeSection(const MCSection *Section,
+                               const MCExpr *Subsection) {}
+    virtual void InitToTextSection() {}
+    virtual void InitSections() {}
+    virtual void EmitAssemblerFlag(MCAssemblerFlag Flag) {}
+    virtual void EmitThumbFunc(MCSymbol *Func) {}
+    virtual void EmitSymbolDesc(MCSymbol *Symbol, unsigned DescValue) {}
+    virtual void EmitWeakReference(MCSymbol *Alias, const MCSymbol *Symbol) {}
+    virtual void BeginCOFFSymbolDef(const MCSymbol *Symbol) {}
+    virtual void EmitCOFFSymbolStorageClass(int StorageClass) {}
+    virtual void EmitCOFFSymbolType(int Type) {}
+    virtual void EndCOFFSymbolDef() {}
+    virtual void EmitELFSize(MCSymbol *Symbol, const MCExpr *Value) {}
+    virtual void EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
+                                       unsigned ByteAlignment) {}
+    virtual void EmitTBSSSymbol(const MCSection *Section, MCSymbol *Symbol,
+                                uint64_t Size, unsigned ByteAlignment) {}
+    virtual void EmitBytes(StringRef Data) {}
+    virtual void EmitValueImpl(const MCExpr *Value, unsigned Size) {}
+    virtual void EmitULEB128Value(const MCExpr *Value) {}
+    virtual void EmitSLEB128Value(const MCExpr *Value) {}
+    virtual void EmitValueToAlignment(unsigned ByteAlignment, int64_t Value,
+                                      unsigned ValueSize,
+                                      unsigned MaxBytesToEmit) {}
+    virtual void EmitCodeAlignment(unsigned ByteAlignment,
+                                   unsigned MaxBytesToEmit) {}
+    virtual bool EmitValueToOffset(const MCExpr *Offset,
+                                   unsigned char Value ) { return false; }
+    virtual void EmitFileDirective(StringRef Filename) {}
+    virtual void EmitDwarfAdvanceLineAddr(int64_t LineDelta,
+                                          const MCSymbol *LastLabel,
+                                          const MCSymbol *Label,
+                                          unsigned PointerSize) {}
+    virtual void FinishImpl() {}
+    virtual void EmitCFIEndProcImpl(MCDwarfFrameInfo &Frame) {
+      RecordProcEnd(Frame);
+    }
+  };
+} // end anonymous namespace
+
+/// addAsmGlobalSymbols - Add global symbols from module-level ASM to the
+/// defined or undefined lists.
+bool LTOModule::addAsmGlobalSymbols(std::string &errMsg) {
+  const std::string &inlineAsm = _module->getModuleInlineAsm();
+  if (inlineAsm.empty())
+    return false;
+
+  OwningPtr<RecordStreamer> Streamer(new RecordStreamer(_context));
+  MemoryBuffer *Buffer = MemoryBuffer::getMemBuffer(inlineAsm);
+  SourceMgr SrcMgr;
+  SrcMgr.AddNewSourceBuffer(Buffer, SMLoc());
+  OwningPtr<MCAsmParser> Parser(createMCAsmParser(SrcMgr,
+                                                  _context, *Streamer,
+                                                  *_target->getMCAsmInfo()));
+  const Target &T = _target->getTarget();
+  OwningPtr<MCInstrInfo> MCII(T.createMCInstrInfo());
+  OwningPtr<MCSubtargetInfo>
+    STI(T.createMCSubtargetInfo(_target->getTargetTriple(),
+                                _target->getTargetCPU(),
+                                _target->getTargetFeatureString()));
+  OwningPtr<MCTargetAsmParser> TAP(T.createMCAsmParser(*STI, *Parser.get(), *MCII));
+  if (!TAP) {
+    errMsg = "target " + std::string(T.getName()) +
+      " does not define AsmParser.";
+    return true;
+  }
+
+  Parser->setTargetParser(*TAP);
+  if (Parser->Run(false))
+    return true;
+
+  for (RecordStreamer::const_iterator i = Streamer->begin(),
+         e = Streamer->end(); i != e; ++i) {
+    StringRef Key = i->first();
+    RecordStreamer::State Value = i->second;
+    if (Value == RecordStreamer::DefinedGlobal)
+      addAsmGlobalSymbol(Key.data(), LTO_SYMBOL_SCOPE_DEFAULT);
+    else if (Value == RecordStreamer::Defined)
+      addAsmGlobalSymbol(Key.data(), LTO_SYMBOL_SCOPE_INTERNAL);
+    else if (Value == RecordStreamer::Global ||
+             Value == RecordStreamer::Used)
+      addAsmGlobalSymbolUndef(Key.data());
+  }
+
+  return false;
+}
+
+/// isDeclaration - Return 'true' if the global value is a declaration.
+static bool isDeclaration(const GlobalValue &V) {
+  if (V.hasAvailableExternallyLinkage())
+    return true;
+
+  if (V.isMaterializable())
+    return false;
+
+  return V.isDeclaration();
+}
+
+/// parseSymbols - Parse the symbols from the module and model-level ASM and add
+/// them to either the defined or undefined lists.
+bool LTOModule::parseSymbols(std::string &errMsg) {
+  // add functions
+  for (Module::iterator f = _module->begin(), e = _module->end(); f != e; ++f) {
+    if (isDeclaration(*f))
+      addPotentialUndefinedSymbol(f, true);
+    else
+      addDefinedFunctionSymbol(f);
+  }
+
+  // add data
+  for (Module::global_iterator v = _module->global_begin(),
+         e = _module->global_end(); v !=  e; ++v) {
+    if (isDeclaration(*v))
+      addPotentialUndefinedSymbol(v, false);
+    else
+      addDefinedDataSymbol(v);
+  }
+
+  // add asm globals
+  if (addAsmGlobalSymbols(errMsg))
+    return true;
+
+  // add aliases
+  for (Module::alias_iterator a = _module->alias_begin(),
+         e = _module->alias_end(); a != e; ++a) {
+    if (isDeclaration(*a->getAliasedGlobal()))
+      // Is an alias to a declaration.
+      addPotentialUndefinedSymbol(a, false);
+    else
+      addDefinedDataSymbol(a);
+  }
+
+  // make symbols for all undefines
+  for (StringMap<NameAndAttributes>::iterator u =_undefines.begin(),
+         e = _undefines.end(); u != e; ++u) {
+    // If this symbol also has a definition, then don't make an undefine because
+    // it is a tentative definition.
+    if (_defines.count(u->getKey())) continue;
+    NameAndAttributes info = u->getValue();
+    _symbols.push_back(info);
+  }
+
+  return false;
+}
diff --git a/lib/LTO/Makefile b/lib/LTO/Makefile
new file mode 100644
index 0000000..55e2a5e
--- /dev/null
+++ b/lib/LTO/Makefile
@@ -0,0 +1,15 @@
+##===- lib/LTO/Makefile ------------------------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../..
+LIBRARYNAME = LLVMLTO
+BUILD_ARCHIVE := 1
+
+include $(LEVEL)/Makefile.common
+
diff --git a/lib/Linker/LinkModules.cpp b/lib/Linker/LinkModules.cpp
index ce02c37..8f2200e 100644
--- a/lib/Linker/LinkModules.cpp
+++ b/lib/Linker/LinkModules.cpp
@@ -22,8 +22,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/Cloning.h"
-
-#include <ctype.h>
+#include <cctype>
 using namespace llvm;
 
 //===----------------------------------------------------------------------===//
@@ -706,7 +705,11 @@ bool ModuleLinker::linkAppendingVarProto(GlobalVariable *DstGV,
   if (DstGV->getVisibility() != SrcGV->getVisibility())
     return emitError(
             "Appending variables with different visibility need to be linked!");
-  
+
+  if (DstGV->hasUnnamedAddr() != SrcGV->hasUnnamedAddr())
+    return emitError(
+        "Appending variables with different unnamed_addr need to be linked!");
+
   if (DstGV->getSection() != SrcGV->getSection())
     return emitError(
           "Appending variables with different section name need to be linked!");
@@ -748,6 +751,7 @@ bool ModuleLinker::linkAppendingVarProto(GlobalVariable *DstGV,
 bool ModuleLinker::linkGlobalProto(GlobalVariable *SGV) {
   GlobalValue *DGV = getLinkedToGlobal(SGV);
   llvm::Optional<GlobalValue::VisibilityTypes> NewVisibility;
+  bool HasUnnamedAddr = SGV->hasUnnamedAddr();
 
   if (DGV) {
     // Concatenation of appending linkage variables is magic and handled later.
@@ -762,6 +766,7 @@ bool ModuleLinker::linkGlobalProto(GlobalVariable *SGV) {
     if (getLinkageResult(DGV, SGV, NewLinkage, NV, LinkFromSrc))
       return true;
     NewVisibility = NV;
+    HasUnnamedAddr = HasUnnamedAddr && DGV->hasUnnamedAddr();
 
     // If we're not linking from the source, then keep the definition that we
     // have.
@@ -770,10 +775,11 @@ bool ModuleLinker::linkGlobalProto(GlobalVariable *SGV) {
       if (GlobalVariable *DGVar = dyn_cast<GlobalVariable>(DGV))
         if (DGVar->isDeclaration() && SGV->isConstant() && !DGVar->isConstant())
           DGVar->setConstant(true);
-      
-      // Set calculated linkage and visibility.
+
+      // Set calculated linkage, visibility and unnamed_addr.
       DGV->setLinkage(NewLinkage);
       DGV->setVisibility(*NewVisibility);
+      DGV->setUnnamedAddr(HasUnnamedAddr);
 
       // Make sure to remember this mapping.
       ValueMap[SGV] = ConstantExpr::getBitCast(DGV,TypeMap.get(SGV->getType()));
@@ -799,6 +805,7 @@ bool ModuleLinker::linkGlobalProto(GlobalVariable *SGV) {
   copyGVAttributes(NewDGV, SGV);
   if (NewVisibility)
     NewDGV->setVisibility(*NewVisibility);
+  NewDGV->setUnnamedAddr(HasUnnamedAddr);
 
   if (DGV) {
     DGV->replaceAllUsesWith(ConstantExpr::getBitCast(NewDGV, DGV->getType()));
@@ -815,6 +822,7 @@ bool ModuleLinker::linkGlobalProto(GlobalVariable *SGV) {
 bool ModuleLinker::linkFunctionProto(Function *SF) {
   GlobalValue *DGV = getLinkedToGlobal(SF);
   llvm::Optional<GlobalValue::VisibilityTypes> NewVisibility;
+  bool HasUnnamedAddr = SF->hasUnnamedAddr();
 
   if (DGV) {
     GlobalValue::LinkageTypes NewLinkage = GlobalValue::InternalLinkage;
@@ -823,11 +831,13 @@ bool ModuleLinker::linkFunctionProto(Function *SF) {
     if (getLinkageResult(DGV, SF, NewLinkage, NV, LinkFromSrc))
       return true;
     NewVisibility = NV;
+    HasUnnamedAddr = HasUnnamedAddr && DGV->hasUnnamedAddr();
 
     if (!LinkFromSrc) {
       // Set calculated linkage
       DGV->setLinkage(NewLinkage);
       DGV->setVisibility(*NewVisibility);
+      DGV->setUnnamedAddr(HasUnnamedAddr);
 
       // Make sure to remember this mapping.
       ValueMap[SF] = ConstantExpr::getBitCast(DGV, TypeMap.get(SF->getType()));
@@ -855,6 +865,7 @@ bool ModuleLinker::linkFunctionProto(Function *SF) {
   copyGVAttributes(NewDF, SF);
   if (NewVisibility)
     NewDF->setVisibility(*NewVisibility);
+  NewDF->setUnnamedAddr(HasUnnamedAddr);
 
   if (DGV) {
     // Any uses of DF need to change to NewDF, with cast.
@@ -1250,6 +1261,13 @@ bool ModuleLinker::run() {
     // Skip if not linking from source.
     if (DoNotLinkFromSource.count(SF)) continue;
     
+    Function *DF = cast<Function>(ValueMap[SF]);
+    if (SF->hasPrefixData()) {
+      // Link in the prefix data.
+      DF->setPrefixData(MapValue(
+          SF->getPrefixData(), ValueMap, RF_None, &TypeMap, &ValMaterializer));
+    }
+
     // Skip if no body (function is external) or materialize.
     if (SF->isDeclaration()) {
       if (!SF->isMaterializable())
@@ -1258,7 +1276,7 @@ bool ModuleLinker::run() {
         return true;
     }
     
-    linkFunctionBody(cast<Function>(ValueMap[SF]), SF);
+    linkFunctionBody(DF, SF);
     SF->Dematerialize();
   }
 
@@ -1286,6 +1304,14 @@ bool ModuleLinker::run() {
         continue;
 
       Function *DF = cast<Function>(ValueMap[SF]);
+      if (SF->hasPrefixData()) {
+        // Link in the prefix data.
+        DF->setPrefixData(MapValue(SF->getPrefixData(),
+                                   ValueMap,
+                                   RF_None,
+                                   &TypeMap,
+                                   &ValMaterializer));
+      }
 
       // Materialize if necessary.
       if (SF->isDeclaration()) {
@@ -1326,6 +1352,11 @@ Linker::Linker(Module *M) : Composite(M) {
 Linker::~Linker() {
 }
 
+void Linker::deleteModule() {
+  delete Composite;
+  Composite = NULL;
+}
+
 bool Linker::linkInModule(Module *Src, unsigned Mode, std::string *ErrorMsg) {
   ModuleLinker TheLinker(Composite, IdentifiedStructTypes, Src, Mode);
   if (TheLinker.run()) {
diff --git a/lib/MC/CMakeLists.txt b/lib/MC/CMakeLists.txt
index 89e2aaf..fa844ef 100644
--- a/lib/MC/CMakeLists.txt
+++ b/lib/MC/CMakeLists.txt
@@ -4,6 +4,7 @@ add_llvm_library(LLVMMC
   MCAsmInfo.cpp
   MCAsmInfoCOFF.cpp
   MCAsmInfoDarwin.cpp
+  MCAsmInfoELF.cpp
   MCAsmStreamer.cpp
   MCAssembler.cpp
   MCAtom.cpp
@@ -25,6 +26,7 @@ add_llvm_library(LLVMMC
   MCMachOStreamer.cpp
   MCMachObjectTargetWriter.cpp
   MCModule.cpp
+  MCModuleYAML.cpp
   MCNullStreamer.cpp
   MCObjectFileInfo.cpp
   MCObjectDisassembler.cpp
diff --git a/lib/MC/ELFObjectWriter.cpp b/lib/MC/ELFObjectWriter.cpp
index 2db59ac..9899bb2 100644
--- a/lib/MC/ELFObjectWriter.cpp
+++ b/lib/MC/ELFObjectWriter.cpp
@@ -73,10 +73,6 @@ class ELFObjectWriter : public MCObjectWriter {
 
       // Support lexicographic sorting.
       bool operator<(const ELFSymbolData &RHS) const {
-        if (MCELF::GetType(*SymbolData) == ELF::STT_FILE)
-          return true;
-        if (MCELF::GetType(*RHS.SymbolData) == ELF::STT_FILE)
-          return false;
         return SymbolData->getSymbol().getName() <
                RHS.SymbolData->getSymbol().getName();
       }
@@ -98,6 +94,7 @@ class ELFObjectWriter : public MCObjectWriter {
     /// @{
 
     SmallString<256> StringTable;
+    std::vector<uint64_t> FileSymbolData;
     std::vector<ELFSymbolData> LocalSymbolData;
     std::vector<ELFSymbolData> ExternalSymbolData;
     std::vector<ELFSymbolData> UndefinedSymbolData;
@@ -551,7 +548,7 @@ void ELFObjectWriter::WriteSymbol(MCDataFragment *SymtabF,
   uint8_t Type = MCELF::GetType(Data);
   uint8_t Info = (Binding << ELF_STB_Shift) | (Type << ELF_STT_Shift);
 
-  // Other and Visibility share the same byte with Visability using the lower
+  // Other and Visibility share the same byte with Visibility using the lower
   // 2 bits
   uint8_t Visibility = MCELF::GetVisibility(OrigData);
   uint8_t Other = MCELF::getOther(OrigData) <<
@@ -590,8 +587,15 @@ void ELFObjectWriter::WriteSymbolTable(MCDataFragment *SymtabF,
   // The first entry is the undefined symbol entry.
   WriteSymbolEntry(SymtabF, ShndxF, 0, 0, 0, 0, 0, 0, false);
 
+  for (unsigned i = 0, e = FileSymbolData.size(); i != e; ++i) {
+    WriteSymbolEntry(SymtabF, ShndxF, FileSymbolData[i],
+                     ELF::STT_FILE | ELF::STB_LOCAL, 0, 0,
+                     ELF::STV_DEFAULT, ELF::SHN_ABS, true);
+  }
+
   // Write the symbol table entries.
-  LastLocalSymbolIndex = LocalSymbolData.size() + 1;
+  LastLocalSymbolIndex = FileSymbolData.size() + LocalSymbolData.size() + 1;
+
   for (unsigned i = 0, e = LocalSymbolData.size(); i != e; ++i) {
     ELFSymbolData &MSD = LocalSymbolData[i];
     WriteSymbol(SymtabF, ShndxF, MSD, Layout);
@@ -880,6 +884,20 @@ void ELFObjectWriter::ComputeSymbolTable(MCAssembler &Asm,
   // FIXME: We could optimize suffixes in strtab in the same way we
   // optimize them in shstrtab.
 
+  for (MCAssembler::const_file_name_iterator it = Asm.file_names_begin(),
+                                            ie = Asm.file_names_end();
+                                            it != ie;
+                                            ++it) {
+    StringRef Name = *it;
+    uint64_t &Entry = StringIndexMap[Name];
+    if (!Entry) {
+      Entry = StringTable.size();
+      StringTable += Name;
+      StringTable += '\x00';
+    }
+    FileSymbolData.push_back(Entry);
+  }
+
   // Add the data for the symbols.
   for (MCAssembler::symbol_iterator it = Asm.symbol_begin(),
          ie = Asm.symbol_end(); it != ie; ++it) {
@@ -964,7 +982,7 @@ void ELFObjectWriter::ComputeSymbolTable(MCAssembler &Asm,
 
   // Set the symbol indices. Local symbols must come before all other
   // symbols with non-local bindings.
-  unsigned Index = 1;
+  unsigned Index = FileSymbolData.size() + 1;
   for (unsigned i = 0, e = LocalSymbolData.size(); i != e; ++i)
     LocalSymbolData[i].SymbolData->setIndex(Index++);
 
@@ -1005,8 +1023,8 @@ void ELFObjectWriter::CreateRelocationSections(MCAssembler &Asm,
     unsigned Flags = 0;
     StringRef Group = "";
     if (Section.getFlags() & ELF::SHF_GROUP) {
-        Flags = ELF::SHF_GROUP;
-        Group = Section.getGroup()->getName();
+      Flags = ELF::SHF_GROUP;
+      Group = Section.getGroup()->getName();
     }
 
     const MCSectionELF *RelaSection =
@@ -1073,7 +1091,7 @@ void ELFObjectWriter::WriteRelocationsFragment(const MCAssembler &Asm,
     else if (entry.Index < 0)
       entry.Index = getSymbolIndexInSymbolTable(Asm, entry.Symbol);
     else
-      entry.Index += LocalSymbolData.size();
+      entry.Index += FileSymbolData.size() + LocalSymbolData.size();
     if (is64Bit()) {
       String64(*F, entry.r_offset);
       if (TargetObjectWriter->isN64()) {
@@ -1104,11 +1122,10 @@ void ELFObjectWriter::WriteRelocationsFragment(const MCAssembler &Asm,
   }
 }
 
-static int compareBySuffix(const void *a, const void *b) {
-  const MCSectionELF *secA = *static_cast<const MCSectionELF* const *>(a);
-  const MCSectionELF *secB = *static_cast<const MCSectionELF* const *>(b);
-  const StringRef &NameA = secA->getSectionName();
-  const StringRef &NameB = secB->getSectionName();
+static int compareBySuffix(const MCSectionELF *const *a,
+                           const MCSectionELF *const *b) {
+  const StringRef &NameA = (*a)->getSectionName();
+  const StringRef &NameB = (*b)->getSectionName();
   const unsigned sizeA = NameA.size();
   const unsigned sizeB = NameB.size();
   const unsigned len = std::min(sizeA, sizeB);
@@ -1299,10 +1316,12 @@ void ELFObjectWriter::WriteSection(MCAssembler &Asm,
     // Remove ".rel" and ".rela" prefixes.
     unsigned SecNameLen = (Section.getType() == ELF::SHT_REL) ? 4 : 5;
     StringRef SectionName = Section.getSectionName().substr(SecNameLen);
+    StringRef GroupName =
+        Section.getGroup() ? Section.getGroup()->getName() : "";
 
-    InfoSection = Asm.getContext().getELFSection(SectionName,
-                                                 ELF::SHT_PROGBITS, 0,
-                                                 SectionKind::getReadOnly());
+    InfoSection = Asm.getContext().getELFSection(SectionName, ELF::SHT_PROGBITS,
+                                                 0, SectionKind::getReadOnly(),
+                                                 0, GroupName);
     sh_info = SectionIndexMap.lookup(InfoSection);
     break;
   }
@@ -1352,11 +1371,12 @@ void ELFObjectWriter::WriteSection(MCAssembler &Asm,
                                        ELF::SHF_EXECINSTR | ELF::SHF_ALLOC,
                                        SectionKind::getText()));
     } else if (SecName.startswith(".ARM.exidx")) {
-      sh_link = SectionIndexMap.lookup(
-        Asm.getContext().getELFSection(SecName.substr(sizeof(".ARM.exidx") - 1),
-                                       ELF::SHT_PROGBITS,
-                                       ELF::SHF_EXECINSTR | ELF::SHF_ALLOC,
-                                       SectionKind::getText()));
+      StringRef GroupName =
+          Section.getGroup() ? Section.getGroup()->getName() : "";
+      sh_link = SectionIndexMap.lookup(Asm.getContext().getELFSection(
+          SecName.substr(sizeof(".ARM.exidx") - 1), ELF::SHT_PROGBITS,
+          ELF::SHF_EXECINSTR | ELF::SHF_ALLOC, SectionKind::getText(), 0,
+          GroupName));
     }
   }
 
diff --git a/lib/MC/MCAsmInfo.cpp b/lib/MC/MCAsmInfo.cpp
index 84e4075..28f1c95 100644
--- a/lib/MC/MCAsmInfo.cpp
+++ b/lib/MC/MCAsmInfo.cpp
@@ -35,7 +35,7 @@ MCAsmInfo::MCAsmInfo() {
   LinkerRequiresNonEmptyDwarfLines = false;
   MaxInstLength = 4;
   MinInstAlignment = 1;
-  PCSymbol = "$";
+  DollarIsPC = false;
   SeparatorString = ";";
   CommentColumn = 40;
   CommentString = "#";
@@ -50,10 +50,7 @@ MCAsmInfo::MCAsmInfo() {
   Code32Directive = ".code32";
   Code64Directive = ".code64";
   AssemblerDialect = 0;
-  AllowQuotesInName = false;
-  AllowNameToStartWithDigit = false;
-  AllowPeriodsInName = true;
-  AllowUTF8 = true;
+  AllowAtInName = false;
   UseDataRegionDirectives = false;
   ZeroDirective = "\t.zero\t";
   AsciiDirective = "\t.ascii\t";
@@ -76,8 +73,8 @@ MCAsmInfo::MCAsmInfo() {
   LCOMMDirectiveAlignmentType = LCOMM::NoAlignment;
   HasDotTypeDotSizeDirective = true;
   HasSingleParameterDotFile = true;
+  HasIdentDirective = false;
   HasNoDeadStrip = false;
-  HasSymbolResolver = false;
   WeakRefDirective = 0;
   WeakDefDirective = 0;
   LinkOnceDirective = 0;
@@ -87,7 +84,6 @@ MCAsmInfo::MCAsmInfo() {
   HasLEB128 = false;
   SupportsDebugInformation = false;
   ExceptionsType = ExceptionHandling::None;
-  DwarfUsesInlineInfoSection = false;
   DwarfUsesRelocationsAcrossSections = true;
   DwarfRegNumForCFI = false;
   HasMicrosoftFastStdCallMangling = false;
diff --git a/lib/MC/MCAsmInfoCOFF.cpp b/lib/MC/MCAsmInfoCOFF.cpp
index 33350d9..9d9f98e 100644
--- a/lib/MC/MCAsmInfoCOFF.cpp
+++ b/lib/MC/MCAsmInfoCOFF.cpp
@@ -43,7 +43,6 @@ MCAsmInfoCOFF::MCAsmInfoCOFF() {
 void MCAsmInfoMicrosoft::anchor() { }
 
 MCAsmInfoMicrosoft::MCAsmInfoMicrosoft() {
-  AllowQuotesInName = true;
 }
 
 void MCAsmInfoGNUCOFF::anchor() { }
diff --git a/lib/MC/MCAsmInfoDarwin.cpp b/lib/MC/MCAsmInfoDarwin.cpp
index a0e3eba..704c816 100644
--- a/lib/MC/MCAsmInfoDarwin.cpp
+++ b/lib/MC/MCAsmInfoDarwin.cpp
@@ -26,7 +26,6 @@ MCAsmInfoDarwin::MCAsmInfoDarwin() {
   GlobalPrefix = "_";
   PrivateGlobalPrefix = "L";
   LinkerPrivateGlobalPrefix = "l";
-  AllowQuotesInName = true;
   HasSingleParameterDotFile = false;
   HasSubsectionsViaSymbols = true;
 
@@ -58,7 +57,6 @@ MCAsmInfoDarwin::MCAsmInfoDarwin() {
 
   HasDotTypeDotSizeDirective = false;
   HasNoDeadStrip = true;
-  HasSymbolResolver = true;
 
   DwarfUsesRelocationsAcrossSections = false;
 }
diff --git a/lib/MC/MCAsmInfoELF.cpp b/lib/MC/MCAsmInfoELF.cpp
new file mode 100644
index 0000000..8cf4e4f
--- /dev/null
+++ b/lib/MC/MCAsmInfoELF.cpp
@@ -0,0 +1,23 @@
+//===-- MCAsmInfoELF.cpp - ELF asm properties -------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines target asm properties related what form asm statements
+// should take in general on ELF-based targets
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCAsmInfoELF.h"
+using namespace llvm;
+
+void MCAsmInfoELF::anchor() { }
+
+MCAsmInfoELF::MCAsmInfoELF() {
+  HasIdentDirective = true;
+  WeakRefDirective = "\t.weak\t";
+}
diff --git a/lib/MC/MCAsmStreamer.cpp b/lib/MC/MCAsmStreamer.cpp
index 781e400..ca49f8f 100644
--- a/lib/MC/MCAsmStreamer.cpp
+++ b/lib/MC/MCAsmStreamer.cpp
@@ -25,6 +25,7 @@
 #include "llvm/MC/MCSectionCOFF.h"
 #include "llvm/MC/MCSectionMachO.h"
 #include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/FormattedStream.h"
@@ -65,17 +66,15 @@ private:
   virtual void EmitCFIEndProcImpl(MCDwarfFrameInfo &Frame);
 
 public:
-  MCAsmStreamer(MCContext &Context, formatted_raw_ostream &os,
-                bool isVerboseAsm, bool useLoc, bool useCFI,
-                bool useDwarfDirectory,
-                MCInstPrinter *printer, MCCodeEmitter *emitter,
-                MCAsmBackend *asmbackend,
-                bool showInst)
-    : MCStreamer(SK_AsmStreamer, Context), OS(os), MAI(Context.getAsmInfo()),
-      InstPrinter(printer), Emitter(emitter), AsmBackend(asmbackend),
-      CommentStream(CommentToEmit), IsVerboseAsm(isVerboseAsm),
-      ShowInst(showInst), UseLoc(useLoc), UseCFI(useCFI),
-      UseDwarfDirectory(useDwarfDirectory) {
+  MCAsmStreamer(MCContext &Context, MCTargetStreamer *TargetStreamer,
+                formatted_raw_ostream &os, bool isVerboseAsm, bool useLoc,
+                bool useCFI, bool useDwarfDirectory, MCInstPrinter *printer,
+                MCCodeEmitter *emitter, MCAsmBackend *asmbackend, bool showInst)
+      : MCStreamer(Context, TargetStreamer), OS(os), MAI(Context.getAsmInfo()),
+        InstPrinter(printer), Emitter(emitter), AsmBackend(asmbackend),
+        CommentStream(CommentToEmit), IsVerboseAsm(isVerboseAsm),
+        ShowInst(showInst), UseLoc(useLoc), UseCFI(useCFI),
+        UseDwarfDirectory(useDwarfDirectory) {
     if (InstPrinter && IsVerboseAsm)
       InstPrinter->setCommentStream(CommentStream);
   }
@@ -154,7 +153,7 @@ public:
   virtual void EmitDwarfAdvanceFrameAddr(const MCSymbol *LastLabel,
                                          const MCSymbol *Label);
 
-  virtual void EmitSymbolAttribute(MCSymbol *Symbol, MCSymbolAttr Attribute);
+  virtual bool EmitSymbolAttribute(MCSymbol *Symbol, MCSymbolAttr Attribute);
 
   virtual void EmitSymbolDesc(MCSymbol *Symbol, unsigned DescValue);
   virtual void BeginCOFFSymbolDef(const MCSymbol *Symbol);
@@ -214,6 +213,7 @@ public:
                                      unsigned Isa, unsigned Discriminator,
                                      StringRef FileName);
 
+  virtual void EmitIdent(StringRef IdentString);
   virtual void EmitCFISections(bool EH, bool Debug);
   virtual void EmitCFIDefCfa(int64_t Register, int64_t Offset);
   virtual void EmitCFIDefCfaOffset(int64_t Offset);
@@ -229,6 +229,7 @@ public:
   virtual void EmitCFISignalFrame();
   virtual void EmitCFIUndefined(int64_t Register);
   virtual void EmitCFIRegister(int64_t Register1, int64_t Register2);
+  virtual void EmitCFIWindowSave();
 
   virtual void EmitWin64EHStartProc(const MCSymbol *Symbol);
   virtual void EmitWin64EHEndProc();
@@ -245,17 +246,6 @@ public:
   virtual void EmitWin64EHPushFrame(bool Code);
   virtual void EmitWin64EHEndProlog();
 
-  virtual void EmitFnStart();
-  virtual void EmitFnEnd();
-  virtual void EmitCantUnwind();
-  virtual void EmitPersonality(const MCSymbol *Personality);
-  virtual void EmitHandlerData();
-  virtual void EmitSetFP(unsigned FpReg, unsigned SpReg, int64_t Offset = 0);
-  virtual void EmitPad(int64_t Offset);
-  virtual void EmitRegSave(const SmallVectorImpl<unsigned> &RegList, bool);
-
-  virtual void EmitTCEntry(const MCSymbol &S);
-
   virtual void EmitInstruction(const MCInst &Inst);
 
   virtual void EmitBundleAlignMode(unsigned AlignPow2);
@@ -265,15 +255,9 @@ public:
   /// EmitRawText - If this file is backed by an assembly streamer, this dumps
   /// the specified string in the output .s file.  This capability is
   /// indicated by the hasRawTextSupport() predicate.
-  virtual void EmitRawText(StringRef String);
+  virtual void EmitRawTextImpl(StringRef String);
 
   virtual void FinishImpl();
-
-  /// @}
-
-  static bool classof(const MCStreamer *S) {
-    return S->getKind() == SK_AsmStreamer;
-  }
 };
 
 } // end anonymous namespace.
@@ -436,7 +420,7 @@ void MCAsmStreamer::EmitDwarfAdvanceFrameAddr(const MCSymbol *LastLabel,
 }
 
 
-void MCAsmStreamer::EmitSymbolAttribute(MCSymbol *Symbol,
+bool MCAsmStreamer::EmitSymbolAttribute(MCSymbol *Symbol,
                                         MCSymbolAttr Attribute) {
   switch (Attribute) {
   case MCSA_Invalid: llvm_unreachable("Invalid symbol attribute");
@@ -447,11 +431,12 @@ void MCAsmStreamer::EmitSymbolAttribute(MCSymbol *Symbol,
   case MCSA_ELF_TypeCommon:      /// .type _foo, STT_COMMON  # aka @common
   case MCSA_ELF_TypeNoType:      /// .type _foo, STT_NOTYPE  # aka @notype
   case MCSA_ELF_TypeGnuUniqueObject:  /// .type _foo, @gnu_unique_object
-    assert(MAI->hasDotTypeDotSizeDirective() && "Symbol Attr not supported");
+    if (!MAI->hasDotTypeDotSizeDirective())
+      return false; // Symbol attribute not supported
     OS << "\t.type\t" << *Symbol << ','
        << ((MAI->getCommentString()[0] != '@') ? '@' : '%');
     switch (Attribute) {
-    default: llvm_unreachable("Unknown ELF .type");
+    default: return false;
     case MCSA_ELF_TypeFunction:    OS << "function"; break;
     case MCSA_ELF_TypeIndFunction: OS << "gnu_indirect_function"; break;
     case MCSA_ELF_TypeObject:      OS << "object"; break;
@@ -461,7 +446,7 @@ void MCAsmStreamer::EmitSymbolAttribute(MCSymbol *Symbol,
     case MCSA_ELF_TypeGnuUniqueObject: OS << "gnu_unique_object"; break;
     }
     EmitEOL();
-    return;
+    return true;
   case MCSA_Global: // .globl/.global
     OS << MAI->getGlobalDirective();
     FlagMap[Symbol] |= EHGlobal;
@@ -491,6 +476,8 @@ void MCAsmStreamer::EmitSymbolAttribute(MCSymbol *Symbol,
 
   OS << *Symbol;
   EmitEOL();
+
+  return true;
 }
 
 void MCAsmStreamer::EmitSymbolDesc(MCSymbol *Symbol, unsigned DescValue) {
@@ -530,6 +517,9 @@ void MCAsmStreamer::EmitELFSize(MCSymbol *Symbol, const MCExpr *Value) {
 
 void MCAsmStreamer::EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
                                      unsigned ByteAlignment) {
+  // Common symbols do not belong to any actual section.
+  AssignSection(Symbol, NULL);
+
   OS << "\t.comm\t" << *Symbol << ',' << Size;
   if (ByteAlignment != 0) {
     if (MAI->getCOMMDirectiveAlignmentIsInBytes())
@@ -546,6 +536,9 @@ void MCAsmStreamer::EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
 /// @param Size - The size of the common symbol.
 void MCAsmStreamer::EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
                                           unsigned ByteAlign) {
+  // Common symbols do not belong to any actual section.
+  AssignSection(Symbol, NULL);
+
   OS << "\t.lcomm\t" << *Symbol << ',' << Size;
   if (ByteAlign > 1) {
     switch (MAI->getLCOMMDirectiveAlignmentType()) {
@@ -565,6 +558,9 @@ void MCAsmStreamer::EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
 
 void MCAsmStreamer::EmitZerofill(const MCSection *Section, MCSymbol *Symbol,
                                  uint64_t Size, unsigned ByteAlignment) {
+  if (Symbol)
+    AssignSection(Symbol, Section);
+
   // Note: a .zerofill directive does not switch sections.
   OS << ".zerofill ";
 
@@ -585,6 +581,8 @@ void MCAsmStreamer::EmitZerofill(const MCSection *Section, MCSymbol *Symbol,
 // e.g. _a.
 void MCAsmStreamer::EmitTBSSSymbol(const MCSection *Section, MCSymbol *Symbol,
                                    uint64_t Size, unsigned ByteAlignment) {
+  AssignSection(Symbol, Section);
+
   assert(Symbol != NULL && "Symbol shouldn't be NULL!");
   // Instead of using the Section we'll just use the shortcut.
   // This is a mach-o specific directive and section.
@@ -654,7 +652,6 @@ void MCAsmStreamer::EmitBytes(StringRef Data) {
     OS << MAI->getAsciiDirective();
   }
 
-  OS << ' ';
   PrintQuotedString(Data, OS);
   EmitEOL();
 }
@@ -884,6 +881,13 @@ void MCAsmStreamer::EmitDwarfLocDirective(unsigned FileNo, unsigned Line,
   EmitEOL();
 }
 
+void MCAsmStreamer::EmitIdent(StringRef IdentString) {
+  assert(MAI->hasIdentDirective() && ".ident directive not supported");
+  OS << "\t.ident\t";
+  PrintQuotedString(IdentString, OS);
+  EmitEOL();
+}
+
 void MCAsmStreamer::EmitCFISections(bool EH, bool Debug) {
   MCStreamer::EmitCFISections(EH, Debug);
 
@@ -1085,6 +1089,16 @@ void MCAsmStreamer::EmitCFIRegister(int64_t Register1, int64_t Register2) {
   EmitEOL();
 }
 
+void MCAsmStreamer::EmitCFIWindowSave() {
+  MCStreamer::EmitCFIWindowSave();
+
+  if (!UseCFI)
+    return;
+
+  OS << "\t.cfi_window_save";
+  EmitEOL();
+}
+
 void MCAsmStreamer::EmitWin64EHStartProc(const MCSymbol *Symbol) {
   MCStreamer::EmitWin64EHStartProc(Symbol);
 
@@ -1290,73 +1304,6 @@ void MCAsmStreamer::AddEncodingComment(const MCInst &Inst) {
   }
 }
 
-void MCAsmStreamer::EmitFnStart() {
-  OS << "\t.fnstart";
-  EmitEOL();
-}
-
-void MCAsmStreamer::EmitFnEnd() {
-  OS << "\t.fnend";
-  EmitEOL();
-}
-
-void MCAsmStreamer::EmitCantUnwind() {
-  OS << "\t.cantunwind";
-  EmitEOL();
-}
-
-void MCAsmStreamer::EmitHandlerData() {
-  OS << "\t.handlerdata";
-  EmitEOL();
-}
-
-void MCAsmStreamer::EmitPersonality(const MCSymbol *Personality) {
-  OS << "\t.personality " << Personality->getName();
-  EmitEOL();
-}
-
-void MCAsmStreamer::EmitSetFP(unsigned FpReg, unsigned SpReg, int64_t Offset) {
-  OS << "\t.setfp\t";
-  InstPrinter->printRegName(OS, FpReg);
-  OS << ", ";
-  InstPrinter->printRegName(OS, SpReg);
-  if (Offset)
-    OS << ", #" << Offset;
-  EmitEOL();
-}
-
-void MCAsmStreamer::EmitPad(int64_t Offset) {
-  OS << "\t.pad\t#" << Offset;
-  EmitEOL();
-}
-
-void MCAsmStreamer::EmitRegSave(const SmallVectorImpl<unsigned> &RegList,
-                                bool isVector) {
-  assert(RegList.size() && "RegList should not be empty");
-  if (isVector)
-    OS << "\t.vsave\t{";
-  else
-    OS << "\t.save\t{";
-
-  InstPrinter->printRegName(OS, RegList[0]);
-
-  for (unsigned i = 1, e = RegList.size(); i != e; ++i) {
-    OS << ", ";
-    InstPrinter->printRegName(OS, RegList[i]);
-  }
-
-  OS << "}";
-  EmitEOL();
-}
-
-void MCAsmStreamer::EmitTCEntry(const MCSymbol &S) {
-  OS << "\t.tc ";
-  OS << S.getName();
-  OS << "[TC],";
-  OS << S.getName();
-  EmitEOL();
-}
-
 void MCAsmStreamer::EmitInstruction(const MCInst &Inst) {
   assert(getCurrentSection().first &&
          "Cannot emit contents before setting section!");
@@ -1399,7 +1346,7 @@ void MCAsmStreamer::EmitBundleUnlock() {
 /// EmitRawText - If this file is backed by an assembly streamer, this dumps
 /// the specified string in the output .s file.  This capability is
 /// indicated by the hasRawTextSupport() predicate.
-void MCAsmStreamer::EmitRawText(StringRef String) {
+void MCAsmStreamer::EmitRawTextImpl(StringRef String) {
   if (!String.empty() && String.back() == '\n')
     String = String.substr(0, String.size()-1);
   OS << String;
@@ -1418,14 +1365,16 @@ void MCAsmStreamer::FinishImpl() {
     MCGenDwarfInfo::Emit(this, LineSectionSymbol);
 
   if (!UseCFI)
-    EmitFrames(false);
+    EmitFrames(AsmBackend.get(), false);
 }
+
 MCStreamer *llvm::createAsmStreamer(MCContext &Context,
+                                    MCTargetStreamer *TargetStreamer,
                                     formatted_raw_ostream &OS,
-                                    bool isVerboseAsm, bool useLoc,
-                                    bool useCFI, bool useDwarfDirectory,
-                                    MCInstPrinter *IP, MCCodeEmitter *CE,
-                                    MCAsmBackend *MAB, bool ShowInst) {
-  return new MCAsmStreamer(Context, OS, isVerboseAsm, useLoc, useCFI,
-                           useDwarfDirectory, IP, CE, MAB, ShowInst);
+                                    bool isVerboseAsm, bool useLoc, bool useCFI,
+                                    bool useDwarfDirectory, MCInstPrinter *IP,
+                                    MCCodeEmitter *CE, MCAsmBackend *MAB,
+                                    bool ShowInst) {
+  return new MCAsmStreamer(Context, TargetStreamer, OS, isVerboseAsm, useLoc,
+                           useCFI, useDwarfDirectory, IP, CE, MAB, ShowInst);
 }
diff --git a/lib/MC/MCAtom.cpp b/lib/MC/MCAtom.cpp
index 2626b39..bc353cd 100644
--- a/lib/MC/MCAtom.cpp
+++ b/lib/MC/MCAtom.cpp
@@ -14,6 +14,9 @@
 
 using namespace llvm;
 
+// Pin the vtable to this file.
+void MCAtom::anchor() {}
+
 void MCAtom::remap(uint64_t NewBegin, uint64_t NewEnd) {
   Parent->remap(this, NewBegin, NewEnd);
 }
@@ -44,7 +47,7 @@ void MCAtom::remapForSplit(uint64_t SplitPt,
 
 void MCDataAtom::addData(const MCData &D) {
   Data.push_back(D);
-  if (Data.size() > Begin - End)
+  if (Data.size() > End + 1 - Begin)
     remap(Begin, End + 1);
 }
 
@@ -72,8 +75,8 @@ MCDataAtom *MCDataAtom::split(uint64_t SplitPt) {
 // MCTextAtom
 
 void MCTextAtom::addInst(const MCInst &I, uint64_t Size) {
-  if (NextInstAddress > End)
-    remap(Begin, NextInstAddress);
+  if (NextInstAddress + Size - 1 > End)
+    remap(Begin, NextInstAddress + Size - 1);
   Insts.push_back(MCDecodedInst(I, NextInstAddress, Size));
   NextInstAddress += Size;
 }
@@ -106,5 +109,6 @@ MCTextAtom *MCTextAtom::split(uint64_t SplitPt) {
 
   std::copy(I, Insts.end(), std::back_inserter(RightAtom->Insts));
   Insts.erase(I, Insts.end());
+  Parent->splitBasicBlocksForAtom(this, RightAtom);
   return RightAtom;
 }
diff --git a/lib/MC/MCContext.cpp b/lib/MC/MCContext.cpp
index 6e4d82b..3b45d16 100644
--- a/lib/MC/MCContext.cpp
+++ b/lib/MC/MCContext.cpp
@@ -25,12 +25,16 @@
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Signals.h"
 #include "llvm/Support/SourceMgr.h"
+
+#include <map>
+
 using namespace llvm;
 
-typedef StringMap<const MCSectionMachO*> MachOUniqueMapTy;
-typedef StringMap<const MCSectionELF*> ELFUniqueMapTy;
-typedef StringMap<const MCSectionCOFF*> COFFUniqueMapTy;
+typedef std::pair<std::string, std::string> SectionGroupPair;
 
+typedef StringMap<const MCSectionMachO*> MachOUniqueMapTy;
+typedef std::map<SectionGroupPair, const MCSectionELF *> ELFUniqueMapTy;
+typedef std::map<SectionGroupPair, const MCSectionCOFF *> COFFUniqueMapTy;
 
 MCContext::MCContext(const MCAsmInfo *mai, const MCRegisterInfo *mri,
                      const MCObjectFileInfo *mofi, const SourceMgr *mgr,
@@ -249,8 +253,9 @@ getELFSection(StringRef Section, unsigned Type, unsigned Flags,
   ELFUniqueMapTy &Map = *(ELFUniqueMapTy*)ELFUniquingMap;
 
   // Do the lookup, if we have a hit, return it.
-  StringMapEntry<const MCSectionELF*> &Entry = Map.GetOrCreateValue(Section);
-  if (Entry.getValue()) return Entry.getValue();
+  std::pair<ELFUniqueMapTy::iterator, bool> Entry = Map.insert(
+      std::make_pair(SectionGroupPair(Section, Group), (MCSectionELF *)0));
+  if (!Entry.second) return Entry.first->second;
 
   // Possibly refine the entry size first.
   if (!EntrySize) {
@@ -261,9 +266,9 @@ getELFSection(StringRef Section, unsigned Type, unsigned Flags,
   if (!Group.empty())
     GroupSym = GetOrCreateSymbol(Group);
 
-  MCSectionELF *Result = new (*this) MCSectionELF(Entry.getKey(), Type, Flags,
-                                                  Kind, EntrySize, GroupSym);
-  Entry.setValue(Result);
+  MCSectionELF *Result = new (*this) MCSectionELF(
+      Entry.first->first.first, Type, Flags, Kind, EntrySize, GroupSym);
+  Entry.first->second = Result;
   return Result;
 }
 
@@ -274,32 +279,51 @@ const MCSectionELF *MCContext::CreateELFGroupSection() {
   return Result;
 }
 
-const MCSectionCOFF *MCContext::getCOFFSection(StringRef Section,
-                                               unsigned Characteristics,
-                                               SectionKind Kind, int Selection,
-                                               const MCSectionCOFF *Assoc) {
+const MCSectionCOFF *
+MCContext::getCOFFSection(StringRef Section, unsigned Characteristics,
+                          SectionKind Kind, StringRef COMDATSymName,
+                          int Selection, const MCSectionCOFF *Assoc) {
   if (COFFUniquingMap == 0)
     COFFUniquingMap = new COFFUniqueMapTy();
   COFFUniqueMapTy &Map = *(COFFUniqueMapTy*)COFFUniquingMap;
 
   // Do the lookup, if we have a hit, return it.
-  StringMapEntry<const MCSectionCOFF*> &Entry = Map.GetOrCreateValue(Section);
-  if (Entry.getValue()) return Entry.getValue();
 
-  MCSectionCOFF *Result = new (*this) MCSectionCOFF(Entry.getKey(),
-                                                    Characteristics,
-                                                    Selection, Assoc, Kind);
+  SectionGroupPair P(Section, COMDATSymName);
+  std::pair<COFFUniqueMapTy::iterator, bool> Entry =
+      Map.insert(std::make_pair(P, (MCSectionCOFF *)0));
+  COFFUniqueMapTy::iterator Iter = Entry.first;
+  if (!Entry.second)
+    return Iter->second;
 
-  Entry.setValue(Result);
+  const MCSymbol *COMDATSymbol = NULL;
+  if (!COMDATSymName.empty())
+    COMDATSymbol = GetOrCreateSymbol(COMDATSymName);
+
+  MCSectionCOFF *Result =
+      new (*this) MCSectionCOFF(Iter->first.first, Characteristics,
+                                COMDATSymbol, Selection, Assoc, Kind);
+
+  Iter->second = Result;
   return Result;
 }
 
+const MCSectionCOFF *
+MCContext::getCOFFSection(StringRef Section, unsigned Characteristics,
+                          SectionKind Kind) {
+  return getCOFFSection(Section, Characteristics, Kind, "", 0);
+}
+
 const MCSectionCOFF *MCContext::getCOFFSection(StringRef Section) {
   if (COFFUniquingMap == 0)
     COFFUniquingMap = new COFFUniqueMapTy();
   COFFUniqueMapTy &Map = *(COFFUniqueMapTy*)COFFUniquingMap;
 
-  return Map.lookup(Section);
+  SectionGroupPair P(Section, "");
+  COFFUniqueMapTy::iterator Iter = Map.find(P);
+  if (Iter == Map.end())
+    return 0;
+  return Iter->second;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/MC/MCDisassembler/Disassembler.cpp b/lib/MC/MCDisassembler/Disassembler.cpp
index ecc7aff..a0066c8 100644
--- a/lib/MC/MCDisassembler/Disassembler.cpp
+++ b/lib/MC/MCDisassembler/Disassembler.cpp
@@ -20,6 +20,7 @@
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbolizer.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/MemoryObject.h"
 #include "llvm/Support/TargetRegistry.h"
 
@@ -101,6 +102,7 @@ LLVMDisasmContextRef LLVMCreateDisasmCPU(const char *Triple, const char *CPU,
   if (!DC)
     return 0;
 
+  DC->setCPU(CPU);
   return DC;
 }
 
@@ -143,6 +145,112 @@ public:
 };
 } // end anonymous namespace
 
+/// \brief Emits the comments that are stored in \p DC comment stream.
+/// Each comment in the comment stream must end with a newline.
+static void emitComments(LLVMDisasmContext *DC,
+                         formatted_raw_ostream &FormattedOS) {
+  // Flush the stream before taking its content.
+  DC->CommentStream.flush();
+  StringRef Comments = DC->CommentsToEmit.str();
+  // Get the default information for printing a comment.
+  const MCAsmInfo *MAI = DC->getAsmInfo();
+  const char *CommentBegin = MAI->getCommentString();
+  unsigned CommentColumn = MAI->getCommentColumn();
+  bool IsFirst = true;
+  while (!Comments.empty()) {
+    if (!IsFirst)
+      FormattedOS << '\n';
+    // Emit a line of comments.
+    FormattedOS.PadToColumn(CommentColumn);
+    size_t Position = Comments.find('\n');
+    FormattedOS << CommentBegin << ' ' << Comments.substr(0, Position);
+    // Move after the newline character.
+    Comments = Comments.substr(Position+1);
+    IsFirst = false;
+  }
+  FormattedOS.flush();
+
+  // Tell the comment stream that the vector changed underneath it.
+  DC->CommentsToEmit.clear();
+  DC->CommentStream.resync();
+}
+
+/// \brief Gets latency information for \p Inst form the itinerary
+/// scheduling model, based on \p DC information.
+/// \return The maximum expected latency over all the operands or -1
+/// if no information are available.
+static int getItineraryLatency(LLVMDisasmContext *DC, const MCInst &Inst) {
+  const int NoInformationAvailable = -1;
+
+  // Check if we have a CPU to get the itinerary information.
+  if (DC->getCPU().empty())
+    return NoInformationAvailable;
+
+  // Get itinerary information.
+  const MCSubtargetInfo *STI = DC->getSubtargetInfo();
+  InstrItineraryData IID = STI->getInstrItineraryForCPU(DC->getCPU());
+  // Get the scheduling class of the requested instruction.
+  const MCInstrDesc& Desc = DC->getInstrInfo()->get(Inst.getOpcode());
+  unsigned SCClass = Desc.getSchedClass();
+
+  int Latency = 0;
+  for (unsigned OpIdx = 0, OpIdxEnd = Inst.getNumOperands(); OpIdx != OpIdxEnd;
+       ++OpIdx)
+    Latency = std::max(Latency, IID.getOperandCycle(SCClass, OpIdx));
+
+  return Latency;
+}
+
+/// \brief Gets latency information for \p Inst, based on \p DC information.
+/// \return The maximum expected latency over all the definitions or -1
+/// if no information are available.
+static int getLatency(LLVMDisasmContext *DC, const MCInst &Inst) {
+  // Try to compute scheduling information.
+  const MCSubtargetInfo *STI = DC->getSubtargetInfo();
+  const MCSchedModel *SCModel = STI->getSchedModel();
+  const int NoInformationAvailable = -1;
+
+  // Check if we have a scheduling model for instructions.
+  if (!SCModel || !SCModel->hasInstrSchedModel())
+    // Try to fall back to the itinerary model if we do not have a
+    // scheduling model.
+    return getItineraryLatency(DC, Inst);
+
+  // Get the scheduling class of the requested instruction.
+  const MCInstrDesc& Desc = DC->getInstrInfo()->get(Inst.getOpcode());
+  unsigned SCClass = Desc.getSchedClass();
+  const MCSchedClassDesc *SCDesc = SCModel->getSchedClassDesc(SCClass);
+  // Resolving the variant SchedClass requires an MI to pass to
+  // SubTargetInfo::resolveSchedClass.
+  if (!SCDesc || !SCDesc->isValid() || SCDesc->isVariant())
+    return NoInformationAvailable;
+
+  // Compute output latency.
+  int Latency = 0;
+  for (unsigned DefIdx = 0, DefEnd = SCDesc->NumWriteLatencyEntries;
+       DefIdx != DefEnd; ++DefIdx) {
+    // Lookup the definition's write latency in SubtargetInfo.
+    const MCWriteLatencyEntry *WLEntry = STI->getWriteLatencyEntry(SCDesc,
+                                                                   DefIdx);
+    Latency = std::max(Latency, WLEntry->Cycles);
+  }
+
+  return Latency;
+}
+
+
+/// \brief Emits latency information in DC->CommentStream for \p Inst, based
+/// on the information available in \p DC.
+static void emitLatency(LLVMDisasmContext *DC, const MCInst &Inst) {
+  int Latency = getLatency(DC, Inst);
+
+  // Report only interesting latency.
+  if (Latency < 2)
+    return;
+
+  DC->CommentStream << "Latency: " << Latency << '\n';
+}
+
 //
 // LLVMDisasmInstruction() disassembles a single instruction using the
 // disassembler context specified in the parameter DC.  The bytes of the
@@ -167,8 +275,10 @@ size_t LLVMDisasmInstruction(LLVMDisasmContextRef DCR, uint8_t *Bytes,
   const MCDisassembler *DisAsm = DC->getDisAsm();
   MCInstPrinter *IP = DC->getIP();
   MCDisassembler::DecodeStatus S;
+  SmallVector<char, 64> InsnStr;
+  raw_svector_ostream Annotations(InsnStr);
   S = DisAsm->getInstruction(Inst, Size, MemoryObject, PC,
-                             /*REMOVE*/ nulls(), DC->CommentStream);
+                             /*REMOVE*/ nulls(), Annotations);
   switch (S) {
   case MCDisassembler::Fail:
   case MCDisassembler::SoftFail:
@@ -176,17 +286,18 @@ size_t LLVMDisasmInstruction(LLVMDisasmContextRef DCR, uint8_t *Bytes,
     return 0;
 
   case MCDisassembler::Success: {
-    DC->CommentStream.flush();
-    StringRef Comments = DC->CommentsToEmit.str();
+    Annotations.flush();
+    StringRef AnnotationsStr = Annotations.str();
 
     SmallVector<char, 64> InsnStr;
     raw_svector_ostream OS(InsnStr);
-    IP->printInst(&Inst, OS, Comments);
-    OS.flush();
+    formatted_raw_ostream FormattedOS(OS);
+    IP->printInst(&Inst, FormattedOS, AnnotationsStr);
 
-    // Tell the comment stream that the vector changed underneath it.
-    DC->CommentsToEmit.clear();
-    DC->CommentStream.resync();
+    if (DC->getOptions() & LLVMDisassembler_Option_PrintLatency)
+      emitLatency(DC, Inst);
+
+    emitComments(DC, FormattedOS);
 
     assert(OutStringSize != 0 && "Output buffer cannot be zero size");
     size_t OutputSize = std::min(OutStringSize-1, InsnStr.size());
@@ -208,12 +319,14 @@ int LLVMSetDisasmOptions(LLVMDisasmContextRef DCR, uint64_t Options){
       LLVMDisasmContext *DC = (LLVMDisasmContext *)DCR;
       MCInstPrinter *IP = DC->getIP();
       IP->setUseMarkup(1);
+      DC->addOptions(LLVMDisassembler_Option_UseMarkup);
       Options &= ~LLVMDisassembler_Option_UseMarkup;
   }
   if (Options & LLVMDisassembler_Option_PrintImmHex){
       LLVMDisasmContext *DC = (LLVMDisasmContext *)DCR;
       MCInstPrinter *IP = DC->getIP();
       IP->setPrintImmHex(1);
+      DC->addOptions(LLVMDisassembler_Option_PrintImmHex);
       Options &= ~LLVMDisassembler_Option_PrintImmHex;
   }
   if (Options & LLVMDisassembler_Option_AsmPrinterVariant){
@@ -229,8 +342,21 @@ int LLVMSetDisasmOptions(LLVMDisasmContextRef DCR, uint64_t Options){
           AsmPrinterVariant, *MAI, *MII, *MRI, *STI);
       if (IP) {
         DC->setIP(IP);
+        DC->addOptions(LLVMDisassembler_Option_AsmPrinterVariant);
         Options &= ~LLVMDisassembler_Option_AsmPrinterVariant;
       }
   }
+  if (Options & LLVMDisassembler_Option_SetInstrComments) {
+    LLVMDisasmContext *DC = (LLVMDisasmContext *)DCR;
+    MCInstPrinter *IP = DC->getIP();
+    IP->setCommentStream(DC->CommentStream);
+    DC->addOptions(LLVMDisassembler_Option_SetInstrComments);
+    Options &= ~LLVMDisassembler_Option_SetInstrComments;
+  }
+  if (Options & LLVMDisassembler_Option_PrintLatency) {
+    LLVMDisasmContext *DC = (LLVMDisasmContext *)DCR;
+    DC->addOptions(LLVMDisassembler_Option_PrintLatency);
+    Options &= ~LLVMDisassembler_Option_PrintLatency;
+  }
   return (Options == 0);
 }
diff --git a/lib/MC/MCDisassembler/Disassembler.h b/lib/MC/MCDisassembler/Disassembler.h
index 6eb59d0..4855af2 100644
--- a/lib/MC/MCDisassembler/Disassembler.h
+++ b/lib/MC/MCDisassembler/Disassembler.h
@@ -73,6 +73,10 @@ private:
   llvm::OwningPtr<const llvm::MCDisassembler> DisAsm;
   // The instruction printer for the target architecture.
   llvm::OwningPtr<llvm::MCInstPrinter> IP;
+  // The options used to set up the disassembler.
+  uint64_t Options;
+  // The CPU string.
+  std::string CPU;
 
 public:
   // Comment stream and backing vector.
@@ -90,6 +94,7 @@ public:
                     MCInstPrinter *iP) : TripleName(tripleName),
                     DisInfo(disInfo), TagType(tagType), GetOpInfo(getOpInfo),
                     SymbolLookUp(symbolLookUp), TheTarget(theTarget),
+                    Options(0),
                     CommentStream(CommentsToEmit) {
     MAI.reset(mAI);
     MRI.reset(mRI);
@@ -114,6 +119,10 @@ public:
   const MCSubtargetInfo *getSubtargetInfo() const { return MSI.get(); }
   MCInstPrinter *getIP() { return IP.get(); }
   void setIP(MCInstPrinter *NewIP) { IP.reset(NewIP); }
+  uint64_t getOptions() const { return Options; }
+  void addOptions(uint64_t Options) { this->Options |= Options; }
+  StringRef getCPU() const { return CPU; }
+  void setCPU(const char *CPU) { this->CPU = CPU; }
 };
 
 } // namespace llvm
diff --git a/lib/MC/MCDwarf.cpp b/lib/MC/MCDwarf.cpp
index 195bbfe..1e5c2e3 100644
--- a/lib/MC/MCDwarf.cpp
+++ b/lib/MC/MCDwarf.cpp
@@ -873,9 +873,7 @@ namespace {
 
     void setSectionStart(const MCSymbol *Label) { SectionStart = Label; }
 
-    /// EmitCompactUnwind - Emit the unwind information in a compact way. If
-    /// we're successful, return 'true'. Otherwise, return 'false' and it will
-    /// emit the normal CIE and FDE.
+    /// EmitCompactUnwind - Emit the unwind information in a compact way.
     void EmitCompactUnwind(MCStreamer &streamer,
                            const MCDwarfFrameInfo &frame);
 
@@ -889,7 +887,7 @@ namespace {
                       const MCSymbol &cieStart,
                       const MCDwarfFrameInfo &frame);
     void EmitCFIInstructions(MCStreamer &streamer,
-                             const std::vector<MCCFIInstruction> &Instrs,
+                             ArrayRef<MCCFIInstruction> Instrs,
                              MCSymbol *BaseLabel);
     void EmitCFIInstruction(MCStreamer &Streamer,
                             const MCCFIInstruction &Instr);
@@ -961,6 +959,10 @@ void FrameEmitterImpl::EmitCFIInstruction(MCStreamer &Streamer,
     Streamer.EmitULEB128IntValue(Reg2);
     return;
   }
+  case MCCFIInstruction::OpWindowSave: {
+    Streamer.EmitIntValue(dwarf::DW_CFA_GNU_window_save, 1);
+    return;
+  }
   case MCCFIInstruction::OpUndefined: {
     unsigned Reg = Instr.getRegister();
     if (VerboseAsm) {
@@ -1091,7 +1093,7 @@ void FrameEmitterImpl::EmitCFIInstruction(MCStreamer &Streamer,
 /// EmitFrameMoves - Emit frame instructions to describe the layout of the
 /// frame.
 void FrameEmitterImpl::EmitCFIInstructions(MCStreamer &streamer,
-                                    const std::vector<MCCFIInstruction> &Instrs,
+                                           ArrayRef<MCCFIInstruction> Instrs,
                                            MCSymbol *BaseLabel) {
   for (unsigned i = 0, N = Instrs.size(); i < N; ++i) {
     const MCCFIInstruction &Instr = Instrs[i];
@@ -1113,9 +1115,7 @@ void FrameEmitterImpl::EmitCFIInstructions(MCStreamer &streamer,
   }
 }
 
-/// EmitCompactUnwind - Emit the unwind information in a compact way. If we're
-/// successful, return 'true'. Otherwise, return 'false' and it will emit the
-/// normal CIE and FDE.
+/// EmitCompactUnwind - Emit the unwind information in a compact way.
 void FrameEmitterImpl::EmitCompactUnwind(MCStreamer &Streamer,
                                          const MCDwarfFrameInfo &Frame) {
   MCContext &Context = Streamer.getContext();
@@ -1419,9 +1419,10 @@ namespace llvm {
   };
 }
 
-void MCDwarfFrameEmitter::Emit(MCStreamer &Streamer,
-                               bool UsingCFI,
-                               bool IsEH) {
+void MCDwarfFrameEmitter::Emit(MCStreamer &Streamer, MCAsmBackend *MAB,
+                               bool UsingCFI, bool IsEH) {
+  Streamer.generateCompactUnwindEncodings(MAB);
+
   MCContext &Context = Streamer.getContext();
   const MCObjectFileInfo *MOFI = Context.getObjectFileInfo();
   FrameEmitterImpl Emitter(UsingCFI, IsEH);
diff --git a/lib/MC/MCELF.cpp b/lib/MC/MCELF.cpp
index 560cdbc..ebb189e 100644
--- a/lib/MC/MCELF.cpp
+++ b/lib/MC/MCELF.cpp
@@ -36,8 +36,8 @@ unsigned MCELF::GetBinding(const MCSymbolData &SD) {
 void MCELF::SetType(MCSymbolData &SD, unsigned Type) {
   assert(Type == ELF::STT_NOTYPE || Type == ELF::STT_OBJECT ||
          Type == ELF::STT_FUNC || Type == ELF::STT_SECTION ||
-         Type == ELF::STT_FILE || Type == ELF::STT_COMMON ||
-         Type == ELF::STT_TLS || Type == ELF::STT_GNU_IFUNC);
+         Type == ELF::STT_COMMON || Type == ELF::STT_TLS ||
+         Type == ELF::STT_GNU_IFUNC);
 
   uint32_t OtherFlags = SD.getFlags() & ~(0xf << ELF_STT_Shift);
   SD.setFlags(OtherFlags | (Type << ELF_STT_Shift));
@@ -47,8 +47,7 @@ unsigned MCELF::GetType(const MCSymbolData &SD) {
   uint32_t Type = (SD.getFlags() & (0xf << ELF_STT_Shift)) >> ELF_STT_Shift;
   assert(Type == ELF::STT_NOTYPE || Type == ELF::STT_OBJECT ||
          Type == ELF::STT_FUNC || Type == ELF::STT_SECTION ||
-         Type == ELF::STT_FILE || Type == ELF::STT_COMMON ||
-         Type == ELF::STT_TLS || Type == ELF::STT_GNU_IFUNC);
+         Type == ELF::STT_COMMON || Type == ELF::STT_TLS || Type == ELF::STT_GNU_IFUNC);
   return Type;
 }
 
diff --git a/lib/MC/MCELFObjectTargetWriter.cpp b/lib/MC/MCELFObjectTargetWriter.cpp
index ec7397d..0c39e4a 100644
--- a/lib/MC/MCELFObjectTargetWriter.cpp
+++ b/lib/MC/MCELFObjectTargetWriter.cpp
@@ -42,9 +42,9 @@ const MCSymbol *MCELFObjectTargetWriter::undefinedExplicitRelSym(const MCValue &
 // ELF doesn't require relocations to be in any order. We sort by the r_offset,
 // just to match gnu as for easier comparison. The use type and index is an
 // arbitrary way of making the sort deterministic.
-static int cmpRel(const void *AP, const void *BP) {
-  const ELFRelocationEntry &A = *(const ELFRelocationEntry *)AP;
-  const ELFRelocationEntry &B = *(const ELFRelocationEntry *)BP;
+static int cmpRel(const ELFRelocationEntry *AP, const ELFRelocationEntry *BP) {
+  const ELFRelocationEntry &A = *AP;
+  const ELFRelocationEntry &B = *BP;
   if (A.r_offset != B.r_offset)
     return B.r_offset - A.r_offset;
   if (B.Type != A.Type)
diff --git a/lib/MC/MCELFStreamer.cpp b/lib/MC/MCELFStreamer.cpp
index 6e5ff50..e806cb9 100644
--- a/lib/MC/MCELFStreamer.cpp
+++ b/lib/MC/MCELFStreamer.cpp
@@ -15,6 +15,7 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCELF.h"
@@ -96,6 +97,9 @@ void MCELFStreamer::EmitDebugLabel(MCSymbol *Symbol) {
 }
 
 void MCELFStreamer::EmitAssemblerFlag(MCAssemblerFlag Flag) {
+  // Let the target do whatever target specific stuff it needs to do.
+  getAssembler().getBackend().handleAssemblerFlag(Flag);
+  // Do any generic stuff we need to do.
   switch (Flag) {
   case MCAF_SyntaxUnified: return; // no-op here.
   case MCAF_Code16: return; // Change parsing mode; no-op here.
@@ -148,8 +152,8 @@ static unsigned CombineSymbolTypes(unsigned T1, unsigned T2) {
   return T2;
 }
 
-void MCELFStreamer::EmitSymbolAttribute(MCSymbol *Symbol,
-                                          MCSymbolAttr Attribute) {
+bool MCELFStreamer::EmitSymbolAttribute(MCSymbol *Symbol,
+                                        MCSymbolAttr Attribute) {
   // Indirect symbols are handled differently, to match how 'as' handles
   // them. This makes writing matching .o files easier.
   if (Attribute == MCSA_IndirectSymbol) {
@@ -159,7 +163,7 @@ void MCELFStreamer::EmitSymbolAttribute(MCSymbol *Symbol,
     ISD.Symbol = Symbol;
     ISD.SectionData = getCurrentSectionData();
     getAssembler().getIndirectSymbols().push_back(ISD);
-    return;
+    return true;
   }
 
   // Adding a symbol attribute always introduces the symbol, note that an
@@ -182,7 +186,7 @@ void MCELFStreamer::EmitSymbolAttribute(MCSymbol *Symbol,
   case MCSA_WeakDefAutoPrivate:
   case MCSA_Invalid:
   case MCSA_IndirectSymbol:
-    llvm_unreachable("Invalid symbol attribute for ELF!");
+    return false;
 
   case MCSA_NoDeadStrip:
   case MCSA_ELF_TypeGnuUniqueObject:
@@ -251,6 +255,8 @@ void MCELFStreamer::EmitSymbolAttribute(MCSymbol *Symbol,
     MCELF::SetVisibility(SD, ELF::STV_INTERNAL);
     break;
   }
+
+  return true;
 }
 
 void MCELFStreamer::EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
@@ -270,7 +276,8 @@ void MCELFStreamer::EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
                                                          ELF::SHF_WRITE |
                                                          ELF::SHF_ALLOC,
                                                          SectionKind::getBSS());
-    Symbol->setSection(*Section);
+
+    AssignSection(Symbol, Section);
 
     struct LocalCommon L = {&SD, Size, ByteAlignment};
     LocalCommons.push_back(L);
@@ -313,20 +320,29 @@ void MCELFStreamer::EmitValueToAlignment(unsigned ByteAlignment,
                                          ValueSize, MaxBytesToEmit);
 }
 
-
-// Add a symbol for the file name of this module. This is the second
-// entry in the module's symbol table (the first being the null symbol).
+// Add a symbol for the file name of this module. They start after the
+// null symbol and don't count as normal symbol, i.e. a non-STT_FILE symbol
+// with the same name may appear.
 void MCELFStreamer::EmitFileDirective(StringRef Filename) {
-  MCSymbol *Symbol = getAssembler().getContext().GetOrCreateSymbol(Filename);
-  Symbol->setSection(*getCurrentSection().first);
-  Symbol->setAbsolute();
-
-  MCSymbolData &SD = getAssembler().getOrCreateSymbolData(*Symbol);
-
-  SD.setFlags(ELF_STT_File | ELF_STB_Local | ELF_STV_Default);
+  getAssembler().addFileName(Filename);
+}
+
+void MCELFStreamer::EmitIdent(StringRef IdentString) {
+  const MCSection *Comment = getAssembler().getContext().getELFSection(
+      ".comment", ELF::SHT_PROGBITS, ELF::SHF_MERGE | ELF::SHF_STRINGS,
+      SectionKind::getReadOnly(), 1, "");
+  PushSection();
+  SwitchSection(Comment);
+  if (!SeenIdent) {
+    EmitIntValue(0, 1);
+    SeenIdent = true;
+  }
+  EmitBytes(IdentString);
+  EmitIntValue(0, 1);
+  PopSection();
 }
 
-void  MCELFStreamer::fixSymbolsInTLSFixups(const MCExpr *expr) {
+void MCELFStreamer::fixSymbolsInTLSFixups(const MCExpr *expr) {
   switch (expr->getKind()) {
   case MCExpr::Target:
     cast<MCTargetExpr>(expr)->fixELFSymbolsInTLSFixups(getAssembler());
@@ -525,9 +541,7 @@ void MCELFStreamer::EmitBundleUnlock() {
   SD->setBundleLockState(MCSectionData::NotBundleLocked);
 }
 
-void MCELFStreamer::FinishImpl() {
-  EmitFrames(true);
-
+void MCELFStreamer::Flush() {
   for (std::vector<LocalCommon>::const_iterator i = LocalCommons.begin(),
                                                 e = LocalCommons.end();
        i != e; ++i) {
@@ -548,17 +562,23 @@ void MCELFStreamer::FinishImpl() {
       SectData.setAlignment(ByteAlignment);
   }
 
-  this->MCObjectStreamer::FinishImpl();
+  LocalCommons.clear();
 }
-void MCELFStreamer::EmitTCEntry(const MCSymbol &S) {
-  // Creates a R_PPC64_TOC relocation
-  MCObjectStreamer::EmitSymbolValue(&S, 8);
+
+void MCELFStreamer::FinishImpl() {
+  EmitFrames(NULL, true);
+
+  Flush();
+
+  this->MCObjectStreamer::FinishImpl();
 }
 
-MCStreamer *llvm::createELFStreamer(MCContext &Context, MCAsmBackend &MAB,
-                                    raw_ostream &OS, MCCodeEmitter *CE,
-                                    bool RelaxAll, bool NoExecStack) {
-  MCELFStreamer *S = new MCELFStreamer(Context, MAB, OS, CE);
+MCStreamer *llvm::createELFStreamer(MCContext &Context,
+                                    MCTargetStreamer *Streamer,
+                                    MCAsmBackend &MAB, raw_ostream &OS,
+                                    MCCodeEmitter *CE, bool RelaxAll,
+                                    bool NoExecStack) {
+  MCELFStreamer *S = new MCELFStreamer(Context, Streamer, MAB, OS, CE);
   if (RelaxAll)
     S->getAssembler().setRelaxAll(true);
   if (NoExecStack)
diff --git a/lib/MC/MCExternalSymbolizer.cpp b/lib/MC/MCExternalSymbolizer.cpp
index 47ef6c4..ca368b2 100644
--- a/lib/MC/MCExternalSymbolizer.cpp
+++ b/lib/MC/MCExternalSymbolizer.cpp
@@ -63,6 +63,8 @@ bool MCExternalSymbolizer::tryAddingSymbolicOperand(MCInst &MI,
     }
     if(ReferenceType == LLVMDisassembler_ReferenceType_Out_SymbolStub)
       cStream << "symbol stub for: " << ReferenceName;
+    else if(ReferenceType == LLVMDisassembler_ReferenceType_Out_Objc_Message)
+      cStream << "Objc message: " << ReferenceName;
     if (!Name && !IsBranch)
       return false;
   }
@@ -132,6 +134,8 @@ bool MCExternalSymbolizer::tryAddingSymbolicOperand(MCInst &MI,
 // literal pool's entry if the referenced address is that of a symbol. Or it
 // will return a pointer to a literal 'C' string if the referenced address of
 // the literal pool's entry is an address into a section with C string literals.
+// Or if the reference is to an Objective-C data structure it will return a
+// specific reference type for it and a string.
 void MCExternalSymbolizer::tryAddingPcLoadReferenceComment(raw_ostream &cStream,
                                                            int64_t Value,
                                                            uint64_t Address) {
@@ -139,9 +143,26 @@ void MCExternalSymbolizer::tryAddingPcLoadReferenceComment(raw_ostream &cStream,
     uint64_t ReferenceType = LLVMDisassembler_ReferenceType_In_PCrel_Load;
     const char *ReferenceName;
     (void)SymbolLookUp(DisInfo, Value, &ReferenceType, Address, &ReferenceName);
-    if(ReferenceType == LLVMDisassembler_ReferenceType_Out_LitPool_SymAddr ||
-       ReferenceType == LLVMDisassembler_ReferenceType_Out_LitPool_CstrAddr)
-      cStream << "literal pool for: " << ReferenceName;
+    if(ReferenceType == LLVMDisassembler_ReferenceType_Out_LitPool_SymAddr)
+      cStream << "literal pool symbol address: " << ReferenceName;
+    else if(ReferenceType ==
+            LLVMDisassembler_ReferenceType_Out_LitPool_CstrAddr)
+      cStream << "literal pool for: \"" << ReferenceName << "\"";
+    else if(ReferenceType ==
+            LLVMDisassembler_ReferenceType_Out_Objc_CFString_Ref)
+      cStream << "Objc cfstring ref: @\"" << ReferenceName << "\"";
+    else if(ReferenceType ==
+            LLVMDisassembler_ReferenceType_Out_Objc_Message)
+      cStream << "Objc message: " << ReferenceName;
+    else if(ReferenceType ==
+            LLVMDisassembler_ReferenceType_Out_Objc_Message_Ref)
+      cStream << "Objc message ref: " << ReferenceName;
+    else if(ReferenceType ==
+            LLVMDisassembler_ReferenceType_Out_Objc_Selector_Ref)
+      cStream << "Objc selector ref: " << ReferenceName;
+    else if(ReferenceType ==
+            LLVMDisassembler_ReferenceType_Out_Objc_Class_Ref)
+      cStream << "Objc class ref: " << ReferenceName;
   }
 }
 
diff --git a/lib/MC/MCFunction.cpp b/lib/MC/MCFunction.cpp
index 2665d3e..767e1e0 100644
--- a/lib/MC/MCFunction.cpp
+++ b/lib/MC/MCFunction.cpp
@@ -9,15 +9,15 @@
 
 #include "llvm/MC/MCFunction.h"
 #include "llvm/MC/MCAtom.h"
-#include "llvm/Support/raw_ostream.h"
+#include "llvm/MC/MCModule.h"
 #include <algorithm>
 
 using namespace llvm;
 
 // MCFunction
 
-MCFunction::MCFunction(StringRef Name)
-  : Name(Name)
+MCFunction::MCFunction(StringRef Name, MCModule *Parent)
+  : Name(Name), ParentModule(Parent)
 {}
 
 MCFunction::~MCFunction() {
@@ -26,18 +26,32 @@ MCFunction::~MCFunction() {
 }
 
 MCBasicBlock &MCFunction::createBlock(const MCTextAtom &TA) {
-  Blocks.push_back(new MCBasicBlock(TA, this));
-  return *Blocks.back();
+  MCBasicBlock *MCBB = new MCBasicBlock(TA, this);
+  Blocks.push_back(MCBB);
+  return *MCBB;
+}
+
+MCBasicBlock *MCFunction::find(uint64_t StartAddr) {
+  for (const_iterator I = begin(), E = end(); I != E; ++I)
+    if ((*I)->getInsts()->getBeginAddr() == StartAddr)
+      return *I;
+  return 0;
+}
+
+const MCBasicBlock *MCFunction::find(uint64_t StartAddr) const {
+  return const_cast<MCFunction *>(this)->find(StartAddr);
 }
 
 // MCBasicBlock
 
 MCBasicBlock::MCBasicBlock(const MCTextAtom &Insts, MCFunction *Parent)
-  : Insts(&Insts), Parent(Parent)
-{}
+  : Insts(&Insts), Parent(Parent) {
+  getParent()->getParent()->trackBBForAtom(&Insts, this);
+}
 
 void MCBasicBlock::addSuccessor(const MCBasicBlock *MCBB) {
-  Successors.push_back(MCBB);
+  if (!isSuccessor(MCBB))
+    Successors.push_back(MCBB);
 }
 
 bool MCBasicBlock::isSuccessor(const MCBasicBlock *MCBB) const {
@@ -46,10 +60,22 @@ bool MCBasicBlock::isSuccessor(const MCBasicBlock *MCBB) const {
 }
 
 void MCBasicBlock::addPredecessor(const MCBasicBlock *MCBB) {
-  Predecessors.push_back(MCBB);
+  if (!isPredecessor(MCBB))
+    Predecessors.push_back(MCBB);
 }
 
 bool MCBasicBlock::isPredecessor(const MCBasicBlock *MCBB) const {
   return std::find(Predecessors.begin(), Predecessors.end(),
                    MCBB) != Predecessors.end();
 }
+
+void MCBasicBlock::splitBasicBlock(MCBasicBlock *SplitBB) {
+  assert(Insts->getEndAddr() + 1 == SplitBB->Insts->getBeginAddr() &&
+         "Splitting unrelated basic blocks!");
+  SplitBB->addPredecessor(this);
+  assert(SplitBB->Successors.empty() &&
+         "Split basic block shouldn't already have successors!");
+  SplitBB->Successors = Successors;
+  Successors.clear();
+  addSuccessor(SplitBB);
+}
diff --git a/lib/MC/MCInstPrinter.cpp b/lib/MC/MCInstPrinter.cpp
index 6a452c8..ba71245 100644
--- a/lib/MC/MCInstPrinter.cpp
+++ b/lib/MC/MCInstPrinter.cpp
@@ -31,9 +31,13 @@ void MCInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
 
 void MCInstPrinter::printAnnotation(raw_ostream &OS, StringRef Annot) {
   if (!Annot.empty()) {
-    if (CommentStream)
+    if (CommentStream) {
       (*CommentStream) << Annot;
-    else
+      // By definition (see MCInstPrinter.h), CommentStream must end with
+      // a newline after each comment.
+      if (Annot.back() != '\n')
+        (*CommentStream) << '\n';
+    } else
       OS << " " << MAI.getCommentString() << " " << Annot;
   }
 }
diff --git a/lib/MC/MCMachOStreamer.cpp b/lib/MC/MCMachOStreamer.cpp
index 0729b7a..2924dcd 100644
--- a/lib/MC/MCMachOStreamer.cpp
+++ b/lib/MC/MCMachOStreamer.cpp
@@ -1,3 +1,4 @@
+//===-- MCMachOStreamer.cpp - MachO Streamer ------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -36,7 +37,7 @@ private:
 public:
   MCMachOStreamer(MCContext &Context, MCAsmBackend &MAB, raw_ostream &OS,
                   MCCodeEmitter *Emitter)
-      : MCObjectStreamer(SK_MachOStreamer, Context, MAB, OS, Emitter) {}
+      : MCObjectStreamer(Context, 0, MAB, OS, Emitter) {}
 
   /// @name MCStreamer Interface
   /// @{
@@ -51,7 +52,7 @@ public:
   virtual void EmitLinkerOptions(ArrayRef<std::string> Options);
   virtual void EmitDataRegion(MCDataRegionType Kind);
   virtual void EmitThumbFunc(MCSymbol *Func);
-  virtual void EmitSymbolAttribute(MCSymbol *Symbol, MCSymbolAttr Attribute);
+  virtual bool EmitSymbolAttribute(MCSymbol *Symbol, MCSymbolAttr Attribute);
   virtual void EmitSymbolDesc(MCSymbol *Symbol, unsigned DescValue);
   virtual void EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
                                 unsigned ByteAlignment);
@@ -81,16 +82,14 @@ public:
     // FIXME: Just ignore the .file; it isn't important enough to fail the
     // entire assembly.
 
-    //report_fatal_error("unsupported directive: '.file'");
+    // report_fatal_error("unsupported directive: '.file'");
   }
 
-  virtual void FinishImpl();
-
-  /// @}
-
-  static bool classof(const MCStreamer *S) {
-    return S->getKind() == SK_MachOStreamer;
+  virtual void EmitIdent(StringRef IdentString) {
+    llvm_unreachable("macho doesn't support this directive");
   }
+
+  virtual void FinishImpl();
 };
 
 } // end anonymous namespace.
@@ -122,7 +121,7 @@ void MCMachOStreamer::EmitLabel(MCSymbol *Symbol) {
   assert(Symbol->isUndefined() && "Cannot define a symbol twice!");
 
   // isSymbolLinkerVisible uses the section.
-  Symbol->setSection(*getCurrentSection().first);
+  AssignSection(Symbol, getCurrentSection().first);
   // We have to create a new fragment if this is an atom defining symbol,
   // fragments cannot span atoms.
   if (getAssembler().isSymbolLinkerVisible(*Symbol))
@@ -217,7 +216,7 @@ void MCMachOStreamer::EmitThumbFunc(MCSymbol *Symbol) {
   SD.setFlags(SD.getFlags() | SF_ThumbFunc);
 }
 
-void MCMachOStreamer::EmitSymbolAttribute(MCSymbol *Symbol,
+bool MCMachOStreamer::EmitSymbolAttribute(MCSymbol *Symbol,
                                           MCSymbolAttr Attribute) {
   // Indirect symbols are handled differently, to match how 'as' handles
   // them. This makes writing matching .o files easier.
@@ -228,7 +227,7 @@ void MCMachOStreamer::EmitSymbolAttribute(MCSymbol *Symbol,
     ISD.Symbol = Symbol;
     ISD.SectionData = getCurrentSectionData();
     getAssembler().getIndirectSymbols().push_back(ISD);
-    return;
+    return true;
   }
 
   // Adding a symbol attribute always introduces the symbol, note that an
@@ -257,7 +256,7 @@ void MCMachOStreamer::EmitSymbolAttribute(MCSymbol *Symbol,
   case MCSA_Protected:
   case MCSA_Weak:
   case MCSA_Local:
-    llvm_unreachable("Invalid symbol attribute for Mach-O!");
+    return false;
 
   case MCSA_Global:
     SD.setExternal(true);
@@ -309,6 +308,8 @@ void MCMachOStreamer::EmitSymbolAttribute(MCSymbol *Symbol,
     SD.setFlags(SD.getFlags() | SF_WeakDefinition | SF_WeakReference);
     break;
   }
+
+  return true;
 }
 
 void MCMachOStreamer::EmitSymbolDesc(MCSymbol *Symbol, unsigned DescValue) {
@@ -324,6 +325,8 @@ void MCMachOStreamer::EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
   // FIXME: Darwin 'as' does appear to allow redef of a .comm by itself.
   assert(Symbol->isUndefined() && "Cannot define a symbol twice!");
 
+  AssignSection(Symbol, NULL);
+
   MCSymbolData &SD = getAssembler().getOrCreateSymbolData(*Symbol);
   SD.setExternal(true);
   SD.setCommon(Size, ByteAlignment);
@@ -360,7 +363,7 @@ void MCMachOStreamer::EmitZerofill(const MCSection *Section, MCSymbol *Symbol,
   MCFragment *F = new MCFillFragment(0, 0, Size, &SectData);
   SD.setFragment(F);
 
-  Symbol->setSection(*Section);
+  AssignSection(Symbol, Section);
 
   // Update the maximum alignment on the zero fill section if necessary.
   if (ByteAlignment > SectData.getAlignment())
@@ -393,7 +396,7 @@ void MCMachOStreamer::EmitInstToData(const MCInst &Inst) {
 }
 
 void MCMachOStreamer::FinishImpl() {
-  EmitFrames(true);
+  EmitFrames(&getAssembler().getBackend(), true);
 
   // We have to set the fragment atom associations so we can relax properly for
   // Mach-O.
diff --git a/lib/MC/MCModule.cpp b/lib/MC/MCModule.cpp
index 5890b4b..7e9e18a 100644
--- a/lib/MC/MCModule.cpp
+++ b/lib/MC/MCModule.cpp
@@ -18,6 +18,10 @@ static bool AtomComp(const MCAtom *L, uint64_t Addr) {
   return L->getEndAddr() < Addr;
 }
 
+static bool AtomCompInv(uint64_t Addr, const MCAtom *R) {
+  return Addr < R->getEndAddr();
+}
+
 void MCModule::map(MCAtom *NewAtom) {
   uint64_t Begin = NewAtom->Begin;
 
@@ -54,9 +58,13 @@ void MCModule::remap(MCAtom *Atom, uint64_t NewBegin, uint64_t NewEnd) {
   assert(*I == Atom && "Previous atom mapping was invalid!");
   Atoms.erase(I);
 
+  // FIXME: special case NewBegin == Atom->Begin
+
   // Insert the new mapping.
   AtomListTy::iterator NewI = std::lower_bound(atom_begin(), atom_end(),
                                                NewBegin, AtomComp);
+  assert((NewI == atom_end() || (*NewI)->getBeginAddr() > Atom->End)
+         && "Offset range already occupied!");
   Atoms.insert(NewI, Atom);
 
   // Update the atom internal bounds.
@@ -73,18 +81,55 @@ const MCAtom *MCModule::findAtomContaining(uint64_t Addr) const {
 }
 
 MCAtom *MCModule::findAtomContaining(uint64_t Addr) {
-  AtomListTy::iterator I = std::lower_bound(atom_begin(), atom_end(),
-                                            Addr, AtomComp);
-  if (I != atom_end() && (*I)->getBeginAddr() <= Addr)
+  return const_cast<MCAtom*>(
+    const_cast<const MCModule *>(this)->findAtomContaining(Addr));
+}
+
+const MCAtom *MCModule::findFirstAtomAfter(uint64_t Addr) const {
+  AtomListTy::const_iterator I = std::upper_bound(atom_begin(), atom_end(),
+                                                  Addr, AtomCompInv);
+  if (I != atom_end())
     return *I;
   return 0;
 }
 
-MCFunction *MCModule::createFunction(const StringRef &Name) {
-  Functions.push_back(new MCFunction(Name));
+MCAtom *MCModule::findFirstAtomAfter(uint64_t Addr) {
+  return const_cast<MCAtom*>(
+    const_cast<const MCModule *>(this)->findFirstAtomAfter(Addr));
+}
+
+MCFunction *MCModule::createFunction(StringRef Name) {
+  Functions.push_back(new MCFunction(Name, this));
   return Functions.back();
 }
 
+static bool CompBBToAtom(MCBasicBlock *BB, const MCTextAtom *Atom) {
+  return BB->getInsts() < Atom;
+}
+
+void MCModule::splitBasicBlocksForAtom(const MCTextAtom *TA,
+                                       const MCTextAtom *NewTA) {
+  BBsByAtomTy::iterator
+    I = std::lower_bound(BBsByAtom.begin(), BBsByAtom.end(),
+                         TA, CompBBToAtom);
+  for (; I != BBsByAtom.end() && (*I)->getInsts() == TA; ++I) {
+    MCBasicBlock *BB = *I;
+    MCBasicBlock *NewBB = &BB->getParent()->createBlock(*NewTA);
+    BB->splitBasicBlock(NewBB);
+  }
+}
+
+void MCModule::trackBBForAtom(const MCTextAtom *Atom, MCBasicBlock *BB) {
+  assert(Atom == BB->getInsts() && "Text atom doesn't back the basic block!");
+  BBsByAtomTy::iterator I = std::lower_bound(BBsByAtom.begin(),
+                                             BBsByAtom.end(),
+                                             Atom, CompBBToAtom);
+  for (; I != BBsByAtom.end() && (*I)->getInsts() == Atom; ++I)
+    if (*I == BB)
+      return;
+  BBsByAtom.insert(I, BB);
+}
+
 MCModule::~MCModule() {
   for (AtomListTy::iterator AI = atom_begin(),
                             AE = atom_end();
diff --git a/lib/MC/MCModuleYAML.cpp b/lib/MC/MCModuleYAML.cpp
new file mode 100644
index 0000000..e2de578
--- /dev/null
+++ b/lib/MC/MCModuleYAML.cpp
@@ -0,0 +1,461 @@
+//===- MCModuleYAML.cpp - MCModule YAMLIO implementation ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines classes for handling the YAML representation of MCModule.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCModuleYAML.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/MC/MCAtom.h"
+#include "llvm/MC/MCFunction.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Object/YAML.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/YAMLTraits.h"
+#include <vector>
+
+namespace llvm {
+
+namespace {
+
+// This class is used to map opcode and register names to enum values.
+//
+// There are at least 3 obvious ways to do this:
+// 1- Generate an MII/MRI method using a tablegen StringMatcher
+// 2- Write an MII/MRI method using std::lower_bound and the assumption that
+//    the enums are sorted (starting at a fixed value).
+// 3- Do the matching manually as is done here.
+//
+// Why 3?
+// 1- A StringMatcher function for thousands of entries would incur
+//    a non-negligible binary size overhead.
+// 2- The lower_bound comparators would be somewhat involved and aren't
+//    obviously reusable (see LessRecordRegister in llvm/TableGen/Record.h)
+// 3- This isn't actually something useful outside tests (but the same argument
+//    can be made against having {MII,MRI}::getName).
+//
+// If this becomes useful outside this specific situation, feel free to do
+// the Right Thing (tm) and move the functionality to MII/MRI.
+//
+class InstrRegInfoHolder {
+  typedef StringMap<unsigned, BumpPtrAllocator> EnumValByNameTy;
+  EnumValByNameTy InstEnumValueByName;
+  EnumValByNameTy RegEnumValueByName;
+
+public:
+  const MCInstrInfo &MII;
+  const MCRegisterInfo &MRI;
+  InstrRegInfoHolder(const MCInstrInfo &MII, const MCRegisterInfo &MRI)
+      : InstEnumValueByName(NextPowerOf2(MII.getNumOpcodes())),
+        RegEnumValueByName(NextPowerOf2(MRI.getNumRegs())), MII(MII), MRI(MRI) {
+    for (int i = 0, e = MII.getNumOpcodes(); i != e; ++i)
+      InstEnumValueByName[MII.getName(i)] = i;
+    for (int i = 0, e = MRI.getNumRegs(); i != e; ++i)
+      RegEnumValueByName[MRI.getName(i)] = i;
+  }
+
+  bool matchRegister(StringRef Name, unsigned &Reg) {
+    EnumValByNameTy::const_iterator It = RegEnumValueByName.find(Name);
+    if (It == RegEnumValueByName.end())
+      return false;
+    Reg = It->getValue();
+    return true;
+  }
+  bool matchOpcode(StringRef Name, unsigned &Opc) {
+    EnumValByNameTy::const_iterator It = InstEnumValueByName.find(Name);
+    if (It == InstEnumValueByName.end())
+      return false;
+    Opc = It->getValue();
+    return true;
+  }
+};
+
+} // end unnamed namespace
+
+namespace MCModuleYAML {
+
+LLVM_YAML_STRONG_TYPEDEF(unsigned, OpcodeEnum)
+
+struct Operand {
+  MCOperand MCOp;
+};
+
+struct Inst {
+  OpcodeEnum Opcode;
+  std::vector<Operand> Operands;
+  uint64_t Size;
+};
+
+struct Atom {
+  MCAtom::AtomKind Type;
+  yaml::Hex64 StartAddress;
+  uint64_t Size;
+
+  std::vector<Inst> Insts;
+  object::yaml::BinaryRef Data;
+};
+
+struct BasicBlock {
+  yaml::Hex64 Address;
+  std::vector<yaml::Hex64> Preds;
+  std::vector<yaml::Hex64> Succs;
+};
+
+struct Function {
+  StringRef Name;
+  std::vector<BasicBlock> BasicBlocks;
+};
+
+struct Module {
+  std::vector<Atom> Atoms;
+  std::vector<Function> Functions;
+};
+
+} // end namespace MCModuleYAML
+} // end namespace llvm
+
+LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(llvm::yaml::Hex64)
+LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(llvm::MCModuleYAML::Operand)
+LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::MCModuleYAML::Inst)
+LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::MCModuleYAML::Atom)
+LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::MCModuleYAML::BasicBlock)
+LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::MCModuleYAML::Function)
+
+namespace llvm {
+
+namespace yaml {
+
+template <> struct ScalarEnumerationTraits<MCAtom::AtomKind> {
+  static void enumeration(IO &IO, MCAtom::AtomKind &Kind);
+};
+
+template <> struct MappingTraits<MCModuleYAML::Atom> {
+  static void mapping(IO &IO, MCModuleYAML::Atom &A);
+};
+
+template <> struct MappingTraits<MCModuleYAML::Inst> {
+  static void mapping(IO &IO, MCModuleYAML::Inst &I);
+};
+
+template <> struct MappingTraits<MCModuleYAML::BasicBlock> {
+  static void mapping(IO &IO, MCModuleYAML::BasicBlock &BB);
+};
+
+template <> struct MappingTraits<MCModuleYAML::Function> {
+  static void mapping(IO &IO, MCModuleYAML::Function &Fn);
+};
+
+template <> struct MappingTraits<MCModuleYAML::Module> {
+  static void mapping(IO &IO, MCModuleYAML::Module &M);
+};
+
+template <> struct ScalarTraits<MCModuleYAML::Operand> {
+  static void output(const MCModuleYAML::Operand &, void *,
+                     llvm::raw_ostream &);
+  static StringRef input(StringRef, void *, MCModuleYAML::Operand &);
+};
+
+template <> struct ScalarTraits<MCModuleYAML::OpcodeEnum> {
+  static void output(const MCModuleYAML::OpcodeEnum &, void *,
+                     llvm::raw_ostream &);
+  static StringRef input(StringRef, void *, MCModuleYAML::OpcodeEnum &);
+};
+
+void ScalarEnumerationTraits<MCAtom::AtomKind>::enumeration(
+    IO &IO, MCAtom::AtomKind &Value) {
+  IO.enumCase(Value, "Text", MCAtom::TextAtom);
+  IO.enumCase(Value, "Data", MCAtom::DataAtom);
+}
+
+void MappingTraits<MCModuleYAML::Atom>::mapping(IO &IO, MCModuleYAML::Atom &A) {
+  IO.mapRequired("StartAddress", A.StartAddress);
+  IO.mapRequired("Size", A.Size);
+  IO.mapRequired("Type", A.Type);
+  if (A.Type == MCAtom::TextAtom)
+    IO.mapRequired("Content", A.Insts);
+  else if (A.Type == MCAtom::DataAtom)
+    IO.mapRequired("Content", A.Data);
+}
+
+void MappingTraits<MCModuleYAML::Inst>::mapping(IO &IO, MCModuleYAML::Inst &I) {
+  IO.mapRequired("Inst", I.Opcode);
+  IO.mapRequired("Size", I.Size);
+  IO.mapRequired("Ops", I.Operands);
+}
+
+void
+MappingTraits<MCModuleYAML::BasicBlock>::mapping(IO &IO,
+                                                 MCModuleYAML::BasicBlock &BB) {
+  IO.mapRequired("Address", BB.Address);
+  IO.mapRequired("Preds", BB.Preds);
+  IO.mapRequired("Succs", BB.Succs);
+}
+
+void MappingTraits<MCModuleYAML::Function>::mapping(IO &IO,
+                                                    MCModuleYAML::Function &F) {
+  IO.mapRequired("Name", F.Name);
+  IO.mapRequired("BasicBlocks", F.BasicBlocks);
+}
+
+void MappingTraits<MCModuleYAML::Module>::mapping(IO &IO,
+                                                  MCModuleYAML::Module &M) {
+  IO.mapRequired("Atoms", M.Atoms);
+  IO.mapOptional("Functions", M.Functions);
+}
+
+void
+ScalarTraits<MCModuleYAML::Operand>::output(const MCModuleYAML::Operand &Val,
+                                            void *Ctx, raw_ostream &Out) {
+  InstrRegInfoHolder *IRI = (InstrRegInfoHolder *)Ctx;
+
+  // FIXME: Doesn't support FPImm and expr/inst, but do these make sense?
+  if (Val.MCOp.isImm())
+    Out << "I" << Val.MCOp.getImm();
+  else if (Val.MCOp.isReg())
+    Out << "R" << IRI->MRI.getName(Val.MCOp.getReg());
+  else
+    llvm_unreachable("Trying to output invalid MCOperand!");
+}
+
+StringRef
+ScalarTraits<MCModuleYAML::Operand>::input(StringRef Scalar, void *Ctx,
+                                           MCModuleYAML::Operand &Val) {
+  InstrRegInfoHolder *IRI = (InstrRegInfoHolder *)Ctx;
+  char Type = 0;
+  if (Scalar.size() >= 1)
+    Type = Scalar.front();
+  if (Type != 'R' && Type != 'I')
+    return "Operand must start with 'R' (register) or 'I' (immediate).";
+  if (Type == 'R') {
+    unsigned Reg;
+    if (!IRI->matchRegister(Scalar.substr(1), Reg))
+      return "Invalid register name.";
+    Val.MCOp = MCOperand::CreateReg(Reg);
+  } else if (Type == 'I') {
+    int64_t RIVal;
+    if (Scalar.substr(1).getAsInteger(10, RIVal))
+      return "Invalid immediate value.";
+    Val.MCOp = MCOperand::CreateImm(RIVal);
+  } else {
+    Val.MCOp = MCOperand();
+  }
+  return StringRef();
+}
+
+void ScalarTraits<MCModuleYAML::OpcodeEnum>::output(
+    const MCModuleYAML::OpcodeEnum &Val, void *Ctx, raw_ostream &Out) {
+  InstrRegInfoHolder *IRI = (InstrRegInfoHolder *)Ctx;
+  Out << IRI->MII.getName(Val);
+}
+
+StringRef
+ScalarTraits<MCModuleYAML::OpcodeEnum>::input(StringRef Scalar, void *Ctx,
+                                              MCModuleYAML::OpcodeEnum &Val) {
+  InstrRegInfoHolder *IRI = (InstrRegInfoHolder *)Ctx;
+  unsigned Opc;
+  if (!IRI->matchOpcode(Scalar, Opc))
+    return "Invalid instruction opcode.";
+  Val = Opc;
+  return "";
+}
+
+} // end namespace yaml
+
+namespace {
+
+class MCModule2YAML {
+  const MCModule &MCM;
+  MCModuleYAML::Module YAMLModule;
+  void dumpAtom(const MCAtom *MCA);
+  void dumpFunction(const MCFunction *MCF);
+  void dumpBasicBlock(const MCBasicBlock *MCBB);
+
+public:
+  MCModule2YAML(const MCModule &MCM);
+  MCModuleYAML::Module &getYAMLModule();
+};
+
+class YAML2MCModule {
+  MCModule &MCM;
+
+public:
+  YAML2MCModule(MCModule &MCM);
+  StringRef parse(const MCModuleYAML::Module &YAMLModule);
+};
+
+} // end unnamed namespace
+
+MCModule2YAML::MCModule2YAML(const MCModule &MCM) : MCM(MCM), YAMLModule() {
+  for (MCModule::const_atom_iterator AI = MCM.atom_begin(), AE = MCM.atom_end();
+       AI != AE; ++AI)
+    dumpAtom(*AI);
+  for (MCModule::const_func_iterator FI = MCM.func_begin(), FE = MCM.func_end();
+       FI != FE; ++FI)
+    dumpFunction(*FI);
+}
+
+void MCModule2YAML::dumpAtom(const MCAtom *MCA) {
+  YAMLModule.Atoms.resize(YAMLModule.Atoms.size() + 1);
+  MCModuleYAML::Atom &A = YAMLModule.Atoms.back();
+  A.Type = MCA->getKind();
+  A.StartAddress = MCA->getBeginAddr();
+  A.Size = MCA->getEndAddr() - MCA->getBeginAddr() + 1;
+  if (const MCTextAtom *TA = dyn_cast<MCTextAtom>(MCA)) {
+    const size_t InstCount = TA->size();
+    A.Insts.resize(InstCount);
+    for (size_t i = 0; i != InstCount; ++i) {
+      const MCDecodedInst &MCDI = TA->at(i);
+      A.Insts[i].Opcode = MCDI.Inst.getOpcode();
+      A.Insts[i].Size = MCDI.Size;
+      const unsigned OpCount = MCDI.Inst.getNumOperands();
+      A.Insts[i].Operands.resize(OpCount);
+      for (unsigned oi = 0; oi != OpCount; ++oi)
+        A.Insts[i].Operands[oi].MCOp = MCDI.Inst.getOperand(oi);
+    }
+  } else if (const MCDataAtom *DA = dyn_cast<MCDataAtom>(MCA)) {
+    A.Data = DA->getData();
+  } else {
+    llvm_unreachable("Unknown atom type.");
+  }
+}
+
+void MCModule2YAML::dumpFunction(const MCFunction *MCF) {
+  YAMLModule.Functions.resize(YAMLModule.Functions.size() + 1);
+  MCModuleYAML::Function &F = YAMLModule.Functions.back();
+  F.Name = MCF->getName();
+  for (MCFunction::const_iterator BBI = MCF->begin(), BBE = MCF->end();
+       BBI != BBE; ++BBI) {
+    const MCBasicBlock *MCBB = *BBI;
+    F.BasicBlocks.resize(F.BasicBlocks.size() + 1);
+    MCModuleYAML::BasicBlock &BB = F.BasicBlocks.back();
+    BB.Address = MCBB->getInsts()->getBeginAddr();
+    for (MCBasicBlock::pred_const_iterator PI = MCBB->pred_begin(),
+                                           PE = MCBB->pred_end();
+         PI != PE; ++PI)
+      BB.Preds.push_back((*PI)->getInsts()->getBeginAddr());
+    for (MCBasicBlock::succ_const_iterator SI = MCBB->succ_begin(),
+                                           SE = MCBB->succ_end();
+         SI != SE; ++SI)
+      BB.Succs.push_back((*SI)->getInsts()->getBeginAddr());
+  }
+}
+
+MCModuleYAML::Module &MCModule2YAML::getYAMLModule() { return YAMLModule; }
+
+YAML2MCModule::YAML2MCModule(MCModule &MCM) : MCM(MCM) {}
+
+StringRef YAML2MCModule::parse(const MCModuleYAML::Module &YAMLModule) {
+  typedef std::vector<MCModuleYAML::Atom>::const_iterator AtomIt;
+  typedef std::vector<MCModuleYAML::Inst>::const_iterator InstIt;
+  typedef std::vector<MCModuleYAML::Operand>::const_iterator OpIt;
+
+  typedef DenseMap<uint64_t, MCTextAtom *> AddrToTextAtomTy;
+  AddrToTextAtomTy TAByAddr;
+
+  for (AtomIt AI = YAMLModule.Atoms.begin(), AE = YAMLModule.Atoms.end();
+       AI != AE; ++AI) {
+    uint64_t StartAddress = AI->StartAddress;
+    if (AI->Size == 0)
+      return "Atoms can't be empty!";
+    uint64_t EndAddress = StartAddress + AI->Size - 1;
+    switch (AI->Type) {
+    case MCAtom::TextAtom: {
+      MCTextAtom *TA = MCM.createTextAtom(StartAddress, EndAddress);
+      TAByAddr[StartAddress] = TA;
+      for (InstIt II = AI->Insts.begin(), IE = AI->Insts.end(); II != IE;
+           ++II) {
+        MCInst MI;
+        MI.setOpcode(II->Opcode);
+        for (OpIt OI = II->Operands.begin(), OE = II->Operands.end(); OI != OE;
+             ++OI)
+          MI.addOperand(OI->MCOp);
+        TA->addInst(MI, II->Size);
+      }
+      break;
+    }
+    case MCAtom::DataAtom: {
+      MCDataAtom *DA = MCM.createDataAtom(StartAddress, EndAddress);
+      SmallVector<char, 64> Data;
+      raw_svector_ostream OS(Data);
+      AI->Data.writeAsBinary(OS);
+      OS.flush();
+      for (size_t i = 0, e = Data.size(); i != e; ++i)
+        DA->addData((uint8_t)Data[i]);
+      break;
+    }
+    }
+  }
+
+  typedef std::vector<MCModuleYAML::Function>::const_iterator FuncIt;
+  typedef std::vector<MCModuleYAML::BasicBlock>::const_iterator BBIt;
+  typedef std::vector<yaml::Hex64>::const_iterator AddrIt;
+  for (FuncIt FI = YAMLModule.Functions.begin(),
+              FE = YAMLModule.Functions.end();
+       FI != FE; ++FI) {
+    MCFunction *MCFN = MCM.createFunction(FI->Name);
+    for (BBIt BBI = FI->BasicBlocks.begin(), BBE = FI->BasicBlocks.end();
+         BBI != BBE; ++BBI) {
+      AddrToTextAtomTy::const_iterator It = TAByAddr.find(BBI->Address);
+      if (It == TAByAddr.end())
+        return "Basic block start address doesn't match any text atom!";
+      MCFN->createBlock(*It->second);
+    }
+    for (BBIt BBI = FI->BasicBlocks.begin(), BBE = FI->BasicBlocks.end();
+         BBI != BBE; ++BBI) {
+      MCBasicBlock *MCBB = MCFN->find(BBI->Address);
+      if (!MCBB)
+        return "Couldn't find matching basic block in function.";
+      for (AddrIt PI = BBI->Preds.begin(), PE = BBI->Preds.end(); PI != PE;
+           ++PI) {
+        MCBasicBlock *Pred = MCFN->find(*PI);
+        if (!Pred)
+          return "Couldn't find predecessor basic block.";
+        MCBB->addPredecessor(Pred);
+      }
+      for (AddrIt SI = BBI->Succs.begin(), SE = BBI->Succs.end(); SI != SE;
+           ++SI) {
+        MCBasicBlock *Succ = MCFN->find(*SI);
+        if (!Succ)
+          return "Couldn't find predecessor basic block.";
+        MCBB->addSuccessor(Succ);
+      }
+    }
+  }
+  return "";
+}
+
+StringRef mcmodule2yaml(raw_ostream &OS, const MCModule &MCM,
+                        const MCInstrInfo &MII, const MCRegisterInfo &MRI) {
+  MCModule2YAML Dumper(MCM);
+  InstrRegInfoHolder IRI(MII, MRI);
+  yaml::Output YOut(OS, (void *)&IRI);
+  YOut << Dumper.getYAMLModule();
+  return "";
+}
+
+StringRef yaml2mcmodule(OwningPtr<MCModule> &MCM, StringRef YamlContent,
+                        const MCInstrInfo &MII, const MCRegisterInfo &MRI) {
+  MCM.reset(new MCModule);
+  YAML2MCModule Parser(*MCM);
+  MCModuleYAML::Module YAMLModule;
+  InstrRegInfoHolder IRI(MII, MRI);
+  yaml::Input YIn(YamlContent, (void *)&IRI);
+  YIn >> YAMLModule;
+  if (error_code ec = YIn.error())
+    return ec.message();
+  StringRef err = Parser.parse(YAMLModule);
+  if (!err.empty())
+    return err;
+  return "";
+}
+
+} // end namespace llvm
diff --git a/lib/MC/MCNullStreamer.cpp b/lib/MC/MCNullStreamer.cpp
index 530c646..9b9c4aa 100644
--- a/lib/MC/MCNullStreamer.cpp
+++ b/lib/MC/MCNullStreamer.cpp
@@ -19,7 +19,7 @@ namespace {
 
   class MCNullStreamer : public MCStreamer {
   public:
-    MCNullStreamer(MCContext &Context) : MCStreamer(SK_NullStreamer, Context) {}
+    MCNullStreamer(MCContext &Context) : MCStreamer(Context, 0) {}
 
     /// @name MCStreamer Interface
     /// @{
@@ -37,7 +37,7 @@ namespace {
     virtual void EmitLabel(MCSymbol *Symbol) {
       assert(Symbol->isUndefined() && "Cannot define a symbol twice!");
       assert(getCurrentSection().first &&"Cannot emit before setting section!");
-      Symbol->setSection(*getCurrentSection().first);
+      AssignSection(Symbol, getCurrentSection().first);
     }
     virtual void EmitDebugLabel(MCSymbol *Symbol) {
       EmitLabel(Symbol);
@@ -52,7 +52,9 @@ namespace {
                                           const MCSymbol *Label,
                                           unsigned PointerSize) {}
 
-    virtual void EmitSymbolAttribute(MCSymbol *Symbol, MCSymbolAttr Attribute){}
+    virtual bool EmitSymbolAttribute(MCSymbol *Symbol, MCSymbolAttr Attribute){
+      return true;
+    }
 
     virtual void EmitSymbolDesc(MCSymbol *Symbol, unsigned DescValue) {}
 
@@ -107,13 +109,6 @@ namespace {
     virtual void EmitCFIEndProcImpl(MCDwarfFrameInfo &Frame) {
       RecordProcEnd(Frame);
     }
-
-    /// @}
-
-    static bool classof(const MCStreamer *S) {
-      return S->getKind() == SK_NullStreamer;
-    }
-
   };
 
 }
diff --git a/lib/MC/MCObjectDisassembler.cpp b/lib/MC/MCObjectDisassembler.cpp
index 1ea6eed..16a110f 100644
--- a/lib/MC/MCObjectDisassembler.cpp
+++ b/lib/MC/MCObjectDisassembler.cpp
@@ -8,8 +8,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/MC/MCObjectDisassembler.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
@@ -18,12 +18,15 @@
 #include "llvm/MC/MCFunction.h"
 #include "llvm/MC/MCInstrAnalysis.h"
 #include "llvm/MC/MCModule.h"
+#include "llvm/MC/MCObjectSymbolizer.h"
+#include "llvm/Object/MachO.h"
 #include "llvm/Object/ObjectFile.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MachO.h"
 #include "llvm/Support/MemoryObject.h"
 #include "llvm/Support/StringRefMemoryObject.h"
 #include "llvm/Support/raw_ostream.h"
 #include <map>
-#include <set>
 
 using namespace llvm;
 using namespace object;
@@ -31,10 +34,55 @@ using namespace object;
 MCObjectDisassembler::MCObjectDisassembler(const ObjectFile &Obj,
                                            const MCDisassembler &Dis,
                                            const MCInstrAnalysis &MIA)
-  : Obj(Obj), Dis(Dis), MIA(MIA) {}
+    : Obj(Obj), Dis(Dis), MIA(MIA), MOS(0) {}
 
-MCModule *MCObjectDisassembler::buildModule(bool withCFG) {
+uint64_t MCObjectDisassembler::getEntrypoint() {
+  error_code ec;
+  for (symbol_iterator SI = Obj.begin_symbols(), SE = Obj.end_symbols();
+       SI != SE; SI.increment(ec)) {
+    if (ec)
+      break;
+    StringRef Name;
+    SI->getName(Name);
+    if (Name == "main" || Name == "_main") {
+      uint64_t Entrypoint;
+      SI->getAddress(Entrypoint);
+      return getEffectiveLoadAddr(Entrypoint);
+    }
+  }
+  return 0;
+}
+
+ArrayRef<uint64_t> MCObjectDisassembler::getStaticInitFunctions() {
+  return ArrayRef<uint64_t>();
+}
+
+ArrayRef<uint64_t> MCObjectDisassembler::getStaticExitFunctions() {
+  return ArrayRef<uint64_t>();
+}
+
+MemoryObject *MCObjectDisassembler::getRegionFor(uint64_t Addr) {
+  // FIXME: Keep track of object sections.
+  return FallbackRegion.get();
+}
+
+uint64_t MCObjectDisassembler::getEffectiveLoadAddr(uint64_t Addr) {
+  return Addr;
+}
+
+uint64_t MCObjectDisassembler::getOriginalLoadAddr(uint64_t Addr) {
+  return Addr;
+}
+
+MCModule *MCObjectDisassembler::buildEmptyModule() {
   MCModule *Module = new MCModule;
+  Module->Entrypoint = getEntrypoint();
+  return Module;
+}
+
+MCModule *MCObjectDisassembler::buildModule(bool withCFG) {
+  MCModule *Module = buildEmptyModule();
+
   buildSectionAtoms(Module);
   if (withCFG)
     buildCFG(Module);
@@ -58,9 +106,10 @@ void MCObjectDisassembler::buildSectionAtoms(MCModule *Module) {
     uint64_t SecSize; SI->getSize(SecSize);
     if (StartAddr == UnknownAddressOrSize || SecSize == UnknownAddressOrSize)
       continue;
+    StartAddr = getEffectiveLoadAddr(StartAddr);
 
     StringRef Contents; SI->getContents(Contents);
-    StringRefMemoryObject memoryObject(Contents);
+    StringRefMemoryObject memoryObject(Contents, StartAddr);
 
     // We don't care about things like non-file-backed sections yet.
     if (Contents.size() != SecSize || !SecSize)
@@ -70,19 +119,31 @@ void MCObjectDisassembler::buildSectionAtoms(MCModule *Module) {
     StringRef SecName; SI->getName(SecName);
 
     if (isText) {
-      MCTextAtom *Text = Module->createTextAtom(StartAddr, EndAddr);
-      Text->setName(SecName);
+      MCTextAtom *Text = 0;
+      MCDataAtom *InvalidData = 0;
+
       uint64_t InstSize;
       for (uint64_t Index = 0; Index < SecSize; Index += InstSize) {
+        const uint64_t CurAddr = StartAddr + Index;
         MCInst Inst;
-        if (Dis.getInstruction(Inst, InstSize, memoryObject, Index,
-                               nulls(), nulls()))
+        if (Dis.getInstruction(Inst, InstSize, memoryObject, CurAddr, nulls(),
+                               nulls())) {
+          if (!Text) {
+            Text = Module->createTextAtom(CurAddr, CurAddr);
+            Text->setName(SecName);
+          }
           Text->addInst(Inst, InstSize);
-        else
-          // We don't care about splitting mixed atoms either.
-          llvm_unreachable("Couldn't disassemble instruction in atom.");
+          InvalidData = 0;
+        } else {
+          assert(InstSize && "getInstruction() consumed no bytes");
+          if (!InvalidData) {
+            Text = 0;
+            InvalidData = Module->createDataAtom(CurAddr, CurAddr+InstSize - 1);
+          }
+          for (uint64_t I = 0; I < InstSize; ++I)
+            InvalidData->addData(Contents[Index+I]);
+        }
       }
-
     } else {
       MCDataAtom *Data = Module->createDataAtom(StartAddr, EndAddr);
       Data->setName(SecName);
@@ -94,13 +155,16 @@ void MCObjectDisassembler::buildSectionAtoms(MCModule *Module) {
 
 namespace {
   struct BBInfo;
-  typedef std::set<BBInfo*> BBInfoSetTy;
+  typedef SmallPtrSet<BBInfo*, 2> BBInfoSetTy;
 
   struct BBInfo {
     MCTextAtom *Atom;
     MCBasicBlock *BB;
     BBInfoSetTy Succs;
     BBInfoSetTy Preds;
+    MCObjectDisassembler::AddressSetTy SuccAddrs;
+
+    BBInfo() : Atom(0), BB(0) {}
 
     void addSucc(BBInfo &Succ) {
       Succs.insert(&Succ);
@@ -109,13 +173,33 @@ namespace {
   };
 }
 
+static void RemoveDupsFromAddressVector(MCObjectDisassembler::AddressSetTy &V) {
+  std::sort(V.begin(), V.end());
+  V.erase(std::unique(V.begin(), V.end()), V.end());
+}
+
 void MCObjectDisassembler::buildCFG(MCModule *Module) {
   typedef std::map<uint64_t, BBInfo> BBInfoByAddrTy;
   BBInfoByAddrTy BBInfos;
-  typedef std::set<uint64_t> AddressSetTy;
   AddressSetTy Splits;
   AddressSetTy Calls;
 
+  error_code ec;
+  for (symbol_iterator SI = Obj.begin_symbols(), SE = Obj.end_symbols();
+       SI != SE; SI.increment(ec)) {
+    if (ec)
+      break;
+    SymbolRef::Type SymType;
+    SI->getType(SymType);
+    if (SymType == SymbolRef::ST_Function) {
+      uint64_t SymAddr;
+      SI->getAddress(SymAddr);
+      SymAddr = getEffectiveLoadAddr(SymAddr);
+      Calls.push_back(SymAddr);
+      Splits.push_back(SymAddr);
+    }
+  }
+
   assert(Module->func_begin() == Module->func_end()
          && "Module already has a CFG!");
 
@@ -125,21 +209,24 @@ void MCObjectDisassembler::buildCFG(MCModule *Module) {
        AI != AE; ++AI) {
     MCTextAtom *TA = dyn_cast<MCTextAtom>(*AI);
     if (!TA) continue;
-    Calls.insert(TA->getBeginAddr());
+    Calls.push_back(TA->getBeginAddr());
     BBInfos[TA->getBeginAddr()].Atom = TA;
     for (MCTextAtom::const_iterator II = TA->begin(), IE = TA->end();
          II != IE; ++II) {
       if (MIA.isTerminator(II->Inst))
-        Splits.insert(II->Address + II->Size);
+        Splits.push_back(II->Address + II->Size);
       uint64_t Target;
       if (MIA.evaluateBranch(II->Inst, II->Address, II->Size, Target)) {
         if (MIA.isCall(II->Inst))
-          Calls.insert(Target);
-        Splits.insert(Target);
+          Calls.push_back(Target);
+        Splits.push_back(Target);
       }
     }
   }
 
+  RemoveDupsFromAddressVector(Splits);
+  RemoveDupsFromAddressVector(Calls);
+
   // Split text atoms into basic block atoms.
   for (AddressSetTy::const_iterator SI = Splits.begin(), SE = Splits.end();
        SI != SE; ++SI) {
@@ -185,8 +272,8 @@ void MCObjectDisassembler::buildCFG(MCModule *Module) {
     // Create MCBBs.
     SmallSetVector<BBInfo*, 16> Worklist;
     Worklist.insert(&BBI);
-    for (size_t WI = 0; WI < Worklist.size(); ++WI) {
-      BBInfo *BBI = Worklist[WI];
+    for (size_t wi = 0; wi < Worklist.size(); ++wi) {
+      BBInfo *BBI = Worklist[wi];
       if (!BBI->Atom)
         continue;
       BBI->BB = &MCFN.createBlock(*BBI->Atom);
@@ -200,17 +287,298 @@ void MCObjectDisassembler::buildCFG(MCModule *Module) {
     }
 
     // Set preds/succs.
-    for (size_t WI = 0; WI < Worklist.size(); ++WI) {
-      BBInfo *BBI = Worklist[WI];
+    for (size_t wi = 0; wi < Worklist.size(); ++wi) {
+      BBInfo *BBI = Worklist[wi];
       MCBasicBlock *MCBB = BBI->BB;
       if (!MCBB)
         continue;
       for (BBInfoSetTy::iterator SI = BBI->Succs.begin(), SE = BBI->Succs.end();
-                                 SI != SE; ++SI)
-        MCBB->addSuccessor((*SI)->BB);
+           SI != SE; ++SI)
+        if ((*SI)->BB)
+          MCBB->addSuccessor((*SI)->BB);
       for (BBInfoSetTy::iterator PI = BBI->Preds.begin(), PE = BBI->Preds.end();
-                                 PI != PE; ++PI)
-        MCBB->addPredecessor((*PI)->BB);
+           PI != PE; ++PI)
+        if ((*PI)->BB)
+          MCBB->addPredecessor((*PI)->BB);
+    }
+  }
+}
+
+// Basic idea of the disassembly + discovery:
+//
+// start with the wanted address, insert it in the worklist
+// while worklist not empty, take next address in the worklist:
+// - check if atom exists there
+//   - if middle of atom:
+//     - split basic blocks referencing the atom
+//     - look for an already encountered BBInfo (using a map<atom, bbinfo>)
+//       - if there is, split it (new one, fallthrough, move succs, etc..)
+//   - if start of atom: nothing else to do
+//   - if no atom: create new atom and new bbinfo
+// - look at the last instruction in the atom, add succs to worklist
+// for all elements in the worklist:
+// - create basic block, update preds/succs, etc..
+//
+MCBasicBlock *MCObjectDisassembler::getBBAt(MCModule *Module, MCFunction *MCFN,
+                                            uint64_t BBBeginAddr,
+                                            AddressSetTy &CallTargets,
+                                            AddressSetTy &TailCallTargets) {
+  typedef std::map<uint64_t, BBInfo> BBInfoByAddrTy;
+  typedef SmallSetVector<uint64_t, 16> AddrWorklistTy;
+  BBInfoByAddrTy BBInfos;
+  AddrWorklistTy Worklist;
+
+  Worklist.insert(BBBeginAddr);
+  for (size_t wi = 0; wi < Worklist.size(); ++wi) {
+    const uint64_t BeginAddr = Worklist[wi];
+    BBInfo *BBI = &BBInfos[BeginAddr];
+
+    MCTextAtom *&TA = BBI->Atom;
+    assert(!TA && "Discovered basic block already has an associated atom!");
+
+    // Look for an atom at BeginAddr.
+    if (MCAtom *A = Module->findAtomContaining(BeginAddr)) {
+      // FIXME: We don't care about mixed atoms, see above.
+      TA = cast<MCTextAtom>(A);
+
+      // The found atom doesn't begin at BeginAddr, we have to split it.
+      if (TA->getBeginAddr() != BeginAddr) {
+        // FIXME: Handle overlapping atoms: middle-starting instructions, etc..
+        MCTextAtom *NewTA = TA->split(BeginAddr);
+
+        // Look for an already encountered basic block that needs splitting
+        BBInfoByAddrTy::iterator It = BBInfos.find(TA->getBeginAddr());
+        if (It != BBInfos.end() && It->second.Atom) {
+          BBI->SuccAddrs = It->second.SuccAddrs;
+          It->second.SuccAddrs.clear();
+          It->second.SuccAddrs.push_back(BeginAddr);
+        }
+        TA = NewTA;
+      }
+      BBI->Atom = TA;
+    } else {
+      // If we didn't find an atom, then we have to disassemble to create one!
+
+      MemoryObject *Region = getRegionFor(BeginAddr);
+      if (!Region)
+        llvm_unreachable(("Couldn't find suitable region for disassembly at " +
+                          utostr(BeginAddr)).c_str());
+
+      uint64_t InstSize;
+      uint64_t EndAddr = Region->getBase() + Region->getExtent();
+
+      // We want to stop before the next atom and have a fallthrough to it.
+      if (MCTextAtom *NextAtom =
+              cast_or_null<MCTextAtom>(Module->findFirstAtomAfter(BeginAddr)))
+        EndAddr = std::min(EndAddr, NextAtom->getBeginAddr());
+
+      for (uint64_t Addr = BeginAddr; Addr < EndAddr; Addr += InstSize) {
+        MCInst Inst;
+        if (Dis.getInstruction(Inst, InstSize, *Region, Addr, nulls(),
+                               nulls())) {
+          if (!TA)
+            TA = Module->createTextAtom(Addr, Addr);
+          TA->addInst(Inst, InstSize);
+        } else {
+          // We don't care about splitting mixed atoms either.
+          llvm_unreachable("Couldn't disassemble instruction in atom.");
+        }
+
+        uint64_t BranchTarget;
+        if (MIA.evaluateBranch(Inst, Addr, InstSize, BranchTarget)) {
+          if (MIA.isCall(Inst))
+            CallTargets.push_back(BranchTarget);
+        }
+
+        if (MIA.isTerminator(Inst))
+          break;
+      }
+      BBI->Atom = TA;
+    }
+
+    assert(TA && "Couldn't disassemble atom, none was created!");
+    assert(TA->begin() != TA->end() && "Empty atom!");
+
+    MemoryObject *Region = getRegionFor(TA->getBeginAddr());
+    assert(Region && "Couldn't find region for already disassembled code!");
+    uint64_t EndRegion = Region->getBase() + Region->getExtent();
+
+    // Now we have a basic block atom, add successors.
+    // Add the fallthrough block.
+    if ((MIA.isConditionalBranch(TA->back().Inst) ||
+         !MIA.isTerminator(TA->back().Inst)) &&
+        (TA->getEndAddr() + 1 < EndRegion)) {
+      BBI->SuccAddrs.push_back(TA->getEndAddr() + 1);
+      Worklist.insert(TA->getEndAddr() + 1);
+    }
+
+    // If the terminator is a branch, add the target block.
+    if (MIA.isBranch(TA->back().Inst)) {
+      uint64_t BranchTarget;
+      if (MIA.evaluateBranch(TA->back().Inst, TA->back().Address,
+                             TA->back().Size, BranchTarget)) {
+        StringRef ExtFnName;
+        if (MOS)
+          ExtFnName =
+              MOS->findExternalFunctionAt(getOriginalLoadAddr(BranchTarget));
+        if (!ExtFnName.empty()) {
+          TailCallTargets.push_back(BranchTarget);
+          CallTargets.push_back(BranchTarget);
+        } else {
+          BBI->SuccAddrs.push_back(BranchTarget);
+          Worklist.insert(BranchTarget);
+        }
+      }
+    }
+  }
+
+  for (size_t wi = 0, we = Worklist.size(); wi != we; ++wi) {
+    const uint64_t BeginAddr = Worklist[wi];
+    BBInfo *BBI = &BBInfos[BeginAddr];
+
+    assert(BBI->Atom && "Found a basic block without an associated atom!");
+
+    // Look for a basic block at BeginAddr.
+    BBI->BB = MCFN->find(BeginAddr);
+    if (BBI->BB) {
+      // FIXME: check that the succs/preds are the same
+      continue;
+    }
+    // If there was none, we have to create one from the atom.
+    BBI->BB = &MCFN->createBlock(*BBI->Atom);
+  }
+
+  for (size_t wi = 0, we = Worklist.size(); wi != we; ++wi) {
+    const uint64_t BeginAddr = Worklist[wi];
+    BBInfo *BBI = &BBInfos[BeginAddr];
+    MCBasicBlock *BB = BBI->BB;
+
+    RemoveDupsFromAddressVector(BBI->SuccAddrs);
+    for (AddressSetTy::const_iterator SI = BBI->SuccAddrs.begin(),
+         SE = BBI->SuccAddrs.end();
+         SE != SE; ++SI) {
+      MCBasicBlock *Succ = BBInfos[*SI].BB;
+      BB->addSuccessor(Succ);
+      Succ->addPredecessor(BB);
+    }
+  }
+
+  assert(BBInfos[Worklist[0]].BB &&
+         "No basic block created at requested address?");
+
+  return BBInfos[Worklist[0]].BB;
+}
+
+MCFunction *
+MCObjectDisassembler::createFunction(MCModule *Module, uint64_t BeginAddr,
+                                     AddressSetTy &CallTargets,
+                                     AddressSetTy &TailCallTargets) {
+  // First, check if this is an external function.
+  StringRef ExtFnName;
+  if (MOS)
+    ExtFnName = MOS->findExternalFunctionAt(getOriginalLoadAddr(BeginAddr));
+  if (!ExtFnName.empty())
+    return Module->createFunction(ExtFnName);
+
+  // If it's not, look for an existing function.
+  for (MCModule::func_iterator FI = Module->func_begin(),
+                               FE = Module->func_end();
+       FI != FE; ++FI) {
+    if ((*FI)->empty())
+      continue;
+    // FIXME: MCModule should provide a findFunctionByAddr()
+    if ((*FI)->getEntryBlock()->getInsts()->getBeginAddr() == BeginAddr)
+      return *FI;
+  }
+
+  // Finally, just create a new one.
+  MCFunction *MCFN = Module->createFunction("");
+  getBBAt(Module, MCFN, BeginAddr, CallTargets, TailCallTargets);
+  return MCFN;
+}
+
+// MachO MCObjectDisassembler implementation.
+
+MCMachOObjectDisassembler::MCMachOObjectDisassembler(
+    const MachOObjectFile &MOOF, const MCDisassembler &Dis,
+    const MCInstrAnalysis &MIA, uint64_t VMAddrSlide,
+    uint64_t HeaderLoadAddress)
+    : MCObjectDisassembler(MOOF, Dis, MIA), MOOF(MOOF),
+      VMAddrSlide(VMAddrSlide), HeaderLoadAddress(HeaderLoadAddress) {
+
+  error_code ec;
+  for (section_iterator SI = MOOF.begin_sections(), SE = MOOF.end_sections();
+       SI != SE; SI.increment(ec)) {
+    if (ec)
+      break;
+    StringRef Name;
+    SI->getName(Name);
+    // FIXME: We should use the S_ section type instead of the name.
+    if (Name == "__mod_init_func") {
+      DEBUG(dbgs() << "Found __mod_init_func section!\n");
+      SI->getContents(ModInitContents);
+    } else if (Name == "__mod_exit_func") {
+      DEBUG(dbgs() << "Found __mod_exit_func section!\n");
+      SI->getContents(ModExitContents);
+    }
+  }
+}
+
+// FIXME: Only do the translations for addresses actually inside the object.
+uint64_t MCMachOObjectDisassembler::getEffectiveLoadAddr(uint64_t Addr) {
+  return Addr + VMAddrSlide;
+}
+
+uint64_t
+MCMachOObjectDisassembler::getOriginalLoadAddr(uint64_t EffectiveAddr) {
+  return EffectiveAddr - VMAddrSlide;
+}
+
+uint64_t MCMachOObjectDisassembler::getEntrypoint() {
+  uint64_t EntryFileOffset = 0;
+
+  // Look for LC_MAIN.
+  {
+    uint32_t LoadCommandCount = MOOF.getHeader().ncmds;
+    MachOObjectFile::LoadCommandInfo Load = MOOF.getFirstLoadCommandInfo();
+    for (unsigned I = 0;; ++I) {
+      if (Load.C.cmd == MachO::LC_MAIN) {
+        EntryFileOffset =
+            ((const MachO::entry_point_command *)Load.Ptr)->entryoff;
+        break;
+      }
+
+      if (I == LoadCommandCount - 1)
+        break;
+      else
+        Load = MOOF.getNextLoadCommandInfo(Load);
     }
   }
+
+  // If we didn't find anything, default to the common implementation.
+  // FIXME: Maybe we could also look at LC_UNIXTHREAD and friends?
+  if (EntryFileOffset)
+    return MCObjectDisassembler::getEntrypoint();
+
+  return EntryFileOffset + HeaderLoadAddress;
+}
+
+ArrayRef<uint64_t> MCMachOObjectDisassembler::getStaticInitFunctions() {
+  // FIXME: We only handle 64bit mach-o
+  assert(MOOF.is64Bit());
+
+  size_t EntrySize = 8;
+  size_t EntryCount = ModInitContents.size() / EntrySize;
+  return ArrayRef<uint64_t>(
+      reinterpret_cast<const uint64_t *>(ModInitContents.data()), EntryCount);
+}
+
+ArrayRef<uint64_t> MCMachOObjectDisassembler::getStaticExitFunctions() {
+  // FIXME: We only handle 64bit mach-o
+  assert(MOOF.is64Bit());
+
+  size_t EntrySize = 8;
+  size_t EntryCount = ModExitContents.size() / EntrySize;
+  return ArrayRef<uint64_t>(
+      reinterpret_cast<const uint64_t *>(ModExitContents.data()), EntryCount);
 }
diff --git a/lib/MC/MCObjectFileInfo.cpp b/lib/MC/MCObjectFileInfo.cpp
index bcf52d2..8ef4a0a 100644
--- a/lib/MC/MCObjectFileInfo.cpp
+++ b/lib/MC/MCObjectFileInfo.cpp
@@ -39,6 +39,9 @@ void MCObjectFileInfo::InitMachOMCObjectFileInfo(Triple T) {
     = Ctx->getMachOSection("__DATA", "__data", 0,
                            SectionKind::getDataRel());
 
+  // BSSSection might not be expected initialized on msvc.
+  BSSSection = 0;
+
   TLSDataSection // .tdata
     = Ctx->getMachOSection("__DATA", "__thread_data",
                            MCSectionMachO::S_THREAD_LOCAL_REGULAR,
@@ -199,6 +202,14 @@ void MCObjectFileInfo::InitMachOMCObjectFileInfo(Triple T) {
     Ctx->getMachOSection("__DWARF", "__debug_pubtypes",
                          MCSectionMachO::S_ATTR_DEBUG,
                          SectionKind::getMetadata());
+  DwarfGnuPubNamesSection =
+    Ctx->getMachOSection("__DWARF", "__debug_gnu_pubn",
+                         MCSectionMachO::S_ATTR_DEBUG,
+                         SectionKind::getMetadata());
+  DwarfGnuPubTypesSection =
+    Ctx->getMachOSection("__DWARF", "__debug_gnu_pubt",
+                         MCSectionMachO::S_ATTR_DEBUG,
+                         SectionKind::getMetadata());
   DwarfStrSection =
     Ctx->getMachOSection("__DWARF", "__debug_str",
                          MCSectionMachO::S_ATTR_DEBUG,
@@ -223,6 +234,9 @@ void MCObjectFileInfo::InitMachOMCObjectFileInfo(Triple T) {
     Ctx->getMachOSection("__DWARF", "__debug_inlined",
                          MCSectionMachO::S_ATTR_DEBUG,
                          SectionKind::getMetadata());
+  StackMapSection =
+    Ctx->getMachOSection("__LLVM_STACKMAPS", "__llvm_stackmaps", 0,
+                         SectionKind::getMetadata());
 
   TLSExtraDataSection = TLSTLVSection;
 }
@@ -435,6 +449,12 @@ void MCObjectFileInfo::InitELFMCObjectFileInfo(Triple T) {
   DwarfPubTypesSection =
     Ctx->getELFSection(".debug_pubtypes", ELF::SHT_PROGBITS, 0,
                        SectionKind::getMetadata());
+  DwarfGnuPubNamesSection =
+    Ctx->getELFSection(".debug_gnu_pubnames", ELF::SHT_PROGBITS, 0,
+                       SectionKind::getMetadata());
+  DwarfGnuPubTypesSection =
+    Ctx->getELFSection(".debug_gnu_pubtypes", ELF::SHT_PROGBITS, 0,
+                       SectionKind::getMetadata());
   DwarfStrSection =
     Ctx->getELFSection(".debug_str", ELF::SHT_PROGBITS,
                        ELF::SHF_MERGE | ELF::SHF_STRINGS,
@@ -496,6 +516,12 @@ void MCObjectFileInfo::InitELFMCObjectFileInfo(Triple T) {
 
 void MCObjectFileInfo::InitCOFFMCObjectFileInfo(Triple T) {
   // COFF
+  BSSSection =
+    Ctx->getCOFFSection(".bss",
+                        COFF::IMAGE_SCN_CNT_UNINITIALIZED_DATA |
+                        COFF::IMAGE_SCN_MEM_READ |
+                        COFF::IMAGE_SCN_MEM_WRITE,
+                        SectionKind::getBSS());
   TextSection =
     Ctx->getCOFFSection(".text",
                         COFF::IMAGE_SCN_CNT_CODE |
@@ -585,6 +611,16 @@ void MCObjectFileInfo::InitCOFFMCObjectFileInfo(Triple T) {
                         COFF::IMAGE_SCN_MEM_DISCARDABLE |
                         COFF::IMAGE_SCN_MEM_READ,
                         SectionKind::getMetadata());
+  DwarfGnuPubNamesSection =
+    Ctx->getCOFFSection(".debug_gnu_pubnames",
+                        COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                        COFF::IMAGE_SCN_MEM_READ,
+                        SectionKind::getMetadata());
+  DwarfGnuPubTypesSection =
+    Ctx->getCOFFSection(".debug_gnu_pubtypes",
+                        COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                        COFF::IMAGE_SCN_MEM_READ,
+                        SectionKind::getMetadata());
   DwarfStrSection =
     Ctx->getCOFFSection(".debug_str",
                         COFF::IMAGE_SCN_MEM_DISCARDABLE |
diff --git a/lib/MC/MCObjectStreamer.cpp b/lib/MC/MCObjectStreamer.cpp
index 36a923a..bc14c2a 100644
--- a/lib/MC/MCObjectStreamer.cpp
+++ b/lib/MC/MCObjectStreamer.cpp
@@ -22,19 +22,22 @@
 #include "llvm/Support/ErrorHandling.h"
 using namespace llvm;
 
-MCObjectStreamer::MCObjectStreamer(StreamerKind Kind, MCContext &Context,
+MCObjectStreamer::MCObjectStreamer(MCContext &Context,
+                                   MCTargetStreamer *TargetStreamer,
                                    MCAsmBackend &TAB, raw_ostream &OS,
                                    MCCodeEmitter *Emitter_)
-    : MCStreamer(Kind, Context),
+    : MCStreamer(Context, TargetStreamer),
       Assembler(new MCAssembler(Context, TAB, *Emitter_,
                                 *TAB.createObjectWriter(OS), OS)),
       CurSectionData(0) {}
 
-MCObjectStreamer::MCObjectStreamer(StreamerKind Kind, MCContext &Context,
+MCObjectStreamer::MCObjectStreamer(MCContext &Context,
+                                   MCTargetStreamer *TargetStreamer,
                                    MCAsmBackend &TAB, raw_ostream &OS,
                                    MCCodeEmitter *Emitter_,
                                    MCAssembler *_Assembler)
-    : MCStreamer(Kind, Context), Assembler(_Assembler), CurSectionData(0) {}
+    : MCStreamer(Context, TargetStreamer), Assembler(_Assembler),
+      CurSectionData(0) {}
 
 MCObjectStreamer::~MCObjectStreamer() {
   delete &Assembler->getBackend();
@@ -302,6 +305,7 @@ void MCObjectStreamer::EmitDwarfAdvanceFrameAddr(const MCSymbol *LastLabel,
 }
 
 void MCObjectStreamer::EmitBytes(StringRef Data) {
+  MCLineEntry::Make(this, getCurrentSection().first);
   getOrCreateDataFragment()->getContents().append(Data.begin(), Data.end());
 }
 
diff --git a/lib/MC/MCObjectSymbolizer.cpp b/lib/MC/MCObjectSymbolizer.cpp
index aa7648e..b9131d1 100644
--- a/lib/MC/MCObjectSymbolizer.cpp
+++ b/lib/MC/MCObjectSymbolizer.cpp
@@ -15,7 +15,7 @@
 #include "llvm/MC/MCRelocationInfo.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Object/MachO.h"
-#include "llvm/Object/ELF.h"
+#include "llvm/Object/ELFObjectFile.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 
@@ -26,100 +26,127 @@ using namespace object;
 
 namespace {
 class MCMachObjectSymbolizer : public MCObjectSymbolizer {
+  const MachOObjectFile *MOOF;
+  // __TEXT;__stubs support.
+  uint64_t StubsStart;
+  uint64_t StubsCount;
+  uint64_t StubSize;
+  uint64_t StubsIndSymIndex;
+
 public:
   MCMachObjectSymbolizer(MCContext &Ctx, OwningPtr<MCRelocationInfo> &RelInfo,
-                         const object::MachOObjectFile *MachOOF)
-    : MCObjectSymbolizer(Ctx, RelInfo, MachOOF)
-  {}
+                         const MachOObjectFile *MOOF);
+
+  StringRef findExternalFunctionAt(uint64_t Addr) LLVM_OVERRIDE;
 
   void tryAddingPcLoadReferenceComment(raw_ostream &cStream,
-                                       int64_t Value, uint64_t Address) {
-    AddrToRelocMap::iterator RI = AddrToReloc.find(Address);
-    if (RI != AddrToReloc.end()) {
-      const MCExpr *RelExpr = RelInfo->createExprForRelocation(RI->second);
-      if (!RelExpr || RelExpr->EvaluateAsAbsolute(Value) == false)
-        return;
-    }
-    uint64_t Addr = Value;
-    SortedSectionList::const_iterator SI = findSectionContaining(Addr);
-    if (SI != SortedSections.end()) {
-      const SectionRef &S = *SI;
-      StringRef Name; S.getName(Name);
-      uint64_t SAddr; S.getAddress(SAddr);
-      if (Name == "__cstring") {
-        StringRef Contents;
-        S.getContents(Contents);
-        Contents = Contents.substr(Addr - SAddr);
-        cStream << " ## literal pool for: "
-                << Contents.substr(0, Contents.find_first_of(0));
-      }
-    }
-  }
+                                       int64_t Value,
+                                       uint64_t Address) LLVM_OVERRIDE;
 };
 } // End unnamed namespace
 
-//===- MCObjectSymbolizer -------------------------------------------------===//
 
-MCObjectSymbolizer::MCObjectSymbolizer(MCContext &Ctx,
-                                       OwningPtr<MCRelocationInfo> &RelInfo,
-                                       const ObjectFile *Obj)
-    : MCSymbolizer(Ctx, RelInfo), Obj(Obj), SortedSections(), AddrToReloc() {
+MCMachObjectSymbolizer::
+MCMachObjectSymbolizer(MCContext &Ctx, OwningPtr<MCRelocationInfo> &RelInfo,
+                       const MachOObjectFile *MOOF)
+    : MCObjectSymbolizer(Ctx, RelInfo, MOOF), MOOF(MOOF),
+      StubsStart(0), StubsCount(0), StubSize(0), StubsIndSymIndex(0) {
+
   error_code ec;
-  for (section_iterator SI = Obj->begin_sections(),
-                        SE = Obj->end_sections();
-                        SI != SE;
-                        SI.increment(ec)) {
+  for (section_iterator SI = MOOF->begin_sections(), SE = MOOF->end_sections();
+       SI != SE; SI.increment(ec)) {
     if (ec) break;
-
-    section_iterator RelSecI = SI->getRelocatedSection();
-    if (RelSecI == Obj->end_sections())
-      continue;
-
-    uint64_t StartAddr; RelSecI->getAddress(StartAddr);
-    uint64_t Size; RelSecI->getSize(Size);
-    bool RequiredForExec; RelSecI->isRequiredForExecution(RequiredForExec);
-    if (RequiredForExec == false || Size == 0)
-      continue;
-    insertSection(*SI);
-    for (relocation_iterator RI = SI->begin_relocations(),
-                             RE = SI->end_relocations();
-                             RI != RE;
-                             RI.increment(ec)) {
-      if (ec) break;
-      // FIXME: libObject is inconsistent regarding error handling. The
-      // overwhelming majority of methods always return object_error::success,
-      // and assert for simple errors.. Here, ELFObjectFile::getRelocationOffset
-      // asserts when the file type isn't ET_REL.
-      // This workaround handles x86-64 elf, the only one that has a relocinfo.
-      uint64_t Offset;
-      if (Obj->isELF()) {
-        const ELF64LEObjectFile *ELFObj = dyn_cast<ELF64LEObjectFile>(Obj);
-        if (ELFObj == 0)
-          break;
-        if (ELFObj->getElfHeader()->e_type == ELF::ET_REL) {
-          RI->getOffset(Offset);
-          Offset += StartAddr;
-        } else {
-          RI->getAddress(Offset);
-        }
+    StringRef Name; SI->getName(Name);
+    if (Name == "__stubs") {
+      SectionRef StubsSec = *SI;
+      if (MOOF->is64Bit()) {
+        MachO::section_64 S = MOOF->getSection64(StubsSec.getRawDataRefImpl());
+        StubsIndSymIndex = S.reserved1;
+        StubSize = S.reserved2;
       } else {
-        RI->getOffset(Offset);
-        Offset += StartAddr;
+        MachO::section S = MOOF->getSection(StubsSec.getRawDataRefImpl());
+        StubsIndSymIndex = S.reserved1;
+        StubSize = S.reserved2;
       }
-      // At a specific address, only keep the first relocation.
-      if (AddrToReloc.find(Offset) == AddrToReloc.end())
-        AddrToReloc[Offset] = *RI;
+      assert(StubSize && "Mach-O stub entry size can't be zero!");
+      StubsSec.getAddress(StubsStart);
+      StubsSec.getSize(StubsCount);
+      StubsCount /= StubSize;
+    }
+  }
+}
+
+StringRef MCMachObjectSymbolizer::findExternalFunctionAt(uint64_t Addr) {
+  // FIXME: also, this can all be done at the very beginning, by iterating over
+  // all stubs and creating the calls to outside functions. Is it worth it
+  // though?
+  if (!StubSize)
+    return StringRef();
+  uint64_t StubIdx = (Addr - StubsStart) / StubSize;
+  if (StubIdx >= StubsCount)
+    return StringRef();
+
+  uint32_t SymtabIdx =
+    MOOF->getIndirectSymbolTableEntry(MOOF->getDysymtabLoadCommand(), StubIdx);
+
+  StringRef SymName;
+  symbol_iterator SI = MOOF->begin_symbols();
+  error_code ec;
+  for (uint32_t i = 0; i != SymtabIdx; ++i) {
+    SI.increment(ec);
+  }
+  SI->getName(SymName);
+  assert(SI != MOOF->end_symbols() && "Stub wasn't found in the symbol table!");
+  assert(SymName.front() == '_' && "Mach-O symbol doesn't start with '_'!");
+  return SymName.substr(1);
+}
+
+void MCMachObjectSymbolizer::
+tryAddingPcLoadReferenceComment(raw_ostream &cStream, int64_t Value,
+                                uint64_t Address) {
+  if (const RelocationRef *R = findRelocationAt(Address)) {
+    const MCExpr *RelExpr = RelInfo->createExprForRelocation(*R);
+    if (!RelExpr || RelExpr->EvaluateAsAbsolute(Value) == false)
+      return;
+  }
+  uint64_t Addr = Value;
+  if (const SectionRef *S = findSectionContaining(Addr)) {
+    StringRef Name; S->getName(Name);
+    uint64_t SAddr; S->getAddress(SAddr);
+    if (Name == "__cstring") {
+      StringRef Contents;
+      S->getContents(Contents);
+      Contents = Contents.substr(Addr - SAddr);
+      cStream << " ## literal pool for: "
+              << Contents.substr(0, Contents.find_first_of(0));
     }
   }
 }
 
+//===- MCObjectSymbolizer -------------------------------------------------===//
+
+MCObjectSymbolizer::MCObjectSymbolizer(MCContext &Ctx,
+                                       OwningPtr<MCRelocationInfo> &RelInfo,
+                                       const ObjectFile *Obj)
+    : MCSymbolizer(Ctx, RelInfo), Obj(Obj), SortedSections(), AddrToReloc() {
+}
+
 bool MCObjectSymbolizer::
 tryAddingSymbolicOperand(MCInst &MI, raw_ostream &cStream,
                          int64_t Value, uint64_t Address, bool IsBranch,
                          uint64_t Offset, uint64_t InstSize) {
-  AddrToRelocMap::iterator RI = AddrToReloc.find(Address + Offset);
-  if (RI != AddrToReloc.end()) {
-    if (const MCExpr *RelExpr = RelInfo->createExprForRelocation(RI->second)) {
+  if (IsBranch) {
+    StringRef ExtFnName = findExternalFunctionAt((uint64_t)Value);
+    if (!ExtFnName.empty()) {
+      MCSymbol *Sym = Ctx.GetOrCreateSymbol(ExtFnName);
+      const MCExpr *Expr = MCSymbolRefExpr::Create(Sym, Ctx);
+      MI.addOperand(MCOperand::CreateExpr(Expr));
+      return true;
+    }
+  }
+
+  if (const RelocationRef *R = findRelocationAt(Address + Offset)) {
+    if (const MCExpr *RelExpr = RelInfo->createExprForRelocation(*R)) {
       MI.addOperand(MCOperand::CreateExpr(RelExpr));
       return true;
     }
@@ -133,10 +160,8 @@ tryAddingSymbolicOperand(MCInst &MI, raw_ostream &cStream,
   uint64_t UValue = Value;
   // FIXME: map instead of looping each time?
   error_code ec;
-  for (symbol_iterator SI = Obj->begin_symbols(),
-       SE = Obj->end_symbols();
-       SI != SE;
-       SI.increment(ec)) {
+  for (symbol_iterator SI = Obj->begin_symbols(), SE = Obj->end_symbols();
+       SI != SE; SI.increment(ec)) {
     if (ec) break;
     uint64_t SymAddr; SI->getAddress(SymAddr);
     uint64_t SymSize; SI->getSize(SymSize);
@@ -166,13 +191,16 @@ tryAddingPcLoadReferenceComment(raw_ostream &cStream,
                                 int64_t Value, uint64_t Address) {
 }
 
+StringRef MCObjectSymbolizer::findExternalFunctionAt(uint64_t Addr) {
+  return StringRef();
+}
+
 MCObjectSymbolizer *
 MCObjectSymbolizer::createObjectSymbolizer(MCContext &Ctx,
                                            OwningPtr<MCRelocationInfo> &RelInfo,
                                            const ObjectFile *Obj) {
-  if (const MachOObjectFile *MachOOF = dyn_cast<MachOObjectFile>(Obj)) {
-    return new MCMachObjectSymbolizer(Ctx, RelInfo, MachOOF);
-  }
+  if (const MachOObjectFile *MOOF = dyn_cast<MachOObjectFile>(Obj))
+    return new MCMachObjectSymbolizer(Ctx, RelInfo, MOOF);
   return new MCObjectSymbolizer(Ctx, RelInfo, Obj);
 }
 
@@ -183,32 +211,100 @@ static bool SectionStartsBefore(const SectionRef &S, uint64_t Addr) {
   return SAddr < Addr;
 }
 
-MCObjectSymbolizer::SortedSectionList::const_iterator
-MCObjectSymbolizer::findSectionContaining(uint64_t Addr) const {
-  SortedSectionList::const_iterator
+const SectionRef *MCObjectSymbolizer::findSectionContaining(uint64_t Addr) {
+  if (SortedSections.empty())
+    buildSectionList();
+
+  SortedSectionList::iterator
     EndIt = SortedSections.end(),
     It = std::lower_bound(SortedSections.begin(), EndIt,
                           Addr, SectionStartsBefore);
   if (It == EndIt)
-    return It;
+    return 0;
   uint64_t SAddr; It->getAddress(SAddr);
   uint64_t SSize; It->getSize(SSize);
   if (Addr >= SAddr + SSize)
-    return EndIt;
-  return It;
+    return 0;
+  return &*It;
+}
+
+const RelocationRef *MCObjectSymbolizer::findRelocationAt(uint64_t Addr) {
+  if (AddrToReloc.empty())
+    buildRelocationByAddrMap();
+
+  AddrToRelocMap::const_iterator RI = AddrToReloc.find(Addr);
+  if (RI == AddrToReloc.end())
+    return 0;
+  return &RI->second;
+}
+
+void MCObjectSymbolizer::buildSectionList() {
+  error_code ec;
+  for (section_iterator SI = Obj->begin_sections(), SE = Obj->end_sections();
+                        SI != SE; SI.increment(ec)) {
+    if (ec) break;
+
+    bool RequiredForExec; SI->isRequiredForExecution(RequiredForExec);
+    if (RequiredForExec == false)
+      continue;
+    uint64_t SAddr; SI->getAddress(SAddr);
+    uint64_t SSize; SI->getSize(SSize);
+    SortedSectionList::iterator It = std::lower_bound(SortedSections.begin(),
+                                                      SortedSections.end(),
+                                                      SAddr,
+                                                      SectionStartsBefore);
+    if (It != SortedSections.end()) {
+      uint64_t FoundSAddr; It->getAddress(FoundSAddr);
+      if (FoundSAddr < SAddr + SSize)
+        llvm_unreachable("Inserting overlapping sections");
+    }
+    SortedSections.insert(It, *SI);
+  }
 }
 
-void MCObjectSymbolizer::insertSection(SectionRef Sec) {
-  uint64_t SAddr; Sec.getAddress(SAddr);
-  uint64_t SSize; Sec.getSize(SSize);
-  SortedSectionList::iterator It = std::lower_bound(SortedSections.begin(),
-                                                    SortedSections.end(),
-                                                    SAddr,
-                                                    SectionStartsBefore);
-  if (It != SortedSections.end()) {
-    uint64_t FoundSAddr; It->getAddress(FoundSAddr);
-    if (FoundSAddr < SAddr + SSize)
-      llvm_unreachable("Inserting overlapping sections");
+void MCObjectSymbolizer::buildRelocationByAddrMap() {
+  error_code ec;
+  for (section_iterator SI = Obj->begin_sections(), SE = Obj->end_sections();
+                        SI != SE; SI.increment(ec)) {
+    if (ec) break;
+
+    section_iterator RelSecI = SI->getRelocatedSection();
+    if (RelSecI == Obj->end_sections())
+      continue;
+
+    uint64_t StartAddr; RelSecI->getAddress(StartAddr);
+    uint64_t Size; RelSecI->getSize(Size);
+    bool RequiredForExec; RelSecI->isRequiredForExecution(RequiredForExec);
+    if (RequiredForExec == false || Size == 0)
+      continue;
+    for (relocation_iterator RI = SI->begin_relocations(),
+                             RE = SI->end_relocations();
+                             RI != RE;
+                             RI.increment(ec)) {
+      if (ec) break;
+      // FIXME: libObject is inconsistent regarding error handling. The
+      // overwhelming majority of methods always return object_error::success,
+      // and assert for simple errors.. Here, ELFObjectFile::getRelocationOffset
+      // asserts when the file type isn't ET_REL.
+      // This workaround handles x86-64 elf, the only one that has a relocinfo.
+      uint64_t Offset;
+      if (Obj->isELF()) {
+        const ELF64LEObjectFile *ELFObj = dyn_cast<ELF64LEObjectFile>(Obj);
+        if (ELFObj == 0)
+          break;
+        if (ELFObj->getELFFile()->getHeader()->e_type == ELF::ET_REL) {
+          RI->getOffset(Offset);
+          Offset += StartAddr;
+        } else {
+          RI->getAddress(Offset);
+        }
+      } else {
+        RI->getOffset(Offset);
+        Offset += StartAddr;
+      }
+      // At a specific address, only keep the first relocation.
+      if (AddrToReloc.find(Offset) == AddrToReloc.end())
+        AddrToReloc[Offset] = *RI;
+    }
   }
-  SortedSections.insert(It, Sec);
 }
diff --git a/lib/MC/MCParser/AsmLexer.cpp b/lib/MC/MCParser/AsmLexer.cpp
index c1c594a..b49dd01 100644
--- a/lib/MC/MCParser/AsmLexer.cpp
+++ b/lib/MC/MCParser/AsmLexer.cpp
@@ -91,9 +91,56 @@ AsmToken AsmLexer::LexFloatLiteral() {
                   StringRef(TokStart, CurPtr - TokStart));
 }
 
-/// LexIdentifier: [a-zA-Z_.][a-zA-Z0-9_$.@]*
+/// LexHexFloatLiteral matches essentially (.[0-9a-fA-F]*)?[pP][+-]?[0-9a-fA-F]+
+/// while making sure there are enough actual digits around for the constant to
+/// be valid.
+///
+/// The leading "0x[0-9a-fA-F]*" (i.e. integer part) has already been consumed
+/// before we get here.
+AsmToken AsmLexer::LexHexFloatLiteral(bool NoIntDigits) {
+  assert((*CurPtr == 'p' || *CurPtr == 'P' || *CurPtr == '.') &&
+         "unexpected parse state in floating hex");
+  bool NoFracDigits = true;
+
+  // Skip the fractional part if there is one
+  if (*CurPtr == '.') {
+    ++CurPtr;
+
+    const char *FracStart = CurPtr;
+    while (isxdigit(*CurPtr))
+      ++CurPtr;
+
+    NoFracDigits = CurPtr == FracStart;
+  }
+
+  if (NoIntDigits && NoFracDigits)
+    return ReturnError(TokStart, "invalid hexadecimal floating-point constant: "
+                                 "expected at least one significand digit");
+
+  // Make sure we do have some kind of proper exponent part
+  if (*CurPtr != 'p' && *CurPtr != 'P')
+    return ReturnError(TokStart, "invalid hexadecimal floating-point constant: "
+                                 "expected exponent part 'p'");
+  ++CurPtr;
+
+  if (*CurPtr == '+' || *CurPtr == '-')
+    ++CurPtr;
+
+  // N.b. exponent digits are *not* hex
+  const char *ExpStart = CurPtr;
+  while (isdigit(*CurPtr))
+    ++CurPtr;
+
+  if (CurPtr == ExpStart)
+    return ReturnError(TokStart, "invalid hexadecimal floating-point constant: "
+                                 "expected at least one exponent digit");
+
+  return AsmToken(AsmToken::Real, StringRef(TokStart, CurPtr - TokStart));
+}
+
+/// LexIdentifier: [a-zA-Z_.][a-zA-Z0-9_$.@?]*
 static bool IsIdentifierChar(char c) {
-  return isalnum(c) || c == '_' || c == '$' || c == '.' || c == '@';
+  return isalnum(c) || c == '_' || c == '$' || c == '.' || c == '@' || c == '?';
 }
 AsmToken AsmLexer::LexIdentifier() {
   // Check for floating point literals.
@@ -265,7 +312,12 @@ AsmToken AsmLexer::LexDigit() {
     while (isxdigit(CurPtr[0]))
       ++CurPtr;
 
-    // Requires at least one hex digit.
+    // "0x.0p0" is valid, and "0x0p0" (but not "0xp0" for example, which will be
+    // diagnosed by LexHexFloatLiteral).
+    if (CurPtr[0] == '.' || CurPtr[0] == 'p' || CurPtr[0] == 'P')
+      return LexHexFloatLiteral(NumStart == CurPtr);
+
+    // Otherwise requires at least one hex digit.
     if (CurPtr == NumStart)
       return ReturnError(CurPtr-2, "invalid hexadecimal number");
 
diff --git a/lib/MC/MCParser/AsmParser.cpp b/lib/MC/MCParser/AsmParser.cpp
index dd0d181..a91bd93 100644
--- a/lib/MC/MCParser/AsmParser.cpp
+++ b/lib/MC/MCParser/AsmParser.cpp
@@ -94,13 +94,13 @@ public:
 };
 
 struct ParseStatementInfo {
-  /// ParsedOperands - The parsed operands from the last parsed statement.
+  /// \brief The parsed operands from the last parsed statement.
   SmallVector<MCParsedAsmOperand*, 8> ParsedOperands;
 
-  /// Opcode - The opcode from the last parsed instruction.
+  /// \brief The opcode from the last parsed instruction.
   unsigned Opcode;
 
-  /// Error - Was there an error parsing the inline assembly?
+  /// \brief Was there an error parsing the inline assembly?
   bool ParseError;
 
   SmallVectorImpl<AsmRewrite> *AsmRewrites;
@@ -138,18 +138,18 @@ private:
   AsmCond TheCondState;
   std::vector<AsmCond> TheCondStack;
 
-  /// ExtensionDirectiveMap - maps directive names to handler methods in parser
+  /// \brief maps directive names to handler methods in parser
   /// extensions. Extensions register themselves in this map by calling
   /// addDirectiveHandler.
   StringMap<ExtensionDirectiveHandler> ExtensionDirectiveMap;
 
-  /// MacroMap - Map of currently defined macros.
+  /// \brief Map of currently defined macros.
   StringMap<MCAsmMacro*> MacroMap;
 
-  /// ActiveMacros - Stack of active macro instantiations.
+  /// \brief Stack of active macro instantiations.
   std::vector<MacroInstantiation*> ActiveMacros;
 
-  /// MacroLikeBodies - List of bodies of anonymous macros.
+  /// \brief List of bodies of anonymous macros.
   std::deque<MCAsmMacro> MacroLikeBodies;
 
   /// Boolean tracking whether macro substitution is enabled.
@@ -165,7 +165,7 @@ private:
   int CppHashBuf;
   /// When generating dwarf for assembly source files we need to calculate the
   /// logical line number based on the last parsed cpp hash file line comment
-  /// and current line. Since this is slow and messes up the SourceMgr's 
+  /// and current line. Since this is slow and messes up the SourceMgr's
   /// cache we save the last info we queried with SrcMgr.FindLineNumber().
   SMLoc LastQueryIDLoc;
   int LastQueryBuffer;
@@ -174,10 +174,10 @@ private:
   /// AssemblerDialect. ~OU means unset value and use value provided by MAI.
   unsigned AssemblerDialect;
 
-  /// IsDarwin - is Darwin compatibility enabled?
+  /// \brief is Darwin compatibility enabled?
   bool IsDarwin;
 
-  /// ParsingInlineAsm - Are we parsing ms-style inline assembly?
+  /// \brief Are we parsing ms-style inline assembly?
   bool ParsingInlineAsm;
 
 public:
@@ -235,7 +235,7 @@ public:
   virtual bool parseParenExpression(const MCExpr *&Res, SMLoc &EndLoc);
   virtual bool parseAbsoluteExpression(int64_t &Res);
 
-  /// parseIdentifier - Parse an identifier or string (as a quoted identifier)
+  /// \brief Parse an identifier or string (as a quoted identifier)
   /// and set \p Res to the identifier contents.
   virtual bool parseIdentifier(StringRef &Res);
   virtual void eatToEndOfStatement();
@@ -245,11 +245,11 @@ public:
 
 private:
 
-  bool ParseStatement(ParseStatementInfo &Info);
-  void EatToEndOfLine();
-  bool ParseCppHashLineFilenameComment(const SMLoc &L);
+  bool parseStatement(ParseStatementInfo &Info);
+  void eatToEndOfLine();
+  bool parseCppHashLineFilenameComment(const SMLoc &L);
 
-  void CheckForBadMacro(SMLoc DirectiveLoc, StringRef Name, StringRef Body,
+  void checkForBadMacro(SMLoc DirectiveLoc, StringRef Name, StringRef Body,
                         MCAsmMacroParameters Parameters);
   bool expandMacro(raw_svector_ostream &OS, StringRef Body,
                    const MCAsmMacroParameters &Parameters,
@@ -257,55 +257,56 @@ private:
                    const SMLoc &L);
 
   /// \brief Are macros enabled in the parser?
-  bool MacrosEnabled() {return MacrosEnabledFlag;}
+  bool areMacrosEnabled() {return MacrosEnabledFlag;}
 
   /// \brief Control a flag in the parser that enables or disables macros.
-  void SetMacrosEnabled(bool Flag) {MacrosEnabledFlag = Flag;}
+  void setMacrosEnabled(bool Flag) {MacrosEnabledFlag = Flag;}
 
   /// \brief Lookup a previously defined macro.
   /// \param Name Macro name.
   /// \returns Pointer to macro. NULL if no such macro was defined.
-  const MCAsmMacro* LookupMacro(StringRef Name);
+  const MCAsmMacro* lookupMacro(StringRef Name);
 
   /// \brief Define a new macro with the given name and information.
-  void DefineMacro(StringRef Name, const MCAsmMacro& Macro);
+  void defineMacro(StringRef Name, const MCAsmMacro& Macro);
 
   /// \brief Undefine a macro. If no such macro was defined, it's a no-op.
-  void UndefineMacro(StringRef Name);
+  void undefineMacro(StringRef Name);
 
   /// \brief Are we inside a macro instantiation?
-  bool InsideMacroInstantiation() {return !ActiveMacros.empty();}
+  bool isInsideMacroInstantiation() {return !ActiveMacros.empty();}
 
-  /// \brief Handle entry to macro instantiation. 
+  /// \brief Handle entry to macro instantiation.
   ///
   /// \param M The macro.
   /// \param NameLoc Instantiation location.
-  bool HandleMacroEntry(const MCAsmMacro *M, SMLoc NameLoc);
+  bool handleMacroEntry(const MCAsmMacro *M, SMLoc NameLoc);
 
   /// \brief Handle exit from macro instantiation.
-  void HandleMacroExit();
+  void handleMacroExit();
 
   /// \brief Extract AsmTokens for a macro argument. If the argument delimiter
   /// is initially unknown, set it to AsmToken::Eof. It will be set to the
   /// correct delimiter by the method.
-  bool ParseMacroArgument(MCAsmMacroArgument &MA,
+  bool parseMacroArgument(MCAsmMacroArgument &MA,
                           AsmToken::TokenKind &ArgumentDelimiter);
 
   /// \brief Parse all macro arguments for a given macro.
-  bool ParseMacroArguments(const MCAsmMacro *M, MCAsmMacroArguments &A);
+  bool parseMacroArguments(const MCAsmMacro *M, MCAsmMacroArguments &A);
 
-  void PrintMacroInstantiations();
-  void PrintMessage(SMLoc Loc, SourceMgr::DiagKind Kind, const Twine &Msg,
+  void printMacroInstantiations();
+  void printMessage(SMLoc Loc, SourceMgr::DiagKind Kind, const Twine &Msg,
                     ArrayRef<SMRange> Ranges = None) const {
     SrcMgr.PrintMessage(Loc, Kind, Msg, Ranges);
   }
   static void DiagHandler(const SMDiagnostic &Diag, void *Context);
 
-  /// EnterIncludeFile - Enter the specified file. This returns true on failure.
-  bool EnterIncludeFile(const std::string &Filename);
-  /// ProcessIncbinFile - Process the specified file for the .incbin directive.
+  /// \brief Enter the specified file. This returns true on failure.
+  bool enterIncludeFile(const std::string &Filename);
+
+  /// \brief Process the specified file for the .incbin directive.
   /// This returns true on failure.
-  bool ProcessIncbinFile(const std::string &Filename);
+  bool processIncbinFile(const std::string &Filename);
 
   /// \brief Reset the current lexer position to that given by \p Loc. The
   /// current token is not set; clients should ensure Lex() is called
@@ -313,7 +314,7 @@ private:
   ///
   /// \param InBuffer If not -1, should be the known buffer id that contains the
   /// location.
-  void JumpToLoc(SMLoc Loc, int InBuffer=-1);
+  void jumpToLoc(SMLoc Loc, int InBuffer=-1);
 
   /// \brief Parse up to the end of statement and a return the contents from the
   /// current token until the end of the statement; the current token on exit
@@ -322,17 +323,16 @@ private:
 
   /// \brief Parse until the end of a statement or a comma is encountered,
   /// return the contents from the current token up to the end or comma.
-  StringRef ParseStringToComma();
+  StringRef parseStringToComma();
 
-  bool ParseAssignment(StringRef Name, bool allow_redef,
+  bool parseAssignment(StringRef Name, bool allow_redef,
                        bool NoDeadStrip = false);
 
-  bool ParsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc);
-  bool ParseBinOpRHS(unsigned Precedence, const MCExpr *&Res, SMLoc &EndLoc);
-  bool ParseParenExpr(const MCExpr *&Res, SMLoc &EndLoc);
-  bool ParseBracketExpr(const MCExpr *&Res, SMLoc &EndLoc);
+  bool parseBinOpRHS(unsigned Precedence, const MCExpr *&Res, SMLoc &EndLoc);
+  bool parseParenExpr(const MCExpr *&Res, SMLoc &EndLoc);
+  bool parseBracketExpr(const MCExpr *&Res, SMLoc &EndLoc);
 
-  bool ParseRegisterOrRegisterNumber(int64_t &Register, SMLoc DirectiveLoc);
+  bool parseRegisterOrRegisterNumber(int64_t &Register, SMLoc DirectiveLoc);
 
   // Generic (target and platform independent) directive parsing.
   enum DirectiveKind {
@@ -342,7 +342,7 @@ private:
     DK_FLOAT, DK_DOUBLE, DK_ALIGN, DK_ALIGN32, DK_BALIGN, DK_BALIGNW,
     DK_BALIGNL, DK_P2ALIGN, DK_P2ALIGNW, DK_P2ALIGNL, DK_ORG, DK_FILL, DK_ENDR,
     DK_BUNDLE_ALIGN_MODE, DK_BUNDLE_LOCK, DK_BUNDLE_UNLOCK,
-    DK_ZERO, DK_EXTERN, DK_GLOBL, DK_GLOBAL, DK_INDIRECT_SYMBOL,
+    DK_ZERO, DK_EXTERN, DK_GLOBL, DK_GLOBAL,
     DK_LAZY_REFERENCE, DK_NO_DEAD_STRIP, DK_SYMBOL_RESOLVER, DK_PRIVATE_EXTERN,
     DK_REFERENCE, DK_WEAK_DEFINITION, DK_WEAK_REFERENCE,
     DK_WEAK_DEF_CAN_BE_HIDDEN, DK_COMM, DK_COMMON, DK_LCOMM, DK_ABORT,
@@ -355,112 +355,113 @@ private:
     DK_CFI_OFFSET, DK_CFI_REL_OFFSET, DK_CFI_PERSONALITY, DK_CFI_LSDA,
     DK_CFI_REMEMBER_STATE, DK_CFI_RESTORE_STATE, DK_CFI_SAME_VALUE,
     DK_CFI_RESTORE, DK_CFI_ESCAPE, DK_CFI_SIGNAL_FRAME, DK_CFI_UNDEFINED,
-    DK_CFI_REGISTER,
+    DK_CFI_REGISTER, DK_CFI_WINDOW_SAVE,
     DK_MACROS_ON, DK_MACROS_OFF, DK_MACRO, DK_ENDM, DK_ENDMACRO, DK_PURGEM,
     DK_SLEB128, DK_ULEB128
   };
 
-  /// DirectiveKindMap - Maps directive name --> DirectiveKind enum, for
+  /// \brief Maps directive name --> DirectiveKind enum, for
   /// directives parsed by this class.
   StringMap<DirectiveKind> DirectiveKindMap;
 
   // ".ascii", ".asciz", ".string"
-  bool ParseDirectiveAscii(StringRef IDVal, bool ZeroTerminated);
-  bool ParseDirectiveValue(unsigned Size); // ".byte", ".long", ...
-  bool ParseDirectiveRealValue(const fltSemantics &); // ".single", ...
-  bool ParseDirectiveFill(); // ".fill"
-  bool ParseDirectiveZero(); // ".zero"
+  bool parseDirectiveAscii(StringRef IDVal, bool ZeroTerminated);
+  bool parseDirectiveValue(unsigned Size); // ".byte", ".long", ...
+  bool parseDirectiveRealValue(const fltSemantics &); // ".single", ...
+  bool parseDirectiveFill(); // ".fill"
+  bool parseDirectiveZero(); // ".zero"
   // ".set", ".equ", ".equiv"
-  bool ParseDirectiveSet(StringRef IDVal, bool allow_redef);
-  bool ParseDirectiveOrg(); // ".org"
+  bool parseDirectiveSet(StringRef IDVal, bool allow_redef);
+  bool parseDirectiveOrg(); // ".org"
   // ".align{,32}", ".p2align{,w,l}"
-  bool ParseDirectiveAlign(bool IsPow2, unsigned ValueSize);
+  bool parseDirectiveAlign(bool IsPow2, unsigned ValueSize);
 
   // ".file", ".line", ".loc", ".stabs"
-  bool ParseDirectiveFile(SMLoc DirectiveLoc);
-  bool ParseDirectiveLine();
-  bool ParseDirectiveLoc();
-  bool ParseDirectiveStabs();
+  bool parseDirectiveFile(SMLoc DirectiveLoc);
+  bool parseDirectiveLine();
+  bool parseDirectiveLoc();
+  bool parseDirectiveStabs();
 
   // .cfi directives
-  bool ParseDirectiveCFIRegister(SMLoc DirectiveLoc);
-  bool ParseDirectiveCFISections();
-  bool ParseDirectiveCFIStartProc();
-  bool ParseDirectiveCFIEndProc();
-  bool ParseDirectiveCFIDefCfaOffset();
-  bool ParseDirectiveCFIDefCfa(SMLoc DirectiveLoc);
-  bool ParseDirectiveCFIAdjustCfaOffset();
-  bool ParseDirectiveCFIDefCfaRegister(SMLoc DirectiveLoc);
-  bool ParseDirectiveCFIOffset(SMLoc DirectiveLoc);
-  bool ParseDirectiveCFIRelOffset(SMLoc DirectiveLoc);
-  bool ParseDirectiveCFIPersonalityOrLsda(bool IsPersonality);
-  bool ParseDirectiveCFIRememberState();
-  bool ParseDirectiveCFIRestoreState();
-  bool ParseDirectiveCFISameValue(SMLoc DirectiveLoc);
-  bool ParseDirectiveCFIRestore(SMLoc DirectiveLoc);
-  bool ParseDirectiveCFIEscape();
-  bool ParseDirectiveCFISignalFrame();
-  bool ParseDirectiveCFIUndefined(SMLoc DirectiveLoc);
+  bool parseDirectiveCFIRegister(SMLoc DirectiveLoc);
+  bool parseDirectiveCFIWindowSave();
+  bool parseDirectiveCFISections();
+  bool parseDirectiveCFIStartProc();
+  bool parseDirectiveCFIEndProc();
+  bool parseDirectiveCFIDefCfaOffset();
+  bool parseDirectiveCFIDefCfa(SMLoc DirectiveLoc);
+  bool parseDirectiveCFIAdjustCfaOffset();
+  bool parseDirectiveCFIDefCfaRegister(SMLoc DirectiveLoc);
+  bool parseDirectiveCFIOffset(SMLoc DirectiveLoc);
+  bool parseDirectiveCFIRelOffset(SMLoc DirectiveLoc);
+  bool parseDirectiveCFIPersonalityOrLsda(bool IsPersonality);
+  bool parseDirectiveCFIRememberState();
+  bool parseDirectiveCFIRestoreState();
+  bool parseDirectiveCFISameValue(SMLoc DirectiveLoc);
+  bool parseDirectiveCFIRestore(SMLoc DirectiveLoc);
+  bool parseDirectiveCFIEscape();
+  bool parseDirectiveCFISignalFrame();
+  bool parseDirectiveCFIUndefined(SMLoc DirectiveLoc);
 
   // macro directives
-  bool ParseDirectivePurgeMacro(SMLoc DirectiveLoc);
-  bool ParseDirectiveEndMacro(StringRef Directive);
-  bool ParseDirectiveMacro(SMLoc DirectiveLoc);
-  bool ParseDirectiveMacrosOnOff(StringRef Directive);
+  bool parseDirectivePurgeMacro(SMLoc DirectiveLoc);
+  bool parseDirectiveEndMacro(StringRef Directive);
+  bool parseDirectiveMacro(SMLoc DirectiveLoc);
+  bool parseDirectiveMacrosOnOff(StringRef Directive);
 
   // ".bundle_align_mode"
-  bool ParseDirectiveBundleAlignMode();
+  bool parseDirectiveBundleAlignMode();
   // ".bundle_lock"
-  bool ParseDirectiveBundleLock();
+  bool parseDirectiveBundleLock();
   // ".bundle_unlock"
-  bool ParseDirectiveBundleUnlock();
+  bool parseDirectiveBundleUnlock();
 
   // ".space", ".skip"
-  bool ParseDirectiveSpace(StringRef IDVal);
+  bool parseDirectiveSpace(StringRef IDVal);
 
   // .sleb128 (Signed=true) and .uleb128 (Signed=false)
-  bool ParseDirectiveLEB128(bool Signed);
+  bool parseDirectiveLEB128(bool Signed);
 
-  /// ParseDirectiveSymbolAttribute - Parse a directive like ".globl" which
+  /// \brief Parse a directive like ".globl" which
   /// accepts a single symbol (which should be a label or an external).
-  bool ParseDirectiveSymbolAttribute(MCSymbolAttr Attr);
+  bool parseDirectiveSymbolAttribute(MCSymbolAttr Attr);
 
-  bool ParseDirectiveComm(bool IsLocal); // ".comm" and ".lcomm"
+  bool parseDirectiveComm(bool IsLocal); // ".comm" and ".lcomm"
 
-  bool ParseDirectiveAbort(); // ".abort"
-  bool ParseDirectiveInclude(); // ".include"
-  bool ParseDirectiveIncbin(); // ".incbin"
+  bool parseDirectiveAbort(); // ".abort"
+  bool parseDirectiveInclude(); // ".include"
+  bool parseDirectiveIncbin(); // ".incbin"
 
-  bool ParseDirectiveIf(SMLoc DirectiveLoc); // ".if"
+  bool parseDirectiveIf(SMLoc DirectiveLoc); // ".if"
   // ".ifb" or ".ifnb", depending on ExpectBlank.
-  bool ParseDirectiveIfb(SMLoc DirectiveLoc, bool ExpectBlank);
+  bool parseDirectiveIfb(SMLoc DirectiveLoc, bool ExpectBlank);
   // ".ifc" or ".ifnc", depending on ExpectEqual.
-  bool ParseDirectiveIfc(SMLoc DirectiveLoc, bool ExpectEqual);
+  bool parseDirectiveIfc(SMLoc DirectiveLoc, bool ExpectEqual);
   // ".ifdef" or ".ifndef", depending on expect_defined
-  bool ParseDirectiveIfdef(SMLoc DirectiveLoc, bool expect_defined);
-  bool ParseDirectiveElseIf(SMLoc DirectiveLoc); // ".elseif"
-  bool ParseDirectiveElse(SMLoc DirectiveLoc); // ".else"
-  bool ParseDirectiveEndIf(SMLoc DirectiveLoc); // .endif
+  bool parseDirectiveIfdef(SMLoc DirectiveLoc, bool expect_defined);
+  bool parseDirectiveElseIf(SMLoc DirectiveLoc); // ".elseif"
+  bool parseDirectiveElse(SMLoc DirectiveLoc); // ".else"
+  bool parseDirectiveEndIf(SMLoc DirectiveLoc); // .endif
   virtual bool parseEscapedString(std::string &Data);
 
-  const MCExpr *ApplyModifierToExpr(const MCExpr *E,
+  const MCExpr *applyModifierToExpr(const MCExpr *E,
                                     MCSymbolRefExpr::VariantKind Variant);
 
   // Macro-like directives
-  MCAsmMacro *ParseMacroLikeBody(SMLoc DirectiveLoc);
-  void InstantiateMacroLikeBody(MCAsmMacro *M, SMLoc DirectiveLoc,
+  MCAsmMacro *parseMacroLikeBody(SMLoc DirectiveLoc);
+  void instantiateMacroLikeBody(MCAsmMacro *M, SMLoc DirectiveLoc,
                                 raw_svector_ostream &OS);
-  bool ParseDirectiveRept(SMLoc DirectiveLoc); // ".rept"
-  bool ParseDirectiveIrp(SMLoc DirectiveLoc);  // ".irp"
-  bool ParseDirectiveIrpc(SMLoc DirectiveLoc); // ".irpc"
-  bool ParseDirectiveEndr(SMLoc DirectiveLoc); // ".endr"
+  bool parseDirectiveRept(SMLoc DirectiveLoc); // ".rept"
+  bool parseDirectiveIrp(SMLoc DirectiveLoc);  // ".irp"
+  bool parseDirectiveIrpc(SMLoc DirectiveLoc); // ".irpc"
+  bool parseDirectiveEndr(SMLoc DirectiveLoc); // ".endr"
 
   // "_emit" or "__emit"
-  bool ParseDirectiveMSEmit(SMLoc DirectiveLoc, ParseStatementInfo &Info,
+  bool parseDirectiveMSEmit(SMLoc DirectiveLoc, ParseStatementInfo &Info,
                             size_t Len);
 
   // "align"
-  bool ParseDirectiveMSAlign(SMLoc DirectiveLoc, ParseStatementInfo &Info);
+  bool parseDirectiveMSAlign(SMLoc DirectiveLoc, ParseStatementInfo &Info);
 
   void initializeDirectiveKindMap();
 };
@@ -476,12 +477,12 @@ extern MCAsmParserExtension *createCOFFAsmParser();
 
 enum { DEFAULT_ADDRSPACE = 0 };
 
-AsmParser::AsmParser(SourceMgr &_SM, MCContext &_Ctx,
-                     MCStreamer &_Out, const MCAsmInfo &_MAI)
-  : Lexer(_MAI), Ctx(_Ctx), Out(_Out), MAI(_MAI), SrcMgr(_SM),
-    PlatformParser(0),
-    CurBuffer(0), MacrosEnabledFlag(true), CppHashLineNumber(0),
-    AssemblerDialect(~0U), IsDarwin(false), ParsingInlineAsm(false) {
+AsmParser::AsmParser(SourceMgr &_SM, MCContext &_Ctx, MCStreamer &_Out,
+                     const MCAsmInfo &_MAI)
+    : Lexer(_MAI), Ctx(_Ctx), Out(_Out), MAI(_MAI), SrcMgr(_SM),
+      PlatformParser(0), CurBuffer(0), MacrosEnabledFlag(true),
+      CppHashLineNumber(0), AssemblerDialect(~0U), IsDarwin(false),
+      ParsingInlineAsm(false) {
   // Save the old handler.
   SavedDiagHandler = SrcMgr.getDiagHandler();
   SavedDiagContext = SrcMgr.getDiagContext();
@@ -512,37 +513,40 @@ AsmParser::~AsmParser() {
   assert(ActiveMacros.empty() && "Unexpected active macro instantiation!");
 
   // Destroy any macros.
-  for (StringMap<MCAsmMacro*>::iterator it = MacroMap.begin(),
-         ie = MacroMap.end(); it != ie; ++it)
+  for (StringMap<MCAsmMacro *>::iterator it = MacroMap.begin(),
+                                         ie = MacroMap.end();
+       it != ie; ++it)
     delete it->getValue();
 
   delete PlatformParser;
 }
 
-void AsmParser::PrintMacroInstantiations() {
+void AsmParser::printMacroInstantiations() {
   // Print the active macro instantiation stack.
-  for (std::vector<MacroInstantiation*>::const_reverse_iterator
-         it = ActiveMacros.rbegin(), ie = ActiveMacros.rend(); it != ie; ++it)
-    PrintMessage((*it)->InstantiationLoc, SourceMgr::DK_Note,
+  for (std::vector<MacroInstantiation *>::const_reverse_iterator
+           it = ActiveMacros.rbegin(),
+           ie = ActiveMacros.rend();
+       it != ie; ++it)
+    printMessage((*it)->InstantiationLoc, SourceMgr::DK_Note,
                  "while in macro instantiation");
 }
 
 bool AsmParser::Warning(SMLoc L, const Twine &Msg, ArrayRef<SMRange> Ranges) {
   if (FatalAssemblerWarnings)
     return Error(L, Msg, Ranges);
-  PrintMessage(L, SourceMgr::DK_Warning, Msg, Ranges);
-  PrintMacroInstantiations();
+  printMessage(L, SourceMgr::DK_Warning, Msg, Ranges);
+  printMacroInstantiations();
   return false;
 }
 
 bool AsmParser::Error(SMLoc L, const Twine &Msg, ArrayRef<SMRange> Ranges) {
   HadError = true;
-  PrintMessage(L, SourceMgr::DK_Error, Msg, Ranges);
-  PrintMacroInstantiations();
+  printMessage(L, SourceMgr::DK_Error, Msg, Ranges);
+  printMacroInstantiations();
   return true;
 }
 
-bool AsmParser::EnterIncludeFile(const std::string &Filename) {
+bool AsmParser::enterIncludeFile(const std::string &Filename) {
   std::string IncludedFile;
   int NewBuf = SrcMgr.AddIncludeFile(Filename, Lexer.getLoc(), IncludedFile);
   if (NewBuf == -1)
@@ -558,7 +562,7 @@ bool AsmParser::EnterIncludeFile(const std::string &Filename) {
 /// Process the specified .incbin file by searching for it in the include paths
 /// then just emitting the byte contents of the file to the streamer. This
 /// returns true on failure.
-bool AsmParser::ProcessIncbinFile(const std::string &Filename) {
+bool AsmParser::processIncbinFile(const std::string &Filename) {
   std::string IncludedFile;
   int NewBuf = SrcMgr.AddIncludeFile(Filename, Lexer.getLoc(), IncludedFile);
   if (NewBuf == -1)
@@ -569,7 +573,7 @@ bool AsmParser::ProcessIncbinFile(const std::string &Filename) {
   return false;
 }
 
-void AsmParser::JumpToLoc(SMLoc Loc, int InBuffer) {
+void AsmParser::jumpToLoc(SMLoc Loc, int InBuffer) {
   if (InBuffer != -1) {
     CurBuffer = InBuffer;
   } else {
@@ -586,7 +590,7 @@ const AsmToken &AsmParser::Lex() {
     // include stack.
     SMLoc ParentIncludeLoc = SrcMgr.getParentIncludeLoc(CurBuffer);
     if (ParentIncludeLoc != SMLoc()) {
-      JumpToLoc(ParentIncludeLoc);
+      jumpToLoc(ParentIncludeLoc);
       tok = &Lexer.Lex();
     }
   }
@@ -623,7 +627,8 @@ bool AsmParser::Run(bool NoInitialTextSection, bool NoFinalize) {
   // While we have input, parse each statement.
   while (Lexer.isNot(AsmToken::Eof)) {
     ParseStatementInfo Info;
-    if (!ParseStatement(Info)) continue;
+    if (!parseStatement(Info))
+      continue;
 
     // We had an error, validate that one was emitted and recover by skipping to
     // the next line.
@@ -637,7 +642,7 @@ bool AsmParser::Run(bool NoInitialTextSection, bool NoFinalize) {
 
   // Check to see there are no empty DwarfFile slots.
   const SmallVectorImpl<MCDwarfFile *> &MCDwarfFiles =
-    getContext().getMCDwarfFiles();
+      getContext().getMCDwarfFiles();
   for (unsigned i = 1; i < MCDwarfFiles.size(); i++) {
     if (!MCDwarfFiles[i])
       TokError("unassigned file number: " + Twine(i) + " for .file directives");
@@ -650,7 +655,7 @@ bool AsmParser::Run(bool NoInitialTextSection, bool NoFinalize) {
   if (!NoFinalize && MAI.hasSubsectionsViaSymbols()) {
     const MCContext::SymbolTable &Symbols = getContext().getSymbols();
     for (MCContext::SymbolTable::const_iterator i = Symbols.begin(),
-         e = Symbols.end();
+                                                e = Symbols.end();
          i != e; ++i) {
       MCSymbol *Sym = i->getValue();
       // Variable symbols may not be marked as defined, so check those
@@ -660,13 +665,12 @@ bool AsmParser::Run(bool NoInitialTextSection, bool NoFinalize) {
         // FIXME: We would really like to refer back to where the symbol was
         // first referenced for a source location. We need to add something
         // to track that. Currently, we just point to the end of the file.
-        PrintMessage(getLexer().getLoc(), SourceMgr::DK_Error,
-                     "assembler local symbol '" + Sym->getName() +
-                     "' not defined");
+        printMessage(
+            getLexer().getLoc(), SourceMgr::DK_Error,
+            "assembler local symbol '" + Sym->getName() + "' not defined");
     }
   }
 
-
   // Finalize the output stream if there are no errors and if the client wants
   // us to.
   if (!HadError && !NoFinalize)
@@ -682,10 +686,9 @@ void AsmParser::checkForValidSection() {
   }
 }
 
-/// eatToEndOfStatement - Throw away the rest of the line for testing purposes.
+/// \brief Throw away the rest of the line for testing purposes.
 void AsmParser::eatToEndOfStatement() {
-  while (Lexer.isNot(AsmToken::EndOfStatement) &&
-         Lexer.isNot(AsmToken::Eof))
+  while (Lexer.isNot(AsmToken::EndOfStatement) && Lexer.isNot(AsmToken::Eof))
     Lex();
 
   // Eat EOL.
@@ -696,33 +699,32 @@ void AsmParser::eatToEndOfStatement() {
 StringRef AsmParser::parseStringToEndOfStatement() {
   const char *Start = getTok().getLoc().getPointer();
 
-  while (Lexer.isNot(AsmToken::EndOfStatement) &&
-         Lexer.isNot(AsmToken::Eof))
+  while (Lexer.isNot(AsmToken::EndOfStatement) && Lexer.isNot(AsmToken::Eof))
     Lex();
 
   const char *End = getTok().getLoc().getPointer();
   return StringRef(Start, End - Start);
 }
 
-StringRef AsmParser::ParseStringToComma() {
+StringRef AsmParser::parseStringToComma() {
   const char *Start = getTok().getLoc().getPointer();
 
   while (Lexer.isNot(AsmToken::EndOfStatement) &&
-         Lexer.isNot(AsmToken::Comma) &&
-         Lexer.isNot(AsmToken::Eof))
+         Lexer.isNot(AsmToken::Comma) && Lexer.isNot(AsmToken::Eof))
     Lex();
 
   const char *End = getTok().getLoc().getPointer();
   return StringRef(Start, End - Start);
 }
 
-/// ParseParenExpr - Parse a paren expression and return it.
+/// \brief Parse a paren expression and return it.
 /// NOTE: This assumes the leading '(' has already been consumed.
 ///
 /// parenexpr ::= expr)
 ///
-bool AsmParser::ParseParenExpr(const MCExpr *&Res, SMLoc &EndLoc) {
-  if (parseExpression(Res)) return true;
+bool AsmParser::parseParenExpr(const MCExpr *&Res, SMLoc &EndLoc) {
+  if (parseExpression(Res))
+    return true;
   if (Lexer.isNot(AsmToken::RParen))
     return TokError("expected ')' in parentheses expression");
   EndLoc = Lexer.getTok().getEndLoc();
@@ -730,13 +732,14 @@ bool AsmParser::ParseParenExpr(const MCExpr *&Res, SMLoc &EndLoc) {
   return false;
 }
 
-/// ParseBracketExpr - Parse a bracket expression and return it.
+/// \brief Parse a bracket expression and return it.
 /// NOTE: This assumes the leading '[' has already been consumed.
 ///
 /// bracketexpr ::= expr]
 ///
-bool AsmParser::ParseBracketExpr(const MCExpr *&Res, SMLoc &EndLoc) {
-  if (parseExpression(Res)) return true;
+bool AsmParser::parseBracketExpr(const MCExpr *&Res, SMLoc &EndLoc) {
+  if (parseExpression(Res))
+    return true;
   if (Lexer.isNot(AsmToken::RBrac))
     return TokError("expected ']' in brackets expression");
   EndLoc = Lexer.getTok().getEndLoc();
@@ -744,13 +747,13 @@ bool AsmParser::ParseBracketExpr(const MCExpr *&Res, SMLoc &EndLoc) {
   return false;
 }
 
-/// ParsePrimaryExpr - Parse a primary expression and return it.
+/// \brief Parse a primary expression and return it.
 ///  primaryexpr ::= (parenexpr
 ///  primaryexpr ::= symbol
 ///  primaryexpr ::= number
 ///  primaryexpr ::= '.'
 ///  primaryexpr ::= ~,+,- primaryexpr
-bool AsmParser::ParsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) {
+bool AsmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) {
   SMLoc FirstTokenLoc = getLexer().getLoc();
   AsmToken::TokenKind FirstTokenKind = Lexer.getKind();
   switch (FirstTokenKind) {
@@ -761,36 +764,54 @@ bool AsmParser::ParsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) {
     return true;
   case AsmToken::Exclaim:
     Lex(); // Eat the operator.
-    if (ParsePrimaryExpr(Res, EndLoc))
+    if (parsePrimaryExpr(Res, EndLoc))
       return true;
     Res = MCUnaryExpr::CreateLNot(Res, getContext());
     return false;
   case AsmToken::Dollar:
+  case AsmToken::At:
   case AsmToken::String:
   case AsmToken::Identifier: {
     StringRef Identifier;
     if (parseIdentifier(Identifier)) {
-      if (FirstTokenKind == AsmToken::Dollar)
-        return Error(FirstTokenLoc, "invalid token in expression");
-      return true;
+      if (FirstTokenKind == AsmToken::Dollar) {
+        if (Lexer.getMAI().getDollarIsPC()) {
+          // This is a '$' reference, which references the current PC.  Emit a
+          // temporary label to the streamer and refer to it.
+          MCSymbol *Sym = Ctx.CreateTempSymbol();
+          Out.EmitLabel(Sym);
+          Res = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_None,
+                                        getContext());
+          EndLoc = FirstTokenLoc;
+          return false;
+        } else
+          return Error(FirstTokenLoc, "invalid token in expression");
+        return true;
+      }
     }
 
     EndLoc = SMLoc::getFromPointer(Identifier.end());
 
     // This is a symbol reference.
+    StringRef SymbolName = Identifier;
+    MCSymbolRefExpr::VariantKind Variant = MCSymbolRefExpr::VK_None;
     std::pair<StringRef, StringRef> Split = Identifier.split('@');
-    MCSymbol *Sym = getContext().GetOrCreateSymbol(Split.first);
 
     // Lookup the symbol variant if used.
-    MCSymbolRefExpr::VariantKind Variant = MCSymbolRefExpr::VK_None;
     if (Split.first.size() != Identifier.size()) {
       Variant = MCSymbolRefExpr::getVariantKindForName(Split.second);
-      if (Variant == MCSymbolRefExpr::VK_Invalid) {
+      if (Variant != MCSymbolRefExpr::VK_Invalid) {
+        SymbolName = Split.first;
+      } else if (MAI.doesAllowAtInName()) {
+        Variant = MCSymbolRefExpr::VK_None;
+      } else {
         Variant = MCSymbolRefExpr::VK_None;
         return TokError("invalid variant '" + Split.second + "'");
       }
     }
 
+    MCSymbol *Sym = getContext().GetOrCreateSymbol(SymbolName);
+
     // If this is an absolute variable reference, substitute it now to preserve
     // semantics in the face of reassignment.
     if (Sym->isVariable() && isa<MCConstantExpr>(Sym->getVariableValue())) {
@@ -823,11 +844,11 @@ bool AsmParser::ParsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) {
           Variant = MCSymbolRefExpr::VK_None;
           return TokError("invalid variant '" + Split.second + "'");
         }
-	IDVal = Split.first;
+        IDVal = Split.first;
       }
-      if (IDVal == "f" || IDVal == "b"){
-        MCSymbol *Sym = Ctx.GetDirectionalLocalSymbol(IntVal,
-                                                      IDVal == "f" ? 1 : 0);
+      if (IDVal == "f" || IDVal == "b") {
+        MCSymbol *Sym =
+            Ctx.GetDirectionalLocalSymbol(IntVal, IDVal == "f" ? 1 : 0);
         Res = MCSymbolRefExpr::Create(Sym, Variant, getContext());
         if (IDVal == "b" && Sym->isUndefined())
           return Error(Loc, "invalid reference to undefined symbol");
@@ -857,27 +878,27 @@ bool AsmParser::ParsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) {
   }
   case AsmToken::LParen:
     Lex(); // Eat the '('.
-    return ParseParenExpr(Res, EndLoc);
+    return parseParenExpr(Res, EndLoc);
   case AsmToken::LBrac:
     if (!PlatformParser->HasBracketExpressions())
       return TokError("brackets expression not supported on this target");
     Lex(); // Eat the '['.
-    return ParseBracketExpr(Res, EndLoc);
+    return parseBracketExpr(Res, EndLoc);
   case AsmToken::Minus:
     Lex(); // Eat the operator.
-    if (ParsePrimaryExpr(Res, EndLoc))
+    if (parsePrimaryExpr(Res, EndLoc))
       return true;
     Res = MCUnaryExpr::CreateMinus(Res, getContext());
     return false;
   case AsmToken::Plus:
     Lex(); // Eat the operator.
-    if (ParsePrimaryExpr(Res, EndLoc))
+    if (parsePrimaryExpr(Res, EndLoc))
       return true;
     Res = MCUnaryExpr::CreatePlus(Res, getContext());
     return false;
   case AsmToken::Tilde:
     Lex(); // Eat the operator.
-    if (ParsePrimaryExpr(Res, EndLoc))
+    if (parsePrimaryExpr(Res, EndLoc))
       return true;
     Res = MCUnaryExpr::CreateNot(Res, getContext());
     return false;
@@ -889,13 +910,13 @@ bool AsmParser::parseExpression(const MCExpr *&Res) {
   return parseExpression(Res, EndLoc);
 }
 
-bool AsmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) {
-  return ParsePrimaryExpr(Res, EndLoc);
-}
-
 const MCExpr *
-AsmParser::ApplyModifierToExpr(const MCExpr *E,
+AsmParser::applyModifierToExpr(const MCExpr *E,
                                MCSymbolRefExpr::VariantKind Variant) {
+  // Ask the target implementation about this expression first.
+  const MCExpr *NewE = getTargetParser().applyModifierToExpr(E, Variant, Ctx);
+  if (NewE)
+    return NewE;
   // Recurse over the given expression, rebuilding it to apply the given variant
   // if there is exactly one symbol.
   switch (E->getKind()) {
@@ -907,8 +928,8 @@ AsmParser::ApplyModifierToExpr(const MCExpr *E,
     const MCSymbolRefExpr *SRE = cast<MCSymbolRefExpr>(E);
 
     if (SRE->getKind() != MCSymbolRefExpr::VK_None) {
-      TokError("invalid variant on expression '" +
-               getTok().getIdentifier() + "' (already modified)");
+      TokError("invalid variant on expression '" + getTok().getIdentifier() +
+               "' (already modified)");
       return E;
     }
 
@@ -917,7 +938,7 @@ AsmParser::ApplyModifierToExpr(const MCExpr *E,
 
   case MCExpr::Unary: {
     const MCUnaryExpr *UE = cast<MCUnaryExpr>(E);
-    const MCExpr *Sub = ApplyModifierToExpr(UE->getSubExpr(), Variant);
+    const MCExpr *Sub = applyModifierToExpr(UE->getSubExpr(), Variant);
     if (!Sub)
       return 0;
     return MCUnaryExpr::Create(UE->getOpcode(), Sub, getContext());
@@ -925,14 +946,16 @@ AsmParser::ApplyModifierToExpr(const MCExpr *E,
 
   case MCExpr::Binary: {
     const MCBinaryExpr *BE = cast<MCBinaryExpr>(E);
-    const MCExpr *LHS = ApplyModifierToExpr(BE->getLHS(), Variant);
-    const MCExpr *RHS = ApplyModifierToExpr(BE->getRHS(), Variant);
+    const MCExpr *LHS = applyModifierToExpr(BE->getLHS(), Variant);
+    const MCExpr *RHS = applyModifierToExpr(BE->getRHS(), Variant);
 
     if (!LHS && !RHS)
       return 0;
 
-    if (!LHS) LHS = BE->getLHS();
-    if (!RHS) RHS = BE->getRHS();
+    if (!LHS)
+      LHS = BE->getLHS();
+    if (!RHS)
+      RHS = BE->getRHS();
 
     return MCBinaryExpr::Create(BE->getOpcode(), LHS, RHS, getContext());
   }
@@ -941,7 +964,7 @@ AsmParser::ApplyModifierToExpr(const MCExpr *E,
   llvm_unreachable("Invalid expression kind!");
 }
 
-/// parseExpression - Parse an expression and return it.
+/// \brief Parse an expression and return it.
 ///
 ///  expr ::= expr &&,|| expr               -> lowest.
 ///  expr ::= expr |,^,&,! expr
@@ -954,7 +977,7 @@ AsmParser::ApplyModifierToExpr(const MCExpr *E,
 bool AsmParser::parseExpression(const MCExpr *&Res, SMLoc &EndLoc) {
   // Parse the expression.
   Res = 0;
-  if (ParsePrimaryExpr(Res, EndLoc) || ParseBinOpRHS(1, Res, EndLoc))
+  if (parsePrimaryExpr(Res, EndLoc) || parseBinOpRHS(1, Res, EndLoc))
     return true;
 
   // As a special case, we support 'a op b @ modifier' by rewriting the
@@ -967,11 +990,11 @@ bool AsmParser::parseExpression(const MCExpr *&Res, SMLoc &EndLoc) {
       return TokError("unexpected symbol modifier following '@'");
 
     MCSymbolRefExpr::VariantKind Variant =
-      MCSymbolRefExpr::getVariantKindForName(getTok().getIdentifier());
+        MCSymbolRefExpr::getVariantKindForName(getTok().getIdentifier());
     if (Variant == MCSymbolRefExpr::VK_Invalid)
       return TokError("invalid variant '" + getTok().getIdentifier() + "'");
 
-    const MCExpr *ModifiedRes = ApplyModifierToExpr(Res, Variant);
+    const MCExpr *ModifiedRes = applyModifierToExpr(Res, Variant);
     if (!ModifiedRes) {
       return TokError("invalid modifier '" + getTok().getIdentifier() +
                       "' (no symbols present)");
@@ -991,8 +1014,7 @@ bool AsmParser::parseExpression(const MCExpr *&Res, SMLoc &EndLoc) {
 
 bool AsmParser::parseParenExpression(const MCExpr *&Res, SMLoc &EndLoc) {
   Res = 0;
-  return ParseParenExpr(Res, EndLoc) ||
-         ParseBinOpRHS(1, Res, EndLoc);
+  return parseParenExpr(Res, EndLoc) || parseBinOpRHS(1, Res, EndLoc);
 }
 
 bool AsmParser::parseAbsoluteExpression(int64_t &Res) {
@@ -1012,9 +1034,9 @@ static unsigned getBinOpPrecedence(AsmToken::TokenKind K,
                                    MCBinaryExpr::Opcode &Kind) {
   switch (K) {
   default:
-    return 0;    // not a binop.
+    return 0; // not a binop.
 
-    // Lowest Precedence: &&, ||
+  // Lowest Precedence: &&, ||
   case AsmToken::AmpAmp:
     Kind = MCBinaryExpr::LAnd;
     return 1;
@@ -1022,10 +1044,9 @@ static unsigned getBinOpPrecedence(AsmToken::TokenKind K,
     Kind = MCBinaryExpr::LOr;
     return 1;
 
-
-    // Low Precedence: |, &, ^
-    //
-    // FIXME: gas seems to support '!' as an infix operator?
+  // Low Precedence: |, &, ^
+  //
+  // FIXME: gas seems to support '!' as an infix operator?
   case AsmToken::Pipe:
     Kind = MCBinaryExpr::Or;
     return 2;
@@ -1036,7 +1057,7 @@ static unsigned getBinOpPrecedence(AsmToken::TokenKind K,
     Kind = MCBinaryExpr::And;
     return 2;
 
-    // Low Intermediate Precedence: ==, !=, <>, <, <=, >, >=
+  // Low Intermediate Precedence: ==, !=, <>, <, <=, >, >=
   case AsmToken::EqualEqual:
     Kind = MCBinaryExpr::EQ;
     return 3;
@@ -1057,7 +1078,7 @@ static unsigned getBinOpPrecedence(AsmToken::TokenKind K,
     Kind = MCBinaryExpr::GTE;
     return 3;
 
-    // Intermediate Precedence: <<, >>
+  // Intermediate Precedence: <<, >>
   case AsmToken::LessLess:
     Kind = MCBinaryExpr::Shl;
     return 4;
@@ -1065,7 +1086,7 @@ static unsigned getBinOpPrecedence(AsmToken::TokenKind K,
     Kind = MCBinaryExpr::Shr;
     return 4;
 
-    // High Intermediate Precedence: +, -
+  // High Intermediate Precedence: +, -
   case AsmToken::Plus:
     Kind = MCBinaryExpr::Add;
     return 5;
@@ -1073,7 +1094,7 @@ static unsigned getBinOpPrecedence(AsmToken::TokenKind K,
     Kind = MCBinaryExpr::Sub;
     return 5;
 
-    // Highest Precedence: *, /, %
+  // Highest Precedence: *, /, %
   case AsmToken::Star:
     Kind = MCBinaryExpr::Mul;
     return 6;
@@ -1086,10 +1107,9 @@ static unsigned getBinOpPrecedence(AsmToken::TokenKind K,
   }
 }
 
-
-/// ParseBinOpRHS - Parse all binary operators with precedence >= 'Precedence'.
+/// \brief Parse all binary operators with precedence >= 'Precedence'.
 /// Res contains the LHS of the expression on input.
-bool AsmParser::ParseBinOpRHS(unsigned Precedence, const MCExpr *&Res,
+bool AsmParser::parseBinOpRHS(unsigned Precedence, const MCExpr *&Res,
                               SMLoc &EndLoc) {
   while (1) {
     MCBinaryExpr::Opcode Kind = MCBinaryExpr::Add;
@@ -1104,15 +1124,15 @@ bool AsmParser::ParseBinOpRHS(unsigned Precedence, const MCExpr *&Res,
 
     // Eat the next primary expression.
     const MCExpr *RHS;
-    if (ParsePrimaryExpr(RHS, EndLoc)) return true;
+    if (parsePrimaryExpr(RHS, EndLoc))
+      return true;
 
     // If BinOp binds less tightly with RHS than the operator after RHS, let
     // the pending operator take RHS as its LHS.
     MCBinaryExpr::Opcode Dummy;
     unsigned NextTokPrec = getBinOpPrecedence(Lexer.getKind(), Dummy);
-    if (TokPrec < NextTokPrec) {
-      if (ParseBinOpRHS(TokPrec+1, RHS, EndLoc)) return true;
-    }
+    if (TokPrec < NextTokPrec && parseBinOpRHS(TokPrec + 1, RHS, EndLoc))
+      return true;
 
     // Merge LHS and RHS according to operator.
     Res = MCBinaryExpr::Create(Kind, Res, RHS, getContext());
@@ -1123,7 +1143,7 @@ bool AsmParser::ParseBinOpRHS(unsigned Precedence, const MCExpr *&Res,
 ///   ::= EndOfStatement
 ///   ::= Label* Directive ...Operands... EndOfStatement
 ///   ::= Label* Identifier OperandList* EndOfStatement
-bool AsmParser::ParseStatement(ParseStatementInfo &Info) {
+bool AsmParser::parseStatement(ParseStatementInfo &Info) {
   if (Lexer.is(AsmToken::EndOfStatement)) {
     Out.AddBlankLine();
     Lex();
@@ -1137,7 +1157,7 @@ bool AsmParser::ParseStatement(ParseStatementInfo &Info) {
   int64_t LocalLabelVal = -1;
   // A full line comment is a '#' as the first token.
   if (Lexer.is(AsmToken::Hash))
-    return ParseCppHashLineFilenameComment(IDLoc);
+    return parseCppHashLineFilenameComment(IDLoc);
 
   // Allow an integer followed by a ':' as a directional local label.
   if (Lexer.is(AsmToken::Integer)) {
@@ -1168,34 +1188,34 @@ bool AsmParser::ParseStatement(ParseStatementInfo &Info) {
   // have to do this so that .endif isn't skipped in a ".if 0" block for
   // example.
   StringMap<DirectiveKind>::const_iterator DirKindIt =
-    DirectiveKindMap.find(IDVal);
-  DirectiveKind DirKind =
-    (DirKindIt == DirectiveKindMap.end()) ? DK_NO_DIRECTIVE :
-                                            DirKindIt->getValue();
+      DirectiveKindMap.find(IDVal);
+  DirectiveKind DirKind = (DirKindIt == DirectiveKindMap.end())
+                              ? DK_NO_DIRECTIVE
+                              : DirKindIt->getValue();
   switch (DirKind) {
-    default:
-      break;
-    case DK_IF:
-      return ParseDirectiveIf(IDLoc);
-    case DK_IFB:
-      return ParseDirectiveIfb(IDLoc, true);
-    case DK_IFNB:
-      return ParseDirectiveIfb(IDLoc, false);
-    case DK_IFC:
-      return ParseDirectiveIfc(IDLoc, true);
-    case DK_IFNC:
-      return ParseDirectiveIfc(IDLoc, false);
-    case DK_IFDEF:
-      return ParseDirectiveIfdef(IDLoc, true);
-    case DK_IFNDEF:
-    case DK_IFNOTDEF:
-      return ParseDirectiveIfdef(IDLoc, false);
-    case DK_ELSEIF:
-      return ParseDirectiveElseIf(IDLoc);
-    case DK_ELSE:
-      return ParseDirectiveElse(IDLoc);
-    case DK_ENDIF:
-      return ParseDirectiveEndIf(IDLoc);
+  default:
+    break;
+  case DK_IF:
+    return parseDirectiveIf(IDLoc);
+  case DK_IFB:
+    return parseDirectiveIfb(IDLoc, true);
+  case DK_IFNB:
+    return parseDirectiveIfb(IDLoc, false);
+  case DK_IFC:
+    return parseDirectiveIfc(IDLoc, true);
+  case DK_IFNC:
+    return parseDirectiveIfc(IDLoc, false);
+  case DK_IFDEF:
+    return parseDirectiveIfdef(IDLoc, true);
+  case DK_IFNDEF:
+  case DK_IFNOTDEF:
+    return parseDirectiveIfdef(IDLoc, false);
+  case DK_ELSEIF:
+    return parseDirectiveElseIf(IDLoc);
+  case DK_ELSE:
+    return parseDirectiveElse(IDLoc);
+  case DK_ENDIF:
+    return parseDirectiveEndIf(IDLoc);
   }
 
   // Ignore the statement if in the middle of inactive conditional
@@ -1242,6 +1262,8 @@ bool AsmParser::ParseStatement(ParseStatementInfo &Info) {
       MCGenDwarfLabelEntry::Make(Sym, &getStreamer(), getSourceManager(),
                                  IDLoc);
 
+    getTargetParser().onLabelParsed(Sym);
+
     // Consume any end of statement token, if present, to avoid spurious
     // AddBlankLine calls().
     if (Lexer.is(AsmToken::EndOfStatement)) {
@@ -1257,24 +1279,24 @@ bool AsmParser::ParseStatement(ParseStatementInfo &Info) {
     // identifier '=' ... -> assignment statement
     Lex();
 
-    return ParseAssignment(IDVal, true);
+    return parseAssignment(IDVal, true);
 
   default: // Normal instruction or directive.
     break;
   }
 
   // If macros are enabled, check to see if this is a macro instantiation.
-  if (MacrosEnabled())
-    if (const MCAsmMacro *M = LookupMacro(IDVal)) {
-      return HandleMacroEntry(M, IDLoc);
+  if (areMacrosEnabled())
+    if (const MCAsmMacro *M = lookupMacro(IDVal)) {
+      return handleMacroEntry(M, IDLoc);
     }
 
   // Otherwise, we have a normal instruction or directive.
-  
+
   // Directives start with "."
   if (IDVal[0] == '.' && IDVal != ".") {
     // There are several entities interested in parsing directives:
-    // 
+    //
     // 1. The target-specific assembly parser. Some directives are target
     //    specific or may potentially behave differently on certain targets.
     // 2. Asm parser extensions. For example, platform-specific parsers
@@ -1291,185 +1313,185 @@ bool AsmParser::ParseStatement(ParseStatementInfo &Info) {
 
     // Next, check the extention directive map to see if any extension has
     // registered itself to parse this directive.
-    std::pair<MCAsmParserExtension*, DirectiveHandler> Handler =
-      ExtensionDirectiveMap.lookup(IDVal);
+    std::pair<MCAsmParserExtension *, DirectiveHandler> Handler =
+        ExtensionDirectiveMap.lookup(IDVal);
     if (Handler.first)
       return (*Handler.second)(Handler.first, IDVal, IDLoc);
 
     // Finally, if no one else is interested in this directive, it must be
     // generic and familiar to this class.
     switch (DirKind) {
-      default:
-        break;
-      case DK_SET:
-      case DK_EQU:
-        return ParseDirectiveSet(IDVal, true);
-      case DK_EQUIV:
-        return ParseDirectiveSet(IDVal, false);
-      case DK_ASCII:
-        return ParseDirectiveAscii(IDVal, false);
-      case DK_ASCIZ:
-      case DK_STRING:
-        return ParseDirectiveAscii(IDVal, true);
-      case DK_BYTE:
-        return ParseDirectiveValue(1);
-      case DK_SHORT:
-      case DK_VALUE:
-      case DK_2BYTE:
-        return ParseDirectiveValue(2);
-      case DK_LONG:
-      case DK_INT:
-      case DK_4BYTE:
-        return ParseDirectiveValue(4);
-      case DK_QUAD:
-      case DK_8BYTE:
-        return ParseDirectiveValue(8);
-      case DK_SINGLE:
-      case DK_FLOAT:
-        return ParseDirectiveRealValue(APFloat::IEEEsingle);
-      case DK_DOUBLE:
-        return ParseDirectiveRealValue(APFloat::IEEEdouble);
-      case DK_ALIGN: {
-        bool IsPow2 = !getContext().getAsmInfo()->getAlignmentIsInBytes();
-        return ParseDirectiveAlign(IsPow2, /*ExprSize=*/1);
-      }
-      case DK_ALIGN32: {
-        bool IsPow2 = !getContext().getAsmInfo()->getAlignmentIsInBytes();
-        return ParseDirectiveAlign(IsPow2, /*ExprSize=*/4);
-      }
-      case DK_BALIGN:
-        return ParseDirectiveAlign(/*IsPow2=*/false, /*ExprSize=*/1);
-      case DK_BALIGNW:
-        return ParseDirectiveAlign(/*IsPow2=*/false, /*ExprSize=*/2);
-      case DK_BALIGNL:
-        return ParseDirectiveAlign(/*IsPow2=*/false, /*ExprSize=*/4);
-      case DK_P2ALIGN:
-        return ParseDirectiveAlign(/*IsPow2=*/true, /*ExprSize=*/1);
-      case DK_P2ALIGNW:
-        return ParseDirectiveAlign(/*IsPow2=*/true, /*ExprSize=*/2);
-      case DK_P2ALIGNL:
-        return ParseDirectiveAlign(/*IsPow2=*/true, /*ExprSize=*/4);
-      case DK_ORG:
-        return ParseDirectiveOrg();
-      case DK_FILL:
-        return ParseDirectiveFill();
-      case DK_ZERO:
-        return ParseDirectiveZero();
-      case DK_EXTERN:
-        eatToEndOfStatement(); // .extern is the default, ignore it.
-        return false;
-      case DK_GLOBL:
-      case DK_GLOBAL:
-        return ParseDirectiveSymbolAttribute(MCSA_Global);
-      case DK_INDIRECT_SYMBOL:
-        return ParseDirectiveSymbolAttribute(MCSA_IndirectSymbol);
-      case DK_LAZY_REFERENCE:
-        return ParseDirectiveSymbolAttribute(MCSA_LazyReference);
-      case DK_NO_DEAD_STRIP:
-        return ParseDirectiveSymbolAttribute(MCSA_NoDeadStrip);
-      case DK_SYMBOL_RESOLVER:
-        return ParseDirectiveSymbolAttribute(MCSA_SymbolResolver);
-      case DK_PRIVATE_EXTERN:
-        return ParseDirectiveSymbolAttribute(MCSA_PrivateExtern);
-      case DK_REFERENCE:
-        return ParseDirectiveSymbolAttribute(MCSA_Reference);
-      case DK_WEAK_DEFINITION:
-        return ParseDirectiveSymbolAttribute(MCSA_WeakDefinition);
-      case DK_WEAK_REFERENCE:
-        return ParseDirectiveSymbolAttribute(MCSA_WeakReference);
-      case DK_WEAK_DEF_CAN_BE_HIDDEN:
-        return ParseDirectiveSymbolAttribute(MCSA_WeakDefAutoPrivate);
-      case DK_COMM:
-      case DK_COMMON:
-        return ParseDirectiveComm(/*IsLocal=*/false);
-      case DK_LCOMM:
-        return ParseDirectiveComm(/*IsLocal=*/true);
-      case DK_ABORT:
-        return ParseDirectiveAbort();
-      case DK_INCLUDE:
-        return ParseDirectiveInclude();
-      case DK_INCBIN:
-        return ParseDirectiveIncbin();
-      case DK_CODE16:
-      case DK_CODE16GCC:
-        return TokError(Twine(IDVal) + " not supported yet");
-      case DK_REPT:
-        return ParseDirectiveRept(IDLoc);
-      case DK_IRP:
-        return ParseDirectiveIrp(IDLoc);
-      case DK_IRPC:
-        return ParseDirectiveIrpc(IDLoc);
-      case DK_ENDR:
-        return ParseDirectiveEndr(IDLoc);
-      case DK_BUNDLE_ALIGN_MODE:
-        return ParseDirectiveBundleAlignMode();
-      case DK_BUNDLE_LOCK:
-        return ParseDirectiveBundleLock();
-      case DK_BUNDLE_UNLOCK:
-        return ParseDirectiveBundleUnlock();
-      case DK_SLEB128:
-        return ParseDirectiveLEB128(true);
-      case DK_ULEB128:
-        return ParseDirectiveLEB128(false);
-      case DK_SPACE:
-      case DK_SKIP:
-        return ParseDirectiveSpace(IDVal);
-      case DK_FILE:
-        return ParseDirectiveFile(IDLoc);
-      case DK_LINE:
-        return ParseDirectiveLine();
-      case DK_LOC:
-        return ParseDirectiveLoc();
-      case DK_STABS:
-        return ParseDirectiveStabs();
-      case DK_CFI_SECTIONS:
-        return ParseDirectiveCFISections();
-      case DK_CFI_STARTPROC:
-        return ParseDirectiveCFIStartProc();
-      case DK_CFI_ENDPROC:
-        return ParseDirectiveCFIEndProc();
-      case DK_CFI_DEF_CFA:
-        return ParseDirectiveCFIDefCfa(IDLoc);
-      case DK_CFI_DEF_CFA_OFFSET:
-        return ParseDirectiveCFIDefCfaOffset();
-      case DK_CFI_ADJUST_CFA_OFFSET:
-        return ParseDirectiveCFIAdjustCfaOffset();
-      case DK_CFI_DEF_CFA_REGISTER:
-        return ParseDirectiveCFIDefCfaRegister(IDLoc);
-      case DK_CFI_OFFSET:
-        return ParseDirectiveCFIOffset(IDLoc);
-      case DK_CFI_REL_OFFSET:
-        return ParseDirectiveCFIRelOffset(IDLoc);
-      case DK_CFI_PERSONALITY:
-        return ParseDirectiveCFIPersonalityOrLsda(true);
-      case DK_CFI_LSDA:
-        return ParseDirectiveCFIPersonalityOrLsda(false);
-      case DK_CFI_REMEMBER_STATE:
-        return ParseDirectiveCFIRememberState();
-      case DK_CFI_RESTORE_STATE:
-        return ParseDirectiveCFIRestoreState();
-      case DK_CFI_SAME_VALUE:
-        return ParseDirectiveCFISameValue(IDLoc);
-      case DK_CFI_RESTORE:
-        return ParseDirectiveCFIRestore(IDLoc);
-      case DK_CFI_ESCAPE:
-        return ParseDirectiveCFIEscape();
-      case DK_CFI_SIGNAL_FRAME:
-        return ParseDirectiveCFISignalFrame();
-      case DK_CFI_UNDEFINED:
-        return ParseDirectiveCFIUndefined(IDLoc);
-      case DK_CFI_REGISTER:
-        return ParseDirectiveCFIRegister(IDLoc);
-      case DK_MACROS_ON:
-      case DK_MACROS_OFF:
-        return ParseDirectiveMacrosOnOff(IDVal);
-      case DK_MACRO:
-        return ParseDirectiveMacro(IDLoc);
-      case DK_ENDM:
-      case DK_ENDMACRO:
-        return ParseDirectiveEndMacro(IDVal);
-      case DK_PURGEM:
-        return ParseDirectivePurgeMacro(IDLoc);
+    default:
+      break;
+    case DK_SET:
+    case DK_EQU:
+      return parseDirectiveSet(IDVal, true);
+    case DK_EQUIV:
+      return parseDirectiveSet(IDVal, false);
+    case DK_ASCII:
+      return parseDirectiveAscii(IDVal, false);
+    case DK_ASCIZ:
+    case DK_STRING:
+      return parseDirectiveAscii(IDVal, true);
+    case DK_BYTE:
+      return parseDirectiveValue(1);
+    case DK_SHORT:
+    case DK_VALUE:
+    case DK_2BYTE:
+      return parseDirectiveValue(2);
+    case DK_LONG:
+    case DK_INT:
+    case DK_4BYTE:
+      return parseDirectiveValue(4);
+    case DK_QUAD:
+    case DK_8BYTE:
+      return parseDirectiveValue(8);
+    case DK_SINGLE:
+    case DK_FLOAT:
+      return parseDirectiveRealValue(APFloat::IEEEsingle);
+    case DK_DOUBLE:
+      return parseDirectiveRealValue(APFloat::IEEEdouble);
+    case DK_ALIGN: {
+      bool IsPow2 = !getContext().getAsmInfo()->getAlignmentIsInBytes();
+      return parseDirectiveAlign(IsPow2, /*ExprSize=*/1);
+    }
+    case DK_ALIGN32: {
+      bool IsPow2 = !getContext().getAsmInfo()->getAlignmentIsInBytes();
+      return parseDirectiveAlign(IsPow2, /*ExprSize=*/4);
+    }
+    case DK_BALIGN:
+      return parseDirectiveAlign(/*IsPow2=*/false, /*ExprSize=*/1);
+    case DK_BALIGNW:
+      return parseDirectiveAlign(/*IsPow2=*/false, /*ExprSize=*/2);
+    case DK_BALIGNL:
+      return parseDirectiveAlign(/*IsPow2=*/false, /*ExprSize=*/4);
+    case DK_P2ALIGN:
+      return parseDirectiveAlign(/*IsPow2=*/true, /*ExprSize=*/1);
+    case DK_P2ALIGNW:
+      return parseDirectiveAlign(/*IsPow2=*/true, /*ExprSize=*/2);
+    case DK_P2ALIGNL:
+      return parseDirectiveAlign(/*IsPow2=*/true, /*ExprSize=*/4);
+    case DK_ORG:
+      return parseDirectiveOrg();
+    case DK_FILL:
+      return parseDirectiveFill();
+    case DK_ZERO:
+      return parseDirectiveZero();
+    case DK_EXTERN:
+      eatToEndOfStatement(); // .extern is the default, ignore it.
+      return false;
+    case DK_GLOBL:
+    case DK_GLOBAL:
+      return parseDirectiveSymbolAttribute(MCSA_Global);
+    case DK_LAZY_REFERENCE:
+      return parseDirectiveSymbolAttribute(MCSA_LazyReference);
+    case DK_NO_DEAD_STRIP:
+      return parseDirectiveSymbolAttribute(MCSA_NoDeadStrip);
+    case DK_SYMBOL_RESOLVER:
+      return parseDirectiveSymbolAttribute(MCSA_SymbolResolver);
+    case DK_PRIVATE_EXTERN:
+      return parseDirectiveSymbolAttribute(MCSA_PrivateExtern);
+    case DK_REFERENCE:
+      return parseDirectiveSymbolAttribute(MCSA_Reference);
+    case DK_WEAK_DEFINITION:
+      return parseDirectiveSymbolAttribute(MCSA_WeakDefinition);
+    case DK_WEAK_REFERENCE:
+      return parseDirectiveSymbolAttribute(MCSA_WeakReference);
+    case DK_WEAK_DEF_CAN_BE_HIDDEN:
+      return parseDirectiveSymbolAttribute(MCSA_WeakDefAutoPrivate);
+    case DK_COMM:
+    case DK_COMMON:
+      return parseDirectiveComm(/*IsLocal=*/false);
+    case DK_LCOMM:
+      return parseDirectiveComm(/*IsLocal=*/true);
+    case DK_ABORT:
+      return parseDirectiveAbort();
+    case DK_INCLUDE:
+      return parseDirectiveInclude();
+    case DK_INCBIN:
+      return parseDirectiveIncbin();
+    case DK_CODE16:
+    case DK_CODE16GCC:
+      return TokError(Twine(IDVal) + " not supported yet");
+    case DK_REPT:
+      return parseDirectiveRept(IDLoc);
+    case DK_IRP:
+      return parseDirectiveIrp(IDLoc);
+    case DK_IRPC:
+      return parseDirectiveIrpc(IDLoc);
+    case DK_ENDR:
+      return parseDirectiveEndr(IDLoc);
+    case DK_BUNDLE_ALIGN_MODE:
+      return parseDirectiveBundleAlignMode();
+    case DK_BUNDLE_LOCK:
+      return parseDirectiveBundleLock();
+    case DK_BUNDLE_UNLOCK:
+      return parseDirectiveBundleUnlock();
+    case DK_SLEB128:
+      return parseDirectiveLEB128(true);
+    case DK_ULEB128:
+      return parseDirectiveLEB128(false);
+    case DK_SPACE:
+    case DK_SKIP:
+      return parseDirectiveSpace(IDVal);
+    case DK_FILE:
+      return parseDirectiveFile(IDLoc);
+    case DK_LINE:
+      return parseDirectiveLine();
+    case DK_LOC:
+      return parseDirectiveLoc();
+    case DK_STABS:
+      return parseDirectiveStabs();
+    case DK_CFI_SECTIONS:
+      return parseDirectiveCFISections();
+    case DK_CFI_STARTPROC:
+      return parseDirectiveCFIStartProc();
+    case DK_CFI_ENDPROC:
+      return parseDirectiveCFIEndProc();
+    case DK_CFI_DEF_CFA:
+      return parseDirectiveCFIDefCfa(IDLoc);
+    case DK_CFI_DEF_CFA_OFFSET:
+      return parseDirectiveCFIDefCfaOffset();
+    case DK_CFI_ADJUST_CFA_OFFSET:
+      return parseDirectiveCFIAdjustCfaOffset();
+    case DK_CFI_DEF_CFA_REGISTER:
+      return parseDirectiveCFIDefCfaRegister(IDLoc);
+    case DK_CFI_OFFSET:
+      return parseDirectiveCFIOffset(IDLoc);
+    case DK_CFI_REL_OFFSET:
+      return parseDirectiveCFIRelOffset(IDLoc);
+    case DK_CFI_PERSONALITY:
+      return parseDirectiveCFIPersonalityOrLsda(true);
+    case DK_CFI_LSDA:
+      return parseDirectiveCFIPersonalityOrLsda(false);
+    case DK_CFI_REMEMBER_STATE:
+      return parseDirectiveCFIRememberState();
+    case DK_CFI_RESTORE_STATE:
+      return parseDirectiveCFIRestoreState();
+    case DK_CFI_SAME_VALUE:
+      return parseDirectiveCFISameValue(IDLoc);
+    case DK_CFI_RESTORE:
+      return parseDirectiveCFIRestore(IDLoc);
+    case DK_CFI_ESCAPE:
+      return parseDirectiveCFIEscape();
+    case DK_CFI_SIGNAL_FRAME:
+      return parseDirectiveCFISignalFrame();
+    case DK_CFI_UNDEFINED:
+      return parseDirectiveCFIUndefined(IDLoc);
+    case DK_CFI_REGISTER:
+      return parseDirectiveCFIRegister(IDLoc);
+    case DK_CFI_WINDOW_SAVE:
+      return parseDirectiveCFIWindowSave();
+    case DK_MACROS_ON:
+    case DK_MACROS_OFF:
+      return parseDirectiveMacrosOnOff(IDVal);
+    case DK_MACRO:
+      return parseDirectiveMacro(IDLoc);
+    case DK_ENDM:
+    case DK_ENDMACRO:
+      return parseDirectiveEndMacro(IDVal);
+    case DK_PURGEM:
+      return parseDirectivePurgeMacro(IDLoc);
     }
 
     return Error(IDLoc, "unknown directive");
@@ -1478,19 +1500,19 @@ bool AsmParser::ParseStatement(ParseStatementInfo &Info) {
   // __asm _emit or __asm __emit
   if (ParsingInlineAsm && (IDVal == "_emit" || IDVal == "__emit" ||
                            IDVal == "_EMIT" || IDVal == "__EMIT"))
-    return ParseDirectiveMSEmit(IDLoc, Info, IDVal.size());
+    return parseDirectiveMSEmit(IDLoc, Info, IDVal.size());
 
   // __asm align
   if (ParsingInlineAsm && (IDVal == "align" || IDVal == "ALIGN"))
-    return ParseDirectiveMSAlign(IDLoc, Info);
+    return parseDirectiveMSAlign(IDLoc, Info);
 
   checkForValidSection();
 
   // Canonicalize the opcode to lower case.
   std::string OpcodeStr = IDVal.lower();
   ParseInstructionInfo IInfo(Info.AsmRewrites);
-  bool HadError = getTargetParser().ParseInstruction(IInfo, OpcodeStr,
-                                                     IDLoc, Info.ParsedOperands);
+  bool HadError = getTargetParser().ParseInstruction(IInfo, OpcodeStr, IDLoc,
+                                                     Info.ParsedOperands);
   Info.ParseError = HadError;
 
   // Dump the parsed representation, if requested.
@@ -1505,7 +1527,7 @@ bool AsmParser::ParseStatement(ParseStatementInfo &Info) {
     }
     OS << "]";
 
-    PrintMessage(IDLoc, SourceMgr::DK_Note, OS.str());
+    printMessage(IDLoc, SourceMgr::DK_Note, OS.str());
   }
 
   // If we are generating dwarf for assembly source files and the current
@@ -1513,49 +1535,49 @@ bool AsmParser::ParseStatement(ParseStatementInfo &Info) {
   // the instruction.
   if (!HadError && getContext().getGenDwarfForAssembly() &&
       getContext().getGenDwarfSection() ==
-      getStreamer().getCurrentSection().first) {
+          getStreamer().getCurrentSection().first) {
 
     unsigned Line = SrcMgr.FindLineNumber(IDLoc, CurBuffer);
 
     // If we previously parsed a cpp hash file line comment then make sure the
     // current Dwarf File is for the CppHashFilename if not then emit the
     // Dwarf File table for it and adjust the line number for the .loc.
-    const SmallVectorImpl<MCDwarfFile *> &MCDwarfFiles = 
-      getContext().getMCDwarfFiles();
+    const SmallVectorImpl<MCDwarfFile *> &MCDwarfFiles =
+        getContext().getMCDwarfFiles();
     if (CppHashFilename.size() != 0) {
       if (MCDwarfFiles[getContext().getGenDwarfFileNumber()]->getName() !=
           CppHashFilename)
         getStreamer().EmitDwarfFileDirective(
-          getContext().nextGenDwarfFileNumber(), StringRef(), CppHashFilename);
-
-       // Since SrcMgr.FindLineNumber() is slow and messes up the SourceMgr's 
-       // cache with the different Loc from the call above we save the last
-       // info we queried here with SrcMgr.FindLineNumber().
-       unsigned CppHashLocLineNo;
-       if (LastQueryIDLoc == CppHashLoc && LastQueryBuffer == CppHashBuf)
-         CppHashLocLineNo = LastQueryLine;
-       else {
-         CppHashLocLineNo = SrcMgr.FindLineNumber(CppHashLoc, CppHashBuf);
-         LastQueryLine = CppHashLocLineNo;
-         LastQueryIDLoc = CppHashLoc;
-         LastQueryBuffer = CppHashBuf;
-       }
-       Line = CppHashLineNumber - 1 + (Line - CppHashLocLineNo);
+            getContext().nextGenDwarfFileNumber(), StringRef(),
+            CppHashFilename);
+
+      // Since SrcMgr.FindLineNumber() is slow and messes up the SourceMgr's
+      // cache with the different Loc from the call above we save the last
+      // info we queried here with SrcMgr.FindLineNumber().
+      unsigned CppHashLocLineNo;
+      if (LastQueryIDLoc == CppHashLoc && LastQueryBuffer == CppHashBuf)
+        CppHashLocLineNo = LastQueryLine;
+      else {
+        CppHashLocLineNo = SrcMgr.FindLineNumber(CppHashLoc, CppHashBuf);
+        LastQueryLine = CppHashLocLineNo;
+        LastQueryIDLoc = CppHashLoc;
+        LastQueryBuffer = CppHashBuf;
+      }
+      Line = CppHashLineNumber - 1 + (Line - CppHashLocLineNo);
     }
 
-    getStreamer().EmitDwarfLocDirective(getContext().getGenDwarfFileNumber(),
-                                        Line, 0, DWARF2_LINE_DEFAULT_IS_STMT ?
-                                        DWARF2_FLAG_IS_STMT : 0, 0, 0,
-                                        StringRef());
+    getStreamer().EmitDwarfLocDirective(
+        getContext().getGenDwarfFileNumber(), Line, 0,
+        DWARF2_LINE_DEFAULT_IS_STMT ? DWARF2_FLAG_IS_STMT : 0, 0, 0,
+        StringRef());
   }
 
   // If parsing succeeded, match the instruction.
   if (!HadError) {
     unsigned ErrorInfo;
-    HadError = getTargetParser().MatchAndEmitInstruction(IDLoc, Info.Opcode,
-                                                         Info.ParsedOperands,
-                                                         Out, ErrorInfo,
-                                                         ParsingInlineAsm);
+    HadError = getTargetParser().MatchAndEmitInstruction(
+        IDLoc, Info.Opcode, Info.ParsedOperands, Out, ErrorInfo,
+        ParsingInlineAsm);
   }
 
   // Don't skip the rest of the line, the instruction parser is responsible for
@@ -1563,25 +1585,25 @@ bool AsmParser::ParseStatement(ParseStatementInfo &Info) {
   return false;
 }
 
-/// EatToEndOfLine uses the Lexer to eat the characters to the end of the line
+/// eatToEndOfLine uses the Lexer to eat the characters to the end of the line
 /// since they may not be able to be tokenized to get to the end of line token.
-void AsmParser::EatToEndOfLine() {
+void AsmParser::eatToEndOfLine() {
   if (!Lexer.is(AsmToken::EndOfStatement))
     Lexer.LexUntilEndOfLine();
- // Eat EOL.
- Lex();
+  // Eat EOL.
+  Lex();
 }
 
-/// ParseCppHashLineFilenameComment as this:
+/// parseCppHashLineFilenameComment as this:
 ///   ::= # number "filename"
 /// or just as a full line comment if it doesn't have a number and a string.
-bool AsmParser::ParseCppHashLineFilenameComment(const SMLoc &L) {
+bool AsmParser::parseCppHashLineFilenameComment(const SMLoc &L) {
   Lex(); // Eat the hash token.
 
   if (getLexer().isNot(AsmToken::Integer)) {
     // Consume the line since in cases it is not a well-formed line directive,
     // as if were simply a full line comment.
-    EatToEndOfLine();
+    eatToEndOfLine();
     return false;
   }
 
@@ -1589,13 +1611,13 @@ bool AsmParser::ParseCppHashLineFilenameComment(const SMLoc &L) {
   Lex();
 
   if (getLexer().isNot(AsmToken::String)) {
-    EatToEndOfLine();
+    eatToEndOfLine();
     return false;
   }
 
   StringRef Filename = getTok().getString();
   // Get rid of the enclosing quotes.
-  Filename = Filename.substr(1, Filename.size()-2);
+  Filename = Filename.substr(1, Filename.size() - 2);
 
   // Save the SMLoc, Filename and LineNumber for later use by diagnostics.
   CppHashLoc = L;
@@ -1604,14 +1626,14 @@ bool AsmParser::ParseCppHashLineFilenameComment(const SMLoc &L) {
   CppHashBuf = CurBuffer;
 
   // Ignore any trailing characters, they're just comment.
-  EatToEndOfLine();
+  eatToEndOfLine();
   return false;
 }
 
-/// DiagHandler - will use the last parsed cpp hash line filename comment
+/// \brief will use the last parsed cpp hash line filename comment
 /// for the Filename and LineNo if any in the diagnostic.
 void AsmParser::DiagHandler(const SMDiagnostic &Diag, void *Context) {
-  const AsmParser *Parser = static_cast<const AsmParser*>(Context);
+  const AsmParser *Parser = static_cast<const AsmParser *>(Context);
   raw_ostream &OS = errs();
 
   const SourceMgr &DiagSrcMgr = *Diag.getSourceMgr();
@@ -1619,19 +1641,18 @@ void AsmParser::DiagHandler(const SMDiagnostic &Diag, void *Context) {
   int DiagBuf = DiagSrcMgr.FindBufferContainingLoc(DiagLoc);
   int CppHashBuf = Parser->SrcMgr.FindBufferContainingLoc(Parser->CppHashLoc);
 
-  // Like SourceMgr::PrintMessage() we need to print the include stack if any
+  // Like SourceMgr::printMessage() we need to print the include stack if any
   // before printing the message.
   int DiagCurBuffer = DiagSrcMgr.FindBufferContainingLoc(DiagLoc);
   if (!Parser->SavedDiagHandler && DiagCurBuffer > 0) {
-     SMLoc ParentIncludeLoc = DiagSrcMgr.getParentIncludeLoc(DiagCurBuffer);
-     DiagSrcMgr.PrintIncludeStack(ParentIncludeLoc, OS);
+    SMLoc ParentIncludeLoc = DiagSrcMgr.getParentIncludeLoc(DiagCurBuffer);
+    DiagSrcMgr.PrintIncludeStack(ParentIncludeLoc, OS);
   }
 
   // If we have not parsed a cpp hash line filename comment or the source
   // manager changed or buffer changed (like in a nested include) then just
   // print the normal diagnostic using its Filename and LineNo.
-  if (!Parser->CppHashLineNumber ||
-      &DiagSrcMgr != &Parser->SrcMgr ||
+  if (!Parser->CppHashLineNumber || &DiagSrcMgr != &Parser->SrcMgr ||
       DiagBuf != CppHashBuf) {
     if (Parser->SavedDiagHandler)
       Parser->SavedDiagHandler(Diag, Parser->SavedDiagContext);
@@ -1643,17 +1664,16 @@ void AsmParser::DiagHandler(const SMDiagnostic &Diag, void *Context) {
   // Use the CppHashFilename and calculate a line number based on the
   // CppHashLoc and CppHashLineNumber relative to this Diag's SMLoc for
   // the diagnostic.
-  const std::string Filename = Parser->CppHashFilename;
+  const std::string &Filename = Parser->CppHashFilename;
 
   int DiagLocLineNo = DiagSrcMgr.FindLineNumber(DiagLoc, DiagBuf);
   int CppHashLocLineNo =
       Parser->SrcMgr.FindLineNumber(Parser->CppHashLoc, CppHashBuf);
-  int LineNo = Parser->CppHashLineNumber - 1 +
-               (DiagLocLineNo - CppHashLocLineNo);
+  int LineNo =
+      Parser->CppHashLineNumber - 1 + (DiagLocLineNo - CppHashLocLineNo);
 
-  SMDiagnostic NewDiag(*Diag.getSourceMgr(), Diag.getLoc(),
-                       Filename, LineNo, Diag.getColumnNo(),
-                       Diag.getKind(), Diag.getMessage(),
+  SMDiagnostic NewDiag(*Diag.getSourceMgr(), Diag.getLoc(), Filename, LineNo,
+                       Diag.getColumnNo(), Diag.getKind(), Diag.getMessage(),
                        Diag.getLineContents(), Diag.getRanges());
 
   if (Parser->SavedDiagHandler)
@@ -1673,8 +1693,7 @@ static bool isIdentifierChar(char c) {
 
 bool AsmParser::expandMacro(raw_svector_ostream &OS, StringRef Body,
                             const MCAsmMacroParameters &Parameters,
-                            const MCAsmMacroArguments &A,
-                            const SMLoc &L) {
+                            const MCAsmMacroArguments &A, const SMLoc &L) {
   unsigned NParameters = Parameters.size();
   if (NParameters != 0 && NParameters != A.size())
     return Error(L, "Wrong number of arguments");
@@ -1710,27 +1729,28 @@ bool AsmParser::expandMacro(raw_svector_ostream &OS, StringRef Body,
       break;
 
     if (!NParameters) {
-      switch (Body[Pos+1]) {
-        // $$ => $
+      switch (Body[Pos + 1]) {
+      // $$ => $
       case '$':
         OS << '$';
         break;
 
-        // $n => number of arguments
+      // $n => number of arguments
       case 'n':
         OS << A.size();
         break;
 
-        // $[0-9] => argument
+      // $[0-9] => argument
       default: {
         // Missing arguments are ignored.
-        unsigned Index = Body[Pos+1] - '0';
+        unsigned Index = Body[Pos + 1] - '0';
         if (Index >= A.size())
           break;
 
         // Otherwise substitute with the token values, with spaces eliminated.
         for (MCAsmMacroArgument::const_iterator it = A[Index].begin(),
-               ie = A[Index].end(); it != ie; ++it)
+                                                ie = A[Index].end();
+             it != ie; ++it)
           OS << it->getString();
         break;
       }
@@ -1741,23 +1761,24 @@ bool AsmParser::expandMacro(raw_svector_ostream &OS, StringRef Body,
       while (isIdentifierChar(Body[I]) && I + 1 != End)
         ++I;
 
-      const char *Begin = Body.data() + Pos +1;
-      StringRef Argument(Begin, I - (Pos +1));
+      const char *Begin = Body.data() + Pos + 1;
+      StringRef Argument(Begin, I - (Pos + 1));
       unsigned Index = 0;
       for (; Index < NParameters; ++Index)
         if (Parameters[Index].first == Argument)
           break;
 
       if (Index == NParameters) {
-          if (Body[Pos+1] == '(' && Body[Pos+2] == ')')
-            Pos += 3;
-          else {
-            OS << '\\' << Argument;
-            Pos = I;
-          }
+        if (Body[Pos + 1] == '(' && Body[Pos + 2] == ')')
+          Pos += 3;
+        else {
+          OS << '\\' << Argument;
+          Pos = I;
+        }
       } else {
         for (MCAsmMacroArgument::const_iterator it = A[Index].begin(),
-               ie = A[Index].end(); it != ie; ++it)
+                                                ie = A[Index].end();
+             it != ie; ++it)
           if (it->getKind() == AsmToken::String)
             OS << it->getStringContents();
           else
@@ -1773,48 +1794,43 @@ bool AsmParser::expandMacro(raw_svector_ostream &OS, StringRef Body,
   return false;
 }
 
-MacroInstantiation::MacroInstantiation(const MCAsmMacro *M, SMLoc IL,
-                                       int EB, SMLoc EL,
-                                       MemoryBuffer *I)
-  : TheMacro(M), Instantiation(I), InstantiationLoc(IL), ExitBuffer(EB),
-    ExitLoc(EL)
-{
-}
+MacroInstantiation::MacroInstantiation(const MCAsmMacro *M, SMLoc IL, int EB,
+                                       SMLoc EL, MemoryBuffer *I)
+    : TheMacro(M), Instantiation(I), InstantiationLoc(IL), ExitBuffer(EB),
+      ExitLoc(EL) {}
 
-static bool IsOperator(AsmToken::TokenKind kind)
-{
-  switch (kind)
-  {
-    default:
-      return false;
-    case AsmToken::Plus:
-    case AsmToken::Minus:
-    case AsmToken::Tilde:
-    case AsmToken::Slash:
-    case AsmToken::Star:
-    case AsmToken::Dot:
-    case AsmToken::Equal:
-    case AsmToken::EqualEqual:
-    case AsmToken::Pipe:
-    case AsmToken::PipePipe:
-    case AsmToken::Caret:
-    case AsmToken::Amp:
-    case AsmToken::AmpAmp:
-    case AsmToken::Exclaim:
-    case AsmToken::ExclaimEqual:
-    case AsmToken::Percent:
-    case AsmToken::Less:
-    case AsmToken::LessEqual:
-    case AsmToken::LessLess:
-    case AsmToken::LessGreater:
-    case AsmToken::Greater:
-    case AsmToken::GreaterEqual:
-    case AsmToken::GreaterGreater:
-      return true;
+static bool isOperator(AsmToken::TokenKind kind) {
+  switch (kind) {
+  default:
+    return false;
+  case AsmToken::Plus:
+  case AsmToken::Minus:
+  case AsmToken::Tilde:
+  case AsmToken::Slash:
+  case AsmToken::Star:
+  case AsmToken::Dot:
+  case AsmToken::Equal:
+  case AsmToken::EqualEqual:
+  case AsmToken::Pipe:
+  case AsmToken::PipePipe:
+  case AsmToken::Caret:
+  case AsmToken::Amp:
+  case AsmToken::AmpAmp:
+  case AsmToken::Exclaim:
+  case AsmToken::ExclaimEqual:
+  case AsmToken::Percent:
+  case AsmToken::Less:
+  case AsmToken::LessEqual:
+  case AsmToken::LessLess:
+  case AsmToken::LessGreater:
+  case AsmToken::Greater:
+  case AsmToken::GreaterEqual:
+  case AsmToken::GreaterGreater:
+    return true;
   }
 }
 
-bool AsmParser::ParseMacroArgument(MCAsmMacroArgument &MA,
+bool AsmParser::parseMacroArgument(MCAsmMacroArgument &MA,
                                    AsmToken::TokenKind &ArgumentDelimiter) {
   unsigned ParenLevel = 0;
   unsigned AddTokens = 0;
@@ -1848,7 +1864,7 @@ bool AsmParser::ParseMacroArgument(MCAsmMacroArgument &MA,
       // one into this argument
       if (ArgumentDelimiter == AsmToken::Space ||
           ArgumentDelimiter == AsmToken::Eof) {
-        if (IsOperator(Lexer.getKind())) {
+        if (isOperator(Lexer.getKind())) {
           // Check to see whether the token is used as an operator,
           // or part of an identifier
           const char *NextChar = getTok().getEndLoc().getPointer();
@@ -1858,14 +1874,14 @@ bool AsmParser::ParseMacroArgument(MCAsmMacroArgument &MA,
 
         if (!AddTokens && ParenLevel == 0) {
           if (ArgumentDelimiter == AsmToken::Eof &&
-              !IsOperator(Lexer.getKind()))
+              !isOperator(Lexer.getKind()))
             ArgumentDelimiter = AsmToken::Space;
           break;
         }
       }
     }
 
-    // HandleMacroEntry relies on not advancing the lexer here
+    // handleMacroEntry relies on not advancing the lexer here
     // to be able to fill in the remaining default parameter values
     if (Lexer.is(AsmToken::EndOfStatement))
       break;
@@ -1890,10 +1906,11 @@ bool AsmParser::ParseMacroArgument(MCAsmMacroArgument &MA,
 }
 
 // Parse the macro instantiation arguments.
-bool AsmParser::ParseMacroArguments(const MCAsmMacro *M, MCAsmMacroArguments &A) {
+bool AsmParser::parseMacroArguments(const MCAsmMacro *M,
+                                    MCAsmMacroArguments &A) {
   const unsigned NParameters = M ? M->Parameters.size() : 0;
   // Argument delimiter is initially unknown. It will be set by
-  // ParseMacroArgument()
+  // parseMacroArgument()
   AsmToken::TokenKind ArgumentDelimiter = AsmToken::Eof;
 
   // Parse two kinds of macro invocations:
@@ -1903,7 +1920,7 @@ bool AsmParser::ParseMacroArguments(const MCAsmMacro *M, MCAsmMacroArguments &A)
        ++Parameter) {
     MCAsmMacroArgument MA;
 
-    if (ParseMacroArgument(MA, ArgumentDelimiter))
+    if (parseMacroArgument(MA, ArgumentDelimiter))
       return true;
 
     if (!MA.empty() || !NParameters)
@@ -1934,31 +1951,31 @@ bool AsmParser::ParseMacroArguments(const MCAsmMacro *M, MCAsmMacroArguments &A)
   return TokError("Too many arguments");
 }
 
-const MCAsmMacro* AsmParser::LookupMacro(StringRef Name) {
-  StringMap<MCAsmMacro*>::iterator I = MacroMap.find(Name);
+const MCAsmMacro *AsmParser::lookupMacro(StringRef Name) {
+  StringMap<MCAsmMacro *>::iterator I = MacroMap.find(Name);
   return (I == MacroMap.end()) ? NULL : I->getValue();
 }
 
-void AsmParser::DefineMacro(StringRef Name, const MCAsmMacro& Macro) {
+void AsmParser::defineMacro(StringRef Name, const MCAsmMacro &Macro) {
   MacroMap[Name] = new MCAsmMacro(Macro);
 }
 
-void AsmParser::UndefineMacro(StringRef Name) {
-  StringMap<MCAsmMacro*>::iterator I = MacroMap.find(Name);
+void AsmParser::undefineMacro(StringRef Name) {
+  StringMap<MCAsmMacro *>::iterator I = MacroMap.find(Name);
   if (I != MacroMap.end()) {
     delete I->getValue();
     MacroMap.erase(I);
   }
 }
 
-bool AsmParser::HandleMacroEntry(const MCAsmMacro *M, SMLoc NameLoc) {
+bool AsmParser::handleMacroEntry(const MCAsmMacro *M, SMLoc NameLoc) {
   // Arbitrarily limit macro nesting depth, to match 'as'. We can eliminate
   // this, although we should protect against infinite loops.
   if (ActiveMacros.size() == 20)
     return TokError("macros cannot be nested more than 20 levels deep");
 
   MCAsmMacroArguments A;
-  if (ParseMacroArguments(M, A))
+  if (parseMacroArguments(M, A))
     return true;
 
   // Remove any trailing empty arguments. Do this after-the-fact as we have
@@ -1981,14 +1998,12 @@ bool AsmParser::HandleMacroEntry(const MCAsmMacro *M, SMLoc NameLoc) {
   OS << ".endmacro\n";
 
   MemoryBuffer *Instantiation =
-    MemoryBuffer::getMemBufferCopy(OS.str(), "<instantiation>");
+      MemoryBuffer::getMemBufferCopy(OS.str(), "<instantiation>");
 
   // Create the macro instantiation object and add to the current macro
   // instantiation stack.
-  MacroInstantiation *MI = new MacroInstantiation(M, NameLoc,
-                                                  CurBuffer,
-                                                  getTok().getLoc(),
-                                                  Instantiation);
+  MacroInstantiation *MI = new MacroInstantiation(
+      M, NameLoc, CurBuffer, getTok().getLoc(), Instantiation);
   ActiveMacros.push_back(MI);
 
   // Jump to the macro instantiation and prime the lexer.
@@ -1999,9 +2014,9 @@ bool AsmParser::HandleMacroEntry(const MCAsmMacro *M, SMLoc NameLoc) {
   return false;
 }
 
-void AsmParser::HandleMacroExit() {
+void AsmParser::handleMacroExit() {
   // Jump to the EndOfStatement we should return to, and consume it.
-  JumpToLoc(ActiveMacros.back()->ExitLoc, ActiveMacros.back()->ExitBuffer);
+  jumpToLoc(ActiveMacros.back()->ExitLoc, ActiveMacros.back()->ExitBuffer);
   Lex();
 
   // Pop the instantiation entry.
@@ -2009,29 +2024,30 @@ void AsmParser::HandleMacroExit() {
   ActiveMacros.pop_back();
 }
 
-static bool IsUsedIn(const MCSymbol *Sym, const MCExpr *Value) {
+static bool isUsedIn(const MCSymbol *Sym, const MCExpr *Value) {
   switch (Value->getKind()) {
   case MCExpr::Binary: {
-    const MCBinaryExpr *BE = static_cast<const MCBinaryExpr*>(Value);
-    return IsUsedIn(Sym, BE->getLHS()) || IsUsedIn(Sym, BE->getRHS());
+    const MCBinaryExpr *BE = static_cast<const MCBinaryExpr *>(Value);
+    return isUsedIn(Sym, BE->getLHS()) || isUsedIn(Sym, BE->getRHS());
   }
   case MCExpr::Target:
   case MCExpr::Constant:
     return false;
   case MCExpr::SymbolRef: {
-    const MCSymbol &S = static_cast<const MCSymbolRefExpr*>(Value)->getSymbol();
+    const MCSymbol &S =
+        static_cast<const MCSymbolRefExpr *>(Value)->getSymbol();
     if (S.isVariable())
-      return IsUsedIn(Sym, S.getVariableValue());
+      return isUsedIn(Sym, S.getVariableValue());
     return &S == Sym;
   }
   case MCExpr::Unary:
-    return IsUsedIn(Sym, static_cast<const MCUnaryExpr*>(Value)->getSubExpr());
+    return isUsedIn(Sym, static_cast<const MCUnaryExpr *>(Value)->getSubExpr());
   }
 
   llvm_unreachable("Unknown expr kind!");
 }
 
-bool AsmParser::ParseAssignment(StringRef Name, bool allow_redef,
+bool AsmParser::parseAssignment(StringRef Name, bool allow_redef,
                                 bool NoDeadStrip) {
   // FIXME: Use better location, we should use proper tokens.
   SMLoc EqualLoc = Lexer.getLoc();
@@ -2064,7 +2080,7 @@ bool AsmParser::ParseAssignment(StringRef Name, bool allow_redef,
     //
     // FIXME: Diagnostics. Note the location of the definition as a label.
     // FIXME: Diagnose assignment to protected identifier (e.g., register name).
-    if (IsUsedIn(Sym, Value))
+    if (isUsedIn(Sym, Value))
       return Error(EqualLoc, "Recursive use of '" + Name + "'");
     else if (Sym->isUndefined() && !Sym->isUsed() && !Sym->isVariable())
       ; // Allow redefinitions of undefined symbols only used in directives.
@@ -2076,7 +2092,7 @@ bool AsmParser::ParseAssignment(StringRef Name, bool allow_redef,
       return Error(EqualLoc, "invalid assignment to '" + Name + "'");
     else if (!isa<MCConstantExpr>(Sym->getVariableValue()))
       return Error(EqualLoc, "invalid reassignment of non-absolute variable '" +
-                   Name + "'");
+                                 Name + "'");
 
     // Don't count these checks as uses.
     Sym->setUsed(false);
@@ -2090,7 +2106,6 @@ bool AsmParser::ParseAssignment(StringRef Name, bool allow_redef,
   if (NoDeadStrip)
     Out.EmitSymbolAttribute(Sym, MCSA_NoDeadStrip);
 
-
   return false;
 }
 
@@ -2099,31 +2114,30 @@ bool AsmParser::ParseAssignment(StringRef Name, bool allow_redef,
 ///   ::= string
 bool AsmParser::parseIdentifier(StringRef &Res) {
   // The assembler has relaxed rules for accepting identifiers, in particular we
-  // allow things like '.globl $foo', which would normally be separate
-  // tokens. At this level, we have already lexed so we cannot (currently)
+  // allow things like '.globl $foo' and '.def @feat.00', which would normally be
+  // separate tokens. At this level, we have already lexed so we cannot (currently)
   // handle this as a context dependent token, instead we detect adjacent tokens
   // and return the combined identifier.
-  if (Lexer.is(AsmToken::Dollar)) {
-    SMLoc DollarLoc = getLexer().getLoc();
+  if (Lexer.is(AsmToken::Dollar) || Lexer.is(AsmToken::At)) {
+    SMLoc PrefixLoc = getLexer().getLoc();
 
-    // Consume the dollar sign, and check for a following identifier.
+    // Consume the prefix character, and check for a following identifier.
     Lex();
     if (Lexer.isNot(AsmToken::Identifier))
       return true;
 
-    // We have a '$' followed by an identifier, make sure they are adjacent.
-    if (DollarLoc.getPointer() + 1 != getTok().getLoc().getPointer())
+    // We have a '$' or '@' followed by an identifier, make sure they are adjacent.
+    if (PrefixLoc.getPointer() + 1 != getTok().getLoc().getPointer())
       return true;
 
     // Construct the joined identifier and consume the token.
-    Res = StringRef(DollarLoc.getPointer(),
-                    getTok().getIdentifier().size() + 1);
+    Res =
+        StringRef(PrefixLoc.getPointer(), getTok().getIdentifier().size() + 1);
     Lex();
     return false;
   }
 
-  if (Lexer.isNot(AsmToken::Identifier) &&
-      Lexer.isNot(AsmToken::String))
+  if (Lexer.isNot(AsmToken::Identifier) && Lexer.isNot(AsmToken::String))
     return true;
 
   Res = getTok().getIdentifier();
@@ -2133,11 +2147,11 @@ bool AsmParser::parseIdentifier(StringRef &Res) {
   return false;
 }
 
-/// ParseDirectiveSet:
+/// parseDirectiveSet:
 ///   ::= .equ identifier ',' expression
 ///   ::= .equiv identifier ',' expression
 ///   ::= .set identifier ',' expression
-bool AsmParser::ParseDirectiveSet(StringRef IDVal, bool allow_redef) {
+bool AsmParser::parseDirectiveSet(StringRef IDVal, bool allow_redef) {
   StringRef Name;
 
   if (parseIdentifier(Name))
@@ -2147,7 +2161,7 @@ bool AsmParser::ParseDirectiveSet(StringRef IDVal, bool allow_redef) {
     return TokError("unexpected token in '" + Twine(IDVal) + "'");
   Lex();
 
-  return ParseAssignment(Name, allow_redef, true);
+  return parseAssignment(Name, allow_redef, true);
 }
 
 bool AsmParser::parseEscapedString(std::string &Data) {
@@ -2168,15 +2182,15 @@ bool AsmParser::parseEscapedString(std::string &Data) {
       return TokError("unexpected backslash at end of string");
 
     // Recognize octal sequences.
-    if ((unsigned) (Str[i] - '0') <= 7) {
+    if ((unsigned)(Str[i] - '0') <= 7) {
       // Consume up to three octal characters.
       unsigned Value = Str[i] - '0';
 
-      if (i + 1 != e && ((unsigned) (Str[i + 1] - '0')) <= 7) {
+      if (i + 1 != e && ((unsigned)(Str[i + 1] - '0')) <= 7) {
         ++i;
         Value = Value * 8 + (Str[i] - '0');
 
-        if (i + 1 != e && ((unsigned) (Str[i + 1] - '0')) <= 7) {
+        if (i + 1 != e && ((unsigned)(Str[i + 1] - '0')) <= 7) {
           ++i;
           Value = Value * 8 + (Str[i] - '0');
         }
@@ -2185,7 +2199,7 @@ bool AsmParser::parseEscapedString(std::string &Data) {
       if (Value > 255)
         return TokError("invalid octal escape sequence (out of range)");
 
-      Data += (unsigned char) Value;
+      Data += (unsigned char)Value;
       continue;
     }
 
@@ -2208,9 +2222,9 @@ bool AsmParser::parseEscapedString(std::string &Data) {
   return false;
 }
 
-/// ParseDirectiveAscii:
+/// parseDirectiveAscii:
 ///   ::= ( .ascii | .asciz | .string ) [ "string" ( , "string" )* ]
-bool AsmParser::ParseDirectiveAscii(StringRef IDVal, bool ZeroTerminated) {
+bool AsmParser::parseDirectiveAscii(StringRef IDVal, bool ZeroTerminated) {
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
     checkForValidSection();
 
@@ -2241,9 +2255,9 @@ bool AsmParser::ParseDirectiveAscii(StringRef IDVal, bool ZeroTerminated) {
   return false;
 }
 
-/// ParseDirectiveValue
+/// parseDirectiveValue
 ///  ::= (.byte | .short | ... ) [ expression (, expression)* ]
-bool AsmParser::ParseDirectiveValue(unsigned Size) {
+bool AsmParser::parseDirectiveValue(unsigned Size) {
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
     checkForValidSection();
 
@@ -2277,9 +2291,9 @@ bool AsmParser::ParseDirectiveValue(unsigned Size) {
   return false;
 }
 
-/// ParseDirectiveRealValue
+/// parseDirectiveRealValue
 ///  ::= (.single | .double) [ expression (, expression)* ]
-bool AsmParser::ParseDirectiveRealValue(const fltSemantics &Semantics) {
+bool AsmParser::parseDirectiveRealValue(const fltSemantics &Semantics) {
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
     checkForValidSection();
 
@@ -2309,7 +2323,7 @@ bool AsmParser::ParseDirectiveRealValue(const fltSemantics &Semantics) {
         else
           return TokError("invalid floating point literal");
       } else if (Value.convertFromString(IDVal, APFloat::rmNearestTiesToEven) ==
-          APFloat::opInvalidOp)
+                 APFloat::opInvalidOp)
         return TokError("invalid floating point literal");
       if (IsNeg)
         Value.changeSign();
@@ -2335,9 +2349,9 @@ bool AsmParser::ParseDirectiveRealValue(const fltSemantics &Semantics) {
   return false;
 }
 
-/// ParseDirectiveZero
+/// parseDirectiveZero
 ///  ::= .zero expression
-bool AsmParser::ParseDirectiveZero() {
+bool AsmParser::parseDirectiveZero() {
   checkForValidSection();
 
   int64_t NumBytes;
@@ -2361,35 +2375,40 @@ bool AsmParser::ParseDirectiveZero() {
   return false;
 }
 
-/// ParseDirectiveFill
-///  ::= .fill expression , expression , expression
-bool AsmParser::ParseDirectiveFill() {
+/// parseDirectiveFill
+///  ::= .fill expression [ , expression [ , expression ] ]
+bool AsmParser::parseDirectiveFill() {
   checkForValidSection();
 
   int64_t NumValues;
   if (parseAbsoluteExpression(NumValues))
     return true;
 
-  if (getLexer().isNot(AsmToken::Comma))
-    return TokError("unexpected token in '.fill' directive");
-  Lex();
+  int64_t FillSize = 1;
+  int64_t FillExpr = 0;
 
-  int64_t FillSize;
-  if (parseAbsoluteExpression(FillSize))
-    return true;
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    if (getLexer().isNot(AsmToken::Comma))
+      return TokError("unexpected token in '.fill' directive");
+    Lex();
 
-  if (getLexer().isNot(AsmToken::Comma))
-    return TokError("unexpected token in '.fill' directive");
-  Lex();
+    if (parseAbsoluteExpression(FillSize))
+      return true;
 
-  int64_t FillExpr;
-  if (parseAbsoluteExpression(FillExpr))
-    return true;
+    if (getLexer().isNot(AsmToken::EndOfStatement)) {
+      if (getLexer().isNot(AsmToken::Comma))
+        return TokError("unexpected token in '.fill' directive");
+      Lex();
 
-  if (getLexer().isNot(AsmToken::EndOfStatement))
-    return TokError("unexpected token in '.fill' directive");
+      if (parseAbsoluteExpression(FillExpr))
+        return true;
 
-  Lex();
+      if (getLexer().isNot(AsmToken::EndOfStatement))
+        return TokError("unexpected token in '.fill' directive");
+
+      Lex();
+    }
+  }
 
   if (FillSize != 1 && FillSize != 2 && FillSize != 4 && FillSize != 8)
     return TokError("invalid '.fill' size, expected 1, 2, 4, or 8");
@@ -2400,9 +2419,9 @@ bool AsmParser::ParseDirectiveFill() {
   return false;
 }
 
-/// ParseDirectiveOrg
+/// parseDirectiveOrg
 ///  ::= .org expression [ , expression ]
-bool AsmParser::ParseDirectiveOrg() {
+bool AsmParser::parseDirectiveOrg() {
   checkForValidSection();
 
   const MCExpr *Offset;
@@ -2435,9 +2454,9 @@ bool AsmParser::ParseDirectiveOrg() {
   return false;
 }
 
-/// ParseDirectiveAlign
+/// parseDirectiveAlign
 ///  ::= {.align, ...} expression [ , expression [ , expression ]]
-bool AsmParser::ParseDirectiveAlign(bool IsPow2, unsigned ValueSize) {
+bool AsmParser::parseDirectiveAlign(bool IsPow2, unsigned ValueSize) {
   checkForValidSection();
 
   SMLoc AlignmentLoc = getLexer().getLoc();
@@ -2501,13 +2520,13 @@ bool AsmParser::ParseDirectiveAlign(bool IsPow2, unsigned ValueSize) {
   if (MaxBytesLoc.isValid()) {
     if (MaxBytesToFill < 1) {
       Error(MaxBytesLoc, "alignment directive can never be satisfied in this "
-            "many bytes, ignoring maximum bytes expression");
+                         "many bytes, ignoring maximum bytes expression");
       MaxBytesToFill = 0;
     }
 
     if (MaxBytesToFill >= Alignment) {
       Warning(MaxBytesLoc, "maximum bytes expression exceeds alignment and "
-              "has no effect");
+                           "has no effect");
       MaxBytesToFill = 0;
     }
   }
@@ -2527,10 +2546,10 @@ bool AsmParser::ParseDirectiveAlign(bool IsPow2, unsigned ValueSize) {
   return false;
 }
 
-/// ParseDirectiveFile
+/// parseDirectiveFile
 /// ::= .file [number] filename
 /// ::= .file number directory filename
-bool AsmParser::ParseDirectiveFile(SMLoc DirectiveLoc) {
+bool AsmParser::parseDirectiveFile(SMLoc DirectiveLoc) {
   // FIXME: I'm not sure what this is.
   int64_t FileNumber = -1;
   SMLoc FileNumberLoc = getLexer().getLoc();
@@ -2546,17 +2565,21 @@ bool AsmParser::ParseDirectiveFile(SMLoc DirectiveLoc) {
     return TokError("unexpected token in '.file' directive");
 
   // Usually the directory and filename together, otherwise just the directory.
-  StringRef Path = getTok().getString();
-  Path = Path.substr(1, Path.size()-2);
+  // Allow the strings to have escaped octal character sequence.
+  std::string Path = getTok().getString();
+  if (parseEscapedString(Path))
+    return true;
   Lex();
 
   StringRef Directory;
   StringRef Filename;
+  std::string FilenameData;
   if (getLexer().is(AsmToken::String)) {
     if (FileNumber == -1)
       return TokError("explicit path specified, but no file number");
-    Filename = getTok().getString();
-    Filename = Filename.substr(1, Filename.size()-2);
+    if (parseEscapedString(FilenameData))
+      return true;
+    Filename = FilenameData;
     Directory = Path;
     Lex();
   } else {
@@ -2570,8 +2593,9 @@ bool AsmParser::ParseDirectiveFile(SMLoc DirectiveLoc) {
     getStreamer().EmitFileDirective(Filename);
   else {
     if (getContext().getGenDwarfForAssembly() == true)
-      Error(DirectiveLoc, "input can't have .file dwarf directives when -g is "
-                        "used to generate dwarf debug info for assembly code");
+      Error(DirectiveLoc,
+            "input can't have .file dwarf directives when -g is "
+            "used to generate dwarf debug info for assembly code");
 
     if (getStreamer().EmitDwarfFileDirective(FileNumber, Directory, Filename))
       Error(FileNumberLoc, "file number already allocated");
@@ -2580,15 +2604,15 @@ bool AsmParser::ParseDirectiveFile(SMLoc DirectiveLoc) {
   return false;
 }
 
-/// ParseDirectiveLine
+/// parseDirectiveLine
 /// ::= .line [number]
-bool AsmParser::ParseDirectiveLine() {
+bool AsmParser::parseDirectiveLine() {
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
     if (getLexer().isNot(AsmToken::Integer))
       return TokError("unexpected token in '.line' directive");
 
     int64_t LineNumber = getTok().getIntVal();
-    (void) LineNumber;
+    (void)LineNumber;
     Lex();
 
     // FIXME: Do something with the .line.
@@ -2600,14 +2624,14 @@ bool AsmParser::ParseDirectiveLine() {
   return false;
 }
 
-/// ParseDirectiveLoc
+/// parseDirectiveLoc
 /// ::= .loc FileNumber [LineNumber] [ColumnPos] [basic_block] [prologue_end]
 ///                                [epilogue_begin] [is_stmt VALUE] [isa VALUE]
 /// The first number is a file number, must have been previously assigned with
 /// a .file directive, the second number is the line number and optionally the
 /// third number is a column position (zero if not specified).  The remaining
 /// optional items are .loc sub-directives.
-bool AsmParser::ParseDirectiveLoc() {
+bool AsmParser::parseDirectiveLoc() {
   if (getLexer().isNot(AsmToken::Integer))
     return TokError("unexpected token in '.loc' directive");
   int64_t FileNumber = getTok().getIntVal();
@@ -2620,8 +2644,8 @@ bool AsmParser::ParseDirectiveLoc() {
   int64_t LineNumber = 0;
   if (getLexer().is(AsmToken::Integer)) {
     LineNumber = getTok().getIntVal();
-    if (LineNumber < 1)
-      return TokError("line number less than one in '.loc' directive");
+    if (LineNumber < 0)
+      return TokError("line number less than zero in '.loc' directive");
     Lex();
   }
 
@@ -2701,15 +2725,15 @@ bool AsmParser::ParseDirectiveLoc() {
   return false;
 }
 
-/// ParseDirectiveStabs
+/// parseDirectiveStabs
 /// ::= .stabs string, number, number, number
-bool AsmParser::ParseDirectiveStabs() {
+bool AsmParser::parseDirectiveStabs() {
   return TokError("unsupported directive '.stabs'");
 }
 
-/// ParseDirectiveCFISections
+/// parseDirectiveCFISections
 /// ::= .cfi_sections section [, section]
-bool AsmParser::ParseDirectiveCFISections() {
+bool AsmParser::parseDirectiveCFISections() {
   StringRef Name;
   bool EH = false;
   bool Debug = false;
@@ -2738,22 +2762,22 @@ bool AsmParser::ParseDirectiveCFISections() {
   return false;
 }
 
-/// ParseDirectiveCFIStartProc
+/// parseDirectiveCFIStartProc
 /// ::= .cfi_startproc
-bool AsmParser::ParseDirectiveCFIStartProc() {
+bool AsmParser::parseDirectiveCFIStartProc() {
   getStreamer().EmitCFIStartProc();
   return false;
 }
 
-/// ParseDirectiveCFIEndProc
+/// parseDirectiveCFIEndProc
 /// ::= .cfi_endproc
-bool AsmParser::ParseDirectiveCFIEndProc() {
+bool AsmParser::parseDirectiveCFIEndProc() {
   getStreamer().EmitCFIEndProc();
   return false;
 }
 
-/// ParseRegisterOrRegisterNumber - parse register name or number.
-bool AsmParser::ParseRegisterOrRegisterNumber(int64_t &Register,
+/// \brief parse register name or number.
+bool AsmParser::parseRegisterOrRegisterNumber(int64_t &Register,
                                               SMLoc DirectiveLoc) {
   unsigned RegNo;
 
@@ -2767,11 +2791,11 @@ bool AsmParser::ParseRegisterOrRegisterNumber(int64_t &Register,
   return false;
 }
 
-/// ParseDirectiveCFIDefCfa
+/// parseDirectiveCFIDefCfa
 /// ::= .cfi_def_cfa register,  offset
-bool AsmParser::ParseDirectiveCFIDefCfa(SMLoc DirectiveLoc) {
+bool AsmParser::parseDirectiveCFIDefCfa(SMLoc DirectiveLoc) {
   int64_t Register = 0;
-  if (ParseRegisterOrRegisterNumber(Register, DirectiveLoc))
+  if (parseRegisterOrRegisterNumber(Register, DirectiveLoc))
     return true;
 
   if (getLexer().isNot(AsmToken::Comma))
@@ -2786,9 +2810,9 @@ bool AsmParser::ParseDirectiveCFIDefCfa(SMLoc DirectiveLoc) {
   return false;
 }
 
-/// ParseDirectiveCFIDefCfaOffset
+/// parseDirectiveCFIDefCfaOffset
 /// ::= .cfi_def_cfa_offset offset
-bool AsmParser::ParseDirectiveCFIDefCfaOffset() {
+bool AsmParser::parseDirectiveCFIDefCfaOffset() {
   int64_t Offset = 0;
   if (parseAbsoluteExpression(Offset))
     return true;
@@ -2797,11 +2821,11 @@ bool AsmParser::ParseDirectiveCFIDefCfaOffset() {
   return false;
 }
 
-/// ParseDirectiveCFIRegister
+/// parseDirectiveCFIRegister
 /// ::= .cfi_register register, register
-bool AsmParser::ParseDirectiveCFIRegister(SMLoc DirectiveLoc) {
+bool AsmParser::parseDirectiveCFIRegister(SMLoc DirectiveLoc) {
   int64_t Register1 = 0;
-  if (ParseRegisterOrRegisterNumber(Register1, DirectiveLoc))
+  if (parseRegisterOrRegisterNumber(Register1, DirectiveLoc))
     return true;
 
   if (getLexer().isNot(AsmToken::Comma))
@@ -2809,16 +2833,23 @@ bool AsmParser::ParseDirectiveCFIRegister(SMLoc DirectiveLoc) {
   Lex();
 
   int64_t Register2 = 0;
-  if (ParseRegisterOrRegisterNumber(Register2, DirectiveLoc))
+  if (parseRegisterOrRegisterNumber(Register2, DirectiveLoc))
     return true;
 
   getStreamer().EmitCFIRegister(Register1, Register2);
   return false;
 }
 
-/// ParseDirectiveCFIAdjustCfaOffset
+/// parseDirectiveCFIWindowSave
+/// ::= .cfi_window_save
+bool AsmParser::parseDirectiveCFIWindowSave() {
+  getStreamer().EmitCFIWindowSave();
+  return false;
+}
+
+/// parseDirectiveCFIAdjustCfaOffset
 /// ::= .cfi_adjust_cfa_offset adjustment
-bool AsmParser::ParseDirectiveCFIAdjustCfaOffset() {
+bool AsmParser::parseDirectiveCFIAdjustCfaOffset() {
   int64_t Adjustment = 0;
   if (parseAbsoluteExpression(Adjustment))
     return true;
@@ -2827,24 +2858,24 @@ bool AsmParser::ParseDirectiveCFIAdjustCfaOffset() {
   return false;
 }
 
-/// ParseDirectiveCFIDefCfaRegister
+/// parseDirectiveCFIDefCfaRegister
 /// ::= .cfi_def_cfa_register register
-bool AsmParser::ParseDirectiveCFIDefCfaRegister(SMLoc DirectiveLoc) {
+bool AsmParser::parseDirectiveCFIDefCfaRegister(SMLoc DirectiveLoc) {
   int64_t Register = 0;
-  if (ParseRegisterOrRegisterNumber(Register, DirectiveLoc))
+  if (parseRegisterOrRegisterNumber(Register, DirectiveLoc))
     return true;
 
   getStreamer().EmitCFIDefCfaRegister(Register);
   return false;
 }
 
-/// ParseDirectiveCFIOffset
+/// parseDirectiveCFIOffset
 /// ::= .cfi_offset register, offset
-bool AsmParser::ParseDirectiveCFIOffset(SMLoc DirectiveLoc) {
+bool AsmParser::parseDirectiveCFIOffset(SMLoc DirectiveLoc) {
   int64_t Register = 0;
   int64_t Offset = 0;
 
-  if (ParseRegisterOrRegisterNumber(Register, DirectiveLoc))
+  if (parseRegisterOrRegisterNumber(Register, DirectiveLoc))
     return true;
 
   if (getLexer().isNot(AsmToken::Comma))
@@ -2858,12 +2889,12 @@ bool AsmParser::ParseDirectiveCFIOffset(SMLoc DirectiveLoc) {
   return false;
 }
 
-/// ParseDirectiveCFIRelOffset
+/// parseDirectiveCFIRelOffset
 /// ::= .cfi_rel_offset register, offset
-bool AsmParser::ParseDirectiveCFIRelOffset(SMLoc DirectiveLoc) {
+bool AsmParser::parseDirectiveCFIRelOffset(SMLoc DirectiveLoc) {
   int64_t Register = 0;
 
-  if (ParseRegisterOrRegisterNumber(Register, DirectiveLoc))
+  if (parseRegisterOrRegisterNumber(Register, DirectiveLoc))
     return true;
 
   if (getLexer().isNot(AsmToken::Comma))
@@ -2900,11 +2931,11 @@ static bool isValidEncoding(int64_t Encoding) {
   return true;
 }
 
-/// ParseDirectiveCFIPersonalityOrLsda
+/// parseDirectiveCFIPersonalityOrLsda
 /// IsPersonality true for cfi_personality, false for cfi_lsda
 /// ::= .cfi_personality encoding, [symbol_name]
 /// ::= .cfi_lsda encoding, [symbol_name]
-bool AsmParser::ParseDirectiveCFIPersonalityOrLsda(bool IsPersonality) {
+bool AsmParser::parseDirectiveCFIPersonalityOrLsda(bool IsPersonality) {
   int64_t Encoding = 0;
   if (parseAbsoluteExpression(Encoding))
     return true;
@@ -2931,46 +2962,46 @@ bool AsmParser::ParseDirectiveCFIPersonalityOrLsda(bool IsPersonality) {
   return false;
 }
 
-/// ParseDirectiveCFIRememberState
+/// parseDirectiveCFIRememberState
 /// ::= .cfi_remember_state
-bool AsmParser::ParseDirectiveCFIRememberState() {
+bool AsmParser::parseDirectiveCFIRememberState() {
   getStreamer().EmitCFIRememberState();
   return false;
 }
 
-/// ParseDirectiveCFIRestoreState
+/// parseDirectiveCFIRestoreState
 /// ::= .cfi_remember_state
-bool AsmParser::ParseDirectiveCFIRestoreState() {
+bool AsmParser::parseDirectiveCFIRestoreState() {
   getStreamer().EmitCFIRestoreState();
   return false;
 }
 
-/// ParseDirectiveCFISameValue
+/// parseDirectiveCFISameValue
 /// ::= .cfi_same_value register
-bool AsmParser::ParseDirectiveCFISameValue(SMLoc DirectiveLoc) {
+bool AsmParser::parseDirectiveCFISameValue(SMLoc DirectiveLoc) {
   int64_t Register = 0;
 
-  if (ParseRegisterOrRegisterNumber(Register, DirectiveLoc))
+  if (parseRegisterOrRegisterNumber(Register, DirectiveLoc))
     return true;
 
   getStreamer().EmitCFISameValue(Register);
   return false;
 }
 
-/// ParseDirectiveCFIRestore
+/// parseDirectiveCFIRestore
 /// ::= .cfi_restore register
-bool AsmParser::ParseDirectiveCFIRestore(SMLoc DirectiveLoc) {
+bool AsmParser::parseDirectiveCFIRestore(SMLoc DirectiveLoc) {
   int64_t Register = 0;
-  if (ParseRegisterOrRegisterNumber(Register, DirectiveLoc))
+  if (parseRegisterOrRegisterNumber(Register, DirectiveLoc))
     return true;
 
   getStreamer().EmitCFIRestore(Register);
   return false;
 }
 
-/// ParseDirectiveCFIEscape
+/// parseDirectiveCFIEscape
 /// ::= .cfi_escape expression[,...]
-bool AsmParser::ParseDirectiveCFIEscape() {
+bool AsmParser::parseDirectiveCFIEscape() {
   std::string Values;
   int64_t CurrValue;
   if (parseAbsoluteExpression(CurrValue))
@@ -2991,9 +3022,9 @@ bool AsmParser::ParseDirectiveCFIEscape() {
   return false;
 }
 
-/// ParseDirectiveCFISignalFrame
+/// parseDirectiveCFISignalFrame
 /// ::= .cfi_signal_frame
-bool AsmParser::ParseDirectiveCFISignalFrame() {
+bool AsmParser::parseDirectiveCFISignalFrame() {
   if (getLexer().isNot(AsmToken::EndOfStatement))
     return Error(getLexer().getLoc(),
                  "unexpected token in '.cfi_signal_frame'");
@@ -3002,40 +3033,40 @@ bool AsmParser::ParseDirectiveCFISignalFrame() {
   return false;
 }
 
-/// ParseDirectiveCFIUndefined
+/// parseDirectiveCFIUndefined
 /// ::= .cfi_undefined register
-bool AsmParser::ParseDirectiveCFIUndefined(SMLoc DirectiveLoc) {
+bool AsmParser::parseDirectiveCFIUndefined(SMLoc DirectiveLoc) {
   int64_t Register = 0;
 
-  if (ParseRegisterOrRegisterNumber(Register, DirectiveLoc))
+  if (parseRegisterOrRegisterNumber(Register, DirectiveLoc))
     return true;
 
   getStreamer().EmitCFIUndefined(Register);
   return false;
 }
 
-/// ParseDirectiveMacrosOnOff
+/// parseDirectiveMacrosOnOff
 /// ::= .macros_on
 /// ::= .macros_off
-bool AsmParser::ParseDirectiveMacrosOnOff(StringRef Directive) {
+bool AsmParser::parseDirectiveMacrosOnOff(StringRef Directive) {
   if (getLexer().isNot(AsmToken::EndOfStatement))
     return Error(getLexer().getLoc(),
                  "unexpected token in '" + Directive + "' directive");
 
-  SetMacrosEnabled(Directive == ".macros_on");
+  setMacrosEnabled(Directive == ".macros_on");
   return false;
 }
 
-/// ParseDirectiveMacro
+/// parseDirectiveMacro
 /// ::= .macro name [parameters]
-bool AsmParser::ParseDirectiveMacro(SMLoc DirectiveLoc) {
+bool AsmParser::parseDirectiveMacro(SMLoc DirectiveLoc) {
   StringRef Name;
   if (parseIdentifier(Name))
     return TokError("expected identifier in '.macro' directive");
 
   MCAsmMacroParameters Parameters;
   // Argument delimiter is initially unknown. It will be set by
-  // ParseMacroArgument()
+  // parseMacroArgument()
   AsmToken::TokenKind ArgumentDelimiter = AsmToken::Eof;
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
     for (;;) {
@@ -3045,7 +3076,7 @@ bool AsmParser::ParseDirectiveMacro(SMLoc DirectiveLoc) {
 
       if (getLexer().is(AsmToken::Equal)) {
         Lex();
-        if (ParseMacroArgument(Parameter.second, ArgumentDelimiter))
+        if (parseMacroArgument(Parameter.second, ArgumentDelimiter))
           return true;
       }
 
@@ -3085,19 +3116,19 @@ bool AsmParser::ParseDirectiveMacro(SMLoc DirectiveLoc) {
     eatToEndOfStatement();
   }
 
-  if (LookupMacro(Name)) {
+  if (lookupMacro(Name)) {
     return Error(DirectiveLoc, "macro '" + Name + "' is already defined");
   }
 
   const char *BodyStart = StartToken.getLoc().getPointer();
   const char *BodyEnd = EndToken.getLoc().getPointer();
   StringRef Body = StringRef(BodyStart, BodyEnd - BodyStart);
-  CheckForBadMacro(DirectiveLoc, Name, Body, Parameters);
-  DefineMacro(Name, MCAsmMacro(Name, Body, Parameters));
+  checkForBadMacro(DirectiveLoc, Name, Body, Parameters);
+  defineMacro(Name, MCAsmMacro(Name, Body, Parameters));
   return false;
 }
 
-/// CheckForBadMacro
+/// checkForBadMacro
 ///
 /// With the support added for named parameters there may be code out there that
 /// is transitioning from positional parameters.  In versions of gas that did
@@ -3111,7 +3142,7 @@ bool AsmParser::ParseDirectiveMacro(SMLoc DirectiveLoc) {
 /// intended or change the macro to use the named parameters.  It is possible
 /// this warning will trigger when the none of the named parameters are used
 /// and the strings like $1 are infact to simply to be passed trough unchanged.
-void AsmParser::CheckForBadMacro(SMLoc DirectiveLoc, StringRef Name,
+void AsmParser::checkForBadMacro(SMLoc DirectiveLoc, StringRef Name,
                                  StringRef Body,
                                  MCAsmMacroParameters Parameters) {
   // If this macro is not defined with named parameters the warning we are
@@ -3149,21 +3180,21 @@ void AsmParser::CheckForBadMacro(SMLoc DirectiveLoc, StringRef Name,
       break;
 
     if (Body[Pos] == '$') {
-      switch (Body[Pos+1]) {
-        // $$ => $
+      switch (Body[Pos + 1]) {
+      // $$ => $
       case '$':
         break;
 
-        // $n => number of arguments
+      // $n => number of arguments
       case 'n':
         PositionalParametersFound = true;
         break;
 
-        // $[0-9] => argument
+      // $[0-9] => argument
       default: {
         PositionalParametersFound = true;
         break;
-        }
+      }
       }
       Pos += 2;
     } else {
@@ -3171,19 +3202,19 @@ void AsmParser::CheckForBadMacro(SMLoc DirectiveLoc, StringRef Name,
       while (isIdentifierChar(Body[I]) && I + 1 != End)
         ++I;
 
-      const char *Begin = Body.data() + Pos +1;
-      StringRef Argument(Begin, I - (Pos +1));
+      const char *Begin = Body.data() + Pos + 1;
+      StringRef Argument(Begin, I - (Pos + 1));
       unsigned Index = 0;
       for (; Index < NParameters; ++Index)
         if (Parameters[Index].first == Argument)
           break;
 
       if (Index == NParameters) {
-          if (Body[Pos+1] == '(' && Body[Pos+2] == ')')
-            Pos += 3;
-          else {
-            Pos = I;
-          }
+        if (Body[Pos + 1] == '(' && Body[Pos + 2] == ')')
+          Pos += 3;
+        else {
+          Pos = I;
+        }
       } else {
         NamedParametersFound = true;
         Pos += 1 + Argument.size();
@@ -3199,29 +3230,29 @@ void AsmParser::CheckForBadMacro(SMLoc DirectiveLoc, StringRef Name,
                           "found in body which will have no effect");
 }
 
-/// ParseDirectiveEndMacro
+/// parseDirectiveEndMacro
 /// ::= .endm
 /// ::= .endmacro
-bool AsmParser::ParseDirectiveEndMacro(StringRef Directive) {
+bool AsmParser::parseDirectiveEndMacro(StringRef Directive) {
   if (getLexer().isNot(AsmToken::EndOfStatement))
     return TokError("unexpected token in '" + Directive + "' directive");
 
   // If we are inside a macro instantiation, terminate the current
   // instantiation.
-  if (InsideMacroInstantiation()) {
-    HandleMacroExit();
+  if (isInsideMacroInstantiation()) {
+    handleMacroExit();
     return false;
   }
 
   // Otherwise, this .endmacro is a stray entry in the file; well formed
   // .endmacro directives are handled during the macro definition parsing.
   return TokError("unexpected '" + Directive + "' in file, "
-                  "no current macro definition");
+                                               "no current macro definition");
 }
 
-/// ParseDirectivePurgeMacro
+/// parseDirectivePurgeMacro
 /// ::= .purgem
-bool AsmParser::ParseDirectivePurgeMacro(SMLoc DirectiveLoc) {
+bool AsmParser::parseDirectivePurgeMacro(SMLoc DirectiveLoc) {
   StringRef Name;
   if (parseIdentifier(Name))
     return TokError("expected identifier in '.purgem' directive");
@@ -3229,16 +3260,16 @@ bool AsmParser::ParseDirectivePurgeMacro(SMLoc DirectiveLoc) {
   if (getLexer().isNot(AsmToken::EndOfStatement))
     return TokError("unexpected token in '.purgem' directive");
 
-  if (!LookupMacro(Name))
+  if (!lookupMacro(Name))
     return Error(DirectiveLoc, "macro '" + Name + "' is not defined");
 
-  UndefineMacro(Name);
+  undefineMacro(Name);
   return false;
 }
 
-/// ParseDirectiveBundleAlignMode
+/// parseDirectiveBundleAlignMode
 /// ::= {.bundle_align_mode} expression
-bool AsmParser::ParseDirectiveBundleAlignMode() {
+bool AsmParser::parseDirectiveBundleAlignMode() {
   checkForValidSection();
 
   // Expect a single argument: an expression that evaluates to a constant
@@ -3262,9 +3293,9 @@ bool AsmParser::ParseDirectiveBundleAlignMode() {
   return false;
 }
 
-/// ParseDirectiveBundleLock
+/// parseDirectiveBundleLock
 /// ::= {.bundle_lock} [align_to_end]
-bool AsmParser::ParseDirectiveBundleLock() {
+bool AsmParser::parseDirectiveBundleLock() {
   checkForValidSection();
   bool AlignToEnd = false;
 
@@ -3272,7 +3303,7 @@ bool AsmParser::ParseDirectiveBundleLock() {
     StringRef Option;
     SMLoc Loc = getTok().getLoc();
     const char *kInvalidOptionError =
-      "invalid option for '.bundle_lock' directive";
+        "invalid option for '.bundle_lock' directive";
 
     if (parseIdentifier(Option))
       return Error(Loc, kInvalidOptionError);
@@ -3291,9 +3322,9 @@ bool AsmParser::ParseDirectiveBundleLock() {
   return false;
 }
 
-/// ParseDirectiveBundleLock
+/// parseDirectiveBundleLock
 /// ::= {.bundle_lock}
-bool AsmParser::ParseDirectiveBundleUnlock() {
+bool AsmParser::parseDirectiveBundleUnlock() {
   checkForValidSection();
 
   if (getLexer().isNot(AsmToken::EndOfStatement))
@@ -3304,9 +3335,9 @@ bool AsmParser::ParseDirectiveBundleUnlock() {
   return false;
 }
 
-/// ParseDirectiveSpace
+/// parseDirectiveSpace
 /// ::= (.skip | .space) expression [ , expression ]
-bool AsmParser::ParseDirectiveSpace(StringRef IDVal) {
+bool AsmParser::parseDirectiveSpace(StringRef IDVal) {
   checkForValidSection();
 
   int64_t NumBytes;
@@ -3329,8 +3360,8 @@ bool AsmParser::ParseDirectiveSpace(StringRef IDVal) {
   Lex();
 
   if (NumBytes <= 0)
-    return TokError("invalid number of bytes in '" +
-                    Twine(IDVal) + "' directive");
+    return TokError("invalid number of bytes in '" + Twine(IDVal) +
+                    "' directive");
 
   // FIXME: Sometimes the fill expr is 'nop' if it isn't supplied, instead of 0.
   getStreamer().EmitFill(NumBytes, FillExpr);
@@ -3338,9 +3369,9 @@ bool AsmParser::ParseDirectiveSpace(StringRef IDVal) {
   return false;
 }
 
-/// ParseDirectiveLEB128
+/// parseDirectiveLEB128
 /// ::= (.sleb128 | .uleb128) expression
-bool AsmParser::ParseDirectiveLEB128(bool Signed) {
+bool AsmParser::parseDirectiveLEB128(bool Signed) {
   checkForValidSection();
   const MCExpr *Value;
 
@@ -3358,9 +3389,9 @@ bool AsmParser::ParseDirectiveLEB128(bool Signed) {
   return false;
 }
 
-/// ParseDirectiveSymbolAttribute
+/// parseDirectiveSymbolAttribute
 ///  ::= { ".globl", ".weak", ... } [ identifier ( , identifier )* ]
-bool AsmParser::ParseDirectiveSymbolAttribute(MCSymbolAttr Attr) {
+bool AsmParser::parseDirectiveSymbolAttribute(MCSymbolAttr Attr) {
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
     for (;;) {
       StringRef Name;
@@ -3375,7 +3406,8 @@ bool AsmParser::ParseDirectiveSymbolAttribute(MCSymbolAttr Attr) {
       if (Sym->isTemporary())
         return Error(Loc, "non-local symbol required in directive");
 
-      getStreamer().EmitSymbolAttribute(Sym, Attr);
+      if (!getStreamer().EmitSymbolAttribute(Sym, Attr))
+        return Error(Loc, "unable to emit symbol attribute");
 
       if (getLexer().is(AsmToken::EndOfStatement))
         break;
@@ -3390,9 +3422,9 @@ bool AsmParser::ParseDirectiveSymbolAttribute(MCSymbolAttr Attr) {
   return false;
 }
 
-/// ParseDirectiveComm
+/// parseDirectiveComm
 ///  ::= ( .comm | .lcomm ) identifier , size_expression [ , align_expression ]
-bool AsmParser::ParseDirectiveComm(bool IsLocal) {
+bool AsmParser::parseDirectiveComm(bool IsLocal) {
   checkForValidSection();
 
   SMLoc IDLoc = getLexer().getLoc();
@@ -3442,14 +3474,14 @@ bool AsmParser::ParseDirectiveComm(bool IsLocal) {
   // but a size of .lcomm creates a bss symbol of size zero.
   if (Size < 0)
     return Error(SizeLoc, "invalid '.comm' or '.lcomm' directive size, can't "
-                 "be less than zero");
+                          "be less than zero");
 
   // NOTE: The alignment in the directive is a power of 2 value, the assembler
   // may internally end up wanting an alignment in bytes.
   // FIXME: Diagnose overflow.
   if (Pow2Alignment < 0)
     return Error(Pow2AlignmentLoc, "invalid '.comm' or '.lcomm' directive "
-                 "alignment, can't be less than zero");
+                                   "alignment, can't be less than zero");
 
   if (!Sym->isUndefined())
     return Error(IDLoc, "invalid symbol redefinition");
@@ -3464,9 +3496,9 @@ bool AsmParser::ParseDirectiveComm(bool IsLocal) {
   return false;
 }
 
-/// ParseDirectiveAbort
+/// parseDirectiveAbort
 ///  ::= .abort [... message ...]
-bool AsmParser::ParseDirectiveAbort() {
+bool AsmParser::parseDirectiveAbort() {
   // FIXME: Use loc from directive.
   SMLoc Loc = getLexer().getLoc();
 
@@ -3485,25 +3517,25 @@ bool AsmParser::ParseDirectiveAbort() {
   return false;
 }
 
-/// ParseDirectiveInclude
+/// parseDirectiveInclude
 ///  ::= .include "filename"
-bool AsmParser::ParseDirectiveInclude() {
+bool AsmParser::parseDirectiveInclude() {
   if (getLexer().isNot(AsmToken::String))
     return TokError("expected string in '.include' directive");
 
-  std::string Filename = getTok().getString();
+  // Allow the strings to have escaped octal character sequence.
+  std::string Filename;
+  if (parseEscapedString(Filename))
+    return true;
   SMLoc IncludeLoc = getLexer().getLoc();
   Lex();
 
   if (getLexer().isNot(AsmToken::EndOfStatement))
     return TokError("unexpected token in '.include' directive");
 
-  // Strip the quotes.
-  Filename = Filename.substr(1, Filename.size()-2);
-
   // Attempt to switch the lexer to the included file before consuming the end
   // of statement to avoid losing it when we switch.
-  if (EnterIncludeFile(Filename)) {
+  if (enterIncludeFile(Filename)) {
     Error(IncludeLoc, "Could not find include file '" + Filename + "'");
     return true;
   }
@@ -3511,24 +3543,24 @@ bool AsmParser::ParseDirectiveInclude() {
   return false;
 }
 
-/// ParseDirectiveIncbin
+/// parseDirectiveIncbin
 ///  ::= .incbin "filename"
-bool AsmParser::ParseDirectiveIncbin() {
+bool AsmParser::parseDirectiveIncbin() {
   if (getLexer().isNot(AsmToken::String))
     return TokError("expected string in '.incbin' directive");
 
-  std::string Filename = getTok().getString();
+  // Allow the strings to have escaped octal character sequence.
+  std::string Filename;
+  if (parseEscapedString(Filename))
+    return true;
   SMLoc IncbinLoc = getLexer().getLoc();
   Lex();
 
   if (getLexer().isNot(AsmToken::EndOfStatement))
     return TokError("unexpected token in '.incbin' directive");
 
-  // Strip the quotes.
-  Filename = Filename.substr(1, Filename.size()-2);
-
   // Attempt to process the included file.
-  if (ProcessIncbinFile(Filename)) {
+  if (processIncbinFile(Filename)) {
     Error(IncbinLoc, "Could not find incbin file '" + Filename + "'");
     return true;
   }
@@ -3536,9 +3568,9 @@ bool AsmParser::ParseDirectiveIncbin() {
   return false;
 }
 
-/// ParseDirectiveIf
+/// parseDirectiveIf
 /// ::= .if expression
-bool AsmParser::ParseDirectiveIf(SMLoc DirectiveLoc) {
+bool AsmParser::parseDirectiveIf(SMLoc DirectiveLoc) {
   TheCondStack.push_back(TheCondState);
   TheCondState.TheCond = AsmCond::IfCond;
   if (TheCondState.Ignore) {
@@ -3560,9 +3592,9 @@ bool AsmParser::ParseDirectiveIf(SMLoc DirectiveLoc) {
   return false;
 }
 
-/// ParseDirectiveIfb
+/// parseDirectiveIfb
 /// ::= .ifb string
-bool AsmParser::ParseDirectiveIfb(SMLoc DirectiveLoc, bool ExpectBlank) {
+bool AsmParser::parseDirectiveIfb(SMLoc DirectiveLoc, bool ExpectBlank) {
   TheCondStack.push_back(TheCondState);
   TheCondState.TheCond = AsmCond::IfCond;
 
@@ -3583,16 +3615,16 @@ bool AsmParser::ParseDirectiveIfb(SMLoc DirectiveLoc, bool ExpectBlank) {
   return false;
 }
 
-/// ParseDirectiveIfc
+/// parseDirectiveIfc
 /// ::= .ifc string1, string2
-bool AsmParser::ParseDirectiveIfc(SMLoc DirectiveLoc, bool ExpectEqual) {
+bool AsmParser::parseDirectiveIfc(SMLoc DirectiveLoc, bool ExpectEqual) {
   TheCondStack.push_back(TheCondState);
   TheCondState.TheCond = AsmCond::IfCond;
 
   if (TheCondState.Ignore) {
     eatToEndOfStatement();
   } else {
-    StringRef Str1 = ParseStringToComma();
+    StringRef Str1 = parseStringToComma();
 
     if (getLexer().isNot(AsmToken::Comma))
       return TokError("unexpected token in '.ifc' directive");
@@ -3613,9 +3645,9 @@ bool AsmParser::ParseDirectiveIfc(SMLoc DirectiveLoc, bool ExpectEqual) {
   return false;
 }
 
-/// ParseDirectiveIfdef
+/// parseDirectiveIfdef
 /// ::= .ifdef symbol
-bool AsmParser::ParseDirectiveIfdef(SMLoc DirectiveLoc, bool expect_defined) {
+bool AsmParser::parseDirectiveIfdef(SMLoc DirectiveLoc, bool expect_defined) {
   StringRef Name;
   TheCondStack.push_back(TheCondState);
   TheCondState.TheCond = AsmCond::IfCond;
@@ -3640,9 +3672,9 @@ bool AsmParser::ParseDirectiveIfdef(SMLoc DirectiveLoc, bool expect_defined) {
   return false;
 }
 
-/// ParseDirectiveElseIf
+/// parseDirectiveElseIf
 /// ::= .elseif expression
-bool AsmParser::ParseDirectiveElseIf(SMLoc DirectiveLoc) {
+bool AsmParser::parseDirectiveElseIf(SMLoc DirectiveLoc) {
   if (TheCondState.TheCond != AsmCond::IfCond &&
       TheCondState.TheCond != AsmCond::ElseIfCond)
     Error(DirectiveLoc, "Encountered a .elseif that doesn't follow a .if or "
@@ -3671,9 +3703,9 @@ bool AsmParser::ParseDirectiveElseIf(SMLoc DirectiveLoc) {
   return false;
 }
 
-/// ParseDirectiveElse
+/// parseDirectiveElse
 /// ::= .else
-bool AsmParser::ParseDirectiveElse(SMLoc DirectiveLoc) {
+bool AsmParser::parseDirectiveElse(SMLoc DirectiveLoc) {
   if (getLexer().isNot(AsmToken::EndOfStatement))
     return TokError("unexpected token in '.else' directive");
 
@@ -3695,16 +3727,15 @@ bool AsmParser::ParseDirectiveElse(SMLoc DirectiveLoc) {
   return false;
 }
 
-/// ParseDirectiveEndIf
+/// parseDirectiveEndIf
 /// ::= .endif
-bool AsmParser::ParseDirectiveEndIf(SMLoc DirectiveLoc) {
+bool AsmParser::parseDirectiveEndIf(SMLoc DirectiveLoc) {
   if (getLexer().isNot(AsmToken::EndOfStatement))
     return TokError("unexpected token in '.endif' directive");
 
   Lex();
 
-  if ((TheCondState.TheCond == AsmCond::NoCond) ||
-      TheCondStack.empty())
+  if ((TheCondState.TheCond == AsmCond::NoCond) || TheCondStack.empty())
     Error(DirectiveLoc, "Encountered a .endif that doesn't follow a .if or "
                         ".else");
   if (!TheCondStack.empty()) {
@@ -3748,7 +3779,6 @@ void AsmParser::initializeDirectiveKindMap() {
   DirectiveKindMap[".extern"] = DK_EXTERN;
   DirectiveKindMap[".globl"] = DK_GLOBL;
   DirectiveKindMap[".global"] = DK_GLOBAL;
-  DirectiveKindMap[".indirect_symbol"] = DK_INDIRECT_SYMBOL;
   DirectiveKindMap[".lazy_reference"] = DK_LAZY_REFERENCE;
   DirectiveKindMap[".no_dead_strip"] = DK_NO_DEAD_STRIP;
   DirectiveKindMap[".symbol_resolver"] = DK_SYMBOL_RESOLVER;
@@ -3810,6 +3840,7 @@ void AsmParser::initializeDirectiveKindMap() {
   DirectiveKindMap[".cfi_signal_frame"] = DK_CFI_SIGNAL_FRAME;
   DirectiveKindMap[".cfi_undefined"] = DK_CFI_UNDEFINED;
   DirectiveKindMap[".cfi_register"] = DK_CFI_REGISTER;
+  DirectiveKindMap[".cfi_window_save"] = DK_CFI_WINDOW_SAVE;
   DirectiveKindMap[".macros_on"] = DK_MACROS_ON;
   DirectiveKindMap[".macros_off"] = DK_MACROS_OFF;
   DirectiveKindMap[".macro"] = DK_MACRO;
@@ -3818,8 +3849,7 @@ void AsmParser::initializeDirectiveKindMap() {
   DirectiveKindMap[".purgem"] = DK_PURGEM;
 }
 
-
-MCAsmMacro *AsmParser::ParseMacroLikeBody(SMLoc DirectiveLoc) {
+MCAsmMacro *AsmParser::parseMacroLikeBody(SMLoc DirectiveLoc) {
   AsmToken EndToken, StartToken = getTok();
 
   unsigned NestLevel = 0;
@@ -3836,8 +3866,7 @@ MCAsmMacro *AsmParser::ParseMacroLikeBody(SMLoc DirectiveLoc) {
     }
 
     // Otherwise, check whether we have reached the .endr.
-    if (Lexer.is(AsmToken::Identifier) &&
-        getTok().getIdentifier() == ".endr") {
+    if (Lexer.is(AsmToken::Identifier) && getTok().getIdentifier() == ".endr") {
       if (NestLevel == 0) {
         EndToken = getTok();
         Lex();
@@ -3865,19 +3894,17 @@ MCAsmMacro *AsmParser::ParseMacroLikeBody(SMLoc DirectiveLoc) {
   return &MacroLikeBodies.back();
 }
 
-void AsmParser::InstantiateMacroLikeBody(MCAsmMacro *M, SMLoc DirectiveLoc,
+void AsmParser::instantiateMacroLikeBody(MCAsmMacro *M, SMLoc DirectiveLoc,
                                          raw_svector_ostream &OS) {
   OS << ".endr\n";
 
   MemoryBuffer *Instantiation =
-    MemoryBuffer::getMemBufferCopy(OS.str(), "<instantiation>");
+      MemoryBuffer::getMemBufferCopy(OS.str(), "<instantiation>");
 
   // Create the macro instantiation object and add to the current macro
   // instantiation stack.
-  MacroInstantiation *MI = new MacroInstantiation(M, DirectiveLoc,
-                                                  CurBuffer,
-                                                  getTok().getLoc(),
-                                                  Instantiation);
+  MacroInstantiation *MI = new MacroInstantiation(
+      M, DirectiveLoc, CurBuffer, getTok().getLoc(), Instantiation);
   ActiveMacros.push_back(MI);
 
   // Jump to the macro instantiation and prime the lexer.
@@ -3886,7 +3913,7 @@ void AsmParser::InstantiateMacroLikeBody(MCAsmMacro *M, SMLoc DirectiveLoc,
   Lex();
 }
 
-bool AsmParser::ParseDirectiveRept(SMLoc DirectiveLoc) {
+bool AsmParser::parseDirectiveRept(SMLoc DirectiveLoc) {
   int64_t Count;
   if (parseAbsoluteExpression(Count))
     return TokError("unexpected token in '.rept' directive");
@@ -3901,7 +3928,7 @@ bool AsmParser::ParseDirectiveRept(SMLoc DirectiveLoc) {
   Lex();
 
   // Lex the rept definition.
-  MCAsmMacro *M = ParseMacroLikeBody(DirectiveLoc);
+  MCAsmMacro *M = parseMacroLikeBody(DirectiveLoc);
   if (!M)
     return true;
 
@@ -3915,14 +3942,14 @@ bool AsmParser::ParseDirectiveRept(SMLoc DirectiveLoc) {
     if (expandMacro(OS, M->Body, Parameters, A, getTok().getLoc()))
       return true;
   }
-  InstantiateMacroLikeBody(M, DirectiveLoc, OS);
+  instantiateMacroLikeBody(M, DirectiveLoc, OS);
 
   return false;
 }
 
-/// ParseDirectiveIrp
+/// parseDirectiveIrp
 /// ::= .irp symbol,values
-bool AsmParser::ParseDirectiveIrp(SMLoc DirectiveLoc) {
+bool AsmParser::parseDirectiveIrp(SMLoc DirectiveLoc) {
   MCAsmMacroParameters Parameters;
   MCAsmMacroParameter Parameter;
 
@@ -3937,14 +3964,14 @@ bool AsmParser::ParseDirectiveIrp(SMLoc DirectiveLoc) {
   Lex();
 
   MCAsmMacroArguments A;
-  if (ParseMacroArguments(0, A))
+  if (parseMacroArguments(0, A))
     return true;
 
   // Eat the end of statement.
   Lex();
 
   // Lex the irp definition.
-  MCAsmMacro *M = ParseMacroLikeBody(DirectiveLoc);
+  MCAsmMacro *M = parseMacroLikeBody(DirectiveLoc);
   if (!M)
     return true;
 
@@ -3961,14 +3988,14 @@ bool AsmParser::ParseDirectiveIrp(SMLoc DirectiveLoc) {
       return true;
   }
 
-  InstantiateMacroLikeBody(M, DirectiveLoc, OS);
+  instantiateMacroLikeBody(M, DirectiveLoc, OS);
 
   return false;
 }
 
-/// ParseDirectiveIrpc
+/// parseDirectiveIrpc
 /// ::= .irpc symbol,values
-bool AsmParser::ParseDirectiveIrpc(SMLoc DirectiveLoc) {
+bool AsmParser::parseDirectiveIrpc(SMLoc DirectiveLoc) {
   MCAsmMacroParameters Parameters;
   MCAsmMacroParameter Parameter;
 
@@ -3983,7 +4010,7 @@ bool AsmParser::ParseDirectiveIrpc(SMLoc DirectiveLoc) {
   Lex();
 
   MCAsmMacroArguments A;
-  if (ParseMacroArguments(0, A))
+  if (parseMacroArguments(0, A))
     return true;
 
   if (A.size() != 1 || A.front().size() != 1)
@@ -3993,7 +4020,7 @@ bool AsmParser::ParseDirectiveIrpc(SMLoc DirectiveLoc) {
   Lex();
 
   // Lex the irpc definition.
-  MCAsmMacro *M = ParseMacroLikeBody(DirectiveLoc);
+  MCAsmMacro *M = parseMacroLikeBody(DirectiveLoc);
   if (!M)
     return true;
 
@@ -4006,7 +4033,7 @@ bool AsmParser::ParseDirectiveIrpc(SMLoc DirectiveLoc) {
   std::size_t I, End = Values.size();
   for (I = 0; I < End; ++I) {
     MCAsmMacroArgument Arg;
-    Arg.push_back(AsmToken(AsmToken::Identifier, Values.slice(I, I+1)));
+    Arg.push_back(AsmToken(AsmToken::Identifier, Values.slice(I, I + 1)));
 
     MCAsmMacroArguments Args;
     Args.push_back(Arg);
@@ -4015,24 +4042,24 @@ bool AsmParser::ParseDirectiveIrpc(SMLoc DirectiveLoc) {
       return true;
   }
 
-  InstantiateMacroLikeBody(M, DirectiveLoc, OS);
+  instantiateMacroLikeBody(M, DirectiveLoc, OS);
 
   return false;
 }
 
-bool AsmParser::ParseDirectiveEndr(SMLoc DirectiveLoc) {
+bool AsmParser::parseDirectiveEndr(SMLoc DirectiveLoc) {
   if (ActiveMacros.empty())
     return TokError("unmatched '.endr' directive");
 
   // The only .repl that should get here are the ones created by
-  // InstantiateMacroLikeBody.
+  // instantiateMacroLikeBody.
   assert(getLexer().is(AsmToken::EndOfStatement));
 
-  HandleMacroExit();
+  handleMacroExit();
   return false;
 }
 
-bool AsmParser::ParseDirectiveMSEmit(SMLoc IDLoc, ParseStatementInfo &Info,
+bool AsmParser::parseDirectiveMSEmit(SMLoc IDLoc, ParseStatementInfo &Info,
                                      size_t Len) {
   const MCExpr *Value;
   SMLoc ExprLoc = getLexer().getLoc();
@@ -4049,7 +4076,7 @@ bool AsmParser::ParseDirectiveMSEmit(SMLoc IDLoc, ParseStatementInfo &Info,
   return false;
 }
 
-bool AsmParser::ParseDirectiveMSAlign(SMLoc IDLoc, ParseStatementInfo &Info) {
+bool AsmParser::parseDirectiveMSAlign(SMLoc IDLoc, ParseStatementInfo &Info) {
   const MCExpr *Value;
   SMLoc ExprLoc = getLexer().getLoc();
   if (parseExpression(Value))
@@ -4061,16 +4088,15 @@ bool AsmParser::ParseDirectiveMSAlign(SMLoc IDLoc, ParseStatementInfo &Info) {
   if (!isPowerOf2_64(IntValue))
     return Error(ExprLoc, "literal value not a power of two greater then zero");
 
-  Info.AsmRewrites->push_back(AsmRewrite(AOK_Align, IDLoc, 5,
-                                         Log2_64(IntValue)));
+  Info.AsmRewrites->push_back(
+      AsmRewrite(AOK_Align, IDLoc, 5, Log2_64(IntValue)));
   return false;
 }
 
 // We are comparing pointers, but the pointers are relative to a single string.
 // Thus, this should always be deterministic.
-static int RewritesSort(const void *A, const void *B) {
-  const AsmRewrite *AsmRewriteA = static_cast<const AsmRewrite *>(A);
-  const AsmRewrite *AsmRewriteB = static_cast<const AsmRewrite *>(B);
+static int rewritesSort(const AsmRewrite *AsmRewriteA,
+                        const AsmRewrite *AsmRewriteB) {
   if (AsmRewriteA->Loc.getPointer() < AsmRewriteB->Loc.getPointer())
     return -1;
   if (AsmRewriteB->Loc.getPointer() < AsmRewriteA->Loc.getPointer())
@@ -4080,25 +4106,22 @@ static int RewritesSort(const void *A, const void *B) {
   // rewrite to the same location.  Make sure the SizeDirective rewrite is
   // performed first, then the Imm/ImmPrefix and finally the Input/Output.  This
   // ensures the sort algorithm is stable.
-  if (AsmRewritePrecedence [AsmRewriteA->Kind] >
-      AsmRewritePrecedence [AsmRewriteB->Kind])
+  if (AsmRewritePrecedence[AsmRewriteA->Kind] >
+      AsmRewritePrecedence[AsmRewriteB->Kind])
     return -1;
 
-  if (AsmRewritePrecedence [AsmRewriteA->Kind] <
-      AsmRewritePrecedence [AsmRewriteB->Kind])
+  if (AsmRewritePrecedence[AsmRewriteA->Kind] <
+      AsmRewritePrecedence[AsmRewriteB->Kind])
     return 1;
-  llvm_unreachable ("Unstable rewrite sort.");
+  llvm_unreachable("Unstable rewrite sort.");
 }
 
-bool
-AsmParser::parseMSInlineAsm(void *AsmLoc, std::string &AsmString,
-                            unsigned &NumOutputs, unsigned &NumInputs,
-                            SmallVectorImpl<std::pair<void *, bool> > &OpDecls,
-                            SmallVectorImpl<std::string> &Constraints,
-                            SmallVectorImpl<std::string> &Clobbers,
-                            const MCInstrInfo *MII,
-                            const MCInstPrinter *IP,
-                            MCAsmParserSemaCallback &SI) {
+bool AsmParser::parseMSInlineAsm(
+    void *AsmLoc, std::string &AsmString, unsigned &NumOutputs,
+    unsigned &NumInputs, SmallVectorImpl<std::pair<void *, bool> > &OpDecls,
+    SmallVectorImpl<std::string> &Constraints,
+    SmallVectorImpl<std::string> &Clobbers, const MCInstrInfo *MII,
+    const MCInstPrinter *IP, MCAsmParserSemaCallback &SI) {
   SmallVector<void *, 4> InputDecls;
   SmallVector<void *, 4> OutputDecls;
   SmallVector<bool, 4> InputDeclsAddressOf;
@@ -4117,7 +4140,7 @@ AsmParser::parseMSInlineAsm(void *AsmLoc, std::string &AsmString,
   unsigned OutputIdx = 0;
   while (getLexer().isNot(AsmToken::Eof)) {
     ParseStatementInfo Info(&AsmStrRewrites);
-    if (ParseStatement(Info))
+    if (parseStatement(Info))
       return true;
 
     if (Info.ParseError)
@@ -4205,7 +4228,7 @@ AsmParser::parseMSInlineAsm(void *AsmLoc, std::string &AsmString,
   raw_string_ostream OS(AsmStringIR);
   const char *AsmStart = SrcMgr.getMemoryBuffer(0)->getBufferStart();
   const char *AsmEnd = SrcMgr.getMemoryBuffer(0)->getBufferEnd();
-  array_pod_sort(AsmStrRewrites.begin(), AsmStrRewrites.end(), RewritesSort);
+  array_pod_sort(AsmStrRewrites.begin(), AsmStrRewrites.end(), rewritesSort);
   for (SmallVectorImpl<AsmRewrite>::iterator I = AsmStrRewrites.begin(),
                                              E = AsmStrRewrites.end();
        I != E; ++I) {
@@ -4230,7 +4253,8 @@ AsmParser::parseMSInlineAsm(void *AsmLoc, std::string &AsmString,
     unsigned AdditionalSkip = 0;
     // Rewrite expressions in $N notation.
     switch (Kind) {
-    default: break;
+    default:
+      break;
     case AOK_Imm:
       OS << "$$" << (*I).Val;
       break;
@@ -4285,8 +4309,7 @@ AsmParser::parseMSInlineAsm(void *AsmLoc, std::string &AsmString,
 }
 
 /// \brief Create an MCAsmParser instance.
-MCAsmParser *llvm::createMCAsmParser(SourceMgr &SM,
-                                     MCContext &C, MCStreamer &Out,
-                                     const MCAsmInfo &MAI) {
+MCAsmParser *llvm::createMCAsmParser(SourceMgr &SM, MCContext &C,
+                                     MCStreamer &Out, const MCAsmInfo &MAI) {
   return new AsmParser(SM, C, Out, MAI);
 }
diff --git a/lib/MC/MCParser/COFFAsmParser.cpp b/lib/MC/MCParser/COFFAsmParser.cpp
index df1794c..d8343a3 100644
--- a/lib/MC/MCParser/COFFAsmParser.cpp
+++ b/lib/MC/MCParser/COFFAsmParser.cpp
@@ -35,6 +35,10 @@ class COFFAsmParser : public MCAsmParserExtension {
                           unsigned Characteristics,
                           SectionKind Kind);
 
+  bool ParseSectionSwitch(StringRef Section, unsigned Characteristics,
+                          SectionKind Kind, StringRef COMDATSymName,
+                          COFF::COMDATType Type, const MCSectionCOFF *Assoc);
+
   bool ParseSectionName(StringRef &SectionName);
   bool ParseSectionFlags(StringRef FlagsString, unsigned* Flags);
 
@@ -111,6 +115,8 @@ class COFFAsmParser : public MCAsmParserExtension {
   bool ParseDirectiveType(StringRef, SMLoc);
   bool ParseDirectiveEndef(StringRef, SMLoc);
   bool ParseDirectiveSecRel32(StringRef, SMLoc);
+  bool parseCOMDATTypeAndAssoc(COFF::COMDATType &Type,
+                               const MCSectionCOFF *&Assoc);
   bool ParseDirectiveLinkOnce(StringRef, SMLoc);
 
   // Win64 EH directives.
@@ -284,12 +290,22 @@ bool COFFAsmParser::ParseDirectiveSymbolAttribute(StringRef Directive, SMLoc) {
 bool COFFAsmParser::ParseSectionSwitch(StringRef Section,
                                        unsigned Characteristics,
                                        SectionKind Kind) {
+  return ParseSectionSwitch(Section, Characteristics, Kind, "",
+                            COFF::IMAGE_COMDAT_SELECT_ANY, 0);
+}
+
+bool COFFAsmParser::ParseSectionSwitch(StringRef Section,
+                                       unsigned Characteristics,
+                                       SectionKind Kind,
+                                       StringRef COMDATSymName,
+                                       COFF::COMDATType Type,
+                                       const MCSectionCOFF *Assoc) {
   if (getLexer().isNot(AsmToken::EndOfStatement))
     return TokError("unexpected token in section switching directive");
   Lex();
 
   getStreamer().SwitchSection(getContext().getCOFFSection(
-                                Section, Characteristics, Kind));
+      Section, Characteristics, Kind, COMDATSymName, Type, Assoc));
 
   return false;
 }
@@ -303,7 +319,7 @@ bool COFFAsmParser::ParseSectionName(StringRef &SectionName) {
   return false;
 }
 
-// .section name [, "flags"]
+// .section name [, "flags"] [, identifier [ identifier ], identifier]
 //
 // Supported flags:
 //   a: Ignored.
@@ -340,11 +356,30 @@ bool COFFAsmParser::ParseDirectiveSection(StringRef, SMLoc) {
       return true;
   }
 
+  COFF::COMDATType Type = COFF::IMAGE_COMDAT_SELECT_ANY;
+  const MCSectionCOFF *Assoc = 0;
+  StringRef COMDATSymName;
+  if (getLexer().is(AsmToken::Comma)) {
+    Lex();
+
+    Flags |= COFF::IMAGE_SCN_LNK_COMDAT;
+
+    if (parseCOMDATTypeAndAssoc(Type, Assoc))
+      return true;
+
+    if (getLexer().isNot(AsmToken::Comma))
+      return TokError("expected comma in directive");
+    Lex();
+
+    if (getParser().parseIdentifier(COMDATSymName))
+      return TokError("expected identifier in directive");
+  }
+
   if (getLexer().isNot(AsmToken::EndOfStatement))
     return TokError("unexpected token in directive");
 
   SectionKind Kind = computeSectionKind(Flags);
-  ParseSectionSwitch(SectionName, Flags, Kind);
+  ParseSectionSwitch(SectionName, Flags, Kind, COMDATSymName, Type, Assoc);
   return false;
 }
 
@@ -409,37 +444,29 @@ bool COFFAsmParser::ParseDirectiveSecRel32(StringRef, SMLoc) {
   return false;
 }
 
-/// ParseDirectiveLinkOnce
-///  ::= .linkonce [ identifier [ identifier ] ]
-bool COFFAsmParser::ParseDirectiveLinkOnce(StringRef, SMLoc Loc) {
-  COFF::COMDATType Type = COFF::IMAGE_COMDAT_SELECT_ANY;
-
-  if (getLexer().is(AsmToken::Identifier)) {
-    StringRef TypeId = getTok().getIdentifier();
+/// ::= [ identifier [ identifier ] ]
+bool COFFAsmParser::parseCOMDATTypeAndAssoc(COFF::COMDATType &Type,
+                                            const MCSectionCOFF *&Assoc) {
+  StringRef TypeId = getTok().getIdentifier();
 
-    Type = StringSwitch<COFF::COMDATType>(TypeId)
-      .Case("one_only", COFF::IMAGE_COMDAT_SELECT_NODUPLICATES)
-      .Case("discard", COFF::IMAGE_COMDAT_SELECT_ANY)
-      .Case("same_size", COFF::IMAGE_COMDAT_SELECT_SAME_SIZE)
-      .Case("same_contents", COFF::IMAGE_COMDAT_SELECT_EXACT_MATCH)
-      .Case("associative", COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE)
-      .Case("largest", COFF::IMAGE_COMDAT_SELECT_LARGEST)
-      .Case("newest", COFF::IMAGE_COMDAT_SELECT_NEWEST)
-      .Default((COFF::COMDATType)0);
+  Type = StringSwitch<COFF::COMDATType>(TypeId)
+    .Case("one_only", COFF::IMAGE_COMDAT_SELECT_NODUPLICATES)
+    .Case("discard", COFF::IMAGE_COMDAT_SELECT_ANY)
+    .Case("same_size", COFF::IMAGE_COMDAT_SELECT_SAME_SIZE)
+    .Case("same_contents", COFF::IMAGE_COMDAT_SELECT_EXACT_MATCH)
+    .Case("associative", COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE)
+    .Case("largest", COFF::IMAGE_COMDAT_SELECT_LARGEST)
+    .Case("newest", COFF::IMAGE_COMDAT_SELECT_NEWEST)
+    .Default((COFF::COMDATType)0);
 
-    if (Type == 0)
-      return TokError(Twine("unrecognized COMDAT type '" + TypeId + "'"));
+  if (Type == 0)
+    return TokError(Twine("unrecognized COMDAT type '" + TypeId + "'"));
 
-    Lex();
-  }
-  
-  const MCSectionCOFF *Current = static_cast<const MCSectionCOFF*>(
-                                       getStreamer().getCurrentSection().first);
+  Lex();
 
-  const MCSectionCOFF *Assoc = 0;
   if (Type == COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE) {
-    StringRef AssocName;
     SMLoc Loc = getTok().getLoc();
+    StringRef AssocName;
     if (ParseSectionName(AssocName))
       return TokError("expected associated section name");
 
@@ -447,14 +474,33 @@ bool COFFAsmParser::ParseDirectiveLinkOnce(StringRef, SMLoc Loc) {
                                         getContext().getCOFFSection(AssocName));
     if (!Assoc)
       return Error(Loc, "cannot associate unknown section '" + AssocName + "'");
-    if (Assoc == Current)
-      return Error(Loc, "cannot associate a section with itself");
     if (!(Assoc->getCharacteristics() & COFF::IMAGE_SCN_LNK_COMDAT))
       return Error(Loc, "associated section must be a COMDAT section");
     if (Assoc->getSelection() == COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE)
       return Error(Loc, "associated section cannot be itself associative");
   }
 
+  return false;
+}
+
+/// ParseDirectiveLinkOnce
+///  ::= .linkonce [ identifier [ identifier ] ]
+bool COFFAsmParser::ParseDirectiveLinkOnce(StringRef, SMLoc Loc) {
+  COFF::COMDATType Type = COFF::IMAGE_COMDAT_SELECT_ANY;
+  const MCSectionCOFF *Assoc = 0;
+  if (getLexer().is(AsmToken::Identifier))
+    if (parseCOMDATTypeAndAssoc(Type, Assoc))
+      return true;
+
+  const MCSectionCOFF *Current = static_cast<const MCSectionCOFF*>(
+                                       getStreamer().getCurrentSection().first);
+
+
+  if (Type == COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE) {
+    if (Assoc == Current)
+      return Error(Loc, "cannot associate a section with itself");
+  }
+
   if (Current->getCharacteristics() & COFF::IMAGE_SCN_LNK_COMDAT)
     return Error(Loc, Twine("section '") + Current->getSectionName() +
                                                        "' is already linkonce");
diff --git a/lib/MC/MCParser/DarwinAsmParser.cpp b/lib/MC/MCParser/DarwinAsmParser.cpp
index 0aeeaf6..4c9bafa 100644
--- a/lib/MC/MCParser/DarwinAsmParser.cpp
+++ b/lib/MC/MCParser/DarwinAsmParser.cpp
@@ -45,6 +45,8 @@ public:
     this->MCAsmParserExtension::Initialize(Parser);
 
     addDirectiveHandler<&DarwinAsmParser::ParseDirectiveDesc>(".desc");
+    addDirectiveHandler<&DarwinAsmParser::ParseDirectiveIndirectSymbol>(
+      ".indirect_symbol");
     addDirectiveHandler<&DarwinAsmParser::ParseDirectiveLsym>(".lsym");
     addDirectiveHandler<&DarwinAsmParser::ParseDirectiveSubsectionsViaSymbols>(
       ".subsections_via_symbols");
@@ -69,6 +71,7 @@ public:
       ".end_data_region");
 
     // Special section directives.
+    addDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveBss>(".bss");
     addDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveConst>(".const");
     addDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveConstData>(
       ".const_data");
@@ -163,6 +166,7 @@ public:
   }
 
   bool ParseDirectiveDesc(StringRef, SMLoc);
+  bool ParseDirectiveIndirectSymbol(StringRef, SMLoc);
   bool ParseDirectiveDumpOrLoad(StringRef, SMLoc);
   bool ParseDirectiveLsym(StringRef, SMLoc);
   bool ParseDirectiveLinkerOption(StringRef, SMLoc);
@@ -179,6 +183,10 @@ public:
   bool ParseDirectiveDataRegionEnd(StringRef, SMLoc);
 
   // Named Section Directive
+  bool ParseSectionDirectiveBss(StringRef, SMLoc) {
+    return ParseSectionSwitch("__DATA", "__bss");
+  }
+
   bool ParseSectionDirectiveConst(StringRef, SMLoc) {
     return ParseSectionSwitch("__TEXT", "__const");
   }
@@ -415,6 +423,39 @@ bool DarwinAsmParser::ParseDirectiveDesc(StringRef, SMLoc) {
   return false;
 }
 
+/// ParseDirectiveIndirectSymbol
+///  ::= .indirect_symbol identifier
+bool DarwinAsmParser::ParseDirectiveIndirectSymbol(StringRef, SMLoc Loc) {
+  const MCSectionMachO *Current = static_cast<const MCSectionMachO*>(
+                                       getStreamer().getCurrentSection().first);
+  unsigned SectionType = Current->getType();
+  if (SectionType != MCSectionMachO::S_NON_LAZY_SYMBOL_POINTERS &&
+      SectionType != MCSectionMachO::S_LAZY_SYMBOL_POINTERS &&
+      SectionType != MCSectionMachO::S_SYMBOL_STUBS)
+    return Error(Loc, "indirect symbol not in a symbol pointer or stub "
+                      "section");
+
+  StringRef Name;
+  if (getParser().parseIdentifier(Name))
+    return TokError("expected identifier in .indirect_symbol directive");
+
+  MCSymbol *Sym = getContext().GetOrCreateSymbol(Name);
+
+  // Assembler local symbols don't make any sense here. Complain loudly.
+  if (Sym->isTemporary())
+    return TokError("non-local symbol required in directive");
+
+  if (!getStreamer().EmitSymbolAttribute(Sym, MCSA_IndirectSymbol))
+    return TokError("unable to emit indirect symbol attribute for: " + Name);
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in '.indirect_symbol' directive");
+
+  Lex();
+
+  return false;
+}
+
 /// ParseDirectiveDumpOrLoad
 ///  ::= ( .dump | .load ) "filename"
 bool DarwinAsmParser::ParseDirectiveDumpOrLoad(StringRef Directive,
diff --git a/lib/MC/MCParser/ELFAsmParser.cpp b/lib/MC/MCParser/ELFAsmParser.cpp
index 3134fc3..8807975 100644
--- a/lib/MC/MCParser/ELFAsmParser.cpp
+++ b/lib/MC/MCParser/ELFAsmParser.cpp
@@ -16,6 +16,7 @@
 #include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/ELF.h"
 using namespace llvm;
 
@@ -30,14 +31,11 @@ class ELFAsmParser : public MCAsmParserExtension {
     getParser().addDirectiveHandler(Directive, Handler);
   }
 
-  bool ParseSectionSwitch(StringRef Section, unsigned Type,
-                          unsigned Flags, SectionKind Kind);
-  bool SeenIdent;
+  bool ParseSectionSwitch(StringRef Section, unsigned Type, unsigned Flags,
+                          SectionKind Kind);
 
 public:
-  ELFAsmParser() : SeenIdent(false) {
-    BracketExpressionsSupported = true;
-  }
+  ELFAsmParser() { BracketExpressionsSupported = true; }
 
   virtual void Initialize(MCAsmParser &Parser) {
     // Call the base implementation.
@@ -241,7 +239,6 @@ bool ELFAsmParser::ParseSectionName(StringRef &SectionName) {
   }
 
   for (;;) {
-    StringRef Tmp;
     unsigned CurSize;
 
     SMLoc PrevLoc = getLexer().getLoc();
@@ -279,14 +276,17 @@ static SectionKind computeSectionKind(unsigned Flags) {
   return SectionKind::getDataRel();
 }
 
-static int parseSectionFlags(StringRef flagsStr) {
-  int flags = 0;
+static unsigned parseSectionFlags(StringRef flagsStr, bool *UseLastGroup) {
+  unsigned flags = 0;
 
   for (unsigned i = 0; i < flagsStr.size(); i++) {
     switch (flagsStr[i]) {
     case 'a':
       flags |= ELF::SHF_ALLOC;
       break;
+    case 'e':
+      flags |= ELF::SHF_EXCLUDE;
+      break;
     case 'x':
       flags |= ELF::SHF_EXECINSTR;
       break;
@@ -311,8 +311,11 @@ static int parseSectionFlags(StringRef flagsStr) {
     case 'G':
       flags |= ELF::SHF_GROUP;
       break;
+    case '?':
+      *UseLastGroup = true;
+      break;
     default:
-      return -1;
+      return -1U;
     }
   }
 
@@ -352,6 +355,7 @@ bool ELFAsmParser::ParseSectionArguments(bool IsPush) {
   StringRef GroupName;
   unsigned Flags = 0;
   const MCExpr *Subsection = 0;
+  bool UseLastGroup = false;
 
   // Set the defaults first.
   if (SectionName == ".fini" || SectionName == ".init" ||
@@ -377,13 +381,16 @@ bool ELFAsmParser::ParseSectionArguments(bool IsPush) {
     StringRef FlagsStr = getTok().getStringContents();
     Lex();
 
-    int extraFlags = parseSectionFlags(FlagsStr);
-    if (extraFlags < 0)
+    unsigned extraFlags = parseSectionFlags(FlagsStr, &UseLastGroup);
+    if (extraFlags == -1U)
       return TokError("unknown flag");
     Flags |= extraFlags;
 
     bool Mergeable = Flags & ELF::SHF_MERGE;
     bool Group = Flags & ELF::SHF_GROUP;
+    if (Group && UseLastGroup)
+      return TokError("Section cannot specifiy a group name while also acting "
+                      "as a member of the last group");
 
     if (getLexer().isNot(AsmToken::Comma)) {
       if (Mergeable)
@@ -392,10 +399,13 @@ bool ELFAsmParser::ParseSectionArguments(bool IsPush) {
         return TokError("Group section must specify the type");
     } else {
       Lex();
-      if (getLexer().isNot(AsmToken::Percent) && getLexer().isNot(AsmToken::At))
-        return TokError("expected '@' or '%' before type");
+      if (getLexer().is(AsmToken::At) || getLexer().is(AsmToken::Percent) ||
+          getLexer().is(AsmToken::String)) {
+        if (!getLexer().is(AsmToken::String))
+          Lex();
+      } else
+        return TokError("expected '@<type>', '%<type>' or \"<type>\"");
 
-      Lex();
       if (getParser().parseIdentifier(TypeName))
         return TokError("expected identifier in directive");
 
@@ -461,6 +471,16 @@ EndStmt:
       return TokError("unknown section type");
   }
 
+  if (UseLastGroup) {
+    MCSectionSubPair CurrentSection = getStreamer().getCurrentSection();
+    if (const MCSectionELF *Section =
+            cast_or_null<MCSectionELF>(CurrentSection.first))
+      if (const MCSymbol *Group = Section->getGroup()) {
+        GroupName = Group->getName();
+        Flags |= ELF::SHF_GROUP;
+      }
+  }
+
   SectionKind Kind = computeSectionKind(Flags);
   getStreamer().SwitchSection(getContext().getELFSection(SectionName, Type,
                                                          Flags, Kind, Size,
@@ -479,7 +499,11 @@ bool ELFAsmParser::ParseDirectivePrevious(StringRef DirName, SMLoc) {
 }
 
 /// ParseDirectiveELFType
+///  ::= .type identifier , STT_<TYPE_IN_UPPER_CASE>
+///  ::= .type identifier , #attribute
 ///  ::= .type identifier , @attribute
+///  ::= .type identifier , %attribute
+///  ::= .type identifier , "attribute"
 bool ELFAsmParser::ParseDirectiveType(StringRef, SMLoc) {
   StringRef Name;
   if (getParser().parseIdentifier(Name))
@@ -492,26 +516,42 @@ bool ELFAsmParser::ParseDirectiveType(StringRef, SMLoc) {
     return TokError("unexpected token in '.type' directive");
   Lex();
 
-  if (getLexer().isNot(AsmToken::Percent) && getLexer().isNot(AsmToken::At))
-    return TokError("expected '@' or '%' before type");
-  Lex();
-
   StringRef Type;
   SMLoc TypeLoc;
+  MCSymbolAttr Attr;
+  if (getLexer().is(AsmToken::Identifier)) {
+    TypeLoc = getLexer().getLoc();
+    if (getParser().parseIdentifier(Type))
+      return TokError("expected symbol type in directive");
+    Attr = StringSwitch<MCSymbolAttr>(Type)
+               .Case("STT_FUNC", MCSA_ELF_TypeFunction)
+               .Case("STT_OBJECT", MCSA_ELF_TypeObject)
+               .Case("STT_TLS", MCSA_ELF_TypeTLS)
+               .Case("STT_COMMON", MCSA_ELF_TypeCommon)
+               .Case("STT_NOTYPE", MCSA_ELF_TypeNoType)
+               .Case("STT_GNU_IFUNC", MCSA_ELF_TypeIndFunction)
+               .Default(MCSA_Invalid);
+  } else if (getLexer().is(AsmToken::Hash) || getLexer().is(AsmToken::At) ||
+             getLexer().is(AsmToken::Percent) ||
+             getLexer().is(AsmToken::String)) {
+    if (!getLexer().is(AsmToken::String))
+      Lex();
 
-  TypeLoc = getLexer().getLoc();
-  if (getParser().parseIdentifier(Type))
-    return TokError("expected symbol type in directive");
-
-  MCSymbolAttr Attr = StringSwitch<MCSymbolAttr>(Type)
-    .Case("function", MCSA_ELF_TypeFunction)
-    .Case("object", MCSA_ELF_TypeObject)
-    .Case("tls_object", MCSA_ELF_TypeTLS)
-    .Case("common", MCSA_ELF_TypeCommon)
-    .Case("notype", MCSA_ELF_TypeNoType)
-    .Case("gnu_unique_object", MCSA_ELF_TypeGnuUniqueObject)
-    .Case("gnu_indirect_function", MCSA_ELF_TypeIndFunction)
-    .Default(MCSA_Invalid);
+    TypeLoc = getLexer().getLoc();
+    if (getParser().parseIdentifier(Type))
+      return TokError("expected symbol type in directive");
+    Attr = StringSwitch<MCSymbolAttr>(Type)
+               .Case("function", MCSA_ELF_TypeFunction)
+               .Case("object", MCSA_ELF_TypeObject)
+               .Case("tls_object", MCSA_ELF_TypeTLS)
+               .Case("common", MCSA_ELF_TypeCommon)
+               .Case("notype", MCSA_ELF_TypeNoType)
+               .Case("gnu_unique_object", MCSA_ELF_TypeGnuUniqueObject)
+               .Case("gnu_indirect_function", MCSA_ELF_TypeIndFunction)
+               .Default(MCSA_Invalid);
+  } else
+    return TokError("expected STT_<TYPE_IN_UPPER_CASE>, '#<type>', '@<type>', "
+                    "'%<type>' or \"<type>\"");
 
   if (Attr == MCSA_Invalid)
     return Error(TypeLoc, "unsupported attribute in '.type' directive");
@@ -536,22 +576,7 @@ bool ELFAsmParser::ParseDirectiveIdent(StringRef, SMLoc) {
 
   Lex();
 
-  const MCSection *Comment =
-    getContext().getELFSection(".comment", ELF::SHT_PROGBITS,
-                               ELF::SHF_MERGE |
-                               ELF::SHF_STRINGS,
-                               SectionKind::getReadOnly(),
-                               1, "");
-
-  getStreamer().PushSection();
-  getStreamer().SwitchSection(Comment);
-  if (!SeenIdent) {
-    getStreamer().EmitIntValue(0, 1);
-    SeenIdent = true;
-  }
-  getStreamer().EmitBytes(Data);
-  getStreamer().EmitIntValue(0, 1);
-  getStreamer().PopSection();
+  getStreamer().EmitIdent(Data);
   return false;
 }
 
diff --git a/lib/MC/MCPureStreamer.cpp b/lib/MC/MCPureStreamer.cpp
index c5c3bb7..f7bf002 100644
--- a/lib/MC/MCPureStreamer.cpp
+++ b/lib/MC/MCPureStreamer.cpp
@@ -29,7 +29,7 @@ private:
 public:
   MCPureStreamer(MCContext &Context, MCAsmBackend &TAB, raw_ostream &OS,
                  MCCodeEmitter *Emitter)
-      : MCObjectStreamer(SK_PureStreamer, Context, TAB, OS, Emitter) {}
+      : MCObjectStreamer(Context, 0, TAB, OS, Emitter) {}
 
   /// @name MCStreamer Interface
   /// @{
@@ -51,8 +51,9 @@ public:
   virtual void FinishImpl();
 
 
-  virtual void EmitSymbolAttribute(MCSymbol *Symbol, MCSymbolAttr Attribute) {
+  virtual bool EmitSymbolAttribute(MCSymbol *Symbol, MCSymbolAttr Attribute) {
     report_fatal_error("unsupported directive in pure streamer");
+    return false;
   }
   virtual void EmitAssemblerFlag(MCAssemblerFlag Flag) {
     report_fatal_error("unsupported directive in pure streamer");
@@ -93,16 +94,13 @@ public:
   virtual void EmitFileDirective(StringRef Filename) {
     report_fatal_error("unsupported directive in pure streamer");
   }
+  virtual void EmitIdent(StringRef IdentString) {
+    report_fatal_error("unsupported directive in pure streamer");
+  }
   virtual bool EmitDwarfFileDirective(unsigned FileNo, StringRef Directory,
                                       StringRef Filename, unsigned CUID = 0) {
     report_fatal_error("unsupported directive in pure streamer");
   }
-
-  /// @}
-
-  static bool classof(const MCStreamer *S) {
-    return S->getKind() == SK_PureStreamer;
-  }
 };
 
 } // end anonymous namespace.
@@ -120,7 +118,7 @@ void MCPureStreamer::EmitLabel(MCSymbol *Symbol) {
   assert(!Symbol->isVariable() && "Cannot emit a variable symbol!");
   assert(getCurrentSection().first && "Cannot emit before setting section!");
 
-  Symbol->setSection(*getCurrentSection().first);
+  AssignSection(Symbol, getCurrentSection().first);
 
   MCSymbolData &SD = getAssembler().getOrCreateSymbolData(*Symbol);
 
diff --git a/lib/MC/MCSectionELF.cpp b/lib/MC/MCSectionELF.cpp
index bf1a984..09eb3e7 100644
--- a/lib/MC/MCSectionELF.cpp
+++ b/lib/MC/MCSectionELF.cpp
@@ -32,6 +32,29 @@ bool MCSectionELF::ShouldOmitSectionDirective(StringRef Name,
   return false;
 }
 
+static void printName(raw_ostream &OS, StringRef Name) {
+  if (Name.find_first_not_of("0123456789_."
+                             "abcdefghijklmnopqrstuvwxyz"
+                             "ABCDEFGHIJKLMNOPQRSTUVWXYZ") == Name.npos) {
+    OS << Name;
+    return;
+  }
+  OS << '"';
+  for (const char *B = Name.begin(), *E = Name.end(); B < E; ++B) {
+    if (*B == '"') // Unquoted "
+      OS << "\\\"";
+    else if (*B != '\\') // Neither " or backslash
+      OS << *B;
+    else if (B + 1 == E) // Trailing backslash
+      OS << "\\\\";
+    else {
+      OS << B[0] << B[1]; // Quoted character
+      ++B;
+    }
+  }
+  OS << '"';
+}
+
 void MCSectionELF::PrintSwitchToSection(const MCAsmInfo &MAI,
                                         raw_ostream &OS,
                                         const MCExpr *Subsection) const {
@@ -44,27 +67,8 @@ void MCSectionELF::PrintSwitchToSection(const MCAsmInfo &MAI,
     return;
   }
 
-  StringRef name = getSectionName();
-  if (name.find_first_not_of("0123456789_."
-                             "abcdefghijklmnopqrstuvwxyz"
-                             "ABCDEFGHIJKLMNOPQRSTUVWXYZ") == name.npos) {
-    OS << "\t.section\t" << name;
-  } else {
-    OS << "\t.section\t\"";
-    for (const char *b = name.begin(), *e = name.end(); b < e; ++b) {
-      if (*b == '"') // Unquoted "
-        OS << "\\\"";
-      else if (*b != '\\') // Neither " or backslash
-        OS << *b;
-      else if (b + 1 == e) // Trailing backslash
-        OS << "\\\\";
-      else {
-        OS << b[0] << b[1]; // Quoted character
-        ++b;
-      }
-    }
-    OS << '"';
-  }
+  OS << "\t.section\t";
+  printName(OS, getSectionName());
 
   // Handle the weird solaris syntax if desired.
   if (MAI.usesSunStyleELFSectionSwitchSyntax() &&
@@ -75,6 +79,8 @@ void MCSectionELF::PrintSwitchToSection(const MCAsmInfo &MAI,
       OS << ",#execinstr";
     if (Flags & ELF::SHF_WRITE)
       OS << ",#write";
+    if (Flags & ELF::SHF_EXCLUDE)
+      OS << ",#exclude";
     if (Flags & ELF::SHF_TLS)
       OS << ",#tls";
     OS << '\n';
@@ -84,6 +90,8 @@ void MCSectionELF::PrintSwitchToSection(const MCAsmInfo &MAI,
   OS << ",\"";
   if (Flags & ELF::SHF_ALLOC)
     OS << 'a';
+  if (Flags & ELF::SHF_EXCLUDE)
+    OS << 'e';
   if (Flags & ELF::SHF_EXECINSTR)
     OS << 'x';
   if (Flags & ELF::SHF_GROUP)
@@ -131,8 +139,11 @@ void MCSectionELF::PrintSwitchToSection(const MCAsmInfo &MAI,
     OS << "," << EntrySize;
   }
 
-  if (Flags & ELF::SHF_GROUP)
-    OS << "," << Group->getName() << ",comdat";
+  if (Flags & ELF::SHF_GROUP) {
+    OS << ",";
+    printName(OS, Group->getName());
+    OS << ",comdat";
+  }
   OS << '\n';
 
   if (Subsection)
diff --git a/lib/MC/MCStreamer.cpp b/lib/MC/MCStreamer.cpp
index 6542f42..2e1d69b 100644
--- a/lib/MC/MCStreamer.cpp
+++ b/lib/MC/MCStreamer.cpp
@@ -10,6 +10,7 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
@@ -21,10 +22,17 @@
 #include <cstdlib>
 using namespace llvm;
 
-MCStreamer::MCStreamer(StreamerKind Kind, MCContext &Ctx)
-    : Kind(Kind), Context(Ctx), EmitEHFrame(true), EmitDebugFrame(false),
-      CurrentW64UnwindInfo(0), LastSymbol(0), AutoInitSections(false) {
+// Pin the vtables to this file.
+MCTargetStreamer::~MCTargetStreamer() {}
+void ARMTargetStreamer::anchor() {}
+
+MCStreamer::MCStreamer(MCContext &Ctx, MCTargetStreamer *TargetStreamer)
+    : Context(Ctx), TargetStreamer(TargetStreamer), EmitEHFrame(true),
+      EmitDebugFrame(false), CurrentW64UnwindInfo(0), LastSymbol(0),
+      AutoInitSections(false) {
   SectionStack.push_back(std::pair<MCSectionSubPair, MCSectionSubPair>());
+  if (TargetStreamer)
+    TargetStreamer->setStreamer(this);
 }
 
 MCStreamer::~MCStreamer() {
@@ -72,6 +80,13 @@ raw_ostream &MCStreamer::GetCommentOS() {
   return nulls();
 }
 
+void MCStreamer::generateCompactUnwindEncodings(MCAsmBackend *MAB) {
+  for (std::vector<MCDwarfFrameInfo>::iterator I = FrameInfos.begin(),
+         E = FrameInfos.end(); I != E; ++I)
+    I->CompactUnwindEncoding =
+      (MAB ? MAB->generateCompactUnwindEncoding(I->Instructions) : 0);
+}
+
 void MCStreamer::EmitDwarfSetLineAddr(int64_t LineDelta,
                                       const MCSymbol *Label, int PointerSize) {
   // emit the sequence to set the address
@@ -183,17 +198,28 @@ void MCStreamer::EmitEHSymAttributes(const MCSymbol *Symbol,
                                      MCSymbol *EHSymbol) {
 }
 
+void MCStreamer::AssignSection(MCSymbol *Symbol, const MCSection *Section) {
+  if (Section)
+    Symbol->setSection(*Section);
+  else
+    Symbol->setUndefined();
+
+  // As we emit symbols into a section, track the order so that they can
+  // be sorted upon later. Zero is reserved to mean 'unemitted'.
+  SymbolOrdering[Symbol] = 1 + SymbolOrdering.size();
+}
+
 void MCStreamer::EmitLabel(MCSymbol *Symbol) {
   assert(!Symbol->isVariable() && "Cannot emit a variable symbol!");
   assert(getCurrentSection().first && "Cannot emit before setting section!");
-  Symbol->setSection(*getCurrentSection().first);
+  AssignSection(Symbol, getCurrentSection().first);
   LastSymbol = Symbol;
 }
 
 void MCStreamer::EmitDebugLabel(MCSymbol *Symbol) {
   assert(!Symbol->isVariable() && "Cannot emit a variable symbol!");
   assert(getCurrentSection().first && "Cannot emit before setting section!");
-  Symbol->setSection(*getCurrentSection().first);
+  AssignSection(Symbol, getCurrentSection().first);
   LastSymbol = Symbol;
 }
 
@@ -380,6 +406,14 @@ void MCStreamer::EmitCFIRegister(int64_t Register1, int64_t Register2) {
   CurFrame->Instructions.push_back(Instruction);
 }
 
+void MCStreamer::EmitCFIWindowSave() {
+  MCSymbol *Label = EmitCFICommon();
+  MCCFIInstruction Instruction =
+    MCCFIInstruction::createWindowSave(Label);
+  MCDwarfFrameInfo *CurFrame = getCurrentFrameInfo();
+  CurFrame->Instructions.push_back(Instruction);
+}
+
 void MCStreamer::setCurrentW64UnwindInfo(MCWin64EHUnwindInfo *Frame) {
   W64UnwindInfos.push_back(Frame);
   CurrentW64UnwindInfo = W64UnwindInfos.back();
@@ -470,7 +504,9 @@ void MCStreamer::EmitWin64EHSetFrame(unsigned Register, unsigned Offset) {
     report_fatal_error("Frame register and offset already specified!");
   if (Offset & 0x0F)
     report_fatal_error("Misaligned frame pointer offset!");
-  MCWin64EHInstruction Inst(Win64EH::UOP_SetFPReg, 0, Register, Offset);
+  MCSymbol *Label = getContext().CreateTempSymbol();
+  MCWin64EHInstruction Inst(Win64EH::UOP_SetFPReg, Label, Register, Offset);
+  EmitLabel(Label);
   CurFrame->LastFrameInst = CurFrame->Instructions.size();
   CurFrame->Instructions.push_back(Inst);
 }
@@ -534,54 +570,10 @@ void MCStreamer::EmitCOFFSecRel32(MCSymbol const *Symbol) {
   llvm_unreachable("This file format doesn't support this directive");
 }
 
-void MCStreamer::EmitFnStart() {
-  errs() << "Not implemented yet\n";
-  abort();
-}
-
-void MCStreamer::EmitFnEnd() {
-  errs() << "Not implemented yet\n";
-  abort();
-}
-
-void MCStreamer::EmitCantUnwind() {
-  errs() << "Not implemented yet\n";
-  abort();
-}
-
-void MCStreamer::EmitHandlerData() {
-  errs() << "Not implemented yet\n";
-  abort();
-}
-
-void MCStreamer::EmitPersonality(const MCSymbol *Personality) {
-  errs() << "Not implemented yet\n";
-  abort();
-}
-
-void MCStreamer::EmitSetFP(unsigned FpReg, unsigned SpReg, int64_t Offset) {
-  errs() << "Not implemented yet\n";
-  abort();
-}
-
-void MCStreamer::EmitPad(int64_t Offset) {
-  errs() << "Not implemented yet\n";
-  abort();
-}
-
-void MCStreamer::EmitRegSave(const SmallVectorImpl<unsigned> &RegList, bool) {
-  errs() << "Not implemented yet\n";
-  abort();
-}
-
-void MCStreamer::EmitTCEntry(const MCSymbol &S) {
-  llvm_unreachable("Unsupported method");
-}
-
 /// EmitRawText - If this file is backed by an assembly streamer, this dumps
 /// the specified string in the output .s file.  This capability is
 /// indicated by the hasRawTextSupport() predicate.
-void MCStreamer::EmitRawText(StringRef String) {
+void MCStreamer::EmitRawTextImpl(StringRef String) {
   errs() << "EmitRawText called on an MCStreamer that doesn't support it, "
   " something must not be fully mc'ized\n";
   abort();
@@ -589,19 +581,18 @@ void MCStreamer::EmitRawText(StringRef String) {
 
 void MCStreamer::EmitRawText(const Twine &T) {
   SmallString<128> Str;
-  T.toVector(Str);
-  EmitRawText(Str.str());
+  EmitRawTextImpl(T.toStringRef(Str));
 }
 
-void MCStreamer::EmitFrames(bool usingCFI) {
+void MCStreamer::EmitFrames(MCAsmBackend *MAB, bool usingCFI) {
   if (!getNumFrameInfos())
     return;
 
   if (EmitEHFrame)
-    MCDwarfFrameEmitter::Emit(*this, usingCFI, true);
+    MCDwarfFrameEmitter::Emit(*this, MAB, usingCFI, true);
 
   if (EmitDebugFrame)
-    MCDwarfFrameEmitter::Emit(*this, usingCFI, false);
+    MCDwarfFrameEmitter::Emit(*this, MAB, usingCFI, false);
 }
 
 void MCStreamer::EmitW64Tables() {
diff --git a/lib/MC/MCSubtargetInfo.cpp b/lib/MC/MCSubtargetInfo.cpp
index f18828d..8d8e290 100644
--- a/lib/MC/MCSubtargetInfo.cpp
+++ b/lib/MC/MCSubtargetInfo.cpp
@@ -27,6 +27,11 @@ MCSubtargetInfo::InitMCProcessorInfo(StringRef CPU, StringRef FS) {
   FeatureBits = Features.getFeatureBits(CPU, ProcDesc, NumProcs,
                                         ProcFeatures, NumFeatures);
 
+  InitCPUSchedModel(CPU);
+}
+
+void
+MCSubtargetInfo::InitCPUSchedModel(StringRef CPU) {
   if (!CPU.empty())
     CPUSchedModel = getSchedModelForCPU(CPU);
   else
@@ -91,10 +96,8 @@ MCSubtargetInfo::getSchedModelForCPU(StringRef CPU) const {
 #endif
 
   // Find entry
-  SubtargetInfoKV KV;
-  KV.Key = CPU.data();
   const SubtargetInfoKV *Found =
-    std::lower_bound(ProcSchedModels, ProcSchedModels+NumProcs, KV);
+    std::lower_bound(ProcSchedModels, ProcSchedModels+NumProcs, CPU);
   if (Found == ProcSchedModels+NumProcs || StringRef(Found->Key) != CPU) {
     errs() << "'" << CPU
            << "' is not a recognized processor for this target"
diff --git a/lib/MC/MCSymbol.cpp b/lib/MC/MCSymbol.cpp
index b973c57..2416525 100644
--- a/lib/MC/MCSymbol.cpp
+++ b/lib/MC/MCSymbol.cpp
@@ -68,12 +68,23 @@ void MCSymbol::print(raw_ostream &OS) const {
   // The name for this MCSymbol is required to be a valid target name.  However,
   // some targets support quoting names with funny characters.  If the name
   // contains a funny character, then print it quoted.
-  if (!NameNeedsQuoting(getName())) {
-    OS << getName();
+  StringRef Name = getName();
+  if (!NameNeedsQuoting(Name)) {
+    OS << Name;
     return;
   }
 
-  OS << '"' << getName() << '"';
+  OS << '"';
+  for (unsigned I = 0, E = Name.size(); I != E; ++I) {
+    char C = Name[I];
+    if (C == '\n')
+      OS << "\\n";
+    else if (C == '"')
+      OS << "\\\"";
+    else
+      OS << C;
+  }
+  OS << '"';
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
diff --git a/lib/MC/MCWin64EH.cpp b/lib/MC/MCWin64EH.cpp
index c5b637c..b8b07d3 100644
--- a/lib/MC/MCWin64EH.cpp
+++ b/lib/MC/MCWin64EH.cpp
@@ -64,7 +64,7 @@ static void EmitAbsDifference(MCStreamer &streamer, MCSymbol *lhs,
 
 static void EmitUnwindCode(MCStreamer &streamer, MCSymbol *begin,
                            MCWin64EHInstruction &inst) {
-  uint8_t b1, b2;
+  uint8_t b2;
   uint16_t w;
   b2 = (inst.getOperation() & 0x0F);
   switch (inst.getOperation()) {
@@ -93,8 +93,7 @@ static void EmitUnwindCode(MCStreamer &streamer, MCSymbol *begin,
     streamer.EmitIntValue(b2, 1);
     break;
   case Win64EH::UOP_SetFPReg:
-    b1 = inst.getOffset() & 0xF0;
-    streamer.EmitIntValue(b1, 1);
+    EmitAbsDifference(streamer, inst.getLabel(), begin);
     streamer.EmitIntValue(b2, 1);
     break;
   case Win64EH::UOP_SaveNonVol:
@@ -129,14 +128,29 @@ static void EmitUnwindCode(MCStreamer &streamer, MCSymbol *begin,
   }
 }
 
+static void EmitSymbolRefWithOfs(MCStreamer &streamer,
+                                 const MCSymbol *Base,
+                                 const MCSymbol *Other) {
+  MCContext &Context = streamer.getContext();
+  const MCSymbolRefExpr *BaseRef = MCSymbolRefExpr::Create(Base, Context);
+  const MCSymbolRefExpr *OtherRef = MCSymbolRefExpr::Create(Other, Context);
+  const MCExpr *Ofs = MCBinaryExpr::CreateSub(OtherRef, BaseRef, Context);
+  const MCSymbolRefExpr *BaseRefRel = MCSymbolRefExpr::Create(Base,
+                                              MCSymbolRefExpr::VK_COFF_IMGREL32,
+                                              Context);
+  streamer.EmitValue(MCBinaryExpr::CreateAdd(BaseRefRel, Ofs, Context), 4);
+}
+
 static void EmitRuntimeFunction(MCStreamer &streamer,
                                 const MCWin64EHUnwindInfo *info) {
   MCContext &context = streamer.getContext();
 
   streamer.EmitValueToAlignment(4);
-  streamer.EmitValue(MCSymbolRefExpr::Create(info->Begin, context), 4);
-  streamer.EmitValue(MCSymbolRefExpr::Create(info->End, context), 4);
-  streamer.EmitValue(MCSymbolRefExpr::Create(info->Symbol, context), 4);
+  EmitSymbolRefWithOfs(streamer, info->Function, info->Begin);
+  EmitSymbolRefWithOfs(streamer, info->Function, info->End);
+  streamer.EmitValue(MCSymbolRefExpr::Create(info->Symbol,
+                                             MCSymbolRefExpr::VK_COFF_IMGREL32,
+                                             context), 4);
 }
 
 static void EmitUnwindInfo(MCStreamer &streamer, MCWin64EHUnwindInfo *info) {
@@ -145,11 +159,11 @@ static void EmitUnwindInfo(MCStreamer &streamer, MCWin64EHUnwindInfo *info) {
 
   MCContext &context = streamer.getContext();
   streamer.EmitValueToAlignment(4);
-  // Upper 3 bits are the version number (currently 1).
-  uint8_t flags = 0x01;
   info->Symbol = context.CreateTempSymbol();
   streamer.EmitLabel(info->Symbol);
 
+  // Upper 3 bits are the version number (currently 1).
+  uint8_t flags = 0x01;
   if (info->ChainedParent)
     flags |= Win64EH::UNW_ChainInfo << 3;
   else {
@@ -185,20 +199,26 @@ static void EmitUnwindInfo(MCStreamer &streamer, MCWin64EHUnwindInfo *info) {
     EmitUnwindCode(streamer, info->Begin, inst);
   }
 
+  // For alignment purposes, the instruction array will always have an even
+  // number of entries, with the final entry potentially unused (in which case
+  // the array will be one longer than indicated by the count of unwind codes
+  // field).
+  if (numCodes & 1) {
+    streamer.EmitIntValue(0, 2);
+  }
+
   if (flags & (Win64EH::UNW_ChainInfo << 3))
     EmitRuntimeFunction(streamer, info->ChainedParent);
   else if (flags &
            ((Win64EH::UNW_TerminateHandler|Win64EH::UNW_ExceptionHandler) << 3))
-    streamer.EmitValue(MCSymbolRefExpr::Create(info->ExceptionHandler, context),
-                       4);
-  else if (numCodes < 2) {
+    streamer.EmitValue(MCSymbolRefExpr::Create(info->ExceptionHandler,
+                                              MCSymbolRefExpr::VK_COFF_IMGREL32,
+                                              context), 4);
+  else if (numCodes == 0) {
     // The minimum size of an UNWIND_INFO struct is 8 bytes. If we're not
     // a chained unwind info, if there is no handler, and if there are fewer
     // than 2 slots used in the unwind code array, we have to pad to 8 bytes.
-    if (numCodes == 1)
-      streamer.EmitIntValue(0, 2);
-    else
-      streamer.EmitIntValue(0, 4);
+    streamer.EmitIntValue(0, 4);
   }
 }
 
diff --git a/lib/MC/MachObjectWriter.cpp b/lib/MC/MachObjectWriter.cpp
index a5ba3c3..8234aff 100644
--- a/lib/MC/MachObjectWriter.cpp
+++ b/lib/MC/MachObjectWriter.cpp
@@ -20,12 +20,11 @@
 #include "llvm/MC/MCSectionMachO.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCValue.h"
-#include "llvm/Object/MachOFormat.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachO.h"
 #include <vector>
 using namespace llvm;
-using namespace llvm::object;
 
 void MachObjectWriter::reset() {
   Relocations.clear();
@@ -128,7 +127,7 @@ void MachObjectWriter::WriteHeader(unsigned NumLoadCommands,
   uint32_t Flags = 0;
 
   if (SubsectionsViaSymbols)
-    Flags |= macho::HF_SubsectionsViaSymbols;
+    Flags |= MachO::MH_SUBSECTIONS_VIA_SYMBOLS;
 
   // struct mach_header (28 bytes) or
   // struct mach_header_64 (32 bytes)
@@ -136,12 +135,12 @@ void MachObjectWriter::WriteHeader(unsigned NumLoadCommands,
   uint64_t Start = OS.tell();
   (void) Start;
 
-  Write32(is64Bit() ? macho::HM_Object64 : macho::HM_Object32);
+  Write32(is64Bit() ? MachO::MH_MAGIC_64 : MachO::MH_MAGIC);
 
   Write32(TargetObjectWriter->getCPUType());
   Write32(TargetObjectWriter->getCPUSubtype());
 
-  Write32(macho::HFT_Object);
+  Write32(MachO::MH_OBJECT);
   Write32(NumLoadCommands);
   Write32(LoadCommandsSize);
   Write32(Flags);
@@ -149,7 +148,7 @@ void MachObjectWriter::WriteHeader(unsigned NumLoadCommands,
     Write32(0); // reserved
 
   assert(OS.tell() - Start ==
-         (is64Bit() ? macho::Header64Size : macho::Header32Size));
+         (is64Bit()?sizeof(MachO::mach_header_64): sizeof(MachO::mach_header)));
 }
 
 /// WriteSegmentLoadCommand - Write a segment load command.
@@ -167,12 +166,12 @@ void MachObjectWriter::WriteSegmentLoadCommand(unsigned NumSections,
   (void) Start;
 
   unsigned SegmentLoadCommandSize =
-    is64Bit() ? macho::SegmentLoadCommand64Size:
-    macho::SegmentLoadCommand32Size;
-  Write32(is64Bit() ? macho::LCT_Segment64 : macho::LCT_Segment);
+    is64Bit() ? sizeof(MachO::segment_command_64):
+    sizeof(MachO::segment_command);
+  Write32(is64Bit() ? MachO::LC_SEGMENT_64 : MachO::LC_SEGMENT);
   Write32(SegmentLoadCommandSize +
-          NumSections * (is64Bit() ? macho::Section64Size :
-                         macho::Section32Size));
+          NumSections * (is64Bit() ? sizeof(MachO::section_64) :
+                         sizeof(MachO::section)));
 
   WriteBytes("", 16);
   if (is64Bit()) {
@@ -186,8 +185,10 @@ void MachObjectWriter::WriteSegmentLoadCommand(unsigned NumSections,
     Write32(SectionDataStartOffset); // file offset
     Write32(SectionDataSize); // file size
   }
-  Write32(0x7); // maxprot
-  Write32(0x7); // initprot
+  // maxprot
+  Write32(MachO::VM_PROT_READ | MachO::VM_PROT_WRITE | MachO::VM_PROT_EXECUTE); 
+  // initprot
+  Write32(MachO::VM_PROT_READ | MachO::VM_PROT_WRITE | MachO::VM_PROT_EXECUTE); 
   Write32(NumSections);
   Write32(0); // flags
 
@@ -240,8 +241,8 @@ void MachObjectWriter::WriteSection(const MCAssembler &Asm,
   if (is64Bit())
     Write32(0); // reserved3
 
-  assert(OS.tell() - Start == (is64Bit() ? macho::Section64Size :
-                               macho::Section32Size));
+  assert(OS.tell() - Start == (is64Bit() ? sizeof(MachO::section_64) :
+                               sizeof(MachO::section)));
 }
 
 void MachObjectWriter::WriteSymtabLoadCommand(uint32_t SymbolOffset,
@@ -253,14 +254,14 @@ void MachObjectWriter::WriteSymtabLoadCommand(uint32_t SymbolOffset,
   uint64_t Start = OS.tell();
   (void) Start;
 
-  Write32(macho::LCT_Symtab);
-  Write32(macho::SymtabLoadCommandSize);
+  Write32(MachO::LC_SYMTAB);
+  Write32(sizeof(MachO::symtab_command));
   Write32(SymbolOffset);
   Write32(NumSymbols);
   Write32(StringTableOffset);
   Write32(StringTableSize);
 
-  assert(OS.tell() - Start == macho::SymtabLoadCommandSize);
+  assert(OS.tell() - Start == sizeof(MachO::symtab_command));
 }
 
 void MachObjectWriter::WriteDysymtabLoadCommand(uint32_t FirstLocalSymbol,
@@ -276,8 +277,8 @@ void MachObjectWriter::WriteDysymtabLoadCommand(uint32_t FirstLocalSymbol,
   uint64_t Start = OS.tell();
   (void) Start;
 
-  Write32(macho::LCT_Dysymtab);
-  Write32(macho::DysymtabLoadCommandSize);
+  Write32(MachO::LC_DYSYMTAB);
+  Write32(sizeof(MachO::dysymtab_command));
   Write32(FirstLocalSymbol);
   Write32(NumLocalSymbols);
   Write32(FirstExternalSymbol);
@@ -297,7 +298,7 @@ void MachObjectWriter::WriteDysymtabLoadCommand(uint32_t FirstLocalSymbol,
   Write32(0); // locreloff
   Write32(0); // nlocrel
 
-  assert(OS.tell() - Start == macho::DysymtabLoadCommandSize);
+  assert(OS.tell() - Start == sizeof(MachO::dysymtab_command));
 }
 
 void MachObjectWriter::WriteNlist(MachSymbolData &MSD,
@@ -312,20 +313,20 @@ void MachObjectWriter::WriteNlist(MachSymbolData &MSD,
   //
   // FIXME: Are the prebound or indirect fields possible here?
   if (Symbol.isUndefined())
-    Type = macho::STT_Undefined;
+    Type = MachO::N_UNDF;
   else if (Symbol.isAbsolute())
-    Type = macho::STT_Absolute;
+    Type = MachO::N_ABS;
   else
-    Type = macho::STT_Section;
+    Type = MachO::N_SECT;
 
   // FIXME: Set STAB bits.
 
   if (Data.isPrivateExtern())
-    Type |= macho::STF_PrivateExtern;
+    Type |= MachO::N_PEXT;
 
   // Set external bit.
   if (Data.isExternal() || Symbol.isUndefined())
-    Type |= macho::STF_External;
+    Type |= MachO::N_EXT;
 
   // Compute the symbol address.
   if (Symbol.isDefined()) {
@@ -341,7 +342,8 @@ void MachObjectWriter::WriteNlist(MachSymbolData &MSD,
       assert((1U << Log2Size) == Align && "Invalid 'common' alignment!");
       if (Log2Size > 15)
         report_fatal_error("invalid 'common' alignment '" +
-                           Twine(Align) + "'");
+                           Twine(Align) + "' for '" + Symbol.getName() + "'",
+                           false);
       // FIXME: Keep this mask with the SymbolFlags enumeration.
       Flags = (Flags & 0xF0FF) | (Log2Size << 8);
     }
@@ -369,17 +371,17 @@ void MachObjectWriter::WriteLinkeditLoadCommand(uint32_t Type,
   (void) Start;
 
   Write32(Type);
-  Write32(macho::LinkeditLoadCommandSize);
+  Write32(sizeof(MachO::linkedit_data_command));
   Write32(DataOffset);
   Write32(DataSize);
 
-  assert(OS.tell() - Start == macho::LinkeditLoadCommandSize);
+  assert(OS.tell() - Start == sizeof(MachO::linkedit_data_command));
 }
 
 static unsigned ComputeLinkerOptionsLoadCommandSize(
   const std::vector<std::string> &Options, bool is64Bit)
 {
-  unsigned Size = sizeof(macho::LinkerOptionsLoadCommand);
+  unsigned Size = sizeof(MachO::linker_options_command);
   for (unsigned i = 0, e = Options.size(); i != e; ++i)
     Size += Options[i].size() + 1;
   return RoundUpToAlignment(Size, is64Bit ? 8 : 4);
@@ -392,10 +394,10 @@ void MachObjectWriter::WriteLinkerOptionsLoadCommand(
   uint64_t Start = OS.tell();
   (void) Start;
 
-  Write32(macho::LCT_LinkerOptions);
+  Write32(MachO::LC_LINKER_OPTIONS);
   Write32(Size);
   Write32(Options.size());
-  uint64_t BytesWritten = sizeof(macho::LinkerOptionsLoadCommand);
+  uint64_t BytesWritten = sizeof(MachO::linker_options_command);
   for (unsigned i = 0, e = Options.size(); i != e; ++i) {
     // Write each string, including the null byte.
     const std::string &Option = Options[i];
@@ -428,6 +430,22 @@ void MachObjectWriter::BindIndirectSymbols(MCAssembler &Asm) {
   //
   // FIXME: Revisit this when the dust settles.
 
+  // Report errors for use of .indirect_symbol not in a symbol pointer section
+  // or stub section.
+  for (MCAssembler::indirect_symbol_iterator it = Asm.indirect_symbol_begin(),
+         ie = Asm.indirect_symbol_end(); it != ie; ++it) {
+    const MCSectionMachO &Section =
+      cast<MCSectionMachO>(it->SectionData->getSection());
+
+    if (Section.getType() != MCSectionMachO::S_NON_LAZY_SYMBOL_POINTERS &&
+        Section.getType() != MCSectionMachO::S_LAZY_SYMBOL_POINTERS &&
+        Section.getType() != MCSectionMachO::S_SYMBOL_STUBS) {
+	MCSymbol &Symbol = *it->Symbol;
+	report_fatal_error("indirect symbol '" + Symbol.getName() +
+                           "' not in a symbol pointer or stub section");
+    }
+  }
+
   // Bind non lazy symbol pointers first.
   unsigned IndirectIndex = 0;
   for (MCAssembler::indirect_symbol_iterator it = Asm.indirect_symbol_begin(),
@@ -723,14 +741,14 @@ void MachObjectWriter::WriteObject(MCAssembler &Asm,
   // section headers) and the symbol table.
   unsigned NumLoadCommands = 1;
   uint64_t LoadCommandsSize = is64Bit() ?
-    macho::SegmentLoadCommand64Size + NumSections * macho::Section64Size :
-    macho::SegmentLoadCommand32Size + NumSections * macho::Section32Size;
+    sizeof(MachO::segment_command_64) + NumSections * sizeof(MachO::section_64):
+    sizeof(MachO::segment_command) + NumSections * sizeof(MachO::section);
 
   // Add the data-in-code load command size, if used.
   unsigned NumDataRegions = Asm.getDataRegions().size();
   if (NumDataRegions) {
     ++NumLoadCommands;
-    LoadCommandsSize += macho::LinkeditLoadCommandSize;
+    LoadCommandsSize += sizeof(MachO::linkedit_data_command);
   }
 
   // Add the symbol table load command sizes, if used.
@@ -738,8 +756,8 @@ void MachObjectWriter::WriteObject(MCAssembler &Asm,
     UndefinedSymbolData.size();
   if (NumSymbols) {
     NumLoadCommands += 2;
-    LoadCommandsSize += (macho::SymtabLoadCommandSize +
-                         macho::DysymtabLoadCommandSize);
+    LoadCommandsSize += (sizeof(MachO::symtab_command) +
+                         sizeof(MachO::dysymtab_command));
   }
 
   // Add the linker option load commands sizes.
@@ -753,8 +771,8 @@ void MachObjectWriter::WriteObject(MCAssembler &Asm,
   
   // Compute the total size of the section data, as well as its file size and vm
   // size.
-  uint64_t SectionDataStart = (is64Bit() ? macho::Header64Size :
-                               macho::Header32Size) + LoadCommandsSize;
+  uint64_t SectionDataStart = (is64Bit() ? sizeof(MachO::mach_header_64) :
+                               sizeof(MachO::mach_header)) + LoadCommandsSize;
   uint64_t SectionDataSize = 0;
   uint64_t SectionDataFileSize = 0;
   uint64_t VMSize = 0;
@@ -791,11 +809,11 @@ void MachObjectWriter::WriteObject(MCAssembler &Asm,
   uint64_t RelocTableEnd = SectionDataStart + SectionDataFileSize;
   for (MCAssembler::const_iterator it = Asm.begin(),
          ie = Asm.end(); it != ie; ++it) {
-    std::vector<macho::RelocationEntry> &Relocs = Relocations[it];
+    std::vector<MachO::any_relocation_info> &Relocs = Relocations[it];
     unsigned NumRelocs = Relocs.size();
     uint64_t SectionStart = SectionDataStart + getSectionAddress(it);
     WriteSection(Asm, Layout, *it, SectionStart, RelocTableEnd, NumRelocs);
-    RelocTableEnd += NumRelocs * macho::RelocationInfoSize;
+    RelocTableEnd += NumRelocs * sizeof(MachO::any_relocation_info);
   }
 
   // Write the data-in-code load command, if used.
@@ -803,7 +821,7 @@ void MachObjectWriter::WriteObject(MCAssembler &Asm,
   if (NumDataRegions) {
     uint64_t DataRegionsOffset = RelocTableEnd;
     uint64_t DataRegionsSize = NumDataRegions * 8;
-    WriteLinkeditLoadCommand(macho::LCT_DataInCode, DataRegionsOffset,
+    WriteLinkeditLoadCommand(MachO::LC_DATA_IN_CODE, DataRegionsOffset,
                              DataRegionsSize);
   }
 
@@ -830,8 +848,9 @@ void MachObjectWriter::WriteObject(MCAssembler &Asm,
 
     // The string table is written after symbol table.
     uint64_t StringTableOffset =
-      SymbolTableOffset + NumSymTabSymbols * (is64Bit() ? macho::Nlist64Size :
-                                              macho::Nlist32Size);
+      SymbolTableOffset + NumSymTabSymbols * (is64Bit() ?
+                                              sizeof(MachO::nlist_64) :
+                                              sizeof(MachO::nlist));
     WriteSymtabLoadCommand(SymbolTableOffset, NumSymTabSymbols,
                            StringTableOffset, StringTable.size());
 
@@ -864,10 +883,10 @@ void MachObjectWriter::WriteObject(MCAssembler &Asm,
          ie = Asm.end(); it != ie; ++it) {
     // Write the section relocation entries, in reverse order to match 'as'
     // (approximately, the exact algorithm is more complicated than this).
-    std::vector<macho::RelocationEntry> &Relocs = Relocations[it];
+    std::vector<MachO::any_relocation_info> &Relocs = Relocations[it];
     for (unsigned i = 0, e = Relocs.size(); i != e; ++i) {
-      Write32(Relocs[e - i - 1].Word0);
-      Write32(Relocs[e - i - 1].Word1);
+      Write32(Relocs[e - i - 1].r_word0);
+      Write32(Relocs[e - i - 1].r_word1);
     }
   }
 
@@ -906,9 +925,9 @@ void MachObjectWriter::WriteObject(MCAssembler &Asm,
         // If this symbol is defined and internal, mark it as such.
         if (it->Symbol->isDefined() &&
             !Asm.getSymbolData(*it->Symbol).isExternal()) {
-          uint32_t Flags = macho::ISF_Local;
+          uint32_t Flags = MachO::INDIRECT_SYMBOL_LOCAL;
           if (it->Symbol->isAbsolute())
-            Flags |= macho::ISF_Absolute;
+            Flags |= MachO::INDIRECT_SYMBOL_ABS;
           Write32(Flags);
           continue;
         }
diff --git a/lib/MC/SubtargetFeature.cpp b/lib/MC/SubtargetFeature.cpp
index 7625abd..2fb91f2 100644
--- a/lib/MC/SubtargetFeature.cpp
+++ b/lib/MC/SubtargetFeature.cpp
@@ -121,13 +121,10 @@ void SubtargetFeatures::AddFeature(const StringRef String,
 /// Find KV in array using binary search.
 static const SubtargetFeatureKV *Find(StringRef S, const SubtargetFeatureKV *A,
                                       size_t L) {
-  // Make the lower bound element we're looking for
-  SubtargetFeatureKV KV;
-  KV.Key = S.data();
   // Determine the end of the array
   const SubtargetFeatureKV *Hi = A + L;
   // Binary search the array
-  const SubtargetFeatureKV *F = std::lower_bound(A, Hi, KV);
+  const SubtargetFeatureKV *F = std::lower_bound(A, Hi, S);
   // If not found then return NULL
   if (F == Hi || StringRef(F->Key) != S) return NULL;
   // Return the found array item
@@ -353,8 +350,7 @@ void SubtargetFeatures::dump() const {
 }
 #endif
 
-/// getDefaultSubtargetFeatures - Return a string listing the features
-/// associated with the target triple.
+/// Adds the default features for the specified target triple.
 ///
 /// FIXME: This is an inelegant way of specifying the features of a
 /// subtarget. It would be better if we could encode this information
diff --git a/lib/MC/WinCOFFObjectWriter.cpp b/lib/MC/WinCOFFObjectWriter.cpp
index 4e26934..d9ca86d 100644
--- a/lib/MC/WinCOFFObjectWriter.cpp
+++ b/lib/MC/WinCOFFObjectWriter.cpp
@@ -138,7 +138,7 @@ public:
   symbol_map  SymbolMap;
 
   WinCOFFObjectWriter(MCWinCOFFObjectTargetWriter *MOTW, raw_ostream &OS);
-  ~WinCOFFObjectWriter();
+  virtual ~WinCOFFObjectWriter();
 
   COFFSymbol *createSymbol(StringRef Name);
   COFFSymbol *GetOrCreateCOFFSymbol(const MCSymbol * Symbol);
@@ -148,13 +148,12 @@ public:
   object_t *createCOFFEntity(StringRef Name, list_t &List);
 
   void DefineSection(MCSectionData const &SectionData);
-  void DefineSymbol(MCSymbolData const &SymbolData,
-                    MCAssembler &Assembler);
+  void DefineSymbol(MCSymbolData const &SymbolData, MCAssembler &Assembler,
+                    const MCAsmLayout &Layout);
 
   void MakeSymbolReal(COFFSymbol &S, size_t Index);
   void MakeSectionReal(COFFSection &S, size_t Number);
 
-  bool ExportSection(COFFSection const *S);
   bool ExportSymbol(MCSymbolData const &SymbolData, MCAssembler &Asm);
 
   bool IsPhysicalSection(COFFSection *S);
@@ -190,17 +189,6 @@ static inline void write_uint32_le(void *Data, uint32_t const &Value) {
   Ptr[3] = (Value & 0xFF000000) >> 24;
 }
 
-static inline void write_uint16_le(void *Data, uint16_t const &Value) {
-  uint8_t *Ptr = reinterpret_cast<uint8_t *>(Data);
-  Ptr[0] = (Value & 0x00FF) >> 0;
-  Ptr[1] = (Value & 0xFF00) >> 8;
-}
-
-static inline void write_uint8_le(void *Data, uint8_t const &Value) {
-  uint8_t *Ptr = reinterpret_cast<uint8_t *>(Data);
-  Ptr[0] = (Value & 0xFF) >> 0;
-}
-
 //------------------------------------------------------------------------------
 // Symbol class implementation
 
@@ -411,7 +399,8 @@ void WinCOFFObjectWriter::DefineSection(MCSectionData const &SectionData) {
 /// This function takes a section data object from the assembler
 /// and creates the associated COFF symbol staging object.
 void WinCOFFObjectWriter::DefineSymbol(MCSymbolData const &SymbolData,
-                                       MCAssembler &Assembler) {
+                                       MCAssembler &Assembler,
+                                       const MCAsmLayout &Layout) {
   MCSymbol const &Symbol = SymbolData.getSymbol();
   COFFSymbol *coff_symbol = GetOrCreateCOFFSymbol(&Symbol);
   SymbolMap[&Symbol] = coff_symbol;
@@ -452,6 +441,12 @@ void WinCOFFObjectWriter::DefineSymbol(MCSymbolData const &SymbolData,
     const MCSymbolData &ResSymData =
       Assembler.getSymbolData(Symbol.AliasedSymbol());
 
+    if (Symbol.isVariable()) {
+      int64_t Addr;
+      if (Symbol.getVariableValue()->EvaluateAsAbsolute(Addr, Layout))
+        coff_symbol->Data.Value = Addr;
+    }
+
     coff_symbol->Data.Type         = (ResSymData.getFlags() & 0x0000FFFF) >>  0;
     coff_symbol->Data.StorageClass = (ResSymData.getFlags() & 0x00FF0000) >> 16;
 
@@ -463,7 +458,9 @@ void WinCOFFObjectWriter::DefineSymbol(MCSymbolData const &SymbolData,
        external ? COFF::IMAGE_SYM_CLASS_EXTERNAL : COFF::IMAGE_SYM_CLASS_STATIC;
     }
 
-    if (ResSymData.Fragment != NULL)
+    if (Symbol.isAbsolute() || Symbol.AliasedSymbol().isVariable())
+      coff_symbol->Data.SectionNumber = COFF::IMAGE_SYM_ABSOLUTE;
+    else if (ResSymData.Fragment != NULL)
       coff_symbol->Section =
         SectionMap[&ResSymData.Fragment->getParent()->getSection()];
 
@@ -508,10 +505,6 @@ void WinCOFFObjectWriter::MakeSymbolReal(COFFSymbol &S, size_t Index) {
   S.Index = Index;
 }
 
-bool WinCOFFObjectWriter::ExportSection(COFFSection const *S) {
-  return !S->MCData->getFragmentList().empty();
-}
-
 bool WinCOFFObjectWriter::ExportSymbol(MCSymbolData const &SymbolData,
                                        MCAssembler &Asm) {
   // This doesn't seem to be right. Strings referred to from the .data section
@@ -625,9 +618,10 @@ void WinCOFFObjectWriter::ExecutePostLayoutBinding(MCAssembler &Asm,
     DefineSection(*i);
 
   for (MCAssembler::const_symbol_iterator i = Asm.symbol_begin(),
-                                          e = Asm.symbol_end(); i != e; i++) {
+                                          e = Asm.symbol_end();
+       i != e; i++) {
     if (ExportSymbol(*i, Asm)) {
-      DefineSymbol(*i, Asm);
+      DefineSymbol(*i, Asm, Layout);
     }
   }
 }
@@ -921,6 +915,9 @@ MCWinCOFFObjectTargetWriter::MCWinCOFFObjectTargetWriter(unsigned Machine_) :
   Machine(Machine_) {
 }
 
+// Pin the vtable to this file.
+void MCWinCOFFObjectTargetWriter::anchor() {}
+
 //------------------------------------------------------------------------------
 // WinCOFFObjectWriter factory function
 
diff --git a/lib/MC/WinCOFFStreamer.cpp b/lib/MC/WinCOFFStreamer.cpp
index 04bfeb4..5b5aad7 100644
--- a/lib/MC/WinCOFFStreamer.cpp
+++ b/lib/MC/WinCOFFStreamer.cpp
@@ -55,7 +55,7 @@ public:
   virtual void EmitDebugLabel(MCSymbol *Symbol);
   virtual void EmitAssemblerFlag(MCAssemblerFlag Flag);
   virtual void EmitThumbFunc(MCSymbol *Func);
-  virtual void EmitSymbolAttribute(MCSymbol *Symbol, MCSymbolAttr Attribute);
+  virtual bool EmitSymbolAttribute(MCSymbol *Symbol, MCSymbolAttr Attribute);
   virtual void EmitSymbolDesc(MCSymbol *Symbol, unsigned DescValue);
   virtual void BeginCOFFSymbolDef(MCSymbol const *Symbol);
   virtual void EmitCOFFSymbolStorageClass(int StorageClass);
@@ -72,13 +72,10 @@ public:
   virtual void EmitTBSSSymbol(const MCSection *Section, MCSymbol *Symbol,
                               uint64_t Size, unsigned ByteAlignment);
   virtual void EmitFileDirective(StringRef Filename);
+  virtual void EmitIdent(StringRef IdentString);
   virtual void EmitWin64EHHandlerData();
   virtual void FinishImpl();
 
-  static bool classof(const MCStreamer *S) {
-    return S->getKind() == SK_WinCOFFStreamer;
-  }
-
 private:
   virtual void EmitInstToData(const MCInst &Inst) {
     MCDataFragment *DF = getOrCreateDataFragment();
@@ -134,8 +131,7 @@ private:
 
 WinCOFFStreamer::WinCOFFStreamer(MCContext &Context, MCAsmBackend &MAB,
                                  MCCodeEmitter &CE, raw_ostream &OS)
-    : MCObjectStreamer(SK_WinCOFFStreamer, Context, MAB, OS, &CE),
-      CurSymbol(NULL) {}
+    : MCObjectStreamer(Context, 0, MAB, OS, &CE), CurSymbol(NULL) {}
 
 void WinCOFFStreamer::AddCommonSymbol(MCSymbol *Symbol, uint64_t Size,
                                       unsigned ByteAlignment, bool External) {
@@ -155,7 +151,8 @@ void WinCOFFStreamer::AddCommonSymbol(MCSymbol *Symbol, uint64_t Size,
   int Selection = COFF::IMAGE_COMDAT_SELECT_LARGEST;
 
   const MCSection *Section = MCStreamer::getContext().getCOFFSection(
-    SectionName, Characteristics, SectionKind::getBSS(), Selection);
+      SectionName, Characteristics, SectionKind::getBSS(), Symbol->getName(),
+      Selection);
 
   MCSectionData &SectionData = getAssembler().getOrCreateSectionData(*Section);
 
@@ -164,7 +161,7 @@ void WinCOFFStreamer::AddCommonSymbol(MCSymbol *Symbol, uint64_t Size,
 
   SymbolData.setExternal(External);
 
-  Symbol->setSection(*Section);
+  AssignSection(Symbol, Section);
 
   if (ByteAlignment != 1)
       new MCAlignFragment(ByteAlignment, 0, 0, ByteAlignment, &SectionData);
@@ -201,7 +198,7 @@ void WinCOFFStreamer::EmitThumbFunc(MCSymbol *Func) {
   llvm_unreachable("not implemented");
 }
 
-void WinCOFFStreamer::EmitSymbolAttribute(MCSymbol *Symbol,
+bool WinCOFFStreamer::EmitSymbolAttribute(MCSymbol *Symbol,
                                           MCSymbolAttr Attribute) {
   assert(Symbol && "Symbol must be non-null!");
   assert((Symbol->isInSection()
@@ -221,8 +218,10 @@ void WinCOFFStreamer::EmitSymbolAttribute(MCSymbol *Symbol,
     break;
 
   default:
-    llvm_unreachable("unsupported attribute");
+    return false;
   }
+
+  return true;
 }
 
 void WinCOFFStreamer::EmitSymbolDesc(MCSymbol *Symbol, unsigned DescValue) {
@@ -309,6 +308,11 @@ void WinCOFFStreamer::EmitFileDirective(StringRef Filename) {
   // info will be a much large effort.
 }
 
+// TODO: Implement this if you want to emit .comment section in COFF obj files.
+void WinCOFFStreamer::EmitIdent(StringRef IdentString) {
+  llvm_unreachable("unsupported directive");
+}
+
 void WinCOFFStreamer::EmitWin64EHHandlerData() {
   MCStreamer::EmitWin64EHHandlerData();
 
@@ -318,6 +322,7 @@ void WinCOFFStreamer::EmitWin64EHHandlerData() {
 }
 
 void WinCOFFStreamer::FinishImpl() {
+  EmitFrames(NULL, true);
   EmitW64Tables();
   MCObjectStreamer::FinishImpl();
 }
diff --git a/lib/Makefile b/lib/Makefile
index 0a4435e..2ed0636 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -10,9 +10,8 @@ LEVEL = ..
 
 include $(LEVEL)/Makefile.config
 
-PARALLEL_DIRS := IR AsmParser Bitcode Analysis Transforms CodeGen \
-                 Target ExecutionEngine Linker MC Object Option DebugInfo \
-								 IRReader
+PARALLEL_DIRS := IR AsmParser Bitcode Analysis Transforms CodeGen Target \
+                 ExecutionEngine Linker LTO MC Object Option DebugInfo   \
+                 IRReader
 
 include $(LEVEL)/Makefile.common
-
diff --git a/lib/Object/Binary.cpp b/lib/Object/Binary.cpp
index fd9d3b4..de57b4c 100644
--- a/lib/Object/Binary.cpp
+++ b/lib/Object/Binary.cpp
@@ -91,6 +91,7 @@ error_code object::createBinary(MemoryBuffer *Source,
       return object_error::success;
     }
     case sys::fs::file_magic::coff_object:
+    case sys::fs::file_magic::coff_import_library:
     case sys::fs::file_magic::pecoff_executable: {
       OwningPtr<Binary> ret(
           ObjectFile::createCOFFObjectFile(scopedSource.take()));
@@ -100,7 +101,8 @@ error_code object::createBinary(MemoryBuffer *Source,
       return object_error::success;
     }
     case sys::fs::file_magic::unknown:
-    case sys::fs::file_magic::bitcode: {
+    case sys::fs::file_magic::bitcode:
+    case sys::fs::file_magic::windows_resource: {
       // Unrecognized object file format.
       return object_error::invalid_file_type;
     }
diff --git a/lib/Object/CMakeLists.txt b/lib/Object/CMakeLists.txt
index 2c2cc8e..1f07cbb 100644
--- a/lib/Object/CMakeLists.txt
+++ b/lib/Object/CMakeLists.txt
@@ -3,6 +3,7 @@ add_llvm_library(LLVMObject
   Binary.cpp
   COFFObjectFile.cpp
   COFFYAML.cpp
+  ELF.cpp
   ELFObjectFile.cpp
   ELFYAML.cpp
   Error.cpp
diff --git a/lib/Object/COFFObjectFile.cpp b/lib/Object/COFFObjectFile.cpp
index cb029f9..3434e70 100644
--- a/lib/Object/COFFObjectFile.cpp
+++ b/lib/Object/COFFObjectFile.cpp
@@ -16,6 +16,9 @@
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/Triple.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cctype>
 
 #include <ctype.h>
 
@@ -111,10 +114,8 @@ error_code COFFObjectFile::getSymbolFileOffset(DataRefImpl Symb,
   const coff_section *Section = NULL;
   if (error_code ec = getSection(symb->SectionNumber, Section))
     return ec;
-  char Type;
-  if (error_code ec = getSymbolNMTypeChar(Symb, Type))
-    return ec;
-  if (Type == 'U' || Type == 'w')
+
+  if (symb->SectionNumber == COFF::IMAGE_SYM_UNDEFINED)
     Result = UnknownAddressOrSize;
   else if (Section)
     Result = Section->PointerToRawData + symb->Value;
@@ -129,10 +130,8 @@ error_code COFFObjectFile::getSymbolAddress(DataRefImpl Symb,
   const coff_section *Section = NULL;
   if (error_code ec = getSection(symb->SectionNumber, Section))
     return ec;
-  char Type;
-  if (error_code ec = getSymbolNMTypeChar(Symb, Type))
-    return ec;
-  if (Type == 'U' || Type == 'w')
+
+  if (symb->SectionNumber == COFF::IMAGE_SYM_UNDEFINED)
     Result = UnknownAddressOrSize;
   else if (Section)
     Result = Section->VirtualAddress + symb->Value;
@@ -152,12 +151,16 @@ error_code COFFObjectFile::getSymbolType(DataRefImpl Symb,
     if (symb->getComplexType() == COFF::IMAGE_SYM_DTYPE_FUNCTION) {
       Result = SymbolRef::ST_Function;
     } else {
-      char Type;
-      if (error_code ec = getSymbolNMTypeChar(Symb, Type))
-        return ec;
-      if (Type == 'r' || Type == 'R') {
-        Result = SymbolRef::ST_Data;
+      uint32_t Characteristics = 0;
+      if (symb->SectionNumber > 0) {
+        const coff_section *Section = NULL;
+        if (error_code ec = getSection(symb->SectionNumber, Section))
+          return ec;
+        Characteristics = Section->Characteristics;
       }
+      if (Characteristics & COFF::IMAGE_SCN_MEM_READ &&
+          ~Characteristics & COFF::IMAGE_SCN_MEM_WRITE) // Read only.
+        Result = SymbolRef::ST_Data;
     }
   }
   return object_error::success;
@@ -196,10 +199,8 @@ error_code COFFObjectFile::getSymbolSize(DataRefImpl Symb,
   const coff_section *Section = NULL;
   if (error_code ec = getSection(symb->SectionNumber, Section))
     return ec;
-  char Type;
-  if (error_code ec = getSymbolNMTypeChar(Symb, Type))
-    return ec;
-  if (Type == 'U' || Type == 'w')
+
+  if (symb->SectionNumber == COFF::IMAGE_SYM_UNDEFINED)
     Result = UnknownAddressOrSize;
   else if (Section)
     Result = Section->SizeOfRawData - symb->Value;
@@ -208,74 +209,6 @@ error_code COFFObjectFile::getSymbolSize(DataRefImpl Symb,
   return object_error::success;
 }
 
-error_code COFFObjectFile::getSymbolNMTypeChar(DataRefImpl Symb,
-                                               char &Result) const {
-  const coff_symbol *symb = toSymb(Symb);
-  StringRef name;
-  if (error_code ec = getSymbolName(Symb, name))
-    return ec;
-  char ret = StringSwitch<char>(name)
-    .StartsWith(".debug", 'N')
-    .StartsWith(".sxdata", 'N')
-    .Default('?');
-
-  if (ret != '?') {
-    Result = ret;
-    return object_error::success;
-  }
-
-  uint32_t Characteristics = 0;
-  if (symb->SectionNumber > 0) {
-    const coff_section *Section = NULL;
-    if (error_code ec = getSection(symb->SectionNumber, Section))
-      return ec;
-    Characteristics = Section->Characteristics;
-  }
-
-  switch (symb->SectionNumber) {
-  case COFF::IMAGE_SYM_UNDEFINED:
-    // Check storage classes.
-    if (symb->StorageClass == COFF::IMAGE_SYM_CLASS_WEAK_EXTERNAL) {
-      Result = 'w';
-      return object_error::success; // Don't do ::toupper.
-    } else if (symb->Value != 0) // Check for common symbols.
-      ret = 'c';
-    else
-      ret = 'u';
-    break;
-  case COFF::IMAGE_SYM_ABSOLUTE:
-    ret = 'a';
-    break;
-  case COFF::IMAGE_SYM_DEBUG:
-    ret = 'n';
-    break;
-  default:
-    // Check section type.
-    if (Characteristics & COFF::IMAGE_SCN_CNT_CODE)
-      ret = 't';
-    else if (  Characteristics & COFF::IMAGE_SCN_MEM_READ
-            && ~Characteristics & COFF::IMAGE_SCN_MEM_WRITE) // Read only.
-      ret = 'r';
-    else if (Characteristics & COFF::IMAGE_SCN_CNT_INITIALIZED_DATA)
-      ret = 'd';
-    else if (Characteristics & COFF::IMAGE_SCN_CNT_UNINITIALIZED_DATA)
-      ret = 'b';
-    else if (Characteristics & COFF::IMAGE_SCN_LNK_INFO)
-      ret = 'i';
-
-    // Check for section symbol.
-    else if (  symb->StorageClass == COFF::IMAGE_SYM_CLASS_STATIC
-            && symb->Value == 0)
-       ret = 's';
-  }
-
-  if (symb->StorageClass == COFF::IMAGE_SYM_CLASS_EXTERNAL)
-    ret = ::toupper(static_cast<unsigned char>(ret));
-
-  Result = ret;
-  return object_error::success;
-}
-
 error_code COFFObjectFile::getSymbolSection(DataRefImpl Symb,
                                             section_iterator &Result) const {
   const coff_symbol *symb = toSymb(Symb);
@@ -406,7 +339,7 @@ error_code COFFObjectFile::sectionContainsSymbol(DataRefImpl Sec,
   return object_error::success;
 }
 
-relocation_iterator COFFObjectFile::getSectionRelBegin(DataRefImpl Sec) const {
+relocation_iterator COFFObjectFile::section_rel_begin(DataRefImpl Sec) const {
   const coff_section *sec = toSec(Sec);
   DataRefImpl ret;
   if (sec->NumberOfRelocations == 0)
@@ -417,7 +350,7 @@ relocation_iterator COFFObjectFile::getSectionRelBegin(DataRefImpl Sec) const {
   return relocation_iterator(RelocationRef(ret, this));
 }
 
-relocation_iterator COFFObjectFile::getSectionRelEnd(DataRefImpl Sec) const {
+relocation_iterator COFFObjectFile::section_rel_end(DataRefImpl Sec) const {
   const coff_section *sec = toSec(Sec);
   DataRefImpl ret;
   if (sec->NumberOfRelocations == 0)
@@ -431,6 +364,94 @@ relocation_iterator COFFObjectFile::getSectionRelEnd(DataRefImpl Sec) const {
   return relocation_iterator(RelocationRef(ret, this));
 }
 
+// Initialize the pointer to the symbol table.
+error_code COFFObjectFile::initSymbolTablePtr() {
+  if (error_code ec = getObject(
+          SymbolTable, Data, base() + COFFHeader->PointerToSymbolTable,
+          COFFHeader->NumberOfSymbols * sizeof(coff_symbol)))
+    return ec;
+
+  // Find string table. The first four byte of the string table contains the
+  // total size of the string table, including the size field itself. If the
+  // string table is empty, the value of the first four byte would be 4.
+  const uint8_t *StringTableAddr =
+      base() + COFFHeader->PointerToSymbolTable +
+      COFFHeader->NumberOfSymbols * sizeof(coff_symbol);
+  const ulittle32_t *StringTableSizePtr;
+  if (error_code ec = getObject(StringTableSizePtr, Data, StringTableAddr))
+    return ec;
+  StringTableSize = *StringTableSizePtr;
+  if (error_code ec =
+      getObject(StringTable, Data, StringTableAddr, StringTableSize))
+    return ec;
+
+  // Check that the string table is null terminated if has any in it.
+  if (StringTableSize < 4 ||
+      (StringTableSize > 4 && StringTable[StringTableSize - 1] != 0))
+    return  object_error::parse_failed;
+  return object_error::success;
+}
+
+// Returns the file offset for the given RVA.
+error_code COFFObjectFile::getRvaPtr(uint32_t Rva, uintptr_t &Res) const {
+  error_code ec;
+  for (section_iterator i = begin_sections(), e = end_sections(); i != e;
+       i.increment(ec)) {
+    if (ec)
+      return ec;
+    const coff_section *Section = getCOFFSection(i);
+    uint32_t SectionStart = Section->VirtualAddress;
+    uint32_t SectionEnd = Section->VirtualAddress + Section->VirtualSize;
+    if (SectionStart <= Rva && Rva < SectionEnd) {
+      uint32_t Offset = Rva - SectionStart;
+      Res = uintptr_t(base()) + Section->PointerToRawData + Offset;
+      return object_error::success;
+    }
+  }
+  return object_error::parse_failed;
+}
+
+// Returns hint and name fields, assuming \p Rva is pointing to a Hint/Name
+// table entry.
+error_code COFFObjectFile::
+getHintName(uint32_t Rva, uint16_t &Hint, StringRef &Name) const {
+  uintptr_t IntPtr = 0;
+  if (error_code ec = getRvaPtr(Rva, IntPtr))
+    return ec;
+  const uint8_t *Ptr = reinterpret_cast<const uint8_t *>(IntPtr);
+  Hint = *reinterpret_cast<const ulittle16_t *>(Ptr);
+  Name = StringRef(reinterpret_cast<const char *>(Ptr + 2));
+  return object_error::success;
+}
+
+// Find the import table.
+error_code COFFObjectFile::initImportTablePtr() {
+  // First, we get the RVA of the import table. If the file lacks a pointer to
+  // the import table, do nothing.
+  const data_directory *DataEntry;
+  if (getDataDirectory(COFF::IMPORT_TABLE, DataEntry))
+    return object_error::success;
+
+  // Do nothing if the pointer to import table is NULL.
+  if (DataEntry->RelativeVirtualAddress == 0)
+    return object_error::success;
+
+  uint32_t ImportTableRva = DataEntry->RelativeVirtualAddress;
+  NumberOfImportDirectory = DataEntry->Size /
+      sizeof(import_directory_table_entry);
+
+  // Find the section that contains the RVA. This is needed because the RVA is
+  // the import table's memory address which is different from its file offset.
+  uintptr_t IntPtr = 0;
+  if (error_code ec = getRvaPtr(ImportTableRva, IntPtr))
+    return ec;
+  ImportDirectory = reinterpret_cast<
+      const import_directory_table_entry *>(IntPtr);
+
+  // It's an error if there's no section containing the Import Table RVA.
+  return object_error::parse_failed;
+}
+
 COFFObjectFile::COFFObjectFile(MemoryBuffer *Object, error_code &ec)
   : ObjectFile(Binary::ID_COFF, Object)
   , COFFHeader(0)
@@ -439,7 +460,9 @@ COFFObjectFile::COFFObjectFile(MemoryBuffer *Object, error_code &ec)
   , SectionTable(0)
   , SymbolTable(0)
   , StringTable(0)
-  , StringTableSize(0) {
+  , StringTableSize(0)
+  , ImportDirectory(0)
+  , NumberOfImportDirectory(0) {
   // Check that we at least have enough room for a header.
   if (!checkSize(Data, ec, sizeof(coff_file_header))) return;
 
@@ -486,49 +509,33 @@ COFFObjectFile::COFFObjectFile(MemoryBuffer *Object, error_code &ec)
     CurPtr += COFFHeader->SizeOfOptionalHeader;
   }
 
-  if ((ec = getObject(SectionTable, Data, base() + CurPtr,
-                      COFFHeader->NumberOfSections * sizeof(coff_section))))
-    return;
-
-  if (COFFHeader->PointerToSymbolTable != 0) {
-    if ((ec = getObject(SymbolTable, Data,
-                        base() + COFFHeader->PointerToSymbolTable,
-                        COFFHeader->NumberOfSymbols * sizeof(coff_symbol))))
+  if (!COFFHeader->isImportLibrary())
+    if ((ec = getObject(SectionTable, Data, base() + CurPtr,
+                        COFFHeader->NumberOfSections * sizeof(coff_section))))
       return;
 
-    // Find string table. The first four byte of the string table contains the
-    // total size of the string table, including the size field itself. If the
-    // string table is empty, the value of the first four byte would be 4.
-    const uint8_t *StringTableAddr = base() + COFFHeader->PointerToSymbolTable
-        + COFFHeader->NumberOfSymbols * sizeof(coff_symbol);
-    const ulittle32_t *StringTableSizePtr;
-    if ((ec = getObject(StringTableSizePtr, Data, StringTableAddr)))
-      return;
-    StringTableSize = *StringTableSizePtr;
-    if ((ec = getObject(StringTable, Data, StringTableAddr, StringTableSize)))
+  // Initialize the pointer to the symbol table.
+  if (COFFHeader->PointerToSymbolTable != 0)
+    if ((ec = initSymbolTablePtr()))
       return;
 
-    // Check that the string table is null terminated if has any in it.
-    if (StringTableSize < 4
-        || (StringTableSize > 4 && StringTable[StringTableSize - 1] != 0)) {
-      ec = object_error::parse_failed;
-      return;
-    }
-  }
+  // Initialize the pointer to the beginning of the import table.
+  if ((ec = initImportTablePtr()))
+    return;
 
   ec = object_error::success;
 }
 
 symbol_iterator COFFObjectFile::begin_symbols() const {
   DataRefImpl ret;
-  ret.p = reinterpret_cast<intptr_t>(SymbolTable);
+  ret.p = reinterpret_cast<uintptr_t>(SymbolTable);
   return symbol_iterator(SymbolRef(ret, this));
 }
 
 symbol_iterator COFFObjectFile::end_symbols() const {
   // The symbol table ends where the string table begins.
   DataRefImpl ret;
-  ret.p = reinterpret_cast<intptr_t>(StringTable);
+  ret.p = reinterpret_cast<uintptr_t>(StringTable);
   return symbol_iterator(SymbolRef(ret, this));
 }
 
@@ -557,16 +564,34 @@ StringRef COFFObjectFile::getLoadName() const {
   return "";
 }
 
+import_directory_iterator COFFObjectFile::import_directory_begin() const {
+  DataRefImpl Imp;
+  Imp.p = reinterpret_cast<uintptr_t>(ImportDirectory);
+  return import_directory_iterator(ImportDirectoryEntryRef(Imp, this));
+}
+
+import_directory_iterator COFFObjectFile::import_directory_end() const {
+  DataRefImpl Imp;
+  if (ImportDirectory) {
+    Imp.p = reinterpret_cast<uintptr_t>(
+        ImportDirectory + (NumberOfImportDirectory - 1));
+  } else {
+    Imp.p = 0;
+  }
+  return import_directory_iterator(ImportDirectoryEntryRef(Imp, this));
+}
 
 section_iterator COFFObjectFile::begin_sections() const {
   DataRefImpl ret;
-  ret.p = reinterpret_cast<intptr_t>(SectionTable);
+  ret.p = reinterpret_cast<uintptr_t>(SectionTable);
   return section_iterator(SectionRef(ret, this));
 }
 
 section_iterator COFFObjectFile::end_sections() const {
   DataRefImpl ret;
-  ret.p = reinterpret_cast<intptr_t>(SectionTable + COFFHeader->NumberOfSections);
+  int numSections = COFFHeader->isImportLibrary()
+      ? 0 : COFFHeader->NumberOfSections;
+  ret.p = reinterpret_cast<uintptr_t>(SectionTable + numSections);
   return section_iterator(SectionRef(ret, this));
 }
 
@@ -678,7 +703,7 @@ error_code COFFObjectFile::getSymbolName(const coff_symbol *symbol,
 ArrayRef<uint8_t> COFFObjectFile::getSymbolAuxData(
                                   const coff_symbol *symbol) const {
   const uint8_t *aux = NULL;
-  
+
   if ( symbol->NumberOfAuxSymbols > 0 ) {
   // AUX data comes immediately after the symbol in COFF
     aux = reinterpret_cast<const uint8_t *>(symbol + 1);
@@ -779,7 +804,6 @@ const coff_relocation *COFFObjectFile::getCOFFRelocation(
   return toRel(It->getRawDataRefImpl());
 }
 
-
 #define LLVM_COFF_SWITCH_RELOC_TYPE_NAME(enum) \
   case COFF::enum: res = #enum; break;
 
@@ -860,6 +884,52 @@ error_code COFFObjectFile::getLibraryPath(DataRefImpl LibData,
   report_fatal_error("getLibraryPath not implemented in COFFObjectFile");
 }
 
+bool ImportDirectoryEntryRef::
+operator==(const ImportDirectoryEntryRef &Other) const {
+  return ImportDirectoryPimpl == Other.ImportDirectoryPimpl;
+}
+
+static const import_directory_table_entry *toImportEntry(DataRefImpl Imp) {
+  return reinterpret_cast<const import_directory_table_entry *>(Imp.p);
+}
+
+error_code
+ImportDirectoryEntryRef::getNext(ImportDirectoryEntryRef &Result) const {
+  const import_directory_table_entry *Dir = toImportEntry(ImportDirectoryPimpl);
+  Dir += 1;
+  DataRefImpl Next;
+  Next.p = reinterpret_cast<uintptr_t>(Dir);
+  Result = ImportDirectoryEntryRef(Next, OwningObject);
+  return object_error::success;
+}
+
+error_code ImportDirectoryEntryRef::
+getImportTableEntry(const import_directory_table_entry *&Result) const {
+  Result = toImportEntry(ImportDirectoryPimpl);
+  return object_error::success;
+}
+
+error_code ImportDirectoryEntryRef::getName(StringRef &Result) const {
+  const import_directory_table_entry *Dir = toImportEntry(ImportDirectoryPimpl);
+  uintptr_t IntPtr = 0;
+  if (error_code ec = OwningObject->getRvaPtr(Dir->NameRVA, IntPtr))
+    return ec;
+  const char *Ptr = reinterpret_cast<const char *>(IntPtr);
+  Result = StringRef(Ptr);
+  return object_error::success;
+}
+
+error_code ImportDirectoryEntryRef::getImportLookupEntry(
+    const import_lookup_table_entry32 *&Result) const {
+  const import_directory_table_entry *Dir = toImportEntry(ImportDirectoryPimpl);
+  uintptr_t IntPtr = 0;
+  if (error_code ec = OwningObject->getRvaPtr(
+          Dir->ImportLookupTableRVA, IntPtr))
+    return ec;
+  Result = reinterpret_cast<const import_lookup_table_entry32 *>(IntPtr);
+  return object_error::success;
+}
+
 namespace llvm {
 
   ObjectFile *ObjectFile::createCOFFObjectFile(MemoryBuffer *Object) {
diff --git a/lib/Object/ELF.cpp b/lib/Object/ELF.cpp
new file mode 100644
index 0000000..7c80d41
--- /dev/null
+++ b/lib/Object/ELF.cpp
@@ -0,0 +1,714 @@
+//===- ELF.cpp - ELF object file implementation -----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Object/ELF.h"
+
+namespace llvm {
+namespace object {
+
+#define LLVM_ELF_SWITCH_RELOC_TYPE_NAME(enum)                                  \
+  case ELF::enum:                                                              \
+    return #enum;                                                              \
+
+StringRef getELFRelocationTypeName(uint32_t Machine, uint32_t Type) {
+  switch (Machine) {
+  case ELF::EM_X86_64:
+    switch (Type) {
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_NONE);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_PC32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_GOT32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_PLT32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_COPY);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_GLOB_DAT);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_JUMP_SLOT);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_RELATIVE);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_GOTPCREL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_32S);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_PC16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_8);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_PC8);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_DTPMOD64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_DTPOFF64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_TPOFF64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_TLSGD);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_TLSLD);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_DTPOFF32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_GOTTPOFF);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_TPOFF32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_PC64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_GOTOFF64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_GOTPC32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_GOT64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_GOTPCREL64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_GOTPC64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_GOTPLT64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_PLTOFF64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_SIZE32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_SIZE64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_GOTPC32_TLSDESC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_TLSDESC_CALL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_TLSDESC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_IRELATIVE);
+    default:
+      break;
+    }
+    break;
+  case ELF::EM_386:
+    switch (Type) {
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_NONE);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_PC32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_GOT32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_PLT32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_COPY);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_GLOB_DAT);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_JUMP_SLOT);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_RELATIVE);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_GOTOFF);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_GOTPC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_32PLT);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_TPOFF);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_IE);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_GOTIE);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_LE);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_GD);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_LDM);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_PC16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_8);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_PC8);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_GD_32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_GD_PUSH);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_GD_CALL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_GD_POP);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_LDM_32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_LDM_PUSH);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_LDM_CALL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_LDM_POP);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_LDO_32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_IE_32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_LE_32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_DTPMOD32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_DTPOFF32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_TPOFF32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_GOTDESC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_DESC_CALL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_DESC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_IRELATIVE);
+    default:
+      break;
+    }
+    break;
+  case ELF::EM_MIPS:
+    switch (Type) {
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_NONE);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_REL32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_26);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_HI16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_LO16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_GPREL16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_LITERAL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_GOT16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_PC16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_CALL16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_GPREL32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_SHIFT5);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_SHIFT6);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_GOT_DISP);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_GOT_PAGE);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_GOT_OFST);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_GOT_HI16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_GOT_LO16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_SUB);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_INSERT_A);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_INSERT_B);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_DELETE);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_HIGHER);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_HIGHEST);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_CALL_HI16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_CALL_LO16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_SCN_DISP);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_REL16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_ADD_IMMEDIATE);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_PJUMP);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_RELGOT);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_JALR);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_TLS_DTPMOD32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_TLS_DTPREL32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_TLS_DTPMOD64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_TLS_DTPREL64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_TLS_GD);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_TLS_LDM);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_TLS_DTPREL_HI16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_TLS_DTPREL_LO16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_TLS_GOTTPREL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_TLS_TPREL32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_TLS_TPREL64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_TLS_TPREL_HI16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_TLS_TPREL_LO16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_GLOB_DAT);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_COPY);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_JUMP_SLOT);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MICROMIPS_26_S1);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MICROMIPS_HI16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MICROMIPS_LO16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MICROMIPS_GOT16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MICROMIPS_PC16_S1);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MICROMIPS_CALL16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MICROMIPS_GOT_DISP);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MICROMIPS_GOT_PAGE);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MICROMIPS_GOT_OFST);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MICROMIPS_TLS_DTPREL_HI16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MICROMIPS_TLS_DTPREL_LO16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MICROMIPS_TLS_TPREL_HI16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MICROMIPS_TLS_TPREL_LO16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_NUM);
+    default:
+      break;
+    }
+    break;
+  case ELF::EM_AARCH64:
+    switch (Type) {
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_NONE);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_ABS64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_ABS32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_ABS16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_PREL64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_PREL32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_PREL16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_MOVW_UABS_G0);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_MOVW_UABS_G0_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_MOVW_UABS_G1);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_MOVW_UABS_G1_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_MOVW_UABS_G2);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_MOVW_UABS_G2_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_MOVW_UABS_G3);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_MOVW_SABS_G0);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_MOVW_SABS_G1);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_MOVW_SABS_G2);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_LD_PREL_LO19);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_ADR_PREL_LO21);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_ADR_PREL_PG_HI21);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_ADD_ABS_LO12_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_LDST8_ABS_LO12_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TSTBR14);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_CONDBR19);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_JUMP26);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_CALL26);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_LDST16_ABS_LO12_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_LDST32_ABS_LO12_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_LDST64_ABS_LO12_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_LDST128_ABS_LO12_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_ADR_GOT_PAGE);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_LD64_GOT_LO12_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLD_MOVW_DTPREL_G2);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLD_MOVW_DTPREL_G1);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLD_MOVW_DTPREL_G1_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLD_MOVW_DTPREL_G0);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLD_MOVW_DTPREL_G0_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLD_ADD_DTPREL_HI12);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLD_ADD_DTPREL_LO12);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLD_ADD_DTPREL_LO12_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLD_LDST8_DTPREL_LO12);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLD_LDST8_DTPREL_LO12_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLD_LDST16_DTPREL_LO12);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLD_LDST16_DTPREL_LO12_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLD_LDST32_DTPREL_LO12);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLD_LDST32_DTPREL_LO12_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLD_LDST64_DTPREL_LO12);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLD_LDST64_DTPREL_LO12_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSIE_MOVW_GOTTPREL_G1);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSIE_MOVW_GOTTPREL_G0_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSIE_LD_GOTTPREL_PREL19);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLE_MOVW_TPREL_G2);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLE_MOVW_TPREL_G1);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLE_MOVW_TPREL_G1_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLE_MOVW_TPREL_G0);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLE_MOVW_TPREL_G0_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLE_ADD_TPREL_HI12);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLE_ADD_TPREL_LO12);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLE_ADD_TPREL_LO12_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLE_LDST8_TPREL_LO12);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLE_LDST8_TPREL_LO12_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLE_LDST16_TPREL_LO12);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLE_LDST16_TPREL_LO12_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLE_LDST32_TPREL_LO12);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLE_LDST32_TPREL_LO12_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLE_LDST64_TPREL_LO12);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLE_LDST64_TPREL_LO12_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSDESC_ADR_PAGE);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSDESC_LD64_LO12_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSDESC_ADD_LO12_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSDESC_CALL);
+    default:
+      break;
+    }
+    break;
+  case ELF::EM_ARM:
+    switch (Type) {
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_NONE);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PC24);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ABS32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_REL32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDR_PC_G0);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ABS16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ABS12);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_ABS5);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ABS8);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_SBREL32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_CALL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_PC8);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_BREL_ADJ);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_DESC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_SWI8);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_XPC25);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_XPC22);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_DTPMOD32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_DTPOFF32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_TPOFF32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_COPY);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_GLOB_DAT);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_JUMP_SLOT);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_RELATIVE);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_GOTOFF32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_BASE_PREL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_GOT_BREL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PLT32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_CALL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_JUMP24);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_JUMP24);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_BASE_ABS);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_PCREL_7_0);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_PCREL_15_8);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_PCREL_23_15);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDR_SBREL_11_0_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_SBREL_19_12_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_SBREL_27_20_CK);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TARGET1);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_SBREL31);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_V4BX);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TARGET2);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PREL31);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_MOVW_ABS_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_MOVT_ABS);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_MOVW_PREL_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_MOVT_PREL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_MOVW_ABS_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_MOVT_ABS);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_MOVW_PREL_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_MOVT_PREL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_JUMP19);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_JUMP6);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_ALU_PREL_11_0);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_PC12);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ABS32_NOI);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_REL32_NOI);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_PC_G0_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_PC_G0);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_PC_G1_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_PC_G1);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_PC_G2);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDR_PC_G1);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDR_PC_G2);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDRS_PC_G0);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDRS_PC_G1);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDRS_PC_G2);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDC_PC_G0);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDC_PC_G1);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDC_PC_G2);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_SB_G0_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_SB_G0);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_SB_G1_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_SB_G1);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_SB_G2);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDR_SB_G0);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDR_SB_G1);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDR_SB_G2);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDRS_SB_G0);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDRS_SB_G1);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDRS_SB_G2);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDC_SB_G0);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDC_SB_G1);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDC_SB_G2);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_MOVW_BREL_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_MOVT_BREL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_MOVW_BREL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_MOVW_BREL_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_MOVT_BREL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_MOVW_BREL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_GOTDESC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_CALL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_DESCSEQ);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_TLS_CALL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PLT32_ABS);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_GOT_ABS);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_GOT_PREL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_GOT_BREL12);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_GOTOFF12);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_GOTRELAX);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_GNU_VTENTRY);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_GNU_VTINHERIT);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_JUMP11);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_JUMP8);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_GD32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_LDM32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_LDO32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_IE32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_LE32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_LDO12);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_LE12);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_IE12GP);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_0);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_1);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_2);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_3);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_4);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_5);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_6);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_7);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_8);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_9);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_10);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_11);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_12);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_13);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_14);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_15);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ME_TOO);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_TLS_DESCSEQ16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_TLS_DESCSEQ32);
+    default:
+      break;
+    }
+    break;
+  case ELF::EM_HEXAGON:
+    switch (Type) {
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_NONE);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_B22_PCREL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_B15_PCREL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_B7_PCREL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_LO16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_HI16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_8);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GPREL16_0);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GPREL16_1);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GPREL16_2);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GPREL16_3);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_HL16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_B13_PCREL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_B9_PCREL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_B32_PCREL_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_32_6_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_B22_PCREL_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_B15_PCREL_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_B13_PCREL_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_B9_PCREL_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_B7_PCREL_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_16_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_12_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_11_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_10_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_9_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_8_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_7_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_6_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_32_PCREL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_COPY);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GLOB_DAT);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_JMP_SLOT);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_RELATIVE);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_PLT_B22_PCREL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GOTREL_LO16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GOTREL_HI16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GOTREL_32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GOT_LO16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GOT_HI16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GOT_32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GOT_16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_DTPMOD_32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_DTPREL_LO16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_DTPREL_HI16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_DTPREL_32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_DTPREL_16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GD_PLT_B22_PCREL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GD_GOT_LO16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GD_GOT_HI16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GD_GOT_32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GD_GOT_16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_IE_LO16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_IE_HI16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_IE_32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_IE_GOT_LO16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_IE_GOT_HI16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_IE_GOT_32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_IE_GOT_16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_TPREL_LO16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_TPREL_HI16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_TPREL_32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_TPREL_16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_6_PCREL_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GOTREL_32_6_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GOTREL_16_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GOTREL_11_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GOT_32_6_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GOT_16_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GOT_11_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_DTPREL_32_6_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_DTPREL_16_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_DTPREL_11_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GD_GOT_32_6_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GD_GOT_16_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GD_GOT_11_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_IE_32_6_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_IE_16_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_IE_GOT_32_6_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_IE_GOT_16_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_IE_GOT_11_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_TPREL_32_6_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_TPREL_16_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_TPREL_11_X);
+    default:
+      break;
+    }
+    break;
+  case ELF::EM_PPC:
+    switch (Type) {
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_NONE);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_ADDR32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_ADDR24);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_ADDR16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_ADDR16_LO);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_ADDR16_HI);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_ADDR16_HA);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_ADDR14);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_ADDR14_BRTAKEN);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_ADDR14_BRNTAKEN);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_REL24);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_REL14);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_REL14_BRTAKEN);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_REL14_BRNTAKEN);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT16_LO);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT16_HI);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT16_HA);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_REL32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_TLS);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_DTPMOD32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_TPREL16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_TPREL16_LO);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_TPREL16_HI);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_TPREL16_HA);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_TPREL32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_DTPREL16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_DTPREL16_LO);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_DTPREL16_HI);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_DTPREL16_HA);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_DTPREL32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT_TLSGD16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT_TLSGD16_LO);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT_TLSGD16_HI);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT_TLSGD16_HA);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT_TLSLD16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT_TLSLD16_LO);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT_TLSLD16_HI);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT_TLSLD16_HA);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT_TPREL16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT_TPREL16_LO);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT_TPREL16_HI);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT_TPREL16_HA);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT_DTPREL16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT_DTPREL16_LO);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT_DTPREL16_HI);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT_DTPREL16_HA);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_TLSGD);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_TLSLD);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_REL16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_REL16_LO);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_REL16_HI);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_REL16_HA);
+    default:
+      break;
+    }
+    break;
+  case ELF::EM_PPC64:
+    switch (Type) {
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_NONE);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_ADDR32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_ADDR24);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_ADDR16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_ADDR16_LO);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_ADDR16_HI);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_ADDR16_HA);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_ADDR14);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_ADDR14_BRTAKEN);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_ADDR14_BRNTAKEN);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_REL24);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_REL14);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_REL14_BRTAKEN);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_REL14_BRNTAKEN);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT16_LO);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT16_HI);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT16_HA);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_REL32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_ADDR64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_ADDR16_HIGHER);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_ADDR16_HIGHERA);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_ADDR16_HIGHEST);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_ADDR16_HIGHESTA);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_REL64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TOC16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TOC16_LO);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TOC16_HI);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TOC16_HA);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TOC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_ADDR16_DS);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_ADDR16_LO_DS);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT16_DS);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT16_LO_DS);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TOC16_DS);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TOC16_LO_DS);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TLS);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_DTPMOD64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TPREL16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TPREL16_LO);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TPREL16_HI);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TPREL16_HA);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TPREL64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_DTPREL16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_DTPREL16_LO);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_DTPREL16_HI);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_DTPREL16_HA);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_DTPREL64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT_TLSGD16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT_TLSGD16_LO);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT_TLSGD16_HI);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT_TLSGD16_HA);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT_TLSLD16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT_TLSLD16_LO);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT_TLSLD16_HI);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT_TLSLD16_HA);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT_TPREL16_DS);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT_TPREL16_LO_DS);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT_TPREL16_HI);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT_TPREL16_HA);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT_DTPREL16_DS);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT_DTPREL16_LO_DS);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT_DTPREL16_HI);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT_DTPREL16_HA);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TPREL16_DS);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TPREL16_LO_DS);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TPREL16_HIGHER);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TPREL16_HIGHERA);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TPREL16_HIGHEST);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TPREL16_HIGHESTA);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_DTPREL16_DS);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_DTPREL16_LO_DS);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_DTPREL16_HIGHER);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_DTPREL16_HIGHERA);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_DTPREL16_HIGHEST);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_DTPREL16_HIGHESTA);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TLSGD);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TLSLD);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_REL16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_REL16_LO);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_REL16_HI);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_REL16_HA);
+    default:
+      break;
+    }
+    break;
+  case ELF::EM_S390:
+    switch (Type) {
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_NONE);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_8);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_12);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_PC32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_GOT12);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_GOT32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_PLT32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_COPY);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_GLOB_DAT);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_JMP_SLOT);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_RELATIVE);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_GOTOFF);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_GOTPC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_GOT16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_PC16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_PC16DBL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_PLT16DBL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_PC32DBL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_PLT32DBL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_GOTPCDBL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_PC64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_GOT64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_PLT64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_GOTENT);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_GOTOFF16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_GOTOFF64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_GOTPLT12);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_GOTPLT16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_GOTPLT32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_GOTPLT64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_GOTPLTENT);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_PLTOFF16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_PLTOFF32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_PLTOFF64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_LOAD);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_GDCALL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_LDCALL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_GD32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_GD64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_GOTIE12);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_GOTIE32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_GOTIE64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_LDM32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_LDM64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_IE32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_IE64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_IEENT);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_LE32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_LE64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_LDO32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_LDO64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_DTPMOD);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_DTPOFF);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_TPOFF);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_20);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_GOT20);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_GOTPLT20);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_GOTIE20);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_IRELATIVE);
+    default:
+      break;
+    }
+    break;
+  default:
+    break;
+  }
+  return "Unknown";
+}
+
+#undef LLVM_ELF_SWITCH_RELOC_TYPE_NAME
+
+} // end namespace object
+} // end namespace llvm
diff --git a/lib/Object/ELFObjectFile.cpp b/lib/Object/ELFObjectFile.cpp
index a6c128e..20b7307 100644
--- a/lib/Object/ELFObjectFile.cpp
+++ b/lib/Object/ELFObjectFile.cpp
@@ -11,13 +11,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Object/ELF.h"
+#include "llvm/Object/ELFObjectFile.h"
 #include "llvm/Support/MathExtras.h"
 
 #include <ctype.h>
 
 namespace llvm {
-
 using namespace object;
 
 // Creates an in-memory object-file by default: createELFObjectFile(Buffer)
diff --git a/lib/Object/ELFYAML.cpp b/lib/Object/ELFYAML.cpp
index e530d3d..2f35cf9 100644
--- a/lib/Object/ELFYAML.cpp
+++ b/lib/Object/ELFYAML.cpp
@@ -266,6 +266,7 @@ void ScalarBitSetTraits<ELFYAML::ELF_SHF>::bitset(IO &IO,
 #define BCase(X) IO.bitSetCase(Value, #X, ELF::X);
   BCase(SHF_WRITE)
   BCase(SHF_ALLOC)
+  BCase(SHF_EXCLUDE)
   BCase(SHF_EXECINSTR)
   BCase(SHF_MERGE)
   BCase(SHF_STRINGS)
diff --git a/lib/Object/MachOObjectFile.cpp b/lib/Object/MachOObjectFile.cpp
index 5d0399e..d2cb8bd 100644
--- a/lib/Object/MachOObjectFile.cpp
+++ b/lib/Object/MachOObjectFile.cpp
@@ -14,11 +14,11 @@
 
 #include "llvm/Object/MachO.h"
 #include "llvm/ADT/Triple.h"
-#include "llvm/Object/MachOFormat.h"
 #include "llvm/Support/DataExtractor.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/Host.h"
 #include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/raw_ostream.h"
 #include <cctype>
 #include <cstring>
 #include <limits>
@@ -29,16 +29,16 @@ using namespace object;
 namespace llvm {
 namespace object {
 
-struct SymbolTableEntryBase {
-  uint32_t StringIndex;
-  uint8_t Type;
-  uint8_t SectionIndex;
-  uint16_t Flags;
+struct nlist_base {
+  uint32_t n_strx;
+  uint8_t n_type;
+  uint8_t n_sect;
+  uint16_t n_desc;
 };
 
-struct SectionBase {
-  char Name[16];
-  char SegmentName[16];
+struct section_base {
+  char sectname[16];
+  char segname[16];
 };
 
 template<typename T>
@@ -50,167 +50,174 @@ template<typename T>
 static void SwapStruct(T &Value);
 
 template<>
-void SwapStruct(macho::RelocationEntry &H) {
-  SwapValue(H.Word0);
-  SwapValue(H.Word1);
+void SwapStruct(MachO::any_relocation_info &H) {
+  SwapValue(H.r_word0);
+  SwapValue(H.r_word1);
 }
 
 template<>
-void SwapStruct(macho::LoadCommand &L) {
-  SwapValue(L.Type);
-  SwapValue(L.Size);
+void SwapStruct(MachO::load_command &L) {
+  SwapValue(L.cmd);
+  SwapValue(L.cmdsize);
 }
 
 template<>
-void SwapStruct(SymbolTableEntryBase &S) {
-  SwapValue(S.StringIndex);
-  SwapValue(S.Flags);
+void SwapStruct(nlist_base &S) {
+  SwapValue(S.n_strx);
+  SwapValue(S.n_desc);
 }
 
 template<>
-void SwapStruct(macho::Section &S) {
-  SwapValue(S.Address);
-  SwapValue(S.Size);
-  SwapValue(S.Offset);
-  SwapValue(S.Align);
-  SwapValue(S.RelocationTableOffset);
-  SwapValue(S.NumRelocationTableEntries);
-  SwapValue(S.Flags);
-  SwapValue(S.Reserved1);
-  SwapValue(S.Reserved2);
+void SwapStruct(MachO::section &S) {
+  SwapValue(S.addr);
+  SwapValue(S.size);
+  SwapValue(S.offset);
+  SwapValue(S.align);
+  SwapValue(S.reloff);
+  SwapValue(S.nreloc);
+  SwapValue(S.flags);
+  SwapValue(S.reserved1);
+  SwapValue(S.reserved2);
 }
 
 template<>
-void SwapStruct(macho::Section64 &S) {
-  SwapValue(S.Address);
-  SwapValue(S.Size);
-  SwapValue(S.Offset);
-  SwapValue(S.Align);
-  SwapValue(S.RelocationTableOffset);
-  SwapValue(S.NumRelocationTableEntries);
-  SwapValue(S.Flags);
-  SwapValue(S.Reserved1);
-  SwapValue(S.Reserved2);
-  SwapValue(S.Reserved3);
+void SwapStruct(MachO::section_64 &S) {
+  SwapValue(S.addr);
+  SwapValue(S.size);
+  SwapValue(S.offset);
+  SwapValue(S.align);
+  SwapValue(S.reloff);
+  SwapValue(S.nreloc);
+  SwapValue(S.flags);
+  SwapValue(S.reserved1);
+  SwapValue(S.reserved2);
+  SwapValue(S.reserved3);
 }
 
 template<>
-void SwapStruct(macho::SymbolTableEntry &S) {
-  SwapValue(S.StringIndex);
-  SwapValue(S.Flags);
-  SwapValue(S.Value);
+void SwapStruct(MachO::nlist &S) {
+  SwapValue(S.n_strx);
+  SwapValue(S.n_desc);
+  SwapValue(S.n_value);
 }
 
 template<>
-void SwapStruct(macho::Symbol64TableEntry &S) {
-  SwapValue(S.StringIndex);
-  SwapValue(S.Flags);
-  SwapValue(S.Value);
+void SwapStruct(MachO::nlist_64 &S) {
+  SwapValue(S.n_strx);
+  SwapValue(S.n_desc);
+  SwapValue(S.n_value);
 }
 
 template<>
-void SwapStruct(macho::Header &H) {
-  SwapValue(H.Magic);
-  SwapValue(H.CPUType);
-  SwapValue(H.CPUSubtype);
-  SwapValue(H.FileType);
-  SwapValue(H.NumLoadCommands);
-  SwapValue(H.SizeOfLoadCommands);
-  SwapValue(H.Flags);
+void SwapStruct(MachO::mach_header &H) {
+  SwapValue(H.magic);
+  SwapValue(H.cputype);
+  SwapValue(H.cpusubtype);
+  SwapValue(H.filetype);
+  SwapValue(H.ncmds);
+  SwapValue(H.sizeofcmds);
+  SwapValue(H.flags);
 }
 
 template<>
-void SwapStruct(macho::Header64Ext &E) {
-  SwapValue(E.Reserved);
+void SwapStruct(MachO::mach_header_64 &H) {
+  SwapValue(H.magic);
+  SwapValue(H.cputype);
+  SwapValue(H.cpusubtype);
+  SwapValue(H.filetype);
+  SwapValue(H.ncmds);
+  SwapValue(H.sizeofcmds);
+  SwapValue(H.flags);
+  SwapValue(H.reserved);
 }
 
 template<>
-void SwapStruct(macho::SymtabLoadCommand &C) {
-  SwapValue(C.Type);
-  SwapValue(C.Size);
-  SwapValue(C.SymbolTableOffset);
-  SwapValue(C.NumSymbolTableEntries);
-  SwapValue(C.StringTableOffset);
-  SwapValue(C.StringTableSize);
+void SwapStruct(MachO::symtab_command &C) {
+  SwapValue(C.cmd);
+  SwapValue(C.cmdsize);
+  SwapValue(C.symoff);
+  SwapValue(C.nsyms);
+  SwapValue(C.stroff);
+  SwapValue(C.strsize);
 }
 
 template<>
-void SwapStruct(macho::DysymtabLoadCommand &C) {
-  SwapValue(C.Type);
-  SwapValue(C.Size);
-  SwapValue(C.LocalSymbolsIndex);
-  SwapValue(C.NumLocalSymbols);
-  SwapValue(C.ExternalSymbolsIndex);
-  SwapValue(C.NumExternalSymbols);
-  SwapValue(C.UndefinedSymbolsIndex);
-  SwapValue(C.NumUndefinedSymbols);
-  SwapValue(C.TOCOffset);
-  SwapValue(C.NumTOCEntries);
-  SwapValue(C.ModuleTableOffset);
-  SwapValue(C.NumModuleTableEntries);
-  SwapValue(C.ReferenceSymbolTableOffset);
-  SwapValue(C.NumReferencedSymbolTableEntries);
-  SwapValue(C.IndirectSymbolTableOffset);
-  SwapValue(C.NumIndirectSymbolTableEntries);
-  SwapValue(C.ExternalRelocationTableOffset);
-  SwapValue(C.NumExternalRelocationTableEntries);
-  SwapValue(C.LocalRelocationTableOffset);
-  SwapValue(C.NumLocalRelocationTableEntries);
+void SwapStruct(MachO::dysymtab_command &C) {
+  SwapValue(C.cmd);
+  SwapValue(C.cmdsize);
+  SwapValue(C.ilocalsym);
+  SwapValue(C.nlocalsym);
+  SwapValue(C.iextdefsym);
+  SwapValue(C.nextdefsym);
+  SwapValue(C.iundefsym);
+  SwapValue(C.nundefsym);
+  SwapValue(C.tocoff);
+  SwapValue(C.ntoc);
+  SwapValue(C.modtaboff);
+  SwapValue(C.nmodtab);
+  SwapValue(C.extrefsymoff);
+  SwapValue(C.nextrefsyms);
+  SwapValue(C.indirectsymoff);
+  SwapValue(C.nindirectsyms);
+  SwapValue(C.extreloff);
+  SwapValue(C.nextrel);
+  SwapValue(C.locreloff);
+  SwapValue(C.nlocrel);
 }
 
 template<>
-void SwapStruct(macho::LinkeditDataLoadCommand &C) {
-  SwapValue(C.Type);
-  SwapValue(C.Size);
-  SwapValue(C.DataOffset);
-  SwapValue(C.DataSize);
+void SwapStruct(MachO::linkedit_data_command &C) {
+  SwapValue(C.cmd);
+  SwapValue(C.cmdsize);
+  SwapValue(C.dataoff);
+  SwapValue(C.datasize);
 }
 
 template<>
-void SwapStruct(macho::SegmentLoadCommand &C) {
-  SwapValue(C.Type);
-  SwapValue(C.Size);
-  SwapValue(C.VMAddress);
-  SwapValue(C.VMSize);
-  SwapValue(C.FileOffset);
-  SwapValue(C.FileSize);
-  SwapValue(C.MaxVMProtection);
-  SwapValue(C.InitialVMProtection);
-  SwapValue(C.NumSections);
-  SwapValue(C.Flags);
+void SwapStruct(MachO::segment_command &C) {
+  SwapValue(C.cmd);
+  SwapValue(C.cmdsize);
+  SwapValue(C.vmaddr);
+  SwapValue(C.vmsize);
+  SwapValue(C.fileoff);
+  SwapValue(C.filesize);
+  SwapValue(C.maxprot);
+  SwapValue(C.initprot);
+  SwapValue(C.nsects);
+  SwapValue(C.flags);
 }
 
 template<>
-void SwapStruct(macho::Segment64LoadCommand &C) {
-  SwapValue(C.Type);
-  SwapValue(C.Size);
-  SwapValue(C.VMAddress);
-  SwapValue(C.VMSize);
-  SwapValue(C.FileOffset);
-  SwapValue(C.FileSize);
-  SwapValue(C.MaxVMProtection);
-  SwapValue(C.InitialVMProtection);
-  SwapValue(C.NumSections);
-  SwapValue(C.Flags);
+void SwapStruct(MachO::segment_command_64 &C) {
+  SwapValue(C.cmd);
+  SwapValue(C.cmdsize);
+  SwapValue(C.vmaddr);
+  SwapValue(C.vmsize);
+  SwapValue(C.fileoff);
+  SwapValue(C.filesize);
+  SwapValue(C.maxprot);
+  SwapValue(C.initprot);
+  SwapValue(C.nsects);
+  SwapValue(C.flags);
 }
 
 template<>
-void SwapStruct(macho::IndirectSymbolTableEntry &C) {
-  SwapValue(C.Index);
+void SwapStruct(uint32_t &C) {
+  SwapValue(C);
 }
 
 template<>
-void SwapStruct(macho::LinkerOptionsLoadCommand &C) {
-  SwapValue(C.Type);
-  SwapValue(C.Size);
-  SwapValue(C.Count);
+void SwapStruct(MachO::linker_options_command &C) {
+  SwapValue(C.cmd);
+  SwapValue(C.cmdsize);
+  SwapValue(C.count);
 }
 
 template<>
-void SwapStruct(macho::DataInCodeTableEntry &C) {
-  SwapValue(C.Offset);
-  SwapValue(C.Length);
-  SwapValue(C.Kind);
+void SwapStruct(MachO::data_in_code_entry &C) {
+  SwapValue(C.offset);
+  SwapValue(C.length);
+  SwapValue(C.kind);
 }
 
 template<typename T>
@@ -226,11 +233,11 @@ static uint32_t
 getSegmentLoadCommandNumSections(const MachOObjectFile *O,
                                  const MachOObjectFile::LoadCommandInfo &L) {
   if (O->is64Bit()) {
-    macho::Segment64LoadCommand S = O->getSegment64LoadCommand(L);
-    return S.NumSections;
+    MachO::segment_command_64 S = O->getSegment64LoadCommand(L);
+    return S.nsects;
   }
-  macho::SegmentLoadCommand S = O->getSegmentLoadCommand(L);
-  return S.NumSections;
+  MachO::segment_command S = O->getSegmentLoadCommand(L);
+  return S.nsects;
 }
 
 static const char *
@@ -239,10 +246,10 @@ getSectionPtr(const MachOObjectFile *O, MachOObjectFile::LoadCommandInfo L,
   uintptr_t CommandAddr = reinterpret_cast<uintptr_t>(L.Ptr);
 
   bool Is64 = O->is64Bit();
-  unsigned SegmentLoadSize = Is64 ? sizeof(macho::Segment64LoadCommand) :
-                                    sizeof(macho::SegmentLoadCommand);
-  unsigned SectionSize = Is64 ? sizeof(macho::Section64) :
-                                sizeof(macho::Section);
+  unsigned SegmentLoadSize = Is64 ? sizeof(MachO::segment_command_64) :
+                                    sizeof(MachO::segment_command);
+  unsigned SectionSize = Is64 ? sizeof(MachO::section_64) :
+                                sizeof(MachO::section);
 
   uintptr_t SectionAddr = CommandAddr + SegmentLoadSize + Sec * SectionSize;
   return reinterpret_cast<const char*>(SectionAddr);
@@ -252,10 +259,10 @@ static const char *getPtr(const MachOObjectFile *O, size_t Offset) {
   return O->getData().substr(Offset, 1).data();
 }
 
-static SymbolTableEntryBase
+static nlist_base
 getSymbolTableEntryBase(const MachOObjectFile *O, DataRefImpl DRI) {
   const char *P = reinterpret_cast<const char *>(DRI.p);
-  return getStruct<SymbolTableEntryBase>(O, P);
+  return getStruct<nlist_base>(O, P);
 }
 
 static StringRef parseSegmentOrSectionName(const char *P) {
@@ -283,11 +290,11 @@ static void advanceTo(T &it, size_t Val) {
 }
 
 static unsigned getCPUType(const MachOObjectFile *O) {
-  return O->getHeader().CPUType;
+  return O->getHeader().cputype;
 }
 
 static void printRelocationTargetName(const MachOObjectFile *O,
-                                      const macho::RelocationEntry &RE,
+                                      const MachO::any_relocation_info &RE,
                                       raw_string_ostream &fmt) {
   bool IsScattered = O->isRelocationScattered(RE);
 
@@ -355,59 +362,61 @@ static void printRelocationTargetName(const MachOObjectFile *O,
   fmt << S;
 }
 
-static uint32_t getPlainRelocationAddress(const macho::RelocationEntry &RE) {
-  return RE.Word0;
+static uint32_t
+getPlainRelocationAddress(const MachO::any_relocation_info &RE) {
+  return RE.r_word0;
 }
 
 static unsigned
-getScatteredRelocationAddress(const macho::RelocationEntry &RE) {
-  return RE.Word0 & 0xffffff;
+getScatteredRelocationAddress(const MachO::any_relocation_info &RE) {
+  return RE.r_word0 & 0xffffff;
 }
 
 static bool getPlainRelocationPCRel(const MachOObjectFile *O,
-                                    const macho::RelocationEntry &RE) {
+                                    const MachO::any_relocation_info &RE) {
   if (O->isLittleEndian())
-    return (RE.Word1 >> 24) & 1;
-  return (RE.Word1 >> 7) & 1;
+    return (RE.r_word1 >> 24) & 1;
+  return (RE.r_word1 >> 7) & 1;
 }
 
 static bool
 getScatteredRelocationPCRel(const MachOObjectFile *O,
-                            const macho::RelocationEntry &RE) {
-  return (RE.Word0 >> 30) & 1;
+                            const MachO::any_relocation_info &RE) {
+  return (RE.r_word0 >> 30) & 1;
 }
 
 static unsigned getPlainRelocationLength(const MachOObjectFile *O,
-                                         const macho::RelocationEntry &RE) {
+                                         const MachO::any_relocation_info &RE) {
   if (O->isLittleEndian())
-    return (RE.Word1 >> 25) & 3;
-  return (RE.Word1 >> 5) & 3;
+    return (RE.r_word1 >> 25) & 3;
+  return (RE.r_word1 >> 5) & 3;
 }
 
 static unsigned
-getScatteredRelocationLength(const macho::RelocationEntry &RE) {
-  return (RE.Word0 >> 28) & 3;
+getScatteredRelocationLength(const MachO::any_relocation_info &RE) {
+  return (RE.r_word0 >> 28) & 3;
 }
 
 static unsigned getPlainRelocationType(const MachOObjectFile *O,
-                                       const macho::RelocationEntry &RE) {
+                                       const MachO::any_relocation_info &RE) {
   if (O->isLittleEndian())
-    return RE.Word1 >> 28;
-  return RE.Word1 & 0xf;
+    return RE.r_word1 >> 28;
+  return RE.r_word1 & 0xf;
 }
 
-static unsigned getScatteredRelocationType(const macho::RelocationEntry &RE) {
-  return (RE.Word0 >> 24) & 0xf;
+static unsigned
+getScatteredRelocationType(const MachO::any_relocation_info &RE) {
+  return (RE.r_word0 >> 24) & 0xf;
 }
 
 static uint32_t getSectionFlags(const MachOObjectFile *O,
                                 DataRefImpl Sec) {
   if (O->is64Bit()) {
-    macho::Section64 Sect = O->getSection64(Sec);
-    return Sect.Flags;
+    MachO::section_64 Sect = O->getSection64(Sec);
+    return Sect.flags;
   }
-  macho::Section Sect = O->getSection(Sec);
-  return Sect.Flags;
+  MachO::section Sect = O->getSection(Sec);
+  return Sect.flags;
 }
 
 MachOObjectFile::MachOObjectFile(MemoryBuffer *Object,
@@ -415,22 +424,22 @@ MachOObjectFile::MachOObjectFile(MemoryBuffer *Object,
                                  error_code &ec)
     : ObjectFile(getMachOType(IsLittleEndian, Is64bits), Object),
       SymtabLoadCmd(NULL), DysymtabLoadCmd(NULL), DataInCodeLoadCmd(NULL) {
-  uint32_t LoadCommandCount = this->getHeader().NumLoadCommands;
-  macho::LoadCommandType SegmentLoadType = is64Bit() ?
-    macho::LCT_Segment64 : macho::LCT_Segment;
+  uint32_t LoadCommandCount = this->getHeader().ncmds;
+  MachO::LoadCommandType SegmentLoadType = is64Bit() ?
+    MachO::LC_SEGMENT_64 : MachO::LC_SEGMENT;
 
   MachOObjectFile::LoadCommandInfo Load = getFirstLoadCommandInfo();
   for (unsigned I = 0; ; ++I) {
-    if (Load.C.Type == macho::LCT_Symtab) {
+    if (Load.C.cmd == MachO::LC_SYMTAB) {
       assert(!SymtabLoadCmd && "Multiple symbol tables");
       SymtabLoadCmd = Load.Ptr;
-    } else if (Load.C.Type == macho::LCT_Dysymtab) {
+    } else if (Load.C.cmd == MachO::LC_DYSYMTAB) {
       assert(!DysymtabLoadCmd && "Multiple dynamic symbol tables");
       DysymtabLoadCmd = Load.Ptr;
-    } else if (Load.C.Type == macho::LCT_DataInCode) {
+    } else if (Load.C.cmd == MachO::LC_DATA_IN_CODE) {
       assert(!DataInCodeLoadCmd && "Multiple data in code tables");
       DataInCodeLoadCmd = Load.Ptr;
-    } else if (Load.C.Type == SegmentLoadType) {
+    } else if (Load.C.cmd == SegmentLoadType) {
       uint32_t NumSections = getSegmentLoadCommandNumSections(this, Load);
       for (unsigned J = 0; J < NumSections; ++J) {
         const char *Sec = getSectionPtr(this, Load, J);
@@ -448,8 +457,8 @@ MachOObjectFile::MachOObjectFile(MemoryBuffer *Object,
 error_code MachOObjectFile::getSymbolNext(DataRefImpl Symb,
                                           SymbolRef &Res) const {
   unsigned SymbolTableEntrySize = is64Bit() ?
-    sizeof(macho::Symbol64TableEntry) :
-    sizeof(macho::SymbolTableEntry);
+    sizeof(MachO::nlist_64) :
+    sizeof(MachO::nlist);
   Symb.p += SymbolTableEntrySize;
   Res = SymbolRef(Symb, this);
   return object_error::success;
@@ -458,8 +467,8 @@ error_code MachOObjectFile::getSymbolNext(DataRefImpl Symb,
 error_code MachOObjectFile::getSymbolName(DataRefImpl Symb,
                                           StringRef &Res) const {
   StringRef StringTable = getStringTableData();
-  SymbolTableEntryBase Entry = getSymbolTableEntryBase(this, Symb);
-  const char *Start = &StringTable.data()[Entry.StringIndex];
+  nlist_base Entry = getSymbolTableEntryBase(this, Symb);
+  const char *Start = &StringTable.data()[Entry.n_strx];
   Res = StringRef(Start);
   return object_error::success;
 }
@@ -467,11 +476,11 @@ error_code MachOObjectFile::getSymbolName(DataRefImpl Symb,
 error_code MachOObjectFile::getSymbolAddress(DataRefImpl Symb,
                                              uint64_t &Res) const {
   if (is64Bit()) {
-    macho::Symbol64TableEntry Entry = getSymbol64TableEntry(Symb);
-    Res = Entry.Value;
+    MachO::nlist_64 Entry = getSymbol64TableEntry(Symb);
+    Res = Entry.n_value;
   } else {
-    macho::SymbolTableEntry Entry = getSymbolTableEntry(Symb);
-    Res = Entry.Value;
+    MachO::nlist Entry = getSymbolTableEntry(Symb);
+    Res = Entry.n_value;
   }
   return object_error::success;
 }
@@ -479,18 +488,18 @@ error_code MachOObjectFile::getSymbolAddress(DataRefImpl Symb,
 error_code
 MachOObjectFile::getSymbolFileOffset(DataRefImpl Symb,
                                      uint64_t &Res) const {
-  SymbolTableEntryBase Entry = getSymbolTableEntryBase(this, Symb);
+  nlist_base Entry = getSymbolTableEntryBase(this, Symb);
   getSymbolAddress(Symb, Res);
-  if (Entry.SectionIndex) {
+  if (Entry.n_sect) {
     uint64_t Delta;
     DataRefImpl SecRel;
-    SecRel.d.a = Entry.SectionIndex-1;
+    SecRel.d.a = Entry.n_sect-1;
     if (is64Bit()) {
-      macho::Section64 Sec = getSection64(SecRel);
-      Delta = Sec.Offset - Sec.Address;
+      MachO::section_64 Sec = getSection64(SecRel);
+      Delta = Sec.offset - Sec.addr;
     } else {
-      macho::Section Sec = getSection(SecRel);
-      Delta = Sec.Offset - Sec.Address;
+      MachO::section Sec = getSection(SecRel);
+      Delta = Sec.offset - Sec.addr;
     }
 
     Res += Delta;
@@ -504,8 +513,8 @@ error_code MachOObjectFile::getSymbolAlignment(DataRefImpl DRI,
   uint32_t flags;
   this->getSymbolFlags(DRI, flags);
   if (flags & SymbolRef::SF_Common) {
-    SymbolTableEntryBase Entry = getSymbolTableEntryBase(this, DRI);
-    Result = 1 << MachO::GET_COMM_ALIGN(Entry.Flags);
+    nlist_base Entry = getSymbolTableEntryBase(this, DRI);
+    Result = 1 << MachO::GET_COMM_ALIGN(Entry.n_desc);
   } else {
     Result = 0;
   }
@@ -518,13 +527,13 @@ error_code MachOObjectFile::getSymbolSize(DataRefImpl DRI,
   uint64_t EndOffset = 0;
   uint8_t SectionIndex;
 
-  SymbolTableEntryBase Entry = getSymbolTableEntryBase(this, DRI);
+  nlist_base Entry = getSymbolTableEntryBase(this, DRI);
   uint64_t Value;
   getSymbolAddress(DRI, Value);
 
   BeginOffset = Value;
 
-  SectionIndex = Entry.SectionIndex;
+  SectionIndex = Entry.n_sect;
   if (!SectionIndex) {
     uint32_t flags = SymbolRef::SF_None;
     this->getSymbolFlags(DRI, flags);
@@ -542,7 +551,7 @@ error_code MachOObjectFile::getSymbolSize(DataRefImpl DRI,
     DataRefImpl DRI = I->getRawDataRefImpl();
     Entry = getSymbolTableEntryBase(this, DRI);
     getSymbolAddress(DRI, Value);
-    if (Entry.SectionIndex == SectionIndex && Value > BeginOffset)
+    if (Entry.n_sect == SectionIndex && Value > BeginOffset)
       if (!EndOffset || Value < EndOffset)
         EndOffset = Value;
   }
@@ -560,73 +569,47 @@ error_code MachOObjectFile::getSymbolSize(DataRefImpl DRI,
 
 error_code MachOObjectFile::getSymbolType(DataRefImpl Symb,
                                           SymbolRef::Type &Res) const {
-  SymbolTableEntryBase Entry = getSymbolTableEntryBase(this, Symb);
-  uint8_t n_type = Entry.Type;
+  nlist_base Entry = getSymbolTableEntryBase(this, Symb);
+  uint8_t n_type = Entry.n_type;
 
   Res = SymbolRef::ST_Other;
 
   // If this is a STAB debugging symbol, we can do nothing more.
-  if (n_type & MachO::NlistMaskStab) {
+  if (n_type & MachO::N_STAB) {
     Res = SymbolRef::ST_Debug;
     return object_error::success;
   }
 
-  switch (n_type & MachO::NlistMaskType) {
-    case MachO::NListTypeUndefined :
+  switch (n_type & MachO::N_TYPE) {
+    case MachO::N_UNDF :
       Res = SymbolRef::ST_Unknown;
       break;
-    case MachO::NListTypeSection :
+    case MachO::N_SECT :
       Res = SymbolRef::ST_Function;
       break;
   }
   return object_error::success;
 }
 
-error_code MachOObjectFile::getSymbolNMTypeChar(DataRefImpl Symb,
-                                                char &Res) const {
-  SymbolTableEntryBase Entry = getSymbolTableEntryBase(this, Symb);
-  uint8_t Type = Entry.Type;
-  uint16_t Flags = Entry.Flags;
-
-  char Char;
-  switch (Type & macho::STF_TypeMask) {
-    case macho::STT_Undefined:
-      Char = 'u';
-      break;
-    case macho::STT_Absolute:
-    case macho::STT_Section:
-      Char = 's';
-      break;
-    default:
-      Char = '?';
-      break;
-  }
-
-  if (Flags & (macho::STF_External | macho::STF_PrivateExtern))
-    Char = toupper(static_cast<unsigned char>(Char));
-  Res = Char;
-  return object_error::success;
-}
-
 error_code MachOObjectFile::getSymbolFlags(DataRefImpl DRI,
                                            uint32_t &Result) const {
-  SymbolTableEntryBase Entry = getSymbolTableEntryBase(this, DRI);
+  nlist_base Entry = getSymbolTableEntryBase(this, DRI);
 
-  uint8_t MachOType = Entry.Type;
-  uint16_t MachOFlags = Entry.Flags;
+  uint8_t MachOType = Entry.n_type;
+  uint16_t MachOFlags = Entry.n_desc;
 
   // TODO: Correctly set SF_ThreadLocal
   Result = SymbolRef::SF_None;
 
-  if ((MachOType & MachO::NlistMaskType) == MachO::NListTypeUndefined)
+  if ((MachOType & MachO::N_TYPE) == MachO::N_UNDF)
     Result |= SymbolRef::SF_Undefined;
 
-  if (MachOFlags & macho::STF_StabsEntryMask)
+  if (MachOType & MachO::N_STAB)
     Result |= SymbolRef::SF_FormatSpecific;
 
-  if (MachOType & MachO::NlistMaskExternal) {
+  if (MachOType & MachO::N_EXT) {
     Result |= SymbolRef::SF_Global;
-    if ((MachOType & MachO::NlistMaskType) == MachO::NListTypeUndefined) {
+    if ((MachOType & MachO::N_TYPE) == MachO::N_UNDF) {
       uint64_t Value;
       getSymbolAddress(DRI, Value);
       if (Value)
@@ -634,10 +617,10 @@ error_code MachOObjectFile::getSymbolFlags(DataRefImpl DRI,
     }
   }
 
-  if (MachOFlags & (MachO::NListDescWeakRef | MachO::NListDescWeakDef))
+  if (MachOFlags & (MachO::N_WEAK_REF | MachO::N_WEAK_DEF))
     Result |= SymbolRef::SF_Weak;
 
-  if ((MachOType & MachO::NlistMaskType) == MachO::NListTypeAbsolute)
+  if ((MachOType & MachO::N_TYPE) == MachO::N_ABS)
     Result |= SymbolRef::SF_Absolute;
 
   return object_error::success;
@@ -646,8 +629,8 @@ error_code MachOObjectFile::getSymbolFlags(DataRefImpl DRI,
 error_code
 MachOObjectFile::getSymbolSection(DataRefImpl Symb,
                                   section_iterator &Res) const {
-  SymbolTableEntryBase Entry = getSymbolTableEntryBase(this, Symb);
-  uint8_t index = Entry.SectionIndex;
+  nlist_base Entry = getSymbolTableEntryBase(this, Symb);
+  uint8_t index = Entry.n_sect;
 
   if (index == 0) {
     Res = end_sections();
@@ -682,11 +665,11 @@ MachOObjectFile::getSectionName(DataRefImpl Sec, StringRef &Result) const {
 error_code
 MachOObjectFile::getSectionAddress(DataRefImpl Sec, uint64_t &Res) const {
   if (is64Bit()) {
-    macho::Section64 Sect = getSection64(Sec);
-    Res = Sect.Address;
+    MachO::section_64 Sect = getSection64(Sec);
+    Res = Sect.addr;
   } else {
-    macho::Section Sect = getSection(Sec);
-    Res = Sect.Address;
+    MachO::section Sect = getSection(Sec);
+    Res = Sect.addr;
   }
   return object_error::success;
 }
@@ -694,11 +677,11 @@ MachOObjectFile::getSectionAddress(DataRefImpl Sec, uint64_t &Res) const {
 error_code
 MachOObjectFile::getSectionSize(DataRefImpl Sec, uint64_t &Res) const {
   if (is64Bit()) {
-    macho::Section64 Sect = getSection64(Sec);
-    Res = Sect.Size;
+    MachO::section_64 Sect = getSection64(Sec);
+    Res = Sect.size;
   } else {
-    macho::Section Sect = getSection(Sec);
-    Res = Sect.Size;
+    MachO::section Sect = getSection(Sec);
+    Res = Sect.size;
   }
 
   return object_error::success;
@@ -710,13 +693,13 @@ MachOObjectFile::getSectionContents(DataRefImpl Sec, StringRef &Res) const {
   uint64_t Size;
 
   if (is64Bit()) {
-    macho::Section64 Sect = getSection64(Sec);
-    Offset = Sect.Offset;
-    Size = Sect.Size;
+    MachO::section_64 Sect = getSection64(Sec);
+    Offset = Sect.offset;
+    Size = Sect.size;
   } else {
-    macho::Section Sect =getSection(Sec);
-    Offset = Sect.Offset;
-    Size = Sect.Size;
+    MachO::section Sect = getSection(Sec);
+    Offset = Sect.offset;
+    Size = Sect.size;
   }
 
   Res = this->getData().substr(Offset, Size);
@@ -727,11 +710,11 @@ error_code
 MachOObjectFile::getSectionAlignment(DataRefImpl Sec, uint64_t &Res) const {
   uint32_t Align;
   if (is64Bit()) {
-    macho::Section64 Sect = getSection64(Sec);
-    Align = Sect.Align;
+    MachO::section_64 Sect = getSection64(Sec);
+    Align = Sect.align;
   } else {
-    macho::Section Sect = getSection(Sec);
-    Align = Sect.Align;
+    MachO::section Sect = getSection(Sec);
+    Align = Sect.align;
   }
 
   Res = uint64_t(1) << Align;
@@ -741,7 +724,7 @@ MachOObjectFile::getSectionAlignment(DataRefImpl Sec, uint64_t &Res) const {
 error_code
 MachOObjectFile::isSectionText(DataRefImpl Sec, bool &Res) const {
   uint32_t Flags = getSectionFlags(this, Sec);
-  Res = Flags & macho::SF_PureInstructions;
+  Res = Flags & MachO::S_ATTR_PURE_INSTRUCTIONS;
   return object_error::success;
 }
 
@@ -775,9 +758,9 @@ error_code MachOObjectFile::isSectionVirtual(DataRefImpl Sec,
 error_code
 MachOObjectFile::isSectionZeroInit(DataRefImpl Sec, bool &Res) const {
   uint32_t Flags = getSectionFlags(this, Sec);
-  unsigned SectionType = Flags & MachO::SectionFlagMaskSectionType;
-  Res = SectionType == MachO::SectionTypeZeroFill ||
-    SectionType == MachO::SectionTypeZeroFillLarge;
+  unsigned SectionType = Flags & MachO::SECTION_TYPE;
+  Res = SectionType == MachO::S_ZEROFILL ||
+    SectionType == MachO::S_GB_ZEROFILL;
   return object_error::success;
 }
 
@@ -814,14 +797,14 @@ MachOObjectFile::sectionContainsSymbol(DataRefImpl Sec, DataRefImpl Symb,
   return object_error::success;
 }
 
-relocation_iterator MachOObjectFile::getSectionRelBegin(DataRefImpl Sec) const {
+relocation_iterator MachOObjectFile::section_rel_begin(DataRefImpl Sec) const {
   uint32_t Offset;
   if (is64Bit()) {
-    macho::Section64 Sect = getSection64(Sec);
-    Offset = Sect.RelocationTableOffset;
+    MachO::section_64 Sect = getSection64(Sec);
+    Offset = Sect.reloff;
   } else {
-    macho::Section Sect = getSection(Sec);
-    Offset = Sect.RelocationTableOffset;
+    MachO::section Sect = getSection(Sec);
+    Offset = Sect.reloff;
   }
 
   DataRefImpl Ret;
@@ -830,21 +813,21 @@ relocation_iterator MachOObjectFile::getSectionRelBegin(DataRefImpl Sec) const {
 }
 
 relocation_iterator
-MachOObjectFile::getSectionRelEnd(DataRefImpl Sec) const {
+MachOObjectFile::section_rel_end(DataRefImpl Sec) const {
   uint32_t Offset;
   uint32_t Num;
   if (is64Bit()) {
-    macho::Section64 Sect = getSection64(Sec);
-    Offset = Sect.RelocationTableOffset;
-    Num = Sect.NumRelocationTableEntries;
+    MachO::section_64 Sect = getSection64(Sec);
+    Offset = Sect.reloff;
+    Num = Sect.nreloc;
   } else {
-    macho::Section Sect = getSection(Sec);
-    Offset = Sect.RelocationTableOffset;
-    Num = Sect.NumRelocationTableEntries;
+    MachO::section Sect = getSection(Sec);
+    Offset = Sect.reloff;
+    Num = Sect.nreloc;
   }
 
-  const macho::RelocationEntry *P =
-    reinterpret_cast<const macho::RelocationEntry*>(getPtr(this, Offset));
+  const MachO::any_relocation_info *P =
+    reinterpret_cast<const MachO::any_relocation_info *>(getPtr(this, Offset));
 
   DataRefImpl Ret;
   Ret.p = reinterpret_cast<uintptr_t>(P + Num);
@@ -853,8 +836,8 @@ MachOObjectFile::getSectionRelEnd(DataRefImpl Sec) const {
 
 error_code MachOObjectFile::getRelocationNext(DataRefImpl Rel,
                                               RelocationRef &Res) const {
-  const macho::RelocationEntry *P =
-    reinterpret_cast<const macho::RelocationEntry *>(Rel.p);
+  const MachO::any_relocation_info *P =
+    reinterpret_cast<const MachO::any_relocation_info *>(Rel.p);
   Rel.p = reinterpret_cast<uintptr_t>(P + 1);
   Res = RelocationRef(Rel, this);
   return object_error::success;
@@ -867,24 +850,24 @@ MachOObjectFile::getRelocationAddress(DataRefImpl Rel, uint64_t &Res) const {
 
 error_code MachOObjectFile::getRelocationOffset(DataRefImpl Rel,
                                                 uint64_t &Res) const {
-  macho::RelocationEntry RE = getRelocation(Rel);
+  MachO::any_relocation_info RE = getRelocation(Rel);
   Res = getAnyRelocationAddress(RE);
   return object_error::success;
 }
 
 symbol_iterator
 MachOObjectFile::getRelocationSymbol(DataRefImpl Rel) const {
-  macho::RelocationEntry RE = getRelocation(Rel);
+  MachO::any_relocation_info RE = getRelocation(Rel);
   uint32_t SymbolIdx = getPlainRelocationSymbolNum(RE);
   bool isExtern = getPlainRelocationExternal(RE);
   if (!isExtern)
     return end_symbols();
 
-  macho::SymtabLoadCommand S = getSymtabLoadCommand();
+  MachO::symtab_command S = getSymtabLoadCommand();
   unsigned SymbolTableEntrySize = is64Bit() ?
-    sizeof(macho::Symbol64TableEntry) :
-    sizeof(macho::SymbolTableEntry);
-  uint64_t Offset = S.SymbolTableOffset + SymbolIdx * SymbolTableEntrySize;
+    sizeof(MachO::nlist_64) :
+    sizeof(MachO::nlist);
+  uint64_t Offset = S.symoff + SymbolIdx * SymbolTableEntrySize;
   DataRefImpl Sym;
   Sym.p = reinterpret_cast<uintptr_t>(getPtr(this, Offset));
   return symbol_iterator(SymbolRef(Sym, this));
@@ -892,7 +875,7 @@ MachOObjectFile::getRelocationSymbol(DataRefImpl Rel) const {
 
 error_code MachOObjectFile::getRelocationType(DataRefImpl Rel,
                                               uint64_t &Res) const {
-  macho::RelocationEntry RE = getRelocation(Rel);
+  MachO::any_relocation_info RE = getRelocation(Rel);
   Res = getAnyRelocationType(RE);
   return object_error::success;
 }
@@ -993,7 +976,7 @@ MachOObjectFile::getRelocationTypeName(DataRefImpl Rel,
 error_code
 MachOObjectFile::getRelocationValueString(DataRefImpl Rel,
                                           SmallVectorImpl<char> &Result) const {
-  macho::RelocationEntry RE = getRelocation(Rel);
+  MachO::any_relocation_info RE = getRelocation(Rel);
 
   unsigned Arch = this->getArch();
 
@@ -1010,47 +993,47 @@ MachOObjectFile::getRelocationValueString(DataRefImpl Rel,
     bool isPCRel = getAnyRelocationPCRel(RE);
 
     switch (Type) {
-      case macho::RIT_X86_64_GOTLoad:   // X86_64_RELOC_GOT_LOAD
-      case macho::RIT_X86_64_GOT: {     // X86_64_RELOC_GOT
+      case MachO::X86_64_RELOC_GOT_LOAD:
+      case MachO::X86_64_RELOC_GOT: {
         printRelocationTargetName(this, RE, fmt);
         fmt << "@GOT";
         if (isPCRel) fmt << "PCREL";
         break;
       }
-      case macho::RIT_X86_64_Subtractor: { // X86_64_RELOC_SUBTRACTOR
+      case MachO::X86_64_RELOC_SUBTRACTOR: {
         DataRefImpl RelNext = Rel;
         RelNext.d.a++;
-        macho::RelocationEntry RENext = getRelocation(RelNext);
+        MachO::any_relocation_info RENext = getRelocation(RelNext);
 
-        // X86_64_SUBTRACTOR must be followed by a relocation of type
+        // X86_64_RELOC_SUBTRACTOR must be followed by a relocation of type
         // X86_64_RELOC_UNSIGNED.
         // NOTE: Scattered relocations don't exist on x86_64.
         unsigned RType = getAnyRelocationType(RENext);
-        if (RType != 0)
+        if (RType != MachO::X86_64_RELOC_UNSIGNED)
           report_fatal_error("Expected X86_64_RELOC_UNSIGNED after "
                              "X86_64_RELOC_SUBTRACTOR.");
 
-        // The X86_64_RELOC_UNSIGNED contains the minuend symbol,
-        // X86_64_SUBTRACTOR contains to the subtrahend.
+        // The X86_64_RELOC_UNSIGNED contains the minuend symbol;
+        // X86_64_RELOC_SUBTRACTOR contains the subtrahend.
         printRelocationTargetName(this, RENext, fmt);
         fmt << "-";
         printRelocationTargetName(this, RE, fmt);
         break;
       }
-      case macho::RIT_X86_64_TLV:
+      case MachO::X86_64_RELOC_TLV:
         printRelocationTargetName(this, RE, fmt);
         fmt << "@TLV";
         if (isPCRel) fmt << "P";
         break;
-      case macho::RIT_X86_64_Signed1: // X86_64_RELOC_SIGNED1
+      case MachO::X86_64_RELOC_SIGNED_1:
         printRelocationTargetName(this, RE, fmt);
         fmt << "-1";
         break;
-      case macho::RIT_X86_64_Signed2: // X86_64_RELOC_SIGNED2
+      case MachO::X86_64_RELOC_SIGNED_2:
         printRelocationTargetName(this, RE, fmt);
         fmt << "-2";
         break;
-      case macho::RIT_X86_64_Signed4: // X86_64_RELOC_SIGNED4
+      case MachO::X86_64_RELOC_SIGNED_4:
         printRelocationTargetName(this, RE, fmt);
         fmt << "-4";
         break;
@@ -1059,21 +1042,22 @@ MachOObjectFile::getRelocationValueString(DataRefImpl Rel,
         break;
     }
   // X86 and ARM share some relocation types in common.
-  } else if (Arch == Triple::x86 || Arch == Triple::arm) {
+  } else if (Arch == Triple::x86 || Arch == Triple::arm ||
+             Arch == Triple::ppc) {
     // Generic relocation types...
     switch (Type) {
-      case macho::RIT_Pair: // GENERIC_RELOC_PAIR - prints no info
+      case MachO::GENERIC_RELOC_PAIR: // prints no info
         return object_error::success;
-      case macho::RIT_Difference: { // GENERIC_RELOC_SECTDIFF
+      case MachO::GENERIC_RELOC_SECTDIFF: {
         DataRefImpl RelNext = Rel;
         RelNext.d.a++;
-        macho::RelocationEntry RENext = getRelocation(RelNext);
+        MachO::any_relocation_info RENext = getRelocation(RelNext);
 
         // X86 sect diff's must be followed by a relocation of type
         // GENERIC_RELOC_PAIR.
         unsigned RType = getAnyRelocationType(RENext);
 
-        if (RType != 1)
+        if (RType != MachO::GENERIC_RELOC_PAIR)
           report_fatal_error("Expected GENERIC_RELOC_PAIR after "
                              "GENERIC_RELOC_SECTDIFF.");
 
@@ -1084,19 +1068,17 @@ MachOObjectFile::getRelocationValueString(DataRefImpl Rel,
       }
     }
 
-    if (Arch == Triple::x86) {
-      // All X86 relocations that need special printing were already
-      // handled in the generic code.
+    if (Arch == Triple::x86 || Arch == Triple::ppc) {
       switch (Type) {
-        case macho::RIT_Generic_LocalDifference:{// GENERIC_RELOC_LOCAL_SECTDIFF
+        case MachO::GENERIC_RELOC_LOCAL_SECTDIFF: {
           DataRefImpl RelNext = Rel;
           RelNext.d.a++;
-          macho::RelocationEntry RENext = getRelocation(RelNext);
+          MachO::any_relocation_info RENext = getRelocation(RelNext);
 
           // X86 sect diff's must be followed by a relocation of type
           // GENERIC_RELOC_PAIR.
           unsigned RType = getAnyRelocationType(RENext);
-          if (RType != 1)
+          if (RType != MachO::GENERIC_RELOC_PAIR)
             report_fatal_error("Expected GENERIC_RELOC_PAIR after "
                                "GENERIC_RELOC_LOCAL_SECTDIFF.");
 
@@ -1105,7 +1087,7 @@ MachOObjectFile::getRelocationValueString(DataRefImpl Rel,
           printRelocationTargetName(this, RENext, fmt);
           break;
         }
-        case macho::RIT_Generic_TLV: {
+        case MachO::GENERIC_RELOC_TLV: {
           printRelocationTargetName(this, RE, fmt);
           fmt << "@TLV";
           if (IsPCRel) fmt << "P";
@@ -1116,8 +1098,8 @@ MachOObjectFile::getRelocationValueString(DataRefImpl Rel,
       }
     } else { // ARM-specific relocations
       switch (Type) {
-        case macho::RIT_ARM_Half:             // ARM_RELOC_HALF
-        case macho::RIT_ARM_HalfDifference: { // ARM_RELOC_HALF_SECTDIFF
+        case MachO::ARM_RELOC_HALF:
+        case MachO::ARM_RELOC_HALF_SECTDIFF: {
           // Half relocations steal a bit from the length field to encode
           // whether this is an upper16 or a lower16 relocation.
           bool isUpper = getAnyRelocationLength(RE) >> 1;
@@ -1130,14 +1112,14 @@ MachOObjectFile::getRelocationValueString(DataRefImpl Rel,
 
           DataRefImpl RelNext = Rel;
           RelNext.d.a++;
-          macho::RelocationEntry RENext = getRelocation(RelNext);
+          MachO::any_relocation_info RENext = getRelocation(RelNext);
 
           // ARM half relocs must be followed by a relocation of type
           // ARM_RELOC_PAIR.
           unsigned RType = getAnyRelocationType(RENext);
-          if (RType != 1)
+          if (RType != MachO::ARM_RELOC_PAIR)
             report_fatal_error("Expected ARM_RELOC_PAIR after "
-                               "GENERIC_RELOC_HALF");
+                               "ARM_RELOC_HALF");
 
           // NOTE: The half of the target virtual address is stashed in the
           // address field of the secondary relocation, but we can't reverse
@@ -1146,7 +1128,7 @@ MachOObjectFile::getRelocationValueString(DataRefImpl Rel,
 
           // ARM_RELOC_HALF_SECTDIFF encodes the second section in the
           // symbol/section pointer of the follow-on relocation.
-          if (Type == macho::RIT_ARM_HalfDifference) {
+          if (Type == MachO::ARM_RELOC_HALF_SECTDIFF) {
             fmt << "-";
             printRelocationTargetName(this, RENext, fmt);
           }
@@ -1177,17 +1159,17 @@ MachOObjectFile::getRelocationHidden(DataRefImpl Rel, bool &Result) const {
 
   // On arches that use the generic relocations, GENERIC_RELOC_PAIR
   // is always hidden.
-  if (Arch == Triple::x86 || Arch == Triple::arm) {
-    if (Type == macho::RIT_Pair) Result = true;
+  if (Arch == Triple::x86 || Arch == Triple::arm || Arch == Triple::ppc) {
+    if (Type == MachO::GENERIC_RELOC_PAIR) Result = true;
   } else if (Arch == Triple::x86_64) {
     // On x86_64, X86_64_RELOC_UNSIGNED is hidden only when it follows
     // an X86_64_RELOC_SUBTRACTOR.
-    if (Type == macho::RIT_X86_64_Unsigned && Rel.d.a > 0) {
+    if (Type == MachO::X86_64_RELOC_UNSIGNED && Rel.d.a > 0) {
       DataRefImpl RelPrev = Rel;
       RelPrev.d.a--;
       uint64_t PrevType;
       getRelocationType(RelPrev, PrevType);
-      if (PrevType == macho::RIT_X86_64_Subtractor)
+      if (PrevType == MachO::X86_64_RELOC_SUBTRACTOR)
         Result = true;
     }
   }
@@ -1210,8 +1192,8 @@ symbol_iterator MachOObjectFile::begin_symbols() const {
   if (!SymtabLoadCmd)
     return symbol_iterator(SymbolRef(DRI, this));
 
-  macho::SymtabLoadCommand Symtab = getSymtabLoadCommand();
-  DRI.p = reinterpret_cast<uintptr_t>(getPtr(this, Symtab.SymbolTableOffset));
+  MachO::symtab_command Symtab = getSymtabLoadCommand();
+  DRI.p = reinterpret_cast<uintptr_t>(getPtr(this, Symtab.symoff));
   return symbol_iterator(SymbolRef(DRI, this));
 }
 
@@ -1220,12 +1202,12 @@ symbol_iterator MachOObjectFile::end_symbols() const {
   if (!SymtabLoadCmd)
     return symbol_iterator(SymbolRef(DRI, this));
 
-  macho::SymtabLoadCommand Symtab = getSymtabLoadCommand();
+  MachO::symtab_command Symtab = getSymtabLoadCommand();
   unsigned SymbolTableEntrySize = is64Bit() ?
-    sizeof(macho::Symbol64TableEntry) :
-    sizeof(macho::SymbolTableEntry);
-  unsigned Offset = Symtab.SymbolTableOffset +
-    Symtab.NumSymbolTableEntries * SymbolTableEntrySize;
+    sizeof(MachO::nlist_64) :
+    sizeof(MachO::nlist);
+  unsigned Offset = Symtab.symoff +
+    Symtab.nsyms * SymbolTableEntrySize;
   DRI.p = reinterpret_cast<uintptr_t>(getPtr(this, Offset));
   return symbol_iterator(SymbolRef(DRI, this));
 }
@@ -1269,28 +1251,28 @@ StringRef MachOObjectFile::getFileFormatName() const {
   unsigned CPUType = getCPUType(this);
   if (!is64Bit()) {
     switch (CPUType) {
-    case llvm::MachO::CPUTypeI386:
+    case llvm::MachO::CPU_TYPE_I386:
       return "Mach-O 32-bit i386";
-    case llvm::MachO::CPUTypeARM:
+    case llvm::MachO::CPU_TYPE_ARM:
       return "Mach-O arm";
-    case llvm::MachO::CPUTypePowerPC:
+    case llvm::MachO::CPU_TYPE_POWERPC:
       return "Mach-O 32-bit ppc";
     default:
-      assert((CPUType & llvm::MachO::CPUArchABI64) == 0 &&
+      assert((CPUType & llvm::MachO::CPU_ARCH_ABI64) == 0 &&
              "64-bit object file when we're not 64-bit?");
       return "Mach-O 32-bit unknown";
     }
   }
 
   // Make sure the cpu type has the correct mask.
-  assert((CPUType & llvm::MachO::CPUArchABI64)
-         == llvm::MachO::CPUArchABI64 &&
+  assert((CPUType & llvm::MachO::CPU_ARCH_ABI64)
+         == llvm::MachO::CPU_ARCH_ABI64 &&
          "32-bit object file when we're 64-bit?");
 
   switch (CPUType) {
-  case llvm::MachO::CPUTypeX86_64:
+  case llvm::MachO::CPU_TYPE_X86_64:
     return "Mach-O 64-bit x86-64";
-  case llvm::MachO::CPUTypePowerPC64:
+  case llvm::MachO::CPU_TYPE_POWERPC64:
     return "Mach-O 64-bit ppc64";
   default:
     return "Mach-O 64-bit unknown";
@@ -1299,15 +1281,15 @@ StringRef MachOObjectFile::getFileFormatName() const {
 
 Triple::ArchType MachOObjectFile::getArch(uint32_t CPUType) {
   switch (CPUType) {
-  case llvm::MachO::CPUTypeI386:
+  case llvm::MachO::CPU_TYPE_I386:
     return Triple::x86;
-  case llvm::MachO::CPUTypeX86_64:
+  case llvm::MachO::CPU_TYPE_X86_64:
     return Triple::x86_64;
-  case llvm::MachO::CPUTypeARM:
+  case llvm::MachO::CPU_TYPE_ARM:
     return Triple::arm;
-  case llvm::MachO::CPUTypePowerPC:
+  case llvm::MachO::CPU_TYPE_POWERPC:
     return Triple::ppc;
-  case llvm::MachO::CPUTypePowerPC64:
+  case llvm::MachO::CPU_TYPE_POWERPC64:
     return Triple::ppc64;
   default:
     return Triple::UnknownArch;
@@ -1323,16 +1305,16 @@ StringRef MachOObjectFile::getLoadName() const {
   report_fatal_error("get_load_name() unimplemented in MachOObjectFile");
 }
 
-relocation_iterator MachOObjectFile::getSectionRelBegin(unsigned Index) const {
+relocation_iterator MachOObjectFile::section_rel_begin(unsigned Index) const {
   DataRefImpl DRI;
   DRI.d.a = Index;
-  return getSectionRelBegin(DRI);
+  return section_rel_begin(DRI);
 }
 
-relocation_iterator MachOObjectFile::getSectionRelEnd(unsigned Index) const {
+relocation_iterator MachOObjectFile::section_rel_end(unsigned Index) const {
   DataRefImpl DRI;
   DRI.d.a = Index;
-  return getSectionRelEnd(DRI);
+  return section_rel_end(DRI);
 }
 
 dice_iterator MachOObjectFile::begin_dices() const {
@@ -1340,8 +1322,8 @@ dice_iterator MachOObjectFile::begin_dices() const {
   if (!DataInCodeLoadCmd)
     return dice_iterator(DiceRef(DRI, this));
 
-  macho::LinkeditDataLoadCommand DicLC = getDataInCodeLoadCommand();
-  DRI.p = reinterpret_cast<uintptr_t>(getPtr(this, DicLC.DataOffset));
+  MachO::linkedit_data_command DicLC = getDataInCodeLoadCommand();
+  DRI.p = reinterpret_cast<uintptr_t>(getPtr(this, DicLC.dataoff));
   return dice_iterator(DiceRef(DRI, this));
 }
 
@@ -1350,8 +1332,8 @@ dice_iterator MachOObjectFile::end_dices() const {
   if (!DataInCodeLoadCmd)
     return dice_iterator(DiceRef(DRI, this));
 
-  macho::LinkeditDataLoadCommand DicLC = getDataInCodeLoadCommand();
-  unsigned Offset = DicLC.DataOffset + DicLC.DataSize;
+  MachO::linkedit_data_command DicLC = getDataInCodeLoadCommand();
+  unsigned Offset = DicLC.dataoff + DicLC.datasize;
   DRI.p = reinterpret_cast<uintptr_t>(getPtr(this, Offset));
   return dice_iterator(DiceRef(DRI, this));
 }
@@ -1364,80 +1346,82 @@ MachOObjectFile::getSectionFinalSegmentName(DataRefImpl Sec) const {
 
 ArrayRef<char>
 MachOObjectFile::getSectionRawName(DataRefImpl Sec) const {
-  const SectionBase *Base =
-    reinterpret_cast<const SectionBase*>(Sections[Sec.d.a]);
-  return ArrayRef<char>(Base->Name);
+  const section_base *Base =
+    reinterpret_cast<const section_base *>(Sections[Sec.d.a]);
+  return ArrayRef<char>(Base->sectname);
 }
 
 ArrayRef<char>
 MachOObjectFile::getSectionRawFinalSegmentName(DataRefImpl Sec) const {
-  const SectionBase *Base =
-    reinterpret_cast<const SectionBase*>(Sections[Sec.d.a]);
-  return ArrayRef<char>(Base->SegmentName);
+  const section_base *Base =
+    reinterpret_cast<const section_base *>(Sections[Sec.d.a]);
+  return ArrayRef<char>(Base->segname);
 }
 
 bool
-MachOObjectFile::isRelocationScattered(const macho::RelocationEntry &RE)
+MachOObjectFile::isRelocationScattered(const MachO::any_relocation_info &RE)
   const {
-  if (getCPUType(this) == llvm::MachO::CPUTypeX86_64)
+  if (getCPUType(this) == MachO::CPU_TYPE_X86_64)
     return false;
-  return getPlainRelocationAddress(RE) & macho::RF_Scattered;
+  return getPlainRelocationAddress(RE) & MachO::R_SCATTERED;
 }
 
 unsigned MachOObjectFile::getPlainRelocationSymbolNum(
-    const macho::RelocationEntry &RE) const {
+    const MachO::any_relocation_info &RE) const {
   if (isLittleEndian())
-    return RE.Word1 & 0xffffff;
-  return RE.Word1 >> 8;
+    return RE.r_word1 & 0xffffff;
+  return RE.r_word1 >> 8;
 }
 
 bool MachOObjectFile::getPlainRelocationExternal(
-    const macho::RelocationEntry &RE) const {
+    const MachO::any_relocation_info &RE) const {
   if (isLittleEndian())
-    return (RE.Word1 >> 27) & 1;
-  return (RE.Word1 >> 4) & 1;
+    return (RE.r_word1 >> 27) & 1;
+  return (RE.r_word1 >> 4) & 1;
 }
 
 bool MachOObjectFile::getScatteredRelocationScattered(
-    const macho::RelocationEntry &RE) const {
-  return RE.Word0 >> 31;
+    const MachO::any_relocation_info &RE) const {
+  return RE.r_word0 >> 31;
 }
 
 uint32_t MachOObjectFile::getScatteredRelocationValue(
-    const macho::RelocationEntry &RE) const {
-  return RE.Word1;
+    const MachO::any_relocation_info &RE) const {
+  return RE.r_word1;
 }
 
 unsigned MachOObjectFile::getAnyRelocationAddress(
-    const macho::RelocationEntry &RE) const {
+    const MachO::any_relocation_info &RE) const {
   if (isRelocationScattered(RE))
     return getScatteredRelocationAddress(RE);
   return getPlainRelocationAddress(RE);
 }
 
-unsigned
-MachOObjectFile::getAnyRelocationPCRel(const macho::RelocationEntry &RE) const {
+unsigned MachOObjectFile::getAnyRelocationPCRel(
+    const MachO::any_relocation_info &RE) const {
   if (isRelocationScattered(RE))
     return getScatteredRelocationPCRel(this, RE);
   return getPlainRelocationPCRel(this, RE);
 }
 
 unsigned MachOObjectFile::getAnyRelocationLength(
-    const macho::RelocationEntry &RE) const {
+    const MachO::any_relocation_info &RE) const {
   if (isRelocationScattered(RE))
     return getScatteredRelocationLength(RE);
   return getPlainRelocationLength(this, RE);
 }
 
 unsigned
-MachOObjectFile::getAnyRelocationType(const macho::RelocationEntry &RE) const {
+MachOObjectFile::getAnyRelocationType(
+                                   const MachO::any_relocation_info &RE) const {
   if (isRelocationScattered(RE))
     return getScatteredRelocationType(RE);
   return getPlainRelocationType(this, RE);
 }
 
 SectionRef
-MachOObjectFile::getRelocationSection(const macho::RelocationEntry &RE) const {
+MachOObjectFile::getRelocationSection(
+                                   const MachO::any_relocation_info &RE) const {
   if (isRelocationScattered(RE) || getPlainRelocationExternal(RE))
     return *end_sections();
   unsigned SecNum = getPlainRelocationSymbolNum(RE) - 1;
@@ -1450,133 +1434,132 @@ MachOObjectFile::LoadCommandInfo
 MachOObjectFile::getFirstLoadCommandInfo() const {
   MachOObjectFile::LoadCommandInfo Load;
 
-  unsigned HeaderSize = is64Bit() ? macho::Header64Size : macho::Header32Size;
+  unsigned HeaderSize = is64Bit() ? sizeof(MachO::mach_header_64) :
+                                    sizeof(MachO::mach_header);
   Load.Ptr = getPtr(this, HeaderSize);
-  Load.C = getStruct<macho::LoadCommand>(this, Load.Ptr);
+  Load.C = getStruct<MachO::load_command>(this, Load.Ptr);
   return Load;
 }
 
 MachOObjectFile::LoadCommandInfo
 MachOObjectFile::getNextLoadCommandInfo(const LoadCommandInfo &L) const {
   MachOObjectFile::LoadCommandInfo Next;
-  Next.Ptr = L.Ptr + L.C.Size;
-  Next.C = getStruct<macho::LoadCommand>(this, Next.Ptr);
+  Next.Ptr = L.Ptr + L.C.cmdsize;
+  Next.C = getStruct<MachO::load_command>(this, Next.Ptr);
   return Next;
 }
 
-macho::Section MachOObjectFile::getSection(DataRefImpl DRI) const {
-  return getStruct<macho::Section>(this, Sections[DRI.d.a]);
+MachO::section MachOObjectFile::getSection(DataRefImpl DRI) const {
+  return getStruct<MachO::section>(this, Sections[DRI.d.a]);
 }
 
-macho::Section64 MachOObjectFile::getSection64(DataRefImpl DRI) const {
-  return getStruct<macho::Section64>(this, Sections[DRI.d.a]);
+MachO::section_64 MachOObjectFile::getSection64(DataRefImpl DRI) const {
+  return getStruct<MachO::section_64>(this, Sections[DRI.d.a]);
 }
 
-macho::Section MachOObjectFile::getSection(const LoadCommandInfo &L,
+MachO::section MachOObjectFile::getSection(const LoadCommandInfo &L,
                                            unsigned Index) const {
   const char *Sec = getSectionPtr(this, L, Index);
-  return getStruct<macho::Section>(this, Sec);
+  return getStruct<MachO::section>(this, Sec);
 }
 
-macho::Section64 MachOObjectFile::getSection64(const LoadCommandInfo &L,
-                                               unsigned Index) const {
+MachO::section_64 MachOObjectFile::getSection64(const LoadCommandInfo &L,
+                                                unsigned Index) const {
   const char *Sec = getSectionPtr(this, L, Index);
-  return getStruct<macho::Section64>(this, Sec);
+  return getStruct<MachO::section_64>(this, Sec);
 }
 
-macho::SymbolTableEntry
+MachO::nlist
 MachOObjectFile::getSymbolTableEntry(DataRefImpl DRI) const {
   const char *P = reinterpret_cast<const char *>(DRI.p);
-  return getStruct<macho::SymbolTableEntry>(this, P);
+  return getStruct<MachO::nlist>(this, P);
 }
 
-macho::Symbol64TableEntry
+MachO::nlist_64
 MachOObjectFile::getSymbol64TableEntry(DataRefImpl DRI) const {
   const char *P = reinterpret_cast<const char *>(DRI.p);
-  return getStruct<macho::Symbol64TableEntry>(this, P);
+  return getStruct<MachO::nlist_64>(this, P);
 }
 
-macho::LinkeditDataLoadCommand MachOObjectFile::getLinkeditDataLoadCommand(
-    const MachOObjectFile::LoadCommandInfo &L) const {
-  return getStruct<macho::LinkeditDataLoadCommand>(this, L.Ptr);
+MachO::linkedit_data_command
+MachOObjectFile::getLinkeditDataLoadCommand(const LoadCommandInfo &L) const {
+  return getStruct<MachO::linkedit_data_command>(this, L.Ptr);
 }
 
-macho::SegmentLoadCommand
+MachO::segment_command
 MachOObjectFile::getSegmentLoadCommand(const LoadCommandInfo &L) const {
-  return getStruct<macho::SegmentLoadCommand>(this, L.Ptr);
+  return getStruct<MachO::segment_command>(this, L.Ptr);
 }
 
-macho::Segment64LoadCommand
+MachO::segment_command_64
 MachOObjectFile::getSegment64LoadCommand(const LoadCommandInfo &L) const {
-  return getStruct<macho::Segment64LoadCommand>(this, L.Ptr);
+  return getStruct<MachO::segment_command_64>(this, L.Ptr);
 }
 
-macho::LinkerOptionsLoadCommand
+MachO::linker_options_command
 MachOObjectFile::getLinkerOptionsLoadCommand(const LoadCommandInfo &L) const {
-  return getStruct<macho::LinkerOptionsLoadCommand>(this, L.Ptr);
+  return getStruct<MachO::linker_options_command>(this, L.Ptr);
 }
 
-macho::RelocationEntry
+MachO::any_relocation_info
 MachOObjectFile::getRelocation(DataRefImpl Rel) const {
   const char *P = reinterpret_cast<const char *>(Rel.p);
-  return getStruct<macho::RelocationEntry>(this, P);
+  return getStruct<MachO::any_relocation_info>(this, P);
 }
 
-macho::DataInCodeTableEntry
+MachO::data_in_code_entry
 MachOObjectFile::getDice(DataRefImpl Rel) const {
   const char *P = reinterpret_cast<const char *>(Rel.p);
-  return getStruct<macho::DataInCodeTableEntry>(this, P);
+  return getStruct<MachO::data_in_code_entry>(this, P);
 }
 
-macho::Header MachOObjectFile::getHeader() const {
-  return getStruct<macho::Header>(this, getPtr(this, 0));
+MachO::mach_header MachOObjectFile::getHeader() const {
+  return getStruct<MachO::mach_header>(this, getPtr(this, 0));
 }
 
-macho::Header64Ext MachOObjectFile::getHeader64Ext() const {
-  return
-    getStruct<macho::Header64Ext>(this, getPtr(this, sizeof(macho::Header)));
+MachO::mach_header_64 MachOObjectFile::getHeader64() const {
+  return getStruct<MachO::mach_header_64>(this, getPtr(this, 0));
 }
 
-macho::IndirectSymbolTableEntry MachOObjectFile::getIndirectSymbolTableEntry(
-                                          const macho::DysymtabLoadCommand &DLC,
-                                          unsigned Index) const {
-  uint64_t Offset = DLC.IndirectSymbolTableOffset +
-    Index * sizeof(macho::IndirectSymbolTableEntry);
-  return getStruct<macho::IndirectSymbolTableEntry>(this, getPtr(this, Offset));
+uint32_t MachOObjectFile::getIndirectSymbolTableEntry(
+                                             const MachO::dysymtab_command &DLC,
+                                             unsigned Index) const {
+  uint64_t Offset = DLC.indirectsymoff + Index * sizeof(uint32_t);
+  return getStruct<uint32_t>(this, getPtr(this, Offset));
 }
 
-macho::DataInCodeTableEntry
+MachO::data_in_code_entry
 MachOObjectFile::getDataInCodeTableEntry(uint32_t DataOffset,
                                          unsigned Index) const {
-  uint64_t Offset = DataOffset + Index * sizeof(macho::DataInCodeTableEntry);
-  return getStruct<macho::DataInCodeTableEntry>(this, getPtr(this, Offset));
+  uint64_t Offset = DataOffset + Index * sizeof(MachO::data_in_code_entry);
+  return getStruct<MachO::data_in_code_entry>(this, getPtr(this, Offset));
 }
 
-macho::SymtabLoadCommand MachOObjectFile::getSymtabLoadCommand() const {
-  return getStruct<macho::SymtabLoadCommand>(this, SymtabLoadCmd);
+MachO::symtab_command MachOObjectFile::getSymtabLoadCommand() const {
+  return getStruct<MachO::symtab_command>(this, SymtabLoadCmd);
 }
 
-macho::DysymtabLoadCommand MachOObjectFile::getDysymtabLoadCommand() const {
-  return getStruct<macho::DysymtabLoadCommand>(this, DysymtabLoadCmd);
+MachO::dysymtab_command MachOObjectFile::getDysymtabLoadCommand() const {
+  return getStruct<MachO::dysymtab_command>(this, DysymtabLoadCmd);
 }
 
-macho::LinkeditDataLoadCommand
+MachO::linkedit_data_command
 MachOObjectFile::getDataInCodeLoadCommand() const {
   if (DataInCodeLoadCmd)
-    return getStruct<macho::LinkeditDataLoadCommand>(this, DataInCodeLoadCmd);
+    return getStruct<MachO::linkedit_data_command>(this, DataInCodeLoadCmd);
 
   // If there is no DataInCodeLoadCmd return a load command with zero'ed fields.
-  macho::LinkeditDataLoadCommand Cmd;
-  Cmd.Type = macho::LCT_DataInCode;
-  Cmd.Size = macho::LinkeditLoadCommandSize;
-  Cmd.DataOffset = 0;
-  Cmd.DataSize = 0;
+  MachO::linkedit_data_command Cmd;
+  Cmd.cmd = MachO::LC_DATA_IN_CODE;
+  Cmd.cmdsize = sizeof(MachO::linkedit_data_command);
+  Cmd.dataoff = 0;
+  Cmd.datasize = 0;
   return Cmd;
 }
 
 StringRef MachOObjectFile::getStringTableData() const {
-  macho::SymtabLoadCommand S = getSymtabLoadCommand();
-  return getData().substr(S.StringTableOffset, S.StringTableSize);
+  MachO::symtab_command S = getSymtabLoadCommand();
+  return getData().substr(S.stroff, S.strsize);
 }
 
 bool MachOObjectFile::is64Bit() const {
diff --git a/lib/Object/MachOUniversal.cpp b/lib/Object/MachOUniversal.cpp
index b76f10e..75160af 100644
--- a/lib/Object/MachOUniversal.cpp
+++ b/lib/Object/MachOUniversal.cpp
@@ -31,18 +31,18 @@ template<typename T>
 static void SwapStruct(T &Value);
 
 template<>
-void SwapStruct(macho::FatHeader &H) {
-  SwapValue(H.Magic);
-  SwapValue(H.NumFatArch);
+void SwapStruct(MachO::fat_header &H) {
+  SwapValue(H.magic);
+  SwapValue(H.nfat_arch);
 }
 
 template<>
-void SwapStruct(macho::FatArchHeader &H) {
-  SwapValue(H.CPUType);
-  SwapValue(H.CPUSubtype);
-  SwapValue(H.Offset);
-  SwapValue(H.Size);
-  SwapValue(H.Align);
+void SwapStruct(MachO::fat_arch &H) {
+  SwapValue(H.cputype);
+  SwapValue(H.cpusubtype);
+  SwapValue(H.offset);
+  SwapValue(H.size);
+  SwapValue(H.align);
 }
 
 template<typename T>
@@ -63,10 +63,10 @@ MachOUniversalBinary::ObjectForArch::ObjectForArch(
   } else {
     // Parse object header.
     StringRef ParentData = Parent->getData();
-    const char *HeaderPos = ParentData.begin() + macho::FatHeaderSize +
-                            Index * macho::FatArchHeaderSize;
-    Header = getUniversalBinaryStruct<macho::FatArchHeader>(HeaderPos);
-    if (ParentData.size() < Header.Offset + Header.Size) {
+    const char *HeaderPos = ParentData.begin() + sizeof(MachO::fat_header) +
+                            Index * sizeof(MachO::fat_arch);
+    Header = getUniversalBinaryStruct<MachO::fat_arch>(HeaderPos);
+    if (ParentData.size() < Header.offset + Header.size) {
       clear();
     }
   }
@@ -76,10 +76,10 @@ error_code MachOUniversalBinary::ObjectForArch::getAsObjectFile(
     OwningPtr<ObjectFile> &Result) const {
   if (Parent) {
     StringRef ParentData = Parent->getData();
-    StringRef ObjectData = ParentData.substr(Header.Offset, Header.Size);
+    StringRef ObjectData = ParentData.substr(Header.offset, Header.size);
     std::string ObjectName =
         Parent->getFileName().str() + ":" +
-        Triple::getArchTypeName(MachOObjectFile::getArch(Header.CPUType));
+        Triple::getArchTypeName(MachOObjectFile::getArch(Header.cputype));
     MemoryBuffer *ObjBuffer = MemoryBuffer::getMemBuffer(
         ObjectData, ObjectName, false);
     if (ObjectFile *Obj = ObjectFile::createMachOObjectFile(ObjBuffer)) {
@@ -96,31 +96,31 @@ MachOUniversalBinary::MachOUniversalBinary(MemoryBuffer *Source,
                                            error_code &ec)
   : Binary(Binary::ID_MachOUniversalBinary, Source),
     NumberOfObjects(0) {
-  if (Source->getBufferSize() < macho::FatHeaderSize) {
+  if (Source->getBufferSize() < sizeof(MachO::fat_header)) {
     ec = object_error::invalid_file_type;
     return;
   }
   // Check for magic value and sufficient header size.
   StringRef Buf = getData();
-  macho::FatHeader H = getUniversalBinaryStruct<macho::FatHeader>(Buf.begin());
-  NumberOfObjects = H.NumFatArch;
-  uint32_t MinSize = macho::FatHeaderSize +
-                     macho::FatArchHeaderSize * NumberOfObjects;
-  if (H.Magic != macho::HM_Universal || Buf.size() < MinSize) {
+  MachO::fat_header H= getUniversalBinaryStruct<MachO::fat_header>(Buf.begin());
+  NumberOfObjects = H.nfat_arch;
+  uint32_t MinSize = sizeof(MachO::fat_header) +
+                     sizeof(MachO::fat_arch) * NumberOfObjects;
+  if (H.magic != MachO::FAT_MAGIC || Buf.size() < MinSize) {
     ec = object_error::parse_failed;
     return;
   }
   ec = object_error::success;
 }
 
-static bool getCTMForArch(Triple::ArchType Arch, mach::CPUTypeMachine &CTM) {
+static bool getCTMForArch(Triple::ArchType Arch, MachO::CPUType &CTM) {
   switch (Arch) {
-    case Triple::x86:    CTM = mach::CTM_i386; return true;
-    case Triple::x86_64: CTM = mach::CTM_x86_64; return true;
-    case Triple::arm:    CTM = mach::CTM_ARM; return true;
-    case Triple::sparc:  CTM = mach::CTM_SPARC; return true;
-    case Triple::ppc:    CTM = mach::CTM_PowerPC; return true;
-    case Triple::ppc64:  CTM = mach::CTM_PowerPC64; return true;
+    case Triple::x86:    CTM = MachO::CPU_TYPE_I386; return true;
+    case Triple::x86_64: CTM = MachO::CPU_TYPE_X86_64; return true;
+    case Triple::arm:    CTM = MachO::CPU_TYPE_ARM; return true;
+    case Triple::sparc:  CTM = MachO::CPU_TYPE_SPARC; return true;
+    case Triple::ppc:    CTM = MachO::CPU_TYPE_POWERPC; return true;
+    case Triple::ppc64:  CTM = MachO::CPU_TYPE_POWERPC64; return true;
     default: return false;
   }
 }
@@ -128,7 +128,7 @@ static bool getCTMForArch(Triple::ArchType Arch, mach::CPUTypeMachine &CTM) {
 error_code
 MachOUniversalBinary::getObjectForArch(Triple::ArchType Arch,
                                        OwningPtr<ObjectFile> &Result) const {
-  mach::CPUTypeMachine CTM;
+  MachO::CPUType CTM;
   if (!getCTMForArch(Arch, CTM))
     return object_error::arch_not_found;
   for (object_iterator I = begin_objects(), E = end_objects(); I != E; ++I) {
diff --git a/lib/Object/ObjectFile.cpp b/lib/Object/ObjectFile.cpp
index 1d1dafd..0e626d6 100644
--- a/lib/Object/ObjectFile.cpp
+++ b/lib/Object/ObjectFile.cpp
@@ -49,6 +49,7 @@ ObjectFile *ObjectFile::createObjectFile(MemoryBuffer *Object) {
   case sys::fs::file_magic::bitcode:
   case sys::fs::file_magic::archive:
   case sys::fs::file_magic::macho_universal_binary:
+  case sys::fs::file_magic::windows_resource:
     delete Object;
     return 0;
   case sys::fs::file_magic::elf_relocatable:
@@ -68,6 +69,7 @@ ObjectFile *ObjectFile::createObjectFile(MemoryBuffer *Object) {
   case sys::fs::file_magic::macho_dsym_companion:
     return createMachOObjectFile(Object);
   case sys::fs::file_magic::coff_object:
+  case sys::fs::file_magic::coff_import_library:
   case sys::fs::file_magic::pecoff_executable:
     return createCOFFObjectFile(Object);
   }
diff --git a/lib/Object/YAML.cpp b/lib/Object/YAML.cpp
index 21bacb8..c527bde 100644
--- a/lib/Object/YAML.cpp
+++ b/lib/Object/YAML.cpp
@@ -15,6 +15,7 @@
 #include "llvm/Object/YAML.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cctype>
 
 using namespace llvm;
 using namespace object::yaml;
diff --git a/lib/Option/OptTable.cpp b/lib/Option/OptTable.cpp
index 98e63bc..6fa459a 100644
--- a/lib/Option/OptTable.cpp
+++ b/lib/Option/OptTable.cpp
@@ -14,26 +14,27 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
+#include <cctype>
 #include <map>
 
 using namespace llvm;
 using namespace llvm::opt;
 
-// Ordering on Info. The ordering is *almost* lexicographic, with two
-// exceptions. First, '\0' comes at the end of the alphabet instead of
-// the beginning (thus options precede any other options which prefix
-// them). Second, for options with the same name, the less permissive
-// version should come first; a Flag option should precede a Joined
-// option, for example.
+namespace llvm {
+namespace opt {
 
-static int StrCmpOptionName(const char *A, const char *B) {
-  char a = *A, b = *B;
+// Ordering on Info. The ordering is *almost* case-insensitive lexicographic,
+// with an exceptions. '\0' comes at the end of the alphabet instead of the
+// beginning (thus options precede any other options which prefix them).
+static int StrCmpOptionNameIgnoreCase(const char *A, const char *B) {
+  const char *X = A, *Y = B;
+  char a = tolower(*A), b = tolower(*B);
   while (a == b) {
     if (a == '\0')
       return 0;
 
-    a = *++A;
-    b = *++B;
+    a = tolower(*++X);
+    b = tolower(*++Y);
   }
 
   if (a == '\0') // A is a prefix of B.
@@ -45,21 +46,25 @@ static int StrCmpOptionName(const char *A, const char *B) {
   return (a < b) ? -1 : 1;
 }
 
-namespace llvm {
-namespace opt {
+#ifndef NDEBUG
+static int StrCmpOptionName(const char *A, const char *B) {
+  if (int N = StrCmpOptionNameIgnoreCase(A, B))
+    return N;
+  return strcmp(A, B);
+}
 
 static inline bool operator<(const OptTable::Info &A, const OptTable::Info &B) {
   if (&A == &B)
     return false;
 
   if (int N = StrCmpOptionName(A.Name, B.Name))
-    return N == -1;
+    return N < 0;
 
   for (const char * const *APre = A.Prefixes,
                   * const *BPre = B.Prefixes;
                           *APre != 0 && *BPre != 0; ++APre, ++BPre) {
     if (int N = StrCmpOptionName(*APre, *BPre))
-      return N == -1;
+      return N < 0;
   }
 
   // Names are the same, check that classes are in order; exactly one
@@ -68,22 +73,22 @@ static inline bool operator<(const OptTable::Info &A, const OptTable::Info &B) {
          "Unexpected classes for options with same name.");
   return B.Kind == Option::JoinedClass;
 }
+#endif
 
 // Support lower_bound between info and an option name.
 static inline bool operator<(const OptTable::Info &I, const char *Name) {
-  return StrCmpOptionName(I.Name, Name) == -1;
-}
-static inline bool operator<(const char *Name, const OptTable::Info &I) {
-  return StrCmpOptionName(Name, I.Name) == -1;
+  return StrCmpOptionNameIgnoreCase(I.Name, Name) < 0;
 }
 }
 }
 
 OptSpecifier::OptSpecifier(const Option *Opt) : ID(Opt->getID()) {}
 
-OptTable::OptTable(const Info *_OptionInfos, unsigned _NumOptionInfos)
+OptTable::OptTable(const Info *_OptionInfos, unsigned _NumOptionInfos,
+                   bool _IgnoreCase)
   : OptionInfos(_OptionInfos),
     NumOptionInfos(_NumOptionInfos),
+    IgnoreCase(_IgnoreCase),
     TheInputOptionID(0),
     TheUnknownOptionID(0),
     FirstSearchableIndex(0)
@@ -171,11 +176,18 @@ static bool isInput(const llvm::StringSet<> &Prefixes, StringRef Arg) {
 }
 
 /// \returns Matched size. 0 means no match.
-static unsigned matchOption(const OptTable::Info *I, StringRef Str) {
+static unsigned matchOption(const OptTable::Info *I, StringRef Str,
+                            bool IgnoreCase) {
   for (const char * const *Pre = I->Prefixes; *Pre != 0; ++Pre) {
     StringRef Prefix(*Pre);
-    if (Str.startswith(Prefix) && Str.substr(Prefix.size()).startswith(I->Name))
-      return Prefix.size() + StringRef(I->Name).size();
+    if (Str.startswith(Prefix)) {
+      StringRef Rest = Str.substr(Prefix.size());
+      bool Matched = IgnoreCase
+          ? Rest.startswith_lower(I->Name)
+          : Rest.startswith(I->Name);
+      if (Matched)
+        return Prefix.size() + StringRef(I->Name).size();
+    }
   }
   return 0;
 }
@@ -210,7 +222,7 @@ Arg *OptTable::ParseOneArg(const ArgList &Args, unsigned &Index,
     unsigned ArgSize = 0;
     // Scan for first option which is a proper prefix.
     for (; Start != End; ++Start)
-      if ((ArgSize = matchOption(Start, Str)))
+      if ((ArgSize = matchOption(Start, Str, IgnoreCase)))
         break;
     if (Start == End)
       break;
@@ -259,20 +271,6 @@ InputArgList *OptTable::ParseArgs(const char *const *ArgBegin,
       continue;
     }
 
-    if (Str == "--") {
-      // Everything after -- is a filename.
-      ++Index;
-
-      assert(TheInputOptionID != 0 && "Invalid input option ID.");
-      while (Index < End) {
-        Args->append(new Arg(getOption(TheInputOptionID),
-                             Args->getArgString(Index), Index,
-                             Args->getArgString(Index)));
-        ++Index;
-      }
-      break;
-    }
-
     unsigned Prev = Index;
     Arg *A = ParseOneArg(*Args, Index, FlagsToInclude, FlagsToExclude);
     assert(Index > Prev && "Parser failed to consume argument.");
@@ -308,6 +306,7 @@ static std::string getOptionHelpName(const OptTable &Opts, OptSpecifier Id) {
     break;
 
   case Option::SeparateClass: case Option::JoinedOrSeparateClass:
+  case Option::RemainingArgsClass:
     Name += ' ';
     // FALLTHROUGH
   case Option::JoinedClass: case Option::CommaJoinedClass:
diff --git a/lib/Option/Option.cpp b/lib/Option/Option.cpp
index 1d6a3d3..7b5ff2b 100644
--- a/lib/Option/Option.cpp
+++ b/lib/Option/Option.cpp
@@ -52,6 +52,7 @@ void Option::dump() const {
     P(MultiArgClass);
     P(JoinedOrSeparateClass);
     P(JoinedAndSeparateClass);
+    P(RemainingArgsClass);
 #undef P
   }
 
@@ -214,6 +215,16 @@ Arg *Option::accept(const ArgList &Args,
     return new Arg(UnaliasedOption, Spelling, Index - 2,
                    Args.getArgString(Index - 2) + ArgSize,
                    Args.getArgString(Index - 1));
+  case RemainingArgsClass: {
+    // Matches iff this is an exact match.
+    // FIXME: Avoid strlen.
+    if (ArgSize != strlen(Args.getArgString(Index)))
+      return 0;
+    Arg *A = new Arg(UnaliasedOption, Spelling, Index++);
+    while (Index < Args.getNumInputArgStrings())
+      A->getValues().push_back(Args.getArgString(Index++));
+    return A;
+  }
   default:
     llvm_unreachable("Invalid option kind!");
   }
diff --git a/lib/Support/APFloat.cpp b/lib/Support/APFloat.cpp
index 34bc6b6..676e2d4 100644
--- a/lib/Support/APFloat.cpp
+++ b/lib/Support/APFloat.cpp
@@ -3546,11 +3546,14 @@ void APFloat::toString(SmallVectorImpl<char> &Str,
   // Set FormatPrecision if zero.  We want to do this before we
   // truncate trailing zeros, as those are part of the precision.
   if (!FormatPrecision) {
-    // It's an interesting question whether to use the nominal
-    // precision or the active precision here for denormals.
-
-    // FormatPrecision = ceil(significandBits / lg_2(10))
-    FormatPrecision = (semantics->precision * 59 + 195) / 196;
+    // We use enough digits so the number can be round-tripped back to an
+    // APFloat. The formula comes from "How to Print Floating-Point Numbers
+    // Accurately" by Steele and White.
+    // FIXME: Using a formula based purely on the precision is conservative;
+    // we can print fewer digits depending on the actual value being printed.
+
+    // FormatPrecision = 2 + floor(significandBits / lg_2(10))
+    FormatPrecision = 2 + semantics->precision * 59 / 196;
   }
 
   // Ignore trailing binary zeros.
diff --git a/lib/Support/Allocator.cpp b/lib/Support/Allocator.cpp
index 3c4191b..6e7a541 100644
--- a/lib/Support/Allocator.cpp
+++ b/lib/Support/Allocator.cpp
@@ -26,6 +26,10 @@ BumpPtrAllocator::BumpPtrAllocator(size_t size, size_t threshold,
     : SlabSize(size), SizeThreshold(std::min(size, threshold)),
       Allocator(allocator), CurSlab(0), BytesAllocated(0) { }
 
+BumpPtrAllocator::BumpPtrAllocator(size_t size, size_t threshold)
+    : SlabSize(size), SizeThreshold(std::min(size, threshold)),
+      Allocator(DefaultSlabAllocator), CurSlab(0), BytesAllocated(0) { }
+
 BumpPtrAllocator::~BumpPtrAllocator() {
   DeallocateSlabs(CurSlab);
 }
@@ -167,9 +171,6 @@ void BumpPtrAllocator::PrintStats() const {
          << " (includes alignment, etc)\n";
 }
 
-MallocSlabAllocator BumpPtrAllocator::DefaultSlabAllocator =
-  MallocSlabAllocator();
-
 SlabAllocator::~SlabAllocator() { }
 
 MallocSlabAllocator::~MallocSlabAllocator() { }
diff --git a/lib/Support/BlockFrequency.cpp b/lib/Support/BlockFrequency.cpp
index 5e45e46..00efe90 100644
--- a/lib/Support/BlockFrequency.cpp
+++ b/lib/Support/BlockFrequency.cpp
@@ -19,52 +19,69 @@
 using namespace llvm;
 
 /// Multiply FREQ by N and store result in W array.
-static void mult96bit(uint64_t freq, uint32_t N, uint64_t W[2]) {
+static void mult96bit(uint64_t freq, uint32_t N, uint32_t W[3]) {
   uint64_t u0 = freq & UINT32_MAX;
   uint64_t u1 = freq >> 32;
 
-  // Represent 96-bit value as w[2]:w[1]:w[0];
-  uint32_t w[3] = { 0, 0, 0 };
-
+  // Represent 96-bit value as W[2]:W[1]:W[0];
   uint64_t t = u0 * N;
   uint64_t k = t >> 32;
-  w[0] = t;
+  W[0] = t;
   t = u1 * N + k;
-  w[1] = t;
-  w[2] = t >> 32;
-
-  // W[1] - higher bits.
-  // W[0] - lower bits.
-  W[0] = w[0] + ((uint64_t) w[1] << 32);
-  W[1] = w[2];
+  W[1] = t;
+  W[2] = t >> 32;
 }
 
-
-/// Divide 96-bit value stored in W array by D.
-/// Return 64-bit quotient, saturated to UINT64_MAX on overflow.
-static uint64_t div96bit(uint64_t W[2], uint32_t D) {
-  uint64_t y = W[0];
-  uint64_t x = W[1];
-  unsigned i;
-
-  assert(x != 0 && "This is really a 64-bit division");
-
-  // This long division algorithm automatically saturates on overflow.
-  for (i = 0; i < 64 && x; ++i) {
-    uint32_t t = -((x >> 31) & 1); // Splat bit 31 to bits 0-31.
-    x = (x << 1) | (y >> 63);
-    y = y << 1;
-    if ((x | t) >= D) {
-      x -= D;
-      ++y;
+/// Divide 96-bit value stored in W[2]:W[1]:W[0] by D. Since our word size is a
+/// 32 bit unsigned integer, we can use a short division algorithm.
+static uint64_t divrem96bit(uint32_t W[3], uint32_t D, uint32_t *Rout) {
+  // We assume that W[2] is non-zero since if W[2] is not then the user should
+  // just use hardware division.
+  assert(W[2] && "This routine assumes that W[2] is non-zero since if W[2] is "
+         "zero, the caller should just use 64/32 hardware.");
+  uint32_t Q[3] = { 0, 0, 0 };
+
+  // The generalized short division algorithm sets i to m + n - 1, where n is
+  // the number of words in the divisior and m is the number of words by which
+  // the divident exceeds the divisor (i.e. m + n == the length of the dividend
+  // in words). Due to our assumption that W[2] is non-zero, we know that the
+  // dividend is of length 3 implying since n is 1 that m = 2. Thus we set i to
+  // m + n - 1 = 2 + 1 - 1 = 2.
+  uint32_t R = 0;
+  for (int i = 2; i >= 0; --i) {
+    uint64_t PartialD = uint64_t(R) << 32 | W[i];
+    if (PartialD == 0) {
+      Q[i] = 0;
+      R = 0;
+    } else if (PartialD < D) {
+      Q[i] = 0;
+      R = uint32_t(PartialD);
+    } else if (PartialD == D) {
+      Q[i] = 1;
+      R = 0;
+    } else {
+      Q[i] = uint32_t(PartialD / D);
+      R = uint32_t(PartialD - (Q[i] * D));
     }
   }
 
-  return y << (64 - i);
-}
+  // If Q[2] is non-zero, then we overflowed.
+  uint64_t Result;
+  if (Q[2]) {
+    Result = UINT64_MAX;
+    R = D;
+  } else {
+    // Form the final uint64_t result, avoiding endianness issues.
+    Result = uint64_t(Q[0]) | (uint64_t(Q[1]) << 32);
+  }
+
+  if (Rout)
+    *Rout = R;
 
+  return Result;
+}
 
-void BlockFrequency::scale(uint32_t N, uint32_t D) {
+uint32_t BlockFrequency::scale(uint32_t N, uint32_t D) {
   assert(D != 0 && "Division by zero");
 
   // Calculate Frequency * N.
@@ -75,15 +92,16 @@ void BlockFrequency::scale(uint32_t N, uint32_t D) {
   // If the product fits in 64 bits, just use built-in division.
   if (MulHi <= UINT32_MAX && MulRes >= MulLo) {
     Frequency = MulRes / D;
-    return;
+    return MulRes % D;
   }
 
   // Product overflowed, use 96-bit operations.
-  // 96-bit value represented as W[1]:W[0].
-  uint64_t W[2];
+  // 96-bit value represented as W[2]:W[1]:W[0].
+  uint32_t W[3];
+  uint32_t R;
   mult96bit(Frequency, N, W);
-  Frequency = div96bit(W, D);
-  return;
+  Frequency = divrem96bit(W, D, &R);
+  return R;
 }
 
 BlockFrequency &BlockFrequency::operator*=(const BranchProbability &Prob) {
@@ -127,6 +145,10 @@ BlockFrequency::operator+(const BlockFrequency &Prob) const {
   return Freq;
 }
 
+uint32_t BlockFrequency::scale(const BranchProbability &Prob) {
+  return scale(Prob.getNumerator(), Prob.getDenominator());
+}
+
 void BlockFrequency::print(raw_ostream &OS) const {
   // Convert fixed-point number to decimal.
   OS << Frequency / getEntryFrequency() << ".";
diff --git a/lib/Support/CMakeLists.txt b/lib/Support/CMakeLists.txt
index 5823836..3aecf3f 100644
--- a/lib/Support/CMakeLists.txt
+++ b/lib/Support/CMakeLists.txt
@@ -54,6 +54,7 @@ add_llvm_library(LLVMSupport
   ToolOutputFile.cpp
   Triple.cpp
   Twine.cpp
+  Unicode.cpp
   YAMLParser.cpp
   YAMLTraits.cpp
   raw_os_ostream.cpp
diff --git a/lib/Support/CommandLine.cpp b/lib/Support/CommandLine.cpp
index a47af27..44a88d8 100644
--- a/lib/Support/CommandLine.cpp
+++ b/lib/Support/CommandLine.cpp
@@ -60,6 +60,8 @@ TEMPLATE_INSTANTIATION(class opt<char>);
 TEMPLATE_INSTANTIATION(class opt<bool>);
 } } // end namespace llvm::cl
 
+// Pin the vtables to this file.
+void GenericOptionValue::anchor() {}
 void OptionValue<boolOrDefault>::anchor() {}
 void OptionValue<std::string>::anchor() {}
 void Option::anchor() {}
@@ -73,6 +75,7 @@ void parser<double>::anchor() {}
 void parser<float>::anchor() {}
 void parser<std::string>::anchor() {}
 void parser<char>::anchor() {}
+void StringSaver::anchor() {}
 
 //===----------------------------------------------------------------------===//
 
diff --git a/lib/Support/Compression.cpp b/lib/Support/Compression.cpp
index fd8a874..b5ddb70 100644
--- a/lib/Support/Compression.cpp
+++ b/lib/Support/Compression.cpp
@@ -81,6 +81,10 @@ zlib::Status zlib::uncompress(StringRef InputBuffer,
   return Res;
 }
 
+uint32_t zlib::crc32(StringRef Buffer) {
+  return ::crc32(0, (const Bytef *)Buffer.data(), Buffer.size());
+}
+
 #else
 bool zlib::isAvailable() { return false; }
 zlib::Status zlib::compress(StringRef InputBuffer,
@@ -93,5 +97,8 @@ zlib::Status zlib::uncompress(StringRef InputBuffer,
                               size_t UncompressedSize) {
   return zlib::StatusUnsupported;
 }
+uint32_t zlib::crc32(StringRef Buffer) {
+  llvm_unreachable("zlib::crc32 is unavailable");
+}
 #endif
 
diff --git a/lib/Support/ConstantRange.cpp b/lib/Support/ConstantRange.cpp
index bb38cd1..265b6e9 100644
--- a/lib/Support/ConstantRange.cpp
+++ b/lib/Support/ConstantRange.cpp
@@ -144,9 +144,6 @@ bool ConstantRange::isSignWrappedSet() const {
 /// getSetSize - Return the number of elements in this set.
 ///
 APInt ConstantRange::getSetSize() const {
-  if (isEmptySet())
-    return APInt(getBitWidth()+1, 0);
-
   if (isFullSet()) {
     APInt Size(getBitWidth()+1, 0);
     Size.setBit(getBitWidth());
@@ -448,6 +445,11 @@ ConstantRange ConstantRange::signExtend(uint32_t DstTySize) const {
 
   unsigned SrcTySize = getBitWidth();
   assert(SrcTySize < DstTySize && "Not a value extension");
+
+  // special case: [X, INT_MIN) -- not really wrapping around
+  if (Upper.isMinSignedValue())
+    return ConstantRange(Lower.sext(DstTySize), Upper.zext(DstTySize));
+
   if (isFullSet() || isSignWrappedSet()) {
     return ConstantRange(APInt::getHighBitsSet(DstTySize,DstTySize-SrcTySize+1),
                          APInt::getLowBitsSet(DstTySize, SrcTySize-1) + 1);
diff --git a/lib/Support/CrashRecoveryContext.cpp b/lib/Support/CrashRecoveryContext.cpp
index d2a3895..92c370d 100644
--- a/lib/Support/CrashRecoveryContext.cpp
+++ b/lib/Support/CrashRecoveryContext.cpp
@@ -11,6 +11,7 @@
 #include "llvm/ADT/SmallString.h"
 #include "llvm/Config/config.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/Mutex.h"
 #include "llvm/Support/ThreadLocal.h"
 #include <cstdio>
@@ -21,7 +22,7 @@ namespace {
 
 struct CrashRecoveryContextImpl;
 
-static sys::ThreadLocal<const CrashRecoveryContextImpl> CurrentContext;
+static ManagedStatic<sys::ThreadLocal<const CrashRecoveryContextImpl> > CurrentContext;
 
 struct CrashRecoveryContextImpl {
   CrashRecoveryContext *CRC;
@@ -34,11 +35,11 @@ public:
   CrashRecoveryContextImpl(CrashRecoveryContext *CRC) : CRC(CRC),
                                                         Failed(false),
                                                         SwitchedThread(false) {
-    CurrentContext.set(this);
+    CurrentContext->set(this);
   }
   ~CrashRecoveryContextImpl() {
     if (!SwitchedThread)
-      CurrentContext.erase();
+      CurrentContext->erase();
   }
 
   /// \brief Called when the separate crash-recovery thread was finished, to
@@ -48,7 +49,7 @@ public:
   void HandleCrash() {
     // Eliminate the current context entry, to avoid re-entering in case the
     // cleanup code crashes.
-    CurrentContext.erase();
+    CurrentContext->erase();
 
     assert(!Failed && "Crash recovery context already failed!");
     Failed = true;
@@ -62,10 +63,10 @@ public:
 
 }
 
-static sys::Mutex gCrashRecoveryContexMutex;
+static ManagedStatic<sys::Mutex> gCrashRecoveryContextMutex;
 static bool gCrashRecoveryEnabled = false;
 
-static sys::ThreadLocal<const CrashRecoveryContextCleanup> 
+static ManagedStatic<sys::ThreadLocal<const CrashRecoveryContextCleanup> >
        tlIsRecoveringFromCrash;
 
 CrashRecoveryContextCleanup::~CrashRecoveryContextCleanup() {}
@@ -73,7 +74,7 @@ CrashRecoveryContextCleanup::~CrashRecoveryContextCleanup() {}
 CrashRecoveryContext::~CrashRecoveryContext() {
   // Reclaim registered resources.
   CrashRecoveryContextCleanup *i = head;
-  tlIsRecoveringFromCrash.set(head);
+  tlIsRecoveringFromCrash->set(head);
   while (i) {
     CrashRecoveryContextCleanup *tmp = i;
     i = tmp->next;
@@ -81,21 +82,21 @@ CrashRecoveryContext::~CrashRecoveryContext() {
     tmp->recoverResources();
     delete tmp;
   }
-  tlIsRecoveringFromCrash.erase();
+  tlIsRecoveringFromCrash->erase();
   
   CrashRecoveryContextImpl *CRCI = (CrashRecoveryContextImpl *) Impl;
   delete CRCI;
 }
 
 bool CrashRecoveryContext::isRecoveringFromCrash() {
-  return tlIsRecoveringFromCrash.get() != 0;
+  return tlIsRecoveringFromCrash->get() != 0;
 }
 
 CrashRecoveryContext *CrashRecoveryContext::GetCurrent() {
   if (!gCrashRecoveryEnabled)
     return 0;
 
-  const CrashRecoveryContextImpl *CRCI = CurrentContext.get();
+  const CrashRecoveryContextImpl *CRCI = CurrentContext->get();
   if (!CRCI)
     return 0;
 
@@ -154,7 +155,7 @@ CrashRecoveryContext::unregisterCleanup(CrashRecoveryContextCleanup *cleanup) {
 static LONG CALLBACK ExceptionHandler(PEXCEPTION_POINTERS ExceptionInfo)
 {
   // Lookup the current thread local recovery object.
-  const CrashRecoveryContextImpl *CRCI = CurrentContext.get();
+  const CrashRecoveryContextImpl *CRCI = CurrentContext->get();
 
   if (!CRCI) {
     // Something has gone horribly wrong, so let's just tell everyone
@@ -182,7 +183,7 @@ static LONG CALLBACK ExceptionHandler(PEXCEPTION_POINTERS ExceptionInfo)
 static sys::ThreadLocal<const void> sCurrentExceptionHandle;
 
 void CrashRecoveryContext::Enable() {
-  sys::ScopedLock L(gCrashRecoveryContexMutex);
+  sys::ScopedLock L(*gCrashRecoveryContextMutex);
 
   if (gCrashRecoveryEnabled)
     return;
@@ -198,7 +199,7 @@ void CrashRecoveryContext::Enable() {
 }
 
 void CrashRecoveryContext::Disable() {
-  sys::ScopedLock L(gCrashRecoveryContexMutex);
+  sys::ScopedLock L(*gCrashRecoveryContextMutex);
 
   if (!gCrashRecoveryEnabled)
     return;
@@ -236,7 +237,7 @@ static struct sigaction PrevActions[NumSignals];
 
 static void CrashRecoverySignalHandler(int Signal) {
   // Lookup the current thread local recovery object.
-  const CrashRecoveryContextImpl *CRCI = CurrentContext.get();
+  const CrashRecoveryContextImpl *CRCI = CurrentContext->get();
 
   if (!CRCI) {
     // We didn't find a crash recovery context -- this means either we got a
@@ -267,7 +268,7 @@ static void CrashRecoverySignalHandler(int Signal) {
 }
 
 void CrashRecoveryContext::Enable() {
-  sys::ScopedLock L(gCrashRecoveryContexMutex);
+  sys::ScopedLock L(*gCrashRecoveryContextMutex);
 
   if (gCrashRecoveryEnabled)
     return;
@@ -286,7 +287,7 @@ void CrashRecoveryContext::Enable() {
 }
 
 void CrashRecoveryContext::Disable() {
-  sys::ScopedLock L(gCrashRecoveryContexMutex);
+  sys::ScopedLock L(*gCrashRecoveryContextMutex);
 
   if (!gCrashRecoveryEnabled)
     return;
diff --git a/lib/Support/Dwarf.cpp b/lib/Support/Dwarf.cpp
index 8a80139..c000b63 100644
--- a/lib/Support/Dwarf.cpp
+++ b/lib/Support/Dwarf.cpp
@@ -12,6 +12,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/Dwarf.h"
+#include "llvm/Support/ErrorHandling.h"
+
 using namespace llvm;
 using namespace dwarf;
 
@@ -59,8 +61,8 @@ const char *llvm::dwarf::TagString(unsigned Tag) {
   case DW_TAG_namelist_item:             return "DW_TAG_namelist_item";
   case DW_TAG_packed_type:               return "DW_TAG_packed_type";
   case DW_TAG_subprogram:                return "DW_TAG_subprogram";
-  case DW_TAG_template_type_parameter:  return "DW_TAG_template_type_parameter";
-  case DW_TAG_template_value_parameter:return "DW_TAG_template_value_parameter";
+  case DW_TAG_template_type_parameter:   return "DW_TAG_template_type_parameter";
+  case DW_TAG_template_value_parameter:  return "DW_TAG_template_value_parameter";
   case DW_TAG_thrown_type:               return "DW_TAG_thrown_type";
   case DW_TAG_try_block:                 return "DW_TAG_try_block";
   case DW_TAG_variant_part:              return "DW_TAG_variant_part";
@@ -454,10 +456,11 @@ const char *llvm::dwarf::OperationEncodingString(unsigned Encoding) {
   case DW_OP_bit_piece:                  return "DW_OP_bit_piece";
   case DW_OP_implicit_value:             return "DW_OP_implicit_value";
   case DW_OP_stack_value:                return "DW_OP_stack_value";
-  case DW_OP_lo_user:                    return "DW_OP_lo_user";
-  case DW_OP_hi_user:                    return "DW_OP_hi_user";
 
-    // DWARF5 Fission Proposal Op Extensions
+  // GNU thread-local storage
+  case DW_OP_GNU_push_tls_address:       return "DW_OP_GNU_push_tls_address";
+
+  // DWARF5 Fission Proposal Op Extensions
   case DW_OP_GNU_addr_index:             return "DW_OP_GNU_addr_index";
   case DW_OP_GNU_const_index:            return "DW_OP_GNU_const_index";
   }
@@ -723,3 +726,51 @@ const char *llvm::dwarf::CallFrameString(unsigned Encoding) {
   }
   return 0;
 }
+
+const char *llvm::dwarf::AtomTypeString(unsigned AT) {
+  switch (AT) {
+  case dwarf::DW_ATOM_null:
+    return "DW_ATOM_null";
+  case dwarf::DW_ATOM_die_offset:
+    return "DW_ATOM_die_offset";
+  case DW_ATOM_cu_offset:
+    return "DW_ATOM_cu_offset";
+  case DW_ATOM_die_tag:
+    return "DW_ATOM_die_tag";
+  case DW_ATOM_type_flags:
+    return "DW_ATOM_type_flags";
+  }
+  return 0;
+}
+
+const char *llvm::dwarf::GDBIndexEntryKindString(GDBIndexEntryKind Kind) {
+  switch (Kind) {
+  case GIEK_NONE:
+    return "NONE";
+  case GIEK_TYPE:
+    return "TYPE";
+  case GIEK_VARIABLE:
+    return "VARIABLE";
+  case GIEK_FUNCTION:
+    return "FUNCTION";
+  case GIEK_OTHER:
+    return "OTHER";
+  case GIEK_UNUSED5:
+    return "UNUSED5";
+  case GIEK_UNUSED6:
+    return "UNUSED6";
+  case GIEK_UNUSED7:
+    return "UNUSED7";
+  }
+  llvm_unreachable("Unknown GDBIndexEntryKind value");
+}
+
+const char *llvm::dwarf::GDBIndexEntryLinkageString(GDBIndexEntryLinkage Linkage) {
+  switch (Linkage) {
+  case GIEL_EXTERNAL:
+    return "EXTERNAL";
+  case GIEL_STATIC:
+    return "STATIC";
+  }
+  llvm_unreachable("Unknown GDBIndexEntryLinkage value");
+}
diff --git a/lib/Support/DynamicLibrary.cpp b/lib/Support/DynamicLibrary.cpp
index f14cb45..a825c68 100644
--- a/lib/Support/DynamicLibrary.cpp
+++ b/lib/Support/DynamicLibrary.cpp
@@ -14,39 +14,22 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/DynamicLibrary.h"
+#include "llvm/Support/ManagedStatic.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/Config/config.h"
 #include "llvm/Support/Mutex.h"
+#include "llvm-c/Support.h"
 #include <cstdio>
 #include <cstring>
 
 // Collection of symbol name/value pairs to be searched prior to any libraries.
-static llvm::StringMap<void *> *ExplicitSymbols = 0;
-
-namespace {
-
-struct ExplicitSymbolsDeleter {
-  ~ExplicitSymbolsDeleter() {
-    delete ExplicitSymbols;
-  }
-};
-
-}
-
-static ExplicitSymbolsDeleter Dummy;
-
-
-static llvm::sys::SmartMutex<true>& getMutex() {
-  static llvm::sys::SmartMutex<true> HandlesMutex;
-  return HandlesMutex;
-}
+static llvm::ManagedStatic<llvm::StringMap<void *> > ExplicitSymbols;
+static llvm::ManagedStatic<llvm::sys::SmartMutex<true> > SymbolsMutex;
 
 void llvm::sys::DynamicLibrary::AddSymbol(StringRef symbolName,
                                           void *symbolValue) {
-  SmartScopedLock<true> lock(getMutex());
-  if (ExplicitSymbols == 0)
-    ExplicitSymbols = new StringMap<void*>();
+  SmartScopedLock<true> lock(*SymbolsMutex);
   (*ExplicitSymbols)[symbolName] = symbolValue;
 }
 
@@ -72,7 +55,7 @@ static DenseSet<void *> *OpenedHandles = 0;
 
 DynamicLibrary DynamicLibrary::getPermanentLibrary(const char *filename,
                                                    std::string *errMsg) {
-  SmartScopedLock<true> lock(getMutex());
+  SmartScopedLock<true> lock(*SymbolsMutex);
 
   void *handle = dlopen(filename, RTLD_LAZY|RTLD_GLOBAL);
   if (handle == 0) {
@@ -126,10 +109,10 @@ void *SearchForAddressOfSpecialSymbol(const char* symbolName);
 }
 
 void* DynamicLibrary::SearchForAddressOfSymbol(const char *symbolName) {
-  SmartScopedLock<true> Lock(getMutex());
+  SmartScopedLock<true> Lock(*SymbolsMutex);
 
   // First check symbols added via AddSymbol().
-  if (ExplicitSymbols) {
+  if (ExplicitSymbols.isConstructed()) {
     StringMap<void *>::iterator i = ExplicitSymbols->find(symbolName);
 
     if (i != ExplicitSymbols->end())
@@ -187,3 +170,11 @@ void* DynamicLibrary::SearchForAddressOfSymbol(const char *symbolName) {
 }
 
 #endif // LLVM_ON_WIN32
+
+//===----------------------------------------------------------------------===//
+// C API.
+//===----------------------------------------------------------------------===//
+
+LLVMBool LLVMLoadLibraryPermanently(const char* Filename) {
+  return llvm::sys::DynamicLibrary::LoadLibraryPermanently(Filename);
+}
diff --git a/lib/Support/Errno.cpp b/lib/Support/Errno.cpp
index ed17f60..1eefa3e 100644
--- a/lib/Support/Errno.cpp
+++ b/lib/Support/Errno.cpp
@@ -39,28 +39,27 @@ std::string StrError(int errnum) {
   char buffer[MaxErrStrLen];
   buffer[0] = '\0';
   std::string str;
+  if (errnum == 0)
+    return str;
+
 #ifdef HAVE_STRERROR_R
   // strerror_r is thread-safe.
-  if (errnum)
-# if defined(__GLIBC__) && defined(_GNU_SOURCE)
-    // glibc defines its own incompatible version of strerror_r
-    // which may not use the buffer supplied.
-    str = strerror_r(errnum,buffer,MaxErrStrLen-1);
-# else
-    strerror_r(errnum,buffer,MaxErrStrLen-1);
-    str = buffer;
-# endif
+#if defined(__GLIBC__) && defined(_GNU_SOURCE)
+  // glibc defines its own incompatible version of strerror_r
+  // which may not use the buffer supplied.
+  str = strerror_r(errnum, buffer, MaxErrStrLen - 1);
+#else
+  strerror_r(errnum, buffer, MaxErrStrLen - 1);
+  str = buffer;
+#endif
 #elif HAVE_DECL_STRERROR_S // "Windows Secure API"
-    if (errnum) {
-      strerror_s(buffer, MaxErrStrLen - 1, errnum);
-      str = buffer;
-    }
+  strerror_s(buffer, MaxErrStrLen - 1, errnum);
+  str = buffer;
 #elif defined(HAVE_STRERROR)
   // Copy the thread un-safe result of strerror into
   // the buffer as fast as possible to minimize impact
   // of collision of strerror in multiple threads.
-  if (errnum)
-    str = strerror(errnum);
+  str = strerror(errnum);
 #else
   // Strange that this system doesn't even have strerror
   // but, oh well, just use a generic message
diff --git a/lib/Support/ErrorHandling.cpp b/lib/Support/ErrorHandling.cpp
index 9425445..1eafb96 100644
--- a/lib/Support/ErrorHandling.cpp
+++ b/lib/Support/ErrorHandling.cpp
@@ -20,6 +20,7 @@
 #include "llvm/Support/Signals.h"
 #include "llvm/Support/Threading.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm-c/Core.h"
 #include <cassert>
 #include <cstdlib>
 
@@ -102,3 +103,19 @@ void llvm::llvm_unreachable_internal(const char *msg, const char *file,
   LLVM_BUILTIN_UNREACHABLE;
 #endif
 }
+
+static void bindingsErrorHandler(void *user_data, const std::string& reason,
+                                 bool gen_crash_diag) {
+  LLVMFatalErrorHandler handler =
+      LLVM_EXTENSION reinterpret_cast<LLVMFatalErrorHandler>(user_data);
+  handler(reason.c_str());
+}
+
+void LLVMInstallFatalErrorHandler(LLVMFatalErrorHandler Handler) {
+  install_fatal_error_handler(bindingsErrorHandler,
+                              LLVM_EXTENSION reinterpret_cast<void *>(Handler));
+}
+
+void LLVMResetFatalErrorHandler() {
+  remove_fatal_error_handler();
+}
diff --git a/lib/Support/GraphWriter.cpp b/lib/Support/GraphWriter.cpp
index 7a9400d..85be415 100644
--- a/lib/Support/GraphWriter.cpp
+++ b/lib/Support/GraphWriter.cpp
@@ -219,5 +219,8 @@ void llvm::DisplayGraph(StringRef FilenameRef, bool wait,
   errs() << "Running 'dotty' program... ";
   if (!ExecGraphViewer(dotty, args, Filename, wait, ErrMsg))
     return;
+#else
+  (void)Filename;
+  (void)ErrMsg;
 #endif
 }
diff --git a/lib/Support/Host.cpp b/lib/Support/Host.cpp
index 90e4389..6e9a5c9 100644
--- a/lib/Support/Host.cpp
+++ b/lib/Support/Host.cpp
@@ -52,8 +52,54 @@ using namespace llvm;
 
 /// GetX86CpuIDAndInfo - Execute the specified cpuid and return the 4 values in the
 /// specified arguments.  If we can't run cpuid on the host, return true.
-static bool GetX86CpuIDAndInfo(unsigned value, unsigned *rEAX,
-                            unsigned *rEBX, unsigned *rECX, unsigned *rEDX) {
+static bool GetX86CpuIDAndInfo(unsigned value, unsigned *rEAX, unsigned *rEBX,
+                               unsigned *rECX, unsigned *rEDX) {
+#if defined(__GNUC__) || defined(__clang__)
+  #if defined(__x86_64__) || defined(_M_AMD64) || defined (_M_X64)
+    // gcc doesn't know cpuid would clobber ebx/rbx. Preseve it manually.
+    asm ("movq\t%%rbx, %%rsi\n\t"
+         "cpuid\n\t"
+         "xchgq\t%%rbx, %%rsi\n\t"
+         : "=a" (*rEAX),
+           "=S" (*rEBX),
+           "=c" (*rECX),
+           "=d" (*rEDX)
+         :  "a" (value));
+    return false;
+  #elif defined(i386) || defined(__i386__) || defined(__x86__) || defined(_M_IX86)
+    asm ("movl\t%%ebx, %%esi\n\t"
+         "cpuid\n\t"
+         "xchgl\t%%ebx, %%esi\n\t"
+         : "=a" (*rEAX),
+           "=S" (*rEBX),
+           "=c" (*rECX),
+           "=d" (*rEDX)
+         :  "a" (value));
+    return false;
+// pedantic #else returns to appease -Wunreachable-code (so we don't generate
+// postprocessed code that looks like "return true; return false;")
+  #else
+    return true;
+  #endif
+#elif defined(_MSC_VER)
+  // The MSVC intrinsic is portable across x86 and x64.
+  int registers[4];
+  __cpuid(registers, value);
+  *rEAX = registers[0];
+  *rEBX = registers[1];
+  *rECX = registers[2];
+  *rEDX = registers[3];
+  return false;
+#else
+  return true;
+#endif
+}
+
+/// GetX86CpuIDAndInfoEx - Execute the specified cpuid with subleaf and return the
+/// 4 values in the specified arguments.  If we can't run cpuid on the host,
+/// return true.
+bool GetX86CpuIDAndInfoEx(unsigned value, unsigned subleaf, unsigned *rEAX,
+                          unsigned *rEBX, unsigned *rECX, unsigned *rEDX) {
 #if defined(__x86_64__) || defined(_M_AMD64) || defined (_M_X64)
   #if defined(__GNUC__)
     // gcc doesn't know cpuid would clobber ebx/rbx. Preseve it manually.
@@ -64,16 +110,22 @@ static bool GetX86CpuIDAndInfo(unsigned value, unsigned *rEAX,
            "=S" (*rEBX),
            "=c" (*rECX),
            "=d" (*rEDX)
-         :  "a" (value));
+         :  "a" (value),
+            "c" (subleaf));
     return false;
   #elif defined(_MSC_VER)
-    int registers[4];
-    __cpuid(registers, value);
-    *rEAX = registers[0];
-    *rEBX = registers[1];
-    *rECX = registers[2];
-    *rEDX = registers[3];
-    return false;
+    // __cpuidex was added in MSVC++ 9.0 SP1
+    #if (_MSC_VER > 1500) || (_MSC_VER == 1500 && _MSC_FULL_VER >= 150030729)
+      int registers[4];
+      __cpuidex(registers, value, subleaf);
+      *rEAX = registers[0];
+      *rEBX = registers[1];
+      *rECX = registers[2];
+      *rEDX = registers[3];
+      return false;
+    #else
+      return true;
+    #endif
   #else
     return true;
   #endif
@@ -86,11 +138,13 @@ static bool GetX86CpuIDAndInfo(unsigned value, unsigned *rEAX,
            "=S" (*rEBX),
            "=c" (*rECX),
            "=d" (*rEDX)
-         :  "a" (value));
+         :  "a" (value),
+            "c" (subleaf));
     return false;
   #elif defined(_MSC_VER)
     __asm {
       mov   eax,value
+      mov   ecx,subleaf
       cpuid
       mov   esi,rEAX
       mov   dword ptr [esi],eax
@@ -102,8 +156,6 @@ static bool GetX86CpuIDAndInfo(unsigned value, unsigned *rEAX,
       mov   dword ptr [esi],edx
     }
     return false;
-// pedantic #else returns to appease -Wunreachable-code (so we don't generate
-// postprocessed code that looks like "return true; return false;")
   #else
     return true;
   #endif
@@ -148,6 +200,14 @@ std::string sys::getHostCPUName() {
   unsigned Model  = 0;
   DetectX86FamilyModel(EAX, Family, Model);
 
+  union {
+    unsigned u[3];
+    char     c[12];
+  } text;
+
+  GetX86CpuIDAndInfo(0, &EAX, text.u+0, text.u+2, text.u+1);
+
+  unsigned MaxLeaf = EAX;
   bool HasSSE3 = (ECX & 0x1);
   bool HasSSE41 = (ECX & 0x80000);
   // If CPUID indicates support for XSAVE, XRESTORE and AVX, and XGETBV 
@@ -155,15 +215,12 @@ std::string sys::getHostCPUName() {
   // switch, then we have full AVX support.
   const unsigned AVXBits = (1 << 27) | (1 << 28);
   bool HasAVX = ((ECX & AVXBits) == AVXBits) && OSHasAVXSupport();
+  bool HasAVX2 = HasAVX && MaxLeaf >= 0x7 &&
+                 !GetX86CpuIDAndInfoEx(0x7, 0x0, &EAX, &EBX, &ECX, &EDX) &&
+                 (EBX & 0x20);
   GetX86CpuIDAndInfo(0x80000001, &EAX, &EBX, &ECX, &EDX);
   bool Em64T = (EDX >> 29) & 0x1;
 
-  union {
-    unsigned u[3];
-    char     c[12];
-  } text;
-
-  GetX86CpuIDAndInfo(0, &EAX, text.u+0, text.u+2, text.u+1);
   if (memcmp(text.c, "GenuineIntel", 12) == 0) {
     switch (Family) {
     case 3:
@@ -271,10 +328,20 @@ std::string sys::getHostCPUName() {
 
       // Ivy Bridge:
       case 58:
+      case 62: // Ivy Bridge EP
         // Not all Ivy Bridge processors support AVX (such as the Pentium
         // versions instead of the i7 versions).
         return HasAVX ? "core-avx-i" : "corei7";
 
+      // Haswell:
+      case 60:
+      case 63:
+      case 69:
+      case 70:
+        // Not all Haswell processors support AVX too (such as the Pentium
+        // versions instead of the i7 versions).
+        return HasAVX2 ? "core-avx2" : "corei7";
+
       case 28: // Most 45 nm Intel Atom processors
       case 38: // 45 nm Atom Lincroft
       case 39: // 32 nm Atom Medfield
@@ -282,6 +349,12 @@ std::string sys::getHostCPUName() {
       case 54: // 32 nm Atom Midview
         return "atom";
 
+      // Atom Silvermont codes from the Intel software optimization guide.
+      case 55:
+      case 74:
+      case 77:
+        return "slm";
+
       default: return (Em64T) ? "x86-64" : "i686";
       }
     case 15: {
@@ -359,9 +432,11 @@ std::string sys::getHostCPUName() {
       case 21:
         if (!HasAVX) // If the OS doesn't support AVX provide a sane fallback.
           return "btver1";
-        if (Model > 15 && Model <= 31)
-          return "bdver2";
-        return "bdver1";
+        if (Model >= 0x30)
+          return "bdver3"; // 30h-3Fh: Steamroller
+        if (Model >= 0x10)
+          return "bdver2"; // 10h-1Fh: Piledriver
+        return "bdver1";   // 00h-0Fh: Bulldozer
       case 22:
         if (!HasAVX) // If the OS doesn't support AVX provide a sane fallback.
           return "btver1";
@@ -546,6 +621,48 @@ std::string sys::getHostCPUName() {
 
   return "generic";
 }
+#elif defined(__linux__) && defined(__s390x__)
+std::string sys::getHostCPUName() {
+  // STIDP is a privileged operation, so use /proc/cpuinfo instead.
+  // Note: We cannot mmap /proc/cpuinfo here and then process the resulting
+  // memory buffer because the 'file' has 0 size (it can be read from only
+  // as a stream).
+
+  std::string Err;
+  DataStreamer *DS = getDataFileStreamer("/proc/cpuinfo", &Err);
+  if (!DS) {
+    DEBUG(dbgs() << "Unable to open /proc/cpuinfo: " << Err << "\n");
+    return "generic";
+  }
+
+  // The "processor 0:" line comes after a fair amount of other information,
+  // including a cache breakdown, but this should be plenty.
+  char buffer[2048];
+  size_t CPUInfoSize = DS->GetBytes((unsigned char*) buffer, sizeof(buffer));
+  delete DS;
+
+  StringRef Str(buffer, CPUInfoSize);
+  SmallVector<StringRef, 32> Lines;
+  Str.split(Lines, "\n");
+  for (unsigned I = 0, E = Lines.size(); I != E; ++I) {
+    if (Lines[I].startswith("processor ")) {
+      size_t Pos = Lines[I].find("machine = ");
+      if (Pos != StringRef::npos) {
+        Pos += sizeof("machine = ") - 1;
+        unsigned int Id;
+        if (!Lines[I].drop_front(Pos).getAsInteger(10, Id)) {
+          if (Id >= 2827)
+            return "zEC12";
+          if (Id >= 2817)
+            return "z196";
+        }
+      }
+      break;
+    }
+  }
+  
+  return "generic";
+}
 #else
 std::string sys::getHostCPUName() {
   return "generic";
diff --git a/lib/Support/Locale.cpp b/lib/Support/Locale.cpp
index 17b9b6c..35ddf7f 100644
--- a/lib/Support/Locale.cpp
+++ b/lib/Support/Locale.cpp
@@ -1,10 +1,31 @@
 #include "llvm/Support/Locale.h"
-#include "llvm/Config/config.h"
+#include "llvm/Support/Unicode.h"
 
-#ifdef __APPLE__
-#include "LocaleXlocale.inc"
-#elif LLVM_ON_WIN32
-#include "LocaleWindows.inc"
+namespace llvm {
+namespace sys {
+namespace locale {
+
+int columnWidth(StringRef Text) {
+#if LLVM_ON_WIN32
+  return Text.size();
 #else
-#include "LocaleGeneric.inc"
+  return llvm::sys::unicode::columnWidthUTF8(Text);
 #endif
+}
+
+bool isPrint(int UCS) {
+#if LLVM_ON_WIN32
+  // Restrict characters that we'll try to print to the the lower part of ASCII
+  // except for the control characters (0x20 - 0x7E). In general one can not
+  // reliably output code points U+0080 and higher using narrow character C/C++
+  // output functions in Windows, because the meaning of the upper 128 codes is
+  // determined by the active code page in the console.
+  return ' ' <= UCS && UCS <= '~';
+#else
+  return llvm::sys::unicode::isPrintable(UCS);
+#endif
+}
+
+} // namespace locale
+} // namespace sys
+} // namespace llvm
diff --git a/lib/Support/LocaleGeneric.inc b/lib/Support/LocaleGeneric.inc
deleted file mode 100644
index 3a939b8..0000000
--- a/lib/Support/LocaleGeneric.inc
+++ /dev/null
@@ -1,382 +0,0 @@
-//===- llvm/Support/LocaleGeneric.inc - Locale-dependent stuff  -*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements llvm::sys::locale::columnWidth and
-// llvm::sys::locale::isPrint functions for UTF-8 locales.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/Support/ConvertUTF.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/UnicodeCharRanges.h"
-
-namespace llvm {
-namespace sys {
-namespace locale {
-
-enum ColumnWidthErrors {
-  ErrorInvalidUTF8 = -2,
-  ErrorNonPrintableCharacter = -1
-};
-
-/// Determines if a character is likely to be displayed correctly on the
-/// terminal. Exact implementation would have to depend on the specific
-/// terminal, so we define the semantic that should be suitable for generic case
-/// of a terminal capable to output Unicode characters.
-/// All characters from the Unicode codepoint range are considered printable
-/// except for:
-///   * C0 and C1 control character ranges;
-///   * default ignorable code points as per 5.21 of
-///     http://www.unicode.org/versions/Unicode6.2.0/UnicodeStandard-6.2.pdf
-///   * format characters (category = Cf);
-///   * surrogates (category = Cs);
-///   * unassigned characters (category = Cn).
-/// \return true if the character is considered printable.
-bool isPrint(int UCS) {
-  // Sorted list of non-overlapping intervals of code points that are not
-  // supposed to be printable.
-  static const UnicodeCharRange NonPrintableRanges[] = {
-    { 0x0000, 0x001F }, { 0x007F, 0x009F }, { 0x00AD, 0x00AD },
-    { 0x034F, 0x034F }, { 0x0378, 0x0379 }, { 0x037F, 0x0383 },
-    { 0x038B, 0x038B }, { 0x038D, 0x038D }, { 0x03A2, 0x03A2 },
-    { 0x0528, 0x0530 }, { 0x0557, 0x0558 }, { 0x0560, 0x0560 },
-    { 0x0588, 0x0588 }, { 0x058B, 0x058E }, { 0x0590, 0x0590 },
-    { 0x05C8, 0x05CF }, { 0x05EB, 0x05EF }, { 0x05F5, 0x0605 },
-    { 0x061C, 0x061D }, { 0x06DD, 0x06DD }, { 0x070E, 0x070F },
-    { 0x074B, 0x074C }, { 0x07B2, 0x07BF }, { 0x07FB, 0x07FF },
-    { 0x082E, 0x082F }, { 0x083F, 0x083F }, { 0x085C, 0x085D },
-    { 0x085F, 0x089F }, { 0x08A1, 0x08A1 }, { 0x08AD, 0x08E3 },
-    { 0x08FF, 0x08FF }, { 0x0978, 0x0978 }, { 0x0980, 0x0980 },
-    { 0x0984, 0x0984 }, { 0x098D, 0x098E }, { 0x0991, 0x0992 },
-    { 0x09A9, 0x09A9 }, { 0x09B1, 0x09B1 }, { 0x09B3, 0x09B5 },
-    { 0x09BA, 0x09BB }, { 0x09C5, 0x09C6 }, { 0x09C9, 0x09CA },
-    { 0x09CF, 0x09D6 }, { 0x09D8, 0x09DB }, { 0x09DE, 0x09DE },
-    { 0x09E4, 0x09E5 }, { 0x09FC, 0x0A00 }, { 0x0A04, 0x0A04 },
-    { 0x0A0B, 0x0A0E }, { 0x0A11, 0x0A12 }, { 0x0A29, 0x0A29 },
-    { 0x0A31, 0x0A31 }, { 0x0A34, 0x0A34 }, { 0x0A37, 0x0A37 },
-    { 0x0A3A, 0x0A3B }, { 0x0A3D, 0x0A3D }, { 0x0A43, 0x0A46 },
-    { 0x0A49, 0x0A4A }, { 0x0A4E, 0x0A50 }, { 0x0A52, 0x0A58 },
-    { 0x0A5D, 0x0A5D }, { 0x0A5F, 0x0A65 }, { 0x0A76, 0x0A80 },
-    { 0x0A84, 0x0A84 }, { 0x0A8E, 0x0A8E }, { 0x0A92, 0x0A92 },
-    { 0x0AA9, 0x0AA9 }, { 0x0AB1, 0x0AB1 }, { 0x0AB4, 0x0AB4 },
-    { 0x0ABA, 0x0ABB }, { 0x0AC6, 0x0AC6 }, { 0x0ACA, 0x0ACA },
-    { 0x0ACE, 0x0ACF }, { 0x0AD1, 0x0ADF }, { 0x0AE4, 0x0AE5 },
-    { 0x0AF2, 0x0B00 }, { 0x0B04, 0x0B04 }, { 0x0B0D, 0x0B0E },
-    { 0x0B11, 0x0B12 }, { 0x0B29, 0x0B29 }, { 0x0B31, 0x0B31 },
-    { 0x0B34, 0x0B34 }, { 0x0B3A, 0x0B3B }, { 0x0B45, 0x0B46 },
-    { 0x0B49, 0x0B4A }, { 0x0B4E, 0x0B55 }, { 0x0B58, 0x0B5B },
-    { 0x0B5E, 0x0B5E }, { 0x0B64, 0x0B65 }, { 0x0B78, 0x0B81 },
-    { 0x0B84, 0x0B84 }, { 0x0B8B, 0x0B8D }, { 0x0B91, 0x0B91 },
-    { 0x0B96, 0x0B98 }, { 0x0B9B, 0x0B9B }, { 0x0B9D, 0x0B9D },
-    { 0x0BA0, 0x0BA2 }, { 0x0BA5, 0x0BA7 }, { 0x0BAB, 0x0BAD },
-    { 0x0BBA, 0x0BBD }, { 0x0BC3, 0x0BC5 }, { 0x0BC9, 0x0BC9 },
-    { 0x0BCE, 0x0BCF }, { 0x0BD1, 0x0BD6 }, { 0x0BD8, 0x0BE5 },
-    { 0x0BFB, 0x0C00 }, { 0x0C04, 0x0C04 }, { 0x0C0D, 0x0C0D },
-    { 0x0C11, 0x0C11 }, { 0x0C29, 0x0C29 }, { 0x0C34, 0x0C34 },
-    { 0x0C3A, 0x0C3C }, { 0x0C45, 0x0C45 }, { 0x0C49, 0x0C49 },
-    { 0x0C4E, 0x0C54 }, { 0x0C57, 0x0C57 }, { 0x0C5A, 0x0C5F },
-    { 0x0C64, 0x0C65 }, { 0x0C70, 0x0C77 }, { 0x0C80, 0x0C81 },
-    { 0x0C84, 0x0C84 }, { 0x0C8D, 0x0C8D }, { 0x0C91, 0x0C91 },
-    { 0x0CA9, 0x0CA9 }, { 0x0CB4, 0x0CB4 }, { 0x0CBA, 0x0CBB },
-    { 0x0CC5, 0x0CC5 }, { 0x0CC9, 0x0CC9 }, { 0x0CCE, 0x0CD4 },
-    { 0x0CD7, 0x0CDD }, { 0x0CDF, 0x0CDF }, { 0x0CE4, 0x0CE5 },
-    { 0x0CF0, 0x0CF0 }, { 0x0CF3, 0x0D01 }, { 0x0D04, 0x0D04 },
-    { 0x0D0D, 0x0D0D }, { 0x0D11, 0x0D11 }, { 0x0D3B, 0x0D3C },
-    { 0x0D45, 0x0D45 }, { 0x0D49, 0x0D49 }, { 0x0D4F, 0x0D56 },
-    { 0x0D58, 0x0D5F }, { 0x0D64, 0x0D65 }, { 0x0D76, 0x0D78 },
-    { 0x0D80, 0x0D81 }, { 0x0D84, 0x0D84 }, { 0x0D97, 0x0D99 },
-    { 0x0DB2, 0x0DB2 }, { 0x0DBC, 0x0DBC }, { 0x0DBE, 0x0DBF },
-    { 0x0DC7, 0x0DC9 }, { 0x0DCB, 0x0DCE }, { 0x0DD5, 0x0DD5 },
-    { 0x0DD7, 0x0DD7 }, { 0x0DE0, 0x0DF1 }, { 0x0DF5, 0x0E00 },
-    { 0x0E3B, 0x0E3E }, { 0x0E5C, 0x0E80 }, { 0x0E83, 0x0E83 },
-    { 0x0E85, 0x0E86 }, { 0x0E89, 0x0E89 }, { 0x0E8B, 0x0E8C },
-    { 0x0E8E, 0x0E93 }, { 0x0E98, 0x0E98 }, { 0x0EA0, 0x0EA0 },
-    { 0x0EA4, 0x0EA4 }, { 0x0EA6, 0x0EA6 }, { 0x0EA8, 0x0EA9 },
-    { 0x0EAC, 0x0EAC }, { 0x0EBA, 0x0EBA }, { 0x0EBE, 0x0EBF },
-    { 0x0EC5, 0x0EC5 }, { 0x0EC7, 0x0EC7 }, { 0x0ECE, 0x0ECF },
-    { 0x0EDA, 0x0EDB }, { 0x0EE0, 0x0EFF }, { 0x0F48, 0x0F48 },
-    { 0x0F6D, 0x0F70 }, { 0x0F98, 0x0F98 }, { 0x0FBD, 0x0FBD },
-    { 0x0FCD, 0x0FCD }, { 0x0FDB, 0x0FFF }, { 0x10C6, 0x10C6 },
-    { 0x10C8, 0x10CC }, { 0x10CE, 0x10CF }, { 0x115F, 0x1160 },
-    { 0x1249, 0x1249 }, { 0x124E, 0x124F }, { 0x1257, 0x1257 },
-    { 0x1259, 0x1259 }, { 0x125E, 0x125F }, { 0x1289, 0x1289 },
-    { 0x128E, 0x128F }, { 0x12B1, 0x12B1 }, { 0x12B6, 0x12B7 },
-    { 0x12BF, 0x12BF }, { 0x12C1, 0x12C1 }, { 0x12C6, 0x12C7 },
-    { 0x12D7, 0x12D7 }, { 0x1311, 0x1311 }, { 0x1316, 0x1317 },
-    { 0x135B, 0x135C }, { 0x137D, 0x137F }, { 0x139A, 0x139F },
-    { 0x13F5, 0x13FF }, { 0x169D, 0x169F }, { 0x16F1, 0x16FF },
-    { 0x170D, 0x170D }, { 0x1715, 0x171F }, { 0x1737, 0x173F },
-    { 0x1754, 0x175F }, { 0x176D, 0x176D }, { 0x1771, 0x1771 },
-    { 0x1774, 0x177F }, { 0x17B4, 0x17B5 }, { 0x17DE, 0x17DF },
-    { 0x17EA, 0x17EF }, { 0x17FA, 0x17FF }, { 0x180B, 0x180D },
-    { 0x180F, 0x180F }, { 0x181A, 0x181F }, { 0x1878, 0x187F },
-    { 0x18AB, 0x18AF }, { 0x18F6, 0x18FF }, { 0x191D, 0x191F },
-    { 0x192C, 0x192F }, { 0x193C, 0x193F }, { 0x1941, 0x1943 },
-    { 0x196E, 0x196F }, { 0x1975, 0x197F }, { 0x19AC, 0x19AF },
-    { 0x19CA, 0x19CF }, { 0x19DB, 0x19DD }, { 0x1A1C, 0x1A1D },
-    { 0x1A5F, 0x1A5F }, { 0x1A7D, 0x1A7E }, { 0x1A8A, 0x1A8F },
-    { 0x1A9A, 0x1A9F }, { 0x1AAE, 0x1AFF }, { 0x1B4C, 0x1B4F },
-    { 0x1B7D, 0x1B7F }, { 0x1BF4, 0x1BFB }, { 0x1C38, 0x1C3A },
-    { 0x1C4A, 0x1C4C }, { 0x1C80, 0x1CBF }, { 0x1CC8, 0x1CCF },
-    { 0x1CF7, 0x1CFF }, { 0x1DE7, 0x1DFB }, { 0x1F16, 0x1F17 },
-    { 0x1F1E, 0x1F1F }, { 0x1F46, 0x1F47 }, { 0x1F4E, 0x1F4F },
-    { 0x1F58, 0x1F58 }, { 0x1F5A, 0x1F5A }, { 0x1F5C, 0x1F5C },
-    { 0x1F5E, 0x1F5E }, { 0x1F7E, 0x1F7F }, { 0x1FB5, 0x1FB5 },
-    { 0x1FC5, 0x1FC5 }, { 0x1FD4, 0x1FD5 }, { 0x1FDC, 0x1FDC },
-    { 0x1FF0, 0x1FF1 }, { 0x1FF5, 0x1FF5 }, { 0x1FFF, 0x1FFF },
-    { 0x200B, 0x200F }, { 0x202A, 0x202E }, { 0x2060, 0x206F },
-    { 0x2072, 0x2073 }, { 0x208F, 0x208F }, { 0x209D, 0x209F },
-    { 0x20BB, 0x20CF }, { 0x20F1, 0x20FF }, { 0x218A, 0x218F },
-    { 0x23F4, 0x23FF }, { 0x2427, 0x243F }, { 0x244B, 0x245F },
-    { 0x2700, 0x2700 }, { 0x2B4D, 0x2B4F }, { 0x2B5A, 0x2BFF },
-    { 0x2C2F, 0x2C2F }, { 0x2C5F, 0x2C5F }, { 0x2CF4, 0x2CF8 },
-    { 0x2D26, 0x2D26 }, { 0x2D28, 0x2D2C }, { 0x2D2E, 0x2D2F },
-    { 0x2D68, 0x2D6E }, { 0x2D71, 0x2D7E }, { 0x2D97, 0x2D9F },
-    { 0x2DA7, 0x2DA7 }, { 0x2DAF, 0x2DAF }, { 0x2DB7, 0x2DB7 },
-    { 0x2DBF, 0x2DBF }, { 0x2DC7, 0x2DC7 }, { 0x2DCF, 0x2DCF },
-    { 0x2DD7, 0x2DD7 }, { 0x2DDF, 0x2DDF }, { 0x2E3C, 0x2E7F },
-    { 0x2E9A, 0x2E9A }, { 0x2EF4, 0x2EFF }, { 0x2FD6, 0x2FEF },
-    { 0x2FFC, 0x2FFF }, { 0x3040, 0x3040 }, { 0x3097, 0x3098 },
-    { 0x3100, 0x3104 }, { 0x312E, 0x3130 }, { 0x3164, 0x3164 },
-    { 0x318F, 0x318F }, { 0x31BB, 0x31BF }, { 0x31E4, 0x31EF },
-    { 0x321F, 0x321F }, { 0x32FF, 0x32FF }, { 0x4DB6, 0x4DBF },
-    { 0x9FCD, 0x9FFF }, { 0xA48D, 0xA48F }, { 0xA4C7, 0xA4CF },
-    { 0xA62C, 0xA63F }, { 0xA698, 0xA69E }, { 0xA6F8, 0xA6FF },
-    { 0xA78F, 0xA78F }, { 0xA794, 0xA79F }, { 0xA7AB, 0xA7F7 },
-    { 0xA82C, 0xA82F }, { 0xA83A, 0xA83F }, { 0xA878, 0xA87F },
-    { 0xA8C5, 0xA8CD }, { 0xA8DA, 0xA8DF }, { 0xA8FC, 0xA8FF },
-    { 0xA954, 0xA95E }, { 0xA97D, 0xA97F }, { 0xA9CE, 0xA9CE },
-    { 0xA9DA, 0xA9DD }, { 0xA9E0, 0xA9FF }, { 0xAA37, 0xAA3F },
-    { 0xAA4E, 0xAA4F }, { 0xAA5A, 0xAA5B }, { 0xAA7C, 0xAA7F },
-    { 0xAAC3, 0xAADA }, { 0xAAF7, 0xAB00 }, { 0xAB07, 0xAB08 },
-    { 0xAB0F, 0xAB10 }, { 0xAB17, 0xAB1F }, { 0xAB27, 0xAB27 },
-    { 0xAB2F, 0xABBF }, { 0xABEE, 0xABEF }, { 0xABFA, 0xABFF },
-    { 0xD7A4, 0xD7AF }, { 0xD7C7, 0xD7CA }, { 0xD7FC, 0xDFFF },
-    { 0xFA6E, 0xFA6F }, { 0xFADA, 0xFAFF }, { 0xFB07, 0xFB12 },
-    { 0xFB18, 0xFB1C }, { 0xFB37, 0xFB37 }, { 0xFB3D, 0xFB3D },
-    { 0xFB3F, 0xFB3F }, { 0xFB42, 0xFB42 }, { 0xFB45, 0xFB45 },
-    { 0xFBC2, 0xFBD2 }, { 0xFD40, 0xFD4F }, { 0xFD90, 0xFD91 },
-    { 0xFDC8, 0xFDEF }, { 0xFDFE, 0xFE0F }, { 0xFE1A, 0xFE1F },
-    { 0xFE27, 0xFE2F }, { 0xFE53, 0xFE53 }, { 0xFE67, 0xFE67 },
-    { 0xFE6C, 0xFE6F }, { 0xFE75, 0xFE75 }, { 0xFEFD, 0xFEFF },
-    { 0xFF00, 0xFF00 }, { 0xFFA0, 0xFFA0 }, { 0xFFBF, 0xFFC1 },
-    { 0xFFC8, 0xFFC9 }, { 0xFFD0, 0xFFD1 }, { 0xFFD8, 0xFFD9 },
-    { 0xFFDD, 0xFFDF }, { 0xFFE7, 0xFFE7 }, { 0xFFEF, 0xFFFB },
-    { 0xFFFE, 0xFFFF }, { 0x1000C, 0x1000C }, { 0x10027, 0x10027 },
-    { 0x1003B, 0x1003B }, { 0x1003E, 0x1003E }, { 0x1004E, 0x1004F },
-    { 0x1005E, 0x1007F }, { 0x100FB, 0x100FF }, { 0x10103, 0x10106 },
-    { 0x10134, 0x10136 }, { 0x1018B, 0x1018F }, { 0x1019C, 0x101CF },
-    { 0x101FE, 0x1027F }, { 0x1029D, 0x1029F }, { 0x102D1, 0x102FF },
-    { 0x1031F, 0x1031F }, { 0x10324, 0x1032F }, { 0x1034B, 0x1037F },
-    { 0x1039E, 0x1039E }, { 0x103C4, 0x103C7 }, { 0x103D6, 0x103FF },
-    { 0x1049E, 0x1049F }, { 0x104AA, 0x107FF }, { 0x10806, 0x10807 },
-    { 0x10809, 0x10809 }, { 0x10836, 0x10836 }, { 0x10839, 0x1083B },
-    { 0x1083D, 0x1083E }, { 0x10856, 0x10856 }, { 0x10860, 0x108FF },
-    { 0x1091C, 0x1091E }, { 0x1093A, 0x1093E }, { 0x10940, 0x1097F },
-    { 0x109B8, 0x109BD }, { 0x109C0, 0x109FF }, { 0x10A04, 0x10A04 },
-    { 0x10A07, 0x10A0B }, { 0x10A14, 0x10A14 }, { 0x10A18, 0x10A18 },
-    { 0x10A34, 0x10A37 }, { 0x10A3B, 0x10A3E }, { 0x10A48, 0x10A4F },
-    { 0x10A59, 0x10A5F }, { 0x10A80, 0x10AFF }, { 0x10B36, 0x10B38 },
-    { 0x10B56, 0x10B57 }, { 0x10B73, 0x10B77 }, { 0x10B80, 0x10BFF },
-    { 0x10C49, 0x10E5F }, { 0x10E7F, 0x10FFF }, { 0x1104E, 0x11051 },
-    { 0x11070, 0x1107F }, { 0x110BD, 0x110BD }, { 0x110C2, 0x110CF },
-    { 0x110E9, 0x110EF }, { 0x110FA, 0x110FF }, { 0x11135, 0x11135 },
-    { 0x11144, 0x1117F }, { 0x111C9, 0x111CF }, { 0x111DA, 0x1167F },
-    { 0x116B8, 0x116BF }, { 0x116CA, 0x11FFF }, { 0x1236F, 0x123FF },
-    { 0x12463, 0x1246F }, { 0x12474, 0x12FFF }, { 0x1342F, 0x167FF },
-    { 0x16A39, 0x16EFF }, { 0x16F45, 0x16F4F }, { 0x16F7F, 0x16F8E },
-    { 0x16FA0, 0x1AFFF }, { 0x1B002, 0x1CFFF }, { 0x1D0F6, 0x1D0FF },
-    { 0x1D127, 0x1D128 }, { 0x1D173, 0x1D17A }, { 0x1D1DE, 0x1D1FF },
-    { 0x1D246, 0x1D2FF }, { 0x1D357, 0x1D35F }, { 0x1D372, 0x1D3FF },
-    { 0x1D455, 0x1D455 }, { 0x1D49D, 0x1D49D }, { 0x1D4A0, 0x1D4A1 },
-    { 0x1D4A3, 0x1D4A4 }, { 0x1D4A7, 0x1D4A8 }, { 0x1D4AD, 0x1D4AD },
-    { 0x1D4BA, 0x1D4BA }, { 0x1D4BC, 0x1D4BC }, { 0x1D4C4, 0x1D4C4 },
-    { 0x1D506, 0x1D506 }, { 0x1D50B, 0x1D50C }, { 0x1D515, 0x1D515 },
-    { 0x1D51D, 0x1D51D }, { 0x1D53A, 0x1D53A }, { 0x1D53F, 0x1D53F },
-    { 0x1D545, 0x1D545 }, { 0x1D547, 0x1D549 }, { 0x1D551, 0x1D551 },
-    { 0x1D6A6, 0x1D6A7 }, { 0x1D7CC, 0x1D7CD }, { 0x1D800, 0x1EDFF },
-    { 0x1EE04, 0x1EE04 }, { 0x1EE20, 0x1EE20 }, { 0x1EE23, 0x1EE23 },
-    { 0x1EE25, 0x1EE26 }, { 0x1EE28, 0x1EE28 }, { 0x1EE33, 0x1EE33 },
-    { 0x1EE38, 0x1EE38 }, { 0x1EE3A, 0x1EE3A }, { 0x1EE3C, 0x1EE41 },
-    { 0x1EE43, 0x1EE46 }, { 0x1EE48, 0x1EE48 }, { 0x1EE4A, 0x1EE4A },
-    { 0x1EE4C, 0x1EE4C }, { 0x1EE50, 0x1EE50 }, { 0x1EE53, 0x1EE53 },
-    { 0x1EE55, 0x1EE56 }, { 0x1EE58, 0x1EE58 }, { 0x1EE5A, 0x1EE5A },
-    { 0x1EE5C, 0x1EE5C }, { 0x1EE5E, 0x1EE5E }, { 0x1EE60, 0x1EE60 },
-    { 0x1EE63, 0x1EE63 }, { 0x1EE65, 0x1EE66 }, { 0x1EE6B, 0x1EE6B },
-    { 0x1EE73, 0x1EE73 }, { 0x1EE78, 0x1EE78 }, { 0x1EE7D, 0x1EE7D },
-    { 0x1EE7F, 0x1EE7F }, { 0x1EE8A, 0x1EE8A }, { 0x1EE9C, 0x1EEA0 },
-    { 0x1EEA4, 0x1EEA4 }, { 0x1EEAA, 0x1EEAA }, { 0x1EEBC, 0x1EEEF },
-    { 0x1EEF2, 0x1EFFF }, { 0x1F02C, 0x1F02F }, { 0x1F094, 0x1F09F },
-    { 0x1F0AF, 0x1F0B0 }, { 0x1F0BF, 0x1F0C0 }, { 0x1F0D0, 0x1F0D0 },
-    { 0x1F0E0, 0x1F0FF }, { 0x1F10B, 0x1F10F }, { 0x1F12F, 0x1F12F },
-    { 0x1F16C, 0x1F16F }, { 0x1F19B, 0x1F1E5 }, { 0x1F203, 0x1F20F },
-    { 0x1F23B, 0x1F23F }, { 0x1F249, 0x1F24F }, { 0x1F252, 0x1F2FF },
-    { 0x1F321, 0x1F32F }, { 0x1F336, 0x1F336 }, { 0x1F37D, 0x1F37F },
-    { 0x1F394, 0x1F39F }, { 0x1F3C5, 0x1F3C5 }, { 0x1F3CB, 0x1F3DF },
-    { 0x1F3F1, 0x1F3FF }, { 0x1F43F, 0x1F43F }, { 0x1F441, 0x1F441 },
-    { 0x1F4F8, 0x1F4F8 }, { 0x1F4FD, 0x1F4FF }, { 0x1F53E, 0x1F53F },
-    { 0x1F544, 0x1F54F }, { 0x1F568, 0x1F5FA }, { 0x1F641, 0x1F644 },
-    { 0x1F650, 0x1F67F }, { 0x1F6C6, 0x1F6FF }, { 0x1F774, 0x1FFFF },
-    { 0x2A6D7, 0x2A6FF }, { 0x2B735, 0x2B73F }, { 0x2B81E, 0x2F7FF },
-    { 0x2FA1E, 0xF0000 }, { 0xFFFFE, 0xFFFFF }, { 0x10FFFE, 0x10FFFF }
-  };
-
-  return UCS >= 0 && UCS <= 0x10FFFF && !isCharInSet(UCS, NonPrintableRanges);
-}
-
-/// Gets the number of positions a character is likely to occupy when output
-/// on a terminal ("character width"). This depends on the implementation of the
-/// terminal, and there's no standard definition of character width.
-/// The implementation defines it in a way that is expected to be compatible
-/// with a generic Unicode-capable terminal.
-/// \return Character width:
-///   * ErrorNonPrintableCharacter (-1) for non-printable characters (as
-///     identified by isPrint);
-///   * 0 for non-spacing and enclosing combining marks;
-///   * 2 for CJK characters excluding halfwidth forms;
-///   * 1 for all remaining characters.
-static inline int charWidth(int UCS)
-{
-  if (!isPrint(UCS))
-    return ErrorNonPrintableCharacter;
-
-  // Sorted list of non-spacing and enclosing combining mark intervals as
-  // defined in "3.6 Combination" of
-  // http://www.unicode.org/versions/Unicode6.2.0/UnicodeStandard-6.2.pdf
-  static const UnicodeCharRange CombiningCharacters[] = {
-    { 0x0300, 0x036F }, { 0x0483, 0x0489 }, { 0x0591, 0x05BD },
-    { 0x05BF, 0x05BF }, { 0x05C1, 0x05C2 }, { 0x05C4, 0x05C5 },
-    { 0x05C7, 0x05C7 }, { 0x0610, 0x061A }, { 0x064B, 0x065F },
-    { 0x0670, 0x0670 }, { 0x06D6, 0x06DC }, { 0x06DF, 0x06E4 },
-    { 0x06E7, 0x06E8 }, { 0x06EA, 0x06ED }, { 0x0711, 0x0711 },
-    { 0x0730, 0x074A }, { 0x07A6, 0x07B0 }, { 0x07EB, 0x07F3 },
-    { 0x0816, 0x0819 }, { 0x081B, 0x0823 }, { 0x0825, 0x0827 },
-    { 0x0829, 0x082D }, { 0x0859, 0x085B }, { 0x08E4, 0x08FE },
-    { 0x0900, 0x0902 }, { 0x093A, 0x093A }, { 0x093C, 0x093C },
-    { 0x0941, 0x0948 }, { 0x094D, 0x094D }, { 0x0951, 0x0957 },
-    { 0x0962, 0x0963 }, { 0x0981, 0x0981 }, { 0x09BC, 0x09BC },
-    { 0x09C1, 0x09C4 }, { 0x09CD, 0x09CD }, { 0x09E2, 0x09E3 },
-    { 0x0A01, 0x0A02 }, { 0x0A3C, 0x0A3C }, { 0x0A41, 0x0A42 },
-    { 0x0A47, 0x0A48 }, { 0x0A4B, 0x0A4D }, { 0x0A51, 0x0A51 },
-    { 0x0A70, 0x0A71 }, { 0x0A75, 0x0A75 }, { 0x0A81, 0x0A82 },
-    { 0x0ABC, 0x0ABC }, { 0x0AC1, 0x0AC5 }, { 0x0AC7, 0x0AC8 },
-    { 0x0ACD, 0x0ACD }, { 0x0AE2, 0x0AE3 }, { 0x0B01, 0x0B01 },
-    { 0x0B3C, 0x0B3C }, { 0x0B3F, 0x0B3F }, { 0x0B41, 0x0B44 },
-    { 0x0B4D, 0x0B4D }, { 0x0B56, 0x0B56 }, { 0x0B62, 0x0B63 },
-    { 0x0B82, 0x0B82 }, { 0x0BC0, 0x0BC0 }, { 0x0BCD, 0x0BCD },
-    { 0x0C3E, 0x0C40 }, { 0x0C46, 0x0C48 }, { 0x0C4A, 0x0C4D },
-    { 0x0C55, 0x0C56 }, { 0x0C62, 0x0C63 }, { 0x0CBC, 0x0CBC },
-    { 0x0CBF, 0x0CBF }, { 0x0CC6, 0x0CC6 }, { 0x0CCC, 0x0CCD },
-    { 0x0CE2, 0x0CE3 }, { 0x0D41, 0x0D44 }, { 0x0D4D, 0x0D4D },
-    { 0x0D62, 0x0D63 }, { 0x0DCA, 0x0DCA }, { 0x0DD2, 0x0DD4 },
-    { 0x0DD6, 0x0DD6 }, { 0x0E31, 0x0E31 }, { 0x0E34, 0x0E3A },
-    { 0x0E47, 0x0E4E }, { 0x0EB1, 0x0EB1 }, { 0x0EB4, 0x0EB9 },
-    { 0x0EBB, 0x0EBC }, { 0x0EC8, 0x0ECD }, { 0x0F18, 0x0F19 },
-    { 0x0F35, 0x0F35 }, { 0x0F37, 0x0F37 }, { 0x0F39, 0x0F39 },
-    { 0x0F71, 0x0F7E }, { 0x0F80, 0x0F84 }, { 0x0F86, 0x0F87 },
-    { 0x0F8D, 0x0F97 }, { 0x0F99, 0x0FBC }, { 0x0FC6, 0x0FC6 },
-    { 0x102D, 0x1030 }, { 0x1032, 0x1037 }, { 0x1039, 0x103A },
-    { 0x103D, 0x103E }, { 0x1058, 0x1059 }, { 0x105E, 0x1060 },
-    { 0x1071, 0x1074 }, { 0x1082, 0x1082 }, { 0x1085, 0x1086 },
-    { 0x108D, 0x108D }, { 0x109D, 0x109D }, { 0x135D, 0x135F },
-    { 0x1712, 0x1714 }, { 0x1732, 0x1734 }, { 0x1752, 0x1753 },
-    { 0x1772, 0x1773 }, { 0x17B4, 0x17B5 }, { 0x17B7, 0x17BD },
-    { 0x17C6, 0x17C6 }, { 0x17C9, 0x17D3 }, { 0x17DD, 0x17DD },
-    { 0x180B, 0x180D }, { 0x18A9, 0x18A9 }, { 0x1920, 0x1922 },
-    { 0x1927, 0x1928 }, { 0x1932, 0x1932 }, { 0x1939, 0x193B },
-    { 0x1A17, 0x1A18 }, { 0x1A56, 0x1A56 }, { 0x1A58, 0x1A5E },
-    { 0x1A60, 0x1A60 }, { 0x1A62, 0x1A62 }, { 0x1A65, 0x1A6C },
-    { 0x1A73, 0x1A7C }, { 0x1A7F, 0x1A7F }, { 0x1B00, 0x1B03 },
-    { 0x1B34, 0x1B34 }, { 0x1B36, 0x1B3A }, { 0x1B3C, 0x1B3C },
-    { 0x1B42, 0x1B42 }, { 0x1B6B, 0x1B73 }, { 0x1B80, 0x1B81 },
-    { 0x1BA2, 0x1BA5 }, { 0x1BA8, 0x1BA9 }, { 0x1BAB, 0x1BAB },
-    { 0x1BE6, 0x1BE6 }, { 0x1BE8, 0x1BE9 }, { 0x1BED, 0x1BED },
-    { 0x1BEF, 0x1BF1 }, { 0x1C2C, 0x1C33 }, { 0x1C36, 0x1C37 },
-    { 0x1CD0, 0x1CD2 }, { 0x1CD4, 0x1CE0 }, { 0x1CE2, 0x1CE8 },
-    { 0x1CED, 0x1CED }, { 0x1CF4, 0x1CF4 }, { 0x1DC0, 0x1DE6 },
-    { 0x1DFC, 0x1DFF }, { 0x20D0, 0x20F0 }, { 0x2CEF, 0x2CF1 },
-    { 0x2D7F, 0x2D7F }, { 0x2DE0, 0x2DFF }, { 0x302A, 0x302D },
-    { 0x3099, 0x309A }, { 0xA66F, 0xA672 }, { 0xA674, 0xA67D },
-    { 0xA69F, 0xA69F }, { 0xA6F0, 0xA6F1 }, { 0xA802, 0xA802 },
-    { 0xA806, 0xA806 }, { 0xA80B, 0xA80B }, { 0xA825, 0xA826 },
-    { 0xA8C4, 0xA8C4 }, { 0xA8E0, 0xA8F1 }, { 0xA926, 0xA92D },
-    { 0xA947, 0xA951 }, { 0xA980, 0xA982 }, { 0xA9B3, 0xA9B3 },
-    { 0xA9B6, 0xA9B9 }, { 0xA9BC, 0xA9BC }, { 0xAA29, 0xAA2E },
-    { 0xAA31, 0xAA32 }, { 0xAA35, 0xAA36 }, { 0xAA43, 0xAA43 },
-    { 0xAA4C, 0xAA4C }, { 0xAAB0, 0xAAB0 }, { 0xAAB2, 0xAAB4 },
-    { 0xAAB7, 0xAAB8 }, { 0xAABE, 0xAABF }, { 0xAAC1, 0xAAC1 },
-    { 0xAAEC, 0xAAED }, { 0xAAF6, 0xAAF6 }, { 0xABE5, 0xABE5 },
-    { 0xABE8, 0xABE8 }, { 0xABED, 0xABED }, { 0xFB1E, 0xFB1E },
-    { 0xFE00, 0xFE0F }, { 0xFE20, 0xFE26 }, { 0x101FD, 0x101FD },
-    { 0x10A01, 0x10A03 }, { 0x10A05, 0x10A06 }, { 0x10A0C, 0x10A0F },
-    { 0x10A38, 0x10A3A }, { 0x10A3F, 0x10A3F }, { 0x11001, 0x11001 },
-    { 0x11038, 0x11046 }, { 0x11080, 0x11081 }, { 0x110B3, 0x110B6 },
-    { 0x110B9, 0x110BA }, { 0x11100, 0x11102 }, { 0x11127, 0x1112B },
-    { 0x1112D, 0x11134 }, { 0x11180, 0x11181 }, { 0x111B6, 0x111BE },
-    { 0x116AB, 0x116AB }, { 0x116AD, 0x116AD }, { 0x116B0, 0x116B5 },
-    { 0x116B7, 0x116B7 }, { 0x16F8F, 0x16F92 }, { 0x1D167, 0x1D169 },
-    { 0x1D17B, 0x1D182 }, { 0x1D185, 0x1D18B }, { 0x1D1AA, 0x1D1AD },
-    { 0x1D242, 0x1D244 }, { 0xE0100, 0xE01EF },
-  };
-
-  if (isCharInSet(UCS, CombiningCharacters))
-    return 0;
-
-  static const UnicodeCharRange DoubleWidthCharacters[] = {
-    // Hangul Jamo
-    { 0x1100, 0x11FF },
-    // Deprecated fullwidth angle brackets
-    { 0x2329, 0x232A },
-    // CJK Misc, CJK Unified Ideographs, Yijing Hexagrams, Yi
-    // excluding U+303F (IDEOGRAPHIC HALF FILL SPACE)
-    { 0x2E80, 0x303E }, { 0x3040, 0xA4CF },
-    // Hangul
-    { 0xAC00, 0xD7A3 }, { 0xD7B0, 0xD7C6 }, { 0xD7CB, 0xD7FB },
-    // CJK Unified Ideographs
-    { 0xF900, 0xFAFF },
-    // Vertical forms
-    { 0xFE10, 0xFE19 },
-    // CJK Compatibility Forms + Small Form Variants
-    { 0xFE30, 0xFE6F },
-    // Fullwidth forms
-    { 0xFF01, 0xFF60 }, { 0xFFE0, 0xFFE6 },
-    // CJK Unified Ideographs
-    { 0x20000, 0x2A6DF }, { 0x2A700, 0x2B81F }, { 0x2F800, 0x2FA1F }
-  };
-
-  if (isCharInSet(UCS, DoubleWidthCharacters))
-    return 2;
-  return 1;
-}
-
-int columnWidth(StringRef Text) {
-  unsigned ColumnWidth = 0;
-  unsigned Length;
-  for (size_t i = 0, e = Text.size(); i < e; i += Length) {
-    Length = getNumBytesForUTF8(Text[i]);
-    if (Length <= 0 || i + Length > Text.size())
-      return ErrorInvalidUTF8;
-    UTF32 buf[1];
-    const UTF8 *Start = reinterpret_cast<const UTF8 *>(Text.data() + i);
-    UTF32 *Target = &buf[0];
-    if (conversionOK != ConvertUTF8toUTF32(&Start, Start + Length, &Target,
-                                           Target + 1, strictConversion))
-      return ErrorInvalidUTF8;
-    int Width = charWidth(buf[0]);
-    if (Width < 0)
-      return ErrorNonPrintableCharacter;
-    ColumnWidth += Width;
-  }
-  return ColumnWidth;
-}
-
-}
-}
-}
diff --git a/lib/Support/LocaleWindows.inc b/lib/Support/LocaleWindows.inc
deleted file mode 100644
index 28e429c..0000000
--- a/lib/Support/LocaleWindows.inc
+++ /dev/null
@@ -1,15 +0,0 @@
-namespace llvm {
-namespace sys {
-namespace locale {
-
-int columnWidth(StringRef s) {
-    return s.size();
-}
-
-bool isPrint(int c) {
-    return ' ' <= c && c <= '~';
-}
-
-}
-}
-}
diff --git a/lib/Support/LocaleXlocale.inc b/lib/Support/LocaleXlocale.inc
deleted file mode 100644
index 389fe3d..0000000
--- a/lib/Support/LocaleXlocale.inc
+++ /dev/null
@@ -1,61 +0,0 @@
-#include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/Support/ManagedStatic.h"
-#include <cassert>
-#include <xlocale.h>
-
-
-namespace {
-  struct locale_holder {
-    locale_holder()
-    : l(newlocale(LC_CTYPE_MASK,"en_US.UTF-8",LC_GLOBAL_LOCALE))
-    {
-      assert(NULL!=l);
-    }
-    ~locale_holder() {
-      freelocale(l);
-    }
-
-    int mbswidth(llvm::SmallString<16> s) const {
-       // this implementation assumes no '\0' in s
-      assert(s.size()==strlen(s.c_str()));
-
-      size_t size = mbstowcs_l(NULL,s.c_str(),0,l);
-      assert(size!=(size_t)-1);
-      if (size==0)
-        return 0;
-      llvm::SmallVector<wchar_t,200> ws(size);
-      size = mbstowcs_l(&ws[0],s.c_str(),ws.size(),l);
-      assert(ws.size()==size);
-      return wcswidth_l(&ws[0],ws.size(),l);
-    }
-
-    int isprint(int c) const {
-      return iswprint_l(c,l);
-    }
-
-  private:
-
-    locale_t l;
-  };
-
-  llvm::ManagedStatic<locale_holder> l;
-}
-
-namespace llvm {
-namespace sys {
-namespace locale {
-
-int columnWidth(StringRef s) {
-  int width = l->mbswidth(s);
-  assert(width>=0);
-  return width;
-}
-
-bool isPrint(int c) {
-  return l->isprint(c);
-}
-
-}
-}
-}
diff --git a/lib/Support/MemoryBuffer.cpp b/lib/Support/MemoryBuffer.cpp
index cab45c7..dcd5529 100644
--- a/lib/Support/MemoryBuffer.cpp
+++ b/lib/Support/MemoryBuffer.cpp
@@ -177,7 +177,7 @@ error_code MemoryBuffer::getFileOrSTDIN(StringRef Filename,
 //===----------------------------------------------------------------------===//
 
 namespace {
-/// \brief Memorry maps a file descriptor using sys::fs::mapped_file_region.
+/// \brief Memory maps a file descriptor using sys::fs::mapped_file_region.
 ///
 /// This handles converting the offset into a legal offset on the platform.
 class MemoryBufferMMapFile : public MemoryBuffer {
@@ -217,7 +217,7 @@ public:
 };
 }
 
-static error_code getMemoryBufferForStream(int FD, 
+static error_code getMemoryBufferForStream(int FD,
                                            StringRef BufferName,
                                            OwningPtr<MemoryBuffer> &result) {
   const ssize_t ChunkSize = 4096*4;
@@ -238,14 +238,19 @@ static error_code getMemoryBufferForStream(int FD,
   return error_code::success();
 }
 
-error_code MemoryBuffer::getFile(StringRef Filename,
+static error_code getFileAux(const char *Filename,
+                             OwningPtr<MemoryBuffer> &result, int64_t FileSize,
+                             bool RequiresNullTerminator);
+
+error_code MemoryBuffer::getFile(Twine Filename,
                                  OwningPtr<MemoryBuffer> &result,
                                  int64_t FileSize,
                                  bool RequiresNullTerminator) {
   // Ensure the path is null terminated.
-  SmallString<256> PathBuf(Filename.begin(), Filename.end());
-  return MemoryBuffer::getFile(PathBuf.c_str(), result, FileSize,
-                               RequiresNullTerminator);
+  SmallString<256> PathBuf;
+  StringRef NullTerminatedName = Filename.toNullTerminatedStringRef(PathBuf);
+  return getFileAux(NullTerminatedName.data(), result, FileSize,
+                    RequiresNullTerminator);
 }
 
 static error_code getOpenFileImpl(int FD, const char *Filename,
@@ -253,10 +258,9 @@ static error_code getOpenFileImpl(int FD, const char *Filename,
                                   uint64_t FileSize, uint64_t MapSize,
                                   int64_t Offset, bool RequiresNullTerminator);
 
-error_code MemoryBuffer::getFile(const char *Filename,
-                                 OwningPtr<MemoryBuffer> &result,
-                                 int64_t FileSize,
-                                 bool RequiresNullTerminator) {
+static error_code getFileAux(const char *Filename,
+                             OwningPtr<MemoryBuffer> &result, int64_t FileSize,
+                             bool RequiresNullTerminator) {
   int FD;
   error_code EC = sys::fs::openFileForRead(Filename, FD);
   if (EC)
@@ -276,7 +280,7 @@ static bool shouldUseMmap(int FD,
                           int PageSize) {
   // We don't use mmap for small files because this can severely fragment our
   // address space.
-  if (MapSize < 4096*4)
+  if (MapSize < 4 * 4096 || MapSize < (unsigned)PageSize)
     return false;
 
   if (!RequiresNullTerminator)
@@ -302,6 +306,15 @@ static bool shouldUseMmap(int FD,
   if (End != FileSize)
     return false;
 
+#if defined(_WIN32) || defined(__CYGWIN__)
+  // Don't peek the next page if file is multiple of *physical* pagesize(4k)
+  // but is not multiple of AllocationGranularity(64k),
+  // when a null terminator is required.
+  // FIXME: It's not good to hardcode 4096 here. dwPageSize shows 4096.
+  if ((FileSize & (4096 - 1)) == 0)
+    return false;
+#endif
+
   // Don't try to map files that are exactly a multiple of the system page size
   // if we need a null terminator.
   if ((FileSize & (PageSize -1)) == 0)
diff --git a/lib/Support/Path.cpp b/lib/Support/Path.cpp
index cfd9ed6..c869b30 100644
--- a/lib/Support/Path.cpp
+++ b/lib/Support/Path.cpp
@@ -77,7 +77,7 @@ namespace {
       return path.substr(0, 1);
 
     // * {file,directory}name
-    size_t end = path.find_first_of(separators, 2);
+    size_t end = path.find_first_of(separators);
     return path.substr(0, end);
   }
 
@@ -449,23 +449,18 @@ void replace_extension(SmallVectorImpl<char> &path, const Twine &extension) {
 }
 
 void native(const Twine &path, SmallVectorImpl<char> &result) {
+  assert((!path.isSingleStringRef() ||
+          path.getSingleStringRef().data() != result.data()) &&
+         "path and result are not allowed to overlap!");
   // Clear result.
   result.clear();
-#ifdef LLVM_ON_WIN32
-  SmallString<128> path_storage;
-  StringRef p = path.toStringRef(path_storage);
-  result.reserve(p.size());
-  for (StringRef::const_iterator i = p.begin(),
-                                 e = p.end();
-                                 i != e;
-                                 ++i) {
-    if (*i == '/')
-      result.push_back('\\');
-    else
-      result.push_back(*i);
-  }
-#else
   path.toVector(result);
+  native(result);
+}
+
+void native(SmallVectorImpl<char> &path) {
+#ifdef LLVM_ON_WIN32
+  std::replace(path.begin(), path.end(), '/', '\\');
 #endif
 }
 
@@ -852,6 +847,21 @@ error_code has_magic(const Twine &path, const Twine &magic, bool &result) {
   if (Magic.size() < 4)
     return file_magic::unknown;
   switch ((unsigned char)Magic[0]) {
+    case 0x00: {
+      // COFF short import library file
+      if (Magic[1] == (char)0x00 && Magic[2] == (char)0xff &&
+          Magic[3] == (char)0xff)
+        return file_magic::coff_import_library;
+      // Windows resource file
+      const char Expected[] = { 0, 0, 0, 0, '\x20', 0, 0, 0, '\xff' };
+      if (Magic.size() >= sizeof(Expected) &&
+          memcmp(Magic.data(), Expected, sizeof(Expected)) == 0)
+        return file_magic::windows_resource;
+      // 0x0000 = COFF unknown machine type
+      if (Magic[1] == 0)
+        return file_magic::coff_object;
+      break;
+    }
     case 0xDE:  // 0x0B17C0DE = BC wraper
       if (Magic[1] == (char)0xC0 && Magic[2] == (char)0x17 &&
           Magic[3] == (char)0x0B)
diff --git a/lib/Support/PrettyStackTrace.cpp b/lib/Support/PrettyStackTrace.cpp
index 23ee5ab..722f4ca 100644
--- a/lib/Support/PrettyStackTrace.cpp
+++ b/lib/Support/PrettyStackTrace.cpp
@@ -15,10 +15,12 @@
 #include "llvm/Support/PrettyStackTrace.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/Config/config.h"     // Get autoconf configuration settings
+#include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/Signals.h"
 #include "llvm/Support/ThreadLocal.h"
 #include "llvm/Support/Watchdog.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm-c/Core.h"
 
 #ifdef HAVE_CRASHREPORTERCLIENT_H
 #include <CrashReporterClient.h>
@@ -26,12 +28,7 @@
 
 using namespace llvm;
 
-namespace llvm {
-  bool DisablePrettyStackTrace = false;
-}
-
-// FIXME: This should be thread local when llvm supports threads.
-static sys::ThreadLocal<const PrettyStackTraceEntry> PrettyStackTraceHead;
+static ManagedStatic<sys::ThreadLocal<const PrettyStackTraceEntry> > PrettyStackTraceHead;
 
 static unsigned PrintStack(const PrettyStackTraceEntry *Entry, raw_ostream &OS){
   unsigned NextID = 0;
@@ -49,12 +46,12 @@ static unsigned PrintStack(const PrettyStackTraceEntry *Entry, raw_ostream &OS){
 /// PrintCurStackTrace - Print the current stack trace to the specified stream.
 static void PrintCurStackTrace(raw_ostream &OS) {
   // Don't print an empty trace.
-  if (PrettyStackTraceHead.get() == 0) return;
+  if (PrettyStackTraceHead->get() == 0) return;
   
   // If there are pretty stack frames registered, walk and emit them.
   OS << "Stack dump:\n";
   
-  PrintStack(PrettyStackTraceHead.get(), OS);
+  PrintStack(PrettyStackTraceHead->get(), OS);
   OS.flush();
 }
 
@@ -102,26 +99,28 @@ static void CrashHandler(void *) {
 #endif
 }
 
-static bool RegisterCrashPrinter() {
-  if (!DisablePrettyStackTrace)
-    sys::AddSignalHandler(CrashHandler, 0);
-  return false;
-}
-
 PrettyStackTraceEntry::PrettyStackTraceEntry() {
-  // The first time this is called, we register the crash printer.
-  static bool HandlerRegistered = RegisterCrashPrinter();
-  (void)HandlerRegistered;
-    
   // Link ourselves.
-  NextEntry = PrettyStackTraceHead.get();
-  PrettyStackTraceHead.set(this);
+  NextEntry = PrettyStackTraceHead->get();
+  PrettyStackTraceHead->set(this);
 }
 
 PrettyStackTraceEntry::~PrettyStackTraceEntry() {
-  assert(PrettyStackTraceHead.get() == this &&
+  // Do nothing if PrettyStackTraceHead is uninitialized. This can only happen
+  // if a shutdown occurred after we created the PrettyStackTraceEntry. That
+  // does occur in the following idiom:
+  //
+  // PrettyStackTraceProgram X(...);
+  // llvm_shutdown_obj Y;
+  //
+  // Without this check, we may end up removing ourselves from the stack trace
+  // after PrettyStackTraceHead has already been destroyed.
+  if (!PrettyStackTraceHead.isConstructed())
+    return;
+  
+  assert(PrettyStackTraceHead->get() == this &&
          "Pretty stack trace entry destruction is out of order");
-  PrettyStackTraceHead.set(getNextEntry());
+  PrettyStackTraceHead->set(getNextEntry());
 }
 
 void PrettyStackTraceString::print(raw_ostream &OS) const {
@@ -135,3 +134,18 @@ void PrettyStackTraceProgram::print(raw_ostream &OS) const {
     OS << ArgV[i] << ' ';
   OS << '\n';
 }
+
+static bool RegisterCrashPrinter() {
+  sys::AddSignalHandler(CrashHandler, 0);
+  return false;
+}
+
+void llvm::EnablePrettyStackTrace() {
+  // The first time this is called, we register the crash printer.
+  static bool HandlerRegistered = RegisterCrashPrinter();
+  (void)HandlerRegistered;
+}
+
+void LLVMEnablePrettyStackTrace() {
+  EnablePrettyStackTrace();
+}
diff --git a/lib/Support/Process.cpp b/lib/Support/Process.cpp
index 2c0d37b..d5168f0 100644
--- a/lib/Support/Process.cpp
+++ b/lib/Support/Process.cpp
@@ -80,6 +80,24 @@ TimeValue self_process::get_wall_time() const {
 #endif
 
 
+#define COLOR(FGBG, CODE, BOLD) "\033[0;" BOLD FGBG CODE "m"
+
+#define ALLCOLORS(FGBG,BOLD) {\
+    COLOR(FGBG, "0", BOLD),\
+    COLOR(FGBG, "1", BOLD),\
+    COLOR(FGBG, "2", BOLD),\
+    COLOR(FGBG, "3", BOLD),\
+    COLOR(FGBG, "4", BOLD),\
+    COLOR(FGBG, "5", BOLD),\
+    COLOR(FGBG, "6", BOLD),\
+    COLOR(FGBG, "7", BOLD)\
+  }
+
+static const char colorcodes[2][2][8][10] = {
+ { ALLCOLORS("3",""), ALLCOLORS("3","1;") },
+ { ALLCOLORS("4",""), ALLCOLORS("4","1;") }
+};
+
 // Include the platform-specific parts of this class.
 #ifdef LLVM_ON_UNIX
 #include "Unix/Process.inc"
diff --git a/lib/Support/Program.cpp b/lib/Support/Program.cpp
index 79f7e5f..83f2ec4 100644
--- a/lib/Support/Program.cpp
+++ b/lib/Support/Program.cpp
@@ -22,30 +22,40 @@ using namespace sys;
 //===          independent code.
 //===----------------------------------------------------------------------===//
 
-static bool Execute(void **Data, StringRef Program, const char **args,
+static bool Execute(ProcessInfo &PI, StringRef Program, const char **args,
                     const char **env, const StringRef **Redirects,
                     unsigned memoryLimit, std::string *ErrMsg);
 
-static int Wait(void *&Data, StringRef Program, unsigned secondsToWait,
-                std::string *ErrMsg);
-
 int sys::ExecuteAndWait(StringRef Program, const char **args, const char **envp,
                         const StringRef **redirects, unsigned secondsToWait,
                         unsigned memoryLimit, std::string *ErrMsg,
                         bool *ExecutionFailed) {
-  void *Data = 0;
-  if (Execute(&Data, Program, args, envp, redirects, memoryLimit, ErrMsg)) {
-    if (ExecutionFailed) *ExecutionFailed = false;
-    return Wait(Data, Program, secondsToWait, ErrMsg);
+  ProcessInfo PI;
+  if (Execute(PI, Program, args, envp, redirects, memoryLimit, ErrMsg)) {
+    if (ExecutionFailed)
+      *ExecutionFailed = false;
+    ProcessInfo Result = Wait(PI, secondsToWait, true, ErrMsg);
+    return Result.ReturnCode;
   }
-  if (ExecutionFailed) *ExecutionFailed = true;
+
+  if (ExecutionFailed)
+    *ExecutionFailed = true;
+
   return -1;
 }
 
-void sys::ExecuteNoWait(StringRef Program, const char **args, const char **envp,
-                        const StringRef **redirects, unsigned memoryLimit,
-                        std::string *ErrMsg) {
-  Execute(/*Data*/ 0, Program, args, envp, redirects, memoryLimit, ErrMsg);
+ProcessInfo sys::ExecuteNoWait(StringRef Program, const char **args,
+                               const char **envp, const StringRef **redirects,
+                               unsigned memoryLimit, std::string *ErrMsg,
+                               bool *ExecutionFailed) {
+  ProcessInfo PI;
+  if (ExecutionFailed)
+    *ExecutionFailed = false;
+  if (!Execute(PI, Program, args, envp, redirects, memoryLimit, ErrMsg))
+    if (ExecutionFailed)
+      *ExecutionFailed = true;
+
+  return PI;
 }
 
 // Include the platform-specific parts of this class.
diff --git a/lib/Support/Regex.cpp b/lib/Support/Regex.cpp
index dec967e..5413641 100644
--- a/lib/Support/Regex.cpp
+++ b/lib/Support/Regex.cpp
@@ -43,7 +43,7 @@ bool Regex::isValid(std::string &Error) {
   
   size_t len = llvm_regerror(error, preg, NULL, 0);
   
-  Error.resize(len);
+  Error.resize(len - 1);
   llvm_regerror(error, preg, &Error[0], len);
   return false;
 }
diff --git a/lib/Support/SmallPtrSet.cpp b/lib/Support/SmallPtrSet.cpp
index f0fed77..dd417b4 100644
--- a/lib/Support/SmallPtrSet.cpp
+++ b/lib/Support/SmallPtrSet.cpp
@@ -202,8 +202,13 @@ void SmallPtrSetImpl::CopyFrom(const SmallPtrSetImpl &RHS) {
   } else if (CurArraySize != RHS.CurArraySize) {
     if (isSmall())
       CurArray = (const void**)malloc(sizeof(void*) * RHS.CurArraySize);
-    else
-      CurArray = (const void**)realloc(CurArray, sizeof(void*)*RHS.CurArraySize);
+    else {
+      const void **T = (const void**)realloc(CurArray,
+                                             sizeof(void*) * RHS.CurArraySize);
+      if (!T)
+        free(CurArray);
+      CurArray = T;
+    }
     assert(CurArray && "Failed to allocate memory?");
   }
   
diff --git a/lib/Support/SourceMgr.cpp b/lib/Support/SourceMgr.cpp
index 51162dd..d4b94f8 100644
--- a/lib/Support/SourceMgr.cpp
+++ b/lib/Support/SourceMgr.cpp
@@ -211,7 +211,8 @@ SMDiagnostic SourceMgr::GetMessage(SMLoc Loc, SourceMgr::DiagKind Kind,
                       LineStr, ColRanges, FixIts);
 }
 
-void SourceMgr::PrintMessage(SMLoc Loc, SourceMgr::DiagKind Kind,
+void SourceMgr::PrintMessage(raw_ostream &OS, SMLoc Loc,
+                             SourceMgr::DiagKind Kind,
                              const Twine &Msg, ArrayRef<SMRange> Ranges,
                              ArrayRef<SMFixIt> FixIts, bool ShowColors) const {
   SMDiagnostic Diagnostic = GetMessage(Loc, Kind, Msg, Ranges, FixIts);
@@ -222,8 +223,6 @@ void SourceMgr::PrintMessage(SMLoc Loc, SourceMgr::DiagKind Kind,
     return;
   }
 
-  raw_ostream &OS = errs();
-
   if (Loc != SMLoc()) {
     int CurBuf = FindBufferContainingLoc(Loc);
     assert(CurBuf != -1 && "Invalid or unspecified location!");
@@ -233,6 +232,12 @@ void SourceMgr::PrintMessage(SMLoc Loc, SourceMgr::DiagKind Kind,
   Diagnostic.print(0, OS, ShowColors);
 }
 
+void SourceMgr::PrintMessage(SMLoc Loc, SourceMgr::DiagKind Kind,
+                             const Twine &Msg, ArrayRef<SMRange> Ranges,
+                             ArrayRef<SMFixIt> FixIts, bool ShowColors) const {
+  PrintMessage(llvm::errs(), Loc, Kind, Msg, Ranges, FixIts, ShowColors);
+}
+
 //===----------------------------------------------------------------------===//
 // SMDiagnostic Implementation
 //===----------------------------------------------------------------------===//
@@ -465,7 +470,7 @@ void SMDiagnostic::print(const char *ProgName, raw_ostream &S,
   if (FixItInsertionLine.empty())
     return;
   
-  for (size_t i = 0, e = FixItInsertionLine.size(), OutCol = 0; i != e; ++i) {
+  for (size_t i = 0, e = FixItInsertionLine.size(), OutCol = 0; i < e; ++i) {
     if (i >= LineContents.size() || LineContents[i] != '\t') {
       S << FixItInsertionLine[i];
       ++OutCol;
diff --git a/lib/Support/StringRef.cpp b/lib/Support/StringRef.cpp
index d7a0bfa..bfae754 100644
--- a/lib/Support/StringRef.cpp
+++ b/lib/Support/StringRef.cpp
@@ -37,20 +37,39 @@ static bool ascii_isdigit(char x) {
   return x >= '0' && x <= '9';
 }
 
-/// compare_lower - Compare strings, ignoring case.
-int StringRef::compare_lower(StringRef RHS) const {
-  for (size_t I = 0, E = min(Length, RHS.Length); I != E; ++I) {
-    unsigned char LHC = ascii_tolower(Data[I]);
-    unsigned char RHC = ascii_tolower(RHS.Data[I]);
+// strncasecmp() is not available on non-POSIX systems, so define an
+// alternative function here.
+static int ascii_strncasecmp(const char *LHS, const char *RHS, size_t Length) {
+  for (size_t I = 0; I < Length; ++I) {
+    unsigned char LHC = ascii_tolower(LHS[I]);
+    unsigned char RHC = ascii_tolower(RHS[I]);
     if (LHC != RHC)
       return LHC < RHC ? -1 : 1;
   }
+  return 0;
+}
 
+/// compare_lower - Compare strings, ignoring case.
+int StringRef::compare_lower(StringRef RHS) const {
+  if (int Res = ascii_strncasecmp(Data, RHS.Data, min(Length, RHS.Length)))
+    return Res;
   if (Length == RHS.Length)
     return 0;
   return Length < RHS.Length ? -1 : 1;
 }
 
+/// Check if this string starts with the given \p Prefix, ignoring case.
+bool StringRef::startswith_lower(StringRef Prefix) const {
+  return Length >= Prefix.Length &&
+      ascii_strncasecmp(Data, Prefix.Data, Prefix.Length) == 0;
+}
+
+/// Check if this string ends with the given \p Suffix, ignoring case.
+bool StringRef::endswith_lower(StringRef Suffix) const {
+  return Length >= Suffix.Length &&
+      ascii_strncasecmp(end() - Suffix.Length, Suffix.Data, Suffix.Length) == 0;
+}
+
 /// compare_numeric - Compare strings, handle embedded numbers.
 int StringRef::compare_numeric(StringRef RHS) const {
   for (size_t I = 0, E = min(Length, RHS.Length); I != E; ++I) {
@@ -85,7 +104,7 @@ int StringRef::compare_numeric(StringRef RHS) const {
 // Compute the edit distance between the two given strings.
 unsigned StringRef::edit_distance(llvm::StringRef Other,
                                   bool AllowReplacements,
-                                  unsigned MaxEditDistance) {
+                                  unsigned MaxEditDistance) const {
   return llvm::ComputeEditDistance(
       llvm::ArrayRef<char>(data(), size()),
       llvm::ArrayRef<char>(Other.data(), Other.size()),
diff --git a/lib/Support/TargetRegistry.cpp b/lib/Support/TargetRegistry.cpp
index 9c81327..0c90c17 100644
--- a/lib/Support/TargetRegistry.cpp
+++ b/lib/Support/TargetRegistry.cpp
@@ -135,9 +135,9 @@ const Target *TargetRegistry::getClosestTargetForJIT(std::string &Error) {
   return TheTarget;
 }
 
-static int TargetArraySortFn(const void *LHS, const void *RHS) {
-  typedef std::pair<StringRef, const Target*> pair_ty;
-  return ((const pair_ty*)LHS)->first.compare(((const pair_ty*)RHS)->first);
+static int TargetArraySortFn(const std::pair<StringRef, const Target *> *LHS,
+                             const std::pair<StringRef, const Target *> *RHS) {
+  return LHS->first.compare(RHS->first);
 }
 
 void TargetRegistry::printRegisteredTargetsForVersion() {
diff --git a/lib/Support/ThreadLocal.cpp b/lib/Support/ThreadLocal.cpp
index 0587aae..868b6ea 100644
--- a/lib/Support/ThreadLocal.cpp
+++ b/lib/Support/ThreadLocal.cpp
@@ -23,7 +23,7 @@
 // Define all methods as no-ops if threading is explicitly disabled
 namespace llvm {
 using namespace sys;
-ThreadLocalImpl::ThreadLocalImpl() { }
+ThreadLocalImpl::ThreadLocalImpl() : data() { }
 ThreadLocalImpl::~ThreadLocalImpl() { }
 void ThreadLocalImpl::setInstance(const void* d) {
   typedef int SIZE_TOO_BIG[sizeof(d) <= sizeof(data) ? 1 : -1];
diff --git a/lib/Support/Triple.cpp b/lib/Support/Triple.cpp
index d0d0e14..6c978a0 100644
--- a/lib/Support/Triple.cpp
+++ b/lib/Support/Triple.cpp
@@ -221,7 +221,7 @@ static Triple::ArchType parseArch(StringRef ArchName) {
     .Cases("i386", "i486", "i586", "i686", Triple::x86)
     // FIXME: Do we need to support these?
     .Cases("i786", "i886", "i986", Triple::x86)
-    .Cases("amd64", "x86_64", Triple::x86_64)
+    .Cases("amd64", "x86_64", "x86_64h", Triple::x86_64)
     .Case("powerpc", Triple::ppc)
     .Cases("powerpc64", "ppu", Triple::ppc64)
     .Case("powerpc64le", Triple::ppc64le)
diff --git a/lib/Support/Unicode.cpp b/lib/Support/Unicode.cpp
new file mode 100644
index 0000000..b719bd8
--- /dev/null
+++ b/lib/Support/Unicode.cpp
@@ -0,0 +1,367 @@
+//===- llvm/Support/Unicode.cpp - Unicode character properties  -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements functions that allow querying certain properties of
+// Unicode characters.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/Unicode.h"
+#include "llvm/Support/ConvertUTF.h"
+#include "llvm/Support/UnicodeCharRanges.h"
+
+namespace llvm {
+namespace sys {
+namespace unicode {
+
+bool isPrintable(int UCS) {
+  // Sorted list of non-overlapping intervals of code points that are not
+  // supposed to be printable.
+  static const UnicodeCharRange NonPrintableRanges[] = {
+    { 0x0000, 0x001F }, { 0x007F, 0x009F }, { 0x034F, 0x034F },
+    { 0x0378, 0x0379 }, { 0x037F, 0x0383 }, { 0x038B, 0x038B },
+    { 0x038D, 0x038D }, { 0x03A2, 0x03A2 }, { 0x0528, 0x0530 },
+    { 0x0557, 0x0558 }, { 0x0560, 0x0560 }, { 0x0588, 0x0588 },
+    { 0x058B, 0x058E }, { 0x0590, 0x0590 }, { 0x05C8, 0x05CF },
+    { 0x05EB, 0x05EF }, { 0x05F5, 0x0605 }, { 0x061C, 0x061D },
+    { 0x06DD, 0x06DD }, { 0x070E, 0x070F }, { 0x074B, 0x074C },
+    { 0x07B2, 0x07BF }, { 0x07FB, 0x07FF }, { 0x082E, 0x082F },
+    { 0x083F, 0x083F }, { 0x085C, 0x085D }, { 0x085F, 0x089F },
+    { 0x08A1, 0x08A1 }, { 0x08AD, 0x08E3 }, { 0x08FF, 0x08FF },
+    { 0x0978, 0x0978 }, { 0x0980, 0x0980 }, { 0x0984, 0x0984 },
+    { 0x098D, 0x098E }, { 0x0991, 0x0992 }, { 0x09A9, 0x09A9 },
+    { 0x09B1, 0x09B1 }, { 0x09B3, 0x09B5 }, { 0x09BA, 0x09BB },
+    { 0x09C5, 0x09C6 }, { 0x09C9, 0x09CA }, { 0x09CF, 0x09D6 },
+    { 0x09D8, 0x09DB }, { 0x09DE, 0x09DE }, { 0x09E4, 0x09E5 },
+    { 0x09FC, 0x0A00 }, { 0x0A04, 0x0A04 }, { 0x0A0B, 0x0A0E },
+    { 0x0A11, 0x0A12 }, { 0x0A29, 0x0A29 }, { 0x0A31, 0x0A31 },
+    { 0x0A34, 0x0A34 }, { 0x0A37, 0x0A37 }, { 0x0A3A, 0x0A3B },
+    { 0x0A3D, 0x0A3D }, { 0x0A43, 0x0A46 }, { 0x0A49, 0x0A4A },
+    { 0x0A4E, 0x0A50 }, { 0x0A52, 0x0A58 }, { 0x0A5D, 0x0A5D },
+    { 0x0A5F, 0x0A65 }, { 0x0A76, 0x0A80 }, { 0x0A84, 0x0A84 },
+    { 0x0A8E, 0x0A8E }, { 0x0A92, 0x0A92 }, { 0x0AA9, 0x0AA9 },
+    { 0x0AB1, 0x0AB1 }, { 0x0AB4, 0x0AB4 }, { 0x0ABA, 0x0ABB },
+    { 0x0AC6, 0x0AC6 }, { 0x0ACA, 0x0ACA }, { 0x0ACE, 0x0ACF },
+    { 0x0AD1, 0x0ADF }, { 0x0AE4, 0x0AE5 }, { 0x0AF2, 0x0B00 },
+    { 0x0B04, 0x0B04 }, { 0x0B0D, 0x0B0E }, { 0x0B11, 0x0B12 },
+    { 0x0B29, 0x0B29 }, { 0x0B31, 0x0B31 }, { 0x0B34, 0x0B34 },
+    { 0x0B3A, 0x0B3B }, { 0x0B45, 0x0B46 }, { 0x0B49, 0x0B4A },
+    { 0x0B4E, 0x0B55 }, { 0x0B58, 0x0B5B }, { 0x0B5E, 0x0B5E },
+    { 0x0B64, 0x0B65 }, { 0x0B78, 0x0B81 }, { 0x0B84, 0x0B84 },
+    { 0x0B8B, 0x0B8D }, { 0x0B91, 0x0B91 }, { 0x0B96, 0x0B98 },
+    { 0x0B9B, 0x0B9B }, { 0x0B9D, 0x0B9D }, { 0x0BA0, 0x0BA2 },
+    { 0x0BA5, 0x0BA7 }, { 0x0BAB, 0x0BAD }, { 0x0BBA, 0x0BBD },
+    { 0x0BC3, 0x0BC5 }, { 0x0BC9, 0x0BC9 }, { 0x0BCE, 0x0BCF },
+    { 0x0BD1, 0x0BD6 }, { 0x0BD8, 0x0BE5 }, { 0x0BFB, 0x0C00 },
+    { 0x0C04, 0x0C04 }, { 0x0C0D, 0x0C0D }, { 0x0C11, 0x0C11 },
+    { 0x0C29, 0x0C29 }, { 0x0C34, 0x0C34 }, { 0x0C3A, 0x0C3C },
+    { 0x0C45, 0x0C45 }, { 0x0C49, 0x0C49 }, { 0x0C4E, 0x0C54 },
+    { 0x0C57, 0x0C57 }, { 0x0C5A, 0x0C5F }, { 0x0C64, 0x0C65 },
+    { 0x0C70, 0x0C77 }, { 0x0C80, 0x0C81 }, { 0x0C84, 0x0C84 },
+    { 0x0C8D, 0x0C8D }, { 0x0C91, 0x0C91 }, { 0x0CA9, 0x0CA9 },
+    { 0x0CB4, 0x0CB4 }, { 0x0CBA, 0x0CBB }, { 0x0CC5, 0x0CC5 },
+    { 0x0CC9, 0x0CC9 }, { 0x0CCE, 0x0CD4 }, { 0x0CD7, 0x0CDD },
+    { 0x0CDF, 0x0CDF }, { 0x0CE4, 0x0CE5 }, { 0x0CF0, 0x0CF0 },
+    { 0x0CF3, 0x0D01 }, { 0x0D04, 0x0D04 }, { 0x0D0D, 0x0D0D },
+    { 0x0D11, 0x0D11 }, { 0x0D3B, 0x0D3C }, { 0x0D45, 0x0D45 },
+    { 0x0D49, 0x0D49 }, { 0x0D4F, 0x0D56 }, { 0x0D58, 0x0D5F },
+    { 0x0D64, 0x0D65 }, { 0x0D76, 0x0D78 }, { 0x0D80, 0x0D81 },
+    { 0x0D84, 0x0D84 }, { 0x0D97, 0x0D99 }, { 0x0DB2, 0x0DB2 },
+    { 0x0DBC, 0x0DBC }, { 0x0DBE, 0x0DBF }, { 0x0DC7, 0x0DC9 },
+    { 0x0DCB, 0x0DCE }, { 0x0DD5, 0x0DD5 }, { 0x0DD7, 0x0DD7 },
+    { 0x0DE0, 0x0DF1 }, { 0x0DF5, 0x0E00 }, { 0x0E3B, 0x0E3E },
+    { 0x0E5C, 0x0E80 }, { 0x0E83, 0x0E83 }, { 0x0E85, 0x0E86 },
+    { 0x0E89, 0x0E89 }, { 0x0E8B, 0x0E8C }, { 0x0E8E, 0x0E93 },
+    { 0x0E98, 0x0E98 }, { 0x0EA0, 0x0EA0 }, { 0x0EA4, 0x0EA4 },
+    { 0x0EA6, 0x0EA6 }, { 0x0EA8, 0x0EA9 }, { 0x0EAC, 0x0EAC },
+    { 0x0EBA, 0x0EBA }, { 0x0EBE, 0x0EBF }, { 0x0EC5, 0x0EC5 },
+    { 0x0EC7, 0x0EC7 }, { 0x0ECE, 0x0ECF }, { 0x0EDA, 0x0EDB },
+    { 0x0EE0, 0x0EFF }, { 0x0F48, 0x0F48 }, { 0x0F6D, 0x0F70 },
+    { 0x0F98, 0x0F98 }, { 0x0FBD, 0x0FBD }, { 0x0FCD, 0x0FCD },
+    { 0x0FDB, 0x0FFF }, { 0x10C6, 0x10C6 }, { 0x10C8, 0x10CC },
+    { 0x10CE, 0x10CF }, { 0x115F, 0x1160 }, { 0x1249, 0x1249 },
+    { 0x124E, 0x124F }, { 0x1257, 0x1257 }, { 0x1259, 0x1259 },
+    { 0x125E, 0x125F }, { 0x1289, 0x1289 }, { 0x128E, 0x128F },
+    { 0x12B1, 0x12B1 }, { 0x12B6, 0x12B7 }, { 0x12BF, 0x12BF },
+    { 0x12C1, 0x12C1 }, { 0x12C6, 0x12C7 }, { 0x12D7, 0x12D7 },
+    { 0x1311, 0x1311 }, { 0x1316, 0x1317 }, { 0x135B, 0x135C },
+    { 0x137D, 0x137F }, { 0x139A, 0x139F }, { 0x13F5, 0x13FF },
+    { 0x169D, 0x169F }, { 0x16F1, 0x16FF }, { 0x170D, 0x170D },
+    { 0x1715, 0x171F }, { 0x1737, 0x173F }, { 0x1754, 0x175F },
+    { 0x176D, 0x176D }, { 0x1771, 0x1771 }, { 0x1774, 0x177F },
+    { 0x17B4, 0x17B5 }, { 0x17DE, 0x17DF }, { 0x17EA, 0x17EF },
+    { 0x17FA, 0x17FF }, { 0x180B, 0x180D }, { 0x180F, 0x180F },
+    { 0x181A, 0x181F }, { 0x1878, 0x187F }, { 0x18AB, 0x18AF },
+    { 0x18F6, 0x18FF }, { 0x191D, 0x191F }, { 0x192C, 0x192F },
+    { 0x193C, 0x193F }, { 0x1941, 0x1943 }, { 0x196E, 0x196F },
+    { 0x1975, 0x197F }, { 0x19AC, 0x19AF }, { 0x19CA, 0x19CF },
+    { 0x19DB, 0x19DD }, { 0x1A1C, 0x1A1D }, { 0x1A5F, 0x1A5F },
+    { 0x1A7D, 0x1A7E }, { 0x1A8A, 0x1A8F }, { 0x1A9A, 0x1A9F },
+    { 0x1AAE, 0x1AFF }, { 0x1B4C, 0x1B4F }, { 0x1B7D, 0x1B7F },
+    { 0x1BF4, 0x1BFB }, { 0x1C38, 0x1C3A }, { 0x1C4A, 0x1C4C },
+    { 0x1C80, 0x1CBF }, { 0x1CC8, 0x1CCF }, { 0x1CF7, 0x1CFF },
+    { 0x1DE7, 0x1DFB }, { 0x1F16, 0x1F17 }, { 0x1F1E, 0x1F1F },
+    { 0x1F46, 0x1F47 }, { 0x1F4E, 0x1F4F }, { 0x1F58, 0x1F58 },
+    { 0x1F5A, 0x1F5A }, { 0x1F5C, 0x1F5C }, { 0x1F5E, 0x1F5E },
+    { 0x1F7E, 0x1F7F }, { 0x1FB5, 0x1FB5 }, { 0x1FC5, 0x1FC5 },
+    { 0x1FD4, 0x1FD5 }, { 0x1FDC, 0x1FDC }, { 0x1FF0, 0x1FF1 },
+    { 0x1FF5, 0x1FF5 }, { 0x1FFF, 0x1FFF }, { 0x200B, 0x200F },
+    { 0x202A, 0x202E }, { 0x2060, 0x206F }, { 0x2072, 0x2073 },
+    { 0x208F, 0x208F }, { 0x209D, 0x209F }, { 0x20BB, 0x20CF },
+    { 0x20F1, 0x20FF }, { 0x218A, 0x218F }, { 0x23F4, 0x23FF },
+    { 0x2427, 0x243F }, { 0x244B, 0x245F }, { 0x2700, 0x2700 },
+    { 0x2B4D, 0x2B4F }, { 0x2B5A, 0x2BFF }, { 0x2C2F, 0x2C2F },
+    { 0x2C5F, 0x2C5F }, { 0x2CF4, 0x2CF8 }, { 0x2D26, 0x2D26 },
+    { 0x2D28, 0x2D2C }, { 0x2D2E, 0x2D2F }, { 0x2D68, 0x2D6E },
+    { 0x2D71, 0x2D7E }, { 0x2D97, 0x2D9F }, { 0x2DA7, 0x2DA7 },
+    { 0x2DAF, 0x2DAF }, { 0x2DB7, 0x2DB7 }, { 0x2DBF, 0x2DBF },
+    { 0x2DC7, 0x2DC7 }, { 0x2DCF, 0x2DCF }, { 0x2DD7, 0x2DD7 },
+    { 0x2DDF, 0x2DDF }, { 0x2E3C, 0x2E7F }, { 0x2E9A, 0x2E9A },
+    { 0x2EF4, 0x2EFF }, { 0x2FD6, 0x2FEF }, { 0x2FFC, 0x2FFF },
+    { 0x3040, 0x3040 }, { 0x3097, 0x3098 }, { 0x3100, 0x3104 },
+    { 0x312E, 0x3130 }, { 0x3164, 0x3164 }, { 0x318F, 0x318F },
+    { 0x31BB, 0x31BF }, { 0x31E4, 0x31EF }, { 0x321F, 0x321F },
+    { 0x32FF, 0x32FF }, { 0x4DB6, 0x4DBF }, { 0x9FCD, 0x9FFF },
+    { 0xA48D, 0xA48F }, { 0xA4C7, 0xA4CF }, { 0xA62C, 0xA63F },
+    { 0xA698, 0xA69E }, { 0xA6F8, 0xA6FF }, { 0xA78F, 0xA78F },
+    { 0xA794, 0xA79F }, { 0xA7AB, 0xA7F7 }, { 0xA82C, 0xA82F },
+    { 0xA83A, 0xA83F }, { 0xA878, 0xA87F }, { 0xA8C5, 0xA8CD },
+    { 0xA8DA, 0xA8DF }, { 0xA8FC, 0xA8FF }, { 0xA954, 0xA95E },
+    { 0xA97D, 0xA97F }, { 0xA9CE, 0xA9CE }, { 0xA9DA, 0xA9DD },
+    { 0xA9E0, 0xA9FF }, { 0xAA37, 0xAA3F }, { 0xAA4E, 0xAA4F },
+    { 0xAA5A, 0xAA5B }, { 0xAA7C, 0xAA7F }, { 0xAAC3, 0xAADA },
+    { 0xAAF7, 0xAB00 }, { 0xAB07, 0xAB08 }, { 0xAB0F, 0xAB10 },
+    { 0xAB17, 0xAB1F }, { 0xAB27, 0xAB27 }, { 0xAB2F, 0xABBF },
+    { 0xABEE, 0xABEF }, { 0xABFA, 0xABFF }, { 0xD7A4, 0xD7AF },
+    { 0xD7C7, 0xD7CA }, { 0xD7FC, 0xDFFF }, { 0xFA6E, 0xFA6F },
+    { 0xFADA, 0xFAFF }, { 0xFB07, 0xFB12 }, { 0xFB18, 0xFB1C },
+    { 0xFB37, 0xFB37 }, { 0xFB3D, 0xFB3D }, { 0xFB3F, 0xFB3F },
+    { 0xFB42, 0xFB42 }, { 0xFB45, 0xFB45 }, { 0xFBC2, 0xFBD2 },
+    { 0xFD40, 0xFD4F }, { 0xFD90, 0xFD91 }, { 0xFDC8, 0xFDEF },
+    { 0xFDFE, 0xFE0F }, { 0xFE1A, 0xFE1F }, { 0xFE27, 0xFE2F },
+    { 0xFE53, 0xFE53 }, { 0xFE67, 0xFE67 }, { 0xFE6C, 0xFE6F },
+    { 0xFE75, 0xFE75 }, { 0xFEFD, 0xFEFF }, { 0xFF00, 0xFF00 },
+    { 0xFFA0, 0xFFA0 }, { 0xFFBF, 0xFFC1 }, { 0xFFC8, 0xFFC9 },
+    { 0xFFD0, 0xFFD1 }, { 0xFFD8, 0xFFD9 }, { 0xFFDD, 0xFFDF },
+    { 0xFFE7, 0xFFE7 }, { 0xFFEF, 0xFFFB }, { 0xFFFE, 0xFFFF },
+    { 0x1000C, 0x1000C }, { 0x10027, 0x10027 }, { 0x1003B, 0x1003B },
+    { 0x1003E, 0x1003E }, { 0x1004E, 0x1004F }, { 0x1005E, 0x1007F },
+    { 0x100FB, 0x100FF }, { 0x10103, 0x10106 }, { 0x10134, 0x10136 },
+    { 0x1018B, 0x1018F }, { 0x1019C, 0x101CF }, { 0x101FE, 0x1027F },
+    { 0x1029D, 0x1029F }, { 0x102D1, 0x102FF }, { 0x1031F, 0x1031F },
+    { 0x10324, 0x1032F }, { 0x1034B, 0x1037F }, { 0x1039E, 0x1039E },
+    { 0x103C4, 0x103C7 }, { 0x103D6, 0x103FF }, { 0x1049E, 0x1049F },
+    { 0x104AA, 0x107FF }, { 0x10806, 0x10807 }, { 0x10809, 0x10809 },
+    { 0x10836, 0x10836 }, { 0x10839, 0x1083B }, { 0x1083D, 0x1083E },
+    { 0x10856, 0x10856 }, { 0x10860, 0x108FF }, { 0x1091C, 0x1091E },
+    { 0x1093A, 0x1093E }, { 0x10940, 0x1097F }, { 0x109B8, 0x109BD },
+    { 0x109C0, 0x109FF }, { 0x10A04, 0x10A04 }, { 0x10A07, 0x10A0B },
+    { 0x10A14, 0x10A14 }, { 0x10A18, 0x10A18 }, { 0x10A34, 0x10A37 },
+    { 0x10A3B, 0x10A3E }, { 0x10A48, 0x10A4F }, { 0x10A59, 0x10A5F },
+    { 0x10A80, 0x10AFF }, { 0x10B36, 0x10B38 }, { 0x10B56, 0x10B57 },
+    { 0x10B73, 0x10B77 }, { 0x10B80, 0x10BFF }, { 0x10C49, 0x10E5F },
+    { 0x10E7F, 0x10FFF }, { 0x1104E, 0x11051 }, { 0x11070, 0x1107F },
+    { 0x110BD, 0x110BD }, { 0x110C2, 0x110CF }, { 0x110E9, 0x110EF },
+    { 0x110FA, 0x110FF }, { 0x11135, 0x11135 }, { 0x11144, 0x1117F },
+    { 0x111C9, 0x111CF }, { 0x111DA, 0x1167F }, { 0x116B8, 0x116BF },
+    { 0x116CA, 0x11FFF }, { 0x1236F, 0x123FF }, { 0x12463, 0x1246F },
+    { 0x12474, 0x12FFF }, { 0x1342F, 0x167FF }, { 0x16A39, 0x16EFF },
+    { 0x16F45, 0x16F4F }, { 0x16F7F, 0x16F8E }, { 0x16FA0, 0x1AFFF },
+    { 0x1B002, 0x1CFFF }, { 0x1D0F6, 0x1D0FF }, { 0x1D127, 0x1D128 },
+    { 0x1D173, 0x1D17A }, { 0x1D1DE, 0x1D1FF }, { 0x1D246, 0x1D2FF },
+    { 0x1D357, 0x1D35F }, { 0x1D372, 0x1D3FF }, { 0x1D455, 0x1D455 },
+    { 0x1D49D, 0x1D49D }, { 0x1D4A0, 0x1D4A1 }, { 0x1D4A3, 0x1D4A4 },
+    { 0x1D4A7, 0x1D4A8 }, { 0x1D4AD, 0x1D4AD }, { 0x1D4BA, 0x1D4BA },
+    { 0x1D4BC, 0x1D4BC }, { 0x1D4C4, 0x1D4C4 }, { 0x1D506, 0x1D506 },
+    { 0x1D50B, 0x1D50C }, { 0x1D515, 0x1D515 }, { 0x1D51D, 0x1D51D },
+    { 0x1D53A, 0x1D53A }, { 0x1D53F, 0x1D53F }, { 0x1D545, 0x1D545 },
+    { 0x1D547, 0x1D549 }, { 0x1D551, 0x1D551 }, { 0x1D6A6, 0x1D6A7 },
+    { 0x1D7CC, 0x1D7CD }, { 0x1D800, 0x1EDFF }, { 0x1EE04, 0x1EE04 },
+    { 0x1EE20, 0x1EE20 }, { 0x1EE23, 0x1EE23 }, { 0x1EE25, 0x1EE26 },
+    { 0x1EE28, 0x1EE28 }, { 0x1EE33, 0x1EE33 }, { 0x1EE38, 0x1EE38 },
+    { 0x1EE3A, 0x1EE3A }, { 0x1EE3C, 0x1EE41 }, { 0x1EE43, 0x1EE46 },
+    { 0x1EE48, 0x1EE48 }, { 0x1EE4A, 0x1EE4A }, { 0x1EE4C, 0x1EE4C },
+    { 0x1EE50, 0x1EE50 }, { 0x1EE53, 0x1EE53 }, { 0x1EE55, 0x1EE56 },
+    { 0x1EE58, 0x1EE58 }, { 0x1EE5A, 0x1EE5A }, { 0x1EE5C, 0x1EE5C },
+    { 0x1EE5E, 0x1EE5E }, { 0x1EE60, 0x1EE60 }, { 0x1EE63, 0x1EE63 },
+    { 0x1EE65, 0x1EE66 }, { 0x1EE6B, 0x1EE6B }, { 0x1EE73, 0x1EE73 },
+    { 0x1EE78, 0x1EE78 }, { 0x1EE7D, 0x1EE7D }, { 0x1EE7F, 0x1EE7F },
+    { 0x1EE8A, 0x1EE8A }, { 0x1EE9C, 0x1EEA0 }, { 0x1EEA4, 0x1EEA4 },
+    { 0x1EEAA, 0x1EEAA }, { 0x1EEBC, 0x1EEEF }, { 0x1EEF2, 0x1EFFF },
+    { 0x1F02C, 0x1F02F }, { 0x1F094, 0x1F09F }, { 0x1F0AF, 0x1F0B0 },
+    { 0x1F0BF, 0x1F0C0 }, { 0x1F0D0, 0x1F0D0 }, { 0x1F0E0, 0x1F0FF },
+    { 0x1F10B, 0x1F10F }, { 0x1F12F, 0x1F12F }, { 0x1F16C, 0x1F16F },
+    { 0x1F19B, 0x1F1E5 }, { 0x1F203, 0x1F20F }, { 0x1F23B, 0x1F23F },
+    { 0x1F249, 0x1F24F }, { 0x1F252, 0x1F2FF }, { 0x1F321, 0x1F32F },
+    { 0x1F336, 0x1F336 }, { 0x1F37D, 0x1F37F }, { 0x1F394, 0x1F39F },
+    { 0x1F3C5, 0x1F3C5 }, { 0x1F3CB, 0x1F3DF }, { 0x1F3F1, 0x1F3FF },
+    { 0x1F43F, 0x1F43F }, { 0x1F441, 0x1F441 }, { 0x1F4F8, 0x1F4F8 },
+    { 0x1F4FD, 0x1F4FF }, { 0x1F53E, 0x1F53F }, { 0x1F544, 0x1F54F },
+    { 0x1F568, 0x1F5FA }, { 0x1F641, 0x1F644 }, { 0x1F650, 0x1F67F },
+    { 0x1F6C6, 0x1F6FF }, { 0x1F774, 0x1FFFF }, { 0x2A6D7, 0x2A6FF },
+    { 0x2B735, 0x2B73F }, { 0x2B81E, 0x2F7FF }, { 0x2FA1E, 0xF0000 },
+    { 0xFFFFE, 0xFFFFF }, { 0x10FFFE, 0x10FFFF }
+  };
+  static const UnicodeCharSet NonPrintables(NonPrintableRanges);
+
+  return UCS >= 0 && UCS <= 0x10FFFF && !NonPrintables.contains(UCS);
+}
+
+/// Gets the number of positions a character is likely to occupy when output
+/// on a terminal ("character width"). This depends on the implementation of the
+/// terminal, and there's no standard definition of character width.
+/// The implementation defines it in a way that is expected to be compatible
+/// with a generic Unicode-capable terminal.
+/// \return Character width:
+///   * ErrorNonPrintableCharacter (-1) for non-printable characters (as
+///     identified by isPrintable);
+///   * 0 for non-spacing and enclosing combining marks;
+///   * 2 for CJK characters excluding halfwidth forms;
+///   * 1 for all remaining characters.
+static inline int charWidth(int UCS)
+{
+  if (!isPrintable(UCS))
+    return ErrorNonPrintableCharacter;
+
+  // Sorted list of non-spacing and enclosing combining mark intervals as
+  // defined in "3.6 Combination" of
+  // http://www.unicode.org/versions/Unicode6.2.0/UnicodeStandard-6.2.pdf
+  static const UnicodeCharRange CombiningCharacterRanges[] = {
+    { 0x0300, 0x036F }, { 0x0483, 0x0489 }, { 0x0591, 0x05BD },
+    { 0x05BF, 0x05BF }, { 0x05C1, 0x05C2 }, { 0x05C4, 0x05C5 },
+    { 0x05C7, 0x05C7 }, { 0x0610, 0x061A }, { 0x064B, 0x065F },
+    { 0x0670, 0x0670 }, { 0x06D6, 0x06DC }, { 0x06DF, 0x06E4 },
+    { 0x06E7, 0x06E8 }, { 0x06EA, 0x06ED }, { 0x0711, 0x0711 },
+    { 0x0730, 0x074A }, { 0x07A6, 0x07B0 }, { 0x07EB, 0x07F3 },
+    { 0x0816, 0x0819 }, { 0x081B, 0x0823 }, { 0x0825, 0x0827 },
+    { 0x0829, 0x082D }, { 0x0859, 0x085B }, { 0x08E4, 0x08FE },
+    { 0x0900, 0x0902 }, { 0x093A, 0x093A }, { 0x093C, 0x093C },
+    { 0x0941, 0x0948 }, { 0x094D, 0x094D }, { 0x0951, 0x0957 },
+    { 0x0962, 0x0963 }, { 0x0981, 0x0981 }, { 0x09BC, 0x09BC },
+    { 0x09C1, 0x09C4 }, { 0x09CD, 0x09CD }, { 0x09E2, 0x09E3 },
+    { 0x0A01, 0x0A02 }, { 0x0A3C, 0x0A3C }, { 0x0A41, 0x0A42 },
+    { 0x0A47, 0x0A48 }, { 0x0A4B, 0x0A4D }, { 0x0A51, 0x0A51 },
+    { 0x0A70, 0x0A71 }, { 0x0A75, 0x0A75 }, { 0x0A81, 0x0A82 },
+    { 0x0ABC, 0x0ABC }, { 0x0AC1, 0x0AC5 }, { 0x0AC7, 0x0AC8 },
+    { 0x0ACD, 0x0ACD }, { 0x0AE2, 0x0AE3 }, { 0x0B01, 0x0B01 },
+    { 0x0B3C, 0x0B3C }, { 0x0B3F, 0x0B3F }, { 0x0B41, 0x0B44 },
+    { 0x0B4D, 0x0B4D }, { 0x0B56, 0x0B56 }, { 0x0B62, 0x0B63 },
+    { 0x0B82, 0x0B82 }, { 0x0BC0, 0x0BC0 }, { 0x0BCD, 0x0BCD },
+    { 0x0C3E, 0x0C40 }, { 0x0C46, 0x0C48 }, { 0x0C4A, 0x0C4D },
+    { 0x0C55, 0x0C56 }, { 0x0C62, 0x0C63 }, { 0x0CBC, 0x0CBC },
+    { 0x0CBF, 0x0CBF }, { 0x0CC6, 0x0CC6 }, { 0x0CCC, 0x0CCD },
+    { 0x0CE2, 0x0CE3 }, { 0x0D41, 0x0D44 }, { 0x0D4D, 0x0D4D },
+    { 0x0D62, 0x0D63 }, { 0x0DCA, 0x0DCA }, { 0x0DD2, 0x0DD4 },
+    { 0x0DD6, 0x0DD6 }, { 0x0E31, 0x0E31 }, { 0x0E34, 0x0E3A },
+    { 0x0E47, 0x0E4E }, { 0x0EB1, 0x0EB1 }, { 0x0EB4, 0x0EB9 },
+    { 0x0EBB, 0x0EBC }, { 0x0EC8, 0x0ECD }, { 0x0F18, 0x0F19 },
+    { 0x0F35, 0x0F35 }, { 0x0F37, 0x0F37 }, { 0x0F39, 0x0F39 },
+    { 0x0F71, 0x0F7E }, { 0x0F80, 0x0F84 }, { 0x0F86, 0x0F87 },
+    { 0x0F8D, 0x0F97 }, { 0x0F99, 0x0FBC }, { 0x0FC6, 0x0FC6 },
+    { 0x102D, 0x1030 }, { 0x1032, 0x1037 }, { 0x1039, 0x103A },
+    { 0x103D, 0x103E }, { 0x1058, 0x1059 }, { 0x105E, 0x1060 },
+    { 0x1071, 0x1074 }, { 0x1082, 0x1082 }, { 0x1085, 0x1086 },
+    { 0x108D, 0x108D }, { 0x109D, 0x109D }, { 0x135D, 0x135F },
+    { 0x1712, 0x1714 }, { 0x1732, 0x1734 }, { 0x1752, 0x1753 },
+    { 0x1772, 0x1773 }, { 0x17B4, 0x17B5 }, { 0x17B7, 0x17BD },
+    { 0x17C6, 0x17C6 }, { 0x17C9, 0x17D3 }, { 0x17DD, 0x17DD },
+    { 0x180B, 0x180D }, { 0x18A9, 0x18A9 }, { 0x1920, 0x1922 },
+    { 0x1927, 0x1928 }, { 0x1932, 0x1932 }, { 0x1939, 0x193B },
+    { 0x1A17, 0x1A18 }, { 0x1A56, 0x1A56 }, { 0x1A58, 0x1A5E },
+    { 0x1A60, 0x1A60 }, { 0x1A62, 0x1A62 }, { 0x1A65, 0x1A6C },
+    { 0x1A73, 0x1A7C }, { 0x1A7F, 0x1A7F }, { 0x1B00, 0x1B03 },
+    { 0x1B34, 0x1B34 }, { 0x1B36, 0x1B3A }, { 0x1B3C, 0x1B3C },
+    { 0x1B42, 0x1B42 }, { 0x1B6B, 0x1B73 }, { 0x1B80, 0x1B81 },
+    { 0x1BA2, 0x1BA5 }, { 0x1BA8, 0x1BA9 }, { 0x1BAB, 0x1BAB },
+    { 0x1BE6, 0x1BE6 }, { 0x1BE8, 0x1BE9 }, { 0x1BED, 0x1BED },
+    { 0x1BEF, 0x1BF1 }, { 0x1C2C, 0x1C33 }, { 0x1C36, 0x1C37 },
+    { 0x1CD0, 0x1CD2 }, { 0x1CD4, 0x1CE0 }, { 0x1CE2, 0x1CE8 },
+    { 0x1CED, 0x1CED }, { 0x1CF4, 0x1CF4 }, { 0x1DC0, 0x1DE6 },
+    { 0x1DFC, 0x1DFF }, { 0x20D0, 0x20F0 }, { 0x2CEF, 0x2CF1 },
+    { 0x2D7F, 0x2D7F }, { 0x2DE0, 0x2DFF }, { 0x302A, 0x302D },
+    { 0x3099, 0x309A }, { 0xA66F, 0xA672 }, { 0xA674, 0xA67D },
+    { 0xA69F, 0xA69F }, { 0xA6F0, 0xA6F1 }, { 0xA802, 0xA802 },
+    { 0xA806, 0xA806 }, { 0xA80B, 0xA80B }, { 0xA825, 0xA826 },
+    { 0xA8C4, 0xA8C4 }, { 0xA8E0, 0xA8F1 }, { 0xA926, 0xA92D },
+    { 0xA947, 0xA951 }, { 0xA980, 0xA982 }, { 0xA9B3, 0xA9B3 },
+    { 0xA9B6, 0xA9B9 }, { 0xA9BC, 0xA9BC }, { 0xAA29, 0xAA2E },
+    { 0xAA31, 0xAA32 }, { 0xAA35, 0xAA36 }, { 0xAA43, 0xAA43 },
+    { 0xAA4C, 0xAA4C }, { 0xAAB0, 0xAAB0 }, { 0xAAB2, 0xAAB4 },
+    { 0xAAB7, 0xAAB8 }, { 0xAABE, 0xAABF }, { 0xAAC1, 0xAAC1 },
+    { 0xAAEC, 0xAAED }, { 0xAAF6, 0xAAF6 }, { 0xABE5, 0xABE5 },
+    { 0xABE8, 0xABE8 }, { 0xABED, 0xABED }, { 0xFB1E, 0xFB1E },
+    { 0xFE00, 0xFE0F }, { 0xFE20, 0xFE26 }, { 0x101FD, 0x101FD },
+    { 0x10A01, 0x10A03 }, { 0x10A05, 0x10A06 }, { 0x10A0C, 0x10A0F },
+    { 0x10A38, 0x10A3A }, { 0x10A3F, 0x10A3F }, { 0x11001, 0x11001 },
+    { 0x11038, 0x11046 }, { 0x11080, 0x11081 }, { 0x110B3, 0x110B6 },
+    { 0x110B9, 0x110BA }, { 0x11100, 0x11102 }, { 0x11127, 0x1112B },
+    { 0x1112D, 0x11134 }, { 0x11180, 0x11181 }, { 0x111B6, 0x111BE },
+    { 0x116AB, 0x116AB }, { 0x116AD, 0x116AD }, { 0x116B0, 0x116B5 },
+    { 0x116B7, 0x116B7 }, { 0x16F8F, 0x16F92 }, { 0x1D167, 0x1D169 },
+    { 0x1D17B, 0x1D182 }, { 0x1D185, 0x1D18B }, { 0x1D1AA, 0x1D1AD },
+    { 0x1D242, 0x1D244 }, { 0xE0100, 0xE01EF },
+  };
+  static const UnicodeCharSet CombiningCharacters(CombiningCharacterRanges);
+
+  if (CombiningCharacters.contains(UCS))
+    return 0;
+
+  static const UnicodeCharRange DoubleWidthCharacterRanges[] = {
+    // Hangul Jamo
+    { 0x1100, 0x11FF },
+    // Deprecated fullwidth angle brackets
+    { 0x2329, 0x232A },
+    // CJK Misc, CJK Unified Ideographs, Yijing Hexagrams, Yi
+    // excluding U+303F (IDEOGRAPHIC HALF FILL SPACE)
+    { 0x2E80, 0x303E }, { 0x3040, 0xA4CF },
+    // Hangul
+    { 0xAC00, 0xD7A3 }, { 0xD7B0, 0xD7C6 }, { 0xD7CB, 0xD7FB },
+    // CJK Unified Ideographs
+    { 0xF900, 0xFAFF },
+    // Vertical forms
+    { 0xFE10, 0xFE19 },
+    // CJK Compatibility Forms + Small Form Variants
+    { 0xFE30, 0xFE6F },
+    // Fullwidth forms
+    { 0xFF01, 0xFF60 }, { 0xFFE0, 0xFFE6 },
+    // CJK Unified Ideographs
+    { 0x20000, 0x2A6DF }, { 0x2A700, 0x2B81F }, { 0x2F800, 0x2FA1F }
+  };
+  static const UnicodeCharSet DoubleWidthCharacters(DoubleWidthCharacterRanges);
+
+  if (DoubleWidthCharacters.contains(UCS))
+    return 2;
+  return 1;
+}
+
+int columnWidthUTF8(StringRef Text) {
+  unsigned ColumnWidth = 0;
+  unsigned Length;
+  for (size_t i = 0, e = Text.size(); i < e; i += Length) {
+    Length = getNumBytesForUTF8(Text[i]);
+    if (Length <= 0 || i + Length > Text.size())
+      return ErrorInvalidUTF8;
+    UTF32 buf[1];
+    const UTF8 *Start = reinterpret_cast<const UTF8 *>(Text.data() + i);
+    UTF32 *Target = &buf[0];
+    if (conversionOK != ConvertUTF8toUTF32(&Start, Start + Length, &Target,
+                                           Target + 1, strictConversion))
+      return ErrorInvalidUTF8;
+    int Width = charWidth(buf[0]);
+    if (Width < 0)
+      return ErrorNonPrintableCharacter;
+    ColumnWidth += Width;
+  }
+  return ColumnWidth;
+}
+
+} // namespace unicode
+} // namespace sys
+} // namespace llvm
+
diff --git a/lib/Support/Unix/Path.inc b/lib/Support/Unix/Path.inc
index 4dcfa09..c9dc871 100644
--- a/lib/Support/Unix/Path.inc
+++ b/lib/Support/Unix/Path.inc
@@ -182,7 +182,7 @@ namespace sys  {
 namespace fs {
 #if defined(__FreeBSD__) || defined (__NetBSD__) || defined(__Bitrig__) || \
     defined(__OpenBSD__) || defined(__minix) || defined(__FreeBSD_kernel__) || \
-    defined(__linux__) || defined(__CYGWIN__)
+    defined(__linux__) || defined(__CYGWIN__) || defined(__DragonFly__)
 static int
 test_dir(char buf[PATH_MAX], char ret[PATH_MAX],
     const char *dir, const char *bin)
@@ -251,7 +251,8 @@ std::string getMainExecutable(const char *argv0, void *MainAddr) {
       return link_path;
   }
 #elif defined(__FreeBSD__) || defined (__NetBSD__) || defined(__Bitrig__) || \
-      defined(__OpenBSD__) || defined(__minix) || defined(__FreeBSD_kernel__)
+      defined(__OpenBSD__) || defined(__minix) || defined(__DragonFly__) || \
+      defined(__FreeBSD_kernel__)
   char exe_path[PATH_MAX];
 
   if (getprogpath(exe_path, argv0) != NULL)
@@ -298,6 +299,18 @@ UniqueID file_status::getUniqueID() const {
 }
 
 error_code current_path(SmallVectorImpl<char> &result) {
+  result.clear();
+
+  const char *pwd = ::getenv("PWD");
+  llvm::sys::fs::file_status PWDStatus, DotStatus;
+  if (pwd && llvm::sys::path::is_absolute(pwd) &&
+      !llvm::sys::fs::status(pwd, PWDStatus) &&
+      !llvm::sys::fs::status(".", DotStatus) &&
+      PWDStatus.getUniqueID() == DotStatus.getUniqueID()) {
+    result.append(pwd, pwd + strlen(pwd));
+    return error_code::success();
+  }
+
 #ifdef MAXPATHLEN
   result.reserve(MAXPATHLEN);
 #else
diff --git a/lib/Support/Unix/Process.inc b/lib/Support/Unix/Process.inc
index 0a797f6..c5778e7 100644
--- a/lib/Support/Unix/Process.inc
+++ b/lib/Support/Unix/Process.inc
@@ -13,6 +13,7 @@
 
 #include "Unix.h"
 #include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Mutex.h"
 #include "llvm/Support/MutexGuard.h"
 #include "llvm/Support/TimeValue.h"
@@ -38,25 +39,6 @@
 #  include <termios.h>
 #endif
 
-// See if we can use curses to detect information about a terminal when
-// connected to one.
-#ifdef HAVE_CURSES
-# if defined(HAVE_CURSES_H)
-#  include <curses.h>
-# elif defined(HAVE_NCURSES_H)
-#  include <ncurses.h>
-# elif defined(HAVE_NCURSESW_H)
-#  include <ncursesw.h>
-# elif defined(HAVE_NCURSES_CURSES_H)
-#  include <ncurses/curses.h>
-# elif defined(HAVE_NCURSESW_CURSES_H)
-#  include <ncursesw/curses.h>
-# else
-#  error Have a curses library but unable to find a curses header!
-# endif
-# include <term.h>
-#endif
-
 //===----------------------------------------------------------------------===//
 //=== WARNING: Implementation here must contain only generic UNIX code that
 //===          is guaranteed to work on *all* UNIX variants.
@@ -107,13 +89,10 @@ TimeValue self_process::get_system_time() const {
   return getRUsageTimes().second;
 }
 
+// On Cygwin, getpagesize() returns 64k(AllocationGranularity) and
+// offset in mmap(3) should be aligned to the AllocationGranularity.
 static unsigned getPageSize() {
-#if defined(__CYGWIN__)
-  // On Cygwin, getpagesize() returns 64k but the page size for the purposes of
-  // memory protection and mmap() is 4k.
-  // See http://www.cygwin.com/ml/cygwin/2009-01/threads.html#00492
-  const int page_size = 0x1000;
-#elif defined(HAVE_GETPAGESIZE)
+#if defined(HAVE_GETPAGESIZE)
   const int page_size = ::getpagesize();
 #elif defined(HAVE_SYSCONF)
   long page_size = ::sysconf(_SC_PAGE_SIZE);
@@ -159,14 +138,6 @@ void Process::GetTimeUsage(TimeValue &elapsed, TimeValue &user_time,
   llvm::tie(user_time, sys_time) = getRUsageTimes();
 }
 
-int Process::GetCurrentUserId() {
-  return getuid();
-}
-
-int Process::GetCurrentGroupId() {
-  return getgid();
-}
-
 #if defined(HAVE_MACH_MACH_H) && !defined(__GNU__)
 #include <mach/mach.h>
 #endif
@@ -211,6 +182,22 @@ void Process::PreventCoreFiles() {
 #endif
 }
 
+Optional<std::string> Process::GetEnv(StringRef Name) {
+  std::string NameStr = Name.str();
+  const char *Val = ::getenv(NameStr.c_str());
+  if (!Val)
+    return None;
+  return std::string(Val);
+}
+
+error_code Process::GetArgumentVector(SmallVectorImpl<const char *> &ArgsOut,
+                                      ArrayRef<const char *> ArgsIn,
+                                      SpecificBumpPtrAllocator<char> &) {
+  ArgsOut.append(ArgsIn.begin(), ArgsIn.end());
+
+  return error_code::success();
+}
+
 bool Process::StandardInIsUserInput() {
   return FileDescriptorIsDisplayed(STDIN_FILENO);
 }
@@ -266,21 +253,50 @@ unsigned Process::StandardErrColumns() {
   return getColumns(2);
 }
 
+#ifdef HAVE_TERMINFO
+// We manually declare these extern functions because finding the correct
+// headers from various terminfo, curses, or other sources is harder than
+// writing their specs down.
+extern "C" int setupterm(char *term, int filedes, int *errret);
+extern "C" struct term *set_curterm(struct term *termp);
+extern "C" int del_curterm(struct term *termp);
+extern "C" int tigetnum(char *capname);
+#endif
+
 static bool terminalHasColors(int fd) {
-#ifdef HAVE_CURSES
-  // First, acquire a global lock because the curses C routines are thread
-  // hostile.
+#ifdef HAVE_TERMINFO
+  // First, acquire a global lock because these C routines are thread hostile.
   static sys::Mutex M;
   MutexGuard G(M);
 
   int errret = 0;
-  if (setupterm((char *)0, fd, &errret) != OK)
+  if (setupterm((char *)0, fd, &errret) != 0)
     // Regardless of why, if we can't get terminfo, we shouldn't try to print
     // colors.
     return false;
 
-  // Test whether the terminal as set up supports color output.
-  if (has_colors() == TRUE)
+  // Test whether the terminal as set up supports color output. How to do this
+  // isn't entirely obvious. We can use the curses routine 'has_colors' but it
+  // would be nice to avoid a dependency on curses proper when we can make do
+  // with a minimal terminfo parsing library. Also, we don't really care whether
+  // the terminal supports the curses-specific color changing routines, merely
+  // if it will interpret ANSI color escape codes in a reasonable way. Thus, the
+  // strategy here is just to query the baseline colors capability and if it
+  // supports colors at all to assume it will translate the escape codes into
+  // whatever range of colors it does support. We can add more detailed tests
+  // here if users report them as necessary.
+  //
+  // The 'tigetnum' routine returns -2 or -1 on errors, and might return 0 if
+  // the terminfo says that no colors are supported.
+  bool HasColors = tigetnum(const_cast<char *>("colors")) > 0;
+
+  // Now extract the structure allocated by setupterm and free its memory
+  // through a really silly dance.
+  struct term *termp = set_curterm((struct term *)0);
+  (void)del_curterm(termp); // Drop any errors here.
+
+  // Return true if we found a color capabilities for the current terminal.
+  if (HasColors)
     return true;
 #endif
 
@@ -302,29 +318,15 @@ bool Process::StandardErrHasColors() {
   return FileDescriptorHasColors(STDERR_FILENO);
 }
 
+void Process::UseANSIEscapeCodes(bool /*enable*/) {
+  // No effect.
+}
+
 bool Process::ColorNeedsFlush() {
   // No, we use ANSI escape sequences.
   return false;
 }
 
-#define COLOR(FGBG, CODE, BOLD) "\033[0;" BOLD FGBG CODE "m"
-
-#define ALLCOLORS(FGBG,BOLD) {\
-    COLOR(FGBG, "0", BOLD),\
-    COLOR(FGBG, "1", BOLD),\
-    COLOR(FGBG, "2", BOLD),\
-    COLOR(FGBG, "3", BOLD),\
-    COLOR(FGBG, "4", BOLD),\
-    COLOR(FGBG, "5", BOLD),\
-    COLOR(FGBG, "6", BOLD),\
-    COLOR(FGBG, "7", BOLD)\
-  }
-
-static const char colorcodes[2][2][8][10] = {
- { ALLCOLORS("3",""), ALLCOLORS("3","1;") },
- { ALLCOLORS("4",""), ALLCOLORS("4","1;") }
-};
-
 const char *Process::OutputColor(char code, bool bold, bool bg) {
   return colorcodes[bg?1:0][bold?1:0][code&7];
 }
diff --git a/lib/Support/Unix/Program.inc b/lib/Support/Unix/Program.inc
index a93a912..78b2971 100644
--- a/lib/Support/Unix/Program.inc
+++ b/lib/Support/Unix/Program.inc
@@ -36,6 +36,9 @@
 #include <unistd.h>
 #endif
 #ifdef HAVE_POSIX_SPAWN
+#ifdef __sun__
+#define  _RESTRICT_KYWD
+#endif
 #include <spawn.h>
 #if !defined(__APPLE__)
   extern char **environ;
@@ -47,6 +50,8 @@
 namespace llvm {
 using namespace sys;
 
+ProcessInfo::ProcessInfo() : Pid(0), ReturnCode(0) {}
+
 // This function just uses the PATH environment variable to find the program.
 std::string
 sys::FindProgramByName(const std::string& progName) {
@@ -175,9 +180,16 @@ static void SetMemoryLimits (unsigned size)
 
 }
 
-static bool Execute(void **Data, StringRef Program, const char **args,
+static bool Execute(ProcessInfo &PI, StringRef Program, const char **args,
                     const char **envp, const StringRef **redirects,
                     unsigned memoryLimit, std::string *ErrMsg) {
+  if (!llvm::sys::fs::exists(Program)) {
+    if (ErrMsg)
+      *ErrMsg = std::string("Executable \"") + Program.str() +
+                std::string("\" doesn't exist!");
+    return false;
+  }
+
   // If this OS has posix_spawn and there is no memory limit being implied, use
   // posix_spawn.  It is more efficient than fork/exec.
 #ifdef HAVE_POSIX_SPAWN
@@ -239,8 +251,8 @@ static bool Execute(void **Data, StringRef Program, const char **args,
     if (Err)
      return !MakeErrMsg(ErrMsg, "posix_spawn failed", Err);
 
-    if (Data)
-      *Data = reinterpret_cast<void*>(PID);
+    PI.Pid = PID;
+
     return true;
   }
 #endif
@@ -303,56 +315,71 @@ static bool Execute(void **Data, StringRef Program, const char **args,
       break;
   }
 
-  if (Data)
-    *Data = reinterpret_cast<void*>(child);
+  PI.Pid = child;
 
   return true;
 }
 
-static int Wait(void *&Data, StringRef Program, unsigned secondsToWait,
-                std::string *ErrMsg) {
+namespace llvm {
+
+ProcessInfo sys::Wait(const ProcessInfo &PI, unsigned SecondsToWait,
+                      bool WaitUntilTerminates, std::string *ErrMsg) {
 #ifdef HAVE_SYS_WAIT_H
   struct sigaction Act, Old;
-  assert(Data && "invalid pid to wait on, process not started?");
-
-  // Install a timeout handler.  The handler itself does nothing, but the simple
-  // fact of having a handler at all causes the wait below to return with EINTR,
-  // unlike if we used SIG_IGN.
-  if (secondsToWait) {
+  assert(PI.Pid && "invalid pid to wait on, process not started?");
+
+  int WaitPidOptions = 0;
+  pid_t ChildPid = PI.Pid;
+  if (WaitUntilTerminates) {
+    SecondsToWait = 0;
+    ChildPid = -1; // mimic a wait() using waitpid()
+  } else if (SecondsToWait) {
+    // Install a timeout handler.  The handler itself does nothing, but the
+    // simple fact of having a handler at all causes the wait below to return
+    // with EINTR, unlike if we used SIG_IGN.
     memset(&Act, 0, sizeof(Act));
     Act.sa_handler = TimeOutHandler;
     sigemptyset(&Act.sa_mask);
     sigaction(SIGALRM, &Act, &Old);
-    alarm(secondsToWait);
-  }
+    alarm(SecondsToWait);
+  } else if (SecondsToWait == 0)
+    WaitPidOptions = WNOHANG;
 
   // Parent process: Wait for the child process to terminate.
   int status;
-  uint64_t pid = reinterpret_cast<uint64_t>(Data);
-  pid_t child = static_cast<pid_t>(pid);
-  while (waitpid(pid, &status, 0) != child)
-    if (secondsToWait && errno == EINTR) {
-      // Kill the child.
-      kill(child, SIGKILL);
-
-      // Turn off the alarm and restore the signal handler
-      alarm(0);
-      sigaction(SIGALRM, &Old, 0);
-
-      // Wait for child to die
-      if (wait(&status) != child)
-        MakeErrMsg(ErrMsg, "Child timed out but wouldn't die");
-      else
-        MakeErrMsg(ErrMsg, "Child timed out", 0);
-
-      return -2;   // Timeout detected
-    } else if (errno != EINTR) {
-      MakeErrMsg(ErrMsg, "Error waiting for child process");
-      return -1;
+  ProcessInfo WaitResult;
+  WaitResult.Pid = waitpid(ChildPid, &status, WaitPidOptions);
+  if (WaitResult.Pid != PI.Pid) {
+    if (WaitResult.Pid == 0) {
+      // Non-blocking wait.
+      return WaitResult;
+    } else {
+      if (SecondsToWait && errno == EINTR) {
+        // Kill the child.
+        kill(PI.Pid, SIGKILL);
+
+        // Turn off the alarm and restore the signal handler
+        alarm(0);
+        sigaction(SIGALRM, &Old, 0);
+
+        // Wait for child to die
+        if (wait(&status) != ChildPid)
+          MakeErrMsg(ErrMsg, "Child timed out but wouldn't die");
+        else
+          MakeErrMsg(ErrMsg, "Child timed out", 0);
+
+        WaitResult.ReturnCode = -2; // Timeout detected
+        return WaitResult;
+      } else if (errno != EINTR) {
+        MakeErrMsg(ErrMsg, "Error waiting for child process");
+        WaitResult.ReturnCode = -1;
+        return WaitResult;
+      }
     }
+  }
 
   // We exited normally without timeout, so turn off the timer.
-  if (secondsToWait) {
+  if (SecondsToWait && !WaitUntilTerminates) {
     alarm(0);
     sigaction(SIGALRM, &Old, 0);
   }
@@ -362,24 +389,19 @@ static int Wait(void *&Data, StringRef Program, unsigned secondsToWait,
   int result = 0;
   if (WIFEXITED(status)) {
     result = WEXITSTATUS(status);
-#ifdef HAVE_POSIX_SPAWN
-    // The posix_spawn child process returns 127 on any kind of error.
-    // Following the POSIX convention for command-line tools (which posix_spawn
-    // itself apparently does not), check to see if the failure was due to some
-    // reason other than the file not existing, and return 126 in this case.
-    bool Exists;
-    if (result == 127 && !llvm::sys::fs::exists(Program, Exists) && Exists)
-      result = 126;
-#endif
+    WaitResult.ReturnCode = result;
+
     if (result == 127) {
       if (ErrMsg)
         *ErrMsg = llvm::sys::StrError(ENOENT);
-      return -1;
+      WaitResult.ReturnCode = -1;
+      return WaitResult;
     }
     if (result == 126) {
       if (ErrMsg)
         *ErrMsg = "Program could not be executed";
-      return -1;
+      WaitResult.ReturnCode = -1;
+      return WaitResult;
     }
   } else if (WIFSIGNALED(status)) {
     if (ErrMsg) {
@@ -391,18 +413,16 @@ static int Wait(void *&Data, StringRef Program, unsigned secondsToWait,
     }
     // Return a special value to indicate that the process received an unhandled
     // signal during execution as opposed to failing to execute.
-    return -2;
+    WaitResult.ReturnCode = -2;
   }
-  return result;
 #else
   if (ErrMsg)
     *ErrMsg = "Program::Wait is not implemented on this platform yet!";
-  return -1;
+  WaitResult.ReturnCode = -2;
 #endif
+  return WaitResult;
 }
 
-namespace llvm {
-
 error_code sys::ChangeStdinToBinary(){
   // Do nothing, as Unix doesn't differentiate between text and binary.
   return make_error_code(errc::success);
@@ -438,5 +458,4 @@ bool llvm::sys::argumentsFitWithinSystemLimits(ArrayRef<const char*> Args) {
   }
   return true;
 }
-
 }
diff --git a/lib/Support/Unix/Signals.inc b/lib/Support/Unix/Signals.inc
index 800a6a7..13ae862 100644
--- a/lib/Support/Unix/Signals.inc
+++ b/lib/Support/Unix/Signals.inc
@@ -333,7 +333,7 @@ static void PrintStackTraceSignalHandler(void *) {
 void llvm::sys::PrintStackTraceOnErrorSignal() {
   AddSignalHandler(PrintStackTraceSignalHandler, 0);
 
-#if defined(__APPLE__) && !defined(ANDROID)
+#if defined(__APPLE__) && defined(ENABLE_CRASH_OVERRIDES)
   // Environment variable to disable any kind of crash dialog.
   if (getenv("LLVM_DISABLE_CRASH_REPORT")) {
     mach_port_t self = mach_task_self();
@@ -359,7 +359,7 @@ void llvm::sys::PrintStackTraceOnErrorSignal() {
 // the same linkage unit by just defining our own versions of the assert handler
 // and abort.
 
-#if defined(__APPLE__) && !defined(ANDROID)
+#if defined(__APPLE__) && defined(ENABLE_CRASH_OVERRIDES)
 
 #include <signal.h>
 #include <pthread.h>
diff --git a/lib/Support/Unix/ThreadLocal.inc b/lib/Support/Unix/ThreadLocal.inc
index 2b4c901..f14d0fa 100644
--- a/lib/Support/Unix/ThreadLocal.inc
+++ b/lib/Support/Unix/ThreadLocal.inc
@@ -18,7 +18,7 @@
 
 namespace llvm {
 using namespace sys;
-ThreadLocalImpl::ThreadLocalImpl() { }
+ThreadLocalImpl::ThreadLocalImpl() : data() { }
 ThreadLocalImpl::~ThreadLocalImpl() { }
 void ThreadLocalImpl::setInstance(const void* d) { data = const_cast<void*>(d);}
 const void* ThreadLocalImpl::getInstance() { return data; }
diff --git a/lib/Support/Unix/Unix.h b/lib/Support/Unix/Unix.h
index dd11c04..ba688e3 100644
--- a/lib/Support/Unix/Unix.h
+++ b/lib/Support/Unix/Unix.h
@@ -47,6 +47,10 @@
 # include <sys/wait.h>
 #endif
 
+#ifdef HAVE_DLFCN_H
+# include <dlfcn.h>
+#endif
+
 #ifndef WEXITSTATUS
 # define WEXITSTATUS(stat_val) ((unsigned)(stat_val) >> 8)
 #endif
diff --git a/lib/Support/Windows/DynamicLibrary.inc b/lib/Support/Windows/DynamicLibrary.inc
index 83da82a..5a7b219 100644
--- a/lib/Support/Windows/DynamicLibrary.inc
+++ b/lib/Support/Windows/DynamicLibrary.inc
@@ -71,7 +71,7 @@ extern "C" {
 
 DynamicLibrary DynamicLibrary::getPermanentLibrary(const char *filename,
                                                    std::string *errMsg) {
-  SmartScopedLock<true> lock(getMutex());
+  SmartScopedLock<true> lock(*SymbolsMutex);
 
   if (!filename) {
     // When no file is specified, enumerate all DLLs and EXEs in the process.
@@ -83,8 +83,15 @@ DynamicLibrary DynamicLibrary::getPermanentLibrary(const char *filename,
     // This is mostly to ensure that the return value still shows up as "valid".
     return DynamicLibrary(&OpenedHandles);
   }
+
+  SmallVector<wchar_t, MAX_PATH> filenameUnicode;
+  if (error_code ec = windows::UTF8ToUTF16(filename, filenameUnicode)) {
+    SetLastError(ec.value());
+    MakeErrMsg(errMsg, std::string(filename) + ": Can't convert to UTF-16: ");
+    return DynamicLibrary();
+  }
   
-  HMODULE a_handle = LoadLibrary(filename);
+  HMODULE a_handle = LoadLibraryW(filenameUnicode.data());
 
   if (a_handle == 0) {
     MakeErrMsg(errMsg, std::string(filename) + ": Can't open : ");
@@ -114,10 +121,10 @@ DynamicLibrary DynamicLibrary::getPermanentLibrary(const char *filename,
 #undef EXPLICIT_SYMBOL2
 
 void* DynamicLibrary::SearchForAddressOfSymbol(const char* symbolName) {
-  SmartScopedLock<true> Lock(getMutex());
+  SmartScopedLock<true> Lock(*SymbolsMutex);
 
   // First check symbols added via AddSymbol().
-  if (ExplicitSymbols) {
+  if (ExplicitSymbols.isConstructed()) {
     StringMap<void *>::iterator i = ExplicitSymbols->find(symbolName);
 
     if (i != ExplicitSymbols->end())
diff --git a/lib/Support/Windows/Memory.inc b/lib/Support/Windows/Memory.inc
index 4c5aebd..1260452 100644
--- a/lib/Support/Windows/Memory.inc
+++ b/lib/Support/Windows/Memory.inc
@@ -82,7 +82,7 @@ MemoryBlock Memory::allocateMappedMemory(size_t NumBytes,
 
   uintptr_t Start = NearBlock ? reinterpret_cast<uintptr_t>(NearBlock->base()) +
                                 NearBlock->size()
-                           : NULL;
+                           : 0;
 
   // If the requested address is not aligned to the allocation granularity,
   // round up to get beyond NearBlock. VirtualAlloc would have rounded down.
@@ -106,7 +106,7 @@ MemoryBlock Memory::allocateMappedMemory(size_t NumBytes,
   MemoryBlock Result;
   Result.Address = PA;
   Result.Size = NumBlocks*Granularity;
-                                 ;
+
   if (Flags & MF_EXEC)
     Memory::InvalidateInstructionCache(Result.Address, Result.Size);
 
diff --git a/lib/Support/Windows/Path.inc b/lib/Support/Windows/Path.inc
index 1694cb2..0b39198 100644
--- a/lib/Support/Windows/Path.inc
+++ b/lib/Support/Windows/Path.inc
@@ -37,70 +37,18 @@ typedef int errno_t;
 
 using namespace llvm;
 
+using llvm::sys::windows::UTF8ToUTF16;
+using llvm::sys::windows::UTF16ToUTF8;
+
 namespace {
   typedef BOOLEAN (WINAPI *PtrCreateSymbolicLinkW)(
     /*__in*/ LPCWSTR lpSymlinkFileName,
     /*__in*/ LPCWSTR lpTargetFileName,
     /*__in*/ DWORD dwFlags);
 
-  PtrCreateSymbolicLinkW create_symbolic_link_api = PtrCreateSymbolicLinkW(
-    ::GetProcAddress(::GetModuleHandleA("kernel32.dll"),
-                     "CreateSymbolicLinkW"));
-
-  error_code UTF8ToUTF16(StringRef utf8, SmallVectorImpl<wchar_t> &utf16) {
-    int len = ::MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS,
-                                    utf8.begin(), utf8.size(),
-                                    utf16.begin(), 0);
-
-    if (len == 0)
-      return windows_error(::GetLastError());
-
-    utf16.reserve(len + 1);
-    utf16.set_size(len);
-
-    len = ::MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS,
-                                    utf8.begin(), utf8.size(),
-                                    utf16.begin(), utf16.size());
-
-    if (len == 0)
-      return windows_error(::GetLastError());
-
-    // Make utf16 null terminated.
-    utf16.push_back(0);
-    utf16.pop_back();
-
-    return error_code::success();
-  }
-
-  error_code UTF16ToUTF8(const wchar_t *utf16, size_t utf16_len,
-                               SmallVectorImpl<char> &utf8) {
-    // Get length.
-    int len = ::WideCharToMultiByte(CP_UTF8, 0,
-                                    utf16, utf16_len,
-                                    utf8.begin(), 0,
-                                    NULL, NULL);
-
-    if (len == 0)
-      return windows_error(::GetLastError());
-
-    utf8.reserve(len);
-    utf8.set_size(len);
-
-    // Now do the actual conversion.
-    len = ::WideCharToMultiByte(CP_UTF8, 0,
-                                utf16, utf16_len,
-                                utf8.data(), utf8.size(),
-                                NULL, NULL);
-
-    if (len == 0)
-      return windows_error(::GetLastError());
-
-    // Make utf8 null terminated.
-    utf8.push_back(0);
-    utf8.pop_back();
-
-    return error_code::success();
-  }
+  PtrCreateSymbolicLinkW create_symbolic_link_api =
+      PtrCreateSymbolicLinkW(::GetProcAddress(
+          ::GetModuleHandleW(L"Kernel32.dll"), "CreateSymbolicLinkW"));
 
   error_code TempDir(SmallVectorImpl<wchar_t> &result) {
   retry_temp_dir:
@@ -180,7 +128,7 @@ retry_random_path:
       BYTE val = 0;
       if (!::CryptGenRandom(CryptoProvider, 1, &val))
           return windows_error(::GetLastError());
-      random_path_utf16.push_back("0123456789abcdef"[val & 15]);
+      random_path_utf16.push_back(L"0123456789abcdef"[val & 15]);
     }
     else
       random_path_utf16.push_back(*i);
@@ -268,9 +216,28 @@ namespace sys  {
 namespace fs {
 
 std::string getMainExecutable(const char *argv0, void *MainExecAddr) {
-  char pathname[MAX_PATH];
-  DWORD ret = ::GetModuleFileNameA(NULL, pathname, MAX_PATH);
-  return ret != MAX_PATH ? pathname : "";
+  SmallVector<wchar_t, MAX_PATH> PathName;
+  DWORD Size = ::GetModuleFileNameW(NULL, PathName.data(), PathName.capacity());
+
+  // A zero return value indicates a failure other than insufficient space.
+  if (Size == 0)
+    return "";
+
+  // Insufficient space is determined by a return value equal to the size of
+  // the buffer passed in.
+  if (Size == PathName.capacity())
+    return "";
+
+  // On success, GetModuleFileNameW returns the number of characters written to
+  // the buffer not including the NULL terminator.
+  PathName.set_size(Size);
+
+  // Convert the result from UTF-16 to UTF-8.
+  SmallVector<char, MAX_PATH> PathNameUTF8;
+  if (UTF16ToUTF8(PathName.data(), PathName.size(), PathNameUTF8))
+    return "";
+
+  return std::string(PathNameUTF8.data());
 }
 
 UniqueID file_status::getUniqueID() const {
@@ -293,47 +260,25 @@ TimeValue file_status::getLastModificationTime() const {
 }
 
 error_code current_path(SmallVectorImpl<char> &result) {
-  SmallVector<wchar_t, 128> cur_path;
-  cur_path.reserve(128);
-retry_cur_dir:
-  DWORD len = ::GetCurrentDirectoryW(cur_path.capacity(), cur_path.data());
-
-  // A zero return value indicates a failure other than insufficient space.
-  if (len == 0)
-    return windows_error(::GetLastError());
+  SmallVector<wchar_t, MAX_PATH> cur_path;
+  DWORD len = MAX_PATH;
 
-  // If there's insufficient space, the len returned is larger than the len
-  // given.
-  if (len > cur_path.capacity()) {
+  do {
     cur_path.reserve(len);
-    goto retry_cur_dir;
-  }
-
-  cur_path.set_size(len);
-  // cur_path now holds the current directory in utf-16. Convert to utf-8.
+    len = ::GetCurrentDirectoryW(cur_path.capacity(), cur_path.data());
 
-  // Find out how much space we need. Sadly, this function doesn't return the
-  // size needed unless you tell it the result size is 0, which means you
-  // _always_ have to call it twice.
-  len = ::WideCharToMultiByte(CP_UTF8, 0,
-                              cur_path.data(), cur_path.size(),
-                              result.data(), 0,
-                              NULL, NULL);
-
-  if (len == 0)
-    return make_error_code(windows_error(::GetLastError()));
+    // A zero return value indicates a failure other than insufficient space.
+    if (len == 0)
+      return windows_error(::GetLastError());
 
-  result.reserve(len);
-  result.set_size(len);
-  // Now do the actual conversion.
-  len = ::WideCharToMultiByte(CP_UTF8, 0,
-                              cur_path.data(), cur_path.size(),
-                              result.data(), result.size(),
-                              NULL, NULL);
-  if (len == 0)
-    return windows_error(::GetLastError());
+    // If there's insufficient space, the len returned is larger than the len
+    // given.
+  } while (len > cur_path.capacity());
 
-  return error_code::success();
+  // On success, GetCurrentDirectoryW returns the number of characters not
+  // including the null-terminator.
+  cur_path.set_size(len);
+  return UTF16ToUTF8(cur_path.begin(), cur_path.size(), result);
 }
 
 error_code create_directory(const Twine &path, bool &existed) {
@@ -746,12 +691,11 @@ error_code mapped_file_region::init(int FD, bool CloseFD, uint64_t Offset) {
   case priv:      flprotect = PAGE_WRITECOPY; break;
   }
 
-  FileMappingHandle = ::CreateFileMapping(FileHandle,
-                                          0,
-                                          flprotect,
-                                          Size >> 32,
-                                          Size & 0xffffffff,
-                                          0);
+  FileMappingHandle =
+      ::CreateFileMappingW(FileHandle, 0, flprotect,
+                           (Offset + Size) >> 32,
+                           (Offset + Size) & 0xffffffff,
+                           0);
   if (FileMappingHandle == NULL) {
     error_code ec = windows_error(GetLastError());
     if (FileDescriptor) {
@@ -816,7 +760,7 @@ mapped_file_region::mapped_file_region(const Twine &path,
                                        mapmode mode,
                                        uint64_t length,
                                        uint64_t offset,
-                                       error_code &ec) 
+                                       error_code &ec)
   : Mode(mode)
   , Size(length)
   , Mapping()
@@ -1018,7 +962,7 @@ error_code detail::directory_iterator_increment(detail::DirIterState &it) {
   return error_code::success();
 }
 
-error_code map_file_pages(const Twine &path, off_t file_offset, size_t size,  
+error_code map_file_pages(const Twine &path, off_t file_offset, size_t size,
                                             bool map_writable, void *&result) {
   assert(0 && "NOT IMPLEMENTED");
   return windows_error::invalid_function;
@@ -1078,7 +1022,7 @@ error_code openFileForWrite(const Twine &Name, int &ResultFD,
   DWORD CreationDisposition;
   if (Flags & F_Excl)
     CreationDisposition = CREATE_NEW;
-  else if (Flags & F_Append) 
+  else if (Flags & F_Append)
     CreationDisposition = OPEN_ALWAYS;
   else
     CreationDisposition = CREATE_ALWAYS;
@@ -1115,7 +1059,64 @@ error_code openFileForWrite(const Twine &Name, int &ResultFD,
   ResultFD = FD;
   return error_code::success();
 }
-
 } // end namespace fs
+
+namespace windows {
+llvm::error_code UTF8ToUTF16(llvm::StringRef utf8,
+                             llvm::SmallVectorImpl<wchar_t> &utf16) {
+  int len = ::MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS,
+                                  utf8.begin(), utf8.size(),
+                                  utf16.begin(), 0);
+
+  if (len == 0)
+    return llvm::windows_error(::GetLastError());
+
+  utf16.reserve(len + 1);
+  utf16.set_size(len);
+
+  len = ::MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS,
+                              utf8.begin(), utf8.size(),
+                              utf16.begin(), utf16.size());
+
+  if (len == 0)
+    return llvm::windows_error(::GetLastError());
+
+  // Make utf16 null terminated.
+  utf16.push_back(0);
+  utf16.pop_back();
+
+  return llvm::error_code::success();
+}
+
+llvm::error_code UTF16ToUTF8(const wchar_t *utf16, size_t utf16_len,
+                             llvm::SmallVectorImpl<char> &utf8) {
+  // Get length.
+  int len = ::WideCharToMultiByte(CP_UTF8, 0,
+                                  utf16, utf16_len,
+                                  utf8.begin(), 0,
+                                  NULL, NULL);
+
+  if (len == 0)
+    return llvm::windows_error(::GetLastError());
+
+  utf8.reserve(len);
+  utf8.set_size(len);
+
+  // Now do the actual conversion.
+  len = ::WideCharToMultiByte(CP_UTF8, 0,
+                              utf16, utf16_len,
+                              utf8.data(), utf8.size(),
+                              NULL, NULL);
+
+  if (len == 0)
+    return llvm::windows_error(::GetLastError());
+
+  // Make utf8 null terminated.
+  utf8.push_back(0);
+  utf8.pop_back();
+
+  return llvm::error_code::success();
+}
+} // end namespace windows
 } // end namespace sys
 } // end namespace llvm
diff --git a/lib/Support/Windows/Process.inc b/lib/Support/Windows/Process.inc
index 359b99f..f9a3db9 100644
--- a/lib/Support/Windows/Process.inc
+++ b/lib/Support/Windows/Process.inc
@@ -11,18 +11,25 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Support/Allocator.h"
+
 #include "Windows.h"
 #include <direct.h>
 #include <io.h>
 #include <malloc.h>
 #include <psapi.h>
+#include <shellapi.h>
 
 #ifdef __MINGW32__
  #if (HAVE_LIBPSAPI != 1)
   #error "libpsapi.a should be present"
  #endif
+ #if (HAVE_LIBSHELL32 != 1)
+  #error "libshell32.a should be present"
+ #endif
 #else
  #pragma comment(lib, "psapi.lib")
+ #pragma comment(lib, "shell32.lib")
 #endif
 
 //===----------------------------------------------------------------------===//
@@ -83,6 +90,8 @@ static unsigned getPageSize() {
   // that LLVM ought to run as 64-bits on a 64-bit system, anyway.
   SYSTEM_INFO info;
   GetSystemInfo(&info);
+  // FIXME: FileOffset in MapViewOfFile() should be aligned to not dwPageSize,
+  // but dwAllocationGranularity.
   return static_cast<unsigned>(info.dwPageSize);
 }
 
@@ -119,28 +128,89 @@ void Process::GetTimeUsage(TimeValue &elapsed, TimeValue &user_time,
   sys_time = getTimeValueFromFILETIME(KernelTime);
 }
 
-int Process::GetCurrentUserId()
-{
-  return 65536;
-}
-
-int Process::GetCurrentGroupId()
-{
-  return 65536;
-}
-
 // Some LLVM programs such as bugpoint produce core files as a normal part of
-// their operation. To prevent the disk from filling up, this configuration item
-// does what's necessary to prevent their generation.
+// their operation. To prevent the disk from filling up, this configuration
+// item does what's necessary to prevent their generation.
 void Process::PreventCoreFiles() {
-  // Windows doesn't do core files, but it does do modal pop-up message
-  // boxes.  As this method is used by bugpoint, preventing these pop-ups
-  // is the moral equivalent of suppressing core files.
+  // Windows does have the concept of core files, called minidumps.  However,
+  // disabling minidumps for a particular application extends past the lifetime
+  // of that application, which is the incorrect behavior for this API.
+  // Additionally, the APIs require elevated privileges to disable and re-
+  // enable minidumps, which makes this untenable. For more information, see
+  // WerAddExcludedApplication and WerRemoveExcludedApplication (Vista and
+  // later).
+  //
+  // Windows also has modal pop-up message boxes.  As this method is used by
+  // bugpoint, preventing these pop-ups is additionally important.
   SetErrorMode(SEM_FAILCRITICALERRORS |
                SEM_NOGPFAULTERRORBOX |
                SEM_NOOPENFILEERRORBOX);
 }
 
+/// Returns the environment variable \arg Name's value as a string encoded in
+/// UTF-8. \arg Name is assumed to be in UTF-8 encoding.
+Optional<std::string> Process::GetEnv(StringRef Name) {
+  // Convert the argument to UTF-16 to pass it to _wgetenv().
+  SmallVector<wchar_t, 128> NameUTF16;
+  if (error_code ec = windows::UTF8ToUTF16(Name, NameUTF16))
+    return None;
+
+  // Environment variable can be encoded in non-UTF8 encoding, and there's no
+  // way to know what the encoding is. The only reliable way to look up
+  // multibyte environment variable is to use GetEnvironmentVariableW().
+  SmallVector<wchar_t, MAX_PATH> Buf;
+  size_t Size = MAX_PATH;
+  do {
+    Buf.reserve(Size);
+    Size =
+        GetEnvironmentVariableW(NameUTF16.data(), Buf.data(), Buf.capacity());
+    if (Size == 0)
+      return None;
+
+    // Try again with larger buffer.
+  } while (Size > Buf.capacity());
+  Buf.set_size(Size);
+
+  // Convert the result from UTF-16 to UTF-8.
+  SmallVector<char, MAX_PATH> Res;
+  if (error_code ec = windows::UTF16ToUTF8(Buf.data(), Size, Res))
+    return None;
+  return std::string(Res.data());
+}
+
+error_code
+Process::GetArgumentVector(SmallVectorImpl<const char *> &Args,
+                           ArrayRef<const char *>,
+                           SpecificBumpPtrAllocator<char> &ArgAllocator) {
+  int NewArgCount;
+  error_code ec;
+
+  wchar_t **UnicodeCommandLine = CommandLineToArgvW(GetCommandLineW(),
+                                                    &NewArgCount);
+  if (!UnicodeCommandLine)
+    return windows_error(::GetLastError());
+
+  Args.reserve(NewArgCount);
+
+  for (int i = 0; i < NewArgCount; ++i) {
+    SmallVector<char, MAX_PATH> NewArgString;
+    ec = windows::UTF16ToUTF8(UnicodeCommandLine[i],
+                              wcslen(UnicodeCommandLine[i]),
+                              NewArgString);
+    if (ec)
+      break;
+
+    char *Buffer = ArgAllocator.Allocate(NewArgString.size() + 1);
+    ::memcpy(Buffer, NewArgString.data(), NewArgString.size() + 1);
+    Args.push_back(Buffer);
+  }
+  LocalFree(UnicodeCommandLine);
+  if (ec)
+    return ec;
+
+  return error_code::success();
+}
+
 bool Process::StandardInIsUserInput() {
   return FileDescriptorIsDisplayed(0);
 }
@@ -187,6 +257,11 @@ bool Process::StandardErrHasColors() {
   return FileDescriptorHasColors(2);
 }
 
+static bool UseANSI = false;
+void Process::UseANSIEscapeCodes(bool enable) {
+  UseANSI = enable;
+}
+
 namespace {
 class DefaultColors
 {
@@ -208,10 +283,12 @@ DefaultColors defaultColors;
 }
 
 bool Process::ColorNeedsFlush() {
-  return true;
+  return !UseANSI;
 }
 
 const char *Process::OutputBold(bool bg) {
+  if (UseANSI) return "\033[1m";
+
   WORD colors = DefaultColors::GetCurrentColor();
   if (bg)
     colors |= BACKGROUND_INTENSITY;
@@ -222,6 +299,8 @@ const char *Process::OutputBold(bool bg) {
 }
 
 const char *Process::OutputColor(char code, bool bold, bool bg) {
+  if (UseANSI) return colorcodes[bg?1:0][bold?1:0][code&7];
+
   WORD colors;
   if (bg) {
     colors = ((code&1) ? BACKGROUND_RED : 0) |
@@ -247,6 +326,8 @@ static WORD GetConsoleTextAttribute(HANDLE hConsoleOutput) {
 }
 
 const char *Process::OutputReverse() {
+  if (UseANSI) return "\033[7m";
+
   const WORD attributes
    = GetConsoleTextAttribute(GetStdHandle(STD_OUTPUT_HANDLE));
 
@@ -273,6 +354,7 @@ const char *Process::OutputReverse() {
 }
 
 const char *Process::ResetColor() {
+  if (UseANSI) return "\033[0m";
   SetConsoleTextAttribute(GetStdHandle(STD_OUTPUT_HANDLE), defaultColors());
   return 0;
 }
diff --git a/lib/Support/Windows/Program.inc b/lib/Support/Windows/Program.inc
index 8165ef4..dc09738 100644
--- a/lib/Support/Windows/Program.inc
+++ b/lib/Support/Windows/Program.inc
@@ -24,16 +24,11 @@
 //===          and must not be UNIX code
 //===----------------------------------------------------------------------===//
 
-namespace {
-  struct Win32ProcessInfo {
-    HANDLE hProcess;
-    DWORD  dwProcessId;
-  };
-}
-
 namespace llvm {
 using namespace sys;
 
+ProcessInfo::ProcessInfo() : ProcessHandle(0), Pid(0), ReturnCode(0) {}
+
 // This function just uses the PATH environment variable to find the program.
 std::string sys::FindProgramByName(const std::string &progName) {
   // Check some degenerate cases
@@ -47,42 +42,39 @@ std::string sys::FindProgramByName(const std::string &progName) {
 
   // At this point, the file name is valid and does not contain slashes.
   // Let Windows search for it.
-  std::string buffer;
-  buffer.resize(MAX_PATH);
-  char *dummy = NULL;
-  DWORD len = SearchPath(NULL, progName.c_str(), ".exe", MAX_PATH,
-                         &buffer[0], &dummy);
-
-  // See if it wasn't found.
-  if (len == 0)
+  SmallVector<wchar_t, MAX_PATH> progNameUnicode;
+  if (windows::UTF8ToUTF16(progName, progNameUnicode))
     return "";
 
-  // See if we got the entire path.
-  if (len < MAX_PATH)
-    return buffer;
+  SmallVector<wchar_t, MAX_PATH> buffer;
+  DWORD len = MAX_PATH;
+  do {
+    buffer.reserve(len);
+    len = ::SearchPathW(NULL, progNameUnicode.data(), L".exe",
+                        buffer.capacity(), buffer.data(), NULL);
 
-  // Buffer was too small; grow and retry.
-  while (true) {
-    buffer.resize(len+1);
-    DWORD len2 = SearchPath(NULL, progName.c_str(), ".exe", len+1, &buffer[0], &dummy);
-
-    // It is unlikely the search failed, but it's always possible some file
-    // was added or removed since the last search, so be paranoid...
-    if (len2 == 0)
+    // See if it wasn't found.
+    if (len == 0)
       return "";
-    else if (len2 <= len)
-      return buffer;
 
-    len = len2;
-  }
+    // Buffer was too small; grow and retry.
+  } while (len > buffer.capacity());
+
+  buffer.set_size(len);
+  SmallVector<char, MAX_PATH> result;
+  if (windows::UTF16ToUTF8(buffer.begin(), buffer.size(), result))
+    return "";
+
+  return std::string(result.data(), result.size());
 }
 
 static HANDLE RedirectIO(const StringRef *path, int fd, std::string* ErrMsg) {
   HANDLE h;
   if (path == 0) {
-    DuplicateHandle(GetCurrentProcess(), (HANDLE)_get_osfhandle(fd),
-                    GetCurrentProcess(), &h,
-                    0, TRUE, DUPLICATE_SAME_ACCESS);
+    if (!DuplicateHandle(GetCurrentProcess(), (HANDLE)_get_osfhandle(fd),
+                         GetCurrentProcess(), &h,
+                         0, TRUE, DUPLICATE_SAME_ACCESS))
+      return INVALID_HANDLE_VALUE;
     return h;
   }
 
@@ -97,9 +89,13 @@ static HANDLE RedirectIO(const StringRef *path, int fd, std::string* ErrMsg) {
   sa.lpSecurityDescriptor = 0;
   sa.bInheritHandle = TRUE;
 
-  h = CreateFile(fname.c_str(), fd ? GENERIC_WRITE : GENERIC_READ,
-                 FILE_SHARE_READ, &sa, fd == 0 ? OPEN_EXISTING : CREATE_ALWAYS,
-                 FILE_ATTRIBUTE_NORMAL, NULL);
+  SmallVector<wchar_t, 128> fnameUnicode;
+  if (windows::UTF8ToUTF16(fname, fnameUnicode))
+    return INVALID_HANDLE_VALUE;
+
+  h = CreateFileW(fnameUnicode.data(), fd ? GENERIC_WRITE : GENERIC_READ,
+                  FILE_SHARE_READ, &sa, fd == 0 ? OPEN_EXISTING : CREATE_ALWAYS,
+                  FILE_ATTRIBUTE_NORMAL, NULL);
   if (h == INVALID_HANDLE_VALUE) {
     MakeErrMsg(ErrMsg, std::string(fname) + ": Can't open file for " +
         (fd ? "input: " : "output: "));
@@ -171,13 +167,9 @@ static unsigned int ArgLenWithQuotes(const char *Str) {
 
 }
 
-static bool Execute(void **Data,
-                    StringRef Program,
-                    const char** args,
-                    const char** envp,
-                    const StringRef** redirects,
-                    unsigned memoryLimit,
-                    std::string* ErrMsg) {
+static bool Execute(ProcessInfo &PI, StringRef Program, const char **args,
+                    const char **envp, const StringRef **redirects,
+                    unsigned memoryLimit, std::string *ErrMsg) {
   if (!sys::fs::can_execute(Program)) {
     if (ErrMsg)
       *ErrMsg = "program not executable";
@@ -227,34 +219,28 @@ static bool Execute(void **Data,
   *p = 0;
 
   // The pointer to the environment block for the new process.
-  OwningArrayPtr<char> envblock;
+  std::vector<wchar_t> EnvBlock;
 
   if (envp) {
     // An environment block consists of a null-terminated block of
     // null-terminated strings. Convert the array of environment variables to
     // an environment block by concatenating them.
+    for (unsigned i = 0; envp[i]; ++i) {
+      SmallVector<wchar_t, MAX_PATH> EnvString;
+      if (error_code ec = windows::UTF8ToUTF16(envp[i], EnvString)) {
+        SetLastError(ec.value());
+        MakeErrMsg(ErrMsg, "Unable to convert environment variable to UTF-16");
+        return false;
+      }
 
-    // First, determine the length of the environment block.
-    len = 0;
-    for (unsigned i = 0; envp[i]; i++)
-      len += strlen(envp[i]) + 1;
-
-    // Now build the environment block.
-    envblock.reset(new char[len+1]);
-    p = envblock.get();
-
-    for (unsigned i = 0; envp[i]; i++) {
-      const char *ev = envp[i];
-      size_t len = strlen(ev) + 1;
-      memcpy(p, ev, len);
-      p += len;
+      EnvBlock.insert(EnvBlock.end(), EnvString.begin(), EnvString.end());
+      EnvBlock.push_back(0);
     }
-
-    *p = 0;
+    EnvBlock.push_back(0);
   }
 
   // Create a child process.
-  STARTUPINFO si;
+  STARTUPINFOW si;
   memset(&si, 0, sizeof(si));
   si.cb = sizeof(si);
   si.hStdInput = INVALID_HANDLE_VALUE;
@@ -278,9 +264,14 @@ static bool Execute(void **Data,
     if (redirects[1] && redirects[2] && *(redirects[1]) == *(redirects[2])) {
       // If stdout and stderr should go to the same place, redirect stderr
       // to the handle already open for stdout.
-      DuplicateHandle(GetCurrentProcess(), si.hStdOutput,
-                      GetCurrentProcess(), &si.hStdError,
-                      0, TRUE, DUPLICATE_SAME_ACCESS);
+      if (!DuplicateHandle(GetCurrentProcess(), si.hStdOutput,
+                           GetCurrentProcess(), &si.hStdError,
+                           0, TRUE, DUPLICATE_SAME_ACCESS)) {
+        CloseHandle(si.hStdInput);
+        CloseHandle(si.hStdOutput);
+        MakeErrMsg(ErrMsg, "can't dup stderr to stdout");
+        return false;
+      }
     } else {
       // Just redirect stderr
       si.hStdError = RedirectIO(redirects[2], 2, ErrMsg);
@@ -298,9 +289,27 @@ static bool Execute(void **Data,
 
   fflush(stdout);
   fflush(stderr);
-  std::string ProgramStr = Program;
-  BOOL rc = CreateProcess(ProgramStr.c_str(), command.get(), NULL, NULL, TRUE,
-                          0, envblock.get(), NULL, &si, &pi);
+
+  SmallVector<wchar_t, MAX_PATH> ProgramUtf16;
+  if (error_code ec = windows::UTF8ToUTF16(Program, ProgramUtf16)) {
+    SetLastError(ec.value());
+    MakeErrMsg(ErrMsg,
+               std::string("Unable to convert application name to UTF-16"));
+    return false;
+  }
+
+  SmallVector<wchar_t, MAX_PATH> CommandUtf16;
+  if (error_code ec = windows::UTF8ToUTF16(command.get(), CommandUtf16)) {
+    SetLastError(ec.value());
+    MakeErrMsg(ErrMsg,
+               std::string("Unable to convert command-line to UTF-16"));
+    return false;
+  }
+
+  BOOL rc = CreateProcessW(ProgramUtf16.data(), CommandUtf16.data(), 0, 0,
+                           TRUE, CREATE_UNICODE_ENVIRONMENT,
+                           EnvBlock.empty() ? 0 : EnvBlock.data(), 0, &si,
+                           &pi);
   DWORD err = GetLastError();
 
   // Regardless of whether the process got created or not, we are done with
@@ -313,15 +322,12 @@ static bool Execute(void **Data,
   if (!rc) {
     SetLastError(err);
     MakeErrMsg(ErrMsg, std::string("Couldn't execute program '") +
-               ProgramStr + "'");
+               Program.str() + "'");
     return false;
   }
-  if (Data) {
-    Win32ProcessInfo* wpi = new Win32ProcessInfo;
-    wpi->hProcess = pi.hProcess;
-    wpi->dwProcessId = pi.dwProcessId;
-    *Data = wpi;
-  }
+
+  PI.Pid = pi.dwProcessId;
+  PI.ProcessHandle = pi.hProcess;
 
   // Make sure these get closed no matter what.
   ScopedCommonHandle hThread(pi.hThread);
@@ -329,7 +335,7 @@ static bool Execute(void **Data,
   // Assign the process to a job if a memory limit is defined.
   ScopedJobHandle hJob;
   if (memoryLimit != 0) {
-    hJob = CreateJobObject(0, 0);
+    hJob = CreateJobObjectW(0, 0);
     bool success = false;
     if (hJob) {
       JOBOBJECT_EXTENDED_LIMIT_INFORMATION jeli;
@@ -351,68 +357,72 @@ static bool Execute(void **Data,
     }
   }
 
-  // Don't leak the handle if the caller doesn't want it.
-  if (!Data)
-    CloseHandle(pi.hProcess);
-
   return true;
 }
 
-static int WaitAux(Win32ProcessInfo *wpi, unsigned secondsToWait,
-                   std::string *ErrMsg) {
-  // Wait for the process to terminate.
-  HANDLE hProcess = wpi->hProcess;
-  DWORD millisecondsToWait = INFINITE;
-  if (secondsToWait > 0)
-    millisecondsToWait = secondsToWait * 1000;
-
-  if (WaitForSingleObject(hProcess, millisecondsToWait) == WAIT_TIMEOUT) {
-    if (!TerminateProcess(hProcess, 1)) {
-      MakeErrMsg(ErrMsg, "Failed to terminate timed-out program.");
-      // -2 indicates a crash or timeout as opposed to failure to execute.
-      return -2;
+namespace llvm {
+ProcessInfo sys::Wait(const ProcessInfo &PI, unsigned SecondsToWait,
+                      bool WaitUntilChildTerminates, std::string *ErrMsg) {
+  assert(PI.Pid && "invalid pid to wait on, process not started?");
+  assert(PI.ProcessHandle &&
+         "invalid process handle to wait on, process not started?");
+  DWORD milliSecondsToWait = 0;
+  if (WaitUntilChildTerminates)
+    milliSecondsToWait = INFINITE;
+  else if (SecondsToWait > 0)
+    milliSecondsToWait = SecondsToWait * 1000;
+
+  ProcessInfo WaitResult = PI;
+  DWORD WaitStatus = WaitForSingleObject(PI.ProcessHandle, milliSecondsToWait);
+  if (WaitStatus == WAIT_TIMEOUT) {
+    if (SecondsToWait) {
+      if (!TerminateProcess(PI.ProcessHandle, 1)) {
+        if (ErrMsg)
+          MakeErrMsg(ErrMsg, "Failed to terminate timed-out program.");
+
+        // -2 indicates a crash or timeout as opposed to failure to execute.
+        WaitResult.ReturnCode = -2;
+        CloseHandle(PI.ProcessHandle);
+        return WaitResult;
+      }
+      WaitForSingleObject(PI.ProcessHandle, INFINITE);
+      CloseHandle(PI.ProcessHandle);
+    } else {
+      // Non-blocking wait.
+      return ProcessInfo();
     }
-    WaitForSingleObject(hProcess, INFINITE);
   }
 
   // Get its exit status.
   DWORD status;
-  BOOL rc = GetExitCodeProcess(hProcess, &status);
+  BOOL rc = GetExitCodeProcess(PI.ProcessHandle, &status);
   DWORD err = GetLastError();
+  CloseHandle(PI.ProcessHandle);
 
   if (!rc) {
     SetLastError(err);
-    MakeErrMsg(ErrMsg, "Failed getting status for program.");
+    if (ErrMsg)
+      MakeErrMsg(ErrMsg, "Failed getting status for program.");
+
     // -2 indicates a crash or timeout as opposed to failure to execute.
-    return -2;
+    WaitResult.ReturnCode = -2;
+    return WaitResult;
   }
 
   if (!status)
-    return 0;
+    return WaitResult;
 
   // Pass 10(Warning) and 11(Error) to the callee as negative value.
   if ((status & 0xBFFF0000U) == 0x80000000U)
-    return (int)status;
-
-  if (status & 0xFF)
-    return status & 0x7FFFFFFF;
-
-  return 1;
-}
-
-static int Wait(void *&Data, StringRef Program, unsigned secondsToWait,
-                std::string *ErrMsg) {
-  Win32ProcessInfo *wpi = reinterpret_cast<Win32ProcessInfo *>(Data);
-  int Ret = WaitAux(wpi, secondsToWait, ErrMsg);
-
-  CloseHandle(wpi->hProcess);
-  delete wpi;
-  Data = 0;
+    WaitResult.ReturnCode = static_cast<int>(status);
+  else if (status & 0xFF)
+    WaitResult.ReturnCode = status & 0x7FFFFFFF;
+  else
+    WaitResult.ReturnCode = 1;
 
-  return Ret;
+  return WaitResult;
 }
 
-namespace llvm {
 error_code sys::ChangeStdinToBinary(){
   int result = _setmode( _fileno(stdin), _O_BINARY );
   if (result == -1)
@@ -449,5 +459,4 @@ bool llvm::sys::argumentsFitWithinSystemLimits(ArrayRef<const char*> Args) {
   }
   return true;
 }
-
 }
diff --git a/lib/Support/Windows/RWMutex.inc b/lib/Support/Windows/RWMutex.inc
index 9593923..c431844 100644
--- a/lib/Support/Windows/RWMutex.inc
+++ b/lib/Support/Windows/RWMutex.inc
@@ -48,8 +48,7 @@ static bool loadSRW() {
   if (!sChecked) {
     sChecked = true;
 
-    HMODULE hLib = ::LoadLibrary(TEXT("Kernel32"));
-    if (hLib) {
+    if (HMODULE hLib = ::GetModuleHandleW(L"Kernel32.dll")) {
       fpInitializeSRWLock =
         (VOID (WINAPI *)(PSRWLOCK))::GetProcAddress(hLib,
                                                "InitializeSRWLock");
@@ -65,7 +64,6 @@ static bool loadSRW() {
       fpReleaseSRWLockShared =
         (VOID (WINAPI *)(PSRWLOCK))::GetProcAddress(hLib,
                                                "ReleaseSRWLockShared");
-      ::FreeLibrary(hLib);
 
       if (fpInitializeSRWLock != NULL) {
         sHasSRW = true;
diff --git a/lib/Support/Windows/Signals.inc b/lib/Support/Windows/Signals.inc
index bce83b9..4b40d51 100644
--- a/lib/Support/Windows/Signals.inc
+++ b/lib/Support/Windows/Signals.inc
@@ -135,7 +135,7 @@ typedef PVOID (WINAPI *fpSymFunctionTableAccess64)(HANDLE, DWORD64);
 static fpSymFunctionTableAccess64 SymFunctionTableAccess64;
 
 static bool load64BitDebugHelp(void) {
-  HMODULE hLib = ::LoadLibrary("Dbghelp.dll");
+  HMODULE hLib = ::LoadLibraryW(L"Dbghelp.dll");
   if (hLib) {
     StackWalk64 = (fpStackWalk64)
                       ::GetProcAddress(hLib, "StackWalk64");
diff --git a/lib/Support/Windows/TimeValue.inc b/lib/Support/Windows/TimeValue.inc
index 96f5579..98b07d6 100644
--- a/lib/Support/Windows/TimeValue.inc
+++ b/lib/Support/Windows/TimeValue.inc
@@ -12,10 +12,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "Windows.h"
+#include <cctype>
 #include <time.h>
 
-namespace llvm {
-using namespace sys;
+using namespace llvm;
+using namespace llvm::sys;
 
 //===----------------------------------------------------------------------===//
 //=== WARNING: Implementation here must contain only Win32 specific code.
@@ -49,13 +50,10 @@ std::string TimeValue::str() const {
   char Buffer[25];
   // FIXME: the windows version of strftime doesn't support %e
   strftime(Buffer, 25, "%b %d %H:%M %Y", LT);
-  assert((Buffer[3] == ' ' && isdigit(Buffer[5]) && Buffer[6] == ' ') ||
+  assert((Buffer[3] == ' ' && isdigit(Buffer[5]) && Buffer[6] == ' ') &&
          "Unexpected format in strftime()!");
   // Emulate %e on %d to mute '0'.
   if (Buffer[4] == '0')
     Buffer[4] = ' ';
   return std::string(Buffer);
 }
-
-
-}
diff --git a/lib/Support/Windows/Windows.h b/lib/Support/Windows/Windows.h
index 4cdac78..1f3417d 100644
--- a/lib/Support/Windows/Windows.h
+++ b/lib/Support/Windows/Windows.h
@@ -24,23 +24,31 @@
 #define _WIN32_IE    0x0600 // MinGW at it again.
 #define WIN32_LEAN_AND_MEAN
 
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Config/config.h" // Get build system configuration settings
 #include "llvm/Support/Compiler.h"
+#include "llvm/Support/system_error.h"
 #include <windows.h>
 #include <wincrypt.h>
-#include <shlobj.h>
 #include <cassert>
 #include <string>
+#include <vector>
 
 inline bool MakeErrMsg(std::string* ErrMsg, const std::string& prefix) {
   if (!ErrMsg)
     return true;
   char *buffer = NULL;
-  FormatMessage(FORMAT_MESSAGE_ALLOCATE_BUFFER|FORMAT_MESSAGE_FROM_SYSTEM,
-      NULL, GetLastError(), 0, (LPSTR)&buffer, 1, NULL);
-  *ErrMsg = prefix + buffer;
+  DWORD R = FormatMessage(FORMAT_MESSAGE_ALLOCATE_BUFFER |
+                          FORMAT_MESSAGE_FROM_SYSTEM,
+                          NULL, GetLastError(), 0, (LPSTR)&buffer, 1, NULL);
+  if (R)
+    *ErrMsg = prefix + buffer;
+  else
+    *ErrMsg = prefix + "Unknown error";
+
   LocalFree(buffer);
-  return true;
+  return R != 0;
 }
 
 template <typename HandleTraits>
@@ -148,4 +156,13 @@ c_str(SmallVectorImpl<T> &str) {
   str.pop_back();
   return str.data();
 }
+
+namespace sys {
+namespace windows {
+error_code UTF8ToUTF16(StringRef utf8,
+                       SmallVectorImpl<wchar_t> &utf16);
+error_code UTF16ToUTF8(const wchar_t *utf16, size_t utf16_len,
+                       SmallVectorImpl<char> &utf8);
+} // end namespace windows
+} // end namespace sys
 } // end namespace llvm.
diff --git a/lib/Support/YAMLParser.cpp b/lib/Support/YAMLParser.cpp
index 213f5e1..9495cd4 100644
--- a/lib/Support/YAMLParser.cpp
+++ b/lib/Support/YAMLParser.cpp
@@ -96,6 +96,15 @@ static EncodingInfo getUnicodeEncoding(StringRef Input) {
 
 namespace llvm {
 namespace yaml {
+/// Pin the vtables to this file.
+void Node::anchor() {}
+void NullNode::anchor() {}
+void ScalarNode::anchor() {}
+void KeyValueNode::anchor() {}
+void MappingNode::anchor() {}
+void SequenceNode::anchor() {}
+void AliasNode::anchor() {}
+
 /// Token - A single YAML token.
 struct Token : ilist_node<Token> {
   enum TokenKind {
@@ -1070,14 +1079,22 @@ bool Scanner::scanDirective() {
   Current = skip_while(&Scanner::skip_ns_char, Current);
   StringRef Name(NameStart, Current - NameStart);
   Current = skip_while(&Scanner::skip_s_white, Current);
-
+  
+  Token T;
   if (Name == "YAML") {
     Current = skip_while(&Scanner::skip_ns_char, Current);
-    Token T;
     T.Kind = Token::TK_VersionDirective;
     T.Range = StringRef(Start, Current - Start);
     TokenQueue.push_back(T);
     return true;
+  } else if(Name == "TAG") {
+    Current = skip_while(&Scanner::skip_ns_char, Current);
+    Current = skip_while(&Scanner::skip_s_white, Current);
+    Current = skip_while(&Scanner::skip_ns_char, Current);
+    T.Kind = Token::TK_TagDirective;
+    T.Range = StringRef(Start, Current - Start);
+    TokenQueue.push_back(T);
+    return true;
   }
   return false;
 }
@@ -1564,10 +1581,6 @@ void Stream::printError(Node *N, const Twine &Msg) {
                      , Ranges);
 }
 
-void Stream::handleYAMLDirective(const Token &t) {
-  // TODO: Ensure version is 1.x.
-}
-
 document_iterator Stream::begin() {
   if (CurrentDoc)
     report_fatal_error("Can only iterate over the stream once");
@@ -1588,14 +1601,59 @@ void Stream::skip() {
     i->skip();
 }
 
-Node::Node(unsigned int Type, OwningPtr<Document> &D, StringRef A)
+Node::Node(unsigned int Type, OwningPtr<Document> &D, StringRef A, StringRef T)
   : Doc(D)
   , TypeID(Type)
-  , Anchor(A) {
+  , Anchor(A)
+  , Tag(T) {
   SMLoc Start = SMLoc::getFromPointer(peekNext().Range.begin());
   SourceRange = SMRange(Start, Start);
 }
 
+std::string Node::getVerbatimTag() const {
+  StringRef Raw = getRawTag();
+  if (!Raw.empty() && Raw != "!") {
+    std::string Ret;
+    if (Raw.find_last_of('!') == 0) {
+      Ret = Doc->getTagMap().find("!")->second;
+      Ret += Raw.substr(1);
+      return llvm_move(Ret);
+    } else if (Raw.startswith("!!")) {
+      Ret = Doc->getTagMap().find("!!")->second;
+      Ret += Raw.substr(2);
+      return llvm_move(Ret);
+    } else {
+      StringRef TagHandle = Raw.substr(0, Raw.find_last_of('!') + 1);
+      std::map<StringRef, StringRef>::const_iterator It =
+          Doc->getTagMap().find(TagHandle);
+      if (It != Doc->getTagMap().end())
+        Ret = It->second;
+      else {
+        Token T;
+        T.Kind = Token::TK_Tag;
+        T.Range = TagHandle;
+        setError(Twine("Unknown tag handle ") + TagHandle, T);
+      }
+      Ret += Raw.substr(Raw.find_last_of('!') + 1);
+      return llvm_move(Ret);
+    }
+  }
+
+  switch (getType()) {
+  case NK_Null:
+    return "tag:yaml.org,2002:null";
+  case NK_Scalar:
+    // TODO: Tag resolution.
+    return "tag:yaml.org,2002:str";
+  case NK_Mapping:
+    return "tag:yaml.org,2002:map";
+  case NK_Sequence:
+    return "tag:yaml.org,2002:seq";
+  }
+
+  return "";
+}
+
 Token &Node::peekNext() {
   return Doc->peekNext();
 }
@@ -1999,6 +2057,10 @@ void SequenceNode::increment() {
 }
 
 Document::Document(Stream &S) : stream(S), Root(0) {
+  // Tag maps starts with two default mappings.
+  TagMap["!"] = "!";
+  TagMap["!!"] = "tag:yaml.org,2002:";
+
   if (parseDirectives())
     expectToken(Token::TK_DocumentStart);
   Token &T = peekNext();
@@ -2042,6 +2104,7 @@ Node *Document::parseBlockNode() {
   Token T = peekNext();
   // Handle properties.
   Token AnchorInfo;
+  Token TagInfo;
 parse_property:
   switch (T.Kind) {
   case Token::TK_Alias:
@@ -2056,7 +2119,11 @@ parse_property:
     T = peekNext();
     goto parse_property;
   case Token::TK_Tag:
-    getNext(); // Skip TK_Tag.
+    if (TagInfo.Kind == Token::TK_Tag) {
+      setError("Already encountered a tag for this node!", T);
+      return 0;
+    }
+    TagInfo = getNext(); // Consume TK_Tag.
     T = peekNext();
     goto parse_property;
   default:
@@ -2070,42 +2137,49 @@ parse_property:
     // Don't eat the TK_BlockEntry, SequenceNode needs it.
     return new (NodeAllocator) SequenceNode( stream.CurrentDoc
                                            , AnchorInfo.Range.substr(1)
+                                           , TagInfo.Range
                                            , SequenceNode::ST_Indentless);
   case Token::TK_BlockSequenceStart:
     getNext();
     return new (NodeAllocator)
       SequenceNode( stream.CurrentDoc
                   , AnchorInfo.Range.substr(1)
+                  , TagInfo.Range
                   , SequenceNode::ST_Block);
   case Token::TK_BlockMappingStart:
     getNext();
     return new (NodeAllocator)
       MappingNode( stream.CurrentDoc
                  , AnchorInfo.Range.substr(1)
+                 , TagInfo.Range
                  , MappingNode::MT_Block);
   case Token::TK_FlowSequenceStart:
     getNext();
     return new (NodeAllocator)
       SequenceNode( stream.CurrentDoc
                   , AnchorInfo.Range.substr(1)
+                  , TagInfo.Range
                   , SequenceNode::ST_Flow);
   case Token::TK_FlowMappingStart:
     getNext();
     return new (NodeAllocator)
       MappingNode( stream.CurrentDoc
                  , AnchorInfo.Range.substr(1)
+                 , TagInfo.Range
                  , MappingNode::MT_Flow);
   case Token::TK_Scalar:
     getNext();
     return new (NodeAllocator)
       ScalarNode( stream.CurrentDoc
                 , AnchorInfo.Range.substr(1)
+                , TagInfo.Range
                 , T.Range);
   case Token::TK_Key:
     // Don't eat the TK_Key, KeyValueNode expects it.
     return new (NodeAllocator)
       MappingNode( stream.CurrentDoc
                  , AnchorInfo.Range.substr(1)
+                 , TagInfo.Range
                  , MappingNode::MT_Inline);
   case Token::TK_DocumentStart:
   case Token::TK_DocumentEnd:
@@ -2126,10 +2200,10 @@ bool Document::parseDirectives() {
   while (true) {
     Token T = peekNext();
     if (T.Kind == Token::TK_TagDirective) {
-      handleTagDirective(getNext());
+      parseTAGDirective();
       isDirective = true;
     } else if (T.Kind == Token::TK_VersionDirective) {
-      stream.handleYAMLDirective(getNext());
+      parseYAMLDirective();
       isDirective = true;
     } else
       break;
@@ -2137,6 +2211,21 @@ bool Document::parseDirectives() {
   return isDirective;
 }
 
+void Document::parseYAMLDirective() {
+  getNext(); // Eat %YAML <version>
+}
+
+void Document::parseTAGDirective() {
+  Token Tag = getNext(); // %TAG <handle> <prefix>
+  StringRef T = Tag.Range;
+  // Strip %TAG
+  T = T.substr(T.find_first_of(" \t")).ltrim(" \t");
+  std::size_t HandleEnd = T.find_first_of(" \t");
+  StringRef TagHandle = T.substr(0, HandleEnd);
+  StringRef TagPrefix = T.substr(HandleEnd).ltrim(" \t");
+  TagMap[TagHandle] = TagPrefix;
+}
+
 bool Document::expectToken(int TK) {
   Token T = getNext();
   if (T.Kind != TK) {
diff --git a/lib/Support/YAMLTraits.cpp b/lib/Support/YAMLTraits.cpp
index b0cd415..42bff96 100644
--- a/lib/Support/YAMLTraits.cpp
+++ b/lib/Support/YAMLTraits.cpp
@@ -15,6 +15,7 @@
 #include "llvm/Support/YAMLParser.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cstring>
+#include <cctype>
 using namespace llvm;
 using namespace yaml;
 
@@ -40,32 +41,43 @@ void IO::setContext(void *Context) {
 //  Input
 //===----------------------------------------------------------------------===//
 
-Input::Input(StringRef InputContent, void *Ctxt) 
-  : IO(Ctxt), 
+Input::Input(StringRef InputContent,
+             void *Ctxt,
+             SourceMgr::DiagHandlerTy DiagHandler,
+             void *DiagHandlerCtxt)
+  : IO(Ctxt),
     Strm(new Stream(InputContent, SrcMgr)),
     CurrentNode(NULL) {
+  if (DiagHandler)
+    SrcMgr.setDiagHandler(DiagHandler, DiagHandlerCtxt);
   DocIterator = Strm->begin();
 }
 
 Input::~Input() {
-  
 }
 
 error_code Input::error() {
   return EC;
 }
 
-void Input::setDiagHandler(SourceMgr::DiagHandlerTy Handler, void *Ctxt) {
-  SrcMgr.setDiagHandler(Handler, Ctxt);
-}
+// Pin the vtables to this file.
+void Input::HNode::anchor() {}
+void Input::EmptyHNode::anchor() {}
+void Input::ScalarHNode::anchor() {}
 
-bool Input::outputting() {
+bool Input::outputting() const {
   return false;
 }
 
 bool Input::setCurrentDocument() {
   if (DocIterator != Strm->end()) {
     Node *N = DocIterator->getRoot();
+    if (!N) {
+      assert(Strm->failed() && "Root is NULL iff parsing failed");
+      EC = make_error_code(errc::invalid_argument);
+      return false;
+    }
+
     if (isa<NullNode>(N)) {
       // Empty files are allowed and ignored
       ++DocIterator;
@@ -82,10 +94,21 @@ void Input::nextDocument() {
   ++DocIterator;
 }
 
+bool Input::mapTag(StringRef Tag, bool Default) {
+  std::string foundTag = CurrentNode->_node->getVerbatimTag();
+  if (foundTag.empty()) {
+    // If no tag found and 'Tag' is the default, say it was found.
+    return Default;
+  }
+  // Return true iff found tag matches supplied tag.
+  return Tag.equals(foundTag);
+}
+
 void Input::beginMapping() {
   if (EC)
     return;
-  MapHNode *MN = dyn_cast<MapHNode>(CurrentNode);
+  // CurrentNode can be null if the document is empty.
+  MapHNode *MN = dyn_cast_or_null<MapHNode>(CurrentNode);
   if (MN) {
     MN->ValidKeys.clear();
   }
@@ -96,6 +119,15 @@ bool Input::preflightKey(const char *Key, bool Required, bool, bool &UseDefault,
   UseDefault = false;
   if (EC)
     return false;
+
+  // CurrentNode is null for empty documents, which is an error in case required
+  // nodes are present.
+  if (!CurrentNode) {
+    if (Required)
+      EC = make_error_code(errc::invalid_argument);
+    return false;
+  }
+
   MapHNode *MN = dyn_cast<MapHNode>(CurrentNode);
   if (!MN) {
     setError(CurrentNode, "not a mapping");
@@ -122,7 +154,8 @@ void Input::postflightKey(void *saveInfo) {
 void Input::endMapping() {
   if (EC)
     return;
-  MapHNode *MN = dyn_cast<MapHNode>(CurrentNode);
+  // CurrentNode can be null if the document is empty.
+  MapHNode *MN = dyn_cast_or_null<MapHNode>(CurrentNode);
   if (!MN)
     return;
   for (MapHNode::NameToNode::iterator i = MN->Mapping.begin(),
@@ -263,6 +296,7 @@ void Input::scalarString(StringRef &S) {
 }
 
 void Input::setError(HNode *hnode, const Twine &message) {
+  assert(hnode && "HNode must not be NULL");
   this->setError(hnode->_node, message);
 }
 
@@ -334,6 +368,10 @@ void Input::setError(const Twine &Message) {
   this->setError(CurrentNode, Message);
 }
 
+bool Input::canElideEmptySequence() {
+  return false;
+}
+
 Input::MapHNode::~MapHNode() {
   for (MapHNode::NameToNode::iterator i = Mapping.begin(), End = Mapping.end();
                                                                 i != End; ++i) {
@@ -368,7 +406,7 @@ Output::Output(raw_ostream &yout, void *context)
 Output::~Output() {
 }
 
-bool Output::outputting() {
+bool Output::outputting() const {
   return true;
 }
 
@@ -377,6 +415,14 @@ void Output::beginMapping() {
   NeedsNewLine = true;
 }
 
+bool Output::mapTag(StringRef Tag, bool Use) {
+  if (Use) {
+    this->output(" ");
+    this->output(Tag);
+  }
+  return Use;
+}
+
 void Output::endMapping() {
   StateStack.pop_back();
 }
@@ -505,9 +551,20 @@ void Output::endBitSetScalar() {
 }
 
 void Output::scalarString(StringRef &S) {
+  const char ScalarSafeChars[] = "abcdefghijklmnopqrstuvwxyz"
+      "ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_-/^., \t";
+
   this->newLineCheck();
-  if (S.find('\n') == StringRef::npos) {
-    // No embedded new-line chars, just print string.
+  if (S.empty()) {
+    // Print '' for the empty string because leaving the field empty is not
+    // allowed.
+    this->outputUpToEndOfLine("''");
+    return;
+  }
+  if (S.find_first_not_of(ScalarSafeChars) == StringRef::npos &&
+      !isspace(S.front()) && !isspace(S.back())) {
+    // If the string consists only of safe characters, print it out without
+    // quotes.
     this->outputUpToEndOfLine(S);
     return;
   }
@@ -532,6 +589,19 @@ void Output::scalarString(StringRef &S) {
 void Output::setError(const Twine &message) {
 }
 
+bool Output::canElideEmptySequence() {
+  // Normally, with an optional key/value where the value is an empty sequence,
+  // the whole key/value can be not written.  But, that produces wrong yaml
+  // if the key/value is the only thing in the map and the map is used in
+  // a sequence.  This detects if the this sequence is the first key/value
+  // in map that itself is embedded in a sequnce.
+  if (StateStack.size() < 2)
+    return true;
+  if (StateStack.back() != inMapFirstKey)
+    return true;
+  return (StateStack[StateStack.size()-2] != inSeq);
+}
+
 void Output::output(StringRef s) {
   Column += s.size();
   Out << s;
diff --git a/lib/Support/raw_ostream.cpp b/lib/Support/raw_ostream.cpp
index 92fa8b5..cb96489 100644
--- a/lib/Support/raw_ostream.cpp
+++ b/lib/Support/raw_ostream.cpp
@@ -447,7 +447,8 @@ raw_fd_ostream::raw_fd_ostream(const char *Filename, std::string &ErrorInfo,
   error_code EC = sys::fs::openFileForWrite(Filename, FD, Flags);
 
   if (EC) {
-    ErrorInfo = "Error opening output file '" + std::string(Filename) + "'";
+    ErrorInfo = "Error opening output file '" + std::string(Filename) + "': " +
+                EC.message();
     ShouldClose = false;
     return;
   }
diff --git a/lib/TableGen/Record.cpp b/lib/TableGen/Record.cpp
index 9ad2053..431f4aa 100644
--- a/lib/TableGen/Record.cpp
+++ b/lib/TableGen/Record.cpp
@@ -557,9 +557,23 @@ Init *BitsInit::resolveReferences(Record &R, const RecordVal *RV) const {
   return const_cast<BitsInit *>(this);
 }
 
+namespace {
+  template<typename T>
+  class Pool : public T {
+  public:
+    ~Pool();
+  };
+  template<typename T>
+  Pool<T>::~Pool() {
+    for (typename T::iterator I = this->begin(), E = this->end(); I != E; ++I) {
+      typename T::value_type &Item = *I;
+      delete Item.second;
+    }
+  }
+}
+
 IntInit *IntInit::get(int64_t V) {
-  typedef DenseMap<int64_t, IntInit *> Pool;
-  static Pool ThePool;
+  static Pool<DenseMap<int64_t, IntInit *> > ThePool;
 
   IntInit *&I = ThePool[V];
   if (!I) I = new IntInit(V);
@@ -586,8 +600,7 @@ IntInit::convertInitializerBitRange(const std::vector<unsigned> &Bits) const {
 void StringInit::anchor() { }
 
 StringInit *StringInit::get(StringRef V) {
-  typedef StringMap<StringInit *> Pool;
-  static Pool ThePool;
+  static Pool<StringMap<StringInit *> > ThePool;
 
   StringInit *&I = ThePool[V];
   if (!I) I = new StringInit(V);
@@ -726,9 +739,7 @@ Init *OpInit::getBit(unsigned Bit) const {
 
 UnOpInit *UnOpInit::get(UnaryOp opc, Init *lhs, RecTy *Type) {
   typedef std::pair<std::pair<unsigned, Init *>, RecTy *> Key;
-
-  typedef DenseMap<Key, UnOpInit *> Pool;
-  static Pool ThePool;  
+  static Pool<DenseMap<Key, UnOpInit *> > ThePool;
 
   Key TheKey(std::make_pair(std::make_pair(opc, lhs), Type));
 
@@ -873,8 +884,7 @@ BinOpInit *BinOpInit::get(BinaryOp opc, Init *lhs,
     RecTy *
     > Key;
 
-  typedef DenseMap<Key, BinOpInit *> Pool;
-  static Pool ThePool;  
+  static Pool<DenseMap<Key, BinOpInit *> > ThePool;
 
   Key TheKey(std::make_pair(std::make_pair(std::make_pair(opc, lhs), rhs),
                             Type));
@@ -1298,8 +1308,7 @@ VarInit *VarInit::get(const std::string &VN, RecTy *T) {
 
 VarInit *VarInit::get(Init *VN, RecTy *T) {
   typedef std::pair<RecTy *, Init *> Key;
-  typedef DenseMap<Key, VarInit *> Pool;
-  static Pool ThePool;
+  static Pool<DenseMap<Key, VarInit *> > ThePool;
 
   Key TheKey(std::make_pair(T, VN));
 
diff --git a/lib/TableGen/TGParser.cpp b/lib/TableGen/TGParser.cpp
index 965cd00..daac574 100644
--- a/lib/TableGen/TGParser.cpp
+++ b/lib/TableGen/TGParser.cpp
@@ -2496,6 +2496,9 @@ bool TGParser::ParseDefm(MultiClass *CurMultiClass) {
     if (Lex.getCode() != tgtok::comma) break;
     Lex.Lex(); // eat ','.
 
+    if (Lex.getCode() != tgtok::Id)
+      return TokError("expected identifier");
+
     SubClassLoc = Lex.getLoc();
 
     // A defm can inherit from regular classes (non-multiclass) as
diff --git a/lib/Target/AArch64/AArch64.td b/lib/Target/AArch64/AArch64.td
index e17052b..9c2c69a 100644
--- a/lib/Target/AArch64/AArch64.td
+++ b/lib/Target/AArch64/AArch64.td
@@ -21,8 +21,11 @@ include "llvm/Target/Target.td"
 // AArch64 Subtarget features.
 //
 
+def FeatureFPARMv8 : SubtargetFeature<"fp-armv8", "HasFPARMv8", "true",
+  "Enable ARMv8 FP">;
+
 def FeatureNEON : SubtargetFeature<"neon", "HasNEON", "true",
-  "Enable Advanced SIMD instructions">;
+  "Enable Advanced SIMD instructions", [FeatureFPARMv8]>;
 
 def FeatureCrypto : SubtargetFeature<"crypto", "HasCrypto", "true",
   "Enable cryptographic instructions">;
@@ -33,7 +36,7 @@ def FeatureCrypto : SubtargetFeature<"crypto", "HasCrypto", "true",
 
 include "AArch64Schedule.td"
 
-def : Processor<"generic", GenericItineraries, [FeatureNEON, FeatureCrypto]>;
+def : Processor<"generic", GenericItineraries, [FeatureFPARMv8]>;
 
 //===----------------------------------------------------------------------===//
 // Register File Description
diff --git a/lib/Target/AArch64/AArch64AsmPrinter.cpp b/lib/Target/AArch64/AArch64AsmPrinter.cpp
index 9498722..d59ca56 100644
--- a/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ b/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -32,17 +32,18 @@ using namespace llvm;
 /// argument to be printed as "bN".
 static bool printModifiedFPRAsmOperand(const MachineOperand &MO,
                                        const TargetRegisterInfo *TRI,
-                                       const TargetRegisterClass &RegClass,
-                                       raw_ostream &O) {
+                                       char RegType, raw_ostream &O) {
   if (!MO.isReg())
     return true;
 
   for (MCRegAliasIterator AR(MO.getReg(), TRI, true); AR.isValid(); ++AR) {
-    if (RegClass.contains(*AR)) {
-      O << AArch64InstPrinter::getRegisterName(*AR);
+    if (AArch64::FPR8RegClass.contains(*AR)) {
+      O << RegType << TRI->getEncodingValue(MO.getReg());
       return false;
     }
   }
+
+  // The register doesn't correspond to anything floating-point like.
   return true;
 }
 
@@ -81,9 +82,9 @@ bool AArch64AsmPrinter::printSymbolicAddress(const MachineOperand &MO,
   StringRef Modifier;
   switch (MO.getType()) {
   default:
-    llvm_unreachable("Unexpected operand for symbolic address constraint");
+    return true;
   case MachineOperand::MO_GlobalAddress:
-    Name = Mang->getSymbol(MO.getGlobal())->getName();
+    Name = getSymbol(MO.getGlobal())->getName();
 
     // Global variables may be accessed either via a GOT or in various fun and
     // interesting TLS-model specific ways. Set the prefix modifier as
@@ -145,57 +146,29 @@ bool AArch64AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
                                         unsigned AsmVariant,
                                         const char *ExtraCode, raw_ostream &O) {
   const TargetRegisterInfo *TRI = MF->getTarget().getRegisterInfo();
-  if (!ExtraCode || !ExtraCode[0]) {
-    // There's actually no operand modifier, which leads to a slightly eclectic
-    // set of behaviour which we have to handle here.
-    const MachineOperand &MO = MI->getOperand(OpNum);
-    switch (MO.getType()) {
-    default:
-      llvm_unreachable("Unexpected operand for inline assembly");
-    case MachineOperand::MO_Register:
-      // GCC prints the unmodified operand of a 'w' constraint as the vector
-      // register. Technically, we could allocate the argument as a VPR128, but
-      // that leads to extremely dodgy copies being generated to get the data
-      // there.
-      if (printModifiedFPRAsmOperand(MO, TRI, AArch64::VPR128RegClass, O))
-        O << AArch64InstPrinter::getRegisterName(MO.getReg());
-      break;
-    case MachineOperand::MO_Immediate:
-      O << '#' << MO.getImm();
-      break;
-    case MachineOperand::MO_FPImmediate:
-      assert(MO.getFPImm()->isExactlyValue(0.0) && "Only FP 0.0 expected");
-      O << "#0.0";
-      break;
-    case MachineOperand::MO_BlockAddress:
-    case MachineOperand::MO_ConstantPoolIndex:
-    case MachineOperand::MO_GlobalAddress:
-    case MachineOperand::MO_ExternalSymbol:
-      return printSymbolicAddress(MO, false, "", O);
-    }
-    return false;
-  }
 
-  // We have a real modifier to handle.
+  if (!ExtraCode)
+    ExtraCode = "";
+
   switch(ExtraCode[0]) {
   default:
-    // See if this is a generic operand
-    return AsmPrinter::PrintAsmOperand(MI, OpNum, AsmVariant, ExtraCode, O);
-  case 'c': // Don't print "#" before an immediate operand.
-    if (!MI->getOperand(OpNum).isImm())
-      return true;
-    O << MI->getOperand(OpNum).getImm();
-    return false;
+    if (!AsmPrinter::PrintAsmOperand(MI, OpNum, AsmVariant, ExtraCode, O))
+        return false;
+    break;
   case 'w':
     // Output 32-bit general register operand, constant zero as wzr, or stack
     // pointer as wsp. Ignored when used with other operand types.
-    return printModifiedGPRAsmOperand(MI->getOperand(OpNum), TRI,
-                                      AArch64::GPR32RegClass, O);
+    if (!printModifiedGPRAsmOperand(MI->getOperand(OpNum), TRI,
+                                    AArch64::GPR32RegClass, O))
+      return false;
+    break;
   case 'x':
     // Output 64-bit general register operand, constant zero as xzr, or stack
     // pointer as sp. Ignored when used with other operand types.
-    return printModifiedGPRAsmOperand(MI->getOperand(OpNum), TRI,
-                                      AArch64::GPR64RegClass, O);
+    if (!printModifiedGPRAsmOperand(MI->getOperand(OpNum), TRI,
+                                    AArch64::GPR64RegClass, O))
+      return false;
+    break;
   case 'H':
     // Output higher numbered of a 64-bit general register pair
   case 'Q':
@@ -211,40 +184,65 @@ bool AArch64AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
     // copies ...).
     llvm_unreachable("FIXME: Unimplemented register pairs");
   case 'b':
-    // Output 8-bit FP/SIMD scalar register operand, prefixed with b.
-    return printModifiedFPRAsmOperand(MI->getOperand(OpNum), TRI,
-                                      AArch64::FPR8RegClass, O);
   case 'h':
-    // Output 16-bit FP/SIMD scalar register operand, prefixed with h.
-    return printModifiedFPRAsmOperand(MI->getOperand(OpNum), TRI,
-                                      AArch64::FPR16RegClass, O);
   case 's':
-    // Output 32-bit FP/SIMD scalar register operand, prefixed with s.
-    return printModifiedFPRAsmOperand(MI->getOperand(OpNum), TRI,
-                                      AArch64::FPR32RegClass, O);
   case 'd':
-    // Output 64-bit FP/SIMD scalar register operand, prefixed with d.
-    return printModifiedFPRAsmOperand(MI->getOperand(OpNum), TRI,
-                                      AArch64::FPR64RegClass, O);
   case 'q':
-    // Output 128-bit FP/SIMD scalar register operand, prefixed with q.
-    return printModifiedFPRAsmOperand(MI->getOperand(OpNum), TRI,
-                                      AArch64::FPR128RegClass, O);
+    if (!printModifiedFPRAsmOperand(MI->getOperand(OpNum), TRI,
+                                    ExtraCode[0], O))
+      return false;
+    break;
   case 'A':
     // Output symbolic address with appropriate relocation modifier (also
     // suitable for ADRP).
-    return printSymbolicAddress(MI->getOperand(OpNum), false, "", O);
+    if (!printSymbolicAddress(MI->getOperand(OpNum), false, "", O))
+      return false;
+    break;
   case 'L':
     // Output bits 11:0 of symbolic address with appropriate :lo12: relocation
     // modifier.
-    return printSymbolicAddress(MI->getOperand(OpNum), true, "lo12", O);
+    if (!printSymbolicAddress(MI->getOperand(OpNum), true, "lo12", O))
+      return false;
+    break;
   case 'G':
     // Output bits 23:12 of symbolic address with appropriate :hi12: relocation
     // modifier (currently only for TLS local exec).
-    return printSymbolicAddress(MI->getOperand(OpNum), true, "hi12", O);
+    if (!printSymbolicAddress(MI->getOperand(OpNum), true, "hi12", O))
+      return false;
+    break;
+  case 'a':
+    return PrintAsmMemoryOperand(MI, OpNum, AsmVariant, ExtraCode, O);
   }
 
+  // There's actually no operand modifier, which leads to a slightly eclectic
+  // set of behaviour which we have to handle here.
+  const MachineOperand &MO = MI->getOperand(OpNum);
+  switch (MO.getType()) {
+  default:
+    llvm_unreachable("Unexpected operand for inline assembly");
+  case MachineOperand::MO_Register:
+    // GCC prints the unmodified operand of a 'w' constraint as the vector
+    // register. Technically, we could allocate the argument as a VPR128, but
+    // that leads to extremely dodgy copies being generated to get the data
+    // there.
+    if (printModifiedFPRAsmOperand(MO, TRI, 'v', O))
+      O << AArch64InstPrinter::getRegisterName(MO.getReg());
+    break;
+  case MachineOperand::MO_Immediate:
+    O << '#' << MO.getImm();
+    break;
+  case MachineOperand::MO_FPImmediate:
+    assert(MO.getFPImm()->isExactlyValue(0.0) && "Only FP 0.0 expected");
+    O << "#0.0";
+    break;
+  case MachineOperand::MO_BlockAddress:
+  case MachineOperand::MO_ConstantPoolIndex:
+  case MachineOperand::MO_GlobalAddress:
+  case MachineOperand::MO_ExternalSymbol:
+    return printSymbolicAddress(MO, false, "", O);
+  }
 
+  return false;
 }
 
 bool AArch64AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
diff --git a/lib/Target/AArch64/AArch64CallingConv.td b/lib/Target/AArch64/AArch64CallingConv.td
index bff7eeb..a2a9f3f 100644
--- a/lib/Target/AArch64/AArch64CallingConv.td
+++ b/lib/Target/AArch64/AArch64CallingConv.td
@@ -59,9 +59,9 @@ def CC_A64_APCS : CallingConv<[
   // Canonicalise the various types that live in different floating-point
   // registers. This makes sense because the PCS does not distinguish Short
   // Vectors and Floating-point types.
-  CCIfType<[v2i8], CCBitConvertToType<f16>>,
-  CCIfType<[v4i8, v2i16], CCBitConvertToType<f32>>,
-  CCIfType<[v8i8, v4i16, v2i32, v2f32, v1i64], CCBitConvertToType<f64>>,
+  CCIfType<[v1i16, v2i8], CCBitConvertToType<f16>>,
+  CCIfType<[v1i32, v4i8, v2i16, v1f32], CCBitConvertToType<f32>>,
+  CCIfType<[v8i8, v4i16, v2i32, v2f32, v1i64, v1f64], CCBitConvertToType<f64>>,
   CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
            CCBitConvertToType<f128>>,
 
@@ -70,7 +70,8 @@ def CC_A64_APCS : CallingConv<[
   // argument is allocated to the least significant bits of register
   // v[NSRN]. The NSRN is incremented by one. The argument has now been
   // allocated."
-  CCIfType<[f16],  CCAssignToReg<[B0, B1, B2, B3, B4, B5, B6, B7]>>,
+  CCIfType<[v1i8], CCAssignToReg<[B0, B1, B2, B3, B4, B5, B6, B7]>>,
+  CCIfType<[f16],  CCAssignToReg<[H0, H1, H2, H3, H4, H5, H6, H7]>>,
   CCIfType<[f32],  CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7]>>,
   CCIfType<[f64],  CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>,
   CCIfType<[f128], CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
diff --git a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index ee819e0..ef99541 100644
--- a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -109,6 +109,45 @@ public:
 
   SDNode* Select(SDNode*);
 private:
+  /// Get the opcode for table lookup instruction
+  unsigned getTBLOpc(bool IsExt, bool Is64Bit, unsigned NumOfVec);
+
+  /// Select NEON table lookup intrinsics.  NumVecs should be 1, 2, 3 or 4.
+  /// IsExt is to indicate if the result will be extended with an argument.
+  SDNode *SelectVTBL(SDNode *N, unsigned NumVecs, bool IsExt);
+
+  /// Select NEON load intrinsics.  NumVecs should be 1, 2, 3 or 4.
+  SDNode *SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs,
+                    const uint16_t *Opcode);
+
+  /// Select NEON store intrinsics.  NumVecs should be 1, 2, 3 or 4.
+  SDNode *SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs,
+                    const uint16_t *Opcodes);
+
+  /// Form sequences of consecutive 64/128-bit registers for use in NEON
+  /// instructions making use of a vector-list (e.g. ldN, tbl). Vecs must have
+  /// between 1 and 4 elements. If it contains a single element that is returned
+  /// unchanged; otherwise a REG_SEQUENCE value is returned.
+  SDValue createDTuple(ArrayRef<SDValue> Vecs);
+  SDValue createQTuple(ArrayRef<SDValue> Vecs);
+
+  /// Generic helper for the createDTuple/createQTuple
+  /// functions. Those should almost always be called instead.
+  SDValue createTuple(ArrayRef<SDValue> Vecs, unsigned RegClassIDs[],
+                      unsigned SubRegs[]);
+
+  /// Select NEON load-duplicate intrinsics.  NumVecs should be 2, 3 or 4.
+  /// The opcode array specifies the instructions used for load.
+  SDNode *SelectVLDDup(SDNode *N, bool isUpdating, unsigned NumVecs,
+                       const uint16_t *Opcodes);
+
+  /// Select NEON load/store lane intrinsics.  NumVecs should be 2, 3 or 4.
+  /// The opcode arrays specify the instructions used for load/store.
+  SDNode *SelectVLDSTLane(SDNode *N, bool IsLoad, bool isUpdating,
+                          unsigned NumVecs, const uint16_t *Opcodes);
+
+  SDValue getTargetSubregToReg(int SRIdx, SDLoc DL, EVT VT, EVT VTD,
+                               SDValue Operand);
 };
 }
 
@@ -390,12 +429,607 @@ SDNode *AArch64DAGToDAGISel::SelectAtomic(SDNode *Node, unsigned Op8,
                               &Ops[0], Ops.size());
 }
 
+SDValue AArch64DAGToDAGISel::createDTuple(ArrayRef<SDValue> Regs) {
+  static unsigned RegClassIDs[] = { AArch64::DPairRegClassID,
+                                    AArch64::DTripleRegClassID,
+                                    AArch64::DQuadRegClassID };
+  static unsigned SubRegs[] = { AArch64::dsub_0, AArch64::dsub_1,
+                                AArch64::dsub_2, AArch64::dsub_3 };
+
+  return createTuple(Regs, RegClassIDs, SubRegs);
+}
+
+SDValue AArch64DAGToDAGISel::createQTuple(ArrayRef<SDValue> Regs) {
+  static unsigned RegClassIDs[] = { AArch64::QPairRegClassID,
+                                    AArch64::QTripleRegClassID,
+                                    AArch64::QQuadRegClassID };
+  static unsigned SubRegs[] = { AArch64::qsub_0, AArch64::qsub_1,
+                                AArch64::qsub_2, AArch64::qsub_3 };
+
+  return createTuple(Regs, RegClassIDs, SubRegs);
+}
+
+SDValue AArch64DAGToDAGISel::createTuple(ArrayRef<SDValue> Regs,
+                                         unsigned RegClassIDs[],
+                                         unsigned SubRegs[]) {
+  // There's no special register-class for a vector-list of 1 element: it's just
+  // a vector.
+  if (Regs.size() == 1)
+    return Regs[0];
+
+  assert(Regs.size() >= 2 && Regs.size() <= 4);
+
+  SDLoc DL(Regs[0].getNode());
+
+  SmallVector<SDValue, 4> Ops;
+
+  // First operand of REG_SEQUENCE is the desired RegClass.
+  Ops.push_back(
+      CurDAG->getTargetConstant(RegClassIDs[Regs.size() - 2], MVT::i32));
+
+  // Then we get pairs of source & subregister-position for the components.
+  for (unsigned i = 0; i < Regs.size(); ++i) {
+    Ops.push_back(Regs[i]);
+    Ops.push_back(CurDAG->getTargetConstant(SubRegs[i], MVT::i32));
+  }
+
+  SDNode *N =
+      CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, MVT::Untyped, Ops);
+  return SDValue(N, 0);
+}
+
+
+// Get the register stride update opcode of a VLD/VST instruction that
+// is otherwise equivalent to the given fixed stride updating instruction.
+static unsigned getVLDSTRegisterUpdateOpcode(unsigned Opc) {
+  switch (Opc) {
+  default: break;
+  case AArch64::LD1WB_8B_fixed: return AArch64::LD1WB_8B_register;
+  case AArch64::LD1WB_4H_fixed: return AArch64::LD1WB_4H_register;
+  case AArch64::LD1WB_2S_fixed: return AArch64::LD1WB_2S_register;
+  case AArch64::LD1WB_1D_fixed: return AArch64::LD1WB_1D_register;
+  case AArch64::LD1WB_16B_fixed: return AArch64::LD1WB_16B_register;
+  case AArch64::LD1WB_8H_fixed: return AArch64::LD1WB_8H_register;
+  case AArch64::LD1WB_4S_fixed: return AArch64::LD1WB_4S_register;
+  case AArch64::LD1WB_2D_fixed: return AArch64::LD1WB_2D_register;
+
+  case AArch64::LD2WB_8B_fixed: return AArch64::LD2WB_8B_register;
+  case AArch64::LD2WB_4H_fixed: return AArch64::LD2WB_4H_register;
+  case AArch64::LD2WB_2S_fixed: return AArch64::LD2WB_2S_register;
+  case AArch64::LD2WB_16B_fixed: return AArch64::LD2WB_16B_register;
+  case AArch64::LD2WB_8H_fixed: return AArch64::LD2WB_8H_register;
+  case AArch64::LD2WB_4S_fixed: return AArch64::LD2WB_4S_register;
+  case AArch64::LD2WB_2D_fixed: return AArch64::LD2WB_2D_register;
+
+  case AArch64::LD3WB_8B_fixed: return AArch64::LD3WB_8B_register;
+  case AArch64::LD3WB_4H_fixed: return AArch64::LD3WB_4H_register;
+  case AArch64::LD3WB_2S_fixed: return AArch64::LD3WB_2S_register;
+  case AArch64::LD3WB_16B_fixed: return AArch64::LD3WB_16B_register;
+  case AArch64::LD3WB_8H_fixed: return AArch64::LD3WB_8H_register;
+  case AArch64::LD3WB_4S_fixed: return AArch64::LD3WB_4S_register;
+  case AArch64::LD3WB_2D_fixed: return AArch64::LD3WB_2D_register;
+
+  case AArch64::LD4WB_8B_fixed: return AArch64::LD4WB_8B_register;
+  case AArch64::LD4WB_4H_fixed: return AArch64::LD4WB_4H_register;
+  case AArch64::LD4WB_2S_fixed: return AArch64::LD4WB_2S_register;
+  case AArch64::LD4WB_16B_fixed: return AArch64::LD4WB_16B_register;
+  case AArch64::LD4WB_8H_fixed: return AArch64::LD4WB_8H_register;
+  case AArch64::LD4WB_4S_fixed: return AArch64::LD4WB_4S_register;
+  case AArch64::LD4WB_2D_fixed: return AArch64::LD4WB_2D_register;
+
+  case AArch64::LD1x2WB_8B_fixed: return AArch64::LD1x2WB_8B_register;
+  case AArch64::LD1x2WB_4H_fixed: return AArch64::LD1x2WB_4H_register;
+  case AArch64::LD1x2WB_2S_fixed: return AArch64::LD1x2WB_2S_register;
+  case AArch64::LD1x2WB_1D_fixed: return AArch64::LD1x2WB_1D_register;
+  case AArch64::LD1x2WB_16B_fixed: return AArch64::LD1x2WB_16B_register;
+  case AArch64::LD1x2WB_8H_fixed: return AArch64::LD1x2WB_8H_register;
+  case AArch64::LD1x2WB_4S_fixed: return AArch64::LD1x2WB_4S_register;
+  case AArch64::LD1x2WB_2D_fixed: return AArch64::LD1x2WB_2D_register;
+
+  case AArch64::LD1x3WB_8B_fixed: return AArch64::LD1x3WB_8B_register;
+  case AArch64::LD1x3WB_4H_fixed: return AArch64::LD1x3WB_4H_register;
+  case AArch64::LD1x3WB_2S_fixed: return AArch64::LD1x3WB_2S_register;
+  case AArch64::LD1x3WB_1D_fixed: return AArch64::LD1x3WB_1D_register;
+  case AArch64::LD1x3WB_16B_fixed: return AArch64::LD1x3WB_16B_register;
+  case AArch64::LD1x3WB_8H_fixed: return AArch64::LD1x3WB_8H_register;
+  case AArch64::LD1x3WB_4S_fixed: return AArch64::LD1x3WB_4S_register;
+  case AArch64::LD1x3WB_2D_fixed: return AArch64::LD1x3WB_2D_register;
+
+  case AArch64::LD1x4WB_8B_fixed: return AArch64::LD1x4WB_8B_register;
+  case AArch64::LD1x4WB_4H_fixed: return AArch64::LD1x4WB_4H_register;
+  case AArch64::LD1x4WB_2S_fixed: return AArch64::LD1x4WB_2S_register;
+  case AArch64::LD1x4WB_1D_fixed: return AArch64::LD1x4WB_1D_register;
+  case AArch64::LD1x4WB_16B_fixed: return AArch64::LD1x4WB_16B_register;
+  case AArch64::LD1x4WB_8H_fixed: return AArch64::LD1x4WB_8H_register;
+  case AArch64::LD1x4WB_4S_fixed: return AArch64::LD1x4WB_4S_register;
+  case AArch64::LD1x4WB_2D_fixed: return AArch64::LD1x4WB_2D_register;
+
+  case AArch64::ST1WB_8B_fixed: return AArch64::ST1WB_8B_register;
+  case AArch64::ST1WB_4H_fixed: return AArch64::ST1WB_4H_register;
+  case AArch64::ST1WB_2S_fixed: return AArch64::ST1WB_2S_register;
+  case AArch64::ST1WB_1D_fixed: return AArch64::ST1WB_1D_register;
+  case AArch64::ST1WB_16B_fixed: return AArch64::ST1WB_16B_register;
+  case AArch64::ST1WB_8H_fixed: return AArch64::ST1WB_8H_register;
+  case AArch64::ST1WB_4S_fixed: return AArch64::ST1WB_4S_register;
+  case AArch64::ST1WB_2D_fixed: return AArch64::ST1WB_2D_register;
+
+  case AArch64::ST2WB_8B_fixed: return AArch64::ST2WB_8B_register;
+  case AArch64::ST2WB_4H_fixed: return AArch64::ST2WB_4H_register;
+  case AArch64::ST2WB_2S_fixed: return AArch64::ST2WB_2S_register;
+  case AArch64::ST2WB_16B_fixed: return AArch64::ST2WB_16B_register;
+  case AArch64::ST2WB_8H_fixed: return AArch64::ST2WB_8H_register;
+  case AArch64::ST2WB_4S_fixed: return AArch64::ST2WB_4S_register;
+  case AArch64::ST2WB_2D_fixed: return AArch64::ST2WB_2D_register;
+
+  case AArch64::ST3WB_8B_fixed: return AArch64::ST3WB_8B_register;
+  case AArch64::ST3WB_4H_fixed: return AArch64::ST3WB_4H_register;
+  case AArch64::ST3WB_2S_fixed: return AArch64::ST3WB_2S_register;
+  case AArch64::ST3WB_16B_fixed: return AArch64::ST3WB_16B_register;
+  case AArch64::ST3WB_8H_fixed: return AArch64::ST3WB_8H_register;
+  case AArch64::ST3WB_4S_fixed: return AArch64::ST3WB_4S_register;
+  case AArch64::ST3WB_2D_fixed: return AArch64::ST3WB_2D_register;
+
+  case AArch64::ST4WB_8B_fixed: return AArch64::ST4WB_8B_register;
+  case AArch64::ST4WB_4H_fixed: return AArch64::ST4WB_4H_register;
+  case AArch64::ST4WB_2S_fixed: return AArch64::ST4WB_2S_register;
+  case AArch64::ST4WB_16B_fixed: return AArch64::ST4WB_16B_register;
+  case AArch64::ST4WB_8H_fixed: return AArch64::ST4WB_8H_register;
+  case AArch64::ST4WB_4S_fixed: return AArch64::ST4WB_4S_register;
+  case AArch64::ST4WB_2D_fixed: return AArch64::ST4WB_2D_register;
+
+  case AArch64::ST1x2WB_8B_fixed: return AArch64::ST1x2WB_8B_register;
+  case AArch64::ST1x2WB_4H_fixed: return AArch64::ST1x2WB_4H_register;
+  case AArch64::ST1x2WB_2S_fixed: return AArch64::ST1x2WB_2S_register;
+  case AArch64::ST1x2WB_1D_fixed: return AArch64::ST1x2WB_1D_register;
+  case AArch64::ST1x2WB_16B_fixed: return AArch64::ST1x2WB_16B_register;
+  case AArch64::ST1x2WB_8H_fixed: return AArch64::ST1x2WB_8H_register;
+  case AArch64::ST1x2WB_4S_fixed: return AArch64::ST1x2WB_4S_register;
+  case AArch64::ST1x2WB_2D_fixed: return AArch64::ST1x2WB_2D_register;
+
+  case AArch64::ST1x3WB_8B_fixed: return AArch64::ST1x3WB_8B_register;
+  case AArch64::ST1x3WB_4H_fixed: return AArch64::ST1x3WB_4H_register;
+  case AArch64::ST1x3WB_2S_fixed: return AArch64::ST1x3WB_2S_register;
+  case AArch64::ST1x3WB_1D_fixed: return AArch64::ST1x3WB_1D_register;
+  case AArch64::ST1x3WB_16B_fixed: return AArch64::ST1x3WB_16B_register;
+  case AArch64::ST1x3WB_8H_fixed: return AArch64::ST1x3WB_8H_register;
+  case AArch64::ST1x3WB_4S_fixed: return AArch64::ST1x3WB_4S_register;
+  case AArch64::ST1x3WB_2D_fixed: return AArch64::ST1x3WB_2D_register;
+
+  case AArch64::ST1x4WB_8B_fixed: return AArch64::ST1x4WB_8B_register;
+  case AArch64::ST1x4WB_4H_fixed: return AArch64::ST1x4WB_4H_register;
+  case AArch64::ST1x4WB_2S_fixed: return AArch64::ST1x4WB_2S_register;
+  case AArch64::ST1x4WB_1D_fixed: return AArch64::ST1x4WB_1D_register;
+  case AArch64::ST1x4WB_16B_fixed: return AArch64::ST1x4WB_16B_register;
+  case AArch64::ST1x4WB_8H_fixed: return AArch64::ST1x4WB_8H_register;
+  case AArch64::ST1x4WB_4S_fixed: return AArch64::ST1x4WB_4S_register;
+  case AArch64::ST1x4WB_2D_fixed: return AArch64::ST1x4WB_2D_register;
+
+  // Post-index of duplicate loads
+  case AArch64::LD2R_WB_8B_fixed: return AArch64::LD2R_WB_8B_register;
+  case AArch64::LD2R_WB_4H_fixed: return AArch64::LD2R_WB_4H_register;
+  case AArch64::LD2R_WB_2S_fixed: return AArch64::LD2R_WB_2S_register;
+  case AArch64::LD2R_WB_1D_fixed: return AArch64::LD2R_WB_1D_register;
+  case AArch64::LD2R_WB_16B_fixed: return AArch64::LD2R_WB_16B_register;
+  case AArch64::LD2R_WB_8H_fixed: return AArch64::LD2R_WB_8H_register;
+  case AArch64::LD2R_WB_4S_fixed: return AArch64::LD2R_WB_4S_register;
+  case AArch64::LD2R_WB_2D_fixed: return AArch64::LD2R_WB_2D_register;
+
+  case AArch64::LD3R_WB_8B_fixed: return AArch64::LD3R_WB_8B_register;
+  case AArch64::LD3R_WB_4H_fixed: return AArch64::LD3R_WB_4H_register;
+  case AArch64::LD3R_WB_2S_fixed: return AArch64::LD3R_WB_2S_register;
+  case AArch64::LD3R_WB_1D_fixed: return AArch64::LD3R_WB_1D_register;
+  case AArch64::LD3R_WB_16B_fixed: return AArch64::LD3R_WB_16B_register;
+  case AArch64::LD3R_WB_8H_fixed: return AArch64::LD3R_WB_8H_register;
+  case AArch64::LD3R_WB_4S_fixed: return AArch64::LD3R_WB_4S_register;
+  case AArch64::LD3R_WB_2D_fixed: return AArch64::LD3R_WB_2D_register;
+
+  case AArch64::LD4R_WB_8B_fixed: return AArch64::LD4R_WB_8B_register;
+  case AArch64::LD4R_WB_4H_fixed: return AArch64::LD4R_WB_4H_register;
+  case AArch64::LD4R_WB_2S_fixed: return AArch64::LD4R_WB_2S_register;
+  case AArch64::LD4R_WB_1D_fixed: return AArch64::LD4R_WB_1D_register;
+  case AArch64::LD4R_WB_16B_fixed: return AArch64::LD4R_WB_16B_register;
+  case AArch64::LD4R_WB_8H_fixed: return AArch64::LD4R_WB_8H_register;
+  case AArch64::LD4R_WB_4S_fixed: return AArch64::LD4R_WB_4S_register;
+  case AArch64::LD4R_WB_2D_fixed: return AArch64::LD4R_WB_2D_register;
+
+  // Post-index of lane loads
+  case AArch64::LD2LN_WB_B_fixed: return AArch64::LD2LN_WB_B_register;
+  case AArch64::LD2LN_WB_H_fixed: return AArch64::LD2LN_WB_H_register;
+  case AArch64::LD2LN_WB_S_fixed: return AArch64::LD2LN_WB_S_register;
+  case AArch64::LD2LN_WB_D_fixed: return AArch64::LD2LN_WB_D_register;
+
+  case AArch64::LD3LN_WB_B_fixed: return AArch64::LD3LN_WB_B_register;
+  case AArch64::LD3LN_WB_H_fixed: return AArch64::LD3LN_WB_H_register;
+  case AArch64::LD3LN_WB_S_fixed: return AArch64::LD3LN_WB_S_register;
+  case AArch64::LD3LN_WB_D_fixed: return AArch64::LD3LN_WB_D_register;
+
+  case AArch64::LD4LN_WB_B_fixed: return AArch64::LD4LN_WB_B_register;
+  case AArch64::LD4LN_WB_H_fixed: return AArch64::LD4LN_WB_H_register;
+  case AArch64::LD4LN_WB_S_fixed: return AArch64::LD4LN_WB_S_register;
+  case AArch64::LD4LN_WB_D_fixed: return AArch64::LD4LN_WB_D_register;
+
+  // Post-index of lane stores
+  case AArch64::ST2LN_WB_B_fixed: return AArch64::ST2LN_WB_B_register;
+  case AArch64::ST2LN_WB_H_fixed: return AArch64::ST2LN_WB_H_register;
+  case AArch64::ST2LN_WB_S_fixed: return AArch64::ST2LN_WB_S_register;
+  case AArch64::ST2LN_WB_D_fixed: return AArch64::ST2LN_WB_D_register;
+
+  case AArch64::ST3LN_WB_B_fixed: return AArch64::ST3LN_WB_B_register;
+  case AArch64::ST3LN_WB_H_fixed: return AArch64::ST3LN_WB_H_register;
+  case AArch64::ST3LN_WB_S_fixed: return AArch64::ST3LN_WB_S_register;
+  case AArch64::ST3LN_WB_D_fixed: return AArch64::ST3LN_WB_D_register;
+
+  case AArch64::ST4LN_WB_B_fixed: return AArch64::ST4LN_WB_B_register;
+  case AArch64::ST4LN_WB_H_fixed: return AArch64::ST4LN_WB_H_register;
+  case AArch64::ST4LN_WB_S_fixed: return AArch64::ST4LN_WB_S_register;
+  case AArch64::ST4LN_WB_D_fixed: return AArch64::ST4LN_WB_D_register;
+  }
+  return Opc; // If not one we handle, return it unchanged.
+}
+
+SDNode *AArch64DAGToDAGISel::SelectVLD(SDNode *N, bool isUpdating,
+                                       unsigned NumVecs,
+                                       const uint16_t *Opcodes) {
+  assert(NumVecs >= 1 && NumVecs <= 4 && "VLD NumVecs out-of-range");
+
+  EVT VT = N->getValueType(0);
+  unsigned OpcodeIndex;
+  bool is64BitVector = VT.is64BitVector();
+  switch (VT.getScalarType().getSizeInBits()) {
+  case 8: OpcodeIndex = is64BitVector ? 0 : 4; break;
+  case 16: OpcodeIndex = is64BitVector ? 1 : 5; break;
+  case 32: OpcodeIndex = is64BitVector ? 2 : 6; break;
+  case 64: OpcodeIndex = is64BitVector ? 3 : 7; break;
+  default: llvm_unreachable("unhandled vector load type");
+  }
+  unsigned Opc = Opcodes[OpcodeIndex];
+
+  SmallVector<SDValue, 2> Ops;
+  unsigned AddrOpIdx = isUpdating ? 1 : 2;
+  Ops.push_back(N->getOperand(AddrOpIdx)); // Push back the Memory Address
+
+  if (isUpdating) {
+    SDValue Inc = N->getOperand(AddrOpIdx + 1);
+    if (!isa<ConstantSDNode>(Inc.getNode())) // Increment in Register
+      Opc = getVLDSTRegisterUpdateOpcode(Opc);
+    Ops.push_back(Inc);
+  }
+
+  Ops.push_back(N->getOperand(0)); // Push back the Chain
+
+  SmallVector<EVT, 3> ResTys;
+  // Push back the type of return super register
+  if (NumVecs == 1)
+    ResTys.push_back(VT);
+  else if (NumVecs == 3)
+    ResTys.push_back(MVT::Untyped);
+  else {
+    EVT ResTy = EVT::getVectorVT(*CurDAG->getContext(), MVT::i64,
+                                 is64BitVector ? NumVecs : NumVecs * 2);
+    ResTys.push_back(ResTy);
+  }
+
+  if (isUpdating)
+    ResTys.push_back(MVT::i64); // Type of the updated register
+  ResTys.push_back(MVT::Other); // Type of the Chain
+  SDLoc dl(N);
+  SDNode *VLd = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
+
+  // Transfer memoperands.
+  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+  MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
+  cast<MachineSDNode>(VLd)->setMemRefs(MemOp, MemOp + 1);
+
+  if (NumVecs == 1)
+    return VLd;
+
+  // If NumVecs > 1, the return result is a super register containing 2-4
+  // consecutive vector registers.
+  SDValue SuperReg = SDValue(VLd, 0);
+
+  unsigned Sub0 = is64BitVector ? AArch64::dsub_0 : AArch64::qsub_0;
+  for (unsigned Vec = 0; Vec < NumVecs; ++Vec)
+    ReplaceUses(SDValue(N, Vec),
+                CurDAG->getTargetExtractSubreg(Sub0 + Vec, dl, VT, SuperReg));
+  // Update users of the Chain
+  ReplaceUses(SDValue(N, NumVecs), SDValue(VLd, 1));
+  if (isUpdating)
+    ReplaceUses(SDValue(N, NumVecs + 1), SDValue(VLd, 2));
+
+  return NULL;
+}
+
+SDNode *AArch64DAGToDAGISel::SelectVST(SDNode *N, bool isUpdating,
+                                       unsigned NumVecs,
+                                       const uint16_t *Opcodes) {
+  assert(NumVecs >= 1 && NumVecs <= 4 && "VST NumVecs out-of-range");
+  SDLoc dl(N);
+
+  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+  MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
+
+  unsigned AddrOpIdx = isUpdating ? 1 : 2;
+  unsigned Vec0Idx = 3;
+  EVT VT = N->getOperand(Vec0Idx).getValueType();
+  unsigned OpcodeIndex;
+  bool is64BitVector = VT.is64BitVector();
+  switch (VT.getScalarType().getSizeInBits()) {
+  case 8: OpcodeIndex = is64BitVector ? 0 : 4; break;
+  case 16: OpcodeIndex = is64BitVector ? 1 : 5; break;
+  case 32: OpcodeIndex = is64BitVector ? 2 : 6; break;
+  case 64: OpcodeIndex = is64BitVector ? 3 : 7; break;
+  default: llvm_unreachable("unhandled vector store type");
+  }
+  unsigned Opc = Opcodes[OpcodeIndex];
+
+  SmallVector<EVT, 2> ResTys;
+  if (isUpdating)
+    ResTys.push_back(MVT::i64);
+  ResTys.push_back(MVT::Other); // Type for the Chain
+
+  SmallVector<SDValue, 6> Ops;
+  Ops.push_back(N->getOperand(AddrOpIdx)); // Push back the Memory Address
+
+  if (isUpdating) {
+    SDValue Inc = N->getOperand(AddrOpIdx + 1);
+    if (!isa<ConstantSDNode>(Inc.getNode())) // Increment in Register
+      Opc = getVLDSTRegisterUpdateOpcode(Opc);
+    Ops.push_back(Inc);
+  }
+
+  SmallVector<SDValue, 4> Regs(N->op_begin() + Vec0Idx,
+                               N->op_begin() + Vec0Idx + NumVecs);
+  SDValue SrcReg = is64BitVector ? createDTuple(Regs) : createQTuple(Regs);
+  Ops.push_back(SrcReg);
+
+  // Push back the Chain
+  Ops.push_back(N->getOperand(0));
+
+  // Transfer memoperands.
+  SDNode *VSt = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
+  cast<MachineSDNode>(VSt)->setMemRefs(MemOp, MemOp + 1);
+
+  return VSt;
+}
+
+SDValue
+AArch64DAGToDAGISel::getTargetSubregToReg(int SRIdx, SDLoc DL, EVT VT, EVT VTD,
+                                          SDValue Operand) {
+  SDNode *Reg = CurDAG->getMachineNode(TargetOpcode::SUBREG_TO_REG, DL,
+                        VT, VTD, MVT::Other,
+                        CurDAG->getTargetConstant(0, MVT::i64),
+                        Operand,
+                        CurDAG->getTargetConstant(AArch64::sub_64, MVT::i32));
+  return SDValue(Reg, 0);
+}
+
+SDNode *AArch64DAGToDAGISel::SelectVLDDup(SDNode *N, bool isUpdating,
+                                          unsigned NumVecs,
+                                          const uint16_t *Opcodes) {
+  assert(NumVecs >=2 && NumVecs <= 4 && "Load Dup NumVecs out-of-range");
+  SDLoc dl(N);
+
+  EVT VT = N->getValueType(0);
+  unsigned OpcodeIndex;
+  bool is64BitVector = VT.is64BitVector();
+  switch (VT.getScalarType().getSizeInBits()) {
+  case 8: OpcodeIndex = is64BitVector ? 0 : 4; break;
+  case 16: OpcodeIndex = is64BitVector ? 1 : 5; break;
+  case 32: OpcodeIndex = is64BitVector ? 2 : 6; break;
+  case 64: OpcodeIndex = is64BitVector ? 3 : 7; break;
+  default: llvm_unreachable("unhandled vector duplicate lane load type");
+  }
+  unsigned Opc = Opcodes[OpcodeIndex];
+
+  SDValue SuperReg;
+  SmallVector<SDValue, 6> Ops;
+  Ops.push_back(N->getOperand(1)); // Push back the Memory Address
+  if (isUpdating) {
+    SDValue Inc = N->getOperand(2);
+    if (!isa<ConstantSDNode>(Inc.getNode())) // Increment in Register
+      Opc = getVLDSTRegisterUpdateOpcode(Opc);
+    Ops.push_back(Inc);
+  }
+  Ops.push_back(N->getOperand(0)); // Push back the Chain
+
+  SmallVector<EVT, 3> ResTys;
+  // Push back the type of return super register
+  if (NumVecs == 3)
+    ResTys.push_back(MVT::Untyped);
+  else {
+    EVT ResTy = EVT::getVectorVT(*CurDAG->getContext(), MVT::i64,
+                                 is64BitVector ? NumVecs : NumVecs * 2);
+    ResTys.push_back(ResTy);
+  }
+  if (isUpdating)
+    ResTys.push_back(MVT::i64); // Type of the updated register
+  ResTys.push_back(MVT::Other); // Type of the Chain
+  SDNode *VLdDup = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
+
+  // Transfer memoperands.
+  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+  MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
+  cast<MachineSDNode>(VLdDup)->setMemRefs(MemOp, MemOp + 1);
+
+  SuperReg = SDValue(VLdDup, 0);
+  unsigned Sub0 = is64BitVector ? AArch64::dsub_0 : AArch64::qsub_0;
+  // Update uses of each registers in super register
+  for (unsigned Vec = 0; Vec < NumVecs; ++Vec)
+    ReplaceUses(SDValue(N, Vec),
+                CurDAG->getTargetExtractSubreg(Sub0 + Vec, dl, VT, SuperReg));
+  // Update uses of the Chain
+  ReplaceUses(SDValue(N, NumVecs), SDValue(VLdDup, 1));
+  if (isUpdating)
+    ReplaceUses(SDValue(N, NumVecs + 1), SDValue(VLdDup, 2));
+  return NULL;
+}
+
+// We only have 128-bit vector type of load/store lane instructions.
+// If it is 64-bit vector, we also select it to the 128-bit instructions.
+// Just use SUBREG_TO_REG to adapt the input to 128-bit vector and
+// EXTRACT_SUBREG to get the 64-bit vector from the 128-bit vector output.
+SDNode *AArch64DAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad,
+                                             bool isUpdating, unsigned NumVecs,
+                                             const uint16_t *Opcodes) {
+  assert(NumVecs >= 2 && NumVecs <= 4 && "VLDSTLane NumVecs out-of-range");
+  SDLoc dl(N);
+  unsigned AddrOpIdx = isUpdating ? 1 : 2;
+  unsigned Vec0Idx = 3;
+
+  SDValue Chain = N->getOperand(0);
+  unsigned Lane =
+      cast<ConstantSDNode>(N->getOperand(Vec0Idx + NumVecs))->getZExtValue();
+  EVT VT = N->getOperand(Vec0Idx).getValueType();
+  bool is64BitVector = VT.is64BitVector();
+  EVT VT64; // 64-bit Vector Type
+
+  if (is64BitVector) {
+    VT64 = VT;
+    VT = EVT::getVectorVT(*CurDAG->getContext(), VT.getVectorElementType(),
+                          VT.getVectorNumElements() * 2);
+  }
+
+  unsigned OpcodeIndex;
+  switch (VT.getScalarType().getSizeInBits()) {
+  case 8: OpcodeIndex = 0; break;
+  case 16: OpcodeIndex = 1; break;
+  case 32: OpcodeIndex = 2; break;
+  case 64: OpcodeIndex = 3; break;
+  default: llvm_unreachable("unhandled vector lane load/store type");
+  }
+  unsigned Opc = Opcodes[OpcodeIndex];
+
+  SmallVector<EVT, 3> ResTys;
+  if (IsLoad) {
+    // Push back the type of return super register
+    if (NumVecs == 3)
+      ResTys.push_back(MVT::Untyped);
+    else {
+      EVT ResTy = EVT::getVectorVT(*CurDAG->getContext(), MVT::i64,
+                                   is64BitVector ? NumVecs : NumVecs * 2);
+      ResTys.push_back(ResTy);
+    }
+  }
+  if (isUpdating)
+    ResTys.push_back(MVT::i64); // Type of the updated register
+  ResTys.push_back(MVT::Other); // Type of Chain
+  SmallVector<SDValue, 5> Ops;
+  Ops.push_back(N->getOperand(AddrOpIdx)); // Push back the Memory Address
+  if (isUpdating) {
+    SDValue Inc = N->getOperand(AddrOpIdx + 1);
+    if (!isa<ConstantSDNode>(Inc.getNode())) // Increment in Register
+      Opc = getVLDSTRegisterUpdateOpcode(Opc);
+    Ops.push_back(Inc);
+  }
+
+  SmallVector<SDValue, 4> Regs(N->op_begin() + Vec0Idx,
+                               N->op_begin() + Vec0Idx + NumVecs);
+  if (is64BitVector)
+    for (unsigned i = 0; i < Regs.size(); i++)
+      Regs[i] = getTargetSubregToReg(AArch64::sub_64, dl, VT, VT64, Regs[i]);
+  SDValue SuperReg = createQTuple(Regs);
+
+  Ops.push_back(SuperReg); // Source Reg
+  SDValue LaneValue = CurDAG->getTargetConstant(Lane, MVT::i32);
+  Ops.push_back(LaneValue);
+  Ops.push_back(Chain); // Push back the Chain
+
+  SDNode *VLdLn = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
+  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+  MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
+  cast<MachineSDNode>(VLdLn)->setMemRefs(MemOp, MemOp + 1);
+  if (!IsLoad)
+    return VLdLn;
+
+  // Extract the subregisters.
+  SuperReg = SDValue(VLdLn, 0);
+  unsigned Sub0 = AArch64::qsub_0;
+  // Update uses of each registers in super register
+  for (unsigned Vec = 0; Vec < NumVecs; ++Vec) {
+    SDValue SUB0 = CurDAG->getTargetExtractSubreg(Sub0 + Vec, dl, VT, SuperReg);
+    if (is64BitVector) {
+      SUB0 = CurDAG->getTargetExtractSubreg(AArch64::sub_64, dl, VT64, SUB0);
+    }
+    ReplaceUses(SDValue(N, Vec), SUB0);
+  }
+  ReplaceUses(SDValue(N, NumVecs), SDValue(VLdLn, 1));
+  if (isUpdating)
+    ReplaceUses(SDValue(N, NumVecs + 1), SDValue(VLdLn, 2));
+  return NULL;
+}
+
+unsigned AArch64DAGToDAGISel::getTBLOpc(bool IsExt, bool Is64Bit,
+                                        unsigned NumOfVec) {
+  assert(NumOfVec >= 1 && NumOfVec <= 4 && "VST NumVecs out-of-range");
+
+  unsigned Opc = 0;
+  switch (NumOfVec) {
+  default:
+    break;
+  case 1:
+    if (IsExt)
+      Opc = Is64Bit ? AArch64::TBX1_8b : AArch64::TBX1_16b;
+    else
+      Opc = Is64Bit ? AArch64::TBL1_8b : AArch64::TBL1_16b;
+    break;
+  case 2:
+    if (IsExt)
+      Opc = Is64Bit ? AArch64::TBX2_8b : AArch64::TBX2_16b;
+    else
+      Opc = Is64Bit ? AArch64::TBL2_8b : AArch64::TBL2_16b;
+    break;
+  case 3:
+    if (IsExt)
+      Opc = Is64Bit ? AArch64::TBX3_8b : AArch64::TBX3_16b;
+    else
+      Opc = Is64Bit ? AArch64::TBL3_8b : AArch64::TBL3_16b;
+    break;
+  case 4:
+    if (IsExt)
+      Opc = Is64Bit ? AArch64::TBX4_8b : AArch64::TBX4_16b;
+    else
+      Opc = Is64Bit ? AArch64::TBL4_8b : AArch64::TBL4_16b;
+    break;
+  }
+
+  return Opc;
+}
+
+SDNode *AArch64DAGToDAGISel::SelectVTBL(SDNode *N, unsigned NumVecs,
+                                        bool IsExt) {
+  assert(NumVecs >= 1 && NumVecs <= 4 && "VST NumVecs out-of-range");
+  SDLoc dl(N);
+
+  // Check the element of look up table is 64-bit or not
+  unsigned Vec0Idx = IsExt ? 2 : 1;
+  assert(!N->getOperand(Vec0Idx + 0).getValueType().is64BitVector() &&
+         "The element of lookup table for vtbl and vtbx must be 128-bit");
+
+  // Check the return value type is 64-bit or not
+  EVT ResVT = N->getValueType(0);
+  bool is64BitRes = ResVT.is64BitVector();
+
+  // Create new SDValue for vector list
+  SmallVector<SDValue, 4> Regs(N->op_begin() + Vec0Idx,
+                               N->op_begin() + Vec0Idx + NumVecs);
+  SDValue TblReg = createQTuple(Regs);
+  unsigned Opc = getTBLOpc(IsExt, is64BitRes, NumVecs);
+
+  SmallVector<SDValue, 3> Ops;
+  if (IsExt)
+    Ops.push_back(N->getOperand(1));
+  Ops.push_back(TblReg);
+  Ops.push_back(N->getOperand(Vec0Idx + NumVecs));
+  return CurDAG->getMachineNode(Opc, dl, ResVT, Ops);
+}
+
 SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
   // Dump information about the Node being selected
   DEBUG(dbgs() << "Selecting: "; Node->dump(CurDAG); dbgs() << "\n");
 
   if (Node->isMachineOpcode()) {
     DEBUG(dbgs() << "== "; Node->dump(CurDAG); dbgs() << "\n");
+    Node->setNodeId(-1);
     return NULL;
   }
 
@@ -535,6 +1169,399 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
     Node = ResNode;
     break;
   }
+  case AArch64ISD::NEON_LD1_UPD: {
+    static const uint16_t Opcodes[] = {
+      AArch64::LD1WB_8B_fixed,  AArch64::LD1WB_4H_fixed,
+      AArch64::LD1WB_2S_fixed,  AArch64::LD1WB_1D_fixed,
+      AArch64::LD1WB_16B_fixed, AArch64::LD1WB_8H_fixed,
+      AArch64::LD1WB_4S_fixed,  AArch64::LD1WB_2D_fixed
+    };
+    return SelectVLD(Node, true, 1, Opcodes);
+  }
+  case AArch64ISD::NEON_LD2_UPD: {
+    static const uint16_t Opcodes[] = {
+      AArch64::LD2WB_8B_fixed,  AArch64::LD2WB_4H_fixed,
+      AArch64::LD2WB_2S_fixed,  AArch64::LD1x2WB_1D_fixed,
+      AArch64::LD2WB_16B_fixed, AArch64::LD2WB_8H_fixed,
+      AArch64::LD2WB_4S_fixed,  AArch64::LD2WB_2D_fixed
+    };
+    return SelectVLD(Node, true, 2, Opcodes);
+  }
+  case AArch64ISD::NEON_LD3_UPD: {
+    static const uint16_t Opcodes[] = {
+      AArch64::LD3WB_8B_fixed,  AArch64::LD3WB_4H_fixed,
+      AArch64::LD3WB_2S_fixed,  AArch64::LD1x3WB_1D_fixed,
+      AArch64::LD3WB_16B_fixed, AArch64::LD3WB_8H_fixed,
+      AArch64::LD3WB_4S_fixed,  AArch64::LD3WB_2D_fixed
+    };
+    return SelectVLD(Node, true, 3, Opcodes);
+  }
+  case AArch64ISD::NEON_LD4_UPD: {
+    static const uint16_t Opcodes[] = {
+      AArch64::LD4WB_8B_fixed,  AArch64::LD4WB_4H_fixed,
+      AArch64::LD4WB_2S_fixed,  AArch64::LD1x4WB_1D_fixed,
+      AArch64::LD4WB_16B_fixed, AArch64::LD4WB_8H_fixed,
+      AArch64::LD4WB_4S_fixed,  AArch64::LD4WB_2D_fixed
+    };
+    return SelectVLD(Node, true, 4, Opcodes);
+  }
+  case AArch64ISD::NEON_LD1x2_UPD: {
+    static const uint16_t Opcodes[] = {
+      AArch64::LD1x2WB_8B_fixed,  AArch64::LD1x2WB_4H_fixed,
+      AArch64::LD1x2WB_2S_fixed,  AArch64::LD1x2WB_1D_fixed,
+      AArch64::LD1x2WB_16B_fixed, AArch64::LD1x2WB_8H_fixed,
+      AArch64::LD1x2WB_4S_fixed,  AArch64::LD1x2WB_2D_fixed
+    };
+    return SelectVLD(Node, true, 2, Opcodes);
+  }
+  case AArch64ISD::NEON_LD1x3_UPD: {
+    static const uint16_t Opcodes[] = {
+      AArch64::LD1x3WB_8B_fixed,  AArch64::LD1x3WB_4H_fixed,
+      AArch64::LD1x3WB_2S_fixed,  AArch64::LD1x3WB_1D_fixed,
+      AArch64::LD1x3WB_16B_fixed, AArch64::LD1x3WB_8H_fixed,
+      AArch64::LD1x3WB_4S_fixed,  AArch64::LD1x3WB_2D_fixed
+    };
+    return SelectVLD(Node, true, 3, Opcodes);
+  }
+  case AArch64ISD::NEON_LD1x4_UPD: {
+    static const uint16_t Opcodes[] = {
+      AArch64::LD1x4WB_8B_fixed,  AArch64::LD1x4WB_4H_fixed,
+      AArch64::LD1x4WB_2S_fixed,  AArch64::LD1x4WB_1D_fixed,
+      AArch64::LD1x4WB_16B_fixed, AArch64::LD1x4WB_8H_fixed,
+      AArch64::LD1x4WB_4S_fixed,  AArch64::LD1x4WB_2D_fixed
+    };
+    return SelectVLD(Node, true, 4, Opcodes);
+  }
+  case AArch64ISD::NEON_ST1_UPD: {
+    static const uint16_t Opcodes[] = {
+      AArch64::ST1WB_8B_fixed,  AArch64::ST1WB_4H_fixed,
+      AArch64::ST1WB_2S_fixed,  AArch64::ST1WB_1D_fixed,
+      AArch64::ST1WB_16B_fixed, AArch64::ST1WB_8H_fixed,
+      AArch64::ST1WB_4S_fixed,  AArch64::ST1WB_2D_fixed
+    };
+    return SelectVST(Node, true, 1, Opcodes);
+  }
+  case AArch64ISD::NEON_ST2_UPD: {
+    static const uint16_t Opcodes[] = {
+      AArch64::ST2WB_8B_fixed,  AArch64::ST2WB_4H_fixed,
+      AArch64::ST2WB_2S_fixed,  AArch64::ST1x2WB_1D_fixed,
+      AArch64::ST2WB_16B_fixed, AArch64::ST2WB_8H_fixed,
+      AArch64::ST2WB_4S_fixed,  AArch64::ST2WB_2D_fixed
+    };
+    return SelectVST(Node, true, 2, Opcodes);
+  }
+  case AArch64ISD::NEON_ST3_UPD: {
+    static const uint16_t Opcodes[] = {
+      AArch64::ST3WB_8B_fixed,  AArch64::ST3WB_4H_fixed,
+      AArch64::ST3WB_2S_fixed,  AArch64::ST1x3WB_1D_fixed,
+      AArch64::ST3WB_16B_fixed, AArch64::ST3WB_8H_fixed,
+      AArch64::ST3WB_4S_fixed,  AArch64::ST3WB_2D_fixed
+    };
+    return SelectVST(Node, true, 3, Opcodes);
+  }
+  case AArch64ISD::NEON_ST4_UPD: {
+    static const uint16_t Opcodes[] = {
+      AArch64::ST4WB_8B_fixed,  AArch64::ST4WB_4H_fixed,
+      AArch64::ST4WB_2S_fixed,  AArch64::ST1x4WB_1D_fixed,
+      AArch64::ST4WB_16B_fixed, AArch64::ST4WB_8H_fixed,
+      AArch64::ST4WB_4S_fixed,  AArch64::ST4WB_2D_fixed
+    };
+    return SelectVST(Node, true, 4, Opcodes);
+  }
+  case AArch64ISD::NEON_LD2DUP: {
+    static const uint16_t Opcodes[] = {
+        AArch64::LD2R_8B, AArch64::LD2R_4H, AArch64::LD2R_2S,
+        AArch64::LD2R_1D, AArch64::LD2R_16B, AArch64::LD2R_8H,
+        AArch64::LD2R_4S, AArch64::LD2R_2D
+    };
+    return SelectVLDDup(Node, false, 2, Opcodes);
+  }
+  case AArch64ISD::NEON_LD3DUP: {
+    static const uint16_t Opcodes[] = {
+        AArch64::LD3R_8B, AArch64::LD3R_4H, AArch64::LD3R_2S,
+        AArch64::LD3R_1D, AArch64::LD3R_16B, AArch64::LD3R_8H,
+        AArch64::LD3R_4S, AArch64::LD3R_2D
+    };
+    return SelectVLDDup(Node, false, 3, Opcodes);
+  }
+  case AArch64ISD::NEON_LD4DUP: {
+    static const uint16_t Opcodes[] = {
+        AArch64::LD4R_8B, AArch64::LD4R_4H, AArch64::LD4R_2S,
+        AArch64::LD4R_1D, AArch64::LD4R_16B, AArch64::LD4R_8H,
+        AArch64::LD4R_4S, AArch64::LD4R_2D
+    };
+    return SelectVLDDup(Node, false, 4, Opcodes);
+  }
+  case AArch64ISD::NEON_LD2DUP_UPD: {
+    static const uint16_t Opcodes[] = {
+      AArch64::LD2R_WB_8B_fixed,  AArch64::LD2R_WB_4H_fixed,
+      AArch64::LD2R_WB_2S_fixed,  AArch64::LD2R_WB_1D_fixed,
+      AArch64::LD2R_WB_16B_fixed, AArch64::LD2R_WB_8H_fixed,
+      AArch64::LD2R_WB_4S_fixed,  AArch64::LD2R_WB_2D_fixed
+    };
+    return SelectVLDDup(Node, true, 2, Opcodes);
+  }
+  case AArch64ISD::NEON_LD3DUP_UPD: {
+    static const uint16_t Opcodes[] = {
+      AArch64::LD3R_WB_8B_fixed,  AArch64::LD3R_WB_4H_fixed,
+      AArch64::LD3R_WB_2S_fixed,  AArch64::LD3R_WB_1D_fixed,
+      AArch64::LD3R_WB_16B_fixed, AArch64::LD3R_WB_8H_fixed,
+      AArch64::LD3R_WB_4S_fixed,  AArch64::LD3R_WB_2D_fixed
+    };
+    return SelectVLDDup(Node, true, 3, Opcodes);
+  }
+  case AArch64ISD::NEON_LD4DUP_UPD: {
+    static const uint16_t Opcodes[] = {
+      AArch64::LD4R_WB_8B_fixed,  AArch64::LD4R_WB_4H_fixed,
+      AArch64::LD4R_WB_2S_fixed,  AArch64::LD4R_WB_1D_fixed,
+      AArch64::LD4R_WB_16B_fixed, AArch64::LD4R_WB_8H_fixed,
+      AArch64::LD4R_WB_4S_fixed,  AArch64::LD4R_WB_2D_fixed
+    };
+    return SelectVLDDup(Node, true, 4, Opcodes);
+  }
+  case AArch64ISD::NEON_LD2LN_UPD: {
+    static const uint16_t Opcodes[] = {
+        AArch64::LD2LN_WB_B_fixed, AArch64::LD2LN_WB_H_fixed,
+        AArch64::LD2LN_WB_S_fixed, AArch64::LD2LN_WB_D_fixed
+    };
+    return SelectVLDSTLane(Node, true, true, 2, Opcodes);
+  }
+  case AArch64ISD::NEON_LD3LN_UPD: {
+    static const uint16_t Opcodes[] = {
+        AArch64::LD3LN_WB_B_fixed, AArch64::LD3LN_WB_H_fixed,
+        AArch64::LD3LN_WB_S_fixed, AArch64::LD3LN_WB_D_fixed
+    };
+    return SelectVLDSTLane(Node, true, true, 3, Opcodes);
+  }
+  case AArch64ISD::NEON_LD4LN_UPD: {
+    static const uint16_t Opcodes[] = {
+        AArch64::LD4LN_WB_B_fixed, AArch64::LD4LN_WB_H_fixed,
+        AArch64::LD4LN_WB_S_fixed, AArch64::LD4LN_WB_D_fixed
+    };
+    return SelectVLDSTLane(Node, true, true, 4, Opcodes);
+  }
+  case AArch64ISD::NEON_ST2LN_UPD: {
+    static const uint16_t Opcodes[] = {
+        AArch64::ST2LN_WB_B_fixed, AArch64::ST2LN_WB_H_fixed,
+        AArch64::ST2LN_WB_S_fixed, AArch64::ST2LN_WB_D_fixed
+    };
+    return SelectVLDSTLane(Node, false, true, 2, Opcodes);
+  }
+  case AArch64ISD::NEON_ST3LN_UPD: {
+    static const uint16_t Opcodes[] = {
+        AArch64::ST3LN_WB_B_fixed, AArch64::ST3LN_WB_H_fixed,
+        AArch64::ST3LN_WB_S_fixed, AArch64::ST3LN_WB_D_fixed
+    };
+    return SelectVLDSTLane(Node, false, true, 3, Opcodes);
+  }
+  case AArch64ISD::NEON_ST4LN_UPD: {
+    static const uint16_t Opcodes[] = {
+        AArch64::ST4LN_WB_B_fixed, AArch64::ST4LN_WB_H_fixed,
+        AArch64::ST4LN_WB_S_fixed, AArch64::ST4LN_WB_D_fixed
+    };
+    return SelectVLDSTLane(Node, false, true, 4, Opcodes);
+  }
+  case AArch64ISD::NEON_ST1x2_UPD: {
+    static const uint16_t Opcodes[] = {
+      AArch64::ST1x2WB_8B_fixed,  AArch64::ST1x2WB_4H_fixed,
+      AArch64::ST1x2WB_2S_fixed,  AArch64::ST1x2WB_1D_fixed,
+      AArch64::ST1x2WB_16B_fixed, AArch64::ST1x2WB_8H_fixed,
+      AArch64::ST1x2WB_4S_fixed,  AArch64::ST1x2WB_2D_fixed
+    };
+    return SelectVST(Node, true, 2, Opcodes);
+  }
+  case AArch64ISD::NEON_ST1x3_UPD: {
+    static const uint16_t Opcodes[] = {
+      AArch64::ST1x3WB_8B_fixed,  AArch64::ST1x3WB_4H_fixed,
+      AArch64::ST1x3WB_2S_fixed,  AArch64::ST1x3WB_1D_fixed,
+      AArch64::ST1x3WB_16B_fixed, AArch64::ST1x3WB_8H_fixed,
+      AArch64::ST1x3WB_4S_fixed,  AArch64::ST1x3WB_2D_fixed
+    };
+    return SelectVST(Node, true, 3, Opcodes);
+  }
+  case AArch64ISD::NEON_ST1x4_UPD: {
+    static const uint16_t Opcodes[] = {
+      AArch64::ST1x4WB_8B_fixed,  AArch64::ST1x4WB_4H_fixed,
+      AArch64::ST1x4WB_2S_fixed,  AArch64::ST1x4WB_1D_fixed,
+      AArch64::ST1x4WB_16B_fixed, AArch64::ST1x4WB_8H_fixed,
+      AArch64::ST1x4WB_4S_fixed,  AArch64::ST1x4WB_2D_fixed
+    };
+    return SelectVST(Node, true, 4, Opcodes);
+  }
+  case ISD::INTRINSIC_WO_CHAIN: {
+    unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(0))->getZExtValue();
+    bool IsExt = false;
+    switch (IntNo) {
+      default:
+        break;
+      case Intrinsic::aarch64_neon_vtbx1:
+        IsExt = true;
+      case Intrinsic::aarch64_neon_vtbl1:
+        return SelectVTBL(Node, 1, IsExt);
+      case Intrinsic::aarch64_neon_vtbx2:
+        IsExt = true;
+      case Intrinsic::aarch64_neon_vtbl2:
+        return SelectVTBL(Node, 2, IsExt);
+      case Intrinsic::aarch64_neon_vtbx3:
+        IsExt = true;
+      case Intrinsic::aarch64_neon_vtbl3:
+        return SelectVTBL(Node, 3, IsExt);
+      case Intrinsic::aarch64_neon_vtbx4:
+        IsExt = true;
+      case Intrinsic::aarch64_neon_vtbl4:
+        return SelectVTBL(Node, 4, IsExt);
+    }
+    break;
+  }
+  case ISD::INTRINSIC_VOID:
+  case ISD::INTRINSIC_W_CHAIN: {
+    unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
+    switch (IntNo) {
+    default:
+      break;
+    case Intrinsic::arm_neon_vld1: {
+      static const uint16_t Opcodes[] = {
+          AArch64::LD1_8B,  AArch64::LD1_4H, AArch64::LD1_2S, AArch64::LD1_1D,
+          AArch64::LD1_16B, AArch64::LD1_8H, AArch64::LD1_4S, AArch64::LD1_2D
+      };
+      return SelectVLD(Node, false, 1, Opcodes);
+    }
+    case Intrinsic::arm_neon_vld2: {
+      static const uint16_t Opcodes[] = {
+          AArch64::LD2_8B,  AArch64::LD2_4H, AArch64::LD2_2S, AArch64::LD1x2_1D,
+          AArch64::LD2_16B, AArch64::LD2_8H, AArch64::LD2_4S, AArch64::LD2_2D
+      };
+      return SelectVLD(Node, false, 2, Opcodes);
+    }
+    case Intrinsic::arm_neon_vld3: {
+      static const uint16_t Opcodes[] = {
+          AArch64::LD3_8B,  AArch64::LD3_4H, AArch64::LD3_2S, AArch64::LD1x3_1D,
+          AArch64::LD3_16B, AArch64::LD3_8H, AArch64::LD3_4S, AArch64::LD3_2D
+      };
+      return SelectVLD(Node, false, 3, Opcodes);
+    }
+    case Intrinsic::arm_neon_vld4: {
+      static const uint16_t Opcodes[] = {
+          AArch64::LD4_8B,  AArch64::LD4_4H, AArch64::LD4_2S, AArch64::LD1x4_1D,
+          AArch64::LD4_16B, AArch64::LD4_8H, AArch64::LD4_4S, AArch64::LD4_2D
+      };
+      return SelectVLD(Node, false, 4, Opcodes);
+    }
+    case Intrinsic::aarch64_neon_vld1x2: {
+      static const uint16_t Opcodes[] = {
+          AArch64::LD1x2_8B, AArch64::LD1x2_4H,  AArch64::LD1x2_2S,
+          AArch64::LD1x2_1D, AArch64::LD1x2_16B, AArch64::LD1x2_8H,
+          AArch64::LD1x2_4S, AArch64::LD1x2_2D
+      };
+      return SelectVLD(Node, false, 2, Opcodes);
+    }
+    case Intrinsic::aarch64_neon_vld1x3: {
+      static const uint16_t Opcodes[] = {
+          AArch64::LD1x3_8B, AArch64::LD1x3_4H,  AArch64::LD1x3_2S,
+          AArch64::LD1x3_1D, AArch64::LD1x3_16B, AArch64::LD1x3_8H,
+          AArch64::LD1x3_4S, AArch64::LD1x3_2D
+      };
+      return SelectVLD(Node, false, 3, Opcodes);
+    }
+    case Intrinsic::aarch64_neon_vld1x4: {
+      static const uint16_t Opcodes[] = {
+          AArch64::LD1x4_8B, AArch64::LD1x4_4H,  AArch64::LD1x4_2S,
+          AArch64::LD1x4_1D, AArch64::LD1x4_16B, AArch64::LD1x4_8H,
+          AArch64::LD1x4_4S, AArch64::LD1x4_2D
+      };
+      return SelectVLD(Node, false, 4, Opcodes);
+    }
+    case Intrinsic::arm_neon_vst1: {
+      static const uint16_t Opcodes[] = {
+          AArch64::ST1_8B,  AArch64::ST1_4H, AArch64::ST1_2S, AArch64::ST1_1D,
+          AArch64::ST1_16B, AArch64::ST1_8H, AArch64::ST1_4S, AArch64::ST1_2D
+      };
+      return SelectVST(Node, false, 1, Opcodes);
+    }
+    case Intrinsic::arm_neon_vst2: {
+      static const uint16_t Opcodes[] = {
+          AArch64::ST2_8B,  AArch64::ST2_4H, AArch64::ST2_2S, AArch64::ST1x2_1D,
+          AArch64::ST2_16B, AArch64::ST2_8H, AArch64::ST2_4S, AArch64::ST2_2D
+      };
+      return SelectVST(Node, false, 2, Opcodes);
+    }
+    case Intrinsic::arm_neon_vst3: {
+      static const uint16_t Opcodes[] = {
+          AArch64::ST3_8B,  AArch64::ST3_4H, AArch64::ST3_2S, AArch64::ST1x3_1D,
+          AArch64::ST3_16B, AArch64::ST3_8H, AArch64::ST3_4S, AArch64::ST3_2D
+      };
+      return SelectVST(Node, false, 3, Opcodes);
+    }
+    case Intrinsic::arm_neon_vst4: {
+      static const uint16_t Opcodes[] = {
+          AArch64::ST4_8B,  AArch64::ST4_4H, AArch64::ST4_2S, AArch64::ST1x4_1D,
+          AArch64::ST4_16B, AArch64::ST4_8H, AArch64::ST4_4S, AArch64::ST4_2D
+      };
+      return SelectVST(Node, false, 4, Opcodes);
+    }
+    case Intrinsic::aarch64_neon_vst1x2: {
+      static const uint16_t Opcodes[] = {
+          AArch64::ST1x2_8B, AArch64::ST1x2_4H,  AArch64::ST1x2_2S,
+          AArch64::ST1x2_1D, AArch64::ST1x2_16B, AArch64::ST1x2_8H,
+          AArch64::ST1x2_4S, AArch64::ST1x2_2D
+      };
+      return SelectVST(Node, false, 2, Opcodes);
+    }
+    case Intrinsic::aarch64_neon_vst1x3: {
+      static const uint16_t Opcodes[] = {
+          AArch64::ST1x3_8B, AArch64::ST1x3_4H,  AArch64::ST1x3_2S,
+          AArch64::ST1x3_1D, AArch64::ST1x3_16B, AArch64::ST1x3_8H,
+          AArch64::ST1x3_4S, AArch64::ST1x3_2D
+      };
+      return SelectVST(Node, false, 3, Opcodes);
+    }
+    case Intrinsic::aarch64_neon_vst1x4: {
+      static const uint16_t Opcodes[] = {
+          AArch64::ST1x4_8B, AArch64::ST1x4_4H,  AArch64::ST1x4_2S,
+          AArch64::ST1x4_1D, AArch64::ST1x4_16B, AArch64::ST1x4_8H,
+          AArch64::ST1x4_4S, AArch64::ST1x4_2D
+      };
+      return SelectVST(Node, false, 4, Opcodes);
+    }
+    case Intrinsic::arm_neon_vld2lane: {
+      static const uint16_t Opcodes[] = {
+          AArch64::LD2LN_B, AArch64::LD2LN_H, AArch64::LD2LN_S, AArch64::LD2LN_D
+      };
+      return SelectVLDSTLane(Node, true, false, 2, Opcodes);
+    }
+    case Intrinsic::arm_neon_vld3lane: {
+      static const uint16_t Opcodes[] = {
+          AArch64::LD3LN_B, AArch64::LD3LN_H, AArch64::LD3LN_S, AArch64::LD3LN_D
+      };
+      return SelectVLDSTLane(Node, true, false, 3, Opcodes);
+    }
+    case Intrinsic::arm_neon_vld4lane: {
+      static const uint16_t Opcodes[] = {
+          AArch64::LD4LN_B, AArch64::LD4LN_H, AArch64::LD4LN_S, AArch64::LD4LN_D
+      };
+      return SelectVLDSTLane(Node, true, false, 4, Opcodes);
+    }
+    case Intrinsic::arm_neon_vst2lane: {
+      static const uint16_t Opcodes[] = {
+          AArch64::ST2LN_B, AArch64::ST2LN_H, AArch64::ST2LN_S, AArch64::ST2LN_D
+      };
+      return SelectVLDSTLane(Node, false, false, 2, Opcodes);
+    }
+    case Intrinsic::arm_neon_vst3lane: {
+      static const uint16_t Opcodes[] = {
+          AArch64::ST3LN_B, AArch64::ST3LN_H, AArch64::ST3LN_S, AArch64::ST3LN_D
+      };
+      return SelectVLDSTLane(Node, false, false, 3, Opcodes);
+    }
+    case Intrinsic::arm_neon_vst4lane: {
+      static const uint16_t Opcodes[] = {
+          AArch64::ST4LN_B, AArch64::ST4LN_H, AArch64::ST4LN_S, AArch64::ST4LN_D
+      };
+      return SelectVLDSTLane(Node, false, false, 4, Opcodes);
+    }
+    } // End of switch IntNo
+    break;
+  } // End of case ISD::INTRINSIC_VOID and :ISD::INTRINSIC_W_CHAIN
   default:
     break; // Let generic code handle it
   }
diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp
index 44b691b..4fdb667 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -50,24 +50,33 @@ AArch64TargetLowering::AArch64TargetLowering(AArch64TargetMachine &TM)
   // Scalar register <-> type mapping
   addRegisterClass(MVT::i32, &AArch64::GPR32RegClass);
   addRegisterClass(MVT::i64, &AArch64::GPR64RegClass);
-  addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
-  addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
-  addRegisterClass(MVT::f64, &AArch64::FPR64RegClass);
-  addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
+
+  if (Subtarget->hasFPARMv8()) {
+    addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
+    addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
+    addRegisterClass(MVT::f64, &AArch64::FPR64RegClass);
+    addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
+  }
 
   if (Subtarget->hasNEON()) {
     // And the vectors
-    addRegisterClass(MVT::v8i8, &AArch64::VPR64RegClass);
-    addRegisterClass(MVT::v4i16, &AArch64::VPR64RegClass);
-    addRegisterClass(MVT::v2i32, &AArch64::VPR64RegClass);
-    addRegisterClass(MVT::v1i64, &AArch64::VPR64RegClass);
-    addRegisterClass(MVT::v2f32, &AArch64::VPR64RegClass);
-    addRegisterClass(MVT::v16i8, &AArch64::VPR128RegClass);
-    addRegisterClass(MVT::v8i16, &AArch64::VPR128RegClass);
-    addRegisterClass(MVT::v4i32, &AArch64::VPR128RegClass);
-    addRegisterClass(MVT::v2i64, &AArch64::VPR128RegClass);
-    addRegisterClass(MVT::v4f32, &AArch64::VPR128RegClass);
-    addRegisterClass(MVT::v2f64, &AArch64::VPR128RegClass);
+    addRegisterClass(MVT::v1i8,  &AArch64::FPR8RegClass);
+    addRegisterClass(MVT::v1i16, &AArch64::FPR16RegClass);
+    addRegisterClass(MVT::v1i32, &AArch64::FPR32RegClass);
+    addRegisterClass(MVT::v1i64, &AArch64::FPR64RegClass);
+    addRegisterClass(MVT::v1f32, &AArch64::FPR32RegClass);
+    addRegisterClass(MVT::v1f64, &AArch64::FPR64RegClass);
+    addRegisterClass(MVT::v8i8,  &AArch64::FPR64RegClass);
+    addRegisterClass(MVT::v4i16, &AArch64::FPR64RegClass);
+    addRegisterClass(MVT::v2i32, &AArch64::FPR64RegClass);
+    addRegisterClass(MVT::v1i64, &AArch64::FPR64RegClass);
+    addRegisterClass(MVT::v2f32, &AArch64::FPR64RegClass);
+    addRegisterClass(MVT::v16i8, &AArch64::FPR128RegClass);
+    addRegisterClass(MVT::v8i16, &AArch64::FPR128RegClass);
+    addRegisterClass(MVT::v4i32, &AArch64::FPR128RegClass);
+    addRegisterClass(MVT::v2i64, &AArch64::FPR128RegClass);
+    addRegisterClass(MVT::v4f32, &AArch64::FPR128RegClass);
+    addRegisterClass(MVT::v2f64, &AArch64::FPR128RegClass);
   }
 
   computeRegisterProperties();
@@ -77,6 +86,12 @@ AArch64TargetLowering::AArch64TargetLowering(AArch64TargetMachine &TM)
 
   setTargetDAGCombine(ISD::AND);
   setTargetDAGCombine(ISD::SRA);
+  setTargetDAGCombine(ISD::SRL);
+  setTargetDAGCombine(ISD::SHL);
+
+  setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
+  setTargetDAGCombine(ISD::INTRINSIC_VOID);
+  setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
 
   // AArch64 does not have i1 loads, or much of anything for i1 really.
   setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
@@ -270,28 +285,89 @@ AArch64TargetLowering::AArch64TargetLowering(AArch64TargetMachine &TM)
   setExceptionSelectorRegister(AArch64::X1);
 
   if (Subtarget->hasNEON()) {
+    setOperationAction(ISD::BUILD_VECTOR, MVT::v1i8, Custom);
     setOperationAction(ISD::BUILD_VECTOR, MVT::v8i8, Custom);
     setOperationAction(ISD::BUILD_VECTOR, MVT::v16i8, Custom);
+    setOperationAction(ISD::BUILD_VECTOR, MVT::v1i16, Custom);
     setOperationAction(ISD::BUILD_VECTOR, MVT::v4i16, Custom);
     setOperationAction(ISD::BUILD_VECTOR, MVT::v8i16, Custom);
+    setOperationAction(ISD::BUILD_VECTOR, MVT::v1i32, Custom);
     setOperationAction(ISD::BUILD_VECTOR, MVT::v2i32, Custom);
     setOperationAction(ISD::BUILD_VECTOR, MVT::v4i32, Custom);
     setOperationAction(ISD::BUILD_VECTOR, MVT::v1i64, Custom);
     setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom);
+    setOperationAction(ISD::BUILD_VECTOR, MVT::v1f32, Custom);
     setOperationAction(ISD::BUILD_VECTOR, MVT::v2f32, Custom);
     setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
+    setOperationAction(ISD::BUILD_VECTOR, MVT::v1f64, Custom);
     setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom);
 
+    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i8, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i8, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i16, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i16, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i32, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i32, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v1i64, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f32, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v1f64, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom);
+
+    setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i8, Legal);
+    setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i16, Legal);
+    setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Legal);
+    setOperationAction(ISD::CONCAT_VECTORS, MVT::v2i64, Legal);
+    setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i16, Legal);
+    setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Legal);
+    setOperationAction(ISD::CONCAT_VECTORS, MVT::v2i64, Legal);
+    setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32, Legal);
+    setOperationAction(ISD::CONCAT_VECTORS, MVT::v2f64, Legal);
+
     setOperationAction(ISD::SETCC, MVT::v8i8, Custom);
     setOperationAction(ISD::SETCC, MVT::v16i8, Custom);
     setOperationAction(ISD::SETCC, MVT::v4i16, Custom);
     setOperationAction(ISD::SETCC, MVT::v8i16, Custom);
     setOperationAction(ISD::SETCC, MVT::v2i32, Custom);
     setOperationAction(ISD::SETCC, MVT::v4i32, Custom);
+    setOperationAction(ISD::SETCC, MVT::v1i64, Custom);
     setOperationAction(ISD::SETCC, MVT::v2i64, Custom);
+    setOperationAction(ISD::SETCC, MVT::v1f32, Custom);
     setOperationAction(ISD::SETCC, MVT::v2f32, Custom);
     setOperationAction(ISD::SETCC, MVT::v4f32, Custom);
+    setOperationAction(ISD::SETCC, MVT::v1f64, Custom);
     setOperationAction(ISD::SETCC, MVT::v2f64, Custom);
+
+    setOperationAction(ISD::FFLOOR, MVT::v2f32, Legal);
+    setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal);
+    setOperationAction(ISD::FFLOOR, MVT::v1f64, Legal);
+    setOperationAction(ISD::FFLOOR, MVT::v2f64, Legal);
+
+    setOperationAction(ISD::FCEIL, MVT::v2f32, Legal);
+    setOperationAction(ISD::FCEIL, MVT::v4f32, Legal);
+    setOperationAction(ISD::FCEIL, MVT::v1f64, Legal);
+    setOperationAction(ISD::FCEIL, MVT::v2f64, Legal);
+
+    setOperationAction(ISD::FTRUNC, MVT::v2f32, Legal);
+    setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal);
+    setOperationAction(ISD::FTRUNC, MVT::v1f64, Legal);
+    setOperationAction(ISD::FTRUNC, MVT::v2f64, Legal);
+
+    setOperationAction(ISD::FRINT, MVT::v2f32, Legal);
+    setOperationAction(ISD::FRINT, MVT::v4f32, Legal);
+    setOperationAction(ISD::FRINT, MVT::v1f64, Legal);
+    setOperationAction(ISD::FRINT, MVT::v2f64, Legal);
+
+    setOperationAction(ISD::FNEARBYINT, MVT::v2f32, Legal);
+    setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Legal);
+    setOperationAction(ISD::FNEARBYINT, MVT::v1f64, Legal);
+    setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Legal);
+
+    setOperationAction(ISD::FROUND, MVT::v2f32, Legal);
+    setOperationAction(ISD::FROUND, MVT::v4f32, Legal);
+    setOperationAction(ISD::FROUND, MVT::v1f64, Legal);
+    setOperationAction(ISD::FROUND, MVT::v2f64, Legal);
   }
 }
 
@@ -333,6 +409,29 @@ static void getExclusiveOperation(unsigned Size, AtomicOrdering Ord,
   StrOpc = StoreOps[Log2_32(Size)];
 }
 
+// FIXME: AArch64::DTripleRegClass and AArch64::QTripleRegClass don't really
+// have value type mapped, and they are both being defined as MVT::untyped.
+// Without knowing the MVT type, MachineLICM::getRegisterClassIDAndCost
+// would fail to figure out the register pressure correctly.
+std::pair<const TargetRegisterClass*, uint8_t>
+AArch64TargetLowering::findRepresentativeClass(MVT VT) const{
+  const TargetRegisterClass *RRC = 0;
+  uint8_t Cost = 1;
+  switch (VT.SimpleTy) {
+  default:
+    return TargetLowering::findRepresentativeClass(VT);
+  case MVT::v4i64:
+    RRC = &AArch64::QPairRegClass;
+    Cost = 2;
+    break;
+  case MVT::v8i64:
+    RRC = &AArch64::QQuadRegClass;
+    Cost = 4;
+    break;
+  }
+  return std::make_pair(RRC, Cost);
+}
+
 MachineBasicBlock *
 AArch64TargetLowering::emitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
                                         unsigned Size,
@@ -658,6 +757,12 @@ AArch64TargetLowering::EmitF128CSEL(MachineInstr *MI,
   MBB->addSuccessor(TrueBB);
   MBB->addSuccessor(EndBB);
 
+  if (!NZCVKilled) {
+    // NZCV is live-through TrueBB.
+    TrueBB->addLiveIn(AArch64::NZCV);
+    EndBB->addLiveIn(AArch64::NZCV);
+  }
+
   // IfTrue:
   //     str qIFTRUE, [sp]
   BuildMI(TrueBB, DL, TII->get(AArch64::LSFP128_STR))
@@ -672,8 +777,6 @@ AArch64TargetLowering::EmitF128CSEL(MachineInstr *MI,
   // Done:
   //     ldr qDEST, [sp]
   //     [... rest of incoming MBB ...]
-  if (!NZCVKilled)
-    EndBB->addLiveIn(AArch64::NZCV);
   MachineInstr *StartOfEnd = EndBB->begin();
   BuildMI(*EndBB, StartOfEnd, DL, TII->get(AArch64::LSFP128_LDR), DestReg)
     .addFrameIndex(ScratchFI)
@@ -833,6 +936,86 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
     return "AArch64ISD::NEON_CMPZ";
   case AArch64ISD::NEON_TST:
     return "AArch64ISD::NEON_TST";
+  case AArch64ISD::NEON_QSHLs:
+    return "AArch64ISD::NEON_QSHLs";
+  case AArch64ISD::NEON_QSHLu:
+    return "AArch64ISD::NEON_QSHLu";
+  case AArch64ISD::NEON_VDUP:
+    return "AArch64ISD::NEON_VDUP";
+  case AArch64ISD::NEON_VDUPLANE:
+    return "AArch64ISD::NEON_VDUPLANE";
+  case AArch64ISD::NEON_REV16:
+    return "AArch64ISD::NEON_REV16";
+  case AArch64ISD::NEON_REV32:
+    return "AArch64ISD::NEON_REV32";
+  case AArch64ISD::NEON_REV64:
+    return "AArch64ISD::NEON_REV64";
+  case AArch64ISD::NEON_UZP1:
+    return "AArch64ISD::NEON_UZP1";
+  case AArch64ISD::NEON_UZP2:
+    return "AArch64ISD::NEON_UZP2";
+  case AArch64ISD::NEON_ZIP1:
+    return "AArch64ISD::NEON_ZIP1";
+  case AArch64ISD::NEON_ZIP2:
+    return "AArch64ISD::NEON_ZIP2";
+  case AArch64ISD::NEON_TRN1:
+    return "AArch64ISD::NEON_TRN1";
+  case AArch64ISD::NEON_TRN2:
+    return "AArch64ISD::NEON_TRN2";
+  case AArch64ISD::NEON_LD1_UPD:
+    return "AArch64ISD::NEON_LD1_UPD";
+  case AArch64ISD::NEON_LD2_UPD:
+    return "AArch64ISD::NEON_LD2_UPD";
+  case AArch64ISD::NEON_LD3_UPD:
+    return "AArch64ISD::NEON_LD3_UPD";
+  case AArch64ISD::NEON_LD4_UPD:
+    return "AArch64ISD::NEON_LD4_UPD";
+  case AArch64ISD::NEON_ST1_UPD:
+    return "AArch64ISD::NEON_ST1_UPD";
+  case AArch64ISD::NEON_ST2_UPD:
+    return "AArch64ISD::NEON_ST2_UPD";
+  case AArch64ISD::NEON_ST3_UPD:
+    return "AArch64ISD::NEON_ST3_UPD";
+  case AArch64ISD::NEON_ST4_UPD:
+    return "AArch64ISD::NEON_ST4_UPD";
+  case AArch64ISD::NEON_LD1x2_UPD:
+    return "AArch64ISD::NEON_LD1x2_UPD";
+  case AArch64ISD::NEON_LD1x3_UPD:
+    return "AArch64ISD::NEON_LD1x3_UPD";
+  case AArch64ISD::NEON_LD1x4_UPD:
+    return "AArch64ISD::NEON_LD1x4_UPD";
+  case AArch64ISD::NEON_ST1x2_UPD:
+    return "AArch64ISD::NEON_ST1x2_UPD";
+  case AArch64ISD::NEON_ST1x3_UPD:
+    return "AArch64ISD::NEON_ST1x3_UPD";
+  case AArch64ISD::NEON_ST1x4_UPD:
+    return "AArch64ISD::NEON_ST1x4_UPD";
+  case AArch64ISD::NEON_LD2DUP:
+    return "AArch64ISD::NEON_LD2DUP";
+  case AArch64ISD::NEON_LD3DUP:
+    return "AArch64ISD::NEON_LD3DUP";
+  case AArch64ISD::NEON_LD4DUP:
+    return "AArch64ISD::NEON_LD4DUP";
+  case AArch64ISD::NEON_LD2DUP_UPD:
+    return "AArch64ISD::NEON_LD2DUP_UPD";
+  case AArch64ISD::NEON_LD3DUP_UPD:
+    return "AArch64ISD::NEON_LD3DUP_UPD";
+  case AArch64ISD::NEON_LD4DUP_UPD:
+    return "AArch64ISD::NEON_LD4DUP_UPD";
+  case AArch64ISD::NEON_LD2LN_UPD:
+    return "AArch64ISD::NEON_LD2LN_UPD";
+  case AArch64ISD::NEON_LD3LN_UPD:
+    return "AArch64ISD::NEON_LD3LN_UPD";
+  case AArch64ISD::NEON_LD4LN_UPD:
+    return "AArch64ISD::NEON_LD4LN_UPD";
+  case AArch64ISD::NEON_ST2LN_UPD:
+    return "AArch64ISD::NEON_ST2LN_UPD";
+  case AArch64ISD::NEON_ST3LN_UPD:
+    return "AArch64ISD::NEON_ST3LN_UPD";
+  case AArch64ISD::NEON_ST4LN_UPD:
+    return "AArch64ISD::NEON_ST4LN_UPD";
+  case AArch64ISD::NEON_VEXTRACT:
+    return "AArch64ISD::NEON_VEXTRACT";
   default:
     return NULL;
   }
@@ -908,24 +1091,31 @@ AArch64TargetLowering::SaveVarArgRegisters(CCState &CCInfo, SelectionDAG &DAG,
     }
   }
 
+  if (getSubtarget()->hasFPARMv8()) {
   unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
   int FPRIdx = 0;
-  if (FPRSaveSize != 0) {
-    FPRIdx = MFI->CreateStackObject(FPRSaveSize, 16, false);
-
-    SDValue FIN = DAG.getFrameIndex(FPRIdx, getPointerTy());
-
-    for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
-      unsigned VReg = MF.addLiveIn(AArch64FPRArgRegs[i],
-                                   &AArch64::FPR128RegClass);
-      SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128);
-      SDValue Store = DAG.getStore(Val.getValue(1), DL, Val, FIN,
-                                   MachinePointerInfo::getStack(i * 16),
-                                   false, false, 0);
-      MemOps.push_back(Store);
-      FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), FIN,
-                        DAG.getConstant(16, getPointerTy()));
+    // According to the AArch64 Procedure Call Standard, section B.1/B.3, we
+    // can omit a register save area if we know we'll never use registers of
+    // that class.
+    if (FPRSaveSize != 0) {
+      FPRIdx = MFI->CreateStackObject(FPRSaveSize, 16, false);
+
+      SDValue FIN = DAG.getFrameIndex(FPRIdx, getPointerTy());
+
+      for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
+        unsigned VReg = MF.addLiveIn(AArch64FPRArgRegs[i],
+            &AArch64::FPR128RegClass);
+        SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128);
+        SDValue Store = DAG.getStore(Val.getValue(1), DL, Val, FIN,
+            MachinePointerInfo::getStack(i * 16),
+            false, false, 0);
+        MemOps.push_back(Store);
+        FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), FIN,
+            DAG.getConstant(16, getPointerTy()));
+      }
     }
+    FuncInfo->setVariadicFPRIdx(FPRIdx);
+    FuncInfo->setVariadicFPRSize(FPRSaveSize);
   }
 
   int StackIdx = MFI->CreateFixedObject(8, CCInfo.getNextStackOffset(), true);
@@ -933,8 +1123,6 @@ AArch64TargetLowering::SaveVarArgRegisters(CCState &CCInfo, SelectionDAG &DAG,
   FuncInfo->setVariadicStackIdx(StackIdx);
   FuncInfo->setVariadicGPRIdx(GPRIdx);
   FuncInfo->setVariadicGPRSize(GPRSaveSize);
-  FuncInfo->setVariadicFPRIdx(FPRIdx);
-  FuncInfo->setVariadicFPRSize(FPRSaveSize);
 
   if (!MemOps.empty()) {
     Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, &MemOps[0],
@@ -1875,7 +2063,7 @@ AArch64TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
 
   SDValue SrcVal = Op.getOperand(0);
   return makeLibCall(DAG, LC, Op.getValueType(), &SrcVal, 1,
-                     /*isSigned*/ false, SDLoc(Op));
+                     /*isSigned*/ false, SDLoc(Op)).first;
 }
 
 SDValue
@@ -1905,6 +2093,45 @@ AArch64TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG,
   return LowerF128ToCall(Op, DAG, LC);
 }
 
+SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MFI->setReturnAddressIsTaken(true);
+
+  EVT VT = Op.getValueType();
+  SDLoc dl(Op);
+  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  if (Depth) {
+    SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
+    SDValue Offset = DAG.getConstant(8, MVT::i64);
+    return DAG.getLoad(VT, dl, DAG.getEntryNode(),
+                       DAG.getNode(ISD::ADD, dl, VT, FrameAddr, Offset),
+                       MachinePointerInfo(), false, false, false, 0);
+  }
+
+  // Return X30, which contains the return address. Mark it an implicit live-in.
+  unsigned Reg = MF.addLiveIn(AArch64::X30, getRegClassFor(MVT::i64));
+  return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, MVT::i64);
+}
+
+
+SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG)
+                                              const {
+  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+  MFI->setFrameAddressIsTaken(true);
+
+  EVT VT = Op.getValueType();
+  SDLoc dl(Op);
+  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  unsigned FrameReg = AArch64::X29;
+  SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
+  while (Depth--)
+    FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
+                            MachinePointerInfo(),
+                            false, false, false, 0);
+  return FrameAddr;
+}
+
 SDValue
 AArch64TargetLowering::LowerGlobalAddressELFLarge(SDValue Op,
                                                   SelectionDAG &DAG) const {
@@ -2650,6 +2877,8 @@ AArch64TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::UINT_TO_FP: return LowerINT_TO_FP(Op, DAG, false);
   case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG);
   case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
+  case ISD::RETURNADDR:    return LowerRETURNADDR(Op, DAG);
+  case ISD::FRAMEADDR:     return LowerFRAMEADDR(Op, DAG);
 
   case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
   case ISD::BRCOND: return LowerBRCOND(Op, DAG);
@@ -2664,6 +2893,7 @@ AArch64TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::VASTART: return LowerVASTART(Op, DAG);
   case ISD::BUILD_VECTOR:
     return LowerBUILD_VECTOR(Op, DAG, getSubtarget());
+  case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG);
   }
 
   return SDValue();
@@ -3235,6 +3465,336 @@ static SDValue PerformSRACombine(SDNode *N,
                      DAG.getConstant(LSB + Width - 1, MVT::i64));
 }
 
+/// Check if this is a valid build_vector for the immediate operand of
+/// a vector shift operation, where all the elements of the build_vector
+/// must have the same constant integer value.
+static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
+  // Ignore bit_converts.
+  while (Op.getOpcode() == ISD::BITCAST)
+    Op = Op.getOperand(0);
+  BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
+  APInt SplatBits, SplatUndef;
+  unsigned SplatBitSize;
+  bool HasAnyUndefs;
+  if (!BVN || !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
+                                      HasAnyUndefs, ElementBits) ||
+      SplatBitSize > ElementBits)
+    return false;
+  Cnt = SplatBits.getSExtValue();
+  return true;
+}
+
+/// Check if this is a valid build_vector for the immediate operand of
+/// a vector shift left operation.  That value must be in the range:
+/// 0 <= Value < ElementBits
+static bool isVShiftLImm(SDValue Op, EVT VT, int64_t &Cnt) {
+  assert(VT.isVector() && "vector shift count is not a vector type");
+  unsigned ElementBits = VT.getVectorElementType().getSizeInBits();
+  if (!getVShiftImm(Op, ElementBits, Cnt))
+    return false;
+  return (Cnt >= 0 && Cnt < ElementBits);
+}
+
+/// Check if this is a valid build_vector for the immediate operand of a
+/// vector shift right operation. The value must be in the range:
+///   1 <= Value <= ElementBits
+static bool isVShiftRImm(SDValue Op, EVT VT, int64_t &Cnt) {
+  assert(VT.isVector() && "vector shift count is not a vector type");
+  unsigned ElementBits = VT.getVectorElementType().getSizeInBits();
+  if (!getVShiftImm(Op, ElementBits, Cnt))
+    return false;
+  return (Cnt >= 1 && Cnt <= ElementBits);
+}
+
+/// Checks for immediate versions of vector shifts and lowers them.
+static SDValue PerformShiftCombine(SDNode *N,
+                                   TargetLowering::DAGCombinerInfo &DCI,
+                                   const AArch64Subtarget *ST) {
+  SelectionDAG &DAG = DCI.DAG;
+  EVT VT = N->getValueType(0);
+  if (N->getOpcode() == ISD::SRA && (VT == MVT::i32 || VT == MVT::i64))
+    return PerformSRACombine(N, DCI);
+
+  // Nothing to be done for scalar shifts.
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  if (!VT.isVector() || !TLI.isTypeLegal(VT))
+    return SDValue();
+
+  assert(ST->hasNEON() && "unexpected vector shift");
+  int64_t Cnt;
+
+  switch (N->getOpcode()) {
+  default:
+    llvm_unreachable("unexpected shift opcode");
+
+  case ISD::SHL:
+    if (isVShiftLImm(N->getOperand(1), VT, Cnt)) {
+      SDValue RHS =
+          DAG.getNode(AArch64ISD::NEON_VDUP, SDLoc(N->getOperand(1)), VT,
+                      DAG.getConstant(Cnt, MVT::i32));
+      return DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0), RHS);
+    }
+    break;
+
+  case ISD::SRA:
+  case ISD::SRL:
+    if (isVShiftRImm(N->getOperand(1), VT, Cnt)) {
+      SDValue RHS =
+          DAG.getNode(AArch64ISD::NEON_VDUP, SDLoc(N->getOperand(1)), VT,
+                      DAG.getConstant(Cnt, MVT::i32));
+      return DAG.getNode(N->getOpcode(), SDLoc(N), VT, N->getOperand(0), RHS);
+    }
+    break;
+  }
+
+  return SDValue();
+}
+
+/// ARM-specific DAG combining for intrinsics.
+static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) {
+  unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+
+  switch (IntNo) {
+  default:
+    // Don't do anything for most intrinsics.
+    break;
+
+  case Intrinsic::arm_neon_vqshifts:
+  case Intrinsic::arm_neon_vqshiftu:
+    EVT VT = N->getOperand(1).getValueType();
+    int64_t Cnt;
+    if (!isVShiftLImm(N->getOperand(2), VT, Cnt))
+      break;
+    unsigned VShiftOpc = (IntNo == Intrinsic::arm_neon_vqshifts)
+                             ? AArch64ISD::NEON_QSHLs
+                             : AArch64ISD::NEON_QSHLu;
+    return DAG.getNode(VShiftOpc, SDLoc(N), N->getValueType(0),
+                       N->getOperand(1), DAG.getConstant(Cnt, MVT::i32));
+  }
+
+  return SDValue();
+}
+
+/// Target-specific DAG combine function for NEON load/store intrinsics
+/// to merge base address updates.
+static SDValue CombineBaseUpdate(SDNode *N,
+                                 TargetLowering::DAGCombinerInfo &DCI) {
+  if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
+    return SDValue();
+
+  SelectionDAG &DAG = DCI.DAG;
+  bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID ||
+                      N->getOpcode() == ISD::INTRINSIC_W_CHAIN);
+  unsigned AddrOpIdx = (isIntrinsic ? 2 : 1);
+  SDValue Addr = N->getOperand(AddrOpIdx);
+
+  // Search for a use of the address operand that is an increment.
+  for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
+       UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
+    SDNode *User = *UI;
+    if (User->getOpcode() != ISD::ADD ||
+        UI.getUse().getResNo() != Addr.getResNo())
+      continue;
+
+    // Check that the add is independent of the load/store.  Otherwise, folding
+    // it would create a cycle.
+    if (User->isPredecessorOf(N) || N->isPredecessorOf(User))
+      continue;
+
+    // Find the new opcode for the updating load/store.
+    bool isLoad = true;
+    bool isLaneOp = false;
+    unsigned NewOpc = 0;
+    unsigned NumVecs = 0;
+    if (isIntrinsic) {
+      unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+      switch (IntNo) {
+      default: llvm_unreachable("unexpected intrinsic for Neon base update");
+      case Intrinsic::arm_neon_vld1:       NewOpc = AArch64ISD::NEON_LD1_UPD;
+        NumVecs = 1; break;
+      case Intrinsic::arm_neon_vld2:       NewOpc = AArch64ISD::NEON_LD2_UPD;
+        NumVecs = 2; break;
+      case Intrinsic::arm_neon_vld3:       NewOpc = AArch64ISD::NEON_LD3_UPD;
+        NumVecs = 3; break;
+      case Intrinsic::arm_neon_vld4:       NewOpc = AArch64ISD::NEON_LD4_UPD;
+        NumVecs = 4; break;
+      case Intrinsic::arm_neon_vst1:       NewOpc = AArch64ISD::NEON_ST1_UPD;
+        NumVecs = 1; isLoad = false; break;
+      case Intrinsic::arm_neon_vst2:       NewOpc = AArch64ISD::NEON_ST2_UPD;
+        NumVecs = 2; isLoad = false; break;
+      case Intrinsic::arm_neon_vst3:       NewOpc = AArch64ISD::NEON_ST3_UPD;
+        NumVecs = 3; isLoad = false; break;
+      case Intrinsic::arm_neon_vst4:       NewOpc = AArch64ISD::NEON_ST4_UPD;
+        NumVecs = 4; isLoad = false; break;
+      case Intrinsic::aarch64_neon_vld1x2: NewOpc = AArch64ISD::NEON_LD1x2_UPD;
+        NumVecs = 2; break;
+      case Intrinsic::aarch64_neon_vld1x3: NewOpc = AArch64ISD::NEON_LD1x3_UPD;
+        NumVecs = 3; break;
+      case Intrinsic::aarch64_neon_vld1x4: NewOpc = AArch64ISD::NEON_LD1x4_UPD;
+        NumVecs = 4; break;
+      case Intrinsic::aarch64_neon_vst1x2: NewOpc = AArch64ISD::NEON_ST1x2_UPD;
+        NumVecs = 2; isLoad = false; break;
+      case Intrinsic::aarch64_neon_vst1x3: NewOpc = AArch64ISD::NEON_ST1x3_UPD;
+        NumVecs = 3; isLoad = false; break;
+      case Intrinsic::aarch64_neon_vst1x4: NewOpc = AArch64ISD::NEON_ST1x4_UPD;
+        NumVecs = 4; isLoad = false; break;
+      case Intrinsic::arm_neon_vld2lane:   NewOpc = AArch64ISD::NEON_LD2LN_UPD;
+        NumVecs = 2; isLaneOp = true; break;
+      case Intrinsic::arm_neon_vld3lane:   NewOpc = AArch64ISD::NEON_LD3LN_UPD;
+        NumVecs = 3; isLaneOp = true; break;
+      case Intrinsic::arm_neon_vld4lane:   NewOpc = AArch64ISD::NEON_LD4LN_UPD;
+        NumVecs = 4; isLaneOp = true; break;
+      case Intrinsic::arm_neon_vst2lane:   NewOpc = AArch64ISD::NEON_ST2LN_UPD;
+        NumVecs = 2; isLoad = false; isLaneOp = true; break;
+      case Intrinsic::arm_neon_vst3lane:   NewOpc = AArch64ISD::NEON_ST3LN_UPD;
+        NumVecs = 3; isLoad = false; isLaneOp = true; break;
+      case Intrinsic::arm_neon_vst4lane:   NewOpc = AArch64ISD::NEON_ST4LN_UPD;
+        NumVecs = 4; isLoad = false; isLaneOp = true; break;
+      }
+    } else {
+      isLaneOp = true;
+      switch (N->getOpcode()) {
+      default: llvm_unreachable("unexpected opcode for Neon base update");
+      case AArch64ISD::NEON_LD2DUP: NewOpc = AArch64ISD::NEON_LD2DUP_UPD;
+        NumVecs = 2; break;
+      case AArch64ISD::NEON_LD3DUP: NewOpc = AArch64ISD::NEON_LD3DUP_UPD;
+        NumVecs = 3; break;
+      case AArch64ISD::NEON_LD4DUP: NewOpc = AArch64ISD::NEON_LD4DUP_UPD;
+        NumVecs = 4; break;
+      }
+    }
+
+    // Find the size of memory referenced by the load/store.
+    EVT VecTy;
+    if (isLoad)
+      VecTy = N->getValueType(0);
+    else
+      VecTy = N->getOperand(AddrOpIdx + 1).getValueType();
+    unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
+    if (isLaneOp)
+      NumBytes /= VecTy.getVectorNumElements();
+
+    // If the increment is a constant, it must match the memory ref size.
+    SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
+    if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
+      uint32_t IncVal = CInc->getZExtValue();
+      if (IncVal != NumBytes)
+        continue;
+      Inc = DAG.getTargetConstant(IncVal, MVT::i32);
+    }
+
+    // Create the new updating load/store node.
+    EVT Tys[6];
+    unsigned NumResultVecs = (isLoad ? NumVecs : 0);
+    unsigned n;
+    for (n = 0; n < NumResultVecs; ++n)
+      Tys[n] = VecTy;
+    Tys[n++] = MVT::i64;
+    Tys[n] = MVT::Other;
+    SDVTList SDTys = DAG.getVTList(Tys, NumResultVecs + 2);
+    SmallVector<SDValue, 8> Ops;
+    Ops.push_back(N->getOperand(0)); // incoming chain
+    Ops.push_back(N->getOperand(AddrOpIdx));
+    Ops.push_back(Inc);
+    for (unsigned i = AddrOpIdx + 1; i < N->getNumOperands(); ++i) {
+      Ops.push_back(N->getOperand(i));
+    }
+    MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N);
+    SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys,
+                                           Ops.data(), Ops.size(),
+                                           MemInt->getMemoryVT(),
+                                           MemInt->getMemOperand());
+
+    // Update the uses.
+    std::vector<SDValue> NewResults;
+    for (unsigned i = 0; i < NumResultVecs; ++i) {
+      NewResults.push_back(SDValue(UpdN.getNode(), i));
+    }
+    NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1)); // chain
+    DCI.CombineTo(N, NewResults);
+    DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
+
+    break;
+  }
+  return SDValue();
+}
+
+/// For a VDUPLANE node N, check if its source operand is a vldN-lane (N > 1)
+/// intrinsic, and if all the other uses of that intrinsic are also VDUPLANEs.
+/// If so, combine them to a vldN-dup operation and return true.
+static SDValue CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
+  SelectionDAG &DAG = DCI.DAG;
+  EVT VT = N->getValueType(0);
+
+  // Check if the VDUPLANE operand is a vldN-dup intrinsic.
+  SDNode *VLD = N->getOperand(0).getNode();
+  if (VLD->getOpcode() != ISD::INTRINSIC_W_CHAIN)
+    return SDValue();
+  unsigned NumVecs = 0;
+  unsigned NewOpc = 0;
+  unsigned IntNo = cast<ConstantSDNode>(VLD->getOperand(1))->getZExtValue();
+  if (IntNo == Intrinsic::arm_neon_vld2lane) {
+    NumVecs = 2;
+    NewOpc = AArch64ISD::NEON_LD2DUP;
+  } else if (IntNo == Intrinsic::arm_neon_vld3lane) {
+    NumVecs = 3;
+    NewOpc = AArch64ISD::NEON_LD3DUP;
+  } else if (IntNo == Intrinsic::arm_neon_vld4lane) {
+    NumVecs = 4;
+    NewOpc = AArch64ISD::NEON_LD4DUP;
+  } else {
+    return SDValue();
+  }
+
+  // First check that all the vldN-lane uses are VDUPLANEs and that the lane
+  // numbers match the load.
+  unsigned VLDLaneNo =
+      cast<ConstantSDNode>(VLD->getOperand(NumVecs + 3))->getZExtValue();
+  for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end();
+       UI != UE; ++UI) {
+    // Ignore uses of the chain result.
+    if (UI.getUse().getResNo() == NumVecs)
+      continue;
+    SDNode *User = *UI;
+    if (User->getOpcode() != AArch64ISD::NEON_VDUPLANE ||
+        VLDLaneNo != cast<ConstantSDNode>(User->getOperand(1))->getZExtValue())
+      return SDValue();
+  }
+
+  // Create the vldN-dup node.
+  EVT Tys[5];
+  unsigned n;
+  for (n = 0; n < NumVecs; ++n)
+    Tys[n] = VT;
+  Tys[n] = MVT::Other;
+  SDVTList SDTys = DAG.getVTList(Tys, NumVecs + 1);
+  SDValue Ops[] = { VLD->getOperand(0), VLD->getOperand(2) };
+  MemIntrinsicSDNode *VLDMemInt = cast<MemIntrinsicSDNode>(VLD);
+  SDValue VLDDup = DAG.getMemIntrinsicNode(NewOpc, SDLoc(VLD), SDTys, Ops, 2,
+                                           VLDMemInt->getMemoryVT(),
+                                           VLDMemInt->getMemOperand());
+
+  // Update the uses.
+  for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end();
+       UI != UE; ++UI) {
+    unsigned ResNo = UI.getUse().getResNo();
+    // Ignore uses of the chain result.
+    if (ResNo == NumVecs)
+      continue;
+    SDNode *User = *UI;
+    DCI.CombineTo(User, SDValue(VLDDup.getNode(), ResNo));
+  }
+
+  // Now the vldN-lane intrinsic is dead except for its chain result.
+  // Update uses of the chain.
+  std::vector<SDValue> VLDDupResults;
+  for (unsigned n = 0; n < NumVecs; ++n)
+    VLDDupResults.push_back(SDValue(VLDDup.getNode(), n));
+  VLDDupResults.push_back(SDValue(VLDDup.getNode(), NumVecs));
+  DCI.CombineTo(VLD, VLDDupResults);
+
+  return SDValue(N, 0);
+}
 
 SDValue
 AArch64TargetLowering::PerformDAGCombine(SDNode *N,
@@ -3243,7 +3803,45 @@ AArch64TargetLowering::PerformDAGCombine(SDNode *N,
   default: break;
   case ISD::AND: return PerformANDCombine(N, DCI);
   case ISD::OR: return PerformORCombine(N, DCI, getSubtarget());
-  case ISD::SRA: return PerformSRACombine(N, DCI);
+  case ISD::SHL:
+  case ISD::SRA:
+  case ISD::SRL:
+    return PerformShiftCombine(N, DCI, getSubtarget());
+  case ISD::INTRINSIC_WO_CHAIN:
+    return PerformIntrinsicCombine(N, DCI.DAG);
+  case AArch64ISD::NEON_VDUPLANE:
+    return CombineVLDDUP(N, DCI);
+  case AArch64ISD::NEON_LD2DUP:
+  case AArch64ISD::NEON_LD3DUP:
+  case AArch64ISD::NEON_LD4DUP:
+    return CombineBaseUpdate(N, DCI);
+  case ISD::INTRINSIC_VOID:
+  case ISD::INTRINSIC_W_CHAIN:
+    switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
+    case Intrinsic::arm_neon_vld1:
+    case Intrinsic::arm_neon_vld2:
+    case Intrinsic::arm_neon_vld3:
+    case Intrinsic::arm_neon_vld4:
+    case Intrinsic::arm_neon_vst1:
+    case Intrinsic::arm_neon_vst2:
+    case Intrinsic::arm_neon_vst3:
+    case Intrinsic::arm_neon_vst4:
+    case Intrinsic::arm_neon_vld2lane:
+    case Intrinsic::arm_neon_vld3lane:
+    case Intrinsic::arm_neon_vld4lane:
+    case Intrinsic::aarch64_neon_vld1x2:
+    case Intrinsic::aarch64_neon_vld1x3:
+    case Intrinsic::aarch64_neon_vld1x4:
+    case Intrinsic::aarch64_neon_vst1x2:
+    case Intrinsic::aarch64_neon_vst1x3:
+    case Intrinsic::aarch64_neon_vst1x4:
+    case Intrinsic::arm_neon_vst2lane:
+    case Intrinsic::arm_neon_vst3lane:
+    case Intrinsic::arm_neon_vst4lane:
+      return CombineBaseUpdate(N, DCI);
+    default:
+      break;
+    }
   }
   return SDValue();
 }
@@ -3269,6 +3867,59 @@ AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
   return false;
 }
 
+// Check whether a Build Vector could be presented as Shuffle Vector. If yes,
+// try to call LowerVECTOR_SHUFFLE to lower it.
+bool AArch64TargetLowering::isKnownShuffleVector(SDValue Op, SelectionDAG &DAG,
+                                                 SDValue &Res) const {
+  SDLoc DL(Op);
+  EVT VT = Op.getValueType();
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned V0NumElts = 0;
+  int Mask[16];
+  SDValue V0, V1;
+
+  // Check if all elements are extracted from less than 3 vectors.
+  for (unsigned i = 0; i < NumElts; ++i) {
+    SDValue Elt = Op.getOperand(i);
+    if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+      return false;
+
+    if (V0.getNode() == 0) {
+      V0 = Elt.getOperand(0);
+      V0NumElts = V0.getValueType().getVectorNumElements();
+    }
+    if (Elt.getOperand(0) == V0) {
+      Mask[i] = (cast<ConstantSDNode>(Elt->getOperand(1))->getZExtValue());
+      continue;
+    } else if (V1.getNode() == 0) {
+      V1 = Elt.getOperand(0);
+    }
+    if (Elt.getOperand(0) == V1) {
+      unsigned Lane = cast<ConstantSDNode>(Elt->getOperand(1))->getZExtValue();
+      Mask[i] = (Lane + V0NumElts);
+      continue;
+    } else {
+      return false;
+    }
+  }
+
+  if (!V1.getNode() && V0NumElts == NumElts * 2) {
+    V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V0,
+                     DAG.getConstant(NumElts, MVT::i64));
+    V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V0,
+                     DAG.getConstant(0, MVT::i64));
+    V0NumElts = V0.getValueType().getVectorNumElements();
+  }
+
+  if (V1.getNode() && NumElts == V0NumElts &&
+      V0NumElts == V1.getValueType().getVectorNumElements()) {
+    SDValue Shuffle = DAG.getVectorShuffle(VT, DL, V0, V1, Mask);
+    Res = LowerVECTOR_SHUFFLE(Shuffle, DAG);
+    return true;
+  } else
+    return false;
+}
+
 // If this is a case we can't handle, return null and let the default
 // expansion code take care of it.
 SDValue
@@ -3283,12 +3934,15 @@ AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
   unsigned SplatBitSize;
   bool HasAnyUndefs;
 
+  unsigned UseNeonMov = VT.getSizeInBits() >= 64;
+
   // Note we favor lowering MOVI over MVNI.
   // This has implications on the definition of patterns in TableGen to select
   // BIC immediate instructions but not ORR immediate instructions.
   // If this lowering order is changed, TableGen patterns for BIC immediate and
   // ORR immediate instructions have to be updated.
-  if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
+  if (UseNeonMov &&
+      BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
     if (SplatBitSize <= 64) {
       // First attempt to use vector immediate-form MOVI
       EVT NeonMovVT;
@@ -3336,9 +3990,390 @@ AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
       }
     }
   }
+
+  unsigned NumElts = VT.getVectorNumElements();
+  bool isOnlyLowElement = true;
+  bool usesOnlyOneValue = true;
+  bool hasDominantValue = false;
+  bool isConstant = true;
+
+  // Map of the number of times a particular SDValue appears in the
+  // element list.
+  DenseMap<SDValue, unsigned> ValueCounts;
+  SDValue Value;
+  for (unsigned i = 0; i < NumElts; ++i) {
+    SDValue V = Op.getOperand(i);
+    if (V.getOpcode() == ISD::UNDEF)
+      continue;
+    if (i > 0)
+      isOnlyLowElement = false;
+    if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V))
+      isConstant = false;
+
+    ValueCounts.insert(std::make_pair(V, 0));
+    unsigned &Count = ValueCounts[V];
+
+    // Is this value dominant? (takes up more than half of the lanes)
+    if (++Count > (NumElts / 2)) {
+      hasDominantValue = true;
+      Value = V;
+    }
+  }
+  if (ValueCounts.size() != 1)
+    usesOnlyOneValue = false;
+  if (!Value.getNode() && ValueCounts.size() > 0)
+    Value = ValueCounts.begin()->first;
+
+  if (ValueCounts.size() == 0)
+    return DAG.getUNDEF(VT);
+
+  // Loads are better lowered with insert_vector_elt.
+  // Keep going if we are hitting this case.
+  if (isOnlyLowElement && !ISD::isNormalLoad(Value.getNode()))
+    return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Value);
+
+  unsigned EltSize = VT.getVectorElementType().getSizeInBits();
+  if (hasDominantValue && EltSize <= 64) {
+    // Use VDUP for non-constant splats.
+    if (!isConstant) {
+      SDValue N;
+
+      // If we are DUPing a value that comes directly from a vector, we could
+      // just use DUPLANE. We can only do this if the lane being extracted
+      // is at a constant index, as the DUP from lane instructions only have
+      // constant-index forms.
+      if (Value->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+          isa<ConstantSDNode>(Value->getOperand(1))) {
+          N = DAG.getNode(AArch64ISD::NEON_VDUPLANE, DL, VT,
+                        Value->getOperand(0), Value->getOperand(1));
+      } else
+        N = DAG.getNode(AArch64ISD::NEON_VDUP, DL, VT, Value);
+
+      if (!usesOnlyOneValue) {
+        // The dominant value was splatted as 'N', but we now have to insert
+        // all differing elements.
+        for (unsigned I = 0; I < NumElts; ++I) {
+          if (Op.getOperand(I) == Value)
+            continue;
+          SmallVector<SDValue, 3> Ops;
+          Ops.push_back(N);
+          Ops.push_back(Op.getOperand(I));
+          Ops.push_back(DAG.getConstant(I, MVT::i64));
+          N = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, &Ops[0], 3);
+        }
+      }
+      return N;
+    }
+    if (usesOnlyOneValue && isConstant) {
+      return DAG.getNode(AArch64ISD::NEON_VDUP, DL, VT, Value);
+    }
+  }
+  // If all elements are constants and the case above didn't get hit, fall back
+  // to the default expansion, which will generate a load from the constant
+  // pool.
+  if (isConstant)
+    return SDValue();
+
+  // Try to lower this in lowering ShuffleVector way.
+  SDValue Shuf;
+  if (isKnownShuffleVector(Op, DAG, Shuf))
+    return Shuf;
+
+  // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
+  // know the default expansion would otherwise fall back on something even
+  // worse. For a vector with one or two non-undef values, that's
+  // scalar_to_vector for the elements followed by a shuffle (provided the
+  // shuffle is valid for the target) and materialization element by element
+  // on the stack followed by a load for everything else.
+  if (!isConstant && !usesOnlyOneValue) {
+    SDValue Vec = DAG.getUNDEF(VT);
+    for (unsigned i = 0 ; i < NumElts; ++i) {
+      SDValue V = Op.getOperand(i);
+      if (V.getOpcode() == ISD::UNDEF)
+        continue;
+      SDValue LaneIdx = DAG.getConstant(i, MVT::i64);
+      Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, Vec, V, LaneIdx);
+    }
+    return Vec;
+  }
   return SDValue();
 }
 
+/// isREVMask - Check if a vector shuffle corresponds to a REV
+/// instruction with the specified blocksize.  (The order of the elements
+/// within each block of the vector is reversed.)
+static bool isREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) {
+  assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) &&
+         "Only possible block sizes for REV are: 16, 32, 64");
+
+  unsigned EltSz = VT.getVectorElementType().getSizeInBits();
+  if (EltSz == 64)
+    return false;
+
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned BlockElts = M[0] + 1;
+  // If the first shuffle index is UNDEF, be optimistic.
+  if (M[0] < 0)
+    BlockElts = BlockSize / EltSz;
+
+  if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz)
+    return false;
+
+  for (unsigned i = 0; i < NumElts; ++i) {
+    if (M[i] < 0)
+      continue; // ignore UNDEF indices
+    if ((unsigned)M[i] != (i - i % BlockElts) + (BlockElts - 1 - i % BlockElts))
+      return false;
+  }
+
+  return true;
+}
+
+// isPermuteMask - Check whether the vector shuffle matches to UZP, ZIP and
+// TRN instruction.
+static unsigned isPermuteMask(ArrayRef<int> M, EVT VT) {
+  unsigned NumElts = VT.getVectorNumElements();
+  if (NumElts < 4)
+    return 0;
+
+  bool ismatch = true;
+
+  // Check UZP1
+  for (unsigned i = 0; i < NumElts; ++i) {
+    if ((unsigned)M[i] != i * 2) {
+      ismatch = false;
+      break;
+    }
+  }
+  if (ismatch)
+    return AArch64ISD::NEON_UZP1;
+
+  // Check UZP2
+  ismatch = true;
+  for (unsigned i = 0; i < NumElts; ++i) {
+    if ((unsigned)M[i] != i * 2 + 1) {
+      ismatch = false;
+      break;
+    }
+  }
+  if (ismatch)
+    return AArch64ISD::NEON_UZP2;
+
+  // Check ZIP1
+  ismatch = true;
+  for (unsigned i = 0; i < NumElts; ++i) {
+    if ((unsigned)M[i] != i / 2 + NumElts * (i % 2)) {
+      ismatch = false;
+      break;
+    }
+  }
+  if (ismatch)
+    return AArch64ISD::NEON_ZIP1;
+
+  // Check ZIP2
+  ismatch = true;
+  for (unsigned i = 0; i < NumElts; ++i) {
+    if ((unsigned)M[i] != (NumElts + i) / 2 + NumElts * (i % 2)) {
+      ismatch = false;
+      break;
+    }
+  }
+  if (ismatch)
+    return AArch64ISD::NEON_ZIP2;
+
+  // Check TRN1
+  ismatch = true;
+  for (unsigned i = 0; i < NumElts; ++i) {
+    if ((unsigned)M[i] != i + (NumElts - 1) * (i % 2)) {
+      ismatch = false;
+      break;
+    }
+  }
+  if (ismatch)
+    return AArch64ISD::NEON_TRN1;
+
+  // Check TRN2
+  ismatch = true;
+  for (unsigned i = 0; i < NumElts; ++i) {
+    if ((unsigned)M[i] != 1 + i + (NumElts - 1) * (i % 2)) {
+      ismatch = false;
+      break;
+    }
+  }
+  if (ismatch)
+    return AArch64ISD::NEON_TRN2;
+
+  return 0;
+}
+
+SDValue
+AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
+                                           SelectionDAG &DAG) const {
+  SDValue V1 = Op.getOperand(0);
+  SDValue V2 = Op.getOperand(1);
+  SDLoc dl(Op);
+  EVT VT = Op.getValueType();
+  ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
+
+  // Convert shuffles that are directly supported on NEON to target-specific
+  // DAG nodes, instead of keeping them as shuffles and matching them again
+  // during code selection.  This is more efficient and avoids the possibility
+  // of inconsistencies between legalization and selection.
+  ArrayRef<int> ShuffleMask = SVN->getMask();
+
+  unsigned EltSize = VT.getVectorElementType().getSizeInBits();
+  if (EltSize > 64)
+    return SDValue();
+
+  if (isREVMask(ShuffleMask, VT, 64))
+    return DAG.getNode(AArch64ISD::NEON_REV64, dl, VT, V1);
+  if (isREVMask(ShuffleMask, VT, 32))
+    return DAG.getNode(AArch64ISD::NEON_REV32, dl, VT, V1);
+  if (isREVMask(ShuffleMask, VT, 16))
+    return DAG.getNode(AArch64ISD::NEON_REV16, dl, VT, V1);
+
+  unsigned ISDNo = isPermuteMask(ShuffleMask, VT);
+  if (ISDNo)
+    return DAG.getNode(ISDNo, dl, VT, V1, V2);
+
+  // If the element of shuffle mask are all the same constant, we can
+  // transform it into either NEON_VDUP or NEON_VDUPLANE
+  if (ShuffleVectorSDNode::isSplatMask(&ShuffleMask[0], VT)) {
+    int Lane = SVN->getSplatIndex();
+    // If this is undef splat, generate it via "just" vdup, if possible.
+    if (Lane == -1) Lane = 0;
+
+    // Test if V1 is a SCALAR_TO_VECTOR.
+    if (V1.getOpcode() == ISD::SCALAR_TO_VECTOR) {
+      return DAG.getNode(AArch64ISD::NEON_VDUP, dl, VT, V1.getOperand(0));
+    }
+    // Test if V1 is a BUILD_VECTOR which is equivalent to a SCALAR_TO_VECTOR.
+    if (V1.getOpcode() == ISD::BUILD_VECTOR) {
+      bool IsScalarToVector = true;
+      for (unsigned i = 0, e = V1.getNumOperands(); i != e; ++i)
+        if (V1.getOperand(i).getOpcode() != ISD::UNDEF &&
+            i != (unsigned)Lane) {
+          IsScalarToVector = false;
+          break;
+        }
+      if (IsScalarToVector)
+        return DAG.getNode(AArch64ISD::NEON_VDUP, dl, VT,
+                           V1.getOperand(Lane));
+    }
+
+    // Test if V1 is a EXTRACT_SUBVECTOR.
+    if (V1.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
+      int ExtLane = cast<ConstantSDNode>(V1.getOperand(1))->getZExtValue();
+      return DAG.getNode(AArch64ISD::NEON_VDUPLANE, dl, VT, V1.getOperand(0),
+                         DAG.getConstant(Lane + ExtLane, MVT::i64));
+    }
+    // Test if V1 is a CONCAT_VECTORS.
+    if (V1.getOpcode() == ISD::CONCAT_VECTORS &&
+        V1.getOperand(1).getOpcode() == ISD::UNDEF) {
+      SDValue Op0 = V1.getOperand(0);
+      assert((unsigned)Lane < Op0.getValueType().getVectorNumElements() &&
+             "Invalid vector lane access");
+      return DAG.getNode(AArch64ISD::NEON_VDUPLANE, dl, VT, Op0,
+                         DAG.getConstant(Lane, MVT::i64));
+    }
+
+    return DAG.getNode(AArch64ISD::NEON_VDUPLANE, dl, VT, V1,
+                       DAG.getConstant(Lane, MVT::i64));
+  }
+
+  int Length = ShuffleMask.size();
+  int V1EltNum = V1.getValueType().getVectorNumElements();
+
+  // If the number of v1 elements is the same as the number of shuffle mask
+  // element and the shuffle masks are sequential values, we can transform
+  // it into NEON_VEXTRACT.
+  if (V1EltNum == Length) {
+    // Check if the shuffle mask is sequential.
+    bool IsSequential = true;
+    int CurMask = ShuffleMask[0];
+    for (int I = 0; I < Length; ++I) {
+      if (ShuffleMask[I] != CurMask) {
+        IsSequential = false;
+        break;
+      }
+      CurMask++;
+    }
+    if (IsSequential) {
+      assert((EltSize % 8 == 0) && "Bitsize of vector element is incorrect");
+      unsigned VecSize = EltSize * V1EltNum;
+      unsigned Index = (EltSize/8) * ShuffleMask[0];
+      if (VecSize == 64 || VecSize == 128)
+        return DAG.getNode(AArch64ISD::NEON_VEXTRACT, dl, VT, V1, V2,
+                           DAG.getConstant(Index, MVT::i64));
+    }
+  }
+
+  // For shuffle mask like "0, 1, 2, 3, 4, 5, 13, 7", try to generate insert
+  // by element from V2 to V1 .
+  // If shuffle mask is like "0, 1, 10, 11, 12, 13, 14, 15", V2 would be a
+  // better choice to be inserted than V1 as less insert needed, so we count
+  // element to be inserted for both V1 and V2, and select less one as insert
+  // target.
+
+  // Collect elements need to be inserted and their index.
+  SmallVector<int, 8> NV1Elt;
+  SmallVector<int, 8> N1Index;
+  SmallVector<int, 8> NV2Elt;
+  SmallVector<int, 8> N2Index;
+  for (int I = 0; I != Length; ++I) {
+    if (ShuffleMask[I] != I) {
+      NV1Elt.push_back(ShuffleMask[I]);
+      N1Index.push_back(I);
+    }
+  }
+  for (int I = 0; I != Length; ++I) {
+    if (ShuffleMask[I] != (I + V1EltNum)) {
+      NV2Elt.push_back(ShuffleMask[I]);
+      N2Index.push_back(I);
+    }
+  }
+
+  // Decide which to be inserted. If all lanes mismatch, neither V1 nor V2
+  // will be inserted.
+  SDValue InsV = V1;
+  SmallVector<int, 8> InsMasks = NV1Elt;
+  SmallVector<int, 8> InsIndex = N1Index;
+  if ((int)NV1Elt.size() != Length || (int)NV2Elt.size() != Length) {
+    if (NV1Elt.size() > NV2Elt.size()) {
+      InsV = V2;
+      InsMasks = NV2Elt;
+      InsIndex = N2Index;
+    }
+  } else {
+    InsV = DAG.getNode(ISD::UNDEF, dl, VT);
+  }
+
+  for (int I = 0, E = InsMasks.size(); I != E; ++I) {
+    SDValue ExtV = V1;
+    int Mask = InsMasks[I];
+    if (Mask >= V1EltNum) {
+      ExtV = V2;
+      Mask -= V1EltNum;
+    }
+    // Any value type smaller than i32 is illegal in AArch64, and this lower
+    // function is called after legalize pass, so we need to legalize
+    // the result here.
+    EVT EltVT;
+    if (VT.getVectorElementType().isFloatingPoint())
+      EltVT = (EltSize == 64) ? MVT::f64 : MVT::f32;
+    else
+      EltVT = (EltSize == 64) ? MVT::i64 : MVT::i32;
+
+    if (Mask >= 0) {
+      ExtV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, ExtV,
+                         DAG.getConstant(Mask, MVT::i64));
+      InsV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, InsV, ExtV,
+                         DAG.getConstant(InsIndex[I], MVT::i64));
+    }
+  }
+  return InsV;
+}
+
 AArch64TargetLowering::ConstraintType
 AArch64TargetLowering::getConstraintType(const std::string &Constraint) const {
   if (Constraint.size() == 1) {
@@ -3484,14 +4519,10 @@ AArch64TargetLowering::getRegForInlineAsmConstraint(
         return std::make_pair(0U, &AArch64::FPR16RegClass);
       else if (VT == MVT::f32)
         return std::make_pair(0U, &AArch64::FPR32RegClass);
-      else if (VT == MVT::f64)
-        return std::make_pair(0U, &AArch64::FPR64RegClass);
       else if (VT.getSizeInBits() == 64)
-        return std::make_pair(0U, &AArch64::VPR64RegClass);
-      else if (VT == MVT::f128)
-        return std::make_pair(0U, &AArch64::FPR128RegClass);
+        return std::make_pair(0U, &AArch64::FPR64RegClass);
       else if (VT.getSizeInBits() == 128)
-        return std::make_pair(0U, &AArch64::VPR128RegClass);
+        return std::make_pair(0U, &AArch64::FPR128RegClass);
       break;
     }
   }
@@ -3500,3 +4531,69 @@ AArch64TargetLowering::getRegForInlineAsmConstraint(
   // constraint into a member of a register class.
   return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
 }
+
+/// Represent NEON load and store intrinsics as MemIntrinsicNodes.
+/// The associated MachineMemOperands record the alignment specified
+/// in the intrinsic calls.
+bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
+                                               const CallInst &I,
+                                               unsigned Intrinsic) const {
+  switch (Intrinsic) {
+  case Intrinsic::arm_neon_vld1:
+  case Intrinsic::arm_neon_vld2:
+  case Intrinsic::arm_neon_vld3:
+  case Intrinsic::arm_neon_vld4:
+  case Intrinsic::aarch64_neon_vld1x2:
+  case Intrinsic::aarch64_neon_vld1x3:
+  case Intrinsic::aarch64_neon_vld1x4:
+  case Intrinsic::arm_neon_vld2lane:
+  case Intrinsic::arm_neon_vld3lane:
+  case Intrinsic::arm_neon_vld4lane: {
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    // Conservatively set memVT to the entire set of vectors loaded.
+    uint64_t NumElts = getDataLayout()->getTypeAllocSize(I.getType()) / 8;
+    Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
+    Info.ptrVal = I.getArgOperand(0);
+    Info.offset = 0;
+    Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1);
+    Info.align = cast<ConstantInt>(AlignArg)->getZExtValue();
+    Info.vol = false; // volatile loads with NEON intrinsics not supported
+    Info.readMem = true;
+    Info.writeMem = false;
+    return true;
+  }
+  case Intrinsic::arm_neon_vst1:
+  case Intrinsic::arm_neon_vst2:
+  case Intrinsic::arm_neon_vst3:
+  case Intrinsic::arm_neon_vst4:
+  case Intrinsic::aarch64_neon_vst1x2:
+  case Intrinsic::aarch64_neon_vst1x3:
+  case Intrinsic::aarch64_neon_vst1x4:
+  case Intrinsic::arm_neon_vst2lane:
+  case Intrinsic::arm_neon_vst3lane:
+  case Intrinsic::arm_neon_vst4lane: {
+    Info.opc = ISD::INTRINSIC_VOID;
+    // Conservatively set memVT to the entire set of vectors stored.
+    unsigned NumElts = 0;
+    for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) {
+      Type *ArgTy = I.getArgOperand(ArgI)->getType();
+      if (!ArgTy->isVectorTy())
+        break;
+      NumElts += getDataLayout()->getTypeAllocSize(ArgTy) / 8;
+    }
+    Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
+    Info.ptrVal = I.getArgOperand(0);
+    Info.offset = 0;
+    Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1);
+    Info.align = cast<ConstantInt>(AlignArg)->getZExtValue();
+    Info.vol = false; // volatile stores with NEON intrinsics not supported
+    Info.readMem = false;
+    Info.writeMem = true;
+    return true;
+  }
+  default:
+    break;
+  }
+
+  return false;
+}
diff --git a/lib/Target/AArch64/AArch64ISelLowering.h b/lib/Target/AArch64/AArch64ISelLowering.h
index 67a908e..8ad5a79 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/lib/Target/AArch64/AArch64ISelLowering.h
@@ -19,7 +19,7 @@
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/Target/TargetLowering.h"
-
+#include "llvm/IR/Intrinsics.h"
 
 namespace llvm {
 namespace AArch64ISD {
@@ -125,6 +125,19 @@ namespace AArch64ISD {
     // Vector FP move immediate
     NEON_FMOVIMM,
 
+    // Vector permute
+    NEON_UZP1,
+    NEON_UZP2,
+    NEON_ZIP1,
+    NEON_ZIP2,
+    NEON_TRN1,
+    NEON_TRN2,
+
+    // Vector Element reverse
+    NEON_REV64,
+    NEON_REV32,
+    NEON_REV16,
+
     // Vector compare
     NEON_CMP,
 
@@ -132,7 +145,58 @@ namespace AArch64ISD {
     NEON_CMPZ,
 
     // Vector compare bitwise test
-    NEON_TST
+    NEON_TST,
+
+    // Vector saturating shift
+    NEON_QSHLs,
+    NEON_QSHLu,
+
+    // Vector dup
+    NEON_VDUP,
+
+    // Vector dup by lane
+    NEON_VDUPLANE,
+
+    // Vector extract
+    NEON_VEXTRACT,
+
+    // NEON duplicate lane loads
+    NEON_LD2DUP = ISD::FIRST_TARGET_MEMORY_OPCODE,
+    NEON_LD3DUP,
+    NEON_LD4DUP,
+
+    // NEON loads with post-increment base updates:
+    NEON_LD1_UPD,
+    NEON_LD2_UPD,
+    NEON_LD3_UPD,
+    NEON_LD4_UPD,
+    NEON_LD1x2_UPD,
+    NEON_LD1x3_UPD,
+    NEON_LD1x4_UPD,
+
+    // NEON stores with post-increment base updates:
+    NEON_ST1_UPD,
+    NEON_ST2_UPD,
+    NEON_ST3_UPD,
+    NEON_ST4_UPD,
+    NEON_ST1x2_UPD,
+    NEON_ST1x3_UPD,
+    NEON_ST1x4_UPD,
+
+    // NEON duplicate lane loads with post-increment base updates:
+    NEON_LD2DUP_UPD,
+    NEON_LD3DUP_UPD,
+    NEON_LD4DUP_UPD,
+
+    // NEON lane loads with post-increment base updates:
+    NEON_LD2LN_UPD,
+    NEON_LD3LN_UPD,
+    NEON_LD4LN_UPD,
+
+    // NEON lane store with post-increment base updates:
+    NEON_ST2LN_UPD,
+    NEON_ST3LN_UPD,
+    NEON_ST4LN_UPD
   };
 }
 
@@ -169,9 +233,13 @@ public:
                           SDLoc dl, SelectionDAG &DAG,
                           SmallVectorImpl<SDValue> &InVals) const;
 
+  bool isKnownShuffleVector(SDValue Op, SelectionDAG &DAG, SDValue &Res) const;
+
   SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
                             const AArch64Subtarget *ST) const;
 
+  SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
+
   void SaveVarArgRegisters(CCState &CCInfo, SelectionDAG &DAG, SDLoc DL,
                            SDValue &Chain) const;
 
@@ -234,6 +302,8 @@ public:
   SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG, bool IsSigned) const;
+  SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
 
   SDValue LowerGlobalAddressELFSmall(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerGlobalAddressELFLarge(SDValue Op, SelectionDAG &DAG) const;
@@ -269,6 +339,14 @@ public:
 
   std::pair<unsigned, const TargetRegisterClass*>
   getRegForInlineAsmConstraint(const std::string &Constraint, MVT VT) const;
+
+  virtual bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I,
+                                  unsigned Intrinsic) const LLVM_OVERRIDE;
+
+protected:
+  std::pair<const TargetRegisterClass*, uint8_t>
+  findRepresentativeClass(MVT VT) const;
+
 private:
   const InstrItineraryData *Itins;
 
@@ -280,6 +358,10 @@ enum NeonModImmType {
   Neon_Mov_Imm,
   Neon_Mvn_Imm
 };
+
+extern SDValue ScanBUILD_VECTOR(SDValue Op, bool &isOnlyLowElement,
+                                bool &usesOnlyOneValue, bool &hasDominantValue,
+                                bool &isConstant, bool &isUNDEF);
 } // namespace llvm
 
 #endif // LLVM_TARGET_AARCH64_ISELLOWERING_H
diff --git a/lib/Target/AArch64/AArch64InstrFormats.td b/lib/Target/AArch64/AArch64InstrFormats.td
index 09451fd..34f917c 100644
--- a/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/lib/Target/AArch64/AArch64InstrFormats.td
@@ -120,6 +120,14 @@ class A64InstRdnm<dag outs, dag ins, string asmstr,
   let Inst{20-16} = Rm;
 }
 
+class A64InstRtnm<dag outs, dag ins, string asmstr,
+                  list<dag> patterns, InstrItinClass itin>
+  : A64InstRtn<outs, ins, asmstr, patterns, itin> {
+  bits<5> Rm;
+
+  let Inst{20-16} = Rm;
+}
+
 //===----------------------------------------------------------------------===//
 //
 // Actual A64 Instruction Formats
@@ -383,6 +391,8 @@ class A64I_extract<bit sf, bits<3> op, bit n,
   // Inherits Rd in 4-0
 }
 
+let Predicates = [HasFPARMv8] in {
+
 // Format for floating-point compare instructions.
 class A64I_fpcmp<bit m, bit s, bits<2> type, bits<2> op, bits<5> opcode2,
                 dag outs, dag ins, string asmstr,
@@ -562,6 +572,8 @@ class A64I_fpimm<bit m, bit s, bits<2> type, bits<5> imm5,
   // Inherit Rd in 4-0
 }
 
+}
+
 // Format for load-register (literal) instructions.
 class A64I_LDRlit<bits<2> opc, bit v,
                   dag outs, dag ins, string asmstr,
@@ -971,31 +983,123 @@ class NeonInstAlias<string Asm, dag Result, bit Emit = 0b1>
   : InstAlias<Asm, Result, Emit> {
 }
 
+// Format AdvSIMD bitwise extract
+class NeonI_BitExtract<bit q, bits<2> op2,
+                       dag outs, dag ins, string asmstr,
+                       list<dag> patterns, InstrItinClass itin>
+  : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
+  let Inst{31} = 0b0;
+  let Inst{30} = q;
+  let Inst{29-24} = 0b101110;
+  let Inst{23-22} = op2;
+  let Inst{21} = 0b0;
+  // Inherit Rm in 20-16
+  let Inst{15} = 0b0;
+  // imm4 in 14-11
+  let Inst{10} = 0b0;
+  // Inherit Rn in 9-5
+  // Inherit Rd in 4-0
+}
+
+// Format AdvSIMD perm
+class NeonI_Perm<bit q, bits<2> size, bits<3> opcode,
+                 dag outs, dag ins, string asmstr,
+                 list<dag> patterns, InstrItinClass itin>
+  : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
+  let Inst{31} = 0b0;
+  let Inst{30} = q;
+  let Inst{29-24} = 0b001110;
+  let Inst{23-22} = size;
+  let Inst{21} = 0b0;
+  // Inherit Rm in 20-16
+  let Inst{15} = 0b0;
+  let Inst{14-12} = opcode;
+  let Inst{11-10} = 0b10;
+  // Inherit Rn in 9-5
+  // Inherit Rd in 4-0
+}
+
+// Format AdvSIMD table lookup
+class NeonI_TBL<bit q, bits<2> op2, bits<2> len, bit op,
+                dag outs, dag ins, string asmstr,
+                list<dag> patterns, InstrItinClass itin>
+  : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
+  let Inst{31} = 0b0;
+  let Inst{30} = q;
+  let Inst{29-24} = 0b001110;
+  let Inst{23-22} = op2;
+  let Inst{21} = 0b0;
+  // Inherit Rm in 20-16
+  let Inst{15} = 0b0;
+  let Inst{14-13} = len;
+  let Inst{12} = op;
+  let Inst{11-10} = 0b00;
+  // Inherit Rn in 9-5
+  // Inherit Rd in 4-0
+}
+
 // Format AdvSIMD 3 vector registers with same vector type
 class NeonI_3VSame<bit q, bit u, bits<2> size, bits<5> opcode,
                    dag outs, dag ins, string asmstr,
                    list<dag> patterns, InstrItinClass itin>
-  : A64InstRdnm<outs, ins, asmstr, patterns, itin>
-{
+  : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
   let Inst{31} = 0b0;
   let Inst{30} = q;
   let Inst{29} = u;
   let Inst{28-24} = 0b01110;
   let Inst{23-22} = size;
   let Inst{21} = 0b1;
-   // Inherit Rm in 20-16
+  // Inherit Rm in 20-16
   let Inst{15-11} = opcode;
   let Inst{10} = 0b1;
   // Inherit Rn in 9-5
   // Inherit Rd in 4-0
 }
 
+// Format AdvSIMD 3 vector registers with different vector type
+class NeonI_3VDiff<bit q, bit u, bits<2> size, bits<4> opcode,
+                   dag outs, dag ins, string asmstr,
+                   list<dag> patterns, InstrItinClass itin>
+  : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
+  let Inst{31} = 0b0;
+  let Inst{30} = q;
+  let Inst{29} = u;
+  let Inst{28-24} = 0b01110;
+  let Inst{23-22} = size;
+  let Inst{21} = 0b1;
+  // Inherit Rm in 20-16
+  let Inst{15-12} = opcode;
+  let Inst{11} = 0b0;
+  let Inst{10} = 0b0;
+  // Inherit Rn in 9-5
+  // Inherit Rd in 4-0
+}
+
+// Format AdvSIMD two registers and an element
+class NeonI_2VElem<bit q, bit u, bits<2> size, bits<4> opcode,
+                   dag outs, dag ins, string asmstr,
+                   list<dag> patterns, InstrItinClass itin>
+  : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
+  let Inst{31} = 0b0;
+  let Inst{30} = q;
+  let Inst{29} = u;
+  let Inst{28-24} = 0b01111;
+  let Inst{23-22} = size;
+  // l in Inst{21}
+  // m in Inst{20}
+  // Inherit Rm in 19-16
+  let Inst{15-12} = opcode;
+  // h in Inst{11}
+  let Inst{10} = 0b0;
+  // Inherit Rn in 9-5
+  // Inherit Rd in 4-0
+}
+
 // Format AdvSIMD 1 vector register with modified immediate
 class NeonI_1VModImm<bit q, bit op,
                      dag outs, dag ins, string asmstr,
                      list<dag> patterns, InstrItinClass itin>
-  : A64InstRd<outs,ins, asmstr, patterns, itin>
-{
+  : A64InstRd<outs,ins, asmstr, patterns, itin> {
   bits<8> Imm;
   bits<4> cmode;
   let Inst{31} = 0b0;
@@ -1015,15 +1119,14 @@ class NeonI_1VModImm<bit q, bit op,
 class NeonI_Scalar3Same<bit u, bits<2> size, bits<5> opcode,
                           dag outs, dag ins, string asmstr,
                           list<dag> patterns, InstrItinClass itin>
-  : A64InstRdnm<outs, ins, asmstr, patterns, itin>
-{
+  : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
   let Inst{31} = 0b0;
   let Inst{30} = 0b1;
   let Inst{29} = u;
   let Inst{28-24} = 0b11110;
   let Inst{23-22} = size;
   let Inst{21} = 0b1;
-   // Inherit Rm in 20-16
+  // Inherit Rm in 20-16
   let Inst{15-11} = opcode;
   let Inst{10} = 0b1;
   // Inherit Rn in 9-5
@@ -1035,6 +1138,98 @@ class NeonI_Scalar3Same<bit u, bits<2> size, bits<5> opcode,
 class NeonI_2VMisc<bit q, bit u, bits<2> size, bits<5> opcode,
                    dag outs, dag ins, string asmstr,
                    list<dag> patterns, InstrItinClass itin>
+  : A64InstRdn<outs, ins, asmstr, patterns, itin> {
+  let Inst{31} = 0b0;
+  let Inst{30} = q;
+  let Inst{29} = u;
+  let Inst{28-24} = 0b01110;
+  let Inst{23-22} = size;
+  let Inst{21-17} = 0b10000;
+  let Inst{16-12} = opcode;
+  let Inst{11-10} = 0b10;
+
+  // Inherit Rn in 9-5
+  // Inherit Rd in 4-0
+}
+
+// Format AdvSIMD 2 vector 1 immediate shift
+class NeonI_2VShiftImm<bit q, bit u, bits<5> opcode,
+                       dag outs, dag ins, string asmstr,
+                       list<dag> patterns, InstrItinClass itin>
+  : A64InstRdn<outs, ins, asmstr, patterns, itin> {
+  bits<7> Imm;
+  let Inst{31} = 0b0;
+  let Inst{30} = q;
+  let Inst{29} = u;
+  let Inst{28-23} = 0b011110;
+  let Inst{22-16} = Imm;
+  let Inst{15-11} = opcode;
+  let Inst{10} = 0b1;
+  
+  // Inherit Rn in 9-5
+  // Inherit Rd in 4-0
+}
+
+// Format AdvSIMD duplicate and insert
+class NeonI_copy<bit q, bit op, bits<4> imm4,
+                 dag outs, dag ins, string asmstr,
+                 list<dag> patterns, InstrItinClass itin>
+  : A64InstRdn<outs, ins, asmstr, patterns, itin> {
+  bits<5> Imm5;
+  let Inst{31} = 0b0;
+  let Inst{30} = q;
+  let Inst{29} = op;
+  let Inst{28-21} = 0b01110000;
+  let Inst{20-16} = Imm5;
+  let Inst{15} = 0b0;
+  let Inst{14-11} = imm4;
+  let Inst{10} = 0b1;
+  
+  // Inherit Rn in 9-5
+  // Inherit Rd in 4-0
+}
+// Format AdvSIMD insert from element to vector
+class NeonI_insert<bit q, bit op,
+                  dag outs, dag ins, string asmstr,
+                  list<dag> patterns, InstrItinClass itin>
+  : A64InstRdn<outs, ins, asmstr, patterns, itin> {
+  bits<5> Imm5;
+  bits<4> Imm4;
+  let Inst{31} = 0b0;
+  let Inst{30} = q;
+  let Inst{29} = op;
+  let Inst{28-21} = 0b01110000;
+  let Inst{20-16} = Imm5;
+  let Inst{15} = 0b0;
+  let Inst{14-11} = Imm4;
+  let Inst{10} = 0b1;
+  
+  // Inherit Rn in 9-5
+  // Inherit Rd in 4-0
+}
+
+// Format AdvSIMD scalar pairwise
+class NeonI_ScalarPair<bit u, bits<2> size, bits<5> opcode,
+                          dag outs, dag ins, string asmstr,
+                          list<dag> patterns, InstrItinClass itin>
+  : A64InstRdn<outs, ins, asmstr, patterns, itin> {
+  let Inst{31} = 0b0;
+  let Inst{30} = 0b1;
+  let Inst{29} = u;
+  let Inst{28-24} = 0b11110;
+  let Inst{23-22} = size;
+  let Inst{21-17} = 0b11000;
+  let Inst{16-12} = opcode;
+  let Inst{11-10} = 0b10;
+
+  // Inherit Rn in 9-5
+  // Inherit Rd in 4-0
+}
+
+// Format AdvSIMD 2 vector across lanes
+class NeonI_2VAcross<bit q, bit u, bits<2> size, bits<5> opcode,
+                     dag outs, dag ins, string asmstr,
+                     list<dag> patterns, InstrItinClass itin>
   : A64InstRdn<outs, ins, asmstr, patterns, itin>
 {
   let Inst{31} = 0b0;
@@ -1042,13 +1237,253 @@ class NeonI_2VMisc<bit q, bit u, bits<2> size, bits<5> opcode,
   let Inst{29} = u;
   let Inst{28-24} = 0b01110;
   let Inst{23-22} = size;
+  let Inst{21-17} = 0b11000;
+  let Inst{16-12} = opcode;
+  let Inst{11-10} = 0b10;
+
+  // Inherit Rn in 9-5
+  // Inherit Rd in 4-0
+}
+
+// Format AdvSIMD scalar two registers miscellaneous
+class NeonI_Scalar2SameMisc<bit u, bits<2> size, bits<5> opcode, dag outs, dag ins,
+                            string asmstr, list<dag> patterns, InstrItinClass itin>
+  : A64InstRdn<outs, ins, asmstr, patterns, itin> {
+  let Inst{31} = 0b0;
+  let Inst{30} = 0b1;
+  let Inst{29} = u;
+  let Inst{28-24} = 0b11110;
+  let Inst{23-22} = size;
   let Inst{21-17} = 0b10000;
   let Inst{16-12} = opcode;
   let Inst{11-10} = 0b10;
+  // Inherit Rn in 9-5
+  // Inherit Rd in 4-0
+}
+
+// Format AdvSIMD vector load/store multiple N-element structure
+class NeonI_LdStMult<bit q, bit l, bits<4> opcode, bits<2> size,
+                    dag outs, dag ins, string asmstr,
+                    list<dag> patterns, InstrItinClass itin>
+  : A64InstRtn<outs, ins, asmstr, patterns, itin>
+{
+  let Inst{31} = 0b0;
+  let Inst{30} = q;
+  let Inst{29-23} = 0b0011000;
+  let Inst{22} = l;
+  let Inst{21-16} = 0b000000;
+  let Inst{15-12} = opcode;
+  let Inst{11-10} = size;
+  
+  // Inherit Rn in 9-5
+  // Inherit Rt in 4-0
+}
+
+// Format AdvSIMD vector load/store multiple N-element structure (post-index)
+class NeonI_LdStMult_Post<bit q, bit l, bits<4> opcode, bits<2> size,
+                         dag outs, dag ins, string asmstr,
+                         list<dag> patterns, InstrItinClass itin>
+  : A64InstRtnm<outs, ins, asmstr, patterns, itin>
+{
+  let Inst{31} = 0b0;
+  let Inst{30} = q;
+  let Inst{29-23} = 0b0011001;
+  let Inst{22} = l;
+  let Inst{21} = 0b0;
+  // Inherit Rm in 20-16
+  let Inst{15-12} = opcode;
+  let Inst{11-10} = size;
+  // Inherit Rn in 9-5
+  // Inherit Rt in 4-0
+}
+
+// Format AdvSIMD vector load Single N-element structure to all lanes
+class NeonI_LdOne_Dup<bit q, bit r, bits<3> opcode, bits<2> size, dag outs,
+                      dag ins, string asmstr, list<dag> patterns,
+                      InstrItinClass itin>
+  : A64InstRtn<outs, ins, asmstr, patterns, itin>
+{
+  let Inst{31} = 0b0;
+  let Inst{30} = q;
+  let Inst{29-23} = 0b0011010;
+  let Inst{22} = 0b1;
+  let Inst{21} = r;
+  let Inst{20-16} = 0b00000;
+  let Inst{15-13} = opcode;
+  let Inst{12} = 0b0;
+  let Inst{11-10} = size;
+
+  // Inherit Rn in 9-5
+  // Inherit Rt in 4-0
+}
+
+// Format AdvSIMD vector load/store Single N-element structure to/from one lane
+class NeonI_LdStOne_Lane<bit l, bit r, bits<2> op2_1, bit op0, dag outs,
+                         dag ins, string asmstr,
+                         list<dag> patterns, InstrItinClass itin>
+  : A64InstRtn<outs, ins, asmstr, patterns, itin>
+{
+  bits<4> lane;
+  let Inst{31} = 0b0;
+  let Inst{29-23} = 0b0011010;
+  let Inst{22} = l;
+  let Inst{21} = r;
+  let Inst{20-16} = 0b00000;
+  let Inst{15-14} = op2_1;
+  let Inst{13} = op0;
+  
+  // Inherit Rn in 9-5
+  // Inherit Rt in 4-0
+}
+
+// Format AdvSIMD post-index vector load Single N-element structure to all lanes
+class NeonI_LdOne_Dup_Post<bit q, bit r, bits<3> opcode, bits<2> size, dag outs,
+                           dag ins, string asmstr, list<dag> patterns,
+                           InstrItinClass itin>
+  : A64InstRtnm<outs, ins, asmstr, patterns, itin>
+{
+  let Inst{31} = 0b0;
+  let Inst{30} = q;
+  let Inst{29-23} = 0b0011011;
+  let Inst{22} = 0b1;
+  let Inst{21} = r;
+  // Inherit Rm in 20-16
+  let Inst{15-13} = opcode;
+  let Inst{12} = 0b0;
+  let Inst{11-10} = size;
+
+  // Inherit Rn in 9-5
+  // Inherit Rt in 4-0
+}
+
+// Format AdvSIMD post-index vector load/store Single N-element structure
+// to/from one lane
+class NeonI_LdStOne_Lane_Post<bit l, bit r, bits<2> op2_1, bit op0, dag outs,
+                         dag ins, string asmstr,
+                         list<dag> patterns, InstrItinClass itin>
+  : A64InstRtnm<outs, ins, asmstr, patterns, itin>
+{
+  bits<4> lane;
+  let Inst{31} = 0b0;
+  let Inst{29-23} = 0b0011011;
+  let Inst{22} = l;
+  let Inst{21} = r;
+  // Inherit Rm in 20-16
+  let Inst{15-14} = op2_1;
+  let Inst{13} = op0;
+  
+  // Inherit Rn in 9-5
+  // Inherit Rt in 4-0
+}
+
+// Format AdvSIMD 3 scalar registers with different type
+
+class NeonI_Scalar3Diff<bit u, bits<2> size, bits<4> opcode,
+                          dag outs, dag ins, string asmstr,
+                          list<dag> patterns, InstrItinClass itin>
+  : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
+  let Inst{31-30} = 0b01;
+  let Inst{29} = u;
+  let Inst{28-24} = 0b11110;
+  let Inst{23-22} = size;
+  let Inst{21} = 0b1;
+  // Inherit Rm in 20-16
+  let Inst{15-12} = opcode;
+  let Inst{11-10} = 0b00;
+  // Inherit Rn in 9-5
+  // Inherit Rd in 4-0
+}
+
+// Format AdvSIMD scalar shift by immediate
+
+class NeonI_ScalarShiftImm<bit u, bits<5> opcode,
+                           dag outs, dag ins, string asmstr,
+                           list<dag> patterns, InstrItinClass itin>
+  : A64InstRdn<outs, ins, asmstr, patterns, itin> {
+  bits<4> Imm4;
+  bits<3> Imm3;
+  let Inst{31-30} = 0b01;
+  let Inst{29} = u;
+  let Inst{28-23} = 0b111110;
+  let Inst{22-19} = Imm4;
+  let Inst{18-16} = Imm3;
+  let Inst{15-11} = opcode;
+  let Inst{10} = 0b1;
+  // Inherit Rn in 9-5
+  // Inherit Rd in 4-0
+}
+
+// Format AdvSIMD crypto AES
+class NeonI_Crypto_AES<bits<2> size, bits<5> opcode,
+                       dag outs, dag ins, string asmstr,
+                       list<dag> patterns, InstrItinClass itin>
+  : A64InstRdn<outs, ins, asmstr, patterns, itin> {
+  let Inst{31-24} = 0b01001110;
+  let Inst{23-22} = size;
+  let Inst{21-17} = 0b10100;
+  let Inst{16-12} = opcode;
+  let Inst{11-10} = 0b10;
+  // Inherit Rn in 9-5
+  // Inherit Rd in 4-0
+}
 
+// Format AdvSIMD crypto SHA
+class NeonI_Crypto_SHA<bits<2> size, bits<5> opcode,
+                       dag outs, dag ins, string asmstr,
+                       list<dag> patterns, InstrItinClass itin>
+  : A64InstRdn<outs, ins, asmstr, patterns, itin> {
+  let Inst{31-24} = 0b01011110;
+  let Inst{23-22} = size;
+  let Inst{21-17} = 0b10100;
+  let Inst{16-12} = opcode;
+  let Inst{11-10} = 0b10;
   // Inherit Rn in 9-5
   // Inherit Rd in 4-0
 }
 
+// Format AdvSIMD crypto 3V SHA
+class NeonI_Crypto_3VSHA<bits<2> size, bits<3> opcode,
+                         dag outs, dag ins, string asmstr,
+                         list<dag> patterns, InstrItinClass itin>
+  : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
+  let Inst{31-24} = 0b01011110;
+  let Inst{23-22} = size;
+  let Inst{21} = 0b0;
+  // Inherit Rm in 20-16
+  let Inst{15} = 0b0;
+  let Inst{14-12} = opcode;
+  let Inst{11-10} = 0b00;
+  // Inherit Rn in 9-5
+  // Inherit Rd in 4-0
+}
+
+// Format AdvSIMD scalar x indexed element
+class NeonI_ScalarXIndexedElem<bit u, bit szhi, bit szlo,
+                               bits<4> opcode, dag outs, dag ins,
+                               string asmstr, list<dag> patterns,
+                               InstrItinClass itin>
+  : A64InstRdnm<outs, ins, asmstr, patterns, itin>
+{
+  let Inst{31} = 0b0;
+  let Inst{30} = 0b1;
+  let Inst{29} = u;
+  let Inst{28-24} = 0b11111;
+  let Inst{23} = szhi;
+  let Inst{22} = szlo;
+  // l in Inst{21}
+  // m in Instr{20}
+  // Inherit Rm in 19-16
+  let Inst{15-12} = opcode;
+  // h in Inst{11}
+  let Inst{10} = 0b0;
+  // Inherit Rn in 9-5
+  // Inherit Rd in 4-0
+}
+// Format AdvSIMD scalar copy - insert from element to scalar
+class NeonI_ScalarCopy<dag outs, dag ins, string asmstr,
+                       list<dag> patterns, InstrItinClass itin>
+  : NeonI_copy<0b1, 0b0, 0b0000, outs, ins, asmstr, patterns, itin> {
+  let Inst{28} = 0b1;
+}
 }
 
diff --git a/lib/Target/AArch64/AArch64InstrInfo.cpp b/lib/Target/AArch64/AArch64InstrInfo.cpp
index d8f45eb..180110a 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -29,7 +29,7 @@
 
 #include <algorithm>
 
-#define GET_INSTRINFO_CTOR
+#define GET_INSTRINFO_CTOR_DTOR
 #include "AArch64GenInstrInfo.inc"
 
 using namespace llvm;
@@ -68,43 +68,71 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     BuildMI(MBB, I, DL, get(AArch64::MRSxi), DestReg)
       .addImm(A64SysReg::NZCV);
   } else if (AArch64::GPR64RegClass.contains(DestReg)) {
-    assert(AArch64::GPR64RegClass.contains(SrcReg));
-    Opc = AArch64::ORRxxx_lsl;
-    ZeroReg = AArch64::XZR;
+    if(AArch64::GPR64RegClass.contains(SrcReg)){
+      Opc = AArch64::ORRxxx_lsl;
+      ZeroReg = AArch64::XZR;
+    } else{
+      assert(AArch64::FPR64RegClass.contains(SrcReg));
+      BuildMI(MBB, I, DL, get(AArch64::FMOVxd), DestReg)
+        .addReg(SrcReg);
+      return;
+    }
   } else if (AArch64::GPR32RegClass.contains(DestReg)) {
-    assert(AArch64::GPR32RegClass.contains(SrcReg));
-    Opc = AArch64::ORRwww_lsl;
-    ZeroReg = AArch64::WZR;
+    if(AArch64::GPR32RegClass.contains(SrcReg)){
+      Opc = AArch64::ORRwww_lsl;
+      ZeroReg = AArch64::WZR;
+    } else{
+      assert(AArch64::FPR32RegClass.contains(SrcReg));
+      BuildMI(MBB, I, DL, get(AArch64::FMOVws), DestReg)
+        .addReg(SrcReg);
+      return;
+    }
   } else if (AArch64::FPR32RegClass.contains(DestReg)) {
-    assert(AArch64::FPR32RegClass.contains(SrcReg));
-    BuildMI(MBB, I, DL, get(AArch64::FMOVss), DestReg)
-      .addReg(SrcReg);
-    return;
+    if(AArch64::FPR32RegClass.contains(SrcReg)){
+      BuildMI(MBB, I, DL, get(AArch64::FMOVss), DestReg)
+        .addReg(SrcReg);
+      return;
+    }
+    else {
+      assert(AArch64::GPR32RegClass.contains(SrcReg));
+      BuildMI(MBB, I, DL, get(AArch64::FMOVsw), DestReg)
+        .addReg(SrcReg);
+      return;
+    }
   } else if (AArch64::FPR64RegClass.contains(DestReg)) {
-    assert(AArch64::FPR64RegClass.contains(SrcReg));
-    BuildMI(MBB, I, DL, get(AArch64::FMOVdd), DestReg)
-      .addReg(SrcReg);
-    return;
+    if(AArch64::FPR64RegClass.contains(SrcReg)){
+      BuildMI(MBB, I, DL, get(AArch64::FMOVdd), DestReg)
+        .addReg(SrcReg);
+      return;
+    }
+    else {
+      assert(AArch64::GPR64RegClass.contains(SrcReg));
+      BuildMI(MBB, I, DL, get(AArch64::FMOVdx), DestReg)
+        .addReg(SrcReg);
+      return;
+    }
   } else if (AArch64::FPR128RegClass.contains(DestReg)) {
     assert(AArch64::FPR128RegClass.contains(SrcReg));
 
-    // FIXME: there's no good way to do this, at least without NEON:
-    //   + There's no single move instruction for q-registers
-    //   + We can't create a spill slot and use normal STR/LDR because stack
-    //     allocation has already happened
-    //   + We can't go via X-registers with FMOV because register allocation has
-    //     already happened.
-    // This may not be efficient, but at least it works.
-    BuildMI(MBB, I, DL, get(AArch64::LSFP128_PreInd_STR), AArch64::XSP)
-      .addReg(SrcReg)
-      .addReg(AArch64::XSP)
-      .addImm(0x1ff & -16);
-
-    BuildMI(MBB, I, DL, get(AArch64::LSFP128_PostInd_LDR), DestReg)
-      .addReg(AArch64::XSP, RegState::Define)
-      .addReg(AArch64::XSP)
-      .addImm(16);
-    return;
+    // If NEON is enable, we use ORR to implement this copy.
+    // If NEON isn't available, emit STR and LDR to handle this.
+    if(getSubTarget().hasNEON()) {
+      BuildMI(MBB, I, DL, get(AArch64::ORRvvv_16B), DestReg)
+        .addReg(SrcReg)
+        .addReg(SrcReg);
+      return;
+    } else {
+      BuildMI(MBB, I, DL, get(AArch64::LSFP128_PreInd_STR), AArch64::XSP)
+        .addReg(SrcReg)
+        .addReg(AArch64::XSP)
+        .addImm(0x1ff & -16);
+
+      BuildMI(MBB, I, DL, get(AArch64::LSFP128_PostInd_LDR), DestReg)
+        .addReg(AArch64::XSP, RegState::Define)
+        .addReg(AArch64::XSP)
+        .addImm(16);
+      return;
+    }
   } else {
     llvm_unreachable("Unknown register class in copyPhysReg");
   }
diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td
index 07289b0..23d81fc 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/lib/Target/AArch64/AArch64InstrInfo.td
@@ -14,6 +14,8 @@
 //===----------------------------------------------------------------------===//
 // ARM Instruction Predicate Definitions.
 //
+def HasFPARMv8       : Predicate<"Subtarget->hasFPARMv8()">,
+                               AssemblerPredicate<"FeatureFPARMv8", "fp-armv8">;
 def HasNEON          : Predicate<"Subtarget->hasNEON()">,
                                  AssemblerPredicate<"FeatureNEON", "neon">;
 def HasCrypto        : Predicate<"Subtarget->hasCrypto()">,
@@ -125,6 +127,8 @@ def A64Sbfx : SDNode<"AArch64ISD::SBFX", SDTA64BFX>;
 
 def A64Ubfx : SDNode<"AArch64ISD::UBFX", SDTA64BFX>;
 
+class BinOpFrag<dag res> : PatFrag<(ops node:$LHS, node:$RHS), res>;
+
 //===----------------------------------------------------------------------===//
 // Call sequence pseudo-instructions
 //===----------------------------------------------------------------------===//
@@ -1274,7 +1278,7 @@ def : Pat<(i64 (sext_inreg (anyext i32:$Rn), i1)),
 
 // UBFX makes sense as an implementation of a 64-bit zero-extension too. Could
 // use either 64-bit or 32-bit variant, but 32-bit might be more efficient.
-def : Pat<(zext i32:$Rn), (SUBREG_TO_REG (i64 0), (UBFXwwii $Rn, 0, 31),
+def : Pat<(i64 (zext i32:$Rn)), (SUBREG_TO_REG (i64 0), (UBFXwwii $Rn, 0, 31),
                                          sub_32)>;
 
 //===-------------------------------
@@ -1978,6 +1982,13 @@ def fpz64 : Operand<f64>,
   let DecoderMethod = "DecodeFPZeroOperand";
 }
 
+def fpz64movi : Operand<i64>,
+            ComplexPattern<f64, 1, "SelectFPZeroOperand", [fpimm]> {
+  let ParserMatchClass = fpzero_asmoperand;
+  let PrintMethod = "printFPZeroOperand";
+  let DecoderMethod = "DecodeFPZeroOperand";
+}
+
 multiclass A64I_fpcmpSignal<bits<2> type, bit imm, dag ins, dag pattern> {
   def _quiet : A64I_fpcmp<0b0, 0b0, type, 0b00, {0b0, imm, 0b0, 0b0, 0b0},
                           (outs), ins, "fcmp\t$Rn, $Rm", [pattern],
@@ -2186,23 +2197,23 @@ def FNMSUBdddd : A64I_fpdp3Impl<"fnmsub", FPR64, f64, 0b01, 0b1, 0b1, fnmsub>;
 
 // Extra patterns for when we're allowed to optimise separate multiplication and
 // addition.
-let Predicates = [UseFusedMAC] in {
-def : Pat<(fadd FPR32:$Ra, (fmul FPR32:$Rn, FPR32:$Rm)),
+let Predicates = [HasFPARMv8, UseFusedMAC] in {
+def : Pat<(f32 (fadd FPR32:$Ra, (f32 (fmul FPR32:$Rn, FPR32:$Rm)))),
           (FMADDssss FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>;
-def : Pat<(fsub FPR32:$Ra, (fmul FPR32:$Rn, FPR32:$Rm)),
+def : Pat<(f32 (fsub FPR32:$Ra, (f32 (fmul FPR32:$Rn, FPR32:$Rm)))),
           (FMSUBssss FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>;
-def : Pat<(fsub (fmul FPR32:$Rn, FPR32:$Rm), FPR32:$Ra),
+def : Pat<(f32 (fsub (f32 (fmul FPR32:$Rn, FPR32:$Rm)), FPR32:$Ra)),
           (FNMADDssss FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>;
-def : Pat<(fsub (fneg FPR32:$Ra), (fmul FPR32:$Rn, FPR32:$Rm)),
+def : Pat<(f32 (fsub (f32 (fneg FPR32:$Ra)), (f32 (fmul FPR32:$Rn, FPR32:$Rm)))),
           (FNMSUBssss FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>;
 
-def : Pat<(fadd FPR64:$Ra, (fmul FPR64:$Rn, FPR64:$Rm)),
+def : Pat<(f64 (fadd FPR64:$Ra, (f64 (fmul FPR64:$Rn, FPR64:$Rm)))),
           (FMADDdddd FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>;
-def : Pat<(fsub FPR64:$Ra, (fmul FPR64:$Rn, FPR64:$Rm)),
+def : Pat<(f64 (fsub FPR64:$Ra, (f64 (fmul FPR64:$Rn, FPR64:$Rm)))),
           (FMSUBdddd FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>;
-def : Pat<(fsub (fmul FPR64:$Rn, FPR64:$Rm), FPR64:$Ra),
+def : Pat<(f64 (fsub (f64 (fmul FPR64:$Rn, FPR64:$Rm)), FPR64:$Ra)),
           (FNMADDdddd FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>;
-def : Pat<(fsub (fneg FPR64:$Ra), (fmul FPR64:$Rn, FPR64:$Rm)),
+def : Pat<(f64 (fsub (f64 (fneg FPR64:$Ra)), (f64 (fmul FPR64:$Rn, FPR64:$Rm)))),
           (FNMSUBdddd FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>;
 }
 
@@ -2342,6 +2353,7 @@ defm FCVTM : A64I_fptointRM<0b10, 0b0, "fcvtm">;
 defm FCVTZ : A64I_fptointRM<0b11, 0b0, "fcvtz">;
 defm FCVTA : A64I_fptointRM<0b00, 0b1, "fcvta">;
 
+let Predicates = [HasFPARMv8] in {
 def : Pat<(i32 (fp_to_sint f32:$Rn)), (FCVTZSws $Rn)>;
 def : Pat<(i64 (fp_to_sint f32:$Rn)), (FCVTZSxs $Rn)>;
 def : Pat<(i32 (fp_to_uint f32:$Rn)), (FCVTZUws $Rn)>;
@@ -2350,6 +2362,7 @@ def : Pat<(i32 (fp_to_sint f64:$Rn)), (FCVTZSwd $Rn)>;
 def : Pat<(i64 (fp_to_sint f64:$Rn)), (FCVTZSxd $Rn)>;
 def : Pat<(i32 (fp_to_uint f64:$Rn)), (FCVTZUwd $Rn)>;
 def : Pat<(i64 (fp_to_uint f64:$Rn)), (FCVTZUxd $Rn)>;
+}
 
 multiclass A64I_inttofp<bit o0, string asmop> {
   def CVTFsw : A64I_fpintI<0b0, 0b00, 0b00, {0, 1, o0}, FPR32, GPR32, asmop>;
@@ -2361,6 +2374,7 @@ multiclass A64I_inttofp<bit o0, string asmop> {
 defm S : A64I_inttofp<0b0, "scvtf">;
 defm U : A64I_inttofp<0b1, "ucvtf">;
 
+let Predicates = [HasFPARMv8] in {
 def : Pat<(f32 (sint_to_fp i32:$Rn)), (SCVTFsw $Rn)>;
 def : Pat<(f32 (sint_to_fp i64:$Rn)), (SCVTFsx $Rn)>;
 def : Pat<(f64 (sint_to_fp i32:$Rn)), (SCVTFdw $Rn)>;
@@ -2369,16 +2383,19 @@ def : Pat<(f32 (uint_to_fp i32:$Rn)), (UCVTFsw $Rn)>;
 def : Pat<(f32 (uint_to_fp i64:$Rn)), (UCVTFsx $Rn)>;
 def : Pat<(f64 (uint_to_fp i32:$Rn)), (UCVTFdw $Rn)>;
 def : Pat<(f64 (uint_to_fp i64:$Rn)), (UCVTFdx $Rn)>;
+}
 
 def FMOVws : A64I_fpintI<0b0, 0b00, 0b00, 0b110, GPR32, FPR32, "fmov">;
 def FMOVsw : A64I_fpintI<0b0, 0b00, 0b00, 0b111, FPR32, GPR32, "fmov">;
 def FMOVxd : A64I_fpintI<0b1, 0b01, 0b00, 0b110, GPR64, FPR64, "fmov">;
 def FMOVdx : A64I_fpintI<0b1, 0b01, 0b00, 0b111, FPR64, GPR64, "fmov">;
 
+let Predicates = [HasFPARMv8] in {
 def : Pat<(i32 (bitconvert f32:$Rn)), (FMOVws $Rn)>;
 def : Pat<(f32 (bitconvert i32:$Rn)), (FMOVsw $Rn)>;
 def : Pat<(i64 (bitconvert f64:$Rn)), (FMOVxd $Rn)>;
 def : Pat<(f64 (bitconvert i64:$Rn)), (FMOVdx $Rn)>;
+}
 
 def lane1_asmoperand : AsmOperandClass {
   let Name = "Lane1";
@@ -2401,11 +2418,13 @@ let DecoderMethod =  "DecodeFMOVLaneInstruction" in {
                           "fmov\t$Rd.d[$Lane], $Rn", [], NoItinerary>;
 }
 
+let Predicates = [HasFPARMv8] in {
 def : InstAlias<"fmov $Rd, $Rn.2d[$Lane]",
                 (FMOVxv GPR64:$Rd, VPR128:$Rn, lane1:$Lane), 0b0>;
 
 def : InstAlias<"fmov $Rd.2d[$Lane], $Rn",
                 (FMOVvx VPR128:$Rd, GPR64:$Rn, lane1:$Lane), 0b0>;
+}
 
 //===----------------------------------------------------------------------===//
 // Floating-point immediate instructions
@@ -2499,11 +2518,15 @@ let mayLoad = 1 in {
   def LDRx_lit : A64I_LDRlitSimple<0b01, 0b0, GPR64>;
 }
 
+let Predicates = [HasFPARMv8] in {
 def LDRs_lit  : A64I_LDRlitSimple<0b00, 0b1, FPR32>;
 def LDRd_lit  : A64I_LDRlitSimple<0b01, 0b1, FPR64>;
+}
 
 let mayLoad = 1 in {
+  let Predicates = [HasFPARMv8] in {
   def LDRq_lit : A64I_LDRlitSimple<0b10, 0b1, FPR128>;
+  }
 
 
   def LDRSWx_lit : A64I_LDRlit<0b10, 0b0,
@@ -3097,6 +3120,7 @@ defm LS32
 defm LS64
   : A64I_LDRSTR_unsigned<"LS64", 0b11, 0b0, 0b0, "", GPR64, dword_addrparams>;
 
+let Predicates = [HasFPARMv8] in {
 // STR/LDR to/from a B register
 defm LSFP8
   : A64I_LDRSTR_unsigned<"LSFP8", 0b00, 0b1, 0b0, "", FPR8, byte_addrparams>;
@@ -3115,6 +3139,7 @@ defm LSFP64
 defm LSFP128
   : A64I_LDRSTR_unsigned<"LSFP128", 0b00, 0b1, 0b1, "", FPR128,
                          qword_addrparams>;
+}
 
 //===------------------------------
 // 2.3 Signed loads
@@ -3570,10 +3595,13 @@ multiclass A64I_LSPsimple<bits<2> opc, bit v, RegisterClass SomeReg,
 
 defm LSPair32 : A64I_LSPsimple<0b00, 0b0, GPR32, word_simm7, "LSPair32">;
 defm LSPair64 : A64I_LSPsimple<0b10, 0b0, GPR64, dword_simm7, "LSPair64">;
+
+let Predicates = [HasFPARMv8] in {
 defm LSFPPair32 : A64I_LSPsimple<0b00, 0b1, FPR32, word_simm7, "LSFPPair32">;
 defm LSFPPair64 : A64I_LSPsimple<0b01, 0b1, FPR64,  dword_simm7, "LSFPPair64">;
 defm LSFPPair128 : A64I_LSPsimple<0b10, 0b1, FPR128, qword_simm7,
                                   "LSFPPair128">;
+}
 
 
 def LDPSWx : A64I_LSPoffset<0b01, 0b0, 0b1,
@@ -5162,4 +5190,4 @@ defm : regoff_pats<"Xm", (add i64:$Rn, (shl i64:$Rm, SHIFT)),
 // Advanced SIMD (NEON) Support
 //
 
-include "AArch64InstrNEON.td"
-\ No newline at end of file
+include "AArch64InstrNEON.td"
diff --git a/lib/Target/AArch64/AArch64InstrNEON.td b/lib/Target/AArch64/AArch64InstrNEON.td
index 98b9e3e..d71749d 100644
--- a/lib/Target/AArch64/AArch64InstrNEON.td
+++ b/lib/Target/AArch64/AArch64InstrNEON.td
@@ -41,6 +41,37 @@ def Neon_cmpz : SDNode<"AArch64ISD::NEON_CMPZ", SDTypeProfile<1, 3,
 def Neon_tst : SDNode<"AArch64ISD::NEON_TST", SDTypeProfile<1, 2,
                  [SDTCisVec<0>,  SDTCisSameAs<1, 2>]>>;
 
+def SDTARMVSH : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0, 1>,
+                                     SDTCisVT<2, i32>]>;
+def Neon_sqrshlImm   : SDNode<"AArch64ISD::NEON_QSHLs", SDTARMVSH>;
+def Neon_uqrshlImm   : SDNode<"AArch64ISD::NEON_QSHLu", SDTARMVSH>;
+
+def SDTPERMUTE : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0, 1>,
+                               SDTCisSameAs<0, 2>]>;
+def Neon_uzp1    : SDNode<"AArch64ISD::NEON_UZP1", SDTPERMUTE>;
+def Neon_uzp2    : SDNode<"AArch64ISD::NEON_UZP2", SDTPERMUTE>;
+def Neon_zip1    : SDNode<"AArch64ISD::NEON_ZIP1", SDTPERMUTE>;
+def Neon_zip2    : SDNode<"AArch64ISD::NEON_ZIP2", SDTPERMUTE>;
+def Neon_trn1    : SDNode<"AArch64ISD::NEON_TRN1", SDTPERMUTE>;
+def Neon_trn2    : SDNode<"AArch64ISD::NEON_TRN2", SDTPERMUTE>;
+
+def SDTVSHUF : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0, 1>]>;
+def Neon_rev64    : SDNode<"AArch64ISD::NEON_REV64", SDTVSHUF>;
+def Neon_rev32    : SDNode<"AArch64ISD::NEON_REV32", SDTVSHUF>;
+def Neon_rev16    : SDNode<"AArch64ISD::NEON_REV16", SDTVSHUF>;
+def Neon_vdup : SDNode<"AArch64ISD::NEON_VDUP", SDTypeProfile<1, 1,
+                       [SDTCisVec<0>]>>;
+def Neon_vduplane : SDNode<"AArch64ISD::NEON_VDUPLANE", SDTypeProfile<1, 2,
+                           [SDTCisVec<0>, SDTCisVec<1>, SDTCisVT<2, i64>]>>;
+def Neon_vextract : SDNode<"AArch64ISD::NEON_VEXTRACT", SDTypeProfile<1, 3,
+                           [SDTCisVec<0>,  SDTCisSameAs<0, 1>,
+                           SDTCisSameAs<0, 2>, SDTCisVT<3, i64>]>>;
+
+def SDT_assertext : SDTypeProfile<1, 1,
+  [SDTCisInt<0>, SDTCisInt<1>, SDTCisSameAs<1, 0>]>;
+def assertsext : SDNode<"ISD::AssertSext", SDT_assertext>;
+def assertzext : SDNode<"ISD::AssertZext", SDT_assertext>;
+
 //===----------------------------------------------------------------------===//
 // Multiclasses
 //===----------------------------------------------------------------------===//
@@ -48,8 +79,7 @@ def Neon_tst : SDNode<"AArch64ISD::NEON_TST", SDTypeProfile<1, 2,
 multiclass NeonI_3VSame_B_sizes<bit u, bits<2> size,  bits<5> opcode,
                                 string asmop, SDPatternOperator opnode8B,
                                 SDPatternOperator opnode16B,
-                                bit Commutable = 0>
-{
+                                bit Commutable = 0> {
   let isCommutable = Commutable in {
     def _8B :  NeonI_3VSame<0b0, u, size, opcode,
                (outs VPR64:$Rd), (ins VPR64:$Rn, VPR64:$Rm),
@@ -70,8 +100,7 @@ multiclass NeonI_3VSame_B_sizes<bit u, bits<2> size,  bits<5> opcode,
 
 multiclass NeonI_3VSame_HS_sizes<bit u, bits<5> opcode,
                                   string asmop, SDPatternOperator opnode,
-                                  bit Commutable = 0>
-{
+                                  bit Commutable = 0> {
   let isCommutable = Commutable in {
     def _4H : NeonI_3VSame<0b0, u, 0b01, opcode,
               (outs VPR64:$Rd), (ins VPR64:$Rn, VPR64:$Rm),
@@ -105,8 +134,7 @@ multiclass NeonI_3VSame_HS_sizes<bit u, bits<5> opcode,
 multiclass NeonI_3VSame_BHS_sizes<bit u, bits<5> opcode,
                                   string asmop, SDPatternOperator opnode,
                                   bit Commutable = 0>
-   : NeonI_3VSame_HS_sizes<u, opcode,  asmop, opnode, Commutable>
-{
+   : NeonI_3VSame_HS_sizes<u, opcode,  asmop, opnode, Commutable> {
   let isCommutable = Commutable in {
     def _8B :  NeonI_3VSame<0b0, u, 0b00, opcode,
                (outs VPR64:$Rd), (ins VPR64:$Rn, VPR64:$Rm),
@@ -127,8 +155,7 @@ multiclass NeonI_3VSame_BHS_sizes<bit u, bits<5> opcode,
 multiclass NeonI_3VSame_BHSD_sizes<bit u, bits<5> opcode,
                                    string asmop, SDPatternOperator opnode,
                                    bit Commutable = 0>
-   : NeonI_3VSame_BHS_sizes<u, opcode,  asmop, opnode, Commutable>
-{
+   : NeonI_3VSame_BHS_sizes<u, opcode,  asmop, opnode, Commutable> {
   let isCommutable = Commutable in {
     def _2D : NeonI_3VSame<0b1, u, 0b11, opcode,
               (outs VPR128:$Rd), (ins VPR128:$Rn, VPR128:$Rm),
@@ -146,8 +173,7 @@ multiclass NeonI_3VSame_SD_sizes<bit u, bit size, bits<5> opcode,
                                  SDPatternOperator opnode4S,
                                  SDPatternOperator opnode2D,
                                  ValueType ResTy2S, ValueType ResTy4S,
-                                 ValueType ResTy2D, bit Commutable = 0>
-{
+                                 ValueType ResTy2D, bit Commutable = 0> {
   let isCommutable = Commutable in {
     def _2S : NeonI_3VSame<0b0, u, {size, 0b0}, opcode,
               (outs VPR64:$Rd), (ins VPR64:$Rn, VPR64:$Rm),
@@ -206,8 +232,8 @@ defm PMULvvv : NeonI_3VSame_B_sizes<0b1, 0b00, 0b10011, "pmul",
 // class NeonI_3VSame_Constraint_impl: NeonI_3VSame with no data type and
 // two operands constraints.
 class NeonI_3VSame_Constraint_impl<string asmop, string asmlane,
-  RegisterClass VPRC, ValueType OpTy, bit q, bit u, bits<2> size, bits<5> opcode,
-  SDPatternOperator opnode>
+  RegisterOperand VPRC, ValueType OpTy, bit q, bit u, bits<2> size,
+  bits<5> opcode, SDPatternOperator opnode>
   : NeonI_3VSame<q, u, size, opcode,
     (outs VPRC:$Rd), (ins VPRC:$src, VPRC:$Rn, VPRC:$Rm),
     asmop # "\t$Rd" # asmlane # ", $Rn" # asmlane # ", $Rm" # asmlane,
@@ -312,26 +338,24 @@ defm ORRvvv : NeonI_3VSame_B_sizes<0b0, 0b10, 0b00011, "orr", or, or, 1>;
 // ORR disassembled as MOV if Vn==Vm
 
 // Vector Move - register
-// Alias for ORR if Vn=Vm and it is the preferred syntax
+// Alias for ORR if Vn=Vm.
+// FIXME: This is actually the preferred syntax but TableGen can't deal with
+// custom printing of aliases.
 def : NeonInstAlias<"mov $Rd.8b, $Rn.8b",
-                    (ORRvvv_8B VPR64:$Rd, VPR64:$Rn, VPR64:$Rn)>;
+                    (ORRvvv_8B VPR64:$Rd, VPR64:$Rn, VPR64:$Rn), 0>;
 def : NeonInstAlias<"mov $Rd.16b, $Rn.16b",
-                    (ORRvvv_16B VPR128:$Rd, VPR128:$Rn, VPR128:$Rn)>;
-
-def Neon_immAllOnes: PatLeaf<(Neon_movi (i32 timm), (i32 imm)), [{
-  ConstantSDNode *ImmConstVal = cast<ConstantSDNode>(N->getOperand(0));
-  ConstantSDNode *OpCmodeConstVal = cast<ConstantSDNode>(N->getOperand(1));
-  unsigned EltBits;
-  uint64_t EltVal = A64Imms::decodeNeonModImm(ImmConstVal->getZExtValue(),
-    OpCmodeConstVal->getZExtValue(), EltBits);
-  return (EltBits == 8 && EltVal == 0xff);
-}]>;
+                    (ORRvvv_16B VPR128:$Rd, VPR128:$Rn, VPR128:$Rn), 0>;
 
+// The MOVI instruction takes two immediate operands.  The first is the
+// immediate encoding, while the second is the cmode.  A cmode of 14, or
+// 0b1110, produces a MOVI operation, rather than a MVNI, ORR, or BIC.
+def Neon_AllZero : PatFrag<(ops), (Neon_movi (i32 0), (i32 14))>;
+def Neon_AllOne : PatFrag<(ops), (Neon_movi (i32 255), (i32 14))>;
 
 def Neon_not8B  : PatFrag<(ops node:$in),
-                          (xor node:$in, (bitconvert (v8i8 Neon_immAllOnes)))>;
+                          (xor node:$in, (bitconvert (v8i8 Neon_AllOne)))>;
 def Neon_not16B : PatFrag<(ops node:$in),
-                          (xor node:$in, (bitconvert (v16i8 Neon_immAllOnes)))>;
+                          (xor node:$in, (bitconvert (v16i8 Neon_AllOne)))>;
 
 def Neon_orn8B : PatFrag<(ops node:$Rn, node:$Rm),
                          (or node:$Rn, (Neon_not8B node:$Rm))>;
@@ -447,6 +471,9 @@ multiclass Neon_bitwise3V_patterns<SDPatternOperator opnode,
   def : Pat<(v2f32 (int_arm_neon_vbsl (v2f32 VPR64:$src),
                     (v2f32 VPR64:$Rn), (v2f32 VPR64:$Rm))),
             (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>;
+  def : Pat<(v1f64 (int_arm_neon_vbsl (v1f64 VPR64:$src),
+                    (v1f64 VPR64:$Rn), (v1f64 VPR64:$Rm))),
+            (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>;
   def : Pat<(v16i8 (int_arm_neon_vbsl (v16i8 VPR128:$src),
                     (v16i8 VPR128:$Rn), (v16i8 VPR128:$Rm))),
             (INST16B VPR128:$src, VPR128:$Rn, VPR128:$Rm)>;
@@ -562,7 +589,7 @@ def Neon_cmgt : PatFrag<(ops node:$lhs, node:$rhs),
 // NeonI_compare_aliases class: swaps register operands to implement
 // comparison aliases, e.g., CMLE is alias for CMGE with operands reversed.
 class NeonI_compare_aliases<string asmop, string asmlane,
-                            Instruction inst, RegisterClass VPRC>
+                            Instruction inst, RegisterOperand VPRC>
   : NeonInstAlias<asmop # "\t$Rd" # asmlane #", $Rn" # asmlane #
                     ", $Rm" # asmlane,
                   (inst VPRC:$Rd, VPRC:$Rm, VPRC:$Rn), 0b0>;
@@ -1023,6 +1050,20 @@ defm neon_mov_imm_LSLH  : neon_mov_imm_shift_operands<"LSL", "H", "true", [{
   return (HasShift && !ShiftOnesIn);
 }]>;
 
+def neon_uimm1_asmoperand : AsmOperandClass
+{
+  let Name = "UImm1";
+  let PredicateMethod = "isUImm<1>";
+  let RenderMethod = "addImmOperands";
+}
+
+def neon_uimm2_asmoperand : AsmOperandClass
+{
+  let Name = "UImm2";
+  let PredicateMethod = "isUImm<2>";
+  let RenderMethod = "addImmOperands";
+}
+
 def neon_uimm8_asmoperand : AsmOperandClass
 {
   let Name = "UImm8";
@@ -1032,7 +1073,7 @@ def neon_uimm8_asmoperand : AsmOperandClass
 
 def neon_uimm8 : Operand<i32>, ImmLeaf<i32, [{(void)Imm; return true;}]> {
   let ParserMatchClass = neon_uimm8_asmoperand;
-  let PrintMethod = "printNeonUImm8Operand";
+  let PrintMethod = "printUImmHexOperand";
 }
 
 def neon_uimm64_mask_asmoperand : AsmOperandClass
@@ -1057,7 +1098,7 @@ multiclass NeonI_mov_imm_lsl_sizes<string asmop, bit op,
                               (outs VPR64:$Rd),
                               (ins neon_uimm8:$Imm,
                                 neon_mov_imm_LSL_operand:$Simm),
-                              !strconcat(asmop, " $Rd.2s, $Imm$Simm"),
+                              !strconcat(asmop, "\t$Rd.2s, $Imm$Simm"),
                               [(set (v2i32 VPR64:$Rd),
                                  (v2i32 (opnode (timm:$Imm),
                                    (neon_mov_imm_LSL_operand:$Simm))))],
@@ -1070,7 +1111,7 @@ multiclass NeonI_mov_imm_lsl_sizes<string asmop, bit op,
                               (outs VPR128:$Rd),
                               (ins neon_uimm8:$Imm,
                                 neon_mov_imm_LSL_operand:$Simm),
-                              !strconcat(asmop, " $Rd.4s, $Imm$Simm"),
+                              !strconcat(asmop, "\t$Rd.4s, $Imm$Simm"),
                               [(set (v4i32 VPR128:$Rd),
                                  (v4i32 (opnode (timm:$Imm),
                                    (neon_mov_imm_LSL_operand:$Simm))))],
@@ -1084,7 +1125,7 @@ multiclass NeonI_mov_imm_lsl_sizes<string asmop, bit op,
                               (outs VPR64:$Rd),
                               (ins neon_uimm8:$Imm,
                                 neon_mov_imm_LSLH_operand:$Simm),
-                              !strconcat(asmop, " $Rd.4h, $Imm$Simm"),
+                              !strconcat(asmop, "\t$Rd.4h, $Imm$Simm"),
                               [(set (v4i16 VPR64:$Rd),
                                  (v4i16 (opnode (timm:$Imm),
                                    (neon_mov_imm_LSLH_operand:$Simm))))],
@@ -1097,7 +1138,7 @@ multiclass NeonI_mov_imm_lsl_sizes<string asmop, bit op,
                               (outs VPR128:$Rd),
                               (ins neon_uimm8:$Imm,
                                 neon_mov_imm_LSLH_operand:$Simm),
-                              !strconcat(asmop, " $Rd.8h, $Imm$Simm"),
+                              !strconcat(asmop, "\t$Rd.8h, $Imm$Simm"),
                               [(set (v8i16 VPR128:$Rd),
                                  (v8i16 (opnode (timm:$Imm),
                                    (neon_mov_imm_LSLH_operand:$Simm))))],
@@ -1117,7 +1158,7 @@ multiclass NeonI_mov_imm_with_constraint_lsl_sizes<string asmop, bit op,
                  (outs VPR64:$Rd),
                  (ins VPR64:$src, neon_uimm8:$Imm,
                    neon_mov_imm_LSL_operand:$Simm),
-                 !strconcat(asmop, " $Rd.2s, $Imm$Simm"),
+                 !strconcat(asmop, "\t$Rd.2s, $Imm$Simm"),
                  [(set (v2i32 VPR64:$Rd),
                     (v2i32 (opnode (v2i32 VPR64:$src),
                       (v2i32 (bitconvert (v2i32 (neonopnode timm:$Imm,
@@ -1131,7 +1172,7 @@ multiclass NeonI_mov_imm_with_constraint_lsl_sizes<string asmop, bit op,
                  (outs VPR128:$Rd),
                  (ins VPR128:$src, neon_uimm8:$Imm,
                    neon_mov_imm_LSL_operand:$Simm),
-                 !strconcat(asmop, " $Rd.4s, $Imm$Simm"),
+                 !strconcat(asmop, "\t$Rd.4s, $Imm$Simm"),
                  [(set (v4i32 VPR128:$Rd),
                     (v4i32 (opnode (v4i32 VPR128:$src),
                       (v4i32 (bitconvert (v4i32 (neonopnode timm:$Imm,
@@ -1146,7 +1187,7 @@ multiclass NeonI_mov_imm_with_constraint_lsl_sizes<string asmop, bit op,
                  (outs VPR64:$Rd),
                  (ins VPR64:$src, neon_uimm8:$Imm,
                    neon_mov_imm_LSLH_operand:$Simm),
-                 !strconcat(asmop, " $Rd.4h, $Imm$Simm"),
+                 !strconcat(asmop, "\t$Rd.4h, $Imm$Simm"),
                  [(set (v4i16 VPR64:$Rd),
                     (v4i16 (opnode (v4i16 VPR64:$src),
                        (v4i16 (bitconvert (v4i16 (neonopnode timm:$Imm,
@@ -1160,7 +1201,7 @@ multiclass NeonI_mov_imm_with_constraint_lsl_sizes<string asmop, bit op,
                  (outs VPR128:$Rd),
                  (ins VPR128:$src, neon_uimm8:$Imm,
                    neon_mov_imm_LSLH_operand:$Simm),
-                 !strconcat(asmop, " $Rd.8h, $Imm$Simm"),
+                 !strconcat(asmop, "\t$Rd.8h, $Imm$Simm"),
                  [(set (v8i16 VPR128:$Rd),
                     (v8i16 (opnode (v8i16 VPR128:$src),
                       (v8i16 (bitconvert (v8i16 (neonopnode timm:$Imm,
@@ -1180,7 +1221,7 @@ multiclass NeonI_mov_imm_msl_sizes<string asmop, bit op,
                              (outs VPR64:$Rd),
                              (ins neon_uimm8:$Imm,
                                neon_mov_imm_MSL_operand:$Simm),
-                             !strconcat(asmop, " $Rd.2s, $Imm$Simm"),
+                             !strconcat(asmop, "\t$Rd.2s, $Imm$Simm"),
                               [(set (v2i32 VPR64:$Rd),
                                  (v2i32 (opnode (timm:$Imm),
                                    (neon_mov_imm_MSL_operand:$Simm))))],
@@ -1193,7 +1234,7 @@ multiclass NeonI_mov_imm_msl_sizes<string asmop, bit op,
                               (outs VPR128:$Rd),
                               (ins neon_uimm8:$Imm,
                                 neon_mov_imm_MSL_operand:$Simm),
-                              !strconcat(asmop, " $Rd.4s, $Imm$Simm"),
+                              !strconcat(asmop, "\t$Rd.4s, $Imm$Simm"),
                               [(set (v4i32 VPR128:$Rd),
                                  (v4i32 (opnode (timm:$Imm),
                                    (neon_mov_imm_MSL_operand:$Simm))))],
@@ -1315,8 +1356,8 @@ defm MVNIvi_msl : NeonI_mov_imm_msl_sizes<"mvni", 0b1, Neon_mvni>;
 }
 
 class NeonI_mov_imm_lsl_aliases<string asmop, string asmlane,
-                                Instruction inst, RegisterClass VPRC>
-  : NeonInstAlias<!strconcat(asmop, " $Rd," # asmlane # ", $Imm"),
+                                Instruction inst, RegisterOperand VPRC>
+  : NeonInstAlias<!strconcat(asmop, "\t$Rd," # asmlane # ", $Imm"),
                         (inst VPRC:$Rd, neon_uimm8:$Imm,  0), 0b0>;
 
 // Aliases for Vector Move Immediate Shifted
@@ -1382,9 +1423,8 @@ let isReMaterializable = 1 in {
 def MOVIdi : NeonI_1VModImm<0b0, 0b1,
                            (outs FPR64:$Rd), (ins neon_uimm64_mask:$Imm),
                            "movi\t $Rd, $Imm",
-                           [(set (f64 FPR64:$Rd),
-                              (f64 (bitconvert
-                                (v1i64 (Neon_movi (timm:$Imm), (i32 imm))))))],
+                           [(set (v1i64 FPR64:$Rd),
+                             (v1i64 (Neon_movi (timm:$Imm), (i32 imm))))],
                            NoItinerary> {
   let cmode = 0b1110;
 }
@@ -1392,7 +1432,7 @@ def MOVIdi : NeonI_1VModImm<0b0, 0b1,
 
 // Vector Floating Point Move Immediate
 
-class NeonI_FMOV_impl<string asmlane, RegisterClass VPRC, ValueType OpTy,
+class NeonI_FMOV_impl<string asmlane, RegisterOperand VPRC, ValueType OpTy,
                       Operand immOpType, bit q, bit op>
   : NeonI_1VModImm<q, op,
                    (outs VPRC:$Rd), (ins immOpType:$Imm),
@@ -1409,49 +1449,3339 @@ def FMOVvi_4S : NeonI_FMOV_impl<".4s", VPR128, v4f32, fmov32_operand, 0b1, 0b0>;
 def FMOVvi_2D : NeonI_FMOV_impl<".2d", VPR128, v2f64, fmov64_operand, 0b1, 0b1>;
 }
 
-// Scalar Arithmetic
+// Vector Shift (Immediate)
+// Immediate in [0, 63]
+def imm0_63 : Operand<i32> {
+  let ParserMatchClass = uimm6_asmoperand;
+}
 
-class NeonI_Scalar3Same_D_size<bit u, bits<5> opcode, string asmop>
-  : NeonI_Scalar3Same<u, 0b11, opcode,
-                (outs FPR64:$Rd), (ins FPR64:$Rn, FPR64:$Rm),
-                !strconcat(asmop, " $Rd, $Rn, $Rm"),
-                [],
+// Shift Right/Left Immediate - The immh:immb field of these shifts are encoded
+// as follows:
+//
+//    Offset    Encoding
+//     8        immh:immb<6:3> = '0001xxx', <imm> is encoded in immh:immb<2:0>
+//     16       immh:immb<6:4> = '001xxxx', <imm> is encoded in immh:immb<3:0>
+//     32       immh:immb<6:5> = '01xxxxx', <imm> is encoded in immh:immb<4:0>
+//     64       immh:immb<6>   = '1xxxxxx', <imm> is encoded in immh:immb<5:0>
+//
+// The shift right immediate amount, in the range 1 to element bits, is computed
+// as Offset - UInt(immh:immb).  The shift left immediate amount, in the range 0
+// to element bits - 1, is computed as UInt(immh:immb) - Offset.
+
+class shr_imm_asmoperands<string OFFSET> : AsmOperandClass {
+  let Name = "ShrImm" # OFFSET;
+  let RenderMethod = "addImmOperands";
+  let DiagnosticType = "ShrImm" # OFFSET;
+}
+
+class shr_imm<string OFFSET> : Operand<i32> {
+  let EncoderMethod = "getShiftRightImm" # OFFSET;
+  let DecoderMethod = "DecodeShiftRightImm" # OFFSET;
+  let ParserMatchClass =
+    !cast<AsmOperandClass>("shr_imm" # OFFSET # "_asmoperand");
+}
+
+def shr_imm8_asmoperand : shr_imm_asmoperands<"8">;
+def shr_imm16_asmoperand : shr_imm_asmoperands<"16">;
+def shr_imm32_asmoperand : shr_imm_asmoperands<"32">;
+def shr_imm64_asmoperand : shr_imm_asmoperands<"64">;
+
+def shr_imm8 : shr_imm<"8">, ImmLeaf<i32, [{return Imm > 0 && Imm <= 8;}]>;
+def shr_imm16 : shr_imm<"16">, ImmLeaf<i32, [{return Imm > 0 && Imm <= 16;}]>;
+def shr_imm32 : shr_imm<"32">, ImmLeaf<i32, [{return Imm > 0 && Imm <= 32;}]>;
+def shr_imm64 : shr_imm<"64">, ImmLeaf<i32, [{return Imm > 0 && Imm <= 64;}]>;
+
+class shl_imm_asmoperands<string OFFSET> : AsmOperandClass {
+  let Name = "ShlImm" # OFFSET;
+  let RenderMethod = "addImmOperands";
+  let DiagnosticType = "ShlImm" # OFFSET;
+}
+
+class shl_imm<string OFFSET> : Operand<i32> {
+  let EncoderMethod = "getShiftLeftImm" # OFFSET;
+  let DecoderMethod = "DecodeShiftLeftImm" # OFFSET;
+  let ParserMatchClass =
+    !cast<AsmOperandClass>("shl_imm" # OFFSET # "_asmoperand");
+}
+
+def shl_imm8_asmoperand : shl_imm_asmoperands<"8">;
+def shl_imm16_asmoperand : shl_imm_asmoperands<"16">;
+def shl_imm32_asmoperand : shl_imm_asmoperands<"32">;
+def shl_imm64_asmoperand : shl_imm_asmoperands<"64">;
+
+def shl_imm8 : shl_imm<"8">, ImmLeaf<i32, [{return Imm >= 0 && Imm < 8;}]>;
+def shl_imm16 : shl_imm<"16">, ImmLeaf<i32, [{return Imm >= 0 && Imm < 16;}]>;
+def shl_imm32 : shl_imm<"32">, ImmLeaf<i32, [{return Imm >= 0 && Imm < 32;}]>;
+def shl_imm64 : shl_imm<"64">, ImmLeaf<i32, [{return Imm >= 0 && Imm < 64;}]>;
+
+class N2VShift<bit q, bit u, bits<5> opcode, string asmop, string T,
+               RegisterOperand VPRC, ValueType Ty, Operand ImmTy, SDNode OpNode>
+  : NeonI_2VShiftImm<q, u, opcode,
+                     (outs VPRC:$Rd), (ins VPRC:$Rn, ImmTy:$Imm),
+                     asmop # "\t$Rd." # T # ", $Rn." # T # ", $Imm",
+                     [(set (Ty VPRC:$Rd),
+                        (Ty (OpNode (Ty VPRC:$Rn),
+                          (Ty (Neon_vdup (i32 ImmTy:$Imm))))))],
+                     NoItinerary>;
+
+multiclass NeonI_N2VShL<bit u, bits<5> opcode, string asmop> {
+  // 64-bit vector types.
+  def _8B : N2VShift<0b0, u, opcode, asmop, "8b", VPR64, v8i8, shl_imm8, shl> {
+    let Inst{22-19} = 0b0001;  // immh:immb = 0001xxx
+  }
+
+  def _4H : N2VShift<0b0, u, opcode, asmop, "4h", VPR64, v4i16, shl_imm16, shl> {
+    let Inst{22-20} = 0b001;   // immh:immb = 001xxxx
+  }
+
+  def _2S : N2VShift<0b0, u, opcode, asmop, "2s", VPR64, v2i32, shl_imm32, shl> {
+    let Inst{22-21} = 0b01;    // immh:immb = 01xxxxx
+  }
+
+  // 128-bit vector types.
+  def _16B : N2VShift<0b1, u, opcode, asmop, "16b", VPR128, v16i8, shl_imm8, shl> {
+    let Inst{22-19} = 0b0001;  // immh:immb = 0001xxx
+  }
+
+  def _8H : N2VShift<0b1, u, opcode, asmop, "8h", VPR128, v8i16, shl_imm16, shl> {
+    let Inst{22-20} = 0b001;   // immh:immb = 001xxxx
+  }
+
+  def _4S : N2VShift<0b1, u, opcode, asmop, "4s", VPR128, v4i32, shl_imm32, shl> {
+    let Inst{22-21} = 0b01;    // immh:immb = 01xxxxx
+  }
+
+  def _2D : N2VShift<0b1, u, opcode, asmop, "2d", VPR128, v2i64, shl_imm64, shl> {
+    let Inst{22} = 0b1;        // immh:immb = 1xxxxxx
+  }
+}
+
+multiclass NeonI_N2VShR<bit u, bits<5> opcode, string asmop, SDNode OpNode> {
+  def _8B : N2VShift<0b0, u, opcode, asmop, "8b", VPR64, v8i8, shr_imm8,
+                     OpNode> {
+    let Inst{22-19} = 0b0001;
+  }
+
+  def _4H : N2VShift<0b0, u, opcode, asmop, "4h", VPR64, v4i16, shr_imm16,
+                     OpNode> {
+    let Inst{22-20} = 0b001;
+  }
+
+  def _2S : N2VShift<0b0, u, opcode, asmop, "2s", VPR64, v2i32, shr_imm32,
+                     OpNode> {
+     let Inst{22-21} = 0b01;
+  }
+
+  def _16B : N2VShift<0b1, u, opcode, asmop, "16b", VPR128, v16i8, shr_imm8,
+                      OpNode> {
+                      let Inst{22-19} = 0b0001;
+                    }
+
+  def _8H : N2VShift<0b1, u, opcode, asmop, "8h", VPR128, v8i16, shr_imm16,
+                     OpNode> {
+                     let Inst{22-20} = 0b001;
+                    }
+
+  def _4S : N2VShift<0b1, u, opcode, asmop, "4s", VPR128, v4i32, shr_imm32,
+                     OpNode> {
+                      let Inst{22-21} = 0b01;
+                    }
+
+  def _2D : N2VShift<0b1, u, opcode, asmop, "2d", VPR128, v2i64, shr_imm64,
+                     OpNode> {
+                      let Inst{22} = 0b1;
+                    }
+}
+
+// Shift left
+defm SHLvvi : NeonI_N2VShL<0b0, 0b01010, "shl">;
+
+// Shift right
+defm SSHRvvi : NeonI_N2VShR<0b0, 0b00000, "sshr", sra>;
+defm USHRvvi : NeonI_N2VShR<0b1, 0b00000, "ushr", srl>;
+
+def Neon_High16B : PatFrag<(ops node:$in),
+                           (extract_subvector (v16i8 node:$in), (iPTR 8))>;
+def Neon_High8H  : PatFrag<(ops node:$in),
+                           (extract_subvector (v8i16 node:$in), (iPTR 4))>;
+def Neon_High4S  : PatFrag<(ops node:$in),
+                           (extract_subvector (v4i32 node:$in), (iPTR 2))>;
+def Neon_High2D  : PatFrag<(ops node:$in),
+                           (extract_subvector (v2i64 node:$in), (iPTR 1))>;
+def Neon_High4float : PatFrag<(ops node:$in),
+                               (extract_subvector (v4f32 node:$in), (iPTR 2))>;
+def Neon_High2double : PatFrag<(ops node:$in),
+                               (extract_subvector (v2f64 node:$in), (iPTR 1))>;
+
+def Neon_Low16B : PatFrag<(ops node:$in),
+                          (v8i8 (extract_subvector (v16i8 node:$in),
+                                                   (iPTR 0)))>;
+def Neon_Low8H : PatFrag<(ops node:$in),
+                         (v4i16 (extract_subvector (v8i16 node:$in),
+                                                   (iPTR 0)))>;
+def Neon_Low4S : PatFrag<(ops node:$in),
+                         (v2i32 (extract_subvector (v4i32 node:$in),
+                                                   (iPTR 0)))>;
+def Neon_Low2D : PatFrag<(ops node:$in),
+                         (v1i64 (extract_subvector (v2i64 node:$in),
+                                                   (iPTR 0)))>;
+def Neon_Low4float : PatFrag<(ops node:$in),
+                             (v2f32 (extract_subvector (v4f32 node:$in),
+                                                       (iPTR 0)))>;
+def Neon_Low2double : PatFrag<(ops node:$in),
+                              (v1f64 (extract_subvector (v2f64 node:$in),
+                                                        (iPTR 0)))>;
+
+class N2VShiftLong<bit q, bit u, bits<5> opcode, string asmop, string DestT,
+                   string SrcT, ValueType DestTy, ValueType SrcTy,
+                   Operand ImmTy, SDPatternOperator ExtOp>
+  : NeonI_2VShiftImm<q, u, opcode, (outs VPR128:$Rd),
+                     (ins VPR64:$Rn, ImmTy:$Imm),
+                     asmop # "\t$Rd." # DestT # ", $Rn." # SrcT # ", $Imm",
+                     [(set (DestTy VPR128:$Rd),
+                        (DestTy (shl
+                          (DestTy (ExtOp (SrcTy VPR64:$Rn))),
+                            (DestTy (Neon_vdup (i32 ImmTy:$Imm))))))],
+                     NoItinerary>;
+
+class N2VShiftLongHigh<bit q, bit u, bits<5> opcode, string asmop, string DestT,
+                       string SrcT, ValueType DestTy, ValueType SrcTy,
+                       int StartIndex, Operand ImmTy,
+                       SDPatternOperator ExtOp, PatFrag getTop>
+  : NeonI_2VShiftImm<q, u, opcode, (outs VPR128:$Rd),
+                     (ins VPR128:$Rn, ImmTy:$Imm),
+                     asmop # "2\t$Rd." # DestT # ", $Rn." # SrcT # ", $Imm",
+                     [(set (DestTy VPR128:$Rd),
+                        (DestTy (shl
+                          (DestTy (ExtOp
+                            (SrcTy (getTop VPR128:$Rn)))),
+                              (DestTy (Neon_vdup (i32 ImmTy:$Imm))))))],
+                     NoItinerary>;
+
+multiclass NeonI_N2VShLL<string prefix, bit u, bits<5> opcode, string asmop,
+                         SDNode ExtOp> {
+  // 64-bit vector types.
+  def _8B : N2VShiftLong<0b0, u, opcode, asmop, "8h", "8b", v8i16, v8i8,
+                         shl_imm8, ExtOp> {
+    let Inst{22-19} = 0b0001;  // immh:immb = 0001xxx
+  }
+
+  def _4H : N2VShiftLong<0b0, u, opcode, asmop, "4s", "4h", v4i32, v4i16,
+                         shl_imm16, ExtOp> {
+    let Inst{22-20} = 0b001;   // immh:immb = 001xxxx
+  }
+
+  def _2S : N2VShiftLong<0b0, u, opcode, asmop, "2d", "2s", v2i64, v2i32,
+                         shl_imm32, ExtOp> {
+    let Inst{22-21} = 0b01;    // immh:immb = 01xxxxx
+  }
+
+  // 128-bit vector types
+  def _16B : N2VShiftLongHigh<0b1, u, opcode, asmop, "8h", "16b", v8i16, v8i8,
+                              8, shl_imm8, ExtOp, Neon_High16B> {
+    let Inst{22-19} = 0b0001;  // immh:immb = 0001xxx
+  }
+
+  def _8H : N2VShiftLongHigh<0b1, u, opcode, asmop, "4s", "8h", v4i32, v4i16,
+                             4, shl_imm16, ExtOp, Neon_High8H> {
+    let Inst{22-20} = 0b001;   // immh:immb = 001xxxx
+  }
+
+  def _4S : N2VShiftLongHigh<0b1, u, opcode, asmop, "2d", "4s", v2i64, v2i32,
+                             2, shl_imm32, ExtOp, Neon_High4S> {
+    let Inst{22-21} = 0b01;    // immh:immb = 01xxxxx
+  }
+
+  // Use other patterns to match when the immediate is 0.
+  def : Pat<(v8i16 (ExtOp (v8i8 VPR64:$Rn))),
+            (!cast<Instruction>(prefix # "_8B") VPR64:$Rn, 0)>;
+
+  def : Pat<(v4i32 (ExtOp (v4i16 VPR64:$Rn))),
+            (!cast<Instruction>(prefix # "_4H") VPR64:$Rn, 0)>;
+
+  def : Pat<(v2i64 (ExtOp (v2i32 VPR64:$Rn))),
+            (!cast<Instruction>(prefix # "_2S") VPR64:$Rn, 0)>;
+
+  def : Pat<(v8i16 (ExtOp (v8i8 (Neon_High16B VPR128:$Rn)))),
+            (!cast<Instruction>(prefix # "_16B") VPR128:$Rn, 0)>;
+
+  def : Pat<(v4i32 (ExtOp (v4i16 (Neon_High8H VPR128:$Rn)))),
+            (!cast<Instruction>(prefix # "_8H") VPR128:$Rn, 0)>;
+
+  def : Pat<(v2i64 (ExtOp (v2i32 (Neon_High4S VPR128:$Rn)))),
+            (!cast<Instruction>(prefix # "_4S") VPR128:$Rn, 0)>;
+}
+
+// Shift left long
+defm SSHLLvvi : NeonI_N2VShLL<"SSHLLvvi", 0b0, 0b10100, "sshll", sext>;
+defm USHLLvvi : NeonI_N2VShLL<"USHLLvvi", 0b1, 0b10100, "ushll", zext>;
+
+// Rounding/Saturating shift
+class N2VShift_RQ<bit q, bit u, bits<5> opcode, string asmop, string T,
+                  RegisterOperand VPRC, ValueType Ty, Operand ImmTy,
+                  SDPatternOperator OpNode>
+  : NeonI_2VShiftImm<q, u, opcode,
+                     (outs VPRC:$Rd), (ins VPRC:$Rn, ImmTy:$Imm),
+                     asmop # "\t$Rd." # T # ", $Rn." # T # ", $Imm",
+                     [(set (Ty VPRC:$Rd), (Ty (OpNode (Ty VPRC:$Rn),
+                        (i32 ImmTy:$Imm))))],
+                     NoItinerary>;
+
+// shift right (vector by immediate)
+multiclass NeonI_N2VShR_RQ<bit u, bits<5> opcode, string asmop,
+                           SDPatternOperator OpNode> {
+  def _8B  : N2VShift_RQ<0b0, u, opcode, asmop, "8b", VPR64, v8i8, shr_imm8,
+                         OpNode> {
+    let Inst{22-19} = 0b0001;
+  }
+
+  def _4H  : N2VShift_RQ<0b0, u, opcode, asmop, "4h", VPR64, v4i16, shr_imm16,
+                         OpNode> {
+    let Inst{22-20} = 0b001;
+  }
+
+  def _2S  : N2VShift_RQ<0b0, u, opcode, asmop, "2s", VPR64, v2i32, shr_imm32,
+                         OpNode> {
+    let Inst{22-21} = 0b01;
+  }
+
+  def _16B : N2VShift_RQ<0b1, u, opcode, asmop, "16b", VPR128, v16i8, shr_imm8,
+                         OpNode> {
+    let Inst{22-19} = 0b0001;
+  }
+
+  def _8H : N2VShift_RQ<0b1, u, opcode, asmop, "8h", VPR128, v8i16, shr_imm16,
+                        OpNode> {
+    let Inst{22-20} = 0b001;
+  }
+
+  def _4S : N2VShift_RQ<0b1, u, opcode, asmop, "4s", VPR128, v4i32, shr_imm32,
+                        OpNode> {
+    let Inst{22-21} = 0b01;
+  }
+
+  def _2D : N2VShift_RQ<0b1, u, opcode, asmop, "2d", VPR128, v2i64, shr_imm64,
+                        OpNode> {
+    let Inst{22} = 0b1;
+  }
+}
+
+multiclass NeonI_N2VShL_Q<bit u, bits<5> opcode, string asmop,
+                          SDPatternOperator OpNode> {
+  // 64-bit vector types.
+  def _8B : N2VShift_RQ<0b0, u, opcode, asmop, "8b", VPR64, v8i8, shl_imm8,
+                        OpNode> {
+    let Inst{22-19} = 0b0001;
+  }
+
+  def _4H : N2VShift_RQ<0b0, u, opcode, asmop, "4h", VPR64, v4i16, shl_imm16,
+                        OpNode> {
+    let Inst{22-20} = 0b001;
+  }
+
+  def _2S : N2VShift_RQ<0b0, u, opcode, asmop, "2s", VPR64, v2i32, shl_imm32,
+                        OpNode> {
+    let Inst{22-21} = 0b01;
+  }
+
+  // 128-bit vector types.
+  def _16B : N2VShift_RQ<0b1, u, opcode, asmop, "16b", VPR128, v16i8, shl_imm8,
+                         OpNode> {
+    let Inst{22-19} = 0b0001;
+  }
+
+  def _8H : N2VShift_RQ<0b1, u, opcode, asmop, "8h", VPR128, v8i16, shl_imm16,
+                        OpNode> {
+    let Inst{22-20} = 0b001;
+  }
+
+  def _4S : N2VShift_RQ<0b1, u, opcode, asmop, "4s", VPR128, v4i32, shl_imm32,
+                        OpNode> {
+    let Inst{22-21} = 0b01;
+  }
+
+  def _2D : N2VShift_RQ<0b1, u, opcode, asmop, "2d", VPR128, v2i64, shl_imm64,
+                        OpNode> {
+    let Inst{22} = 0b1;
+  }
+}
+
+// Rounding shift right
+defm SRSHRvvi : NeonI_N2VShR_RQ<0b0, 0b00100, "srshr",
+                                int_aarch64_neon_vsrshr>;
+defm URSHRvvi : NeonI_N2VShR_RQ<0b1, 0b00100, "urshr",
+                                int_aarch64_neon_vurshr>;
+
+// Saturating shift left unsigned
+defm SQSHLUvvi : NeonI_N2VShL_Q<0b1, 0b01100, "sqshlu", int_aarch64_neon_vsqshlu>;
+
+// Saturating shift left
+defm SQSHLvvi : NeonI_N2VShL_Q<0b0, 0b01110, "sqshl", Neon_sqrshlImm>;
+defm UQSHLvvi : NeonI_N2VShL_Q<0b1, 0b01110, "uqshl", Neon_uqrshlImm>;
+
+class N2VShiftAdd<bit q, bit u, bits<5> opcode, string asmop, string T,
+                  RegisterOperand VPRC, ValueType Ty, Operand ImmTy,
+                  SDNode OpNode>
+  : NeonI_2VShiftImm<q, u, opcode,
+           (outs VPRC:$Rd), (ins VPRC:$src, VPRC:$Rn, ImmTy:$Imm),
+           asmop # "\t$Rd." # T # ", $Rn." # T # ", $Imm",
+           [(set (Ty VPRC:$Rd), (Ty (add (Ty VPRC:$src),
+              (Ty (OpNode (Ty VPRC:$Rn),
+                (Ty (Neon_vdup (i32 ImmTy:$Imm))))))))],
+           NoItinerary> {
+  let Constraints = "$src = $Rd";
+}
+
+// Shift Right accumulate
+multiclass NeonI_N2VShRAdd<bit u, bits<5> opcode, string asmop, SDNode OpNode> {
+  def _8B : N2VShiftAdd<0b0, u, opcode, asmop, "8b", VPR64, v8i8, shr_imm8,
+                        OpNode> {
+    let Inst{22-19} = 0b0001;
+  }
+
+  def _4H : N2VShiftAdd<0b0, u, opcode, asmop, "4h", VPR64, v4i16, shr_imm16,
+                        OpNode> {
+    let Inst{22-20} = 0b001;
+  }
+
+  def _2S : N2VShiftAdd<0b0, u, opcode, asmop, "2s", VPR64, v2i32, shr_imm32,
+                        OpNode> {
+    let Inst{22-21} = 0b01;
+  }
+
+  def _16B : N2VShiftAdd<0b1, u, opcode, asmop, "16b", VPR128, v16i8, shr_imm8,
+                         OpNode> {
+    let Inst{22-19} = 0b0001;
+  }
+
+  def _8H : N2VShiftAdd<0b1, u, opcode, asmop, "8h", VPR128, v8i16, shr_imm16,
+                        OpNode> {
+    let Inst{22-20} = 0b001;
+  }
+
+  def _4S : N2VShiftAdd<0b1, u, opcode, asmop, "4s", VPR128, v4i32, shr_imm32,
+                        OpNode> {
+    let Inst{22-21} = 0b01;
+  }
+
+  def _2D : N2VShiftAdd<0b1, u, opcode, asmop, "2d", VPR128, v2i64, shr_imm64,
+                        OpNode> {
+    let Inst{22} = 0b1;
+  }
+}
+
+// Shift right and accumulate
+defm SSRAvvi    : NeonI_N2VShRAdd<0, 0b00010, "ssra", sra>;
+defm USRAvvi    : NeonI_N2VShRAdd<1, 0b00010, "usra", srl>;
+
+// Rounding shift accumulate
+class N2VShiftAdd_R<bit q, bit u, bits<5> opcode, string asmop, string T,
+                    RegisterOperand VPRC, ValueType Ty, Operand ImmTy,
+                    SDPatternOperator OpNode>
+  : NeonI_2VShiftImm<q, u, opcode,
+                     (outs VPRC:$Rd), (ins VPRC:$src, VPRC:$Rn, ImmTy:$Imm),
+                     asmop # "\t$Rd." # T # ", $Rn." # T # ", $Imm",
+                     [(set (Ty VPRC:$Rd), (Ty (add (Ty VPRC:$src),
+                        (Ty (OpNode (Ty VPRC:$Rn), (i32 ImmTy:$Imm))))))],
+                     NoItinerary> {
+  let Constraints = "$src = $Rd";
+}
+
+multiclass NeonI_N2VShRAdd_R<bit u, bits<5> opcode, string asmop,
+                             SDPatternOperator OpNode> {
+  def _8B : N2VShiftAdd_R<0b0, u, opcode, asmop, "8b", VPR64, v8i8, shr_imm8,
+                          OpNode> {
+    let Inst{22-19} = 0b0001;
+  }
+
+  def _4H : N2VShiftAdd_R<0b0, u, opcode, asmop, "4h", VPR64, v4i16, shr_imm16,
+                          OpNode> {
+    let Inst{22-20} = 0b001;
+  }
+
+  def _2S : N2VShiftAdd_R<0b0, u, opcode, asmop, "2s", VPR64, v2i32, shr_imm32,
+                          OpNode> {
+    let Inst{22-21} = 0b01;
+  }
+
+  def _16B : N2VShiftAdd_R<0b1, u, opcode, asmop, "16b", VPR128, v16i8, shr_imm8,
+                           OpNode> {
+    let Inst{22-19} = 0b0001;
+  }
+
+  def _8H : N2VShiftAdd_R<0b1, u, opcode, asmop, "8h", VPR128, v8i16, shr_imm16,
+                          OpNode> {
+    let Inst{22-20} = 0b001;
+  }
+
+  def _4S : N2VShiftAdd_R<0b1, u, opcode, asmop, "4s", VPR128, v4i32, shr_imm32,
+                          OpNode> {
+    let Inst{22-21} = 0b01;
+  }
+
+  def _2D : N2VShiftAdd_R<0b1, u, opcode, asmop, "2d", VPR128, v2i64, shr_imm64,
+                          OpNode> {
+    let Inst{22} = 0b1;
+  }
+}
+
+// Rounding shift right and accumulate
+defm SRSRAvvi : NeonI_N2VShRAdd_R<0, 0b00110, "srsra", int_aarch64_neon_vsrshr>;
+defm URSRAvvi : NeonI_N2VShRAdd_R<1, 0b00110, "ursra", int_aarch64_neon_vurshr>;
+
+// Shift insert by immediate
+class N2VShiftIns<bit q, bit u, bits<5> opcode, string asmop, string T,
+                  RegisterOperand VPRC, ValueType Ty, Operand ImmTy,
+                  SDPatternOperator OpNode>
+    : NeonI_2VShiftImm<q, u, opcode,
+           (outs VPRC:$Rd), (ins VPRC:$src, VPRC:$Rn, ImmTy:$Imm),
+           asmop # "\t$Rd." # T # ", $Rn." # T # ", $Imm",
+           [(set (Ty VPRC:$Rd), (Ty (OpNode (Ty VPRC:$src), (Ty VPRC:$Rn),
+             (i32 ImmTy:$Imm))))],
+           NoItinerary> {
+  let Constraints = "$src = $Rd";
+}
+
+// shift left insert (vector by immediate)
+multiclass NeonI_N2VShLIns<bit u, bits<5> opcode, string asmop> {
+  def _8B : N2VShiftIns<0b0, u, opcode, asmop, "8b", VPR64, v8i8, shl_imm8,
+                        int_aarch64_neon_vsli> {
+    let Inst{22-19} = 0b0001;
+  }
+
+  def _4H : N2VShiftIns<0b0, u, opcode, asmop, "4h", VPR64, v4i16, shl_imm16,
+                        int_aarch64_neon_vsli> {
+    let Inst{22-20} = 0b001;
+  }
+
+  def _2S : N2VShiftIns<0b0, u, opcode, asmop, "2s", VPR64, v2i32, shl_imm32,
+                        int_aarch64_neon_vsli> {
+    let Inst{22-21} = 0b01;
+  }
+
+    // 128-bit vector types
+  def _16B : N2VShiftIns<0b1, u, opcode, asmop, "16b", VPR128, v16i8, shl_imm8,
+                         int_aarch64_neon_vsli> {
+    let Inst{22-19} = 0b0001;
+  }
+
+  def _8H : N2VShiftIns<0b1, u, opcode, asmop, "8h", VPR128, v8i16, shl_imm16,
+                        int_aarch64_neon_vsli> {
+    let Inst{22-20} = 0b001;
+  }
+
+  def _4S : N2VShiftIns<0b1, u, opcode, asmop, "4s", VPR128, v4i32, shl_imm32,
+                        int_aarch64_neon_vsli> {
+    let Inst{22-21} = 0b01;
+  }
+
+  def _2D : N2VShiftIns<0b1, u, opcode, asmop, "2d", VPR128, v2i64, shl_imm64,
+                        int_aarch64_neon_vsli> {
+    let Inst{22} = 0b1;
+  }
+}
+
+// shift right insert (vector by immediate)
+multiclass NeonI_N2VShRIns<bit u, bits<5> opcode, string asmop> {
+    // 64-bit vector types.
+  def _8B : N2VShiftIns<0b0, u, opcode, asmop, "8b", VPR64, v8i8, shr_imm8,
+                        int_aarch64_neon_vsri> {
+    let Inst{22-19} = 0b0001;
+  }
+
+  def _4H : N2VShiftIns<0b0, u, opcode, asmop, "4h", VPR64, v4i16, shr_imm16,
+                        int_aarch64_neon_vsri> {
+    let Inst{22-20} = 0b001;
+  }
+
+  def _2S : N2VShiftIns<0b0, u, opcode, asmop, "2s", VPR64, v2i32, shr_imm32,
+                        int_aarch64_neon_vsri> {
+    let Inst{22-21} = 0b01;
+  }
+
+    // 128-bit vector types
+  def _16B : N2VShiftIns<0b1, u, opcode, asmop, "16b", VPR128, v16i8, shr_imm8,
+                         int_aarch64_neon_vsri> {
+    let Inst{22-19} = 0b0001;
+  }
+
+  def _8H : N2VShiftIns<0b1, u, opcode, asmop, "8h", VPR128, v8i16, shr_imm16,
+                        int_aarch64_neon_vsri> {
+    let Inst{22-20} = 0b001;
+  }
+
+  def _4S : N2VShiftIns<0b1, u, opcode, asmop, "4s", VPR128, v4i32, shr_imm32,
+                        int_aarch64_neon_vsri> {
+    let Inst{22-21} = 0b01;
+  }
+
+  def _2D : N2VShiftIns<0b1, u, opcode, asmop, "2d", VPR128, v2i64, shr_imm64,
+                        int_aarch64_neon_vsri> {
+    let Inst{22} = 0b1;
+  }
+}
+
+// Shift left and insert
+defm SLIvvi   : NeonI_N2VShLIns<0b1, 0b01010, "sli">;
+
+// Shift right and insert
+defm SRIvvi   : NeonI_N2VShRIns<0b1, 0b01000, "sri">;
+
+class N2VShR_Narrow<bit q, bit u, bits<5> opcode, string asmop, string DestT,
+                    string SrcT, Operand ImmTy>
+  : NeonI_2VShiftImm<q, u, opcode,
+                     (outs VPR64:$Rd), (ins VPR128:$Rn, ImmTy:$Imm),
+                     asmop # "\t$Rd." # DestT # ", $Rn." # SrcT # ", $Imm",
+                     [], NoItinerary>;
+
+class N2VShR_Narrow_Hi<bit q, bit u, bits<5> opcode, string asmop, string DestT,
+                       string SrcT, Operand ImmTy>
+  : NeonI_2VShiftImm<q, u, opcode, (outs VPR128:$Rd),
+                     (ins VPR128:$src, VPR128:$Rn, ImmTy:$Imm),
+                     asmop # "\t$Rd." # DestT # ", $Rn." # SrcT # ", $Imm",
+                     [], NoItinerary> {
+  let Constraints = "$src = $Rd";
+}
+
+// left long shift by immediate
+multiclass NeonI_N2VShR_Narrow<bit u, bits<5> opcode, string asmop> {
+  def _8B : N2VShR_Narrow<0b0, u, opcode, asmop, "8b", "8h", shr_imm8> {
+    let Inst{22-19} = 0b0001;
+  }
+
+  def _4H : N2VShR_Narrow<0b0, u, opcode, asmop, "4h", "4s", shr_imm16> {
+    let Inst{22-20} = 0b001;
+  }
+
+  def _2S : N2VShR_Narrow<0b0, u, opcode, asmop, "2s", "2d", shr_imm32> {
+    let Inst{22-21} = 0b01;
+  }
+
+  // Shift Narrow High
+  def _16B : N2VShR_Narrow_Hi<0b1, u, opcode, asmop # "2", "16b", "8h",
+                              shr_imm8> {
+    let Inst{22-19} = 0b0001;
+  }
+
+  def _8H : N2VShR_Narrow_Hi<0b1, u, opcode, asmop # "2", "8h", "4s",
+                             shr_imm16> {
+    let Inst{22-20} = 0b001;
+  }
+
+  def _4S : N2VShR_Narrow_Hi<0b1, u, opcode, asmop # "2", "4s", "2d",
+                             shr_imm32> {
+    let Inst{22-21} = 0b01;
+  }
+}
+
+// Shift right narrow
+defm SHRNvvi : NeonI_N2VShR_Narrow<0b0, 0b10000, "shrn">;
+
+// Shift right narrow (prefix Q is saturating, prefix R is rounding)
+defm QSHRUNvvi :NeonI_N2VShR_Narrow<0b1, 0b10000, "sqshrun">;
+defm RSHRNvvi : NeonI_N2VShR_Narrow<0b0, 0b10001, "rshrn">;
+defm QRSHRUNvvi : NeonI_N2VShR_Narrow<0b1, 0b10001, "sqrshrun">;
+defm SQSHRNvvi : NeonI_N2VShR_Narrow<0b0, 0b10010, "sqshrn">;
+defm UQSHRNvvi : NeonI_N2VShR_Narrow<0b1, 0b10010, "uqshrn">;
+defm SQRSHRNvvi : NeonI_N2VShR_Narrow<0b0, 0b10011, "sqrshrn">;
+defm UQRSHRNvvi : NeonI_N2VShR_Narrow<0b1, 0b10011, "uqrshrn">;
+
+def Neon_combine_2D : PatFrag<(ops node:$Rm, node:$Rn),
+                              (v2i64 (concat_vectors (v1i64 node:$Rm),
+                                                     (v1i64 node:$Rn)))>;
+def Neon_combine_8H : PatFrag<(ops node:$Rm, node:$Rn),
+                              (v8i16 (concat_vectors (v4i16 node:$Rm),
+                                                     (v4i16 node:$Rn)))>;
+def Neon_combine_4S : PatFrag<(ops node:$Rm, node:$Rn),
+                              (v4i32 (concat_vectors (v2i32 node:$Rm),
+                                                     (v2i32 node:$Rn)))>;
+def Neon_combine_4f : PatFrag<(ops node:$Rm, node:$Rn),
+                              (v4f32 (concat_vectors (v2f32 node:$Rm),
+                                                     (v2f32 node:$Rn)))>;
+def Neon_combine_2d : PatFrag<(ops node:$Rm, node:$Rn),
+                              (v2f64 (concat_vectors (v1f64 node:$Rm),
+                                                     (v1f64 node:$Rn)))>;
+
+def Neon_lshrImm8H : PatFrag<(ops node:$lhs, node:$rhs),
+                             (v8i16 (srl (v8i16 node:$lhs),
+                               (v8i16 (Neon_vdup (i32 node:$rhs)))))>;
+def Neon_lshrImm4S : PatFrag<(ops node:$lhs, node:$rhs),
+                             (v4i32 (srl (v4i32 node:$lhs),
+                               (v4i32 (Neon_vdup (i32 node:$rhs)))))>;
+def Neon_lshrImm2D : PatFrag<(ops node:$lhs, node:$rhs),
+                             (v2i64 (srl (v2i64 node:$lhs),
+                               (v2i64 (Neon_vdup (i32 node:$rhs)))))>;
+def Neon_ashrImm8H : PatFrag<(ops node:$lhs, node:$rhs),
+                             (v8i16 (sra (v8i16 node:$lhs),
+                               (v8i16 (Neon_vdup (i32 node:$rhs)))))>;
+def Neon_ashrImm4S : PatFrag<(ops node:$lhs, node:$rhs),
+                             (v4i32 (sra (v4i32 node:$lhs),
+                               (v4i32 (Neon_vdup (i32 node:$rhs)))))>;
+def Neon_ashrImm2D : PatFrag<(ops node:$lhs, node:$rhs),
+                             (v2i64 (sra (v2i64 node:$lhs),
+                               (v2i64 (Neon_vdup (i32 node:$rhs)))))>;
+
+// Normal shift right narrow is matched by IR (srl/sra, trunc, concat_vectors)
+multiclass Neon_shiftNarrow_patterns<string shr> {
+  def : Pat<(v8i8 (trunc (!cast<PatFrag>("Neon_" # shr # "Imm8H") VPR128:$Rn,
+              (i32 shr_imm8:$Imm)))),
+            (SHRNvvi_8B VPR128:$Rn, imm:$Imm)>;
+  def : Pat<(v4i16 (trunc (!cast<PatFrag>("Neon_" # shr # "Imm4S") VPR128:$Rn,
+              (i32 shr_imm16:$Imm)))),
+            (SHRNvvi_4H VPR128:$Rn, imm:$Imm)>;
+  def : Pat<(v2i32 (trunc (!cast<PatFrag>("Neon_" # shr # "Imm2D") VPR128:$Rn,
+              (i32 shr_imm32:$Imm)))),
+            (SHRNvvi_2S VPR128:$Rn, imm:$Imm)>;
+
+  def : Pat<(Neon_combine_2D (v1i64 VPR64:$src), (v1i64 (bitconvert
+              (v8i8 (trunc (!cast<PatFrag>("Neon_" # shr # "Imm8H")
+                VPR128:$Rn, (i32 shr_imm8:$Imm))))))),
+            (SHRNvvi_16B (v2i64 (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64)),
+                         VPR128:$Rn, imm:$Imm)>;
+  def : Pat<(Neon_combine_2D (v1i64 VPR64:$src), (v1i64 (bitconvert
+              (v4i16 (trunc (!cast<PatFrag>("Neon_" # shr # "Imm4S")
+                VPR128:$Rn, (i32 shr_imm16:$Imm))))))),
+            (SHRNvvi_8H (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64),
+                        VPR128:$Rn, imm:$Imm)>;
+  def : Pat<(Neon_combine_2D (v1i64 VPR64:$src), (v1i64 (bitconvert
+              (v2i32 (trunc (!cast<PatFrag>("Neon_" # shr # "Imm2D")
+                VPR128:$Rn, (i32 shr_imm32:$Imm))))))),
+            (SHRNvvi_4S (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64),
+                        VPR128:$Rn, imm:$Imm)>;
+}
+
+multiclass Neon_shiftNarrow_QR_patterns<SDPatternOperator op, string prefix> {
+  def : Pat<(v8i8 (op (v8i16 VPR128:$Rn), shr_imm8:$Imm)),
+            (!cast<Instruction>(prefix # "_8B") VPR128:$Rn, imm:$Imm)>;
+  def : Pat<(v4i16 (op (v4i32 VPR128:$Rn), shr_imm16:$Imm)),
+            (!cast<Instruction>(prefix # "_4H") VPR128:$Rn, imm:$Imm)>;
+  def : Pat<(v2i32 (op (v2i64 VPR128:$Rn), shr_imm32:$Imm)),
+            (!cast<Instruction>(prefix # "_2S") VPR128:$Rn, imm:$Imm)>;
+
+  def : Pat<(Neon_combine_2D (v1i64 VPR64:$src),
+                (v1i64 (bitconvert (v8i8
+                    (op (v8i16 VPR128:$Rn), shr_imm8:$Imm))))),
+            (!cast<Instruction>(prefix # "_16B")
+                (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64),
+                VPR128:$Rn, imm:$Imm)>;
+  def : Pat<(Neon_combine_2D (v1i64 VPR64:$src),
+                (v1i64 (bitconvert (v4i16
+                    (op (v4i32 VPR128:$Rn), shr_imm16:$Imm))))),
+            (!cast<Instruction>(prefix # "_8H")
+                (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64),
+                VPR128:$Rn, imm:$Imm)>;
+  def : Pat<(Neon_combine_2D (v1i64 VPR64:$src),
+                (v1i64 (bitconvert (v2i32
+                    (op (v2i64 VPR128:$Rn), shr_imm32:$Imm))))),
+            (!cast<Instruction>(prefix # "_4S")
+                  (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64),
+                  VPR128:$Rn, imm:$Imm)>;
+}
+
+defm : Neon_shiftNarrow_patterns<"lshr">;
+defm : Neon_shiftNarrow_patterns<"ashr">;
+
+defm : Neon_shiftNarrow_QR_patterns<int_aarch64_neon_vsqshrun, "QSHRUNvvi">;
+defm : Neon_shiftNarrow_QR_patterns<int_aarch64_neon_vrshrn, "RSHRNvvi">;
+defm : Neon_shiftNarrow_QR_patterns<int_aarch64_neon_vsqrshrun, "QRSHRUNvvi">;
+defm : Neon_shiftNarrow_QR_patterns<int_aarch64_neon_vsqshrn, "SQSHRNvvi">;
+defm : Neon_shiftNarrow_QR_patterns<int_aarch64_neon_vuqshrn, "UQSHRNvvi">;
+defm : Neon_shiftNarrow_QR_patterns<int_aarch64_neon_vsqrshrn, "SQRSHRNvvi">;
+defm : Neon_shiftNarrow_QR_patterns<int_aarch64_neon_vuqrshrn, "UQRSHRNvvi">;
+
+// Convert fix-point and float-pointing
+class N2VCvt_Fx<bit q, bit u, bits<5> opcode, string asmop, string T,
+                RegisterOperand VPRC, ValueType DestTy, ValueType SrcTy,
+                Operand ImmTy, SDPatternOperator IntOp>
+  : NeonI_2VShiftImm<q, u, opcode,
+                     (outs VPRC:$Rd), (ins VPRC:$Rn, ImmTy:$Imm),
+                     asmop # "\t$Rd." # T # ", $Rn." # T # ", $Imm",
+                     [(set (DestTy VPRC:$Rd), (DestTy (IntOp (SrcTy VPRC:$Rn),
+                       (i32 ImmTy:$Imm))))],
+                     NoItinerary>;
+
+multiclass NeonI_N2VCvt_Fx2fp<bit u, bits<5> opcode, string asmop,
+                              SDPatternOperator IntOp> {
+  def _2S : N2VCvt_Fx<0, u, opcode, asmop, "2s", VPR64, v2f32, v2i32,
+                      shr_imm32, IntOp> {
+    let Inst{22-21} = 0b01;
+  }
+
+  def _4S : N2VCvt_Fx<1, u, opcode, asmop, "4s", VPR128, v4f32, v4i32,
+                      shr_imm32, IntOp> {
+    let Inst{22-21} = 0b01;
+  }
+
+  def _2D : N2VCvt_Fx<1, u, opcode, asmop, "2d", VPR128, v2f64, v2i64,
+                      shr_imm64, IntOp> {
+    let Inst{22} = 0b1;
+  }
+}
+
+multiclass NeonI_N2VCvt_Fp2fx<bit u, bits<5> opcode, string asmop,
+                              SDPatternOperator IntOp> {
+  def _2S : N2VCvt_Fx<0, u, opcode, asmop, "2s", VPR64, v2i32, v2f32,
+                      shr_imm32, IntOp> {
+    let Inst{22-21} = 0b01;
+  }
+
+  def _4S : N2VCvt_Fx<1, u, opcode, asmop, "4s", VPR128, v4i32, v4f32,
+                      shr_imm32, IntOp> {
+    let Inst{22-21} = 0b01;
+  }
+
+  def _2D : N2VCvt_Fx<1, u, opcode, asmop, "2d", VPR128, v2i64, v2f64,
+                      shr_imm64, IntOp> {
+    let Inst{22} = 0b1;
+  }
+}
+
+// Convert fixed-point to floating-point
+defm VCVTxs2f : NeonI_N2VCvt_Fx2fp<0, 0b11100, "scvtf",
+                                   int_arm_neon_vcvtfxs2fp>;
+defm VCVTxu2f : NeonI_N2VCvt_Fx2fp<1, 0b11100, "ucvtf",
+                                   int_arm_neon_vcvtfxu2fp>;
+
+// Convert floating-point to fixed-point
+defm VCVTf2xs : NeonI_N2VCvt_Fp2fx<0, 0b11111, "fcvtzs",
+                                   int_arm_neon_vcvtfp2fxs>;
+defm VCVTf2xu : NeonI_N2VCvt_Fp2fx<1, 0b11111, "fcvtzu",
+                                   int_arm_neon_vcvtfp2fxu>;
+
+multiclass Neon_sshll2_0<SDNode ext>
+{
+  def _v8i8  : PatFrag<(ops node:$Rn),
+                       (v8i16 (ext (v8i8 (Neon_High16B node:$Rn))))>;
+  def _v4i16 : PatFrag<(ops node:$Rn),
+                       (v4i32 (ext (v4i16 (Neon_High8H node:$Rn))))>;
+  def _v2i32 : PatFrag<(ops node:$Rn),
+                       (v2i64 (ext (v2i32 (Neon_High4S node:$Rn))))>;
+}
+
+defm NI_sext_high : Neon_sshll2_0<sext>;
+defm NI_zext_high : Neon_sshll2_0<zext>;
+
+
+//===----------------------------------------------------------------------===//
+// Multiclasses for NeonI_Across
+//===----------------------------------------------------------------------===//
+
+// Variant 1
+
+multiclass NeonI_2VAcross_1<bit u, bits<5> opcode,
+                            string asmop, SDPatternOperator opnode>
+{
+    def _1h8b:  NeonI_2VAcross<0b0, u, 0b00, opcode,
+                (outs FPR16:$Rd), (ins VPR64:$Rn),
+                asmop # "\t$Rd, $Rn.8b",
+                [(set (v1i16 FPR16:$Rd),
+                    (v1i16 (opnode (v8i8 VPR64:$Rn))))],
                 NoItinerary>;
 
-multiclass NeonI_Scalar3Same_BHSD_sizes<bit u, bits<5> opcode,
-                                        string asmop, bit Commutable = 0>
+    def _1h16b: NeonI_2VAcross<0b1, u, 0b00, opcode,
+                (outs FPR16:$Rd), (ins VPR128:$Rn),
+                asmop # "\t$Rd, $Rn.16b",
+                [(set (v1i16 FPR16:$Rd),
+                    (v1i16 (opnode (v16i8 VPR128:$Rn))))],
+                NoItinerary>;
+
+    def _1s4h:  NeonI_2VAcross<0b0, u, 0b01, opcode,
+                (outs FPR32:$Rd), (ins VPR64:$Rn),
+                asmop # "\t$Rd, $Rn.4h",
+                [(set (v1i32 FPR32:$Rd),
+                    (v1i32 (opnode (v4i16 VPR64:$Rn))))],
+                NoItinerary>;
+
+    def _1s8h:  NeonI_2VAcross<0b1, u, 0b01, opcode,
+                (outs FPR32:$Rd), (ins VPR128:$Rn),
+                asmop # "\t$Rd, $Rn.8h",
+                [(set (v1i32 FPR32:$Rd),
+                    (v1i32 (opnode (v8i16 VPR128:$Rn))))],
+                NoItinerary>;
+
+    // _1d2s doesn't exist!
+
+    def _1d4s:  NeonI_2VAcross<0b1, u, 0b10, opcode,
+                (outs FPR64:$Rd), (ins VPR128:$Rn),
+                asmop # "\t$Rd, $Rn.4s",
+                [(set (v1i64 FPR64:$Rd),
+                    (v1i64 (opnode (v4i32 VPR128:$Rn))))],
+                NoItinerary>;
+}
+
+defm SADDLV : NeonI_2VAcross_1<0b0, 0b00011, "saddlv", int_aarch64_neon_saddlv>;
+defm UADDLV : NeonI_2VAcross_1<0b1, 0b00011, "uaddlv", int_aarch64_neon_uaddlv>;
+
+// Variant 2
+
+multiclass NeonI_2VAcross_2<bit u, bits<5> opcode,
+                            string asmop, SDPatternOperator opnode>
 {
+    def _1b8b:  NeonI_2VAcross<0b0, u, 0b00, opcode,
+                (outs FPR8:$Rd), (ins VPR64:$Rn),
+                asmop # "\t$Rd, $Rn.8b",
+                [(set (v1i8 FPR8:$Rd),
+                    (v1i8 (opnode (v8i8 VPR64:$Rn))))],
+                NoItinerary>;
+
+    def _1b16b: NeonI_2VAcross<0b1, u, 0b00, opcode,
+                (outs FPR8:$Rd), (ins VPR128:$Rn),
+                asmop # "\t$Rd, $Rn.16b",
+                [(set (v1i8 FPR8:$Rd),
+                    (v1i8 (opnode (v16i8 VPR128:$Rn))))],
+                NoItinerary>;
+
+    def _1h4h:  NeonI_2VAcross<0b0, u, 0b01, opcode,
+                (outs FPR16:$Rd), (ins VPR64:$Rn),
+                asmop # "\t$Rd, $Rn.4h",
+                [(set (v1i16 FPR16:$Rd),
+                    (v1i16 (opnode (v4i16 VPR64:$Rn))))],
+                NoItinerary>;
+
+    def _1h8h:  NeonI_2VAcross<0b1, u, 0b01, opcode,
+                (outs FPR16:$Rd), (ins VPR128:$Rn),
+                asmop # "\t$Rd, $Rn.8h",
+                [(set (v1i16 FPR16:$Rd),
+                    (v1i16 (opnode (v8i16 VPR128:$Rn))))],
+                NoItinerary>;
+
+    // _1s2s doesn't exist!
+
+    def _1s4s:  NeonI_2VAcross<0b1, u, 0b10, opcode,
+                (outs FPR32:$Rd), (ins VPR128:$Rn),
+                asmop # "\t$Rd, $Rn.4s",
+                [(set (v1i32 FPR32:$Rd),
+                    (v1i32 (opnode (v4i32 VPR128:$Rn))))],
+                NoItinerary>;
+}
+
+defm SMAXV : NeonI_2VAcross_2<0b0, 0b01010, "smaxv", int_aarch64_neon_smaxv>;
+defm UMAXV : NeonI_2VAcross_2<0b1, 0b01010, "umaxv", int_aarch64_neon_umaxv>;
+
+defm SMINV : NeonI_2VAcross_2<0b0, 0b11010, "sminv", int_aarch64_neon_sminv>;
+defm UMINV : NeonI_2VAcross_2<0b1, 0b11010, "uminv", int_aarch64_neon_uminv>;
+
+defm ADDV : NeonI_2VAcross_2<0b0, 0b11011, "addv", int_aarch64_neon_vaddv>;
+
+// Variant 3
+
+multiclass NeonI_2VAcross_3<bit u, bits<5> opcode, bits<2> size,
+                            string asmop, SDPatternOperator opnode> {
+    def _1s4s:  NeonI_2VAcross<0b1, u, size, opcode,
+                (outs FPR32:$Rd), (ins VPR128:$Rn),
+                asmop # "\t$Rd, $Rn.4s",
+                [(set (v1f32 FPR32:$Rd),
+                    (v1f32 (opnode (v4f32 VPR128:$Rn))))],
+                NoItinerary>;
+}
+
+defm FMAXNMV : NeonI_2VAcross_3<0b1, 0b01100, 0b00, "fmaxnmv",
+                                int_aarch64_neon_vmaxnmv>;
+defm FMINNMV : NeonI_2VAcross_3<0b1, 0b01100, 0b10, "fminnmv",
+                                int_aarch64_neon_vminnmv>;
+
+defm FMAXV : NeonI_2VAcross_3<0b1, 0b01111, 0b00, "fmaxv",
+                              int_aarch64_neon_vmaxv>;
+defm FMINV : NeonI_2VAcross_3<0b1, 0b01111, 0b10, "fminv",
+                              int_aarch64_neon_vminv>;
+
+// The followings are for instruction class (Perm)
+
+class NeonI_Permute<bit q, bits<2> size, bits<3> opcode,
+                    string asmop, RegisterOperand OpVPR, string OpS,
+                    SDPatternOperator opnode, ValueType Ty>
+  : NeonI_Perm<q, size, opcode,
+               (outs OpVPR:$Rd), (ins OpVPR:$Rn, OpVPR:$Rm),
+               asmop # "\t$Rd." # OpS # ", $Rn." # OpS # ", $Rm." # OpS,
+               [(set (Ty OpVPR:$Rd),
+                  (Ty (opnode (Ty OpVPR:$Rn), (Ty OpVPR:$Rm))))],
+               NoItinerary>;
+
+multiclass NeonI_Perm_pat<bits<3> opcode, string asmop,
+                          SDPatternOperator opnode> {
+  def _8b  : NeonI_Permute<0b0, 0b00, opcode, asmop,
+                           VPR64, "8b", opnode, v8i8>;
+  def _16b : NeonI_Permute<0b1, 0b00, opcode, asmop,
+                           VPR128, "16b",opnode, v16i8>;
+  def _4h  : NeonI_Permute<0b0, 0b01, opcode, asmop,
+                           VPR64, "4h", opnode, v4i16>;
+  def _8h  : NeonI_Permute<0b1, 0b01, opcode, asmop,
+                           VPR128, "8h", opnode, v8i16>;
+  def _2s  : NeonI_Permute<0b0, 0b10, opcode, asmop,
+                           VPR64, "2s", opnode, v2i32>;
+  def _4s  : NeonI_Permute<0b1, 0b10, opcode, asmop,
+                           VPR128, "4s", opnode, v4i32>;
+  def _2d  : NeonI_Permute<0b1, 0b11, opcode, asmop,
+                           VPR128, "2d", opnode, v2i64>;
+}
+
+defm UZP1vvv : NeonI_Perm_pat<0b001, "uzp1", Neon_uzp1>;
+defm TRN1vvv : NeonI_Perm_pat<0b010, "trn1", Neon_trn1>;
+defm ZIP1vvv : NeonI_Perm_pat<0b011, "zip1", Neon_zip1>;
+defm UZP2vvv : NeonI_Perm_pat<0b101, "uzp2", Neon_uzp2>;
+defm TRN2vvv : NeonI_Perm_pat<0b110, "trn2", Neon_trn2>;
+defm ZIP2vvv : NeonI_Perm_pat<0b111, "zip2", Neon_zip2>;
+
+multiclass NeonI_Perm_float_pat<string INS, SDPatternOperator opnode> {
+  def : Pat<(v2f32 (opnode (v2f32 VPR64:$Rn), (v2f32 VPR64:$Rm))),
+            (!cast<Instruction>(INS # "_2s") VPR64:$Rn, VPR64:$Rm)>;
+
+  def : Pat<(v4f32 (opnode (v4f32 VPR128:$Rn), (v4f32 VPR128:$Rm))),
+            (!cast<Instruction>(INS # "_4s") VPR128:$Rn, VPR128:$Rm)>;
+
+  def : Pat<(v2f64 (opnode (v2f64 VPR128:$Rn), (v2f64 VPR128:$Rm))),
+            (!cast<Instruction>(INS # "_2d") VPR128:$Rn, VPR128:$Rm)>;
+}
+
+defm : NeonI_Perm_float_pat<"UZP1vvv", Neon_uzp1>;
+defm : NeonI_Perm_float_pat<"UZP2vvv", Neon_uzp2>;
+defm : NeonI_Perm_float_pat<"ZIP1vvv", Neon_zip1>;
+defm : NeonI_Perm_float_pat<"ZIP2vvv", Neon_zip2>;
+defm : NeonI_Perm_float_pat<"TRN1vvv", Neon_trn1>;
+defm : NeonI_Perm_float_pat<"TRN2vvv", Neon_trn2>;
+
+// The followings are for instruction class (3V Diff)
+
+// normal long/long2 pattern
+class NeonI_3VDL<bit q, bit u, bits<2> size, bits<4> opcode,
+                 string asmop, string ResS, string OpS,
+                 SDPatternOperator opnode, SDPatternOperator ext,
+                 RegisterOperand OpVPR,
+                 ValueType ResTy, ValueType OpTy>
+  : NeonI_3VDiff<q, u, size, opcode,
+                 (outs VPR128:$Rd), (ins OpVPR:$Rn, OpVPR:$Rm),
+                 asmop # "\t$Rd." # ResS # ", $Rn." # OpS # ", $Rm." # OpS,
+                 [(set (ResTy VPR128:$Rd),
+                    (ResTy (opnode (ResTy (ext (OpTy OpVPR:$Rn))),
+                                   (ResTy (ext (OpTy OpVPR:$Rm))))))],
+                 NoItinerary>;
+
+multiclass NeonI_3VDL_s<bit u, bits<4> opcode,
+                        string asmop, SDPatternOperator opnode,
+                        bit Commutable = 0> {
+  let isCommutable = Commutable in {
+    def _8h8b : NeonI_3VDL<0b0, u, 0b00, opcode, asmop, "8h", "8b",
+                           opnode, sext, VPR64, v8i16, v8i8>;
+    def _4s4h : NeonI_3VDL<0b0, u, 0b01, opcode, asmop, "4s", "4h",
+                           opnode, sext, VPR64, v4i32, v4i16>;
+    def _2d2s : NeonI_3VDL<0b0, u, 0b10, opcode, asmop, "2d", "2s",
+                           opnode, sext, VPR64, v2i64, v2i32>;
+  }
+}
+
+multiclass NeonI_3VDL2_s<bit u, bits<4> opcode, string asmop,
+                         SDPatternOperator opnode, bit Commutable = 0> {
+  let isCommutable = Commutable in {
+    def _8h16b : NeonI_3VDL<0b1, u, 0b00, opcode, asmop, "8h", "16b",
+                            opnode, NI_sext_high_v8i8, VPR128, v8i16, v16i8>;
+    def _4s8h  : NeonI_3VDL<0b1, u, 0b01, opcode, asmop, "4s", "8h",
+                            opnode, NI_sext_high_v4i16, VPR128, v4i32, v8i16>;
+    def _2d4s  : NeonI_3VDL<0b1, u, 0b10, opcode, asmop, "2d", "4s",
+                            opnode, NI_sext_high_v2i32, VPR128, v2i64, v4i32>;
+  }
+}
+
+multiclass NeonI_3VDL_u<bit u, bits<4> opcode, string asmop,
+                        SDPatternOperator opnode, bit Commutable = 0> {
+  let isCommutable = Commutable in {
+    def _8h8b : NeonI_3VDL<0b0, u, 0b00, opcode, asmop, "8h", "8b",
+                           opnode, zext, VPR64, v8i16, v8i8>;
+    def _4s4h : NeonI_3VDL<0b0, u, 0b01, opcode, asmop, "4s", "4h",
+                           opnode, zext, VPR64, v4i32, v4i16>;
+    def _2d2s : NeonI_3VDL<0b0, u, 0b10, opcode, asmop, "2d", "2s",
+                           opnode, zext, VPR64, v2i64, v2i32>;
+  }
+}
+
+multiclass NeonI_3VDL2_u<bit u, bits<4> opcode, string asmop,
+                         SDPatternOperator opnode, bit Commutable = 0> {
+  let isCommutable = Commutable in {
+    def _8h16b : NeonI_3VDL<0b1, u, 0b00, opcode, asmop, "8h", "16b",
+                            opnode, NI_zext_high_v8i8, VPR128, v8i16, v16i8>;
+    def _4s8h : NeonI_3VDL<0b1, u, 0b01, opcode, asmop, "4s", "8h",
+                           opnode, NI_zext_high_v4i16, VPR128, v4i32, v8i16>;
+    def _2d4s : NeonI_3VDL<0b1, u, 0b10, opcode, asmop, "2d", "4s",
+                           opnode, NI_zext_high_v2i32, VPR128, v2i64, v4i32>;
+  }
+}
+
+defm SADDLvvv :  NeonI_3VDL_s<0b0, 0b0000, "saddl", add, 1>;
+defm UADDLvvv :  NeonI_3VDL_u<0b1, 0b0000, "uaddl", add, 1>;
+
+defm SADDL2vvv :  NeonI_3VDL2_s<0b0, 0b0000, "saddl2", add, 1>;
+defm UADDL2vvv :  NeonI_3VDL2_u<0b1, 0b0000, "uaddl2", add, 1>;
+
+defm SSUBLvvv :  NeonI_3VDL_s<0b0, 0b0010, "ssubl", sub, 0>;
+defm USUBLvvv :  NeonI_3VDL_u<0b1, 0b0010, "usubl", sub, 0>;
+
+defm SSUBL2vvv :  NeonI_3VDL2_s<0b0, 0b0010, "ssubl2", sub, 0>;
+defm USUBL2vvv :  NeonI_3VDL2_u<0b1, 0b0010, "usubl2", sub, 0>;
+
+// normal wide/wide2 pattern
+class NeonI_3VDW<bit q, bit u, bits<2> size, bits<4> opcode,
+                 string asmop, string ResS, string OpS,
+                 SDPatternOperator opnode, SDPatternOperator ext,
+                 RegisterOperand OpVPR,
+                 ValueType ResTy, ValueType OpTy>
+  : NeonI_3VDiff<q, u, size, opcode,
+                 (outs VPR128:$Rd), (ins VPR128:$Rn, OpVPR:$Rm),
+                 asmop # "\t$Rd." # ResS # ", $Rn." # ResS # ", $Rm." # OpS,
+                 [(set (ResTy VPR128:$Rd),
+                    (ResTy (opnode (ResTy VPR128:$Rn),
+                                   (ResTy (ext (OpTy OpVPR:$Rm))))))],
+                 NoItinerary>;
+
+multiclass NeonI_3VDW_s<bit u, bits<4> opcode, string asmop,
+                        SDPatternOperator opnode> {
+  def _8h8b : NeonI_3VDW<0b0, u, 0b00, opcode, asmop, "8h", "8b",
+                         opnode, sext, VPR64, v8i16, v8i8>;
+  def _4s4h : NeonI_3VDW<0b0, u, 0b01, opcode, asmop, "4s", "4h",
+                         opnode, sext, VPR64, v4i32, v4i16>;
+  def _2d2s : NeonI_3VDW<0b0, u, 0b10, opcode, asmop, "2d", "2s",
+                         opnode, sext, VPR64, v2i64, v2i32>;
+}
+
+defm SADDWvvv :  NeonI_3VDW_s<0b0, 0b0001, "saddw", add>;
+defm SSUBWvvv :  NeonI_3VDW_s<0b0, 0b0011, "ssubw", sub>;
+
+multiclass NeonI_3VDW2_s<bit u, bits<4> opcode, string asmop,
+                         SDPatternOperator opnode> {
+  def _8h16b : NeonI_3VDW<0b1, u, 0b00, opcode, asmop, "8h", "16b",
+                          opnode, NI_sext_high_v8i8, VPR128, v8i16, v16i8>;
+  def _4s8h  : NeonI_3VDW<0b1, u, 0b01, opcode, asmop, "4s", "8h",
+                          opnode, NI_sext_high_v4i16, VPR128, v4i32, v8i16>;
+  def _2d4s  : NeonI_3VDW<0b1, u, 0b10, opcode, asmop, "2d", "4s",
+                          opnode, NI_sext_high_v2i32, VPR128, v2i64, v4i32>;
+}
+
+defm SADDW2vvv :  NeonI_3VDW2_s<0b0, 0b0001, "saddw2", add>;
+defm SSUBW2vvv :  NeonI_3VDW2_s<0b0, 0b0011, "ssubw2", sub>;
+
+multiclass NeonI_3VDW_u<bit u, bits<4> opcode, string asmop,
+                        SDPatternOperator opnode> {
+  def _8h8b : NeonI_3VDW<0b0, u, 0b00, opcode, asmop, "8h", "8b",
+                         opnode, zext, VPR64, v8i16, v8i8>;
+  def _4s4h : NeonI_3VDW<0b0, u, 0b01, opcode, asmop, "4s", "4h",
+                         opnode, zext, VPR64, v4i32, v4i16>;
+  def _2d2s : NeonI_3VDW<0b0, u, 0b10, opcode, asmop, "2d", "2s",
+                         opnode, zext, VPR64, v2i64, v2i32>;
+}
+
+defm UADDWvvv :  NeonI_3VDW_u<0b1, 0b0001, "uaddw", add>;
+defm USUBWvvv :  NeonI_3VDW_u<0b1, 0b0011, "usubw", sub>;
+
+multiclass NeonI_3VDW2_u<bit u, bits<4> opcode, string asmop,
+                         SDPatternOperator opnode> {
+  def _8h16b : NeonI_3VDW<0b1, u, 0b00, opcode, asmop, "8h", "16b",
+                          opnode, NI_zext_high_v8i8, VPR128, v8i16, v16i8>;
+  def _4s8h : NeonI_3VDW<0b1, u, 0b01, opcode, asmop, "4s", "8h",
+                         opnode, NI_zext_high_v4i16, VPR128, v4i32, v8i16>;
+  def _2d4s : NeonI_3VDW<0b1, u, 0b10, opcode, asmop, "2d", "4s",
+                         opnode, NI_zext_high_v2i32, VPR128, v2i64, v4i32>;
+}
+
+defm UADDW2vvv :  NeonI_3VDW2_u<0b1, 0b0001, "uaddw2", add>;
+defm USUBW2vvv :  NeonI_3VDW2_u<0b1, 0b0011, "usubw2", sub>;
+
+// Get the high half part of the vector element.
+multiclass NeonI_get_high {
+  def _8h : PatFrag<(ops node:$Rn),
+                    (v8i8 (trunc (v8i16 (srl (v8i16 node:$Rn),
+                                             (v8i16 (Neon_vdup (i32 8)))))))>;
+  def _4s : PatFrag<(ops node:$Rn),
+                    (v4i16 (trunc (v4i32 (srl (v4i32 node:$Rn),
+                                              (v4i32 (Neon_vdup (i32 16)))))))>;
+  def _2d : PatFrag<(ops node:$Rn),
+                    (v2i32 (trunc (v2i64 (srl (v2i64 node:$Rn),
+                                              (v2i64 (Neon_vdup (i32 32)))))))>;
+}
+
+defm NI_get_hi : NeonI_get_high;
+
+// pattern for addhn/subhn with 2 operands
+class NeonI_3VDN_addhn_2Op<bit q, bit u, bits<2> size, bits<4> opcode,
+                           string asmop, string ResS, string OpS,
+                           SDPatternOperator opnode, SDPatternOperator get_hi,
+                           ValueType ResTy, ValueType OpTy>
+  : NeonI_3VDiff<q, u, size, opcode,
+                 (outs VPR64:$Rd), (ins VPR128:$Rn, VPR128:$Rm),
+                 asmop # "\t$Rd." # ResS # ", $Rn." # OpS # ", $Rm." # OpS,
+                 [(set (ResTy VPR64:$Rd),
+                    (ResTy (get_hi
+                      (OpTy (opnode (OpTy VPR128:$Rn),
+                                    (OpTy VPR128:$Rm))))))],
+                 NoItinerary>;
+
+multiclass NeonI_3VDN_addhn_2Op<bit u, bits<4> opcode, string asmop,
+                                SDPatternOperator opnode, bit Commutable = 0> {
+  let isCommutable = Commutable in {
+    def _8b8h : NeonI_3VDN_addhn_2Op<0b0, u, 0b00, opcode, asmop, "8b", "8h",
+                                     opnode, NI_get_hi_8h, v8i8, v8i16>;
+    def _4h4s : NeonI_3VDN_addhn_2Op<0b0, u, 0b01, opcode, asmop, "4h", "4s",
+                                     opnode, NI_get_hi_4s, v4i16, v4i32>;
+    def _2s2d : NeonI_3VDN_addhn_2Op<0b0, u, 0b10, opcode, asmop, "2s", "2d",
+                                     opnode, NI_get_hi_2d, v2i32, v2i64>;
+  }
+}
+
+defm ADDHNvvv  : NeonI_3VDN_addhn_2Op<0b0, 0b0100, "addhn", add, 1>;
+defm SUBHNvvv  : NeonI_3VDN_addhn_2Op<0b0, 0b0110, "subhn", sub, 0>;
+
+// pattern for operation with 2 operands
+class NeonI_3VD_2Op<bit q, bit u, bits<2> size, bits<4> opcode,
+                    string asmop, string ResS, string OpS,
+                    SDPatternOperator opnode,
+                    RegisterOperand ResVPR, RegisterOperand OpVPR,
+                    ValueType ResTy, ValueType OpTy>
+  : NeonI_3VDiff<q, u, size, opcode,
+                 (outs ResVPR:$Rd), (ins OpVPR:$Rn, OpVPR:$Rm),
+                 asmop # "\t$Rd." # ResS # ", $Rn." # OpS # ", $Rm." # OpS,
+                 [(set (ResTy ResVPR:$Rd),
+                    (ResTy (opnode (OpTy OpVPR:$Rn), (OpTy OpVPR:$Rm))))],
+                 NoItinerary>;
+
+// normal narrow pattern
+multiclass NeonI_3VDN_2Op<bit u, bits<4> opcode, string asmop,
+                          SDPatternOperator opnode, bit Commutable = 0> {
+  let isCommutable = Commutable in {
+    def _8b8h : NeonI_3VD_2Op<0b0, u, 0b00, opcode, asmop, "8b", "8h",
+                              opnode, VPR64, VPR128, v8i8, v8i16>;
+    def _4h4s : NeonI_3VD_2Op<0b0, u, 0b01, opcode, asmop, "4h", "4s",
+                              opnode, VPR64, VPR128, v4i16, v4i32>;
+    def _2s2d : NeonI_3VD_2Op<0b0, u, 0b10, opcode, asmop, "2s", "2d",
+                              opnode, VPR64, VPR128, v2i32, v2i64>;
+  }
+}
+
+defm RADDHNvvv : NeonI_3VDN_2Op<0b1, 0b0100, "raddhn", int_arm_neon_vraddhn, 1>;
+defm RSUBHNvvv : NeonI_3VDN_2Op<0b1, 0b0110, "rsubhn", int_arm_neon_vrsubhn, 0>;
+
+// pattern for acle intrinsic with 3 operands
+class NeonI_3VDN_3Op<bit q, bit u, bits<2> size, bits<4> opcode,
+                     string asmop, string ResS, string OpS>
+  : NeonI_3VDiff<q, u, size, opcode,
+                 (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn, VPR128:$Rm),
+                 asmop # "\t$Rd." # ResS # ", $Rn." # OpS # ", $Rm." # OpS,
+                 [], NoItinerary> {
+  let Constraints = "$src = $Rd";
+  let neverHasSideEffects = 1;
+}
+
+multiclass NeonI_3VDN_3Op_v1<bit u, bits<4> opcode, string asmop> {
+  def _16b8h : NeonI_3VDN_3Op<0b1, u, 0b00, opcode, asmop, "16b", "8h">;
+  def _8h4s : NeonI_3VDN_3Op<0b1, u, 0b01, opcode, asmop, "8h", "4s">;
+  def _4s2d : NeonI_3VDN_3Op<0b1, u, 0b10, opcode, asmop, "4s", "2d">;
+}
+
+defm ADDHN2vvv  : NeonI_3VDN_3Op_v1<0b0, 0b0100, "addhn2">;
+defm SUBHN2vvv  : NeonI_3VDN_3Op_v1<0b0, 0b0110, "subhn2">;
+
+defm RADDHN2vvv : NeonI_3VDN_3Op_v1<0b1, 0b0100, "raddhn2">;
+defm RSUBHN2vvv : NeonI_3VDN_3Op_v1<0b1, 0b0110, "rsubhn2">;
+
+// Patterns have to be separate because there's a SUBREG_TO_REG in the output
+// part.
+class NarrowHighHalfPat<Instruction INST, ValueType DstTy, ValueType SrcTy,
+                        SDPatternOperator coreop>
+  : Pat<(Neon_combine_2D (v1i64 VPR64:$src),
+                      (v1i64 (bitconvert (DstTy (coreop (SrcTy VPR128:$Rn),
+                                                        (SrcTy VPR128:$Rm)))))),
+        (INST (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64),
+              VPR128:$Rn, VPR128:$Rm)>;
+
+// addhn2 patterns
+def : NarrowHighHalfPat<ADDHN2vvv_16b8h, v8i8,  v8i16,
+          BinOpFrag<(NI_get_hi_8h (add node:$LHS, node:$RHS))>>;
+def : NarrowHighHalfPat<ADDHN2vvv_8h4s,  v4i16, v4i32,
+          BinOpFrag<(NI_get_hi_4s (add node:$LHS, node:$RHS))>>;
+def : NarrowHighHalfPat<ADDHN2vvv_4s2d,  v2i32, v2i64,
+          BinOpFrag<(NI_get_hi_2d (add node:$LHS, node:$RHS))>>;
+
+// subhn2 patterns
+def : NarrowHighHalfPat<SUBHN2vvv_16b8h, v8i8,  v8i16,
+          BinOpFrag<(NI_get_hi_8h (sub node:$LHS, node:$RHS))>>;
+def : NarrowHighHalfPat<SUBHN2vvv_8h4s,  v4i16, v4i32,
+          BinOpFrag<(NI_get_hi_4s (sub node:$LHS, node:$RHS))>>;
+def : NarrowHighHalfPat<SUBHN2vvv_4s2d,  v2i32, v2i64,
+          BinOpFrag<(NI_get_hi_2d (sub node:$LHS, node:$RHS))>>;
+
+// raddhn2 patterns
+def : NarrowHighHalfPat<RADDHN2vvv_16b8h, v8i8,  v8i16, int_arm_neon_vraddhn>;
+def : NarrowHighHalfPat<RADDHN2vvv_8h4s,  v4i16, v4i32, int_arm_neon_vraddhn>;
+def : NarrowHighHalfPat<RADDHN2vvv_4s2d,  v2i32, v2i64, int_arm_neon_vraddhn>;
+
+// rsubhn2 patterns
+def : NarrowHighHalfPat<RSUBHN2vvv_16b8h, v8i8,  v8i16, int_arm_neon_vrsubhn>;
+def : NarrowHighHalfPat<RSUBHN2vvv_8h4s,  v4i16, v4i32, int_arm_neon_vrsubhn>;
+def : NarrowHighHalfPat<RSUBHN2vvv_4s2d,  v2i32, v2i64, int_arm_neon_vrsubhn>;
+
+// pattern that need to extend result
+class NeonI_3VDL_Ext<bit q, bit u, bits<2> size, bits<4> opcode,
+                     string asmop, string ResS, string OpS,
+                     SDPatternOperator opnode,
+                     RegisterOperand OpVPR,
+                     ValueType ResTy, ValueType OpTy, ValueType OpSTy>
+  : NeonI_3VDiff<q, u, size, opcode,
+                 (outs VPR128:$Rd), (ins OpVPR:$Rn, OpVPR:$Rm),
+                 asmop # "\t$Rd." # ResS # ", $Rn." # OpS # ", $Rm." # OpS,
+                 [(set (ResTy VPR128:$Rd),
+                    (ResTy (zext (OpSTy (opnode (OpTy OpVPR:$Rn),
+                                                (OpTy OpVPR:$Rm))))))],
+                 NoItinerary>;
+
+multiclass NeonI_3VDL_zext<bit u, bits<4> opcode, string asmop,
+                           SDPatternOperator opnode, bit Commutable = 0> {
+  let isCommutable = Commutable in {
+    def _8h8b : NeonI_3VDL_Ext<0b0, u, 0b00, opcode, asmop, "8h", "8b",
+                               opnode, VPR64, v8i16, v8i8, v8i8>;
+    def _4s4h : NeonI_3VDL_Ext<0b0, u, 0b01, opcode, asmop, "4s", "4h",
+                               opnode, VPR64, v4i32, v4i16, v4i16>;
+    def _2d2s : NeonI_3VDL_Ext<0b0, u, 0b10, opcode, asmop, "2d", "2s",
+                               opnode, VPR64, v2i64, v2i32, v2i32>;
+  }
+}
+
+defm SABDLvvv : NeonI_3VDL_zext<0b0, 0b0111, "sabdl", int_arm_neon_vabds, 1>;
+defm UABDLvvv : NeonI_3VDL_zext<0b1, 0b0111, "uabdl", int_arm_neon_vabdu, 1>;
+
+multiclass NeonI_Op_High<SDPatternOperator op> {
+  def _16B : PatFrag<(ops node:$Rn, node:$Rm),
+                     (op (v8i8 (Neon_High16B node:$Rn)),
+                         (v8i8 (Neon_High16B node:$Rm)))>;
+  def _8H  : PatFrag<(ops node:$Rn, node:$Rm),
+                     (op (v4i16 (Neon_High8H node:$Rn)),
+                         (v4i16 (Neon_High8H node:$Rm)))>;
+  def _4S  : PatFrag<(ops node:$Rn, node:$Rm),
+                     (op (v2i32 (Neon_High4S node:$Rn)),
+                         (v2i32 (Neon_High4S node:$Rm)))>;
+}
+
+defm NI_sabdl_hi : NeonI_Op_High<int_arm_neon_vabds>;
+defm NI_uabdl_hi : NeonI_Op_High<int_arm_neon_vabdu>;
+defm NI_smull_hi : NeonI_Op_High<int_arm_neon_vmulls>;
+defm NI_umull_hi : NeonI_Op_High<int_arm_neon_vmullu>;
+defm NI_qdmull_hi : NeonI_Op_High<int_arm_neon_vqdmull>;
+defm NI_pmull_hi : NeonI_Op_High<int_arm_neon_vmullp>;
+
+multiclass NeonI_3VDL_Abd_u<bit u, bits<4> opcode, string asmop, string opnode,
+                            bit Commutable = 0> {
+  let isCommutable = Commutable in {
+    def _8h8b  : NeonI_3VDL_Ext<0b1, u, 0b00, opcode, asmop, "8h", "16b",
+                                !cast<PatFrag>(opnode # "_16B"),
+                                VPR128, v8i16, v16i8, v8i8>;
+    def _4s4h  : NeonI_3VDL_Ext<0b1, u, 0b01, opcode, asmop, "4s", "8h",
+                                !cast<PatFrag>(opnode # "_8H"),
+                                VPR128, v4i32, v8i16, v4i16>;
+    def _2d2s  : NeonI_3VDL_Ext<0b1, u, 0b10, opcode, asmop, "2d", "4s",
+                                !cast<PatFrag>(opnode # "_4S"),
+                                VPR128, v2i64, v4i32, v2i32>;
+  }
+}
+
+defm SABDL2vvv : NeonI_3VDL_Abd_u<0b0, 0b0111, "sabdl2", "NI_sabdl_hi", 1>;
+defm UABDL2vvv : NeonI_3VDL_Abd_u<0b1, 0b0111, "uabdl2", "NI_uabdl_hi", 1>;
+
+// For pattern that need two operators being chained.
+class NeonI_3VDL_Aba<bit q, bit u, bits<2> size, bits<4> opcode,
+                     string asmop, string ResS, string OpS,
+                     SDPatternOperator opnode, SDPatternOperator subop,
+                     RegisterOperand OpVPR,
+                     ValueType ResTy, ValueType OpTy, ValueType OpSTy>
+  : NeonI_3VDiff<q, u, size, opcode,
+                 (outs VPR128:$Rd), (ins VPR128:$src, OpVPR:$Rn, OpVPR:$Rm),
+                 asmop # "\t$Rd." # ResS # ", $Rn." # OpS # ", $Rm." # OpS,
+                 [(set (ResTy VPR128:$Rd),
+                    (ResTy (opnode
+                      (ResTy VPR128:$src),
+                      (ResTy (zext (OpSTy (subop (OpTy OpVPR:$Rn),
+                                                 (OpTy OpVPR:$Rm))))))))],
+                 NoItinerary> {
+  let Constraints = "$src = $Rd";
+}
+
+multiclass NeonI_3VDL_Aba_v1<bit u, bits<4> opcode, string asmop,
+                             SDPatternOperator opnode, SDPatternOperator subop>{
+  def _8h8b : NeonI_3VDL_Aba<0b0, u, 0b00, opcode, asmop, "8h", "8b",
+                             opnode, subop, VPR64, v8i16, v8i8, v8i8>;
+  def _4s4h : NeonI_3VDL_Aba<0b0, u, 0b01, opcode, asmop, "4s", "4h",
+                             opnode, subop, VPR64, v4i32, v4i16, v4i16>;
+  def _2d2s : NeonI_3VDL_Aba<0b0, u, 0b10, opcode, asmop, "2d", "2s",
+                             opnode, subop, VPR64, v2i64, v2i32, v2i32>;
+}
+
+defm SABALvvv :  NeonI_3VDL_Aba_v1<0b0, 0b0101, "sabal",
+                                   add, int_arm_neon_vabds>;
+defm UABALvvv :  NeonI_3VDL_Aba_v1<0b1, 0b0101, "uabal",
+                                   add, int_arm_neon_vabdu>;
+
+multiclass NeonI_3VDL2_Aba_v1<bit u, bits<4> opcode, string asmop,
+                              SDPatternOperator opnode, string subop> {
+  def _8h8b : NeonI_3VDL_Aba<0b1, u, 0b00, opcode, asmop, "8h", "16b",
+                             opnode, !cast<PatFrag>(subop # "_16B"),
+                             VPR128, v8i16, v16i8, v8i8>;
+  def _4s4h : NeonI_3VDL_Aba<0b1, u, 0b01, opcode, asmop, "4s", "8h",
+                             opnode, !cast<PatFrag>(subop # "_8H"),
+                             VPR128, v4i32, v8i16, v4i16>;
+  def _2d2s : NeonI_3VDL_Aba<0b1, u, 0b10, opcode, asmop, "2d", "4s",
+                             opnode, !cast<PatFrag>(subop # "_4S"),
+                             VPR128, v2i64, v4i32, v2i32>;
+}
+
+defm SABAL2vvv :  NeonI_3VDL2_Aba_v1<0b0, 0b0101, "sabal2", add,
+                                     "NI_sabdl_hi">;
+defm UABAL2vvv :  NeonI_3VDL2_Aba_v1<0b1, 0b0101, "uabal2", add,
+                                     "NI_uabdl_hi">;
+
+// Long pattern with 2 operands
+multiclass NeonI_3VDL_2Op<bit u, bits<4> opcode, string asmop,
+                          SDPatternOperator opnode, bit Commutable = 0> {
+  let isCommutable = Commutable in {
+    def _8h8b : NeonI_3VD_2Op<0b0, u, 0b00, opcode, asmop, "8h", "8b",
+                              opnode, VPR128, VPR64, v8i16, v8i8>;
+    def _4s4h : NeonI_3VD_2Op<0b0, u, 0b01, opcode, asmop, "4s", "4h",
+                              opnode, VPR128, VPR64, v4i32, v4i16>;
+    def _2d2s : NeonI_3VD_2Op<0b0, u, 0b10, opcode, asmop, "2d", "2s",
+                              opnode, VPR128, VPR64, v2i64, v2i32>;
+  }
+}
+
+defm SMULLvvv :  NeonI_3VDL_2Op<0b0, 0b1100, "smull", int_arm_neon_vmulls, 1>;
+defm UMULLvvv :  NeonI_3VDL_2Op<0b1, 0b1100, "umull", int_arm_neon_vmullu, 1>;
+
+class NeonI_3VDL2_2Op_mull<bit q, bit u, bits<2> size, bits<4> opcode,
+                           string asmop, string ResS, string OpS,
+                           SDPatternOperator opnode,
+                           ValueType ResTy, ValueType OpTy>
+  : NeonI_3VDiff<q, u, size, opcode,
+                 (outs VPR128:$Rd), (ins VPR128:$Rn, VPR128:$Rm),
+                 asmop # "\t$Rd." # ResS # ", $Rn." # OpS # ", $Rm." # OpS,
+                 [(set (ResTy VPR128:$Rd),
+                    (ResTy (opnode (OpTy VPR128:$Rn), (OpTy VPR128:$Rm))))],
+                 NoItinerary>;
+
+multiclass NeonI_3VDL2_2Op_mull_v1<bit u, bits<4> opcode, string asmop,
+                                   string opnode, bit Commutable = 0> {
   let isCommutable = Commutable in {
-    def bbb : NeonI_Scalar3Same<u, 0b00, opcode,
-                                (outs FPR8:$Rd), (ins FPR8:$Rn, FPR8:$Rm),
-                                !strconcat(asmop, " $Rd, $Rn, $Rm"),
+    def _8h16b : NeonI_3VDL2_2Op_mull<0b1, u, 0b00, opcode, asmop, "8h", "16b",
+                                      !cast<PatFrag>(opnode # "_16B"),
+                                      v8i16, v16i8>;
+    def _4s8h : NeonI_3VDL2_2Op_mull<0b1, u, 0b01, opcode, asmop, "4s", "8h",
+                                     !cast<PatFrag>(opnode # "_8H"),
+                                     v4i32, v8i16>;
+    def _2d4s : NeonI_3VDL2_2Op_mull<0b1, u, 0b10, opcode, asmop, "2d", "4s",
+                                     !cast<PatFrag>(opnode # "_4S"),
+                                     v2i64, v4i32>;
+  }
+}
+
+defm SMULL2vvv : NeonI_3VDL2_2Op_mull_v1<0b0, 0b1100, "smull2",
+                                         "NI_smull_hi", 1>;
+defm UMULL2vvv : NeonI_3VDL2_2Op_mull_v1<0b1, 0b1100, "umull2",
+                                         "NI_umull_hi", 1>;
+
+// Long pattern with 3 operands
+class NeonI_3VDL_3Op<bit q, bit u, bits<2> size, bits<4> opcode,
+                     string asmop, string ResS, string OpS,
+                     SDPatternOperator opnode,
+                     ValueType ResTy, ValueType OpTy>
+  : NeonI_3VDiff<q, u, size, opcode,
+                 (outs VPR128:$Rd), (ins VPR128:$src, VPR64:$Rn, VPR64:$Rm),
+                 asmop # "\t$Rd." # ResS # ", $Rn." # OpS # ", $Rm." # OpS,
+                 [(set (ResTy VPR128:$Rd),
+                    (ResTy (opnode
+                      (ResTy VPR128:$src),
+                      (OpTy VPR64:$Rn), (OpTy VPR64:$Rm))))],
+               NoItinerary> {
+  let Constraints = "$src = $Rd";
+}
+
+multiclass NeonI_3VDL_3Op_v1<bit u, bits<4> opcode, string asmop,
+                             SDPatternOperator opnode> {
+  def _8h8b : NeonI_3VDL_3Op<0b0, u, 0b00, opcode, asmop, "8h", "8b",
+                             opnode, v8i16, v8i8>;
+  def _4s4h : NeonI_3VDL_3Op<0b0, u, 0b01, opcode, asmop, "4s", "4h",
+                             opnode, v4i32, v4i16>;
+  def _2d2s : NeonI_3VDL_3Op<0b0, u, 0b10, opcode, asmop, "2d", "2s",
+                             opnode, v2i64, v2i32>;
+}
+
+def Neon_smlal : PatFrag<(ops node:$Rd, node:$Rn, node:$Rm),
+                         (add node:$Rd,
+                            (int_arm_neon_vmulls node:$Rn, node:$Rm))>;
+
+def Neon_umlal : PatFrag<(ops node:$Rd, node:$Rn, node:$Rm),
+                         (add node:$Rd,
+                            (int_arm_neon_vmullu node:$Rn, node:$Rm))>;
+
+def Neon_smlsl : PatFrag<(ops node:$Rd, node:$Rn, node:$Rm),
+                         (sub node:$Rd,
+                            (int_arm_neon_vmulls node:$Rn, node:$Rm))>;
+
+def Neon_umlsl : PatFrag<(ops node:$Rd, node:$Rn, node:$Rm),
+                         (sub node:$Rd,
+                            (int_arm_neon_vmullu node:$Rn, node:$Rm))>;
+
+defm SMLALvvv :  NeonI_3VDL_3Op_v1<0b0, 0b1000, "smlal", Neon_smlal>;
+defm UMLALvvv :  NeonI_3VDL_3Op_v1<0b1, 0b1000, "umlal", Neon_umlal>;
+
+defm SMLSLvvv :  NeonI_3VDL_3Op_v1<0b0, 0b1010, "smlsl", Neon_smlsl>;
+defm UMLSLvvv :  NeonI_3VDL_3Op_v1<0b1, 0b1010, "umlsl", Neon_umlsl>;
+
+class NeonI_3VDL2_3Op_mlas<bit q, bit u, bits<2> size, bits<4> opcode,
+                           string asmop, string ResS, string OpS,
+                           SDPatternOperator subop, SDPatternOperator opnode,
+                           RegisterOperand OpVPR,
+                           ValueType ResTy, ValueType OpTy>
+  : NeonI_3VDiff<q, u, size, opcode,
+               (outs VPR128:$Rd), (ins VPR128:$src, OpVPR:$Rn, OpVPR:$Rm),
+               asmop # "\t$Rd." # ResS # ", $Rn." # OpS # ", $Rm." # OpS,
+               [(set (ResTy VPR128:$Rd),
+                  (ResTy (subop
+                    (ResTy VPR128:$src),
+                    (ResTy (opnode (OpTy OpVPR:$Rn), (OpTy OpVPR:$Rm))))))],
+               NoItinerary> {
+  let Constraints = "$src = $Rd";
+}
+
+multiclass NeonI_3VDL2_3Op_mlas_v1<bit u, bits<4> opcode, string asmop,
+                                   SDPatternOperator subop, string opnode> {
+  def _8h16b : NeonI_3VDL2_3Op_mlas<0b1, u, 0b00, opcode, asmop, "8h", "16b",
+                                    subop, !cast<PatFrag>(opnode # "_16B"),
+                                    VPR128, v8i16, v16i8>;
+  def _4s8h : NeonI_3VDL2_3Op_mlas<0b1, u, 0b01, opcode, asmop, "4s", "8h",
+                                   subop, !cast<PatFrag>(opnode # "_8H"),
+                                   VPR128, v4i32, v8i16>;
+  def _2d4s : NeonI_3VDL2_3Op_mlas<0b1, u, 0b10, opcode, asmop, "2d", "4s",
+                                   subop, !cast<PatFrag>(opnode # "_4S"),
+                                   VPR128, v2i64, v4i32>;
+}
+
+defm SMLAL2vvv :  NeonI_3VDL2_3Op_mlas_v1<0b0, 0b1000, "smlal2",
+                                          add, "NI_smull_hi">;
+defm UMLAL2vvv :  NeonI_3VDL2_3Op_mlas_v1<0b1, 0b1000, "umlal2",
+                                          add, "NI_umull_hi">;
+
+defm SMLSL2vvv :  NeonI_3VDL2_3Op_mlas_v1<0b0, 0b1010, "smlsl2",
+                                          sub, "NI_smull_hi">;
+defm UMLSL2vvv :  NeonI_3VDL2_3Op_mlas_v1<0b1, 0b1010, "umlsl2",
+                                          sub, "NI_umull_hi">;
+
+multiclass NeonI_3VDL_qdmlal_3Op_v2<bit u, bits<4> opcode, string asmop,
+                                    SDPatternOperator opnode> {
+  def _4s4h : NeonI_3VDL2_3Op_mlas<0b0, u, 0b01, opcode, asmop, "4s", "4h",
+                                   opnode, int_arm_neon_vqdmull,
+                                   VPR64, v4i32, v4i16>;
+  def _2d2s : NeonI_3VDL2_3Op_mlas<0b0, u, 0b10, opcode, asmop, "2d", "2s",
+                                   opnode, int_arm_neon_vqdmull,
+                                   VPR64, v2i64, v2i32>;
+}
+
+defm SQDMLALvvv : NeonI_3VDL_qdmlal_3Op_v2<0b0, 0b1001, "sqdmlal",
+                                           int_arm_neon_vqadds>;
+defm SQDMLSLvvv : NeonI_3VDL_qdmlal_3Op_v2<0b0, 0b1011, "sqdmlsl",
+                                           int_arm_neon_vqsubs>;
+
+multiclass NeonI_3VDL_v2<bit u, bits<4> opcode, string asmop,
+                         SDPatternOperator opnode, bit Commutable = 0> {
+  let isCommutable = Commutable in {
+    def _4s4h : NeonI_3VD_2Op<0b0, u, 0b01, opcode, asmop, "4s", "4h",
+                              opnode, VPR128, VPR64, v4i32, v4i16>;
+    def _2d2s : NeonI_3VD_2Op<0b0, u, 0b10, opcode, asmop, "2d", "2s",
+                              opnode, VPR128, VPR64, v2i64, v2i32>;
+  }
+}
+
+defm SQDMULLvvv : NeonI_3VDL_v2<0b0, 0b1101, "sqdmull",
+                                int_arm_neon_vqdmull, 1>;
+
+multiclass NeonI_3VDL2_2Op_mull_v2<bit u, bits<4> opcode, string asmop,
+                                   string opnode, bit Commutable = 0> {
+  let isCommutable = Commutable in {
+    def _4s8h : NeonI_3VDL2_2Op_mull<0b1, u, 0b01, opcode, asmop, "4s", "8h",
+                                     !cast<PatFrag>(opnode # "_8H"),
+                                     v4i32, v8i16>;
+    def _2d4s : NeonI_3VDL2_2Op_mull<0b1, u, 0b10, opcode, asmop, "2d", "4s",
+                                     !cast<PatFrag>(opnode # "_4S"),
+                                     v2i64, v4i32>;
+  }
+}
+
+defm SQDMULL2vvv : NeonI_3VDL2_2Op_mull_v2<0b0, 0b1101, "sqdmull2",
+                                           "NI_qdmull_hi", 1>;
+
+multiclass NeonI_3VDL2_3Op_qdmlal_v2<bit u, bits<4> opcode, string asmop,
+                                     SDPatternOperator opnode> {
+  def _4s8h : NeonI_3VDL2_3Op_mlas<0b1, u, 0b01, opcode, asmop, "4s", "8h",
+                                   opnode, NI_qdmull_hi_8H,
+                                   VPR128, v4i32, v8i16>;
+  def _2d4s : NeonI_3VDL2_3Op_mlas<0b1, u, 0b10, opcode, asmop, "2d", "4s",
+                                   opnode, NI_qdmull_hi_4S,
+                                   VPR128, v2i64, v4i32>;
+}
+
+defm SQDMLAL2vvv : NeonI_3VDL2_3Op_qdmlal_v2<0b0, 0b1001, "sqdmlal2",
+                                             int_arm_neon_vqadds>;
+defm SQDMLSL2vvv : NeonI_3VDL2_3Op_qdmlal_v2<0b0, 0b1011, "sqdmlsl2",
+                                             int_arm_neon_vqsubs>;
+
+multiclass NeonI_3VDL_v3<bit u, bits<4> opcode, string asmop,
+                         SDPatternOperator opnode, bit Commutable = 0> {
+  let isCommutable = Commutable in {
+    def _8h8b : NeonI_3VD_2Op<0b0, u, 0b00, opcode, asmop, "8h", "8b",
+                              opnode, VPR128, VPR64, v8i16, v8i8>;
+
+    def _1q1d : NeonI_3VDiff<0b0, u, 0b11, opcode,
+                             (outs VPR128:$Rd), (ins VPR64:$Rn, VPR64:$Rm),
+                             asmop # "\t$Rd.1q, $Rn.1d, $Rm.1d",
+                             [], NoItinerary>;
+  }
+}
+
+defm PMULLvvv : NeonI_3VDL_v3<0b0, 0b1110, "pmull", int_arm_neon_vmullp, 1>;
+
+multiclass NeonI_3VDL2_2Op_mull_v3<bit u, bits<4> opcode, string asmop,
+                                   string opnode, bit Commutable = 0> {
+  let isCommutable = Commutable in {
+    def _8h16b : NeonI_3VDL2_2Op_mull<0b1, u, 0b00, opcode, asmop, "8h", "16b",
+                                      !cast<PatFrag>(opnode # "_16B"),
+                                      v8i16, v16i8>;
+
+    def _1q2d : NeonI_3VDiff<0b1, u, 0b11, opcode,
+                             (outs VPR128:$Rd), (ins VPR128:$Rn, VPR128:$Rm),
+                             asmop # "\t$Rd.1q, $Rn.2d, $Rm.2d",
+                             [], NoItinerary>;
+  }
+}
+
+defm PMULL2vvv : NeonI_3VDL2_2Op_mull_v3<0b0, 0b1110, "pmull2", "NI_pmull_hi",
+                                         1>;
+
+// End of implementation for instruction class (3V Diff)
+
+// The followings are vector load/store multiple N-element structure
+// (class SIMD lselem).
+
+// ld1:         load multiple 1-element structure to 1/2/3/4 registers.
+// ld2/ld3/ld4: load multiple N-element structure to N registers (N = 2, 3, 4).
+//              The structure consists of a sequence of sets of N values.
+//              The first element of the structure is placed in the first lane
+//              of the first first vector, the second element in the first lane
+//              of the second vector, and so on.
+// E.g. LD1_3V_2S will load 32-bit elements {A, B, C, D, E, F} sequentially into
+// the three 64-bit vectors list {BA, DC, FE}.
+// E.g. LD3_2S will load 32-bit elements {A, B, C, D, E, F} into the three
+// 64-bit vectors list {DA, EB, FC}.
+// Store instructions store multiple structure to N registers like load.
+
+
+class NeonI_LDVList<bit q, bits<4> opcode, bits<2> size,
+                    RegisterOperand VecList, string asmop>
+  : NeonI_LdStMult<q, 1, opcode, size,
+                 (outs VecList:$Rt), (ins GPR64xsp:$Rn),
+                 asmop # "\t$Rt, [$Rn]",
+                 [],
+                 NoItinerary> {
+  let mayLoad = 1;
+  let neverHasSideEffects = 1;
+}
+
+multiclass LDVList_BHSD<bits<4> opcode, string List, string asmop> {
+  def _8B : NeonI_LDVList<0, opcode, 0b00,
+                          !cast<RegisterOperand>(List # "8B_operand"), asmop>;
+
+  def _4H : NeonI_LDVList<0, opcode, 0b01,
+                          !cast<RegisterOperand>(List # "4H_operand"), asmop>;
+
+  def _2S : NeonI_LDVList<0, opcode, 0b10,
+                          !cast<RegisterOperand>(List # "2S_operand"), asmop>;
+
+  def _16B : NeonI_LDVList<1, opcode, 0b00,
+                           !cast<RegisterOperand>(List # "16B_operand"), asmop>;
+
+  def _8H : NeonI_LDVList<1, opcode, 0b01,
+                          !cast<RegisterOperand>(List # "8H_operand"), asmop>;
+
+  def _4S : NeonI_LDVList<1, opcode, 0b10,
+                          !cast<RegisterOperand>(List # "4S_operand"), asmop>;
+
+  def _2D : NeonI_LDVList<1, opcode, 0b11,
+                          !cast<RegisterOperand>(List # "2D_operand"), asmop>;
+}
+
+// Load multiple N-element structure to N consecutive registers (N = 1,2,3,4)
+defm LD1 : LDVList_BHSD<0b0111, "VOne", "ld1">;
+def LD1_1D : NeonI_LDVList<0, 0b0111, 0b11, VOne1D_operand, "ld1">;
+
+defm LD2 : LDVList_BHSD<0b1000, "VPair", "ld2">;
+
+defm LD3 : LDVList_BHSD<0b0100, "VTriple", "ld3">;
+
+defm LD4 : LDVList_BHSD<0b0000, "VQuad", "ld4">;
+
+// Load multiple 1-element structure to N consecutive registers (N = 2,3,4)
+defm LD1x2 : LDVList_BHSD<0b1010, "VPair", "ld1">;
+def LD1x2_1D : NeonI_LDVList<0, 0b1010, 0b11, VPair1D_operand, "ld1">;
+
+defm LD1x3 : LDVList_BHSD<0b0110, "VTriple", "ld1">;
+def LD1x3_1D : NeonI_LDVList<0, 0b0110, 0b11, VTriple1D_operand, "ld1">;
+
+defm LD1x4 : LDVList_BHSD<0b0010, "VQuad", "ld1">;
+def LD1x4_1D : NeonI_LDVList<0, 0b0010, 0b11, VQuad1D_operand, "ld1">;
+
+class NeonI_STVList<bit q, bits<4> opcode, bits<2> size,
+                    RegisterOperand VecList, string asmop>
+  : NeonI_LdStMult<q, 0, opcode, size,
+                 (outs), (ins GPR64xsp:$Rn, VecList:$Rt),
+                 asmop # "\t$Rt, [$Rn]",
+                 [],
+                 NoItinerary> {
+  let mayStore = 1;
+  let neverHasSideEffects = 1;
+}
+
+multiclass STVList_BHSD<bits<4> opcode, string List, string asmop> {
+  def _8B : NeonI_STVList<0, opcode, 0b00,
+                          !cast<RegisterOperand>(List # "8B_operand"), asmop>;
+
+  def _4H : NeonI_STVList<0, opcode, 0b01,
+                          !cast<RegisterOperand>(List # "4H_operand"), asmop>;
+
+  def _2S : NeonI_STVList<0, opcode, 0b10,
+                          !cast<RegisterOperand>(List # "2S_operand"), asmop>;
+
+  def _16B : NeonI_STVList<1, opcode, 0b00,
+                           !cast<RegisterOperand>(List # "16B_operand"), asmop>;
+
+  def _8H : NeonI_STVList<1, opcode, 0b01,
+                          !cast<RegisterOperand>(List # "8H_operand"), asmop>;
+
+  def _4S : NeonI_STVList<1, opcode, 0b10,
+                          !cast<RegisterOperand>(List # "4S_operand"), asmop>;
+
+  def _2D : NeonI_STVList<1, opcode, 0b11,
+                          !cast<RegisterOperand>(List # "2D_operand"), asmop>;
+}
+
+// Store multiple N-element structures from N registers (N = 1,2,3,4)
+defm ST1 : STVList_BHSD<0b0111, "VOne", "st1">;
+def ST1_1D : NeonI_STVList<0, 0b0111, 0b11, VOne1D_operand, "st1">;
+
+defm ST2 : STVList_BHSD<0b1000, "VPair", "st2">;
+
+defm ST3 : STVList_BHSD<0b0100, "VTriple", "st3">;
+
+defm ST4 : STVList_BHSD<0b0000, "VQuad", "st4">;
+
+// Store multiple 1-element structures from N consecutive registers (N = 2,3,4)
+defm ST1x2 : STVList_BHSD<0b1010, "VPair", "st1">;
+def ST1x2_1D : NeonI_STVList<0, 0b1010, 0b11, VPair1D_operand, "st1">;
+
+defm ST1x3 : STVList_BHSD<0b0110, "VTriple", "st1">;
+def ST1x3_1D : NeonI_STVList<0, 0b0110, 0b11, VTriple1D_operand, "st1">;
+
+defm ST1x4 : STVList_BHSD<0b0010, "VQuad", "st1">;
+def ST1x4_1D : NeonI_STVList<0, 0b0010, 0b11, VQuad1D_operand, "st1">;
+
+def : Pat<(v2f64 (load GPR64xsp:$addr)), (LD1_2D GPR64xsp:$addr)>;
+def : Pat<(v2i64 (load GPR64xsp:$addr)), (LD1_2D GPR64xsp:$addr)>;
+
+def : Pat<(v4f32 (load GPR64xsp:$addr)), (LD1_4S GPR64xsp:$addr)>;
+def : Pat<(v4i32 (load GPR64xsp:$addr)), (LD1_4S GPR64xsp:$addr)>;
+
+def : Pat<(v8i16 (load GPR64xsp:$addr)), (LD1_8H GPR64xsp:$addr)>;
+def : Pat<(v16i8 (load GPR64xsp:$addr)), (LD1_16B GPR64xsp:$addr)>;
+
+def : Pat<(v1f64 (load GPR64xsp:$addr)), (LD1_1D GPR64xsp:$addr)>;
+def : Pat<(v1i64 (load GPR64xsp:$addr)), (LD1_1D GPR64xsp:$addr)>;
+
+def : Pat<(v2f32 (load GPR64xsp:$addr)), (LD1_2S GPR64xsp:$addr)>;
+def : Pat<(v2i32 (load GPR64xsp:$addr)), (LD1_2S GPR64xsp:$addr)>;
+
+def : Pat<(v4i16 (load GPR64xsp:$addr)), (LD1_4H GPR64xsp:$addr)>;
+def : Pat<(v8i8 (load GPR64xsp:$addr)), (LD1_8B GPR64xsp:$addr)>;
+
+def : Pat<(store (v2i64 VPR128:$value), GPR64xsp:$addr),
+          (ST1_2D GPR64xsp:$addr, VPR128:$value)>;
+def : Pat<(store (v2f64 VPR128:$value), GPR64xsp:$addr),
+          (ST1_2D GPR64xsp:$addr, VPR128:$value)>;
+
+def : Pat<(store (v4i32 VPR128:$value), GPR64xsp:$addr),
+          (ST1_4S GPR64xsp:$addr, VPR128:$value)>;
+def : Pat<(store (v4f32 VPR128:$value), GPR64xsp:$addr),
+          (ST1_4S GPR64xsp:$addr, VPR128:$value)>;
+
+def : Pat<(store (v8i16 VPR128:$value), GPR64xsp:$addr),
+          (ST1_8H GPR64xsp:$addr, VPR128:$value)>;
+def : Pat<(store (v16i8 VPR128:$value), GPR64xsp:$addr),
+          (ST1_16B GPR64xsp:$addr, VPR128:$value)>;
+
+def : Pat<(store (v1i64 VPR64:$value), GPR64xsp:$addr),
+          (ST1_1D GPR64xsp:$addr, VPR64:$value)>;
+def : Pat<(store (v1f64 VPR64:$value), GPR64xsp:$addr),
+          (ST1_1D GPR64xsp:$addr, VPR64:$value)>;
+
+def : Pat<(store (v2i32 VPR64:$value), GPR64xsp:$addr),
+          (ST1_2S GPR64xsp:$addr, VPR64:$value)>;
+def : Pat<(store (v2f32 VPR64:$value), GPR64xsp:$addr),
+          (ST1_2S GPR64xsp:$addr, VPR64:$value)>;
+
+def : Pat<(store (v4i16 VPR64:$value), GPR64xsp:$addr),
+          (ST1_4H GPR64xsp:$addr, VPR64:$value)>;
+def : Pat<(store (v8i8 VPR64:$value), GPR64xsp:$addr),
+          (ST1_8B GPR64xsp:$addr, VPR64:$value)>;
+
+// End of vector load/store multiple N-element structure(class SIMD lselem)
+
+// The followings are post-index vector load/store multiple N-element
+// structure(class SIMD lselem-post)
+def exact1_asmoperand : AsmOperandClass {
+  let Name = "Exact1";
+  let PredicateMethod = "isExactImm<1>";
+  let RenderMethod = "addImmOperands";
+}
+def uimm_exact1 : Operand<i32>, ImmLeaf<i32, [{return Imm == 1;}]> {
+  let ParserMatchClass = exact1_asmoperand;
+}
+
+def exact2_asmoperand : AsmOperandClass {
+  let Name = "Exact2";
+  let PredicateMethod = "isExactImm<2>";
+  let RenderMethod = "addImmOperands";
+}
+def uimm_exact2 : Operand<i32>, ImmLeaf<i32, [{return Imm == 2;}]> {
+  let ParserMatchClass = exact2_asmoperand;
+}
+
+def exact3_asmoperand : AsmOperandClass {
+  let Name = "Exact3";
+  let PredicateMethod = "isExactImm<3>";
+  let RenderMethod = "addImmOperands";
+}
+def uimm_exact3 : Operand<i32>, ImmLeaf<i32, [{return Imm == 3;}]> {
+  let ParserMatchClass = exact3_asmoperand;
+}
+
+def exact4_asmoperand : AsmOperandClass {
+  let Name = "Exact4";
+  let PredicateMethod = "isExactImm<4>";
+  let RenderMethod = "addImmOperands";
+}
+def uimm_exact4 : Operand<i32>, ImmLeaf<i32, [{return Imm == 4;}]> {
+  let ParserMatchClass = exact4_asmoperand;
+}
+
+def exact6_asmoperand : AsmOperandClass {
+  let Name = "Exact6";
+  let PredicateMethod = "isExactImm<6>";
+  let RenderMethod = "addImmOperands";
+}
+def uimm_exact6 : Operand<i32>, ImmLeaf<i32, [{return Imm == 6;}]> {
+  let ParserMatchClass = exact6_asmoperand;
+}
+
+def exact8_asmoperand : AsmOperandClass {
+  let Name = "Exact8";
+  let PredicateMethod = "isExactImm<8>";
+  let RenderMethod = "addImmOperands";
+}
+def uimm_exact8 : Operand<i32>, ImmLeaf<i32, [{return Imm == 8;}]> {
+  let ParserMatchClass = exact8_asmoperand;
+}
+
+def exact12_asmoperand : AsmOperandClass {
+  let Name = "Exact12";
+  let PredicateMethod = "isExactImm<12>";
+  let RenderMethod = "addImmOperands";
+}
+def uimm_exact12 : Operand<i32>, ImmLeaf<i32, [{return Imm == 12;}]> {
+  let ParserMatchClass = exact12_asmoperand;
+}
+
+def exact16_asmoperand : AsmOperandClass {
+  let Name = "Exact16";
+  let PredicateMethod = "isExactImm<16>";
+  let RenderMethod = "addImmOperands";
+}
+def uimm_exact16 : Operand<i32>, ImmLeaf<i32, [{return Imm == 16;}]> {
+  let ParserMatchClass = exact16_asmoperand;
+}
+
+def exact24_asmoperand : AsmOperandClass {
+  let Name = "Exact24";
+  let PredicateMethod = "isExactImm<24>";
+  let RenderMethod = "addImmOperands";
+}
+def uimm_exact24 : Operand<i32>, ImmLeaf<i32, [{return Imm == 24;}]> {
+  let ParserMatchClass = exact24_asmoperand;
+}
+
+def exact32_asmoperand : AsmOperandClass {
+  let Name = "Exact32";
+  let PredicateMethod = "isExactImm<32>";
+  let RenderMethod = "addImmOperands";
+}
+def uimm_exact32 : Operand<i32>, ImmLeaf<i32, [{return Imm == 32;}]> {
+  let ParserMatchClass = exact32_asmoperand;
+}
+
+def exact48_asmoperand : AsmOperandClass {
+  let Name = "Exact48";
+  let PredicateMethod = "isExactImm<48>";
+  let RenderMethod = "addImmOperands";
+}
+def uimm_exact48 : Operand<i32>, ImmLeaf<i32, [{return Imm == 48;}]> {
+  let ParserMatchClass = exact48_asmoperand;
+}
+
+def exact64_asmoperand : AsmOperandClass {
+  let Name = "Exact64";
+  let PredicateMethod = "isExactImm<64>";
+  let RenderMethod = "addImmOperands";
+}
+def uimm_exact64 : Operand<i32>, ImmLeaf<i32, [{return Imm == 64;}]> {
+  let ParserMatchClass = exact64_asmoperand;
+}
+
+multiclass NeonI_LDWB_VList<bit q, bits<4> opcode, bits<2> size,
+                           RegisterOperand VecList, Operand ImmTy,
+                           string asmop> {
+  let Constraints = "$Rn = $wb", mayLoad = 1, neverHasSideEffects = 1,
+      DecoderMethod = "DecodeVLDSTPostInstruction" in {
+    def _fixed : NeonI_LdStMult_Post<q, 1, opcode, size,
+                     (outs VecList:$Rt, GPR64xsp:$wb),
+                     (ins GPR64xsp:$Rn, ImmTy:$amt),
+                     asmop # "\t$Rt, [$Rn], $amt",
+                     [],
+                     NoItinerary> {
+      let Rm = 0b11111;
+    }
+
+    def _register : NeonI_LdStMult_Post<q, 1, opcode, size,
+                        (outs VecList:$Rt, GPR64xsp:$wb),
+                        (ins GPR64xsp:$Rn, GPR64noxzr:$Rm),
+                        asmop # "\t$Rt, [$Rn], $Rm",
+                        [],
+                        NoItinerary>;
+  }
+}
+
+multiclass LDWB_VList_BHSD<bits<4> opcode, string List, Operand ImmTy,
+    Operand ImmTy2, string asmop> {
+  defm _8B : NeonI_LDWB_VList<0, opcode, 0b00,
+                              !cast<RegisterOperand>(List # "8B_operand"),
+                              ImmTy, asmop>;
+
+  defm _4H : NeonI_LDWB_VList<0, opcode, 0b01,
+                              !cast<RegisterOperand>(List # "4H_operand"),
+                              ImmTy, asmop>;
+
+  defm _2S : NeonI_LDWB_VList<0, opcode, 0b10,
+                              !cast<RegisterOperand>(List # "2S_operand"),
+                              ImmTy, asmop>;
+
+  defm _16B : NeonI_LDWB_VList<1, opcode, 0b00,
+                               !cast<RegisterOperand>(List # "16B_operand"),
+                               ImmTy2, asmop>;
+
+  defm _8H : NeonI_LDWB_VList<1, opcode, 0b01,
+                              !cast<RegisterOperand>(List # "8H_operand"),
+                              ImmTy2, asmop>;
+
+  defm _4S : NeonI_LDWB_VList<1, opcode, 0b10,
+                              !cast<RegisterOperand>(List # "4S_operand"),
+                              ImmTy2, asmop>;
+
+  defm _2D : NeonI_LDWB_VList<1, opcode, 0b11,
+                              !cast<RegisterOperand>(List # "2D_operand"),
+                              ImmTy2, asmop>;
+}
+
+// Post-index load multiple N-element structures from N registers (N = 1,2,3,4)
+defm LD1WB : LDWB_VList_BHSD<0b0111, "VOne", uimm_exact8, uimm_exact16, "ld1">;
+defm LD1WB_1D : NeonI_LDWB_VList<0, 0b0111, 0b11, VOne1D_operand, uimm_exact8,
+                                 "ld1">;
+
+defm LD2WB : LDWB_VList_BHSD<0b1000, "VPair", uimm_exact16, uimm_exact32, "ld2">;
+
+defm LD3WB : LDWB_VList_BHSD<0b0100, "VTriple", uimm_exact24, uimm_exact48,
+                             "ld3">;
+
+defm LD4WB : LDWB_VList_BHSD<0b0000, "VQuad", uimm_exact32, uimm_exact64, "ld4">;
+
+// Post-index load multiple 1-element structures from N consecutive registers
+// (N = 2,3,4)
+defm LD1x2WB : LDWB_VList_BHSD<0b1010, "VPair", uimm_exact16, uimm_exact32,
+                               "ld1">;
+defm LD1x2WB_1D : NeonI_LDWB_VList<0, 0b1010, 0b11, VPair1D_operand,
+                                   uimm_exact16, "ld1">;
+
+defm LD1x3WB : LDWB_VList_BHSD<0b0110, "VTriple", uimm_exact24, uimm_exact48,
+                               "ld1">;
+defm LD1x3WB_1D : NeonI_LDWB_VList<0, 0b0110, 0b11, VTriple1D_operand,
+                                   uimm_exact24, "ld1">;
+
+defm LD1x4WB : LDWB_VList_BHSD<0b0010, "VQuad", uimm_exact32, uimm_exact64,
+                                "ld1">;
+defm LD1x4WB_1D : NeonI_LDWB_VList<0, 0b0010, 0b11, VQuad1D_operand,
+                                   uimm_exact32, "ld1">;
+
+multiclass NeonI_STWB_VList<bit q, bits<4> opcode, bits<2> size,
+                            RegisterOperand VecList, Operand ImmTy,
+                            string asmop> {
+  let Constraints = "$Rn = $wb", mayStore = 1, neverHasSideEffects = 1,
+      DecoderMethod = "DecodeVLDSTPostInstruction" in {
+    def _fixed : NeonI_LdStMult_Post<q, 0, opcode, size,
+                     (outs GPR64xsp:$wb),
+                     (ins GPR64xsp:$Rn, ImmTy:$amt, VecList:$Rt),
+                     asmop # "\t$Rt, [$Rn], $amt",
+                     [],
+                     NoItinerary> {
+      let Rm = 0b11111;
+    }
+
+    def _register : NeonI_LdStMult_Post<q, 0, opcode, size,
+                      (outs GPR64xsp:$wb),
+                      (ins GPR64xsp:$Rn, GPR64noxzr:$Rm, VecList:$Rt),
+                      asmop # "\t$Rt, [$Rn], $Rm",
+                      [],
+                      NoItinerary>;
+  }
+}
+
+multiclass STWB_VList_BHSD<bits<4> opcode, string List, Operand ImmTy,
+                           Operand ImmTy2, string asmop> {
+  defm _8B : NeonI_STWB_VList<0, opcode, 0b00,
+                 !cast<RegisterOperand>(List # "8B_operand"), ImmTy, asmop>;
+
+  defm _4H : NeonI_STWB_VList<0, opcode, 0b01,
+                              !cast<RegisterOperand>(List # "4H_operand"),
+                              ImmTy, asmop>;
+
+  defm _2S : NeonI_STWB_VList<0, opcode, 0b10,
+                              !cast<RegisterOperand>(List # "2S_operand"),
+                              ImmTy, asmop>;
+
+  defm _16B : NeonI_STWB_VList<1, opcode, 0b00,
+                               !cast<RegisterOperand>(List # "16B_operand"),
+                               ImmTy2, asmop>;
+
+  defm _8H : NeonI_STWB_VList<1, opcode, 0b01,
+                              !cast<RegisterOperand>(List # "8H_operand"),
+                              ImmTy2, asmop>;
+
+  defm _4S : NeonI_STWB_VList<1, opcode, 0b10,
+                              !cast<RegisterOperand>(List # "4S_operand"),
+                              ImmTy2, asmop>;
+
+  defm _2D : NeonI_STWB_VList<1, opcode, 0b11,
+                              !cast<RegisterOperand>(List # "2D_operand"),
+                              ImmTy2, asmop>;
+}
+
+// Post-index load multiple N-element structures from N registers (N = 1,2,3,4)
+defm ST1WB : STWB_VList_BHSD<0b0111, "VOne", uimm_exact8, uimm_exact16, "st1">;
+defm ST1WB_1D : NeonI_STWB_VList<0, 0b0111, 0b11, VOne1D_operand, uimm_exact8,
+                                 "st1">;
+
+defm ST2WB : STWB_VList_BHSD<0b1000, "VPair", uimm_exact16, uimm_exact32, "st2">;
+
+defm ST3WB : STWB_VList_BHSD<0b0100, "VTriple", uimm_exact24, uimm_exact48,
+                             "st3">;
+
+defm ST4WB : STWB_VList_BHSD<0b0000, "VQuad", uimm_exact32, uimm_exact64, "st4">;
+
+// Post-index load multiple 1-element structures from N consecutive registers
+// (N = 2,3,4)
+defm ST1x2WB : STWB_VList_BHSD<0b1010, "VPair", uimm_exact16, uimm_exact32,
+                               "st1">;
+defm ST1x2WB_1D : NeonI_STWB_VList<0, 0b1010, 0b11, VPair1D_operand,
+                                   uimm_exact16, "st1">;
+
+defm ST1x3WB : STWB_VList_BHSD<0b0110, "VTriple", uimm_exact24, uimm_exact48,
+                               "st1">;
+defm ST1x3WB_1D : NeonI_STWB_VList<0, 0b0110, 0b11, VTriple1D_operand,
+                                   uimm_exact24, "st1">;
+
+defm ST1x4WB : STWB_VList_BHSD<0b0010, "VQuad", uimm_exact32, uimm_exact64,
+                               "st1">;
+defm ST1x4WB_1D : NeonI_STWB_VList<0, 0b0010, 0b11, VQuad1D_operand,
+                                   uimm_exact32, "st1">;
+
+// End of post-index vector load/store multiple N-element structure
+// (class SIMD lselem-post)
+
+// The followings are vector load/store single N-element structure
+// (class SIMD lsone).
+def neon_uimm0_bare : Operand<i64>,
+                        ImmLeaf<i64, [{return Imm == 0;}]> {
+  let ParserMatchClass = neon_uimm0_asmoperand;
+  let PrintMethod = "printUImmBareOperand";
+}
+
+def neon_uimm1_bare : Operand<i64>,
+                        ImmLeaf<i64, [{return Imm < 2;}]> {
+  let ParserMatchClass = neon_uimm1_asmoperand;
+  let PrintMethod = "printUImmBareOperand";
+}
+
+def neon_uimm2_bare : Operand<i64>,
+                        ImmLeaf<i64, [{return Imm < 4;}]> {
+  let ParserMatchClass = neon_uimm2_asmoperand;
+  let PrintMethod = "printUImmBareOperand";
+}
+
+def neon_uimm3_bare : Operand<i64>,
+                        ImmLeaf<i64, [{return Imm < 8;}]> {
+  let ParserMatchClass = uimm3_asmoperand;
+  let PrintMethod = "printUImmBareOperand";
+}
+
+def neon_uimm4_bare : Operand<i64>,
+                        ImmLeaf<i64, [{return Imm < 16;}]> {
+  let ParserMatchClass = uimm4_asmoperand;
+  let PrintMethod = "printUImmBareOperand";
+}
+
+class NeonI_LDN_Dup<bit q, bit r, bits<3> opcode, bits<2> size,
+                    RegisterOperand VecList, string asmop>
+    : NeonI_LdOne_Dup<q, r, opcode, size,
+                      (outs VecList:$Rt), (ins GPR64xsp:$Rn),
+                      asmop # "\t$Rt, [$Rn]",
+                      [],
+                      NoItinerary> {
+  let mayLoad = 1;
+  let neverHasSideEffects = 1;
+}
+
+multiclass LDN_Dup_BHSD<bit r, bits<3> opcode, string List, string asmop> {
+  def _8B : NeonI_LDN_Dup<0, r, opcode, 0b00,
+                          !cast<RegisterOperand>(List # "8B_operand"), asmop>;
+
+  def _4H : NeonI_LDN_Dup<0, r, opcode, 0b01,
+                          !cast<RegisterOperand>(List # "4H_operand"), asmop>;
+
+  def _2S : NeonI_LDN_Dup<0, r, opcode, 0b10,
+                          !cast<RegisterOperand>(List # "2S_operand"), asmop>;
+
+  def _1D : NeonI_LDN_Dup<0, r, opcode, 0b11,
+                          !cast<RegisterOperand>(List # "1D_operand"), asmop>;
+
+  def _16B : NeonI_LDN_Dup<1, r, opcode, 0b00,
+                           !cast<RegisterOperand>(List # "16B_operand"), asmop>;
+
+  def _8H : NeonI_LDN_Dup<1, r, opcode, 0b01,
+                          !cast<RegisterOperand>(List # "8H_operand"), asmop>;
+
+  def _4S : NeonI_LDN_Dup<1, r, opcode, 0b10,
+                          !cast<RegisterOperand>(List # "4S_operand"), asmop>;
+
+  def _2D : NeonI_LDN_Dup<1, r, opcode, 0b11,
+                          !cast<RegisterOperand>(List # "2D_operand"), asmop>;
+}
+
+// Load single 1-element structure to all lanes of 1 register
+defm LD1R : LDN_Dup_BHSD<0b0, 0b110, "VOne", "ld1r">;
+
+// Load single N-element structure to all lanes of N consecutive
+// registers (N = 2,3,4)
+defm LD2R : LDN_Dup_BHSD<0b1, 0b110, "VPair", "ld2r">;
+defm LD3R : LDN_Dup_BHSD<0b0, 0b111, "VTriple", "ld3r">;
+defm LD4R : LDN_Dup_BHSD<0b1, 0b111, "VQuad", "ld4r">;
+
+
+class LD1R_pattern <ValueType VTy, ValueType DTy, PatFrag LoadOp,
+                    Instruction INST>
+    : Pat<(VTy (Neon_vdup (DTy (LoadOp GPR64xsp:$Rn)))),
+          (VTy (INST GPR64xsp:$Rn))>;
+
+// Match all LD1R instructions
+def : LD1R_pattern<v8i8, i32, extloadi8, LD1R_8B>;
+
+def : LD1R_pattern<v16i8, i32, extloadi8, LD1R_16B>;
+
+def : LD1R_pattern<v4i16, i32, extloadi16, LD1R_4H>;
+
+def : LD1R_pattern<v8i16, i32, extloadi16, LD1R_8H>;
+
+def : LD1R_pattern<v2i32, i32, load, LD1R_2S>;
+def : LD1R_pattern<v2f32, f32, load, LD1R_2S>;
+
+def : LD1R_pattern<v4i32, i32, load, LD1R_4S>;
+def : LD1R_pattern<v4f32, f32, load, LD1R_4S>;
+
+def : LD1R_pattern<v1i64, i64, load, LD1R_1D>;
+def : LD1R_pattern<v1f64, f64, load, LD1R_1D>;
+
+def : LD1R_pattern<v2i64, i64, load, LD1R_2D>;
+def : LD1R_pattern<v2f64, f64, load, LD1R_2D>;
+
+
+multiclass VectorList_Bare_BHSD<string PREFIX, int Count,
+                                RegisterClass RegList> {
+  defm B : VectorList_operands<PREFIX, "B", Count, RegList>;
+  defm H : VectorList_operands<PREFIX, "H", Count, RegList>;
+  defm S : VectorList_operands<PREFIX, "S", Count, RegList>;
+  defm D : VectorList_operands<PREFIX, "D", Count, RegList>;
+}
+
+// Special vector list operand of 128-bit vectors with bare layout.
+// i.e. only show ".b", ".h", ".s", ".d"
+defm VOne : VectorList_Bare_BHSD<"VOne", 1, FPR128>;
+defm VPair : VectorList_Bare_BHSD<"VPair", 2, QPair>;
+defm VTriple : VectorList_Bare_BHSD<"VTriple", 3, QTriple>;
+defm VQuad : VectorList_Bare_BHSD<"VQuad", 4, QQuad>;
+
+class NeonI_LDN_Lane<bit r, bits<2> op2_1, bit op0, RegisterOperand VList,
+                     Operand ImmOp, string asmop>
+    : NeonI_LdStOne_Lane<1, r, op2_1, op0,
+                         (outs VList:$Rt),
+                         (ins GPR64xsp:$Rn, VList:$src, ImmOp:$lane),
+                         asmop # "\t$Rt[$lane], [$Rn]",
+                         [],
+                         NoItinerary> {
+  let mayLoad = 1;
+  let neverHasSideEffects = 1;
+  let hasExtraDefRegAllocReq = 1;
+  let Constraints = "$src = $Rt";
+}
+
+multiclass LDN_Lane_BHSD<bit r, bit op0, string List, string asmop> {
+  def _B : NeonI_LDN_Lane<r, 0b00, op0,
+                          !cast<RegisterOperand>(List # "B_operand"),
+                          neon_uimm4_bare, asmop> {
+    let Inst{12-10} = lane{2-0};
+    let Inst{30} = lane{3};
+  }
+
+  def _H : NeonI_LDN_Lane<r, 0b01, op0,
+                          !cast<RegisterOperand>(List # "H_operand"),
+                          neon_uimm3_bare, asmop> {
+    let Inst{12-10} = {lane{1}, lane{0}, 0b0};
+    let Inst{30} = lane{2};
+  }
+
+  def _S : NeonI_LDN_Lane<r, 0b10, op0,
+                          !cast<RegisterOperand>(List # "S_operand"),
+                          neon_uimm2_bare, asmop> {
+    let Inst{12-10} = {lane{0}, 0b0, 0b0};
+    let Inst{30} = lane{1};
+  }
+
+  def _D : NeonI_LDN_Lane<r, 0b10, op0,
+                          !cast<RegisterOperand>(List # "D_operand"),
+                          neon_uimm1_bare, asmop> {
+    let Inst{12-10} = 0b001;
+    let Inst{30} = lane{0};
+  }
+}
+
+// Load single 1-element structure to one lane of 1 register.
+defm LD1LN : LDN_Lane_BHSD<0b0, 0b0, "VOne", "ld1">;
+
+// Load single N-element structure to one lane of N consecutive registers
+// (N = 2,3,4)
+defm LD2LN : LDN_Lane_BHSD<0b1, 0b0, "VPair", "ld2">;
+defm LD3LN : LDN_Lane_BHSD<0b0, 0b1, "VTriple", "ld3">;
+defm LD4LN : LDN_Lane_BHSD<0b1, 0b1, "VQuad", "ld4">;
+
+multiclass LD1LN_patterns<ValueType VTy, ValueType VTy2, ValueType DTy,
+                          Operand ImmOp, Operand ImmOp2, PatFrag LoadOp,
+                          Instruction INST> {
+  def : Pat<(VTy (vector_insert (VTy VPR64:$src),
+                     (DTy (LoadOp GPR64xsp:$Rn)), (ImmOp:$lane))),
+            (VTy (EXTRACT_SUBREG
+                     (INST GPR64xsp:$Rn,
+                           (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64),
+                           ImmOp:$lane),
+                     sub_64))>;
+
+  def : Pat<(VTy2 (vector_insert (VTy2 VPR128:$src),
+                      (DTy (LoadOp GPR64xsp:$Rn)), (ImmOp2:$lane))),
+            (VTy2 (INST GPR64xsp:$Rn, VPR128:$src, ImmOp2:$lane))>;
+}
+
+// Match all LD1LN instructions
+defm : LD1LN_patterns<v8i8, v16i8, i32, neon_uimm3_bare, neon_uimm4_bare,
+                      extloadi8, LD1LN_B>;
+
+defm : LD1LN_patterns<v4i16, v8i16, i32, neon_uimm2_bare, neon_uimm3_bare,
+                      extloadi16, LD1LN_H>;
+
+defm : LD1LN_patterns<v2i32, v4i32, i32, neon_uimm1_bare, neon_uimm2_bare,
+                      load, LD1LN_S>;
+defm : LD1LN_patterns<v2f32, v4f32, f32, neon_uimm1_bare, neon_uimm2_bare,
+                      load, LD1LN_S>;
+
+defm : LD1LN_patterns<v1i64, v2i64, i64, neon_uimm0_bare, neon_uimm1_bare,
+                      load, LD1LN_D>;
+defm : LD1LN_patterns<v1f64, v2f64, f64, neon_uimm0_bare, neon_uimm1_bare,
+                      load, LD1LN_D>;
+
+class NeonI_STN_Lane<bit r, bits<2> op2_1, bit op0, RegisterOperand VList,
+                     Operand ImmOp, string asmop>
+    : NeonI_LdStOne_Lane<0, r, op2_1, op0,
+                         (outs), (ins GPR64xsp:$Rn, VList:$Rt, ImmOp:$lane),
+                         asmop # "\t$Rt[$lane], [$Rn]",
+                         [],
+                         NoItinerary> {
+  let mayStore = 1;
+  let neverHasSideEffects = 1;
+  let hasExtraDefRegAllocReq = 1;
+}
+
+multiclass STN_Lane_BHSD<bit r, bit op0, string List, string asmop> {
+  def _B : NeonI_STN_Lane<r, 0b00, op0,
+                          !cast<RegisterOperand>(List # "B_operand"),
+                          neon_uimm4_bare, asmop> {
+    let Inst{12-10} = lane{2-0};
+    let Inst{30} = lane{3};
+  }
+
+  def _H : NeonI_STN_Lane<r, 0b01, op0,
+                          !cast<RegisterOperand>(List # "H_operand"),
+                          neon_uimm3_bare, asmop> {
+    let Inst{12-10} = {lane{1}, lane{0}, 0b0};
+    let Inst{30} = lane{2};
+  }
+
+  def _S : NeonI_STN_Lane<r, 0b10, op0,
+                          !cast<RegisterOperand>(List # "S_operand"),
+                           neon_uimm2_bare, asmop> {
+    let Inst{12-10} = {lane{0}, 0b0, 0b0};
+    let Inst{30} = lane{1};
+  }
+
+  def _D : NeonI_STN_Lane<r, 0b10, op0,
+                          !cast<RegisterOperand>(List # "D_operand"),
+                          neon_uimm1_bare, asmop>{
+    let Inst{12-10} = 0b001;
+    let Inst{30} = lane{0};
+  }
+}
+
+// Store single 1-element structure from one lane of 1 register.
+defm ST1LN : STN_Lane_BHSD<0b0, 0b0, "VOne", "st1">;
+
+// Store single N-element structure from one lane of N consecutive registers
+// (N = 2,3,4)
+defm ST2LN : STN_Lane_BHSD<0b1, 0b0, "VPair", "st2">;
+defm ST3LN : STN_Lane_BHSD<0b0, 0b1, "VTriple", "st3">;
+defm ST4LN : STN_Lane_BHSD<0b1, 0b1, "VQuad", "st4">;
+
+multiclass ST1LN_patterns<ValueType VTy, ValueType VTy2, ValueType DTy,
+                          Operand ImmOp, Operand ImmOp2, PatFrag StoreOp,
+                          Instruction INST> {
+  def : Pat<(StoreOp (DTy (vector_extract (VTy VPR64:$Rt), ImmOp:$lane)),
+                     GPR64xsp:$Rn),
+            (INST GPR64xsp:$Rn,
+                  (SUBREG_TO_REG (i64 0), VPR64:$Rt, sub_64),
+                  ImmOp:$lane)>;
+
+  def : Pat<(StoreOp (DTy (vector_extract (VTy2 VPR128:$Rt), ImmOp2:$lane)),
+                     GPR64xsp:$Rn),
+            (INST GPR64xsp:$Rn, VPR128:$Rt, ImmOp2:$lane)>;
+}
+
+// Match all ST1LN instructions
+defm : ST1LN_patterns<v8i8, v16i8, i32, neon_uimm3_bare, neon_uimm4_bare,
+                      truncstorei8, ST1LN_B>;
+
+defm : ST1LN_patterns<v4i16, v8i16, i32, neon_uimm2_bare, neon_uimm3_bare,
+                      truncstorei16, ST1LN_H>;
+
+defm : ST1LN_patterns<v2i32, v4i32, i32, neon_uimm1_bare, neon_uimm2_bare,
+                      store, ST1LN_S>;
+defm : ST1LN_patterns<v2f32, v4f32, f32, neon_uimm1_bare, neon_uimm2_bare,
+                      store, ST1LN_S>;
+
+defm : ST1LN_patterns<v1i64, v2i64, i64, neon_uimm0_bare, neon_uimm1_bare,
+                      store, ST1LN_D>;
+defm : ST1LN_patterns<v1f64, v2f64, f64, neon_uimm0_bare, neon_uimm1_bare,
+                      store, ST1LN_D>;
+
+// End of vector load/store single N-element structure (class SIMD lsone).
+
+
+// The following are post-index load/store single N-element instructions
+// (class SIMD lsone-post)
+
+multiclass NeonI_LDN_WB_Dup<bit q, bit r, bits<3> opcode, bits<2> size,
+                            RegisterOperand VecList, Operand ImmTy,
+                            string asmop> {
+  let mayLoad = 1, neverHasSideEffects = 1, Constraints = "$wb = $Rn",
+  DecoderMethod = "DecodeVLDSTLanePostInstruction" in {
+    def _fixed : NeonI_LdOne_Dup_Post<q, r, opcode, size,
+                      (outs VecList:$Rt, GPR64xsp:$wb),
+                      (ins GPR64xsp:$Rn, ImmTy:$amt),
+                      asmop # "\t$Rt, [$Rn], $amt",
+                      [],
+                      NoItinerary> {
+                        let Rm = 0b11111;
+                      }
+
+    def _register : NeonI_LdOne_Dup_Post<q, r, opcode, size,
+                      (outs VecList:$Rt, GPR64xsp:$wb),
+                      (ins GPR64xsp:$Rn, GPR64noxzr:$Rm),
+                      asmop # "\t$Rt, [$Rn], $Rm",
+                      [],
+                      NoItinerary>;
+  }
+}
+
+multiclass LDWB_Dup_BHSD<bit r, bits<3> opcode, string List, string asmop,
+                         Operand uimm_b, Operand uimm_h,
+                         Operand uimm_s, Operand uimm_d> {
+  defm _8B : NeonI_LDN_WB_Dup<0, r, opcode, 0b00,
+                              !cast<RegisterOperand>(List # "8B_operand"),
+                              uimm_b, asmop>;
+
+  defm _4H : NeonI_LDN_WB_Dup<0, r, opcode, 0b01,
+                              !cast<RegisterOperand>(List # "4H_operand"),
+                              uimm_h, asmop>;
+
+  defm _2S : NeonI_LDN_WB_Dup<0, r, opcode, 0b10,
+                              !cast<RegisterOperand>(List # "2S_operand"),
+                              uimm_s, asmop>;
+
+  defm _1D : NeonI_LDN_WB_Dup<0, r, opcode, 0b11,
+                              !cast<RegisterOperand>(List # "1D_operand"),
+                              uimm_d, asmop>;
+
+  defm _16B : NeonI_LDN_WB_Dup<1, r, opcode, 0b00,
+                               !cast<RegisterOperand>(List # "16B_operand"),
+                               uimm_b, asmop>;
+
+  defm _8H : NeonI_LDN_WB_Dup<1, r, opcode, 0b01,
+                              !cast<RegisterOperand>(List # "8H_operand"),
+                              uimm_h, asmop>;
+
+  defm _4S : NeonI_LDN_WB_Dup<1, r, opcode, 0b10,
+                              !cast<RegisterOperand>(List # "4S_operand"),
+                              uimm_s, asmop>;
+
+  defm _2D : NeonI_LDN_WB_Dup<1, r, opcode, 0b11,
+                              !cast<RegisterOperand>(List # "2D_operand"),
+                              uimm_d, asmop>;
+}
+
+// Post-index load single 1-element structure to all lanes of 1 register
+defm LD1R_WB : LDWB_Dup_BHSD<0b0, 0b110, "VOne", "ld1r", uimm_exact1,
+                             uimm_exact2, uimm_exact4, uimm_exact8>;
+
+// Post-index load single N-element structure to all lanes of N consecutive
+// registers (N = 2,3,4)
+defm LD2R_WB : LDWB_Dup_BHSD<0b1, 0b110, "VPair", "ld2r", uimm_exact2,
+                             uimm_exact4, uimm_exact8, uimm_exact16>;
+defm LD3R_WB : LDWB_Dup_BHSD<0b0, 0b111, "VTriple", "ld3r", uimm_exact3,
+                             uimm_exact6, uimm_exact12, uimm_exact24>;
+defm LD4R_WB : LDWB_Dup_BHSD<0b1, 0b111, "VQuad", "ld4r", uimm_exact4,
+                             uimm_exact8, uimm_exact16, uimm_exact32>;
+
+let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1,
+    Constraints = "$Rn = $wb, $Rt = $src",
+    DecoderMethod = "DecodeVLDSTLanePostInstruction" in {
+  class LDN_WBFx_Lane<bit r, bits<2> op2_1, bit op0, RegisterOperand VList,
+                                Operand ImmTy, Operand ImmOp, string asmop>
+      : NeonI_LdStOne_Lane_Post<1, r, op2_1, op0,
+                                (outs VList:$Rt, GPR64xsp:$wb),
+                                (ins GPR64xsp:$Rn, ImmTy:$amt,
+                                    VList:$src, ImmOp:$lane),
+                                asmop # "\t$Rt[$lane], [$Rn], $amt",
                                 [],
-                                NoItinerary>;
-    def hhh : NeonI_Scalar3Same<u, 0b01, opcode,
-                                (outs FPR16:$Rd), (ins FPR16:$Rn, FPR16:$Rm),
-                                !strconcat(asmop, " $Rd, $Rn, $Rm"),
+                                NoItinerary> {
+    let Rm = 0b11111;
+  }
+
+  class LDN_WBReg_Lane<bit r, bits<2> op2_1, bit op0, RegisterOperand VList,
+                                 Operand ImmTy, Operand ImmOp, string asmop>
+      : NeonI_LdStOne_Lane_Post<1, r, op2_1, op0,
+                                (outs VList:$Rt, GPR64xsp:$wb),
+                                (ins GPR64xsp:$Rn, GPR64noxzr:$Rm,
+                                    VList:$src, ImmOp:$lane),
+                                asmop # "\t$Rt[$lane], [$Rn], $Rm",
                                 [],
                                 NoItinerary>;
-    def sss : NeonI_Scalar3Same<u, 0b10, opcode,
-                                (outs FPR32:$Rd), (ins FPR32:$Rn, FPR32:$Rm),
-                                !strconcat(asmop, " $Rd, $Rn, $Rm"),
+}
+
+multiclass LD_Lane_WB_BHSD<bit r, bit op0, string List, string asmop,
+                           Operand uimm_b, Operand uimm_h,
+                           Operand uimm_s, Operand uimm_d> {
+  def _B_fixed : LDN_WBFx_Lane<r, 0b00, op0,
+                               !cast<RegisterOperand>(List # "B_operand"),
+                               uimm_b, neon_uimm4_bare, asmop> {
+    let Inst{12-10} = lane{2-0};
+    let Inst{30} = lane{3};
+  }
+
+  def _B_register : LDN_WBReg_Lane<r, 0b00, op0,
+                                   !cast<RegisterOperand>(List # "B_operand"),
+                                   uimm_b, neon_uimm4_bare, asmop> {
+    let Inst{12-10} = lane{2-0};
+    let Inst{30} = lane{3};
+  }
+
+  def _H_fixed : LDN_WBFx_Lane<r, 0b01, op0,
+                               !cast<RegisterOperand>(List # "H_operand"),
+                               uimm_h, neon_uimm3_bare, asmop> {
+    let Inst{12-10} = {lane{1}, lane{0}, 0b0};
+    let Inst{30} = lane{2};
+  }
+
+  def _H_register : LDN_WBReg_Lane<r, 0b01, op0,
+                                   !cast<RegisterOperand>(List # "H_operand"),
+                                   uimm_h, neon_uimm3_bare, asmop> {
+    let Inst{12-10} = {lane{1}, lane{0}, 0b0};
+    let Inst{30} = lane{2};
+  }
+
+  def _S_fixed : LDN_WBFx_Lane<r, 0b10, op0,
+                               !cast<RegisterOperand>(List # "S_operand"),
+                               uimm_s, neon_uimm2_bare, asmop> {
+    let Inst{12-10} = {lane{0}, 0b0, 0b0};
+    let Inst{30} = lane{1};
+  }
+
+  def _S_register : LDN_WBReg_Lane<r, 0b10, op0,
+                                   !cast<RegisterOperand>(List # "S_operand"),
+                                   uimm_s, neon_uimm2_bare, asmop> {
+    let Inst{12-10} = {lane{0}, 0b0, 0b0};
+    let Inst{30} = lane{1};
+  }
+
+  def _D_fixed : LDN_WBFx_Lane<r, 0b10, op0,
+                               !cast<RegisterOperand>(List # "D_operand"),
+                               uimm_d, neon_uimm1_bare, asmop> {
+    let Inst{12-10} = 0b001;
+    let Inst{30} = lane{0};
+  }
+
+  def _D_register : LDN_WBReg_Lane<r, 0b10, op0,
+                                   !cast<RegisterOperand>(List # "D_operand"),
+                                   uimm_d, neon_uimm1_bare, asmop> {
+    let Inst{12-10} = 0b001;
+    let Inst{30} = lane{0};
+  }
+}
+
+// Post-index load single 1-element structure to one lane of 1 register.
+defm LD1LN_WB : LD_Lane_WB_BHSD<0b0, 0b0, "VOne", "ld1", uimm_exact1,
+                                uimm_exact2, uimm_exact4, uimm_exact8>;
+
+// Post-index load single N-element structure to one lane of N consecutive
+// registers
+// (N = 2,3,4)
+defm LD2LN_WB : LD_Lane_WB_BHSD<0b1, 0b0, "VPair", "ld2", uimm_exact2,
+                                uimm_exact4, uimm_exact8, uimm_exact16>;
+defm LD3LN_WB : LD_Lane_WB_BHSD<0b0, 0b1, "VTriple", "ld3", uimm_exact3,
+                                uimm_exact6, uimm_exact12, uimm_exact24>;
+defm LD4LN_WB : LD_Lane_WB_BHSD<0b1, 0b1, "VQuad", "ld4", uimm_exact4,
+                                uimm_exact8, uimm_exact16, uimm_exact32>;
+
+let mayStore = 1, neverHasSideEffects = 1,
+    hasExtraDefRegAllocReq = 1, Constraints = "$Rn = $wb",
+    DecoderMethod = "DecodeVLDSTLanePostInstruction" in {
+  class STN_WBFx_Lane<bit r, bits<2> op2_1, bit op0, RegisterOperand VList,
+                      Operand ImmTy, Operand ImmOp, string asmop>
+      : NeonI_LdStOne_Lane_Post<0, r, op2_1, op0,
+                                (outs GPR64xsp:$wb),
+                                (ins GPR64xsp:$Rn, ImmTy:$amt,
+                                    VList:$Rt, ImmOp:$lane),
+                                asmop # "\t$Rt[$lane], [$Rn], $amt",
+                                [],
+                                NoItinerary> {
+    let Rm = 0b11111;
+  }
+
+  class STN_WBReg_Lane<bit r, bits<2> op2_1, bit op0, RegisterOperand VList,
+                       Operand ImmTy, Operand ImmOp, string asmop>
+      : NeonI_LdStOne_Lane_Post<0, r, op2_1, op0,
+                                (outs GPR64xsp:$wb),
+                                (ins GPR64xsp:$Rn, GPR64noxzr:$Rm, VList:$Rt,
+                                    ImmOp:$lane),
+                                asmop # "\t$Rt[$lane], [$Rn], $Rm",
                                 [],
                                 NoItinerary>;
-    def ddd : NeonI_Scalar3Same<u, 0b11, opcode,
-                               (outs FPR64:$Rd), (ins FPR64:$Rn, FPR64:$Rm),
-                               !strconcat(asmop, " $Rd, $Rn, $Rm"),
-                               [],
-                               NoItinerary>;
+}
+
+multiclass ST_Lane_WB_BHSD<bit r, bit op0, string List, string asmop,
+                           Operand uimm_b, Operand uimm_h,
+                           Operand uimm_s, Operand uimm_d> {
+  def _B_fixed : STN_WBFx_Lane<r, 0b00, op0,
+                               !cast<RegisterOperand>(List # "B_operand"),
+                               uimm_b, neon_uimm4_bare, asmop> {
+    let Inst{12-10} = lane{2-0};
+    let Inst{30} = lane{3};
+  }
+
+  def _B_register : STN_WBReg_Lane<r, 0b00, op0,
+                                   !cast<RegisterOperand>(List # "B_operand"),
+                                   uimm_b, neon_uimm4_bare, asmop> {
+    let Inst{12-10} = lane{2-0};
+    let Inst{30} = lane{3};
+  }
+
+  def _H_fixed : STN_WBFx_Lane<r, 0b01, op0,
+                               !cast<RegisterOperand>(List # "H_operand"),
+                               uimm_h, neon_uimm3_bare, asmop> {
+    let Inst{12-10} = {lane{1}, lane{0}, 0b0};
+    let Inst{30} = lane{2};
+  }
+
+  def _H_register : STN_WBReg_Lane<r, 0b01, op0,
+                                   !cast<RegisterOperand>(List # "H_operand"),
+                                   uimm_h, neon_uimm3_bare, asmop> {
+    let Inst{12-10} = {lane{1}, lane{0}, 0b0};
+    let Inst{30} = lane{2};
+  }
+
+  def _S_fixed : STN_WBFx_Lane<r, 0b10, op0,
+                               !cast<RegisterOperand>(List # "S_operand"),
+                               uimm_s, neon_uimm2_bare, asmop> {
+    let Inst{12-10} = {lane{0}, 0b0, 0b0};
+    let Inst{30} = lane{1};
+  }
+
+  def _S_register : STN_WBReg_Lane<r, 0b10, op0,
+                                   !cast<RegisterOperand>(List # "S_operand"),
+                                   uimm_s, neon_uimm2_bare, asmop> {
+    let Inst{12-10} = {lane{0}, 0b0, 0b0};
+    let Inst{30} = lane{1};
+  }
+
+  def _D_fixed : STN_WBFx_Lane<r, 0b10, op0,
+                               !cast<RegisterOperand>(List # "D_operand"),
+                               uimm_d, neon_uimm1_bare, asmop> {
+    let Inst{12-10} = 0b001;
+    let Inst{30} = lane{0};
+  }
+
+  def _D_register : STN_WBReg_Lane<r, 0b10, op0,
+                                   !cast<RegisterOperand>(List # "D_operand"),
+                                   uimm_d, neon_uimm1_bare, asmop> {
+    let Inst{12-10} = 0b001;
+    let Inst{30} = lane{0};
   }
 }
 
-class Neon_Scalar_D_size_patterns<SDPatternOperator opnode, Instruction INSTD>
-  : Pat<(v1i64 (opnode (v1i64 VPR64:$Rn), (v1i64 VPR64:$Rm))),
-        (SUBREG_TO_REG (i64 0),
-              (INSTD (EXTRACT_SUBREG VPR64:$Rn, sub_64),
-             (EXTRACT_SUBREG VPR64:$Rm, sub_64)),
-          sub_64)>;
+// Post-index store single 1-element structure from one lane of 1 register.
+defm ST1LN_WB : ST_Lane_WB_BHSD<0b0, 0b0, "VOne", "st1", uimm_exact1,
+                                uimm_exact2, uimm_exact4, uimm_exact8>;
+
+// Post-index store single N-element structure from one lane of N consecutive
+// registers (N = 2,3,4)
+defm ST2LN_WB : ST_Lane_WB_BHSD<0b1, 0b0, "VPair", "st2", uimm_exact2,
+                                uimm_exact4, uimm_exact8, uimm_exact16>;
+defm ST3LN_WB : ST_Lane_WB_BHSD<0b0, 0b1, "VTriple", "st3", uimm_exact3,
+                                uimm_exact6, uimm_exact12, uimm_exact24>;
+defm ST4LN_WB : ST_Lane_WB_BHSD<0b1, 0b1, "VQuad", "st4", uimm_exact4,
+                                uimm_exact8, uimm_exact16, uimm_exact32>;
+
+// End of post-index load/store single N-element instructions
+// (class SIMD lsone-post)
+
+// Neon Scalar instructions implementation
+// Scalar Three Same
+
+class NeonI_Scalar3Same_size<bit u, bits<2> size, bits<5> opcode, string asmop,
+                             RegisterClass FPRC>
+  : NeonI_Scalar3Same<u, size, opcode,
+                      (outs FPRC:$Rd), (ins FPRC:$Rn, FPRC:$Rm),
+                      !strconcat(asmop, "\t$Rd, $Rn, $Rm"),
+                      [],
+                      NoItinerary>;
+
+class NeonI_Scalar3Same_D_size<bit u, bits<5> opcode, string asmop>
+  : NeonI_Scalar3Same_size<u, 0b11, opcode, asmop, FPR64>;
+
+multiclass NeonI_Scalar3Same_HS_sizes<bit u, bits<5> opcode, string asmop,
+                                      bit Commutable = 0> {
+  let isCommutable = Commutable in {
+    def hhh : NeonI_Scalar3Same_size<u, 0b01, opcode, asmop, FPR16>;
+    def sss : NeonI_Scalar3Same_size<u, 0b10, opcode, asmop, FPR32>;
+  }
+}
+
+multiclass NeonI_Scalar3Same_SD_sizes<bit u, bit size_high, bits<5> opcode,
+                                      string asmop, bit Commutable = 0> {
+  let isCommutable = Commutable in {
+    def sss : NeonI_Scalar3Same_size<u, {size_high, 0b0}, opcode, asmop, FPR32>;
+    def ddd : NeonI_Scalar3Same_size<u, {size_high, 0b1}, opcode, asmop, FPR64>;
+  }
+}
+
+multiclass NeonI_Scalar3Same_BHSD_sizes<bit u, bits<5> opcode,
+                                        string asmop, bit Commutable = 0> {
+  let isCommutable = Commutable in {
+    def bbb : NeonI_Scalar3Same_size<u, 0b00, opcode, asmop, FPR8>;
+    def hhh : NeonI_Scalar3Same_size<u, 0b01, opcode, asmop, FPR16>;
+    def sss : NeonI_Scalar3Same_size<u, 0b10, opcode, asmop, FPR32>;
+    def ddd : NeonI_Scalar3Same_size<u, 0b11, opcode, asmop, FPR64>;
+  }
+}
+
+multiclass Neon_Scalar3Same_D_size_patterns<SDPatternOperator opnode,
+                                            Instruction INSTD> {
+  def : Pat<(v1i64 (opnode (v1i64 FPR64:$Rn), (v1i64 FPR64:$Rm))),
+            (INSTD FPR64:$Rn, FPR64:$Rm)>;
+}
+
+multiclass Neon_Scalar3Same_BHSD_size_patterns<SDPatternOperator opnode,
+                                               Instruction INSTB,
+                                               Instruction INSTH,
+                                               Instruction INSTS,
+                                               Instruction INSTD>
+  : Neon_Scalar3Same_D_size_patterns<opnode, INSTD> {
+  def: Pat<(v1i8 (opnode (v1i8 FPR8:$Rn), (v1i8 FPR8:$Rm))),
+           (INSTB FPR8:$Rn, FPR8:$Rm)>;
+
+  def: Pat<(v1i16 (opnode (v1i16 FPR16:$Rn), (v1i16 FPR16:$Rm))),
+           (INSTH FPR16:$Rn, FPR16:$Rm)>;
 
+  def: Pat<(v1i32 (opnode (v1i32 FPR32:$Rn), (v1i32 FPR32:$Rm))),
+           (INSTS FPR32:$Rn, FPR32:$Rm)>;
+}
+
+class Neon_Scalar3Same_cmp_D_size_patterns<SDPatternOperator opnode,
+                                           Instruction INSTD>
+  : Pat<(v1i64 (opnode (v1i64 FPR64:$Rn), (v1i64 FPR64:$Rm))),
+        (INSTD FPR64:$Rn, FPR64:$Rm)>;
+
+multiclass Neon_Scalar3Same_HS_size_patterns<SDPatternOperator opnode,
+                                             Instruction INSTH,
+                                             Instruction INSTS> {
+  def : Pat<(v1i16 (opnode (v1i16 FPR16:$Rn), (v1i16 FPR16:$Rm))),
+            (INSTH FPR16:$Rn, FPR16:$Rm)>;
+  def : Pat<(v1i32 (opnode (v1i32 FPR32:$Rn), (v1i32 FPR32:$Rm))),
+            (INSTS FPR32:$Rn, FPR32:$Rm)>;
+}
+
+multiclass Neon_Scalar3Same_SD_size_patterns<SDPatternOperator opnode,
+                                             Instruction INSTS,
+                                             Instruction INSTD> {
+  def : Pat<(v1f32 (opnode (v1f32 FPR32:$Rn), (v1f32 FPR32:$Rm))),
+            (INSTS FPR32:$Rn, FPR32:$Rm)>;
+  def : Pat<(v1f64 (opnode (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
+            (INSTD FPR64:$Rn, FPR64:$Rm)>;
+}
+
+multiclass Neon_Scalar3Same_cmp_SD_size_patterns<SDPatternOperator opnode,
+                                                 Instruction INSTS,
+                                                 Instruction INSTD> {
+  def : Pat<(v1i32 (opnode (v1f32 FPR32:$Rn), (v1f32 FPR32:$Rm))),
+            (INSTS FPR32:$Rn, FPR32:$Rm)>;
+  def : Pat<(v1i64 (opnode (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
+            (INSTD FPR64:$Rn, FPR64:$Rm)>;
+}
+
+class Neon_Scalar3Same_cmp_V1_D_size_patterns<CondCode CC,
+                                              Instruction INSTD>
+  : Pat<(v1i64 (Neon_cmp (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm), CC)),
+        (INSTD FPR64:$Rn, FPR64:$Rm)>;
+
+// Scalar Three Different
+
+class NeonI_Scalar3Diff_size<bit u, bits<2> size, bits<4> opcode, string asmop,
+                             RegisterClass FPRCD, RegisterClass FPRCS>
+  : NeonI_Scalar3Diff<u, size, opcode,
+                      (outs FPRCD:$Rd), (ins FPRCS:$Rn, FPRCS:$Rm),
+                      !strconcat(asmop, "\t$Rd, $Rn, $Rm"),
+                      [],
+                      NoItinerary>;
+
+multiclass NeonI_Scalar3Diff_HS_size<bit u, bits<4> opcode, string asmop> {
+  def shh : NeonI_Scalar3Diff_size<u, 0b01, opcode, asmop, FPR32, FPR16>;
+  def dss : NeonI_Scalar3Diff_size<u, 0b10, opcode, asmop, FPR64, FPR32>;
+}
+
+multiclass NeonI_Scalar3Diff_ml_HS_size<bit u, bits<4> opcode, string asmop> {
+  let Constraints = "$Src = $Rd" in {
+    def shh : NeonI_Scalar3Diff<u, 0b01, opcode,
+                       (outs FPR32:$Rd), (ins FPR32:$Src, FPR16:$Rn, FPR16:$Rm),
+                       !strconcat(asmop, "\t$Rd, $Rn, $Rm"),
+                       [],
+                       NoItinerary>;
+    def dss : NeonI_Scalar3Diff<u, 0b10, opcode,
+                       (outs FPR64:$Rd), (ins FPR64:$Src, FPR32:$Rn, FPR32:$Rm),
+                       !strconcat(asmop, "\t$Rd, $Rn, $Rm"),
+                       [],
+                       NoItinerary>;
+  }
+}
+
+multiclass Neon_Scalar3Diff_HS_size_patterns<SDPatternOperator opnode,
+                                             Instruction INSTH,
+                                             Instruction INSTS> {
+  def : Pat<(v1i32 (opnode (v1i16 FPR16:$Rn), (v1i16 FPR16:$Rm))),
+            (INSTH FPR16:$Rn, FPR16:$Rm)>;
+  def : Pat<(v1i64 (opnode (v1i32 FPR32:$Rn), (v1i32 FPR32:$Rm))),
+            (INSTS FPR32:$Rn, FPR32:$Rm)>;
+}
+
+multiclass Neon_Scalar3Diff_ml_HS_size_patterns<SDPatternOperator opnode,
+                                             Instruction INSTH,
+                                             Instruction INSTS> {
+  def : Pat<(v1i32 (opnode (v1i32 FPR32:$Src), (v1i16 FPR16:$Rn), (v1i16 FPR16:$Rm))),
+            (INSTH FPR32:$Src, FPR16:$Rn, FPR16:$Rm)>;
+  def : Pat<(v1i64 (opnode (v1i64 FPR64:$Src), (v1i32 FPR32:$Rn), (v1i32 FPR32:$Rm))),
+            (INSTS FPR64:$Src, FPR32:$Rn, FPR32:$Rm)>;
+}
+
+// Scalar Two Registers Miscellaneous
+
+class NeonI_Scalar2SameMisc_size<bit u, bits<2> size, bits<5> opcode, string asmop,
+                             RegisterClass FPRCD, RegisterClass FPRCS>
+  : NeonI_Scalar2SameMisc<u, size, opcode,
+                          (outs FPRCD:$Rd), (ins FPRCS:$Rn),
+                          !strconcat(asmop, "\t$Rd, $Rn"),
+                          [],
+                          NoItinerary>;
+
+multiclass NeonI_Scalar2SameMisc_SD_size<bit u, bit size_high, bits<5> opcode,
+                                         string asmop> {
+  def ss : NeonI_Scalar2SameMisc_size<u, {size_high, 0b0}, opcode, asmop, FPR32,
+                                      FPR32>;
+  def dd : NeonI_Scalar2SameMisc_size<u, {size_high, 0b1}, opcode, asmop, FPR64,
+                                      FPR64>;
+}
+
+multiclass NeonI_Scalar2SameMisc_D_size<bit u, bits<5> opcode, string asmop> {
+  def dd : NeonI_Scalar2SameMisc_size<u, 0b11, opcode, asmop, FPR64, FPR64>;
+}
+
+multiclass NeonI_Scalar2SameMisc_BHSD_size<bit u, bits<5> opcode, string asmop>
+  : NeonI_Scalar2SameMisc_D_size<u, opcode, asmop> {
+  def bb : NeonI_Scalar2SameMisc_size<u, 0b00, opcode, asmop, FPR8, FPR8>;
+  def hh : NeonI_Scalar2SameMisc_size<u, 0b01, opcode, asmop, FPR16, FPR16>;
+  def ss : NeonI_Scalar2SameMisc_size<u, 0b10, opcode, asmop, FPR32, FPR32>;
+}
+
+class NeonI_Scalar2SameMisc_fcvtxn_D_size<bit u, bits<5> opcode, string asmop>
+  : NeonI_Scalar2SameMisc_size<u, 0b01, opcode, asmop, FPR32, FPR64>;
+
+multiclass NeonI_Scalar2SameMisc_narrow_HSD_size<bit u, bits<5> opcode,
+                                                 string asmop> {
+  def bh : NeonI_Scalar2SameMisc_size<u, 0b00, opcode, asmop, FPR8, FPR16>;
+  def hs : NeonI_Scalar2SameMisc_size<u, 0b01, opcode, asmop, FPR16, FPR32>;
+  def sd : NeonI_Scalar2SameMisc_size<u, 0b10, opcode, asmop, FPR32, FPR64>;
+}
+
+class NeonI_Scalar2SameMisc_accum_size<bit u, bits<2> size, bits<5> opcode,
+                                       string asmop, RegisterClass FPRC>
+  : NeonI_Scalar2SameMisc<u, size, opcode,
+                          (outs FPRC:$Rd), (ins FPRC:$Src, FPRC:$Rn),
+                          !strconcat(asmop, "\t$Rd, $Rn"),
+                          [],
+                          NoItinerary>;
+
+multiclass NeonI_Scalar2SameMisc_accum_BHSD_size<bit u, bits<5> opcode,
+                                                 string asmop> {
+
+  let Constraints = "$Src = $Rd" in {
+    def bb : NeonI_Scalar2SameMisc_accum_size<u, 0b00, opcode, asmop, FPR8>;
+    def hh : NeonI_Scalar2SameMisc_accum_size<u, 0b01, opcode, asmop, FPR16>;
+    def ss : NeonI_Scalar2SameMisc_accum_size<u, 0b10, opcode, asmop, FPR32>;
+    def dd : NeonI_Scalar2SameMisc_accum_size<u, 0b11, opcode, asmop, FPR64>;
+  }
+}
+
+class Neon_Scalar2SameMisc_fcvtxn_D_size_patterns<SDPatternOperator opnode,
+                                                  Instruction INSTD>
+  : Pat<(v1f32 (opnode (v1f64 FPR64:$Rn))),
+        (INSTD FPR64:$Rn)>;
+
+multiclass Neon_Scalar2SameMisc_fcvt_SD_size_patterns<SDPatternOperator opnode,
+                                                      Instruction INSTS,
+                                                      Instruction INSTD> {
+  def : Pat<(v1i32 (opnode (v1f32 FPR32:$Rn))),
+            (INSTS FPR32:$Rn)>;
+  def : Pat<(v1i64 (opnode (v1f64 FPR64:$Rn))),
+            (INSTD FPR64:$Rn)>;
+}
+
+multiclass Neon_Scalar2SameMisc_cvt_SD_size_patterns<SDPatternOperator Sopnode,
+                                                     SDPatternOperator Dopnode,
+                                                     Instruction INSTS,
+                                                     Instruction INSTD> {
+  def : Pat<(f32 (Sopnode (v1i32 FPR32:$Rn))),
+            (INSTS FPR32:$Rn)>;
+  def : Pat<(f64 (Dopnode (v1i64 FPR64:$Rn))),
+            (INSTD FPR64:$Rn)>;
+}
+
+multiclass Neon_Scalar2SameMisc_SD_size_patterns<SDPatternOperator opnode,
+                                                 Instruction INSTS,
+                                                 Instruction INSTD> {
+  def : Pat<(v1f32 (opnode (v1f32 FPR32:$Rn))),
+            (INSTS FPR32:$Rn)>;
+  def : Pat<(v1f64 (opnode (v1f64 FPR64:$Rn))),
+            (INSTD FPR64:$Rn)>;
+}
+
+class NeonI_Scalar2SameMisc_cmpz_D_size<bit u, bits<5> opcode, string asmop>
+  : NeonI_Scalar2SameMisc<u, 0b11, opcode,
+                          (outs FPR64:$Rd), (ins FPR64:$Rn, neon_uimm0:$Imm),
+                          !strconcat(asmop, "\t$Rd, $Rn, $Imm"),
+                          [],
+                          NoItinerary>;
+
+multiclass NeonI_Scalar2SameMisc_cmpz_SD_size<bit u, bits<5> opcode,
+                                              string asmop> {
+  def ssi : NeonI_Scalar2SameMisc<u, 0b10, opcode,
+                           (outs FPR32:$Rd), (ins FPR32:$Rn, fpz32:$FPImm),
+                           !strconcat(asmop, "\t$Rd, $Rn, $FPImm"),
+                           [],
+                           NoItinerary>;
+  def ddi : NeonI_Scalar2SameMisc<u, 0b11, opcode,
+                           (outs FPR64:$Rd), (ins FPR64:$Rn, fpz32:$FPImm),
+                           !strconcat(asmop, "\t$Rd, $Rn, $FPImm"),
+                           [],
+                           NoItinerary>;
+}
+
+class Neon_Scalar2SameMisc_cmpz_D_size_patterns<SDPatternOperator opnode,
+                                                Instruction INSTD>
+  : Pat<(v1i64 (opnode (v1i64 FPR64:$Rn),
+                       (v1i64 (bitconvert (v8i8 Neon_AllZero))))),
+        (INSTD FPR64:$Rn, 0)>;
+
+class Neon_Scalar2SameMisc_cmpz_D_V1_size_patterns<CondCode CC,
+                                                   Instruction INSTD>
+  : Pat<(v1i64 (Neon_cmpz (v1i64 FPR64:$Rn),
+                          (i32 neon_uimm0:$Imm), CC)),
+        (INSTD FPR64:$Rn, neon_uimm0:$Imm)>;
+
+multiclass Neon_Scalar2SameMisc_cmpz_SD_size_patterns<SDPatternOperator opnode,
+                                                      Instruction INSTS,
+                                                      Instruction INSTD> {
+  def : Pat<(v1i32 (opnode (v1f32 FPR32:$Rn),
+                           (v1f32 (scalar_to_vector (f32 fpz32:$FPImm))))),
+            (INSTS FPR32:$Rn, fpz32:$FPImm)>;
+  def : Pat<(v1i64 (opnode (v1f64 FPR64:$Rn),
+                           (v1f32 (scalar_to_vector (f32 fpz32:$FPImm))))),
+            (INSTD FPR64:$Rn, fpz32:$FPImm)>;
+}
+
+multiclass Neon_Scalar2SameMisc_D_size_patterns<SDPatternOperator opnode,
+                                                Instruction INSTD> {
+  def : Pat<(v1i64 (opnode (v1i64 FPR64:$Rn))),
+            (INSTD FPR64:$Rn)>;
+}
+
+multiclass Neon_Scalar2SameMisc_BHSD_size_patterns<SDPatternOperator opnode,
+                                                   Instruction INSTB,
+                                                   Instruction INSTH,
+                                                   Instruction INSTS,
+                                                   Instruction INSTD>
+  : Neon_Scalar2SameMisc_D_size_patterns<opnode, INSTD> {
+  def : Pat<(v1i8 (opnode (v1i8 FPR8:$Rn))),
+            (INSTB FPR8:$Rn)>;
+  def : Pat<(v1i16 (opnode (v1i16 FPR16:$Rn))),
+            (INSTH FPR16:$Rn)>;
+  def : Pat<(v1i32 (opnode (v1i32 FPR32:$Rn))),
+            (INSTS FPR32:$Rn)>;
+}
+
+multiclass Neon_Scalar2SameMisc_narrow_HSD_size_patterns<
+                                                       SDPatternOperator opnode,
+                                                       Instruction INSTH,
+                                                       Instruction INSTS,
+                                                       Instruction INSTD> {
+  def : Pat<(v1i8 (opnode (v1i16 FPR16:$Rn))),
+            (INSTH FPR16:$Rn)>;
+  def : Pat<(v1i16 (opnode (v1i32 FPR32:$Rn))),
+            (INSTS FPR32:$Rn)>;
+  def : Pat<(v1i32 (opnode (v1i64 FPR64:$Rn))),
+            (INSTD FPR64:$Rn)>;
+
+}
+
+multiclass Neon_Scalar2SameMisc_accum_BHSD_size_patterns<
+                                                       SDPatternOperator opnode,
+                                                       Instruction INSTB,
+                                                       Instruction INSTH,
+                                                       Instruction INSTS,
+                                                       Instruction INSTD> {
+  def : Pat<(v1i8 (opnode (v1i8 FPR8:$Src), (v1i8 FPR8:$Rn))),
+            (INSTB FPR8:$Src, FPR8:$Rn)>;
+  def : Pat<(v1i16 (opnode (v1i16 FPR16:$Src), (v1i16 FPR16:$Rn))),
+            (INSTH FPR16:$Src, FPR16:$Rn)>;
+  def : Pat<(v1i32 (opnode (v1i32 FPR32:$Src), (v1i32 FPR32:$Rn))),
+            (INSTS FPR32:$Src, FPR32:$Rn)>;
+  def : Pat<(v1i64 (opnode (v1i64 FPR64:$Src), (v1i64 FPR64:$Rn))),
+            (INSTD FPR64:$Src, FPR64:$Rn)>;
+}
+
+// Scalar Shift By Immediate
+
+class NeonI_ScalarShiftImm_size<bit u, bits<5> opcode, string asmop,
+                                RegisterClass FPRC, Operand ImmTy>
+  : NeonI_ScalarShiftImm<u, opcode,
+                         (outs FPRC:$Rd), (ins FPRC:$Rn, ImmTy:$Imm),
+                         !strconcat(asmop, "\t$Rd, $Rn, $Imm"),
+                         [], NoItinerary>;
+
+multiclass NeonI_ScalarShiftRightImm_D_size<bit u, bits<5> opcode,
+                                            string asmop> {
+  def ddi : NeonI_ScalarShiftImm_size<u, opcode, asmop, FPR64, shr_imm64> {
+    bits<6> Imm;
+    let Inst{22} = 0b1; // immh:immb = 1xxxxxx
+    let Inst{21-16} = Imm;
+  }
+}
+
+multiclass NeonI_ScalarShiftRightImm_BHSD_size<bit u, bits<5> opcode,
+                                               string asmop>
+  : NeonI_ScalarShiftRightImm_D_size<u, opcode, asmop> {
+  def bbi : NeonI_ScalarShiftImm_size<u, opcode, asmop, FPR8, shr_imm8> {
+    bits<3> Imm;
+    let Inst{22-19} = 0b0001; // immh:immb = 0001xxx
+    let Inst{18-16} = Imm;
+  }
+  def hhi : NeonI_ScalarShiftImm_size<u, opcode, asmop, FPR16, shr_imm16> {
+    bits<4> Imm;
+    let Inst{22-20} = 0b001; // immh:immb = 001xxxx
+    let Inst{19-16} = Imm;
+  }
+  def ssi : NeonI_ScalarShiftImm_size<u, opcode, asmop, FPR32, shr_imm32> {
+    bits<5> Imm;
+    let Inst{22-21} = 0b01; // immh:immb = 01xxxxx
+    let Inst{20-16} = Imm;
+  }
+}
+
+multiclass NeonI_ScalarShiftLeftImm_D_size<bit u, bits<5> opcode,
+                                            string asmop> {
+  def ddi : NeonI_ScalarShiftImm_size<u, opcode, asmop, FPR64, shl_imm64> {
+    bits<6> Imm;
+    let Inst{22} = 0b1; // immh:immb = 1xxxxxx
+    let Inst{21-16} = Imm;
+  }
+}
+
+multiclass NeonI_ScalarShiftLeftImm_BHSD_size<bit u, bits<5> opcode,
+                                              string asmop>
+  : NeonI_ScalarShiftLeftImm_D_size<u, opcode, asmop> {
+  def bbi : NeonI_ScalarShiftImm_size<u, opcode, asmop, FPR8, shl_imm8> {
+    bits<3> Imm;
+    let Inst{22-19} = 0b0001; // immh:immb = 0001xxx
+    let Inst{18-16} = Imm;
+  }
+  def hhi : NeonI_ScalarShiftImm_size<u, opcode, asmop, FPR16, shl_imm16> {
+    bits<4> Imm;
+    let Inst{22-20} = 0b001; // immh:immb = 001xxxx
+    let Inst{19-16} = Imm;
+  }
+  def ssi : NeonI_ScalarShiftImm_size<u, opcode, asmop, FPR32, shl_imm32> {
+    bits<5> Imm;
+    let Inst{22-21} = 0b01; // immh:immb = 01xxxxx
+    let Inst{20-16} = Imm;
+  }
+}
+
+class NeonI_ScalarShiftRightImm_accum_D_size<bit u, bits<5> opcode, string asmop>
+  : NeonI_ScalarShiftImm<u, opcode,
+                         (outs FPR64:$Rd),
+                         (ins FPR64:$Src, FPR64:$Rn, shr_imm64:$Imm),
+                         !strconcat(asmop, "\t$Rd, $Rn, $Imm"),
+                         [], NoItinerary> {
+    bits<6> Imm;
+    let Inst{22} = 0b1; // immh:immb = 1xxxxxx
+    let Inst{21-16} = Imm;
+    let Constraints = "$Src = $Rd";
+}
+
+class NeonI_ScalarShiftLeftImm_accum_D_size<bit u, bits<5> opcode, string asmop>
+  : NeonI_ScalarShiftImm<u, opcode,
+                         (outs FPR64:$Rd),
+                         (ins FPR64:$Src, FPR64:$Rn, shl_imm64:$Imm),
+                         !strconcat(asmop, "\t$Rd, $Rn, $Imm"),
+                         [], NoItinerary> {
+    bits<6> Imm;
+    let Inst{22} = 0b1; // immh:immb = 1xxxxxx
+    let Inst{21-16} = Imm;
+    let Constraints = "$Src = $Rd";
+}
+
+class NeonI_ScalarShiftImm_narrow_size<bit u, bits<5> opcode, string asmop,
+                                       RegisterClass FPRCD, RegisterClass FPRCS,
+                                       Operand ImmTy>
+  : NeonI_ScalarShiftImm<u, opcode,
+                         (outs FPRCD:$Rd), (ins FPRCS:$Rn, ImmTy:$Imm),
+                         !strconcat(asmop, "\t$Rd, $Rn, $Imm"),
+                         [], NoItinerary>;
+
+multiclass NeonI_ScalarShiftImm_narrow_HSD_size<bit u, bits<5> opcode,
+                                                string asmop> {
+  def bhi : NeonI_ScalarShiftImm_narrow_size<u, opcode, asmop, FPR8, FPR16,
+                                             shr_imm8> {
+    bits<3> Imm;
+    let Inst{22-19} = 0b0001; // immh:immb = 0001xxx
+    let Inst{18-16} = Imm;
+  }
+  def hsi : NeonI_ScalarShiftImm_narrow_size<u, opcode, asmop, FPR16, FPR32,
+                                             shr_imm16> {
+    bits<4> Imm;
+    let Inst{22-20} = 0b001; // immh:immb = 001xxxx
+    let Inst{19-16} = Imm;
+  }
+  def sdi : NeonI_ScalarShiftImm_narrow_size<u, opcode, asmop, FPR32, FPR64,
+                                             shr_imm32> {
+    bits<5> Imm;
+    let Inst{22-21} = 0b01; // immh:immb = 01xxxxx
+    let Inst{20-16} = Imm;
+  }
+}
+
+multiclass NeonI_ScalarShiftImm_cvt_SD_size<bit u, bits<5> opcode, string asmop> {
+  def ssi : NeonI_ScalarShiftImm_size<u, opcode, asmop, FPR32, shr_imm32> {
+    bits<5> Imm;
+    let Inst{22-21} = 0b01; // immh:immb = 01xxxxx
+    let Inst{20-16} = Imm;
+  }
+  def ddi : NeonI_ScalarShiftImm_size<u, opcode, asmop, FPR64, shr_imm64> {
+    bits<6> Imm;
+    let Inst{22} = 0b1; // immh:immb = 1xxxxxx
+    let Inst{21-16} = Imm;
+  }
+}
+
+multiclass Neon_ScalarShiftRImm_D_size_patterns<SDPatternOperator opnode,
+                                               Instruction INSTD> {
+  def ddi : Pat<(v1i64 (opnode (v1i64 FPR64:$Rn), (i32 shr_imm64:$Imm))),
+                (INSTD FPR64:$Rn, imm:$Imm)>;
+}
+
+multiclass Neon_ScalarShiftLImm_D_size_patterns<SDPatternOperator opnode,
+                                               Instruction INSTD> {
+  def ddi : Pat<(v1i64 (opnode (v1i64 FPR64:$Rn), (i32 shl_imm64:$Imm))),
+                (INSTD FPR64:$Rn, imm:$Imm)>;
+}
+
+class Neon_ScalarShiftImm_arm_D_size_patterns<SDPatternOperator opnode,
+                                              Instruction INSTD>
+  : Pat<(v1i64 (opnode (v1i64 FPR64:$Rn),
+            (v1i64 (Neon_vdup (i32 shr_imm64:$Imm))))),
+        (INSTD FPR64:$Rn, imm:$Imm)>;
+
+multiclass Neon_ScalarShiftLImm_BHSD_size_patterns<SDPatternOperator opnode,
+                                                   Instruction INSTB,
+                                                   Instruction INSTH,
+                                                   Instruction INSTS,
+                                                   Instruction INSTD>
+  : Neon_ScalarShiftLImm_D_size_patterns<opnode, INSTD> {
+  def bbi : Pat<(v1i8 (opnode (v1i8 FPR8:$Rn), (i32 shl_imm8:$Imm))),
+                (INSTB FPR8:$Rn, imm:$Imm)>;
+  def hhi : Pat<(v1i16 (opnode (v1i16 FPR16:$Rn), (i32 shl_imm16:$Imm))),
+                (INSTH FPR16:$Rn, imm:$Imm)>;
+  def ssi : Pat<(v1i32 (opnode (v1i32 FPR32:$Rn), (i32 shl_imm32:$Imm))),
+                (INSTS FPR32:$Rn, imm:$Imm)>;
+}
+
+class Neon_ScalarShiftLImm_accum_D_size_patterns<SDPatternOperator opnode,
+                                                Instruction INSTD>
+  : Pat<(v1i64 (opnode (v1i64 FPR64:$Src), (v1i64 FPR64:$Rn),
+            (i32 shl_imm64:$Imm))),
+        (INSTD FPR64:$Src, FPR64:$Rn, imm:$Imm)>;
+
+class Neon_ScalarShiftRImm_accum_D_size_patterns<SDPatternOperator opnode,
+                                                Instruction INSTD>
+  : Pat<(v1i64 (opnode (v1i64 FPR64:$Src), (v1i64 FPR64:$Rn),
+            (i32 shr_imm64:$Imm))),
+        (INSTD FPR64:$Src, FPR64:$Rn, imm:$Imm)>;
+
+multiclass Neon_ScalarShiftImm_narrow_HSD_size_patterns<
+                                                       SDPatternOperator opnode,
+                                                       Instruction INSTH,
+                                                       Instruction INSTS,
+                                                       Instruction INSTD> {
+  def bhi : Pat<(v1i8 (opnode (v1i16 FPR16:$Rn), (i32 shr_imm16:$Imm))),
+                (INSTH FPR16:$Rn, imm:$Imm)>;
+  def hsi : Pat<(v1i16 (opnode (v1i32 FPR32:$Rn), (i32 shr_imm32:$Imm))),
+                (INSTS FPR32:$Rn, imm:$Imm)>;
+  def sdi : Pat<(v1i32 (opnode (v1i64 FPR64:$Rn), (i32 shr_imm64:$Imm))),
+                (INSTD FPR64:$Rn, imm:$Imm)>;
+}
+
+multiclass Neon_ScalarShiftImm_scvtf_SD_size_patterns<SDPatternOperator Sopnode,
+                                                      SDPatternOperator Dopnode,
+                                                      Instruction INSTS,
+                                                      Instruction INSTD> {
+  def ssi : Pat<(f32 (Sopnode (v1i32 FPR32:$Rn), (i32 shr_imm32:$Imm))),
+                (INSTS FPR32:$Rn, imm:$Imm)>;
+  def ddi : Pat<(f64 (Dopnode (v1i64 FPR64:$Rn), (i32 shr_imm64:$Imm))),
+                (INSTD FPR64:$Rn, imm:$Imm)>;
+}
+
+multiclass Neon_ScalarShiftImm_fcvts_SD_size_patterns<SDPatternOperator Sopnode,
+                                                      SDPatternOperator Dopnode,
+                                                      Instruction INSTS,
+                                                      Instruction INSTD> {
+  def ssi : Pat<(v1i32 (Sopnode (v1f32 FPR32:$Rn), (i32 shr_imm32:$Imm))),
+                (INSTS FPR32:$Rn, imm:$Imm)>;
+  def ddi : Pat<(v1i64 (Dopnode (v1f64 FPR64:$Rn), (i32 shr_imm64:$Imm))),
+                (INSTD FPR64:$Rn, imm:$Imm)>;
+}
+
+// Scalar Signed Shift Right (Immediate)
+defm SSHR : NeonI_ScalarShiftRightImm_D_size<0b0, 0b00000, "sshr">;
+defm : Neon_ScalarShiftRImm_D_size_patterns<int_aarch64_neon_vshrds_n, SSHRddi>;
+// Pattern to match llvm.arm.* intrinsic.
+def : Neon_ScalarShiftImm_arm_D_size_patterns<sra, SSHRddi>;
+
+// Scalar Unsigned Shift Right (Immediate)
+defm USHR : NeonI_ScalarShiftRightImm_D_size<0b1, 0b00000, "ushr">;
+defm : Neon_ScalarShiftRImm_D_size_patterns<int_aarch64_neon_vshrdu_n, USHRddi>;
+// Pattern to match llvm.arm.* intrinsic.
+def : Neon_ScalarShiftImm_arm_D_size_patterns<srl, USHRddi>;
+
+// Scalar Signed Rounding Shift Right (Immediate)
+defm SRSHR : NeonI_ScalarShiftRightImm_D_size<0b0, 0b00100, "srshr">;
+defm : Neon_ScalarShiftRImm_D_size_patterns<int_aarch64_neon_vsrshr, SRSHRddi>;
+
+// Scalar Unigned Rounding Shift Right (Immediate)
+defm URSHR : NeonI_ScalarShiftRightImm_D_size<0b1, 0b00100, "urshr">;
+defm : Neon_ScalarShiftRImm_D_size_patterns<int_aarch64_neon_vurshr, URSHRddi>;
+
+// Scalar Signed Shift Right and Accumulate (Immediate)
+def SSRA : NeonI_ScalarShiftRightImm_accum_D_size<0b0, 0b00010, "ssra">;
+def : Neon_ScalarShiftRImm_accum_D_size_patterns
+          <int_aarch64_neon_vsrads_n, SSRA>;
+
+// Scalar Unsigned Shift Right and Accumulate (Immediate)
+def USRA : NeonI_ScalarShiftRightImm_accum_D_size<0b1, 0b00010, "usra">;
+def : Neon_ScalarShiftRImm_accum_D_size_patterns
+          <int_aarch64_neon_vsradu_n, USRA>;
+
+// Scalar Signed Rounding Shift Right and Accumulate (Immediate)
+def SRSRA : NeonI_ScalarShiftRightImm_accum_D_size<0b0, 0b00110, "srsra">;
+def : Neon_ScalarShiftRImm_accum_D_size_patterns
+          <int_aarch64_neon_vrsrads_n, SRSRA>;
+
+// Scalar Unsigned Rounding Shift Right and Accumulate (Immediate)
+def URSRA : NeonI_ScalarShiftRightImm_accum_D_size<0b1, 0b00110, "ursra">;
+def : Neon_ScalarShiftRImm_accum_D_size_patterns
+          <int_aarch64_neon_vrsradu_n, URSRA>;
+
+// Scalar Shift Left (Immediate)
+defm SHL : NeonI_ScalarShiftLeftImm_D_size<0b0, 0b01010, "shl">;
+defm : Neon_ScalarShiftLImm_D_size_patterns<int_aarch64_neon_vshld_n, SHLddi>;
+// Pattern to match llvm.arm.* intrinsic.
+def : Neon_ScalarShiftImm_arm_D_size_patterns<shl, SHLddi>;
+
+// Signed Saturating Shift Left (Immediate)
+defm SQSHL : NeonI_ScalarShiftLeftImm_BHSD_size<0b0, 0b01110, "sqshl">;
+defm : Neon_ScalarShiftLImm_BHSD_size_patterns<int_aarch64_neon_vqshls_n,
+                                               SQSHLbbi, SQSHLhhi,
+                                               SQSHLssi, SQSHLddi>;
+// Pattern to match llvm.arm.* intrinsic.
+defm : Neon_ScalarShiftLImm_D_size_patterns<Neon_sqrshlImm, SQSHLddi>;
+
+// Unsigned Saturating Shift Left (Immediate)
+defm UQSHL : NeonI_ScalarShiftLeftImm_BHSD_size<0b1, 0b01110, "uqshl">;
+defm : Neon_ScalarShiftLImm_BHSD_size_patterns<int_aarch64_neon_vqshlu_n,
+                                               UQSHLbbi, UQSHLhhi,
+                                               UQSHLssi, UQSHLddi>;
+// Pattern to match llvm.arm.* intrinsic.
+defm : Neon_ScalarShiftLImm_D_size_patterns<Neon_uqrshlImm, UQSHLddi>;
+
+// Signed Saturating Shift Left Unsigned (Immediate)
+defm SQSHLU : NeonI_ScalarShiftLeftImm_BHSD_size<0b1, 0b01100, "sqshlu">;
+defm : Neon_ScalarShiftLImm_BHSD_size_patterns<int_aarch64_neon_vsqshlu,
+                                               SQSHLUbbi, SQSHLUhhi,
+                                               SQSHLUssi, SQSHLUddi>;
+
+// Shift Right And Insert (Immediate)
+def SRI : NeonI_ScalarShiftRightImm_accum_D_size<0b1, 0b01000, "sri">;
+def : Neon_ScalarShiftRImm_accum_D_size_patterns
+          <int_aarch64_neon_vsri, SRI>;
+
+// Shift Left And Insert (Immediate)
+def SLI : NeonI_ScalarShiftLeftImm_accum_D_size<0b1, 0b01010, "sli">;
+def : Neon_ScalarShiftLImm_accum_D_size_patterns
+          <int_aarch64_neon_vsli, SLI>;
+
+// Signed Saturating Shift Right Narrow (Immediate)
+defm SQSHRN : NeonI_ScalarShiftImm_narrow_HSD_size<0b0, 0b10010, "sqshrn">;
+defm : Neon_ScalarShiftImm_narrow_HSD_size_patterns<int_aarch64_neon_vsqshrn,
+                                                    SQSHRNbhi, SQSHRNhsi,
+                                                    SQSHRNsdi>;
+
+// Unsigned Saturating Shift Right Narrow (Immediate)
+defm UQSHRN : NeonI_ScalarShiftImm_narrow_HSD_size<0b1, 0b10010, "uqshrn">;
+defm : Neon_ScalarShiftImm_narrow_HSD_size_patterns<int_aarch64_neon_vuqshrn,
+                                                    UQSHRNbhi, UQSHRNhsi,
+                                                    UQSHRNsdi>;
+
+// Signed Saturating Rounded Shift Right Narrow (Immediate)
+defm SQRSHRN : NeonI_ScalarShiftImm_narrow_HSD_size<0b0, 0b10011, "sqrshrn">;
+defm : Neon_ScalarShiftImm_narrow_HSD_size_patterns<int_aarch64_neon_vsqrshrn,
+                                                    SQRSHRNbhi, SQRSHRNhsi,
+                                                    SQRSHRNsdi>;
+
+// Unsigned Saturating Rounded Shift Right Narrow (Immediate)
+defm UQRSHRN : NeonI_ScalarShiftImm_narrow_HSD_size<0b1, 0b10011, "uqrshrn">;
+defm : Neon_ScalarShiftImm_narrow_HSD_size_patterns<int_aarch64_neon_vuqrshrn,
+                                                    UQRSHRNbhi, UQRSHRNhsi,
+                                                    UQRSHRNsdi>;
+
+// Signed Saturating Shift Right Unsigned Narrow (Immediate)
+defm SQSHRUN : NeonI_ScalarShiftImm_narrow_HSD_size<0b1, 0b10000, "sqshrun">;
+defm : Neon_ScalarShiftImm_narrow_HSD_size_patterns<int_aarch64_neon_vsqshrun,
+                                                    SQSHRUNbhi, SQSHRUNhsi,
+                                                    SQSHRUNsdi>;
+
+// Signed Saturating Rounded Shift Right Unsigned Narrow (Immediate)
+defm SQRSHRUN : NeonI_ScalarShiftImm_narrow_HSD_size<0b1, 0b10001, "sqrshrun">;
+defm : Neon_ScalarShiftImm_narrow_HSD_size_patterns<int_aarch64_neon_vsqrshrun,
+                                                    SQRSHRUNbhi, SQRSHRUNhsi,
+                                                    SQRSHRUNsdi>;
+
+// Scalar Signed Fixed-point Convert To Floating-Point (Immediate)
+defm SCVTF_N : NeonI_ScalarShiftImm_cvt_SD_size<0b0, 0b11100, "scvtf">;
+defm : Neon_ScalarShiftImm_scvtf_SD_size_patterns<int_aarch64_neon_vcvtf32_n_s32,
+                                                  int_aarch64_neon_vcvtf64_n_s64,
+                                                  SCVTF_Nssi, SCVTF_Nddi>;
+
+// Scalar Unsigned Fixed-point Convert To Floating-Point (Immediate)
+defm UCVTF_N : NeonI_ScalarShiftImm_cvt_SD_size<0b1, 0b11100, "ucvtf">;
+defm : Neon_ScalarShiftImm_scvtf_SD_size_patterns<int_aarch64_neon_vcvtf32_n_u32,
+                                                  int_aarch64_neon_vcvtf64_n_u64,
+                                                  UCVTF_Nssi, UCVTF_Nddi>;
+
+// Scalar Floating-point Convert To Signed Fixed-point (Immediate)
+defm FCVTZS_N : NeonI_ScalarShiftImm_cvt_SD_size<0b0, 0b11111, "fcvtzs">;
+defm : Neon_ScalarShiftImm_fcvts_SD_size_patterns<int_aarch64_neon_vcvts_n_s32_f32,
+                                                  int_aarch64_neon_vcvtd_n_s64_f64,
+                                                  FCVTZS_Nssi, FCVTZS_Nddi>;
+
+// Scalar Floating-point Convert To Unsigned Fixed-point (Immediate)
+defm FCVTZU_N : NeonI_ScalarShiftImm_cvt_SD_size<0b1, 0b11111, "fcvtzu">;
+defm : Neon_ScalarShiftImm_fcvts_SD_size_patterns<int_aarch64_neon_vcvts_n_u32_f32,
+                                                  int_aarch64_neon_vcvtd_n_u64_f64,
+                                                  FCVTZU_Nssi, FCVTZU_Nddi>;
+
+// Patterns For Convert Instructions Between v1f64 and v1i64
+class Neon_ScalarShiftImm_cvtf_v1f64_pattern<SDPatternOperator opnode,
+                                             Instruction INST>
+    : Pat<(v1f64 (opnode (v1i64 FPR64:$Rn), (i32 shr_imm64:$Imm))),
+          (INST FPR64:$Rn, imm:$Imm)>;
+
+class Neon_ScalarShiftImm_fcvt_v1f64_pattern<SDPatternOperator opnode,
+                                             Instruction INST>
+    : Pat<(v1i64 (opnode (v1f64 FPR64:$Rn), (i32 shr_imm64:$Imm))),
+          (INST FPR64:$Rn, imm:$Imm)>;
+
+def : Neon_ScalarShiftImm_cvtf_v1f64_pattern<int_arm_neon_vcvtfxs2fp,
+                                             SCVTF_Nddi>;
+
+def : Neon_ScalarShiftImm_cvtf_v1f64_pattern<int_arm_neon_vcvtfxu2fp,
+                                             UCVTF_Nddi>;
+
+def : Neon_ScalarShiftImm_fcvt_v1f64_pattern<int_arm_neon_vcvtfp2fxs,
+                                             FCVTZS_Nddi>;
+
+def : Neon_ScalarShiftImm_fcvt_v1f64_pattern<int_arm_neon_vcvtfp2fxu,
+                                             FCVTZU_Nddi>;
 
 // Scalar Integer Add
 let isCommutable = 1 in {
@@ -1461,9 +4791,15 @@ def ADDddd : NeonI_Scalar3Same_D_size<0b0, 0b10000, "add">;
 // Scalar Integer Sub
 def SUBddd : NeonI_Scalar3Same_D_size<0b1, 0b10000, "sub">;
 
-// Pattern for Scalar Integer Add and Sub with D register
-def : Neon_Scalar_D_size_patterns<add, ADDddd>;
-def : Neon_Scalar_D_size_patterns<sub, SUBddd>;
+// Pattern for Scalar Integer Add and Sub with D register only
+defm : Neon_Scalar3Same_D_size_patterns<add, ADDddd>;
+defm : Neon_Scalar3Same_D_size_patterns<sub, SUBddd>;
+
+// Patterns to match llvm.aarch64.* intrinsic for Scalar Add, Sub
+defm : Neon_Scalar3Same_D_size_patterns<int_aarch64_neon_vaddds, ADDddd>;
+defm : Neon_Scalar3Same_D_size_patterns<int_aarch64_neon_vadddu, ADDddd>;
+defm : Neon_Scalar3Same_D_size_patterns<int_aarch64_neon_vsubds, SUBddd>;
+defm : Neon_Scalar3Same_D_size_patterns<int_aarch64_neon_vsubdu, SUBddd>;
 
 // Scalar Integer Saturating Add (Signed, Unsigned)
 defm SQADD : NeonI_Scalar3Same_BHSD_sizes<0b0, 0b00001, "sqadd", 1>;
@@ -1473,41 +4809,1212 @@ defm UQADD : NeonI_Scalar3Same_BHSD_sizes<0b1, 0b00001, "uqadd", 1>;
 defm SQSUB : NeonI_Scalar3Same_BHSD_sizes<0b0, 0b00101, "sqsub", 0>;
 defm UQSUB : NeonI_Scalar3Same_BHSD_sizes<0b1, 0b00101, "uqsub", 0>;
 
-// Patterns for Scalar Integer Saturating Add, Sub with D register only
-def : Neon_Scalar_D_size_patterns<int_arm_neon_vqadds, SQADDddd>;
-def : Neon_Scalar_D_size_patterns<int_arm_neon_vqaddu, UQADDddd>;
-def : Neon_Scalar_D_size_patterns<int_arm_neon_vqsubs, SQSUBddd>;
-def : Neon_Scalar_D_size_patterns<int_arm_neon_vqsubu, UQSUBddd>;
+
+// Patterns to match llvm.aarch64.* intrinsic for
+// Scalar Integer Saturating Add, Sub  (Signed, Unsigned)
+defm : Neon_Scalar3Same_BHSD_size_patterns<int_arm_neon_vqadds, SQADDbbb,
+                                           SQADDhhh, SQADDsss, SQADDddd>;
+defm : Neon_Scalar3Same_BHSD_size_patterns<int_arm_neon_vqaddu, UQADDbbb,
+                                           UQADDhhh, UQADDsss, UQADDddd>;
+defm : Neon_Scalar3Same_BHSD_size_patterns<int_arm_neon_vqsubs, SQSUBbbb,
+                                           SQSUBhhh, SQSUBsss, SQSUBddd>;
+defm : Neon_Scalar3Same_BHSD_size_patterns<int_arm_neon_vqsubu, UQSUBbbb,
+                                           UQSUBhhh, UQSUBsss, UQSUBddd>;
+
+// Scalar Integer Saturating Doubling Multiply Half High
+defm SQDMULH : NeonI_Scalar3Same_HS_sizes<0b0, 0b10110, "sqdmulh", 1>;
+
+// Scalar Integer Saturating Rounding Doubling Multiply Half High
+defm SQRDMULH : NeonI_Scalar3Same_HS_sizes<0b1, 0b10110, "sqrdmulh", 1>;
+
+// Patterns to match llvm.arm.* intrinsic for
+// Scalar Integer Saturating Doubling Multiply Half High and
+// Scalar Integer Saturating Rounding Doubling Multiply Half High
+defm : Neon_Scalar3Same_HS_size_patterns<int_arm_neon_vqdmulh, SQDMULHhhh,
+                                                               SQDMULHsss>;
+defm : Neon_Scalar3Same_HS_size_patterns<int_arm_neon_vqrdmulh, SQRDMULHhhh,
+                                                                SQRDMULHsss>;
+
+// Scalar Floating-point Multiply Extended
+defm FMULX : NeonI_Scalar3Same_SD_sizes<0b0, 0b0, 0b11011, "fmulx", 1>;
+
+// Scalar Floating-point Reciprocal Step
+defm FRECPS : NeonI_Scalar3Same_SD_sizes<0b0, 0b0, 0b11111, "frecps", 0>;
+
+// Scalar Floating-point Reciprocal Square Root Step
+defm FRSQRTS : NeonI_Scalar3Same_SD_sizes<0b0, 0b1, 0b11111, "frsqrts", 0>;
+
+// Patterns to match llvm.arm.* intrinsic for
+// Scalar Floating-point Reciprocal Step and
+// Scalar Floating-point Reciprocal Square Root Step
+defm : Neon_Scalar3Same_SD_size_patterns<int_arm_neon_vrecps, FRECPSsss,
+                                                              FRECPSddd>;
+defm : Neon_Scalar3Same_SD_size_patterns<int_arm_neon_vrsqrts, FRSQRTSsss,
+                                                               FRSQRTSddd>;
+
+def : Pat<(v1f64 (fsqrt (v1f64 FPR64:$Rn))), (FSQRTdd FPR64:$Rn)>;
+
+// Patterns to match llvm.aarch64.* intrinsic for
+// Scalar Floating-point Multiply Extended,
+multiclass Neon_Scalar3Same_MULX_SD_size_patterns<SDPatternOperator opnode,
+                                                  Instruction INSTS,
+                                                  Instruction INSTD> {
+  def : Pat<(f32 (opnode (f32 FPR32:$Rn), (f32 FPR32:$Rm))),
+            (INSTS FPR32:$Rn, FPR32:$Rm)>;
+  def : Pat<(f64 (opnode (f64 FPR64:$Rn), (f64 FPR64:$Rm))),
+            (INSTD FPR64:$Rn, FPR64:$Rm)>;
+}
+
+defm : Neon_Scalar3Same_MULX_SD_size_patterns<int_aarch64_neon_vmulx,
+                                              FMULXsss,FMULXddd>;
 
 // Scalar Integer Shift Left (Signed, Unsigned)
 def SSHLddd : NeonI_Scalar3Same_D_size<0b0, 0b01000, "sshl">;
 def USHLddd : NeonI_Scalar3Same_D_size<0b1, 0b01000, "ushl">;
 
+// Patterns to match llvm.arm.* intrinsic for
+// Scalar Integer Shift Left (Signed, Unsigned)
+defm : Neon_Scalar3Same_D_size_patterns<int_arm_neon_vshifts, SSHLddd>;
+defm : Neon_Scalar3Same_D_size_patterns<int_arm_neon_vshiftu, USHLddd>;
+
+// Patterns to match llvm.aarch64.* intrinsic for
+// Scalar Integer Shift Left (Signed, Unsigned)
+defm : Neon_Scalar3Same_D_size_patterns<int_aarch64_neon_vshlds, SSHLddd>;
+defm : Neon_Scalar3Same_D_size_patterns<int_aarch64_neon_vshldu, USHLddd>;
+
 // Scalar Integer Saturating Shift Left (Signed, Unsigned)
 defm SQSHL: NeonI_Scalar3Same_BHSD_sizes<0b0, 0b01001, "sqshl", 0>;
 defm UQSHL: NeonI_Scalar3Same_BHSD_sizes<0b1, 0b01001, "uqshl", 0>;
 
-// Scalar Integer Rouding Shift Left (Signed, Unsigned)
+// Patterns to match llvm.aarch64.* intrinsic for
+// Scalar  Integer Saturating Shift Letf (Signed, Unsigned)
+defm : Neon_Scalar3Same_BHSD_size_patterns<int_aarch64_neon_vqshls, SQSHLbbb,
+                                           SQSHLhhh, SQSHLsss, SQSHLddd>;
+defm : Neon_Scalar3Same_BHSD_size_patterns<int_aarch64_neon_vqshlu, UQSHLbbb,
+                                           UQSHLhhh, UQSHLsss, UQSHLddd>;
+
+// Patterns to match llvm.arm.* intrinsic for
+// Scalar  Integer Saturating Shift Letf (Signed, Unsigned)
+defm : Neon_Scalar3Same_D_size_patterns<int_arm_neon_vqshifts, SQSHLddd>;
+defm : Neon_Scalar3Same_D_size_patterns<int_arm_neon_vqshiftu, UQSHLddd>;
+
+// Scalar Integer Rounding Shift Left (Signed, Unsigned)
 def SRSHLddd: NeonI_Scalar3Same_D_size<0b0, 0b01010, "srshl">;
 def URSHLddd: NeonI_Scalar3Same_D_size<0b1, 0b01010, "urshl">;
 
+// Patterns to match llvm.aarch64.* intrinsic for
+// Scalar Integer Rounding Shift Left (Signed, Unsigned)
+defm : Neon_Scalar3Same_D_size_patterns<int_aarch64_neon_vrshlds, SRSHLddd>;
+defm : Neon_Scalar3Same_D_size_patterns<int_aarch64_neon_vrshldu, URSHLddd>;
+
+// Patterns to match llvm.arm.* intrinsic for
+// Scalar Integer Rounding Shift Left (Signed, Unsigned)
+defm : Neon_Scalar3Same_D_size_patterns<int_arm_neon_vrshifts, SRSHLddd>;
+defm : Neon_Scalar3Same_D_size_patterns<int_arm_neon_vrshiftu, URSHLddd>;
+
 // Scalar Integer Saturating Rounding Shift Left (Signed, Unsigned)
 defm SQRSHL: NeonI_Scalar3Same_BHSD_sizes<0b0, 0b01011, "sqrshl", 0>;
 defm UQRSHL: NeonI_Scalar3Same_BHSD_sizes<0b1, 0b01011, "uqrshl", 0>;
 
-// Patterns for Scalar Integer Shift Lef, Saturating Shift Left,
-// Rounding Shift Left, Rounding Saturating Shift Left with D register only
-def : Neon_Scalar_D_size_patterns<int_arm_neon_vshifts, SSHLddd>;
-def : Neon_Scalar_D_size_patterns<int_arm_neon_vshiftu, USHLddd>;
-def : Neon_Scalar_D_size_patterns<shl, SSHLddd>;
-def : Neon_Scalar_D_size_patterns<shl, USHLddd>;
-def : Neon_Scalar_D_size_patterns<int_arm_neon_vqshifts, SQSHLddd>;
-def : Neon_Scalar_D_size_patterns<int_arm_neon_vqshiftu, UQSHLddd>;
-def : Neon_Scalar_D_size_patterns<int_arm_neon_vrshifts, SRSHLddd>;
-def : Neon_Scalar_D_size_patterns<int_arm_neon_vrshiftu, URSHLddd>;
-def : Neon_Scalar_D_size_patterns<int_arm_neon_vqrshifts, SQRSHLddd>;
-def : Neon_Scalar_D_size_patterns<int_arm_neon_vqrshiftu, UQRSHLddd>;
+// Patterns to match llvm.aarch64.* intrinsic for
+// Scalar Integer Saturating Rounding Shift Left (Signed, Unsigned)
+defm : Neon_Scalar3Same_BHSD_size_patterns<int_aarch64_neon_vqrshls, SQRSHLbbb,
+                                           SQRSHLhhh, SQRSHLsss, SQRSHLddd>;
+defm : Neon_Scalar3Same_BHSD_size_patterns<int_aarch64_neon_vqrshlu, UQRSHLbbb,
+                                           UQRSHLhhh, UQRSHLsss, UQRSHLddd>;
 
+// Patterns to match llvm.arm.* intrinsic for
+// Scalar Integer Saturating Rounding Shift Left (Signed, Unsigned)
+defm : Neon_Scalar3Same_D_size_patterns<int_arm_neon_vqrshifts, SQRSHLddd>;
+defm : Neon_Scalar3Same_D_size_patterns<int_arm_neon_vqrshiftu, UQRSHLddd>;
+
+// Signed Saturating Doubling Multiply-Add Long
+defm SQDMLAL : NeonI_Scalar3Diff_ml_HS_size<0b0, 0b1001, "sqdmlal">;
+defm : Neon_Scalar3Diff_ml_HS_size_patterns<int_aarch64_neon_vqdmlal,
+                                            SQDMLALshh, SQDMLALdss>;
+
+// Signed Saturating Doubling Multiply-Subtract Long
+defm SQDMLSL : NeonI_Scalar3Diff_ml_HS_size<0b0, 0b1011, "sqdmlsl">;
+defm : Neon_Scalar3Diff_ml_HS_size_patterns<int_aarch64_neon_vqdmlsl,
+                                            SQDMLSLshh, SQDMLSLdss>;
+
+// Signed Saturating Doubling Multiply Long
+defm SQDMULL : NeonI_Scalar3Diff_HS_size<0b0, 0b1101, "sqdmull">;
+defm : Neon_Scalar3Diff_HS_size_patterns<int_arm_neon_vqdmull,
+                                         SQDMULLshh, SQDMULLdss>;
+
+// Scalar Signed Integer Convert To Floating-point
+defm SCVTF  : NeonI_Scalar2SameMisc_SD_size<0b0, 0b0, 0b11101, "scvtf">;
+defm : Neon_Scalar2SameMisc_cvt_SD_size_patterns<int_aarch64_neon_vcvtf32_s32,
+                                                 int_aarch64_neon_vcvtf64_s64,
+                                                 SCVTFss, SCVTFdd>;
+
+// Scalar Unsigned Integer Convert To Floating-point
+defm UCVTF  : NeonI_Scalar2SameMisc_SD_size<0b1, 0b0, 0b11101, "ucvtf">;
+defm : Neon_Scalar2SameMisc_cvt_SD_size_patterns<int_aarch64_neon_vcvtf32_u32,
+                                                 int_aarch64_neon_vcvtf64_u64,
+                                                 UCVTFss, UCVTFdd>;
+
+// Scalar Floating-point Converts
+def FCVTXN : NeonI_Scalar2SameMisc_fcvtxn_D_size<0b1, 0b10110, "fcvtxn">;
+def : Neon_Scalar2SameMisc_fcvtxn_D_size_patterns<int_aarch64_neon_fcvtxn,
+                                                  FCVTXN>;
+
+defm FCVTNS : NeonI_Scalar2SameMisc_SD_size<0b0, 0b0, 0b11010, "fcvtns">;
+defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns<int_aarch64_neon_fcvtns,
+                                                  FCVTNSss, FCVTNSdd>;
+
+defm FCVTNU : NeonI_Scalar2SameMisc_SD_size<0b1, 0b0, 0b11010, "fcvtnu">;
+defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns<int_aarch64_neon_fcvtnu,
+                                                  FCVTNUss, FCVTNUdd>;
+
+defm FCVTMS : NeonI_Scalar2SameMisc_SD_size<0b0, 0b0, 0b11011, "fcvtms">;
+defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns<int_aarch64_neon_fcvtms,
+                                                  FCVTMSss, FCVTMSdd>;
+
+defm FCVTMU : NeonI_Scalar2SameMisc_SD_size<0b1, 0b0, 0b11011, "fcvtmu">;
+defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns<int_aarch64_neon_fcvtmu,
+                                                  FCVTMUss, FCVTMUdd>;
+
+defm FCVTAS : NeonI_Scalar2SameMisc_SD_size<0b0, 0b0, 0b11100, "fcvtas">;
+defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns<int_aarch64_neon_fcvtas,
+                                                  FCVTASss, FCVTASdd>;
+
+defm FCVTAU : NeonI_Scalar2SameMisc_SD_size<0b1, 0b0, 0b11100, "fcvtau">;
+defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns<int_aarch64_neon_fcvtau,
+                                                  FCVTAUss, FCVTAUdd>;
+
+defm FCVTPS : NeonI_Scalar2SameMisc_SD_size<0b0, 0b1, 0b11010, "fcvtps">;
+defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns<int_aarch64_neon_fcvtps,
+                                                  FCVTPSss, FCVTPSdd>;
+
+defm FCVTPU : NeonI_Scalar2SameMisc_SD_size<0b1, 0b1, 0b11010, "fcvtpu">;
+defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns<int_aarch64_neon_fcvtpu,
+                                                  FCVTPUss, FCVTPUdd>;
+
+defm FCVTZS : NeonI_Scalar2SameMisc_SD_size<0b0, 0b1, 0b11011, "fcvtzs">;
+defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns<int_aarch64_neon_fcvtzs,
+                                                  FCVTZSss, FCVTZSdd>;
+
+defm FCVTZU : NeonI_Scalar2SameMisc_SD_size<0b1, 0b1, 0b11011, "fcvtzu">;
+defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns<int_aarch64_neon_fcvtzu,
+                                                  FCVTZUss, FCVTZUdd>;
+
+// Patterns For Convert Instructions Between v1f64 and v1i64
+class Neon_Scalar2SameMisc_cvtf_v1f64_pattern<SDPatternOperator opnode,
+                                              Instruction INST>
+    : Pat<(v1f64 (opnode (v1i64 FPR64:$Rn))), (INST FPR64:$Rn)>;
+
+class Neon_Scalar2SameMisc_fcvt_v1f64_pattern<SDPatternOperator opnode,
+                                              Instruction INST>
+    : Pat<(v1i64 (opnode (v1f64 FPR64:$Rn))), (INST FPR64:$Rn)>;
+
+def : Neon_Scalar2SameMisc_cvtf_v1f64_pattern<sint_to_fp, SCVTFdd>;
+def : Neon_Scalar2SameMisc_cvtf_v1f64_pattern<uint_to_fp, UCVTFdd>;
+
+def : Neon_Scalar2SameMisc_fcvt_v1f64_pattern<fp_to_sint, FCVTZSdd>;
+def : Neon_Scalar2SameMisc_fcvt_v1f64_pattern<fp_to_uint, FCVTZUdd>;
+
+// Scalar Floating-point Reciprocal Estimate
+defm FRECPE : NeonI_Scalar2SameMisc_SD_size<0b0, 0b1, 0b11101, "frecpe">;
+defm : Neon_Scalar2SameMisc_SD_size_patterns<int_arm_neon_vrecpe,
+                                             FRECPEss, FRECPEdd>;
+
+// Scalar Floating-point Reciprocal Exponent
+defm FRECPX : NeonI_Scalar2SameMisc_SD_size<0b0, 0b1, 0b11111, "frecpx">;
+defm : Neon_Scalar2SameMisc_SD_size_patterns<int_aarch64_neon_vrecpx,
+                                             FRECPXss, FRECPXdd>;
+
+// Scalar Floating-point Reciprocal Square Root Estimate
+defm FRSQRTE: NeonI_Scalar2SameMisc_SD_size<0b1, 0b1, 0b11101, "frsqrte">;
+defm : Neon_Scalar2SameMisc_SD_size_patterns<int_arm_neon_vrsqrte,
+                                             FRSQRTEss, FRSQRTEdd>;
+
+// Scalar Floating-point Round
+class Neon_ScalarFloatRound_pattern<SDPatternOperator opnode, Instruction INST>
+    : Pat<(v1f64 (opnode (v1f64 FPR64:$Rn))), (INST FPR64:$Rn)>;
+
+def : Neon_ScalarFloatRound_pattern<fceil, FRINTPdd>;
+def : Neon_ScalarFloatRound_pattern<ffloor, FRINTMdd>;
+def : Neon_ScalarFloatRound_pattern<ftrunc, FRINTZdd>;
+def : Neon_ScalarFloatRound_pattern<frint, FRINTXdd>;
+def : Neon_ScalarFloatRound_pattern<fnearbyint, FRINTIdd>;
+def : Neon_ScalarFloatRound_pattern<frnd, FRINTAdd>;
+def : Neon_ScalarFloatRound_pattern<int_aarch64_neon_frintn, FRINTNdd>;
+
+// Scalar Integer Compare
+
+// Scalar Compare Bitwise Equal
+def CMEQddd: NeonI_Scalar3Same_D_size<0b1, 0b10001, "cmeq">;
+def : Neon_Scalar3Same_cmp_D_size_patterns<int_aarch64_neon_vceq, CMEQddd>;
+
+class Neon_Scalar3Same_cmp_D_size_v1_patterns<SDPatternOperator opnode,
+                                              Instruction INSTD,
+                                              CondCode CC>
+  : Pat<(v1i64 (opnode (v1i64 FPR64:$Rn), (v1i64 FPR64:$Rm), CC)),
+        (INSTD FPR64:$Rn, FPR64:$Rm)>;
+
+def : Neon_Scalar3Same_cmp_D_size_v1_patterns<Neon_cmp, CMEQddd, SETEQ>;
+
+// Scalar Compare Signed Greather Than Or Equal
+def CMGEddd: NeonI_Scalar3Same_D_size<0b0, 0b00111, "cmge">;
+def : Neon_Scalar3Same_cmp_D_size_patterns<int_aarch64_neon_vcge, CMGEddd>;
+def : Neon_Scalar3Same_cmp_D_size_v1_patterns<Neon_cmp, CMGEddd, SETGE>;
+
+// Scalar Compare Unsigned Higher Or Same
+def CMHSddd: NeonI_Scalar3Same_D_size<0b1, 0b00111, "cmhs">;
+def : Neon_Scalar3Same_cmp_D_size_patterns<int_aarch64_neon_vchs, CMHSddd>;
+def : Neon_Scalar3Same_cmp_D_size_v1_patterns<Neon_cmp, CMHSddd, SETUGE>;
+
+// Scalar Compare Unsigned Higher
+def CMHIddd: NeonI_Scalar3Same_D_size<0b1, 0b00110, "cmhi">;
+def : Neon_Scalar3Same_cmp_D_size_patterns<int_aarch64_neon_vchi, CMHIddd>;
+def : Neon_Scalar3Same_cmp_D_size_v1_patterns<Neon_cmp, CMHIddd, SETUGT>;
+
+// Scalar Compare Signed Greater Than
+def CMGTddd: NeonI_Scalar3Same_D_size<0b0, 0b00110, "cmgt">;
+def : Neon_Scalar3Same_cmp_D_size_patterns<int_aarch64_neon_vcgt, CMGTddd>;
+def : Neon_Scalar3Same_cmp_D_size_v1_patterns<Neon_cmp, CMGTddd, SETGT>;
+
+// Scalar Compare Bitwise Test Bits
+def CMTSTddd: NeonI_Scalar3Same_D_size<0b0, 0b10001, "cmtst">;
+def : Neon_Scalar3Same_cmp_D_size_patterns<int_aarch64_neon_vtstd, CMTSTddd>;
+def : Neon_Scalar3Same_cmp_D_size_patterns<Neon_tst, CMTSTddd>;
+
+// Scalar Compare Bitwise Equal To Zero
+def CMEQddi: NeonI_Scalar2SameMisc_cmpz_D_size<0b0, 0b01001, "cmeq">;
+def : Neon_Scalar2SameMisc_cmpz_D_size_patterns<int_aarch64_neon_vceq,
+                                                CMEQddi>;
+def : Neon_Scalar2SameMisc_cmpz_D_V1_size_patterns<SETEQ, CMEQddi>;
+
+// Scalar Compare Signed Greather Than Or Equal To Zero
+def CMGEddi: NeonI_Scalar2SameMisc_cmpz_D_size<0b1, 0b01000, "cmge">;
+def : Neon_Scalar2SameMisc_cmpz_D_size_patterns<int_aarch64_neon_vcge,
+                                                CMGEddi>;
+def : Neon_Scalar2SameMisc_cmpz_D_V1_size_patterns<SETGE, CMGEddi>;
+
+// Scalar Compare Signed Greater Than Zero
+def CMGTddi: NeonI_Scalar2SameMisc_cmpz_D_size<0b0, 0b01000, "cmgt">;
+def : Neon_Scalar2SameMisc_cmpz_D_size_patterns<int_aarch64_neon_vcgt,
+                                                CMGTddi>;
+def : Neon_Scalar2SameMisc_cmpz_D_V1_size_patterns<SETGT, CMGTddi>;
+
+// Scalar Compare Signed Less Than Or Equal To Zero
+def CMLEddi: NeonI_Scalar2SameMisc_cmpz_D_size<0b1, 0b01001, "cmle">;
+def : Neon_Scalar2SameMisc_cmpz_D_size_patterns<int_aarch64_neon_vclez,
+                                                CMLEddi>;
+def : Neon_Scalar2SameMisc_cmpz_D_V1_size_patterns<SETLE, CMLEddi>;
+
+// Scalar Compare Less Than Zero
+def CMLTddi: NeonI_Scalar2SameMisc_cmpz_D_size<0b0, 0b01010, "cmlt">;
+def : Neon_Scalar2SameMisc_cmpz_D_size_patterns<int_aarch64_neon_vcltz,
+                                                CMLTddi>;
+def : Neon_Scalar2SameMisc_cmpz_D_V1_size_patterns<SETLT, CMLTddi>;
+
+// Scalar Floating-point Compare
+
+// Scalar Floating-point Compare Mask Equal
+defm FCMEQ: NeonI_Scalar3Same_SD_sizes<0b0, 0b0, 0b11100, "fcmeq">;
+defm : Neon_Scalar3Same_cmp_SD_size_patterns<int_aarch64_neon_vceq,
+                                             FCMEQsss, FCMEQddd>;
+def : Neon_Scalar3Same_cmp_V1_D_size_patterns<SETEQ, FCMEQddd>;
+
+// Scalar Floating-point Compare Mask Equal To Zero
+defm FCMEQZ: NeonI_Scalar2SameMisc_cmpz_SD_size<0b0, 0b01101, "fcmeq">;
+defm : Neon_Scalar2SameMisc_cmpz_SD_size_patterns<int_aarch64_neon_vceq,
+                                                  FCMEQZssi, FCMEQZddi>;
+def : Pat<(v1i64 (Neon_cmpz (v1f64 FPR64:$Rn), (f32 fpz32:$FPImm), SETEQ)),
+          (FCMEQZddi FPR64:$Rn, fpz32:$FPImm)>;
+
+// Scalar Floating-point Compare Mask Greater Than Or Equal
+defm FCMGE: NeonI_Scalar3Same_SD_sizes<0b1, 0b0, 0b11100, "fcmge">;
+defm : Neon_Scalar3Same_cmp_SD_size_patterns<int_aarch64_neon_vcge,
+                                             FCMGEsss, FCMGEddd>;
+def : Neon_Scalar3Same_cmp_V1_D_size_patterns<SETGE, FCMGEddd>;
+
+// Scalar Floating-point Compare Mask Greater Than Or Equal To Zero
+defm FCMGEZ: NeonI_Scalar2SameMisc_cmpz_SD_size<0b1, 0b01100, "fcmge">;
+defm : Neon_Scalar2SameMisc_cmpz_SD_size_patterns<int_aarch64_neon_vcge,
+                                                  FCMGEZssi, FCMGEZddi>;
+
+// Scalar Floating-point Compare Mask Greather Than
+defm FCMGT: NeonI_Scalar3Same_SD_sizes<0b1, 0b1, 0b11100, "fcmgt">;
+defm : Neon_Scalar3Same_cmp_SD_size_patterns<int_aarch64_neon_vcgt,
+                                             FCMGTsss, FCMGTddd>;
+def : Neon_Scalar3Same_cmp_V1_D_size_patterns<SETGT, FCMGTddd>;
+
+// Scalar Floating-point Compare Mask Greather Than Zero
+defm FCMGTZ: NeonI_Scalar2SameMisc_cmpz_SD_size<0b0, 0b01100, "fcmgt">;
+defm : Neon_Scalar2SameMisc_cmpz_SD_size_patterns<int_aarch64_neon_vcgt,
+                                                  FCMGTZssi, FCMGTZddi>;
+
+// Scalar Floating-point Compare Mask Less Than Or Equal To Zero
+defm FCMLEZ: NeonI_Scalar2SameMisc_cmpz_SD_size<0b1, 0b01101, "fcmle">;
+defm : Neon_Scalar2SameMisc_cmpz_SD_size_patterns<int_aarch64_neon_vclez,
+                                                  FCMLEZssi, FCMLEZddi>;
+
+// Scalar Floating-point Compare Mask Less Than Zero
+defm FCMLTZ: NeonI_Scalar2SameMisc_cmpz_SD_size<0b0, 0b01110, "fcmlt">;
+defm : Neon_Scalar2SameMisc_cmpz_SD_size_patterns<int_aarch64_neon_vcltz,
+                                                  FCMLTZssi, FCMLTZddi>;
+
+// Scalar Floating-point Absolute Compare Mask Greater Than Or Equal
+defm FACGE: NeonI_Scalar3Same_SD_sizes<0b1, 0b0, 0b11101, "facge">;
+defm : Neon_Scalar3Same_cmp_SD_size_patterns<int_aarch64_neon_vcage,
+                                             FACGEsss, FACGEddd>;
+
+// Scalar Floating-point Absolute Compare Mask Greater Than
+defm FACGT: NeonI_Scalar3Same_SD_sizes<0b1, 0b1, 0b11101, "facgt">;
+defm : Neon_Scalar3Same_cmp_SD_size_patterns<int_aarch64_neon_vcagt,
+                                             FACGTsss, FACGTddd>;
+
+// Scakar Floating-point Absolute Difference
+defm FABD: NeonI_Scalar3Same_SD_sizes<0b1, 0b1, 0b11010, "fabd">;
+defm : Neon_Scalar3Same_SD_size_patterns<int_aarch64_neon_vabd,
+                                         FABDsss, FABDddd>;
+
+// Scalar Absolute Value
+defm ABS : NeonI_Scalar2SameMisc_D_size<0b0, 0b01011, "abs">;
+defm : Neon_Scalar2SameMisc_D_size_patterns<int_aarch64_neon_vabs, ABSdd>;
+
+// Scalar Signed Saturating Absolute Value
+defm SQABS : NeonI_Scalar2SameMisc_BHSD_size<0b0, 0b00111, "sqabs">;
+defm : Neon_Scalar2SameMisc_BHSD_size_patterns<int_arm_neon_vqabs,
+                                               SQABSbb, SQABShh, SQABSss, SQABSdd>;
+
+// Scalar Negate
+defm NEG : NeonI_Scalar2SameMisc_D_size<0b1, 0b01011, "neg">;
+defm : Neon_Scalar2SameMisc_D_size_patterns<int_aarch64_neon_vneg, NEGdd>;
+
+// Scalar Signed Saturating Negate
+defm SQNEG : NeonI_Scalar2SameMisc_BHSD_size<0b1, 0b00111, "sqneg">;
+defm : Neon_Scalar2SameMisc_BHSD_size_patterns<int_arm_neon_vqneg,
+                                               SQNEGbb, SQNEGhh, SQNEGss, SQNEGdd>;
+
+// Scalar Signed Saturating Accumulated of Unsigned Value
+defm SUQADD : NeonI_Scalar2SameMisc_accum_BHSD_size<0b0, 0b00011, "suqadd">;
+defm : Neon_Scalar2SameMisc_accum_BHSD_size_patterns<int_aarch64_neon_vuqadd,
+                                                     SUQADDbb, SUQADDhh,
+                                                     SUQADDss, SUQADDdd>;
+
+// Scalar Unsigned Saturating Accumulated of Signed Value
+defm USQADD : NeonI_Scalar2SameMisc_accum_BHSD_size<0b1, 0b00011, "usqadd">;
+defm : Neon_Scalar2SameMisc_accum_BHSD_size_patterns<int_aarch64_neon_vsqadd,
+                                                     USQADDbb, USQADDhh,
+                                                     USQADDss, USQADDdd>;
+
+def : Pat<(v1i64 (int_aarch64_neon_suqadd (v1i64 FPR64:$Src),
+                                          (v1i64 FPR64:$Rn))),
+          (SUQADDdd FPR64:$Src, FPR64:$Rn)>;
+
+def : Pat<(v1i64 (int_aarch64_neon_usqadd (v1i64 FPR64:$Src),
+                                          (v1i64 FPR64:$Rn))),
+          (USQADDdd FPR64:$Src, FPR64:$Rn)>;
+
+def : Pat<(v1i64 (int_arm_neon_vabs (v1i64 FPR64:$Rn))),
+          (ABSdd FPR64:$Rn)>;
+
+def : Pat<(v1i64 (int_arm_neon_vqabs (v1i64 FPR64:$Rn))),
+          (SQABSdd FPR64:$Rn)>;
+
+def : Pat<(v1i64 (int_arm_neon_vqneg (v1i64 FPR64:$Rn))),
+          (SQNEGdd FPR64:$Rn)>;
+
+def : Pat<(v1i64 (sub (v1i64 (bitconvert (v8i8 Neon_AllZero))),
+                      (v1i64 FPR64:$Rn))),
+          (NEGdd FPR64:$Rn)>;
+
+// Scalar Signed Saturating Extract Unsigned Narrow
+defm SQXTUN : NeonI_Scalar2SameMisc_narrow_HSD_size<0b1, 0b10010, "sqxtun">;
+defm : Neon_Scalar2SameMisc_narrow_HSD_size_patterns<int_arm_neon_vqmovnsu,
+                                                     SQXTUNbh, SQXTUNhs,
+                                                     SQXTUNsd>;
+
+// Scalar Signed Saturating Extract Narrow
+defm SQXTN  : NeonI_Scalar2SameMisc_narrow_HSD_size<0b0, 0b10100, "sqxtn">;
+defm : Neon_Scalar2SameMisc_narrow_HSD_size_patterns<int_arm_neon_vqmovns,
+                                                     SQXTNbh, SQXTNhs,
+                                                     SQXTNsd>;
+
+// Scalar Unsigned Saturating Extract Narrow
+defm UQXTN  : NeonI_Scalar2SameMisc_narrow_HSD_size<0b1, 0b10100, "uqxtn">;
+defm : Neon_Scalar2SameMisc_narrow_HSD_size_patterns<int_arm_neon_vqmovnu,
+                                                     UQXTNbh, UQXTNhs,
+                                                     UQXTNsd>;
+
+// Scalar Reduce Pairwise
+
+multiclass NeonI_ScalarPair_D_sizes<bit u, bit size, bits<5> opcode,
+                                     string asmop, bit Commutable = 0> {
+  let isCommutable = Commutable in {
+    def _D_2D : NeonI_ScalarPair<u, {size, 0b1}, opcode,
+                                (outs FPR64:$Rd), (ins VPR128:$Rn),
+                                !strconcat(asmop, "\t$Rd, $Rn.2d"),
+                                [],
+                                NoItinerary>;
+  }
+}
+
+multiclass NeonI_ScalarPair_SD_sizes<bit u, bit size, bits<5> opcode,
+                                     string asmop, bit Commutable = 0>
+  : NeonI_ScalarPair_D_sizes<u, size, opcode, asmop, Commutable> {
+  let isCommutable = Commutable in {
+    def _S_2S : NeonI_ScalarPair<u, {size, 0b0}, opcode,
+                                (outs FPR32:$Rd), (ins VPR64:$Rn),
+                                !strconcat(asmop, "\t$Rd, $Rn.2s"),
+                                [],
+                                NoItinerary>;
+  }
+}
+
+// Scalar Reduce Addition Pairwise (Integer) with
+// Pattern to match llvm.arm.* intrinsic
+defm ADDPvv : NeonI_ScalarPair_D_sizes<0b0, 0b1, 0b11011, "addp", 0>;
+
+// Pattern to match llvm.aarch64.* intrinsic for
+// Scalar Reduce Addition Pairwise (Integer)
+def : Pat<(v1i64 (int_aarch64_neon_vpadd (v2i64 VPR128:$Rn))),
+          (ADDPvv_D_2D VPR128:$Rn)>;
+def : Pat<(v1i64 (int_aarch64_neon_vaddv (v2i64 VPR128:$Rn))),
+          (ADDPvv_D_2D VPR128:$Rn)>;
+
+// Scalar Reduce Addition Pairwise (Floating Point)
+defm FADDPvv : NeonI_ScalarPair_SD_sizes<0b1, 0b0, 0b01101, "faddp", 0>;
+
+// Scalar Reduce Maximum Pairwise (Floating Point)
+defm FMAXPvv : NeonI_ScalarPair_SD_sizes<0b1, 0b0, 0b01111, "fmaxp", 0>;
+
+// Scalar Reduce Minimum Pairwise (Floating Point)
+defm FMINPvv : NeonI_ScalarPair_SD_sizes<0b1, 0b1, 0b01111, "fminp", 0>;
+
+// Scalar Reduce maxNum Pairwise (Floating Point)
+defm FMAXNMPvv : NeonI_ScalarPair_SD_sizes<0b1, 0b0, 0b01100, "fmaxnmp", 0>;
+
+// Scalar Reduce minNum Pairwise (Floating Point)
+defm FMINNMPvv : NeonI_ScalarPair_SD_sizes<0b1, 0b1, 0b01100, "fminnmp", 0>;
+
+multiclass Neon_ScalarPair_SD_size_patterns<SDPatternOperator opnodeS,
+                                            SDPatternOperator opnodeD,
+                                            Instruction INSTS,
+                                            Instruction INSTD> {
+  def : Pat<(v1f32 (opnodeS (v2f32 VPR64:$Rn))),
+            (INSTS VPR64:$Rn)>;
+  def : Pat<(v1f64 (opnodeD (v2f64 VPR128:$Rn))),
+            (INSTD VPR128:$Rn)>;
+}
+
+// Patterns to match llvm.aarch64.* intrinsic for
+// Scalar Reduce Add, Max, Min, MaxiNum, MinNum Pairwise (Floating Point)
+defm : Neon_ScalarPair_SD_size_patterns<int_aarch64_neon_vpfadd,
+  int_aarch64_neon_vpfaddq, FADDPvv_S_2S, FADDPvv_D_2D>;
+
+defm : Neon_ScalarPair_SD_size_patterns<int_aarch64_neon_vpmax,
+  int_aarch64_neon_vpmaxq, FMAXPvv_S_2S, FMAXPvv_D_2D>;
+
+defm : Neon_ScalarPair_SD_size_patterns<int_aarch64_neon_vpmin,
+  int_aarch64_neon_vpminq, FMINPvv_S_2S, FMINPvv_D_2D>;
+
+defm : Neon_ScalarPair_SD_size_patterns<int_aarch64_neon_vpfmaxnm,
+  int_aarch64_neon_vpfmaxnmq, FMAXNMPvv_S_2S, FMAXNMPvv_D_2D>;
+
+defm : Neon_ScalarPair_SD_size_patterns<int_aarch64_neon_vpfminnm,
+  int_aarch64_neon_vpfminnmq, FMINNMPvv_S_2S, FMINNMPvv_D_2D>;
+
+defm : Neon_ScalarPair_SD_size_patterns<int_aarch64_neon_vaddv,
+    int_aarch64_neon_vaddv, FADDPvv_S_2S, FADDPvv_D_2D>;
+
+def : Pat<(v1f32 (int_aarch64_neon_vaddv (v4f32 VPR128:$Rn))),
+          (FADDPvv_S_2S (v2f32
+               (EXTRACT_SUBREG
+                   (v4f32 (FADDP_4S (v4f32 VPR128:$Rn), (v4f32 VPR128:$Rn))),
+                   sub_64)))>;
+
+defm : Neon_ScalarPair_SD_size_patterns<int_aarch64_neon_vmaxv,
+    int_aarch64_neon_vmaxv, FMAXPvv_S_2S, FMAXPvv_D_2D>;
+
+defm : Neon_ScalarPair_SD_size_patterns<int_aarch64_neon_vminv,
+    int_aarch64_neon_vminv, FMINPvv_S_2S, FMINPvv_D_2D>;
+
+defm : Neon_ScalarPair_SD_size_patterns<int_aarch64_neon_vmaxnmv,
+    int_aarch64_neon_vmaxnmv, FMAXNMPvv_S_2S, FMAXNMPvv_D_2D>;
+
+defm : Neon_ScalarPair_SD_size_patterns<int_aarch64_neon_vminnmv,
+    int_aarch64_neon_vminnmv, FMINNMPvv_S_2S, FMINNMPvv_D_2D>;
+
+// Scalar by element Arithmetic
+
+class NeonI_ScalarXIndexedElemArith<string asmop, bits<4> opcode,
+                                    string rmlane, bit u, bit szhi, bit szlo,
+                                    RegisterClass ResFPR, RegisterClass OpFPR,
+                                    RegisterOperand OpVPR, Operand OpImm>
+  : NeonI_ScalarXIndexedElem<u, szhi, szlo, opcode,
+                             (outs ResFPR:$Rd),
+                             (ins OpFPR:$Rn, OpVPR:$MRm, OpImm:$Imm),
+                             asmop # "\t$Rd, $Rn, $MRm" # rmlane # "[$Imm]",
+                             [],
+                             NoItinerary> {
+  bits<3> Imm;
+  bits<5> MRm;
+}
+
+class NeonI_ScalarXIndexedElemArith_Constraint_Impl<string asmop, bits<4> opcode,
+                                                    string rmlane,
+                                                    bit u, bit szhi, bit szlo,
+                                                    RegisterClass ResFPR,
+                                                    RegisterClass OpFPR,
+                                                    RegisterOperand OpVPR,
+                                                    Operand OpImm>
+  : NeonI_ScalarXIndexedElem<u, szhi, szlo, opcode,
+                             (outs ResFPR:$Rd),
+                             (ins ResFPR:$src, OpFPR:$Rn, OpVPR:$MRm, OpImm:$Imm),
+                             asmop # "\t$Rd, $Rn, $MRm" # rmlane # "[$Imm]",
+                             [],
+                             NoItinerary> {
+  let Constraints = "$src = $Rd";
+  bits<3> Imm;
+  bits<5> MRm;
+}
+
+// Scalar Floating Point  multiply (scalar, by element)
+def FMULssv_4S : NeonI_ScalarXIndexedElemArith<"fmul",
+  0b1001, ".s", 0b0, 0b1, 0b0, FPR32, FPR32, VPR128, neon_uimm2_bare> {
+  let Inst{11} = Imm{1}; // h
+  let Inst{21} = Imm{0}; // l
+  let Inst{20-16} = MRm;
+}
+def FMULddv_2D : NeonI_ScalarXIndexedElemArith<"fmul",
+  0b1001, ".d", 0b0, 0b1, 0b1, FPR64, FPR64, VPR128, neon_uimm1_bare> {
+  let Inst{11} = Imm{0}; // h
+  let Inst{21} = 0b0;    // l
+  let Inst{20-16} = MRm;
+}
+
+// Scalar Floating Point  multiply extended (scalar, by element)
+def FMULXssv_4S : NeonI_ScalarXIndexedElemArith<"fmulx",
+  0b1001, ".s", 0b1, 0b1, 0b0, FPR32, FPR32, VPR128, neon_uimm2_bare> {
+  let Inst{11} = Imm{1}; // h
+  let Inst{21} = Imm{0}; // l
+  let Inst{20-16} = MRm;
+}
+def FMULXddv_2D : NeonI_ScalarXIndexedElemArith<"fmulx",
+  0b1001, ".d", 0b1, 0b1, 0b1, FPR64, FPR64, VPR128, neon_uimm1_bare> {
+  let Inst{11} = Imm{0}; // h
+  let Inst{21} = 0b0;    // l
+  let Inst{20-16} = MRm;
+}
+
+multiclass Neon_ScalarXIndexedElem_MUL_MULX_Patterns<
+  SDPatternOperator opnode,
+  Instruction INST,
+  ValueType ResTy, RegisterClass FPRC, ValueType OpTy, Operand OpImm,
+  ValueType OpNTy, ValueType ExTy, Operand OpNImm> {
+
+  def  : Pat<(ResTy (opnode (ResTy FPRC:$Rn),
+               (ResTy (vector_extract (OpTy VPR128:$MRm), OpImm:$Imm)))),
+             (ResTy (INST (ResTy FPRC:$Rn), (OpTy VPR128:$MRm), OpImm:$Imm))>;
+
+  def  : Pat<(ResTy (opnode (ResTy FPRC:$Rn),
+               (ResTy (vector_extract (OpNTy VPR64:$MRm), OpNImm:$Imm)))),
+             (ResTy (INST (ResTy FPRC:$Rn),
+               (ExTy (SUBREG_TO_REG (i64 0), VPR64:$MRm, sub_64)),
+               OpNImm:$Imm))>;
+
+  // swapped operands
+  def  : Pat<(ResTy (opnode
+               (ResTy (vector_extract (OpTy VPR128:$MRm), OpImm:$Imm)),
+               (ResTy FPRC:$Rn))),
+             (ResTy (INST (ResTy FPRC:$Rn), (OpTy VPR128:$MRm), OpImm:$Imm))>;
+
+  def  : Pat<(ResTy (opnode
+               (ResTy (vector_extract (OpNTy VPR64:$MRm), OpNImm:$Imm)),
+               (ResTy FPRC:$Rn))),
+             (ResTy (INST (ResTy FPRC:$Rn),
+               (ExTy (SUBREG_TO_REG (i64 0), VPR64:$MRm, sub_64)),
+               OpNImm:$Imm))>;
+}
+
+// Patterns for Scalar Floating Point  multiply (scalar, by element)
+defm : Neon_ScalarXIndexedElem_MUL_MULX_Patterns<fmul, FMULssv_4S,
+  f32, FPR32, v4f32, neon_uimm2_bare, v2f32, v4f32, neon_uimm1_bare>;
+defm : Neon_ScalarXIndexedElem_MUL_MULX_Patterns<fmul, FMULddv_2D,
+  f64, FPR64, v2f64, neon_uimm1_bare, v1f64, v2f64, neon_uimm0_bare>;
+
+// Patterns for Scalar Floating Point  multiply extended (scalar, by element)
+defm : Neon_ScalarXIndexedElem_MUL_MULX_Patterns<int_aarch64_neon_vmulx,
+  FMULXssv_4S, f32, FPR32, v4f32, neon_uimm2_bare,
+  v2f32, v4f32, neon_uimm1_bare>;
+defm : Neon_ScalarXIndexedElem_MUL_MULX_Patterns<int_aarch64_neon_vmulx,
+  FMULXddv_2D, f64, FPR64, v2f64, neon_uimm1_bare,
+  v1f64, v2f64, neon_uimm0_bare>;
+
+
+// Scalar Floating Point fused multiply-add (scalar, by element)
+def FMLAssv_4S : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"fmla",
+  0b0001, ".s", 0b0, 0b1, 0b0, FPR32, FPR32, VPR128, neon_uimm2_bare> {
+  let Inst{11} = Imm{1}; // h
+  let Inst{21} = Imm{0}; // l
+  let Inst{20-16} = MRm;
+}
+def FMLAddv_2D : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"fmla",
+  0b0001, ".d", 0b0, 0b1, 0b1, FPR64, FPR64, VPR128, neon_uimm1_bare> {
+  let Inst{11} = Imm{0}; // h
+  let Inst{21} = 0b0;    // l
+  let Inst{20-16} = MRm;
+}
+
+// Scalar Floating Point fused multiply-subtract (scalar, by element)
+def FMLSssv_4S : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"fmls",
+  0b0101, ".s", 0b0, 0b1, 0b0, FPR32, FPR32, VPR128, neon_uimm2_bare> {
+  let Inst{11} = Imm{1}; // h
+  let Inst{21} = Imm{0}; // l
+  let Inst{20-16} = MRm;
+}
+def FMLSddv_2D : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"fmls",
+  0b0101, ".d", 0b0, 0b1, 0b1, FPR64, FPR64, VPR128, neon_uimm1_bare> {
+  let Inst{11} = Imm{0}; // h
+  let Inst{21} = 0b0;    // l
+  let Inst{20-16} = MRm;
+}
+// We are allowed to match the fma instruction regardless of compile options.
+multiclass Neon_ScalarXIndexedElem_FMA_Patterns<
+  Instruction FMLAI, Instruction FMLSI,
+  ValueType ResTy, RegisterClass FPRC, ValueType OpTy, Operand OpImm,
+  ValueType OpNTy, ValueType ExTy, Operand OpNImm> {
+  // fmla
+  def  : Pat<(ResTy (fma (ResTy FPRC:$Rn),
+               (ResTy (vector_extract (OpTy VPR128:$MRm), OpImm:$Imm)),
+               (ResTy FPRC:$Ra))),
+             (ResTy (FMLAI (ResTy FPRC:$Ra),
+               (ResTy FPRC:$Rn), (OpTy VPR128:$MRm), OpImm:$Imm))>;
+
+  def  : Pat<(ResTy (fma (ResTy FPRC:$Rn),
+               (ResTy (vector_extract (OpNTy VPR64:$MRm), OpNImm:$Imm)),
+               (ResTy FPRC:$Ra))),
+             (ResTy (FMLAI (ResTy FPRC:$Ra),
+               (ResTy FPRC:$Rn),
+               (ExTy (SUBREG_TO_REG (i64 0), VPR64:$MRm, sub_64)),
+               OpNImm:$Imm))>;
+
+  // swapped fmla operands
+  def  : Pat<(ResTy (fma
+               (ResTy (vector_extract (OpTy VPR128:$MRm), OpImm:$Imm)),
+               (ResTy FPRC:$Rn),
+               (ResTy FPRC:$Ra))),
+             (ResTy (FMLAI (ResTy FPRC:$Ra),
+               (ResTy FPRC:$Rn), (OpTy VPR128:$MRm), OpImm:$Imm))>;
+
+  def  : Pat<(ResTy (fma
+               (ResTy (vector_extract (OpNTy VPR64:$MRm), OpNImm:$Imm)),
+               (ResTy FPRC:$Rn),
+               (ResTy FPRC:$Ra))),
+             (ResTy (FMLAI (ResTy FPRC:$Ra),
+               (ResTy FPRC:$Rn),
+               (ExTy (SUBREG_TO_REG (i64 0), VPR64:$MRm, sub_64)),
+               OpNImm:$Imm))>;
+
+  // fmls
+  def  : Pat<(ResTy (fma (ResTy FPRC:$Rn),
+               (fneg (ResTy (vector_extract (OpTy VPR128:$MRm), OpImm:$Imm))),
+               (ResTy FPRC:$Ra))),
+             (ResTy (FMLSI (ResTy FPRC:$Ra),
+               (ResTy FPRC:$Rn), (OpTy VPR128:$MRm), OpImm:$Imm))>;
+
+  def  : Pat<(ResTy (fma (ResTy FPRC:$Rn),
+               (fneg (ResTy (vector_extract (OpNTy VPR64:$MRm), OpNImm:$Imm))),
+               (ResTy FPRC:$Ra))),
+             (ResTy (FMLSI (ResTy FPRC:$Ra),
+               (ResTy FPRC:$Rn),
+               (ExTy (SUBREG_TO_REG (i64 0), VPR64:$MRm, sub_64)),
+               OpNImm:$Imm))>;
+
+  // swapped fmls operands
+  def  : Pat<(ResTy (fma
+               (fneg (ResTy (vector_extract (OpTy VPR128:$MRm), OpImm:$Imm))),
+               (ResTy FPRC:$Rn),
+               (ResTy FPRC:$Ra))),
+             (ResTy (FMLSI (ResTy FPRC:$Ra),
+               (ResTy FPRC:$Rn), (OpTy VPR128:$MRm), OpImm:$Imm))>;
+
+  def  : Pat<(ResTy (fma
+               (fneg (ResTy (vector_extract (OpNTy VPR64:$MRm), OpNImm:$Imm))),
+               (ResTy FPRC:$Rn),
+               (ResTy FPRC:$Ra))),
+             (ResTy (FMLSI (ResTy FPRC:$Ra),
+               (ResTy FPRC:$Rn),
+               (ExTy (SUBREG_TO_REG (i64 0), VPR64:$MRm, sub_64)),
+               OpNImm:$Imm))>;
+}
+
+// Scalar Floating Point fused multiply-add and
+// multiply-subtract (scalar, by element)
+defm : Neon_ScalarXIndexedElem_FMA_Patterns<FMLAssv_4S, FMLSssv_4S,
+  f32, FPR32, v4f32, neon_uimm2_bare, v2f32, v4f32, neon_uimm1_bare>;
+defm : Neon_ScalarXIndexedElem_FMA_Patterns<FMLAddv_2D, FMLSddv_2D,
+  f64, FPR64, v2f64, neon_uimm1_bare, v1f64, v2f64, neon_uimm0_bare>;
+defm : Neon_ScalarXIndexedElem_FMA_Patterns<FMLAddv_2D, FMLSddv_2D,
+  f64, FPR64, v2f64, neon_uimm1_bare, v1f64, v2f64, neon_uimm0_bare>;
+
+// Scalar Signed saturating doubling multiply long (scalar, by element)
+def SQDMULLshv_4H : NeonI_ScalarXIndexedElemArith<"sqdmull",
+  0b1011, ".h", 0b0, 0b0, 0b1, FPR32, FPR16, VPR64Lo, neon_uimm2_bare> {
+  let Inst{11} = 0b0; // h
+  let Inst{21} = Imm{1}; // l
+  let Inst{20} = Imm{0}; // m
+  let Inst{19-16} = MRm{3-0};
+}
+def SQDMULLshv_8H : NeonI_ScalarXIndexedElemArith<"sqdmull",
+  0b1011, ".h", 0b0, 0b0, 0b1, FPR32, FPR16, VPR128Lo, neon_uimm3_bare> {
+  let Inst{11} = Imm{2}; // h
+  let Inst{21} = Imm{1}; // l
+  let Inst{20} = Imm{0}; // m
+  let Inst{19-16} = MRm{3-0};
+}
+def SQDMULLdsv_2S : NeonI_ScalarXIndexedElemArith<"sqdmull",
+  0b1011, ".s", 0b0, 0b1, 0b0, FPR64, FPR32, VPR64, neon_uimm1_bare> {
+  let Inst{11} = 0b0;    // h
+  let Inst{21} = Imm{0}; // l
+  let Inst{20-16} = MRm;
+}
+def SQDMULLdsv_4S : NeonI_ScalarXIndexedElemArith<"sqdmull",
+  0b1011, ".s", 0b0, 0b1, 0b0, FPR64, FPR32, VPR128, neon_uimm2_bare> {
+  let Inst{11} = Imm{1};    // h
+  let Inst{21} = Imm{0};    // l
+  let Inst{20-16} = MRm;
+}
+
+multiclass Neon_ScalarXIndexedElem_MUL_Patterns<
+  SDPatternOperator opnode,
+  Instruction INST,
+  ValueType ResTy, RegisterClass FPRC,
+  ValueType OpVTy, ValueType OpTy,
+  ValueType VecOpTy, ValueType ExTy, RegisterOperand VPRC, Operand OpImm> {
+
+  def  : Pat<(ResTy (opnode (OpVTy FPRC:$Rn),
+               (OpVTy (scalar_to_vector
+                 (ExTy (vector_extract (VecOpTy VPRC:$MRm), OpImm:$Imm)))))),
+             (ResTy (INST (OpVTy FPRC:$Rn), (VecOpTy VPRC:$MRm), OpImm:$Imm))>;
+
+  //swapped operands
+  def  : Pat<(ResTy (opnode
+               (OpVTy (scalar_to_vector
+                 (ExTy (vector_extract (VecOpTy VPRC:$MRm), OpImm:$Imm)))),
+                 (OpVTy FPRC:$Rn))),
+             (ResTy (INST (OpVTy FPRC:$Rn), (VecOpTy VPRC:$MRm), OpImm:$Imm))>;
+}
+
+
+// Patterns for Scalar Signed saturating doubling
+// multiply long (scalar, by element)
+defm : Neon_ScalarXIndexedElem_MUL_Patterns<int_arm_neon_vqdmull,
+  SQDMULLshv_4H, v1i32, FPR16, v1i16, i16, v4i16,
+  i32, VPR64Lo, neon_uimm2_bare>;
+defm : Neon_ScalarXIndexedElem_MUL_Patterns<int_arm_neon_vqdmull,
+  SQDMULLshv_8H, v1i32, FPR16, v1i16, i16, v8i16,
+  i32, VPR128Lo, neon_uimm3_bare>;
+defm : Neon_ScalarXIndexedElem_MUL_Patterns<int_arm_neon_vqdmull,
+  SQDMULLdsv_2S, v1i64, FPR32, v1i32, i32, v2i32,
+  i32, VPR64Lo, neon_uimm1_bare>;
+defm : Neon_ScalarXIndexedElem_MUL_Patterns<int_arm_neon_vqdmull,
+  SQDMULLdsv_4S, v1i64, FPR32, v1i32, i32, v4i32,
+  i32, VPR128Lo, neon_uimm2_bare>;
+
+// Scalar Signed saturating doubling multiply-add long (scalar, by element)
+def SQDMLALshv_4H : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"sqdmlal",
+  0b0011, ".h", 0b0, 0b0, 0b1, FPR32, FPR16, VPR64Lo, neon_uimm2_bare> {
+  let Inst{11} = 0b0; // h
+  let Inst{21} = Imm{1}; // l
+  let Inst{20} = Imm{0}; // m
+  let Inst{19-16} = MRm{3-0};
+}
+def SQDMLALshv_8H : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"sqdmlal",
+  0b0011, ".h", 0b0, 0b0, 0b1, FPR32, FPR16, VPR128Lo, neon_uimm3_bare> {
+  let Inst{11} = Imm{2}; // h
+  let Inst{21} = Imm{1}; // l
+  let Inst{20} = Imm{0}; // m
+  let Inst{19-16} = MRm{3-0};
+}
+def SQDMLALdsv_2S : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"sqdmlal",
+  0b0011, ".s", 0b0, 0b1, 0b0, FPR64, FPR32, VPR64, neon_uimm1_bare> {
+  let Inst{11} = 0b0;    // h
+  let Inst{21} = Imm{0}; // l
+  let Inst{20-16} = MRm;
+}
+def SQDMLALdsv_4S : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"sqdmlal",
+  0b0011, ".s", 0b0, 0b1, 0b0, FPR64, FPR32, VPR128, neon_uimm2_bare> {
+  let Inst{11} = Imm{1};    // h
+  let Inst{21} = Imm{0};    // l
+  let Inst{20-16} = MRm;
+}
+
+// Scalar Signed saturating doubling
+// multiply-subtract long (scalar, by element)
+def SQDMLSLshv_4H : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"sqdmlsl",
+  0b0111, ".h", 0b0, 0b0, 0b1, FPR32, FPR16, VPR64Lo, neon_uimm2_bare> {
+  let Inst{11} = 0b0; // h
+  let Inst{21} = Imm{1}; // l
+  let Inst{20} = Imm{0}; // m
+  let Inst{19-16} = MRm{3-0};
+}
+def SQDMLSLshv_8H : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"sqdmlsl",
+  0b0111, ".h", 0b0, 0b0, 0b1, FPR32, FPR16, VPR128Lo, neon_uimm3_bare> {
+  let Inst{11} = Imm{2}; // h
+  let Inst{21} = Imm{1}; // l
+  let Inst{20} = Imm{0}; // m
+  let Inst{19-16} = MRm{3-0};
+}
+def SQDMLSLdsv_2S : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"sqdmlsl",
+  0b0111, ".s", 0b0, 0b1, 0b0, FPR64, FPR32, VPR64, neon_uimm1_bare> {
+  let Inst{11} = 0b0;    // h
+  let Inst{21} = Imm{0}; // l
+  let Inst{20-16} = MRm;
+}
+def SQDMLSLdsv_4S : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"sqdmlsl",
+  0b0111, ".s", 0b0, 0b1, 0b0, FPR64, FPR32, VPR128, neon_uimm2_bare> {
+  let Inst{11} = Imm{1};    // h
+  let Inst{21} = Imm{0};    // l
+  let Inst{20-16} = MRm;
+}
+
+multiclass Neon_ScalarXIndexedElem_MLAL_Patterns<
+  SDPatternOperator opnode,
+  SDPatternOperator coreopnode,
+  Instruction INST,
+  ValueType ResTy, RegisterClass ResFPRC, RegisterClass FPRC,
+  ValueType OpTy,
+  ValueType OpVTy, ValueType ExTy, RegisterOperand VPRC, Operand OpImm> {
+
+  def  : Pat<(ResTy (opnode
+               (ResTy ResFPRC:$Ra),
+               (ResTy (coreopnode (OpTy FPRC:$Rn),
+                 (OpTy (scalar_to_vector
+                   (ExTy (vector_extract (OpVTy VPRC:$MRm), OpImm:$Imm)))))))),
+             (ResTy (INST (ResTy ResFPRC:$Ra),
+               (OpTy FPRC:$Rn), (OpVTy VPRC:$MRm), OpImm:$Imm))>;
+
+  // swapped operands
+  def  : Pat<(ResTy (opnode
+               (ResTy ResFPRC:$Ra),
+               (ResTy (coreopnode
+                 (OpTy (scalar_to_vector
+                   (ExTy (vector_extract (OpVTy VPRC:$MRm), OpImm:$Imm)))),
+                 (OpTy FPRC:$Rn))))),
+             (ResTy (INST (ResTy ResFPRC:$Ra),
+               (OpTy FPRC:$Rn), (OpVTy VPRC:$MRm), OpImm:$Imm))>;
+}
+
+// Patterns for Scalar Signed saturating
+// doubling multiply-add long (scalar, by element)
+defm : Neon_ScalarXIndexedElem_MLAL_Patterns<int_arm_neon_vqadds,
+  int_arm_neon_vqdmull, SQDMLALshv_4H, v1i32, FPR32, FPR16, v1i16, v4i16,
+  i32, VPR64Lo, neon_uimm2_bare>;
+defm : Neon_ScalarXIndexedElem_MLAL_Patterns<int_arm_neon_vqadds,
+  int_arm_neon_vqdmull, SQDMLALshv_8H, v1i32, FPR32, FPR16, v1i16, v8i16,
+  i32, VPR128Lo, neon_uimm3_bare>;
+defm : Neon_ScalarXIndexedElem_MLAL_Patterns<int_arm_neon_vqadds,
+  int_arm_neon_vqdmull, SQDMLALdsv_2S, v1i64, FPR64, FPR32, v1i32, v2i32,
+  i32, VPR64Lo, neon_uimm1_bare>;
+defm : Neon_ScalarXIndexedElem_MLAL_Patterns<int_arm_neon_vqadds,
+  int_arm_neon_vqdmull, SQDMLALdsv_4S, v1i64, FPR64, FPR32, v1i32, v4i32,
+  i32, VPR128Lo, neon_uimm2_bare>;
+
+// Patterns for Scalar Signed saturating
+// doubling multiply-sub long (scalar, by element)
+defm : Neon_ScalarXIndexedElem_MLAL_Patterns<int_arm_neon_vqsubs,
+  int_arm_neon_vqdmull, SQDMLSLshv_4H, v1i32, FPR32, FPR16, v1i16, v4i16,
+  i32, VPR64Lo, neon_uimm2_bare>;
+defm : Neon_ScalarXIndexedElem_MLAL_Patterns<int_arm_neon_vqsubs,
+  int_arm_neon_vqdmull, SQDMLSLshv_8H, v1i32, FPR32, FPR16, v1i16, v8i16,
+  i32, VPR128Lo, neon_uimm3_bare>;
+defm : Neon_ScalarXIndexedElem_MLAL_Patterns<int_arm_neon_vqsubs,
+  int_arm_neon_vqdmull, SQDMLSLdsv_2S, v1i64, FPR64, FPR32, v1i32, v2i32,
+  i32, VPR64Lo, neon_uimm1_bare>;
+defm : Neon_ScalarXIndexedElem_MLAL_Patterns<int_arm_neon_vqsubs,
+  int_arm_neon_vqdmull, SQDMLSLdsv_4S, v1i64, FPR64, FPR32, v1i32, v4i32,
+  i32, VPR128Lo, neon_uimm2_bare>;
+
+// Scalar general arithmetic operation
+class Neon_Scalar_GeneralMath2D_pattern<SDPatternOperator opnode,
+                                        Instruction INST> 
+    : Pat<(v1f64 (opnode (v1f64 FPR64:$Rn))), (INST FPR64:$Rn)>;
+
+class Neon_Scalar_GeneralMath3D_pattern<SDPatternOperator opnode,
+                                        Instruction INST> 
+    : Pat<(v1f64 (opnode (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
+          (INST FPR64:$Rn, FPR64:$Rm)>;
+
+class Neon_Scalar_GeneralMath4D_pattern<SDPatternOperator opnode,
+                                        Instruction INST> 
+    : Pat<(v1f64 (opnode (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm),
+              (v1f64 FPR64:$Ra))),
+          (INST FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>;
+
+def : Neon_Scalar_GeneralMath3D_pattern<fadd, FADDddd>;
+def : Neon_Scalar_GeneralMath3D_pattern<fmul, FMULddd>;
+def : Neon_Scalar_GeneralMath3D_pattern<fsub, FSUBddd>;
+def : Neon_Scalar_GeneralMath3D_pattern<fdiv, FDIVddd>;
+def : Neon_Scalar_GeneralMath3D_pattern<int_arm_neon_vabds, FABDddd>;
+def : Neon_Scalar_GeneralMath3D_pattern<int_arm_neon_vmaxs, FMAXddd>;
+def : Neon_Scalar_GeneralMath3D_pattern<int_arm_neon_vmins, FMINddd>;
+def : Neon_Scalar_GeneralMath3D_pattern<int_aarch64_neon_vmaxnm, FMAXNMddd>;
+def : Neon_Scalar_GeneralMath3D_pattern<int_aarch64_neon_vminnm, FMINNMddd>;
+
+def : Neon_Scalar_GeneralMath2D_pattern<fabs, FABSdd>;
+def : Neon_Scalar_GeneralMath2D_pattern<fneg, FNEGdd>;
+
+def : Neon_Scalar_GeneralMath4D_pattern<fma, FMADDdddd>;
+def : Neon_Scalar_GeneralMath4D_pattern<fmsub, FMSUBdddd>;
+
+// Scalar Signed saturating doubling multiply returning
+// high half (scalar, by element)
+def SQDMULHhhv_4H : NeonI_ScalarXIndexedElemArith<"sqdmulh",
+  0b1100, ".h", 0b0, 0b0, 0b1, FPR16, FPR16, VPR64Lo, neon_uimm2_bare> {
+  let Inst{11} = 0b0; // h
+  let Inst{21} = Imm{1}; // l
+  let Inst{20} = Imm{0}; // m
+  let Inst{19-16} = MRm{3-0};
+}
+def SQDMULHhhv_8H : NeonI_ScalarXIndexedElemArith<"sqdmulh",
+  0b1100, ".h", 0b0, 0b0, 0b1, FPR16, FPR16, VPR128Lo, neon_uimm3_bare> {
+  let Inst{11} = Imm{2}; // h
+  let Inst{21} = Imm{1}; // l
+  let Inst{20} = Imm{0}; // m
+  let Inst{19-16} = MRm{3-0};
+}
+def SQDMULHssv_2S : NeonI_ScalarXIndexedElemArith<"sqdmulh",
+  0b1100, ".s", 0b0, 0b1, 0b0, FPR32, FPR32, VPR64, neon_uimm1_bare> {
+  let Inst{11} = 0b0;    // h
+  let Inst{21} = Imm{0}; // l
+  let Inst{20-16} = MRm;
+}
+def SQDMULHssv_4S : NeonI_ScalarXIndexedElemArith<"sqdmulh",
+  0b1100, ".s", 0b0, 0b1, 0b0, FPR32, FPR32, VPR128, neon_uimm2_bare> {
+  let Inst{11} = Imm{1};    // h
+  let Inst{21} = Imm{0};    // l
+  let Inst{20-16} = MRm;
+}
+
+// Patterns for Scalar Signed saturating doubling multiply returning
+// high half (scalar, by element)
+defm : Neon_ScalarXIndexedElem_MUL_Patterns<int_arm_neon_vqdmulh,
+  SQDMULHhhv_4H, v1i16, FPR16, v1i16, i16, v4i16,
+  i32, VPR64Lo, neon_uimm2_bare>;
+defm : Neon_ScalarXIndexedElem_MUL_Patterns<int_arm_neon_vqdmulh,
+  SQDMULHhhv_8H, v1i16, FPR16, v1i16, i16, v8i16,
+  i32, VPR128Lo, neon_uimm3_bare>;
+defm : Neon_ScalarXIndexedElem_MUL_Patterns<int_arm_neon_vqdmulh,
+  SQDMULHssv_2S, v1i32, FPR32, v1i32, i32, v2i32,
+  i32, VPR64Lo, neon_uimm1_bare>;
+defm : Neon_ScalarXIndexedElem_MUL_Patterns<int_arm_neon_vqdmulh,
+  SQDMULHssv_4S, v1i32, FPR32, v1i32, i32, v4i32,
+  i32, VPR128Lo, neon_uimm2_bare>;
+
+// Scalar Signed saturating rounding doubling multiply
+// returning high half (scalar, by element)
+def SQRDMULHhhv_4H : NeonI_ScalarXIndexedElemArith<"sqrdmulh",
+  0b1101, ".h", 0b0, 0b0, 0b1, FPR16, FPR16, VPR64Lo, neon_uimm2_bare> {
+  let Inst{11} = 0b0; // h
+  let Inst{21} = Imm{1}; // l
+  let Inst{20} = Imm{0}; // m
+  let Inst{19-16} = MRm{3-0};
+}
+def SQRDMULHhhv_8H : NeonI_ScalarXIndexedElemArith<"sqrdmulh",
+  0b1101, ".h", 0b0, 0b0, 0b1, FPR16, FPR16, VPR128Lo, neon_uimm3_bare> {
+  let Inst{11} = Imm{2}; // h
+  let Inst{21} = Imm{1}; // l
+  let Inst{20} = Imm{0}; // m
+  let Inst{19-16} = MRm{3-0};
+}
+def SQRDMULHssv_2S : NeonI_ScalarXIndexedElemArith<"sqrdmulh",
+  0b1101, ".s", 0b0, 0b1, 0b0, FPR32, FPR32, VPR64, neon_uimm1_bare> {
+  let Inst{11} = 0b0;    // h
+  let Inst{21} = Imm{0}; // l
+  let Inst{20-16} = MRm;
+}
+def SQRDMULHssv_4S : NeonI_ScalarXIndexedElemArith<"sqrdmulh",
+  0b1101, ".s", 0b0, 0b1, 0b0, FPR32, FPR32, VPR128, neon_uimm2_bare> {
+  let Inst{11} = Imm{1};    // h
+  let Inst{21} = Imm{0};    // l
+  let Inst{20-16} = MRm;
+}
+
+defm : Neon_ScalarXIndexedElem_MUL_Patterns<int_arm_neon_vqrdmulh,
+  SQRDMULHhhv_4H, v1i16, FPR16, v1i16, i16, v4i16, i32,
+  VPR64Lo, neon_uimm2_bare>;
+defm : Neon_ScalarXIndexedElem_MUL_Patterns<int_arm_neon_vqrdmulh,
+  SQRDMULHhhv_8H, v1i16, FPR16, v1i16, i16, v8i16, i32,
+  VPR128Lo, neon_uimm3_bare>;
+defm : Neon_ScalarXIndexedElem_MUL_Patterns<int_arm_neon_vqrdmulh,
+  SQRDMULHssv_2S, v1i32, FPR32, v1i32, i32, v2i32, i32,
+  VPR64Lo, neon_uimm1_bare>;
+defm : Neon_ScalarXIndexedElem_MUL_Patterns<int_arm_neon_vqrdmulh,
+  SQRDMULHssv_4S, v1i32, FPR32, v1i32, i32, v4i32, i32,
+  VPR128Lo, neon_uimm2_bare>;
+
+// Scalar Copy - DUP element to scalar
+class NeonI_Scalar_DUP<string asmop, string asmlane,
+                       RegisterClass ResRC, RegisterOperand VPRC,
+                       Operand OpImm>
+  : NeonI_ScalarCopy<(outs ResRC:$Rd), (ins VPRC:$Rn, OpImm:$Imm),
+                     asmop # "\t$Rd, $Rn." # asmlane # "[$Imm]",
+                     [],
+                     NoItinerary> {
+  bits<4> Imm;
+}
+
+def DUPbv_B : NeonI_Scalar_DUP<"dup", "b", FPR8, VPR128, neon_uimm4_bare> {
+  let Inst{20-16} = {Imm{3}, Imm{2}, Imm{1}, Imm{0}, 0b1};
+}
+def DUPhv_H : NeonI_Scalar_DUP<"dup", "h", FPR16, VPR128, neon_uimm3_bare> {
+  let Inst{20-16} = {Imm{2}, Imm{1}, Imm{0}, 0b1, 0b0};
+}
+def DUPsv_S : NeonI_Scalar_DUP<"dup", "s", FPR32, VPR128, neon_uimm2_bare> {
+  let Inst{20-16} = {Imm{1}, Imm{0}, 0b1, 0b0, 0b0};
+}
+def DUPdv_D : NeonI_Scalar_DUP<"dup", "d", FPR64, VPR128, neon_uimm1_bare> {
+  let Inst{20-16} = {Imm, 0b1, 0b0, 0b0, 0b0};
+}
+
+multiclass NeonI_Scalar_DUP_Elt_pattern<Instruction DUPI, ValueType ResTy,
+  ValueType OpTy, Operand OpImm,
+  ValueType OpNTy, ValueType ExTy, Operand OpNImm> {
+  def : Pat<(ResTy (vector_extract (OpTy VPR128:$Rn), OpImm:$Imm)),
+            (ResTy (DUPI (OpTy VPR128:$Rn), OpImm:$Imm))>;
+
+  def : Pat<(ResTy (vector_extract (OpNTy VPR64:$Rn), OpNImm:$Imm)),
+            (ResTy (DUPI
+              (ExTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
+                OpNImm:$Imm))>;
+}
+
+// Patterns for vector extract of FP data using scalar DUP instructions
+defm : NeonI_Scalar_DUP_Elt_pattern<DUPsv_S, f32,
+  v4f32, neon_uimm2_bare, v2f32, v4f32, neon_uimm1_bare>;
+defm : NeonI_Scalar_DUP_Elt_pattern<DUPdv_D, f64,
+  v2f64, neon_uimm1_bare, v1f64, v2f64, neon_uimm0_bare>;
+
+multiclass NeonI_Scalar_DUP_Ext_Vec_pattern<Instruction DUPI,
+  ValueType ResTy, ValueType OpTy,Operand OpLImm,
+  ValueType NOpTy, ValueType ExTy, Operand OpNImm> {
+
+  def : Pat<(ResTy (extract_subvector (OpTy VPR128:$Rn), OpLImm:$Imm)),
+            (ResTy (DUPI VPR128:$Rn, OpLImm:$Imm))>;
+
+  def : Pat<(ResTy (extract_subvector (NOpTy VPR64:$Rn), OpNImm:$Imm)),
+            (ResTy (DUPI
+              (ExTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
+                OpNImm:$Imm))>;
+}
+
+// Patterns for extract subvectors of v1ix data using scalar DUP instructions.
+defm : NeonI_Scalar_DUP_Ext_Vec_pattern<DUPbv_B, v1i8, v16i8, neon_uimm4_bare,
+                                        v8i8, v16i8, neon_uimm3_bare>;
+defm : NeonI_Scalar_DUP_Ext_Vec_pattern<DUPhv_H, v1i16, v8i16, neon_uimm3_bare,
+                                        v4i16, v8i16, neon_uimm2_bare>;
+defm : NeonI_Scalar_DUP_Ext_Vec_pattern<DUPsv_S, v1i32, v4i32, neon_uimm2_bare,
+                                        v2i32, v4i32, neon_uimm1_bare>;
+
+multiclass NeonI_Scalar_DUP_Copy_pattern1<Instruction DUPI, ValueType ResTy,
+                                          ValueType OpTy, ValueType ElemTy,
+                                          Operand OpImm, ValueType OpNTy,
+                                          ValueType ExTy, Operand OpNImm> {
+
+  def : Pat<(ResTy (vector_insert (ResTy undef),
+              (ElemTy (vector_extract (OpTy VPR128:$Rn), OpImm:$Imm)),
+              (neon_uimm0_bare:$Imm))),
+            (ResTy (DUPI (OpTy VPR128:$Rn), OpImm:$Imm))>;
+
+  def : Pat<(ResTy (vector_insert (ResTy undef),
+              (ElemTy (vector_extract (OpNTy VPR64:$Rn), OpNImm:$Imm)),
+              (OpNImm:$Imm))),
+            (ResTy (DUPI
+              (ExTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
+              OpNImm:$Imm))>;
+}
+
+multiclass NeonI_Scalar_DUP_Copy_pattern2<Instruction DUPI, ValueType ResTy,
+                                          ValueType OpTy, ValueType ElemTy,
+                                          Operand OpImm, ValueType OpNTy,
+                                          ValueType ExTy, Operand OpNImm> {
+
+  def : Pat<(ResTy (scalar_to_vector
+              (ElemTy (vector_extract (OpTy VPR128:$Rn), OpImm:$Imm)))),
+            (ResTy (DUPI (OpTy VPR128:$Rn), OpImm:$Imm))>;
+
+  def : Pat<(ResTy (scalar_to_vector
+              (ElemTy (vector_extract (OpNTy VPR64:$Rn), OpNImm:$Imm)))),
+            (ResTy (DUPI
+              (ExTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
+              OpNImm:$Imm))>;
+}
+
+// Patterns for vector copy to v1ix and v1fx vectors using scalar DUP
+// instructions.
+defm : NeonI_Scalar_DUP_Copy_pattern1<DUPdv_D,
+  v1i64, v2i64, i64, neon_uimm1_bare,
+  v1i64, v2i64, neon_uimm0_bare>;
+defm : NeonI_Scalar_DUP_Copy_pattern1<DUPsv_S,
+  v1i32, v4i32, i32, neon_uimm2_bare,
+  v2i32, v4i32, neon_uimm1_bare>;
+defm : NeonI_Scalar_DUP_Copy_pattern1<DUPhv_H,
+  v1i16, v8i16, i32, neon_uimm3_bare,
+  v4i16, v8i16, neon_uimm2_bare>;
+defm : NeonI_Scalar_DUP_Copy_pattern1<DUPbv_B,
+  v1i8, v16i8, i32, neon_uimm4_bare,
+  v8i8, v16i8, neon_uimm3_bare>;
+defm : NeonI_Scalar_DUP_Copy_pattern1<DUPdv_D,
+  v1f64, v2f64, f64, neon_uimm1_bare,
+  v1f64, v2f64, neon_uimm0_bare>;
+defm : NeonI_Scalar_DUP_Copy_pattern1<DUPsv_S,
+  v1f32, v4f32, f32, neon_uimm2_bare,
+  v2f32, v4f32, neon_uimm1_bare>;
+defm : NeonI_Scalar_DUP_Copy_pattern2<DUPdv_D,
+  v1i64, v2i64, i64, neon_uimm1_bare,
+  v1i64, v2i64, neon_uimm0_bare>;
+defm : NeonI_Scalar_DUP_Copy_pattern2<DUPsv_S,
+  v1i32, v4i32, i32, neon_uimm2_bare,
+  v2i32, v4i32, neon_uimm1_bare>;
+defm : NeonI_Scalar_DUP_Copy_pattern2<DUPhv_H,
+  v1i16, v8i16, i32, neon_uimm3_bare,
+  v4i16, v8i16, neon_uimm2_bare>;
+defm : NeonI_Scalar_DUP_Copy_pattern2<DUPbv_B,
+  v1i8, v16i8, i32, neon_uimm4_bare,
+  v8i8, v16i8, neon_uimm3_bare>;
+defm : NeonI_Scalar_DUP_Copy_pattern2<DUPdv_D,
+  v1f64, v2f64, f64, neon_uimm1_bare,
+  v1f64, v2f64, neon_uimm0_bare>;
+defm : NeonI_Scalar_DUP_Copy_pattern2<DUPsv_S,
+  v1f32, v4f32, f32, neon_uimm2_bare,
+  v2f32, v4f32, neon_uimm1_bare>;
+
+multiclass NeonI_Scalar_DUP_alias<string asmop, string asmlane,
+                                  Instruction DUPI, Operand OpImm,
+                                  RegisterClass ResRC> {
+  def : NeonInstAlias<!strconcat(asmop, "$Rd, $Rn" # asmlane # "[$Imm]"),
+          (DUPI ResRC:$Rd, VPR128:$Rn, OpImm:$Imm), 0b0>;
+}
+
+// Aliases for Scalar copy - DUP element (scalar)
+// FIXME: This is actually the preferred syntax but TableGen can't deal with
+// custom printing of aliases.
+defm : NeonI_Scalar_DUP_alias<"mov", ".b", DUPbv_B, neon_uimm4_bare, FPR8>;
+defm : NeonI_Scalar_DUP_alias<"mov", ".h", DUPhv_H, neon_uimm3_bare, FPR16>;
+defm : NeonI_Scalar_DUP_alias<"mov", ".s", DUPsv_S, neon_uimm2_bare, FPR32>;
+defm : NeonI_Scalar_DUP_alias<"mov", ".d", DUPdv_D, neon_uimm1_bare, FPR64>;
+
+multiclass NeonI_SDUP<PatFrag GetLow, PatFrag GetHigh, ValueType ResTy,
+                      ValueType OpTy> {
+  def : Pat<(ResTy (GetLow VPR128:$Rn)),
+            (ResTy (EXTRACT_SUBREG (OpTy VPR128:$Rn), sub_64))>;
+  def : Pat<(ResTy (GetHigh VPR128:$Rn)),
+            (ResTy (DUPdv_D (OpTy VPR128:$Rn), 1))>;
+}
+
+defm : NeonI_SDUP<Neon_Low16B, Neon_High16B, v8i8, v16i8>;
+defm : NeonI_SDUP<Neon_Low8H, Neon_High8H, v4i16, v8i16>;
+defm : NeonI_SDUP<Neon_Low4S, Neon_High4S, v2i32, v4i32>;
+defm : NeonI_SDUP<Neon_Low2D, Neon_High2D, v1i64, v2i64>;
+defm : NeonI_SDUP<Neon_Low4float, Neon_High4float, v2f32, v4f32>;
+defm : NeonI_SDUP<Neon_Low2double, Neon_High2double, v1f64, v2f64>;
 
 //===----------------------------------------------------------------------===//
 // Non-Instruction Patterns
@@ -1578,57 +6085,2587 @@ def : Pat<(v4i32 (bitconvert (v2f64  VPR128:$src))), (v4i32 VPR128:$src)>;
 def : Pat<(v8i16 (bitconvert (v2f64  VPR128:$src))), (v8i16 VPR128:$src)>;
 def : Pat<(v16i8 (bitconvert (v2f64  VPR128:$src))), (v16i8 VPR128:$src)>;
 
-
 // ...and scalar bitcasts...
+def : Pat<(f16 (bitconvert (v1i16  FPR16:$src))), (f16 FPR16:$src)>;
+def : Pat<(f32 (bitconvert (v1i32  FPR32:$src))), (f32 FPR32:$src)>;
+def : Pat<(f64 (bitconvert (v1i64  FPR64:$src))), (f64 FPR64:$src)>;
+def : Pat<(f32 (bitconvert (v1f32  FPR32:$src))), (f32 FPR32:$src)>;
+def : Pat<(f64 (bitconvert (v1f64  FPR64:$src))), (f64 FPR64:$src)>;
+
+def : Pat<(i64 (bitconvert (v1i64  FPR64:$src))), (FMOVxd $src)>;
+def : Pat<(i64 (bitconvert (v1f64  FPR64:$src))), (FMOVxd $src)>;
+def : Pat<(i64 (bitconvert (v2i32  FPR64:$src))), (FMOVxd $src)>;
+def : Pat<(i64 (bitconvert (v2f32  FPR64:$src))), (FMOVxd $src)>;
+def : Pat<(i64 (bitconvert (v4i16  FPR64:$src))), (FMOVxd $src)>;
+def : Pat<(i64 (bitconvert (v8i8  FPR64:$src))), (FMOVxd $src)>;
+
+def : Pat<(i32 (bitconvert (v1i32  FPR32:$src))), (FMOVws $src)>;
+
+def : Pat<(v8i8  (bitconvert (v1i64  VPR64:$src))), (v8i8 VPR64:$src)>;
+def : Pat<(v4i16 (bitconvert (v1i64  VPR64:$src))), (v4i16 VPR64:$src)>;
+def : Pat<(v2i32 (bitconvert (v1i64  VPR64:$src))), (v2i32 VPR64:$src)>;
+
+def : Pat<(f64   (bitconvert (v8i8  VPR64:$src))), (f64 VPR64:$src)>;
+def : Pat<(f64   (bitconvert (v4i16  VPR64:$src))), (f64 VPR64:$src)>;
+def : Pat<(f64   (bitconvert (v2i32  VPR64:$src))), (f64 VPR64:$src)>;
+def : Pat<(f64   (bitconvert (v2f32  VPR64:$src))), (f64 VPR64:$src)>;
+def : Pat<(f64   (bitconvert (v1i64  VPR64:$src))), (f64 VPR64:$src)>;
+
+def : Pat<(f128  (bitconvert (v16i8  VPR128:$src))), (f128 VPR128:$src)>;
+def : Pat<(f128  (bitconvert (v8i16  VPR128:$src))), (f128 VPR128:$src)>;
+def : Pat<(f128  (bitconvert (v4i32  VPR128:$src))), (f128 VPR128:$src)>;
+def : Pat<(f128  (bitconvert (v2i64  VPR128:$src))), (f128 VPR128:$src)>;
+def : Pat<(f128  (bitconvert (v4f32  VPR128:$src))), (f128 VPR128:$src)>;
+def : Pat<(f128  (bitconvert (v2f64  VPR128:$src))), (f128 VPR128:$src)>;
+
+def : Pat<(v1i16 (bitconvert (f16  FPR16:$src))), (v1i16 FPR16:$src)>;
+def : Pat<(v1i32 (bitconvert (f32  FPR32:$src))), (v1i32 FPR32:$src)>;
+def : Pat<(v1i64 (bitconvert (f64  FPR64:$src))), (v1i64 FPR64:$src)>;
+def : Pat<(v1f32 (bitconvert (f32  FPR32:$src))), (v1f32 FPR32:$src)>;
+def : Pat<(v1f64 (bitconvert (f64  FPR64:$src))), (v1f64 FPR64:$src)>;
+
+def : Pat<(v1i64 (bitconvert (i64  GPR64:$src))), (FMOVdx $src)>;
+def : Pat<(v1f64 (bitconvert (i64  GPR64:$src))), (FMOVdx $src)>;
+def : Pat<(v2i32 (bitconvert (i64  GPR64:$src))), (FMOVdx $src)>;
+def : Pat<(v2f32 (bitconvert (i64  GPR64:$src))), (FMOVdx $src)>;
+def : Pat<(v4i16 (bitconvert (i64  GPR64:$src))), (FMOVdx $src)>;
+def : Pat<(v8i8 (bitconvert (i64  GPR64:$src))), (FMOVdx $src)>;
+
+def : Pat<(v1i32 (bitconvert (i32  GPR32:$src))), (FMOVsw $src)>;
+
+def : Pat<(v8i8   (bitconvert (f64   FPR64:$src))), (v8i8 FPR64:$src)>;
+def : Pat<(v4i16  (bitconvert (f64   FPR64:$src))), (v4i16 FPR64:$src)>;
+def : Pat<(v2i32  (bitconvert (f64   FPR64:$src))), (v2i32 FPR64:$src)>;
+def : Pat<(v2f32  (bitconvert (f64   FPR64:$src))), (v2f32 FPR64:$src)>;
+def : Pat<(v1i64  (bitconvert (f64   FPR64:$src))), (v1i64 FPR64:$src)>;
+
+def : Pat<(v16i8  (bitconvert (f128   FPR128:$src))), (v16i8 FPR128:$src)>;
+def : Pat<(v8i16  (bitconvert (f128   FPR128:$src))), (v8i16 FPR128:$src)>;
+def : Pat<(v4i32  (bitconvert (f128   FPR128:$src))), (v4i32 FPR128:$src)>;
+def : Pat<(v2i64  (bitconvert (f128   FPR128:$src))), (v2i64 FPR128:$src)>;
+def : Pat<(v4f32  (bitconvert (f128   FPR128:$src))), (v4f32 FPR128:$src)>;
+def : Pat<(v2f64  (bitconvert (f128   FPR128:$src))), (v2f64 FPR128:$src)>;
+
+// Scalar Three Same
+
+def neon_uimm3 : Operand<i64>,
+                   ImmLeaf<i64, [{return Imm < 8;}]> {
+  let ParserMatchClass = uimm3_asmoperand;
+  let PrintMethod = "printUImmHexOperand";
+}
+
+def neon_uimm4 : Operand<i64>,
+                   ImmLeaf<i64, [{return Imm < 16;}]> {
+  let ParserMatchClass = uimm4_asmoperand;
+  let PrintMethod = "printUImmHexOperand";
+}
+
+// Bitwise Extract
+class NeonI_Extract<bit q, bits<2> op2, string asmop,
+                    string OpS, RegisterOperand OpVPR, Operand OpImm>
+  : NeonI_BitExtract<q, op2, (outs OpVPR:$Rd),
+                     (ins OpVPR:$Rn, OpVPR:$Rm, OpImm:$Index),
+                     asmop # "\t$Rd." # OpS # ", $Rn." # OpS #
+                     ", $Rm." # OpS # ", $Index",
+                     [],
+                     NoItinerary>{
+  bits<4> Index;
+}
+
+def EXTvvvi_8b : NeonI_Extract<0b0, 0b00, "ext", "8b",
+                               VPR64, neon_uimm3> {
+  let Inst{14-11} = {0b0, Index{2}, Index{1}, Index{0}};
+}
+
+def EXTvvvi_16b: NeonI_Extract<0b1, 0b00, "ext", "16b",
+                               VPR128, neon_uimm4> {
+  let Inst{14-11} = Index;
+}
+
+class NI_Extract<ValueType OpTy, RegisterOperand OpVPR, Instruction INST,
+                 Operand OpImm>
+  : Pat<(OpTy (Neon_vextract (OpTy OpVPR:$Rn), (OpTy OpVPR:$Rm),
+                                 (i64 OpImm:$Imm))),
+              (INST OpVPR:$Rn, OpVPR:$Rm, OpImm:$Imm)>;
+
+def : NI_Extract<v8i8,  VPR64,  EXTvvvi_8b,  neon_uimm3>;
+def : NI_Extract<v4i16, VPR64,  EXTvvvi_8b,  neon_uimm3>;
+def : NI_Extract<v2i32, VPR64,  EXTvvvi_8b,  neon_uimm3>;
+def : NI_Extract<v1i64, VPR64,  EXTvvvi_8b,  neon_uimm3>;
+def : NI_Extract<v2f32, VPR64,  EXTvvvi_8b,  neon_uimm3>;
+def : NI_Extract<v1f64, VPR64,  EXTvvvi_8b,  neon_uimm3>;
+def : NI_Extract<v16i8, VPR128, EXTvvvi_16b, neon_uimm4>;
+def : NI_Extract<v8i16, VPR128, EXTvvvi_16b, neon_uimm4>;
+def : NI_Extract<v4i32, VPR128, EXTvvvi_16b, neon_uimm4>;
+def : NI_Extract<v2i64, VPR128, EXTvvvi_16b, neon_uimm4>;
+def : NI_Extract<v4f32, VPR128, EXTvvvi_16b, neon_uimm4>;
+def : NI_Extract<v2f64, VPR128, EXTvvvi_16b, neon_uimm4>;
+
+// Table lookup
+class NI_TBL<bit q, bits<2> op2, bits<2> len, bit op,
+             string asmop, string OpS, RegisterOperand OpVPR,
+             RegisterOperand VecList>
+  : NeonI_TBL<q, op2, len, op,
+              (outs OpVPR:$Rd), (ins VecList:$Rn, OpVPR:$Rm),
+              asmop # "\t$Rd." # OpS # ", $Rn, $Rm." # OpS,
+              [],
+              NoItinerary>;
+
+// The vectors in look up table are always 16b
+multiclass NI_TBL_pat<bits<2> len, bit op, string asmop, string List> {
+  def _8b  : NI_TBL<0, 0b00, len, op, asmop, "8b", VPR64,
+                    !cast<RegisterOperand>(List # "16B_operand")>;
+
+  def _16b : NI_TBL<1, 0b00, len, op, asmop, "16b", VPR128,
+                    !cast<RegisterOperand>(List # "16B_operand")>;
+}
+
+defm TBL1 : NI_TBL_pat<0b00, 0b0, "tbl", "VOne">;
+defm TBL2 : NI_TBL_pat<0b01, 0b0, "tbl", "VPair">;
+defm TBL3 : NI_TBL_pat<0b10, 0b0, "tbl", "VTriple">;
+defm TBL4 : NI_TBL_pat<0b11, 0b0, "tbl", "VQuad">;
+
+// Table lookup extention
+class NI_TBX<bit q, bits<2> op2, bits<2> len, bit op,
+             string asmop, string OpS, RegisterOperand OpVPR,
+             RegisterOperand VecList>
+  : NeonI_TBL<q, op2, len, op,
+              (outs OpVPR:$Rd), (ins OpVPR:$src, VecList:$Rn, OpVPR:$Rm),
+              asmop # "\t$Rd." # OpS # ", $Rn, $Rm." # OpS,
+              [],
+              NoItinerary> {
+  let Constraints = "$src = $Rd";
+}
+
+// The vectors in look up table are always 16b
+multiclass NI_TBX_pat<bits<2> len, bit op, string asmop, string List> {
+  def _8b  : NI_TBX<0, 0b00, len, op, asmop, "8b", VPR64,
+                    !cast<RegisterOperand>(List # "16B_operand")>;
+
+  def _16b : NI_TBX<1, 0b00, len, op, asmop, "16b", VPR128,
+                    !cast<RegisterOperand>(List # "16B_operand")>;
+}
+
+defm TBX1 : NI_TBX_pat<0b00, 0b1, "tbx", "VOne">;
+defm TBX2 : NI_TBX_pat<0b01, 0b1, "tbx", "VPair">;
+defm TBX3 : NI_TBX_pat<0b10, 0b1, "tbx", "VTriple">;
+defm TBX4 : NI_TBX_pat<0b11, 0b1, "tbx", "VQuad">;
+
+class NeonI_INS_main<string asmop, string Res, ValueType ResTy,
+                     RegisterClass OpGPR, ValueType OpTy, Operand OpImm>
+  : NeonI_copy<0b1, 0b0, 0b0011,
+               (outs VPR128:$Rd), (ins VPR128:$src, OpGPR:$Rn, OpImm:$Imm),
+               asmop # "\t$Rd." # Res # "[$Imm], $Rn",
+               [(set (ResTy VPR128:$Rd),
+                 (ResTy (vector_insert
+                   (ResTy VPR128:$src),
+                   (OpTy OpGPR:$Rn),
+                   (OpImm:$Imm))))],
+               NoItinerary> {
+  bits<4> Imm;
+  let Constraints = "$src = $Rd";
+}
+
+//Insert element (vector, from main)
+def INSbw : NeonI_INS_main<"ins", "b", v16i8, GPR32, i32,
+                           neon_uimm4_bare> {
+  let Inst{20-16} = {Imm{3}, Imm{2}, Imm{1}, Imm{0}, 0b1};
+}
+def INShw : NeonI_INS_main<"ins", "h", v8i16, GPR32, i32,
+                           neon_uimm3_bare> {
+  let Inst{20-16} = {Imm{2}, Imm{1}, Imm{0}, 0b1, 0b0};
+}
+def INSsw : NeonI_INS_main<"ins", "s", v4i32, GPR32, i32,
+                           neon_uimm2_bare> {
+  let Inst{20-16} = {Imm{1}, Imm{0}, 0b1, 0b0, 0b0};
+}
+def INSdx : NeonI_INS_main<"ins", "d", v2i64, GPR64, i64,
+                           neon_uimm1_bare> {
+  let Inst{20-16} = {Imm, 0b1, 0b0, 0b0, 0b0};
+}
+
+def : NeonInstAlias<"mov $Rd.b[$Imm], $Rn",
+                    (INSbw VPR128:$Rd, GPR32:$Rn, neon_uimm4_bare:$Imm), 0>;
+def : NeonInstAlias<"mov $Rd.h[$Imm], $Rn",
+                    (INShw VPR128:$Rd, GPR32:$Rn, neon_uimm3_bare:$Imm), 0>;
+def : NeonInstAlias<"mov $Rd.s[$Imm], $Rn",
+                    (INSsw VPR128:$Rd, GPR32:$Rn, neon_uimm2_bare:$Imm), 0>;
+def : NeonInstAlias<"mov $Rd.d[$Imm], $Rn",
+                    (INSdx VPR128:$Rd, GPR64:$Rn, neon_uimm1_bare:$Imm), 0>;
+
+class Neon_INS_main_pattern <ValueType ResTy,ValueType ExtResTy,
+                             RegisterClass OpGPR, ValueType OpTy,
+                             Operand OpImm, Instruction INS>
+  : Pat<(ResTy (vector_insert
+              (ResTy VPR64:$src),
+              (OpTy OpGPR:$Rn),
+              (OpImm:$Imm))),
+        (ResTy (EXTRACT_SUBREG
+          (ExtResTy (INS (ExtResTy (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64)),
+            OpGPR:$Rn, OpImm:$Imm)), sub_64))>;
+
+def INSbw_pattern : Neon_INS_main_pattern<v8i8, v16i8, GPR32, i32,
+                                          neon_uimm3_bare, INSbw>;
+def INShw_pattern : Neon_INS_main_pattern<v4i16, v8i16, GPR32, i32,
+                                          neon_uimm2_bare, INShw>;
+def INSsw_pattern : Neon_INS_main_pattern<v2i32, v4i32, GPR32, i32,
+                                          neon_uimm1_bare, INSsw>;
+def INSdx_pattern : Neon_INS_main_pattern<v1i64, v2i64, GPR64, i64,
+                                          neon_uimm0_bare, INSdx>;
+
+class NeonI_INS_element<string asmop, string Res, Operand ResImm>
+  : NeonI_insert<0b1, 0b1,
+                 (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn,
+                 ResImm:$Immd, ResImm:$Immn),
+                 asmop # "\t$Rd." # Res # "[$Immd], $Rn." # Res # "[$Immn]",
+                 [],
+                 NoItinerary> {
+  let Constraints = "$src = $Rd";
+  bits<4> Immd;
+  bits<4> Immn;
+}
+
+//Insert element (vector, from element)
+def INSELb : NeonI_INS_element<"ins", "b", neon_uimm4_bare> {
+  let Inst{20-16} = {Immd{3}, Immd{2}, Immd{1}, Immd{0}, 0b1};
+  let Inst{14-11} = {Immn{3}, Immn{2}, Immn{1}, Immn{0}};
+}
+def INSELh : NeonI_INS_element<"ins", "h", neon_uimm3_bare> {
+  let Inst{20-16} = {Immd{2}, Immd{1}, Immd{0}, 0b1, 0b0};
+  let Inst{14-11} = {Immn{2}, Immn{1}, Immn{0}, 0b0};
+  // bit 11 is unspecified, but should be set to zero.
+}
+def INSELs : NeonI_INS_element<"ins", "s", neon_uimm2_bare> {
+  let Inst{20-16} = {Immd{1}, Immd{0}, 0b1, 0b0, 0b0};
+  let Inst{14-11} = {Immn{1}, Immn{0}, 0b0, 0b0};
+  // bits 11-12 are unspecified, but should be set to zero.
+}
+def INSELd : NeonI_INS_element<"ins", "d", neon_uimm1_bare> {
+  let Inst{20-16} = {Immd, 0b1, 0b0, 0b0, 0b0};
+  let Inst{14-11} = {Immn{0}, 0b0, 0b0, 0b0};
+  // bits 11-13 are unspecified, but should be set to zero.
+}
+
+def : NeonInstAlias<"mov $Rd.b[$Immd], $Rn.b[$Immn]",
+                    (INSELb VPR128:$Rd, VPR128:$Rn,
+                      neon_uimm4_bare:$Immd, neon_uimm4_bare:$Immn), 0>;
+def : NeonInstAlias<"mov $Rd.h[$Immd], $Rn.h[$Immn]",
+                    (INSELh VPR128:$Rd, VPR128:$Rn,
+                      neon_uimm3_bare:$Immd, neon_uimm3_bare:$Immn), 0>;
+def : NeonInstAlias<"mov $Rd.s[$Immd], $Rn.s[$Immn]",
+                    (INSELs VPR128:$Rd, VPR128:$Rn,
+                      neon_uimm2_bare:$Immd, neon_uimm2_bare:$Immn), 0>;
+def : NeonInstAlias<"mov $Rd.d[$Immd], $Rn.d[$Immn]",
+                    (INSELd VPR128:$Rd, VPR128:$Rn,
+                      neon_uimm1_bare:$Immd, neon_uimm1_bare:$Immn), 0>;
+
+multiclass Neon_INS_elt_pattern<ValueType ResTy, ValueType NaTy,
+                                ValueType MidTy, Operand StImm, Operand NaImm,
+                                Instruction INS> {
+def : Pat<(ResTy (vector_insert
+            (ResTy VPR128:$src),
+            (MidTy (vector_extract
+              (ResTy VPR128:$Rn),
+              (StImm:$Immn))),
+            (StImm:$Immd))),
+          (INS (ResTy VPR128:$src), (ResTy VPR128:$Rn),
+              StImm:$Immd, StImm:$Immn)>;
+
+def : Pat <(ResTy (vector_insert
+             (ResTy VPR128:$src),
+             (MidTy (vector_extract
+               (NaTy VPR64:$Rn),
+               (NaImm:$Immn))),
+             (StImm:$Immd))),
+           (INS (ResTy VPR128:$src),
+             (ResTy (SUBREG_TO_REG (i64 0), (NaTy VPR64:$Rn), sub_64)),
+             StImm:$Immd, NaImm:$Immn)>;
+
+def : Pat <(NaTy (vector_insert
+             (NaTy VPR64:$src),
+             (MidTy (vector_extract
+               (ResTy VPR128:$Rn),
+               (StImm:$Immn))),
+             (NaImm:$Immd))),
+           (NaTy (EXTRACT_SUBREG
+             (ResTy (INS
+               (ResTy (SUBREG_TO_REG (i64 0), (NaTy VPR64:$src), sub_64)),
+               (ResTy VPR128:$Rn),
+               NaImm:$Immd, StImm:$Immn)),
+             sub_64))>;
+
+def : Pat <(NaTy (vector_insert
+             (NaTy VPR64:$src),
+             (MidTy (vector_extract
+               (NaTy VPR64:$Rn),
+               (NaImm:$Immn))),
+             (NaImm:$Immd))),
+           (NaTy (EXTRACT_SUBREG
+             (ResTy (INS
+               (ResTy (SUBREG_TO_REG (i64 0), (NaTy VPR64:$src), sub_64)),
+               (ResTy (SUBREG_TO_REG (i64 0), (NaTy VPR64:$Rn), sub_64)),
+               NaImm:$Immd, NaImm:$Immn)),
+             sub_64))>;
+}
+
+defm : Neon_INS_elt_pattern<v4f32, v2f32, f32, neon_uimm2_bare,
+                            neon_uimm1_bare, INSELs>;
+defm : Neon_INS_elt_pattern<v2f64, v1f64, f64, neon_uimm1_bare,
+                            neon_uimm0_bare, INSELd>;
+defm : Neon_INS_elt_pattern<v16i8, v8i8, i32, neon_uimm4_bare,
+                            neon_uimm3_bare, INSELb>;
+defm : Neon_INS_elt_pattern<v8i16, v4i16, i32, neon_uimm3_bare,
+                            neon_uimm2_bare, INSELh>;
+defm : Neon_INS_elt_pattern<v4i32, v2i32, i32, neon_uimm2_bare,
+                            neon_uimm1_bare, INSELs>;
+defm : Neon_INS_elt_pattern<v2i64, v1i64, i64, neon_uimm1_bare,
+                            neon_uimm0_bare, INSELd>;
+
+multiclass Neon_INS_elt_float_pattern<ValueType ResTy, ValueType NaTy,
+                                      ValueType MidTy,
+                                      RegisterClass OpFPR, Operand ResImm,
+                                      SubRegIndex SubIndex, Instruction INS> {
+def : Pat <(ResTy (vector_insert
+             (ResTy VPR128:$src),
+             (MidTy OpFPR:$Rn),
+             (ResImm:$Imm))),
+           (INS (ResTy VPR128:$src),
+             (ResTy (SUBREG_TO_REG (i64 0), OpFPR:$Rn, SubIndex)),
+             ResImm:$Imm,
+             (i64 0))>;
+
+def : Pat <(NaTy (vector_insert
+             (NaTy VPR64:$src),
+             (MidTy OpFPR:$Rn),
+             (ResImm:$Imm))),
+           (NaTy (EXTRACT_SUBREG
+             (ResTy (INS
+               (ResTy (SUBREG_TO_REG (i64 0), (NaTy VPR64:$src), sub_64)),
+               (ResTy (SUBREG_TO_REG (i64 0), (MidTy OpFPR:$Rn), SubIndex)),
+               ResImm:$Imm,
+               (i64 0))),
+             sub_64))>;
+}
+
+defm : Neon_INS_elt_float_pattern<v4f32, v2f32, f32, FPR32, neon_uimm2_bare,
+                                  sub_32, INSELs>;
+defm : Neon_INS_elt_float_pattern<v2f64, v1f64, f64, FPR64, neon_uimm1_bare,
+                                  sub_64, INSELd>;
+
+class NeonI_SMOV<string asmop, string Res, bit Q,
+                 ValueType OpTy, ValueType eleTy,
+                 Operand OpImm, RegisterClass ResGPR, ValueType ResTy>
+  : NeonI_copy<Q, 0b0, 0b0101,
+               (outs ResGPR:$Rd), (ins VPR128:$Rn, OpImm:$Imm),
+               asmop # "\t$Rd, $Rn." # Res # "[$Imm]",
+               [(set (ResTy ResGPR:$Rd),
+                 (ResTy (sext_inreg
+                   (ResTy (vector_extract
+                     (OpTy VPR128:$Rn), (OpImm:$Imm))),
+                   eleTy)))],
+               NoItinerary> {
+  bits<4> Imm;
+}
+
+//Signed integer move (main, from element)
+def SMOVwb : NeonI_SMOV<"smov", "b", 0b0, v16i8, i8, neon_uimm4_bare,
+                        GPR32, i32> {
+  let Inst{20-16} = {Imm{3}, Imm{2}, Imm{1}, Imm{0}, 0b1};
+}
+def SMOVwh : NeonI_SMOV<"smov", "h", 0b0, v8i16, i16, neon_uimm3_bare,
+                        GPR32, i32> {
+  let Inst{20-16} = {Imm{2}, Imm{1}, Imm{0}, 0b1, 0b0};
+}
+def SMOVxb : NeonI_SMOV<"smov", "b", 0b1, v16i8, i8, neon_uimm4_bare,
+                        GPR64, i64> {
+  let Inst{20-16} = {Imm{3}, Imm{2}, Imm{1}, Imm{0}, 0b1};
+}
+def SMOVxh : NeonI_SMOV<"smov", "h", 0b1, v8i16, i16, neon_uimm3_bare,
+                        GPR64, i64> {
+  let Inst{20-16} = {Imm{2}, Imm{1}, Imm{0}, 0b1, 0b0};
+}
+def SMOVxs : NeonI_SMOV<"smov", "s", 0b1, v4i32, i32, neon_uimm2_bare,
+                        GPR64, i64> {
+  let Inst{20-16} = {Imm{1}, Imm{0}, 0b1, 0b0, 0b0};
+}
+
+multiclass Neon_SMOVx_pattern <ValueType StTy, ValueType NaTy,
+                               ValueType eleTy, Operand StImm,  Operand NaImm,
+                               Instruction SMOVI> {
+  def : Pat<(i64 (sext_inreg
+              (i64 (anyext
+                (i32 (vector_extract
+                  (StTy VPR128:$Rn), (StImm:$Imm))))),
+              eleTy)),
+            (SMOVI VPR128:$Rn, StImm:$Imm)>;
+
+  def : Pat<(i64 (sext
+              (i32 (vector_extract
+                (StTy VPR128:$Rn), (StImm:$Imm))))),
+            (SMOVI VPR128:$Rn, StImm:$Imm)>;
+
+  def : Pat<(i64 (sext_inreg
+              (i64 (vector_extract
+                (NaTy VPR64:$Rn), (NaImm:$Imm))),
+              eleTy)),
+            (SMOVI (StTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
+              NaImm:$Imm)>;
+
+  def : Pat<(i64 (sext_inreg
+              (i64 (anyext
+                (i32 (vector_extract
+                  (NaTy VPR64:$Rn), (NaImm:$Imm))))),
+              eleTy)),
+            (SMOVI (StTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
+              NaImm:$Imm)>;
+
+  def : Pat<(i64 (sext
+              (i32 (vector_extract
+                (NaTy VPR64:$Rn), (NaImm:$Imm))))),
+            (SMOVI (StTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
+              NaImm:$Imm)>;
+}
+
+defm : Neon_SMOVx_pattern<v16i8, v8i8, i8, neon_uimm4_bare,
+                          neon_uimm3_bare, SMOVxb>;
+defm : Neon_SMOVx_pattern<v8i16, v4i16, i16, neon_uimm3_bare,
+                          neon_uimm2_bare, SMOVxh>;
+defm : Neon_SMOVx_pattern<v4i32, v2i32, i32, neon_uimm2_bare,
+                          neon_uimm1_bare, SMOVxs>;
+
+class Neon_SMOVw_pattern <ValueType StTy, ValueType NaTy,
+                          ValueType eleTy, Operand StImm,  Operand NaImm,
+                          Instruction SMOVI>
+  : Pat<(i32 (sext_inreg
+          (i32 (vector_extract
+            (NaTy VPR64:$Rn), (NaImm:$Imm))),
+          eleTy)),
+        (SMOVI (StTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
+          NaImm:$Imm)>;
+
+def : Neon_SMOVw_pattern<v16i8, v8i8, i8, neon_uimm4_bare,
+                         neon_uimm3_bare, SMOVwb>;
+def : Neon_SMOVw_pattern<v8i16, v4i16, i16, neon_uimm3_bare,
+                         neon_uimm2_bare, SMOVwh>;
+
+class NeonI_UMOV<string asmop, string Res, bit Q,
+                 ValueType OpTy, Operand OpImm,
+                 RegisterClass ResGPR, ValueType ResTy>
+  : NeonI_copy<Q, 0b0, 0b0111,
+               (outs ResGPR:$Rd), (ins VPR128:$Rn, OpImm:$Imm),
+               asmop # "\t$Rd, $Rn." # Res # "[$Imm]",
+               [(set (ResTy ResGPR:$Rd),
+                  (ResTy (vector_extract
+                    (OpTy VPR128:$Rn), (OpImm:$Imm))))],
+               NoItinerary> {
+  bits<4> Imm;
+}
+
+//Unsigned integer move (main, from element)
+def UMOVwb : NeonI_UMOV<"umov", "b", 0b0, v16i8, neon_uimm4_bare,
+                         GPR32, i32> {
+  let Inst{20-16} = {Imm{3}, Imm{2}, Imm{1}, Imm{0}, 0b1};
+}
+def UMOVwh : NeonI_UMOV<"umov", "h", 0b0, v8i16, neon_uimm3_bare,
+                         GPR32, i32> {
+  let Inst{20-16} = {Imm{2}, Imm{1}, Imm{0}, 0b1, 0b0};
+}
+def UMOVws : NeonI_UMOV<"umov", "s", 0b0, v4i32, neon_uimm2_bare,
+                         GPR32, i32> {
+  let Inst{20-16} = {Imm{1}, Imm{0}, 0b1, 0b0, 0b0};
+}
+def UMOVxd : NeonI_UMOV<"umov", "d", 0b1, v2i64, neon_uimm1_bare,
+                         GPR64, i64> {
+  let Inst{20-16} = {Imm, 0b1, 0b0, 0b0, 0b0};
+}
+
+def : NeonInstAlias<"mov $Rd, $Rn.s[$Imm]",
+                    (UMOVws GPR32:$Rd, VPR128:$Rn, neon_uimm2_bare:$Imm), 0>;
+def : NeonInstAlias<"mov $Rd, $Rn.d[$Imm]",
+                    (UMOVxd GPR64:$Rd, VPR128:$Rn, neon_uimm1_bare:$Imm), 0>;
+
+class Neon_UMOV_pattern <ValueType StTy, ValueType NaTy, ValueType ResTy,
+                         Operand StImm,  Operand NaImm,
+                         Instruction SMOVI>
+  : Pat<(ResTy (vector_extract
+          (NaTy VPR64:$Rn), NaImm:$Imm)),
+        (SMOVI (StTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
+          NaImm:$Imm)>;
+
+def : Neon_UMOV_pattern<v16i8, v8i8, i32, neon_uimm4_bare,
+                        neon_uimm3_bare, UMOVwb>;
+def : Neon_UMOV_pattern<v8i16, v4i16, i32, neon_uimm3_bare,
+                        neon_uimm2_bare, UMOVwh>;
+def : Neon_UMOV_pattern<v4i32, v2i32, i32, neon_uimm2_bare,
+                        neon_uimm1_bare, UMOVws>;
+
+def : Pat<(i32 (and
+            (i32 (vector_extract
+              (v16i8 VPR128:$Rn), (neon_uimm4_bare:$Imm))),
+            255)),
+          (UMOVwb VPR128:$Rn, neon_uimm4_bare:$Imm)>;
+
+def : Pat<(i32 (and
+            (i32 (vector_extract
+              (v8i16 VPR128:$Rn), (neon_uimm3_bare:$Imm))),
+            65535)),
+          (UMOVwh VPR128:$Rn, neon_uimm3_bare:$Imm)>;
+
+def : Pat<(i64 (zext
+            (i32 (vector_extract
+              (v2i64 VPR128:$Rn), (neon_uimm1_bare:$Imm))))),
+          (UMOVxd VPR128:$Rn, neon_uimm1_bare:$Imm)>;
+
+def : Pat<(i32 (and
+            (i32 (vector_extract
+              (v8i8 VPR64:$Rn), (neon_uimm3_bare:$Imm))),
+            255)),
+          (UMOVwb (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64),
+            neon_uimm3_bare:$Imm)>;
+
+def : Pat<(i32 (and
+            (i32 (vector_extract
+              (v4i16 VPR64:$Rn), (neon_uimm2_bare:$Imm))),
+            65535)),
+          (UMOVwh (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64),
+            neon_uimm2_bare:$Imm)>;
+
+def : Pat<(i64 (zext
+            (i32 (vector_extract
+              (v1i64 VPR64:$Rn), (neon_uimm0_bare:$Imm))))),
+          (UMOVxd (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64),
+            neon_uimm0_bare:$Imm)>;
+
+// Additional copy patterns for scalar types
+def : Pat<(i32 (vector_extract (v1i8 FPR8:$Rn), (i64 0))),
+          (UMOVwb (v16i8
+            (SUBREG_TO_REG (i64 0), FPR8:$Rn, sub_8)), (i64 0))>;
+
+def : Pat<(i32 (vector_extract (v1i16 FPR16:$Rn), (i64 0))),
+          (UMOVwh (v8i16
+            (SUBREG_TO_REG (i64 0), FPR16:$Rn, sub_16)), (i64 0))>;
+
+def : Pat<(i32 (vector_extract (v1i32 FPR32:$Rn), (i64 0))),
+          (FMOVws FPR32:$Rn)>;
+
+def : Pat<(i64 (vector_extract (v1i64 FPR64:$Rn), (i64 0))),
+          (FMOVxd FPR64:$Rn)>;
+
+def : Pat<(f64 (vector_extract (v1f64 FPR64:$Rn), (i64 0))),
+          (f64 FPR64:$Rn)>;
+
+def : Pat<(f32 (vector_extract (v1f32 FPR32:$Rn), (i64 0))),
+          (f32 FPR32:$Rn)>;
+
+def : Pat<(v1i8 (scalar_to_vector GPR32:$Rn)),
+          (v1i8 (EXTRACT_SUBREG (v16i8
+            (INSbw (v16i8 (IMPLICIT_DEF)), $Rn, (i64 0))),
+            sub_8))>;
+
+def : Pat<(v1i16 (scalar_to_vector GPR32:$Rn)),
+          (v1i16 (EXTRACT_SUBREG (v8i16
+            (INShw (v8i16 (IMPLICIT_DEF)), $Rn, (i64 0))),
+            sub_16))>;
+
+def : Pat<(v1i32 (scalar_to_vector GPR32:$src)),
+          (FMOVsw $src)>;
+
+def : Pat<(v1i64 (scalar_to_vector GPR64:$src)),
+          (FMOVdx $src)>;
+
+def : Pat<(v1f32 (scalar_to_vector (f32 FPR32:$Rn))),
+          (v1f32 FPR32:$Rn)>;
+def : Pat<(v1f64 (scalar_to_vector (f64 FPR64:$Rn))),
+          (v1f64 FPR64:$Rn)>;
+
+def : Pat<(v1f64 (scalar_to_vector (f64 FPR64:$src))),
+          (FMOVdd $src)>;
+
+def : Pat<(v2f64 (scalar_to_vector (f64 FPR64:$src))),
+          (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)),
+                         (f64 FPR64:$src), sub_64)>;
+
+class NeonI_DUP_Elt<bit Q, string asmop, string rdlane,  string rnlane,
+                    RegisterOperand ResVPR, Operand OpImm>
+  : NeonI_copy<Q, 0b0, 0b0000, (outs ResVPR:$Rd),
+               (ins VPR128:$Rn, OpImm:$Imm),
+               asmop # "\t$Rd" # rdlane # ", $Rn" # rnlane # "[$Imm]",
+               [],
+               NoItinerary> {
+  bits<4> Imm;
+}
+
+def DUPELT16b : NeonI_DUP_Elt<0b1, "dup", ".16b", ".b", VPR128,
+                              neon_uimm4_bare> {
+  let Inst{20-16} = {Imm{3}, Imm{2}, Imm{1}, Imm{0}, 0b1};
+}
+
+def DUPELT8h : NeonI_DUP_Elt<0b1, "dup", ".8h", ".h", VPR128,
+                              neon_uimm3_bare> {
+  let Inst{20-16} = {Imm{2}, Imm{1}, Imm{0}, 0b1, 0b0};
+}
+
+def DUPELT4s : NeonI_DUP_Elt<0b1, "dup", ".4s", ".s", VPR128,
+                              neon_uimm2_bare> {
+  let Inst{20-16} = {Imm{1}, Imm{0}, 0b1, 0b0, 0b0};
+}
+
+def DUPELT2d : NeonI_DUP_Elt<0b1, "dup", ".2d", ".d", VPR128,
+                              neon_uimm1_bare> {
+  let Inst{20-16} = {Imm, 0b1, 0b0, 0b0, 0b0};
+}
+
+def DUPELT8b : NeonI_DUP_Elt<0b0, "dup", ".8b", ".b", VPR64,
+                              neon_uimm4_bare> {
+  let Inst{20-16} = {Imm{3}, Imm{2}, Imm{1}, Imm{0}, 0b1};
+}
+
+def DUPELT4h : NeonI_DUP_Elt<0b0, "dup", ".4h", ".h", VPR64,
+                              neon_uimm3_bare> {
+  let Inst{20-16} = {Imm{2}, Imm{1}, Imm{0}, 0b1, 0b0};
+}
+
+def DUPELT2s : NeonI_DUP_Elt<0b0, "dup", ".2s", ".s", VPR64,
+                              neon_uimm2_bare> {
+  let Inst{20-16} = {Imm{1}, Imm{0}, 0b1, 0b0, 0b0};
+}
+
+multiclass NeonI_DUP_Elt_pattern<Instruction DUPELT, ValueType ResTy,
+                                       ValueType OpTy,ValueType NaTy,
+                                       ValueType ExTy, Operand OpLImm,
+                                       Operand OpNImm> {
+def  : Pat<(ResTy (Neon_vduplane (OpTy VPR128:$Rn), OpLImm:$Imm)),
+        (ResTy (DUPELT (OpTy VPR128:$Rn), OpLImm:$Imm))>;
+
+def : Pat<(ResTy (Neon_vduplane
+            (NaTy VPR64:$Rn), OpNImm:$Imm)),
+          (ResTy (DUPELT
+            (ExTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)), OpNImm:$Imm))>;
+}
+defm : NeonI_DUP_Elt_pattern<DUPELT16b, v16i8, v16i8, v8i8, v16i8,
+                             neon_uimm4_bare, neon_uimm3_bare>;
+defm : NeonI_DUP_Elt_pattern<DUPELT8b, v8i8, v16i8, v8i8, v16i8,
+                             neon_uimm4_bare, neon_uimm3_bare>;
+defm : NeonI_DUP_Elt_pattern<DUPELT8h, v8i16, v8i16, v4i16, v8i16,
+                             neon_uimm3_bare, neon_uimm2_bare>;
+defm : NeonI_DUP_Elt_pattern<DUPELT4h, v4i16, v8i16, v4i16, v8i16,
+                             neon_uimm3_bare, neon_uimm2_bare>;
+defm : NeonI_DUP_Elt_pattern<DUPELT4s, v4i32, v4i32, v2i32, v4i32,
+                             neon_uimm2_bare, neon_uimm1_bare>;
+defm : NeonI_DUP_Elt_pattern<DUPELT2s, v2i32, v4i32, v2i32, v4i32,
+                             neon_uimm2_bare, neon_uimm1_bare>;
+defm : NeonI_DUP_Elt_pattern<DUPELT2d, v2i64, v2i64, v1i64, v2i64,
+                             neon_uimm1_bare, neon_uimm0_bare>;
+defm : NeonI_DUP_Elt_pattern<DUPELT4s, v4f32, v4f32, v2f32, v4f32,
+                             neon_uimm2_bare, neon_uimm1_bare>;
+defm : NeonI_DUP_Elt_pattern<DUPELT2s, v2f32, v4f32, v2f32, v4f32,
+                             neon_uimm2_bare, neon_uimm1_bare>;
+defm : NeonI_DUP_Elt_pattern<DUPELT2d, v2f64, v2f64, v1f64, v2f64,
+                             neon_uimm1_bare, neon_uimm0_bare>;
+
+def : Pat<(v2f32 (Neon_vdup (f32 FPR32:$Rn))),
+          (v2f32 (DUPELT2s
+            (SUBREG_TO_REG (i64 0), FPR32:$Rn, sub_32),
+            (i64 0)))>;
+def : Pat<(v4f32 (Neon_vdup (f32 FPR32:$Rn))),
+          (v4f32 (DUPELT4s
+            (SUBREG_TO_REG (i64 0), FPR32:$Rn, sub_32),
+            (i64 0)))>;
+def : Pat<(v2f64 (Neon_vdup (f64 FPR64:$Rn))),
+          (v2f64 (DUPELT2d
+            (SUBREG_TO_REG (i64 0), FPR64:$Rn, sub_64),
+            (i64 0)))>;
+
+class NeonI_DUP<bit Q, string asmop, string rdlane,
+                RegisterOperand ResVPR, ValueType ResTy,
+                RegisterClass OpGPR, ValueType OpTy>
+  : NeonI_copy<Q, 0b0, 0b0001, (outs ResVPR:$Rd), (ins OpGPR:$Rn),
+               asmop # "\t$Rd" # rdlane # ", $Rn",
+               [(set (ResTy ResVPR:$Rd),
+                 (ResTy (Neon_vdup (OpTy OpGPR:$Rn))))],
+               NoItinerary>;
+
+def DUP16b : NeonI_DUP<0b1, "dup", ".16b", VPR128, v16i8, GPR32, i32> {
+  let Inst{20-16} = 0b00001;
+  // bits 17-20 are unspecified, but should be set to zero.
+}
+
+def DUP8h : NeonI_DUP<0b1, "dup", ".8h", VPR128, v8i16, GPR32, i32> {
+  let Inst{20-16} = 0b00010;
+  // bits 18-20 are unspecified, but should be set to zero.
+}
+
+def DUP4s : NeonI_DUP<0b1, "dup", ".4s", VPR128, v4i32, GPR32, i32> {
+  let Inst{20-16} = 0b00100;
+  // bits 19-20 are unspecified, but should be set to zero.
+}
+
+def DUP2d : NeonI_DUP<0b1, "dup", ".2d", VPR128, v2i64, GPR64, i64> {
+  let Inst{20-16} = 0b01000;
+  // bit 20 is unspecified, but should be set to zero.
+}
+
+def DUP8b : NeonI_DUP<0b0, "dup", ".8b", VPR64, v8i8, GPR32, i32> {
+  let Inst{20-16} = 0b00001;
+  // bits 17-20 are unspecified, but should be set to zero.
+}
+
+def DUP4h : NeonI_DUP<0b0, "dup", ".4h", VPR64, v4i16, GPR32, i32> {
+  let Inst{20-16} = 0b00010;
+  // bits 18-20 are unspecified, but should be set to zero.
+}
+
+def DUP2s : NeonI_DUP<0b0, "dup", ".2s", VPR64, v2i32, GPR32, i32> {
+  let Inst{20-16} = 0b00100;
+  // bits 19-20 are unspecified, but should be set to zero.
+}
+
+// patterns for CONCAT_VECTORS
+multiclass Concat_Vector_Pattern<ValueType ResTy, ValueType OpTy> {
+def : Pat<(ResTy (concat_vectors (OpTy VPR64:$Rn), undef)),
+          (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)>;
+def : Pat<(ResTy (concat_vectors (OpTy VPR64:$Rn), (OpTy VPR64:$Rm))),
+          (INSELd
+            (v2i64 (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
+            (v2i64 (SUBREG_TO_REG (i64 0), VPR64:$Rm, sub_64)),
+            (i64 1),
+            (i64 0))>;
+def : Pat<(ResTy (concat_vectors (OpTy VPR64:$Rn), (OpTy VPR64:$Rn))),
+          (DUPELT2d
+            (v2i64 (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
+            (i64 0))> ;
+}
+
+defm : Concat_Vector_Pattern<v16i8, v8i8>;
+defm : Concat_Vector_Pattern<v8i16, v4i16>;
+defm : Concat_Vector_Pattern<v4i32, v2i32>;
+defm : Concat_Vector_Pattern<v2i64, v1i64>;
+defm : Concat_Vector_Pattern<v4f32, v2f32>;
+defm : Concat_Vector_Pattern<v2f64, v1f64>;
+
+//patterns for EXTRACT_SUBVECTOR
+def : Pat<(v8i8 (extract_subvector (v16i8 VPR128:$Rn), (i64 0))),
+          (v8i8 (EXTRACT_SUBREG VPR128:$Rn, sub_64))>;
+def : Pat<(v4i16 (extract_subvector (v8i16 VPR128:$Rn), (i64 0))),
+          (v4i16 (EXTRACT_SUBREG VPR128:$Rn, sub_64))>;
+def : Pat<(v2i32 (extract_subvector (v4i32 VPR128:$Rn), (i64 0))),
+          (v2i32 (EXTRACT_SUBREG VPR128:$Rn, sub_64))>;
+def : Pat<(v1i64 (extract_subvector (v2i64 VPR128:$Rn), (i64 0))),
+          (v1i64 (EXTRACT_SUBREG VPR128:$Rn, sub_64))>;
+def : Pat<(v2f32 (extract_subvector (v4f32 VPR128:$Rn), (i64 0))),
+          (v2f32 (EXTRACT_SUBREG VPR128:$Rn, sub_64))>;
+def : Pat<(v1f64 (extract_subvector (v2f64 VPR128:$Rn), (i64 0))),
+          (v1f64 (EXTRACT_SUBREG VPR128:$Rn, sub_64))>;
+
+// The followings are for instruction class (3V Elem)
+
+// Variant 1
+
+class NI_2VE<bit q, bit u, bits<2> size, bits<4> opcode,
+             string asmop, string ResS, string OpS, string EleOpS,
+             Operand OpImm, RegisterOperand ResVPR,
+             RegisterOperand OpVPR, RegisterOperand EleOpVPR>
+  : NeonI_2VElem<q, u, size, opcode,
+                 (outs ResVPR:$Rd), (ins ResVPR:$src, OpVPR:$Rn,
+                                         EleOpVPR:$Re, OpImm:$Index),
+                 asmop # "\t$Rd." # ResS # ", $Rn." # OpS #
+                 ", $Re." # EleOpS # "[$Index]",
+                 [],
+                 NoItinerary> {
+  bits<3> Index;
+  bits<5> Re;
+
+  let Constraints = "$src = $Rd";
+}
+
+multiclass NI_2VE_v1<bit u, bits<4> opcode, string asmop> {
+  // vector register class for element is always 128-bit to cover the max index
+  def _2s4s : NI_2VE<0b0, u, 0b10, opcode, asmop, "2s", "2s", "s",
+                     neon_uimm2_bare, VPR64, VPR64, VPR128> {
+    let Inst{11} = {Index{1}};
+    let Inst{21} = {Index{0}};
+    let Inst{20-16} = Re;
+  }
+
+  def _4s4s : NI_2VE<0b1, u, 0b10, opcode, asmop, "4s", "4s", "s",
+                     neon_uimm2_bare, VPR128, VPR128, VPR128> {
+    let Inst{11} = {Index{1}};
+    let Inst{21} = {Index{0}};
+    let Inst{20-16} = Re;
+  }
+
+  // Index operations on 16-bit(H) elements are restricted to using v0-v15.
+  def _4h8h : NI_2VE<0b0, u, 0b01, opcode, asmop, "4h", "4h", "h",
+                     neon_uimm3_bare, VPR64, VPR64, VPR128Lo> {
+    let Inst{11} = {Index{2}};
+    let Inst{21} = {Index{1}};
+    let Inst{20} = {Index{0}};
+    let Inst{19-16} = Re{3-0};
+  }
+
+  def _8h8h : NI_2VE<0b1, u, 0b01, opcode, asmop, "8h", "8h", "h",
+                     neon_uimm3_bare, VPR128, VPR128, VPR128Lo> {
+    let Inst{11} = {Index{2}};
+    let Inst{21} = {Index{1}};
+    let Inst{20} = {Index{0}};
+    let Inst{19-16} = Re{3-0};
+  }
+}
+
+defm MLAvve : NI_2VE_v1<0b1, 0b0000, "mla">;
+defm MLSvve : NI_2VE_v1<0b1, 0b0100, "mls">;
+
+// Pattern for lane in 128-bit vector
+class NI_2VE_laneq<Instruction INST, Operand OpImm, SDPatternOperator op,
+                   RegisterOperand ResVPR, RegisterOperand OpVPR,
+                   RegisterOperand EleOpVPR, ValueType ResTy, ValueType OpTy,
+                   ValueType EleOpTy>
+  : Pat<(ResTy (op (ResTy ResVPR:$src), (OpTy OpVPR:$Rn),
+          (OpTy (Neon_vduplane (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))),
+        (INST ResVPR:$src, OpVPR:$Rn, EleOpVPR:$Re, OpImm:$Index)>;
+
+// Pattern for lane in 64-bit vector
+class NI_2VE_lane<Instruction INST, Operand OpImm, SDPatternOperator op,
+                  RegisterOperand ResVPR, RegisterOperand OpVPR,
+                  RegisterOperand EleOpVPR, ValueType ResTy, ValueType OpTy,
+                  ValueType EleOpTy>
+  : Pat<(ResTy (op (ResTy ResVPR:$src), (OpTy OpVPR:$Rn),
+          (OpTy (Neon_vduplane (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))),
+        (INST ResVPR:$src, OpVPR:$Rn,
+          (SUBREG_TO_REG (i64 0), EleOpVPR:$Re, sub_64), OpImm:$Index)>;
+
+multiclass NI_2VE_v1_pat<string subop, SDPatternOperator op>
+{
+  def : NI_2VE_laneq<!cast<Instruction>(subop # "_2s4s"), neon_uimm2_bare,
+                     op, VPR64, VPR64, VPR128, v2i32, v2i32, v4i32>;
+
+  def : NI_2VE_laneq<!cast<Instruction>(subop # "_4s4s"), neon_uimm2_bare,
+                     op, VPR128, VPR128, VPR128, v4i32, v4i32, v4i32>;
+
+  def : NI_2VE_laneq<!cast<Instruction>(subop # "_4h8h"), neon_uimm3_bare,
+                     op, VPR64, VPR64, VPR128Lo, v4i16, v4i16, v8i16>;
+
+  def : NI_2VE_laneq<!cast<Instruction>(subop # "_8h8h"), neon_uimm3_bare,
+                     op, VPR128, VPR128, VPR128Lo, v8i16, v8i16, v8i16>;
+
+  // Index can only be half of the max value for lane in 64-bit vector
+
+  def : NI_2VE_lane<!cast<Instruction>(subop # "_2s4s"), neon_uimm1_bare,
+                    op, VPR64, VPR64, VPR64, v2i32, v2i32, v2i32>;
+
+  def : NI_2VE_lane<!cast<Instruction>(subop # "_4h8h"), neon_uimm2_bare,
+                    op, VPR64, VPR64, VPR64Lo, v4i16, v4i16, v4i16>;
+}
+
+defm MLA_lane_v1 : NI_2VE_v1_pat<"MLAvve", Neon_mla>;
+defm MLS_lane_v1 : NI_2VE_v1_pat<"MLSvve", Neon_mls>;
+
+class NI_2VE_2op<bit q, bit u, bits<2> size, bits<4> opcode,
+                 string asmop, string ResS, string OpS, string EleOpS,
+                 Operand OpImm, RegisterOperand ResVPR,
+                 RegisterOperand OpVPR, RegisterOperand EleOpVPR>
+  : NeonI_2VElem<q, u, size, opcode,
+                 (outs ResVPR:$Rd), (ins OpVPR:$Rn,
+                                         EleOpVPR:$Re, OpImm:$Index),
+                 asmop # "\t$Rd." # ResS # ", $Rn." # OpS #
+                 ", $Re." # EleOpS # "[$Index]",
+                 [],
+                 NoItinerary> {
+  bits<3> Index;
+  bits<5> Re;
+}
+
+multiclass NI_2VE_v1_2op<bit u, bits<4> opcode, string asmop> {
+  // vector register class for element is always 128-bit to cover the max index
+  def _2s4s : NI_2VE_2op<0b0, u, 0b10, opcode, asmop, "2s", "2s", "s",
+                         neon_uimm2_bare, VPR64, VPR64, VPR128> {
+    let Inst{11} = {Index{1}};
+    let Inst{21} = {Index{0}};
+    let Inst{20-16} = Re;
+  }
+
+  def _4s4s : NI_2VE_2op<0b1, u, 0b10, opcode, asmop, "4s", "4s", "s",
+                         neon_uimm2_bare, VPR128, VPR128, VPR128> {
+    let Inst{11} = {Index{1}};
+    let Inst{21} = {Index{0}};
+    let Inst{20-16} = Re;
+  }
+
+  // Index operations on 16-bit(H) elements are restricted to using v0-v15.
+  def _4h8h : NI_2VE_2op<0b0, u, 0b01, opcode, asmop, "4h", "4h", "h",
+                         neon_uimm3_bare, VPR64, VPR64, VPR128Lo> {
+    let Inst{11} = {Index{2}};
+    let Inst{21} = {Index{1}};
+    let Inst{20} = {Index{0}};
+    let Inst{19-16} = Re{3-0};
+  }
+
+  def _8h8h : NI_2VE_2op<0b1, u, 0b01, opcode, asmop, "8h", "8h", "h",
+                         neon_uimm3_bare, VPR128, VPR128, VPR128Lo> {
+    let Inst{11} = {Index{2}};
+    let Inst{21} = {Index{1}};
+    let Inst{20} = {Index{0}};
+    let Inst{19-16} = Re{3-0};
+  }
+}
+
+defm MULve : NI_2VE_v1_2op<0b0, 0b1000, "mul">;
+defm SQDMULHve : NI_2VE_v1_2op<0b0, 0b1100, "sqdmulh">;
+defm SQRDMULHve : NI_2VE_v1_2op<0b0, 0b1101, "sqrdmulh">;
+
+// Pattern for lane in 128-bit vector
+class NI_2VE_mul_laneq<Instruction INST, Operand OpImm, SDPatternOperator op,
+                       RegisterOperand OpVPR, RegisterOperand EleOpVPR,
+                       ValueType ResTy, ValueType OpTy, ValueType EleOpTy>
+  : Pat<(ResTy (op (OpTy OpVPR:$Rn),
+          (OpTy (Neon_vduplane (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))),
+        (INST OpVPR:$Rn, EleOpVPR:$Re, OpImm:$Index)>;
+
+// Pattern for lane in 64-bit vector
+class NI_2VE_mul_lane<Instruction INST, Operand OpImm, SDPatternOperator op,
+                      RegisterOperand OpVPR, RegisterOperand EleOpVPR,
+                      ValueType ResTy, ValueType OpTy, ValueType EleOpTy>
+  : Pat<(ResTy (op (OpTy OpVPR:$Rn),
+          (OpTy (Neon_vduplane (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))),
+        (INST OpVPR:$Rn,
+          (SUBREG_TO_REG (i64 0), EleOpVPR:$Re, sub_64), OpImm:$Index)>;
+
+multiclass NI_2VE_mul_v1_pat<string subop, SDPatternOperator op> {
+  def : NI_2VE_mul_laneq<!cast<Instruction>(subop # "_2s4s"), neon_uimm2_bare,
+                         op, VPR64, VPR128, v2i32, v2i32, v4i32>;
+
+  def : NI_2VE_mul_laneq<!cast<Instruction>(subop # "_4s4s"), neon_uimm2_bare,
+                         op, VPR128, VPR128, v4i32, v4i32, v4i32>;
+
+  def : NI_2VE_mul_laneq<!cast<Instruction>(subop # "_4h8h"), neon_uimm3_bare,
+                         op, VPR64, VPR128Lo, v4i16, v4i16, v8i16>;
+
+  def : NI_2VE_mul_laneq<!cast<Instruction>(subop # "_8h8h"), neon_uimm3_bare,
+                         op, VPR128, VPR128Lo, v8i16, v8i16, v8i16>;
+
+  // Index can only be half of the max value for lane in 64-bit vector
+
+  def : NI_2VE_mul_lane<!cast<Instruction>(subop # "_2s4s"), neon_uimm1_bare,
+                        op, VPR64, VPR64, v2i32, v2i32, v2i32>;
+
+  def : NI_2VE_mul_lane<!cast<Instruction>(subop # "_4h8h"), neon_uimm2_bare,
+                        op, VPR64, VPR64Lo, v4i16, v4i16, v4i16>;
+}
+
+defm MUL_lane_v1 : NI_2VE_mul_v1_pat<"MULve", mul>;
+defm SQDMULH_lane_v1 : NI_2VE_mul_v1_pat<"SQDMULHve", int_arm_neon_vqdmulh>;
+defm SQRDMULH_lane_v1 : NI_2VE_mul_v1_pat<"SQRDMULHve", int_arm_neon_vqrdmulh>;
+
+// Variant 2
+
+multiclass NI_2VE_v2_2op<bit u, bits<4> opcode, string asmop> {
+  // vector register class for element is always 128-bit to cover the max index
+  def _2s4s : NI_2VE_2op<0b0, u, 0b10, opcode, asmop, "2s", "2s", "s",
+                         neon_uimm2_bare, VPR64, VPR64, VPR128> {
+    let Inst{11} = {Index{1}};
+    let Inst{21} = {Index{0}};
+    let Inst{20-16} = Re;
+  }
+
+  def _4s4s : NI_2VE_2op<0b1, u, 0b10, opcode, asmop, "4s", "4s", "s",
+                         neon_uimm2_bare, VPR128, VPR128, VPR128> {
+    let Inst{11} = {Index{1}};
+    let Inst{21} = {Index{0}};
+    let Inst{20-16} = Re;
+  }
+
+  // _1d2d doesn't exist!
+
+  def _2d2d : NI_2VE_2op<0b1, u, 0b11, opcode, asmop, "2d", "2d", "d",
+                         neon_uimm1_bare, VPR128, VPR128, VPR128> {
+    let Inst{11} = {Index{0}};
+    let Inst{21} = 0b0;
+    let Inst{20-16} = Re;
+  }
+}
+
+defm FMULve : NI_2VE_v2_2op<0b0, 0b1001, "fmul">;
+defm FMULXve : NI_2VE_v2_2op<0b1, 0b1001, "fmulx">;
+
+class NI_2VE_mul_lane_2d<Instruction INST, Operand OpImm, SDPatternOperator op,
+                         RegisterOperand OpVPR, RegisterOperand EleOpVPR,
+                         ValueType ResTy, ValueType OpTy, ValueType EleOpTy,
+                         SDPatternOperator coreop>
+  : Pat<(ResTy (op (OpTy OpVPR:$Rn),
+          (OpTy (coreop (EleOpTy EleOpVPR:$Re), (EleOpTy EleOpVPR:$Re))))),
+        (INST OpVPR:$Rn,
+          (SUBREG_TO_REG (i64 0), EleOpVPR:$Re, sub_64), 0)>;
+
+multiclass NI_2VE_mul_v2_pat<string subop, SDPatternOperator op> {
+  def : NI_2VE_mul_laneq<!cast<Instruction>(subop # "_2s4s"), neon_uimm2_bare,
+                         op, VPR64, VPR128, v2f32, v2f32, v4f32>;
+
+  def : NI_2VE_mul_laneq<!cast<Instruction>(subop # "_4s4s"), neon_uimm2_bare,
+                         op, VPR128, VPR128, v4f32, v4f32, v4f32>;
+
+  def : NI_2VE_mul_laneq<!cast<Instruction>(subop # "_2d2d"), neon_uimm1_bare,
+                         op, VPR128, VPR128, v2f64, v2f64, v2f64>;
+
+  // Index can only be half of the max value for lane in 64-bit vector
+
+  def : NI_2VE_mul_lane<!cast<Instruction>(subop # "_2s4s"), neon_uimm1_bare,
+                        op, VPR64, VPR64, v2f32, v2f32, v2f32>;
+
+  def : NI_2VE_mul_lane_2d<!cast<Instruction>(subop # "_2d2d"), neon_uimm1_bare,
+                           op, VPR128, VPR64, v2f64, v2f64, v1f64,
+                           BinOpFrag<(Neon_combine_2d node:$LHS, node:$RHS)>>;
+}
+
+defm FMUL_lane_v2 : NI_2VE_mul_v2_pat<"FMULve", fmul>;
+defm FMULX_lane_v2 : NI_2VE_mul_v2_pat<"FMULXve", int_aarch64_neon_vmulx>;
+
+def : Pat<(v2f32 (fmul (v2f32 (Neon_vdup (f32 FPR32:$Re))),
+                       (v2f32 VPR64:$Rn))),
+          (FMULve_2s4s VPR64:$Rn, (SUBREG_TO_REG (i32 0), $Re, sub_32), 0)>;
+
+def : Pat<(v4f32 (fmul (v4f32 (Neon_vdup (f32 FPR32:$Re))),
+                       (v4f32 VPR128:$Rn))),
+          (FMULve_4s4s VPR128:$Rn, (SUBREG_TO_REG (i32 0), $Re, sub_32), 0)>;
+
+def : Pat<(v2f64 (fmul (v2f64 (Neon_vdup (f64 FPR64:$Re))),
+                       (v2f64 VPR128:$Rn))),
+          (FMULve_2d2d VPR128:$Rn, (SUBREG_TO_REG (i64 0), $Re, sub_64), 0)>;
+
+// The followings are patterns using fma
+// -ffp-contract=fast generates fma
+
+multiclass NI_2VE_v2<bit u, bits<4> opcode, string asmop> {
+  // vector register class for element is always 128-bit to cover the max index
+  def _2s4s : NI_2VE<0b0, u, 0b10, opcode, asmop, "2s", "2s", "s",
+                     neon_uimm2_bare, VPR64, VPR64, VPR128> {
+    let Inst{11} = {Index{1}};
+    let Inst{21} = {Index{0}};
+    let Inst{20-16} = Re;
+  }
+
+  def _4s4s : NI_2VE<0b1, u, 0b10, opcode, asmop, "4s", "4s", "s",
+                     neon_uimm2_bare, VPR128, VPR128, VPR128> {
+    let Inst{11} = {Index{1}};
+    let Inst{21} = {Index{0}};
+    let Inst{20-16} = Re;
+  }
+
+  // _1d2d doesn't exist!
+
+  def _2d2d : NI_2VE<0b1, u, 0b11, opcode, asmop, "2d", "2d", "d",
+                     neon_uimm1_bare, VPR128, VPR128, VPR128> {
+    let Inst{11} = {Index{0}};
+    let Inst{21} = 0b0;
+    let Inst{20-16} = Re;
+  }
+}
+
+defm FMLAvve : NI_2VE_v2<0b0, 0b0001, "fmla">;
+defm FMLSvve : NI_2VE_v2<0b0, 0b0101, "fmls">;
+
+// Pattern for lane in 128-bit vector
+class NI_2VEswap_laneq<Instruction INST, Operand OpImm, SDPatternOperator op,
+                       RegisterOperand ResVPR, RegisterOperand OpVPR,
+                       ValueType ResTy, ValueType OpTy,
+                       SDPatternOperator coreop>
+  : Pat<(ResTy (op (ResTy (coreop (OpTy OpVPR:$Re), (i64 OpImm:$Index))),
+                   (ResTy ResVPR:$src), (ResTy ResVPR:$Rn))),
+        (INST ResVPR:$src, ResVPR:$Rn, OpVPR:$Re, OpImm:$Index)>;
+
+// Pattern for lane 0
+class NI_2VEfma_lane0<Instruction INST, SDPatternOperator op,
+                      RegisterOperand ResVPR, ValueType ResTy>
+  : Pat<(ResTy (op (ResTy ResVPR:$Rn),
+                   (ResTy (Neon_vdup (f32 FPR32:$Re))),
+                   (ResTy ResVPR:$src))),
+        (INST ResVPR:$src, ResVPR:$Rn,
+              (SUBREG_TO_REG (i32 0), $Re, sub_32), 0)>;
+
+// Pattern for lane in 64-bit vector
+class NI_2VEswap_lane<Instruction INST, Operand OpImm, SDPatternOperator op,
+                      RegisterOperand ResVPR, RegisterOperand OpVPR,
+                      ValueType ResTy, ValueType OpTy,
+                      SDPatternOperator coreop>
+  : Pat<(ResTy (op (ResTy (coreop (OpTy OpVPR:$Re), (i64 OpImm:$Index))),
+                   (ResTy ResVPR:$Rn), (ResTy ResVPR:$src))),
+        (INST ResVPR:$src, ResVPR:$Rn,
+          (SUBREG_TO_REG (i64 0), OpVPR:$Re, sub_64), OpImm:$Index)>;
+
+// Pattern for lane in 64-bit vector
+class NI_2VEswap_lane_2d2d<Instruction INST, Operand OpImm,
+                           SDPatternOperator op,
+                           RegisterOperand ResVPR, RegisterOperand OpVPR,
+                           ValueType ResTy, ValueType OpTy,
+                           SDPatternOperator coreop>
+  : Pat<(ResTy (op (ResTy (coreop (OpTy OpVPR:$Re), (OpTy OpVPR:$Re))),
+                   (ResTy ResVPR:$Rn), (ResTy ResVPR:$src))),
+        (INST ResVPR:$src, ResVPR:$Rn,
+          (SUBREG_TO_REG (i64 0), OpVPR:$Re, sub_64), 0)>;
+
+
+multiclass NI_2VE_fma_v2_pat<string subop, SDPatternOperator op> {
+  def : NI_2VEswap_laneq<!cast<Instruction>(subop # "_2s4s"),
+                         neon_uimm2_bare, op, VPR64, VPR128, v2f32, v4f32,
+                         BinOpFrag<(Neon_vduplane node:$LHS, node:$RHS)>>;
+
+  def : NI_2VEfma_lane0<!cast<Instruction>(subop # "_2s4s"),
+                        op, VPR64, v2f32>;
+
+  def : NI_2VEswap_laneq<!cast<Instruction>(subop # "_4s4s"),
+                         neon_uimm2_bare, op, VPR128, VPR128, v4f32, v4f32,
+                         BinOpFrag<(Neon_vduplane node:$LHS, node:$RHS)>>;
+
+  def : NI_2VEfma_lane0<!cast<Instruction>(subop # "_4s4s"),
+                        op, VPR128, v4f32>;
+
+  def : NI_2VEswap_laneq<!cast<Instruction>(subop # "_2d2d"),
+                         neon_uimm1_bare, op, VPR128, VPR128, v2f64, v2f64,
+                         BinOpFrag<(Neon_vduplane node:$LHS, node:$RHS)>>;
+
+  // Index can only be half of the max value for lane in 64-bit vector
+
+  def : NI_2VEswap_lane<!cast<Instruction>(subop # "_2s4s"),
+                        neon_uimm1_bare, op, VPR64, VPR64, v2f32, v2f32,
+                        BinOpFrag<(Neon_vduplane node:$LHS, node:$RHS)>>;
+
+  def : NI_2VEswap_lane_2d2d<!cast<Instruction>(subop # "_2d2d"),
+                             neon_uimm1_bare, op, VPR128, VPR64, v2f64, v1f64,
+                             BinOpFrag<(Neon_combine_2d node:$LHS, node:$RHS)>>;
+}
+
+defm FMLA_lane_v2_s : NI_2VE_fma_v2_pat<"FMLAvve", fma>;
+
+// Pattern for lane 0
+class NI_2VEfms_lane0<Instruction INST, SDPatternOperator op,
+                      RegisterOperand ResVPR, ValueType ResTy>
+  : Pat<(ResTy (op (ResTy (fneg ResVPR:$Rn)),
+                   (ResTy (Neon_vdup (f32 FPR32:$Re))),
+                   (ResTy ResVPR:$src))),
+        (INST ResVPR:$src, ResVPR:$Rn,
+              (SUBREG_TO_REG (i32 0), $Re, sub_32), 0)>;
+
+multiclass NI_2VE_fms_v2_pat<string subop, SDPatternOperator op>
+{
+  def : NI_2VEswap_laneq<!cast<Instruction>(subop # "_2s4s"),
+                         neon_uimm2_bare, op, VPR64, VPR128, v2f32, v4f32,
+                         BinOpFrag<(fneg (Neon_vduplane node:$LHS, node:$RHS))>>;
+
+  def : NI_2VEswap_laneq<!cast<Instruction>(subop # "_2s4s"),
+                         neon_uimm2_bare, op, VPR64, VPR128, v2f32, v4f32,
+                         BinOpFrag<(Neon_vduplane
+                                     (fneg node:$LHS), node:$RHS)>>;
+
+  def : NI_2VEfms_lane0<!cast<Instruction>(subop # "_2s4s"),
+                        op, VPR64, v2f32>;
+
+  def : NI_2VEswap_laneq<!cast<Instruction>(subop # "_4s4s"),
+                         neon_uimm2_bare, op, VPR128, VPR128, v4f32, v4f32,
+                         BinOpFrag<(fneg (Neon_vduplane
+                                     node:$LHS, node:$RHS))>>;
+
+  def : NI_2VEswap_laneq<!cast<Instruction>(subop # "_4s4s"),
+                         neon_uimm2_bare, op, VPR128, VPR128, v4f32, v4f32,
+                         BinOpFrag<(Neon_vduplane
+                                     (fneg node:$LHS), node:$RHS)>>;
+
+  def : NI_2VEfms_lane0<!cast<Instruction>(subop # "_4s4s"),
+                        op, VPR128, v4f32>;
+
+  def : NI_2VEswap_laneq<!cast<Instruction>(subop # "_2d2d"),
+                         neon_uimm1_bare, op, VPR128, VPR128, v2f64, v2f64,
+                         BinOpFrag<(fneg (Neon_vduplane
+                                     node:$LHS, node:$RHS))>>;
+
+  def : NI_2VEswap_laneq<!cast<Instruction>(subop # "_2d2d"),
+                         neon_uimm1_bare, op, VPR128, VPR128, v2f64, v2f64,
+                         BinOpFrag<(Neon_vduplane
+                                     (fneg node:$LHS), node:$RHS)>>;
+
+  // Index can only be half of the max value for lane in 64-bit vector
+
+  def : NI_2VEswap_lane<!cast<Instruction>(subop # "_2s4s"),
+                        neon_uimm1_bare, op, VPR64, VPR64, v2f32, v2f32,
+                        BinOpFrag<(fneg (Neon_vduplane
+                                    node:$LHS, node:$RHS))>>;
+
+  def : NI_2VEswap_lane<!cast<Instruction>(subop # "_2s4s"),
+                        neon_uimm1_bare, op, VPR64, VPR64, v2f32, v2f32,
+                        BinOpFrag<(Neon_vduplane
+                                    (fneg node:$LHS), node:$RHS)>>;
+
+  def : NI_2VEswap_lane<!cast<Instruction>(subop # "_4s4s"),
+                        neon_uimm1_bare, op, VPR128, VPR64, v4f32, v2f32,
+                        BinOpFrag<(fneg (Neon_vduplane node:$LHS, node:$RHS))>>;
+
+  def : NI_2VEswap_lane<!cast<Instruction>(subop # "_4s4s"),
+                        neon_uimm1_bare, op, VPR128, VPR64, v4f32, v2f32,
+                        BinOpFrag<(Neon_vduplane (fneg node:$LHS), node:$RHS)>>;
+
+  def : NI_2VEswap_lane_2d2d<!cast<Instruction>(subop # "_2d2d"),
+                             neon_uimm1_bare, op, VPR128, VPR64, v2f64, v1f64,
+                             BinOpFrag<(fneg (Neon_combine_2d
+                                         node:$LHS, node:$RHS))>>;
+
+  def : NI_2VEswap_lane_2d2d<!cast<Instruction>(subop # "_2d2d"),
+                             neon_uimm1_bare, op, VPR128, VPR64, v2f64, v1f64,
+                             BinOpFrag<(Neon_combine_2d
+                                         (fneg node:$LHS), (fneg node:$RHS))>>;
+}
+
+defm FMLS_lane_v2_s : NI_2VE_fms_v2_pat<"FMLSvve", fma>;
+
+// Variant 3: Long type
+// E.g. SMLAL : 4S/4H/H (v0-v15), 2D/2S/S
+//      SMLAL2: 4S/8H/H (v0-v15), 2D/4S/S
+
+multiclass NI_2VE_v3<bit u, bits<4> opcode, string asmop> {
+  // vector register class for element is always 128-bit to cover the max index
+  def _2d2s : NI_2VE<0b0, u, 0b10, opcode, asmop, "2d", "2s", "s",
+                     neon_uimm2_bare, VPR128, VPR64, VPR128> {
+    let Inst{11} = {Index{1}};
+    let Inst{21} = {Index{0}};
+    let Inst{20-16} = Re;
+  }
+
+  def _2d4s : NI_2VE<0b1, u, 0b10, opcode, asmop # "2", "2d", "4s", "s",
+                     neon_uimm2_bare, VPR128, VPR128, VPR128> {
+    let Inst{11} = {Index{1}};
+    let Inst{21} = {Index{0}};
+    let Inst{20-16} = Re;
+  }
+
+  // Index operations on 16-bit(H) elements are restricted to using v0-v15.
+  def _4s8h : NI_2VE<0b1, u, 0b01, opcode, asmop # "2", "4s", "8h", "h",
+                     neon_uimm3_bare, VPR128, VPR128, VPR128Lo> {
+    let Inst{11} = {Index{2}};
+    let Inst{21} = {Index{1}};
+    let Inst{20} = {Index{0}};
+    let Inst{19-16} = Re{3-0};
+  }
+
+  def _4s4h : NI_2VE<0b0, u, 0b01, opcode, asmop, "4s", "4h", "h",
+                     neon_uimm3_bare, VPR128, VPR64, VPR128Lo> {
+    let Inst{11} = {Index{2}};
+    let Inst{21} = {Index{1}};
+    let Inst{20} = {Index{0}};
+    let Inst{19-16} = Re{3-0};
+  }
+}
+
+defm SMLALvve : NI_2VE_v3<0b0, 0b0010, "smlal">;
+defm UMLALvve : NI_2VE_v3<0b1, 0b0010, "umlal">;
+defm SMLSLvve : NI_2VE_v3<0b0, 0b0110, "smlsl">;
+defm UMLSLvve : NI_2VE_v3<0b1, 0b0110, "umlsl">;
+defm SQDMLALvve : NI_2VE_v3<0b0, 0b0011, "sqdmlal">;
+defm SQDMLSLvve : NI_2VE_v3<0b0, 0b0111, "sqdmlsl">;
+
+multiclass NI_2VE_v3_2op<bit u, bits<4> opcode, string asmop> {
+  // vector register class for element is always 128-bit to cover the max index
+  def _2d2s : NI_2VE_2op<0b0, u, 0b10, opcode, asmop, "2d", "2s", "s",
+                         neon_uimm2_bare, VPR128, VPR64, VPR128> {
+    let Inst{11} = {Index{1}};
+    let Inst{21} = {Index{0}};
+    let Inst{20-16} = Re;
+  }
+
+  def _2d4s : NI_2VE_2op<0b1, u, 0b10, opcode, asmop # "2", "2d", "4s", "s",
+                         neon_uimm2_bare, VPR128, VPR128, VPR128> {
+    let Inst{11} = {Index{1}};
+    let Inst{21} = {Index{0}};
+    let Inst{20-16} = Re;
+  }
+
+  // Index operations on 16-bit(H) elements are restricted to using v0-v15.
+  def _4s8h : NI_2VE_2op<0b1, u, 0b01, opcode, asmop # "2", "4s", "8h", "h",
+                         neon_uimm3_bare, VPR128, VPR128, VPR128Lo> {
+    let Inst{11} = {Index{2}};
+    let Inst{21} = {Index{1}};
+    let Inst{20} = {Index{0}};
+    let Inst{19-16} = Re{3-0};
+  }
+
+  def _4s4h : NI_2VE_2op<0b0, u, 0b01, opcode, asmop, "4s", "4h", "h",
+                         neon_uimm3_bare, VPR128, VPR64, VPR128Lo> {
+    let Inst{11} = {Index{2}};
+    let Inst{21} = {Index{1}};
+    let Inst{20} = {Index{0}};
+    let Inst{19-16} = Re{3-0};
+  }
+}
+
+defm SMULLve : NI_2VE_v3_2op<0b0, 0b1010, "smull">;
+defm UMULLve : NI_2VE_v3_2op<0b1, 0b1010, "umull">;
+defm SQDMULLve : NI_2VE_v3_2op<0b0, 0b1011, "sqdmull">;
+
+def : Pat<(v1f64 (scalar_to_vector (f64 FPR64:$src))),
+          (FMOVdd $src)>;
+def : Pat<(v1f32 (scalar_to_vector (f32 FPR32:$src))),
+          (FMOVss $src)>;
+
+// Pattern for lane in 128-bit vector
+class NI_2VEL2_laneq<Instruction INST, Operand OpImm, SDPatternOperator op,
+                     RegisterOperand EleOpVPR, ValueType ResTy,
+                     ValueType OpTy, ValueType EleOpTy, ValueType HalfOpTy,
+                     SDPatternOperator hiop>
+  : Pat<(ResTy (op (ResTy VPR128:$src),
+          (HalfOpTy (hiop (OpTy VPR128:$Rn))),
+          (HalfOpTy (Neon_vduplane
+                      (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))),
+        (INST VPR128:$src, VPR128:$Rn, EleOpVPR:$Re, OpImm:$Index)>;
+
+// Pattern for lane in 64-bit vector
+class NI_2VEL2_lane<Instruction INST, Operand OpImm, SDPatternOperator op,
+                    RegisterOperand EleOpVPR, ValueType ResTy,
+                    ValueType OpTy, ValueType EleOpTy, ValueType HalfOpTy,
+                    SDPatternOperator hiop>
+  : Pat<(ResTy (op (ResTy VPR128:$src),
+          (HalfOpTy (hiop (OpTy VPR128:$Rn))),
+          (HalfOpTy (Neon_vduplane
+                      (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))),
+        (INST VPR128:$src, VPR128:$Rn,
+          (SUBREG_TO_REG (i64 0), EleOpVPR:$Re, sub_64), OpImm:$Index)>;
+
+class NI_2VEL2_lane0<Instruction INST, SDPatternOperator op,
+                     ValueType ResTy, ValueType OpTy, ValueType HalfOpTy,
+                     SDPatternOperator hiop, Instruction DupInst>
+  : Pat<(ResTy (op (ResTy VPR128:$src),
+          (HalfOpTy (hiop (OpTy VPR128:$Rn))),
+          (HalfOpTy (Neon_vdup (i32 GPR32:$Re))))),
+        (INST VPR128:$src, VPR128:$Rn, (DupInst $Re), 0)>;
+
+multiclass NI_2VEL_v3_pat<string subop, SDPatternOperator op> {
+  def : NI_2VE_laneq<!cast<Instruction>(subop # "_4s4h"), neon_uimm3_bare,
+                     op, VPR128, VPR64, VPR128Lo, v4i32, v4i16, v8i16>;
+
+  def : NI_2VE_laneq<!cast<Instruction>(subop # "_2d2s"), neon_uimm2_bare,
+                     op, VPR128, VPR64, VPR128, v2i64, v2i32, v4i32>;
+
+  def : NI_2VEL2_laneq<!cast<Instruction>(subop # "_4s8h"), neon_uimm3_bare,
+                       op, VPR128Lo, v4i32, v8i16, v8i16, v4i16, Neon_High8H>;
+
+  def : NI_2VEL2_laneq<!cast<Instruction>(subop # "_2d4s"), neon_uimm2_bare,
+                       op, VPR128, v2i64, v4i32, v4i32, v2i32, Neon_High4S>;
+
+  def : NI_2VEL2_lane0<!cast<Instruction>(subop # "_4s8h"),
+                       op, v4i32, v8i16, v4i16, Neon_High8H, DUP8h>;
+
+  def : NI_2VEL2_lane0<!cast<Instruction>(subop # "_2d4s"),
+                       op, v2i64, v4i32, v2i32, Neon_High4S, DUP4s>;
+
+  // Index can only be half of the max value for lane in 64-bit vector
+
+  def : NI_2VE_lane<!cast<Instruction>(subop # "_4s4h"), neon_uimm2_bare,
+                    op, VPR128, VPR64, VPR64Lo, v4i32, v4i16, v4i16>;
+
+  def : NI_2VE_lane<!cast<Instruction>(subop # "_2d2s"), neon_uimm1_bare,
+                    op, VPR128, VPR64, VPR64, v2i64, v2i32, v2i32>;
+
+  def : NI_2VEL2_lane<!cast<Instruction>(subop # "_4s8h"), neon_uimm2_bare,
+                      op, VPR64Lo, v4i32, v8i16, v4i16, v4i16, Neon_High8H>;
+
+  def : NI_2VEL2_lane<!cast<Instruction>(subop # "_2d4s"), neon_uimm1_bare,
+                      op, VPR64, v2i64, v4i32, v2i32, v2i32, Neon_High4S>;
+}
+
+defm SMLAL_lane_v3 : NI_2VEL_v3_pat<"SMLALvve", Neon_smlal>;
+defm UMLAL_lane_v3 : NI_2VEL_v3_pat<"UMLALvve", Neon_umlal>;
+defm SMLSL_lane_v3 : NI_2VEL_v3_pat<"SMLSLvve", Neon_smlsl>;
+defm UMLSL_lane_v3 : NI_2VEL_v3_pat<"UMLSLvve", Neon_umlsl>;
+
+// Pattern for lane in 128-bit vector
+class NI_2VEL2_mul_laneq<Instruction INST, Operand OpImm, SDPatternOperator op,
+                         RegisterOperand EleOpVPR, ValueType ResTy,
+                         ValueType OpTy, ValueType EleOpTy, ValueType HalfOpTy,
+                         SDPatternOperator hiop>
+  : Pat<(ResTy (op
+          (HalfOpTy (hiop (OpTy VPR128:$Rn))),
+          (HalfOpTy (Neon_vduplane
+                      (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))),
+        (INST VPR128:$Rn, EleOpVPR:$Re, OpImm:$Index)>;
+
+// Pattern for lane in 64-bit vector
+class NI_2VEL2_mul_lane<Instruction INST, Operand OpImm, SDPatternOperator op,
+                        RegisterOperand EleOpVPR, ValueType ResTy,
+                        ValueType OpTy, ValueType EleOpTy, ValueType HalfOpTy,
+                        SDPatternOperator hiop>
+  : Pat<(ResTy (op
+          (HalfOpTy (hiop (OpTy VPR128:$Rn))),
+          (HalfOpTy (Neon_vduplane
+                      (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))),
+        (INST VPR128:$Rn,
+          (SUBREG_TO_REG (i64 0), EleOpVPR:$Re, sub_64), OpImm:$Index)>;
+
+// Pattern for fixed lane 0
+class NI_2VEL2_mul_lane0<Instruction INST, SDPatternOperator op,
+                         ValueType ResTy, ValueType OpTy, ValueType HalfOpTy,
+                         SDPatternOperator hiop, Instruction DupInst>
+  : Pat<(ResTy (op
+          (HalfOpTy (hiop (OpTy VPR128:$Rn))),
+          (HalfOpTy (Neon_vdup (i32 GPR32:$Re))))),
+        (INST VPR128:$Rn, (DupInst $Re), 0)>;
+
+multiclass NI_2VEL_mul_v3_pat<string subop, SDPatternOperator op> {
+  def : NI_2VE_mul_laneq<!cast<Instruction>(subop # "_4s4h"), neon_uimm3_bare,
+                         op, VPR64, VPR128Lo, v4i32, v4i16, v8i16>;
+
+  def : NI_2VE_mul_laneq<!cast<Instruction>(subop # "_2d2s"), neon_uimm2_bare,
+                         op, VPR64, VPR128, v2i64, v2i32, v4i32>;
+
+  def : NI_2VEL2_mul_laneq<!cast<Instruction>(subop # "_4s8h"), neon_uimm3_bare,
+                         op, VPR128Lo, v4i32, v8i16, v8i16, v4i16, Neon_High8H>;
+
+  def : NI_2VEL2_mul_laneq<!cast<Instruction>(subop # "_2d4s"), neon_uimm2_bare,
+                           op, VPR128, v2i64, v4i32, v4i32, v2i32, Neon_High4S>;
+
+  def : NI_2VEL2_mul_lane0<!cast<Instruction>(subop # "_4s8h"),
+                           op, v4i32, v8i16, v4i16, Neon_High8H, DUP8h>;
+
+  def : NI_2VEL2_mul_lane0<!cast<Instruction>(subop # "_2d4s"),
+                           op, v2i64, v4i32, v2i32, Neon_High4S, DUP4s>;
+
+  // Index can only be half of the max value for lane in 64-bit vector
+
+  def : NI_2VE_mul_lane<!cast<Instruction>(subop # "_4s4h"), neon_uimm2_bare,
+                        op, VPR64, VPR64Lo, v4i32, v4i16, v4i16>;
+
+  def : NI_2VE_mul_lane<!cast<Instruction>(subop # "_2d2s"), neon_uimm1_bare,
+                        op, VPR64, VPR64, v2i64, v2i32, v2i32>;
+
+  def : NI_2VEL2_mul_lane<!cast<Instruction>(subop # "_4s8h"), neon_uimm2_bare,
+                          op, VPR64Lo, v4i32, v8i16, v4i16, v4i16, Neon_High8H>;
+
+  def : NI_2VEL2_mul_lane<!cast<Instruction>(subop # "_2d4s"), neon_uimm1_bare,
+                          op, VPR64, v2i64, v4i32, v2i32, v2i32, Neon_High4S>;
+}
+
+defm SMULL_lane_v3 : NI_2VEL_mul_v3_pat<"SMULLve", int_arm_neon_vmulls>;
+defm UMULL_lane_v3 : NI_2VEL_mul_v3_pat<"UMULLve", int_arm_neon_vmullu>;
+defm SQDMULL_lane_v3 : NI_2VEL_mul_v3_pat<"SQDMULLve", int_arm_neon_vqdmull>;
+
+multiclass NI_qdma<SDPatternOperator op> {
+  def _4s : PatFrag<(ops node:$Ra, node:$Rn, node:$Rm),
+                    (op node:$Ra,
+                      (v4i32 (int_arm_neon_vqdmull node:$Rn, node:$Rm)))>;
+
+  def _2d : PatFrag<(ops node:$Ra, node:$Rn, node:$Rm),
+                    (op node:$Ra,
+                      (v2i64 (int_arm_neon_vqdmull node:$Rn, node:$Rm)))>;
+}
+
+defm Neon_qdmlal : NI_qdma<int_arm_neon_vqadds>;
+defm Neon_qdmlsl : NI_qdma<int_arm_neon_vqsubs>;
+
+multiclass NI_2VEL_v3_qdma_pat<string subop, string op> {
+  def : NI_2VE_laneq<!cast<Instruction>(subop # "_4s4h"), neon_uimm3_bare,
+                     !cast<PatFrag>(op # "_4s"), VPR128, VPR64, VPR128Lo,
+                     v4i32, v4i16, v8i16>;
+
+  def : NI_2VE_laneq<!cast<Instruction>(subop # "_2d2s"), neon_uimm2_bare,
+                     !cast<PatFrag>(op # "_2d"), VPR128, VPR64, VPR128,
+                     v2i64, v2i32, v4i32>;
+
+  def : NI_2VEL2_laneq<!cast<Instruction>(subop # "_4s8h"), neon_uimm3_bare,
+                       !cast<PatFrag>(op # "_4s"), VPR128Lo,
+                       v4i32, v8i16, v8i16, v4i16, Neon_High8H>;
+
+  def : NI_2VEL2_laneq<!cast<Instruction>(subop # "_2d4s"), neon_uimm2_bare,
+                       !cast<PatFrag>(op # "_2d"), VPR128,
+                       v2i64, v4i32, v4i32, v2i32, Neon_High4S>;
+
+  def : NI_2VEL2_lane0<!cast<Instruction>(subop # "_4s8h"),
+                       !cast<PatFrag>(op # "_4s"),
+                       v4i32, v8i16, v4i16, Neon_High8H, DUP8h>;
+
+  def : NI_2VEL2_lane0<!cast<Instruction>(subop # "_2d4s"),
+                       !cast<PatFrag>(op # "_2d"),
+                       v2i64, v4i32, v2i32, Neon_High4S, DUP4s>;
+
+  // Index can only be half of the max value for lane in 64-bit vector
+
+  def : NI_2VE_lane<!cast<Instruction>(subop # "_4s4h"), neon_uimm2_bare,
+                    !cast<PatFrag>(op # "_4s"), VPR128, VPR64, VPR64Lo,
+                    v4i32, v4i16, v4i16>;
+
+  def : NI_2VE_lane<!cast<Instruction>(subop # "_2d2s"), neon_uimm1_bare,
+                    !cast<PatFrag>(op # "_2d"), VPR128, VPR64, VPR64,
+                    v2i64, v2i32, v2i32>;
+
+  def : NI_2VEL2_lane<!cast<Instruction>(subop # "_4s8h"), neon_uimm2_bare,
+                      !cast<PatFrag>(op # "_4s"), VPR64Lo,
+                      v4i32, v8i16, v4i16, v4i16, Neon_High8H>;
+
+  def : NI_2VEL2_lane<!cast<Instruction>(subop # "_2d4s"), neon_uimm1_bare,
+                      !cast<PatFrag>(op # "_2d"), VPR64,
+                      v2i64, v4i32, v2i32, v2i32, Neon_High4S>;
+}
+
+defm SQDMLAL_lane_v3 : NI_2VEL_v3_qdma_pat<"SQDMLALvve", "Neon_qdmlal">;
+defm SQDMLSL_lane_v3 : NI_2VEL_v3_qdma_pat<"SQDMLSLvve", "Neon_qdmlsl">;
+
+// End of implementation for instruction class (3V Elem)
+
+class NeonI_REV<string asmop, string Res, bits<2> size, bit Q, bit U,
+                bits<5> opcode, RegisterOperand ResVPR, ValueType ResTy,
+                SDPatternOperator Neon_Rev>
+  : NeonI_2VMisc<Q, U, size, opcode,
+               (outs ResVPR:$Rd), (ins ResVPR:$Rn),
+               asmop # "\t$Rd." # Res # ", $Rn." # Res,
+               [(set (ResTy ResVPR:$Rd),
+                  (ResTy (Neon_Rev (ResTy ResVPR:$Rn))))],
+               NoItinerary> ;
+
+def REV64_16b : NeonI_REV<"rev64", "16b", 0b00, 0b1, 0b0, 0b00000, VPR128,
+                          v16i8, Neon_rev64>;
+def REV64_8h : NeonI_REV<"rev64", "8h", 0b01, 0b1, 0b0, 0b00000, VPR128,
+                         v8i16, Neon_rev64>;
+def REV64_4s : NeonI_REV<"rev64", "4s", 0b10, 0b1, 0b0, 0b00000, VPR128,
+                         v4i32, Neon_rev64>;
+def REV64_8b : NeonI_REV<"rev64", "8b", 0b00, 0b0, 0b0, 0b00000, VPR64,
+                         v8i8, Neon_rev64>;
+def REV64_4h : NeonI_REV<"rev64", "4h", 0b01, 0b0, 0b0, 0b00000, VPR64,
+                         v4i16, Neon_rev64>;
+def REV64_2s : NeonI_REV<"rev64", "2s", 0b10, 0b0, 0b0, 0b00000, VPR64,
+                         v2i32, Neon_rev64>;
+
+def : Pat<(v4f32 (Neon_rev64 (v4f32 VPR128:$Rn))), (REV64_4s VPR128:$Rn)>;
+def : Pat<(v2f32 (Neon_rev64 (v2f32 VPR64:$Rn))), (REV64_2s VPR64:$Rn)>;
+
+def REV32_16b : NeonI_REV<"rev32", "16b", 0b00, 0b1, 0b1, 0b00000, VPR128,
+                          v16i8, Neon_rev32>;
+def REV32_8h : NeonI_REV<"rev32", "8h", 0b01, 0b1, 0b1, 0b00000, VPR128,
+                          v8i16, Neon_rev32>;
+def REV32_8b : NeonI_REV<"rev32", "8b", 0b00, 0b0, 0b1, 0b00000, VPR64,
+                         v8i8, Neon_rev32>;
+def REV32_4h : NeonI_REV<"rev32", "4h", 0b01, 0b0, 0b1, 0b00000, VPR64,
+                         v4i16, Neon_rev32>;
+
+def REV16_16b : NeonI_REV<"rev16", "16b", 0b00, 0b1, 0b0, 0b00001, VPR128,
+                          v16i8, Neon_rev16>;
+def REV16_8b : NeonI_REV<"rev16", "8b", 0b00, 0b0, 0b0, 0b00001, VPR64,
+                         v8i8, Neon_rev16>;
+
+multiclass NeonI_PairwiseAdd<string asmop, bit U, bits<5> opcode,
+                             SDPatternOperator Neon_Padd> {
+  def 16b8h : NeonI_2VMisc<0b1, U, 0b00, opcode,
+                           (outs VPR128:$Rd), (ins VPR128:$Rn),
+                           asmop # "\t$Rd.8h, $Rn.16b",
+                           [(set (v8i16 VPR128:$Rd),
+                              (v8i16 (Neon_Padd (v16i8 VPR128:$Rn))))],
+                           NoItinerary>;
+
+  def 8b4h : NeonI_2VMisc<0b0, U, 0b00, opcode,
+                          (outs VPR64:$Rd), (ins VPR64:$Rn),
+                          asmop # "\t$Rd.4h, $Rn.8b",
+                          [(set (v4i16 VPR64:$Rd),
+                             (v4i16 (Neon_Padd (v8i8 VPR64:$Rn))))],
+                          NoItinerary>;
+
+  def 8h4s : NeonI_2VMisc<0b1, U, 0b01, opcode,
+                           (outs VPR128:$Rd), (ins VPR128:$Rn),
+                           asmop # "\t$Rd.4s, $Rn.8h",
+                           [(set (v4i32 VPR128:$Rd),
+                              (v4i32 (Neon_Padd (v8i16 VPR128:$Rn))))],
+                           NoItinerary>;
+
+  def 4h2s : NeonI_2VMisc<0b0, U, 0b01, opcode,
+                          (outs VPR64:$Rd), (ins VPR64:$Rn),
+                          asmop # "\t$Rd.2s, $Rn.4h",
+                          [(set (v2i32 VPR64:$Rd),
+                             (v2i32 (Neon_Padd (v4i16 VPR64:$Rn))))],
+                          NoItinerary>;
+
+  def 4s2d : NeonI_2VMisc<0b1, U, 0b10, opcode,
+                           (outs VPR128:$Rd), (ins VPR128:$Rn),
+                           asmop # "\t$Rd.2d, $Rn.4s",
+                           [(set (v2i64 VPR128:$Rd),
+                              (v2i64 (Neon_Padd (v4i32 VPR128:$Rn))))],
+                           NoItinerary>;
+
+  def 2s1d : NeonI_2VMisc<0b0, U, 0b10, opcode,
+                          (outs VPR64:$Rd), (ins VPR64:$Rn),
+                          asmop # "\t$Rd.1d, $Rn.2s",
+                          [(set (v1i64 VPR64:$Rd),
+                             (v1i64 (Neon_Padd (v2i32 VPR64:$Rn))))],
+                          NoItinerary>;
+}
+
+defm SADDLP : NeonI_PairwiseAdd<"saddlp", 0b0, 0b00010,
+                                int_arm_neon_vpaddls>;
+defm UADDLP : NeonI_PairwiseAdd<"uaddlp", 0b1, 0b00010,
+                                int_arm_neon_vpaddlu>;
+
+multiclass NeonI_PairwiseAddAcc<string asmop, bit U, bits<5> opcode,
+                             SDPatternOperator Neon_Padd> {
+  let Constraints = "$src = $Rd" in {
+    def 16b8h : NeonI_2VMisc<0b1, U, 0b00, opcode,
+                             (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
+                             asmop # "\t$Rd.8h, $Rn.16b",
+                             [(set (v8i16 VPR128:$Rd),
+                                (v8i16 (Neon_Padd
+                                  (v8i16 VPR128:$src), (v16i8 VPR128:$Rn))))],
+                             NoItinerary>;
+
+    def 8b4h : NeonI_2VMisc<0b0, U, 0b00, opcode,
+                            (outs VPR64:$Rd), (ins VPR64:$src, VPR64:$Rn),
+                            asmop # "\t$Rd.4h, $Rn.8b",
+                            [(set (v4i16 VPR64:$Rd),
+                               (v4i16 (Neon_Padd
+                                 (v4i16 VPR64:$src), (v8i8 VPR64:$Rn))))],
+                            NoItinerary>;
+
+    def 8h4s : NeonI_2VMisc<0b1, U, 0b01, opcode,
+                            (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
+                            asmop # "\t$Rd.4s, $Rn.8h",
+                            [(set (v4i32 VPR128:$Rd),
+                               (v4i32 (Neon_Padd
+                                 (v4i32 VPR128:$src), (v8i16 VPR128:$Rn))))],
+                            NoItinerary>;
+
+    def 4h2s : NeonI_2VMisc<0b0, U, 0b01, opcode,
+                            (outs VPR64:$Rd), (ins VPR64:$src, VPR64:$Rn),
+                            asmop # "\t$Rd.2s, $Rn.4h",
+                            [(set (v2i32 VPR64:$Rd),
+                               (v2i32 (Neon_Padd
+                                 (v2i32 VPR64:$src), (v4i16 VPR64:$Rn))))],
+                            NoItinerary>;
+
+    def 4s2d : NeonI_2VMisc<0b1, U, 0b10, opcode,
+                            (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
+                            asmop # "\t$Rd.2d, $Rn.4s",
+                            [(set (v2i64 VPR128:$Rd),
+                               (v2i64 (Neon_Padd
+                                 (v2i64 VPR128:$src), (v4i32 VPR128:$Rn))))],
+                            NoItinerary>;
+
+    def 2s1d : NeonI_2VMisc<0b0, U, 0b10, opcode,
+                            (outs VPR64:$Rd), (ins VPR64:$src, VPR64:$Rn),
+                            asmop # "\t$Rd.1d, $Rn.2s",
+                            [(set (v1i64 VPR64:$Rd),
+                               (v1i64 (Neon_Padd
+                                 (v1i64 VPR64:$src), (v2i32 VPR64:$Rn))))],
+                            NoItinerary>;
+  }
+}
+
+defm SADALP : NeonI_PairwiseAddAcc<"sadalp", 0b0, 0b00110,
+                                   int_arm_neon_vpadals>;
+defm UADALP : NeonI_PairwiseAddAcc<"uadalp", 0b1, 0b00110,
+                                   int_arm_neon_vpadalu>;
+
+multiclass NeonI_2VMisc_BHSDsize_1Arg<string asmop, bit U, bits<5> opcode> {
+  def 16b : NeonI_2VMisc<0b1, U, 0b00, opcode,
+                         (outs VPR128:$Rd), (ins VPR128:$Rn),
+                         asmop # "\t$Rd.16b, $Rn.16b",
+                         [], NoItinerary>;
+
+  def 8h : NeonI_2VMisc<0b1, U, 0b01, opcode,
+                        (outs VPR128:$Rd), (ins VPR128:$Rn),
+                        asmop # "\t$Rd.8h, $Rn.8h",
+                        [], NoItinerary>;
+
+  def 4s : NeonI_2VMisc<0b1, U, 0b10, opcode,
+                        (outs VPR128:$Rd), (ins VPR128:$Rn),
+                        asmop # "\t$Rd.4s, $Rn.4s",
+                        [], NoItinerary>;
+
+  def 2d : NeonI_2VMisc<0b1, U, 0b11, opcode,
+                        (outs VPR128:$Rd), (ins VPR128:$Rn),
+                        asmop # "\t$Rd.2d, $Rn.2d",
+                        [], NoItinerary>;
+
+  def 8b : NeonI_2VMisc<0b0, U, 0b00, opcode,
+                         (outs VPR64:$Rd), (ins VPR64:$Rn),
+                         asmop # "\t$Rd.8b, $Rn.8b",
+                         [], NoItinerary>;
+
+  def 4h : NeonI_2VMisc<0b0, U, 0b01, opcode,
+                        (outs VPR64:$Rd), (ins VPR64:$Rn),
+                        asmop # "\t$Rd.4h, $Rn.4h",
+                        [], NoItinerary>;
+
+  def 2s : NeonI_2VMisc<0b0, U, 0b10, opcode,
+                        (outs VPR64:$Rd), (ins VPR64:$Rn),
+                        asmop # "\t$Rd.2s, $Rn.2s",
+                        [], NoItinerary>;
+}
+
+defm SQABS : NeonI_2VMisc_BHSDsize_1Arg<"sqabs", 0b0, 0b00111>;
+defm SQNEG : NeonI_2VMisc_BHSDsize_1Arg<"sqneg", 0b1, 0b00111>;
+defm ABS : NeonI_2VMisc_BHSDsize_1Arg<"abs", 0b0, 0b01011>;
+defm NEG : NeonI_2VMisc_BHSDsize_1Arg<"neg", 0b1, 0b01011>;
+
+multiclass NeonI_2VMisc_BHSD_1Arg_Pattern<string Prefix,
+                                          SDPatternOperator Neon_Op> {
+  def : Pat<(v16i8 (Neon_Op (v16i8 VPR128:$Rn))),
+            (v16i8 (!cast<Instruction>(Prefix # 16b) (v16i8 VPR128:$Rn)))>;
+
+  def : Pat<(v8i16 (Neon_Op (v8i16 VPR128:$Rn))),
+            (v8i16 (!cast<Instruction>(Prefix # 8h) (v8i16 VPR128:$Rn)))>;
+
+  def : Pat<(v4i32 (Neon_Op (v4i32 VPR128:$Rn))),
+            (v4i32 (!cast<Instruction>(Prefix # 4s) (v4i32 VPR128:$Rn)))>;
+
+  def : Pat<(v2i64 (Neon_Op (v2i64 VPR128:$Rn))),
+            (v2i64 (!cast<Instruction>(Prefix # 2d) (v2i64 VPR128:$Rn)))>;
+
+  def : Pat<(v8i8 (Neon_Op (v8i8 VPR64:$Rn))),
+            (v8i8 (!cast<Instruction>(Prefix # 8b) (v8i8 VPR64:$Rn)))>;
+
+  def : Pat<(v4i16 (Neon_Op (v4i16 VPR64:$Rn))),
+            (v4i16 (!cast<Instruction>(Prefix # 4h) (v4i16 VPR64:$Rn)))>;
+
+  def : Pat<(v2i32 (Neon_Op (v2i32 VPR64:$Rn))),
+            (v2i32 (!cast<Instruction>(Prefix # 2s) (v2i32 VPR64:$Rn)))>;
+}
+
+defm : NeonI_2VMisc_BHSD_1Arg_Pattern<"SQABS", int_arm_neon_vqabs>;
+defm : NeonI_2VMisc_BHSD_1Arg_Pattern<"SQNEG", int_arm_neon_vqneg>;
+defm : NeonI_2VMisc_BHSD_1Arg_Pattern<"ABS", int_arm_neon_vabs>;
+
+def : Pat<(v16i8 (sub
+            (v16i8 Neon_AllZero),
+            (v16i8 VPR128:$Rn))),
+          (v16i8 (NEG16b (v16i8 VPR128:$Rn)))>;
+def : Pat<(v8i8 (sub
+            (v8i8 Neon_AllZero),
+            (v8i8 VPR64:$Rn))),
+          (v8i8 (NEG8b (v8i8 VPR64:$Rn)))>;
+def : Pat<(v8i16 (sub
+            (v8i16 (bitconvert (v16i8 Neon_AllZero))),
+            (v8i16 VPR128:$Rn))),
+          (v8i16 (NEG8h (v8i16 VPR128:$Rn)))>;
+def : Pat<(v4i16 (sub
+            (v4i16 (bitconvert (v8i8 Neon_AllZero))),
+            (v4i16 VPR64:$Rn))),
+          (v4i16 (NEG4h (v4i16 VPR64:$Rn)))>;
+def : Pat<(v4i32 (sub
+            (v4i32 (bitconvert (v16i8 Neon_AllZero))),
+            (v4i32 VPR128:$Rn))),
+          (v4i32 (NEG4s (v4i32 VPR128:$Rn)))>;
+def : Pat<(v2i32 (sub
+            (v2i32 (bitconvert (v8i8 Neon_AllZero))),
+            (v2i32 VPR64:$Rn))),
+          (v2i32 (NEG2s (v2i32 VPR64:$Rn)))>;
+def : Pat<(v2i64 (sub
+            (v2i64 (bitconvert (v16i8 Neon_AllZero))),
+            (v2i64 VPR128:$Rn))),
+          (v2i64 (NEG2d (v2i64 VPR128:$Rn)))>;
+
+multiclass NeonI_2VMisc_BHSDsize_2Args<string asmop, bit U, bits<5> opcode> {
+  let Constraints = "$src = $Rd" in {
+    def 16b : NeonI_2VMisc<0b1, U, 0b00, opcode,
+                           (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
+                           asmop # "\t$Rd.16b, $Rn.16b",
+                           [], NoItinerary>;
+
+    def 8h : NeonI_2VMisc<0b1, U, 0b01, opcode,
+                          (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
+                          asmop # "\t$Rd.8h, $Rn.8h",
+                          [], NoItinerary>;
+
+    def 4s : NeonI_2VMisc<0b1, U, 0b10, opcode,
+                          (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
+                          asmop # "\t$Rd.4s, $Rn.4s",
+                          [], NoItinerary>;
+
+    def 2d : NeonI_2VMisc<0b1, U, 0b11, opcode,
+                          (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
+                          asmop # "\t$Rd.2d, $Rn.2d",
+                          [], NoItinerary>;
+
+    def 8b : NeonI_2VMisc<0b0, U, 0b00, opcode,
+                          (outs VPR64:$Rd), (ins VPR64:$src, VPR64:$Rn),
+                          asmop # "\t$Rd.8b, $Rn.8b",
+                          [], NoItinerary>;
+
+    def 4h : NeonI_2VMisc<0b0, U, 0b01, opcode,
+                          (outs VPR64:$Rd), (ins VPR64:$src, VPR64:$Rn),
+                          asmop # "\t$Rd.4h, $Rn.4h",
+                          [], NoItinerary>;
+
+    def 2s : NeonI_2VMisc<0b0, U, 0b10, opcode,
+                          (outs VPR64:$Rd), (ins VPR64:$src, VPR64:$Rn),
+                          asmop # "\t$Rd.2s, $Rn.2s",
+                          [], NoItinerary>;
+  }
+}
+
+defm SUQADD : NeonI_2VMisc_BHSDsize_2Args<"suqadd", 0b0, 0b00011>;
+defm USQADD : NeonI_2VMisc_BHSDsize_2Args<"usqadd", 0b1, 0b00011>;
+
+multiclass NeonI_2VMisc_BHSD_2Args_Pattern<string Prefix,
+                                           SDPatternOperator Neon_Op> {
+  def : Pat<(v16i8 (Neon_Op (v16i8 VPR128:$src), (v16i8 VPR128:$Rn))),
+            (v16i8 (!cast<Instruction>(Prefix # 16b)
+              (v16i8 VPR128:$src), (v16i8 VPR128:$Rn)))>;
+
+  def : Pat<(v8i16 (Neon_Op (v8i16 VPR128:$src), (v8i16 VPR128:$Rn))),
+            (v8i16 (!cast<Instruction>(Prefix # 8h)
+              (v8i16 VPR128:$src), (v8i16 VPR128:$Rn)))>;
+
+  def : Pat<(v4i32 (Neon_Op (v4i32 VPR128:$src), (v4i32 VPR128:$Rn))),
+            (v4i32 (!cast<Instruction>(Prefix # 4s)
+              (v4i32 VPR128:$src), (v4i32 VPR128:$Rn)))>;
+
+  def : Pat<(v2i64 (Neon_Op (v2i64 VPR128:$src), (v2i64 VPR128:$Rn))),
+            (v2i64 (!cast<Instruction>(Prefix # 2d)
+              (v2i64 VPR128:$src), (v2i64 VPR128:$Rn)))>;
+
+  def : Pat<(v8i8 (Neon_Op (v8i8 VPR64:$src), (v8i8 VPR64:$Rn))),
+            (v8i8 (!cast<Instruction>(Prefix # 8b)
+              (v8i8 VPR64:$src), (v8i8 VPR64:$Rn)))>;
+
+  def : Pat<(v4i16 (Neon_Op (v4i16 VPR64:$src), (v4i16 VPR64:$Rn))),
+            (v4i16 (!cast<Instruction>(Prefix # 4h)
+              (v4i16 VPR64:$src), (v4i16 VPR64:$Rn)))>;
+
+  def : Pat<(v2i32 (Neon_Op (v2i32 VPR64:$src), (v2i32 VPR64:$Rn))),
+            (v2i32 (!cast<Instruction>(Prefix # 2s)
+              (v2i32 VPR64:$src), (v2i32 VPR64:$Rn)))>;
+}
+
+defm : NeonI_2VMisc_BHSD_2Args_Pattern<"SUQADD", int_aarch64_neon_suqadd>;
+defm : NeonI_2VMisc_BHSD_2Args_Pattern<"USQADD", int_aarch64_neon_usqadd>;
+
+multiclass NeonI_2VMisc_BHSsizes<string asmop, bit U,
+                          SDPatternOperator Neon_Op> {
+  def 16b : NeonI_2VMisc<0b1, U, 0b00, 0b00100,
+                         (outs VPR128:$Rd), (ins VPR128:$Rn),
+                         asmop # "\t$Rd.16b, $Rn.16b",
+                         [(set (v16i8 VPR128:$Rd),
+                            (v16i8 (Neon_Op (v16i8 VPR128:$Rn))))],
+                         NoItinerary>;
+
+  def 8h : NeonI_2VMisc<0b1, U, 0b01, 0b00100,
+                        (outs VPR128:$Rd), (ins VPR128:$Rn),
+                        asmop # "\t$Rd.8h, $Rn.8h",
+                        [(set (v8i16 VPR128:$Rd),
+                           (v8i16 (Neon_Op (v8i16 VPR128:$Rn))))],
+                        NoItinerary>;
+
+  def 4s : NeonI_2VMisc<0b1, U, 0b10, 0b00100,
+                        (outs VPR128:$Rd), (ins VPR128:$Rn),
+                        asmop # "\t$Rd.4s, $Rn.4s",
+                        [(set (v4i32 VPR128:$Rd),
+                           (v4i32 (Neon_Op (v4i32 VPR128:$Rn))))],
+                        NoItinerary>;
+
+  def 8b : NeonI_2VMisc<0b0, U, 0b00, 0b00100,
+                        (outs VPR64:$Rd), (ins VPR64:$Rn),
+                        asmop # "\t$Rd.8b, $Rn.8b",
+                        [(set (v8i8 VPR64:$Rd),
+                           (v8i8 (Neon_Op (v8i8 VPR64:$Rn))))],
+                        NoItinerary>;
+
+  def 4h : NeonI_2VMisc<0b0, U, 0b01, 0b00100,
+                        (outs VPR64:$Rd), (ins VPR64:$Rn),
+                        asmop # "\t$Rd.4h, $Rn.4h",
+                        [(set (v4i16 VPR64:$Rd),
+                           (v4i16 (Neon_Op (v4i16 VPR64:$Rn))))],
+                        NoItinerary>;
+
+  def 2s : NeonI_2VMisc<0b0, U, 0b10, 0b00100,
+                        (outs VPR64:$Rd), (ins VPR64:$Rn),
+                        asmop # "\t$Rd.2s, $Rn.2s",
+                        [(set (v2i32 VPR64:$Rd),
+                           (v2i32 (Neon_Op (v2i32 VPR64:$Rn))))],
+                        NoItinerary>;
+}
+
+defm CLS : NeonI_2VMisc_BHSsizes<"cls", 0b0, int_arm_neon_vcls>;
+defm CLZ : NeonI_2VMisc_BHSsizes<"clz", 0b1, ctlz>;
+
+multiclass NeonI_2VMisc_Bsize<string asmop, bit U, bits<2> size,
+                              bits<5> Opcode> {
+  def 16b : NeonI_2VMisc<0b1, U, size, Opcode,
+                         (outs VPR128:$Rd), (ins VPR128:$Rn),
+                         asmop # "\t$Rd.16b, $Rn.16b",
+                         [], NoItinerary>;
+
+  def 8b : NeonI_2VMisc<0b0, U, size, Opcode,
+                        (outs VPR64:$Rd), (ins VPR64:$Rn),
+                        asmop # "\t$Rd.8b, $Rn.8b",
+                        [], NoItinerary>;
+}
+
+defm CNT : NeonI_2VMisc_Bsize<"cnt", 0b0, 0b00, 0b00101>;
+defm NOT : NeonI_2VMisc_Bsize<"not", 0b1, 0b00, 0b00101>;
+defm RBIT : NeonI_2VMisc_Bsize<"rbit", 0b1, 0b01, 0b00101>;
+
+def : NeonInstAlias<"mvn $Rd.16b, $Rn.16b",
+                    (NOT16b VPR128:$Rd, VPR128:$Rn), 0>;
+def : NeonInstAlias<"mvn $Rd.8b, $Rn.8b",
+                    (NOT8b VPR64:$Rd, VPR64:$Rn), 0>;
+
+def : Pat<(v16i8 (ctpop (v16i8 VPR128:$Rn))),
+          (v16i8 (CNT16b (v16i8 VPR128:$Rn)))>;
+def : Pat<(v8i8 (ctpop (v8i8 VPR64:$Rn))),
+          (v8i8 (CNT8b (v8i8 VPR64:$Rn)))>;
+
+def : Pat<(v16i8 (xor
+            (v16i8 VPR128:$Rn),
+            (v16i8 Neon_AllOne))),
+          (v16i8 (NOT16b (v16i8 VPR128:$Rn)))>;
+def : Pat<(v8i8 (xor
+            (v8i8 VPR64:$Rn),
+            (v8i8 Neon_AllOne))),
+          (v8i8 (NOT8b (v8i8 VPR64:$Rn)))>;
+def : Pat<(v8i16 (xor
+            (v8i16 VPR128:$Rn),
+            (v8i16 (bitconvert (v16i8 Neon_AllOne))))),
+          (NOT16b VPR128:$Rn)>;
+def : Pat<(v4i16 (xor
+            (v4i16 VPR64:$Rn),
+            (v4i16 (bitconvert (v8i8 Neon_AllOne))))),
+          (NOT8b VPR64:$Rn)>;
+def : Pat<(v4i32 (xor
+            (v4i32 VPR128:$Rn),
+            (v4i32 (bitconvert (v16i8 Neon_AllOne))))),
+          (NOT16b VPR128:$Rn)>;
+def : Pat<(v2i32 (xor
+            (v2i32 VPR64:$Rn),
+            (v2i32 (bitconvert (v8i8 Neon_AllOne))))),
+          (NOT8b VPR64:$Rn)>;
+def : Pat<(v2i64 (xor
+            (v2i64 VPR128:$Rn),
+            (v2i64 (bitconvert (v16i8 Neon_AllOne))))),
+          (NOT16b VPR128:$Rn)>;
+
+def : Pat<(v16i8 (int_aarch64_neon_rbit (v16i8 VPR128:$Rn))),
+          (v16i8 (RBIT16b (v16i8 VPR128:$Rn)))>;
+def : Pat<(v8i8 (int_aarch64_neon_rbit (v8i8 VPR64:$Rn))),
+          (v8i8 (RBIT8b (v8i8 VPR64:$Rn)))>;
+
+multiclass NeonI_2VMisc_SDsizes<string asmop, bit U, bits<5> opcode,
+                                SDPatternOperator Neon_Op> {
+  def 4s : NeonI_2VMisc<0b1, U, 0b10, opcode,
+                        (outs VPR128:$Rd), (ins VPR128:$Rn),
+                        asmop # "\t$Rd.4s, $Rn.4s",
+                        [(set (v4f32 VPR128:$Rd),
+                           (v4f32 (Neon_Op (v4f32 VPR128:$Rn))))],
+                        NoItinerary>;
+
+  def 2d : NeonI_2VMisc<0b1, U, 0b11, opcode,
+                        (outs VPR128:$Rd), (ins VPR128:$Rn),
+                        asmop # "\t$Rd.2d, $Rn.2d",
+                        [(set (v2f64 VPR128:$Rd),
+                           (v2f64 (Neon_Op (v2f64 VPR128:$Rn))))],
+                        NoItinerary>;
+
+  def 2s : NeonI_2VMisc<0b0, U, 0b10, opcode,
+                        (outs VPR64:$Rd), (ins VPR64:$Rn),
+                        asmop # "\t$Rd.2s, $Rn.2s",
+                        [(set (v2f32 VPR64:$Rd),
+                           (v2f32 (Neon_Op (v2f32 VPR64:$Rn))))],
+                        NoItinerary>;
+}
+
+defm FABS : NeonI_2VMisc_SDsizes<"fabs", 0b0, 0b01111, fabs>;
+defm FNEG : NeonI_2VMisc_SDsizes<"fneg", 0b1, 0b01111, fneg>;
+
+multiclass NeonI_2VMisc_HSD_Narrow<string asmop, bit U, bits<5> opcode> {
+  def 8h8b : NeonI_2VMisc<0b0, U, 0b00, opcode,
+                          (outs VPR64:$Rd), (ins VPR128:$Rn),
+                          asmop # "\t$Rd.8b, $Rn.8h",
+                          [], NoItinerary>;
+
+  def 4s4h : NeonI_2VMisc<0b0, U, 0b01, opcode,
+                          (outs VPR64:$Rd), (ins VPR128:$Rn),
+                          asmop # "\t$Rd.4h, $Rn.4s",
+                          [], NoItinerary>;
+
+  def 2d2s : NeonI_2VMisc<0b0, U, 0b10, opcode,
+                          (outs VPR64:$Rd), (ins VPR128:$Rn),
+                          asmop # "\t$Rd.2s, $Rn.2d",
+                          [], NoItinerary>;
+
+  let Constraints = "$Rd = $src" in {
+    def 8h16b : NeonI_2VMisc<0b1, U, 0b00, opcode,
+                             (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
+                             asmop # "2\t$Rd.16b, $Rn.8h",
+                             [], NoItinerary>;
+
+    def 4s8h : NeonI_2VMisc<0b1, U, 0b01, opcode,
+                            (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
+                            asmop # "2\t$Rd.8h, $Rn.4s",
+                            [], NoItinerary>;
+
+    def 2d4s : NeonI_2VMisc<0b1, U, 0b10, opcode,
+                            (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
+                            asmop # "2\t$Rd.4s, $Rn.2d",
+                            [], NoItinerary>;
+  }
+}
+
+defm XTN : NeonI_2VMisc_HSD_Narrow<"xtn", 0b0, 0b10010>;
+defm SQXTUN : NeonI_2VMisc_HSD_Narrow<"sqxtun", 0b1, 0b10010>;
+defm SQXTN : NeonI_2VMisc_HSD_Narrow<"sqxtn", 0b0, 0b10100>;
+defm UQXTN : NeonI_2VMisc_HSD_Narrow<"uqxtn", 0b1, 0b10100>;
+
+multiclass NeonI_2VMisc_Narrow_Patterns<string Prefix,
+                                        SDPatternOperator Neon_Op> {
+  def : Pat<(v8i8 (Neon_Op (v8i16 VPR128:$Rn))),
+            (v8i8 (!cast<Instruction>(Prefix # 8h8b) (v8i16 VPR128:$Rn)))>;
+
+  def : Pat<(v4i16 (Neon_Op (v4i32 VPR128:$Rn))),
+            (v4i16 (!cast<Instruction>(Prefix # 4s4h) (v4i32 VPR128:$Rn)))>;
+
+  def : Pat<(v2i32 (Neon_Op (v2i64 VPR128:$Rn))),
+            (v2i32 (!cast<Instruction>(Prefix # 2d2s) (v2i64 VPR128:$Rn)))>;
+
+  def : Pat<(v16i8 (concat_vectors
+              (v8i8 VPR64:$src),
+              (v8i8 (Neon_Op (v8i16 VPR128:$Rn))))),
+            (!cast<Instruction>(Prefix # 8h16b)
+              (SUBREG_TO_REG (i32 0), VPR64:$src, sub_64),
+              VPR128:$Rn)>;
+
+  def : Pat<(v8i16 (concat_vectors
+              (v4i16 VPR64:$src),
+              (v4i16 (Neon_Op (v4i32 VPR128:$Rn))))),
+            (!cast<Instruction>(Prefix # 4s8h)
+              (SUBREG_TO_REG (i32 0), VPR64:$src, sub_64),
+              VPR128:$Rn)>;
+
+  def : Pat<(v4i32 (concat_vectors
+              (v2i32 VPR64:$src),
+              (v2i32 (Neon_Op (v2i64 VPR128:$Rn))))),
+            (!cast<Instruction>(Prefix # 2d4s)
+              (SUBREG_TO_REG (i32 0), VPR64:$src, sub_64),
+              VPR128:$Rn)>;
+}
+
+defm : NeonI_2VMisc_Narrow_Patterns<"XTN", trunc>;
+defm : NeonI_2VMisc_Narrow_Patterns<"SQXTUN", int_arm_neon_vqmovnsu>;
+defm : NeonI_2VMisc_Narrow_Patterns<"SQXTN", int_arm_neon_vqmovns>;
+defm : NeonI_2VMisc_Narrow_Patterns<"UQXTN", int_arm_neon_vqmovnu>;
+
+multiclass NeonI_2VMisc_SHIFT<string asmop, bit U, bits<5> opcode> {
+  let DecoderMethod = "DecodeSHLLInstruction" in {
+    def 8b8h : NeonI_2VMisc<0b0, U, 0b00, opcode,
+                            (outs VPR128:$Rd),
+                            (ins VPR64:$Rn, uimm_exact8:$Imm),
+                            asmop # "\t$Rd.8h, $Rn.8b, $Imm",
+                            [], NoItinerary>;
+
+    def 4h4s : NeonI_2VMisc<0b0, U, 0b01, opcode,
+                            (outs VPR128:$Rd),
+                            (ins VPR64:$Rn, uimm_exact16:$Imm),
+                            asmop # "\t$Rd.4s, $Rn.4h, $Imm",
+                            [], NoItinerary>;
+
+    def 2s2d : NeonI_2VMisc<0b0, U, 0b10, opcode,
+                            (outs VPR128:$Rd),
+                            (ins VPR64:$Rn, uimm_exact32:$Imm),
+                            asmop # "\t$Rd.2d, $Rn.2s, $Imm",
+                            [], NoItinerary>;
+
+    def 16b8h : NeonI_2VMisc<0b1, U, 0b00, opcode,
+                            (outs VPR128:$Rd),
+                            (ins VPR128:$Rn, uimm_exact8:$Imm),
+                            asmop # "2\t$Rd.8h, $Rn.16b, $Imm",
+                            [], NoItinerary>;
+
+    def 8h4s : NeonI_2VMisc<0b1, U, 0b01, opcode,
+                            (outs VPR128:$Rd),
+                            (ins VPR128:$Rn, uimm_exact16:$Imm),
+                            asmop # "2\t$Rd.4s, $Rn.8h, $Imm",
+                            [], NoItinerary>;
+
+    def 4s2d : NeonI_2VMisc<0b1, U, 0b10, opcode,
+                            (outs VPR128:$Rd),
+                            (ins VPR128:$Rn, uimm_exact32:$Imm),
+                            asmop # "2\t$Rd.2d, $Rn.4s, $Imm",
+                            [], NoItinerary>;
+  }
+}
+
+defm SHLL : NeonI_2VMisc_SHIFT<"shll", 0b1, 0b10011>;
+
+class NeonI_SHLL_Patterns<ValueType OpTy, ValueType DesTy,
+                          SDPatternOperator ExtOp, Operand Neon_Imm,
+                          string suffix>
+  : Pat<(DesTy (shl
+          (DesTy (ExtOp (OpTy VPR64:$Rn))),
+            (DesTy (Neon_vdup
+              (i32 Neon_Imm:$Imm))))),
+        (!cast<Instruction>("SHLL" # suffix) VPR64:$Rn, Neon_Imm:$Imm)>;
+
+class NeonI_SHLL_High_Patterns<ValueType OpTy, ValueType DesTy,
+                               SDPatternOperator ExtOp, Operand Neon_Imm,
+                               string suffix, PatFrag GetHigh>
+  : Pat<(DesTy (shl
+          (DesTy (ExtOp
+            (OpTy (GetHigh VPR128:$Rn)))),
+              (DesTy (Neon_vdup
+                (i32 Neon_Imm:$Imm))))),
+        (!cast<Instruction>("SHLL" # suffix) VPR128:$Rn, Neon_Imm:$Imm)>;
+
+def : NeonI_SHLL_Patterns<v8i8, v8i16, zext, uimm_exact8, "8b8h">;
+def : NeonI_SHLL_Patterns<v8i8, v8i16, sext, uimm_exact8, "8b8h">;
+def : NeonI_SHLL_Patterns<v4i16, v4i32, zext, uimm_exact16, "4h4s">;
+def : NeonI_SHLL_Patterns<v4i16, v4i32, sext, uimm_exact16, "4h4s">;
+def : NeonI_SHLL_Patterns<v2i32, v2i64, zext, uimm_exact32, "2s2d">;
+def : NeonI_SHLL_Patterns<v2i32, v2i64, sext, uimm_exact32, "2s2d">;
+def : NeonI_SHLL_High_Patterns<v8i8, v8i16, zext, uimm_exact8, "16b8h",
+                               Neon_High16B>;
+def : NeonI_SHLL_High_Patterns<v8i8, v8i16, sext, uimm_exact8, "16b8h",
+                               Neon_High16B>;
+def : NeonI_SHLL_High_Patterns<v4i16, v4i32, zext, uimm_exact16, "8h4s",
+                               Neon_High8H>;
+def : NeonI_SHLL_High_Patterns<v4i16, v4i32, sext, uimm_exact16, "8h4s",
+                               Neon_High8H>;
+def : NeonI_SHLL_High_Patterns<v2i32, v2i64, zext, uimm_exact32, "4s2d",
+                               Neon_High4S>;
+def : NeonI_SHLL_High_Patterns<v2i32, v2i64, sext, uimm_exact32, "4s2d",
+                               Neon_High4S>;
+
+multiclass NeonI_2VMisc_SD_Narrow<string asmop, bit U, bits<5> opcode> {
+  def 4s4h : NeonI_2VMisc<0b0, U, 0b00, opcode,
+                          (outs VPR64:$Rd), (ins VPR128:$Rn),
+                          asmop # "\t$Rd.4h, $Rn.4s",
+                          [], NoItinerary>;
+
+  def 2d2s : NeonI_2VMisc<0b0, U, 0b01, opcode,
+                          (outs VPR64:$Rd), (ins VPR128:$Rn),
+                          asmop # "\t$Rd.2s, $Rn.2d",
+                          [], NoItinerary>;
+
+  let Constraints = "$src = $Rd" in {
+    def 4s8h : NeonI_2VMisc<0b1, U, 0b00, opcode,
+                            (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
+                            asmop # "2\t$Rd.8h, $Rn.4s",
+                            [], NoItinerary>;
+
+    def 2d4s : NeonI_2VMisc<0b1, U, 0b01, opcode,
+                            (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
+                            asmop # "2\t$Rd.4s, $Rn.2d",
+                            [], NoItinerary>;
+  }
+}
+
+defm FCVTN : NeonI_2VMisc_SD_Narrow<"fcvtn", 0b0, 0b10110>;
+
+multiclass NeonI_2VMisc_Narrow_Pattern<string prefix,
+                                       SDPatternOperator f32_to_f16_Op,
+                                       SDPatternOperator f64_to_f32_Op> {
+
+  def : Pat<(v4i16 (f32_to_f16_Op (v4f32 VPR128:$Rn))),
+              (!cast<Instruction>(prefix # "4s4h") (v4f32 VPR128:$Rn))>;
+
+  def : Pat<(v8i16 (concat_vectors
+                (v4i16 VPR64:$src),
+                (v4i16 (f32_to_f16_Op (v4f32 VPR128:$Rn))))),
+                  (!cast<Instruction>(prefix # "4s8h")
+                    (v4f32 (SUBREG_TO_REG (i32 0), VPR64:$src, sub_64)),
+                    (v4f32 VPR128:$Rn))>;
+
+  def : Pat<(v2f32 (f64_to_f32_Op (v2f64 VPR128:$Rn))),
+            (!cast<Instruction>(prefix # "2d2s") (v2f64 VPR128:$Rn))>;
+
+  def : Pat<(v4f32 (concat_vectors
+              (v2f32 VPR64:$src),
+              (v2f32 (f64_to_f32_Op (v2f64 VPR128:$Rn))))),
+                (!cast<Instruction>(prefix # "2d4s")
+                  (v4f32 (SUBREG_TO_REG (i32 0), VPR64:$src, sub_64)),
+                  (v2f64 VPR128:$Rn))>;
+}
+
+defm : NeonI_2VMisc_Narrow_Pattern<"FCVTN", int_arm_neon_vcvtfp2hf, fround>;
+
+multiclass NeonI_2VMisc_D_Narrow<string asmop, string prefix, bit U,
+                                 bits<5> opcode> {
+  def 2d2s : NeonI_2VMisc<0b0, U, 0b01, opcode,
+                          (outs VPR64:$Rd), (ins VPR128:$Rn),
+                          asmop # "\t$Rd.2s, $Rn.2d",
+                          [], NoItinerary>;
+
+  def 2d4s : NeonI_2VMisc<0b1, U, 0b01, opcode,
+                          (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
+                          asmop # "2\t$Rd.4s, $Rn.2d",
+                          [], NoItinerary> {
+    let Constraints = "$src = $Rd";
+  }
+
+  def : Pat<(v2f32 (int_aarch64_neon_fcvtxn (v2f64 VPR128:$Rn))),
+            (!cast<Instruction>(prefix # "2d2s") VPR128:$Rn)>;
+
+  def : Pat<(v4f32 (concat_vectors
+              (v2f32 VPR64:$src),
+              (v2f32 (int_aarch64_neon_fcvtxn (v2f64 VPR128:$Rn))))),
+            (!cast<Instruction>(prefix # "2d4s")
+               (v4f32 (SUBREG_TO_REG (i32 0), VPR64:$src, sub_64)),
+               VPR128:$Rn)>;
+}
+
+defm FCVTXN : NeonI_2VMisc_D_Narrow<"fcvtxn","FCVTXN", 0b1, 0b10110>;
+
+def Neon_High4Float : PatFrag<(ops node:$in),
+                              (extract_subvector (v4f32 node:$in), (iPTR 2))>;
+
+multiclass NeonI_2VMisc_HS_Extend<string asmop, bit U, bits<5> opcode> {
+  def 4h4s : NeonI_2VMisc<0b0, U, 0b00, opcode,
+                          (outs VPR128:$Rd), (ins VPR64:$Rn),
+                          asmop # "\t$Rd.4s, $Rn.4h",
+                          [], NoItinerary>;
+
+  def 2s2d : NeonI_2VMisc<0b0, U, 0b01, opcode,
+                          (outs VPR128:$Rd), (ins VPR64:$Rn),
+                          asmop # "\t$Rd.2d, $Rn.2s",
+                          [], NoItinerary>;
+
+  def 8h4s : NeonI_2VMisc<0b1, U, 0b00, opcode,
+                          (outs VPR128:$Rd), (ins VPR128:$Rn),
+                          asmop # "2\t$Rd.4s, $Rn.8h",
+                          [], NoItinerary>;
+
+  def 4s2d : NeonI_2VMisc<0b1, U, 0b01, opcode,
+                          (outs VPR128:$Rd), (ins VPR128:$Rn),
+                          asmop # "2\t$Rd.2d, $Rn.4s",
+                          [], NoItinerary>;
+}
+
+defm FCVTL : NeonI_2VMisc_HS_Extend<"fcvtl", 0b0, 0b10111>;
+
+multiclass NeonI_2VMisc_Extend_Pattern<string prefix> {
+  def : Pat<(v4f32 (int_arm_neon_vcvthf2fp (v4i16 VPR64:$Rn))),
+            (!cast<Instruction>(prefix # "4h4s") VPR64:$Rn)>;
+
+  def : Pat<(v4f32 (int_arm_neon_vcvthf2fp
+              (v4i16 (Neon_High8H
+                (v8i16 VPR128:$Rn))))),
+            (!cast<Instruction>(prefix # "8h4s") VPR128:$Rn)>;
+
+  def : Pat<(v2f64 (fextend (v2f32 VPR64:$Rn))),
+            (!cast<Instruction>(prefix # "2s2d") VPR64:$Rn)>;
+
+  def : Pat<(v2f64 (fextend
+              (v2f32 (Neon_High4Float
+                (v4f32 VPR128:$Rn))))),
+            (!cast<Instruction>(prefix # "4s2d") VPR128:$Rn)>;
+}
+
+defm : NeonI_2VMisc_Extend_Pattern<"FCVTL">;
+
+multiclass NeonI_2VMisc_SD_Conv<string asmop, bit Size, bit U, bits<5> opcode,
+                                ValueType ResTy4s, ValueType OpTy4s,
+                                ValueType ResTy2d, ValueType OpTy2d,
+                                ValueType ResTy2s, ValueType OpTy2s,
+                                SDPatternOperator Neon_Op> {
+
+  def 4s : NeonI_2VMisc<0b1, U, {Size, 0b0}, opcode,
+                        (outs VPR128:$Rd), (ins VPR128:$Rn),
+                        asmop # "\t$Rd.4s, $Rn.4s",
+                        [(set (ResTy4s VPR128:$Rd),
+                           (ResTy4s (Neon_Op (OpTy4s VPR128:$Rn))))],
+                        NoItinerary>;
+
+  def 2d : NeonI_2VMisc<0b1, U, {Size, 0b1}, opcode,
+                        (outs VPR128:$Rd), (ins VPR128:$Rn),
+                        asmop # "\t$Rd.2d, $Rn.2d",
+                        [(set (ResTy2d VPR128:$Rd),
+                           (ResTy2d (Neon_Op (OpTy2d VPR128:$Rn))))],
+                        NoItinerary>;
+
+  def 2s : NeonI_2VMisc<0b0, U, {Size, 0b0}, opcode,
+                        (outs VPR64:$Rd), (ins VPR64:$Rn),
+                        asmop # "\t$Rd.2s, $Rn.2s",
+                        [(set (ResTy2s VPR64:$Rd),
+                           (ResTy2s (Neon_Op (OpTy2s VPR64:$Rn))))],
+                        NoItinerary>;
+}
+
+multiclass NeonI_2VMisc_fp_to_int<string asmop, bit Size, bit U,
+                                  bits<5> opcode, SDPatternOperator Neon_Op> {
+  defm _ : NeonI_2VMisc_SD_Conv<asmop, Size, U, opcode, v4i32, v4f32, v2i64,
+                                v2f64, v2i32, v2f32, Neon_Op>;
+}
+
+defm FCVTNS : NeonI_2VMisc_fp_to_int<"fcvtns", 0b0, 0b0, 0b11010,
+                                     int_aarch64_neon_fcvtns>;
+defm FCVTNU : NeonI_2VMisc_fp_to_int<"fcvtnu", 0b0, 0b1, 0b11010,
+                                     int_aarch64_neon_fcvtnu>;
+defm FCVTPS : NeonI_2VMisc_fp_to_int<"fcvtps", 0b1, 0b0, 0b11010,
+                                     int_aarch64_neon_fcvtps>;
+defm FCVTPU : NeonI_2VMisc_fp_to_int<"fcvtpu", 0b1, 0b1, 0b11010,
+                                     int_aarch64_neon_fcvtpu>;
+defm FCVTMS : NeonI_2VMisc_fp_to_int<"fcvtms", 0b0, 0b0, 0b11011,
+                                     int_aarch64_neon_fcvtms>;
+defm FCVTMU : NeonI_2VMisc_fp_to_int<"fcvtmu", 0b0, 0b1, 0b11011,
+                                     int_aarch64_neon_fcvtmu>;
+defm FCVTZS : NeonI_2VMisc_fp_to_int<"fcvtzs", 0b1, 0b0, 0b11011, fp_to_sint>;
+defm FCVTZU : NeonI_2VMisc_fp_to_int<"fcvtzu", 0b1, 0b1, 0b11011, fp_to_uint>;
+defm FCVTAS : NeonI_2VMisc_fp_to_int<"fcvtas", 0b0, 0b0, 0b11100,
+                                     int_aarch64_neon_fcvtas>;
+defm FCVTAU : NeonI_2VMisc_fp_to_int<"fcvtau", 0b0, 0b1, 0b11100,
+                                     int_aarch64_neon_fcvtau>;
+
+multiclass NeonI_2VMisc_int_to_fp<string asmop, bit Size, bit U,
+                                  bits<5> opcode, SDPatternOperator Neon_Op> {
+  defm _ : NeonI_2VMisc_SD_Conv<asmop, Size, U, opcode, v4f32, v4i32, v2f64,
+                                v2i64, v2f32, v2i32, Neon_Op>;
+}
+
+defm SCVTF : NeonI_2VMisc_int_to_fp<"scvtf", 0b0, 0b0, 0b11101, sint_to_fp>;
+defm UCVTF : NeonI_2VMisc_int_to_fp<"ucvtf", 0b0, 0b1, 0b11101, uint_to_fp>;
+
+multiclass NeonI_2VMisc_fp_to_fp<string asmop, bit Size, bit U,
+                                 bits<5> opcode, SDPatternOperator Neon_Op> {
+  defm _ : NeonI_2VMisc_SD_Conv<asmop, Size, U, opcode, v4f32, v4f32, v2f64,
+                                v2f64, v2f32, v2f32, Neon_Op>;
+}
+
+defm FRINTN : NeonI_2VMisc_fp_to_fp<"frintn", 0b0, 0b0, 0b11000,
+                                     int_aarch64_neon_frintn>;
+defm FRINTA : NeonI_2VMisc_fp_to_fp<"frinta", 0b0, 0b1, 0b11000, frnd>;
+defm FRINTP : NeonI_2VMisc_fp_to_fp<"frintp", 0b1, 0b0, 0b11000, fceil>;
+defm FRINTM : NeonI_2VMisc_fp_to_fp<"frintm", 0b0, 0b0, 0b11001, ffloor>;
+defm FRINTX : NeonI_2VMisc_fp_to_fp<"frintx", 0b0, 0b1, 0b11001, frint>;
+defm FRINTZ : NeonI_2VMisc_fp_to_fp<"frintz", 0b1, 0b0, 0b11001, ftrunc>;
+defm FRINTI : NeonI_2VMisc_fp_to_fp<"frinti", 0b1, 0b1, 0b11001, fnearbyint>;
+defm FRECPE : NeonI_2VMisc_fp_to_fp<"frecpe", 0b1, 0b0, 0b11101,
+                                    int_arm_neon_vrecpe>;
+defm FRSQRTE : NeonI_2VMisc_fp_to_fp<"frsqrte", 0b1, 0b1, 0b11101,
+                                     int_arm_neon_vrsqrte>;
+defm FSQRT : NeonI_2VMisc_fp_to_fp<"fsqrt", 0b1, 0b1, 0b11111, fsqrt>;
+
+multiclass NeonI_2VMisc_S_Conv<string asmop, bit Size, bit U,
+                               bits<5> opcode, SDPatternOperator Neon_Op> {
+  def 4s : NeonI_2VMisc<0b1, U, {Size, 0b0}, opcode,
+                        (outs VPR128:$Rd), (ins VPR128:$Rn),
+                        asmop # "\t$Rd.4s, $Rn.4s",
+                        [(set (v4i32 VPR128:$Rd),
+                           (v4i32 (Neon_Op (v4i32 VPR128:$Rn))))],
+                        NoItinerary>;
+
+  def 2s : NeonI_2VMisc<0b0, U, {Size, 0b0}, opcode,
+                        (outs VPR64:$Rd), (ins VPR64:$Rn),
+                        asmop # "\t$Rd.2s, $Rn.2s",
+                        [(set (v2i32 VPR64:$Rd),
+                           (v2i32 (Neon_Op (v2i32 VPR64:$Rn))))],
+                        NoItinerary>;
+}
+
+defm URECPE : NeonI_2VMisc_S_Conv<"urecpe", 0b1, 0b0, 0b11100,
+                                  int_arm_neon_vrecpe>;
+defm URSQRTE : NeonI_2VMisc_S_Conv<"ursqrte", 0b1, 0b1, 0b11100,
+                                   int_arm_neon_vrsqrte>;
+
+// Crypto Class
+class NeonI_Cryptoaes_2v<bits<2> size, bits<5> opcode,
+                         string asmop, SDPatternOperator opnode>
+  : NeonI_Crypto_AES<size, opcode,
+                     (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
+                     asmop # "\t$Rd.16b, $Rn.16b",
+                     [(set (v16i8 VPR128:$Rd),
+                        (v16i8 (opnode (v16i8 VPR128:$src),
+                                       (v16i8 VPR128:$Rn))))],
+                     NoItinerary>{
+  let Constraints = "$src = $Rd";
+  let Predicates = [HasNEON, HasCrypto];
+}
+
+def AESE : NeonI_Cryptoaes_2v<0b00, 0b00100, "aese", int_arm_neon_aese>;
+def AESD : NeonI_Cryptoaes_2v<0b00, 0b00101, "aesd", int_arm_neon_aesd>;
+
+class NeonI_Cryptoaes<bits<2> size, bits<5> opcode,
+                      string asmop, SDPatternOperator opnode>
+  : NeonI_Crypto_AES<size, opcode,
+                     (outs VPR128:$Rd), (ins VPR128:$Rn),
+                     asmop # "\t$Rd.16b, $Rn.16b",
+                     [(set (v16i8 VPR128:$Rd),
+                        (v16i8 (opnode (v16i8 VPR128:$Rn))))],
+                     NoItinerary>;
+
+def AESMC : NeonI_Cryptoaes<0b00, 0b00110, "aesmc", int_arm_neon_aesmc>;
+def AESIMC : NeonI_Cryptoaes<0b00, 0b00111, "aesimc", int_arm_neon_aesimc>;
+
+class NeonI_Cryptosha_vv<bits<2> size, bits<5> opcode,
+                         string asmop, SDPatternOperator opnode>
+  : NeonI_Crypto_SHA<size, opcode,
+                     (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
+                     asmop # "\t$Rd.4s, $Rn.4s",
+                     [(set (v4i32 VPR128:$Rd),
+                        (v4i32 (opnode (v4i32 VPR128:$src),
+                                       (v4i32 VPR128:$Rn))))],
+                     NoItinerary> {
+  let Constraints = "$src = $Rd";
+  let Predicates = [HasNEON, HasCrypto];
+}
+
+def SHA1SU1 : NeonI_Cryptosha_vv<0b00, 0b00001, "sha1su1",
+                                 int_arm_neon_sha1su1>;
+def SHA256SU0 : NeonI_Cryptosha_vv<0b00, 0b00010, "sha256su0",
+                                   int_arm_neon_sha256su0>;
+
+class NeonI_Cryptosha_ss<bits<2> size, bits<5> opcode,
+                         string asmop, SDPatternOperator opnode>
+  : NeonI_Crypto_SHA<size, opcode,
+                     (outs FPR32:$Rd), (ins FPR32:$Rn),
+                     asmop # "\t$Rd, $Rn",
+                     [(set (v1i32 FPR32:$Rd),
+                        (v1i32 (opnode (v1i32 FPR32:$Rn))))],
+                     NoItinerary> {
+  let Predicates = [HasNEON, HasCrypto];
+}
+
+def SHA1H : NeonI_Cryptosha_ss<0b00, 0b00000, "sha1h", int_arm_neon_sha1h>;
+
+class NeonI_Cryptosha3_vvv<bits<2> size, bits<3> opcode, string asmop,
+                           SDPatternOperator opnode>
+  : NeonI_Crypto_3VSHA<size, opcode,
+                       (outs VPR128:$Rd),
+                       (ins VPR128:$src, VPR128:$Rn, VPR128:$Rm),
+                       asmop # "\t$Rd.4s, $Rn.4s, $Rm.4s",
+                       [(set (v4i32 VPR128:$Rd),
+                          (v4i32 (opnode (v4i32 VPR128:$src),
+                                         (v4i32 VPR128:$Rn),
+                                         (v4i32 VPR128:$Rm))))],
+                       NoItinerary> {
+  let Constraints = "$src = $Rd";
+  let Predicates = [HasNEON, HasCrypto];
+}
+
+def SHA1SU0 : NeonI_Cryptosha3_vvv<0b00, 0b011, "sha1su0",
+                                   int_arm_neon_sha1su0>;
+def SHA256SU1 : NeonI_Cryptosha3_vvv<0b00, 0b110, "sha256su1",
+                                     int_arm_neon_sha256su1>;
+
+class NeonI_Cryptosha3_qqv<bits<2> size, bits<3> opcode, string asmop,
+                           SDPatternOperator opnode>
+  : NeonI_Crypto_3VSHA<size, opcode,
+                       (outs FPR128:$Rd),
+                       (ins FPR128:$src, FPR128:$Rn, VPR128:$Rm),
+                       asmop # "\t$Rd, $Rn, $Rm.4s",
+                       [(set (v4i32 FPR128:$Rd),
+                          (v4i32 (opnode (v4i32 FPR128:$src),
+                                         (v4i32 FPR128:$Rn),
+                                         (v4i32 VPR128:$Rm))))],
+                       NoItinerary> {
+  let Constraints = "$src = $Rd";
+  let Predicates = [HasNEON, HasCrypto];
+}
+
+def SHA256H : NeonI_Cryptosha3_qqv<0b00, 0b100, "sha256h",
+                                   int_arm_neon_sha256h>;
+def SHA256H2 : NeonI_Cryptosha3_qqv<0b00, 0b101, "sha256h2",
+                                    int_arm_neon_sha256h2>;
+
+class NeonI_Cryptosha3_qsv<bits<2> size, bits<3> opcode, string asmop,
+                           SDPatternOperator opnode>
+  : NeonI_Crypto_3VSHA<size, opcode,
+                       (outs FPR128:$Rd),
+                       (ins FPR128:$src, FPR32:$Rn, VPR128:$Rm),
+                       asmop # "\t$Rd, $Rn, $Rm.4s",
+                       [(set (v4i32 FPR128:$Rd),
+                          (v4i32 (opnode (v4i32 FPR128:$src),
+                                         (v1i32 FPR32:$Rn),
+                                         (v4i32 VPR128:$Rm))))],
+                       NoItinerary> {
+  let Constraints = "$src = $Rd";
+  let Predicates = [HasNEON, HasCrypto];
+}
+
+def SHA1C : NeonI_Cryptosha3_qsv<0b00, 0b000, "sha1c", int_aarch64_neon_sha1c>;
+def SHA1P : NeonI_Cryptosha3_qsv<0b00, 0b001, "sha1p", int_aarch64_neon_sha1p>;
+def SHA1M : NeonI_Cryptosha3_qsv<0b00, 0b010, "sha1m", int_aarch64_neon_sha1m>;
+
+//
+// Patterns for handling half-precision values
+//
+
+// Convert f16 value coming in as i16 value to f32
+def : Pat<(f32 (f16_to_f32 (i32 (and (i32 GPR32:$Rn), 65535)))),
+          (FCVTsh (EXTRACT_SUBREG (FMOVsw GPR32:$Rn), sub_16))>;
+def : Pat<(f32 (f16_to_f32 (i32 (assertzext GPR32:$Rn)))),
+          (FCVTsh (EXTRACT_SUBREG (FMOVsw GPR32:$Rn), sub_16))>;
+
+def : Pat<(f32 (f16_to_f32 (i32 (assertzext (i32 (
+            f32_to_f16 (f32 FPR32:$Rn))))))),
+          (f32 FPR32:$Rn)>;
+
+// Patterns for vector extract of half-precision FP value in i16 storage type
+def : Pat<(f32 (f16_to_f32 ( i32 (and (i32 (vector_extract
+            (v4i16 VPR64:$Rn), neon_uimm2_bare:$Imm)), 65535)))),
+          (FCVTsh (f16 (DUPhv_H
+            (v8i16 (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
+            neon_uimm2_bare:$Imm)))>;
+
+def : Pat<(f32 (f16_to_f32 ( i32 (and (i32 (vector_extract
+            (v8i16 VPR128:$Rn), neon_uimm3_bare:$Imm)), 65535)))),
+          (FCVTsh (f16 (DUPhv_H (v8i16 VPR128:$Rn), neon_uimm3_bare:$Imm)))>;
+
+// Patterns for vector insert of half-precision FP value 0 in i16 storage type
+def : Pat<(v8i16 (vector_insert (v8i16 VPR128:$Rn),
+            (i32 (assertsext (i32 (fp_to_sint(f32 (f16_to_f32 (i32 0))))))),
+            (neon_uimm3_bare:$Imm))),
+          (v8i16 (INSELh (v8i16 VPR128:$Rn),
+            (v8i16 (SUBREG_TO_REG (i64 0),
+              (f16 (EXTRACT_SUBREG (f32 (FMOVsw (i32 WZR))), sub_16)),
+              sub_16)),
+            neon_uimm3_bare:$Imm, 0))>;
+
+def : Pat<(v4i16 (vector_insert (v4i16 VPR64:$Rn),
+            (i32 (assertsext (i32 (fp_to_sint(f32 (f16_to_f32 (i32 0))))))),
+            (neon_uimm2_bare:$Imm))),
+          (v4i16 (EXTRACT_SUBREG
+            (v8i16 (INSELh
+              (v8i16 (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
+              (v8i16 (SUBREG_TO_REG (i64 0),
+                (f16 (EXTRACT_SUBREG (f32 (FMOVsw (i32 WZR))), sub_16)),
+                sub_16)),
+              neon_uimm2_bare:$Imm, 0)),
+            sub_64))>;
+
+// Patterns for vector insert of half-precision FP value in i16 storage type
+def : Pat<(v8i16 (vector_insert (v8i16 VPR128:$Rn),
+            (i32 (assertsext (i32 (fp_to_sint
+              (f32 (f16_to_f32 (i32 (and (i32 GPR32:$src), 65535)))))))),
+            (neon_uimm3_bare:$Imm))),
+          (v8i16 (INSELh (v8i16 VPR128:$Rn),
+            (v8i16 (SUBREG_TO_REG (i64 0),
+              (f16 (EXTRACT_SUBREG (f32 (FMOVsw (i32 GPR32:$src))), sub_16)),
+              sub_16)),
+            neon_uimm3_bare:$Imm, 0))>;
+
+def : Pat<(v4i16 (vector_insert (v4i16 VPR64:$Rn),
+            (i32 (assertsext (i32 (fp_to_sint
+              (f32 (f16_to_f32 (i32 (and (i32 GPR32:$src), 65535)))))))),
+            (neon_uimm2_bare:$Imm))),
+          (v4i16 (EXTRACT_SUBREG
+            (v8i16 (INSELh
+              (v8i16 (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
+              (v8i16 (SUBREG_TO_REG (i64 0),
+                (f16 (EXTRACT_SUBREG (f32 (FMOVsw (i32 GPR32:$src))), sub_16)),
+                sub_16)),
+              neon_uimm2_bare:$Imm, 0)),
+            sub_64))>;
+
+def : Pat<(v8i16 (vector_insert (v8i16 VPR128:$Rn),
+            (i32 (vector_extract (v8i16 VPR128:$src), neon_uimm3_bare:$Imm2)),
+              (neon_uimm3_bare:$Imm1))),
+          (v8i16 (INSELh (v8i16 VPR128:$Rn), (v8i16 VPR128:$src),
+            neon_uimm3_bare:$Imm1, neon_uimm3_bare:$Imm2))>;
+
+// Patterns for vector copy of half-precision FP value in i16 storage type
+def : Pat<(v8i16 (vector_insert (v8i16 VPR128:$Rn),
+            (i32 (assertsext (i32 (fp_to_sint(f32 (f16_to_f32 (i32 (and (i32
+              (vector_extract (v8i16 VPR128:$src), neon_uimm3_bare:$Imm2)),
+              65535)))))))),
+            (neon_uimm3_bare:$Imm1))),
+          (v8i16 (INSELh (v8i16 VPR128:$Rn), (v8i16 VPR128:$src),
+            neon_uimm3_bare:$Imm1, neon_uimm3_bare:$Imm2))>;
+
+def : Pat<(v4i16 (vector_insert (v4i16 VPR64:$Rn),
+            (i32 (assertsext (i32 (fp_to_sint(f32 (f16_to_f32 (i32 (and (i32
+              (vector_extract (v4i16 VPR64:$src), neon_uimm3_bare:$Imm2)),
+              65535)))))))),
+            (neon_uimm3_bare:$Imm1))),
+          (v4i16 (EXTRACT_SUBREG
+            (v8i16 (INSELh
+              (v8i16 (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
+              (v8i16 (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64)),
+              neon_uimm3_bare:$Imm1, neon_uimm3_bare:$Imm2)),
+            sub_64))>;
+
 
-def : Pat<(f64   (bitconvert (v8i8  VPR64:$src))),
-                 (f64 (EXTRACT_SUBREG (v8i8  VPR64:$src), sub_64))>;
-def : Pat<(f64   (bitconvert (v4i16  VPR64:$src))),
-                 (f64 (EXTRACT_SUBREG (v4i16  VPR64:$src), sub_64))>;
-def : Pat<(f64   (bitconvert (v2i32  VPR64:$src))),
-                 (f64 (EXTRACT_SUBREG (v2i32  VPR64:$src), sub_64))>;
-def : Pat<(f64   (bitconvert (v2f32  VPR64:$src))),
-                 (f64 (EXTRACT_SUBREG (v2f32  VPR64:$src), sub_64))>;
-def : Pat<(f64   (bitconvert (v1i64  VPR64:$src))),
-                 (f64 (EXTRACT_SUBREG (v1i64  VPR64:$src), sub_64))>;
-def : Pat<(f128  (bitconvert (v16i8  VPR128:$src))),
-                 (f128 (EXTRACT_SUBREG (v16i8  VPR128:$src), sub_alias))>;
-def : Pat<(f128  (bitconvert (v8i16  VPR128:$src))),
-                 (f128 (EXTRACT_SUBREG (v8i16  VPR128:$src), sub_alias))>;
-def : Pat<(f128  (bitconvert (v4i32  VPR128:$src))),
-                 (f128 (EXTRACT_SUBREG (v4i32  VPR128:$src), sub_alias))>;
-def : Pat<(f128  (bitconvert (v2i64  VPR128:$src))),
-                 (f128 (EXTRACT_SUBREG (v2i64  VPR128:$src), sub_alias))>;
-def : Pat<(f128  (bitconvert (v4f32  VPR128:$src))),
-                 (f128 (EXTRACT_SUBREG (v4f32  VPR128:$src), sub_alias))>;
-def : Pat<(f128  (bitconvert (v2f64  VPR128:$src))),
-                 (f128 (EXTRACT_SUBREG (v2f64  VPR128:$src), sub_alias))>;
-
-def : Pat<(v8i8   (bitconvert (f64   FPR64:$src))),
-                  (v8i8 (SUBREG_TO_REG (i64 0), (f64  FPR64:$src), sub_64))>;
-def : Pat<(v4i16  (bitconvert (f64   FPR64:$src))),
-                  (v4i16 (SUBREG_TO_REG (i64 0), (f64  FPR64:$src), sub_64))>;
-def : Pat<(v2i32  (bitconvert (f64   FPR64:$src))),
-                  (v2i32 (SUBREG_TO_REG (i64 0), (f64  FPR64:$src), sub_64))>;
-def : Pat<(v2f32  (bitconvert (f64   FPR64:$src))),
-                  (v2f32 (SUBREG_TO_REG (i64 0), (f64  FPR64:$src), sub_64))>;
-def : Pat<(v1i64  (bitconvert (f64   FPR64:$src))),
-                  (v1i64 (SUBREG_TO_REG (i64 0), (f64  FPR64:$src), sub_64))>;
-def : Pat<(v16i8  (bitconvert (f128   FPR128:$src))),
-                  (v16i8 (SUBREG_TO_REG (i128 0), (f128  FPR128:$src),
-                  sub_alias))>;
-def : Pat<(v8i16  (bitconvert (f128   FPR128:$src))),
-                  (v8i16 (SUBREG_TO_REG (i128 0), (f128  FPR128:$src),
-                  sub_alias))>;
-def : Pat<(v4i32  (bitconvert (f128   FPR128:$src))),
-                  (v4i32 (SUBREG_TO_REG (i128 0), (f128  FPR128:$src),
-                  sub_alias))>;
-def : Pat<(v2i64  (bitconvert (f128   FPR128:$src))),
-                  (v2i64 (SUBREG_TO_REG (i128 0), (f128  FPR128:$src),
-                  sub_alias))>;
-def : Pat<(v4f32  (bitconvert (f128   FPR128:$src))),
-                  (v4f32 (SUBREG_TO_REG (i128 0), (f128  FPR128:$src),
-                  sub_alias))>;
-def : Pat<(v2f64  (bitconvert (f128   FPR128:$src))),
-                  (v2f64 (SUBREG_TO_REG (i128 0), (f128  FPR128:$src),
-                  sub_alias))>;
diff --git a/lib/Target/AArch64/AArch64MCInstLower.cpp b/lib/Target/AArch64/AArch64MCInstLower.cpp
index 7ce5ce3..8cfb968 100644
--- a/lib/Target/AArch64/AArch64MCInstLower.cpp
+++ b/lib/Target/AArch64/AArch64MCInstLower.cpp
@@ -121,7 +121,7 @@ bool AArch64AsmPrinter::lowerOperand(const MachineOperand &MO,
     MCOp = lowerSymbolOperand(MO, GetExternalSymbolSymbol(MO.getSymbolName()));
     break;
   case MachineOperand::MO_GlobalAddress:
-    MCOp = lowerSymbolOperand(MO, Mang->getSymbol(MO.getGlobal()));
+    MCOp = lowerSymbolOperand(MO, getSymbol(MO.getGlobal()));
     break;
   case MachineOperand::MO_MachineBasicBlock:
     MCOp = MCOperand::CreateExpr(MCSymbolRefExpr::Create(
diff --git a/lib/Target/AArch64/AArch64RegisterInfo.td b/lib/Target/AArch64/AArch64RegisterInfo.td
index b3a81b1..4e2022c 100644
--- a/lib/Target/AArch64/AArch64RegisterInfo.td
+++ b/lib/Target/AArch64/AArch64RegisterInfo.td
@@ -18,9 +18,19 @@ def sub_32 : SubRegIndex<32>;
 def sub_16 : SubRegIndex<16>;
 def sub_8  : SubRegIndex<8>;
 
-// The VPR registers are handled as sub-registers of FPR equivalents, but
-// they're really the same thing. We give this concept a special index.
-def sub_alias : SubRegIndex<128>;
+// Note: Code depends on these having consecutive numbers.
+def qqsub : SubRegIndex<256, 256>;
+
+def qsub_0 : SubRegIndex<128>;
+def qsub_1 : SubRegIndex<128, 128>;
+def qsub_2 : ComposedSubRegIndex<qqsub, qsub_0>;
+def qsub_3 : ComposedSubRegIndex<qqsub, qsub_1>;
+
+def dsub_0 : SubRegIndex<64>;
+def dsub_1 : SubRegIndex<64, 64>;
+def dsub_2 : ComposedSubRegIndex<qsub_1, dsub_0>;
+def dsub_3 : ComposedSubRegIndex<qsub_1, dsub_1>;
+def dsub_4 : ComposedSubRegIndex<qsub_2, dsub_0>;
 }
 
 // Registers are identified with 5-bit ID numbers.
@@ -137,60 +147,51 @@ foreach Index = 0-31 in {
 }
 
 
-def FPR8 : RegisterClass<"AArch64", [i8], 8,
+def FPR8 : RegisterClass<"AArch64", [i8, v1i8], 8,
                           (sequence "B%u", 0, 31)> {
 }
 
-def FPR16 : RegisterClass<"AArch64", [f16], 16,
+def FPR16 : RegisterClass<"AArch64", [f16, v1i16], 16,
                           (sequence "H%u", 0, 31)> {
 }
 
-def FPR32 : RegisterClass<"AArch64", [f32], 32,
+def FPR32 : RegisterClass<"AArch64", [f32, v1i32, v1f32], 32,
                           (sequence "S%u", 0, 31)> {
 }
 
-def FPR64 : RegisterClass<"AArch64", [f64], 64,
-                          (sequence "D%u", 0, 31)> {
-}
+def FPR64 : RegisterClass<"AArch64",
+                          [f64, v2f32, v2i32, v4i16, v8i8, v1i64, v1f64],
+                          64, (sequence "D%u", 0, 31)>;
 
-def FPR128 : RegisterClass<"AArch64", [f128], 128,
-                          (sequence "Q%u", 0, 31)> {
-}
+def FPR128 : RegisterClass<"AArch64",
+                           [f128, v2f64, v2i64, v4f32, v4i32, v8i16, v16i8],
+                           128, (sequence "Q%u", 0, 31)>;
+
+def FPR64Lo : RegisterClass<"AArch64",
+                            [f64, v2f32, v2i32, v4i16, v8i8, v1i64, v1f64],
+                            64, (sequence "D%u", 0, 15)>;
 
+def FPR128Lo : RegisterClass<"AArch64",
+                             [f128, v2f64, v2i64, v4f32, v4i32, v8i16, v16i8],
+                             128, (sequence "Q%u", 0, 15)>;
 
 //===----------------------------------------------------------------------===//
 //  Vector registers:
 //===----------------------------------------------------------------------===//
 
-// NEON registers simply specify the overall vector, and it's expected that
-// Instructions will individually specify the acceptable data layout. In
-// principle this leaves two approaches open:
-//   + An operand, giving a single ADDvvv instruction (for example). This turns
-//     out to be unworkable in the assembly parser (without every Instruction
-//     having a "cvt" function, at least) because the constraints can't be
-//     properly enforced. It also complicates specifying patterns since each
-//     instruction will accept many types.
-//  + A bare token (e.g. ".2d"). This means the AsmParser has to know specific
-//    details about NEON registers, but simplifies most other details.
-//
-// The second approach was taken.
-
-foreach Index = 0-31 in {
-  def V # Index  : AArch64RegWithSubs<Index, "v" # Index,
-                                      [!cast<Register>("Q" # Index)],
-                                      [sub_alias]>,
-            DwarfRegNum<[!add(Index, 64)]>;
+def VPR64AsmOperand : AsmOperandClass {
+  let Name = "VPR";
+  let PredicateMethod = "isReg";
+  let RenderMethod = "addRegOperands";
 }
 
-// These two classes contain the same registers, which should be reasonably
-// sensible for MC and allocation purposes, but allows them to be treated
-// separately for things like stack spilling.
-def VPR64 : RegisterClass<"AArch64", [v2f32, v2i32, v4i16, v8i8, v1i64], 64,
-                          (sequence "V%u", 0, 31)>;
+def VPR64 : RegisterOperand<FPR64, "printVPRRegister">;
 
-def VPR128 : RegisterClass<"AArch64",
-                           [v2f64, v2i64, v4f32, v4i32, v8i16, v16i8], 128,
-                           (sequence "V%u", 0, 31)>;
+def VPR128 : RegisterOperand<FPR128, "printVPRRegister">;
+
+def VPR64Lo : RegisterOperand<FPR64Lo, "printVPRRegister">;
+
+def VPR128Lo : RegisterOperand<FPR128Lo, "printVPRRegister">;
 
 // Flags register
 def NZCV : Register<"nzcv"> {
@@ -201,3 +202,90 @@ def FlagClass : RegisterClass<"AArch64", [i32], 32, (add NZCV)> {
   let CopyCost = -1;
   let isAllocatable = 0;
 }
+
+//===----------------------------------------------------------------------===//
+//  Consecutive vector registers
+//===----------------------------------------------------------------------===//
+// 2 Consecutive 64-bit registers: D0_D1, D1_D2, ..., D30_D31
+def Tuples2D : RegisterTuples<[dsub_0, dsub_1],
+                              [(rotl FPR64, 0), (rotl FPR64, 1)]>;
+                              
+// 3 Consecutive 64-bit registers: D0_D1_D2, ..., D31_D0_D1
+def Tuples3D : RegisterTuples<[dsub_0, dsub_1, dsub_2],
+                              [(rotl FPR64, 0), (rotl FPR64, 1),
+                               (rotl FPR64, 2)]>;
+                               
+// 4 Consecutive 64-bit registers: D0_D1_D2_D3, ..., D31_D0_D1_D2
+def Tuples4D : RegisterTuples<[dsub_0, dsub_1, dsub_2, dsub_3],
+                              [(rotl FPR64, 0), (rotl FPR64, 1),
+                               (rotl FPR64, 2), (rotl FPR64, 3)]>;
+
+// 2 Consecutive 128-bit registers: Q0_Q1, Q1_Q2, ..., Q30_Q31
+def Tuples2Q : RegisterTuples<[qsub_0, qsub_1],
+                              [(rotl FPR128, 0), (rotl FPR128, 1)]>;
+
+// 3 Consecutive 128-bit registers: Q0_Q1_Q2, ..., Q31_Q0_Q1
+def Tuples3Q : RegisterTuples<[qsub_0, qsub_1, qsub_2],
+                              [(rotl FPR128, 0), (rotl FPR128, 1),
+                               (rotl FPR128, 2)]>;
+                               
+// 4 Consecutive 128-bit registers: Q0_Q1_Q2_Q3, ..., Q31_Q0_Q1_Q2
+def Tuples4Q : RegisterTuples<[qsub_0, qsub_1, qsub_2, qsub_3],
+                              [(rotl FPR128, 0), (rotl FPR128, 1),
+                               (rotl FPR128, 2), (rotl FPR128, 3)]>;
+
+// The followings are super register classes to model 2/3/4 consecutive
+// 64-bit/128-bit registers.
+
+def DPair : RegisterClass<"AArch64", [v2i64], 64, (add Tuples2D)>;
+
+def DTriple : RegisterClass<"AArch64", [untyped], 64, (add Tuples3D)> {
+  let Size = 192; // 3 x 64 bits, we have no predefined type of that size.
+}
+
+def DQuad : RegisterClass<"AArch64", [v4i64], 64, (add Tuples4D)>;
+
+def QPair : RegisterClass<"AArch64", [v4i64], 128, (add Tuples2Q)>;
+
+def QTriple : RegisterClass<"AArch64", [untyped], 128, (add Tuples3Q)> {
+  let Size = 384; // 3 x 128 bits, we have no predefined type of that size.
+}
+
+def QQuad : RegisterClass<"AArch64", [v8i64], 128, (add Tuples4Q)>;
+
+
+// The followings are vector list operands
+multiclass VectorList_operands<string PREFIX, string LAYOUT, int Count,
+                               RegisterClass RegList> {
+  def _asmoperand : AsmOperandClass {
+    let Name = PREFIX # LAYOUT # Count;
+    let RenderMethod = "addVectorListOperands";
+    let PredicateMethod = 
+        "isVectorList<A64Layout::VL_" # LAYOUT # ", " # Count # ">";
+    let ParserMethod = "ParseVectorList";
+  }
+
+  def _operand : RegisterOperand<RegList,
+        "printVectorList<A64Layout::VL_" # LAYOUT # ", " # Count # ">"> {
+    let ParserMatchClass =
+      !cast<AsmOperandClass>(PREFIX # LAYOUT # "_asmoperand");
+  }
+}
+
+multiclass VectorList_BHSD<string PREFIX, int Count, RegisterClass DRegList,
+                           RegisterClass QRegList> {
+  defm 8B : VectorList_operands<PREFIX, "8B", Count, DRegList>;
+  defm 4H : VectorList_operands<PREFIX, "4H", Count, DRegList>;
+  defm 2S : VectorList_operands<PREFIX, "2S", Count, DRegList>;
+  defm 1D : VectorList_operands<PREFIX, "1D", Count, DRegList>;
+  defm 16B : VectorList_operands<PREFIX, "16B", Count, QRegList>;
+  defm 8H : VectorList_operands<PREFIX, "8H", Count, QRegList>;
+  defm 4S : VectorList_operands<PREFIX, "4S", Count, QRegList>;
+  defm 2D : VectorList_operands<PREFIX, "2D", Count, QRegList>;
+}
+
+// Vector list operand with 1/2/3/4 registers: VOne8B_operand,..., VQuad2D_operand
+defm VOne : VectorList_BHSD<"VOne", 1, FPR64, FPR128>;
+defm VPair : VectorList_BHSD<"VPair", 2, DPair, QPair>;
+defm VTriple : VectorList_BHSD<"VTriple", 3, DTriple, QTriple>;
+defm VQuad : VectorList_BHSD<"VQuad", 4, DQuad, QQuad>;
+\ No newline at end of file
diff --git a/lib/Target/AArch64/AArch64Subtarget.cpp b/lib/Target/AArch64/AArch64Subtarget.cpp
index d71bb4e..5c693c1 100644
--- a/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -25,11 +25,31 @@
 
 using namespace llvm;
 
+// Pin the vtable to this file.
+void AArch64Subtarget::anchor() {}
+
 AArch64Subtarget::AArch64Subtarget(StringRef TT, StringRef CPU, StringRef FS)
-    : AArch64GenSubtargetInfo(TT, CPU, FS), HasNEON(false), HasCrypto(false),
-      TargetTriple(TT) {
+    : AArch64GenSubtargetInfo(TT, CPU, FS), HasFPARMv8(false), HasNEON(false),
+      HasCrypto(false), TargetTriple(TT), CPUString(CPU) {
+
+  initializeSubtargetFeatures(CPU, FS);
+}
+
+void AArch64Subtarget::initializeSubtargetFeatures(StringRef CPU,
+                                                   StringRef FS) {
+  if (CPU.empty())
+    CPUString = "generic";
+
+  std::string FullFS = FS;
+  if (CPUString == "generic") {
+    // Enable FP by default.
+    if (FullFS.empty())
+      FullFS = "+fp-armv8";
+    else
+      FullFS = "+fp-armv8," + FullFS;
+  }
 
-  ParseSubtargetFeatures(CPU, FS);
+  ParseSubtargetFeatures(CPU, FullFS);
 }
 
 bool AArch64Subtarget::GVIsIndirectSymbol(const GlobalValue *GV,
diff --git a/lib/Target/AArch64/AArch64Subtarget.h b/lib/Target/AArch64/AArch64Subtarget.h
index 35a7c8d..bbfd3bc 100644
--- a/lib/Target/AArch64/AArch64Subtarget.h
+++ b/lib/Target/AArch64/AArch64Subtarget.h
@@ -27,18 +27,31 @@ class StringRef;
 class GlobalValue;
 
 class AArch64Subtarget : public AArch64GenSubtargetInfo {
+  virtual void anchor();
 protected:
+  bool HasFPARMv8;
   bool HasNEON;
   bool HasCrypto;
 
   /// TargetTriple - What processor and OS we're targeting.
   Triple TargetTriple;
+
+  /// CPUString - String name of used CPU.
+  std::string CPUString;
+
+private:
+  void initializeSubtargetFeatures(StringRef CPU, StringRef FS);
+
 public:
   /// This constructor initializes the data members to match that
   /// of the specified triple.
   ///
   AArch64Subtarget(StringRef TT, StringRef CPU, StringRef FS);
 
+  virtual bool enableMachineScheduler() const {
+    return true;
+  }
+
   /// ParseSubtargetFeatures - Parses features string setting specified
   /// subtarget options.  Definition of function is auto generated by tblgen.
   void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
@@ -46,11 +59,13 @@ public:
   bool GVIsIndirectSymbol(const GlobalValue *GV, Reloc::Model RelocM) const;
 
   bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); }
-  bool isTargetLinux() const { return TargetTriple.getOS() == Triple::Linux; }
+  bool isTargetLinux() const { return TargetTriple.isOSLinux(); }
 
+  bool hasFPARMv8() const { return HasFPARMv8; }
   bool hasNEON() const { return HasNEON; }
-
   bool hasCrypto() const { return HasCrypto; }
+
+  const std::string & getCPUString() const { return CPUString; }
 };
 } // End llvm namespace
 
diff --git a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index 43e91ac..fbbce11 100644
--- a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -54,8 +54,9 @@ public:
 #include "AArch64GenAsmMatcher.inc"
   };
 
-  AArch64AsmParser(MCSubtargetInfo &_STI, MCAsmParser &_Parser)
-    : MCTargetAsmParser(), STI(_STI), Parser(_Parser) {
+  AArch64AsmParser(MCSubtargetInfo &_STI, MCAsmParser &_Parser,
+                   const MCInstrInfo &MII)
+      : MCTargetAsmParser(), STI(_STI), Parser(_Parser) {
     MCAsmParserExtension::Initialize(_Parser);
 
     // Initialize the set of available features.
@@ -126,6 +127,11 @@ public:
   OperandMatchResultTy
   ParseSysRegOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
 
+  bool TryParseVector(uint32_t &RegNum, SMLoc &RegEndLoc, StringRef &Layout,
+                      SMLoc &LayoutLoc);
+
+  OperandMatchResultTy ParseVectorList(SmallVectorImpl<MCParsedAsmOperand *> &);
+
   bool validateInstruction(MCInst &Inst,
                           const SmallVectorImpl<MCParsedAsmOperand*> &Operands);
 
@@ -153,6 +159,7 @@ private:
     k_Immediate,      // Including expressions referencing symbols
     k_Register,
     k_ShiftExtend,
+    k_VectorList,     // A sequential list of 1 to 4 registers.
     k_SysReg,         // The register operand of MRS and MSR instructions
     k_Token,          // The mnemonic; other raw tokens the auto-generated
     k_WrappedRegister // Load/store exclusive permit a wrapped register.
@@ -188,6 +195,13 @@ private:
     bool ImplicitAmount;
   };
 
+  // A vector register list is a sequential list of 1 to 4 registers.
+  struct VectorListOp {
+    unsigned RegNum;
+    unsigned Count;
+    A64Layout::VectorLayout Layout;
+  };
+
   struct SysRegOp {
     const char *Data;
     unsigned Length;
@@ -205,6 +219,7 @@ private:
     struct ImmOp Imm;
     struct RegOp Reg;
     struct ShiftExtendOp ShiftExtend;
+    struct VectorListOp VectorList;
     struct SysRegOp SysReg;
     struct TokOp Tok;
   };
@@ -664,6 +679,44 @@ public:
     return !ShiftExtend.ImplicitAmount && ShiftExtend.Amount <= 4;
   }
 
+  // if 0 < value <= w, return true
+  bool isShrFixedWidth(int w) const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE)
+      return false;
+    int64_t Value = CE->getValue();
+    return Value > 0 && Value <= w;
+  }
+
+  bool isShrImm8() const { return isShrFixedWidth(8); }
+
+  bool isShrImm16() const { return isShrFixedWidth(16); }
+
+  bool isShrImm32() const { return isShrFixedWidth(32); }
+
+  bool isShrImm64() const { return isShrFixedWidth(64); }
+
+  // if 0 <= value < w, return true
+  bool isShlFixedWidth(int w) const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE)
+      return false;
+    int64_t Value = CE->getValue();
+    return Value >= 0 && Value < w;
+  }
+
+  bool isShlImm8() const { return isShlFixedWidth(8); }
+
+  bool isShlImm16() const { return isShlFixedWidth(16); }
+
+  bool isShlImm32() const { return isShlFixedWidth(32); }
+
+  bool isShlImm64() const { return isShlFixedWidth(64); }
+
   bool isNeonMovImmShiftLSL() const {
     if (!isShiftOrExtend())
       return false;
@@ -697,6 +750,12 @@ public:
     return ShiftExtend.Amount == 8 || ShiftExtend.Amount == 16;
   }
 
+  template <A64Layout::VectorLayout Layout, unsigned Count>
+  bool isVectorList() const {
+    return Kind == k_VectorList && VectorList.Layout == Layout &&
+           VectorList.Count == Count;
+  }
+
   template <int MemSize> bool isSImm7Scaled() const {
     if (!isImm())
       return false;
@@ -756,6 +815,17 @@ public:
     return true;
   }
 
+  // if value == N, return true
+  template<int N>
+  bool isExactImm() const {
+    if (!isImm()) return false;
+
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+
+    return CE->getValue() == N;
+  }
+
   static AArch64Operand *CreateImmWithLSL(const MCExpr *Val,
                                           unsigned ShiftAmount,
                                           bool ImplicitAmount,
@@ -817,6 +887,18 @@ public:
     return Op;
   }
 
+  static AArch64Operand *CreateVectorList(unsigned RegNum, unsigned Count,
+                                          A64Layout::VectorLayout Layout,
+                                          SMLoc S, SMLoc E) {
+    AArch64Operand *Op = new AArch64Operand(k_VectorList, S, E);
+    Op->VectorList.RegNum = RegNum;
+    Op->VectorList.Count = Count;
+    Op->VectorList.Layout = Layout;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
   static AArch64Operand *CreateToken(StringRef Str, SMLoc S) {
     AArch64Operand *Op = new AArch64Operand(k_Token, S, S);
     Op->Tok.Data = Str.data();
@@ -1164,6 +1246,11 @@ public:
     }
     Inst.addOperand(MCOperand::CreateImm(Imm));
   }
+
+  void addVectorListOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(VectorList.RegNum));
+  }
 };
 
 } // end anonymous namespace.
@@ -1203,7 +1290,6 @@ AArch64AsmParser::ParseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
       else
         return MatchOperand_Success;
     }
-
     // ... or it might be a symbolish thing
   }
     // Fall through
@@ -1247,7 +1333,7 @@ AArch64AsmParser::ParseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
     return ParseOperand(Operands, Mnemonic);
   }
   // The following will likely be useful later, but not in very early cases
-  case AsmToken::LCurly:  // Weird SIMD lists
+  case AsmToken::LCurly: // SIMD vector list is not parsed here
     llvm_unreachable("Don't know how to deal with '{' in operand");
     return MatchOperand_ParseFail;
   }
@@ -1405,7 +1491,7 @@ AArch64AsmParser::ParseImmWithLSLOperand(
 
   // The optional operand must be "lsl #N" where N is non-negative.
   if (Parser.getTok().is(AsmToken::Identifier)
-      && Parser.getTok().getIdentifier().lower() == "lsl") {
+      && Parser.getTok().getIdentifier().equals_lower("lsl")) {
     Parser.Lex();
 
     if (Parser.getTok().is(AsmToken::Hash)) {
@@ -1462,9 +1548,8 @@ AArch64AsmParser::ParseCRxOperand(
     return MatchOperand_ParseFail;
   }
 
-  std::string LowerTok = Parser.getTok().getIdentifier().lower();
-  StringRef Tok(LowerTok);
-  if (Tok[0] != 'c') {
+  StringRef Tok = Parser.getTok().getIdentifier();
+  if (Tok[0] != 'c' && Tok[0] != 'C') {
     Error(S, "Expected cN operand where 0 <= N <= 15");
     return MatchOperand_ParseFail;
   }
@@ -1536,22 +1621,11 @@ AArch64AsmParser::IdentifyRegister(unsigned &RegNum, SMLoc &RegEndLoc,
   std::string LowerReg = Tok.getString().lower();
   size_t DotPos = LowerReg.find('.');
 
-  RegNum = MatchRegisterName(LowerReg.substr(0, DotPos));
-  if (RegNum == AArch64::NoRegister) {
-    RegNum = StringSwitch<unsigned>(LowerReg.substr(0, DotPos))
-      .Case("ip0", AArch64::X16)
-      .Case("ip1", AArch64::X17)
-      .Case("fp", AArch64::X29)
-      .Case("lr", AArch64::X30)
-      .Default(AArch64::NoRegister);
-  }
-  if (RegNum == AArch64::NoRegister)
-    return false;
-
+  bool IsVec128 = false;
   SMLoc S = Tok.getLoc();
   RegEndLoc = SMLoc::getFromPointer(S.getPointer() + DotPos);
 
-  if (DotPos == StringRef::npos) {
+  if (DotPos == std::string::npos) {
     Layout = StringRef();
   } else {
     // Everything afterwards needs to be a literal token, expected to be
@@ -1561,20 +1635,78 @@ AArch64AsmParser::IdentifyRegister(unsigned &RegNum, SMLoc &RegEndLoc,
     // gives us a permanent string to use in the token (a pointer into LowerReg
     // would go out of scope when we return).
     LayoutLoc = SMLoc::getFromPointer(S.getPointer() + DotPos + 1);
-    std::string LayoutText = LowerReg.substr(DotPos, StringRef::npos);
+    StringRef LayoutText = StringRef(LowerReg).substr(DotPos);
+
+    // See if it's a 128-bit layout first.
     Layout = StringSwitch<const char *>(LayoutText)
-      .Case(".d", ".d").Case(".1d", ".1d").Case(".2d", ".2d")
-      .Case(".s", ".s").Case(".2s", ".2s").Case(".4s", ".4s")
-      .Case(".h", ".h").Case(".4h", ".4h").Case(".8h", ".8h")
-      .Case(".b", ".b").Case(".8b", ".8b").Case(".16b", ".16b")
+      .Case(".q", ".q").Case(".1q", ".1q")
+      .Case(".d", ".d").Case(".2d", ".2d")
+      .Case(".s", ".s").Case(".4s", ".4s")
+      .Case(".h", ".h").Case(".8h", ".8h")
+      .Case(".b", ".b").Case(".16b", ".16b")
       .Default("");
 
+    if (Layout.size() != 0)
+      IsVec128 = true;
+    else {
+      Layout = StringSwitch<const char *>(LayoutText)
+                   .Case(".1d", ".1d")
+                   .Case(".2s", ".2s")
+                   .Case(".4h", ".4h")
+                   .Case(".8b", ".8b")
+                   .Default("");
+    }
+
     if (Layout.size() == 0) {
-      // Malformed register
+      // If we've still not pinned it down the register is malformed.
       return false;
     }
   }
 
+  RegNum = MatchRegisterName(LowerReg.substr(0, DotPos));
+  if (RegNum == AArch64::NoRegister) {
+    RegNum = StringSwitch<unsigned>(LowerReg.substr(0, DotPos))
+      .Case("ip0", AArch64::X16)
+      .Case("ip1", AArch64::X17)
+      .Case("fp", AArch64::X29)
+      .Case("lr", AArch64::X30)
+      .Case("v0", IsVec128 ? AArch64::Q0 : AArch64::D0)
+      .Case("v1", IsVec128 ? AArch64::Q1 : AArch64::D1)
+      .Case("v2", IsVec128 ? AArch64::Q2 : AArch64::D2)
+      .Case("v3", IsVec128 ? AArch64::Q3 : AArch64::D3)
+      .Case("v4", IsVec128 ? AArch64::Q4 : AArch64::D4)
+      .Case("v5", IsVec128 ? AArch64::Q5 : AArch64::D5)
+      .Case("v6", IsVec128 ? AArch64::Q6 : AArch64::D6)
+      .Case("v7", IsVec128 ? AArch64::Q7 : AArch64::D7)
+      .Case("v8", IsVec128 ? AArch64::Q8 : AArch64::D8)
+      .Case("v9", IsVec128 ? AArch64::Q9 : AArch64::D9)
+      .Case("v10", IsVec128 ? AArch64::Q10 : AArch64::D10)
+      .Case("v11", IsVec128 ? AArch64::Q11 : AArch64::D11)
+      .Case("v12", IsVec128 ? AArch64::Q12 : AArch64::D12)
+      .Case("v13", IsVec128 ? AArch64::Q13 : AArch64::D13)
+      .Case("v14", IsVec128 ? AArch64::Q14 : AArch64::D14)
+      .Case("v15", IsVec128 ? AArch64::Q15 : AArch64::D15)
+      .Case("v16", IsVec128 ? AArch64::Q16 : AArch64::D16)
+      .Case("v17", IsVec128 ? AArch64::Q17 : AArch64::D17)
+      .Case("v18", IsVec128 ? AArch64::Q18 : AArch64::D18)
+      .Case("v19", IsVec128 ? AArch64::Q19 : AArch64::D19)
+      .Case("v20", IsVec128 ? AArch64::Q20 : AArch64::D20)
+      .Case("v21", IsVec128 ? AArch64::Q21 : AArch64::D21)
+      .Case("v22", IsVec128 ? AArch64::Q22 : AArch64::D22)
+      .Case("v23", IsVec128 ? AArch64::Q23 : AArch64::D23)
+      .Case("v24", IsVec128 ? AArch64::Q24 : AArch64::D24)
+      .Case("v25", IsVec128 ? AArch64::Q25 : AArch64::D25)
+      .Case("v26", IsVec128 ? AArch64::Q26 : AArch64::D26)
+      .Case("v27", IsVec128 ? AArch64::Q27 : AArch64::D27)
+      .Case("v28", IsVec128 ? AArch64::Q28 : AArch64::D28)
+      .Case("v29", IsVec128 ? AArch64::Q29 : AArch64::D29)
+      .Case("v30", IsVec128 ? AArch64::Q30 : AArch64::D30)
+      .Case("v31", IsVec128 ? AArch64::Q31 : AArch64::D31)
+      .Default(AArch64::NoRegister);
+  }
+  if (RegNum == AArch64::NoRegister)
+    return false;
+
   return true;
 }
 
@@ -1606,6 +1738,7 @@ AArch64AsmParser::ParseRegister(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
       case 'h': NumLanes = 8; break;
       case 's': NumLanes = 4; break;
       case 'd': NumLanes = 2; break;
+      case 'q': NumLanes = 1; break;
       }
     }
 
@@ -1824,6 +1957,148 @@ AArch64AsmParser::ParseShiftExtend(
   return MatchOperand_Success;
 }
 
+/// Try to parse a vector register token, If it is a vector register,
+/// the token is eaten and return true. Otherwise return false.
+bool AArch64AsmParser::TryParseVector(uint32_t &RegNum, SMLoc &RegEndLoc,
+                                      StringRef &Layout, SMLoc &LayoutLoc) {
+  bool IsVector = true;
+
+  if (!IdentifyRegister(RegNum, RegEndLoc, Layout, LayoutLoc))
+    IsVector = false;
+  else if (!AArch64MCRegisterClasses[AArch64::FPR64RegClassID]
+                .contains(RegNum) &&
+           !AArch64MCRegisterClasses[AArch64::FPR128RegClassID]
+                .contains(RegNum))
+    IsVector = false;
+  else if (Layout.size() == 0)
+    IsVector = false;
+
+  if (!IsVector)
+    Error(Parser.getTok().getLoc(), "expected vector type register");
+
+  Parser.Lex(); // Eat this token.
+  return IsVector;
+}
+
+
+// A vector list contains 1-4 consecutive registers.
+// Now there are two kinds of vector list when number of vector > 1:
+//   (1) {Vn.layout, Vn+1.layout, ... , Vm.layout}
+//   (2) {Vn.layout - Vm.layout}
+// If the layout is like .b/.h/.s/.d, also parse the lane.
+AArch64AsmParser::OperandMatchResultTy AArch64AsmParser::ParseVectorList(
+    SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
+  if (Parser.getTok().isNot(AsmToken::LCurly)) {
+    Error(Parser.getTok().getLoc(), "'{' expected");
+    return MatchOperand_ParseFail;
+  }
+  SMLoc SLoc = Parser.getTok().getLoc();
+  Parser.Lex(); // Eat '{' token.
+
+  unsigned Reg, Count = 1;
+  StringRef LayoutStr;
+  SMLoc RegEndLoc, LayoutLoc;
+  if (!TryParseVector(Reg, RegEndLoc, LayoutStr, LayoutLoc))
+    return MatchOperand_ParseFail;
+
+  if (Parser.getTok().is(AsmToken::Minus)) {
+    Parser.Lex(); // Eat the minus.
+
+    unsigned Reg2;
+    StringRef LayoutStr2;
+    SMLoc RegEndLoc2, LayoutLoc2;
+    SMLoc RegLoc2 = Parser.getTok().getLoc();
+
+    if (!TryParseVector(Reg2, RegEndLoc2, LayoutStr2, LayoutLoc2))
+      return MatchOperand_ParseFail;
+    unsigned Space = (Reg < Reg2) ? (Reg2 - Reg) : (Reg2 + 32 - Reg);
+
+    if (LayoutStr != LayoutStr2) {
+      Error(LayoutLoc2, "expected the same vector layout");
+      return MatchOperand_ParseFail;
+    }
+    if (Space == 0 || Space > 3) {
+      Error(RegLoc2, "invalid number of vectors");
+      return MatchOperand_ParseFail;
+    }
+
+    Count += Space;
+  } else {
+    unsigned LastReg = Reg;
+    while (Parser.getTok().is(AsmToken::Comma)) {
+      Parser.Lex(); // Eat the comma.
+      unsigned Reg2;
+      StringRef LayoutStr2;
+      SMLoc RegEndLoc2, LayoutLoc2;
+      SMLoc RegLoc2 = Parser.getTok().getLoc();
+
+      if (!TryParseVector(Reg2, RegEndLoc2, LayoutStr2, LayoutLoc2))
+        return MatchOperand_ParseFail;
+      unsigned Space = (LastReg < Reg2) ? (Reg2 - LastReg)
+                                        : (Reg2 + 32 - LastReg);
+      Count++;
+
+      // The space between two vectors should be 1. And they should have the same layout.
+      // Total count shouldn't be great than 4
+      if (Space != 1) {
+        Error(RegLoc2, "invalid space between two vectors");
+        return MatchOperand_ParseFail;
+      }
+      if (LayoutStr != LayoutStr2) {
+        Error(LayoutLoc2, "expected the same vector layout");
+        return MatchOperand_ParseFail;
+      }
+      if (Count > 4) {
+        Error(RegLoc2, "invalid number of vectors");
+        return MatchOperand_ParseFail;
+      }
+
+      LastReg = Reg2;
+    }
+  }
+
+  if (Parser.getTok().isNot(AsmToken::RCurly)) {
+    Error(Parser.getTok().getLoc(), "'}' expected");
+    return MatchOperand_ParseFail;
+  }
+  SMLoc ELoc = Parser.getTok().getLoc();
+  Parser.Lex(); // Eat '}' token.
+
+  A64Layout::VectorLayout Layout = A64StringToVectorLayout(LayoutStr);
+  if (Count > 1) { // If count > 1, create vector list using super register.
+    bool IsVec64 = (Layout < A64Layout::VL_16B);
+    static unsigned SupRegIDs[3][2] = {
+      { AArch64::QPairRegClassID, AArch64::DPairRegClassID },
+      { AArch64::QTripleRegClassID, AArch64::DTripleRegClassID },
+      { AArch64::QQuadRegClassID, AArch64::DQuadRegClassID }
+    };
+    unsigned SupRegID = SupRegIDs[Count - 2][static_cast<int>(IsVec64)];
+    unsigned Sub0 = IsVec64 ? AArch64::dsub_0 : AArch64::qsub_0;
+    const MCRegisterInfo *MRI = getContext().getRegisterInfo();
+    Reg = MRI->getMatchingSuperReg(Reg, Sub0,
+                                   &AArch64MCRegisterClasses[SupRegID]);
+  }
+  Operands.push_back(
+      AArch64Operand::CreateVectorList(Reg, Count, Layout, SLoc, ELoc));
+
+  if (Parser.getTok().is(AsmToken::LBrac)) {
+    uint32_t NumLanes = 0;
+    switch(Layout) {
+    case A64Layout::VL_B : NumLanes = 16; break;
+    case A64Layout::VL_H : NumLanes = 8; break;
+    case A64Layout::VL_S : NumLanes = 4; break;
+    case A64Layout::VL_D : NumLanes = 2; break;
+    default:
+      SMLoc Loc = getLexer().getLoc();
+      Error(Loc, "expected comma before next operand");
+      return MatchOperand_ParseFail;
+    }
+    return ParseNEONLane(Operands, NumLanes);
+  } else {
+    return MatchOperand_Success;
+  }
+}
+
 // FIXME: We would really like to be able to tablegen'erate this.
 bool AArch64AsmParser::
 validateInstruction(MCInst &Inst,
@@ -2240,6 +2515,30 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
   case Match_Width64:
     return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
                  "expected integer in range [<lsb>, 63]");
+  case Match_ShrImm8:
+    return Error(((AArch64Operand *)Operands[ErrorInfo])->getStartLoc(),
+                 "expected integer in range [1, 8]");
+  case Match_ShrImm16:
+    return Error(((AArch64Operand *)Operands[ErrorInfo])->getStartLoc(),
+                 "expected integer in range [1, 16]");
+  case Match_ShrImm32:
+    return Error(((AArch64Operand *)Operands[ErrorInfo])->getStartLoc(),
+                 "expected integer in range [1, 32]");
+  case Match_ShrImm64:
+    return Error(((AArch64Operand *)Operands[ErrorInfo])->getStartLoc(),
+                 "expected integer in range [1, 64]");
+  case Match_ShlImm8:
+    return Error(((AArch64Operand *)Operands[ErrorInfo])->getStartLoc(),
+                 "expected integer in range [0, 7]");
+  case Match_ShlImm16:
+    return Error(((AArch64Operand *)Operands[ErrorInfo])->getStartLoc(),
+                 "expected integer in range [0, 15]");
+  case Match_ShlImm32:
+    return Error(((AArch64Operand *)Operands[ErrorInfo])->getStartLoc(),
+                 "expected integer in range [0, 31]");
+  case Match_ShlImm64:
+    return Error(((AArch64Operand *)Operands[ErrorInfo])->getStartLoc(),
+                 "expected integer in range [0, 63]");
   }
 
   llvm_unreachable("Implement any new match types added!");
diff --git a/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp b/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
index a88a8e8..be4d7f2 100644
--- a/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
+++ b/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
@@ -82,15 +82,38 @@ static DecodeStatus DecodeFPR32RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
                                          uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeFPR64RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
                                          uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeFPR64LoRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                         uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeFPR128RegisterClass(llvm::MCInst &Inst,
                                               unsigned RegNo, uint64_t Address,
                                               const void *Decoder);
-static DecodeStatus DecodeVPR64RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+static DecodeStatus DecodeFPR128LoRegisterClass(llvm::MCInst &Inst,
+                                                unsigned RegNo, uint64_t Address,
+                                                const void *Decoder);
+
+static DecodeStatus DecodeGPR64noxzrRegisterClass(llvm::MCInst &Inst,
+                                                  unsigned RegNo,
+                                                  uint64_t Address,
+                                                  const void *Decoder);
+
+static DecodeStatus DecodeDPairRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                             uint64_t Address,
+                                             const void *Decoder);
+static DecodeStatus DecodeQPairRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                             uint64_t Address,
+                                             const void *Decoder);
+static DecodeStatus DecodeDTripleRegisterClass(llvm::MCInst &Inst,
+                                               unsigned RegNo, uint64_t Address,
+                                               const void *Decoder);
+static DecodeStatus DecodeQTripleRegisterClass(llvm::MCInst &Inst,
+                                               unsigned RegNo, uint64_t Address,
+                                               const void *Decoder);
+static DecodeStatus DecodeDQuadRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                             uint64_t Address,
+                                             const void *Decoder);
+static DecodeStatus DecodeQQuadRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
                                              uint64_t Address,
                                              const void *Decoder);
-static DecodeStatus DecodeVPR128RegisterClass(llvm::MCInst &Inst,
-                                              unsigned RegNo, uint64_t Address,
-                                              const void *Decoder);
 
 static DecodeStatus DecodeAddrRegExtendOperand(llvm::MCInst &Inst,
                                                unsigned OptionHiS,
@@ -113,6 +136,30 @@ static DecodeStatus DecodeFPZeroOperand(llvm::MCInst &Inst,
                                         uint64_t Address,
                                         const void *Decoder);
 
+static DecodeStatus DecodeShiftRightImm8(MCInst &Inst, unsigned Val,
+                                         uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeShiftRightImm16(MCInst &Inst, unsigned Val,
+                                          uint64_t Address,
+                                          const void *Decoder);
+static DecodeStatus DecodeShiftRightImm32(MCInst &Inst, unsigned Val,
+                                          uint64_t Address,
+                                          const void *Decoder);
+static DecodeStatus DecodeShiftRightImm64(MCInst &Inst, unsigned Val,
+                                          uint64_t Address,
+                                          const void *Decoder);
+
+static DecodeStatus DecodeShiftLeftImm8(MCInst &Inst, unsigned Val,
+                                        uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeShiftLeftImm16(MCInst &Inst, unsigned Val,
+                                         uint64_t Address,
+                                         const void *Decoder);
+static DecodeStatus DecodeShiftLeftImm32(MCInst &Inst, unsigned Val,
+                                         uint64_t Address,
+                                         const void *Decoder);
+static DecodeStatus DecodeShiftLeftImm64(MCInst &Inst, unsigned Val,
+                                         uint64_t Address,
+                                         const void *Decoder);
+
 template<int RegWidth>
 static DecodeStatus DecodeMoveWideImmOperand(llvm::MCInst &Inst,
                                              unsigned FullImm,
@@ -183,6 +230,17 @@ static DecodeStatus DecodeSingleIndexedInstruction(llvm::MCInst &Inst,
                                                    uint64_t Address,
                                                    const void *Decoder);
 
+static DecodeStatus DecodeVLDSTPostInstruction(MCInst &Inst, unsigned Val,
+                                               uint64_t Address,
+                                               const void *Decoder);
+
+static DecodeStatus DecodeVLDSTLanePostInstruction(MCInst &Inst, unsigned Insn,
+                                                   uint64_t Address,
+                                                   const void *Decoder);
+
+static DecodeStatus DecodeSHLLInstruction(MCInst &Inst, unsigned Insn,
+                                          uint64_t Address,
+                                          const void *Decoder);
 
 static bool Check(DecodeStatus &Out, DecodeStatus In);
 
@@ -331,6 +389,14 @@ DecodeFPR64RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
   return MCDisassembler::Success;
 }
 
+static DecodeStatus
+DecodeFPR64LoRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                            uint64_t Address, const void *Decoder) {
+  if (RegNo > 15)
+    return MCDisassembler::Fail;
+
+  return DecodeFPR64RegisterClass(Inst, RegNo, Address, Decoder);
+}
 
 static DecodeStatus
 DecodeFPR128RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
@@ -343,28 +409,80 @@ DecodeFPR128RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeVPR64RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                                             uint64_t Address,
-                                             const void *Decoder) {
-  if (RegNo > 31)
+static DecodeStatus
+DecodeFPR128LoRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                            uint64_t Address, const void *Decoder) {
+  if (RegNo > 15)
     return MCDisassembler::Fail;
 
-  uint16_t Register = getReg(Decoder, AArch64::VPR64RegClassID, RegNo);
+  return DecodeFPR128RegisterClass(Inst, RegNo, Address, Decoder);
+}
+
+static DecodeStatus DecodeGPR64noxzrRegisterClass(llvm::MCInst &Inst,
+                                                  unsigned RegNo,
+                                                  uint64_t Address,
+                                                  const void *Decoder) {
+  if (RegNo > 30)
+    return MCDisassembler::Fail;
+
+  uint16_t Register = getReg(Decoder, AArch64::GPR64noxzrRegClassID, RegNo);
   Inst.addOperand(MCOperand::CreateReg(Register));
   return MCDisassembler::Success;
 }
 
-static DecodeStatus
-DecodeVPR128RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-						  uint64_t Address, const void *Decoder) {
+static DecodeStatus DecodeRegisterClassByID(llvm::MCInst &Inst, unsigned RegNo,
+                                            unsigned RegID,
+                                            const void *Decoder) {
   if (RegNo > 31)
     return MCDisassembler::Fail;
 
-  uint16_t Register = getReg(Decoder, AArch64::VPR128RegClassID, RegNo);
+  uint16_t Register = getReg(Decoder, RegID, RegNo);
   Inst.addOperand(MCOperand::CreateReg(Register));
   return MCDisassembler::Success;
 }
 
+static DecodeStatus DecodeDPairRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                             uint64_t Address,
+                                             const void *Decoder) {
+  return DecodeRegisterClassByID(Inst, RegNo, AArch64::DPairRegClassID,
+                                 Decoder);
+}
+
+static DecodeStatus DecodeQPairRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                             uint64_t Address,
+                                             const void *Decoder) {
+  return DecodeRegisterClassByID(Inst, RegNo, AArch64::QPairRegClassID,
+                                 Decoder);
+}
+
+static DecodeStatus DecodeDTripleRegisterClass(llvm::MCInst &Inst,
+                                               unsigned RegNo, uint64_t Address,
+                                               const void *Decoder) {
+  return DecodeRegisterClassByID(Inst, RegNo, AArch64::DTripleRegClassID,
+                                 Decoder);
+}
+
+static DecodeStatus DecodeQTripleRegisterClass(llvm::MCInst &Inst,
+                                               unsigned RegNo, uint64_t Address,
+                                               const void *Decoder) {
+  return DecodeRegisterClassByID(Inst, RegNo, AArch64::QTripleRegClassID,
+                                 Decoder);
+}
+
+static DecodeStatus DecodeDQuadRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                             uint64_t Address,
+                                             const void *Decoder) {
+  return DecodeRegisterClassByID(Inst, RegNo, AArch64::DQuadRegClassID,
+                                 Decoder);
+}
+
+static DecodeStatus DecodeQQuadRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                             uint64_t Address,
+                                             const void *Decoder) {
+  return DecodeRegisterClassByID(Inst, RegNo, AArch64::QQuadRegClassID,
+                                 Decoder);
+}
+
 static DecodeStatus DecodeAddrRegExtendOperand(llvm::MCInst &Inst,
                                                unsigned OptionHiS,
                                                uint64_t Address,
@@ -413,7 +531,73 @@ static DecodeStatus DecodeFPZeroOperand(llvm::MCInst &Inst,
   return MCDisassembler::Success;
 }
 
+static DecodeStatus DecodeShiftRightImm8(MCInst &Inst, unsigned Val,
+                                         uint64_t Address,
+                                         const void *Decoder) {
+  Inst.addOperand(MCOperand::CreateImm(8 - Val));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeShiftRightImm16(MCInst &Inst, unsigned Val,
+                                          uint64_t Address,
+                                          const void *Decoder) {
+  Inst.addOperand(MCOperand::CreateImm(16 - Val));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeShiftRightImm32(MCInst &Inst, unsigned Val,
+                                          uint64_t Address,
+                                          const void *Decoder) {
+  Inst.addOperand(MCOperand::CreateImm(32 - Val));
+  return MCDisassembler::Success;
+}
 
+static DecodeStatus DecodeShiftRightImm64(MCInst &Inst, unsigned Val,
+                                          uint64_t Address,
+                                          const void *Decoder) {
+  Inst.addOperand(MCOperand::CreateImm(64 - Val));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeShiftLeftImm8(MCInst &Inst, unsigned Val,
+                                        uint64_t Address,
+                                        const void *Decoder) {
+  if (Val > 7)
+    return MCDisassembler::Fail;
+
+  Inst.addOperand(MCOperand::CreateImm(Val));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeShiftLeftImm16(MCInst &Inst, unsigned Val,
+                                         uint64_t Address,
+                                         const void *Decoder) {
+  if (Val > 15)
+    return MCDisassembler::Fail;
+
+  Inst.addOperand(MCOperand::CreateImm(Val));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeShiftLeftImm32(MCInst &Inst, unsigned Val,
+                                         uint64_t Address,
+                                         const void *Decoder) {
+  if (Val > 31)
+    return MCDisassembler::Fail;
+
+  Inst.addOperand(MCOperand::CreateImm(Val));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeShiftLeftImm64(MCInst &Inst, unsigned Val,
+                                         uint64_t Address,
+                                         const void *Decoder) {
+  if (Val > 63)
+    return MCDisassembler::Fail;
+
+  Inst.addOperand(MCOperand::CreateImm(Val));
+  return MCDisassembler::Success;
+}
 
 template<int RegWidth>
 static DecodeStatus DecodeMoveWideImmOperand(llvm::MCInst &Inst,
@@ -570,11 +754,11 @@ static DecodeStatus DecodeFMOVLaneInstruction(llvm::MCInst &Inst, unsigned Insn,
   unsigned IsToVec = fieldFromInstruction(Insn, 16, 1);
 
   if (IsToVec) {
-    DecodeVPR128RegisterClass(Inst, Rd, Address, Decoder);
+    DecodeFPR128RegisterClass(Inst, Rd, Address, Decoder);
     DecodeGPR64RegisterClass(Inst, Rn, Address, Decoder);
   } else {
     DecodeGPR64RegisterClass(Inst, Rd, Address, Decoder);
-    DecodeVPR128RegisterClass(Inst, Rn, Address, Decoder);
+    DecodeFPR128RegisterClass(Inst, Rn, Address, Decoder);
   }
 
   // Add the lane
@@ -838,3 +1022,551 @@ DecodeNeonMovImmShiftOperand(llvm::MCInst &Inst, unsigned ShiftAmount,
   Inst.addOperand(MCOperand::CreateImm(ShiftAmount));
   return MCDisassembler::Success;
 }
+
+// Decode post-index vector load/store instructions.
+// This is necessary as we need to decode Rm: if Rm == 0b11111, the last
+// operand is an immediate equal the the length of vector list in bytes,
+// or Rm is decoded to a GPR64noxzr register.
+static DecodeStatus DecodeVLDSTPostInstruction(MCInst &Inst, unsigned Insn,
+                                               uint64_t Address,
+                                               const void *Decoder) {
+  unsigned Rt = fieldFromInstruction(Insn, 0, 5);
+  unsigned Rn = fieldFromInstruction(Insn, 5, 5);
+  unsigned Rm = fieldFromInstruction(Insn, 16, 5);
+  unsigned Opcode = fieldFromInstruction(Insn, 12, 4);
+  unsigned IsLoad = fieldFromInstruction(Insn, 22, 1);
+  // 0 for 64bit vector list, 1 for 128bit vector list
+  unsigned Is128BitVec = fieldFromInstruction(Insn, 30, 1);
+
+  unsigned NumVecs;
+  switch (Opcode) {
+  case 0: // ld4/st4
+  case 2: // ld1/st1 with 4 vectors
+    NumVecs = 4; break;
+  case 4: // ld3/st3
+  case 6: // ld1/st1 with 3 vectors
+    NumVecs = 3; break;
+  case 7: // ld1/st1 with 1 vector
+    NumVecs = 1; break;
+  case 8:  // ld2/st2
+  case 10: // ld1/st1 with 2 vectors
+    NumVecs = 2; break;
+  default:
+    llvm_unreachable("Invalid opcode for post-index load/store instructions");
+  }
+
+  // Decode vector list of 1/2/3/4 vectors for load instructions.
+  if (IsLoad) {
+    switch (NumVecs) {
+    case 1:
+      Is128BitVec ? DecodeFPR128RegisterClass(Inst, Rt, Address, Decoder)
+                  : DecodeFPR64RegisterClass(Inst, Rt, Address, Decoder);
+      break;
+    case 2:
+      Is128BitVec ? DecodeQPairRegisterClass(Inst, Rt, Address, Decoder)
+                  : DecodeDPairRegisterClass(Inst, Rt, Address, Decoder);
+      break;
+    case 3:
+      Is128BitVec ? DecodeQTripleRegisterClass(Inst, Rt, Address, Decoder)
+                  : DecodeDTripleRegisterClass(Inst, Rt, Address, Decoder);
+      break;
+    case 4:
+      Is128BitVec ? DecodeQQuadRegisterClass(Inst, Rt, Address, Decoder)
+                  : DecodeDQuadRegisterClass(Inst, Rt, Address, Decoder);
+      break;
+    }
+  }
+
+  // Decode write back register, which is equal to Rn.
+  DecodeGPR64xspRegisterClass(Inst, Rn, Address, Decoder);
+  DecodeGPR64xspRegisterClass(Inst, Rn, Address, Decoder);
+
+  if (Rm == 31) // If Rm is 0x11111, add the vector list length in byte
+    Inst.addOperand(MCOperand::CreateImm(NumVecs * (Is128BitVec ? 16 : 8)));
+  else // Decode Rm
+    DecodeGPR64noxzrRegisterClass(Inst, Rm, Address, Decoder);
+
+  // Decode vector list of 1/2/3/4 vectors for load instructions.
+  if (!IsLoad) {
+    switch (NumVecs) {
+    case 1:
+      Is128BitVec ? DecodeFPR128RegisterClass(Inst, Rt, Address, Decoder)
+                  : DecodeFPR64RegisterClass(Inst, Rt, Address, Decoder);
+      break;
+    case 2:
+      Is128BitVec ? DecodeQPairRegisterClass(Inst, Rt, Address, Decoder)
+                  : DecodeDPairRegisterClass(Inst, Rt, Address, Decoder);
+      break;
+    case 3:
+      Is128BitVec ? DecodeQTripleRegisterClass(Inst, Rt, Address, Decoder)
+                  : DecodeDTripleRegisterClass(Inst, Rt, Address, Decoder);
+      break;
+    case 4:
+      Is128BitVec ? DecodeQQuadRegisterClass(Inst, Rt, Address, Decoder)
+                  : DecodeDQuadRegisterClass(Inst, Rt, Address, Decoder);
+      break;
+    }
+  }
+
+  return MCDisassembler::Success;
+}
+
+// Decode post-index vector load/store lane instructions.
+// This is necessary as we need to decode Rm: if Rm == 0b11111, the last
+// operand is an immediate equal the the length of the changed bytes,
+// or Rm is decoded to a GPR64noxzr register.
+static DecodeStatus DecodeVLDSTLanePostInstruction(MCInst &Inst, unsigned Insn,
+                                                   uint64_t Address,
+                                                   const void *Decoder) {
+  bool Is64bitVec = false;
+  bool IsLoadDup = false;
+  bool IsLoad = false;
+  // The total number of bytes transferred.
+  // TransferBytes = NumVecs * OneLaneBytes
+  unsigned TransferBytes = 0;
+  unsigned NumVecs = 0;
+  unsigned Opc = Inst.getOpcode();
+  switch (Opc) {
+  case AArch64::LD1R_WB_8B_fixed: case AArch64::LD1R_WB_8B_register:
+  case AArch64::LD1R_WB_4H_fixed: case AArch64::LD1R_WB_4H_register:
+  case AArch64::LD1R_WB_2S_fixed: case AArch64::LD1R_WB_2S_register:
+  case AArch64::LD1R_WB_1D_fixed: case AArch64::LD1R_WB_1D_register: {
+    switch (Opc) {
+    case AArch64::LD1R_WB_8B_fixed: case AArch64::LD1R_WB_8B_register:
+      TransferBytes = 1; break;
+    case AArch64::LD1R_WB_4H_fixed: case AArch64::LD1R_WB_4H_register:
+      TransferBytes = 2; break;
+    case AArch64::LD1R_WB_2S_fixed: case AArch64::LD1R_WB_2S_register:
+      TransferBytes = 4; break;
+    case AArch64::LD1R_WB_1D_fixed: case AArch64::LD1R_WB_1D_register:
+      TransferBytes = 8; break;
+    }
+    Is64bitVec = true;
+    IsLoadDup = true;
+    NumVecs = 1;
+    break;
+  }
+
+  case AArch64::LD1R_WB_16B_fixed: case AArch64::LD1R_WB_16B_register:
+  case AArch64::LD1R_WB_8H_fixed: case AArch64::LD1R_WB_8H_register:
+  case AArch64::LD1R_WB_4S_fixed: case AArch64::LD1R_WB_4S_register:
+  case AArch64::LD1R_WB_2D_fixed: case AArch64::LD1R_WB_2D_register: {
+    switch (Opc) {
+    case AArch64::LD1R_WB_16B_fixed: case AArch64::LD1R_WB_16B_register:
+      TransferBytes = 1; break;
+    case AArch64::LD1R_WB_8H_fixed: case AArch64::LD1R_WB_8H_register:
+      TransferBytes = 2; break;
+    case AArch64::LD1R_WB_4S_fixed: case AArch64::LD1R_WB_4S_register:
+      TransferBytes = 4; break;
+    case AArch64::LD1R_WB_2D_fixed: case AArch64::LD1R_WB_2D_register:
+      TransferBytes = 8; break;
+    }
+    IsLoadDup = true;
+    NumVecs = 1;
+    break;
+  }
+
+  case AArch64::LD2R_WB_8B_fixed: case AArch64::LD2R_WB_8B_register:
+  case AArch64::LD2R_WB_4H_fixed: case AArch64::LD2R_WB_4H_register:
+  case AArch64::LD2R_WB_2S_fixed: case AArch64::LD2R_WB_2S_register:
+  case AArch64::LD2R_WB_1D_fixed: case AArch64::LD2R_WB_1D_register: {
+    switch (Opc) {
+    case AArch64::LD2R_WB_8B_fixed: case AArch64::LD2R_WB_8B_register:
+      TransferBytes = 2; break;
+    case AArch64::LD2R_WB_4H_fixed: case AArch64::LD2R_WB_4H_register:
+      TransferBytes = 4; break;
+    case AArch64::LD2R_WB_2S_fixed: case AArch64::LD2R_WB_2S_register:
+      TransferBytes = 8; break;
+    case AArch64::LD2R_WB_1D_fixed: case AArch64::LD2R_WB_1D_register:
+      TransferBytes = 16; break;
+    }
+    Is64bitVec = true;
+    IsLoadDup = true;
+    NumVecs = 2;
+    break;
+  }
+
+  case AArch64::LD2R_WB_16B_fixed: case AArch64::LD2R_WB_16B_register:
+  case AArch64::LD2R_WB_8H_fixed: case AArch64::LD2R_WB_8H_register:
+  case AArch64::LD2R_WB_4S_fixed: case AArch64::LD2R_WB_4S_register:
+  case AArch64::LD2R_WB_2D_fixed: case AArch64::LD2R_WB_2D_register: {
+    switch (Opc) {
+    case AArch64::LD2R_WB_16B_fixed: case AArch64::LD2R_WB_16B_register:
+      TransferBytes = 2; break;
+    case AArch64::LD2R_WB_8H_fixed: case AArch64::LD2R_WB_8H_register:
+      TransferBytes = 4; break;
+    case AArch64::LD2R_WB_4S_fixed: case AArch64::LD2R_WB_4S_register:
+      TransferBytes = 8; break;
+    case AArch64::LD2R_WB_2D_fixed: case AArch64::LD2R_WB_2D_register:
+      TransferBytes = 16; break;
+    }
+    IsLoadDup = true;
+    NumVecs = 2;
+    break;
+  }
+
+  case AArch64::LD3R_WB_8B_fixed: case AArch64::LD3R_WB_8B_register:
+  case AArch64::LD3R_WB_4H_fixed: case AArch64::LD3R_WB_4H_register:
+  case AArch64::LD3R_WB_2S_fixed: case AArch64::LD3R_WB_2S_register:
+  case AArch64::LD3R_WB_1D_fixed: case AArch64::LD3R_WB_1D_register: {
+    switch (Opc) {
+    case AArch64::LD3R_WB_8B_fixed: case AArch64::LD3R_WB_8B_register:
+      TransferBytes = 3; break;
+    case AArch64::LD3R_WB_4H_fixed: case AArch64::LD3R_WB_4H_register:
+      TransferBytes = 6; break;
+    case AArch64::LD3R_WB_2S_fixed: case AArch64::LD3R_WB_2S_register:
+      TransferBytes = 12; break;
+    case AArch64::LD3R_WB_1D_fixed: case AArch64::LD3R_WB_1D_register:
+      TransferBytes = 24; break;
+    }
+    Is64bitVec = true;
+    IsLoadDup = true;
+    NumVecs = 3;
+    break;
+  }
+
+  case AArch64::LD3R_WB_16B_fixed: case AArch64::LD3R_WB_16B_register:
+  case AArch64::LD3R_WB_4S_fixed: case AArch64::LD3R_WB_8H_register:
+  case AArch64::LD3R_WB_8H_fixed: case AArch64::LD3R_WB_4S_register:
+  case AArch64::LD3R_WB_2D_fixed: case AArch64::LD3R_WB_2D_register: {
+    switch (Opc) {
+    case AArch64::LD3R_WB_16B_fixed: case AArch64::LD3R_WB_16B_register:
+      TransferBytes = 3; break;
+    case AArch64::LD3R_WB_8H_fixed: case AArch64::LD3R_WB_8H_register:
+      TransferBytes = 6; break;
+    case AArch64::LD3R_WB_4S_fixed: case AArch64::LD3R_WB_4S_register:
+      TransferBytes = 12; break;
+    case AArch64::LD3R_WB_2D_fixed: case AArch64::LD3R_WB_2D_register:
+      TransferBytes = 24; break;
+    }
+    IsLoadDup = true;
+    NumVecs = 3;
+    break;
+  }
+
+  case AArch64::LD4R_WB_8B_fixed: case AArch64::LD4R_WB_8B_register:
+  case AArch64::LD4R_WB_4H_fixed: case AArch64::LD4R_WB_4H_register:
+  case AArch64::LD4R_WB_2S_fixed: case AArch64::LD4R_WB_2S_register:
+  case AArch64::LD4R_WB_1D_fixed: case AArch64::LD4R_WB_1D_register: {
+    switch (Opc) {
+    case AArch64::LD4R_WB_8B_fixed: case AArch64::LD4R_WB_8B_register:
+      TransferBytes = 4; break;
+    case AArch64::LD4R_WB_4H_fixed: case AArch64::LD4R_WB_4H_register:
+      TransferBytes = 8; break;
+    case AArch64::LD4R_WB_2S_fixed: case AArch64::LD4R_WB_2S_register:
+      TransferBytes = 16; break;
+    case AArch64::LD4R_WB_1D_fixed: case AArch64::LD4R_WB_1D_register:
+      TransferBytes = 32; break;
+    }
+    Is64bitVec = true;
+    IsLoadDup = true;
+    NumVecs = 4;
+    break;
+  }
+
+  case AArch64::LD4R_WB_16B_fixed: case AArch64::LD4R_WB_16B_register:
+  case AArch64::LD4R_WB_4S_fixed: case AArch64::LD4R_WB_8H_register:
+  case AArch64::LD4R_WB_8H_fixed: case AArch64::LD4R_WB_4S_register:
+  case AArch64::LD4R_WB_2D_fixed: case AArch64::LD4R_WB_2D_register: {
+    switch (Opc) {
+    case AArch64::LD4R_WB_16B_fixed: case AArch64::LD4R_WB_16B_register:
+      TransferBytes = 4; break;
+    case AArch64::LD4R_WB_8H_fixed: case AArch64::LD4R_WB_8H_register:
+      TransferBytes = 8; break;
+    case AArch64::LD4R_WB_4S_fixed: case AArch64::LD4R_WB_4S_register:
+      TransferBytes = 16; break;
+    case AArch64::LD4R_WB_2D_fixed: case AArch64::LD4R_WB_2D_register:
+      TransferBytes = 32; break;
+    }
+    IsLoadDup = true;
+    NumVecs = 4;
+    break;
+  }
+
+  case AArch64::LD1LN_WB_B_fixed: case AArch64::LD1LN_WB_B_register:
+  case AArch64::LD1LN_WB_H_fixed: case AArch64::LD1LN_WB_H_register:
+  case AArch64::LD1LN_WB_S_fixed: case AArch64::LD1LN_WB_S_register:
+  case AArch64::LD1LN_WB_D_fixed: case AArch64::LD1LN_WB_D_register: {
+    switch (Opc) {
+    case AArch64::LD1LN_WB_B_fixed: case AArch64::LD1LN_WB_B_register:
+      TransferBytes = 1; break;
+    case AArch64::LD1LN_WB_H_fixed: case AArch64::LD1LN_WB_H_register:
+      TransferBytes = 2; break;
+    case AArch64::LD1LN_WB_S_fixed: case AArch64::LD1LN_WB_S_register:
+      TransferBytes = 4; break;
+    case AArch64::LD1LN_WB_D_fixed: case AArch64::LD1LN_WB_D_register:
+      TransferBytes = 8; break;
+    }
+    IsLoad = true;
+    NumVecs = 1;
+    break;
+  }
+
+  case AArch64::LD2LN_WB_B_fixed: case AArch64::LD2LN_WB_B_register:
+  case AArch64::LD2LN_WB_H_fixed: case AArch64::LD2LN_WB_H_register:
+  case AArch64::LD2LN_WB_S_fixed: case AArch64::LD2LN_WB_S_register:
+  case AArch64::LD2LN_WB_D_fixed: case AArch64::LD2LN_WB_D_register: {
+    switch (Opc) {
+    case AArch64::LD2LN_WB_B_fixed: case AArch64::LD2LN_WB_B_register:
+      TransferBytes = 2; break;
+    case AArch64::LD2LN_WB_H_fixed: case AArch64::LD2LN_WB_H_register:
+      TransferBytes = 4; break;
+    case AArch64::LD2LN_WB_S_fixed: case AArch64::LD2LN_WB_S_register:
+      TransferBytes = 8; break;
+    case AArch64::LD2LN_WB_D_fixed: case AArch64::LD2LN_WB_D_register:
+      TransferBytes = 16; break;
+    }
+    IsLoad = true;
+    NumVecs = 2;
+    break;
+  }
+
+  case AArch64::LD3LN_WB_B_fixed: case AArch64::LD3LN_WB_B_register:
+  case AArch64::LD3LN_WB_H_fixed: case AArch64::LD3LN_WB_H_register:
+  case AArch64::LD3LN_WB_S_fixed: case AArch64::LD3LN_WB_S_register:
+  case AArch64::LD3LN_WB_D_fixed: case AArch64::LD3LN_WB_D_register: {
+    switch (Opc) {
+    case AArch64::LD3LN_WB_B_fixed: case AArch64::LD3LN_WB_B_register:
+      TransferBytes = 3; break;
+    case AArch64::LD3LN_WB_H_fixed: case AArch64::LD3LN_WB_H_register:
+      TransferBytes = 6; break;
+    case AArch64::LD3LN_WB_S_fixed: case AArch64::LD3LN_WB_S_register:
+      TransferBytes = 12; break;
+    case AArch64::LD3LN_WB_D_fixed: case AArch64::LD3LN_WB_D_register:
+      TransferBytes = 24; break;
+    }
+    IsLoad = true;
+    NumVecs = 3;
+    break;
+  }
+
+  case AArch64::LD4LN_WB_B_fixed: case AArch64::LD4LN_WB_B_register:
+  case AArch64::LD4LN_WB_H_fixed: case AArch64::LD4LN_WB_H_register:
+  case AArch64::LD4LN_WB_S_fixed: case AArch64::LD4LN_WB_S_register:
+  case AArch64::LD4LN_WB_D_fixed: case AArch64::LD4LN_WB_D_register: {
+    switch (Opc) {
+    case AArch64::LD4LN_WB_B_fixed: case AArch64::LD4LN_WB_B_register:
+      TransferBytes = 4; break;
+    case AArch64::LD4LN_WB_H_fixed: case AArch64::LD4LN_WB_H_register:
+      TransferBytes = 8; break;
+    case AArch64::LD4LN_WB_S_fixed: case AArch64::LD4LN_WB_S_register:
+      TransferBytes = 16; break;
+    case AArch64::LD4LN_WB_D_fixed: case AArch64::LD4LN_WB_D_register:
+      TransferBytes = 32; break;
+    }
+    IsLoad = true;
+    NumVecs = 4;
+    break;
+  }
+
+  case AArch64::ST1LN_WB_B_fixed: case AArch64::ST1LN_WB_B_register:
+  case AArch64::ST1LN_WB_H_fixed: case AArch64::ST1LN_WB_H_register:
+  case AArch64::ST1LN_WB_S_fixed: case AArch64::ST1LN_WB_S_register:
+  case AArch64::ST1LN_WB_D_fixed: case AArch64::ST1LN_WB_D_register: {
+    switch (Opc) {
+    case AArch64::ST1LN_WB_B_fixed: case AArch64::ST1LN_WB_B_register:
+      TransferBytes = 1; break;
+    case AArch64::ST1LN_WB_H_fixed: case AArch64::ST1LN_WB_H_register:
+      TransferBytes = 2; break;
+    case AArch64::ST1LN_WB_S_fixed: case AArch64::ST1LN_WB_S_register:
+      TransferBytes = 4; break;
+    case AArch64::ST1LN_WB_D_fixed: case AArch64::ST1LN_WB_D_register:
+      TransferBytes = 8; break;
+    }
+    NumVecs = 1;
+    break;
+  }
+
+  case AArch64::ST2LN_WB_B_fixed: case AArch64::ST2LN_WB_B_register:
+  case AArch64::ST2LN_WB_H_fixed: case AArch64::ST2LN_WB_H_register:
+  case AArch64::ST2LN_WB_S_fixed: case AArch64::ST2LN_WB_S_register:
+  case AArch64::ST2LN_WB_D_fixed: case AArch64::ST2LN_WB_D_register: {
+    switch (Opc) {
+    case AArch64::ST2LN_WB_B_fixed: case AArch64::ST2LN_WB_B_register:
+      TransferBytes = 2; break;
+    case AArch64::ST2LN_WB_H_fixed: case AArch64::ST2LN_WB_H_register:
+      TransferBytes = 4; break;
+    case AArch64::ST2LN_WB_S_fixed: case AArch64::ST2LN_WB_S_register:
+      TransferBytes = 8; break;
+    case AArch64::ST2LN_WB_D_fixed: case AArch64::ST2LN_WB_D_register:
+      TransferBytes = 16; break;
+    }
+    NumVecs = 2;
+    break;
+  }
+
+  case AArch64::ST3LN_WB_B_fixed: case AArch64::ST3LN_WB_B_register:
+  case AArch64::ST3LN_WB_H_fixed: case AArch64::ST3LN_WB_H_register:
+  case AArch64::ST3LN_WB_S_fixed: case AArch64::ST3LN_WB_S_register:
+  case AArch64::ST3LN_WB_D_fixed: case AArch64::ST3LN_WB_D_register: {
+    switch (Opc) {
+    case AArch64::ST3LN_WB_B_fixed: case AArch64::ST3LN_WB_B_register:
+      TransferBytes = 3; break;
+    case AArch64::ST3LN_WB_H_fixed: case AArch64::ST3LN_WB_H_register:
+      TransferBytes = 6; break;
+    case AArch64::ST3LN_WB_S_fixed: case AArch64::ST3LN_WB_S_register:
+      TransferBytes = 12; break;
+    case AArch64::ST3LN_WB_D_fixed: case AArch64::ST3LN_WB_D_register:
+      TransferBytes = 24; break;
+    }
+    NumVecs = 3;
+    break;
+  }
+
+  case AArch64::ST4LN_WB_B_fixed: case AArch64::ST4LN_WB_B_register:
+  case AArch64::ST4LN_WB_H_fixed: case AArch64::ST4LN_WB_H_register:
+  case AArch64::ST4LN_WB_S_fixed: case AArch64::ST4LN_WB_S_register:
+  case AArch64::ST4LN_WB_D_fixed: case AArch64::ST4LN_WB_D_register: {
+    switch (Opc) {
+    case AArch64::ST4LN_WB_B_fixed: case AArch64::ST4LN_WB_B_register:
+      TransferBytes = 4; break;
+    case AArch64::ST4LN_WB_H_fixed: case AArch64::ST4LN_WB_H_register:
+      TransferBytes = 8; break;
+    case AArch64::ST4LN_WB_S_fixed: case AArch64::ST4LN_WB_S_register:
+      TransferBytes = 16; break;
+    case AArch64::ST4LN_WB_D_fixed: case AArch64::ST4LN_WB_D_register:
+      TransferBytes = 32; break;
+    }
+    NumVecs = 4;
+    break;
+  }
+
+  default:
+    return MCDisassembler::Fail;
+  } // End of switch (Opc)
+
+  unsigned Rt = fieldFromInstruction(Insn, 0, 5);
+  unsigned Rn = fieldFromInstruction(Insn, 5, 5);
+  unsigned Rm = fieldFromInstruction(Insn, 16, 5);
+
+  // Decode post-index of load duplicate lane
+  if (IsLoadDup) {
+    switch (NumVecs) {
+    case 1:
+      Is64bitVec ? DecodeFPR64RegisterClass(Inst, Rt, Address, Decoder)
+                 : DecodeFPR128RegisterClass(Inst, Rt, Address, Decoder);
+      break;
+    case 2:
+      Is64bitVec ? DecodeDPairRegisterClass(Inst, Rt, Address, Decoder)
+                 : DecodeQPairRegisterClass(Inst, Rt, Address, Decoder);
+      break;
+    case 3:
+      Is64bitVec ? DecodeDTripleRegisterClass(Inst, Rt, Address, Decoder)
+                 : DecodeQTripleRegisterClass(Inst, Rt, Address, Decoder);
+      break;
+    case 4:
+      Is64bitVec ? DecodeDQuadRegisterClass(Inst, Rt, Address, Decoder)
+                 : DecodeQQuadRegisterClass(Inst, Rt, Address, Decoder);
+    }
+
+    // Decode write back register, which is equal to Rn.
+    DecodeGPR64xspRegisterClass(Inst, Rn, Address, Decoder);
+    DecodeGPR64xspRegisterClass(Inst, Rn, Address, Decoder);
+
+    if (Rm == 31) // If Rm is 0x11111, add the number of transferred bytes
+      Inst.addOperand(MCOperand::CreateImm(TransferBytes));
+    else // Decode Rm
+      DecodeGPR64noxzrRegisterClass(Inst, Rm, Address, Decoder);
+
+    return MCDisassembler::Success;
+  }
+
+  // Decode post-index of load/store lane
+  // Loads have a vector list as output.
+  if (IsLoad) {
+    switch (NumVecs) {
+    case 1:
+      DecodeFPR128RegisterClass(Inst, Rt, Address, Decoder);
+      break;
+    case 2:
+      DecodeQPairRegisterClass(Inst, Rt, Address, Decoder);
+      break;
+    case 3:
+      DecodeQTripleRegisterClass(Inst, Rt, Address, Decoder);
+      break;
+    case 4:
+      DecodeQQuadRegisterClass(Inst, Rt, Address, Decoder);
+    }
+  }
+
+  // Decode write back register, which is equal to Rn.
+  DecodeGPR64xspRegisterClass(Inst, Rn, Address, Decoder);
+  DecodeGPR64xspRegisterClass(Inst, Rn, Address, Decoder);
+
+  if (Rm == 31) // If Rm is 0x11111, add the number of transferred bytes
+    Inst.addOperand(MCOperand::CreateImm(TransferBytes));
+  else // Decode Rm
+    DecodeGPR64noxzrRegisterClass(Inst, Rm, Address, Decoder);
+
+  // Decode the source vector list.
+  switch (NumVecs) {
+  case 1:
+    DecodeFPR128RegisterClass(Inst, Rt, Address, Decoder);
+    break;
+  case 2:
+    DecodeQPairRegisterClass(Inst, Rt, Address, Decoder);
+    break;
+  case 3:
+    DecodeQTripleRegisterClass(Inst, Rt, Address, Decoder);
+    break;
+  case 4:
+    DecodeQQuadRegisterClass(Inst, Rt, Address, Decoder);
+  }
+
+  // Decode lane
+  unsigned Q = fieldFromInstruction(Insn, 30, 1);
+  unsigned S = fieldFromInstruction(Insn, 10, 3);
+  unsigned lane = 0;
+  // Calculate the number of lanes by number of vectors and transfered bytes.
+  // NumLanes = 16 bytes / bytes of each lane
+  unsigned NumLanes = 16 / (TransferBytes / NumVecs);
+  switch (NumLanes) {
+  case 16: // A vector has 16 lanes, each lane is 1 bytes.
+    lane = (Q << 3) | S;
+    break;
+  case 8:
+    lane = (Q << 2) | (S >> 1);
+    break;
+  case 4:
+    lane = (Q << 1) | (S >> 2);
+    break;
+  case 2:
+    lane = Q;
+    break;
+  }
+  Inst.addOperand(MCOperand::CreateImm(lane));
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeSHLLInstruction(MCInst &Inst, unsigned Insn,
+                                          uint64_t Address,
+                                          const void *Decoder) {
+  unsigned Rd = fieldFromInstruction(Insn, 0, 5);
+  unsigned Rn = fieldFromInstruction(Insn, 5, 5);
+  unsigned size = fieldFromInstruction(Insn, 22, 2);
+  unsigned Q = fieldFromInstruction(Insn, 30, 1);
+
+  DecodeFPR128RegisterClass(Inst, Rd, Address, Decoder);
+
+  if(Q)
+    DecodeFPR128RegisterClass(Inst, Rn, Address, Decoder);
+  else
+    DecodeFPR64RegisterClass(Inst, Rn, Address, Decoder);
+
+  switch (size) {
+  case 0:
+    Inst.addOperand(MCOperand::CreateImm(8));
+    break;
+  case 1:
+    Inst.addOperand(MCOperand::CreateImm(16));
+    break;
+  case 2:
+    Inst.addOperand(MCOperand::CreateImm(32));
+    break;
+  default :
+    return MCDisassembler::Fail;
+  }
+  return MCDisassembler::Success;
+}
+
diff --git a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
index b624331..0438de3 100644
--- a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
+++ b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
@@ -368,6 +368,14 @@ AArch64InstPrinter::printSImm7ScaledOperand(const MCInst *MI, unsigned OpNum,
   O << "#" << (Imm * MemScale);
 }
 
+void AArch64InstPrinter::printVPRRegister(const MCInst *MI, unsigned OpNo,
+                                          raw_ostream &O) {
+  unsigned Reg = MI->getOperand(OpNo).getReg();
+  std::string Name = getRegisterName(Reg);
+  Name[0] = 'v';
+  O << Name;
+}
+
 void AArch64InstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
                                       raw_ostream &O) {
   const MCOperand &Op = MI->getOperand(OpNo);
@@ -454,8 +462,8 @@ void AArch64InstPrinter::printNeonUImm0Operand(const MCInst *MI, unsigned OpNum,
   o << "#0x0";
 }
 
-void AArch64InstPrinter::printNeonUImm8Operand(const MCInst *MI, unsigned OpNum,
-                                               raw_ostream &O) {
+void AArch64InstPrinter::printUImmHexOperand(const MCInst *MI, unsigned OpNum,
+                                             raw_ostream &O) {
   const MCOperand &MOUImm = MI->getOperand(OpNum);
 
   assert(MOUImm.isImm() &&
@@ -467,6 +475,18 @@ void AArch64InstPrinter::printNeonUImm8Operand(const MCInst *MI, unsigned OpNum,
   O.write_hex(Imm);
 }
 
+void AArch64InstPrinter::printUImmBareOperand(const MCInst *MI,
+                                              unsigned OpNum,
+                                              raw_ostream &O) {
+  const MCOperand &MOUImm = MI->getOperand(OpNum);
+
+  assert(MOUImm.isImm()
+         && "Immediate operand required for Neon vector immediate inst.");
+
+  unsigned Imm = MOUImm.getImm();
+  O << Imm;
+}
+
 void AArch64InstPrinter::printNeonUImm64MaskOperand(const MCInst *MI,
                                                     unsigned OpNum,
                                                     raw_ostream &O) {
@@ -487,3 +507,33 @@ void AArch64InstPrinter::printNeonUImm64MaskOperand(const MCInst *MI,
   O << "#0x";
   O.write_hex(Mask);
 }
+
+// If Count > 1, there are two valid kinds of vector list:
+//   (1) {Vn.layout, Vn+1.layout, ... , Vm.layout}
+//   (2) {Vn.layout - Vm.layout}
+// We choose the first kind as output.
+template <A64Layout::VectorLayout Layout, unsigned Count>
+void AArch64InstPrinter::printVectorList(const MCInst *MI, unsigned OpNum,
+                                         raw_ostream &O) {
+  assert(Count >= 1 && Count <= 4 && "Invalid Number of Vectors");
+
+  unsigned Reg = MI->getOperand(OpNum).getReg();
+  std::string LayoutStr = A64VectorLayoutToString(Layout);
+  O << "{";
+  if (Count > 1) { // Print sub registers separately
+    bool IsVec64 = (Layout < A64Layout::VL_16B);
+    unsigned SubRegIdx = IsVec64 ? AArch64::dsub_0 : AArch64::qsub_0;
+    for (unsigned I = 0; I < Count; I++) {
+      std::string Name = getRegisterName(MRI.getSubReg(Reg, SubRegIdx++));
+      Name[0] = 'v';
+      O << Name << LayoutStr;
+      if (I != Count - 1)
+        O << ", ";
+    }
+  } else { // Print the register directly when NumVecs is 1.
+    std::string Name = getRegisterName(Reg);
+    Name[0] = 'v';
+    O << Name << LayoutStr;
+  }
+  O << "}";
+}
diff --git a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h
index f7439be..37b7273 100644
--- a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h
+++ b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h
@@ -157,6 +157,7 @@ public:
   void printRegExtendOperand(const MCInst *MI, unsigned OpNum,
                              raw_ostream &O, A64SE::ShiftExtSpecifiers Ext);
 
+  void printVPRRegister(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   virtual void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot);
 
@@ -168,9 +169,13 @@ public:
   void printNeonMovImmShiftOperand(const MCInst *MI, unsigned OpNum,
                                    raw_ostream &O);
   void printNeonUImm0Operand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printNeonUImm8Operand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printUImmHexOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printUImmBareOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
   void printNeonUImm64MaskOperand(const MCInst *MI, unsigned OpNum,
                                   raw_ostream &O);
+
+  template <A64Layout::VectorLayout Layout, unsigned Count>
+  void printVectorList(const MCInst *MI, unsigned OpNum, raw_ostream &O);
 };
 }
 
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
index a3373b1..8a9077c 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
@@ -578,8 +578,8 @@ static uint64_t adjustFixupValue(unsigned Kind, uint64_t Value) {
 }
 
 MCAsmBackend *
-llvm::createAArch64AsmBackend(const Target &T, StringRef TT, StringRef CPU) {
+llvm::createAArch64AsmBackend(const Target &T, const MCRegisterInfo &MRI,
+                              StringRef TT, StringRef CPU) {
   Triple TheTriple(TT);
-
   return new ELFAArch64AsmBackend(T, TT, TheTriple.getOS());
 }
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
index 104e4d2..a64c463 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
@@ -55,11 +55,10 @@ namespace {
 /// by MachO. Beware!
 class AArch64ELFStreamer : public MCELFStreamer {
 public:
-  AArch64ELFStreamer(MCContext &Context, MCAsmBackend &TAB,
-                 raw_ostream &OS, MCCodeEmitter *Emitter)
-    : MCELFStreamer(Context, TAB, OS, Emitter),
-      MappingSymbolCounter(0), LastEMS(EMS_None) {
-  }
+  AArch64ELFStreamer(MCContext &Context, MCAsmBackend &TAB, raw_ostream &OS,
+                     MCCodeEmitter *Emitter)
+      : MCELFStreamer(Context, 0, TAB, OS, Emitter), MappingSymbolCounter(0),
+        LastEMS(EMS_None) {}
 
   ~AArch64ELFStreamer() {}
 
@@ -129,7 +128,7 @@ private:
     MCELF::SetType(SD, ELF::STT_NOTYPE);
     MCELF::SetBinding(SD, ELF::STB_LOCAL);
     SD.setExternal(false);
-    Symbol->setSection(*getCurrentSection().first);
+    AssignSection(Symbol, getCurrentSection().first);
 
     const MCExpr *Value = MCSymbolRefExpr::Create(Start, getContext());
     Symbol->setVariableValue(Value);
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
index 8ec8cbf..add874c 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
@@ -31,11 +31,12 @@ AArch64ELFMCAsmInfo::AArch64ELFMCAsmInfo() {
 
   UseDataRegionDirectives = true;
 
-  WeakRefDirective = "\t.weak\t";
-
   HasLEB128 = true;
   SupportsDebugInformation = true;
 
   // Exceptions handling
   ExceptionsType = ExceptionHandling::DwarfCFI;
 }
+
+// Pin the vtable to this file.
+void AArch64ELFMCAsmInfo::anchor() {}
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
index a20bc47..d1dd285 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
@@ -14,13 +14,15 @@
 #ifndef LLVM_AARCH64TARGETASMINFO_H
 #define LLVM_AARCH64TARGETASMINFO_H
 
-#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCAsmInfoELF.h"
 
 namespace llvm {
 
-  struct AArch64ELFMCAsmInfo : public MCAsmInfo {
-    explicit AArch64ELFMCAsmInfo();
-  };
+struct AArch64ELFMCAsmInfo : public MCAsmInfoELF {
+  explicit AArch64ELFMCAsmInfo();
+private:
+  virtual void anchor();
+};
 
 } // namespace llvm
 
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
index b9770b3..b41c566 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
@@ -59,6 +59,23 @@ public:
   unsigned getBitfield64LSLOpValue(const MCInst &MI, unsigned OpIdx,
                                    SmallVectorImpl<MCFixup> &Fixups) const;
 
+  unsigned getShiftRightImm8(const MCInst &MI, unsigned Op,
+                             SmallVectorImpl<MCFixup> &Fixups) const;
+  unsigned getShiftRightImm16(const MCInst &MI, unsigned Op,
+                              SmallVectorImpl<MCFixup> &Fixups) const;
+  unsigned getShiftRightImm32(const MCInst &MI, unsigned Op,
+                              SmallVectorImpl<MCFixup> &Fixups) const;
+  unsigned getShiftRightImm64(const MCInst &MI, unsigned Op,
+                              SmallVectorImpl<MCFixup> &Fixups) const;
+
+  unsigned getShiftLeftImm8(const MCInst &MI, unsigned Op,
+                            SmallVectorImpl<MCFixup> &Fixups) const;
+  unsigned getShiftLeftImm16(const MCInst &MI, unsigned Op,
+                             SmallVectorImpl<MCFixup> &Fixups) const;
+  unsigned getShiftLeftImm32(const MCInst &MI, unsigned Op,
+                             SmallVectorImpl<MCFixup> &Fixups) const;
+  unsigned getShiftLeftImm64(const MCInst &MI, unsigned Op,
+                             SmallVectorImpl<MCFixup> &Fixups) const;
 
   // Labels are handled mostly the same way: a symbol is needed, and
   // just gets some fixup attached.
@@ -310,6 +327,45 @@ AArch64MCCodeEmitter::getBitfield64LSLOpValue(const MCInst &MI, unsigned OpIdx,
   return ((64 - MO.getImm()) & 0x3f) | (63 - MO.getImm()) << 6;
 }
 
+unsigned AArch64MCCodeEmitter::getShiftRightImm8(
+    const MCInst &MI, unsigned Op, SmallVectorImpl<MCFixup> &Fixups) const {
+  return 8 - MI.getOperand(Op).getImm();
+}
+
+unsigned AArch64MCCodeEmitter::getShiftRightImm16(
+    const MCInst &MI, unsigned Op, SmallVectorImpl<MCFixup> &Fixups) const {
+  return 16 - MI.getOperand(Op).getImm();
+}
+
+unsigned AArch64MCCodeEmitter::getShiftRightImm32(
+    const MCInst &MI, unsigned Op, SmallVectorImpl<MCFixup> &Fixups) const {
+  return 32 - MI.getOperand(Op).getImm();
+}
+
+unsigned AArch64MCCodeEmitter::getShiftRightImm64(
+    const MCInst &MI, unsigned Op, SmallVectorImpl<MCFixup> &Fixups) const {
+  return 64 - MI.getOperand(Op).getImm();
+}
+
+unsigned AArch64MCCodeEmitter::getShiftLeftImm8(
+    const MCInst &MI, unsigned Op, SmallVectorImpl<MCFixup> &Fixups) const {
+  return MI.getOperand(Op).getImm() - 8;
+}
+
+unsigned AArch64MCCodeEmitter::getShiftLeftImm16(
+    const MCInst &MI, unsigned Op, SmallVectorImpl<MCFixup> &Fixups) const {
+  return MI.getOperand(Op).getImm() - 16;
+}
+
+unsigned AArch64MCCodeEmitter::getShiftLeftImm32(
+    const MCInst &MI, unsigned Op, SmallVectorImpl<MCFixup> &Fixups) const {
+  return MI.getOperand(Op).getImm() - 32;
+}
+
+unsigned AArch64MCCodeEmitter::getShiftLeftImm64(
+    const MCInst &MI, unsigned Op, SmallVectorImpl<MCFixup> &Fixups) const {
+  return MI.getOperand(Op).getImm() - 64;
+}
 
 template<AArch64::Fixups fixupDesired> unsigned
 AArch64MCCodeEmitter::getLabelOpValue(const MCInst &MI,
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h
index 3849fe3..670e657 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h
@@ -43,8 +43,9 @@ MCCodeEmitter *createAArch64MCCodeEmitter(const MCInstrInfo &MCII,
 MCObjectWriter *createAArch64ELFObjectWriter(raw_ostream &OS,
                                              uint8_t OSABI);
 
-MCAsmBackend *createAArch64AsmBackend(const Target &T, StringRef TT,
-                                      StringRef CPU);
+MCAsmBackend *createAArch64AsmBackend(const Target &T,
+                                      const MCRegisterInfo &MRI,
+                                      StringRef TT, StringRef CPU);
 
 } // End llvm namespace
 
diff --git a/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/lib/Target/AArch64/Utils/AArch64BaseInfo.h
index e675efc..ce970b0 100644
--- a/lib/Target/AArch64/Utils/AArch64BaseInfo.h
+++ b/lib/Target/AArch64/Utils/AArch64BaseInfo.h
@@ -306,6 +306,65 @@ namespace A64SE {
     };
 }
 
+namespace A64Layout {
+    enum VectorLayout {
+        Invalid = -1,
+        VL_8B,
+        VL_4H,
+        VL_2S,
+        VL_1D,
+
+        VL_16B,
+        VL_8H,
+        VL_4S,
+        VL_2D,
+
+        // Bare layout for the 128-bit vector
+        // (only show ".b", ".h", ".s", ".d" without vector number)
+        VL_B,
+        VL_H,
+        VL_S,
+        VL_D
+    };
+}
+
+inline static const char *
+A64VectorLayoutToString(A64Layout::VectorLayout Layout) {
+  switch (Layout) {
+  case A64Layout::VL_8B:  return ".8b";
+  case A64Layout::VL_4H:  return ".4h";
+  case A64Layout::VL_2S:  return ".2s";
+  case A64Layout::VL_1D:  return ".1d";
+  case A64Layout::VL_16B:  return ".16b";
+  case A64Layout::VL_8H:  return ".8h";
+  case A64Layout::VL_4S:  return ".4s";
+  case A64Layout::VL_2D:  return ".2d";
+  case A64Layout::VL_B:  return ".b";
+  case A64Layout::VL_H:  return ".h";
+  case A64Layout::VL_S:  return ".s";
+  case A64Layout::VL_D:  return ".d";
+  default: llvm_unreachable("Unknown Vector Layout");
+  }
+}
+
+inline static A64Layout::VectorLayout
+A64StringToVectorLayout(StringRef LayoutStr) {
+  return StringSwitch<A64Layout::VectorLayout>(LayoutStr)
+             .Case(".8b", A64Layout::VL_8B)
+             .Case(".4h", A64Layout::VL_4H)
+             .Case(".2s", A64Layout::VL_2S)
+             .Case(".1d", A64Layout::VL_1D)
+             .Case(".16b", A64Layout::VL_16B)
+             .Case(".8h", A64Layout::VL_8H)
+             .Case(".4s", A64Layout::VL_4S)
+             .Case(".2d", A64Layout::VL_2D)
+             .Case(".b", A64Layout::VL_B)
+             .Case(".h", A64Layout::VL_H)
+             .Case(".s", A64Layout::VL_S)
+             .Case(".d", A64Layout::VL_D)
+             .Default(A64Layout::Invalid);
+}
+
 namespace A64SysReg {
   enum SysRegROValues {
     MDCCSR_EL0        = 0x9808, // 10  011  0000  0001  000
diff --git a/lib/Target/ARM/A15SDOptimizer.cpp b/lib/Target/ARM/A15SDOptimizer.cpp
index e8c2f7c..ff585b4 100644
--- a/lib/Target/ARM/A15SDOptimizer.cpp
+++ b/lib/Target/ARM/A15SDOptimizer.cpp
@@ -657,6 +657,13 @@ bool A15SDOptimizer::runOnInstruction(MachineInstr *MI) {
         Modified = true;
         for (SmallVectorImpl<MachineOperand *>::const_iterator I = Uses.begin(),
                E = Uses.end(); I != E; ++I) {
+          // Make sure to constrain the register class of the new register to
+          // match what we're replacing. Otherwise we can optimize a DPR_VFP2
+          // reference into a plain DPR, and that will end poorly. NewReg is
+          // always virtual here, so there will always be a matching subclass
+          // to find.
+          MRI->constrainRegClass(NewReg, MRI->getRegClass((*I)->getReg()));
+
           DEBUG(dbgs() << "Replacing operand "
                        << **I << " with "
                        << PrintReg(NewReg) << "\n");
diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td
index e5da3a5..36e5680 100644
--- a/lib/Target/ARM/ARM.td
+++ b/lib/Target/ARM/ARM.td
@@ -45,7 +45,7 @@ def FeatureFP16   : SubtargetFeature<"fp16", "HasFP16", "true",
 def FeatureVFP4   : SubtargetFeature<"vfp4", "HasVFPv4", "true",
                                      "Enable VFP4 instructions",
                                      [FeatureVFP3, FeatureFP16]>;
-def FeatureV8FP : SubtargetFeature<"v8fp", "HasV8FP",
+def FeatureFPARMv8 : SubtargetFeature<"fp-armv8", "HasFPARMv8",
                                    "true", "Enable ARMv8 FP",
                                    [FeatureVFP4]>;
 def FeatureD16    : SubtargetFeature<"d16", "HasD16", "true",
@@ -67,6 +67,11 @@ def FeaturePerfMon : SubtargetFeature<"perfmon", "HasPerfMon", "true",
                            "Enable support for Performance Monitor extensions">;
 def FeatureTrustZone : SubtargetFeature<"trustzone", "HasTrustZone", "true",
                           "Enable support for TrustZone security extensions">;
+def FeatureCrypto : SubtargetFeature<"crypto", "HasCrypto", "true",
+                          "Enable support for Cryptography extensions",
+                          [FeatureNEON]>;
+def FeatureCRC : SubtargetFeature<"crc", "HasCRC", "true",
+                          "Enable support for CRC instructions">;
 
 // Some processors have FP multiply-accumulate instructions that don't
 // play nicely with other VFP / NEON instructions, and it's generally better
@@ -114,10 +119,24 @@ def FeatureDSPThumb2 : SubtargetFeature<"t2dsp", "Thumb2DSP", "true",
 def FeatureMP : SubtargetFeature<"mp", "HasMPExtension", "true",
                                  "Supports Multiprocessing extension">;
 
-// M-series ISA?
-def FeatureMClass : SubtargetFeature<"mclass", "IsMClass", "true",
+// Virtualization extension - requires HW divide (ARMv7-AR ARMARM - 4.4.8).
+def FeatureVirtualization : SubtargetFeature<"virtualization",
+                                 "HasVirtualization", "true",
+                                 "Supports Virtualization extension",
+                                 [FeatureHWDiv, FeatureHWDivARM]>;
+
+// M-series ISA
+def FeatureMClass : SubtargetFeature<"mclass", "ARMProcClass", "MClass",
                                      "Is microcontroller profile ('M' series)">;
 
+// R-series ISA
+def FeatureRClass : SubtargetFeature<"rclass", "ARMProcClass", "RClass",
+                                     "Is realtime profile ('R' series)">;
+
+// A-series ISA
+def FeatureAClass : SubtargetFeature<"aclass", "ARMProcClass", "AClass",
+                                     "Is application profile ('A' series)">;
+
 // Special TRAP encoding for NaCl, which looks like a TRAP in Thumb too.
 // See ARMInstrInfo.td for details.
 def FeatureNaClTrap : SubtargetFeature<"nacl-trap", "UseNaClTrap", "true",
@@ -135,15 +154,19 @@ def HasV5TEOps  : SubtargetFeature<"v5te", "HasV5TEOps", "true",
 def HasV6Ops    : SubtargetFeature<"v6", "HasV6Ops", "true",
                                    "Support ARM v6 instructions",
                                    [HasV5TEOps]>;
+def HasV6MOps   : SubtargetFeature<"v6m", "HasV6MOps", "true",
+                                   "Support ARM v6M instructions",
+                                   [HasV6Ops]>;
 def HasV6T2Ops  : SubtargetFeature<"v6t2", "HasV6T2Ops", "true",
                                    "Support ARM v6t2 instructions",
-                                   [HasV6Ops, FeatureThumb2]>;
+                                   [HasV6MOps, FeatureThumb2]>;
 def HasV7Ops    : SubtargetFeature<"v7", "HasV7Ops", "true",
                                    "Support ARM v7 instructions",
                                    [HasV6T2Ops, FeaturePerfMon]>;
 def HasV8Ops    : SubtargetFeature<"v8", "HasV8Ops", "true",
                                    "Support ARM v8 instructions",
-                                   [HasV7Ops]>;
+                                   [HasV7Ops, FeatureVirtualization,
+                                    FeatureMP]>;
 
 //===----------------------------------------------------------------------===//
 // ARM Processors supported.
@@ -179,9 +202,23 @@ def ProcSwift   : SubtargetFeature<"swift", "ARMProcFamily", "Swift",
 // FIXME: It has not been determined if A15 has these features.
 def ProcA15      : SubtargetFeature<"a15", "ARMProcFamily", "CortexA15",
                                    "Cortex-A15 ARM processors",
-                                   [FeatureT2XtPk, FeatureFP16, FeatureVFP4,
+                                   [FeatureT2XtPk, FeatureVFP4,
+                                    FeatureMP, FeatureHWDiv, FeatureHWDivARM,
                                     FeatureAvoidPartialCPSR,
-                                    FeatureTrustZone]>;
+                                    FeatureTrustZone, FeatureVirtualization]>;
+
+def ProcA53     : SubtargetFeature<"a53", "ARMProcFamily", "CortexA53",
+                                   "Cortex-A53 ARM processors",
+                                   [FeatureHWDiv, FeatureHWDivARM,
+                                    FeatureTrustZone, FeatureT2XtPk,
+                                    FeatureCrypto, FeatureCRC]>;
+
+def ProcA57     : SubtargetFeature<"a57", "ARMProcFamily", "CortexA57",
+                                   "Cortex-A57 ARM processors",
+                                   [FeatureHWDiv, FeatureHWDivARM,
+                                    FeatureTrustZone, FeatureT2XtPk,
+                                    FeatureCrypto, FeatureCRC]>;
+
 def ProcR5      : SubtargetFeature<"r5", "ARMProcFamily", "CortexR5",
                                    "Cortex-R5 ARM processors",
                                    [FeatureSlowFPBrcc,
@@ -243,7 +280,7 @@ def : Processor<"mpcore",           ARMV6Itineraries, [HasV6Ops, FeatureVFP2,
                                                        FeatureHasSlowFPVMLx]>;
 
 // V6M Processors.
-def : Processor<"cortex-m0",        ARMV6Itineraries, [HasV6Ops, FeatureNoARM,
+def : Processor<"cortex-m0",        ARMV6Itineraries, [HasV6MOps, FeatureNoARM,
                                                        FeatureDB, FeatureMClass]>;
 
 // V6T2 Processors.
@@ -258,26 +295,30 @@ def : Processor<"arm1156t2f-s",     ARMV6Itineraries, [HasV6T2Ops, FeatureVFP2,
 def : ProcessorModel<"cortex-a5",   CortexA8Model,
                                     [ProcA5, HasV7Ops, FeatureNEON, FeatureDB,
                                      FeatureVFP4, FeatureDSPThumb2,
-                                     FeatureHasRAS]>;
+                                     FeatureHasRAS, FeatureAClass]>;
 def : ProcessorModel<"cortex-a8",   CortexA8Model,
                                     [ProcA8, HasV7Ops, FeatureNEON, FeatureDB,
-                                     FeatureDSPThumb2, FeatureHasRAS]>;
+                                     FeatureDSPThumb2, FeatureHasRAS,
+                                     FeatureAClass]>;
 def : ProcessorModel<"cortex-a9",   CortexA9Model,
                                     [ProcA9, HasV7Ops, FeatureNEON, FeatureDB,
-                                     FeatureDSPThumb2, FeatureHasRAS]>;
+                                     FeatureDSPThumb2, FeatureHasRAS,
+                                     FeatureAClass]>;
 def : ProcessorModel<"cortex-a9-mp", CortexA9Model,
                                     [ProcA9, HasV7Ops, FeatureNEON, FeatureDB,
                                      FeatureDSPThumb2, FeatureMP,
-                                     FeatureHasRAS]>;
+                                     FeatureHasRAS, FeatureAClass]>;
 // FIXME: A15 has currently the same ProcessorModel as A9.
 def : ProcessorModel<"cortex-a15",   CortexA9Model,
                                     [ProcA15, HasV7Ops, FeatureNEON, FeatureDB,
-                                     FeatureDSPThumb2, FeatureHasRAS]>;
+                                     FeatureDSPThumb2, FeatureHasRAS,
+                                     FeatureAClass]>;
 // FIXME: R5 has currently the same ProcessorModel as A8.
 def : ProcessorModel<"cortex-r5",   CortexA8Model,
                                     [ProcR5, HasV7Ops, FeatureDB,
                                      FeatureVFP3, FeatureDSPThumb2,
-                                     FeatureHasRAS]>;
+                                     FeatureHasRAS, FeatureVFPOnlySP,
+                                     FeatureD16, FeatureRClass]>;
 
 // V7M Processors.
 def : ProcNoItin<"cortex-m3",       [HasV7Ops,
@@ -289,16 +330,22 @@ def : ProcNoItin<"cortex-m4",       [HasV7Ops,
                                      FeatureThumb2, FeatureNoARM, FeatureDB,
                                      FeatureHWDiv, FeatureDSPThumb2,
                                      FeatureT2XtPk, FeatureVFP4,
-                                     FeatureVFPOnlySP, FeatureMClass]>;
+                                     FeatureVFPOnlySP, FeatureD16,
+                                     FeatureMClass]>;
 
 // Swift uArch Processors.
 def : ProcessorModel<"swift",       SwiftModel,
                                     [ProcSwift, HasV7Ops, FeatureNEON,
                                      FeatureDB, FeatureDSPThumb2,
-                                     FeatureHasRAS]>;
+                                     FeatureHasRAS, FeatureAClass]>;
 
 // V8 Processors
-def : ProcNoItin<"cortex-a53",      [HasV8Ops]>;
+def : ProcNoItin<"cortex-a53",      [ProcA53, HasV8Ops, FeatureAClass,
+                                    FeatureDB, FeatureFPARMv8,
+                                    FeatureNEON, FeatureDSPThumb2]>;
+def : ProcNoItin<"cortex-a57",      [ProcA57, HasV8Ops, FeatureAClass,
+                                    FeatureDB, FeatureFPARMv8,
+                                    FeatureNEON, FeatureDSPThumb2]>;
 
 //===----------------------------------------------------------------------===//
 // Register File Description
diff --git a/lib/Target/ARM/ARMAsmPrinter.cpp b/lib/Target/ARM/ARMAsmPrinter.cpp
index 13a22b1..e79f88d 100644
--- a/lib/Target/ARM/ARMAsmPrinter.cpp
+++ b/lib/Target/ARM/ARMAsmPrinter.cpp
@@ -17,6 +17,7 @@
 #include "ARM.h"
 #include "ARMBuildAttrs.h"
 #include "ARMConstantPoolValue.h"
+#include "ARMFPUName.h"
 #include "ARMMachineFunctionInfo.h"
 #include "ARMTargetMachine.h"
 #include "ARMTargetObjectFile.h"
@@ -55,164 +56,6 @@
 #include <cctype>
 using namespace llvm;
 
-namespace {
-
-  // Per section and per symbol attributes are not supported.
-  // To implement them we would need the ability to delay this emission
-  // until the assembly file is fully parsed/generated as only then do we
-  // know the symbol and section numbers.
-  class AttributeEmitter {
-  public:
-    virtual void MaybeSwitchVendor(StringRef Vendor) = 0;
-    virtual void EmitAttribute(unsigned Attribute, unsigned Value) = 0;
-    virtual void EmitTextAttribute(unsigned Attribute, StringRef String) = 0;
-    virtual void Finish() = 0;
-    virtual ~AttributeEmitter() {}
-  };
-
-  class AsmAttributeEmitter : public AttributeEmitter {
-    MCStreamer &Streamer;
-
-  public:
-    AsmAttributeEmitter(MCStreamer &Streamer_) : Streamer(Streamer_) {}
-    void MaybeSwitchVendor(StringRef Vendor) { }
-
-    void EmitAttribute(unsigned Attribute, unsigned Value) {
-      Streamer.EmitRawText("\t.eabi_attribute " +
-                           Twine(Attribute) + ", " + Twine(Value));
-    }
-
-    void EmitTextAttribute(unsigned Attribute, StringRef String) {
-      switch (Attribute) {
-      default: llvm_unreachable("Unsupported Text attribute in ASM Mode");
-      case ARMBuildAttrs::CPU_name:
-        Streamer.EmitRawText(StringRef("\t.cpu ") + String.lower());
-        break;
-      /* GAS requires .fpu to be emitted regardless of EABI attribute */
-      case ARMBuildAttrs::Advanced_SIMD_arch:
-      case ARMBuildAttrs::VFP_arch:
-        Streamer.EmitRawText(StringRef("\t.fpu ") + String.lower());
-        break;
-      }
-    }
-    void Finish() { }
-  };
-
-  class ObjectAttributeEmitter : public AttributeEmitter {
-    // This structure holds all attributes, accounting for
-    // their string/numeric value, so we can later emmit them
-    // in declaration order, keeping all in the same vector
-    struct AttributeItemType {
-      enum {
-        HiddenAttribute = 0,
-        NumericAttribute,
-        TextAttribute
-      } Type;
-      unsigned Tag;
-      unsigned IntValue;
-      StringRef StringValue;
-    } AttributeItem;
-
-    MCObjectStreamer &Streamer;
-    StringRef CurrentVendor;
-    SmallVector<AttributeItemType, 64> Contents;
-
-    // Account for the ULEB/String size of each item,
-    // not just the number of items
-    size_t ContentsSize;
-    // FIXME: this should be in a more generic place, but
-    // getULEBSize() is in MCAsmInfo and will be moved to MCDwarf
-    size_t getULEBSize(int Value) {
-      size_t Size = 0;
-      do {
-        Value >>= 7;
-        Size += sizeof(int8_t); // Is this really necessary?
-      } while (Value);
-      return Size;
-    }
-
-  public:
-    ObjectAttributeEmitter(MCObjectStreamer &Streamer_) :
-      Streamer(Streamer_), CurrentVendor(""), ContentsSize(0) { }
-
-    void MaybeSwitchVendor(StringRef Vendor) {
-      assert(!Vendor.empty() && "Vendor cannot be empty.");
-
-      if (CurrentVendor.empty())
-        CurrentVendor = Vendor;
-      else if (CurrentVendor == Vendor)
-        return;
-      else
-        Finish();
-
-      CurrentVendor = Vendor;
-
-      assert(Contents.size() == 0);
-    }
-
-    void EmitAttribute(unsigned Attribute, unsigned Value) {
-      AttributeItemType attr = {
-        AttributeItemType::NumericAttribute,
-        Attribute,
-        Value,
-        StringRef("")
-      };
-      ContentsSize += getULEBSize(Attribute);
-      ContentsSize += getULEBSize(Value);
-      Contents.push_back(attr);
-    }
-
-    void EmitTextAttribute(unsigned Attribute, StringRef String) {
-      AttributeItemType attr = {
-        AttributeItemType::TextAttribute,
-        Attribute,
-        0,
-        String
-      };
-      ContentsSize += getULEBSize(Attribute);
-      // String + \0
-      ContentsSize += String.size()+1;
-
-      Contents.push_back(attr);
-    }
-
-    void Finish() {
-      // Vendor size + Vendor name + '\0'
-      const size_t VendorHeaderSize = 4 + CurrentVendor.size() + 1;
-
-      // Tag + Tag Size
-      const size_t TagHeaderSize = 1 + 4;
-
-      Streamer.EmitIntValue(VendorHeaderSize + TagHeaderSize + ContentsSize, 4);
-      Streamer.EmitBytes(CurrentVendor);
-      Streamer.EmitIntValue(0, 1); // '\0'
-
-      Streamer.EmitIntValue(ARMBuildAttrs::File, 1);
-      Streamer.EmitIntValue(TagHeaderSize + ContentsSize, 4);
-
-      // Size should have been accounted for already, now
-      // emit each field as its type (ULEB or String)
-      for (unsigned int i=0; i<Contents.size(); ++i) {
-        AttributeItemType item = Contents[i];
-        Streamer.EmitULEB128IntValue(item.Tag);
-        switch (item.Type) {
-        default: llvm_unreachable("Invalid attribute type");
-        case AttributeItemType::NumericAttribute:
-          Streamer.EmitULEB128IntValue(item.IntValue);
-          break;
-        case AttributeItemType::TextAttribute:
-          Streamer.EmitBytes(item.StringValue.upper());
-          Streamer.EmitIntValue(0, 1); // '\0'
-          break;
-        }
-      }
-
-      Contents.clear();
-    }
-  };
-
-} // end of anonymous namespace
-
 /// EmitDwarfRegOp - Emit dwarf register operation.
 void ARMAsmPrinter::EmitDwarfRegOp(const MachineLocation &MLoc,
                                    bool Indirect) const {
@@ -302,7 +145,7 @@ void ARMAsmPrinter::EmitXXStructor(const Constant *CV) {
   const GlobalValue *GV = dyn_cast<GlobalValue>(CV->stripPointerCasts());
   assert(GV && "C++ constructor pointer was not a GlobalValue!");
 
-  const MCExpr *E = MCSymbolRefExpr::Create(Mang->getSymbol(GV),
+  const MCExpr *E = MCSymbolRefExpr::Create(getSymbol(GV),
                                             (Subtarget->isTargetDarwin()
                                              ? MCSymbolRefExpr::VK_None
                                              : MCSymbolRefExpr::VK_ARM_TARGET1),
@@ -363,7 +206,7 @@ void ARMAsmPrinter::printOperand(const MachineInstr *MI, int OpNum,
     else if ((Modifier && strcmp(Modifier, "hi16") == 0) ||
              (TF & ARMII::MO_HI16))
       O << ":upper16:";
-    O << *Mang->getSymbol(GV);
+    O << *getSymbol(GV);
 
     printOffset(MO.getOffset(), O);
     if (TF == ARMII::MO_PLT)
@@ -496,6 +339,23 @@ bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
       if (!FlagsOP.isImm())
         return true;
       unsigned Flags = FlagsOP.getImm();
+
+      // This operand may not be the one that actually provides the register. If
+      // it's tied to a previous one then we should refer instead to that one
+      // for registers and their classes.
+      unsigned TiedIdx;
+      if (InlineAsm::isUseOperandTiedToDef(Flags, TiedIdx)) {
+        for (OpNum = InlineAsm::MIOp_FirstOperand; TiedIdx; --TiedIdx) {
+          unsigned OpFlags = MI->getOperand(OpNum).getImm();
+          OpNum += InlineAsm::getNumOperandRegisters(OpFlags) + 1;
+        }
+        Flags = MI->getOperand(OpNum).getImm();
+
+        // Later code expects OpNum to be pointing at the register rather than
+        // the flags.
+        OpNum += 1;
+      }
+
       unsigned NumVals = InlineAsm::getNumOperandRegisters(Flags);
       unsigned RC;
       InlineAsm::hasRegClassConstraint(Flags, RC);
@@ -714,11 +574,6 @@ void ARMAsmPrinter::EmitEndOfAsmFile(Module &M) {
     // generates code that does this, it is always safe to set.
     OutStreamer.EmitAssemblerFlag(MCAF_SubsectionsViaSymbols);
   }
-  // FIXME: This should eventually end up somewhere else where more
-  // intelligent flag decisions can be made. For now we are just maintaining
-  // the status quo for ARM and setting EF_ARM_EABI_VER5 as the default.
-  if (MCELFStreamer *MES = dyn_cast<MCELFStreamer>(&OutStreamer))
-    MES->getAssembler().setELFHeaderEFlags(ELF::EF_ARM_EABI_VER5);
 }
 
 //===----------------------------------------------------------------------===//
@@ -728,150 +583,150 @@ void ARMAsmPrinter::EmitEndOfAsmFile(Module &M) {
 // to appear in the .ARM.attributes section in ELF.
 // Instead of subclassing the MCELFStreamer, we do the work here.
 
-void ARMAsmPrinter::emitAttributes() {
-
-  emitARMAttributeSection();
-
-  /* GAS expect .fpu to be emitted, regardless of VFP build attribute */
-  bool emitFPU = false;
-  AttributeEmitter *AttrEmitter;
-  if (OutStreamer.hasRawTextSupport()) {
-    AttrEmitter = new AsmAttributeEmitter(OutStreamer);
-    emitFPU = true;
-  } else {
-    MCObjectStreamer &O = static_cast<MCObjectStreamer&>(OutStreamer);
-    AttrEmitter = new ObjectAttributeEmitter(O);
-  }
-
-  AttrEmitter->MaybeSwitchVendor("aeabi");
-
-  std::string CPUString = Subtarget->getCPUString();
+static ARMBuildAttrs::CPUArch getArchForCPU(StringRef CPU,
+                                            const ARMSubtarget *Subtarget) {
+  if (CPU == "xscale")
+    return ARMBuildAttrs::v5TEJ;
 
-  if (CPUString == "cortex-a8" ||
-      Subtarget->isCortexA8()) {
-    AttrEmitter->EmitTextAttribute(ARMBuildAttrs::CPU_name, "cortex-a8");
-    AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch, ARMBuildAttrs::v7);
-    AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch_profile,
-                               ARMBuildAttrs::ApplicationProfile);
-    AttrEmitter->EmitAttribute(ARMBuildAttrs::ARM_ISA_use,
-                               ARMBuildAttrs::Allowed);
-    AttrEmitter->EmitAttribute(ARMBuildAttrs::THUMB_ISA_use,
-                               ARMBuildAttrs::AllowThumb32);
-    // Fixme: figure out when this is emitted.
-    //AttrEmitter->EmitAttribute(ARMBuildAttrs::WMMX_arch,
-    //                           ARMBuildAttrs::AllowWMMXv1);
-    //
-
-    /// ADD additional Else-cases here!
-  } else if (CPUString == "xscale") {
-    AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch, ARMBuildAttrs::v5TEJ);
-    AttrEmitter->EmitAttribute(ARMBuildAttrs::ARM_ISA_use,
-                               ARMBuildAttrs::Allowed);
-    AttrEmitter->EmitAttribute(ARMBuildAttrs::THUMB_ISA_use,
-                               ARMBuildAttrs::Allowed);
-  } else if (Subtarget->hasV8Ops())
-    AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch, ARMBuildAttrs::v8);
+  if (Subtarget->hasV8Ops())
+    return ARMBuildAttrs::v8;
   else if (Subtarget->hasV7Ops()) {
-    AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch, ARMBuildAttrs::v7);
-    AttrEmitter->EmitAttribute(ARMBuildAttrs::THUMB_ISA_use,
-                               ARMBuildAttrs::AllowThumb32);
+    if (Subtarget->isMClass() && Subtarget->hasThumb2DSP())
+      return ARMBuildAttrs::v7E_M;
+    return ARMBuildAttrs::v7;
   } else if (Subtarget->hasV6T2Ops())
-    AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch, ARMBuildAttrs::v6T2);
+    return ARMBuildAttrs::v6T2;
+  else if (Subtarget->hasV6MOps())
+    return ARMBuildAttrs::v6S_M;
   else if (Subtarget->hasV6Ops())
-    AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch, ARMBuildAttrs::v6);
+    return ARMBuildAttrs::v6;
   else if (Subtarget->hasV5TEOps())
-    AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch, ARMBuildAttrs::v5TE);
+    return ARMBuildAttrs::v5TE;
   else if (Subtarget->hasV5TOps())
-    AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch, ARMBuildAttrs::v5T);
+    return ARMBuildAttrs::v5T;
   else if (Subtarget->hasV4TOps())
-    AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch, ARMBuildAttrs::v4T);
+    return ARMBuildAttrs::v4T;
   else
-    AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch, ARMBuildAttrs::v4);
+    return ARMBuildAttrs::v4;
+}
 
-  if (Subtarget->hasNEON() && emitFPU) {
-    /* NEON is not exactly a VFP architecture, but GAS emit one of
-     * neon/neon-vfpv4/vfpv3/vfpv2 for .fpu parameters */
-    if (Subtarget->hasVFP4())
-      AttrEmitter->EmitTextAttribute(ARMBuildAttrs::Advanced_SIMD_arch,
-                                     "neon-vfpv4");
-    else
-      AttrEmitter->EmitTextAttribute(ARMBuildAttrs::Advanced_SIMD_arch, "neon");
-    /* If emitted for NEON, omit from VFP below, since you can have both
-     * NEON and VFP in build attributes but only one .fpu */
-    emitFPU = false;
+void ARMAsmPrinter::emitAttributes() {
+  MCTargetStreamer &TS = OutStreamer.getTargetStreamer();
+  ARMTargetStreamer &ATS = static_cast<ARMTargetStreamer &>(TS);
+
+  ATS.switchVendor("aeabi");
+
+  std::string CPUString = Subtarget->getCPUString();
+
+  if (CPUString != "generic")
+    ATS.emitTextAttribute(ARMBuildAttrs::CPU_name, CPUString);
+
+  ATS.emitAttribute(ARMBuildAttrs::CPU_arch,
+                    getArchForCPU(CPUString, Subtarget));
+
+  if (Subtarget->isAClass()) {
+    ATS.emitAttribute(ARMBuildAttrs::CPU_arch_profile,
+                      ARMBuildAttrs::ApplicationProfile);
+  } else if (Subtarget->isRClass()) {
+    ATS.emitAttribute(ARMBuildAttrs::CPU_arch_profile,
+                      ARMBuildAttrs::RealTimeProfile);
+  } else if (Subtarget->isMClass()){
+    ATS.emitAttribute(ARMBuildAttrs::CPU_arch_profile,
+                      ARMBuildAttrs::MicroControllerProfile);
   }
 
-  /* V8FP + .fpu */
-  if (Subtarget->hasV8FP()) {
-    AttrEmitter->EmitAttribute(ARMBuildAttrs::VFP_arch,
-                               ARMBuildAttrs::AllowV8FPA);
-    if (emitFPU)
-      AttrEmitter->EmitTextAttribute(ARMBuildAttrs::VFP_arch, "v8fp");
-    /* VFPv4 + .fpu */
-  } else if (Subtarget->hasVFP4()) {
-    AttrEmitter->EmitAttribute(ARMBuildAttrs::VFP_arch,
-                               ARMBuildAttrs::AllowFPv4A);
-    if (emitFPU)
-      AttrEmitter->EmitTextAttribute(ARMBuildAttrs::VFP_arch, "vfpv4");
-
-  /* VFPv3 + .fpu */
-  } else if (Subtarget->hasVFP3()) {
-    AttrEmitter->EmitAttribute(ARMBuildAttrs::VFP_arch,
-                               ARMBuildAttrs::AllowFPv3A);
-    if (emitFPU)
-      AttrEmitter->EmitTextAttribute(ARMBuildAttrs::VFP_arch, "vfpv3");
-
-  /* VFPv2 + .fpu */
-  } else if (Subtarget->hasVFP2()) {
-    AttrEmitter->EmitAttribute(ARMBuildAttrs::VFP_arch,
-                               ARMBuildAttrs::AllowFPv2);
-    if (emitFPU)
-      AttrEmitter->EmitTextAttribute(ARMBuildAttrs::VFP_arch, "vfpv2");
+  ATS.emitAttribute(ARMBuildAttrs::ARM_ISA_use, Subtarget->hasARMOps() ?
+                      ARMBuildAttrs::Allowed : ARMBuildAttrs::Not_Allowed);
+  if (Subtarget->isThumb1Only()) {
+    ATS.emitAttribute(ARMBuildAttrs::THUMB_ISA_use,
+                      ARMBuildAttrs::Allowed);
+  } else if (Subtarget->hasThumb2()) {
+    ATS.emitAttribute(ARMBuildAttrs::THUMB_ISA_use,
+                      ARMBuildAttrs::AllowThumb32);
   }
 
-  /* TODO: ARMBuildAttrs::Allowed is not completely accurate,
-   * since NEON can have 1 (allowed) or 2 (MAC operations) */
   if (Subtarget->hasNEON()) {
-    if (Subtarget->hasV8Ops())
-      AttrEmitter->EmitAttribute(ARMBuildAttrs::Advanced_SIMD_arch,
-                                 ARMBuildAttrs::AllowedNeonV8);
+    /* NEON is not exactly a VFP architecture, but GAS emit one of
+     * neon/neon-fp-armv8/neon-vfpv4/vfpv3/vfpv2 for .fpu parameters */
+    if (Subtarget->hasFPARMv8()) {
+      if (Subtarget->hasCrypto())
+        ATS.emitFPU(ARM::CRYPTO_NEON_FP_ARMV8);
+      else
+        ATS.emitFPU(ARM::NEON_FP_ARMV8);
+    }
+    else if (Subtarget->hasVFP4())
+      ATS.emitFPU(ARM::NEON_VFPV4);
     else
-      AttrEmitter->EmitAttribute(ARMBuildAttrs::Advanced_SIMD_arch,
-                                 ARMBuildAttrs::Allowed);
+      ATS.emitFPU(ARM::NEON);
+    // Emit Tag_Advanced_SIMD_arch for ARMv8 architecture
+    if (Subtarget->hasV8Ops())
+      ATS.emitAttribute(ARMBuildAttrs::Advanced_SIMD_arch,
+                        ARMBuildAttrs::AllowNeonARMv8);
+  } else {
+    if (Subtarget->hasFPARMv8())
+      ATS.emitFPU(ARM::FP_ARMV8);
+    else if (Subtarget->hasVFP4())
+      ATS.emitFPU(Subtarget->hasD16() ? ARM::VFPV4_D16 : ARM::VFPV4);
+    else if (Subtarget->hasVFP3())
+      ATS.emitFPU(Subtarget->hasD16() ? ARM::VFPV3_D16 : ARM::VFPV3);
+    else if (Subtarget->hasVFP2())
+      ATS.emitFPU(ARM::VFPV2);
   }
 
   // Signal various FP modes.
   if (!TM.Options.UnsafeFPMath) {
-    AttrEmitter->EmitAttribute(ARMBuildAttrs::ABI_FP_denormal,
-                               ARMBuildAttrs::Allowed);
-    AttrEmitter->EmitAttribute(ARMBuildAttrs::ABI_FP_exceptions,
-                               ARMBuildAttrs::Allowed);
+    ATS.emitAttribute(ARMBuildAttrs::ABI_FP_denormal, ARMBuildAttrs::Allowed);
+    ATS.emitAttribute(ARMBuildAttrs::ABI_FP_exceptions,
+                      ARMBuildAttrs::Allowed);
   }
 
   if (TM.Options.NoInfsFPMath && TM.Options.NoNaNsFPMath)
-    AttrEmitter->EmitAttribute(ARMBuildAttrs::ABI_FP_number_model,
-                               ARMBuildAttrs::Allowed);
+    ATS.emitAttribute(ARMBuildAttrs::ABI_FP_number_model,
+                      ARMBuildAttrs::Allowed);
   else
-    AttrEmitter->EmitAttribute(ARMBuildAttrs::ABI_FP_number_model,
-                               ARMBuildAttrs::AllowIEE754);
+    ATS.emitAttribute(ARMBuildAttrs::ABI_FP_number_model,
+                      ARMBuildAttrs::AllowIEE754);
 
   // FIXME: add more flags to ARMBuildAttrs.h
   // 8-bytes alignment stuff.
-  AttrEmitter->EmitAttribute(ARMBuildAttrs::ABI_align8_needed, 1);
-  AttrEmitter->EmitAttribute(ARMBuildAttrs::ABI_align8_preserved, 1);
+  ATS.emitAttribute(ARMBuildAttrs::ABI_align8_needed, 1);
+  ATS.emitAttribute(ARMBuildAttrs::ABI_align8_preserved, 1);
+
+  // ABI_HardFP_use attribute to indicate single precision FP.
+  if (Subtarget->isFPOnlySP())
+    ATS.emitAttribute(ARMBuildAttrs::ABI_HardFP_use,
+                      ARMBuildAttrs::HardFPSinglePrecision);
 
   // Hard float.  Use both S and D registers and conform to AAPCS-VFP.
-  if (Subtarget->isAAPCS_ABI() && TM.Options.FloatABIType == FloatABI::Hard) {
-    AttrEmitter->EmitAttribute(ARMBuildAttrs::ABI_HardFP_use, 3);
-    AttrEmitter->EmitAttribute(ARMBuildAttrs::ABI_VFP_args, 1);
-  }
+  if (Subtarget->isAAPCS_ABI() && TM.Options.FloatABIType == FloatABI::Hard)
+    ATS.emitAttribute(ARMBuildAttrs::ABI_VFP_args, ARMBuildAttrs::HardFPAAPCS);
+
   // FIXME: Should we signal R9 usage?
 
-  if (Subtarget->hasDivide())
-    AttrEmitter->EmitAttribute(ARMBuildAttrs::DIV_use, 1);
+  if (Subtarget->hasFP16())
+      ATS.emitAttribute(ARMBuildAttrs::FP_HP_extension, ARMBuildAttrs::AllowHPFP);
+
+  if (Subtarget->hasMPExtension())
+      ATS.emitAttribute(ARMBuildAttrs::MPextension_use, ARMBuildAttrs::AllowMP);
+
+  if (Subtarget->hasDivide()) {
+    // Check if hardware divide is only available in thumb2 or ARM as well.
+    ATS.emitAttribute(ARMBuildAttrs::DIV_use,
+      Subtarget->hasDivideInARMMode() ? ARMBuildAttrs::AllowDIVExt :
+                                        ARMBuildAttrs::AllowDIVIfExists);
+  }
 
-  AttrEmitter->Finish();
-  delete AttrEmitter;
+  if (Subtarget->hasTrustZone() && Subtarget->hasVirtualization())
+      ATS.emitAttribute(ARMBuildAttrs::Virtualization_use,
+                        ARMBuildAttrs::AllowTZVirtualization);
+  else if (Subtarget->hasTrustZone())
+      ATS.emitAttribute(ARMBuildAttrs::Virtualization_use,
+                        ARMBuildAttrs::AllowTZ);
+  else if (Subtarget->hasVirtualization())
+      ATS.emitAttribute(ARMBuildAttrs::Virtualization_use,
+                        ARMBuildAttrs::AllowVirtualization);
+
+  ATS.finishAttributeSection();
 }
 
 void ARMAsmPrinter::emitARMAttributeSection() {
@@ -923,7 +778,7 @@ MCSymbol *ARMAsmPrinter::GetARMGVSymbol(const GlobalValue *GV) {
   bool isIndirect = Subtarget->isTargetDarwin() &&
     Subtarget->GVIsIndirectSymbol(GV, TM.getRelocationModel());
   if (!isIndirect)
-    return Mang->getSymbol(GV);
+    return getSymbol(GV);
 
   // FIXME: Remove this when Darwin transition to @GOT like syntax.
   MCSymbol *MCSym = GetSymbolWithGlobalValueBase(GV, "$non_lazy_ptr");
@@ -934,7 +789,7 @@ MCSymbol *ARMAsmPrinter::GetARMGVSymbol(const GlobalValue *GV) {
     MMIMachO.getGVStubEntry(MCSym);
   if (StubSym.getPointer() == 0)
     StubSym = MachineModuleInfoImpl::
-      StubValueTy(Mang->getSymbol(GV), !GV->hasInternalLinkage());
+      StubValueTy(getSymbol(GV), !GV->hasInternalLinkage());
   return MCSym;
 }
 
@@ -1111,6 +966,8 @@ void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) {
   assert(MI->getFlag(MachineInstr::FrameSetup) &&
       "Only instruction which are involved into frame setup code are allowed");
 
+  MCTargetStreamer &TS = OutStreamer.getTargetStreamer();
+  ARMTargetStreamer &ATS = static_cast<ARMTargetStreamer &>(TS);
   const MachineFunction &MF = *MI->getParent()->getParent();
   const TargetRegisterInfo *RegInfo = MF.getTarget().getRegisterInfo();
   const ARMFunctionInfo &AFI = *MF.getInfo<ARMFunctionInfo>();
@@ -1173,7 +1030,7 @@ void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) {
       RegList.push_back(SrcReg);
       break;
     }
-    OutStreamer.EmitRegSave(RegList, Opc == ARM::VSTMDDB_UPD);
+    ATS.emitRegSave(RegList, Opc == ARM::VSTMDDB_UPD);
   } else {
     // Changes of stack / frame pointer.
     if (SrcReg == ARM::SP) {
@@ -1221,11 +1078,11 @@ void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) {
       if (DstReg == FramePtr && FramePtr != ARM::SP)
         // Set-up of the frame pointer. Positive values correspond to "add"
         // instruction.
-        OutStreamer.EmitSetFP(FramePtr, ARM::SP, -Offset);
+        ATS.emitSetFP(FramePtr, ARM::SP, -Offset);
       else if (DstReg == ARM::SP) {
         // Change of SP by an offset. Positive values correspond to "sub"
         // instruction.
-        OutStreamer.EmitPad(Offset);
+        ATS.emitPad(Offset);
       } else {
         MI->dump();
         llvm_unreachable("Unsupported opcode for unwinding information");
@@ -1366,7 +1223,7 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
       .addReg(0));
 
     const GlobalValue *GV = MI->getOperand(0).getGlobal();
-    MCSymbol *GVSym = Mang->getSymbol(GV);
+    MCSymbol *GVSym = getSymbol(GV);
     const MCExpr *GVSymExpr = MCSymbolRefExpr::Create(GVSym, OutContext);
     OutStreamer.EmitInstruction(MCInstBuilder(ARM::Bcc)
       .addExpr(GVSymExpr)
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp
index 977d936..f835a4e 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -11,10 +11,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "ARMBaseInstrInfo.h"
 #include "ARM.h"
+#include "ARMBaseInstrInfo.h"
 #include "ARMBaseRegisterInfo.h"
 #include "ARMConstantPoolValue.h"
+#include "ARMFeatures.h"
 #include "ARMHazardRecognizer.h"
 #include "ARMMachineFunctionInfo.h"
 #include "MCTargetDesc/ARMAddressingModes.h"
@@ -36,7 +37,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 
-#define GET_INSTRINFO_CTOR
+#define GET_INSTRINFO_CTOR_DTOR
 #include "ARMGenInstrInfo.inc"
 
 using namespace llvm;
@@ -520,11 +521,17 @@ bool ARMBaseInstrInfo::isPredicable(MachineInstr *MI) const {
   if (!MI->isPredicable())
     return false;
 
-  if ((MI->getDesc().TSFlags & ARMII::DomainMask) == ARMII::DomainNEON) {
-    ARMFunctionInfo *AFI =
-      MI->getParent()->getParent()->getInfo<ARMFunctionInfo>();
-    return AFI->isThumb2Function();
+  ARMFunctionInfo *AFI =
+    MI->getParent()->getParent()->getInfo<ARMFunctionInfo>();
+
+  if (AFI->isThumb2Function()) {
+    if (getSubtarget().restrictIT())
+      return isV8EligibleForIT(MI);
+  } else { // non-Thumb
+    if ((MI->getDesc().TSFlags & ARMII::DomainMask) == ARMII::DomainNEON)
+      return false;
   }
+
   return true;
 }
 
@@ -645,16 +652,16 @@ void ARMBaseInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                                    unsigned DestReg, unsigned SrcReg,
                                    bool KillSrc) const {
   bool GPRDest = ARM::GPRRegClass.contains(DestReg);
-  bool GPRSrc  = ARM::GPRRegClass.contains(SrcReg);
+  bool GPRSrc = ARM::GPRRegClass.contains(SrcReg);
 
   if (GPRDest && GPRSrc) {
     AddDefaultCC(AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::MOVr), DestReg)
-                                  .addReg(SrcReg, getKillRegState(KillSrc))));
+                                    .addReg(SrcReg, getKillRegState(KillSrc))));
     return;
   }
 
   bool SPRDest = ARM::SPRRegClass.contains(DestReg);
-  bool SPRSrc  = ARM::SPRRegClass.contains(SrcReg);
+  bool SPRSrc = ARM::SPRRegClass.contains(SrcReg);
 
   unsigned Opc = 0;
   if (SPRDest && SPRSrc)
@@ -683,26 +690,47 @@ void ARMBaseInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
   int Spacing = 1;
 
   // Use VORRq when possible.
-  if (ARM::QQPRRegClass.contains(DestReg, SrcReg))
-    Opc = ARM::VORRq, BeginIdx = ARM::qsub_0, SubRegs = 2;
-  else if (ARM::QQQQPRRegClass.contains(DestReg, SrcReg))
-    Opc = ARM::VORRq, BeginIdx = ARM::qsub_0, SubRegs = 4;
+  if (ARM::QQPRRegClass.contains(DestReg, SrcReg)) {
+    Opc = ARM::VORRq;
+    BeginIdx = ARM::qsub_0;
+    SubRegs = 2;
+  } else if (ARM::QQQQPRRegClass.contains(DestReg, SrcReg)) {
+    Opc = ARM::VORRq;
+    BeginIdx = ARM::qsub_0;
+    SubRegs = 4;
   // Fall back to VMOVD.
-  else if (ARM::DPairRegClass.contains(DestReg, SrcReg))
-    Opc = ARM::VMOVD, BeginIdx = ARM::dsub_0, SubRegs = 2;
-  else if (ARM::DTripleRegClass.contains(DestReg, SrcReg))
-    Opc = ARM::VMOVD, BeginIdx = ARM::dsub_0, SubRegs = 3;
-  else if (ARM::DQuadRegClass.contains(DestReg, SrcReg))
-    Opc = ARM::VMOVD, BeginIdx = ARM::dsub_0, SubRegs = 4;
-  else if (ARM::GPRPairRegClass.contains(DestReg, SrcReg))
-    Opc = ARM::MOVr, BeginIdx = ARM::gsub_0, SubRegs = 2;
-
-  else if (ARM::DPairSpcRegClass.contains(DestReg, SrcReg))
-    Opc = ARM::VMOVD, BeginIdx = ARM::dsub_0, SubRegs = 2, Spacing = 2;
-  else if (ARM::DTripleSpcRegClass.contains(DestReg, SrcReg))
-    Opc = ARM::VMOVD, BeginIdx = ARM::dsub_0, SubRegs = 3, Spacing = 2;
-  else if (ARM::DQuadSpcRegClass.contains(DestReg, SrcReg))
-    Opc = ARM::VMOVD, BeginIdx = ARM::dsub_0, SubRegs = 4, Spacing = 2;
+  } else if (ARM::DPairRegClass.contains(DestReg, SrcReg)) {
+    Opc = ARM::VMOVD;
+    BeginIdx = ARM::dsub_0;
+    SubRegs = 2;
+  } else if (ARM::DTripleRegClass.contains(DestReg, SrcReg)) {
+    Opc = ARM::VMOVD;
+    BeginIdx = ARM::dsub_0;
+    SubRegs = 3;
+  } else if (ARM::DQuadRegClass.contains(DestReg, SrcReg)) {
+    Opc = ARM::VMOVD;
+    BeginIdx = ARM::dsub_0;
+    SubRegs = 4;
+  } else if (ARM::GPRPairRegClass.contains(DestReg, SrcReg)) {
+    Opc = Subtarget.isThumb2() ? ARM::tMOVr : ARM::MOVr;
+    BeginIdx = ARM::gsub_0;
+    SubRegs = 2;
+  } else if (ARM::DPairSpcRegClass.contains(DestReg, SrcReg)) {
+    Opc = ARM::VMOVD;
+    BeginIdx = ARM::dsub_0;
+    SubRegs = 2;
+    Spacing = 2;
+  } else if (ARM::DTripleSpcRegClass.contains(DestReg, SrcReg)) {
+    Opc = ARM::VMOVD;
+    BeginIdx = ARM::dsub_0;
+    SubRegs = 3;
+    Spacing = 2;
+  } else if (ARM::DQuadSpcRegClass.contains(DestReg, SrcReg)) {
+    Opc = ARM::VMOVD;
+    BeginIdx = ARM::dsub_0;
+    SubRegs = 4;
+    Spacing = 2;
+  }
 
   assert(Opc && "Impossible reg-to-reg copy");
 
@@ -711,22 +739,21 @@ void ARMBaseInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
 
   // Copy register tuples backward when the first Dest reg overlaps with SrcReg.
   if (TRI->regsOverlap(SrcReg, TRI->getSubReg(DestReg, BeginIdx))) {
-    BeginIdx = BeginIdx + ((SubRegs-1)*Spacing);
+    BeginIdx = BeginIdx + ((SubRegs - 1) * Spacing);
     Spacing = -Spacing;
   }
 #ifndef NDEBUG
   SmallSet<unsigned, 4> DstRegs;
 #endif
   for (unsigned i = 0; i != SubRegs; ++i) {
-    unsigned Dst = TRI->getSubReg(DestReg, BeginIdx + i*Spacing);
-    unsigned Src = TRI->getSubReg(SrcReg,  BeginIdx + i*Spacing);
+    unsigned Dst = TRI->getSubReg(DestReg, BeginIdx + i * Spacing);
+    unsigned Src = TRI->getSubReg(SrcReg, BeginIdx + i * Spacing);
     assert(Dst && Src && "Bad sub-register");
 #ifndef NDEBUG
     assert(!DstRegs.count(Src) && "destructive vector copy");
     DstRegs.insert(Dst);
 #endif
-    Mov = BuildMI(MBB, I, I->getDebugLoc(), get(Opc), Dst)
-      .addReg(Src);
+    Mov = BuildMI(MBB, I, I->getDebugLoc(), get(Opc), Dst).addReg(Src);
     // VORR takes two source operands.
     if (Opc == ARM::VORRq)
       Mov.addReg(Src);
@@ -1404,9 +1431,11 @@ bool ARMBaseInstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
   case ARM::VLDRD:
   case ARM::VLDRS:
   case ARM::t2LDRi8:
+  case ARM::t2LDRBi8:
   case ARM::t2LDRDi8:
   case ARM::t2LDRSHi8:
   case ARM::t2LDRi12:
+  case ARM::t2LDRBi12:
   case ARM::t2LDRSHi12:
     break;
   }
@@ -1423,8 +1452,10 @@ bool ARMBaseInstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
   case ARM::VLDRD:
   case ARM::VLDRS:
   case ARM::t2LDRi8:
+  case ARM::t2LDRBi8:
   case ARM::t2LDRSHi8:
   case ARM::t2LDRi12:
+  case ARM::t2LDRBi12:
   case ARM::t2LDRSHi12:
     break;
   }
@@ -1471,7 +1502,16 @@ bool ARMBaseInstrInfo::shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
   if ((Offset2 - Offset1) / 8 > 64)
     return false;
 
-  if (Load1->getMachineOpcode() != Load2->getMachineOpcode())
+  // Check if the machine opcodes are different. If they are different
+  // then we consider them to not be of the same base address,
+  // EXCEPT in the case of Thumb2 byte loads where one is LDRBi8 and the other LDRBi12.
+  // In this case, they are considered to be the same because they are different
+  // encoding forms of the same basic instruction.
+  if ((Load1->getMachineOpcode() != Load2->getMachineOpcode()) &&
+      !((Load1->getMachineOpcode() == ARM::t2LDRBi8 &&
+         Load2->getMachineOpcode() == ARM::t2LDRBi12) ||
+        (Load1->getMachineOpcode() == ARM::t2LDRBi12 &&
+         Load2->getMachineOpcode() == ARM::t2LDRBi8)))
     return false;  // FIXME: overly conservative?
 
   // Four loads in a row should be sufficient.
@@ -1686,7 +1726,7 @@ MachineInstr *ARMBaseInstrInfo::optimizeSelect(MachineInstr *MI,
                                                bool PreferFalse) const {
   assert((MI->getOpcode() == ARM::MOVCCr || MI->getOpcode() == ARM::t2MOVCCr) &&
          "Unknown select instruction");
-  const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
+  MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
   MachineInstr *DefMI = canFoldIntoMOVCC(MI->getOperand(2).getReg(), MRI, this);
   bool Invert = !DefMI;
   if (!DefMI)
@@ -1694,11 +1734,17 @@ MachineInstr *ARMBaseInstrInfo::optimizeSelect(MachineInstr *MI,
   if (!DefMI)
     return 0;
 
+  // Find new register class to use.
+  MachineOperand FalseReg = MI->getOperand(Invert ? 2 : 1);
+  unsigned       DestReg  = MI->getOperand(0).getReg();
+  const TargetRegisterClass *PreviousClass = MRI.getRegClass(FalseReg.getReg());
+  if (!MRI.constrainRegClass(DestReg, PreviousClass))
+    return 0;
+
   // Create a new predicated version of DefMI.
   // Rfalse is the first use.
   MachineInstrBuilder NewMI = BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
-                                      DefMI->getDesc(),
-                                      MI->getOperand(0).getReg());
+                                      DefMI->getDesc(), DestReg);
 
   // Copy all the DefMI operands, excluding its (null) predicate.
   const MCInstrDesc &DefDesc = DefMI->getDesc();
@@ -1721,7 +1767,6 @@ MachineInstr *ARMBaseInstrInfo::optimizeSelect(MachineInstr *MI,
   // register operand tied to the first def.
   // The tie makes the register allocator ensure the FalseReg is allocated the
   // same register as operand 0.
-  MachineOperand FalseReg = MI->getOperand(Invert ? 2 : 1);
   FalseReg.setImplicit();
   NewMI.addOperand(FalseReg);
   NewMI->tieOperands(0, NewMI->getNumOperands() - 1);
@@ -1781,6 +1826,14 @@ void llvm::emitARMRegPlusImmediate(MachineBasicBlock &MBB,
                                unsigned DestReg, unsigned BaseReg, int NumBytes,
                                ARMCC::CondCodes Pred, unsigned PredReg,
                                const ARMBaseInstrInfo &TII, unsigned MIFlags) {
+  if (NumBytes == 0 && DestReg != BaseReg) {
+    BuildMI(MBB, MBBI, dl, TII.get(ARM::MOVr), DestReg)
+      .addReg(BaseReg, RegState::Kill)
+      .addImm((unsigned)Pred).addReg(PredReg).addReg(0)
+      .setMIFlags(MIFlags);
+    return;
+  }
+
   bool isSub = NumBytes < 0;
   if (isSub) NumBytes = -NumBytes;
 
@@ -1804,6 +1857,115 @@ void llvm::emitARMRegPlusImmediate(MachineBasicBlock &MBB,
   }
 }
 
+bool llvm::tryFoldSPUpdateIntoPushPop(MachineFunction &MF,
+                                      MachineInstr *MI,
+                                      unsigned NumBytes) {
+  // This optimisation potentially adds lots of load and store
+  // micro-operations, it's only really a great benefit to code-size.
+  if (!MF.getFunction()->hasFnAttribute(Attribute::MinSize))
+    return false;
+
+  // If only one register is pushed/popped, LLVM can use an LDR/STR
+  // instead. We can't modify those so make sure we're dealing with an
+  // instruction we understand.
+  bool IsPop = isPopOpcode(MI->getOpcode());
+  bool IsPush = isPushOpcode(MI->getOpcode());
+  if (!IsPush && !IsPop)
+    return false;
+
+  bool IsVFPPushPop = MI->getOpcode() == ARM::VSTMDDB_UPD ||
+                      MI->getOpcode() == ARM::VLDMDIA_UPD;
+  bool IsT1PushPop = MI->getOpcode() == ARM::tPUSH ||
+                     MI->getOpcode() == ARM::tPOP ||
+                     MI->getOpcode() == ARM::tPOP_RET;
+
+  assert((IsT1PushPop || (MI->getOperand(0).getReg() == ARM::SP &&
+                          MI->getOperand(1).getReg() == ARM::SP)) &&
+         "trying to fold sp update into non-sp-updating push/pop");
+
+  // The VFP push & pop act on D-registers, so we can only fold an adjustment
+  // by a multiple of 8 bytes in correctly. Similarly rN is 4-bytes. Don't try
+  // if this is violated.
+  if (NumBytes % (IsVFPPushPop ? 8 : 4) != 0)
+    return false;
+
+  // ARM and Thumb2 push/pop insts have explicit "sp, sp" operands (+
+  // pred) so the list starts at 4. Thumb1 starts after the predicate.
+  int RegListIdx = IsT1PushPop ? 2 : 4;
+
+  // Calculate the space we'll need in terms of registers.
+  unsigned FirstReg = MI->getOperand(RegListIdx).getReg();
+  unsigned RD0Reg, RegsNeeded;
+  if (IsVFPPushPop) {
+    RD0Reg = ARM::D0;
+    RegsNeeded = NumBytes / 8;
+  } else {
+    RD0Reg = ARM::R0;
+    RegsNeeded = NumBytes / 4;
+  }
+
+  // We're going to have to strip all list operands off before
+  // re-adding them since the order matters, so save the existing ones
+  // for later.
+  SmallVector<MachineOperand, 4> RegList;
+  for (int i = MI->getNumOperands() - 1; i >= RegListIdx; --i)
+    RegList.push_back(MI->getOperand(i));
+
+  MachineBasicBlock *MBB = MI->getParent();
+  const TargetRegisterInfo *TRI = MF.getRegInfo().getTargetRegisterInfo();
+  const MCPhysReg *CSRegs = TRI->getCalleeSavedRegs(&MF);
+
+  // Now try to find enough space in the reglist to allocate NumBytes.
+  for (unsigned CurReg = FirstReg - 1; CurReg >= RD0Reg && RegsNeeded;
+       --CurReg) {
+    if (!IsPop) {
+      // Pushing any register is completely harmless, mark the
+      // register involved as undef since we don't care about it in
+      // the slightest.
+      RegList.push_back(MachineOperand::CreateReg(CurReg, false, false,
+                                                  false, false, true));
+      --RegsNeeded;
+      continue;
+    }
+
+    // However, we can only pop an extra register if it's not live. For
+    // registers live within the function we might clobber a return value
+    // register; the other way a register can be live here is if it's
+    // callee-saved.
+    if (isCalleeSavedRegister(CurReg, CSRegs) ||
+        MBB->computeRegisterLiveness(TRI, CurReg, MI) !=
+            MachineBasicBlock::LQR_Dead) {
+      // VFP pops don't allow holes in the register list, so any skip is fatal
+      // for our transformation. GPR pops do, so we should just keep looking.
+      if (IsVFPPushPop)
+        return false;
+      else
+        continue;
+    }
+
+    // Mark the unimportant registers as <def,dead> in the POP.
+    RegList.push_back(MachineOperand::CreateReg(CurReg, true, false, false,
+                                                true));
+    --RegsNeeded;
+  }
+
+  if (RegsNeeded > 0)
+    return false;
+
+  // Finally we know we can profitably perform the optimisation so go
+  // ahead: strip all existing registers off and add them back again
+  // in the right order.
+  for (int i = MI->getNumOperands() - 1; i >= RegListIdx; --i)
+    MI->RemoveOperand(i);
+
+  // Add the complete list back in.
+  MachineInstrBuilder MIB(MF, &*MI);
+  for (int i = RegList.size() - 1; i >= 0; --i)
+    MIB.addOperand(RegList[i]);
+
+  return true;
+}
+
 bool llvm::rewriteARMFrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
                                 unsigned FrameReg, int &Offset,
                                 const ARMBaseInstrInfo &TII) {
@@ -2210,8 +2372,32 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2,
           isSafe = true;
           break;
         }
-        // Condition code is after the operand before CPSR.
-        ARMCC::CondCodes CC = (ARMCC::CondCodes)Instr.getOperand(IO-1).getImm();
+        // Condition code is after the operand before CPSR except for VSELs.
+        ARMCC::CondCodes CC;
+        bool IsInstrVSel = true;
+        switch (Instr.getOpcode()) {
+        default:
+          IsInstrVSel = false;
+          CC = (ARMCC::CondCodes)Instr.getOperand(IO - 1).getImm();
+          break;
+        case ARM::VSELEQD:
+        case ARM::VSELEQS:
+          CC = ARMCC::EQ;
+          break;
+        case ARM::VSELGTD:
+        case ARM::VSELGTS:
+          CC = ARMCC::GT;
+          break;
+        case ARM::VSELGED:
+        case ARM::VSELGES:
+          CC = ARMCC::GE;
+          break;
+        case ARM::VSELVSS:
+        case ARM::VSELVSD:
+          CC = ARMCC::VS;
+          break;
+        }
+
         if (Sub) {
           ARMCC::CondCodes NewCC = getSwappedCondition(CC);
           if (NewCC == ARMCC::AL)
@@ -2222,11 +2408,14 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2,
           // If it is safe to remove CmpInstr, the condition code of these
           // operands will be modified.
           if (SrcReg2 != 0 && Sub->getOperand(1).getReg() == SrcReg2 &&
-              Sub->getOperand(2).getReg() == SrcReg)
-            OperandsToUpdate.push_back(std::make_pair(&((*I).getOperand(IO-1)),
-                                                      NewCC));
-        }
-        else
+              Sub->getOperand(2).getReg() == SrcReg) {
+            // VSel doesn't support condition code update.
+            if (IsInstrVSel)
+              return false;
+            OperandsToUpdate.push_back(
+                std::make_pair(&((*I).getOperand(IO - 1)), NewCC));
+          }
+        } else
           switch (CC) {
           default:
             // CPSR can be used multiple times, we should continue.
@@ -3582,6 +3771,24 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
   return Latency;
 }
 
+unsigned ARMBaseInstrInfo::getPredicationCost(const MachineInstr *MI) const {
+   if (MI->isCopyLike() || MI->isInsertSubreg() ||
+      MI->isRegSequence() || MI->isImplicitDef())
+    return 0;
+
+  if (MI->isBundle())
+    return 0;
+
+  const MCInstrDesc &MCID = MI->getDesc();
+
+  if (MCID.isCall() || MCID.hasImplicitDefOfPhysReg(ARM::CPSR)) {
+    // When predicated, CPSR is an additional source operand for CPSR updating
+    // instructions, this apparently increases their latencies.
+    return 1;
+  }
+  return 0;
+}
+
 unsigned ARMBaseInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
                                            const MachineInstr *MI,
                                            unsigned *PredCost) const {
@@ -4114,7 +4321,7 @@ breakPartialRegDependency(MachineBasicBlock::iterator MI,
   // FIXME: In some cases, VLDRS can be changed to a VLD1DUPd32 which defines
   // the full D-register by loading the same value to both lanes.  The
   // instruction is micro-coded with 2 uops, so don't do this until we can
-  // properly schedule micro-coded instuctions.  The dispatcher stalls cause
+  // properly schedule micro-coded instructions.  The dispatcher stalls cause
   // too big regressions.
 
   // Insert the dependency-breaking FCONSTD before MI.
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.h b/lib/Target/ARM/ARMBaseInstrInfo.h
index 96f8637..93e5964 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.h
+++ b/lib/Target/ARM/ARMBaseInstrInfo.h
@@ -264,6 +264,8 @@ private:
                         const MCInstrDesc &UseMCID,
                         unsigned UseIdx, unsigned UseAlign) const;
 
+  unsigned getPredicationCost(const MachineInstr *MI) const;
+
   unsigned getInstrLatency(const InstrItineraryData *ItinData,
                            const MachineInstr *MI,
                            unsigned *PredCost = 0) const;
@@ -360,6 +362,17 @@ bool isIndirectBranchOpcode(int Opc) {
   return Opc == ARM::BX || Opc == ARM::MOVPCRX || Opc == ARM::tBRIND;
 }
 
+static inline bool isPopOpcode(int Opc) {
+  return Opc == ARM::tPOP_RET || Opc == ARM::LDMIA_RET ||
+         Opc == ARM::t2LDMIA_RET || Opc == ARM::tPOP || Opc == ARM::LDMIA_UPD ||
+         Opc == ARM::t2LDMIA_UPD || Opc == ARM::VLDMDIA_UPD;
+}
+
+static inline bool isPushOpcode(int Opc) {
+  return Opc == ARM::tPUSH || Opc == ARM::t2STMDB_UPD ||
+         Opc == ARM::STMDB_UPD || Opc == ARM::VSTMDDB_UPD;
+}
+
 /// getInstrPredicate - If instruction is predicated, returns its predicate
 /// condition, otherwise returns AL. It also returns the condition code
 /// register by reference.
@@ -399,6 +412,13 @@ void emitThumbRegPlusImmediate(MachineBasicBlock &MBB,
                                const ARMBaseRegisterInfo& MRI,
                                unsigned MIFlags = 0);
 
+/// Tries to add registers to the reglist of a given base-updating
+/// push/pop instruction to adjust the stack by an additional
+/// NumBytes. This can save a few bytes per function in code-size, but
+/// obviously generates more memory traffic. As such, it only takes
+/// effect in functions being optimised for size.
+bool tryFoldSPUpdateIntoPushPop(MachineFunction &MF, MachineInstr *MI,
+                                unsigned NumBytes);
 
 /// rewriteARMFrameIndex / rewriteT2FrameIndex -
 /// Rewrite MI to access 'Offset' bytes from the FP. Return false if the
diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
index 58c06e3..8717dc0 100644
--- a/lib/Target/ARM/ARMBaseRegisterInfo.cpp
+++ b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
@@ -51,20 +51,34 @@ ARMBaseRegisterInfo::ARMBaseRegisterInfo(const ARMSubtarget &sti)
 
 const uint16_t*
 ARMBaseRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
-  bool ghcCall = false;
- 
-  if (MF) {
-    const Function *F = MF->getFunction();
-    ghcCall = (F ? F->getCallingConv() == CallingConv::GHC : false);
-  }
- 
-  if (ghcCall)
+  const uint16_t *RegList = (STI.isTargetIOS() && !STI.isAAPCS_ABI())
+                                ? CSR_iOS_SaveList
+                                : CSR_AAPCS_SaveList;
+
+  if (!MF) return RegList;
+
+  const Function *F = MF->getFunction();
+  if (F->getCallingConv() == CallingConv::GHC) {
     // GHC set of callee saved regs is empty as all those regs are
     // used for passing STG regs around
     return CSR_NoRegs_SaveList;
-  else
-    return (STI.isTargetIOS() && !STI.isAAPCS_ABI())
-      ? CSR_iOS_SaveList : CSR_AAPCS_SaveList;
+  } else if (F->hasFnAttribute("interrupt")) {
+    if (STI.isMClass()) {
+      // M-class CPUs have hardware which saves the registers needed to allow a
+      // function conforming to the AAPCS to function as a handler.
+      return CSR_AAPCS_SaveList;
+    } else if (F->getFnAttribute("interrupt").getValueAsString() == "FIQ") {
+      // Fast interrupt mode gives the handler a private copy of R8-R14, so less
+      // need to be saved to restore user-mode state.
+      return CSR_FIQ_SaveList;
+    } else {
+      // Generally only R13-R14 (i.e. SP, LR) are automatically preserved by
+      // exception handling.
+      return CSR_GenericInt_SaveList;
+    }
+  }
+
+  return RegList;
 }
 
 const uint32_t*
@@ -371,14 +385,6 @@ ARMBaseRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
   return ARM::SP;
 }
 
-unsigned ARMBaseRegisterInfo::getEHExceptionRegister() const {
-  llvm_unreachable("What is the exception register");
-}
-
-unsigned ARMBaseRegisterInfo::getEHHandlerRegister() const {
-  llvm_unreachable("What is the exception handler register");
-}
-
 /// emitLoadConstPool - Emits a load from constpool to materialize the
 /// specified immediate.
 void ARMBaseRegisterInfo::
diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.h b/lib/Target/ARM/ARMBaseRegisterInfo.h
index cdaad05..e28fff6 100644
--- a/lib/Target/ARM/ARMBaseRegisterInfo.h
+++ b/lib/Target/ARM/ARMBaseRegisterInfo.h
@@ -72,6 +72,14 @@ static inline bool isARMArea3Register(unsigned Reg, bool isIOS) {
   }
 }
 
+static inline bool isCalleeSavedRegister(unsigned Reg,
+                                         const MCPhysReg *CSRegs) {
+  for (unsigned i = 0; CSRegs[i]; ++i)
+    if (Reg == CSRegs[i])
+      return true;
+  return false;
+}
+
 class ARMBaseRegisterInfo : public ARMGenRegisterInfo {
 protected:
   const ARMSubtarget &STI;
@@ -149,10 +157,6 @@ public:
   unsigned getFrameRegister(const MachineFunction &MF) const;
   unsigned getBaseRegister() const { return BasePtr; }
 
-  // Exception handling queries.
-  unsigned getEHExceptionRegister() const;
-  unsigned getEHHandlerRegister() const;
-
   bool isLowRegister(unsigned Reg) const;
 
 
diff --git a/lib/Target/ARM/ARMBuildAttrs.h b/lib/Target/ARM/ARMBuildAttrs.h
index f614dca..b16d4ef 100644
--- a/lib/Target/ARM/ARMBuildAttrs.h
+++ b/lib/Target/ARM/ARMBuildAttrs.h
@@ -15,11 +15,13 @@
 #ifndef __TARGET_ARMBUILDATTRS_H__
 #define __TARGET_ARMBUILDATTRS_H__
 
+namespace llvm {
 namespace ARMBuildAttrs {
+
   enum SpecialAttr {
     // This is for the .cpu asm attr. It translates into one or more
     // AttrType (below) entries in the .ARM.attributes section in the ELF.
-    SEL_CPU 
+    SEL_CPU
   };
 
   enum AttrType {
@@ -57,7 +59,7 @@ namespace ARMBuildAttrs {
     ABI_FP_optimization_goals = 31,
     compatibility             = 32,
     CPU_unaligned_access      = 34,
-    VFP_HP_extension          = 36,
+    FP_HP_extension           = 36,
     ABI_FP_16bit_format       = 38,
     MPextension_use           = 42, // was 70, 2.08 ABI
     DIV_use                   = 44,
@@ -93,7 +95,7 @@ namespace ARMBuildAttrs {
     v8       = 14   // v8, AArch32
   };
 
-  enum CPUArchProfile { // (=7), uleb128 
+  enum CPUArchProfile { // (=7), uleb128
     Not_Applicable = 0, // pre v7, or cross-profile code
     ApplicationProfile = (0x41), // 'A' (e.g. for Cortex A8)
     RealTimeProfile = (0x52), // 'R' (e.g. for Cortex R4)
@@ -102,34 +104,67 @@ namespace ARMBuildAttrs {
   };
 
   // The following have a lot of common use cases
-  enum { 
-    //ARMISAUse (=8), uleb128  and THUMBISAUse (=9), uleb128
+  enum {
     Not_Allowed = 0,
     Allowed = 1,
-    AllowedNeonV8 = 3,
 
-    // FP_arch (=10), uleb128 (formerly Tag_VFP_arch = 10)
+    // Tag_ARM_ISA_use (=8), uleb128
+
+    // Tag_THUMB_ISA_use, (=9), uleb128
+    AllowThumb32 = 2, // 32-bit Thumb (implies 16-bit instructions)
+
+    // Tag_FP_arch (=10), uleb128 (formerly Tag_VFP_arch = 10)
     AllowFPv2  = 2, // v2 FP ISA permitted (implies use of the v1 FP ISA)
     AllowFPv3A = 3, // v3 FP ISA permitted (implies use of the v2 FP ISA)
-    AllowFPv3B = 4, // v3 FP ISA permitted, but only D0-D15, S0-S31 
-    AllowFPv4A = 5, // v4 FP ISA permitted (implies use of v3 FP ISA) 
+    AllowFPv3B = 4, // v3 FP ISA permitted, but only D0-D15, S0-S31
+    AllowFPv4A = 5, // v4 FP ISA permitted (implies use of v3 FP ISA)
     AllowFPv4B = 6, // v4 FP ISA was permitted, but only D0-D15, S0-S31
-    AllowV8FPA = 7, // Use of the ARM v8-A FP ISA was permitted
-    AllowV8FPB = 8, // Use of the ARM v8-A FP ISA was permitted, but only D0-D15, S0-S31
+    AllowFPARMv8A = 7, // Use of the ARM v8-A FP ISA was permitted
+    AllowFPARMv8B = 8, // Use of the ARM v8-A FP ISA was permitted, but only D0-D15, S0-S31
 
     // Tag_WMMX_arch, (=11), uleb128
-    AllowThumb32 = 2, // 32-bit Thumb (implies 16-bit instructions)
-    
-    // Tag_WMMX_arch, (=11), uleb128
-    AllowWMMXv1 = 2,  // The user permitted this entity to use WMMX v2
+    AllowWMMXv1 = 1,  // The user permitted this entity to use WMMX v1
+    AllowWMMXv2 = 2,  // The user permitted this entity to use WMMX v2
+
+    // Tag_Advanced_SIMD_arch, (=12), uleb128
+    AllowNeon = 1, // SIMDv1 was permitted
+    AllowNeon2 = 2, // SIMDv2 was permitted (Half-precision FP, MAC operations)
+    AllowNeonARMv8 = 3, // ARM v8-A SIMD was permitted
 
-    // Tag_ABI_FP_denormal, (=20), uleb128 
+    // Tag_ABI_FP_denormal, (=20), uleb128
     PreserveFPSign = 2, // sign when flushed-to-zero is preserved
 
     // Tag_ABI_FP_number_model, (=23), uleb128
     AllowRTABI = 2,  // numbers, infinities, and one quiet NaN (see [RTABI])
-    AllowIEE754 = 3 // this code to use all the IEEE 754-defined FP encodings
+    AllowIEE754 = 3, // this code to use all the IEEE 754-defined FP encodings
+
+    // Tag_ABI_HardFP_use, (=27), uleb128
+    HardFPImplied = 0, // FP use should be implied by Tag_FP_arch
+    HardFPSinglePrecision = 1, // Single-precision only
+
+    // Tag_ABI_VFP_args, (=28), uleb128
+    BaseAAPCS = 0,
+    HardFPAAPCS = 1,
+
+    // Tag_FP_HP_extension, (=36), uleb128
+    AllowHPFP = 1, // Allow use of Half Precision FP
+
+    // Tag_MPextension_use, (=42), uleb128
+    AllowMP = 1, // Allow use of MP extensions
+
+    // Tag_DIV_use, (=44), uleb128
+    AllowDIVIfExists = 0, // Allow hardware divide if available in arch, or no info exists.
+    DisallowDIV = 1, // Hardware divide explicitly disallowed
+    AllowDIVExt = 2, // Allow hardware divide as optional architecture extension above
+                     // the base arch specified by Tag_CPU_arch and Tag_CPU_arch_profile.
+
+    // Tag_Virtualization_use, (=68), uleb128
+    AllowTZ = 1,
+    AllowVirtualization = 2,
+    AllowTZVirtualization = 3
   };
-}
+
+} // namespace ARMBuildAttrs
+} // namespace llvm
 
 #endif // __TARGET_ARMBUILDATTRS_H__
diff --git a/lib/Target/ARM/ARMCallingConv.td b/lib/Target/ARM/ARMCallingConv.td
index 89c5223..9bea4b2 100644
--- a/lib/Target/ARM/ARMCallingConv.td
+++ b/lib/Target/ARM/ARMCallingConv.td
@@ -207,4 +207,24 @@ def CSR_AAPCS_ThisReturn : CalleeSavedRegs<(add LR, R11, R10, R9, R8, R7, R6,
 def CSR_iOS : CalleeSavedRegs<(add LR, R7, R6, R5, R4, (sub CSR_AAPCS, R9))>;
 
 def CSR_iOS_ThisReturn : CalleeSavedRegs<(add LR, R7, R6, R5, R4,
-                                          (sub CSR_AAPCS_ThisReturn, R9))>;
+                                         (sub CSR_AAPCS_ThisReturn, R9))>;
+
+// The "interrupt" attribute is used to generate code that is acceptable in
+// exception-handlers of various kinds. It makes us use a different return
+// instruction (handled elsewhere) and affects which registers we must return to
+// our "caller" in the same state as we receive them.
+
+// For most interrupts, all registers except SP and LR are shared with
+// user-space. We mark LR to be saved anyway, since this is what the ARM backend
+// generally does rather than tracking its liveness as a normal register.
+def CSR_GenericInt : CalleeSavedRegs<(add LR, (sequence "R%u", 12, 0))>;
+
+// The fast interrupt handlers have more private state and get their own copies
+// of R8-R12, in addition to SP and LR. As before, mark LR for saving too.
+
+// FIXME: we mark R11 as callee-saved since it's often the frame-pointer, and
+// current frame lowering expects to encounter it while processing callee-saved
+// registers.
+def CSR_FIQ : CalleeSavedRegs<(add LR, R11, (sequence "R%u", 7, 0))>;
+
+
diff --git a/lib/Target/ARM/ARMConstantPoolValue.cpp b/lib/Target/ARM/ARMConstantPoolValue.cpp
index 4e703ec..7d41c69 100644
--- a/lib/Target/ARM/ARMConstantPoolValue.cpp
+++ b/lib/Target/ARM/ARMConstantPoolValue.cpp
@@ -163,21 +163,7 @@ const BlockAddress *ARMConstantPoolConstant::getBlockAddress() const {
 
 int ARMConstantPoolConstant::getExistingMachineCPValue(MachineConstantPool *CP,
                                                        unsigned Alignment) {
-  unsigned AlignMask = Alignment - 1;
-  const std::vector<MachineConstantPoolEntry> Constants = CP->getConstants();
-  for (unsigned i = 0, e = Constants.size(); i != e; ++i) {
-    if (Constants[i].isMachineConstantPoolEntry() &&
-        (Constants[i].getAlignment() & AlignMask) == 0) {
-      ARMConstantPoolValue *CPV =
-        (ARMConstantPoolValue *)Constants[i].Val.MachineCPVal;
-      ARMConstantPoolConstant *APC = dyn_cast<ARMConstantPoolConstant>(CPV);
-      if (!APC) continue;
-      if (APC->CVal == CVal && equals(APC))
-        return i;
-    }
-  }
-
-  return -1;
+  return getExistingMachineCPValueImpl<ARMConstantPoolConstant>(CP, Alignment);
 }
 
 bool ARMConstantPoolConstant::hasSameValue(ARMConstantPoolValue *ACPV) {
@@ -216,22 +202,7 @@ ARMConstantPoolSymbol::Create(LLVMContext &C, const char *s,
 
 int ARMConstantPoolSymbol::getExistingMachineCPValue(MachineConstantPool *CP,
                                                      unsigned Alignment) {
-  unsigned AlignMask = Alignment - 1;
-  const std::vector<MachineConstantPoolEntry> Constants = CP->getConstants();
-  for (unsigned i = 0, e = Constants.size(); i != e; ++i) {
-    if (Constants[i].isMachineConstantPoolEntry() &&
-        (Constants[i].getAlignment() & AlignMask) == 0) {
-      ARMConstantPoolValue *CPV =
-        (ARMConstantPoolValue *)Constants[i].Val.MachineCPVal;
-      ARMConstantPoolSymbol *APS = dyn_cast<ARMConstantPoolSymbol>(CPV);
-      if (!APS) continue;
-
-      if (APS->S == S && equals(APS))
-        return i;
-    }
-  }
-
-  return -1;
+  return getExistingMachineCPValueImpl<ARMConstantPoolSymbol>(CP, Alignment);
 }
 
 bool ARMConstantPoolSymbol::hasSameValue(ARMConstantPoolValue *ACPV) {
@@ -271,22 +242,7 @@ ARMConstantPoolMBB *ARMConstantPoolMBB::Create(LLVMContext &C,
 
 int ARMConstantPoolMBB::getExistingMachineCPValue(MachineConstantPool *CP,
                                                   unsigned Alignment) {
-  unsigned AlignMask = Alignment - 1;
-  const std::vector<MachineConstantPoolEntry> Constants = CP->getConstants();
-  for (unsigned i = 0, e = Constants.size(); i != e; ++i) {
-    if (Constants[i].isMachineConstantPoolEntry() &&
-        (Constants[i].getAlignment() & AlignMask) == 0) {
-      ARMConstantPoolValue *CPV =
-        (ARMConstantPoolValue *)Constants[i].Val.MachineCPVal;
-      ARMConstantPoolMBB *APMBB = dyn_cast<ARMConstantPoolMBB>(CPV);
-      if (!APMBB) continue;
-
-      if (APMBB->MBB == MBB && equals(APMBB))
-        return i;
-    }
-  }
-
-  return -1;
+  return getExistingMachineCPValueImpl<ARMConstantPoolMBB>(CP, Alignment);
 }
 
 bool ARMConstantPoolMBB::hasSameValue(ARMConstantPoolValue *ACPV) {
diff --git a/lib/Target/ARM/ARMConstantPoolValue.h b/lib/Target/ARM/ARMConstantPoolValue.h
index 93812fe..7ae7bf4 100644
--- a/lib/Target/ARM/ARMConstantPoolValue.h
+++ b/lib/Target/ARM/ARMConstantPoolValue.h
@@ -15,6 +15,7 @@
 #define LLVM_TARGET_ARM_CONSTANTPOOLVALUE_H
 
 #include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
 #include <cstddef>
 
@@ -64,6 +65,26 @@ protected:
   ARMConstantPoolValue(LLVMContext &C, unsigned id, ARMCP::ARMCPKind Kind,
                        unsigned char PCAdj, ARMCP::ARMCPModifier Modifier,
                        bool AddCurrentAddress);
+
+  template <typename Derived>
+  int getExistingMachineCPValueImpl(MachineConstantPool *CP,
+                                    unsigned Alignment) {
+    unsigned AlignMask = Alignment - 1;
+    const std::vector<MachineConstantPoolEntry> &Constants = CP->getConstants();
+    for (unsigned i = 0, e = Constants.size(); i != e; ++i) {
+      if (Constants[i].isMachineConstantPoolEntry() &&
+          (Constants[i].getAlignment() & AlignMask) == 0) {
+        ARMConstantPoolValue *CPV =
+            (ARMConstantPoolValue *)Constants[i].Val.MachineCPVal;
+        if (Derived *APC = dyn_cast<Derived>(CPV))
+          if (cast<Derived>(this)->equals(APC))
+            return i;
+      }
+    }
+
+    return -1;
+  }
+
 public:
   virtual ~ARMConstantPoolValue();
 
@@ -156,6 +177,10 @@ public:
   static bool classof(const ARMConstantPoolValue *APV) {
     return APV->isGlobalValue() || APV->isBlockAddress() || APV->isLSDA();
   }
+
+  bool equals(const ARMConstantPoolConstant *A) const {
+    return CVal == A->CVal && ARMConstantPoolValue::equals(A);
+  }
 };
 
 /// ARMConstantPoolSymbol - ARM-specific constantpool values for external
@@ -187,6 +212,10 @@ public:
   static bool classof(const ARMConstantPoolValue *ACPV) {
     return ACPV->isExtSymbol();
   }
+
+  bool equals(const ARMConstantPoolSymbol *A) const {
+    return S == A->S && ARMConstantPoolValue::equals(A);
+  }
 };
 
 /// ARMConstantPoolMBB - ARM-specific constantpool value of a machine basic
@@ -219,6 +248,10 @@ public:
   static bool classof(const ARMConstantPoolValue *ACPV) {
     return ACPV->isMachineBasicBlock();
   }
+
+  bool equals(const ARMConstantPoolMBB *A) const {
+    return MBB == A->MBB && ARMConstantPoolValue::equals(A);
+  }
 };
 
 } // End llvm namespace
diff --git a/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/lib/Target/ARM/ARMExpandPseudoInsts.cpp
index beb843c..e6f7f86 100644
--- a/lib/Target/ARM/ARMExpandPseudoInsts.cpp
+++ b/lib/Target/ARM/ARMExpandPseudoInsts.cpp
@@ -692,10 +692,9 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
       unsigned newOpc = Opcode == ARM::VMOVScc ? ARM::VMOVS : ARM::VMOVD;
       BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(newOpc),
               MI.getOperand(1).getReg())
-        .addReg(MI.getOperand(2).getReg(),
-                getKillRegState(MI.getOperand(2).isKill()))
+        .addOperand(MI.getOperand(2))
         .addImm(MI.getOperand(3).getImm()) // 'pred'
-        .addReg(MI.getOperand(4).getReg());
+        .addOperand(MI.getOperand(4));
 
       MI.eraseFromParent();
       return true;
@@ -705,10 +704,9 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
       unsigned Opc = AFI->isThumbFunction() ? ARM::t2MOVr : ARM::MOVr;
       BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc),
               MI.getOperand(1).getReg())
-        .addReg(MI.getOperand(2).getReg(),
-                getKillRegState(MI.getOperand(2).isKill()))
+        .addOperand(MI.getOperand(2))
         .addImm(MI.getOperand(3).getImm()) // 'pred'
-        .addReg(MI.getOperand(4).getReg())
+        .addOperand(MI.getOperand(4))
         .addReg(0); // 's' bit
 
       MI.eraseFromParent();
@@ -717,39 +715,36 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
     case ARM::MOVCCsi: {
       BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MOVsi),
               (MI.getOperand(1).getReg()))
-        .addReg(MI.getOperand(2).getReg(),
-                getKillRegState(MI.getOperand(2).isKill()))
+        .addOperand(MI.getOperand(2))
         .addImm(MI.getOperand(3).getImm())
         .addImm(MI.getOperand(4).getImm()) // 'pred'
-        .addReg(MI.getOperand(5).getReg())
+        .addOperand(MI.getOperand(5))
         .addReg(0); // 's' bit
 
       MI.eraseFromParent();
       return true;
     }
-
     case ARM::MOVCCsr: {
       BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MOVsr),
               (MI.getOperand(1).getReg()))
-        .addReg(MI.getOperand(2).getReg(),
-                getKillRegState(MI.getOperand(2).isKill()))
-        .addReg(MI.getOperand(3).getReg(),
-                getKillRegState(MI.getOperand(3).isKill()))
+        .addOperand(MI.getOperand(2))
+        .addOperand(MI.getOperand(3))
         .addImm(MI.getOperand(4).getImm())
         .addImm(MI.getOperand(5).getImm()) // 'pred'
-        .addReg(MI.getOperand(6).getReg())
+        .addOperand(MI.getOperand(6))
         .addReg(0); // 's' bit
 
       MI.eraseFromParent();
       return true;
     }
+    case ARM::t2MOVCCi16:
     case ARM::MOVCCi16: {
-      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MOVi16),
+      unsigned NewOpc = AFI->isThumbFunction() ? ARM::t2MOVi16 : ARM::MOVi16;
+      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(NewOpc),
               MI.getOperand(1).getReg())
         .addImm(MI.getOperand(2).getImm())
         .addImm(MI.getOperand(3).getImm()) // 'pred'
-        .addReg(MI.getOperand(4).getReg());
-
+        .addOperand(MI.getOperand(4));
       MI.eraseFromParent();
       return true;
     }
@@ -760,23 +755,47 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
               MI.getOperand(1).getReg())
         .addImm(MI.getOperand(2).getImm())
         .addImm(MI.getOperand(3).getImm()) // 'pred'
-        .addReg(MI.getOperand(4).getReg())
+        .addOperand(MI.getOperand(4))
         .addReg(0); // 's' bit
 
       MI.eraseFromParent();
       return true;
     }
+    case ARM::t2MVNCCi:
     case ARM::MVNCCi: {
-      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MVNi),
+      unsigned Opc = AFI->isThumbFunction() ? ARM::t2MVNi : ARM::MVNi;
+      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc),
               MI.getOperand(1).getReg())
         .addImm(MI.getOperand(2).getImm())
         .addImm(MI.getOperand(3).getImm()) // 'pred'
-        .addReg(MI.getOperand(4).getReg())
+        .addOperand(MI.getOperand(4))
         .addReg(0); // 's' bit
 
       MI.eraseFromParent();
       return true;
     }
+    case ARM::t2MOVCClsl:
+    case ARM::t2MOVCClsr:
+    case ARM::t2MOVCCasr:
+    case ARM::t2MOVCCror: {
+      unsigned NewOpc;
+      switch (Opcode) {
+      case ARM::t2MOVCClsl: NewOpc = ARM::t2LSLri; break;
+      case ARM::t2MOVCClsr: NewOpc = ARM::t2LSRri; break;
+      case ARM::t2MOVCCasr: NewOpc = ARM::t2ASRri; break;
+      case ARM::t2MOVCCror: NewOpc = ARM::t2RORri; break;
+      default: llvm_unreachable("unexpeced conditional move");
+      }
+      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(NewOpc),
+              MI.getOperand(1).getReg())
+        .addOperand(MI.getOperand(2))
+        .addImm(MI.getOperand(3).getImm())
+        .addImm(MI.getOperand(4).getImm()) // 'pred'
+        .addOperand(MI.getOperand(5))
+        .addReg(0); // 's' bit
+      MI.eraseFromParent();
+      return true;
+    }
     case ARM::Int_eh_sjlj_dispatchsetup: {
       MachineFunction &MF = *MI.getParent()->getParent();
       const ARMBaseInstrInfo *AII =
@@ -823,7 +842,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
 
     case ARM::MOVsrl_flag:
     case ARM::MOVsra_flag: {
-      // These are just fancy MOVs insructions.
+      // These are just fancy MOVs instructions.
       AddDefaultPred(BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MOVsi),
                              MI.getOperand(0).getReg())
                      .addOperand(MI.getOperand(1))
@@ -938,6 +957,18 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
       ExpandMOV32BitImm(MBB, MBBI);
       return true;
 
+    case ARM::SUBS_PC_LR: {
+      MachineInstrBuilder MIB =
+          BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::SUBri), ARM::PC)
+              .addReg(ARM::LR)
+              .addOperand(MI.getOperand(0))
+              .addOperand(MI.getOperand(1))
+              .addOperand(MI.getOperand(2))
+              .addReg(ARM::CPSR, RegState::Undef);
+      TransferImpOps(MI, MIB, MIB);
+      MI.eraseFromParent();
+      return true;
+    }
     case ARM::VLDMQIA: {
       unsigned NewOpc = ARM::VLDMDIA;
       MachineInstrBuilder MIB =
diff --git a/lib/Target/ARM/ARMFPUName.def b/lib/Target/ARM/ARMFPUName.def
new file mode 100644
index 0000000..9a1bbe7
--- /dev/null
+++ b/lib/Target/ARM/ARMFPUName.def
@@ -0,0 +1,32 @@
+//===-- ARMFPUName.def - List of the ARM FPU names --------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the list of the supported ARM FPU names.
+//
+//===----------------------------------------------------------------------===//
+
+// NOTE: NO INCLUDE GUARD DESIRED!
+
+#ifndef ARM_FPU_NAME
+#error "You must define ARM_FPU_NAME(NAME, ID) before including ARMFPUName.h"
+#endif
+
+ARM_FPU_NAME("vfp", VFP)
+ARM_FPU_NAME("vfpv2", VFPV2)
+ARM_FPU_NAME("vfpv3", VFPV3)
+ARM_FPU_NAME("vfpv3-d16", VFPV3_D16)
+ARM_FPU_NAME("vfpv4", VFPV4)
+ARM_FPU_NAME("vfpv4-d16", VFPV4_D16)
+ARM_FPU_NAME("fp-armv8", FP_ARMV8)
+ARM_FPU_NAME("neon", NEON)
+ARM_FPU_NAME("neon-vfpv4", NEON_VFPV4)
+ARM_FPU_NAME("neon-fp-armv8", NEON_FP_ARMV8)
+ARM_FPU_NAME("crypto-neon-fp-armv8", CRYPTO_NEON_FP_ARMV8)
+
+#undef ARM_FPU_NAME
diff --git a/lib/Target/ARM/ARMFPUName.h b/lib/Target/ARM/ARMFPUName.h
new file mode 100644
index 0000000..2a64cce
--- /dev/null
+++ b/lib/Target/ARM/ARMFPUName.h
@@ -0,0 +1,26 @@
+//===-- ARMFPUName.h - List of the ARM FPU names ----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARMFPUNAME_H
+#define ARMFPUNAME_H
+
+namespace llvm {
+namespace ARM {
+
+enum FPUKind {
+  INVALID_FPU = 0
+
+#define ARM_FPU_NAME(NAME, ID) , ID
+#include "ARMFPUName.def"
+};
+
+} // namespace ARM
+} // namespace llvm
+
+#endif // ARMFPUNAME_H
diff --git a/lib/Target/ARM/ARMFastISel.cpp b/lib/Target/ARM/ARMFastISel.cpp
index ed054aa..a4004f3 100644
--- a/lib/Target/ARM/ARMFastISel.cpp
+++ b/lib/Target/ARM/ARMFastISel.cpp
@@ -176,6 +176,8 @@ class ARMFastISel : public FastISel {
 
     // Utility routines.
   private:
+    unsigned constrainOperandRegClass(const MCInstrDesc &II, unsigned OpNum,
+                                      unsigned Op);
     bool isTypeLegal(Type *Ty, MVT &VT);
     bool isLoadTypeLegal(Type *Ty, MVT &VT);
     bool ARMEmitCmp(const Value *Src1Value, const Value *Src2Value,
@@ -252,10 +254,10 @@ bool ARMFastISel::DefinesOptionalPredicate(MachineInstr *MI, bool *CPSR) {
 bool ARMFastISel::isARMNEONPred(const MachineInstr *MI) {
   const MCInstrDesc &MCID = MI->getDesc();
 
-  // If we're a thumb2 or not NEON function we were handled via isPredicable.
+  // If we're a thumb2 or not NEON function we'll be handled via isPredicable.
   if ((MCID.TSFlags & ARMII::DomainMask) != ARMII::DomainNEON ||
        AFI->isThumb2Function())
-    return false;
+    return MI->isPredicable();
 
   for (unsigned i = 0, e = MCID.getNumOperands(); i != e; ++i)
     if (MCID.OpInfo[i].isPredicate())
@@ -276,7 +278,7 @@ ARMFastISel::AddOptionalDefs(const MachineInstrBuilder &MIB) {
   // Do we use a predicate? or...
   // Are we NEON in ARM mode and have a predicate operand? If so, I know
   // we're not predicable but add it anyways.
-  if (TII.isPredicable(MI) || isARMNEONPred(MI))
+  if (isARMNEONPred(MI))
     AddDefaultPred(MIB);
 
   // Do we optionally set a predicate?  Preds is size > 0 iff the predicate
@@ -291,6 +293,23 @@ ARMFastISel::AddOptionalDefs(const MachineInstrBuilder &MIB) {
   return MIB;
 }
 
+unsigned ARMFastISel::constrainOperandRegClass(const MCInstrDesc &II,
+                                               unsigned Op, unsigned OpNum) {
+  if (TargetRegisterInfo::isVirtualRegister(Op)) {
+    const TargetRegisterClass *RegClass =
+        TII.getRegClass(II, OpNum, &TRI, *FuncInfo.MF);
+    if (!MRI.constrainRegClass(Op, RegClass)) {
+      // If it's not legal to COPY between the register classes, something
+      // has gone very wrong before we got here.
+      unsigned NewOp = createResultReg(RegClass);
+      AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                              TII.get(TargetOpcode::COPY), NewOp).addReg(Op));
+      return NewOp;
+    }
+  }
+  return Op;
+}
+
 unsigned ARMFastISel::FastEmitInst_(unsigned MachineInstOpcode,
                                     const TargetRegisterClass* RC) {
   unsigned ResultReg = createResultReg(RC);
@@ -306,6 +325,9 @@ unsigned ARMFastISel::FastEmitInst_r(unsigned MachineInstOpcode,
   unsigned ResultReg = createResultReg(RC);
   const MCInstrDesc &II = TII.get(MachineInstOpcode);
 
+  // Make sure the input operand is sufficiently constrained to be legal
+  // for this instruction.
+  Op0 = constrainOperandRegClass(II, Op0, 1);
   if (II.getNumDefs() >= 1) {
     AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II, ResultReg)
                    .addReg(Op0, Op0IsKill * RegState::Kill));
@@ -326,6 +348,11 @@ unsigned ARMFastISel::FastEmitInst_rr(unsigned MachineInstOpcode,
   unsigned ResultReg = createResultReg(RC);
   const MCInstrDesc &II = TII.get(MachineInstOpcode);
 
+  // Make sure the input operands are sufficiently constrained to be legal
+  // for this instruction.
+  Op0 = constrainOperandRegClass(II, Op0, 1);
+  Op1 = constrainOperandRegClass(II, Op1, 2);
+
   if (II.getNumDefs() >= 1) {
     AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II, ResultReg)
                    .addReg(Op0, Op0IsKill * RegState::Kill)
@@ -349,6 +376,12 @@ unsigned ARMFastISel::FastEmitInst_rrr(unsigned MachineInstOpcode,
   unsigned ResultReg = createResultReg(RC);
   const MCInstrDesc &II = TII.get(MachineInstOpcode);
 
+  // Make sure the input operands are sufficiently constrained to be legal
+  // for this instruction.
+  Op0 = constrainOperandRegClass(II, Op0, 1);
+  Op1 = constrainOperandRegClass(II, Op1, 2);
+  Op2 = constrainOperandRegClass(II, Op1, 3);
+
   if (II.getNumDefs() >= 1) {
     AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II, ResultReg)
                    .addReg(Op0, Op0IsKill * RegState::Kill)
@@ -373,6 +406,9 @@ unsigned ARMFastISel::FastEmitInst_ri(unsigned MachineInstOpcode,
   unsigned ResultReg = createResultReg(RC);
   const MCInstrDesc &II = TII.get(MachineInstOpcode);
 
+  // Make sure the input operand is sufficiently constrained to be legal
+  // for this instruction.
+  Op0 = constrainOperandRegClass(II, Op0, 1);
   if (II.getNumDefs() >= 1) {
     AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II, ResultReg)
                    .addReg(Op0, Op0IsKill * RegState::Kill)
@@ -395,6 +431,9 @@ unsigned ARMFastISel::FastEmitInst_rf(unsigned MachineInstOpcode,
   unsigned ResultReg = createResultReg(RC);
   const MCInstrDesc &II = TII.get(MachineInstOpcode);
 
+  // Make sure the input operand is sufficiently constrained to be legal
+  // for this instruction.
+  Op0 = constrainOperandRegClass(II, Op0, 1);
   if (II.getNumDefs() >= 1) {
     AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II, ResultReg)
                    .addReg(Op0, Op0IsKill * RegState::Kill)
@@ -418,6 +457,10 @@ unsigned ARMFastISel::FastEmitInst_rri(unsigned MachineInstOpcode,
   unsigned ResultReg = createResultReg(RC);
   const MCInstrDesc &II = TII.get(MachineInstOpcode);
 
+  // Make sure the input operands are sufficiently constrained to be legal
+  // for this instruction.
+  Op0 = constrainOperandRegClass(II, Op0, 1);
+  Op1 = constrainOperandRegClass(II, Op1, 2);
   if (II.getNumDefs() >= 1) {
     AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II, ResultReg)
                    .addReg(Op0, Op0IsKill * RegState::Kill)
@@ -610,6 +653,7 @@ unsigned ARMFastISel::ARMMaterializeInt(const Constant *C, MVT VT) {
                     .addConstantPoolIndex(Idx));
   else
     // The extra immediate is for addrmode2.
+    DestReg = constrainOperandRegClass(TII.get(ARM::LDRcp), DestReg, 0);
     AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
                             TII.get(ARM::LDRcp), DestReg)
                     .addConstantPoolIndex(Idx)
@@ -685,6 +729,7 @@ unsigned ARMFastISel::ARMMaterializeGV(const GlobalValue *GV, MVT VT) {
       AddOptionalDefs(MIB);
     } else {
       // The extra immediate is for addrmode2.
+      DestReg = constrainOperandRegClass(TII.get(ARM::LDRcp), DestReg, 0);
       MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(ARM::LDRcp),
                     DestReg)
         .addConstantPoolIndex(Idx)
@@ -855,13 +900,8 @@ bool ARMFastISel::ARMComputeAddress(const Value *Obj, Address &Addr) {
               TmpOffset += CI->getSExtValue() * S;
               break;
             }
-            if (isa<AddOperator>(Op) &&
-                (!isa<Instruction>(Op) ||
-                 FuncInfo.MBBMap[cast<Instruction>(Op)->getParent()]
-                 == FuncInfo.MBB) &&
-                isa<ConstantInt>(cast<AddOperator>(Op)->getOperand(1))) {
-              // An add (in the same block) with a constant operand. Fold the
-              // constant.
+            if (canFoldAddIntoGEP(U, Op)) {
+              // A compatible add with a constant operand. Fold the constant.
               ConstantInt *CI =
               cast<ConstantInt>(cast<AddOperator>(Op)->getOperand(1));
               TmpOffset += CI->getSExtValue() * S;
@@ -1139,6 +1179,7 @@ bool ARMFastISel::ARMEmitStore(MVT VT, unsigned SrcReg, Address &Addr,
         (const TargetRegisterClass*)&ARM::tGPRRegClass :
         (const TargetRegisterClass*)&ARM::GPRRegClass);
       unsigned Opc = isThumb2 ? ARM::t2ANDri : ARM::ANDri;
+      SrcReg = constrainOperandRegClass(TII.get(Opc), SrcReg, 1);
       AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
                               TII.get(Opc), Res)
                       .addReg(SrcReg).addImm(1));
@@ -1210,6 +1251,7 @@ bool ARMFastISel::ARMEmitStore(MVT VT, unsigned SrcReg, Address &Addr,
   ARMSimplifyAddress(Addr, VT, useAM3);
 
   // Create the base instruction, then add the operands.
+  SrcReg = constrainOperandRegClass(TII.get(StrOpc), SrcReg, 0);
   MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
                                     TII.get(StrOpc))
                             .addReg(SrcReg);
@@ -1333,6 +1375,7 @@ bool ARMFastISel::SelectBranch(const Instruction *I) {
         (isLoadTypeLegal(TI->getOperand(0)->getType(), SourceVT))) {
       unsigned TstOpc = isThumb2 ? ARM::t2TSTri : ARM::TSTri;
       unsigned OpReg = getRegForValue(TI->getOperand(0));
+      OpReg = constrainOperandRegClass(TII.get(TstOpc), OpReg, 0);
       AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
                               TII.get(TstOpc))
                       .addReg(OpReg).addImm(1));
@@ -1370,6 +1413,7 @@ bool ARMFastISel::SelectBranch(const Instruction *I) {
   // and it left a value for us in a virtual register.  Ergo, we test
   // the one-bit value left in the virtual register.
   unsigned TstOpc = isThumb2 ? ARM::t2TSTri : ARM::TSTri;
+  CmpReg = constrainOperandRegClass(TII.get(TstOpc), CmpReg, 0);
   AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TstOpc))
                   .addReg(CmpReg).addImm(1));
 
@@ -1494,13 +1538,15 @@ bool ARMFastISel::ARMEmitCmp(const Value *Src1Value, const Value *Src2Value,
     }
   }
 
+  const MCInstrDesc &II = TII.get(CmpOpc);
+  SrcReg1 = constrainOperandRegClass(II, SrcReg1, 0);
   if (!UseImm) {
-    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
-                            TII.get(CmpOpc))
+    SrcReg2 = constrainOperandRegClass(II, SrcReg2, 1);
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II)
                     .addReg(SrcReg1).addReg(SrcReg2));
   } else {
     MachineInstrBuilder MIB;
-    MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(CmpOpc))
+    MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II)
       .addReg(SrcReg1);
 
     // Only add immediate for icmp as the immediate for fcmp is an implicit 0.0.
@@ -1699,6 +1745,7 @@ bool ARMFastISel::SelectSelect(const Instruction *I) {
   }
 
   unsigned CmpOpc = isThumb2 ? ARM::t2CMPri : ARM::CMPri;
+  CondReg = constrainOperandRegClass(TII.get(CmpOpc), CondReg, 0);
   AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(CmpOpc))
                   .addReg(CondReg).addImm(0));
 
@@ -1715,12 +1762,16 @@ bool ARMFastISel::SelectSelect(const Instruction *I) {
       MovCCOpc = isThumb2 ? ARM::t2MVNCCi : ARM::MVNCCi;
   }
   unsigned ResultReg = createResultReg(RC);
-  if (!UseImm)
+  if (!UseImm) {
+    Op2Reg = constrainOperandRegClass(TII.get(MovCCOpc), Op2Reg, 1);
+    Op1Reg = constrainOperandRegClass(TII.get(MovCCOpc), Op1Reg, 2);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(MovCCOpc), ResultReg)
     .addReg(Op2Reg).addReg(Op1Reg).addImm(ARMCC::NE).addReg(ARM::CPSR);
-  else
+  } else {
+    Op1Reg = constrainOperandRegClass(TII.get(MovCCOpc), Op1Reg, 1);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(MovCCOpc), ResultReg)
     .addReg(Op1Reg).addImm(Imm).addImm(ARMCC::EQ).addReg(ARM::CPSR);
+  }
   UpdateValueMap(I, ResultReg);
   return true;
 }
@@ -1806,6 +1857,8 @@ bool ARMFastISel::SelectBinaryIntOp(const Instruction *I, unsigned ISDOpcode) {
   if (SrcReg2 == 0) return false;
 
   unsigned ResultReg = createResultReg(&ARM::GPRnopcRegClass);
+  SrcReg1 = constrainOperandRegClass(TII.get(Opc), SrcReg1, 1);
+  SrcReg2 = constrainOperandRegClass(TII.get(Opc), SrcReg2, 2);
   AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
                           TII.get(Opc), ResultReg)
                   .addReg(SrcReg1).addReg(SrcReg2));
@@ -1933,7 +1986,7 @@ bool ARMFastISel::ProcessCallArgs(SmallVectorImpl<Value*> &Args,
           !VA.isRegLoc() || !ArgLocs[++i].isRegLoc())
         return false;
     } else {
-      switch (static_cast<EVT>(ArgVT).getSimpleVT().SimpleTy) {
+      switch (ArgVT.SimpleTy) {
       default:
         return false;
       case MVT::i1:
@@ -2410,15 +2463,22 @@ bool ARMFastISel::SelectCall(const Instruction *I,
   MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt,
                                     DL, TII.get(CallOpc));
 
+  unsigned char OpFlags = 0;
+
+  // Add MO_PLT for global address or external symbol in the PIC relocation
+  // model.
+  if (Subtarget->isTargetELF() && TM.getRelocationModel() == Reloc::PIC_)
+    OpFlags = ARMII::MO_PLT;
+
   // ARM calls don't take a predicate, but tBL / tBLX do.
   if(isThumb2)
     AddDefaultPred(MIB);
   if (UseReg)
     MIB.addReg(CalleeReg);
   else if (!IntrMemName)
-    MIB.addGlobalAddress(GV, 0, 0);
+    MIB.addGlobalAddress(GV, 0, OpFlags);
   else
-    MIB.addExternalSymbol(IntrMemName, 0);
+    MIB.addExternalSymbol(IntrMemName, OpFlags);
 
   // Add implicit physical register uses to the call.
   for (unsigned i = 0, e = RegArgs.size(); i != e; ++i)
@@ -2731,6 +2791,7 @@ unsigned ARMFastISel::ARMEmitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT,
         *FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opcode), ResultReg);
     if (setsCPSR)
       MIB.addReg(ARM::CPSR, RegState::Define);
+    SrcReg = constrainOperandRegClass(TII.get(Opcode), SrcReg, 1 + setsCPSR);
     AddDefaultPred(MIB.addReg(SrcReg, isKill * RegState::Kill).addImm(ImmEnc));
     if (hasS)
       AddDefaultCC(MIB);
@@ -2965,12 +3026,14 @@ unsigned ARMFastISel::ARMLowerPICELF(const GlobalValue *GV,
   unsigned DestReg1 = createResultReg(TLI.getRegClassFor(VT));
   // Load value.
   if (isThumb2) {
+    DestReg1 = constrainOperandRegClass(TII.get(ARM::t2LDRpci), DestReg1, 0);
     AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
                             TII.get(ARM::t2LDRpci), DestReg1)
                     .addConstantPoolIndex(Idx));
     Opc = UseGOTOFF ? ARM::t2ADDrr : ARM::t2LDRs;
   } else {
     // The extra immediate is for addrmode2.
+    DestReg1 = constrainOperandRegClass(TII.get(ARM::LDRcp), DestReg1, 0);
     AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt,
                             DL, TII.get(ARM::LDRcp), DestReg1)
                     .addConstantPoolIndex(Idx).addImm(0));
@@ -2984,6 +3047,9 @@ unsigned ARMFastISel::ARMLowerPICELF(const GlobalValue *GV,
   }
 
   unsigned DestReg2 = createResultReg(TLI.getRegClassFor(VT));
+  DestReg2 = constrainOperandRegClass(TII.get(Opc), DestReg2, 0);
+  DestReg1 = constrainOperandRegClass(TII.get(Opc), DestReg1, 1);
+  GlobalBaseReg = constrainOperandRegClass(TII.get(Opc), GlobalBaseReg, 2);
   MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt,
                                     DL, TII.get(Opc), DestReg2)
                             .addReg(DestReg1)
@@ -3049,7 +3115,7 @@ bool ARMFastISel::FastLowerArguments() {
     ARM::R0, ARM::R1, ARM::R2, ARM::R3
   };
 
-  const TargetRegisterClass *RC = TLI.getRegClassFor(MVT::i32);
+  const TargetRegisterClass *RC = &ARM::rGPRRegClass;
   Idx = 0;
   for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
        I != E; ++I, ++Idx) {
diff --git a/lib/Target/ARM/ARMFeatures.h b/lib/Target/ARM/ARMFeatures.h
new file mode 100644
index 0000000..dafc4b3
--- /dev/null
+++ b/lib/Target/ARM/ARMFeatures.h
@@ -0,0 +1,93 @@
+//===-- ARMFeatures.h - Checks for ARM instruction features ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the code shared between ARM CodeGen and ARM MC
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TARGET_ARM_FEATURES_H
+#define TARGET_ARM_FEATURES_H
+
+#include "ARM.h"
+
+namespace llvm {
+
+template<typename InstrType> // could be MachineInstr or MCInst
+inline bool isV8EligibleForIT(InstrType *Instr, int BLXOperandIndex = 0) {
+  switch (Instr->getOpcode()) {
+  default:
+    return false;
+  case ARM::tADC:
+  case ARM::tADDi3:
+  case ARM::tADDi8:
+  case ARM::tADDrSPi:
+  case ARM::tADDrr:
+  case ARM::tAND:
+  case ARM::tASRri:
+  case ARM::tASRrr:
+  case ARM::tBIC:
+  case ARM::tCMNz:
+  case ARM::tCMPi8:
+  case ARM::tCMPr:
+  case ARM::tEOR:
+  case ARM::tLDRBi:
+  case ARM::tLDRBr:
+  case ARM::tLDRHi:
+  case ARM::tLDRHr:
+  case ARM::tLDRSB:
+  case ARM::tLDRSH:
+  case ARM::tLDRi:
+  case ARM::tLDRr:
+  case ARM::tLDRspi:
+  case ARM::tLSLri:
+  case ARM::tLSLrr:
+  case ARM::tLSRri:
+  case ARM::tLSRrr:
+  case ARM::tMOVi8:
+  case ARM::tMUL:
+  case ARM::tMVN:
+  case ARM::tORR:
+  case ARM::tROR:
+  case ARM::tRSB:
+  case ARM::tSBC:
+  case ARM::tSTRBi:
+  case ARM::tSTRBr:
+  case ARM::tSTRHi:
+  case ARM::tSTRHr:
+  case ARM::tSTRi:
+  case ARM::tSTRr:
+  case ARM::tSTRspi:
+  case ARM::tSUBi3:
+  case ARM::tSUBi8:
+  case ARM::tSUBrr:
+  case ARM::tTST:
+    return true;
+// there are some "conditionally deprecated" opcodes
+  case ARM::tADDspr:
+    return Instr->getOperand(2).getReg() != ARM::PC;
+  // ADD PC, SP and BLX PC were always unpredictable,
+  // now on top of it they're deprecated
+  case ARM::tADDrSP:
+  case ARM::tBX:
+    return Instr->getOperand(0).getReg() != ARM::PC;
+  case ARM::tBLXr:
+    return Instr->getOperand(BLXOperandIndex).getReg() != ARM::PC;
+  case ARM::tADDhirr:
+    return Instr->getOperand(0).getReg() != ARM::PC &&
+           Instr->getOperand(2).getReg() != ARM::PC;
+  case ARM::tCMPhir:
+  case ARM::tMOVr:
+    return Instr->getOperand(0).getReg() != ARM::PC &&
+           Instr->getOperand(1).getReg() != ARM::PC;
+  }
+}
+
+}
+
+#endif
diff --git a/lib/Target/ARM/ARMFrameLowering.cpp b/lib/Target/ARM/ARMFrameLowering.cpp
index c8637be..d32bdbc 100644
--- a/lib/Target/ARM/ARMFrameLowering.cpp
+++ b/lib/Target/ARM/ARMFrameLowering.cpp
@@ -82,22 +82,11 @@ ARMFrameLowering::canSimplifyCallFramePseudos(const MachineFunction &MF) const {
   return hasReservedCallFrame(MF) || MF.getFrameInfo()->hasVarSizedObjects();
 }
 
-static bool isCalleeSavedRegister(unsigned Reg, const uint16_t *CSRegs) {
-  for (unsigned i = 0; CSRegs[i]; ++i)
-    if (Reg == CSRegs[i])
-      return true;
-  return false;
-}
-
 static bool isCSRestore(MachineInstr *MI,
                         const ARMBaseInstrInfo &TII,
                         const uint16_t *CSRegs) {
   // Integer spill area is handled with "pop".
-  if (MI->getOpcode() == ARM::LDMIA_RET ||
-      MI->getOpcode() == ARM::t2LDMIA_RET ||
-      MI->getOpcode() == ARM::LDMIA_UPD ||
-      MI->getOpcode() == ARM::t2LDMIA_UPD ||
-      MI->getOpcode() == ARM::VLDMDIA_UPD) {
+  if (isPopOpcode(MI->getOpcode())) {
     // The first two operands are predicates. The last two are
     // imp-def and imp-use of SP. Check everything in between.
     for (int i = 5, e = MI->getNumOperands(); i != e; ++i)
@@ -115,20 +104,31 @@ static bool isCSRestore(MachineInstr *MI,
   return false;
 }
 
-static void
-emitSPUpdate(bool isARM,
-             MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
-             DebugLoc dl, const ARMBaseInstrInfo &TII,
-             int NumBytes, unsigned MIFlags = MachineInstr::NoFlags,
-             ARMCC::CondCodes Pred = ARMCC::AL, unsigned PredReg = 0) {
+static void emitRegPlusImmediate(bool isARM, MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator &MBBI, DebugLoc dl,
+                                 const ARMBaseInstrInfo &TII, unsigned DestReg,
+                                 unsigned SrcReg, int NumBytes,
+                                 unsigned MIFlags = MachineInstr::NoFlags,
+                                 ARMCC::CondCodes Pred = ARMCC::AL,
+                                 unsigned PredReg = 0) {
   if (isARM)
-    emitARMRegPlusImmediate(MBB, MBBI, dl, ARM::SP, ARM::SP, NumBytes,
+    emitARMRegPlusImmediate(MBB, MBBI, dl, DestReg, SrcReg, NumBytes,
                             Pred, PredReg, TII, MIFlags);
   else
-    emitT2RegPlusImmediate(MBB, MBBI, dl, ARM::SP, ARM::SP, NumBytes,
+    emitT2RegPlusImmediate(MBB, MBBI, dl, DestReg, SrcReg, NumBytes,
                            Pred, PredReg, TII, MIFlags);
 }
 
+static void emitSPUpdate(bool isARM, MachineBasicBlock &MBB,
+                         MachineBasicBlock::iterator &MBBI, DebugLoc dl,
+                         const ARMBaseInstrInfo &TII, int NumBytes,
+                         unsigned MIFlags = MachineInstr::NoFlags,
+                         ARMCC::CondCodes Pred = ARMCC::AL,
+                         unsigned PredReg = 0) {
+  emitRegPlusImmediate(isARM, MBB, MBBI, dl, TII, ARM::SP, ARM::SP, NumBytes,
+                       MIFlags, Pred, PredReg);
+}
+
 void ARMFrameLowering::emitPrologue(MachineFunction &MF) const {
   MachineBasicBlock &MBB = MF.front();
   MachineBasicBlock::iterator MBBI = MBB.begin();
@@ -175,6 +175,10 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF) const {
     unsigned Reg = CSI[i].getReg();
     int FI = CSI[i].getFrameIdx();
     switch (Reg) {
+    case ARM::R0:
+    case ARM::R1:
+    case ARM::R2:
+    case ARM::R3:
     case ARM::R4:
     case ARM::R5:
     case ARM::R6:
@@ -182,73 +186,61 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF) const {
     case ARM::LR:
       if (Reg == FramePtr)
         FramePtrSpillFI = FI;
-      AFI->addGPRCalleeSavedArea1Frame(FI);
       GPRCS1Size += 4;
       break;
     case ARM::R8:
     case ARM::R9:
     case ARM::R10:
     case ARM::R11:
+    case ARM::R12:
       if (Reg == FramePtr)
         FramePtrSpillFI = FI;
-      if (STI.isTargetIOS()) {
-        AFI->addGPRCalleeSavedArea2Frame(FI);
+      if (STI.isTargetIOS())
         GPRCS2Size += 4;
-      } else {
-        AFI->addGPRCalleeSavedArea1Frame(FI);
+      else
         GPRCS1Size += 4;
-      }
       break;
     default:
       // This is a DPR. Exclude the aligned DPRCS2 spills.
       if (Reg == ARM::D8)
         D8SpillFI = FI;
-      if (Reg < ARM::D8 || Reg >= ARM::D8 + AFI->getNumAlignedDPRCS2Regs()) {
-        AFI->addDPRCalleeSavedAreaFrame(FI);
+      if (Reg < ARM::D8 || Reg >= ARM::D8 + AFI->getNumAlignedDPRCS2Regs())
         DPRCSSize += 8;
-      }
     }
   }
 
   // Move past area 1.
-  if (GPRCS1Size > 0) MBBI++;
-
-  // Set FP to point to the stack slot that contains the previous FP.
-  // For iOS, FP is R7, which has now been stored in spill area 1.
-  // Otherwise, if this is not iOS, all the callee-saved registers go
-  // into spill area 1, including the FP in R11.  In either case, it is
-  // now safe to emit this assignment.
-  bool HasFP = hasFP(MF);
-  if (HasFP) {
-    unsigned ADDriOpc = !AFI->isThumbFunction() ? ARM::ADDri : ARM::t2ADDri;
-    MachineInstrBuilder MIB =
-      BuildMI(MBB, MBBI, dl, TII.get(ADDriOpc), FramePtr)
-      .addFrameIndex(FramePtrSpillFI).addImm(0)
-      .setMIFlag(MachineInstr::FrameSetup);
-    AddDefaultCC(AddDefaultPred(MIB));
-  }
-
-  // Move past area 2.
-  if (GPRCS2Size > 0) MBBI++;
+  MachineBasicBlock::iterator LastPush = MBB.end(), FramePtrPush;
+  if (GPRCS1Size > 0)
+    FramePtrPush = LastPush = MBBI++;
 
   // Determine starting offsets of spill areas.
+  bool HasFP = hasFP(MF);
   unsigned DPRCSOffset  = NumBytes - (GPRCS1Size + GPRCS2Size + DPRCSSize);
   unsigned GPRCS2Offset = DPRCSOffset + DPRCSSize;
   unsigned GPRCS1Offset = GPRCS2Offset + GPRCS2Size;
-  if (HasFP)
+  int FramePtrOffsetInPush = 0;
+  if (HasFP) {
+    FramePtrOffsetInPush = MFI->getObjectOffset(FramePtrSpillFI) + GPRCS1Size;
     AFI->setFramePtrSpillOffset(MFI->getObjectOffset(FramePtrSpillFI) +
                                 NumBytes);
+  }
   AFI->setGPRCalleeSavedArea1Offset(GPRCS1Offset);
   AFI->setGPRCalleeSavedArea2Offset(GPRCS2Offset);
   AFI->setDPRCalleeSavedAreaOffset(DPRCSOffset);
 
+  // Move past area 2.
+  if (GPRCS2Size > 0) {
+    LastPush = MBBI++;
+  }
+
   // Move past area 3.
   if (DPRCSSize > 0) {
-    MBBI++;
+    LastPush = MBBI++;
     // Since vpush register list cannot have gaps, there may be multiple vpush
     // instructions in the prologue.
     while (MBBI->getOpcode() == ARM::VSTMDDB_UPD)
-      MBBI++;
+      LastPush = MBBI++;
   }
 
   // Move past the aligned DPRCS2 area.
@@ -264,8 +256,13 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF) const {
 
   if (NumBytes) {
     // Adjust SP after all the callee-save spills.
-    emitSPUpdate(isARM, MBB, MBBI, dl, TII, -NumBytes,
-                 MachineInstr::FrameSetup);
+    if (tryFoldSPUpdateIntoPushPop(MF, LastPush, NumBytes)) {
+      if (LastPush == FramePtrPush)
+        FramePtrOffsetInPush += NumBytes;
+    } else
+      emitSPUpdate(isARM, MBB, MBBI, dl, TII, -NumBytes,
+                   MachineInstr::FrameSetup);
+
     if (HasFP && isARM)
       // Restore from fp only in ARM mode: e.g. sub sp, r7, #24
       // Note it's not safe to do this in Thumb2 mode because it would have
@@ -278,6 +275,18 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF) const {
       AFI->setShouldRestoreSPFromFP(true);
   }
 
+  // Set FP to point to the stack slot that contains the previous FP.
+  // For iOS, FP is R7, which has now been stored in spill area 1.
+  // Otherwise, if this is not iOS, all the callee-saved registers go
+  // into spill area 1, including the FP in R11.  In either case, it
+  // is in area one and the adjustment needs to take place just after
+  // that push.
+  if (HasFP)
+    emitRegPlusImmediate(!AFI->isThumbFunction(), MBB, ++FramePtrPush, dl, TII,
+                         FramePtr, ARM::SP, FramePtrOffsetInPush,
+                         MachineInstr::FrameSetup);
+
+
   if (STI.isTargetELF() && hasFP(MF))
     MFI->setOffsetAdjustment(MFI->getOffsetAdjustment() -
                              AFI->getFramePtrSpillOffset());
@@ -373,11 +382,11 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF,
       emitSPUpdate(isARM, MBB, MBBI, dl, TII, NumBytes);
   } else {
     // Unwind MBBI to point to first LDR / VLDRD.
-    const uint16_t *CSRegs = RegInfo->getCalleeSavedRegs();
+    const uint16_t *CSRegs = RegInfo->getCalleeSavedRegs(&MF);
     if (MBBI != MBB.begin()) {
-      do
+      do {
         --MBBI;
-      while (MBBI != MBB.begin() && isCSRestore(MBBI, TII, CSRegs));
+      } while (MBBI != MBB.begin() && isCSRestore(MBBI, TII, CSRegs));
       if (!isCSRestore(MBBI, TII, CSRegs))
         ++MBBI;
     }
@@ -421,8 +430,8 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF,
                                  ARM::SP)
             .addReg(FramePtr));
       }
-    } else if (NumBytes)
-      emitSPUpdate(isARM, MBB, MBBI, dl, TII, NumBytes);
+    } else if (NumBytes && !tryFoldSPUpdateIntoPushPop(MF, MBBI, NumBytes))
+        emitSPUpdate(isARM, MBB, MBBI, dl, TII, NumBytes);
 
     // Increment past our save areas.
     if (AFI->getDPRCalleeSavedAreaSize()) {
@@ -501,12 +510,6 @@ ARMFrameLowering::ResolveFrameIndexReference(const MachineFunction &MF,
 
   FrameReg = ARM::SP;
   Offset += SPAdj;
-  if (AFI->isGPRCalleeSavedArea1Frame(FI))
-    return Offset - AFI->getGPRCalleeSavedArea1Offset();
-  else if (AFI->isGPRCalleeSavedArea2Frame(FI))
-    return Offset - AFI->getGPRCalleeSavedArea2Offset();
-  else if (AFI->isDPRCalleeSavedAreaFrame(FI))
-    return Offset - AFI->getDPRCalleeSavedAreaOffset();
 
   // SP can move around if there are allocas.  We may also lose track of SP
   // when emergency spilling inside a non-reserved call frame setup.
@@ -658,6 +661,8 @@ void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB,
   unsigned RetOpcode = MI->getOpcode();
   bool isTailCall = (RetOpcode == ARM::TCRETURNdi ||
                      RetOpcode == ARM::TCRETURNri);
+  bool isInterrupt =
+      RetOpcode == ARM::SUBS_PC_LR || RetOpcode == ARM::t2SUBS_PC_LR;
 
   SmallVector<unsigned, 4> Regs;
   unsigned i = CSI.size();
@@ -672,7 +677,8 @@ void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB,
       if (Reg >= ARM::D8 && Reg < ARM::D8 + NumAlignedDPRCS2Regs)
         continue;
 
-      if (Reg == ARM::LR && !isTailCall && !isVarArg && STI.hasV5TOps()) {
+      if (Reg == ARM::LR && !isTailCall && !isVarArg && !isInterrupt &&
+          STI.hasV5TOps()) {
         Reg = ARM::PC;
         LdmOpc = AFI->isThumbFunction() ? ARM::t2LDMIA_RET : ARM::LDMIA_RET;
         // Fold the return instruction into the LDM.
@@ -1199,7 +1205,7 @@ ARMFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
 
   // Don't spill FP if the frame can be eliminated. This is determined
   // by scanning the callee-save registers to see if any is used.
-  const uint16_t *CSRegs = RegInfo->getCalleeSavedRegs();
+  const uint16_t *CSRegs = RegInfo->getCalleeSavedRegs(&MF);
   for (unsigned i = 0; CSRegs[i]; ++i) {
     unsigned Reg = CSRegs[i];
     bool Spilled = false;
@@ -1226,6 +1232,8 @@ ARMFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
       case ARM::LR:
         LRSpilled = true;
         // Fallthrough
+      case ARM::R0: case ARM::R1:
+      case ARM::R2: case ARM::R3:
       case ARM::R4: case ARM::R5:
       case ARM::R6: case ARM::R7:
         CS1Spilled = true;
@@ -1240,6 +1248,8 @@ ARMFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
       }
 
       switch (Reg) {
+      case ARM::R0: case ARM::R1:
+      case ARM::R2: case ARM::R3:
       case ARM::R4: case ARM::R5:
       case ARM::R6: case ARM::R7:
       case ARM::LR:
@@ -1295,8 +1305,12 @@ ARMFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
     if (!LRSpilled && CS1Spilled) {
       MRI.setPhysRegUsed(ARM::LR);
       NumGPRSpills++;
-      UnspilledCS1GPRs.erase(std::find(UnspilledCS1GPRs.begin(),
-                                    UnspilledCS1GPRs.end(), (unsigned)ARM::LR));
+      SmallVectorImpl<unsigned>::iterator LRPos;
+      LRPos = std::find(UnspilledCS1GPRs.begin(), UnspilledCS1GPRs.end(),
+                        (unsigned)ARM::LR);
+      if (LRPos != UnspilledCS1GPRs.end())
+        UnspilledCS1GPRs.erase(LRPos);
+
       ForceLRSpill = false;
       ExtraCSSpill = true;
     }
diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp
index 4ca3af6..87d1522 100644
--- a/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -130,6 +130,13 @@ public:
     return true;
   }
 
+  bool SelectCMOVPred(SDValue N, SDValue &Pred, SDValue &Reg) {
+    const ConstantSDNode *CN = cast<ConstantSDNode>(N);
+    Pred = CurDAG->getTargetConstant(CN->getZExtValue(), MVT::i32);
+    Reg = CurDAG->getRegister(ARM::CPSR, MVT::i32);
+    return true;
+  }
+
   bool SelectAddrMode2OffsetReg(SDNode *Op, SDValue N,
                              SDValue &Offset, SDValue &Opc);
   bool SelectAddrMode2OffsetImm(SDNode *Op, SDValue N,
@@ -239,21 +246,6 @@ private:
   /// SelectV6T2BitfieldExtractOp - Select SBFX/UBFX instructions for ARM.
   SDNode *SelectV6T2BitfieldExtractOp(SDNode *N, bool isSigned);
 
-  /// SelectCMOVOp - Select CMOV instructions for ARM.
-  SDNode *SelectCMOVOp(SDNode *N);
-  SDNode *SelectT2CMOVShiftOp(SDNode *N, SDValue FalseVal, SDValue TrueVal,
-                              ARMCC::CondCodes CCVal, SDValue CCR,
-                              SDValue InFlag);
-  SDNode *SelectARMCMOVShiftOp(SDNode *N, SDValue FalseVal, SDValue TrueVal,
-                               ARMCC::CondCodes CCVal, SDValue CCR,
-                               SDValue InFlag);
-  SDNode *SelectT2CMOVImmOp(SDNode *N, SDValue FalseVal, SDValue TrueVal,
-                              ARMCC::CondCodes CCVal, SDValue CCR,
-                              SDValue InFlag);
-  SDNode *SelectARMCMOVImmOp(SDNode *N, SDValue FalseVal, SDValue TrueVal,
-                               ARMCC::CondCodes CCVal, SDValue CCR,
-                               SDValue InFlag);
-
   // Select special operations if node forms integer ABS pattern
   SDNode *SelectABSOp(SDNode *N);
 
@@ -261,7 +253,7 @@ private:
 
   SDNode *SelectConcatVector(SDNode *N);
 
-  SDNode *SelectAtomic64(SDNode *Node, unsigned Opc);
+  SDNode *SelectAtomic(SDNode *N, unsigned Op8, unsigned Op16, unsigned Op32, unsigned Op64);
 
   /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
   /// inline asm expressions.
@@ -2321,204 +2313,6 @@ SDNode *ARMDAGToDAGISel::SelectV6T2BitfieldExtractOp(SDNode *N,
   return NULL;
 }
 
-SDNode *ARMDAGToDAGISel::
-SelectT2CMOVShiftOp(SDNode *N, SDValue FalseVal, SDValue TrueVal,
-                    ARMCC::CondCodes CCVal, SDValue CCR, SDValue InFlag) {
-  SDValue CPTmp0;
-  SDValue CPTmp1;
-  if (SelectT2ShifterOperandReg(TrueVal, CPTmp0, CPTmp1)) {
-    unsigned SOVal = cast<ConstantSDNode>(CPTmp1)->getZExtValue();
-    unsigned SOShOp = ARM_AM::getSORegShOp(SOVal);
-    unsigned Opc = 0;
-    switch (SOShOp) {
-    case ARM_AM::lsl: Opc = ARM::t2MOVCClsl; break;
-    case ARM_AM::lsr: Opc = ARM::t2MOVCClsr; break;
-    case ARM_AM::asr: Opc = ARM::t2MOVCCasr; break;
-    case ARM_AM::ror: Opc = ARM::t2MOVCCror; break;
-    default:
-      llvm_unreachable("Unknown so_reg opcode!");
-    }
-    SDValue SOShImm =
-      CurDAG->getTargetConstant(ARM_AM::getSORegOffset(SOVal), MVT::i32);
-    SDValue CC = CurDAG->getTargetConstant(CCVal, MVT::i32);
-    SDValue Ops[] = { FalseVal, CPTmp0, SOShImm, CC, CCR, InFlag };
-    return CurDAG->SelectNodeTo(N, Opc, MVT::i32,Ops, 6);
-  }
-  return 0;
-}
-
-SDNode *ARMDAGToDAGISel::
-SelectARMCMOVShiftOp(SDNode *N, SDValue FalseVal, SDValue TrueVal,
-                     ARMCC::CondCodes CCVal, SDValue CCR, SDValue InFlag) {
-  SDValue CPTmp0;
-  SDValue CPTmp1;
-  SDValue CPTmp2;
-  if (SelectImmShifterOperand(TrueVal, CPTmp0, CPTmp2)) {
-    SDValue CC = CurDAG->getTargetConstant(CCVal, MVT::i32);
-    SDValue Ops[] = { FalseVal, CPTmp0, CPTmp2, CC, CCR, InFlag };
-    return CurDAG->SelectNodeTo(N, ARM::MOVCCsi, MVT::i32, Ops, 6);
-  }
-
-  if (SelectRegShifterOperand(TrueVal, CPTmp0, CPTmp1, CPTmp2)) {
-    SDValue CC = CurDAG->getTargetConstant(CCVal, MVT::i32);
-    SDValue Ops[] = { FalseVal, CPTmp0, CPTmp1, CPTmp2, CC, CCR, InFlag };
-    return CurDAG->SelectNodeTo(N, ARM::MOVCCsr, MVT::i32, Ops, 7);
-  }
-  return 0;
-}
-
-SDNode *ARMDAGToDAGISel::
-SelectT2CMOVImmOp(SDNode *N, SDValue FalseVal, SDValue TrueVal,
-                  ARMCC::CondCodes CCVal, SDValue CCR, SDValue InFlag) {
-  ConstantSDNode *T = dyn_cast<ConstantSDNode>(TrueVal);
-  if (!T)
-    return 0;
-
-  unsigned Opc = 0;
-  unsigned TrueImm = T->getZExtValue();
-  if (is_t2_so_imm(TrueImm)) {
-    Opc = ARM::t2MOVCCi;
-  } else if (TrueImm <= 0xffff) {
-    Opc = ARM::t2MOVCCi16;
-  } else if (is_t2_so_imm_not(TrueImm)) {
-    TrueImm = ~TrueImm;
-    Opc = ARM::t2MVNCCi;
-  } else if (TrueVal.getNode()->hasOneUse() && Subtarget->hasV6T2Ops()) {
-    // Large immediate.
-    Opc = ARM::t2MOVCCi32imm;
-  }
-
-  if (Opc) {
-    SDValue True = CurDAG->getTargetConstant(TrueImm, MVT::i32);
-    SDValue CC = CurDAG->getTargetConstant(CCVal, MVT::i32);
-    SDValue Ops[] = { FalseVal, True, CC, CCR, InFlag };
-    return CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops, 5);
-  }
-
-  return 0;
-}
-
-SDNode *ARMDAGToDAGISel::
-SelectARMCMOVImmOp(SDNode *N, SDValue FalseVal, SDValue TrueVal,
-                   ARMCC::CondCodes CCVal, SDValue CCR, SDValue InFlag) {
-  ConstantSDNode *T = dyn_cast<ConstantSDNode>(TrueVal);
-  if (!T)
-    return 0;
-
-  unsigned Opc = 0;
-  unsigned TrueImm = T->getZExtValue();
-  bool isSoImm = is_so_imm(TrueImm);
-  if (isSoImm) {
-    Opc = ARM::MOVCCi;
-  } else if (Subtarget->hasV6T2Ops() && TrueImm <= 0xffff) {
-    Opc = ARM::MOVCCi16;
-  } else if (is_so_imm_not(TrueImm)) {
-    TrueImm = ~TrueImm;
-    Opc = ARM::MVNCCi;
-  } else if (TrueVal.getNode()->hasOneUse() &&
-             (Subtarget->hasV6T2Ops() || ARM_AM::isSOImmTwoPartVal(TrueImm))) {
-    // Large immediate.
-    Opc = ARM::MOVCCi32imm;
-  }
-
-  if (Opc) {
-    SDValue True = CurDAG->getTargetConstant(TrueImm, MVT::i32);
-    SDValue CC = CurDAG->getTargetConstant(CCVal, MVT::i32);
-    SDValue Ops[] = { FalseVal, True, CC, CCR, InFlag };
-    return CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops, 5);
-  }
-
-  return 0;
-}
-
-SDNode *ARMDAGToDAGISel::SelectCMOVOp(SDNode *N) {
-  EVT VT = N->getValueType(0);
-  SDValue FalseVal = N->getOperand(0);
-  SDValue TrueVal  = N->getOperand(1);
-  SDValue CC = N->getOperand(2);
-  SDValue CCR = N->getOperand(3);
-  SDValue InFlag = N->getOperand(4);
-  assert(CC.getOpcode() == ISD::Constant);
-  assert(CCR.getOpcode() == ISD::Register);
-  ARMCC::CondCodes CCVal =
-    (ARMCC::CondCodes)cast<ConstantSDNode>(CC)->getZExtValue();
-
-  if (!Subtarget->isThumb1Only() && VT == MVT::i32) {
-    // Pattern: (ARMcmov:i32 GPR:i32:$false, so_reg:i32:$true, (imm:i32):$cc)
-    // Emits: (MOVCCs:i32 GPR:i32:$false, so_reg:i32:$true, (imm:i32):$cc)
-    // Pattern complexity = 18  cost = 1  size = 0
-    if (Subtarget->isThumb()) {
-      SDNode *Res = SelectT2CMOVShiftOp(N, FalseVal, TrueVal,
-                                        CCVal, CCR, InFlag);
-      if (!Res)
-        Res = SelectT2CMOVShiftOp(N, TrueVal, FalseVal,
-                               ARMCC::getOppositeCondition(CCVal), CCR, InFlag);
-      if (Res)
-        return Res;
-    } else {
-      SDNode *Res = SelectARMCMOVShiftOp(N, FalseVal, TrueVal,
-                                         CCVal, CCR, InFlag);
-      if (!Res)
-        Res = SelectARMCMOVShiftOp(N, TrueVal, FalseVal,
-                               ARMCC::getOppositeCondition(CCVal), CCR, InFlag);
-      if (Res)
-        return Res;
-    }
-
-    // Pattern: (ARMcmov:i32 GPR:i32:$false,
-    //             (imm:i32)<<P:Pred_so_imm>>:$true,
-    //             (imm:i32):$cc)
-    // Emits: (MOVCCi:i32 GPR:i32:$false,
-    //           (so_imm:i32 (imm:i32):$true), (imm:i32):$cc)
-    // Pattern complexity = 10  cost = 1  size = 0
-    if (Subtarget->isThumb()) {
-      SDNode *Res = SelectT2CMOVImmOp(N, FalseVal, TrueVal,
-                                        CCVal, CCR, InFlag);
-      if (!Res)
-        Res = SelectT2CMOVImmOp(N, TrueVal, FalseVal,
-                               ARMCC::getOppositeCondition(CCVal), CCR, InFlag);
-      if (Res)
-        return Res;
-    } else {
-      SDNode *Res = SelectARMCMOVImmOp(N, FalseVal, TrueVal,
-                                         CCVal, CCR, InFlag);
-      if (!Res)
-        Res = SelectARMCMOVImmOp(N, TrueVal, FalseVal,
-                               ARMCC::getOppositeCondition(CCVal), CCR, InFlag);
-      if (Res)
-        return Res;
-    }
-  }
-
-  // Pattern: (ARMcmov:i32 GPR:i32:$false, GPR:i32:$true, (imm:i32):$cc)
-  // Emits: (MOVCCr:i32 GPR:i32:$false, GPR:i32:$true, (imm:i32):$cc)
-  // Pattern complexity = 6  cost = 1  size = 0
-  //
-  // Pattern: (ARMcmov:i32 GPR:i32:$false, GPR:i32:$true, (imm:i32):$cc)
-  // Emits: (tMOVCCr:i32 GPR:i32:$false, GPR:i32:$true, (imm:i32):$cc)
-  // Pattern complexity = 6  cost = 11  size = 0
-  //
-  // Also VMOVScc and VMOVDcc.
-  SDValue Tmp2 = CurDAG->getTargetConstant(CCVal, MVT::i32);
-  SDValue Ops[] = { FalseVal, TrueVal, Tmp2, CCR, InFlag };
-  unsigned Opc = 0;
-  switch (VT.getSimpleVT().SimpleTy) {
-  default: llvm_unreachable("Illegal conditional move type!");
-  case MVT::i32:
-    Opc = Subtarget->isThumb()
-      ? (Subtarget->hasThumb2() ? ARM::t2MOVCCr : ARM::tMOVCCr_pseudo)
-      : ARM::MOVCCr;
-    break;
-  case MVT::f32:
-    Opc = ARM::VMOVScc;
-    break;
-  case MVT::f64:
-    Opc = ARM::VMOVDcc;
-    break;
-  }
-  return CurDAG->SelectNodeTo(N, Opc, VT, Ops, 5);
-}
-
 /// Target-specific DAG combining for ISD::XOR.
 /// Target-independent combining lowers SELECT_CC nodes of the form
 /// select_cc setg[ge] X,  0,  X, -X
@@ -2567,30 +2361,45 @@ SDNode *ARMDAGToDAGISel::SelectConcatVector(SDNode *N) {
   return createDRegPairNode(VT, N->getOperand(0), N->getOperand(1));
 }
 
-SDNode *ARMDAGToDAGISel::SelectAtomic64(SDNode *Node, unsigned Opc) {
+SDNode *ARMDAGToDAGISel::SelectAtomic(SDNode *Node, unsigned Op8,
+                                      unsigned Op16,unsigned Op32,
+                                      unsigned Op64) {
+  // Mostly direct translation to the given operations, except that we preserve
+  // the AtomicOrdering for use later on.
+  AtomicSDNode *AN = cast<AtomicSDNode>(Node);
+  EVT VT = AN->getMemoryVT();
+
+  unsigned Op;
+  SDVTList VTs = CurDAG->getVTList(AN->getValueType(0), MVT::Other);
+  if (VT == MVT::i8)
+    Op = Op8;
+  else if (VT == MVT::i16)
+    Op = Op16;
+  else if (VT == MVT::i32)
+    Op = Op32;
+  else if (VT == MVT::i64) {
+    Op = Op64;
+    VTs = CurDAG->getVTList(MVT::i32, MVT::i32, MVT::Other);
+  } else
+    llvm_unreachable("Unexpected atomic operation");
+
   SmallVector<SDValue, 6> Ops;
-  Ops.push_back(Node->getOperand(1)); // Ptr
-  Ops.push_back(Node->getOperand(2)); // Low part of Val1
-  Ops.push_back(Node->getOperand(3)); // High part of Val1
-  if (Opc == ARM::ATOMCMPXCHG6432) {
-    Ops.push_back(Node->getOperand(4)); // Low part of Val2
-    Ops.push_back(Node->getOperand(5)); // High part of Val2
-  }
-  Ops.push_back(Node->getOperand(0)); // Chain
-  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
-  MemOp[0] = cast<MemSDNode>(Node)->getMemOperand();
-  SDNode *ResNode = CurDAG->getMachineNode(Opc, SDLoc(Node),
-                                           MVT::i32, MVT::i32, MVT::Other,
-                                           Ops);
-  cast<MachineSDNode>(ResNode)->setMemRefs(MemOp, MemOp + 1);
-  return ResNode;
+  for (unsigned i = 1; i < AN->getNumOperands(); ++i)
+      Ops.push_back(AN->getOperand(i));
+
+  Ops.push_back(CurDAG->getTargetConstant(AN->getOrdering(), MVT::i32));
+  Ops.push_back(AN->getOperand(0)); // Chain moves to the end
+
+  return CurDAG->SelectNodeTo(Node, Op, VTs, &Ops[0], Ops.size());
 }
 
 SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
   SDLoc dl(N);
 
-  if (N->isMachineOpcode())
+  if (N->isMachineOpcode()) {
+    N->setNodeId(-1);
     return NULL;   // Already selected.
+  }
 
   switch (N->getOpcode()) {
   default: break;
@@ -2882,8 +2691,6 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
                 SDValue(Chain.getNode(), Chain.getResNo()));
     return NULL;
   }
-  case ARMISD::CMOV:
-    return SelectCMOVOp(N);
   case ARMISD::VZIP: {
     unsigned Opc = 0;
     EVT VT = N->getValueType(0);
@@ -3457,31 +3264,90 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
   case ISD::CONCAT_VECTORS:
     return SelectConcatVector(N);
 
-  case ARMISD::ATOMOR64_DAG:
-    return SelectAtomic64(N, ARM::ATOMOR6432);
-  case ARMISD::ATOMXOR64_DAG:
-    return SelectAtomic64(N, ARM::ATOMXOR6432);
-  case ARMISD::ATOMADD64_DAG:
-    return SelectAtomic64(N, ARM::ATOMADD6432);
-  case ARMISD::ATOMSUB64_DAG:
-    return SelectAtomic64(N, ARM::ATOMSUB6432);
-  case ARMISD::ATOMNAND64_DAG:
-    return SelectAtomic64(N, ARM::ATOMNAND6432);
-  case ARMISD::ATOMAND64_DAG:
-    return SelectAtomic64(N, ARM::ATOMAND6432);
-  case ARMISD::ATOMSWAP64_DAG:
-    return SelectAtomic64(N, ARM::ATOMSWAP6432);
-  case ARMISD::ATOMCMPXCHG64_DAG:
-    return SelectAtomic64(N, ARM::ATOMCMPXCHG6432);
-
-  case ARMISD::ATOMMIN64_DAG:
-    return SelectAtomic64(N, ARM::ATOMMIN6432);
-  case ARMISD::ATOMUMIN64_DAG:
-    return SelectAtomic64(N, ARM::ATOMUMIN6432);
-  case ARMISD::ATOMMAX64_DAG:
-    return SelectAtomic64(N, ARM::ATOMMAX6432);
-  case ARMISD::ATOMUMAX64_DAG:
-    return SelectAtomic64(N, ARM::ATOMUMAX6432);
+  case ISD::ATOMIC_LOAD:
+    if (cast<AtomicSDNode>(N)->getMemoryVT() == MVT::i64)
+      return SelectAtomic(N, 0, 0, 0, ARM::ATOMIC_LOAD_I64);
+    else
+      break;
+
+  case ISD::ATOMIC_STORE:
+    if (cast<AtomicSDNode>(N)->getMemoryVT() == MVT::i64)
+      return SelectAtomic(N, 0, 0, 0, ARM::ATOMIC_STORE_I64);
+    else
+      break;
+
+  case ISD::ATOMIC_LOAD_ADD:
+    return SelectAtomic(N,
+                        ARM::ATOMIC_LOAD_ADD_I8,
+                        ARM::ATOMIC_LOAD_ADD_I16,
+                        ARM::ATOMIC_LOAD_ADD_I32,
+                        ARM::ATOMIC_LOAD_ADD_I64);
+  case ISD::ATOMIC_LOAD_SUB:
+    return SelectAtomic(N,
+                        ARM::ATOMIC_LOAD_SUB_I8,
+                        ARM::ATOMIC_LOAD_SUB_I16,
+                        ARM::ATOMIC_LOAD_SUB_I32,
+                        ARM::ATOMIC_LOAD_SUB_I64);
+  case ISD::ATOMIC_LOAD_AND:
+    return SelectAtomic(N,
+                        ARM::ATOMIC_LOAD_AND_I8,
+                        ARM::ATOMIC_LOAD_AND_I16,
+                        ARM::ATOMIC_LOAD_AND_I32,
+                        ARM::ATOMIC_LOAD_AND_I64);
+  case ISD::ATOMIC_LOAD_OR:
+    return SelectAtomic(N,
+                        ARM::ATOMIC_LOAD_OR_I8,
+                        ARM::ATOMIC_LOAD_OR_I16,
+                        ARM::ATOMIC_LOAD_OR_I32,
+                        ARM::ATOMIC_LOAD_OR_I64);
+  case ISD::ATOMIC_LOAD_XOR:
+    return SelectAtomic(N,
+                        ARM::ATOMIC_LOAD_XOR_I8,
+                        ARM::ATOMIC_LOAD_XOR_I16,
+                        ARM::ATOMIC_LOAD_XOR_I32,
+                        ARM::ATOMIC_LOAD_XOR_I64);
+  case ISD::ATOMIC_LOAD_NAND:
+    return SelectAtomic(N,
+                        ARM::ATOMIC_LOAD_NAND_I8,
+                        ARM::ATOMIC_LOAD_NAND_I16,
+                        ARM::ATOMIC_LOAD_NAND_I32,
+                        ARM::ATOMIC_LOAD_NAND_I64);
+  case ISD::ATOMIC_LOAD_MIN:
+    return SelectAtomic(N,
+                        ARM::ATOMIC_LOAD_MIN_I8,
+                        ARM::ATOMIC_LOAD_MIN_I16,
+                        ARM::ATOMIC_LOAD_MIN_I32,
+                        ARM::ATOMIC_LOAD_MIN_I64);
+  case ISD::ATOMIC_LOAD_MAX:
+    return SelectAtomic(N,
+                        ARM::ATOMIC_LOAD_MAX_I8,
+                        ARM::ATOMIC_LOAD_MAX_I16,
+                        ARM::ATOMIC_LOAD_MAX_I32,
+                        ARM::ATOMIC_LOAD_MAX_I64);
+  case ISD::ATOMIC_LOAD_UMIN:
+    return SelectAtomic(N,
+                        ARM::ATOMIC_LOAD_UMIN_I8,
+                        ARM::ATOMIC_LOAD_UMIN_I16,
+                        ARM::ATOMIC_LOAD_UMIN_I32,
+                        ARM::ATOMIC_LOAD_UMIN_I64);
+  case ISD::ATOMIC_LOAD_UMAX:
+    return SelectAtomic(N,
+                        ARM::ATOMIC_LOAD_UMAX_I8,
+                        ARM::ATOMIC_LOAD_UMAX_I16,
+                        ARM::ATOMIC_LOAD_UMAX_I32,
+                        ARM::ATOMIC_LOAD_UMAX_I64);
+  case ISD::ATOMIC_SWAP:
+    return SelectAtomic(N,
+                        ARM::ATOMIC_SWAP_I8,
+                        ARM::ATOMIC_SWAP_I16,
+                        ARM::ATOMIC_SWAP_I32,
+                        ARM::ATOMIC_SWAP_I64);
+  case ISD::ATOMIC_CMP_SWAP:
+    return SelectAtomic(N,
+                        ARM::ATOMIC_CMP_SWAP_I8,
+                        ARM::ATOMIC_CMP_SWAP_I16,
+                        ARM::ATOMIC_CMP_SWAP_I32,
+                        ARM::ATOMIC_CMP_SWAP_I64);
   }
 
   return SelectCode(N);
@@ -3614,7 +3480,10 @@ SDNode *ARMDAGToDAGISel::SelectInlineAsm(SDNode *N){
     if(PairedReg.getNode()) {
       OpChanged[OpChanged.size() -1 ] = true;
       Flag = InlineAsm::getFlagWord(Kind, 1 /* RegNum*/);
-      Flag = InlineAsm::getFlagWordForRegClass(Flag, ARM::GPRPairRegClassID);
+      if (IsTiedToChangedOp)
+        Flag = InlineAsm::getFlagWordForMatchingOp(Flag, DefIdx);
+      else
+        Flag = InlineAsm::getFlagWordForRegClass(Flag, ARM::GPRPairRegClassID);
       // Replace the current flag.
       AsmNodeOperands[AsmNodeOperands.size() -1] = CurDAG->getTargetConstant(
           Flag, MVT::i32);
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index caec11e..76a0a83 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -48,6 +48,7 @@
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetOptions.h"
+#include <utility>
 using namespace llvm;
 
 STATISTIC(NumTailCalls, "Number of tail calls");
@@ -174,9 +175,10 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
 
   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
 
-  if (Subtarget->isTargetDarwin()) {
+  if (Subtarget->isTargetIOS()) {
     // Uses VFP for Thumb libfuncs if available.
-    if (Subtarget->isThumb() && Subtarget->hasVFP2()) {
+    if (Subtarget->isThumb() && Subtarget->hasVFP2() &&
+        Subtarget->hasARMOps()) {
       // Single-precision floating-point arithmetic.
       setLibcallName(RTLIB::ADD_F32, "__addsf3vfp");
       setLibcallName(RTLIB::SUB_F32, "__subsf3vfp");
@@ -421,7 +423,7 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
   }
 
   // Use divmod compiler-rt calls for iOS 5.0 and later.
-  if (Subtarget->getTargetTriple().getOS() == Triple::IOS &&
+  if (Subtarget->getTargetTriple().isiOS() &&
       !Subtarget->getTargetTriple().isOSVersionLT(5, 0)) {
     setLibcallName(RTLIB::SDIVREM_I32, "__divmodsi4");
     setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4");
@@ -452,6 +454,7 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
   }
 
   setOperationAction(ISD::ConstantFP, MVT::f32, Custom);
+  setOperationAction(ISD::ConstantFP, MVT::f64, Custom);
 
   if (Subtarget->hasNEON()) {
     addDRTypeForNEON(MVT::v2f32);
@@ -564,16 +567,6 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
     setOperationAction(ISD::FP_ROUND,   MVT::v2f32, Expand);
     setOperationAction(ISD::FP_EXTEND,  MVT::v2f64, Expand);
 
-    // Custom expand long extensions to vectors.
-    setOperationAction(ISD::SIGN_EXTEND, MVT::v8i32,  Custom);
-    setOperationAction(ISD::ZERO_EXTEND, MVT::v8i32,  Custom);
-    setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64,  Custom);
-    setOperationAction(ISD::ZERO_EXTEND, MVT::v4i64,  Custom);
-    setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
-    setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
-    setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64,  Custom);
-    setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64,  Custom);
-
     // NEON does not have single instruction CTPOP for vectors with element
     // types wider than 8-bits.  However, custom lowering can leverage the
     // v8i8/v16i8 vcnt instruction.
@@ -750,12 +743,10 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand);
   // ARMv6 Thumb1 (except for CPUs that support dmb / dsb) and earlier use
   // the default expansion.
-  // FIXME: This should be checking for v6k, not just v6.
-  if (Subtarget->hasDataBarrier() ||
-      (Subtarget->hasV6Ops() && !Subtarget->isThumb())) {
-    // membarrier needs custom lowering; the rest are legal and handled
-    // normally.
-    setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom);
+  if (Subtarget->hasAnyDataBarrier() && !Subtarget->isThumb1Only()) {
+    // ATOMIC_FENCE needs custom lowering; the other 32-bit ones are legal and
+    // handled normally.
+    setOperationAction(ISD::ATOMIC_FENCE,     MVT::Other, Custom);
     // Custom lowering for 64-bit ops
     setOperationAction(ISD::ATOMIC_LOAD_ADD,  MVT::i64, Custom);
     setOperationAction(ISD::ATOMIC_LOAD_SUB,  MVT::i64, Custom);
@@ -768,11 +759,20 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
     setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i64, Custom);
     setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i64, Custom);
     setOperationAction(ISD::ATOMIC_CMP_SWAP,  MVT::i64, Custom);
-    // Automatically insert fences (dmb ist) around ATOMIC_SWAP etc.
-    setInsertFencesForAtomic(true);
+    // On v8, we have particularly efficient implementations of atomic fences
+    // if they can be combined with nearby atomic loads and stores.
+    if (!Subtarget->hasV8Ops()) {
+      // Automatically insert fences (dmb ist) around ATOMIC_SWAP etc.
+      setInsertFencesForAtomic(true);
+    }
+    setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom);
   } else {
+    // If there's anything we can use as a barrier, go through custom lowering
+    // for ATOMIC_FENCE.
+    setOperationAction(ISD::ATOMIC_FENCE,   MVT::Other,
+                       Subtarget->hasAnyDataBarrier() ? Custom : Expand);
+
     // Set them all for expansion, which will force libcalls.
-    setOperationAction(ISD::ATOMIC_FENCE,   MVT::Other, Expand);
     setOperationAction(ISD::ATOMIC_CMP_SWAP,  MVT::i32, Expand);
     setOperationAction(ISD::ATOMIC_SWAP,      MVT::i32, Expand);
     setOperationAction(ISD::ATOMIC_LOAD_ADD,  MVT::i32, Expand);
@@ -869,6 +869,18 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
       setOperationAction(ISD::FP32_TO_FP16, MVT::i32, Expand);
     }
   }
+      
+  // Combine sin / cos into one node or libcall if possible.
+  if (Subtarget->hasSinCos()) {
+    setLibcallName(RTLIB::SINCOS_F32, "sincosf");
+    setLibcallName(RTLIB::SINCOS_F64, "sincos");
+    if (Subtarget->getTargetTriple().getOS() == Triple::IOS) {
+      // For iOS, we don't want to the normal expansion of a libcall to
+      // sincos. We want to issue a libcall to __sincos_stret.
+      setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
+      setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
+    }
+  }
 
   // We have target-specific dag combine patterns for the following nodes:
   // ARMISD::VMOVRRD  - No need to call setTargetDAGCombine
@@ -908,6 +920,44 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
   setMinFunctionAlignment(Subtarget->isThumb() ? 1 : 2);
 }
 
+static void getExclusiveOperation(unsigned Size, AtomicOrdering Ord,
+                                  bool isThumb2, unsigned &LdrOpc,
+                                  unsigned &StrOpc) {
+  static const unsigned LoadBares[4][2] =  {{ARM::LDREXB, ARM::t2LDREXB},
+                                            {ARM::LDREXH, ARM::t2LDREXH},
+                                            {ARM::LDREX,  ARM::t2LDREX},
+                                            {ARM::LDREXD, ARM::t2LDREXD}};
+  static const unsigned LoadAcqs[4][2] =   {{ARM::LDAEXB, ARM::t2LDAEXB},
+                                            {ARM::LDAEXH, ARM::t2LDAEXH},
+                                            {ARM::LDAEX,  ARM::t2LDAEX},
+                                            {ARM::LDAEXD, ARM::t2LDAEXD}};
+  static const unsigned StoreBares[4][2] = {{ARM::STREXB, ARM::t2STREXB},
+                                            {ARM::STREXH, ARM::t2STREXH},
+                                            {ARM::STREX,  ARM::t2STREX},
+                                            {ARM::STREXD, ARM::t2STREXD}};
+  static const unsigned StoreRels[4][2] =  {{ARM::STLEXB, ARM::t2STLEXB},
+                                            {ARM::STLEXH, ARM::t2STLEXH},
+                                            {ARM::STLEX,  ARM::t2STLEX},
+                                            {ARM::STLEXD, ARM::t2STLEXD}};
+
+  const unsigned (*LoadOps)[2], (*StoreOps)[2];
+  if (Ord == Acquire || Ord == AcquireRelease || Ord == SequentiallyConsistent)
+    LoadOps = LoadAcqs;
+  else
+    LoadOps = LoadBares;
+
+  if (Ord == Release || Ord == AcquireRelease || Ord == SequentiallyConsistent)
+    StoreOps = StoreRels;
+  else
+    StoreOps = StoreBares;
+
+  assert(isPowerOf2_32(Size) && Size <= 8 &&
+         "unsupported size for atomic binary op!");
+
+  LdrOpc = LoadOps[Log2_32(Size)][isThumb2];
+  StrOpc = StoreOps[Log2_32(Size)][isThumb2];
+}
+
 // FIXME: It might make sense to define the representative register class as the
 // nearest super-register that has a non-null superset. For example, DPR_VFP2 is
 // a super-register of SPR, and DPR is a superset if DPR_VFP2. Consequently,
@@ -970,6 +1020,7 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case ARMISD::BR_JT:         return "ARMISD::BR_JT";
   case ARMISD::BR2_JT:        return "ARMISD::BR2_JT";
   case ARMISD::RET_FLAG:      return "ARMISD::RET_FLAG";
+  case ARMISD::INTRET_FLAG:   return "ARMISD::INTRET_FLAG";
   case ARMISD::PIC_ADD:       return "ARMISD::PIC_ADD";
   case ARMISD::CMP:           return "ARMISD::CMP";
   case ARMISD::CMN:           return "ARMISD::CMN";
@@ -1009,7 +1060,6 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
 
   case ARMISD::DYN_ALLOC:     return "ARMISD::DYN_ALLOC";
 
-  case ARMISD::MEMBARRIER:    return "ARMISD::MEMBARRIER";
   case ARMISD::MEMBARRIER_MCR: return "ARMISD::MEMBARRIER_MCR";
 
   case ARMISD::PRELOAD:       return "ARMISD::PRELOAD";
@@ -1068,6 +1118,8 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case ARMISD::BUILD_VECTOR:  return "ARMISD::BUILD_VECTOR";
   case ARMISD::FMAX:          return "ARMISD::FMAX";
   case ARMISD::FMIN:          return "ARMISD::FMIN";
+  case ARMISD::VMAXNM:        return "ARMISD::VMAX";
+  case ARMISD::VMINNM:        return "ARMISD::VMIN";
   case ARMISD::BFI:           return "ARMISD::BFI";
   case ARMISD::VORRIMM:       return "ARMISD::VORRIMM";
   case ARMISD::VBICIMM:       return "ARMISD::VBICIMM";
@@ -1092,19 +1144,6 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case ARMISD::VST2LN_UPD:    return "ARMISD::VST2LN_UPD";
   case ARMISD::VST3LN_UPD:    return "ARMISD::VST3LN_UPD";
   case ARMISD::VST4LN_UPD:    return "ARMISD::VST4LN_UPD";
-
-  case ARMISD::ATOMADD64_DAG:     return "ATOMADD64_DAG";
-  case ARMISD::ATOMSUB64_DAG:     return "ATOMSUB64_DAG";
-  case ARMISD::ATOMOR64_DAG:      return "ATOMOR64_DAG";
-  case ARMISD::ATOMXOR64_DAG:     return "ATOMXOR64_DAG";
-  case ARMISD::ATOMAND64_DAG:     return "ATOMAND64_DAG";
-  case ARMISD::ATOMNAND64_DAG:    return "ATOMNAND64_DAG";
-  case ARMISD::ATOMSWAP64_DAG:    return "ATOMSWAP64_DAG";
-  case ARMISD::ATOMCMPXCHG64_DAG: return "ATOMCMPXCHG64_DAG";
-  case ARMISD::ATOMMIN64_DAG:     return "ATOMMIN64_DAG";
-  case ARMISD::ATOMUMIN64_DAG:    return "ATOMUMIN64_DAG";
-  case ARMISD::ATOMMAX64_DAG:     return "ATOMMAX64_DAG";
-  case ARMISD::ATOMUMAX64_DAG:    return "ATOMUMAX64_DAG";
   }
 }
 
@@ -1536,7 +1575,8 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
           SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const);
           SDValue Load = DAG.getLoad(PtrVT, dl, Chain, AddArg,
                                      MachinePointerInfo(),
-                                     false, false, false, 0);
+                                     false, false, false,
+                                     DAG.InferPtrAlignment(AddArg));
           MemOpChains.push_back(Load.getValue(1));
           RegsToPass.push_back(std::make_pair(j, Load));
         }
@@ -1745,24 +1785,26 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                                   RegsToPass[i].second.getValueType()));
 
   // Add a register mask operand representing the call-preserved registers.
-  const uint32_t *Mask;
-  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
-  const ARMBaseRegisterInfo *ARI = static_cast<const ARMBaseRegisterInfo*>(TRI);
-  if (isThisReturn) {
-    // For 'this' returns, use the R0-preserving mask if applicable
-    Mask = ARI->getThisReturnPreservedMask(CallConv);
-    if (!Mask) {
-      // Set isThisReturn to false if the calling convention is not one that
-      // allows 'returned' to be modeled in this way, so LowerCallResult does
-      // not try to pass 'this' straight through 
-      isThisReturn = false;
+  if (!isTailCall) {
+    const uint32_t *Mask;
+    const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
+    const ARMBaseRegisterInfo *ARI = static_cast<const ARMBaseRegisterInfo*>(TRI);
+    if (isThisReturn) {
+      // For 'this' returns, use the R0-preserving mask if applicable
+      Mask = ARI->getThisReturnPreservedMask(CallConv);
+      if (!Mask) {
+        // Set isThisReturn to false if the calling convention is not one that
+        // allows 'returned' to be modeled in this way, so LowerCallResult does
+        // not try to pass 'this' straight through
+        isThisReturn = false;
+        Mask = ARI->getCallPreservedMask(CallConv);
+      }
+    } else
       Mask = ARI->getCallPreservedMask(CallConv);
-    }
-  } else
-    Mask = ARI->getCallPreservedMask(CallConv);
 
-  assert(Mask && "Missing call preserved mask for calling convention");
-  Ops.push_back(DAG.getRegisterMask(Mask));
+    assert(Mask && "Missing call preserved mask for calling convention");
+    Ops.push_back(DAG.getRegisterMask(Mask));
+  }
 
   if (InFlag.getNode())
     Ops.push_back(InFlag);
@@ -1933,6 +1975,12 @@ ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
   if (isVarArg && !Outs.empty())
     return false;
 
+  // Exception-handling functions need a special set of instructions to indicate
+  // a return to the hardware. Tail-calling another function would probably
+  // break this.
+  if (CallerF->hasFnAttribute("interrupt"))
+    return false;
+
   // Also avoid sibcall optimization if either caller or callee uses struct
   // return semantics.
   if (isCalleeStructRet || isCallerStructRet)
@@ -2061,6 +2109,39 @@ ARMTargetLowering::CanLowerReturn(CallingConv::ID CallConv,
                                                     isVarArg));
 }
 
+static SDValue LowerInterruptReturn(SmallVectorImpl<SDValue> &RetOps,
+                                    SDLoc DL, SelectionDAG &DAG) {
+  const MachineFunction &MF = DAG.getMachineFunction();
+  const Function *F = MF.getFunction();
+
+  StringRef IntKind = F->getFnAttribute("interrupt").getValueAsString();
+
+  // See ARM ARM v7 B1.8.3. On exception entry LR is set to a possibly offset
+  // version of the "preferred return address". These offsets affect the return
+  // instruction if this is a return from PL1 without hypervisor extensions.
+  //    IRQ/FIQ: +4     "subs pc, lr, #4"
+  //    SWI:     0      "subs pc, lr, #0"
+  //    ABORT:   +4     "subs pc, lr, #4"
+  //    UNDEF:   +4/+2  "subs pc, lr, #0"
+  // UNDEF varies depending on where the exception came from ARM or Thumb
+  // mode. Alongside GCC, we throw our hands up in disgust and pretend it's 0.
+
+  int64_t LROffset;
+  if (IntKind == "" || IntKind == "IRQ" || IntKind == "FIQ" ||
+      IntKind == "ABORT")
+    LROffset = 4;
+  else if (IntKind == "SWI" || IntKind == "UNDEF")
+    LROffset = 0;
+  else
+    report_fatal_error("Unsupported interrupt attribute. If present, value "
+                       "must be one of: IRQ, FIQ, SWI, ABORT or UNDEF");
+
+  RetOps.insert(RetOps.begin() + 1, DAG.getConstant(LROffset, MVT::i32, false));
+
+  return DAG.getNode(ARMISD::INTRET_FLAG, DL, MVT::Other,
+                     RetOps.data(), RetOps.size());
+}
+
 SDValue
 ARMTargetLowering::LowerReturn(SDValue Chain,
                                CallingConv::ID CallConv, bool isVarArg,
@@ -2146,6 +2227,19 @@ ARMTargetLowering::LowerReturn(SDValue Chain,
   if (Flag.getNode())
     RetOps.push_back(Flag);
 
+  // CPUs which aren't M-class use a special sequence to return from
+  // exceptions (roughly, any instruction setting pc and cpsr simultaneously,
+  // though we use "subs pc, lr, #N").
+  //
+  // M-class CPUs actually use a normal return sequence with a special
+  // (hardware-provided) value in LR, so the normal code path works.
+  if (DAG.getMachineFunction().getFunction()->hasFnAttribute("interrupt") &&
+      !Subtarget->isMClass()) {
+    if (Subtarget->isThumb1Only())
+      report_fatal_error("interrupt attribute is not supported in Thumb1");
+    return LowerInterruptReturn(RetOps, dl, DAG);
+  }
+
   return DAG.getNode(ARMISD::RET_FLAG, dl, MVT::Other,
                      RetOps.data(), RetOps.size());
 }
@@ -2202,7 +2296,8 @@ bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
   bool HasRet = false;
   for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
        UI != UE; ++UI) {
-    if (UI->getOpcode() != ARMISD::RET_FLAG)
+    if (UI->getOpcode() != ARMISD::RET_FLAG &&
+        UI->getOpcode() != ARMISD::INTRET_FLAG)
       return false;
     HasRet = true;
   }
@@ -2589,7 +2684,7 @@ static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG,
     // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get
     // here.
     assert(Subtarget->hasV6Ops() && !Subtarget->isThumb() &&
-           "Unexpected ISD::MEMBARRIER encountered. Should be libcall!");
+           "Unexpected ISD::ATOMIC_FENCE encountered. Should be libcall!");
     return DAG.getNode(ARMISD::MEMBARRIER_MCR, dl, MVT::Other, Op.getOperand(0),
                        DAG.getConstant(0, MVT::i32));
   }
@@ -2597,14 +2692,18 @@ static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG,
   ConstantSDNode *OrdN = cast<ConstantSDNode>(Op.getOperand(1));
   AtomicOrdering Ord = static_cast<AtomicOrdering>(OrdN->getZExtValue());
   unsigned Domain = ARM_MB::ISH;
-  if (Subtarget->isSwift() && Ord == Release) {
+  if (Subtarget->isMClass()) {
+    // Only a full system barrier exists in the M-class architectures.
+    Domain = ARM_MB::SY;
+  } else if (Subtarget->isSwift() && Ord == Release) {
     // Swift happens to implement ISHST barriers in a way that's compatible with
     // Release semantics but weaker than ISH so we'd be fools not to use
     // it. Beware: other processors probably don't!
     Domain = ARM_MB::ISHST;
   }
 
-  return DAG.getNode(ARMISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0),
+  return DAG.getNode(ISD::INTRINSIC_VOID, dl, MVT::Other, Op.getOperand(0),
+                     DAG.getConstant(Intrinsic::arm_dmb, MVT::i32),
                      DAG.getConstant(Domain, MVT::i32));
 }
 
@@ -3177,6 +3276,61 @@ SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
                          SelectTrue, SelectFalse, ISD::SETNE);
 }
 
+static ISD::CondCode getInverseCCForVSEL(ISD::CondCode CC) {
+  if (CC == ISD::SETNE)
+    return ISD::SETEQ;
+  return ISD::getSetCCSwappedOperands(CC);
+}
+
+static void checkVSELConstraints(ISD::CondCode CC, ARMCC::CondCodes &CondCode,
+                                 bool &swpCmpOps, bool &swpVselOps) {
+  // Start by selecting the GE condition code for opcodes that return true for
+  // 'equality'
+  if (CC == ISD::SETUGE || CC == ISD::SETOGE || CC == ISD::SETOLE ||
+      CC == ISD::SETULE)
+    CondCode = ARMCC::GE;
+
+  // and GT for opcodes that return false for 'equality'.
+  else if (CC == ISD::SETUGT || CC == ISD::SETOGT || CC == ISD::SETOLT ||
+           CC == ISD::SETULT)
+    CondCode = ARMCC::GT;
+
+  // Since we are constrained to GE/GT, if the opcode contains 'less', we need
+  // to swap the compare operands.
+  if (CC == ISD::SETOLE || CC == ISD::SETULE || CC == ISD::SETOLT ||
+      CC == ISD::SETULT)
+    swpCmpOps = true;
+
+  // Both GT and GE are ordered comparisons, and return false for 'unordered'.
+  // If we have an unordered opcode, we need to swap the operands to the VSEL
+  // instruction (effectively negating the condition).
+  //
+  // This also has the effect of swapping which one of 'less' or 'greater'
+  // returns true, so we also swap the compare operands. It also switches
+  // whether we return true for 'equality', so we compensate by picking the
+  // opposite condition code to our original choice.
+  if (CC == ISD::SETULE || CC == ISD::SETULT || CC == ISD::SETUGE ||
+      CC == ISD::SETUGT) {
+    swpCmpOps = !swpCmpOps;
+    swpVselOps = !swpVselOps;
+    CondCode = CondCode == ARMCC::GT ? ARMCC::GE : ARMCC::GT;
+  }
+
+  // 'ordered' is 'anything but unordered', so use the VS condition code and
+  // swap the VSEL operands.
+  if (CC == ISD::SETO) {
+    CondCode = ARMCC::VS;
+    swpVselOps = true;
+  }
+
+  // 'unordered or not equal' is 'anything but equal', so use the EQ condition
+  // code and swap the VSEL operands.
+  if (CC == ISD::SETUNE) {
+    CondCode = ARMCC::EQ;
+    swpVselOps = true;
+  }
+}
+
 SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
   EVT VT = Op.getValueType();
   SDValue LHS = Op.getOperand(0);
@@ -3187,15 +3341,66 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
   SDLoc dl(Op);
 
   if (LHS.getValueType() == MVT::i32) {
+    // Try to generate VSEL on ARMv8.
+    // The VSEL instruction can't use all the usual ARM condition
+    // codes: it only has two bits to select the condition code, so it's
+    // constrained to use only GE, GT, VS and EQ.
+    //
+    // To implement all the various ISD::SETXXX opcodes, we sometimes need to
+    // swap the operands of the previous compare instruction (effectively
+    // inverting the compare condition, swapping 'less' and 'greater') and
+    // sometimes need to swap the operands to the VSEL (which inverts the
+    // condition in the sense of firing whenever the previous condition didn't)
+    if (getSubtarget()->hasFPARMv8() && (TrueVal.getValueType() == MVT::f32 ||
+                                      TrueVal.getValueType() == MVT::f64)) {
+      ARMCC::CondCodes CondCode = IntCCToARMCC(CC);
+      if (CondCode == ARMCC::LT || CondCode == ARMCC::LE ||
+          CondCode == ARMCC::VC || CondCode == ARMCC::NE) {
+        CC = getInverseCCForVSEL(CC);
+        std::swap(TrueVal, FalseVal);
+      }
+    }
+
     SDValue ARMcc;
     SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
     SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
-    return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc, CCR,Cmp);
+    return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc, CCR,
+                       Cmp);
   }
 
   ARMCC::CondCodes CondCode, CondCode2;
   FPCCToARMCC(CC, CondCode, CondCode2);
 
+  // Try to generate VSEL on ARMv8.
+  if (getSubtarget()->hasFPARMv8() && (TrueVal.getValueType() == MVT::f32 ||
+                                    TrueVal.getValueType() == MVT::f64)) {
+    // We can select VMAXNM/VMINNM from a compare followed by a select with the
+    // same operands, as follows:
+    //   c = fcmp [ogt, olt, ugt, ult] a, b
+    //   select c, a, b
+    // We only do this in unsafe-fp-math, because signed zeros and NaNs are
+    // handled differently than the original code sequence.
+    if (getTargetMachine().Options.UnsafeFPMath && LHS == TrueVal &&
+        RHS == FalseVal) {
+      if (CC == ISD::SETOGT || CC == ISD::SETUGT)
+        return DAG.getNode(ARMISD::VMAXNM, dl, VT, TrueVal, FalseVal);
+      if (CC == ISD::SETOLT || CC == ISD::SETULT)
+        return DAG.getNode(ARMISD::VMINNM, dl, VT, TrueVal, FalseVal);
+    }
+
+    bool swpCmpOps = false;
+    bool swpVselOps = false;
+    checkVSELConstraints(CC, CondCode, swpCmpOps, swpVselOps);
+
+    if (CondCode == ARMCC::GT || CondCode == ARMCC::GE ||
+        CondCode == ARMCC::VS || CondCode == ARMCC::EQ) {
+      if (swpCmpOps)
+        std::swap(LHS, RHS);
+      if (swpVselOps)
+        std::swap(TrueVal, FalseVal);
+    }
+  }
+
   SDValue ARMcc = DAG.getConstant(CondCode, MVT::i32);
   SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl);
   SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
@@ -3627,47 +3832,6 @@ SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
   return FrameAddr;
 }
 
-/// Custom Expand long vector extensions, where size(DestVec) > 2*size(SrcVec),
-/// and size(DestVec) > 128-bits.
-/// This is achieved by doing the one extension from the SrcVec, splitting the
-/// result, extending these parts, and then concatenating these into the
-/// destination.
-static SDValue ExpandVectorExtension(SDNode *N, SelectionDAG &DAG) {
-  SDValue Op = N->getOperand(0);
-  EVT SrcVT = Op.getValueType();
-  EVT DestVT = N->getValueType(0);
-
-  assert(DestVT.getSizeInBits() > 128 &&
-         "Custom sext/zext expansion needs >128-bit vector.");
-  // If this is a normal length extension, use the default expansion.
-  if (SrcVT.getSizeInBits()*4 != DestVT.getSizeInBits() &&
-      SrcVT.getSizeInBits()*8 != DestVT.getSizeInBits())
-    return SDValue();
-
-  SDLoc dl(N);
-  unsigned SrcEltSize = SrcVT.getVectorElementType().getSizeInBits();
-  unsigned DestEltSize = DestVT.getVectorElementType().getSizeInBits();
-  unsigned NumElts = SrcVT.getVectorNumElements();
-  LLVMContext &Ctx = *DAG.getContext();
-  SDValue Mid, SplitLo, SplitHi, ExtLo, ExtHi;
-
-  EVT MidVT = EVT::getVectorVT(Ctx, EVT::getIntegerVT(Ctx, SrcEltSize*2),
-                               NumElts);
-  EVT SplitVT = EVT::getVectorVT(Ctx, EVT::getIntegerVT(Ctx, SrcEltSize*2),
-                                 NumElts/2);
-  EVT ExtVT = EVT::getVectorVT(Ctx, EVT::getIntegerVT(Ctx, DestEltSize),
-                               NumElts/2);
-
-  Mid = DAG.getNode(N->getOpcode(), dl, MidVT, Op);
-  SplitLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SplitVT, Mid,
-                        DAG.getIntPtrConstant(0));
-  SplitHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SplitVT, Mid,
-                        DAG.getIntPtrConstant(NumElts/2));
-  ExtLo = DAG.getNode(N->getOpcode(), dl, ExtVT, SplitLo);
-  ExtHi = DAG.getNode(N->getOpcode(), dl, ExtVT, SplitHi);
-  return DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, ExtLo, ExtHi);
-}
-
 /// ExpandBITCAST - If the target supports VFP, this function is called to
 /// expand a bit convert where either the source or destination type is i64 to
 /// use a VMOVDRR or VMOVRRD node.  This should not be done when the non-i64
@@ -4271,17 +4435,25 @@ static SDValue isNEONModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,
 
 SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG,
                                            const ARMSubtarget *ST) const {
-  if (!ST->useNEONForSinglePrecisionFP() || !ST->hasVFP3() || ST->hasD16())
+  if (!ST->hasVFP3())
     return SDValue();
 
+  bool IsDouble = Op.getValueType() == MVT::f64;
   ConstantFPSDNode *CFP = cast<ConstantFPSDNode>(Op);
-  assert(Op.getValueType() == MVT::f32 &&
-         "ConstantFP custom lowering should only occur for f32.");
 
   // Try splatting with a VMOV.f32...
   APFloat FPVal = CFP->getValueAPF();
-  int ImmVal = ARM_AM::getFP32Imm(FPVal);
+  int ImmVal = IsDouble ? ARM_AM::getFP64Imm(FPVal) : ARM_AM::getFP32Imm(FPVal);
+
   if (ImmVal != -1) {
+    if (IsDouble || !ST->useNEONForSinglePrecisionFP()) {
+      // We have code in place to select a valid ConstantFP already, no need to
+      // do any mangling.
+      return Op;
+    }
+
+    // It's a float and we are trying to use NEON operations where
+    // possible. Lower it to a splat followed by an extract.
     SDLoc DL(Op);
     SDValue NewVal = DAG.getTargetConstant(ImmVal, MVT::i32);
     SDValue VecConstant = DAG.getNode(ARMISD::VMOVFPIMM, DL, MVT::v2f32,
@@ -4290,15 +4462,31 @@ SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG,
                        DAG.getConstant(0, MVT::i32));
   }
 
-  // If that fails, try a VMOV.i32
+  // The rest of our options are NEON only, make sure that's allowed before
+  // proceeding..
+  if (!ST->hasNEON() || (!IsDouble && !ST->useNEONForSinglePrecisionFP()))
+    return SDValue();
+
   EVT VMovVT;
-  unsigned iVal = FPVal.bitcastToAPInt().getZExtValue();
-  SDValue NewVal = isNEONModifiedImm(iVal, 0, 32, DAG, VMovVT, false,
-                                     VMOVModImm);
+  uint64_t iVal = FPVal.bitcastToAPInt().getZExtValue();
+
+  // It wouldn't really be worth bothering for doubles except for one very
+  // important value, which does happen to match: 0.0. So make sure we don't do
+  // anything stupid.
+  if (IsDouble && (iVal & 0xffffffff) != (iVal >> 32))
+    return SDValue();
+
+  // Try a VMOV.i32 (FIXME: i8, i16, or i64 could work too).
+  SDValue NewVal = isNEONModifiedImm(iVal & 0xffffffffU, 0, 32, DAG, VMovVT,
+                                     false, VMOVModImm);
   if (NewVal != SDValue()) {
     SDLoc DL(Op);
     SDValue VecConstant = DAG.getNode(ARMISD::VMOVIMM, DL, VMovVT,
                                       NewVal);
+    if (IsDouble)
+      return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant);
+
+    // It's a float: cast and extract a vector element.
     SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32,
                                        VecConstant);
     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant,
@@ -4306,11 +4494,16 @@ SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG,
   }
 
   // Finally, try a VMVN.i32
-  NewVal = isNEONModifiedImm(~iVal & 0xffffffff, 0, 32, DAG, VMovVT, false,
-                             VMVNModImm);
+  NewVal = isNEONModifiedImm(~iVal & 0xffffffffU, 0, 32, DAG, VMovVT,
+                             false, VMVNModImm);
   if (NewVal != SDValue()) {
     SDLoc DL(Op);
     SDValue VecConstant = DAG.getNode(ARMISD::VMVNIMM, DL, VMovVT, NewVal);
+
+    if (IsDouble)
+      return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant);
+
+    // It's a float: cast and extract a vector element.
     SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32,
                                        VecConstant);
     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant,
@@ -5769,6 +5962,70 @@ static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
                      Op.getOperand(1), Op.getOperand(2));
 }
 
+SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
+  assert(Subtarget->isTargetDarwin());
+
+  // For iOS, we want to call an alternative entry point: __sincos_stret,
+  // return values are passed via sret.
+  SDLoc dl(Op);
+  SDValue Arg = Op.getOperand(0);
+  EVT ArgVT = Arg.getValueType();
+  Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
+
+  MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo();
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+  // Pair of floats / doubles used to pass the result.
+  StructType *RetTy = StructType::get(ArgTy, ArgTy, NULL);
+
+  // Create stack object for sret.
+  const uint64_t ByteSize = TLI.getDataLayout()->getTypeAllocSize(RetTy);
+  const unsigned StackAlign = TLI.getDataLayout()->getPrefTypeAlignment(RetTy);
+  int FrameIdx = FrameInfo->CreateStackObject(ByteSize, StackAlign, false);
+  SDValue SRet = DAG.getFrameIndex(FrameIdx, TLI.getPointerTy());
+
+  ArgListTy Args;
+  ArgListEntry Entry;
+
+  Entry.Node = SRet;
+  Entry.Ty = RetTy->getPointerTo();
+  Entry.isSExt = false;
+  Entry.isZExt = false;
+  Entry.isSRet = true;
+  Args.push_back(Entry);
+
+  Entry.Node = Arg;
+  Entry.Ty = ArgTy;
+  Entry.isSExt = false;
+  Entry.isZExt = false;
+  Args.push_back(Entry);
+
+  const char *LibcallName  = (ArgVT == MVT::f64)
+  ? "__sincos_stret" : "__sincosf_stret";
+  SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy());
+
+  TargetLowering::
+  CallLoweringInfo CLI(DAG.getEntryNode(), Type::getVoidTy(*DAG.getContext()),
+                       false, false, false, false, 0,
+                       CallingConv::C, /*isTaillCall=*/false,
+                       /*doesNotRet=*/false, /*isReturnValueUsed*/false,
+                       Callee, Args, DAG, dl);
+  std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
+
+  SDValue LoadSin = DAG.getLoad(ArgVT, dl, CallResult.second, SRet,
+                                MachinePointerInfo(), false, false, false, 0);
+
+  // Address of cos field.
+  SDValue Add = DAG.getNode(ISD::ADD, dl, getPointerTy(), SRet,
+                            DAG.getIntPtrConstant(ArgVT.getStoreSize()));
+  SDValue LoadCos = DAG.getLoad(ArgVT, dl, LoadSin.getValue(1), Add,
+                                MachinePointerInfo(), false, false, false, 0);
+
+  SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
+  return DAG.getNode(ISD::MERGE_VALUES, dl, Tys,
+                     LoadSin.getValue(0), LoadCos.getValue(0));
+}
+
 static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG) {
   // Monotonic load/store is legal for all targets
   if (cast<AtomicSDNode>(Op)->getOrdering() <= Monotonic)
@@ -5781,32 +6038,28 @@ static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG) {
 
 static void
 ReplaceATOMIC_OP_64(SDNode *Node, SmallVectorImpl<SDValue>& Results,
-                    SelectionDAG &DAG, unsigned NewOp) {
+                    SelectionDAG &DAG) {
   SDLoc dl(Node);
   assert (Node->getValueType(0) == MVT::i64 &&
           "Only know how to expand i64 atomics");
+  AtomicSDNode *AN = cast<AtomicSDNode>(Node);
 
   SmallVector<SDValue, 6> Ops;
   Ops.push_back(Node->getOperand(0)); // Chain
   Ops.push_back(Node->getOperand(1)); // Ptr
-  // Low part of Val1
-  Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
-                            Node->getOperand(2), DAG.getIntPtrConstant(0)));
-  // High part of Val1
-  Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
-                            Node->getOperand(2), DAG.getIntPtrConstant(1)));
-  if (NewOp == ARMISD::ATOMCMPXCHG64_DAG) {
-    // High part of Val1
+  for(unsigned i=2; i<Node->getNumOperands(); i++) {
+    // Low part
     Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
-                              Node->getOperand(3), DAG.getIntPtrConstant(0)));
-    // High part of Val2
+                              Node->getOperand(i), DAG.getIntPtrConstant(0)));
+    // High part
     Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
-                              Node->getOperand(3), DAG.getIntPtrConstant(1)));
+                              Node->getOperand(i), DAG.getIntPtrConstant(1)));
   }
   SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other);
   SDValue Result =
-    DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops.data(), Ops.size(), MVT::i64,
-                            cast<MemSDNode>(Node)->getMemOperand());
+    DAG.getAtomic(Node->getOpcode(), dl, MVT::i64, Tys, Ops.data(), Ops.size(),
+                  cast<MemSDNode>(Node)->getMemOperand(), AN->getOrdering(),
+                  AN->getSynchScope());
   SDValue OpsF[] = { Result.getValue(0), Result.getValue(1) };
   Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2));
   Results.push_back(Result.getValue(2));
@@ -5904,6 +6157,7 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::SUBE:          return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
   case ISD::ATOMIC_LOAD:
   case ISD::ATOMIC_STORE:  return LowerAtomicLoadStore(Op, DAG);
+  case ISD::FSINCOS:       return LowerFSINCOS(Op, DAG);
   case ISD::SDIVREM:
   case ISD::UDIVREM:       return LowerDivRem(Op, DAG);
   }
@@ -5921,10 +6175,6 @@ void ARMTargetLowering::ReplaceNodeResults(SDNode *N,
   case ISD::BITCAST:
     Res = ExpandBITCAST(N, DAG);
     break;
-  case ISD::SIGN_EXTEND:
-  case ISD::ZERO_EXTEND:
-    Res = ExpandVectorExtension(N, DAG);
-    break;
   case ISD::SRL:
   case ISD::SRA:
     Res = Expand64BitShift(N, DAG, Subtarget);
@@ -5932,41 +6182,21 @@ void ARMTargetLowering::ReplaceNodeResults(SDNode *N,
   case ISD::READCYCLECOUNTER:
     ReplaceREADCYCLECOUNTER(N, Results, DAG, Subtarget);
     return;
+  case ISD::ATOMIC_STORE:
+  case ISD::ATOMIC_LOAD:
   case ISD::ATOMIC_LOAD_ADD:
-    ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMADD64_DAG);
-    return;
   case ISD::ATOMIC_LOAD_AND:
-    ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMAND64_DAG);
-    return;
   case ISD::ATOMIC_LOAD_NAND:
-    ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMNAND64_DAG);
-    return;
   case ISD::ATOMIC_LOAD_OR:
-    ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMOR64_DAG);
-    return;
   case ISD::ATOMIC_LOAD_SUB:
-    ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMSUB64_DAG);
-    return;
   case ISD::ATOMIC_LOAD_XOR:
-    ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMXOR64_DAG);
-    return;
   case ISD::ATOMIC_SWAP:
-    ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMSWAP64_DAG);
-    return;
   case ISD::ATOMIC_CMP_SWAP:
-    ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMCMPXCHG64_DAG);
-    return;
   case ISD::ATOMIC_LOAD_MIN:
-    ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMMIN64_DAG);
-    return;
   case ISD::ATOMIC_LOAD_UMIN:
-    ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMUMIN64_DAG);
-    return;
   case ISD::ATOMIC_LOAD_MAX:
-    ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMMAX64_DAG);
-    return;
   case ISD::ATOMIC_LOAD_UMAX:
-    ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMUMAX64_DAG);
+    ReplaceATOMIC_OP_64(N, Results, DAG);
     return;
   }
   if (Res.getNode())
@@ -5986,6 +6216,7 @@ ARMTargetLowering::EmitAtomicCmpSwap(MachineInstr *MI,
   unsigned oldval  = MI->getOperand(2).getReg();
   unsigned newval  = MI->getOperand(3).getReg();
   const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  AtomicOrdering Ord = static_cast<AtomicOrdering>(MI->getOperand(4).getImm());
   DebugLoc dl = MI->getDebugLoc();
   bool isThumb2 = Subtarget->isThumb2();
 
@@ -6001,21 +6232,7 @@ ARMTargetLowering::EmitAtomicCmpSwap(MachineInstr *MI,
   }
 
   unsigned ldrOpc, strOpc;
-  switch (Size) {
-  default: llvm_unreachable("unsupported size for AtomicCmpSwap!");
-  case 1:
-    ldrOpc = isThumb2 ? ARM::t2LDREXB : ARM::LDREXB;
-    strOpc = isThumb2 ? ARM::t2STREXB : ARM::STREXB;
-    break;
-  case 2:
-    ldrOpc = isThumb2 ? ARM::t2LDREXH : ARM::LDREXH;
-    strOpc = isThumb2 ? ARM::t2STREXH : ARM::STREXH;
-    break;
-  case 4:
-    ldrOpc = isThumb2 ? ARM::t2LDREX : ARM::LDREX;
-    strOpc = isThumb2 ? ARM::t2STREX : ARM::STREX;
-    break;
-  }
+  getExclusiveOperation(Size, Ord, isThumb2, ldrOpc, strOpc);
 
   MachineFunction *MF = BB->getParent();
   const BasicBlock *LLVM_BB = BB->getBasicBlock();
@@ -6095,6 +6312,7 @@ ARMTargetLowering::EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
   unsigned dest = MI->getOperand(0).getReg();
   unsigned ptr = MI->getOperand(1).getReg();
   unsigned incr = MI->getOperand(2).getReg();
+  AtomicOrdering Ord = static_cast<AtomicOrdering>(MI->getOperand(3).getImm());
   DebugLoc dl = MI->getDebugLoc();
   bool isThumb2 = Subtarget->isThumb2();
 
@@ -6102,24 +6320,11 @@ ARMTargetLowering::EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
   if (isThumb2) {
     MRI.constrainRegClass(dest, &ARM::rGPRRegClass);
     MRI.constrainRegClass(ptr, &ARM::rGPRRegClass);
+    MRI.constrainRegClass(incr, &ARM::rGPRRegClass);
   }
 
   unsigned ldrOpc, strOpc;
-  switch (Size) {
-  default: llvm_unreachable("unsupported size for AtomicCmpSwap!");
-  case 1:
-    ldrOpc = isThumb2 ? ARM::t2LDREXB : ARM::LDREXB;
-    strOpc = isThumb2 ? ARM::t2STREXB : ARM::STREXB;
-    break;
-  case 2:
-    ldrOpc = isThumb2 ? ARM::t2LDREXH : ARM::LDREXH;
-    strOpc = isThumb2 ? ARM::t2STREXH : ARM::STREXH;
-    break;
-  case 4:
-    ldrOpc = isThumb2 ? ARM::t2LDREX : ARM::LDREX;
-    strOpc = isThumb2 ? ARM::t2STREX : ARM::STREX;
-    break;
-  }
+  getExclusiveOperation(Size, Ord, isThumb2, ldrOpc, strOpc);
 
   MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
   MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
@@ -6203,6 +6408,7 @@ ARMTargetLowering::EmitAtomicBinaryMinMax(MachineInstr *MI,
   unsigned ptr = MI->getOperand(1).getReg();
   unsigned incr = MI->getOperand(2).getReg();
   unsigned oldval = dest;
+  AtomicOrdering Ord = static_cast<AtomicOrdering>(MI->getOperand(3).getImm());
   DebugLoc dl = MI->getDebugLoc();
   bool isThumb2 = Subtarget->isThumb2();
 
@@ -6210,24 +6416,20 @@ ARMTargetLowering::EmitAtomicBinaryMinMax(MachineInstr *MI,
   if (isThumb2) {
     MRI.constrainRegClass(dest, &ARM::rGPRRegClass);
     MRI.constrainRegClass(ptr, &ARM::rGPRRegClass);
+    MRI.constrainRegClass(incr, &ARM::rGPRRegClass);
   }
 
   unsigned ldrOpc, strOpc, extendOpc;
+  getExclusiveOperation(Size, Ord, isThumb2, ldrOpc, strOpc);
   switch (Size) {
-  default: llvm_unreachable("unsupported size for AtomicCmpSwap!");
+  default: llvm_unreachable("unsupported size for AtomicBinaryMinMax!");
   case 1:
-    ldrOpc = isThumb2 ? ARM::t2LDREXB : ARM::LDREXB;
-    strOpc = isThumb2 ? ARM::t2STREXB : ARM::STREXB;
     extendOpc = isThumb2 ? ARM::t2SXTB : ARM::SXTB;
     break;
   case 2:
-    ldrOpc = isThumb2 ? ARM::t2LDREXH : ARM::LDREXH;
-    strOpc = isThumb2 ? ARM::t2STREXH : ARM::STREXH;
     extendOpc = isThumb2 ? ARM::t2SXTH : ARM::SXTH;
     break;
   case 4:
-    ldrOpc = isThumb2 ? ARM::t2LDREX : ARM::LDREX;
-    strOpc = isThumb2 ? ARM::t2STREX : ARM::STREX;
     extendOpc = 0;
     break;
   }
@@ -6271,7 +6473,10 @@ ARMTargetLowering::EmitAtomicBinaryMinMax(MachineInstr *MI,
 
   // Sign extend the value, if necessary.
   if (signExtend && extendOpc) {
-    oldval = MRI.createVirtualRegister(&ARM::GPRRegClass);
+    oldval = MRI.createVirtualRegister(isThumb2 ? &ARM::rGPRRegClass
+                                                : &ARM::GPRnopcRegClass);
+    if (!isThumb2)
+      MRI.constrainRegClass(dest, &ARM::GPRnopcRegClass);
     AddDefaultPred(BuildMI(BB, dl, TII->get(extendOpc), oldval)
                      .addReg(dest)
                      .addImm(0));
@@ -6309,7 +6514,7 @@ ARMTargetLowering::EmitAtomicBinary64(MachineInstr *MI, MachineBasicBlock *BB,
                                       unsigned Op1, unsigned Op2,
                                       bool NeedsCarry, bool IsCmpxchg,
                                       bool IsMinMax, ARMCC::CondCodes CC) const {
-  // This also handles ATOMIC_SWAP, indicated by Op1==0.
+  // This also handles ATOMIC_SWAP and ATOMIC_STORE, indicated by Op1==0.
   const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
 
   const BasicBlock *LLVM_BB = BB->getBasicBlock();
@@ -6317,11 +6522,15 @@ ARMTargetLowering::EmitAtomicBinary64(MachineInstr *MI, MachineBasicBlock *BB,
   MachineFunction::iterator It = BB;
   ++It;
 
+  bool isStore = (MI->getOpcode() == ARM::ATOMIC_STORE_I64);
+  unsigned offset = (isStore ? -2 : 0);
   unsigned destlo = MI->getOperand(0).getReg();
   unsigned desthi = MI->getOperand(1).getReg();
-  unsigned ptr = MI->getOperand(2).getReg();
-  unsigned vallo = MI->getOperand(3).getReg();
-  unsigned valhi = MI->getOperand(4).getReg();
+  unsigned ptr = MI->getOperand(offset+2).getReg();
+  unsigned vallo = MI->getOperand(offset+3).getReg();
+  unsigned valhi = MI->getOperand(offset+4).getReg();
+  unsigned OrdIdx = offset + (IsCmpxchg ? 7 : 5);
+  AtomicOrdering Ord = static_cast<AtomicOrdering>(MI->getOperand(OrdIdx).getImm());
   DebugLoc dl = MI->getDebugLoc();
   bool isThumb2 = Subtarget->isThumb2();
 
@@ -6330,8 +6539,13 @@ ARMTargetLowering::EmitAtomicBinary64(MachineInstr *MI, MachineBasicBlock *BB,
     MRI.constrainRegClass(destlo, &ARM::rGPRRegClass);
     MRI.constrainRegClass(desthi, &ARM::rGPRRegClass);
     MRI.constrainRegClass(ptr, &ARM::rGPRRegClass);
+    MRI.constrainRegClass(vallo, &ARM::rGPRRegClass);
+    MRI.constrainRegClass(valhi, &ARM::rGPRRegClass);
   }
 
+  unsigned ldrOpc, strOpc;
+  getExclusiveOperation(8, Ord, isThumb2, ldrOpc, strOpc);
+
   MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
   MachineBasicBlock *contBB = 0, *cont2BB = 0;
   if (IsCmpxchg || IsMinMax)
@@ -6371,21 +6585,23 @@ ARMTargetLowering::EmitAtomicBinary64(MachineInstr *MI, MachineBasicBlock *BB,
   //   fallthrough --> exitMBB
   BB = loopMBB;
 
-  // Load
-  if (isThumb2) {
-    AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::t2LDREXD))
-                   .addReg(destlo, RegState::Define)
-                   .addReg(desthi, RegState::Define)
-                   .addReg(ptr));
-  } else {
-    unsigned GPRPair0 = MRI.createVirtualRegister(&ARM::GPRPairRegClass);
-    AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::LDREXD))
-                   .addReg(GPRPair0, RegState::Define).addReg(ptr));
-    // Copy r2/r3 into dest.  (This copy will normally be coalesced.)
-    BuildMI(BB, dl, TII->get(TargetOpcode::COPY), destlo)
-      .addReg(GPRPair0, 0, ARM::gsub_0);
-    BuildMI(BB, dl, TII->get(TargetOpcode::COPY), desthi)
-      .addReg(GPRPair0, 0, ARM::gsub_1);
+  if (!isStore) {
+    // Load
+    if (isThumb2) {
+      AddDefaultPred(BuildMI(BB, dl, TII->get(ldrOpc))
+                     .addReg(destlo, RegState::Define)
+                     .addReg(desthi, RegState::Define)
+                     .addReg(ptr));
+    } else {
+      unsigned GPRPair0 = MRI.createVirtualRegister(&ARM::GPRPairRegClass);
+      AddDefaultPred(BuildMI(BB, dl, TII->get(ldrOpc))
+                     .addReg(GPRPair0, RegState::Define).addReg(ptr));
+      // Copy r2/r3 into dest.  (This copy will normally be coalesced.)
+      BuildMI(BB, dl, TII->get(TargetOpcode::COPY), destlo)
+        .addReg(GPRPair0, 0, ARM::gsub_0);
+      BuildMI(BB, dl, TII->get(TargetOpcode::COPY), desthi)
+        .addReg(GPRPair0, 0, ARM::gsub_1);
+    }
   }
 
   unsigned StoreLo, StoreHi;
@@ -6437,7 +6653,9 @@ ARMTargetLowering::EmitAtomicBinary64(MachineInstr *MI, MachineBasicBlock *BB,
 
   // Store
   if (isThumb2) {
-    AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::t2STREXD), storesuccess)
+    MRI.constrainRegClass(StoreLo, &ARM::rGPRRegClass);
+    MRI.constrainRegClass(StoreHi, &ARM::rGPRRegClass);
+    AddDefaultPred(BuildMI(BB, dl, TII->get(strOpc), storesuccess)
                    .addReg(StoreLo).addReg(StoreHi).addReg(ptr));
   } else {
     // Marshal a pair...
@@ -6455,7 +6673,7 @@ ARMTargetLowering::EmitAtomicBinary64(MachineInstr *MI, MachineBasicBlock *BB,
       .addImm(ARM::gsub_1);
 
     // ...and store it
-    AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::STREXD), storesuccess)
+    AddDefaultPred(BuildMI(BB, dl, TII->get(strOpc), storesuccess)
                    .addReg(StorePair).addReg(ptr));
   }
   // Cmp+jump
@@ -6476,6 +6694,51 @@ ARMTargetLowering::EmitAtomicBinary64(MachineInstr *MI, MachineBasicBlock *BB,
   return BB;
 }
 
+MachineBasicBlock *
+ARMTargetLowering::EmitAtomicLoad64(MachineInstr *MI, MachineBasicBlock *BB) const {
+
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+
+  unsigned destlo = MI->getOperand(0).getReg();
+  unsigned desthi = MI->getOperand(1).getReg();
+  unsigned ptr = MI->getOperand(2).getReg();
+  AtomicOrdering Ord = static_cast<AtomicOrdering>(MI->getOperand(3).getImm());
+  DebugLoc dl = MI->getDebugLoc();
+  bool isThumb2 = Subtarget->isThumb2();
+
+  MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+  if (isThumb2) {
+    MRI.constrainRegClass(destlo, &ARM::rGPRRegClass);
+    MRI.constrainRegClass(desthi, &ARM::rGPRRegClass);
+    MRI.constrainRegClass(ptr, &ARM::rGPRRegClass);
+  }
+  unsigned ldrOpc, strOpc;
+  getExclusiveOperation(8, Ord, isThumb2, ldrOpc, strOpc);
+
+  MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(ldrOpc));
+
+  if (isThumb2) {
+    MIB.addReg(destlo, RegState::Define)
+       .addReg(desthi, RegState::Define)
+       .addReg(ptr);
+
+  } else {
+    unsigned GPRPair0 = MRI.createVirtualRegister(&ARM::GPRPairRegClass);
+    MIB.addReg(GPRPair0, RegState::Define).addReg(ptr);
+
+    // Copy GPRPair0 into dest.  (This copy will normally be coalesced.)
+    BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), destlo)
+      .addReg(GPRPair0, 0, ARM::gsub_0);
+    BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), desthi)
+      .addReg(GPRPair0, 0, ARM::gsub_1);
+  }
+  AddDefaultPred(MIB);
+
+  MI->eraseFromParent();   // The instruction is gone now.
+
+  return BB;
+}
+
 /// SetupEntryBlockForSjLj - Insert code into the entry block that creates and
 /// registers the function context.
 void ARMTargetLowering::
@@ -7007,8 +7270,109 @@ MachineBasicBlock *OtherSucc(MachineBasicBlock *MBB, MachineBasicBlock *Succ) {
   llvm_unreachable("Expecting a BB with two successors!");
 }
 
-MachineBasicBlock *ARMTargetLowering::
-EmitStructByval(MachineInstr *MI, MachineBasicBlock *BB) const {
+/// Return the load opcode for a given load size. If load size >= 8,
+/// neon opcode will be returned.
+static unsigned getLdOpcode(unsigned LdSize, bool IsThumb1, bool IsThumb2) {
+  if (LdSize >= 8)
+    return LdSize == 16 ? ARM::VLD1q32wb_fixed
+                        : LdSize == 8 ? ARM::VLD1d32wb_fixed : 0;
+  if (IsThumb1)
+    return LdSize == 4 ? ARM::tLDRi
+                       : LdSize == 2 ? ARM::tLDRHi
+                                     : LdSize == 1 ? ARM::tLDRBi : 0;
+  if (IsThumb2)
+    return LdSize == 4 ? ARM::t2LDR_POST
+                       : LdSize == 2 ? ARM::t2LDRH_POST
+                                     : LdSize == 1 ? ARM::t2LDRB_POST : 0;
+  return LdSize == 4 ? ARM::LDR_POST_IMM
+                     : LdSize == 2 ? ARM::LDRH_POST
+                                   : LdSize == 1 ? ARM::LDRB_POST_IMM : 0;
+}
+
+/// Return the store opcode for a given store size. If store size >= 8,
+/// neon opcode will be returned.
+static unsigned getStOpcode(unsigned StSize, bool IsThumb1, bool IsThumb2) {
+  if (StSize >= 8)
+    return StSize == 16 ? ARM::VST1q32wb_fixed
+                        : StSize == 8 ? ARM::VST1d32wb_fixed : 0;
+  if (IsThumb1)
+    return StSize == 4 ? ARM::tSTRi
+                       : StSize == 2 ? ARM::tSTRHi
+                                     : StSize == 1 ? ARM::tSTRBi : 0;
+  if (IsThumb2)
+    return StSize == 4 ? ARM::t2STR_POST
+                       : StSize == 2 ? ARM::t2STRH_POST
+                                     : StSize == 1 ? ARM::t2STRB_POST : 0;
+  return StSize == 4 ? ARM::STR_POST_IMM
+                     : StSize == 2 ? ARM::STRH_POST
+                                   : StSize == 1 ? ARM::STRB_POST_IMM : 0;
+}
+
+/// Emit a post-increment load operation with given size. The instructions
+/// will be added to BB at Pos.
+static void emitPostLd(MachineBasicBlock *BB, MachineInstr *Pos,
+                       const TargetInstrInfo *TII, DebugLoc dl,
+                       unsigned LdSize, unsigned Data, unsigned AddrIn,
+                       unsigned AddrOut, bool IsThumb1, bool IsThumb2) {
+  unsigned LdOpc = getLdOpcode(LdSize, IsThumb1, IsThumb2);
+  assert(LdOpc != 0 && "Should have a load opcode");
+  if (LdSize >= 8) {
+    AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
+                       .addReg(AddrOut, RegState::Define).addReg(AddrIn)
+                       .addImm(0));
+  } else if (IsThumb1) {
+    // load + update AddrIn
+    AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
+                       .addReg(AddrIn).addImm(0));
+    MachineInstrBuilder MIB =
+        BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut);
+    MIB = AddDefaultT1CC(MIB);
+    MIB.addReg(AddrIn).addImm(LdSize);
+    AddDefaultPred(MIB);
+  } else if (IsThumb2) {
+    AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
+                       .addReg(AddrOut, RegState::Define).addReg(AddrIn)
+                       .addImm(LdSize));
+  } else { // arm
+    AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
+                       .addReg(AddrOut, RegState::Define).addReg(AddrIn)
+                       .addReg(0).addImm(LdSize));
+  }
+}
+
+/// Emit a post-increment store operation with given size. The instructions
+/// will be added to BB at Pos.
+static void emitPostSt(MachineBasicBlock *BB, MachineInstr *Pos,
+                       const TargetInstrInfo *TII, DebugLoc dl,
+                       unsigned StSize, unsigned Data, unsigned AddrIn,
+                       unsigned AddrOut, bool IsThumb1, bool IsThumb2) {
+  unsigned StOpc = getStOpcode(StSize, IsThumb1, IsThumb2);
+  assert(StOpc != 0 && "Should have a store opcode");
+  if (StSize >= 8) {
+    AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
+                       .addReg(AddrIn).addImm(0).addReg(Data));
+  } else if (IsThumb1) {
+    // store + update AddrIn
+    AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(StOpc)).addReg(Data)
+                       .addReg(AddrIn).addImm(0));
+    MachineInstrBuilder MIB =
+        BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut);
+    MIB = AddDefaultT1CC(MIB);
+    MIB.addReg(AddrIn).addImm(StSize);
+    AddDefaultPred(MIB);
+  } else if (IsThumb2) {
+    AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
+                       .addReg(Data).addReg(AddrIn).addImm(StSize));
+  } else { // arm
+    AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
+                       .addReg(Data).addReg(AddrIn).addReg(0)
+                       .addImm(StSize));
+  }
+}
+
+MachineBasicBlock *
+ARMTargetLowering::EmitStructByval(MachineInstr *MI,
+                                   MachineBasicBlock *BB) const {
   // This pseudo instruction has 3 operands: dst, src, size
   // We expand it to a loop if size > Subtarget->getMaxInlineSizeThreshold().
   // Otherwise, we will generate unrolled scalar copies.
@@ -7023,23 +7387,18 @@ EmitStructByval(MachineInstr *MI, MachineBasicBlock *BB) const {
   unsigned Align = MI->getOperand(3).getImm();
   DebugLoc dl = MI->getDebugLoc();
 
-  bool isThumb2 = Subtarget->isThumb2();
   MachineFunction *MF = BB->getParent();
   MachineRegisterInfo &MRI = MF->getRegInfo();
-  unsigned ldrOpc, strOpc, UnitSize = 0;
+  unsigned UnitSize = 0;
+  const TargetRegisterClass *TRC = 0;
+  const TargetRegisterClass *VecTRC = 0;
 
-  const TargetRegisterClass *TRC = isThumb2 ?
-    (const TargetRegisterClass*)&ARM::tGPRRegClass :
-    (const TargetRegisterClass*)&ARM::GPRRegClass;
-  const TargetRegisterClass *TRC_Vec = 0;
+  bool IsThumb1 = Subtarget->isThumb1Only();
+  bool IsThumb2 = Subtarget->isThumb2();
 
   if (Align & 1) {
-    ldrOpc = isThumb2 ? ARM::t2LDRB_POST : ARM::LDRB_POST_IMM;
-    strOpc = isThumb2 ? ARM::t2STRB_POST : ARM::STRB_POST_IMM;
     UnitSize = 1;
   } else if (Align & 2) {
-    ldrOpc = isThumb2 ? ARM::t2LDRH_POST : ARM::LDRH_POST;
-    strOpc = isThumb2 ? ARM::t2STRH_POST : ARM::STRH_POST;
     UnitSize = 2;
   } else {
     // Check whether we can use NEON instructions.
@@ -7047,27 +7406,27 @@ EmitStructByval(MachineInstr *MI, MachineBasicBlock *BB) const {
           hasAttribute(AttributeSet::FunctionIndex,
                        Attribute::NoImplicitFloat) &&
         Subtarget->hasNEON()) {
-      if ((Align % 16 == 0) && SizeVal >= 16) {
-        ldrOpc = ARM::VLD1q32wb_fixed;
-        strOpc = ARM::VST1q32wb_fixed;
+      if ((Align % 16 == 0) && SizeVal >= 16)
         UnitSize = 16;
-        TRC_Vec = (const TargetRegisterClass*)&ARM::DPairRegClass;
-      }
-      else if ((Align % 8 == 0) && SizeVal >= 8) {
-        ldrOpc = ARM::VLD1d32wb_fixed;
-        strOpc = ARM::VST1d32wb_fixed;
+      else if ((Align % 8 == 0) && SizeVal >= 8)
         UnitSize = 8;
-        TRC_Vec = (const TargetRegisterClass*)&ARM::DPRRegClass;
-      }
     }
     // Can't use NEON instructions.
-    if (UnitSize == 0) {
-      ldrOpc = isThumb2 ? ARM::t2LDR_POST : ARM::LDR_POST_IMM;
-      strOpc = isThumb2 ? ARM::t2STR_POST : ARM::STR_POST_IMM;
+    if (UnitSize == 0)
       UnitSize = 4;
-    }
   }
 
+  // Select the correct opcode and register class for unit size load/store
+  bool IsNeon = UnitSize >= 8;
+  TRC = (IsThumb1 || IsThumb2) ? (const TargetRegisterClass *)&ARM::tGPRRegClass
+                               : (const TargetRegisterClass *)&ARM::GPRRegClass;
+  if (IsNeon)
+    VecTRC = UnitSize == 16
+                 ? (const TargetRegisterClass *)&ARM::DPairRegClass
+                 : UnitSize == 8
+                       ? (const TargetRegisterClass *)&ARM::DPRRegClass
+                       : 0;
+
   unsigned BytesLeft = SizeVal % UnitSize;
   unsigned LoopSize = SizeVal - BytesLeft;
 
@@ -7078,34 +7437,13 @@ EmitStructByval(MachineInstr *MI, MachineBasicBlock *BB) const {
     unsigned srcIn = src;
     unsigned destIn = dest;
     for (unsigned i = 0; i < LoopSize; i+=UnitSize) {
-      unsigned scratch = MRI.createVirtualRegister(UnitSize >= 8 ? TRC_Vec:TRC);
       unsigned srcOut = MRI.createVirtualRegister(TRC);
       unsigned destOut = MRI.createVirtualRegister(TRC);
-      if (UnitSize >= 8) {
-        AddDefaultPred(BuildMI(*BB, MI, dl,
-          TII->get(ldrOpc), scratch)
-          .addReg(srcOut, RegState::Define).addReg(srcIn).addImm(0));
-
-        AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(strOpc), destOut)
-          .addReg(destIn).addImm(0).addReg(scratch));
-      } else if (isThumb2) {
-        AddDefaultPred(BuildMI(*BB, MI, dl,
-          TII->get(ldrOpc), scratch)
-          .addReg(srcOut, RegState::Define).addReg(srcIn).addImm(UnitSize));
-
-        AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(strOpc), destOut)
-          .addReg(scratch).addReg(destIn)
-          .addImm(UnitSize));
-      } else {
-        AddDefaultPred(BuildMI(*BB, MI, dl,
-          TII->get(ldrOpc), scratch)
-          .addReg(srcOut, RegState::Define).addReg(srcIn).addReg(0)
-          .addImm(UnitSize));
-
-        AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(strOpc), destOut)
-          .addReg(scratch).addReg(destIn)
-          .addReg(0).addImm(UnitSize));
-      }
+      unsigned scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC);
+      emitPostLd(BB, MI, TII, dl, UnitSize, scratch, srcIn, srcOut,
+                 IsThumb1, IsThumb2);
+      emitPostSt(BB, MI, TII, dl, UnitSize, scratch, destIn, destOut,
+                 IsThumb1, IsThumb2);
       srcIn = srcOut;
       destIn = destOut;
     }
@@ -7113,30 +7451,14 @@ EmitStructByval(MachineInstr *MI, MachineBasicBlock *BB) const {
     // Handle the leftover bytes with LDRB and STRB.
     // [scratch, srcOut] = LDRB_POST(srcIn, 1)
     // [destOut] = STRB_POST(scratch, destIn, 1)
-    ldrOpc = isThumb2 ? ARM::t2LDRB_POST : ARM::LDRB_POST_IMM;
-    strOpc = isThumb2 ? ARM::t2STRB_POST : ARM::STRB_POST_IMM;
     for (unsigned i = 0; i < BytesLeft; i++) {
-      unsigned scratch = MRI.createVirtualRegister(TRC);
       unsigned srcOut = MRI.createVirtualRegister(TRC);
       unsigned destOut = MRI.createVirtualRegister(TRC);
-      if (isThumb2) {
-        AddDefaultPred(BuildMI(*BB, MI, dl,
-          TII->get(ldrOpc),scratch)
-          .addReg(srcOut, RegState::Define).addReg(srcIn).addImm(1));
-
-        AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(strOpc), destOut)
-          .addReg(scratch).addReg(destIn)
-          .addReg(0).addImm(1));
-      } else {
-        AddDefaultPred(BuildMI(*BB, MI, dl,
-          TII->get(ldrOpc),scratch)
-          .addReg(srcOut, RegState::Define).addReg(srcIn)
-          .addReg(0).addImm(1));
-
-        AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(strOpc), destOut)
-          .addReg(scratch).addReg(destIn)
-          .addReg(0).addImm(1));
-      }
+      unsigned scratch = MRI.createVirtualRegister(TRC);
+      emitPostLd(BB, MI, TII, dl, 1, scratch, srcIn, srcOut,
+                 IsThumb1, IsThumb2);
+      emitPostSt(BB, MI, TII, dl, 1, scratch, destIn, destOut,
+                 IsThumb1, IsThumb2);
       srcIn = srcOut;
       destIn = destOut;
     }
@@ -7177,17 +7499,16 @@ EmitStructByval(MachineInstr *MI, MachineBasicBlock *BB) const {
 
   // Load an immediate to varEnd.
   unsigned varEnd = MRI.createVirtualRegister(TRC);
-  if (isThumb2) {
-    unsigned VReg1 = varEnd;
+  if (IsThumb2) {
+    unsigned Vtmp = varEnd;
     if ((LoopSize & 0xFFFF0000) != 0)
-      VReg1 = MRI.createVirtualRegister(TRC);
-    AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::t2MOVi16), VReg1)
-                   .addImm(LoopSize & 0xFFFF));
+      Vtmp = MRI.createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::t2MOVi16), Vtmp)
+                       .addImm(LoopSize & 0xFFFF));
 
     if ((LoopSize & 0xFFFF0000) != 0)
       AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::t2MOVTi16), varEnd)
-                     .addReg(VReg1)
-                     .addImm(LoopSize >> 16));
+                         .addReg(Vtmp).addImm(LoopSize >> 16));
   } else {
     MachineConstantPool *ConstantPool = MF->getConstantPool();
     Type *Int32Ty = Type::getInt32Ty(MF->getFunction()->getContext());
@@ -7199,10 +7520,12 @@ EmitStructByval(MachineInstr *MI, MachineBasicBlock *BB) const {
       Align = getDataLayout()->getTypeAllocSize(C->getType());
     unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align);
 
-    AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::LDRcp))
-                   .addReg(varEnd, RegState::Define)
-                   .addConstantPoolIndex(Idx)
-                   .addImm(0));
+    if (IsThumb1)
+      AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(ARM::tLDRpci)).addReg(
+          varEnd, RegState::Define).addConstantPoolIndex(Idx));
+    else
+      AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(ARM::LDRcp)).addReg(
+          varEnd, RegState::Define).addConstantPoolIndex(Idx).addImm(0));
   }
   BB->addSuccessor(loopMBB);
 
@@ -7231,39 +7554,30 @@ EmitStructByval(MachineInstr *MI, MachineBasicBlock *BB) const {
 
   //   [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize)
   //   [destLoop] = STR_POST(scratch, destPhi, UnitSiz)
-  unsigned scratch = MRI.createVirtualRegister(UnitSize >= 8 ? TRC_Vec:TRC);
-  if (UnitSize >= 8) {
-    AddDefaultPred(BuildMI(BB, dl, TII->get(ldrOpc), scratch)
-      .addReg(srcLoop, RegState::Define).addReg(srcPhi).addImm(0));
-
-    AddDefaultPred(BuildMI(BB, dl, TII->get(strOpc), destLoop)
-      .addReg(destPhi).addImm(0).addReg(scratch));
-  } else if (isThumb2) {
-    AddDefaultPred(BuildMI(BB, dl, TII->get(ldrOpc), scratch)
-      .addReg(srcLoop, RegState::Define).addReg(srcPhi).addImm(UnitSize));
-
-    AddDefaultPred(BuildMI(BB, dl, TII->get(strOpc), destLoop)
-      .addReg(scratch).addReg(destPhi)
-      .addImm(UnitSize));
-  } else {
-    AddDefaultPred(BuildMI(BB, dl, TII->get(ldrOpc), scratch)
-      .addReg(srcLoop, RegState::Define).addReg(srcPhi).addReg(0)
-      .addImm(UnitSize));
-
-    AddDefaultPred(BuildMI(BB, dl, TII->get(strOpc), destLoop)
-      .addReg(scratch).addReg(destPhi)
-      .addReg(0).addImm(UnitSize));
-  }
+  unsigned scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC);
+  emitPostLd(BB, BB->end(), TII, dl, UnitSize, scratch, srcPhi, srcLoop,
+             IsThumb1, IsThumb2);
+  emitPostSt(BB, BB->end(), TII, dl, UnitSize, scratch, destPhi, destLoop,
+             IsThumb1, IsThumb2);
 
   // Decrement loop variable by UnitSize.
-  MachineInstrBuilder MIB = BuildMI(BB, dl,
-    TII->get(isThumb2 ? ARM::t2SUBri : ARM::SUBri), varLoop);
-  AddDefaultCC(AddDefaultPred(MIB.addReg(varPhi).addImm(UnitSize)));
-  MIB->getOperand(5).setReg(ARM::CPSR);
-  MIB->getOperand(5).setIsDef(true);
-
-  BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
-    .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR);
+  if (IsThumb1) {
+    MachineInstrBuilder MIB =
+        BuildMI(*BB, BB->end(), dl, TII->get(ARM::tSUBi8), varLoop);
+    MIB = AddDefaultT1CC(MIB);
+    MIB.addReg(varPhi).addImm(UnitSize);
+    AddDefaultPred(MIB);
+  } else {
+    MachineInstrBuilder MIB =
+        BuildMI(*BB, BB->end(), dl,
+                TII->get(IsThumb2 ? ARM::t2SUBri : ARM::SUBri), varLoop);
+    AddDefaultCC(AddDefaultPred(MIB.addReg(varPhi).addImm(UnitSize)));
+    MIB->getOperand(5).setReg(ARM::CPSR);
+    MIB->getOperand(5).setIsDef(true);
+  }
+  BuildMI(*BB, BB->end(), dl,
+          TII->get(IsThumb1 ? ARM::tBcc : IsThumb2 ? ARM::t2Bcc : ARM::Bcc))
+      .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR);
 
   // loopMBB can loop back to loopMBB or fall through to exitMBB.
   BB->addSuccessor(loopMBB);
@@ -7272,34 +7586,19 @@ EmitStructByval(MachineInstr *MI, MachineBasicBlock *BB) const {
   // Add epilogue to handle BytesLeft.
   BB = exitMBB;
   MachineInstr *StartOfExit = exitMBB->begin();
-  ldrOpc = isThumb2 ? ARM::t2LDRB_POST : ARM::LDRB_POST_IMM;
-  strOpc = isThumb2 ? ARM::t2STRB_POST : ARM::STRB_POST_IMM;
 
   //   [scratch, srcOut] = LDRB_POST(srcLoop, 1)
   //   [destOut] = STRB_POST(scratch, destLoop, 1)
   unsigned srcIn = srcLoop;
   unsigned destIn = destLoop;
   for (unsigned i = 0; i < BytesLeft; i++) {
-    unsigned scratch = MRI.createVirtualRegister(TRC);
     unsigned srcOut = MRI.createVirtualRegister(TRC);
     unsigned destOut = MRI.createVirtualRegister(TRC);
-    if (isThumb2) {
-      AddDefaultPred(BuildMI(*BB, StartOfExit, dl,
-        TII->get(ldrOpc),scratch)
-        .addReg(srcOut, RegState::Define).addReg(srcIn).addImm(1));
-
-      AddDefaultPred(BuildMI(*BB, StartOfExit, dl, TII->get(strOpc), destOut)
-        .addReg(scratch).addReg(destIn)
-        .addImm(1));
-    } else {
-      AddDefaultPred(BuildMI(*BB, StartOfExit, dl,
-        TII->get(ldrOpc),scratch)
-        .addReg(srcOut, RegState::Define).addReg(srcIn).addReg(0).addImm(1));
-
-      AddDefaultPred(BuildMI(*BB, StartOfExit, dl, TII->get(strOpc), destOut)
-        .addReg(scratch).addReg(destIn)
-        .addReg(0).addImm(1));
-    }
+    unsigned scratch = MRI.createVirtualRegister(TRC);
+    emitPostLd(BB, StartOfExit, TII, dl, 1, scratch, srcIn, srcOut,
+               IsThumb1, IsThumb2);
+    emitPostSt(BB, StartOfExit, TII, dl, 1, scratch, destIn, destOut,
+               IsThumb1, IsThumb2);
     srcIn = srcOut;
     destIn = destOut;
   }
@@ -7449,46 +7748,49 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   case ARM::ATOMIC_CMP_SWAP_I16: return EmitAtomicCmpSwap(MI, BB, 2);
   case ARM::ATOMIC_CMP_SWAP_I32: return EmitAtomicCmpSwap(MI, BB, 4);
 
+  case ARM::ATOMIC_LOAD_I64:
+    return EmitAtomicLoad64(MI, BB);
 
-  case ARM::ATOMADD6432:
+  case ARM::ATOMIC_LOAD_ADD_I64:
     return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2ADDrr : ARM::ADDrr,
                               isThumb2 ? ARM::t2ADCrr : ARM::ADCrr,
                               /*NeedsCarry*/ true);
-  case ARM::ATOMSUB6432:
+  case ARM::ATOMIC_LOAD_SUB_I64:
     return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr,
                               isThumb2 ? ARM::t2SBCrr : ARM::SBCrr,
                               /*NeedsCarry*/ true);
-  case ARM::ATOMOR6432:
+  case ARM::ATOMIC_LOAD_OR_I64:
     return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2ORRrr : ARM::ORRrr,
                               isThumb2 ? ARM::t2ORRrr : ARM::ORRrr);
-  case ARM::ATOMXOR6432:
+  case ARM::ATOMIC_LOAD_XOR_I64:
     return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2EORrr : ARM::EORrr,
                               isThumb2 ? ARM::t2EORrr : ARM::EORrr);
-  case ARM::ATOMAND6432:
+  case ARM::ATOMIC_LOAD_AND_I64:
     return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2ANDrr : ARM::ANDrr,
                               isThumb2 ? ARM::t2ANDrr : ARM::ANDrr);
-  case ARM::ATOMSWAP6432:
+  case ARM::ATOMIC_STORE_I64:
+  case ARM::ATOMIC_SWAP_I64:
     return EmitAtomicBinary64(MI, BB, 0, 0, false);
-  case ARM::ATOMCMPXCHG6432:
+  case ARM::ATOMIC_CMP_SWAP_I64:
     return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr,
                               isThumb2 ? ARM::t2SBCrr : ARM::SBCrr,
                               /*NeedsCarry*/ false, /*IsCmpxchg*/true);
-  case ARM::ATOMMIN6432:
+  case ARM::ATOMIC_LOAD_MIN_I64:
     return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr,
                               isThumb2 ? ARM::t2SBCrr : ARM::SBCrr,
                               /*NeedsCarry*/ true, /*IsCmpxchg*/false,
                               /*IsMinMax*/ true, ARMCC::LT);
-  case ARM::ATOMMAX6432:
+  case ARM::ATOMIC_LOAD_MAX_I64:
     return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr,
                               isThumb2 ? ARM::t2SBCrr : ARM::SBCrr,
                               /*NeedsCarry*/ true, /*IsCmpxchg*/false,
                               /*IsMinMax*/ true, ARMCC::GE);
-  case ARM::ATOMUMIN6432:
+  case ARM::ATOMIC_LOAD_UMIN_I64:
     return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr,
                               isThumb2 ? ARM::t2SBCrr : ARM::SBCrr,
                               /*NeedsCarry*/ true, /*IsCmpxchg*/false,
                               /*IsMinMax*/ true, ARMCC::LO);
-  case ARM::ATOMUMAX6432:
+  case ARM::ATOMIC_LOAD_UMAX_I64:
     return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr,
                               isThumb2 ? ARM::t2SBCrr : ARM::SBCrr,
                               /*NeedsCarry*/ true, /*IsCmpxchg*/false,
@@ -8197,6 +8499,13 @@ static SDValue PerformSUBCombine(SDNode *N,
 /// is faster than
 ///   vadd d3, d0, d1
 ///   vmul d3, d3, d2
+//  However, for (A + B) * (A + B),
+//    vadd d2, d0, d1
+//    vmul d3, d0, d2
+//    vmla d3, d1, d2
+//  is slower than
+//    vadd d2, d0, d1
+//    vmul d3, d2, d2
 static SDValue PerformVMULCombine(SDNode *N,
                                   TargetLowering::DAGCombinerInfo &DCI,
                                   const ARMSubtarget *Subtarget) {
@@ -8216,6 +8525,9 @@ static SDValue PerformVMULCombine(SDNode *N,
     std::swap(N0, N1);
   }
 
+  if (N0 == N1)
+    return SDValue();
+
   EVT VT = N->getValueType(0);
   SDLoc DL(N);
   SDValue N00 = N0->getOperand(0);
@@ -10548,6 +10860,8 @@ ARMTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
     case 'r':
       return RCPair(0U, &ARM::GPRRegClass);
     case 'w':
+      if (VT == MVT::Other)
+        break;
       if (VT == MVT::f32)
         return RCPair(0U, &ARM::SPRRegClass);
       if (VT.getSizeInBits() == 64)
@@ -10556,6 +10870,8 @@ ARMTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
         return RCPair(0U, &ARM::QPRRegClass);
       break;
     case 'x':
+      if (VT == MVT::Other)
+        break;
       if (VT == MVT::f32)
         return RCPair(0U, &ARM::SPR_8RegClass);
       if (VT.getSizeInBits() == 64)
diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h
index 44c769f..90facdd 100644
--- a/lib/Target/ARM/ARMISelLowering.h
+++ b/lib/Target/ARM/ARMISelLowering.h
@@ -52,6 +52,7 @@ namespace llvm {
       BR_JT,        // Jumptable branch.
       BR2_JT,       // Jumptable branch (2 level - jumptable entry is a jump).
       RET_FLAG,     // Return with a flag operand.
+      INTRET_FLAG,  // Interrupt return with an LR-offset and a flag operand.
 
       PIC_ADD,      // Add with a PC operand and a PIC label.
 
@@ -94,7 +95,6 @@ namespace llvm {
 
       DYN_ALLOC,    // Dynamic allocation on the stack.
 
-      MEMBARRIER,   // Memory barrier (DMB)
       MEMBARRIER_MCR, // Memory barrier (MCR)
 
       PRELOAD,      // Preload
@@ -186,6 +186,8 @@ namespace llvm {
       // Floating-point max and min:
       FMAX,
       FMIN,
+      VMAXNM,
+      VMINNM,
 
       // Bit-field insert
       BFI,
@@ -222,21 +224,7 @@ namespace llvm {
       VST4_UPD,
       VST2LN_UPD,
       VST3LN_UPD,
-      VST4LN_UPD,
-
-      // 64-bit atomic ops (value split into two registers)
-      ATOMADD64_DAG,
-      ATOMSUB64_DAG,
-      ATOMOR64_DAG,
-      ATOMXOR64_DAG,
-      ATOMAND64_DAG,
-      ATOMNAND64_DAG,
-      ATOMSWAP64_DAG,
-      ATOMCMPXCHG64_DAG,
-      ATOMMIN64_DAG,
-      ATOMUMIN64_DAG,
-      ATOMMAX64_DAG,
-      ATOMUMAX64_DAG
+      VST4LN_UPD
     };
   }
 
@@ -375,6 +363,12 @@ namespace llvm {
     /// be used for loads / stores from the global.
     virtual unsigned getMaximalGlobalOffset() const;
 
+    /// Returns true if a cast between SrcAS and DestAS is a noop.
+    virtual bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const {
+      // Addrspacecasts are always noops.
+      return true;
+    }
+
     /// createFastISel - This method returns a target specific FastISel object,
     /// or null if the target does not support "fast" ISel.
     virtual FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
@@ -460,6 +454,7 @@ namespace llvm {
                             const ARMSubtarget *ST) const;
     SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
                               const ARMSubtarget *ST) const;
+    SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerDivRem(SDValue Op, SelectionDAG &DAG) const;
 
     /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster
@@ -573,6 +568,8 @@ namespace llvm {
                                                unsigned Size,
                                                bool signExtend,
                                                ARMCC::CondCodes Cond) const;
+    MachineBasicBlock *EmitAtomicLoad64(MachineInstr *MI,
+                                        MachineBasicBlock *BB) const;
 
     void SetupEntryBlockForSjLj(MachineInstr *MI,
                                 MachineBasicBlock *MBB,
diff --git a/lib/Target/ARM/ARMInstrFormats.td b/lib/Target/ARM/ARMInstrFormats.td
index 1349476..f93504f 100644
--- a/lib/Target/ARM/ARMInstrFormats.td
+++ b/lib/Target/ARM/ARMInstrFormats.td
@@ -155,6 +155,16 @@ def pred : PredicateOperand<OtherVT, (ops i32imm, i32imm),
   let DecoderMethod = "DecodePredicateOperand";
 }
 
+// Selectable predicate operand for CMOV instructions. We can't use a normal
+// predicate because the default values interfere with instruction selection. In
+// all other respects it is identical though: pseudo-instruction expansion
+// relies on the MachineOperands being compatible.
+def cmovpred : Operand<i32>, PredicateOp,
+               ComplexPattern<i32, 2, "SelectCMOVPred"> {
+  let MIOperandInfo = (ops i32imm, i32imm);
+  let PrintMethod = "printPredicateOperand";
+}
+
 // Conditional code result for instructions whose 's' bit is set, e.g. subs.
 def CCOutOperand : AsmOperandClass { let Name = "CCOut"; }
 def cc_out : OptionalDefOperand<OtherVT, (ops CCR), (ops (i32 zero_reg))> {
@@ -237,6 +247,8 @@ class t2InstAlias<string Asm, dag Result, bit Emit = 0b1>
       : InstAlias<Asm, Result, Emit>, Requires<[IsThumb2]>;
 class VFP2InstAlias<string Asm, dag Result, bit Emit = 0b1>
       : InstAlias<Asm, Result, Emit>, Requires<[HasVFP2]>;
+class VFP2DPInstAlias<string Asm, dag Result, bit Emit = 0b1>
+      : InstAlias<Asm, Result, Emit>, Requires<[HasVFP2,HasDPVFP]>;
 class VFP3InstAlias<string Asm, dag Result, bit Emit = 0b1>
       : InstAlias<Asm, Result, Emit>, Requires<[HasVFP3]>;
 class NEONInstAlias<string Asm, dag Result, bit Emit = 0b1>
@@ -490,8 +502,7 @@ class JTI<dag oops, dag iops, InstrItinClass itin,
   : XI<oops, iops, AddrModeNone, 0, IndexModeNone, BrMiscFrm, itin,
        asm, "", pattern>;
 
-// Atomic load/store instructions
-class AIldrex<bits<2> opcod, dag oops, dag iops, InstrItinClass itin,
+class AIldr_ex_or_acq<bits<2> opcod, bits<2> opcod2, dag oops, dag iops, InstrItinClass itin,
               string opc, string asm, list<dag> pattern>
   : I<oops, iops, AddrModeNone, 4, IndexModeNone, LdStExFrm, itin,
       opc, asm, "", pattern> {
@@ -502,23 +513,52 @@ class AIldrex<bits<2> opcod, dag oops, dag iops, InstrItinClass itin,
   let Inst{20}    = 1;
   let Inst{19-16} = addr;
   let Inst{15-12} = Rt;
-  let Inst{11-0}  = 0b111110011111;
+  let Inst{11-10} = 0b11;
+  let Inst{9-8}   = opcod2;
+  let Inst{7-0}   = 0b10011111;
 }
-class AIstrex<bits<2> opcod, dag oops, dag iops, InstrItinClass itin,
+class AIstr_ex_or_rel<bits<2> opcod, bits<2> opcod2, dag oops, dag iops, InstrItinClass itin,
               string opc, string asm, list<dag> pattern>
   : I<oops, iops, AddrModeNone, 4, IndexModeNone, LdStExFrm, itin,
       opc, asm, "", pattern> {
-  bits<4> Rd;
   bits<4> Rt;
   bits<4> addr;
   let Inst{27-23} = 0b00011;
   let Inst{22-21} = opcod;
   let Inst{20}    = 0;
   let Inst{19-16} = addr;
-  let Inst{15-12} = Rd;
-  let Inst{11-4}  = 0b11111001;
+  let Inst{11-10} = 0b11;
+  let Inst{9-8}   = opcod2;
+  let Inst{7-4}   = 0b1001;
   let Inst{3-0}   = Rt;
 }
+// Atomic load/store instructions
+class AIldrex<bits<2> opcod, dag oops, dag iops, InstrItinClass itin,
+              string opc, string asm, list<dag> pattern>
+  : AIldr_ex_or_acq<opcod, 0b11, oops, iops, itin, opc, asm, pattern>;
+
+class AIstrex<bits<2> opcod, dag oops, dag iops, InstrItinClass itin,
+              string opc, string asm, list<dag> pattern>
+  : AIstr_ex_or_rel<opcod, 0b11, oops, iops, itin, opc, asm, pattern> {
+  bits<4> Rd;
+  let Inst{15-12} = Rd;
+}
+
+// Exclusive load/store instructions
+
+class AIldaex<bits<2> opcod, dag oops, dag iops, InstrItinClass itin,
+              string opc, string asm, list<dag> pattern>
+  : AIldr_ex_or_acq<opcod, 0b10, oops, iops, itin, opc, asm, pattern>,
+    Requires<[IsARM, HasV8]>;
+
+class AIstlex<bits<2> opcod, dag oops, dag iops, InstrItinClass itin,
+              string opc, string asm, list<dag> pattern>
+  : AIstr_ex_or_rel<opcod, 0b10, oops, iops, itin, opc, asm, pattern>,
+    Requires<[IsARM, HasV8]> {
+  bits<4> Rd;
+  let Inst{15-12} = Rd;
+}
+
 class AIswp<bit b, dag oops, dag iops, string opc, list<dag> pattern>
   : AI<oops, iops, MiscFrm, NoItinerary, opc, "\t$Rt, $Rt2, $addr", pattern> {
   bits<4> Rt;
@@ -535,6 +575,18 @@ class AIswp<bit b, dag oops, dag iops, string opc, list<dag> pattern>
   let Unpredictable{11-8} = 0b1111;
   let DecoderMethod = "DecodeSwap";
 }
+// Acquire/Release load/store instructions
+class AIldracq<bits<2> opcod, dag oops, dag iops, InstrItinClass itin,
+              string opc, string asm, list<dag> pattern>
+  : AIldr_ex_or_acq<opcod, 0b00, oops, iops, itin, opc, asm, pattern>,
+    Requires<[IsARM, HasV8]>;
+
+class AIstrrel<bits<2> opcod, dag oops, dag iops, InstrItinClass itin,
+              string opc, string asm, list<dag> pattern>
+  : AIstr_ex_or_rel<opcod, 0b00, oops, iops, itin, opc, asm, pattern>,
+    Requires<[IsARM, HasV8]> {
+  let Inst{15-12}   = 0b1111;
+}
 
 // addrmode1 instructions
 class AI1<bits<4> opcod, dag oops, dag iops, Format f, InstrItinClass itin,
@@ -1520,6 +1572,8 @@ class ADuI<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<2> opcod4,
   let Inst{8}     = 1;          // Double precision
   let Inst{7-6}   = opcod4;
   let Inst{4}     = opcod5;
+
+  let Predicates = [HasVFP2, HasDPVFP];
 }
 
 // Double precision, unary, not-predicated
@@ -1572,6 +1626,8 @@ class ADbI<bits<5> opcod1, bits<2> opcod2, bit op6, bit op4, dag oops,
   let Inst{8}     = 1;          // Double precision
   let Inst{6}     = op6;
   let Inst{4}     = op4;
+
+  let Predicates = [HasVFP2, HasDPVFP];
 }
 
 // FP, binary, not predicated
@@ -1601,6 +1657,8 @@ class ADbInp<bits<5> opcod1, bits<2> opcod2, bit opcod3, dag oops, dag iops,
   let Inst{8}     = 1; // double precision
   let Inst{6}     = opcod3;
   let Inst{4}     = 0;
+
+  let Predicates = [HasVFP2, HasDPVFP];
 }
 
 // Single precision, unary, predicated
@@ -1965,7 +2023,7 @@ class N2V<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, bits<2> op17_16,
 }
 
 // Same as N2V but not predicated.
-class N2Vnp<bits<2> op17_16, bits<3> op10_8, bit op7, bit op6,
+class N2Vnp<bits<2> op19_18, bits<2> op17_16, bits<3> op10_8, bit op7, bit op6,
             dag oops, dag iops, InstrItinClass itin, string OpcodeStr,
             string Dt, ValueType ResTy, ValueType OpTy, list<dag> pattern>
    : NeonInp<oops, iops, AddrModeNone, IndexModeNone, N2RegFrm, itin,
@@ -1982,7 +2040,7 @@ class N2Vnp<bits<2> op17_16, bits<3> op10_8, bit op7, bit op6,
   // Encode constant bits
   let Inst{27-23} = 0b00111;
   let Inst{21-20} = 0b11;
-  let Inst{19-18} = 0b10;
+  let Inst{19-18} = op19_18;
   let Inst{17-16} = op17_16;
   let Inst{11} = 0;
   let Inst{10-8} = op10_8;
diff --git a/lib/Target/ARM/ARMInstrInfo.cpp b/lib/Target/ARM/ARMInstrInfo.cpp
index 8cdb853..df867b4 100644
--- a/lib/Target/ARM/ARMInstrInfo.cpp
+++ b/lib/Target/ARM/ARMInstrInfo.cpp
@@ -22,6 +22,7 @@
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/MC/MCAsmInfo.h"
@@ -119,17 +120,29 @@ namespace {
       MachineBasicBlock &FirstMBB = MF.front();
       MachineBasicBlock::iterator MBBI = FirstMBB.begin();
       DebugLoc DL = FirstMBB.findDebugLoc(MBBI);
-      unsigned GlobalBaseReg = AFI->getGlobalBaseReg();
+      unsigned TempReg =
+          MF.getRegInfo().createVirtualRegister(&ARM::rGPRRegClass);
       unsigned Opc = TM->getSubtarget<ARMSubtarget>().isThumb2() ?
                      ARM::t2LDRpci : ARM::LDRcp;
       const TargetInstrInfo &TII = *TM->getInstrInfo();
       MachineInstrBuilder MIB = BuildMI(FirstMBB, MBBI, DL,
-                                        TII.get(Opc), GlobalBaseReg)
+                                        TII.get(Opc), TempReg)
                                 .addConstantPoolIndex(Idx);
       if (Opc == ARM::LDRcp)
         MIB.addImm(0);
       AddDefaultPred(MIB);
 
+      // Fix the GOT address by adding pc.
+      unsigned GlobalBaseReg = AFI->getGlobalBaseReg();
+      Opc = TM->getSubtarget<ARMSubtarget>().isThumb2() ? ARM::tPICADD
+                                                        : ARM::PICADD;
+      MIB = BuildMI(FirstMBB, MBBI, DL, TII.get(Opc), GlobalBaseReg)
+                .addReg(TempReg)
+                .addImm(ARMPCLabelIndex);
+      if (Opc == ARM::PICADD)
+        AddDefaultPred(MIB);
+
+
       return true;
     }
 
diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td
index c243402..2042c04 100644
--- a/lib/Target/ARM/ARMInstrInfo.td
+++ b/lib/Target/ARM/ARMInstrInfo.td
@@ -71,6 +71,9 @@ def SDT_ARMTCRET : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
 def SDT_ARMBFI : SDTypeProfile<1, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>,
                                       SDTCisVT<2, i32>, SDTCisVT<3, i32>]>;
 
+def SDT_ARMVMAXNM : SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisFP<1>, SDTCisFP<2>]>;
+def SDT_ARMVMINNM : SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisFP<1>, SDTCisFP<2>]>;
+
 def SDTBinaryArithWithFlags : SDTypeProfile<2, 2,
                                             [SDTCisSameAs<0, 2>,
                                              SDTCisSameAs<0, 3>,
@@ -118,7 +121,8 @@ def ARMcall_nolink   : SDNode<"ARMISD::CALL_NOLINK", SDT_ARMcall,
 
 def ARMretflag       : SDNode<"ARMISD::RET_FLAG", SDTNone,
                               [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
-
+def ARMintretflag    : SDNode<"ARMISD::INTRET_FLAG", SDT_ARMcall,
+                              [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
 def ARMcmov          : SDNode<"ARMISD::CMOV", SDT_ARMCMov,
                               [SDNPInGlue]>;
 
@@ -162,8 +166,6 @@ def ARMeh_sjlj_longjmp: SDNode<"ARMISD::EH_SJLJ_LONGJMP",
                                SDT_ARMEH_SJLJ_Longjmp,
                                [SDNPHasChain, SDNPSideEffect]>;
 
-def ARMMemBarrier     : SDNode<"ARMISD::MEMBARRIER", SDT_ARMMEMBARRIER,
-                               [SDNPHasChain, SDNPSideEffect]>;
 def ARMMemBarrierMCR  : SDNode<"ARMISD::MEMBARRIER_MCR", SDT_ARMMEMBARRIER,
                                [SDNPHasChain, SDNPSideEffect]>;
 def ARMPreload        : SDNode<"ARMISD::PRELOAD", SDT_ARMPREFETCH,
@@ -174,9 +176,11 @@ def ARMrbit          : SDNode<"ARMISD::RBIT", SDTIntUnaryOp>;
 def ARMtcret         : SDNode<"ARMISD::TC_RETURN", SDT_ARMTCRET,
                         [SDNPHasChain,  SDNPOptInGlue, SDNPVariadic]>;
 
-
 def ARMbfi           : SDNode<"ARMISD::BFI", SDT_ARMBFI>;
 
+def ARMvmaxnm        : SDNode<"ARMISD::VMAXNM", SDT_ARMVMAXNM, []>;
+def ARMvminnm        : SDNode<"ARMISD::VMINNM", SDT_ARMVMINNM, []>;
+
 //===----------------------------------------------------------------------===//
 // ARM Instruction Predicate Definitions.
 //
@@ -189,6 +193,9 @@ def HasV5TE          : Predicate<"Subtarget->hasV5TEOps()">,
 def HasV6            : Predicate<"Subtarget->hasV6Ops()">,
                                  AssemblerPredicate<"HasV6Ops", "armv6">;
 def NoV6             : Predicate<"!Subtarget->hasV6Ops()">;
+def HasV6M           : Predicate<"Subtarget->hasV6MOps()">,
+                                 AssemblerPredicate<"HasV6MOps",
+                                                    "armv6m or armv6t2">;
 def HasV6T2          : Predicate<"Subtarget->hasV6T2Ops()">,
                                  AssemblerPredicate<"HasV6T2Ops", "armv6t2">;
 def NoV6T2           : Predicate<"!Subtarget->hasV6T2Ops()">;
@@ -196,6 +203,8 @@ def HasV7            : Predicate<"Subtarget->hasV7Ops()">,
                                  AssemblerPredicate<"HasV7Ops", "armv7">;
 def HasV8            : Predicate<"Subtarget->hasV8Ops()">,
                                  AssemblerPredicate<"HasV8Ops", "armv8">;
+def PreV8            : Predicate<"!Subtarget->hasV8Ops()">,
+                                 AssemblerPredicate<"!HasV8Ops", "armv7 or earlier">;
 def NoVFP            : Predicate<"!Subtarget->hasVFP2()">;
 def HasVFP2          : Predicate<"Subtarget->hasVFP2()">,
                                  AssemblerPredicate<"FeatureVFP2", "VFP2">;
@@ -203,16 +212,23 @@ def HasVFP3          : Predicate<"Subtarget->hasVFP3()">,
                                  AssemblerPredicate<"FeatureVFP3", "VFP3">;
 def HasVFP4          : Predicate<"Subtarget->hasVFP4()">,
                                  AssemblerPredicate<"FeatureVFP4", "VFP4">;
-def HasV8FP          : Predicate<"Subtarget->hasV8FP()">,
-                                 AssemblerPredicate<"FeatureV8FP", "V8FP">;
+def HasDPVFP         : Predicate<"!Subtarget->isFPOnlySP()">,
+                                 AssemblerPredicate<"!FeatureVFPOnlySP",
+                                                    "double precision VFP">;
+def HasFPARMv8       : Predicate<"Subtarget->hasFPARMv8()">,
+                                 AssemblerPredicate<"FeatureFPARMv8", "FPARMv8">;
 def HasNEON          : Predicate<"Subtarget->hasNEON()">,
                                  AssemblerPredicate<"FeatureNEON", "NEON">;
+def HasCrypto        : Predicate<"Subtarget->hasCrypto()">,
+                                 AssemblerPredicate<"FeatureCrypto", "crypto">;
+def HasCRC           : Predicate<"Subtarget->hasCRC()">,
+                                 AssemblerPredicate<"FeatureCRC", "crc">;
 def HasFP16          : Predicate<"Subtarget->hasFP16()">,
                                  AssemblerPredicate<"FeatureFP16","half-float">;
 def HasDivide        : Predicate<"Subtarget->hasDivide()">,
-                                 AssemblerPredicate<"FeatureHWDiv", "divide">;
+                                 AssemblerPredicate<"FeatureHWDiv", "divide in THUMB">;
 def HasDivideInARM   : Predicate<"Subtarget->hasDivideInARMMode()">,
-                                 AssemblerPredicate<"FeatureHWDivARM">;
+                                 AssemblerPredicate<"FeatureHWDivARM", "divide in ARM">;
 def HasT2ExtractPack : Predicate<"Subtarget->hasT2ExtractPack()">,
                                  AssemblerPredicate<"FeatureT2XtPk",
                                                      "pack/extract">;
@@ -237,10 +253,10 @@ def IsThumb2         : Predicate<"Subtarget->isThumb2()">,
                                  AssemblerPredicate<"ModeThumb,FeatureThumb2",
                                                     "thumb2">;
 def IsMClass         : Predicate<"Subtarget->isMClass()">,
-                                 AssemblerPredicate<"FeatureMClass", "armv7m">;
-def IsARClass        : Predicate<"!Subtarget->isMClass()">,
+                                 AssemblerPredicate<"FeatureMClass", "armv*m">;
+def IsNotMClass      : Predicate<"!Subtarget->isMClass()">,
                                  AssemblerPredicate<"!FeatureMClass",
-                                                    "armv7a/r">;
+                                                    "!armv*m">;
 def IsARM            : Predicate<"!Subtarget->isThumb()">,
                                  AssemblerPredicate<"!ModeThumb", "arm-mode">;
 def IsIOS            : Predicate<"Subtarget->isTargetIOS()">;
@@ -587,17 +603,6 @@ def imm0_1 : Operand<i32> { let ParserMatchClass = Imm0_1AsmOperand; }
 def Imm0_3AsmOperand: ImmAsmOperand { let Name = "Imm0_3"; }
 def imm0_3 : Operand<i32> { let ParserMatchClass = Imm0_3AsmOperand; }
 
-/// imm0_4 predicate - Immediate in the range [0,4].
-def Imm0_4AsmOperand : ImmAsmOperand
-{ 
-  let Name = "Imm0_4"; 
-  let DiagnosticType = "ImmRange0_4";  
-}
-def imm0_4 : Operand<i32>, ImmLeaf<i32, [{ return Imm >= 0 && Imm < 5; }]> {
-  let ParserMatchClass = Imm0_4AsmOperand;
-  let DecoderMethod = "DecodeImm0_4";
-}
-
 /// imm0_7 predicate - Immediate in the range [0,7].
 def Imm0_7AsmOperand: ImmAsmOperand { let Name = "Imm0_7"; }
 def imm0_7 : Operand<i32>, ImmLeaf<i32, [{
@@ -677,6 +682,15 @@ def imm0_63 : Operand<i32>, ImmLeaf<i32, [{
   let ParserMatchClass = Imm0_63AsmOperand;
 }
 
+/// imm0_239 predicate - Immediate in the range [0,239].
+def Imm0_239AsmOperand : ImmAsmOperand {
+  let Name = "Imm0_239";
+  let DiagnosticType = "ImmRange0_239";
+}
+def imm0_239 : Operand<i32>, ImmLeaf<i32, [{ return Imm >= 0 && Imm < 240; }]> {
+  let ParserMatchClass = Imm0_239AsmOperand;
+}
+
 /// imm0_255 predicate - Immediate in the range [0,255].
 def Imm0_255AsmOperand : ImmAsmOperand { let Name = "Imm0_255"; }
 def imm0_255 : Operand<i32>, ImmLeaf<i32, [{ return Imm >= 0 && Imm < 256; }]> {
@@ -708,6 +722,11 @@ def imm0_65535_expr : Operand<i32> {
   let ParserMatchClass = Imm0_65535ExprAsmOperand;
 }
 
+def Imm256_65535ExprAsmOperand: ImmAsmOperand { let Name = "Imm256_65535Expr"; }
+def imm256_65535_expr : Operand<i32> {
+  let ParserMatchClass = Imm256_65535ExprAsmOperand;
+}
+
 /// imm24b - True if the 32-bit immediate is encodable in 24 bits.
 def Imm24bitAsmOperand: ImmAsmOperand { let Name = "Imm24bit"; }
 def imm24b : Operand<i32>, ImmLeaf<i32, [{
@@ -1665,53 +1684,11 @@ PseudoInst<(outs), (ins i32imm:$amt, pred:$p), NoItinerary,
            [(ARMcallseq_start timm:$amt)]>;
 }
 
-// Atomic pseudo-insts which will be lowered to ldrexd/strexd loops.
-// (These pseudos use a hand-written selection code).
-let usesCustomInserter = 1, Defs = [CPSR], mayLoad = 1, mayStore = 1 in {
-def ATOMOR6432   : PseudoInst<(outs GPR:$dst1, GPR:$dst2),
-                              (ins GPR:$addr, GPR:$src1, GPR:$src2),
-                              NoItinerary, []>;
-def ATOMXOR6432  : PseudoInst<(outs GPR:$dst1, GPR:$dst2),
-                              (ins GPR:$addr, GPR:$src1, GPR:$src2),
-                              NoItinerary, []>;
-def ATOMADD6432  : PseudoInst<(outs GPR:$dst1, GPR:$dst2),
-                              (ins GPR:$addr, GPR:$src1, GPR:$src2),
-                              NoItinerary, []>;
-def ATOMSUB6432  : PseudoInst<(outs GPR:$dst1, GPR:$dst2),
-                              (ins GPR:$addr, GPR:$src1, GPR:$src2),
-                              NoItinerary, []>;
-def ATOMNAND6432 : PseudoInst<(outs GPR:$dst1, GPR:$dst2),
-                              (ins GPR:$addr, GPR:$src1, GPR:$src2),
-                              NoItinerary, []>;
-def ATOMAND6432  : PseudoInst<(outs GPR:$dst1, GPR:$dst2),
-                              (ins GPR:$addr, GPR:$src1, GPR:$src2),
-                              NoItinerary, []>;
-def ATOMSWAP6432 : PseudoInst<(outs GPR:$dst1, GPR:$dst2),
-                              (ins GPR:$addr, GPR:$src1, GPR:$src2),
-                              NoItinerary, []>;
-def ATOMCMPXCHG6432 : PseudoInst<(outs GPR:$dst1, GPR:$dst2),
-                                 (ins GPR:$addr, GPR:$cmp1, GPR:$cmp2,
-                                      GPR:$set1, GPR:$set2),
-                                 NoItinerary, []>;
-def ATOMMIN6432  : PseudoInst<(outs GPR:$dst1, GPR:$dst2),
-                              (ins GPR:$addr, GPR:$src1, GPR:$src2),
-                              NoItinerary, []>;
-def ATOMUMIN6432  : PseudoInst<(outs GPR:$dst1, GPR:$dst2),
-                              (ins GPR:$addr, GPR:$src1, GPR:$src2),
-                              NoItinerary, []>;
-def ATOMMAX6432  : PseudoInst<(outs GPR:$dst1, GPR:$dst2),
-                              (ins GPR:$addr, GPR:$src1, GPR:$src2),
-                              NoItinerary, []>;
-def ATOMUMAX6432  : PseudoInst<(outs GPR:$dst1, GPR:$dst2),
-                              (ins GPR:$addr, GPR:$src1, GPR:$src2),
-                              NoItinerary, []>;
-}
-
-def HINT : AI<(outs), (ins imm0_4:$imm), MiscFrm, NoItinerary,
+def HINT : AI<(outs), (ins imm0_239:$imm), MiscFrm, NoItinerary,
               "hint", "\t$imm", []>, Requires<[IsARM, HasV6]> {
-  bits<3> imm;
-  let Inst{27-3} = 0b0011001000001111000000000;
-  let Inst{2-0} = imm;
+  bits<8> imm;
+  let Inst{27-8} = 0b00110010000011110000;
+  let Inst{7-0} = imm;
 }
 
 def : InstAlias<"nop$p", (HINT 0, pred:$p)>, Requires<[IsARM, HasV6T2]>;
@@ -1719,6 +1696,9 @@ def : InstAlias<"yield$p", (HINT 1, pred:$p)>, Requires<[IsARM, HasV6T2]>;
 def : InstAlias<"wfe$p", (HINT 2, pred:$p)>, Requires<[IsARM, HasV6T2]>;
 def : InstAlias<"wfi$p", (HINT 3, pred:$p)>, Requires<[IsARM, HasV6T2]>;
 def : InstAlias<"sev$p", (HINT 4, pred:$p)>, Requires<[IsARM, HasV6T2]>;
+def : InstAlias<"sevl$p", (HINT 5, pred:$p)>, Requires<[IsARM, HasV8]>;
+
+def : Pat<(int_arm_sevl), (HINT 5)>;
 
 def SEL : AI<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), DPFrm, NoItinerary, "sel",
              "\t$Rd, $Rn, $Rm", []>, Requires<[IsARM, HasV6]> {
@@ -1746,6 +1726,16 @@ def BKPT : AInoP<(outs), (ins imm0_65535:$val), MiscFrm, NoItinerary,
   let Inst{7-4} = 0b0111;
 }
 
+def HLT : AInoP<(outs), (ins imm0_65535:$val), MiscFrm, NoItinerary,
+                 "hlt", "\t$val", []>, Requires<[IsARM, HasV8]> {
+  bits<16> val;
+  let Inst{3-0} = val{3-0};
+  let Inst{19-8} = val{15-4};
+  let Inst{27-20} = 0b00010000;
+  let Inst{31-28} = 0xe; // AL
+  let Inst{7-4} = 0b0111;
+}
+
 // Change Processor State
 // FIXME: We should use InstAlias to handle the optional operands.
 class CPS<dag iops, string asm_ops>
@@ -1820,7 +1810,7 @@ defm PLDW : APreLoad<0, 1, "pldw">, Requires<[IsARM,HasV7,HasMP]>;
 defm PLI  : APreLoad<1, 0, "pli">,  Requires<[IsARM,HasV7]>;
 
 def SETEND : AXI<(outs), (ins setend_op:$end), MiscFrm, NoItinerary,
-                 "setend\t$end", []>, Requires<[IsARM]> {
+                 "setend\t$end", []>, Requires<[IsARM]>, Deprecated<HasV8Ops> {
   bits<1> end;
   let Inst{31-10} = 0b1111000100000001000000;
   let Inst{9} = end;
@@ -1953,6 +1943,12 @@ let isReturn = 1, isTerminator = 1, isBarrier = 1 in {
                Requires<[IsARM, NoV4T]>, Sched<[WriteBr]> {
     let Inst{27-0} = 0b0001101000001111000000001110;
   }
+
+  // Exception return: N.b. doesn't set CPSR as far as we're concerned (it sets
+  // the user-space one).
+  def SUBS_PC_LR : ARMPseudoInst<(outs), (ins i32imm:$offset, pred:$p),
+                                 4, IIC_Br,
+                                 [(ARMintretflag imm:$offset)]>;
 }
 
 // Indirect branches
@@ -2283,6 +2279,13 @@ def LDRD : AI3ld<0b1101, 0, (outs GPR:$Rd, GPR:$dst2),
                  []>, Requires<[IsARM, HasV5TE]>;
 }
 
+def LDA : AIldracq<0b00, (outs GPR:$Rt), (ins addr_offset_none:$addr),
+                    NoItinerary, "lda", "\t$Rt, $addr", []>;
+def LDAB : AIldracq<0b10, (outs GPR:$Rt), (ins addr_offset_none:$addr),
+                    NoItinerary, "ldab", "\t$Rt, $addr", []>;
+def LDAH : AIldracq<0b11, (outs GPR:$Rt), (ins addr_offset_none:$addr),
+                    NoItinerary, "ldah", "\t$Rt, $addr", []>;
+
 // Indexed loads
 multiclass AI2_ldridx<bit isByte, string opc,
                       InstrItinClass iii, InstrItinClass iir> {
@@ -2825,6 +2828,12 @@ multiclass AI3strT<bits<4> op, string opc> {
 
 defm STRHT : AI3strT<0b1011, "strht">;
 
+def STL : AIstrrel<0b00, (outs), (ins GPR:$Rt, addr_offset_none:$addr),
+                   NoItinerary, "stl", "\t$Rt, $addr", []>;
+def STLB : AIstrrel<0b10, (outs), (ins GPR:$Rt, addr_offset_none:$addr),
+                    NoItinerary, "stlb", "\t$Rt, $addr", []>;
+def STLH : AIstrrel<0b11, (outs), (ins GPR:$Rt, addr_offset_none:$addr),
+                    NoItinerary, "stlh", "\t$Rt, $addr", []>;
 
 //===----------------------------------------------------------------------===//
 //  Load / store multiple Instructions.
@@ -4014,6 +4023,45 @@ def : ARMV6Pat<(or (and GPRnopc:$src1, 0xFFFF0000),
                (PKHTB GPRnopc:$src1, GPRnopc:$src2, imm1_15:$sh)>;
 
 //===----------------------------------------------------------------------===//
+// CRC Instructions
+//
+// Polynomials:
+// + CRC32{B,H,W}       0x04C11DB7
+// + CRC32C{B,H,W}      0x1EDC6F41
+//
+
+class AI_crc32<bit C, bits<2> sz, string suffix, SDPatternOperator builtin>
+  : AInoP<(outs GPRnopc:$Rd), (ins GPRnopc:$Rn, GPRnopc:$Rm), MiscFrm, NoItinerary,
+               !strconcat("crc32", suffix), "\t$Rd, $Rn, $Rm",
+               [(set GPRnopc:$Rd, (builtin GPRnopc:$Rn, GPRnopc:$Rm))]>,
+               Requires<[IsARM, HasV8, HasCRC]> {
+  bits<4> Rd;
+  bits<4> Rn;
+  bits<4> Rm;
+
+  let Inst{31-28} = 0b1110;
+  let Inst{27-23} = 0b00010;
+  let Inst{22-21} = sz;
+  let Inst{20}    = 0;
+  let Inst{19-16} = Rn;
+  let Inst{15-12} = Rd;
+  let Inst{11-10} = 0b00;
+  let Inst{9}     = C;
+  let Inst{8}     = 0;
+  let Inst{7-4}   = 0b0100;
+  let Inst{3-0}   = Rm;
+
+  let Unpredictable{11-8} = 0b1101;
+}
+
+def CRC32B  : AI_crc32<0, 0b00, "b", int_arm_crc32b>;
+def CRC32CB : AI_crc32<1, 0b00, "cb", int_arm_crc32cb>;
+def CRC32H  : AI_crc32<0, 0b01, "h", int_arm_crc32h>;
+def CRC32CH : AI_crc32<1, 0b01, "ch", int_arm_crc32ch>;
+def CRC32W  : AI_crc32<0, 0b10, "w", int_arm_crc32w>;
+def CRC32CW : AI_crc32<1, 0b10, "cw", int_arm_crc32cw>;
+
+//===----------------------------------------------------------------------===//
 //  Comparison Instructions...
 //
 
@@ -4139,56 +4187,65 @@ def BCCZi64 : PseudoInst<(outs),
 
 
 // Conditional moves
-// FIXME: should be able to write a pattern for ARMcmov, but can't use
-// a two-value operand where a dag node expects two operands. :(
 let neverHasSideEffects = 1 in {
 
 let isCommutable = 1, isSelect = 1 in
-def MOVCCr : ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$false, GPR:$Rm, pred:$p),
+def MOVCCr : ARMPseudoInst<(outs GPR:$Rd),
+                           (ins GPR:$false, GPR:$Rm, cmovpred:$p),
                            4, IIC_iCMOVr,
-  [/*(set GPR:$Rd, (ARMcmov GPR:$false, GPR:$Rm, imm:$cc, CCR:$ccr))*/]>,
-      RegConstraint<"$false = $Rd">, Sched<[WriteALU]>;
+                           [(set GPR:$Rd, (ARMcmov GPR:$false, GPR:$Rm,
+                                                   cmovpred:$p))]>,
+             RegConstraint<"$false = $Rd">, Sched<[WriteALU]>;
 
 def MOVCCsi : ARMPseudoInst<(outs GPR:$Rd),
-                           (ins GPR:$false, so_reg_imm:$shift, pred:$p),
-                           4, IIC_iCMOVsr,
-  [/*(set GPR:$Rd, (ARMcmov GPR:$false, so_reg_imm:$shift,
-                            imm:$cc, CCR:$ccr))*/]>,
+                            (ins GPR:$false, so_reg_imm:$shift, cmovpred:$p),
+                            4, IIC_iCMOVsr,
+                            [(set GPR:$Rd,
+                                  (ARMcmov GPR:$false, so_reg_imm:$shift,
+                                           cmovpred:$p))]>,
       RegConstraint<"$false = $Rd">, Sched<[WriteALU]>;
 def MOVCCsr : ARMPseudoInst<(outs GPR:$Rd),
-                           (ins GPR:$false, so_reg_reg:$shift, pred:$p),
+                            (ins GPR:$false, so_reg_reg:$shift, cmovpred:$p),
                            4, IIC_iCMOVsr,
-  [/*(set GPR:$Rd, (ARMcmov GPR:$false, so_reg_reg:$shift,
-                            imm:$cc, CCR:$ccr))*/]>,
+  [(set GPR:$Rd, (ARMcmov GPR:$false, so_reg_reg:$shift,
+                            cmovpred:$p))]>,
       RegConstraint<"$false = $Rd">, Sched<[WriteALU]>;
 
 
 let isMoveImm = 1 in
-def MOVCCi16 : ARMPseudoInst<(outs GPR:$Rd),
-                             (ins GPR:$false, imm0_65535_expr:$imm, pred:$p),
-                             4, IIC_iMOVi,
-                             []>,
+def MOVCCi16
+    : ARMPseudoInst<(outs GPR:$Rd),
+                    (ins GPR:$false, imm0_65535_expr:$imm, cmovpred:$p),
+                    4, IIC_iMOVi,
+                    [(set GPR:$Rd, (ARMcmov GPR:$false, imm0_65535:$imm,
+                                            cmovpred:$p))]>,
       RegConstraint<"$false = $Rd">, Requires<[IsARM, HasV6T2]>,
       Sched<[WriteALU]>;
 
 let isMoveImm = 1 in
 def MOVCCi : ARMPseudoInst<(outs GPR:$Rd),
-                           (ins GPR:$false, so_imm:$imm, pred:$p),
+                           (ins GPR:$false, so_imm:$imm, cmovpred:$p),
                            4, IIC_iCMOVi,
-   [/*(set GPR:$Rd, (ARMcmov GPR:$false, so_imm:$imm, imm:$cc, CCR:$ccr))*/]>,
+                           [(set GPR:$Rd, (ARMcmov GPR:$false, so_imm:$imm,
+                                                   cmovpred:$p))]>,
       RegConstraint<"$false = $Rd">, Sched<[WriteALU]>;
 
 // Two instruction predicate mov immediate.
 let isMoveImm = 1 in
-def MOVCCi32imm : ARMPseudoInst<(outs GPR:$Rd),
-                                (ins GPR:$false, i32imm:$src, pred:$p),
-                  8, IIC_iCMOVix2, []>, RegConstraint<"$false = $Rd">;
+def MOVCCi32imm
+    : ARMPseudoInst<(outs GPR:$Rd),
+                    (ins GPR:$false, i32imm:$src, cmovpred:$p),
+                    8, IIC_iCMOVix2,
+                    [(set GPR:$Rd, (ARMcmov GPR:$false, imm:$src,
+                                            cmovpred:$p))]>,
+      RegConstraint<"$false = $Rd">, Requires<[IsARM, HasV6T2]>;
 
 let isMoveImm = 1 in
 def MVNCCi : ARMPseudoInst<(outs GPR:$Rd),
-                           (ins GPR:$false, so_imm:$imm, pred:$p),
+                           (ins GPR:$false, so_imm:$imm, cmovpred:$p),
                            4, IIC_iCMOVi,
- [/*(set GPR:$Rd, (ARMcmov GPR:$false, so_imm_not:$imm, imm:$cc, CCR:$ccr))*/]>,
+                           [(set GPR:$Rd, (ARMcmov GPR:$false, so_imm_not:$imm,
+                                                   cmovpred:$p))]>,
                 RegConstraint<"$false = $Rd">, Sched<[WriteALU]>;
 
 } // neverHasSideEffects
@@ -4221,7 +4278,7 @@ def instsyncb_opt : Operand<i32> {
 // memory barriers protect the atomic sequences
 let hasSideEffects = 1 in {
 def DMB : AInoP<(outs), (ins memb_opt:$opt), MiscFrm, NoItinerary,
-                "dmb", "\t$opt", [(ARMMemBarrier (i32 imm:$opt))]>,
+                "dmb", "\t$opt", [(int_arm_dmb (i32 imm0_15:$opt))]>,
                 Requires<[IsARM, HasDB]> {
   bits<4> opt;
   let Inst{31-4} = 0xf57ff05;
@@ -4230,7 +4287,7 @@ def DMB : AInoP<(outs), (ins memb_opt:$opt), MiscFrm, NoItinerary,
 }
 
 def DSB : AInoP<(outs), (ins memb_opt:$opt), MiscFrm, NoItinerary,
-                "dsb", "\t$opt", []>,
+                "dsb", "\t$opt", [(int_arm_dsb (i32 imm0_15:$opt))]>,
                 Requires<[IsARM, HasDB]> {
   bits<4> opt;
   let Inst{31-4} = 0xf57ff04;
@@ -4246,124 +4303,219 @@ def ISB : AInoP<(outs), (ins instsyncb_opt:$opt), MiscFrm, NoItinerary,
   let Inst{3-0} = opt;
 }
 
+let usesCustomInserter = 1, Defs = [CPSR] in {
+
 // Pseudo instruction that combines movs + predicated rsbmi
 // to implement integer ABS
-let usesCustomInserter = 1, Defs = [CPSR] in
-def ABS : ARMPseudoInst<(outs GPR:$dst), (ins GPR:$src), 8, NoItinerary, []>;
+  def ABS : ARMPseudoInst<(outs GPR:$dst), (ins GPR:$src), 8, NoItinerary, []>;
 
-let usesCustomInserter = 1 in {
-  let Defs = [CPSR] in {
+// Atomic pseudo-insts which will be lowered to ldrex/strex loops.
+// (64-bit pseudos use a hand-written selection code).
+  let mayLoad = 1, mayStore = 1 in {
     def ATOMIC_LOAD_ADD_I8 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
-      [(set GPR:$dst, (atomic_load_add_8 GPR:$ptr, GPR:$incr))]>;
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$incr, i32imm:$ordering),
+      NoItinerary, []>;
     def ATOMIC_LOAD_SUB_I8 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
-      [(set GPR:$dst, (atomic_load_sub_8 GPR:$ptr, GPR:$incr))]>;
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$incr, i32imm:$ordering),
+      NoItinerary, []>;
     def ATOMIC_LOAD_AND_I8 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
-      [(set GPR:$dst, (atomic_load_and_8 GPR:$ptr, GPR:$incr))]>;
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$incr, i32imm:$ordering),
+      NoItinerary, []>;
     def ATOMIC_LOAD_OR_I8 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
-      [(set GPR:$dst, (atomic_load_or_8 GPR:$ptr, GPR:$incr))]>;
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$incr, i32imm:$ordering),
+      NoItinerary, []>;
     def ATOMIC_LOAD_XOR_I8 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
-      [(set GPR:$dst, (atomic_load_xor_8 GPR:$ptr, GPR:$incr))]>;
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$incr, i32imm:$ordering),
+      NoItinerary, []>;
     def ATOMIC_LOAD_NAND_I8 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
-      [(set GPR:$dst, (atomic_load_nand_8 GPR:$ptr, GPR:$incr))]>;
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$incr, i32imm:$ordering),
+      NoItinerary, []>;
     def ATOMIC_LOAD_MIN_I8 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary,
-      [(set GPR:$dst, (atomic_load_min_8 GPR:$ptr, GPR:$val))]>;
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$val, i32imm:$ordering),
+      NoItinerary, []>;
     def ATOMIC_LOAD_MAX_I8 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary,
-      [(set GPR:$dst, (atomic_load_max_8 GPR:$ptr, GPR:$val))]>;
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$val, i32imm:$ordering),
+      NoItinerary, []>;
     def ATOMIC_LOAD_UMIN_I8 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary,
-      [(set GPR:$dst, (atomic_load_umin_8 GPR:$ptr, GPR:$val))]>;
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$val, i32imm:$ordering),
+      NoItinerary, []>;
     def ATOMIC_LOAD_UMAX_I8 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary,
-      [(set GPR:$dst, (atomic_load_umax_8 GPR:$ptr, GPR:$val))]>;
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$val, i32imm:$ordering),
+      NoItinerary, []>;
+    def ATOMIC_SWAP_I8 : PseudoInst<
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$new, i32imm:$ordering),
+      NoItinerary, []>;
+    def ATOMIC_CMP_SWAP_I8 : PseudoInst<
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$old, GPR:$new, i32imm:$ordering),
+      NoItinerary, []>;
     def ATOMIC_LOAD_ADD_I16 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
-      [(set GPR:$dst, (atomic_load_add_16 GPR:$ptr, GPR:$incr))]>;
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$incr, i32imm:$ordering),
+      NoItinerary, []>;
     def ATOMIC_LOAD_SUB_I16 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
-      [(set GPR:$dst, (atomic_load_sub_16 GPR:$ptr, GPR:$incr))]>;
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$incr, i32imm:$ordering),
+      NoItinerary, []>;
     def ATOMIC_LOAD_AND_I16 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
-      [(set GPR:$dst, (atomic_load_and_16 GPR:$ptr, GPR:$incr))]>;
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$incr, i32imm:$ordering),
+      NoItinerary, []>;
     def ATOMIC_LOAD_OR_I16 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
-      [(set GPR:$dst, (atomic_load_or_16 GPR:$ptr, GPR:$incr))]>;
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$incr, i32imm:$ordering),
+      NoItinerary, []>;
     def ATOMIC_LOAD_XOR_I16 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
-      [(set GPR:$dst, (atomic_load_xor_16 GPR:$ptr, GPR:$incr))]>;
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$incr, i32imm:$ordering),
+      NoItinerary, []>;
     def ATOMIC_LOAD_NAND_I16 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
-      [(set GPR:$dst, (atomic_load_nand_16 GPR:$ptr, GPR:$incr))]>;
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$incr, i32imm:$ordering),
+      NoItinerary, []>;
     def ATOMIC_LOAD_MIN_I16 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary,
-      [(set GPR:$dst, (atomic_load_min_16 GPR:$ptr, GPR:$val))]>;
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$val, i32imm:$ordering),
+      NoItinerary, []>;
     def ATOMIC_LOAD_MAX_I16 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary,
-      [(set GPR:$dst, (atomic_load_max_16 GPR:$ptr, GPR:$val))]>;
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$val, i32imm:$ordering),
+      NoItinerary, []>;
     def ATOMIC_LOAD_UMIN_I16 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary,
-      [(set GPR:$dst, (atomic_load_umin_16 GPR:$ptr, GPR:$val))]>;
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$val, i32imm:$ordering),
+      NoItinerary, []>;
     def ATOMIC_LOAD_UMAX_I16 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary,
-      [(set GPR:$dst, (atomic_load_umax_16 GPR:$ptr, GPR:$val))]>;
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$val, i32imm:$ordering),
+      NoItinerary, []>;
+    def ATOMIC_SWAP_I16 : PseudoInst<
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$new, i32imm:$ordering),
+      NoItinerary, []>;
+    def ATOMIC_CMP_SWAP_I16 : PseudoInst<
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$old, GPR:$new, i32imm:$ordering),
+      NoItinerary, []>;
     def ATOMIC_LOAD_ADD_I32 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
-      [(set GPR:$dst, (atomic_load_add_32 GPR:$ptr, GPR:$incr))]>;
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$incr, i32imm:$ordering),
+      NoItinerary, []>;
     def ATOMIC_LOAD_SUB_I32 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
-      [(set GPR:$dst, (atomic_load_sub_32 GPR:$ptr, GPR:$incr))]>;
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$incr, i32imm:$ordering),
+      NoItinerary, []>;
     def ATOMIC_LOAD_AND_I32 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
-      [(set GPR:$dst, (atomic_load_and_32 GPR:$ptr, GPR:$incr))]>;
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$incr, i32imm:$ordering),
+      NoItinerary, []>;
     def ATOMIC_LOAD_OR_I32 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
-      [(set GPR:$dst, (atomic_load_or_32 GPR:$ptr, GPR:$incr))]>;
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$incr, i32imm:$ordering),
+      NoItinerary, []>;
     def ATOMIC_LOAD_XOR_I32 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
-      [(set GPR:$dst, (atomic_load_xor_32 GPR:$ptr, GPR:$incr))]>;
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$incr, i32imm:$ordering),
+      NoItinerary, []>;
     def ATOMIC_LOAD_NAND_I32 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
-      [(set GPR:$dst, (atomic_load_nand_32 GPR:$ptr, GPR:$incr))]>;
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$incr, i32imm:$ordering),
+      NoItinerary, []>;
     def ATOMIC_LOAD_MIN_I32 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary,
-      [(set GPR:$dst, (atomic_load_min_32 GPR:$ptr, GPR:$val))]>;
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$val, i32imm:$ordering),
+      NoItinerary, []>;
     def ATOMIC_LOAD_MAX_I32 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary,
-      [(set GPR:$dst, (atomic_load_max_32 GPR:$ptr, GPR:$val))]>;
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$val, i32imm:$ordering),
+      NoItinerary, []>;
     def ATOMIC_LOAD_UMIN_I32 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary,
-      [(set GPR:$dst, (atomic_load_umin_32 GPR:$ptr, GPR:$val))]>;
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$val, i32imm:$ordering),
+      NoItinerary, []>;
     def ATOMIC_LOAD_UMAX_I32 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary,
-      [(set GPR:$dst, (atomic_load_umax_32 GPR:$ptr, GPR:$val))]>;
-
-    def ATOMIC_SWAP_I8 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$new), NoItinerary,
-      [(set GPR:$dst, (atomic_swap_8 GPR:$ptr, GPR:$new))]>;
-    def ATOMIC_SWAP_I16 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$new), NoItinerary,
-      [(set GPR:$dst, (atomic_swap_16 GPR:$ptr, GPR:$new))]>;
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$val, i32imm:$ordering),
+      NoItinerary, []>;
     def ATOMIC_SWAP_I32 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$new), NoItinerary,
-      [(set GPR:$dst, (atomic_swap_32 GPR:$ptr, GPR:$new))]>;
-
-    def ATOMIC_CMP_SWAP_I8 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$old, GPR:$new), NoItinerary,
-      [(set GPR:$dst, (atomic_cmp_swap_8 GPR:$ptr, GPR:$old, GPR:$new))]>;
-    def ATOMIC_CMP_SWAP_I16 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$old, GPR:$new), NoItinerary,
-      [(set GPR:$dst, (atomic_cmp_swap_16 GPR:$ptr, GPR:$old, GPR:$new))]>;
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$new, i32imm:$ordering),
+      NoItinerary, []>;
     def ATOMIC_CMP_SWAP_I32 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$old, GPR:$new), NoItinerary,
-      [(set GPR:$dst, (atomic_cmp_swap_32 GPR:$ptr, GPR:$old, GPR:$new))]>;
-}
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$old, GPR:$new, i32imm:$ordering),
+      NoItinerary, []>;
+    def ATOMIC_LOAD_ADD_I64 : PseudoInst<
+      (outs GPR:$dst1, GPR:$dst2),
+      (ins GPR:$addr, GPR:$src1, GPR:$src2, i32imm:$ordering),
+      NoItinerary, []>;
+    def ATOMIC_LOAD_SUB_I64 : PseudoInst<
+      (outs GPR:$dst1, GPR:$dst2),
+      (ins GPR:$addr, GPR:$src1, GPR:$src2, i32imm:$ordering),
+      NoItinerary, []>;
+    def ATOMIC_LOAD_AND_I64 : PseudoInst<
+      (outs GPR:$dst1, GPR:$dst2),
+      (ins GPR:$addr, GPR:$src1, GPR:$src2, i32imm:$ordering),
+      NoItinerary, []>;
+    def ATOMIC_LOAD_OR_I64 :  PseudoInst<
+      (outs GPR:$dst1, GPR:$dst2),
+      (ins GPR:$addr, GPR:$src1, GPR:$src2, i32imm:$ordering),
+      NoItinerary, []>;
+    def ATOMIC_LOAD_XOR_I64 : PseudoInst<
+      (outs GPR:$dst1, GPR:$dst2),
+      (ins GPR:$addr, GPR:$src1, GPR:$src2, i32imm:$ordering),
+      NoItinerary, []>;
+    def ATOMIC_LOAD_NAND_I64 : PseudoInst<
+      (outs GPR:$dst1, GPR:$dst2),
+      (ins GPR:$addr, GPR:$src1, GPR:$src2, i32imm:$ordering),
+      NoItinerary, []>;
+    def ATOMIC_LOAD_MIN_I64 : PseudoInst<
+      (outs GPR:$dst1, GPR:$dst2),
+      (ins GPR:$addr, GPR:$src1, GPR:$src2, i32imm:$ordering),
+      NoItinerary, []>;
+    def ATOMIC_LOAD_MAX_I64 : PseudoInst<
+      (outs GPR:$dst1, GPR:$dst2),
+      (ins GPR:$addr, GPR:$src1, GPR:$src2, i32imm:$ordering),
+      NoItinerary, []>;
+    def ATOMIC_LOAD_UMIN_I64 : PseudoInst<
+      (outs GPR:$dst1, GPR:$dst2),
+      (ins GPR:$addr, GPR:$src1, GPR:$src2, i32imm:$ordering),
+      NoItinerary, []>;
+    def ATOMIC_LOAD_UMAX_I64 : PseudoInst<
+      (outs GPR:$dst1, GPR:$dst2),
+      (ins GPR:$addr, GPR:$src1, GPR:$src2, i32imm:$ordering),
+      NoItinerary, []>;
+    def ATOMIC_SWAP_I64 : PseudoInst<
+      (outs GPR:$dst1, GPR:$dst2),
+      (ins GPR:$addr, GPR:$src1, GPR:$src2, i32imm:$ordering),
+      NoItinerary, []>;
+    def ATOMIC_CMP_SWAP_I64 : PseudoInst<
+      (outs GPR:$dst1, GPR:$dst2),
+      (ins GPR:$addr, GPR:$cmp1, GPR:$cmp2,
+           GPR:$set1, GPR:$set2, i32imm:$ordering),
+      NoItinerary, []>;
+  }
+  let mayLoad = 1 in
+    def ATOMIC_LOAD_I64 : PseudoInst<
+      (outs GPR:$dst1, GPR:$dst2),
+      (ins GPR:$addr, i32imm:$ordering),
+      NoItinerary, []>;
+  let mayStore = 1 in
+    def ATOMIC_STORE_I64 : PseudoInst<
+      (outs GPR:$dst1, GPR:$dst2),
+      (ins GPR:$addr, GPR:$src1, GPR:$src2, i32imm:$ordering),
+      NoItinerary, []>;
 }
 
 let usesCustomInserter = 1 in {
@@ -4402,8 +4554,7 @@ def strex_4 : PatFrag<(ops node:$val, node:$ptr),
 
 let mayLoad = 1 in {
 def LDREXB : AIldrex<0b10, (outs GPR:$Rt), (ins addr_offset_none:$addr),
-                     NoItinerary,
-                     "ldrexb", "\t$Rt, $addr",
+                     NoItinerary, "ldrexb", "\t$Rt, $addr",
                      [(set GPR:$Rt, (ldrex_1 addr_offset_none:$addr))]>;
 def LDREXH : AIldrex<0b11, (outs GPR:$Rt), (ins addr_offset_none:$addr),
                      NoItinerary, "ldrexh", "\t$Rt, $addr",
@@ -4412,10 +4563,22 @@ def LDREX  : AIldrex<0b00, (outs GPR:$Rt), (ins addr_offset_none:$addr),
                      NoItinerary, "ldrex", "\t$Rt, $addr",
                      [(set GPR:$Rt, (ldrex_4 addr_offset_none:$addr))]>;
 let hasExtraDefRegAllocReq = 1 in
-def LDREXD: AIldrex<0b01, (outs GPRPairOp:$Rt),(ins addr_offset_none:$addr),
+def LDREXD : AIldrex<0b01, (outs GPRPairOp:$Rt),(ins addr_offset_none:$addr),
                       NoItinerary, "ldrexd", "\t$Rt, $addr", []> {
   let DecoderMethod = "DecodeDoubleRegLoad";
 }
+
+def LDAEXB : AIldaex<0b10, (outs GPR:$Rt), (ins addr_offset_none:$addr),
+                     NoItinerary, "ldaexb", "\t$Rt, $addr", []>;
+def LDAEXH : AIldaex<0b11, (outs GPR:$Rt), (ins addr_offset_none:$addr),
+                     NoItinerary, "ldaexh", "\t$Rt, $addr", []>;
+def LDAEX  : AIldaex<0b00, (outs GPR:$Rt), (ins addr_offset_none:$addr),
+                     NoItinerary, "ldaex", "\t$Rt, $addr", []>;
+let hasExtraDefRegAllocReq = 1 in
+def LDAEXD : AIldaex<0b01, (outs GPRPairOp:$Rt),(ins addr_offset_none:$addr),
+                      NoItinerary, "ldaexd", "\t$Rt, $addr", []> {
+  let DecoderMethod = "DecodeDoubleRegLoad";
+}
 }
 
 let mayStore = 1, Constraints = "@earlyclobber $Rd" in {
@@ -4434,8 +4597,22 @@ def STREXD : AIstrex<0b01, (outs GPR:$Rd),
                     NoItinerary, "strexd", "\t$Rd, $Rt, $addr", []> {
   let DecoderMethod = "DecodeDoubleRegStore";
 }
+def STLEXB: AIstlex<0b10, (outs GPR:$Rd), (ins GPR:$Rt, addr_offset_none:$addr),
+                    NoItinerary, "stlexb", "\t$Rd, $Rt, $addr",
+                    []>;
+def STLEXH: AIstlex<0b11, (outs GPR:$Rd), (ins GPR:$Rt, addr_offset_none:$addr),
+                    NoItinerary, "stlexh", "\t$Rd, $Rt, $addr",
+                    []>;
+def STLEX : AIstlex<0b00, (outs GPR:$Rd), (ins GPR:$Rt, addr_offset_none:$addr),
+                    NoItinerary, "stlex", "\t$Rd, $Rt, $addr",
+                    []>;
+let hasExtraSrcRegAllocReq = 1 in
+def STLEXD : AIstlex<0b01, (outs GPR:$Rd),
+                    (ins GPRPairOp:$Rt, addr_offset_none:$addr),
+                    NoItinerary, "stlexd", "\t$Rd, $Rt, $addr", []> {
+  let DecoderMethod = "DecodeDoubleRegStore";
+}
 }
-
 
 def CLREX : AXI<(outs), (ins), MiscFrm, NoItinerary, "clrex",
                 [(int_arm_clrex)]>,
@@ -4452,12 +4629,43 @@ def : ARMPat<(strex_1 (and GPR:$Rt, 0xff), addr_offset_none:$addr),
 def : ARMPat<(strex_2 (and GPR:$Rt, 0xffff), addr_offset_none:$addr),
              (STREXH GPR:$Rt, addr_offset_none:$addr)>;
 
+class acquiring_load<PatFrag base>
+  : PatFrag<(ops node:$ptr), (base node:$ptr), [{
+  AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
+  return Ordering == Acquire || Ordering == SequentiallyConsistent;
+}]>;
+
+def atomic_load_acquire_8  : acquiring_load<atomic_load_8>;
+def atomic_load_acquire_16 : acquiring_load<atomic_load_16>;
+def atomic_load_acquire_32 : acquiring_load<atomic_load_32>;
+
+class releasing_store<PatFrag base>
+  : PatFrag<(ops node:$ptr, node:$val), (base node:$ptr, node:$val), [{
+  AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
+  return Ordering == Release || Ordering == SequentiallyConsistent;
+}]>;
+
+def atomic_store_release_8  : releasing_store<atomic_store_8>;
+def atomic_store_release_16 : releasing_store<atomic_store_16>;
+def atomic_store_release_32 : releasing_store<atomic_store_32>;
+
+let AddedComplexity = 8 in {
+  def : ARMPat<(atomic_load_acquire_8 addr_offset_none:$addr),  (LDAB addr_offset_none:$addr)>;
+  def : ARMPat<(atomic_load_acquire_16 addr_offset_none:$addr), (LDAH addr_offset_none:$addr)>;
+  def : ARMPat<(atomic_load_acquire_32 addr_offset_none:$addr), (LDA  addr_offset_none:$addr)>;
+  def : ARMPat<(atomic_store_release_8 addr_offset_none:$addr, GPR:$val),  (STLB GPR:$val, addr_offset_none:$addr)>;
+  def : ARMPat<(atomic_store_release_16 addr_offset_none:$addr, GPR:$val), (STLH GPR:$val, addr_offset_none:$addr)>;
+  def : ARMPat<(atomic_store_release_32 addr_offset_none:$addr, GPR:$val), (STL  GPR:$val, addr_offset_none:$addr)>;
+}
+
 // SWP/SWPB are deprecated in V6/V7.
 let mayLoad = 1, mayStore = 1 in {
 def SWP : AIswp<0, (outs GPRnopc:$Rt),
-                (ins GPRnopc:$Rt2, addr_offset_none:$addr), "swp", []>;
+                (ins GPRnopc:$Rt2, addr_offset_none:$addr), "swp", []>,
+                Requires<[PreV8]>;
 def SWPB: AIswp<1, (outs GPRnopc:$Rt),
-                (ins GPRnopc:$Rt2, addr_offset_none:$addr), "swpb", []>;
+                (ins GPRnopc:$Rt2, addr_offset_none:$addr), "swpb", []>,
+                Requires<[PreV8]>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -4468,7 +4676,8 @@ def CDP : ABI<0b1110, (outs), (ins p_imm:$cop, imm0_15:$opc1,
             c_imm:$CRd, c_imm:$CRn, c_imm:$CRm, imm0_7:$opc2),
             NoItinerary, "cdp", "\t$cop, $opc1, $CRd, $CRn, $CRm, $opc2",
             [(int_arm_cdp imm:$cop, imm:$opc1, imm:$CRd, imm:$CRn,
-                          imm:$CRm, imm:$opc2)]> {
+                          imm:$CRm, imm:$opc2)]>,
+            Requires<[PreV8]> {
   bits<4> opc1;
   bits<4> CRn;
   bits<4> CRd;
@@ -4489,7 +4698,8 @@ def CDP2 : ABXI<0b1110, (outs), (ins p_imm:$cop, imm0_15:$opc1,
                c_imm:$CRd, c_imm:$CRn, c_imm:$CRm, imm0_7:$opc2),
                NoItinerary, "cdp2\t$cop, $opc1, $CRd, $CRn, $CRm, $opc2",
                [(int_arm_cdp2 imm:$cop, imm:$opc1, imm:$CRd, imm:$CRn,
-                              imm:$CRm, imm:$opc2)]> {
+                              imm:$CRm, imm:$opc2)]>,
+               Requires<[PreV8]> {
   let Inst{31-28} = 0b1111;
   bits<4> opc1;
   bits<4> CRn;
@@ -4667,10 +4877,10 @@ defm LDC   : LdStCop <1, 0, "ldc">;
 defm LDCL  : LdStCop <1, 1, "ldcl">;
 defm STC   : LdStCop <0, 0, "stc">;
 defm STCL  : LdStCop <0, 1, "stcl">;
-defm LDC2  : LdSt2Cop<1, 0, "ldc2">;
-defm LDC2L : LdSt2Cop<1, 1, "ldc2l">;
-defm STC2  : LdSt2Cop<0, 0, "stc2">;
-defm STC2L : LdSt2Cop<0, 1, "stc2l">;
+defm LDC2  : LdSt2Cop<1, 0, "ldc2">, Requires<[PreV8]>;
+defm LDC2L : LdSt2Cop<1, 1, "ldc2l">, Requires<[PreV8]>;
+defm STC2  : LdSt2Cop<0, 0, "stc2">, Requires<[PreV8]>;
+defm STC2L : LdSt2Cop<0, 1, "stc2l">, Requires<[PreV8]>;
 
 //===----------------------------------------------------------------------===//
 // Move between coprocessor and ARM core register.
@@ -4703,7 +4913,8 @@ def MCR : MovRCopro<"mcr", 0 /* from ARM core register to coprocessor */,
                     (ins p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn,
                          c_imm:$CRm, imm0_7:$opc2),
                     [(int_arm_mcr imm:$cop, imm:$opc1, GPR:$Rt, imm:$CRn,
-                                  imm:$CRm, imm:$opc2)]>;
+                                  imm:$CRm, imm:$opc2)]>,
+                    ComplexDeprecationPredicate<"MCR">;
 def : ARMInstAlias<"mcr${p} $cop, $opc1, $Rt, $CRn, $CRm",
                    (MCR p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn,
                         c_imm:$CRm, 0, pred:$p)>;
@@ -4746,14 +4957,16 @@ def MCR2 : MovRCopro2<"mcr2", 0 /* from ARM core register to coprocessor */,
                       (ins p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn,
                            c_imm:$CRm, imm0_7:$opc2),
                       [(int_arm_mcr2 imm:$cop, imm:$opc1, GPR:$Rt, imm:$CRn,
-                                     imm:$CRm, imm:$opc2)]>;
+                                     imm:$CRm, imm:$opc2)]>,
+                      Requires<[PreV8]>;
 def : ARMInstAlias<"mcr2$ $cop, $opc1, $Rt, $CRn, $CRm",
                    (MCR2 p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn,
                          c_imm:$CRm, 0)>;
 def MRC2 : MovRCopro2<"mrc2", 1 /* from coprocessor to ARM core register */,
                       (outs GPRwithAPSR:$Rt),
                       (ins p_imm:$cop, imm0_7:$opc1, c_imm:$CRn, c_imm:$CRm,
-                           imm0_7:$opc2), []>;
+                           imm0_7:$opc2), []>,
+                      Requires<[PreV8]>;
 def : ARMInstAlias<"mrc2$ $cop, $opc1, $Rt, $CRn, $CRm",
                    (MRC2 GPRwithAPSR:$Rt, p_imm:$cop, imm0_7:$opc1, c_imm:$CRn,
                          c_imm:$CRm, 0)>;
@@ -4790,7 +5003,8 @@ def MRRC : MovRRCopro<"mrrc", 1 /* from coprocessor to ARM core register */>;
 class MovRRCopro2<string opc, bit direction, list<dag> pattern = []>
   : ABXI<0b1100, (outs), (ins p_imm:$cop, imm0_15:$opc1,
          GPRnopc:$Rt, GPRnopc:$Rt2, c_imm:$CRm), NoItinerary,
-         !strconcat(opc, "\t$cop, $opc1, $Rt, $Rt2, $CRm"), pattern> {
+         !strconcat(opc, "\t$cop, $opc1, $Rt, $Rt2, $CRm"), pattern>,
+    Requires<[PreV8]> {
   let Inst{31-28} = 0b1111;
   let Inst{23-21} = 0b010;
   let Inst{20} = direction;
@@ -5341,4 +5555,5 @@ def : InstAlias<"umull${s}${p} $RdLo, $RdHi, $Rn, $Rm",
 
 // 'it' blocks in ARM mode just validate the predicates. The IT itself
 // is discarded.
-def ITasm : ARMAsmPseudo<"it$mask $cc", (ins it_pred:$cc, it_mask:$mask)>;
+def ITasm : ARMAsmPseudo<"it$mask $cc", (ins it_pred:$cc, it_mask:$mask)>,
+         ComplexDeprecationPredicate<"IT">;
diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td
index af4f4d1..43bd4c2 100644
--- a/lib/Target/ARM/ARMInstrNEON.td
+++ b/lib/Target/ARM/ARMInstrNEON.td
@@ -2355,17 +2355,36 @@ class N2VQInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
 class N2VDIntnp<bits<2> op17_16, bits<3> op10_8, bit op7,
               InstrItinClass itin, string OpcodeStr, string Dt,
               ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
-  : N2Vnp<op17_16, op10_8, op7, 0,  (outs DPR:$Vd), (ins DPR:$Vm),
+  : N2Vnp<0b10, op17_16, op10_8, op7, 0,  (outs DPR:$Vd), (ins DPR:$Vm),
           itin, OpcodeStr, Dt, ResTy, OpTy,
           [(set DPR:$Vd, (ResTy (IntOp (OpTy DPR:$Vm))))]>;
 
 class N2VQIntnp<bits<2> op17_16, bits<3> op10_8, bit op7,
               InstrItinClass itin, string OpcodeStr, string Dt,
               ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
-  : N2Vnp<op17_16, op10_8, op7, 1,  (outs QPR:$Vd), (ins QPR:$Vm),
+  : N2Vnp<0b10, op17_16, op10_8, op7, 1,  (outs QPR:$Vd), (ins QPR:$Vm),
           itin, OpcodeStr, Dt, ResTy, OpTy,
           [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$Vm))))]>;
 
+// Similar to NV2VQIntnp with some more encoding bits exposed (crypto).
+class N2VQIntXnp<bits<2> op19_18, bits<2> op17_16, bits<3> op10_8, bit op6,
+              bit op7, InstrItinClass itin, string OpcodeStr, string Dt,
+              ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
+  : N2Vnp<op19_18, op17_16, op10_8, op7, op6,  (outs QPR:$Vd), (ins QPR:$Vm),
+          itin, OpcodeStr, Dt, ResTy, OpTy,
+          [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$Vm))))]>;
+
+// Same as N2VQIntXnp but with Vd as a src register.
+class N2VQIntX2np<bits<2> op19_18, bits<2> op17_16, bits<3> op10_8, bit op6,
+              bit op7, InstrItinClass itin, string OpcodeStr, string Dt,
+              ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
+  : N2Vnp<op19_18, op17_16, op10_8, op7, op6,
+          (outs QPR:$Vd), (ins QPR:$src, QPR:$Vm),
+          itin, OpcodeStr, Dt, ResTy, OpTy,
+          [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$src), (OpTy QPR:$Vm))))]> {
+  let Constraints = "$src = $Vd";
+}
+
 // Narrow 2-register operations.
 class N2VN<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
            bits<2> op17_16, bits<5> op11_7, bit op6, bit op4,
@@ -2534,7 +2553,7 @@ class N3VDIntnp<bits<5> op27_23, bits<2> op21_20, bits<4> op11_8, bit op6,
                 string Dt, ValueType ResTy, ValueType OpTy,
                 SDPatternOperator IntOp, bit Commutable>
   : N3Vnp<op27_23, op21_20, op11_8, op6, op4,
-          (outs DPR:$Vd), (ins DPR:$Vn, DPR:$Vm), f, itin, OpcodeStr, Dt,
+          (outs DPR:$Vd), (ins DPR:$Vn, DPR:$Vm), N3RegFrm, itin, OpcodeStr, Dt,
           ResTy, OpTy, IntOp, Commutable,
           [(set DPR:$Vd, (ResTy (IntOp (OpTy DPR:$Vn), (OpTy DPR:$Vm))))]>;
 
@@ -2592,6 +2611,19 @@ class N3VQIntnp<bits<5> op27_23, bits<2> op21_20, bits<4> op11_8, bit op6,
           ResTy, OpTy, IntOp, Commutable,
           [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$Vn), (OpTy QPR:$Vm))))]>;
 
+// Same as N3VQIntnp but with Vd as a src register.
+class N3VQInt3np<bits<5> op27_23, bits<2> op21_20, bits<4> op11_8, bit op6,
+                bit op4, Format f, InstrItinClass itin, string OpcodeStr,
+                string Dt, ValueType ResTy, ValueType OpTy,
+                SDPatternOperator IntOp, bit Commutable>
+  : N3Vnp<op27_23, op21_20, op11_8, op6, op4,
+          (outs QPR:$Vd), (ins QPR:$src, QPR:$Vn, QPR:$Vm), f, itin, OpcodeStr,
+          Dt, ResTy, OpTy, IntOp, Commutable,
+          [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$src), (OpTy QPR:$Vn),
+                                       (OpTy QPR:$Vm))))]> {
+  let Constraints = "$src = $Vd";
+}
+
 class N3VQIntSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
                 string OpcodeStr, string Dt,
                 ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
@@ -2842,6 +2874,7 @@ class N3VL<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
         [(set QPR:$Vd, (TyQ (OpNode (TyD DPR:$Vn), (TyD DPR:$Vm))))]> {
   let isCommutable = Commutable;
 }
+
 class N3VLSL<bit op24, bits<2> op21_20, bits<4> op11_8,
              InstrItinClass itin, string OpcodeStr, string Dt,
              ValueType TyQ, ValueType TyD, SDNode OpNode>
@@ -2897,6 +2930,17 @@ class N3VLInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
         [(set QPR:$Vd, (TyQ (IntOp (TyD DPR:$Vn), (TyD DPR:$Vm))))]> {
   let isCommutable = Commutable;
 }
+
+// Same as above, but not predicated.
+class N3VLIntnp<bits<5> op27_23, bits<2> op21_20, bits<4> op11_8, bit op6,
+                bit op4, InstrItinClass itin, string OpcodeStr,
+                string Dt, ValueType ResTy, ValueType OpTy,
+                SDPatternOperator IntOp, bit Commutable>
+  : N3Vnp<op27_23, op21_20, op11_8, op6, op4,
+          (outs QPR:$Vd), (ins DPR:$Vn, DPR:$Vm), N3RegFrm, itin, OpcodeStr, Dt,
+          ResTy, OpTy, IntOp, Commutable,
+          [(set QPR:$Vd, (ResTy (IntOp (OpTy DPR:$Vn), (OpTy DPR:$Vm))))]>;
+
 class N3VLIntSL<bit op24, bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
                 string OpcodeStr, string Dt,
                 ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
@@ -3973,12 +4017,18 @@ defm VQADDu   : N3VInt_QHSD<1, 0, 0b0000, 1, N3RegFrm,
                             IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q,
                             "vqadd", "u", int_arm_neon_vqaddu, 1>;
 //   VADDHN   : Vector Add and Narrow Returning High Half (D = Q + Q)
-defm VADDHN   : N3VNInt_HSD<0,1,0b0100,0, "vaddhn", "i",
-                            int_arm_neon_vaddhn, 1>;
+defm VADDHN   : N3VNInt_HSD<0,1,0b0100,0, "vaddhn", "i", null_frag, 1>;
 //   VRADDHN  : Vector Rounding Add and Narrow Returning High Half (D = Q + Q)
 defm VRADDHN  : N3VNInt_HSD<1,1,0b0100,0, "vraddhn", "i",
                             int_arm_neon_vraddhn, 1>;
 
+def : Pat<(v8i8  (trunc (NEONvshru (add (v8i16 QPR:$Vn), QPR:$Vm), 8))),
+          (VADDHNv8i8 QPR:$Vn, QPR:$Vm)>;
+def : Pat<(v4i16 (trunc (NEONvshru (add (v4i32 QPR:$Vn), QPR:$Vm), 16))),
+          (VADDHNv4i16 QPR:$Vn, QPR:$Vm)>;
+def : Pat<(v2i32 (trunc (NEONvshru (add (v2i64 QPR:$Vn), QPR:$Vm), 32))),
+          (VADDHNv2i32 QPR:$Vn, QPR:$Vm)>;
+
 // Vector Multiply Operations.
 
 //   VMUL     : Vector Multiply (integer, polynomial and floating-point)
@@ -4016,6 +4066,17 @@ def : Pat<(v4f32 (fmul (v4f32 QPR:$src1),
                                    (DSubReg_i32_reg imm:$lane))),
                            (SubReg_i32_lane imm:$lane)))>;
 
+
+def : Pat<(v2f32 (fmul DPR:$Rn, (NEONvdup (f32 SPR:$Rm)))),
+          (VMULslfd DPR:$Rn,
+            (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), SPR:$Rm, ssub_0),
+            (i32 0))>;
+def : Pat<(v4f32 (fmul QPR:$Rn, (NEONvdup (f32 SPR:$Rm)))),
+          (VMULslfq QPR:$Rn,
+            (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), SPR:$Rm, ssub_0),
+            (i32 0))>;
+
+
 //   VQDMULH  : Vector Saturating Doubling Multiply Returning High Half
 defm VQDMULH  : N3VInt_HS<0, 0, 0b1011, 0, N3RegFrm, IIC_VMULi16D, IIC_VMULi32D,
                           IIC_VMULi16Q, IIC_VMULi32Q,
@@ -4061,12 +4122,18 @@ def : Pat<(v4i32 (int_arm_neon_vqrdmulh (v4i32 QPR:$src1),
                                   (SubReg_i32_lane imm:$lane)))>;
 
 //   VMULL    : Vector Multiply Long (integer and polynomial) (Q = D * D)
-defm VMULLs   : N3VL_QHS<0,1,0b1100,0, IIC_VMULi16D, IIC_VMULi32D,
-                         "vmull", "s", NEONvmulls, 1>;
-defm VMULLu   : N3VL_QHS<1,1,0b1100,0, IIC_VMULi16D, IIC_VMULi32D,
-                         "vmull", "u", NEONvmullu, 1>;
-def  VMULLp   : N3VLInt<0, 1, 0b00, 0b1110, 0, IIC_VMULi16D, "vmull", "p8",
-                        v8i16, v8i8, int_arm_neon_vmullp, 1>;
+let PostEncoderMethod = "NEONThumb2DataIPostEncoder",
+    DecoderNamespace = "NEONData" in {
+  defm VMULLs   : N3VL_QHS<0,1,0b1100,0, IIC_VMULi16D, IIC_VMULi32D,
+                           "vmull", "s", NEONvmulls, 1>;
+  defm VMULLu   : N3VL_QHS<1,1,0b1100,0, IIC_VMULi16D, IIC_VMULi32D,
+                           "vmull", "u", NEONvmullu, 1>;
+  def  VMULLp8   :  N3VLInt<0, 1, 0b00, 0b1110, 0, IIC_VMULi16D, "vmull", "p8",
+                            v8i16, v8i8, int_arm_neon_vmullp, 1>;
+  def  VMULLp64  : N3VLIntnp<0b00101, 0b10, 0b1110, 0, 0, NoItinerary,
+                          "vmull", "p64", v2i64, v1i64, int_arm_neon_vmullp, 1>,
+                    Requires<[HasV8, HasCrypto]>;
+}
 defm VMULLsls : N3VLSL_HS<0, 0b1010, IIC_VMULi16D, "vmull", "s", NEONvmulls>;
 defm VMULLslu : N3VLSL_HS<1, 0b1010, IIC_VMULi16D, "vmull", "u", NEONvmullu>;
 
@@ -4133,8 +4200,27 @@ defm VMLALslu : N3VLMulOpSL_HS<1, 0b0010, "vmlal", "u", NEONvmullu, add>;
 
 //   VQDMLAL  : Vector Saturating Doubling Multiply Accumulate Long (Q += D * D)
 defm VQDMLAL  : N3VLInt3_HS<0, 1, 0b1001, 0, IIC_VMACi16D, IIC_VMACi32D,
-                            "vqdmlal", "s", int_arm_neon_vqdmlal>;
-defm VQDMLALsl: N3VLInt3SL_HS<0, 0b0011, "vqdmlal", "s", int_arm_neon_vqdmlal>;
+                            "vqdmlal", "s", null_frag>;
+defm VQDMLALsl: N3VLInt3SL_HS<0, 0b0011, "vqdmlal", "s", null_frag>;
+
+def : Pat<(v4i32 (int_arm_neon_vqadds (v4i32 QPR:$src1),
+                     (v4i32 (int_arm_neon_vqdmull (v4i16 DPR:$Vn),
+                                                  (v4i16 DPR:$Vm))))),
+          (VQDMLALv4i32 QPR:$src1, DPR:$Vn, DPR:$Vm)>;
+def : Pat<(v2i64 (int_arm_neon_vqadds (v2i64 QPR:$src1),
+                     (v2i64 (int_arm_neon_vqdmull (v2i32 DPR:$Vn),
+                                                  (v2i32 DPR:$Vm))))),
+          (VQDMLALv2i64 QPR:$src1, DPR:$Vn, DPR:$Vm)>;
+def : Pat<(v4i32 (int_arm_neon_vqadds (v4i32 QPR:$src1),
+                     (v4i32 (int_arm_neon_vqdmull (v4i16 DPR:$Vn),
+                                (v4i16 (NEONvduplane (v4i16 DPR_8:$Vm),
+                                                     imm:$lane)))))),
+          (VQDMLALslv4i16 QPR:$src1, DPR:$Vn, DPR_8:$Vm, imm:$lane)>;
+def : Pat<(v2i64 (int_arm_neon_vqadds (v2i64 QPR:$src1),
+                     (v2i64 (int_arm_neon_vqdmull (v2i32 DPR:$Vn),
+                                (v2i32 (NEONvduplane (v2i32 DPR_VFP2:$Vm),
+                                                     imm:$lane)))))),
+          (VQDMLALslv2i32 QPR:$src1, DPR:$Vn, DPR_VFP2:$Vm, imm:$lane)>;
 
 //   VMLS     : Vector Multiply Subtract (integer and floating-point)
 defm VMLS     : N3VMulOp_QHS<1, 0, 0b1001, 0, IIC_VMACi16D, IIC_VMACi32D,
@@ -4190,25 +4276,44 @@ defm VMLSLslu : N3VLMulOpSL_HS<1, 0b0110, "vmlsl", "u", NEONvmullu, sub>;
 
 //   VQDMLSL  : Vector Saturating Doubling Multiply Subtract Long (Q -= D * D)
 defm VQDMLSL  : N3VLInt3_HS<0, 1, 0b1011, 0, IIC_VMACi16D, IIC_VMACi32D,
-                            "vqdmlsl", "s", int_arm_neon_vqdmlsl>;
-defm VQDMLSLsl: N3VLInt3SL_HS<0, 0b111, "vqdmlsl", "s", int_arm_neon_vqdmlsl>;
+                            "vqdmlsl", "s", null_frag>;
+defm VQDMLSLsl: N3VLInt3SL_HS<0, 0b111, "vqdmlsl", "s", null_frag>;
+
+def : Pat<(v4i32 (int_arm_neon_vqsubs (v4i32 QPR:$src1),
+                     (v4i32 (int_arm_neon_vqdmull (v4i16 DPR:$Vn),
+                                                  (v4i16 DPR:$Vm))))),
+          (VQDMLSLv4i32 QPR:$src1, DPR:$Vn, DPR:$Vm)>;
+def : Pat<(v2i64 (int_arm_neon_vqsubs (v2i64 QPR:$src1),
+                     (v2i64 (int_arm_neon_vqdmull (v2i32 DPR:$Vn),
+                                                  (v2i32 DPR:$Vm))))),
+          (VQDMLSLv2i64 QPR:$src1, DPR:$Vn, DPR:$Vm)>;
+def : Pat<(v4i32 (int_arm_neon_vqsubs (v4i32 QPR:$src1),
+                     (v4i32 (int_arm_neon_vqdmull (v4i16 DPR:$Vn),
+                                (v4i16 (NEONvduplane (v4i16 DPR_8:$Vm),
+                                                     imm:$lane)))))),
+          (VQDMLSLslv4i16 QPR:$src1, DPR:$Vn, DPR_8:$Vm, imm:$lane)>;
+def : Pat<(v2i64 (int_arm_neon_vqsubs (v2i64 QPR:$src1),
+                     (v2i64 (int_arm_neon_vqdmull (v2i32 DPR:$Vn),
+                                (v2i32 (NEONvduplane (v2i32 DPR_VFP2:$Vm),
+                                                     imm:$lane)))))),
+          (VQDMLSLslv2i32 QPR:$src1, DPR:$Vn, DPR_VFP2:$Vm, imm:$lane)>;
 
 // Fused Vector Multiply-Accumulate and Fused Multiply-Subtract Operations.
 def  VFMAfd   : N3VDMulOp<0, 0, 0b00, 0b1100, 1, IIC_VFMACD, "vfma", "f32",
                           v2f32, fmul_su, fadd_mlx>,
-                Requires<[HasVFP4,UseFusedMAC]>;
+                Requires<[HasNEON,HasVFP4,UseFusedMAC]>;
 
 def  VFMAfq   : N3VQMulOp<0, 0, 0b00, 0b1100, 1, IIC_VFMACQ, "vfma", "f32",
                           v4f32, fmul_su, fadd_mlx>,
-                Requires<[HasVFP4,UseFusedMAC]>;
+                Requires<[HasNEON,HasVFP4,UseFusedMAC]>;
 
 //   Fused Vector Multiply Subtract (floating-point)
 def  VFMSfd   : N3VDMulOp<0, 0, 0b10, 0b1100, 1, IIC_VFMACD, "vfms", "f32",
                           v2f32, fmul_su, fsub_mlx>,
-                Requires<[HasVFP4,UseFusedMAC]>;
+                Requires<[HasNEON,HasVFP4,UseFusedMAC]>;
 def  VFMSfq   : N3VQMulOp<0, 0, 0b10, 0b1100, 1, IIC_VFMACQ, "vfms", "f32",
                           v4f32, fmul_su, fsub_mlx>,
-                Requires<[HasVFP4,UseFusedMAC]>;
+                Requires<[HasNEON,HasVFP4,UseFusedMAC]>;
 
 // Match @llvm.fma.* intrinsics
 def : Pat<(v2f32 (fma DPR:$Vn, DPR:$Vm, DPR:$src1)),
@@ -4256,12 +4361,18 @@ defm VQSUBu   : N3VInt_QHSD<1, 0, 0b0010, 1, N3RegFrm,
                             IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q,
                             "vqsub", "u", int_arm_neon_vqsubu, 0>;
 //   VSUBHN   : Vector Subtract and Narrow Returning High Half (D = Q - Q)
-defm VSUBHN   : N3VNInt_HSD<0,1,0b0110,0, "vsubhn", "i",
-                            int_arm_neon_vsubhn, 0>;
+defm VSUBHN   : N3VNInt_HSD<0,1,0b0110,0, "vsubhn", "i", null_frag, 0>;
 //   VRSUBHN  : Vector Rounding Subtract and Narrow Returning High Half (D=Q-Q)
 defm VRSUBHN  : N3VNInt_HSD<1,1,0b0110,0, "vrsubhn", "i",
                             int_arm_neon_vrsubhn, 0>;
 
+def : Pat<(v8i8  (trunc (NEONvshru (sub (v8i16 QPR:$Vn), QPR:$Vm), 8))),
+          (VSUBHNv8i8 QPR:$Vn, QPR:$Vm)>;
+def : Pat<(v4i16 (trunc (NEONvshru (sub (v4i32 QPR:$Vn), QPR:$Vm), 16))),
+          (VSUBHNv4i16 QPR:$Vn, QPR:$Vm)>;
+def : Pat<(v2i32 (trunc (NEONvshru (sub (v2i64 QPR:$Vn), QPR:$Vm), 32))),
+          (VSUBHNv2i32 QPR:$Vn, QPR:$Vm)>;
+
 // Vector Comparisons.
 
 //   VCEQ     : Vector Compare Equal
@@ -5047,10 +5158,10 @@ def  VSWPq    : N2VX<0b11, 0b11, 0b00, 0b10, 0b00000, 1, 0,
 // Vector Move Operations.
 
 //   VMOV     : Vector Move (Register)
-def : InstAlias<"vmov${p} $Vd, $Vm",
-                (VORRd DPR:$Vd, DPR:$Vm, DPR:$Vm, pred:$p)>;
-def : InstAlias<"vmov${p} $Vd, $Vm",
-                (VORRq QPR:$Vd, QPR:$Vm, QPR:$Vm, pred:$p)>;
+def : NEONInstAlias<"vmov${p} $Vd, $Vm",
+                    (VORRd DPR:$Vd, DPR:$Vm, DPR:$Vm, pred:$p)>;
+def : NEONInstAlias<"vmov${p} $Vd, $Vm",
+                    (VORRq QPR:$Vd, QPR:$Vm, QPR:$Vm, pred:$p)>;
 
 //   VMOV     : Vector Move (Immediate)
 
@@ -5461,6 +5572,25 @@ def VCVTxu2fq : N2VCvtQ<1, 1, 0b1110, 0, 1, "vcvt", "f32.u32",
                         v4f32, v4i32, int_arm_neon_vcvtfxu2fp>;
 }
 
+def : NEONInstAlias<"vcvt${p}.s32.f32 $Dd, $Dm, #0", 
+                    (VCVTf2sd DPR:$Dd, DPR:$Dm, pred:$p)>;
+def : NEONInstAlias<"vcvt${p}.u32.f32 $Dd, $Dm, #0", 
+                    (VCVTf2ud DPR:$Dd, DPR:$Dm, pred:$p)>;
+def : NEONInstAlias<"vcvt${p}.f32.s32 $Dd, $Dm, #0", 
+                    (VCVTs2fd DPR:$Dd, DPR:$Dm, pred:$p)>;
+def : NEONInstAlias<"vcvt${p}.f32.u32 $Dd, $Dm, #0", 
+                    (VCVTu2fd DPR:$Dd, DPR:$Dm, pred:$p)>;
+
+def : NEONInstAlias<"vcvt${p}.s32.f32 $Qd, $Qm, #0", 
+                    (VCVTf2sq QPR:$Qd, QPR:$Qm, pred:$p)>;
+def : NEONInstAlias<"vcvt${p}.u32.f32 $Qd, $Qm, #0", 
+                    (VCVTf2uq QPR:$Qd, QPR:$Qm, pred:$p)>;
+def : NEONInstAlias<"vcvt${p}.f32.s32 $Qd, $Qm, #0", 
+                    (VCVTs2fq QPR:$Qd, QPR:$Qm, pred:$p)>;
+def : NEONInstAlias<"vcvt${p}.f32.u32 $Qd, $Qm, #0", 
+                    (VCVTu2fq QPR:$Qd, QPR:$Qm, pred:$p)>;
+
+
 //   VCVT     : Vector Convert Between Half-Precision and Single-Precision.
 def  VCVTf2h  : N2VNInt<0b11, 0b11, 0b01, 0b10, 0b01100, 0, 0,
                         IIC_VUNAQ, "vcvt", "f16.f32",
@@ -5725,9 +5855,9 @@ multiclass VRINT_FPI<string op, bits<3> op9_7, SDPatternOperator Int> {
     }
   }
 
-  def : InstAlias<!strconcat("vrint", op, ".f32.f32\t$Dd, $Dm"),
+  def : NEONInstAlias<!strconcat("vrint", op, ".f32.f32\t$Dd, $Dm"),
                   (!cast<Instruction>(NAME#"D") DPR:$Dd, DPR:$Dm)>;
-  def : InstAlias<!strconcat("vrint", op, ".f32.f32\t$Qd, $Qm"),
+  def : NEONInstAlias<!strconcat("vrint", op, ".f32.f32\t$Qd, $Qm"),
                   (!cast<Instruction>(NAME#"Q") QPR:$Qd, QPR:$Qm)>;
 }
 
@@ -5738,6 +5868,49 @@ defm VRINTZN : VRINT_FPI<"z", 0b011, int_arm_neon_vrintz>;
 defm VRINTMN : VRINT_FPI<"m", 0b101, int_arm_neon_vrintm>;
 defm VRINTPN : VRINT_FPI<"p", 0b111, int_arm_neon_vrintp>;
 
+// Cryptography instructions
+let PostEncoderMethod = "NEONThumb2DataIPostEncoder",
+    DecoderNamespace = "v8Crypto" in {
+  class AES<string op, bit op7, bit op6, SDPatternOperator Int>
+    : N2VQIntXnp<0b00, 0b00, 0b011, op6, op7, NoItinerary,
+                 !strconcat("aes", op), "8", v16i8, v16i8, Int>,
+      Requires<[HasV8, HasCrypto]>;
+  class AES2Op<string op, bit op7, bit op6, SDPatternOperator Int>
+    : N2VQIntX2np<0b00, 0b00, 0b011, op6, op7, NoItinerary,
+                 !strconcat("aes", op), "8", v16i8, v16i8, Int>,
+      Requires<[HasV8, HasCrypto]>;
+  class N2SHA<string op, bits<2> op17_16, bits<3> op10_8, bit op7, bit op6,
+              SDPatternOperator Int>
+    : N2VQIntXnp<0b10, op17_16, op10_8, op6, op7, NoItinerary,
+                 !strconcat("sha", op), "32", v4i32, v4i32, Int>,
+      Requires<[HasV8, HasCrypto]>;
+  class N2SHA2Op<string op, bits<2> op17_16, bits<3> op10_8, bit op7, bit op6,
+              SDPatternOperator Int>
+    : N2VQIntX2np<0b10, op17_16, op10_8, op6, op7, NoItinerary,
+                 !strconcat("sha", op), "32", v4i32, v4i32, Int>,
+      Requires<[HasV8, HasCrypto]>;
+  class N3SHA3Op<string op, bits<5> op27_23, bits<2> op21_20, SDPatternOperator Int>
+    : N3VQInt3np<op27_23, op21_20, 0b1100, 1, 0, N3RegFrm, NoItinerary,
+                !strconcat("sha", op), "32", v4i32, v4i32, Int, 0>,
+      Requires<[HasV8, HasCrypto]>;
+}
+
+def AESD : AES2Op<"d", 0, 1, int_arm_neon_aesd>;
+def AESE : AES2Op<"e", 0, 0, int_arm_neon_aese>;
+def AESIMC : AES<"imc", 1, 1, int_arm_neon_aesimc>;
+def AESMC : AES<"mc", 1, 0, int_arm_neon_aesmc>;
+
+def SHA1H : N2SHA<"1h", 0b01, 0b010, 1, 1, int_arm_neon_sha1h>;
+def SHA1SU1 : N2SHA2Op<"1su1", 0b10, 0b011, 1, 0, int_arm_neon_sha1su1>;
+def SHA256SU0 : N2SHA2Op<"256su0", 0b10, 0b011, 1, 1, int_arm_neon_sha256su0>;
+def SHA1C : N3SHA3Op<"1c", 0b00100, 0b00, int_arm_neon_sha1c>;
+def SHA1M : N3SHA3Op<"1m", 0b00100, 0b10, int_arm_neon_sha1m>;
+def SHA1P : N3SHA3Op<"1p", 0b00100, 0b01, int_arm_neon_sha1p>;
+def SHA1SU0 : N3SHA3Op<"1su0", 0b00100, 0b11, int_arm_neon_sha1su0>;
+def SHA256H : N3SHA3Op<"256h", 0b00110, 0b00, int_arm_neon_sha256h>;
+def SHA256H2 : N3SHA3Op<"256h2", 0b00110, 0b01, int_arm_neon_sha256h2>;
+def SHA256SU1 : N3SHA3Op<"256su1", 0b00110, 0b10, int_arm_neon_sha256su1>;
+
 //===----------------------------------------------------------------------===//
 // NEON instructions for single-precision FP math
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/ARM/ARMInstrThumb.td b/lib/Target/ARM/ARMInstrThumb.td
index e7218c6..af5ef53 100644
--- a/lib/Target/ARM/ARMInstrThumb.td
+++ b/lib/Target/ARM/ARMInstrThumb.td
@@ -269,25 +269,26 @@ class T1SystemEncoding<bits<8> opc>
   let Inst{7-0} = opc;
 }
 
-def tNOP : T1pI<(outs), (ins), NoItinerary, "nop", "", []>,
-           T1SystemEncoding<0x00>, // A8.6.110
-        Requires<[IsThumb2]>;
-
-def tYIELD : T1pI<(outs), (ins), NoItinerary, "yield", "", []>,
-           T1SystemEncoding<0x10>, // A8.6.410
-           Requires<[IsThumb2]>;
-
-def tWFE : T1pI<(outs), (ins), NoItinerary, "wfe", "", []>,
-           T1SystemEncoding<0x20>, // A8.6.408
-           Requires<[IsThumb2]>;
+def tHINT : T1pI<(outs), (ins imm0_15:$imm), NoItinerary, "hint", "\t$imm", []>,
+            T1SystemEncoding<0x00>,
+            Requires<[IsThumb, HasV6M]> {
+  bits<4> imm;
+  let Inst{7-4} = imm;
+}
 
-def tWFI : T1pI<(outs), (ins), NoItinerary, "wfi", "", []>,
-           T1SystemEncoding<0x30>, // A8.6.409
-           Requires<[IsThumb2]>;
+class tHintAlias<string Asm, dag Result> : tInstAlias<Asm, Result> {
+  let Predicates = [IsThumb, HasV6M];
+}
 
-def tSEV : T1pI<(outs), (ins), NoItinerary, "sev", "", []>,
-           T1SystemEncoding<0x40>, // A8.6.157
-           Requires<[IsThumb2]>;
+def : tHintAlias<"nop$p", (tHINT 0, pred:$p)>; // A8.6.110
+def : tHintAlias<"yield$p", (tHINT 1, pred:$p)>; // A8.6.410
+def : tHintAlias<"wfe$p", (tHINT 2, pred:$p)>; // A8.6.408
+def : tHintAlias<"wfi$p", (tHINT 3, pred:$p)>; // A8.6.409
+def : tHintAlias<"sev$p", (tHINT 4, pred:$p)>; // A8.6.157
+def : tInstAlias<"sevl$p", (tHINT 5, pred:$p)> {
+  let Predicates = [IsThumb2, HasV8];
+}
+def : T2Pat<(int_arm_sevl), (tHINT 5)>;
 
 // The imm operand $val can be used by a debugger to store more information
 // about the breakpoint.
@@ -300,8 +301,15 @@ def tBKPT : T1I<(outs), (ins imm0_255:$val), NoItinerary, "bkpt\t$val",
   let Inst{7-0} = val;
 }
 
+def tHLT : T1I<(outs), (ins imm0_63:$val), NoItinerary, "hlt\t$val",
+                []>, T1Encoding<0b101110>, Requires<[IsThumb, HasV8]> {
+  let Inst{9-6} = 0b1010;
+  bits<6> val;
+  let Inst{5-0} = val;
+}
+
 def tSETEND : T1I<(outs), (ins setend_op:$end), NoItinerary, "setend\t$end",
-                  []>, T1Encoding<0b101101> {
+                  []>, T1Encoding<0b101101>, Deprecated<HasV8Ops> {
   bits<1> end;
   // A8.6.156
   let Inst{9-5} = 0b10010;
@@ -491,7 +499,8 @@ let isBranch = 1, isTerminator = 1, isBarrier = 1 in {
              T1Encoding<{1,1,1,0,0,?}>, Sched<[WriteBr]> {
     bits<11> target;
     let Inst{10-0} = target;
-  }
+    let AsmMatchConverter = "cvtThumbBranches";
+ }
 
   // Far jump
   // Just a pseudo for a tBL instruction. Needed to let regalloc know about
@@ -521,6 +530,7 @@ let isBranch = 1, isTerminator = 1 in
   bits<8> target;
   let Inst{11-8} = p;
   let Inst{7-0} = target;
+  let AsmMatchConverter = "cvtThumbBranches";
 }
 
 
@@ -660,9 +670,6 @@ def tLDRpci : T1pIs<(outs tGPR:$Rt), (ins t_addrmode_pc:$addr), IIC_iLoad_i,
   let Inst{7-0}  = addr;
 }
 
-def : tInstAlias<"ldr${p}.n $Rt, $addr", 
-                 (tLDRpci tGPR:$Rt, t_addrmode_pc:$addr, pred:$p), 0>;
-
 // A8.6.194 & A8.6.192
 defm tSTR  : thumb_st_rr_ri_enc<0b000, 0b0110, t_addrmode_rrs4,
                                 t_addrmode_is4, AddrModeT1_4,
@@ -1205,9 +1212,9 @@ def tUXTH :                     // A8.6.264
 // Expanded after instruction selection into a branch sequence.
 let usesCustomInserter = 1 in  // Expanded after instruction selection.
   def tMOVCCr_pseudo :
-  PseudoInst<(outs tGPR:$dst), (ins tGPR:$false, tGPR:$true, pred:$cc),
-              NoItinerary,
-             [/*(set tGPR:$dst, (ARMcmov tGPR:$false, tGPR:$true, imm:$cc))*/]>;
+  PseudoInst<(outs tGPR:$dst), (ins tGPR:$false, tGPR:$true, cmovpred:$p),
+             NoItinerary,
+             [(set tGPR:$dst, (ARMcmov tGPR:$false, tGPR:$true, cmovpred:$p))]>;
 
 // tLEApcrel - Load a pc-relative address into a register without offending the
 // assembler.
diff --git a/lib/Target/ARM/ARMInstrThumb2.td b/lib/Target/ARM/ARMInstrThumb2.td
index 84086a5..48acffd 100644
--- a/lib/Target/ARM/ARMInstrThumb2.td
+++ b/lib/Target/ARM/ARMInstrThumb2.td
@@ -465,6 +465,18 @@ class T2ThreeReg<dag oops, dag iops, InstrItinClass itin,
   let Inst{3-0}   = Rm;
 }
 
+class T2ThreeRegNoP<dag oops, dag iops, InstrItinClass itin,
+           string asm, list<dag> pattern>
+  : T2XI<oops, iops, itin, asm, pattern> {
+  bits<4> Rd;
+  bits<4> Rn;
+  bits<4> Rm;
+
+  let Inst{11-8}  = Rd;
+  let Inst{19-16} = Rn;
+  let Inst{3-0}   = Rm;
+}
+
 class T2sThreeReg<dag oops, dag iops, InstrItinClass itin,
            string opc, string asm, list<dag> pattern>
   : T2sI<oops, iops, itin, opc, asm, pattern> {
@@ -1396,6 +1408,32 @@ def t2LDRHT  : T2IldT<0, 0b01, "ldrht", IIC_iLoad_bh_i>;
 def t2LDRSBT : T2IldT<1, 0b00, "ldrsbt", IIC_iLoad_bh_i>;
 def t2LDRSHT : T2IldT<1, 0b01, "ldrsht", IIC_iLoad_bh_i>;
 
+class T2Ildacq<bits<4> bits23_20, bits<2> bit54, dag oops, dag iops,
+               string opc, string asm, list<dag> pattern>
+  : Thumb2I<oops, iops, AddrModeNone, 4, NoItinerary,
+            opc, asm, "", pattern>, Requires<[IsThumb, HasV8]> {
+  bits<4> Rt;
+  bits<4> addr;
+
+  let Inst{31-27} = 0b11101;
+  let Inst{26-24} = 0b000;
+  let Inst{23-20} = bits23_20;
+  let Inst{11-6} = 0b111110;
+  let Inst{5-4} = bit54;
+  let Inst{3-0} = 0b1111;
+
+  // Encode instruction operands
+  let Inst{19-16} = addr;
+  let Inst{15-12} = Rt;
+}
+
+def t2LDA : T2Ildacq<0b1101, 0b10, (outs rGPR:$Rt),
+                     (ins addr_offset_none:$addr), "lda", "\t$Rt, $addr", []>;
+def t2LDAB : T2Ildacq<0b1101, 0b00, (outs rGPR:$Rt),
+                      (ins addr_offset_none:$addr), "ldab", "\t$Rt, $addr", []>;
+def t2LDAH : T2Ildacq<0b1101, 0b01, (outs rGPR:$Rt),
+                      (ins addr_offset_none:$addr), "ldah", "\t$Rt, $addr", []>;
+
 // Store
 defm t2STR :T2I_st<0b10,"str", IIC_iStore_i, IIC_iStore_si, GPR,
                    BinOpFrag<(store node:$LHS, node:$RHS)>>;
@@ -1539,6 +1577,31 @@ def t2STRD_POST : T2Ii8s4post<0, 1, 0, (outs GPR:$wb),
                  IIC_iStore_d_ru, "strd", "\t$Rt, $Rt2, $addr$imm",
                  "$addr.base = $wb", []>;
 
+class T2Istrrel<bits<2> bit54, dag oops, dag iops,
+                string opc, string asm, list<dag> pattern>
+  : Thumb2I<oops, iops, AddrModeNone, 4, NoItinerary, opc,
+            asm, "", pattern>, Requires<[IsThumb, HasV8]> {
+  bits<4> Rt;
+  bits<4> addr;
+
+  let Inst{31-27} = 0b11101;
+  let Inst{26-20} = 0b0001100;
+  let Inst{11-6} = 0b111110;
+  let Inst{5-4} = bit54;
+  let Inst{3-0} = 0b1111;
+
+  // Encode instruction operands
+  let Inst{19-16} = addr;
+  let Inst{15-12} = Rt;
+}
+
+def t2STL  : T2Istrrel<0b10, (outs), (ins rGPR:$Rt, addr_offset_none:$addr),
+                       "stl", "\t$Rt, $addr", []>;
+def t2STLB : T2Istrrel<0b00, (outs), (ins rGPR:$Rt, addr_offset_none:$addr),
+                       "stlb", "\t$Rt, $addr", []>;
+def t2STLH : T2Istrrel<0b01, (outs), (ins rGPR:$Rt, addr_offset_none:$addr),
+                       "stlh", "\t$Rt, $addr", []>;
+
 // T2Ipl (Preload Data/Instruction) signals the memory system of possible future
 // data/instruction access.
 // instr_write is inverted for Thumb mode: (prefetch 3) -> (preload 0),
@@ -1855,6 +1918,9 @@ def t2MOVi16 : T2I<(outs rGPR:$Rd), (ins imm0_65535_expr:$imm), IIC_iMOVi,
   let DecoderMethod = "DecodeT2MOVTWInstruction";
 }
 
+def : t2InstAlias<"mov${p} $Rd, $imm", 
+                  (t2MOVi16 rGPR:$Rd, imm256_65535_expr:$imm, pred:$p)>;
+
 def t2MOVi16_ga_pcrel : PseudoInst<(outs rGPR:$Rd),
                                 (ins i32imm:$addr, pclabel:$id), IIC_iMOVi, []>;
 
@@ -2950,6 +3016,34 @@ def : T2Pat<(or (and rGPR:$src1, 0xFFFF0000),
             Requires<[HasT2ExtractPack, IsThumb2]>;
 
 //===----------------------------------------------------------------------===//
+// CRC32 Instructions
+//
+// Polynomials:
+// + CRC32{B,H,W}       0x04C11DB7
+// + CRC32C{B,H,W}      0x1EDC6F41
+//
+
+class T2I_crc32<bit C, bits<2> sz, string suffix, SDPatternOperator builtin>
+  : T2ThreeRegNoP<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), NoItinerary,
+               !strconcat("crc32", suffix, "\t$Rd, $Rn, $Rm"),
+               [(set rGPR:$Rd, (builtin rGPR:$Rn, rGPR:$Rm))]>,
+               Requires<[IsThumb2, HasV8, HasCRC]> {
+  let Inst{31-27} = 0b11111;
+  let Inst{26-21} = 0b010110;
+  let Inst{20}    = C;
+  let Inst{15-12} = 0b1111;
+  let Inst{7-6}   = 0b10;
+  let Inst{5-4}   = sz;
+}
+
+def t2CRC32B  : T2I_crc32<0, 0b00, "b", int_arm_crc32b>;
+def t2CRC32CB : T2I_crc32<1, 0b00, "cb", int_arm_crc32cb>;
+def t2CRC32H  : T2I_crc32<0, 0b01, "h", int_arm_crc32h>;
+def t2CRC32CH : T2I_crc32<1, 0b01, "ch", int_arm_crc32ch>;
+def t2CRC32W  : T2I_crc32<0, 0b10, "w", int_arm_crc32w>;
+def t2CRC32CW : T2I_crc32<1, 0b10, "cw", int_arm_crc32cw>;
+
+//===----------------------------------------------------------------------===//
 //  Comparison Instructions...
 //
 defm t2CMP  : T2I_cmp_irs<0b1101, "cmp",
@@ -3029,93 +3123,67 @@ defm t2TEQ  : T2I_cmp_irs<0b0100, "teq",
                          BinOpFrag<(ARMcmpZ (xor_su node:$LHS, node:$RHS), 0)>>;
 
 // Conditional moves
-// FIXME: should be able to write a pattern for ARMcmov, but can't use
-// a two-value operand where a dag node expects two operands. :(
 let neverHasSideEffects = 1 in {
 
 let isCommutable = 1, isSelect = 1 in
 def t2MOVCCr : t2PseudoInst<(outs rGPR:$Rd),
-                            (ins rGPR:$false, rGPR:$Rm, pred:$p),
+                            (ins rGPR:$false, rGPR:$Rm, cmovpred:$p),
                             4, IIC_iCMOVr,
-   [/*(set rGPR:$Rd, (ARMcmov rGPR:$false, rGPR:$Rm, imm:$cc, CCR:$ccr))*/]>,
-                RegConstraint<"$false = $Rd">,
-                Sched<[WriteALU]>;
+                            [(set rGPR:$Rd, (ARMcmov rGPR:$false, rGPR:$Rm,
+                                                     cmovpred:$p))]>,
+               RegConstraint<"$false = $Rd">, Sched<[WriteALU]>;
 
 let isMoveImm = 1 in
-def t2MOVCCi : t2PseudoInst<(outs rGPR:$Rd),
-                            (ins rGPR:$false, t2_so_imm:$imm, pred:$p),
+def t2MOVCCi
+    : t2PseudoInst<(outs rGPR:$Rd),
+                   (ins rGPR:$false, t2_so_imm:$imm, cmovpred:$p),
                    4, IIC_iCMOVi,
-[/*(set rGPR:$Rd,(ARMcmov rGPR:$false,t2_so_imm:$imm, imm:$cc, CCR:$ccr))*/]>,
-                   RegConstraint<"$false = $Rd">, Sched<[WriteALU]>;
+                   [(set rGPR:$Rd, (ARMcmov rGPR:$false,t2_so_imm:$imm,
+                                            cmovpred:$p))]>,
+      RegConstraint<"$false = $Rd">, Sched<[WriteALU]>;
 
-// FIXME: Pseudo-ize these. For now, just mark codegen only.
 let isCodeGenOnly = 1 in {
 let isMoveImm = 1 in
-def t2MOVCCi16 : T2I<(outs rGPR:$Rd), (ins rGPR:$false, imm0_65535_expr:$imm),
-                      IIC_iCMOVi,
-                      "movw", "\t$Rd, $imm", []>,
-                      RegConstraint<"$false = $Rd">, Sched<[WriteALU]> {
-  let Inst{31-27} = 0b11110;
-  let Inst{25} = 1;
-  let Inst{24-21} = 0b0010;
-  let Inst{20} = 0; // The S bit.
-  let Inst{15} = 0;
-
-  bits<4> Rd;
-  bits<16> imm;
-
-  let Inst{11-8}  = Rd;
-  let Inst{19-16} = imm{15-12};
-  let Inst{26}    = imm{11};
-  let Inst{14-12} = imm{10-8};
-  let Inst{7-0}   = imm{7-0};
-}
+def t2MOVCCi16
+    : t2PseudoInst<(outs rGPR:$Rd),
+                   (ins  rGPR:$false, imm0_65535_expr:$imm, cmovpred:$p),
+                   4, IIC_iCMOVi,
+                   [(set rGPR:$Rd, (ARMcmov rGPR:$false, imm0_65535:$imm,
+                                            cmovpred:$p))]>,
+      RegConstraint<"$false = $Rd">, Sched<[WriteALU]>;
 
 let isMoveImm = 1 in
-def t2MOVCCi32imm : PseudoInst<(outs rGPR:$dst),
-                               (ins rGPR:$false, i32imm:$src, pred:$p),
-                    IIC_iCMOVix2, []>, RegConstraint<"$false = $dst">;
+def t2MVNCCi
+    : t2PseudoInst<(outs rGPR:$Rd),
+                   (ins rGPR:$false, t2_so_imm:$imm, cmovpred:$p),
+                   4, IIC_iCMOVi,
+                   [(set rGPR:$Rd,
+                         (ARMcmov rGPR:$false, t2_so_imm_not:$imm,
+                                  cmovpred:$p))]>,
+      RegConstraint<"$false = $Rd">, Sched<[WriteALU]>;
+
+class MOVCCShPseudo<SDPatternOperator opnode, Operand ty>
+    : t2PseudoInst<(outs rGPR:$Rd),
+                   (ins rGPR:$false, rGPR:$Rm, i32imm:$imm, cmovpred:$p),
+                   4, IIC_iCMOVsi,
+                   [(set rGPR:$Rd, (ARMcmov rGPR:$false,
+                                            (opnode rGPR:$Rm, (i32 ty:$imm)),
+                                            cmovpred:$p))]>,
+      RegConstraint<"$false = $Rd">, Sched<[WriteALU]>;
+
+def t2MOVCClsl : MOVCCShPseudo<shl,  imm0_31>;
+def t2MOVCClsr : MOVCCShPseudo<srl,  imm_sr>;
+def t2MOVCCasr : MOVCCShPseudo<sra,  imm_sr>;
+def t2MOVCCror : MOVCCShPseudo<rotr, imm0_31>;
 
 let isMoveImm = 1 in
-def t2MVNCCi : T2OneRegImm<(outs rGPR:$Rd), (ins rGPR:$false, t2_so_imm:$imm),
-                   IIC_iCMOVi, "mvn", "\t$Rd, $imm",
-[/*(set rGPR:$Rd,(ARMcmov rGPR:$false,t2_so_imm_not:$imm,
-                   imm:$cc, CCR:$ccr))*/]>,
-                   RegConstraint<"$false = $Rd">, Sched<[WriteALU]> {
-  let Inst{31-27} = 0b11110;
-  let Inst{25} = 0;
-  let Inst{24-21} = 0b0011;
-  let Inst{20} = 0; // The S bit.
-  let Inst{19-16} = 0b1111; // Rn
-  let Inst{15} = 0;
-}
-
-class T2I_movcc_sh<bits<2> opcod, dag oops, dag iops, InstrItinClass itin,
-                   string opc, string asm, list<dag> pattern>
-  : T2TwoRegShiftImm<oops, iops, itin, opc, asm, pattern>, Sched<[WriteALU]> {
-  let Inst{31-27} = 0b11101;
-  let Inst{26-25} = 0b01;
-  let Inst{24-21} = 0b0010;
-  let Inst{20} = 0; // The S bit.
-  let Inst{19-16} = 0b1111; // Rn
-  let Inst{5-4} = opcod; // Shift type.
-}
-def t2MOVCClsl : T2I_movcc_sh<0b00, (outs rGPR:$Rd),
-                             (ins rGPR:$false, rGPR:$Rm, i32imm:$imm),
-                             IIC_iCMOVsi, "lsl", ".w\t$Rd, $Rm, $imm", []>,
-                 RegConstraint<"$false = $Rd">;
-def t2MOVCClsr : T2I_movcc_sh<0b01, (outs rGPR:$Rd),
-                             (ins rGPR:$false, rGPR:$Rm, i32imm:$imm),
-                             IIC_iCMOVsi, "lsr", ".w\t$Rd, $Rm, $imm", []>,
-                 RegConstraint<"$false = $Rd">;
-def t2MOVCCasr : T2I_movcc_sh<0b10, (outs rGPR:$Rd),
-                             (ins rGPR:$false, rGPR:$Rm, i32imm:$imm),
-                             IIC_iCMOVsi, "asr", ".w\t$Rd, $Rm, $imm", []>,
-                 RegConstraint<"$false = $Rd">;
-def t2MOVCCror : T2I_movcc_sh<0b11, (outs rGPR:$Rd),
-                             (ins rGPR:$false, rGPR:$Rm, i32imm:$imm),
-                             IIC_iCMOVsi, "ror", ".w\t$Rd, $Rm, $imm", []>,
-                 RegConstraint<"$false = $Rd">;
+def t2MOVCCi32imm
+    : t2PseudoInst<(outs rGPR:$dst),
+                   (ins rGPR:$false, i32imm:$src, cmovpred:$p),
+                   8, IIC_iCMOVix2,
+                   [(set rGPR:$dst, (ARMcmov rGPR:$false, imm:$src,
+                                             cmovpred:$p))]>,
+      RegConstraint<"$false = $dst">;
 } // isCodeGenOnly = 1
 
 } // neverHasSideEffects
@@ -3127,7 +3195,7 @@ def t2MOVCCror : T2I_movcc_sh<0b11, (outs rGPR:$Rd),
 // memory barriers protect the atomic sequences
 let hasSideEffects = 1 in {
 def t2DMB : T2I<(outs), (ins memb_opt:$opt), NoItinerary,
-                "dmb", "\t$opt", [(ARMMemBarrier (i32 imm:$opt))]>,
+                "dmb", "\t$opt", [(int_arm_dmb (i32 imm0_15:$opt))]>,
                 Requires<[HasDB]> {
   bits<4> opt;
   let Inst{31-4} = 0xf3bf8f5;
@@ -3136,7 +3204,8 @@ def t2DMB : T2I<(outs), (ins memb_opt:$opt), NoItinerary,
 }
 
 def t2DSB : T2I<(outs), (ins memb_opt:$opt), NoItinerary,
-                "dsb", "\t$opt", []>, Requires<[HasDB]> {
+                "dsb", "\t$opt", [(int_arm_dsb (i32 imm0_15:$opt))]>,
+                Requires<[HasDB]> {
   bits<4> opt;
   let Inst{31-4} = 0xf3bf8f4;
   let Inst{3-0} = opt;
@@ -3149,15 +3218,14 @@ def t2ISB : T2I<(outs), (ins instsyncb_opt:$opt), NoItinerary,
   let Inst{3-0} = opt;
 }
 
-class T2I_ldrex<bits<2> opcod, dag oops, dag iops, AddrMode am, int sz,
+class T2I_ldrex<bits<4> opcod, dag oops, dag iops, AddrMode am, int sz,
                 InstrItinClass itin, string opc, string asm, string cstr,
                 list<dag> pattern, bits<4> rt2 = 0b1111>
   : Thumb2I<oops, iops, am, sz, itin, opc, asm, cstr, pattern> {
   let Inst{31-27} = 0b11101;
   let Inst{26-20} = 0b0001101;
   let Inst{11-8} = rt2;
-  let Inst{7-6} = 0b01;
-  let Inst{5-4} = opcod;
+  let Inst{7-4} = opcod;
   let Inst{3-0} = 0b1111;
 
   bits<4> addr;
@@ -3165,15 +3233,14 @@ class T2I_ldrex<bits<2> opcod, dag oops, dag iops, AddrMode am, int sz,
   let Inst{19-16} = addr;
   let Inst{15-12} = Rt;
 }
-class T2I_strex<bits<2> opcod, dag oops, dag iops, AddrMode am, int sz,
+class T2I_strex<bits<4> opcod, dag oops, dag iops, AddrMode am, int sz,
                 InstrItinClass itin, string opc, string asm, string cstr,
                 list<dag> pattern, bits<4> rt2 = 0b1111>
   : Thumb2I<oops, iops, am, sz, itin, opc, asm, cstr, pattern> {
   let Inst{31-27} = 0b11101;
   let Inst{26-20} = 0b0001100;
   let Inst{11-8} = rt2;
-  let Inst{7-6} = 0b01;
-  let Inst{5-4} = opcod;
+  let Inst{7-4} = opcod;
 
   bits<4> Rd;
   bits<4> addr;
@@ -3184,11 +3251,11 @@ class T2I_strex<bits<2> opcod, dag oops, dag iops, AddrMode am, int sz,
 }
 
 let mayLoad = 1 in {
-def t2LDREXB : T2I_ldrex<0b00, (outs rGPR:$Rt), (ins addr_offset_none:$addr),
+def t2LDREXB : T2I_ldrex<0b0100, (outs rGPR:$Rt), (ins addr_offset_none:$addr),
                          AddrModeNone, 4, NoItinerary,
                          "ldrexb", "\t$Rt, $addr", "",
                          [(set rGPR:$Rt, (ldrex_1 addr_offset_none:$addr))]>;
-def t2LDREXH : T2I_ldrex<0b01, (outs rGPR:$Rt), (ins addr_offset_none:$addr),
+def t2LDREXH : T2I_ldrex<0b0101, (outs rGPR:$Rt), (ins addr_offset_none:$addr),
                          AddrModeNone, 4, NoItinerary,
                          "ldrexh", "\t$Rt, $addr", "",
                          [(set rGPR:$Rt, (ldrex_2 addr_offset_none:$addr))]>;
@@ -3206,7 +3273,7 @@ def t2LDREX  : Thumb2I<(outs rGPR:$Rt), (ins t2addrmode_imm0_1020s4:$addr),
   let Inst{7-0} = addr{7-0};
 }
 let hasExtraDefRegAllocReq = 1 in
-def t2LDREXD : T2I_ldrex<0b11, (outs rGPR:$Rt, rGPR:$Rt2),
+def t2LDREXD : T2I_ldrex<0b0111, (outs rGPR:$Rt, rGPR:$Rt2),
                          (ins addr_offset_none:$addr),
                          AddrModeNone, 4, NoItinerary,
                          "ldrexd", "\t$Rt, $Rt2, $addr", "",
@@ -3214,16 +3281,48 @@ def t2LDREXD : T2I_ldrex<0b11, (outs rGPR:$Rt, rGPR:$Rt2),
   bits<4> Rt2;
   let Inst{11-8} = Rt2;
 }
+def t2LDAEXB : T2I_ldrex<0b1100, (outs rGPR:$Rt), (ins addr_offset_none:$addr),
+                         AddrModeNone, 4, NoItinerary,
+                         "ldaexb", "\t$Rt, $addr", "",
+                         []>, Requires<[IsThumb, HasV8]>;
+def t2LDAEXH : T2I_ldrex<0b1101, (outs rGPR:$Rt), (ins addr_offset_none:$addr),
+                         AddrModeNone, 4, NoItinerary,
+                         "ldaexh", "\t$Rt, $addr", "",
+                         []>, Requires<[IsThumb, HasV8]>;
+def t2LDAEX  : Thumb2I<(outs rGPR:$Rt), (ins addr_offset_none:$addr),
+                       AddrModeNone, 4, NoItinerary,
+                       "ldaex", "\t$Rt, $addr", "",
+                     []>, Requires<[IsThumb, HasV8]> {
+  bits<4> Rt;
+  bits<4> addr;
+  let Inst{31-27} = 0b11101;
+  let Inst{26-20} = 0b0001101;
+  let Inst{19-16} = addr;
+  let Inst{15-12} = Rt;
+  let Inst{11-8} = 0b1111;
+  let Inst{7-0} = 0b11101111;
+}
+let hasExtraDefRegAllocReq = 1 in
+def t2LDAEXD : T2I_ldrex<0b1111, (outs rGPR:$Rt, rGPR:$Rt2),
+                         (ins addr_offset_none:$addr),
+                         AddrModeNone, 4, NoItinerary,
+                         "ldaexd", "\t$Rt, $Rt2, $addr", "",
+                         [], {?, ?, ?, ?}>, Requires<[IsThumb, HasV8]> {
+  bits<4> Rt2;
+  let Inst{11-8} = Rt2;
+
+  let Inst{7} = 1;
+}
 }
 
 let mayStore = 1, Constraints = "@earlyclobber $Rd" in {
-def t2STREXB : T2I_strex<0b00, (outs rGPR:$Rd),
+def t2STREXB : T2I_strex<0b0100, (outs rGPR:$Rd),
                          (ins rGPR:$Rt, addr_offset_none:$addr),
                          AddrModeNone, 4, NoItinerary,
                          "strexb", "\t$Rd, $Rt, $addr", "",
                          [(set rGPR:$Rd, (strex_1 rGPR:$Rt,
                                                   addr_offset_none:$addr))]>;
-def t2STREXH : T2I_strex<0b01, (outs rGPR:$Rd),
+def t2STREXH : T2I_strex<0b0101, (outs rGPR:$Rd),
                          (ins rGPR:$Rt, addr_offset_none:$addr),
                          AddrModeNone, 4, NoItinerary,
                          "strexh", "\t$Rd, $Rt, $addr", "",
@@ -3247,7 +3346,7 @@ def t2STREX  : Thumb2I<(outs rGPR:$Rd), (ins rGPR:$Rt,
   let Inst{7-0} = addr{7-0};
 }
 let hasExtraSrcRegAllocReq = 1 in
-def t2STREXD : T2I_strex<0b11, (outs rGPR:$Rd),
+def t2STREXD : T2I_strex<0b0111, (outs rGPR:$Rd),
                          (ins rGPR:$Rt, rGPR:$Rt2, addr_offset_none:$addr),
                          AddrModeNone, 4, NoItinerary,
                          "strexd", "\t$Rd, $Rt, $Rt2, $addr", "", [],
@@ -3255,6 +3354,42 @@ def t2STREXD : T2I_strex<0b11, (outs rGPR:$Rd),
   bits<4> Rt2;
   let Inst{11-8} = Rt2;
 }
+def t2STLEXB : T2I_strex<0b1100, (outs rGPR:$Rd),
+                         (ins rGPR:$Rt, addr_offset_none:$addr),
+                         AddrModeNone, 4, NoItinerary,
+                         "stlexb", "\t$Rd, $Rt, $addr", "",
+                         []>, Requires<[IsThumb, HasV8]>;
+
+def t2STLEXH : T2I_strex<0b1101, (outs rGPR:$Rd),
+                         (ins rGPR:$Rt, addr_offset_none:$addr),
+                         AddrModeNone, 4, NoItinerary,
+                         "stlexh", "\t$Rd, $Rt, $addr", "",
+                         []>, Requires<[IsThumb, HasV8]>;
+
+def t2STLEX  : Thumb2I<(outs rGPR:$Rd), (ins rGPR:$Rt,
+                             addr_offset_none:$addr),
+                  AddrModeNone, 4, NoItinerary,
+                  "stlex", "\t$Rd, $Rt, $addr", "",
+                  []>, Requires<[IsThumb, HasV8]> {
+  bits<4> Rd;
+  bits<4> Rt;
+  bits<4> addr;
+  let Inst{31-27} = 0b11101;
+  let Inst{26-20} = 0b0001100;
+  let Inst{19-16} = addr;
+  let Inst{15-12} = Rt;
+  let Inst{11-4}  = 0b11111110;
+  let Inst{3-0}   = Rd;
+}
+let hasExtraSrcRegAllocReq = 1 in
+def t2STLEXD : T2I_strex<0b1111, (outs rGPR:$Rd),
+                         (ins rGPR:$Rt, rGPR:$Rt2, addr_offset_none:$addr),
+                         AddrModeNone, 4, NoItinerary,
+                         "stlexd", "\t$Rd, $Rt, $Rt2, $addr", "", [],
+                         {?, ?, ?, ?}>, Requires<[IsThumb, HasV8]> {
+  bits<4> Rt2;
+  let Inst{11-8} = Rt2;
+}
 }
 
 def t2CLREX : T2I<(outs), (ins), NoItinerary, "clrex", "", [(int_arm_clrex)]>,
@@ -3336,13 +3471,14 @@ def t2B   : T2I<(outs), (ins uncondbrtarget:$target), IIC_Br,
   let Inst{12} = 1;
 
   bits<24> target;
-  let Inst{26} = target{19};
-  let Inst{11} = target{18};
-  let Inst{13} = target{17};
+  let Inst{26} = target{23};
+  let Inst{13} = target{22};
+  let Inst{11} = target{21};
   let Inst{25-16} = target{20-11};
   let Inst{10-0} = target{10-0};
   let DecoderMethod = "DecodeT2BInstruction";
-}
+  let AsmMatchConverter = "cvtThumbBranches"; 
+} 
 
 let isNotDuplicable = 1, isIndirectBranch = 1 in {
 def t2BR_JT : t2PseudoInst<(outs),
@@ -3410,6 +3546,7 @@ def t2Bcc : T2I<(outs), (ins brtarget:$target), IIC_Br,
   let Inst{10-0} = target{11-1};
 
   let DecoderMethod = "DecodeThumb2BCCInstruction";
+  let AsmMatchConverter = "cvtThumbBranches";
 }
 
 // Tail calls. The IOS version of thumb tail calls uses a t2 branch, so
@@ -3428,7 +3565,8 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in {
 let Defs = [ITSTATE] in
 def t2IT : Thumb2XI<(outs), (ins it_pred:$cc, it_mask:$mask),
                     AddrModeNone, 2,  IIC_iALUx,
-                    "it$mask\t$cc", "", []> {
+                    "it$mask\t$cc", "", []>,
+           ComplexDeprecationPredicate<"IT"> {
   // 16-bit instruction.
   let Inst{31-16} = 0x0000;
   let Inst{15-8} = 0b10111111;
@@ -3502,27 +3640,34 @@ class t2CPS<dag iops, string asm_op> : T2XI<(outs), iops, NoItinerary,
 
 let M = 1 in
   def t2CPS3p : t2CPS<(ins imod_op:$imod, iflags_op:$iflags, i32imm:$mode),
-                      "$imod.w\t$iflags, $mode">;
+                      "$imod\t$iflags, $mode">;
 let mode = 0, M = 0 in
   def t2CPS2p : t2CPS<(ins imod_op:$imod, iflags_op:$iflags),
                       "$imod.w\t$iflags">;
 let imod = 0, iflags = 0, M = 1 in
   def t2CPS1p : t2CPS<(ins imm0_31:$mode), "\t$mode">;
 
+def : t2InstAlias<"cps$imod.w $iflags, $mode",
+                   (t2CPS3p imod_op:$imod, iflags_op:$iflags, i32imm:$mode), 0>;
+def : t2InstAlias<"cps.w $mode", (t2CPS1p imm0_31:$mode), 0>;
+
 // A6.3.4 Branches and miscellaneous control
 // Table A6-14 Change Processor State, and hint instructions
-def t2HINT : T2I<(outs), (ins imm0_4:$imm), NoItinerary, "hint", "\t$imm",[]> {
-  bits<3> imm;
+def t2HINT : T2I<(outs), (ins imm0_239:$imm), NoItinerary, "hint", ".w\t$imm",[]> {
+  bits<8> imm;
   let Inst{31-3} = 0b11110011101011111000000000000;
-  let Inst{2-0} = imm;
+  let Inst{7-0} = imm;
 }
 
-def : t2InstAlias<"hint$p.w $imm", (t2HINT imm0_4:$imm, pred:$p)>;
+def : t2InstAlias<"hint$p $imm", (t2HINT imm0_239:$imm, pred:$p)>;
 def : t2InstAlias<"nop$p.w", (t2HINT 0, pred:$p)>;
 def : t2InstAlias<"yield$p.w", (t2HINT 1, pred:$p)>;
 def : t2InstAlias<"wfe$p.w", (t2HINT 2, pred:$p)>;
 def : t2InstAlias<"wfi$p.w", (t2HINT 3, pred:$p)>;
 def : t2InstAlias<"sev$p.w", (t2HINT 4, pred:$p)>;
+def : t2InstAlias<"sevl$p.w", (t2HINT 5, pred:$p)> {
+  let Predicates = [IsThumb2, HasV8];
+}
 
 def t2DBG : T2I<(outs), (ins imm0_15:$opt), NoItinerary, "dbg", "\t$opt", []> {
   bits<4> opt;
@@ -3545,6 +3690,20 @@ def t2SMC : T2I<(outs), (ins imm0_15:$opt), NoItinerary, "smc", "\t$opt",
   let Inst{19-16} = opt;
 }
 
+class T2DCPS<bits<2> opt, string opc>
+  : T2I<(outs), (ins), NoItinerary, opc, "", []>, Requires<[IsThumb2, HasV8]> {
+  let Inst{31-27} = 0b11110;
+  let Inst{26-20} = 0b1111000;
+  let Inst{19-16} = 0b1111;
+  let Inst{15-12} = 0b1000;
+  let Inst{11-2} = 0b0000000000;
+  let Inst{1-0} = opt;
+}
+
+def t2DCPS1 : T2DCPS<0b01, "dcps1">;
+def t2DCPS2 : T2DCPS<0b10, "dcps2">;
+def t2DCPS3 : T2DCPS<0b11, "dcps3">;
+
 class T2SRS<bits<2> Op, bit W, dag oops, dag iops, InstrItinClass itin,
             string opc, string asm, list<dag> pattern>
   : T2I<oops, iops, itin, opc, asm, pattern> {
@@ -3600,9 +3759,12 @@ def t2RFEIA  : T2RFE<0b111010011001,
                    [/* For disassembly only; pattern left blank */]>;
 
 // B9.3.19 SUBS PC, LR, #imm (Thumb2) system instruction.
-let Defs = [PC], Uses = [LR] in
+// Exception return instruction is "subs pc, lr, #imm".
+let isReturn = 1, isBarrier = 1, isTerminator = 1, Defs = [PC] in
 def t2SUBS_PC_LR : T2I <(outs), (ins imm0_255:$imm), NoItinerary,
-                   "subs", "\tpc, lr, $imm", []>, Requires<[IsThumb2]> {
+                        "subs", "\tpc, lr, $imm",
+                        [(ARMintretflag imm0_255:$imm)]>,
+                   Requires<[IsThumb2]> {
   let Inst{31-8} = 0b111100111101111010001111;
 
   bits<8> imm;
@@ -3752,10 +3914,10 @@ defm t2LDC   : t2LdStCop<0b1110, 1, 0, "ldc">;
 defm t2LDCL  : t2LdStCop<0b1110, 1, 1, "ldcl">;
 defm t2STC   : t2LdStCop<0b1110, 0, 0, "stc">;
 defm t2STCL  : t2LdStCop<0b1110, 0, 1, "stcl">;
-defm t2LDC2  : t2LdStCop<0b1111, 1, 0, "ldc2">;
-defm t2LDC2L : t2LdStCop<0b1111, 1, 1, "ldc2l">;
-defm t2STC2  : t2LdStCop<0b1111, 0, 0, "stc2">;
-defm t2STC2L : t2LdStCop<0b1111, 0, 1, "stc2l">;
+defm t2LDC2  : t2LdStCop<0b1111, 1, 0, "ldc2">, Requires<[PreV8]>;
+defm t2LDC2L : t2LdStCop<0b1111, 1, 1, "ldc2l">, Requires<[PreV8]>;
+defm t2STC2  : t2LdStCop<0b1111, 0, 0, "stc2">, Requires<[PreV8]>;
+defm t2STC2L : t2LdStCop<0b1111, 0, 1, "stc2l">, Requires<[PreV8]>;
 
 
 //===----------------------------------------------------------------------===//
@@ -3767,7 +3929,7 @@ defm t2STC2L : t2LdStCop<0b1111, 0, 1, "stc2l">;
 //
 // A/R class can only move from CPSR or SPSR.
 def t2MRS_AR : T2I<(outs GPR:$Rd), (ins), NoItinerary, "mrs", "\t$Rd, apsr",
-                  []>, Requires<[IsThumb2,IsARClass]> {
+                  []>, Requires<[IsThumb2,IsNotMClass]> {
   bits<4> Rd;
   let Inst{31-12} = 0b11110011111011111000;
   let Inst{11-8} = Rd;
@@ -3777,7 +3939,7 @@ def t2MRS_AR : T2I<(outs GPR:$Rd), (ins), NoItinerary, "mrs", "\t$Rd, apsr",
 def : t2InstAlias<"mrs${p} $Rd, cpsr", (t2MRS_AR GPR:$Rd, pred:$p)>;
 
 def t2MRSsys_AR: T2I<(outs GPR:$Rd), (ins), NoItinerary, "mrs", "\t$Rd, spsr",
-                   []>, Requires<[IsThumb2,IsARClass]> {
+                   []>, Requires<[IsThumb2,IsNotMClass]> {
   bits<4> Rd;
   let Inst{31-12} = 0b11110011111111111000;
   let Inst{11-8} = Rd;
@@ -3810,7 +3972,7 @@ def t2MRS_M : T2I<(outs rGPR:$Rd), (ins msr_mask:$mask), NoItinerary,
 // the mask with the fields to be accessed in the special register.
 def t2MSR_AR : T2I<(outs), (ins msr_mask:$mask, rGPR:$Rn),
                    NoItinerary, "msr", "\t$mask, $Rn", []>,
-               Requires<[IsThumb2,IsARClass]> {
+               Requires<[IsThumb2,IsNotMClass]> {
   bits<5> mask;
   bits<4> Rn;
   let Inst{31-21} = 0b11110011100;
@@ -3892,7 +4054,8 @@ def t2MCR : t2MovRCopro<0b1110, "mcr", 0,
            (ins p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn,
                 c_imm:$CRm, imm0_7:$opc2),
            [(int_arm_mcr imm:$cop, imm:$opc1, GPR:$Rt, imm:$CRn,
-                         imm:$CRm, imm:$opc2)]>;
+                         imm:$CRm, imm:$opc2)]>,
+           ComplexDeprecationPredicate<"MCR">;
 def : t2InstAlias<"mcr${p} $cop, $opc1, $Rt, $CRn, $CRm",
                   (t2MCR p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn,
                          c_imm:$CRm, 0, pred:$p)>;
@@ -3900,7 +4063,9 @@ def t2MCR2 : t2MovRCopro<0b1111, "mcr2", 0,
              (outs), (ins p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn,
                           c_imm:$CRm, imm0_7:$opc2),
              [(int_arm_mcr2 imm:$cop, imm:$opc1, GPR:$Rt, imm:$CRn,
-                            imm:$CRm, imm:$opc2)]>;
+                            imm:$CRm, imm:$opc2)]> {
+  let Predicates = [IsThumb2, PreV8];
+}
 def : t2InstAlias<"mcr2${p} $cop, $opc1, $Rt, $CRn, $CRm",
                   (t2MCR2 p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn,
                           c_imm:$CRm, 0, pred:$p)>;
@@ -3915,7 +4080,9 @@ def : t2InstAlias<"mrc${p} $cop, $opc1, $Rt, $CRn, $CRm",
 
 def t2MRC2 : t2MovRCopro<0b1111, "mrc2", 1,
              (outs GPRwithAPSR:$Rt), (ins p_imm:$cop, imm0_7:$opc1, c_imm:$CRn,
-                                  c_imm:$CRm, imm0_7:$opc2), []>;
+                                  c_imm:$CRm, imm0_7:$opc2), []> {
+  let Predicates = [IsThumb2, PreV8];
+}
 def : t2InstAlias<"mrc2${p} $cop, $opc1, $Rt, $CRn, $CRm",
                   (t2MRC2 GPRwithAPSR:$Rt, p_imm:$cop, imm0_7:$opc1, c_imm:$CRn,
                           c_imm:$CRm, 0, pred:$p)>;
@@ -3933,17 +4100,22 @@ def t2MCRR : t2MovRRCopro<0b1110, "mcrr", 0,
                                        imm:$CRm)]>;
 def t2MCRR2 : t2MovRRCopro<0b1111, "mcrr2", 0,
                            [(int_arm_mcrr2 imm:$cop, imm:$opc1, GPR:$Rt,
-                                           GPR:$Rt2, imm:$CRm)]>;
+                                           GPR:$Rt2, imm:$CRm)]> {
+  let Predicates = [IsThumb2, PreV8];
+}
+
 /* from coprocessor to ARM core register */
 def t2MRRC : t2MovRRCopro<0b1110, "mrrc", 1>;
 
-def t2MRRC2 : t2MovRRCopro<0b1111, "mrrc2", 1>;
+def t2MRRC2 : t2MovRRCopro<0b1111, "mrrc2", 1> {
+  let Predicates = [IsThumb2, PreV8];
+}
 
 //===----------------------------------------------------------------------===//
 // Other Coprocessor Instructions.
 //
 
-def tCDP : T2Cop<0b1110, (outs), (ins p_imm:$cop, imm0_15:$opc1,
+def t2CDP : T2Cop<0b1110, (outs), (ins p_imm:$cop, imm0_15:$opc1,
                  c_imm:$CRd, c_imm:$CRn, c_imm:$CRm, imm0_7:$opc2),
                  "cdp", "\t$cop, $opc1, $CRd, $CRn, $CRm, $opc2",
                  [(int_arm_cdp imm:$cop, imm:$opc1, imm:$CRd, imm:$CRn,
@@ -3964,6 +4136,8 @@ def tCDP : T2Cop<0b1110, (outs), (ins p_imm:$cop, imm0_15:$opc1,
   let Inst{15-12} = CRd;
   let Inst{19-16} = CRn;
   let Inst{23-20} = opc1;
+
+  let Predicates = [IsThumb2, PreV8];
 }
 
 def t2CDP2 : T2Cop<0b1111, (outs), (ins p_imm:$cop, imm0_15:$opc1,
@@ -3987,6 +4161,8 @@ def t2CDP2 : T2Cop<0b1111, (outs), (ins p_imm:$cop, imm0_15:$opc1,
   let Inst{15-12} = CRd;
   let Inst{19-16} = CRn;
   let Inst{23-20} = opc1;
+
+  let Predicates = [IsThumb2, PreV8];
 }
 
 
@@ -4060,6 +4236,15 @@ def : T2Pat<(atomic_store_32 t2addrmode_negimm8:$addr, GPR:$val),
 def : T2Pat<(atomic_store_32 t2addrmode_so_reg:$addr, GPR:$val),
             (t2STRs     GPR:$val, t2addrmode_so_reg:$addr)>;
 
+let AddedComplexity = 8 in {
+  def : T2Pat<(atomic_load_acquire_8 addr_offset_none:$addr),  (t2LDAB addr_offset_none:$addr)>;
+  def : T2Pat<(atomic_load_acquire_16 addr_offset_none:$addr), (t2LDAH addr_offset_none:$addr)>;
+  def : T2Pat<(atomic_load_acquire_32 addr_offset_none:$addr), (t2LDA  addr_offset_none:$addr)>;
+  def : T2Pat<(atomic_store_release_8 addr_offset_none:$addr, GPR:$val),  (t2STLB GPR:$val, addr_offset_none:$addr)>;
+  def : T2Pat<(atomic_store_release_16 addr_offset_none:$addr, GPR:$val), (t2STLH GPR:$val, addr_offset_none:$addr)>;
+  def : T2Pat<(atomic_store_release_32 addr_offset_none:$addr, GPR:$val), (t2STL  GPR:$val, addr_offset_none:$addr)>;
+}
+
 
 //===----------------------------------------------------------------------===//
 // Assembler aliases
@@ -4081,7 +4266,8 @@ def : t2InstAlias<"sbc${s}${p} $Rd, $Rn, $ShiftedRm",
 
 // Aliases for ADD without the ".w" optional width specifier.
 def : t2InstAlias<"add${s}${p} $Rd, $Rn, $imm",
-        (t2ADDri rGPR:$Rd, GPRnopc:$Rn, t2_so_imm:$imm, pred:$p, cc_out:$s)>;
+        (t2ADDri GPRnopc:$Rd, GPRnopc:$Rn, t2_so_imm:$imm, pred:$p, 
+         cc_out:$s)>;
 def : t2InstAlias<"add${p} $Rd, $Rn, $imm",
            (t2ADDri12 GPRnopc:$Rd, GPR:$Rn, imm0_4095:$imm, pred:$p)>;
 def : t2InstAlias<"add${s}${p} $Rd, $Rn, $Rm",
@@ -4156,9 +4342,9 @@ def : t2InstAlias<"tst${p} $Rn, $Rm",
                   (t2TSTrr GPRnopc:$Rn, rGPR:$Rm, pred:$p)>;
 
 // Memory barriers
-def : InstAlias<"dmb${p}", (t2DMB 0xf, pred:$p)>, Requires<[IsThumb2, HasDB]>;
-def : InstAlias<"dsb${p}", (t2DSB 0xf, pred:$p)>, Requires<[IsThumb2, HasDB]>;
-def : InstAlias<"isb${p}", (t2ISB 0xf, pred:$p)>, Requires<[IsThumb2, HasDB]>;
+def : InstAlias<"dmb${p}", (t2DMB 0xf, pred:$p)>, Requires<[HasDB]>;
+def : InstAlias<"dsb${p}", (t2DSB 0xf, pred:$p)>, Requires<[HasDB]>;
+def : InstAlias<"isb${p}", (t2ISB 0xf, pred:$p)>, Requires<[HasDB]>;
 
 // Alias for LDR, LDRB, LDRH, LDRSB, and LDRSH without the ".w" optional
 // width specifier.
@@ -4185,7 +4371,7 @@ def : t2InstAlias<"ldrsh${p} $Rt, $addr",
                   (t2LDRSHs rGPR:$Rt, t2addrmode_so_reg:$addr, pred:$p)>;
 
 def : t2InstAlias<"ldr${p} $Rt, $addr",
-                  (t2LDRpci GPR:$Rt, t2ldrlabel:$addr, pred:$p)>;
+                  (t2LDRpci rGPR:$Rt, t2ldrlabel:$addr, pred:$p)>;
 def : t2InstAlias<"ldrb${p} $Rt, $addr",
                   (t2LDRBpci rGPR:$Rt, t2ldrlabel:$addr, pred:$p)>;
 def : t2InstAlias<"ldrh${p} $Rt, $addr",
@@ -4347,16 +4533,16 @@ def : t2InstAlias<"mvn${p} $Rd, $imm",
                   (t2MOVi rGPR:$Rd, t2_so_imm_not:$imm, pred:$p, zero_reg)>;
 // Same for AND <--> BIC
 def : t2InstAlias<"bic${s}${p} $Rd, $Rn, $imm",
-                  (t2ANDri rGPR:$Rd, rGPR:$Rn, so_imm_not:$imm,
+                  (t2ANDri rGPR:$Rd, rGPR:$Rn, t2_so_imm_not:$imm,
                            pred:$p, cc_out:$s)>;
 def : t2InstAlias<"bic${s}${p} $Rdn, $imm",
-                  (t2ANDri rGPR:$Rdn, rGPR:$Rdn, so_imm_not:$imm,
+                  (t2ANDri rGPR:$Rdn, rGPR:$Rdn, t2_so_imm_not:$imm,
                            pred:$p, cc_out:$s)>;
 def : t2InstAlias<"and${s}${p} $Rd, $Rn, $imm",
-                  (t2BICri rGPR:$Rd, rGPR:$Rn, so_imm_not:$imm,
+                  (t2BICri rGPR:$Rd, rGPR:$Rn, t2_so_imm_not:$imm,
                            pred:$p, cc_out:$s)>;
 def : t2InstAlias<"and${s}${p} $Rdn, $imm",
-                  (t2BICri rGPR:$Rdn, rGPR:$Rdn, so_imm_not:$imm,
+                  (t2BICri rGPR:$Rdn, rGPR:$Rdn, t2_so_imm_not:$imm,
                            pred:$p, cc_out:$s)>;
 // Likewise, "add Rd, t2_so_imm_neg" -> sub
 def : t2InstAlias<"add${s}${p} $Rd, $Rn, $imm",
@@ -4398,7 +4584,7 @@ def : t2InstAlias<"adr${p} $Rd, $addr",
 
 // LDR(literal) w/ alternate [pc, #imm] syntax.
 def t2LDRpcrel   : t2AsmPseudo<"ldr${p} $Rt, $addr",
-                         (ins GPRnopc:$Rt, t2ldr_pcrel_imm12:$addr, pred:$p)>;
+                         (ins GPR:$Rt, t2ldr_pcrel_imm12:$addr, pred:$p)>;
 def t2LDRBpcrel  : t2AsmPseudo<"ldrb${p} $Rt, $addr",
                          (ins GPRnopc:$Rt, t2ldr_pcrel_imm12:$addr, pred:$p)>;
 def t2LDRHpcrel  : t2AsmPseudo<"ldrh${p} $Rt, $addr",
@@ -4409,7 +4595,7 @@ def t2LDRSHpcrel  : t2AsmPseudo<"ldrsh${p} $Rt, $addr",
                          (ins GPRnopc:$Rt, t2ldr_pcrel_imm12:$addr, pred:$p)>;
     // Version w/ the .w suffix.
 def : t2InstAlias<"ldr${p}.w $Rt, $addr",
-                  (t2LDRpcrel GPRnopc:$Rt, t2ldr_pcrel_imm12:$addr, pred:$p), 0>;
+                  (t2LDRpcrel GPR:$Rt, t2ldr_pcrel_imm12:$addr, pred:$p), 0>;
 def : t2InstAlias<"ldrb${p}.w $Rt, $addr",
                   (t2LDRBpcrel GPRnopc:$Rt, t2ldr_pcrel_imm12:$addr, pred:$p)>;
 def : t2InstAlias<"ldrh${p}.w $Rt, $addr",
diff --git a/lib/Target/ARM/ARMInstrVFP.td b/lib/Target/ARM/ARMInstrVFP.td
index f9cfa15..a8cdc5c 100644
--- a/lib/Target/ARM/ARMInstrVFP.td
+++ b/lib/Target/ARM/ARMInstrVFP.td
@@ -333,45 +333,52 @@ def VNMULS : ASbI<0b11100, 0b10, 1, 0,
   let D = VFPNeonA8Domain;
 }
 
-multiclass vsel_inst<string op, bits<2> opc> {
-  let DecoderNamespace = "VFPV8", PostEncoderMethod = "" in {
+multiclass vsel_inst<string op, bits<2> opc, int CC> {
+  let DecoderNamespace = "VFPV8", PostEncoderMethod = "",
+      Uses = [CPSR], AddedComplexity = 4 in {
     def S : ASbInp<0b11100, opc, 0,
                    (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
                    NoItinerary, !strconcat("vsel", op, ".f32\t$Sd, $Sn, $Sm"),
-                   []>, Requires<[HasV8FP]>;
+                   [(set SPR:$Sd, (ARMcmov SPR:$Sm, SPR:$Sn, CC))]>,
+                   Requires<[HasFPARMv8]>;
 
     def D : ADbInp<0b11100, opc, 0,
                    (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm),
                    NoItinerary, !strconcat("vsel", op, ".f64\t$Dd, $Dn, $Dm"),
-                   []>, Requires<[HasV8FP]>;
+                   [(set DPR:$Dd, (ARMcmov (f64 DPR:$Dm), (f64 DPR:$Dn), CC))]>,
+                   Requires<[HasFPARMv8, HasDPVFP]>;
   }
 }
 
-defm VSELGT : vsel_inst<"gt", 0b11>;
-defm VSELGE : vsel_inst<"ge", 0b10>;
-defm VSELEQ : vsel_inst<"eq", 0b00>;
-defm VSELVS : vsel_inst<"vs", 0b01>;
+// The CC constants here match ARMCC::CondCodes.
+defm VSELGT : vsel_inst<"gt", 0b11, 12>;
+defm VSELGE : vsel_inst<"ge", 0b10, 10>;
+defm VSELEQ : vsel_inst<"eq", 0b00, 0>;
+defm VSELVS : vsel_inst<"vs", 0b01, 6>;
 
-multiclass vmaxmin_inst<string op, bit opc> {
+multiclass vmaxmin_inst<string op, bit opc, SDNode SD> {
   let DecoderNamespace = "VFPV8", PostEncoderMethod = "" in {
     def S : ASbInp<0b11101, 0b00, opc,
                    (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
                    NoItinerary, !strconcat(op, ".f32\t$Sd, $Sn, $Sm"),
-                   []>, Requires<[HasV8FP]>;
+                   [(set SPR:$Sd, (SD SPR:$Sn, SPR:$Sm))]>,
+                   Requires<[HasFPARMv8]>;
 
     def D : ADbInp<0b11101, 0b00, opc,
                    (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm),
                    NoItinerary, !strconcat(op, ".f64\t$Dd, $Dn, $Dm"),
-                   []>, Requires<[HasV8FP]>;
+                   [(set DPR:$Dd, (f64 (SD (f64 DPR:$Dn), (f64 DPR:$Dm))))]>,
+                   Requires<[HasFPARMv8, HasDPVFP]>;
   }
 }
 
-defm VMAXNM : vmaxmin_inst<"vmaxnm", 0>;
-defm VMINNM : vmaxmin_inst<"vminnm", 1>;
+defm VMAXNM : vmaxmin_inst<"vmaxnm", 0, ARMvmaxnm>;
+defm VMINNM : vmaxmin_inst<"vminnm", 1, ARMvminnm>;
 
 // Match reassociated forms only if not sign dependent rounding.
 def : Pat<(fmul (fneg DPR:$a), (f64 DPR:$b)),
-          (VNMULD DPR:$a, DPR:$b)>, Requires<[NoHonorSignDependentRounding]>;
+          (VNMULD DPR:$a, DPR:$b)>,
+          Requires<[NoHonorSignDependentRounding,HasDPVFP]>;
 def : Pat<(fmul (fneg SPR:$a), SPR:$b),
           (VNMULS SPR:$a, SPR:$b)>, Requires<[NoHonorSignDependentRounding]>;
 
@@ -502,6 +509,8 @@ def VCVTSD  : VFPAI<(outs SPR:$Sd), (ins DPR:$Dm), VFPUnaryFrm,
   let Inst{11-8}  = 0b1011;
   let Inst{7-6}   = 0b11;
   let Inst{4}     = 0;
+
+  let Predicates = [HasVFP2, HasDPVFP];
 }
 
 // Between half, single and double-precision.  For disassembly only.
@@ -532,7 +541,7 @@ def VCVTTSH: ASuI<0b11101, 0b11, 0b0011, 0b11, 0, (outs SPR:$Sd), (ins SPR:$Sm),
 def VCVTBHD : ADuI<0b11101, 0b11, 0b0010, 0b01, 0,
                    (outs DPR:$Dd), (ins SPR:$Sm),
                    NoItinerary, "vcvtb", ".f64.f16\t$Dd, $Sm",
-                   []>, Requires<[HasV8FP]> {
+                   []>, Requires<[HasFPARMv8, HasDPVFP]> {
   // Instruction operands.
   bits<5> Sm;
 
@@ -544,7 +553,7 @@ def VCVTBHD : ADuI<0b11101, 0b11, 0b0010, 0b01, 0,
 def VCVTBDH : ADuI<0b11101, 0b11, 0b0011, 0b01, 0,
                    (outs SPR:$Sd), (ins DPR:$Dm),
                    NoItinerary, "vcvtb", ".f16.f64\t$Sd, $Dm",
-                   []>, Requires<[HasV8FP]> {
+                   []>, Requires<[HasFPARMv8, HasDPVFP]> {
   // Instruction operands.
   bits<5> Sd;
   bits<5> Dm;
@@ -559,7 +568,7 @@ def VCVTBDH : ADuI<0b11101, 0b11, 0b0011, 0b01, 0,
 def VCVTTHD : ADuI<0b11101, 0b11, 0b0010, 0b11, 0,
                    (outs DPR:$Dd), (ins SPR:$Sm),
                    NoItinerary, "vcvtt", ".f64.f16\t$Dd, $Sm",
-                   []>, Requires<[HasV8FP]> {
+                   []>, Requires<[HasFPARMv8, HasDPVFP]> {
   // Instruction operands.
   bits<5> Sm;
 
@@ -571,7 +580,7 @@ def VCVTTHD : ADuI<0b11101, 0b11, 0b0010, 0b11, 0,
 def VCVTTDH : ADuI<0b11101, 0b11, 0b0011, 0b11, 0,
                    (outs SPR:$Sd), (ins DPR:$Dm),
                    NoItinerary, "vcvtt", ".f16.f64\t$Sd, $Dm",
-                   []>, Requires<[HasV8FP]> {
+                   []>, Requires<[HasFPARMv8, HasDPVFP]> {
   // Instruction operands.
   bits<5> Sd;
   bits<5> Dm;
@@ -588,21 +597,21 @@ multiclass vcvt_inst<string opc, bits<2> rm> {
     def SS : ASuInp<0b11101, 0b11, 0b1100, 0b11, 0,
                     (outs SPR:$Sd), (ins SPR:$Sm),
                     NoItinerary, !strconcat("vcvt", opc, ".s32.f32\t$Sd, $Sm"),
-                    []>, Requires<[HasV8FP]> {
+                    []>, Requires<[HasFPARMv8]> {
       let Inst{17-16} = rm;
     }
 
     def US : ASuInp<0b11101, 0b11, 0b1100, 0b01, 0,
                     (outs SPR:$Sd), (ins SPR:$Sm),
                     NoItinerary, !strconcat("vcvt", opc, ".u32.f32\t$Sd, $Sm"),
-                    []>, Requires<[HasV8FP]> {
+                    []>, Requires<[HasFPARMv8]> {
       let Inst{17-16} = rm;
     }
 
     def SD : ASuInp<0b11101, 0b11, 0b1100, 0b11, 0,
                     (outs SPR:$Sd), (ins DPR:$Dm),
                     NoItinerary, !strconcat("vcvt", opc, ".s32.f64\t$Sd, $Dm"),
-                    []>, Requires<[HasV8FP]> {
+                    []>, Requires<[HasFPARMv8, HasDPVFP]> {
       bits<5> Dm;
 
       let Inst{17-16} = rm;
@@ -616,7 +625,7 @@ multiclass vcvt_inst<string opc, bits<2> rm> {
     def UD : ASuInp<0b11101, 0b11, 0b1100, 0b01, 0,
                     (outs SPR:$Sd), (ins DPR:$Dm),
                     NoItinerary, !strconcat("vcvt", opc, ".u32.f64\t$Sd, $Dm"),
-                    []>, Requires<[HasV8FP]> {
+                    []>, Requires<[HasFPARMv8, HasDPVFP]> {
       bits<5> Dm;
 
       let Inst{17-16} = rm;
@@ -652,17 +661,24 @@ multiclass vrint_inst_zrx<string opc, bit op, bit op2> {
   def S : ASuI<0b11101, 0b11, 0b0110, 0b11, 0,
                (outs SPR:$Sd), (ins SPR:$Sm),
                NoItinerary, !strconcat("vrint", opc), ".f32\t$Sd, $Sm",
-               []>, Requires<[HasV8FP]> {
+               []>, Requires<[HasFPARMv8]> {
     let Inst{7} = op2;
     let Inst{16} = op;
   }
   def D : ADuI<0b11101, 0b11, 0b0110, 0b11, 0,
                 (outs DPR:$Dd), (ins DPR:$Dm),
                 NoItinerary, !strconcat("vrint", opc), ".f64\t$Dd, $Dm",
-                []>, Requires<[HasV8FP]> {
+                []>, Requires<[HasFPARMv8, HasDPVFP]> {
     let Inst{7} = op2;
     let Inst{16} = op;
   }
+
+  def : InstAlias<!strconcat("vrint", opc, "$p.f32.f32\t$Sd, $Sm"),
+                  (!cast<Instruction>(NAME#"S") SPR:$Sd, SPR:$Sm, pred:$p)>,
+        Requires<[HasFPARMv8]>;
+  def : InstAlias<!strconcat("vrint", opc, "$p.f64.f64\t$Dd, $Dm"),
+                  (!cast<Instruction>(NAME#"D") DPR:$Dd, DPR:$Dm, pred:$p)>,
+        Requires<[HasFPARMv8,HasDPVFP]>;
 }
 
 defm VRINTZ : vrint_inst_zrx<"z", 0, 1>;
@@ -674,21 +690,23 @@ multiclass vrint_inst_anpm<string opc, bits<2> rm> {
     def S : ASuInp<0b11101, 0b11, 0b1000, 0b01, 0,
                    (outs SPR:$Sd), (ins SPR:$Sm),
                    NoItinerary, !strconcat("vrint", opc, ".f32\t$Sd, $Sm"),
-                   []>, Requires<[HasV8FP]> {
+                   []>, Requires<[HasFPARMv8]> {
       let Inst{17-16} = rm;
     }
     def D : ADuInp<0b11101, 0b11, 0b1000, 0b01, 0,
                    (outs DPR:$Dd), (ins DPR:$Dm),
                    NoItinerary, !strconcat("vrint", opc, ".f64\t$Dd, $Dm"),
-                   []>, Requires<[HasV8FP]> {
+                   []>, Requires<[HasFPARMv8, HasDPVFP]> {
       let Inst{17-16} = rm;
     }
   }
 
   def : InstAlias<!strconcat("vrint", opc, ".f32.f32\t$Sd, $Sm"),
-                  (!cast<Instruction>(NAME#"S") SPR:$Sd, SPR:$Sm)>;
+                  (!cast<Instruction>(NAME#"S") SPR:$Sd, SPR:$Sm)>,
+        Requires<[HasFPARMv8]>;
   def : InstAlias<!strconcat("vrint", opc, ".f64.f64\t$Dd, $Dm"),
-                  (!cast<Instruction>(NAME#"D") DPR:$Dd, DPR:$Dm)>;
+                  (!cast<Instruction>(NAME#"D") DPR:$Dd, DPR:$Dm)>,
+        Requires<[HasFPARMv8,HasDPVFP]>;
 }
 
 defm VRINTA : vrint_inst_anpm<"a", 0b00>;
@@ -885,6 +903,8 @@ class AVConv1IDs_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3,
   let Inst{5}     = Sm{0};
   let Inst{15-12} = Dd{3-0};
   let Inst{22}    = Dd{4};
+
+  let Predicates = [HasVFP2, HasDPVFP];
 }
 
 class AVConv1InSs_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3,
@@ -956,6 +976,8 @@ class AVConv1IsD_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3,
   let Inst{5}     = Dm{4};
   let Inst{15-12} = Sd{4-1};
   let Inst{22}    = Sd{0};
+
+  let Predicates = [HasVFP2, HasDPVFP];
 }
 
 class AVConv1InsS_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3,
@@ -1077,6 +1099,8 @@ class AVConv1XInsD_Encode<bits<5> op1, bits<2> op2, bits<4> op3, bits<4> op4,
   // if dp_operation then UInt(D:Vd) else UInt(Vd:D);
   let Inst{22} = dst{4};
   let Inst{15-12} = dst{3-0};
+
+  let Predicates = [HasVFP2, HasDPVFP];
 }
 
 def VTOSHS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1110, 0b1010, 0,
@@ -1189,7 +1213,7 @@ def VMLAD : ADbI<0b11100, 0b00, 0, 0,
                  [(set DPR:$Dd, (fadd_mlx (fmul_su DPR:$Dn, DPR:$Dm),
                                           (f64 DPR:$Ddin)))]>,
               RegConstraint<"$Ddin = $Dd">,
-              Requires<[HasVFP2,UseFPVMLx,DontUseFusedMAC]>;
+              Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>;
 
 def VMLAS : ASbIn<0b11100, 0b00, 0, 0,
                   (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
@@ -1205,7 +1229,7 @@ def VMLAS : ASbIn<0b11100, 0b00, 0, 0,
 
 def : Pat<(fadd_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))),
           (VMLAD DPR:$dstin, DPR:$a, DPR:$b)>,
-          Requires<[HasVFP2,UseFPVMLx,DontUseFusedMAC]>;
+          Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>;
 def : Pat<(fadd_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)),
           (VMLAS SPR:$dstin, SPR:$a, SPR:$b)>,
           Requires<[HasVFP2,DontUseNEONForFP, UseFPVMLx,DontUseFusedMAC]>;
@@ -1216,7 +1240,7 @@ def VMLSD : ADbI<0b11100, 0b00, 1, 0,
                  [(set DPR:$Dd, (fadd_mlx (fneg (fmul_su DPR:$Dn,DPR:$Dm)),
                                           (f64 DPR:$Ddin)))]>,
               RegConstraint<"$Ddin = $Dd">,
-              Requires<[HasVFP2,UseFPVMLx,DontUseFusedMAC]>;
+              Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>;
 
 def VMLSS : ASbIn<0b11100, 0b00, 1, 0,
                   (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
@@ -1232,7 +1256,7 @@ def VMLSS : ASbIn<0b11100, 0b00, 1, 0,
 
 def : Pat<(fsub_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))),
           (VMLSD DPR:$dstin, DPR:$a, DPR:$b)>,
-          Requires<[HasVFP2,UseFPVMLx,DontUseFusedMAC]>;
+          Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>;
 def : Pat<(fsub_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)),
           (VMLSS SPR:$dstin, SPR:$a, SPR:$b)>,
           Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>;
@@ -1243,7 +1267,7 @@ def VNMLAD : ADbI<0b11100, 0b01, 1, 0,
                   [(set DPR:$Dd,(fsub_mlx (fneg (fmul_su DPR:$Dn,DPR:$Dm)),
                                           (f64 DPR:$Ddin)))]>,
                 RegConstraint<"$Ddin = $Dd">,
-                Requires<[HasVFP2,UseFPVMLx,DontUseFusedMAC]>;
+                Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>;
 
 def VNMLAS : ASbI<0b11100, 0b01, 1, 0,
                   (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
@@ -1259,7 +1283,7 @@ def VNMLAS : ASbI<0b11100, 0b01, 1, 0,
 
 def : Pat<(fsub_mlx (fneg (fmul_su DPR:$a, (f64 DPR:$b))), DPR:$dstin),
           (VNMLAD DPR:$dstin, DPR:$a, DPR:$b)>,
-          Requires<[HasVFP2,UseFPVMLx,DontUseFusedMAC]>;
+          Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>;
 def : Pat<(fsub_mlx (fneg (fmul_su SPR:$a, SPR:$b)), SPR:$dstin),
           (VNMLAS SPR:$dstin, SPR:$a, SPR:$b)>,
           Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>;
@@ -1270,7 +1294,7 @@ def VNMLSD : ADbI<0b11100, 0b01, 0, 0,
                   [(set DPR:$Dd, (fsub_mlx (fmul_su DPR:$Dn, DPR:$Dm),
                                            (f64 DPR:$Ddin)))]>,
                RegConstraint<"$Ddin = $Dd">,
-               Requires<[HasVFP2,UseFPVMLx,DontUseFusedMAC]>;
+               Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>;
 
 def VNMLSS : ASbI<0b11100, 0b01, 0, 0,
                   (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
@@ -1285,7 +1309,7 @@ def VNMLSS : ASbI<0b11100, 0b01, 0, 0,
 
 def : Pat<(fsub_mlx (fmul_su DPR:$a, (f64 DPR:$b)), DPR:$dstin),
           (VNMLSD DPR:$dstin, DPR:$a, DPR:$b)>,
-          Requires<[HasVFP2,UseFPVMLx,DontUseFusedMAC]>;
+          Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>;
 def : Pat<(fsub_mlx (fmul_su SPR:$a, SPR:$b), SPR:$dstin),
           (VNMLSS SPR:$dstin, SPR:$a, SPR:$b)>,
           Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>;
@@ -1299,7 +1323,7 @@ def VFMAD : ADbI<0b11101, 0b10, 0, 0,
                  [(set DPR:$Dd, (fadd_mlx (fmul_su DPR:$Dn, DPR:$Dm),
                                           (f64 DPR:$Ddin)))]>,
               RegConstraint<"$Ddin = $Dd">,
-              Requires<[HasVFP4,UseFusedMAC]>;
+              Requires<[HasVFP4,HasDPVFP,UseFusedMAC]>;
 
 def VFMAS : ASbIn<0b11101, 0b10, 0, 0,
                   (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
@@ -1314,7 +1338,7 @@ def VFMAS : ASbIn<0b11101, 0b10, 0, 0,
 
 def : Pat<(fadd_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))),
           (VFMAD DPR:$dstin, DPR:$a, DPR:$b)>,
-          Requires<[HasVFP4,UseFusedMAC]>;
+          Requires<[HasVFP4,HasDPVFP,UseFusedMAC]>;
 def : Pat<(fadd_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)),
           (VFMAS SPR:$dstin, SPR:$a, SPR:$b)>,
           Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]>;
@@ -1323,7 +1347,7 @@ def : Pat<(fadd_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)),
 // (fma x, y, z) -> (vfms z, x, y)
 def : Pat<(f64 (fma DPR:$Dn, DPR:$Dm, DPR:$Ddin)),
           (VFMAD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>,
-      Requires<[HasVFP4]>;
+      Requires<[HasVFP4,HasDPVFP]>;
 def : Pat<(f32 (fma SPR:$Sn, SPR:$Sm, SPR:$Sdin)),
           (VFMAS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
       Requires<[HasVFP4]>;
@@ -1334,7 +1358,7 @@ def VFMSD : ADbI<0b11101, 0b10, 1, 0,
                  [(set DPR:$Dd, (fadd_mlx (fneg (fmul_su DPR:$Dn,DPR:$Dm)),
                                           (f64 DPR:$Ddin)))]>,
               RegConstraint<"$Ddin = $Dd">,
-              Requires<[HasVFP4,UseFusedMAC]>;
+              Requires<[HasVFP4,HasDPVFP,UseFusedMAC]>;
 
 def VFMSS : ASbIn<0b11101, 0b10, 1, 0,
                   (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
@@ -1349,7 +1373,7 @@ def VFMSS : ASbIn<0b11101, 0b10, 1, 0,
 
 def : Pat<(fsub_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))),
           (VFMSD DPR:$dstin, DPR:$a, DPR:$b)>,
-          Requires<[HasVFP4,UseFusedMAC]>;
+          Requires<[HasVFP4,HasDPVFP,UseFusedMAC]>;
 def : Pat<(fsub_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)),
           (VFMSS SPR:$dstin, SPR:$a, SPR:$b)>,
           Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]>;
@@ -1358,14 +1382,14 @@ def : Pat<(fsub_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)),
 // (fma (fneg x), y, z) -> (vfms z, x, y)
 def : Pat<(f64 (fma (fneg DPR:$Dn), DPR:$Dm, DPR:$Ddin)),
           (VFMSD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>,
-      Requires<[HasVFP4]>;
+      Requires<[HasVFP4,HasDPVFP]>;
 def : Pat<(f32 (fma (fneg SPR:$Sn), SPR:$Sm, SPR:$Sdin)),
           (VFMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
       Requires<[HasVFP4]>;
 // (fma x, (fneg y), z) -> (vfms z, x, y)
 def : Pat<(f64 (fma DPR:$Dn, (fneg DPR:$Dm), DPR:$Ddin)),
           (VFMSD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>,
-      Requires<[HasVFP4]>;
+      Requires<[HasVFP4,HasDPVFP]>;
 def : Pat<(f32 (fma SPR:$Sn, (fneg SPR:$Sm), SPR:$Sdin)),
           (VFMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
       Requires<[HasVFP4]>;
@@ -1376,7 +1400,7 @@ def VFNMAD : ADbI<0b11101, 0b01, 1, 0,
                   [(set DPR:$Dd,(fsub_mlx (fneg (fmul_su DPR:$Dn,DPR:$Dm)),
                                           (f64 DPR:$Ddin)))]>,
                 RegConstraint<"$Ddin = $Dd">,
-                Requires<[HasVFP4,UseFusedMAC]>;
+                Requires<[HasVFP4,HasDPVFP,UseFusedMAC]>;
 
 def VFNMAS : ASbI<0b11101, 0b01, 1, 0,
                   (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
@@ -1391,7 +1415,7 @@ def VFNMAS : ASbI<0b11101, 0b01, 1, 0,
 
 def : Pat<(fsub_mlx (fneg (fmul_su DPR:$a, (f64 DPR:$b))), DPR:$dstin),
           (VFNMAD DPR:$dstin, DPR:$a, DPR:$b)>,
-          Requires<[HasVFP4,UseFusedMAC]>;
+          Requires<[HasVFP4,HasDPVFP,UseFusedMAC]>;
 def : Pat<(fsub_mlx (fneg (fmul_su SPR:$a, SPR:$b)), SPR:$dstin),
           (VFNMAS SPR:$dstin, SPR:$a, SPR:$b)>,
           Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]>;
@@ -1400,14 +1424,14 @@ def : Pat<(fsub_mlx (fneg (fmul_su SPR:$a, SPR:$b)), SPR:$dstin),
 // (fneg (fma x, y, z)) -> (vfnma z, x, y)
 def : Pat<(fneg (fma (f64 DPR:$Dn), (f64 DPR:$Dm), (f64 DPR:$Ddin))),
           (VFNMAD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>,
-      Requires<[HasVFP4]>;
+      Requires<[HasVFP4,HasDPVFP]>;
 def : Pat<(fneg (fma (f32 SPR:$Sn), (f32 SPR:$Sm), (f32 SPR:$Sdin))),
           (VFNMAS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
       Requires<[HasVFP4]>;
 // (fma (fneg x), y, (fneg z)) -> (vfnma z, x, y)
 def : Pat<(f64 (fma (fneg DPR:$Dn), DPR:$Dm, (fneg DPR:$Ddin))),
           (VFNMAD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>,
-      Requires<[HasVFP4]>;
+      Requires<[HasVFP4,HasDPVFP]>;
 def : Pat<(f32 (fma (fneg SPR:$Sn), SPR:$Sm, (fneg SPR:$Sdin))),
           (VFNMAS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
       Requires<[HasVFP4]>;
@@ -1418,7 +1442,7 @@ def VFNMSD : ADbI<0b11101, 0b01, 0, 0,
                   [(set DPR:$Dd, (fsub_mlx (fmul_su DPR:$Dn, DPR:$Dm),
                                            (f64 DPR:$Ddin)))]>,
                RegConstraint<"$Ddin = $Dd">,
-               Requires<[HasVFP4,UseFusedMAC]>;
+               Requires<[HasVFP4,HasDPVFP,UseFusedMAC]>;
 
 def VFNMSS : ASbI<0b11101, 0b01, 0, 0,
                   (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
@@ -1432,7 +1456,7 @@ def VFNMSS : ASbI<0b11101, 0b01, 0, 0,
 
 def : Pat<(fsub_mlx (fmul_su DPR:$a, (f64 DPR:$b)), DPR:$dstin),
           (VFNMSD DPR:$dstin, DPR:$a, DPR:$b)>,
-          Requires<[HasVFP4,UseFusedMAC]>;
+          Requires<[HasVFP4,HasDPVFP,UseFusedMAC]>;
 def : Pat<(fsub_mlx (fmul_su SPR:$a, SPR:$b), SPR:$dstin),
           (VFNMSS SPR:$dstin, SPR:$a, SPR:$b)>,
           Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]>;
@@ -1442,21 +1466,21 @@ def : Pat<(fsub_mlx (fmul_su SPR:$a, SPR:$b), SPR:$dstin),
 // (fma x, y, (fneg z)) -> (vfnms z, x, y))
 def : Pat<(f64 (fma DPR:$Dn, DPR:$Dm, (fneg DPR:$Ddin))),
           (VFNMSD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>,
-      Requires<[HasVFP4]>;
+      Requires<[HasVFP4,HasDPVFP]>;
 def : Pat<(f32 (fma SPR:$Sn, SPR:$Sm, (fneg SPR:$Sdin))),
           (VFNMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
       Requires<[HasVFP4]>;
 // (fneg (fma (fneg x), y, z)) -> (vfnms z, x, y)
 def : Pat<(fneg (f64 (fma (fneg DPR:$Dn), DPR:$Dm, DPR:$Ddin))),
           (VFNMSD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>,
-      Requires<[HasVFP4]>;
+      Requires<[HasVFP4,HasDPVFP]>;
 def : Pat<(fneg (f32 (fma (fneg SPR:$Sn), SPR:$Sm, SPR:$Sdin))),
           (VFNMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
       Requires<[HasVFP4]>;
 // (fneg (fma x, (fneg y), z) -> (vfnms z, x, y)
 def : Pat<(fneg (f64 (fma DPR:$Dn, (fneg DPR:$Dm), DPR:$Ddin))),
           (VFNMSD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>,
-      Requires<[HasVFP4]>;
+      Requires<[HasVFP4,HasDPVFP]>;
 def : Pat<(fneg (f32 (fma SPR:$Sn, (fneg SPR:$Sm), SPR:$Sdin))),
           (VFNMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
       Requires<[HasVFP4]>;
@@ -1466,15 +1490,17 @@ def : Pat<(fneg (f32 (fma SPR:$Sn, (fneg SPR:$Sm), SPR:$Sdin))),
 //
 
 let neverHasSideEffects = 1 in {
-def VMOVDcc  : ARMPseudoInst<(outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm, pred:$p),
-                    4, IIC_fpUNA64,
-                    [/*(set DPR:$Dd, (ARMcmov DPR:$Dn, DPR:$Dm, imm:$cc))*/]>,
-                 RegConstraint<"$Dn = $Dd">;
-
-def VMOVScc  : ARMPseudoInst<(outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm, pred:$p),
-                    4, IIC_fpUNA32,
-                    [/*(set SPR:$Sd, (ARMcmov SPR:$Sn, SPR:$Sm, imm:$cc))*/]>,
-                 RegConstraint<"$Sn = $Sd">;
+def VMOVDcc  : PseudoInst<(outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm, cmovpred:$p),
+                    IIC_fpUNA64,
+                    [(set (f64 DPR:$Dd),
+                          (ARMcmov DPR:$Dn, DPR:$Dm, cmovpred:$p))]>,
+               RegConstraint<"$Dn = $Dd">, Requires<[HasVFP2,HasDPVFP]>;
+
+def VMOVScc  : PseudoInst<(outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm, cmovpred:$p),
+                    IIC_fpUNA32,
+                    [(set (f32 SPR:$Sd),
+                          (ARMcmov SPR:$Sn, SPR:$Sm, cmovpred:$p))]>,
+               RegConstraint<"$Sn = $Sd">, Requires<[HasVFP2]>;
 } // neverHasSideEffects
 
 //===----------------------------------------------------------------------===//
@@ -1520,6 +1546,8 @@ let Uses = [FPSCR] in {
                               "vmrs", "\t$Rt, mvfr0", []>;
   def VMRS_MVFR1 : MovFromVFP<0b0110 /* mvfr1 */, (outs GPR:$Rt), (ins),
                               "vmrs", "\t$Rt, mvfr1", []>;
+  def VMRS_MVFR2 : MovFromVFP<0b0101 /* mvfr2 */, (outs GPR:$Rt), (ins),
+                              "vmrs", "\t$Rt, mvfr2", []>, Requires<[HasFPARMv8]>;
   def VMRS_FPINST : MovFromVFP<0b1001 /* fpinst */, (outs GPR:$Rt), (ins),
                               "vmrs", "\t$Rt, fpinst", []>;
   def VMRS_FPINST2 : MovFromVFP<0b1010 /* fpinst2 */, (outs GPR:$Rt), (ins),
@@ -1573,7 +1601,8 @@ let isReMaterializable = 1 in {
 def FCONSTD : VFPAI<(outs DPR:$Dd), (ins vfp_f64imm:$imm),
                     VFPMiscFrm, IIC_fpUNA64,
                     "vmov", ".f64\t$Dd, $imm",
-                    [(set DPR:$Dd, vfp_f64imm:$imm)]>, Requires<[HasVFP3]> {
+                    [(set DPR:$Dd, vfp_f64imm:$imm)]>,
+              Requires<[HasVFP3,HasDPVFP]> {
   bits<5> Dd;
   bits<8> imm;
 
@@ -1655,23 +1684,23 @@ def : VFP2MnemonicAlias<"fmrx", "vmrs">;
 def : VFP2MnemonicAlias<"fmxr", "vmsr">;
 
 // Be friendly and accept the old form of zero-compare
-def : VFP2InstAlias<"fcmpzd${p} $val", (VCMPZD DPR:$val, pred:$p)>;
+def : VFP2DPInstAlias<"fcmpzd${p} $val", (VCMPZD DPR:$val, pred:$p)>;
 def : VFP2InstAlias<"fcmpzs${p} $val", (VCMPZS SPR:$val, pred:$p)>;
 
 
 def : VFP2InstAlias<"fmstat${p}", (FMSTAT pred:$p)>;
 def : VFP2InstAlias<"fadds${p} $Sd, $Sn, $Sm",
                     (VADDS SPR:$Sd, SPR:$Sn, SPR:$Sm, pred:$p)>;
-def : VFP2InstAlias<"faddd${p} $Dd, $Dn, $Dm",
-                    (VADDD DPR:$Dd, DPR:$Dn, DPR:$Dm, pred:$p)>;
+def : VFP2DPInstAlias<"faddd${p} $Dd, $Dn, $Dm",
+                      (VADDD DPR:$Dd, DPR:$Dn, DPR:$Dm, pred:$p)>;
 def : VFP2InstAlias<"fsubs${p} $Sd, $Sn, $Sm",
                     (VSUBS SPR:$Sd, SPR:$Sn, SPR:$Sm, pred:$p)>;
-def : VFP2InstAlias<"fsubd${p} $Dd, $Dn, $Dm",
-                    (VSUBD DPR:$Dd, DPR:$Dn, DPR:$Dm, pred:$p)>;
+def : VFP2DPInstAlias<"fsubd${p} $Dd, $Dn, $Dm",
+                      (VSUBD DPR:$Dd, DPR:$Dn, DPR:$Dm, pred:$p)>;
 
 // No need for the size suffix on VSQRT. It's implied by the register classes.
 def : VFP2InstAlias<"vsqrt${p} $Sd, $Sm", (VSQRTS SPR:$Sd, SPR:$Sm, pred:$p)>;
-def : VFP2InstAlias<"vsqrt${p} $Dd, $Dm", (VSQRTD DPR:$Dd, DPR:$Dm, pred:$p)>;
+def : VFP2DPInstAlias<"vsqrt${p} $Dd, $Dm", (VSQRTD DPR:$Dd, DPR:$Dm, pred:$p)>;
 
 // VLDR/VSTR accept an optional type suffix.
 def : VFP2InstAlias<"vldr${p}.32 $Sd, $addr",
diff --git a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
index 1803a8a..61596d5 100644
--- a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
+++ b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
@@ -90,6 +90,10 @@ namespace {
     typedef SmallVector<MemOpQueueEntry,8> MemOpQueue;
     typedef MemOpQueue::iterator MemOpQueueIter;
 
+    void findUsesOfImpDef(SmallVectorImpl<MachineOperand *> &UsesOfImpDefs,
+                          const MemOpQueue &MemOps, unsigned DefReg,
+                          unsigned RangeBegin, unsigned RangeEnd);
+
     bool MergeOps(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
                   int Offset, unsigned Base, bool BaseKill, int Opcode,
                   ARMCC::CondCodes Pred, unsigned PredReg, unsigned Scratch,
@@ -360,6 +364,62 @@ ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB,
   return true;
 }
 
+/// \brief Find all instructions using a given imp-def within a range.
+///
+/// We are trying to combine a range of instructions, one of which (located at
+/// position RangeBegin) implicitly defines a register. The final LDM/STM will
+/// be placed at RangeEnd, and so any uses of this definition between RangeStart
+/// and RangeEnd must be modified to use an undefined value.
+///
+/// The live range continues until we find a second definition or one of the
+/// uses we find is a kill. Unfortunately MemOps is not sorted by Position, so
+/// we must consider all uses and decide which are relevant in a second pass.
+void ARMLoadStoreOpt::findUsesOfImpDef(
+    SmallVectorImpl<MachineOperand *> &UsesOfImpDefs, const MemOpQueue &MemOps,
+    unsigned DefReg, unsigned RangeBegin, unsigned RangeEnd) {
+  std::map<unsigned, MachineOperand *> Uses;
+  unsigned LastLivePos = RangeEnd;
+
+  // First we find all uses of this register with Position between RangeBegin
+  // and RangeEnd, any or all of these could be uses of a definition at
+  // RangeBegin. We also record the latest position a definition at RangeBegin
+  // would be considered live.
+  for (unsigned i = 0; i < MemOps.size(); ++i) {
+    MachineInstr &MI = *MemOps[i].MBBI;
+    unsigned MIPosition = MemOps[i].Position;
+    if (MIPosition <= RangeBegin || MIPosition > RangeEnd)
+      continue;
+
+    // If this instruction defines the register, then any later use will be of
+    // that definition rather than ours.
+    if (MI.definesRegister(DefReg))
+      LastLivePos = std::min(LastLivePos, MIPosition);
+
+    MachineOperand *UseOp = MI.findRegisterUseOperand(DefReg);
+    if (!UseOp)
+      continue;
+
+    // If this instruction kills the register then (assuming liveness is
+    // correct when we start) we don't need to think about anything after here.
+    if (UseOp->isKill())
+      LastLivePos = std::min(LastLivePos, MIPosition);
+
+    Uses[MIPosition] = UseOp;
+  }
+
+  // Now we traverse the list of all uses, and append the ones that actually use
+  // our definition to the requested list.
+  for (std::map<unsigned, MachineOperand *>::iterator I = Uses.begin(),
+                                                      E = Uses.end();
+       I != E; ++I) {
+    // List is sorted by position so once we've found one out of range there
+    // will be no more to consider.
+    if (I->first > LastLivePos)
+      break;
+    UsesOfImpDefs.push_back(I->second);
+  }
+}
+
 // MergeOpsUpdate - call MergeOps and update MemOps and merges accordingly on
 // success.
 void ARMLoadStoreOpt::MergeOpsUpdate(MachineBasicBlock &MBB,
@@ -392,6 +452,7 @@ void ARMLoadStoreOpt::MergeOpsUpdate(MachineBasicBlock &MBB,
 
   SmallVector<std::pair<unsigned, bool>, 8> Regs;
   SmallVector<unsigned, 8> ImpDefs;
+  SmallVector<MachineOperand *, 8> UsesOfImpDefs;
   for (unsigned i = memOpsBegin; i < memOpsEnd; ++i) {
     unsigned Reg = memOps[i].Reg;
     // If we are inserting the merged operation after an operation that
@@ -406,6 +467,12 @@ void ARMLoadStoreOpt::MergeOpsUpdate(MachineBasicBlock &MBB,
       unsigned DefReg = MO->getReg();
       if (std::find(ImpDefs.begin(), ImpDefs.end(), DefReg) == ImpDefs.end())
         ImpDefs.push_back(DefReg);
+
+      // There may be other uses of the definition between this instruction and
+      // the eventual LDM/STM position. These should be marked undef if the
+      // merge takes place.
+      findUsesOfImpDef(UsesOfImpDefs, memOps, DefReg, memOps[i].Position,
+                       insertPos);
     }
   }
 
@@ -418,6 +485,16 @@ void ARMLoadStoreOpt::MergeOpsUpdate(MachineBasicBlock &MBB,
 
   // Merge succeeded, update records.
   Merges.push_back(prior(Loc));
+
+  // In gathering loads together, we may have moved the imp-def of a register
+  // past one of its uses. This is OK, since we know better than the rest of
+  // LLVM what's OK with ARM loads and stores; but we still have to adjust the
+  // affected uses.
+  for (SmallVectorImpl<MachineOperand *>::iterator I = UsesOfImpDefs.begin(),
+                                                   E = UsesOfImpDefs.end();
+       I != E; ++I)
+    (*I)->setIsUndef();
+
   for (unsigned i = memOpsBegin; i < memOpsEnd; ++i) {
     // Remove kill flags from any memops that come before insertPos.
     if (Regs[i-memOpsBegin].second) {
@@ -489,7 +566,10 @@ ARMLoadStoreOpt::MergeLDR_STR(MachineBasicBlock &MBB, unsigned SIndex,
     if (Reg != ARM::SP &&
         NewOffset == Offset + (int)Size &&
         ((isNotVFP && RegNum > PRegNum) ||
-         ((Count < Limit) && RegNum == PRegNum+1))) {
+         ((Count < Limit) && RegNum == PRegNum+1)) &&
+        // On Swift we don't want vldm/vstm to start with a odd register num
+        // because Q register unaligned vldm/vstm need more uops.
+        (!STI->isSwift() || isNotVFP || Count != 1 || !(PRegNum & 0x1))) {
       Offset += Size;
       PRegNum = RegNum;
       ++Count;
diff --git a/lib/Target/ARM/ARMMCInstLower.cpp b/lib/Target/ARM/ARMMCInstLower.cpp
index b641483..e12c9c6 100644
--- a/lib/Target/ARM/ARMMCInstLower.cpp
+++ b/lib/Target/ARM/ARMMCInstLower.cpp
@@ -82,7 +82,7 @@ bool ARMAsmPrinter::lowerOperand(const MachineOperand &MO,
         MO.getMBB()->getSymbol(), OutContext));
     break;
   case MachineOperand::MO_GlobalAddress:
-    MCOp = GetSymbolRef(MO, Mang->getSymbol(MO.getGlobal()));
+    MCOp = GetSymbolRef(MO, getSymbol(MO.getGlobal()));
     break;
   case MachineOperand::MO_ExternalSymbol:
    MCOp = GetSymbolRef(MO,
diff --git a/lib/Target/ARM/ARMMachineFunctionInfo.h b/lib/Target/ARM/ARMMachineFunctionInfo.h
index d9ec4fd..010edf3 100644
--- a/lib/Target/ARM/ARMMachineFunctionInfo.h
+++ b/lib/Target/ARM/ARMMachineFunctionInfo.h
@@ -84,12 +84,6 @@ class ARMFunctionInfo : public MachineFunctionInfo {
   unsigned GPRCS2Size;
   unsigned DPRCSSize;
 
-  /// GPRCS1Frames, GPRCS2Frames, DPRCSFrames - Keeps track of frame indices
-  /// which belong to these spill areas.
-  BitVector GPRCS1Frames;
-  BitVector GPRCS2Frames;
-  BitVector DPRCSFrames;
-
   /// NumAlignedDPRCS2Regs - The number of callee-saved DPRs that are saved in
   /// the aligned portion of the stack frame.  This is always a contiguous
   /// sequence of D-registers starting from d8.
@@ -128,7 +122,6 @@ public:
     LRSpilledForFarJump(false),
     FramePtrSpillOffset(0), GPRCS1Offset(0), GPRCS2Offset(0), DPRCSOffset(0),
     GPRCS1Size(0), GPRCS2Size(0), DPRCSSize(0),
-    GPRCS1Frames(0), GPRCS2Frames(0), DPRCSFrames(0),
     NumAlignedDPRCS2Regs(0),
     JumpTableUId(0), PICLabelUId(0),
     VarArgsFrameIndex(0), HasITBlocks(false), GlobalBaseReg(0) {}
@@ -141,7 +134,6 @@ public:
     LRSpilledForFarJump(false),
     FramePtrSpillOffset(0), GPRCS1Offset(0), GPRCS2Offset(0), DPRCSOffset(0),
     GPRCS1Size(0), GPRCS2Size(0), DPRCSSize(0),
-    GPRCS1Frames(32), GPRCS2Frames(32), DPRCSFrames(32),
     JumpTableUId(0), PICLabelUId(0),
     VarArgsFrameIndex(0), HasITBlocks(false), GlobalBaseReg(0) {}
 
@@ -190,59 +182,6 @@ public:
   void setGPRCalleeSavedArea2Size(unsigned s) { GPRCS2Size = s; }
   void setDPRCalleeSavedAreaSize(unsigned s)  { DPRCSSize = s; }
 
-  bool isGPRCalleeSavedArea1Frame(int fi) const {
-    if (fi < 0 || fi >= (int)GPRCS1Frames.size())
-      return false;
-    return GPRCS1Frames[fi];
-  }
-  bool isGPRCalleeSavedArea2Frame(int fi) const {
-    if (fi < 0 || fi >= (int)GPRCS2Frames.size())
-      return false;
-    return GPRCS2Frames[fi];
-  }
-  bool isDPRCalleeSavedAreaFrame(int fi) const {
-    if (fi < 0 || fi >= (int)DPRCSFrames.size())
-      return false;
-    return DPRCSFrames[fi];
-  }
-
-  void addGPRCalleeSavedArea1Frame(int fi) {
-    if (fi >= 0) {
-      int Size = GPRCS1Frames.size();
-      if (fi >= Size) {
-        Size *= 2;
-        if (fi >= Size)
-          Size = fi+1;
-        GPRCS1Frames.resize(Size);
-      }
-      GPRCS1Frames[fi] = true;
-    }
-  }
-  void addGPRCalleeSavedArea2Frame(int fi) {
-    if (fi >= 0) {
-      int Size = GPRCS2Frames.size();
-      if (fi >= Size) {
-        Size *= 2;
-        if (fi >= Size)
-          Size = fi+1;
-        GPRCS2Frames.resize(Size);
-      }
-      GPRCS2Frames[fi] = true;
-    }
-  }
-  void addDPRCalleeSavedAreaFrame(int fi) {
-    if (fi >= 0) {
-      int Size = DPRCSFrames.size();
-      if (fi >= Size) {
-        Size *= 2;
-        if (fi >= Size)
-          Size = fi+1;
-        DPRCSFrames.resize(Size);
-      }
-      DPRCSFrames[fi] = true;
-    }
-  }
-
   unsigned createJumpTableUId() {
     return JumpTableUId++;
   }
diff --git a/lib/Target/ARM/ARMRegisterInfo.td b/lib/Target/ARM/ARMRegisterInfo.td
index bb7d358..d045761 100644
--- a/lib/Target/ARM/ARMRegisterInfo.td
+++ b/lib/Target/ARM/ARMRegisterInfo.td
@@ -172,6 +172,7 @@ def ITSTATE    : ARMReg<4, "itstate">;
 
 // Special Registers - only available in privileged mode.
 def FPSID   : ARMReg<0,  "fpsid">;
+def MVFR2   : ARMReg<5,  "mvfr2">;
 def MVFR1   : ARMReg<6,  "mvfr1">;
 def MVFR0   : ARMReg<7,  "mvfr0">;
 def FPEXC   : ARMReg<8,  "fpexc">;
@@ -251,7 +252,7 @@ def hGPR : RegisterClass<"ARM", [i32], 32, (sub GPR, tGPR)>;
 // to the saved value before the tail call, which would clobber a call address.
 // Note, getMinimalPhysRegClass(R0) returns tGPR because of the names of
 // this class and the preceding one(!)  This is what we want.
-def tcGPR : RegisterClass<"ARM", [i32], 32, (add R0, R1, R2, R3, R9, R12)> {
+def tcGPR : RegisterClass<"ARM", [i32], 32, (add R0, R1, R2, R3, R12)> {
   let AltOrders = [(and tcGPR, tGPR)];
   let AltOrderSelect = [{
       return MF.getTarget().getSubtarget<ARMSubtarget>().isThumb1Only();
diff --git a/lib/Target/ARM/ARMScheduleA9.td b/lib/Target/ARM/ARMScheduleA9.td
index 74ee50b..603e775 100644
--- a/lib/Target/ARM/ARMScheduleA9.td
+++ b/lib/Target/ARM/ARMScheduleA9.td
@@ -1879,6 +1879,10 @@ def CortexA9Itineraries : ProcessorItineraries<
 // The following definitions describe the simpler per-operand machine model.
 // This works with MachineScheduler and will eventually replace itineraries.
 
+class A9WriteLMOpsListType<list<WriteSequence> writes> {
+  list <WriteSequence> Writes = writes;
+  SchedMachineModel SchedModel = ?;
+}
 
 // Cortex-A9 machine model for scheduling and other instruction cost heuristics.
 def CortexA9Model : SchedMachineModel {
@@ -2011,7 +2015,7 @@ def A9WriteAdr#NumAddr : WriteSequence<[A9WriteAdr], NumAddr>;
 
 // Define a predicate to select the LDM based on number of memory addresses.
 def A9LMAdr#NumAddr#Pred :
-  SchedPredicate<"TII->getNumLDMAddresses(MI) == "#NumAddr>;
+  SchedPredicate<"(TII->getNumLDMAddresses(MI)+1)/2 == "#NumAddr>;
 
 } // foreach NumAddr
 
@@ -2054,48 +2058,30 @@ def A9WriteL#NumAddr#Hi : WriteSequence<
 //===----------------------------------------------------------------------===//
 // LDM: Load multiple into 32-bit integer registers.
 
+def A9WriteLMOpsList : A9WriteLMOpsListType<
+                 [A9WriteL1, A9WriteL1Hi,
+                  A9WriteL2, A9WriteL2Hi,
+                  A9WriteL3, A9WriteL3Hi,
+                  A9WriteL4, A9WriteL4Hi,
+                  A9WriteL5, A9WriteL5Hi,
+                  A9WriteL6, A9WriteL6Hi,
+                  A9WriteL7, A9WriteL7Hi,
+                  A9WriteL8, A9WriteL8Hi]>;
+
 // A9WriteLM variants expand into a pair of writes for each 64-bit
 // value loaded. When the number of registers is odd, the last
 // A9WriteLnHi is naturally ignored because the instruction has no
 // following def operands.  These variants take no issue resource, so
 // they may need to be part of a WriteSequence that includes A9WriteIssue.
 def A9WriteLM : SchedWriteVariant<[
-  SchedVar<A9LMAdr1Pred, [A9WriteL1, A9WriteL1Hi]>,
-  SchedVar<A9LMAdr2Pred, [A9WriteL1, A9WriteL1Hi,
-                          A9WriteL2, A9WriteL2Hi]>,
-  SchedVar<A9LMAdr3Pred, [A9WriteL1, A9WriteL1Hi,
-                          A9WriteL2, A9WriteL2Hi,
-                          A9WriteL3, A9WriteL3Hi]>,
-  SchedVar<A9LMAdr4Pred, [A9WriteL1, A9WriteL1Hi,
-                          A9WriteL2, A9WriteL2Hi,
-                          A9WriteL3, A9WriteL3Hi,
-                          A9WriteL4, A9WriteL4Hi]>,
-  SchedVar<A9LMAdr5Pred, [A9WriteL1, A9WriteL1Hi,
-                          A9WriteL2, A9WriteL2Hi,
-                          A9WriteL3, A9WriteL3Hi,
-                          A9WriteL4, A9WriteL4Hi,
-                          A9WriteL5, A9WriteL5Hi]>,
-  SchedVar<A9LMAdr6Pred, [A9WriteL1, A9WriteL1Hi,
-                          A9WriteL2, A9WriteL2Hi,
-                          A9WriteL3, A9WriteL3Hi,
-                          A9WriteL4, A9WriteL4Hi,
-                          A9WriteL5, A9WriteL5Hi,
-                          A9WriteL6, A9WriteL6Hi]>,
-  SchedVar<A9LMAdr7Pred, [A9WriteL1, A9WriteL1Hi,
-                          A9WriteL2, A9WriteL2Hi,
-                          A9WriteL3, A9WriteL3Hi,
-                          A9WriteL4, A9WriteL4Hi,
-                          A9WriteL5, A9WriteL5Hi,
-                          A9WriteL6, A9WriteL6Hi,
-                          A9WriteL7, A9WriteL7Hi]>,
-  SchedVar<A9LMAdr8Pred, [A9WriteL1, A9WriteL1Hi,
-                          A9WriteL2, A9WriteL2Hi,
-                          A9WriteL3, A9WriteL3Hi,
-                          A9WriteL4, A9WriteL4Hi,
-                          A9WriteL5, A9WriteL5Hi,
-                          A9WriteL6, A9WriteL6Hi,
-                          A9WriteL7, A9WriteL7Hi,
-                          A9WriteL8, A9WriteL8Hi]>,
+  SchedVar<A9LMAdr1Pred, A9WriteLMOpsList.Writes[0-1]>,
+  SchedVar<A9LMAdr2Pred, A9WriteLMOpsList.Writes[0-3]>,
+  SchedVar<A9LMAdr3Pred, A9WriteLMOpsList.Writes[0-5]>,
+  SchedVar<A9LMAdr4Pred, A9WriteLMOpsList.Writes[0-7]>,
+  SchedVar<A9LMAdr5Pred, A9WriteLMOpsList.Writes[0-9]>,
+  SchedVar<A9LMAdr6Pred, A9WriteLMOpsList.Writes[0-11]>,
+  SchedVar<A9LMAdr7Pred, A9WriteLMOpsList.Writes[0-13]>,
+  SchedVar<A9LMAdr8Pred, A9WriteLMOpsList.Writes[0-15]>,
   // For unknown LDMs, define the maximum number of writes, but only
   // make the first two consume resources.
   SchedVar<A9LMUnknownPred, [A9WriteL1, A9WriteL1Hi,
@@ -2177,49 +2163,39 @@ def A9WriteLMfp#NumAddr#Hi : WriteSequence<
 // pair of writes for each 64-bit data loaded. When the number of
 // registers is odd, the last WriteLMfpnHi is naturally ignored because
 // the instruction has no following def operands.
+
+def A9WriteLMfpPostRAOpsList : A9WriteLMOpsListType<
+                 [A9WriteLMfp1, A9WriteLMfp2,       // 0-1
+                  A9WriteLMfp3, A9WriteLMfp4,       // 2-3
+                  A9WriteLMfp5, A9WriteLMfp6,       // 4-5
+                  A9WriteLMfp7, A9WriteLMfp8,       // 6-7
+                  A9WriteLMfp1Hi,                   // 8-8
+                  A9WriteLMfp2Hi, A9WriteLMfp2Hi,   // 9-10
+                  A9WriteLMfp3Hi, A9WriteLMfp3Hi,   // 11-12
+                  A9WriteLMfp4Hi, A9WriteLMfp4Hi,   // 13-14
+                  A9WriteLMfp5Hi, A9WriteLMfp5Hi,   // 15-16
+                  A9WriteLMfp6Hi, A9WriteLMfp6Hi,   // 17-18
+                  A9WriteLMfp7Hi, A9WriteLMfp7Hi,   // 19-20
+                  A9WriteLMfp8Hi, A9WriteLMfp8Hi]>; // 21-22
+
 def A9WriteLMfpPostRA : SchedWriteVariant<[
-  SchedVar<A9LMAdr1Pred, [A9WriteLMfp1, A9WriteLMfp1Hi]>,
-  SchedVar<A9LMAdr2Pred, [A9WriteLMfp1, A9WriteLMfp1Hi,
-                          A9WriteLMfp2, A9WriteLMfp2Hi]>,
-  SchedVar<A9LMAdr3Pred, [A9WriteLMfp1, A9WriteLMfp1Hi,
-                          A9WriteLMfp2, A9WriteLMfp2Hi,
-                          A9WriteLMfp3, A9WriteLMfp3Hi]>,
-  SchedVar<A9LMAdr4Pred, [A9WriteLMfp1, A9WriteLMfp1Hi,
-                          A9WriteLMfp2, A9WriteLMfp2Hi,
-                          A9WriteLMfp3, A9WriteLMfp3Hi,
-                          A9WriteLMfp4, A9WriteLMfp4Hi]>,
-  SchedVar<A9LMAdr5Pred, [A9WriteLMfp1, A9WriteLMfp1Hi,
-                          A9WriteLMfp2, A9WriteLMfp2Hi,
-                          A9WriteLMfp3, A9WriteLMfp3Hi,
-                          A9WriteLMfp4, A9WriteLMfp4Hi,
-                          A9WriteLMfp5, A9WriteLMfp5Hi]>,
-  SchedVar<A9LMAdr6Pred, [A9WriteLMfp1, A9WriteLMfp1Hi,
-                          A9WriteLMfp2, A9WriteLMfp2Hi,
-                          A9WriteLMfp3, A9WriteLMfp3Hi,
-                          A9WriteLMfp4, A9WriteLMfp4Hi,
-                          A9WriteLMfp5, A9WriteLMfp5Hi,
-                          A9WriteLMfp6, A9WriteLMfp6Hi]>,
-  SchedVar<A9LMAdr7Pred, [A9WriteLMfp1, A9WriteLMfp1Hi,
-                          A9WriteLMfp2, A9WriteLMfp2Hi,
-                          A9WriteLMfp3, A9WriteLMfp3Hi,
-                          A9WriteLMfp4, A9WriteLMfp4Hi,
-                          A9WriteLMfp5, A9WriteLMfp5Hi,
-                          A9WriteLMfp6, A9WriteLMfp6Hi,
-                          A9WriteLMfp7, A9WriteLMfp7Hi]>,
-  SchedVar<A9LMAdr8Pred, [A9WriteLMfp1, A9WriteLMfp1Hi,
-                          A9WriteLMfp2, A9WriteLMfp2Hi,
-                          A9WriteLMfp3, A9WriteLMfp3Hi,
-                          A9WriteLMfp4, A9WriteLMfp4Hi,
-                          A9WriteLMfp5, A9WriteLMfp5Hi,
-                          A9WriteLMfp6, A9WriteLMfp6Hi,
-                          A9WriteLMfp7, A9WriteLMfp7Hi,
-                          A9WriteLMfp8, A9WriteLMfp8Hi]>,
+  SchedVar<A9LMAdr1Pred, A9WriteLMfpPostRAOpsList.Writes[0-0, 8-8]>,
+  SchedVar<A9LMAdr2Pred, A9WriteLMfpPostRAOpsList.Writes[0-1, 9-10]>,
+  SchedVar<A9LMAdr3Pred, A9WriteLMfpPostRAOpsList.Writes[0-2, 10-12]>,
+  SchedVar<A9LMAdr4Pred, A9WriteLMfpPostRAOpsList.Writes[0-3, 11-14]>,
+  SchedVar<A9LMAdr5Pred, A9WriteLMfpPostRAOpsList.Writes[0-4, 12-16]>,
+  SchedVar<A9LMAdr6Pred, A9WriteLMfpPostRAOpsList.Writes[0-5, 13-18]>,
+  SchedVar<A9LMAdr7Pred, A9WriteLMfpPostRAOpsList.Writes[0-6, 14-20]>,
+  SchedVar<A9LMAdr8Pred, A9WriteLMfpPostRAOpsList.Writes[0-7, 15-22]>,
   // For unknown LDMs, define the maximum number of writes, but only
-  // make the first two consume resources.
-  SchedVar<A9LMUnknownPred, [A9WriteLMfp1, A9WriteLMfp1Hi,
-                             A9WriteLMfp2, A9WriteLMfp2Hi,
-                             A9WriteLMfp3Hi, A9WriteLMfp3Hi,
-                             A9WriteLMfp4Hi, A9WriteLMfp4Hi,
+  // make the first two consume resources. We are optimizing for the case
+  // where the operands are DPRs, and this determines the first eight
+  // types. The remaining eight types are filled to cover the case
+  // where the operands are SPRs.
+  SchedVar<A9LMUnknownPred, [A9WriteLMfp1, A9WriteLMfp2,
+                             A9WriteLMfp3Hi, A9WriteLMfp4Hi,
+                             A9WriteLMfp5Hi, A9WriteLMfp6Hi,
+                             A9WriteLMfp7Hi, A9WriteLMfp8Hi,
                              A9WriteLMfp5Hi, A9WriteLMfp5Hi,
                              A9WriteLMfp6Hi, A9WriteLMfp6Hi,
                              A9WriteLMfp7Hi, A9WriteLMfp7Hi,
diff --git a/lib/Target/ARM/ARMScheduleSwift.td b/lib/Target/ARM/ARMScheduleSwift.td
index 2a41616..8d7dbc2 100644
--- a/lib/Target/ARM/ARMScheduleSwift.td
+++ b/lib/Target/ARM/ARMScheduleSwift.td
@@ -1345,20 +1345,25 @@ let SchedModel = SwiftModel in {
   // 4.2.20 Integer Load Signextended
   def SwiftWriteP2P01ThreeCycle : SchedWriteRes<[SwiftUnitP2, SwiftUnitP01]> {
     let Latency = 3;
+    let NumMicroOps = 2;
   }
   def SwiftWriteP2P01FourCyle : SchedWriteRes<[SwiftUnitP2, SwiftUnitP01]> {
     let Latency = 4;
+    let NumMicroOps = 2;
   }
   def SwiftWriteP2P01P01FourCycle : SchedWriteRes<[SwiftUnitP2, SwiftUnitP01,
                                                    SwiftUnitP01]> {
     let Latency = 4;
+    let NumMicroOps = 3;
   }
   def SwiftWriteP2P2ThreeCycle : SchedWriteRes<[SwiftUnitP2, SwiftUnitP2]> {
     let Latency = 3;
+    let NumMicroOps = 2;
   }
   def SwiftWriteP2P2P01ThreeCycle : SchedWriteRes<[SwiftUnitP2, SwiftUnitP2,
-                                                    SwiftUnitP01]> {
+                                                   SwiftUnitP01]> {
     let Latency = 3;
+    let NumMicroOps = 3;
   }
   def SwiftWrBackOne : SchedWriteRes<[]> {
     let Latency = 1;
@@ -1399,7 +1404,10 @@ let SchedModel = SwiftModel in {
     def SwiftWriteLM#Lat#Cy : SchedWriteRes<[SwiftUnitP2]> {
       let Latency = Lat;
     }
-    def SwiftWriteLM#Lat#CyNo : SchedWriteRes<[]> { let Latency = Lat; }
+    def SwiftWriteLM#Lat#CyNo : SchedWriteRes<[]> {
+      let Latency = Lat;
+      let NumMicroOps = 0;
+    }
   }
   // Predicate.
   foreach NumAddr = 1-16 in {
@@ -1520,6 +1528,7 @@ let SchedModel = SwiftModel in {
   // 4.2.25 Integer Store, Multiple
   def SwiftWriteStIncAddr : SchedWriteRes<[SwiftUnitP2, SwiftUnitP01]> {
     let Latency = 0;
+    let NumMicroOps = 2;
   }
   foreach NumAddr = 1-16 in {
      def SwiftWriteSTM#NumAddr : WriteSequence<[SwiftWriteStIncAddr], NumAddr>;
diff --git a/lib/Target/ARM/ARMSubtarget.cpp b/lib/Target/ARM/ARMSubtarget.cpp
index 0a0f30c..8351c63 100644
--- a/lib/Target/ARM/ARMSubtarget.cpp
+++ b/lib/Target/ARM/ARMSubtarget.cpp
@@ -32,7 +32,7 @@ ReserveR9("arm-reserve-r9", cl::Hidden,
           cl::desc("Reserve R9, making it unavailable as GPR"));
 
 static cl::opt<bool>
-DarwinUseMOVT("arm-darwin-use-movt", cl::init(true), cl::Hidden);
+ArmUseMOVT("arm-use-movt", cl::init(true), cl::Hidden);
 
 static cl::opt<bool>
 UseFusedMulOps("arm-use-mulops",
@@ -57,10 +57,28 @@ Align(cl::desc("Load/store alignment support"),
                      "Allow unaligned memory accesses"),
           clEnumValEnd));
 
+enum ITMode {
+  DefaultIT,
+  RestrictedIT,
+  NoRestrictedIT
+};
+
+static cl::opt<ITMode>
+IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT),
+   cl::ZeroOrMore,
+   cl::values(clEnumValN(DefaultIT, "arm-default-it",
+                         "Generate IT block based on arch"),
+              clEnumValN(RestrictedIT, "arm-restrict-it",
+                         "Disallow deprecated IT based on ARMv8"),
+              clEnumValN(NoRestrictedIT, "arm-no-restrict-it",
+                         "Allow IT blocks based on ARMv7"),
+              clEnumValEnd));
+
 ARMSubtarget::ARMSubtarget(const std::string &TT, const std::string &CPU,
                            const std::string &FS, const TargetOptions &Options)
   : ARMGenSubtargetInfo(TT, CPU, FS)
   , ARMProcFamily(Others)
+  , ARMProcClass(None)
   , stackAlignment(4)
   , CPUString(CPU)
   , TargetTriple(TT)
@@ -75,13 +93,14 @@ void ARMSubtarget::initializeEnvironment() {
   HasV5TOps = false;
   HasV5TEOps = false;
   HasV6Ops = false;
+  HasV6MOps = false;
   HasV6T2Ops = false;
   HasV7Ops = false;
   HasV8Ops = false;
   HasVFPv2 = false;
   HasVFPv3 = false;
   HasVFPv4 = false;
-  HasV8FP = false;
+  HasFPARMv8 = false;
   HasNEON = false;
   UseNEONForSinglePrecisionFP = false;
   UseMulOps = UseFusedMulOps;
@@ -90,7 +109,6 @@ void ARMSubtarget::initializeEnvironment() {
   SlowFPBrcc = false;
   InThumbMode = false;
   HasThumb2 = false;
-  IsMClass = false;
   NoARM = false;
   PostRAScheduler = false;
   IsR9Reserved = ReserveR9;
@@ -107,9 +125,12 @@ void ARMSubtarget::initializeEnvironment() {
   AvoidMOVsShifterOperand = false;
   HasRAS = false;
   HasMPExtension = false;
+  HasVirtualization = false;
   FPOnlySP = false;
   HasPerfMon = false;
   HasTrustZone = false;
+  HasCrypto = false;
+  HasCRC = false;
   AllowsUnalignedMem = false;
   Thumb2DSP = false;
   UseNaClTrap = false;
@@ -133,8 +154,13 @@ void ARMSubtarget::resetSubtargetFeatures(const MachineFunction *MF) {
 }
 
 void ARMSubtarget::resetSubtargetFeatures(StringRef CPU, StringRef FS) {
-  if (CPUString.empty())
-    CPUString = "generic";
+  if (CPUString.empty()) {
+    if (isTargetIOS() && TargetTriple.getArchName().endswith("v7s"))
+      // Default to the Swift CPU when targeting armv7s/thumbv7s.
+      CPUString = "swift";
+    else
+      CPUString = "generic";
+  }
 
   // Insert the architecture feature derived from the target triple into the
   // feature string. This is important for setting features that are implied
@@ -152,7 +178,7 @@ void ARMSubtarget::resetSubtargetFeatures(StringRef CPU, StringRef FS) {
   // Thumb2 implies at least V6T2. FIXME: Fix tests to explicitly specify a
   // ARM version or CPU and then remove this.
   if (!HasV6T2Ops && hasThumb2())
-    HasV4TOps = HasV5TOps = HasV5TEOps = HasV6Ops = HasV6T2Ops = true;
+    HasV4TOps = HasV5TOps = HasV5TEOps = HasV6Ops = HasV6MOps = HasV6T2Ops = true;
 
   // Keep a pointer to static instruction cost data for the specified CPU.
   SchedModel = getSchedModelForCPU(CPUString);
@@ -169,11 +195,12 @@ void ARMSubtarget::resetSubtargetFeatures(StringRef CPU, StringRef FS) {
   if (isAAPCS_ABI())
     stackAlignment = 8;
 
-  if (!isTargetIOS())
-    UseMovt = hasV6T2Ops();
-  else {
+  UseMovt = hasV6T2Ops() && ArmUseMOVT;
+
+  if (!isTargetIOS()) {
+    IsR9Reserved = ReserveR9;
+  } else {
     IsR9Reserved = ReserveR9 | !HasV6Ops;
-    UseMovt = DarwinUseMOVT && hasV6T2Ops();
     SupportsTailCall = !getTargetTriple().isOSVersionLT(5, 0);
   }
 
@@ -207,6 +234,18 @@ void ARMSubtarget::resetSubtargetFeatures(StringRef CPU, StringRef FS) {
       break;
   }
 
+  switch (IT) {
+  case DefaultIT:
+    RestrictIT = hasV8Ops() ? true : false;
+    break;
+  case RestrictedIT:
+    RestrictIT = true;
+    break;
+  case NoRestrictedIT:
+    RestrictIT = false;
+    break;
+  }
+
   // NEON f32 ops are non-IEEE 754 compliant. Darwin is ok with it by default.
   uint64_t Bits = getFeatureBits();
   if ((Bits & ARM::ProcA5 || Bits & ARM::ProcA8) && // Where this matters
@@ -271,12 +310,15 @@ unsigned ARMSubtarget::getMispredictionPenalty() const {
   return SchedModel->MispredictPenalty;
 }
 
+bool ARMSubtarget::hasSinCos() const {
+  return getTargetTriple().getOS() == Triple::IOS &&
+    !getTargetTriple().isOSVersionLT(7, 0);
+}
+
 bool ARMSubtarget::enablePostRAScheduler(
            CodeGenOpt::Level OptLevel,
            TargetSubtargetInfo::AntiDepBreakMode& Mode,
            RegClassVector& CriticalPathRCs) const {
-  Mode = TargetSubtargetInfo::ANTIDEP_CRITICAL;
-  CriticalPathRCs.clear();
-  CriticalPathRCs.push_back(&ARM::GPRRegClass);
+  Mode = TargetSubtargetInfo::ANTIDEP_NONE;
   return PostRAScheduler && OptLevel >= CodeGenOpt::Default;
 }
diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h
index ad7f1b3..5276901 100644
--- a/lib/Target/ARM/ARMSubtarget.h
+++ b/lib/Target/ARM/ARMSubtarget.h
@@ -31,29 +31,36 @@ class TargetOptions;
 class ARMSubtarget : public ARMGenSubtargetInfo {
 protected:
   enum ARMProcFamilyEnum {
-    Others, CortexA5, CortexA8, CortexA9, CortexA15, CortexR5, Swift
+    Others, CortexA5, CortexA8, CortexA9, CortexA15, CortexR5, Swift, CortexA53, CortexA57
+  };
+  enum ARMProcClassEnum {
+    None, AClass, RClass, MClass
   };
 
   /// ARMProcFamily - ARM processor family: Cortex-A8, Cortex-A9, and others.
   ARMProcFamilyEnum ARMProcFamily;
 
+  /// ARMProcClass - ARM processor class: None, AClass, RClass or MClass.
+  ARMProcClassEnum ARMProcClass;
+
   /// HasV4TOps, HasV5TOps, HasV5TEOps,
-  /// HasV6Ops, HasV6T2Ops, HasV7Ops, HasV8Ops -
+  /// HasV6Ops, HasV6MOps, HasV6T2Ops, HasV7Ops, HasV8Ops -
   /// Specify whether target support specific ARM ISA variants.
   bool HasV4TOps;
   bool HasV5TOps;
   bool HasV5TEOps;
   bool HasV6Ops;
+  bool HasV6MOps;
   bool HasV6T2Ops;
   bool HasV7Ops;
   bool HasV8Ops;
 
-  /// HasVFPv2, HasVFPv3, HasVFPv4, HasV8FP, HasNEON - Specify what
+  /// HasVFPv2, HasVFPv3, HasVFPv4, HasFPARMv8, HasNEON - Specify what
   /// floating point ISAs are supported.
   bool HasVFPv2;
   bool HasVFPv3;
   bool HasVFPv4;
-  bool HasV8FP;
+  bool HasFPARMv8;
   bool HasNEON;
 
   /// UseNEONForSinglePrecisionFP - if the NEONFP attribute has been
@@ -82,10 +89,6 @@ protected:
   /// HasThumb2 - True if Thumb2 instructions are supported.
   bool HasThumb2;
 
-  /// IsMClass - True if the subtarget belongs to the 'M' profile of CPUs -
-  /// v6m, v7m for example.
-  bool IsMClass;
-
   /// NoARM - True if subtarget does not support ARM mode execution.
   bool NoARM;
 
@@ -147,6 +150,10 @@ protected:
   /// extension (ARMv7 only).
   bool HasMPExtension;
 
+  /// HasVirtualization - True if the subtarget supports the Virtualization
+  /// extension.
+  bool HasVirtualization;
+
   /// FPOnlySP - If true, the floating point unit only supports single
   /// precision.
   bool FPOnlySP;
@@ -159,11 +166,21 @@ protected:
   /// HasTrustZone - if true, processor supports TrustZone security extensions
   bool HasTrustZone;
 
+  /// HasCrypto - if true, processor supports Cryptography extensions
+  bool HasCrypto;
+
+  /// HasCRC - if true, processor supports CRC instructions
+  bool HasCRC;
+
   /// AllowsUnalignedMem - If true, the subtarget allows unaligned memory
   /// accesses for some types.  For details, see
   /// ARMTargetLowering::allowsUnalignedMemoryAccesses().
   bool AllowsUnalignedMem;
 
+  /// RestrictIT - If true, the subtarget disallows generation of deprecated IT
+  ///  blocks to conform to ARMv8 rule.
+  bool RestrictIT;
+
   /// Thumb2DSP - If true, the subtarget supports the v7 DSP (saturating arith
   /// and such) instructions in Thumb2 code.
   bool Thumb2DSP;
@@ -228,6 +245,7 @@ public:
   bool hasV5TOps()  const { return HasV5TOps;  }
   bool hasV5TEOps() const { return HasV5TEOps; }
   bool hasV6Ops()   const { return HasV6Ops;   }
+  bool hasV6MOps()  const { return HasV6MOps;  }
   bool hasV6T2Ops() const { return HasV6T2Ops; }
   bool hasV7Ops()   const { return HasV7Ops;  }
   bool hasV8Ops()   const { return HasV8Ops;  }
@@ -246,8 +264,11 @@ public:
   bool hasVFP2() const { return HasVFPv2; }
   bool hasVFP3() const { return HasVFPv3; }
   bool hasVFP4() const { return HasVFPv4; }
-  bool hasV8FP() const { return HasV8FP; }
+  bool hasFPARMv8() const { return HasFPARMv8; }
   bool hasNEON() const { return HasNEON;  }
+  bool hasCrypto() const { return HasCrypto; }
+  bool hasCRC() const { return HasCRC; }
+  bool hasVirtualization() const { return HasVirtualization; }
   bool useNEONForSinglePrecisionFP() const {
     return hasNEON() && UseNEONForSinglePrecisionFP; }
 
@@ -255,6 +276,9 @@ public:
   bool hasDivideInARMMode() const { return HasHardwareDivideInARM; }
   bool hasT2ExtractPack() const { return HasT2ExtractPack; }
   bool hasDataBarrier() const { return HasDataBarrier; }
+  bool hasAnyDataBarrier() const {
+    return HasDataBarrier || (hasV6Ops() && !isThumb());
+  }
   bool useMulOps() const { return UseMulOps; }
   bool useFPVMLx() const { return !SlowFPVMLx; }
   bool hasVMLxForwarding() const { return HasVMLxForwarding; }
@@ -275,10 +299,10 @@ public:
 
   const Triple &getTargetTriple() const { return TargetTriple; }
 
-  bool isTargetIOS() const { return TargetTriple.getOS() == Triple::IOS; }
+  bool isTargetIOS() const { return TargetTriple.isiOS(); }
   bool isTargetDarwin() const { return TargetTriple.isOSDarwin(); }
-  bool isTargetNaCl() const { return TargetTriple.getOS() == Triple::NaCl; }
-  bool isTargetLinux() const { return TargetTriple.getOS() == Triple::Linux; }
+  bool isTargetNaCl() const { return TargetTriple.isOSNaCl(); }
+  bool isTargetLinux() const { return TargetTriple.isOSLinux(); }
   bool isTargetELF() const { return !isTargetDarwin(); }
   // ARM EABI is the bare-metal EABI described in ARM ABI documents and
   // can be accessed via -target arm-none-eabi. This is NOT GNUEABI.
@@ -296,8 +320,9 @@ public:
   bool isThumb1Only() const { return InThumbMode && !HasThumb2; }
   bool isThumb2() const { return InThumbMode && HasThumb2; }
   bool hasThumb2() const { return HasThumb2; }
-  bool isMClass() const { return IsMClass; }
-  bool isARClass() const { return !IsMClass; }
+  bool isMClass() const { return ARMProcClass == MClass; }
+  bool isRClass() const { return ARMProcClass == RClass; }
+  bool isAClass() const { return ARMProcClass == AClass; }
 
   bool isR9Reserved() const { return IsR9Reserved; }
 
@@ -306,9 +331,15 @@ public:
 
   bool allowsUnalignedMem() const { return AllowsUnalignedMem; }
 
+  bool restrictIT() const { return RestrictIT; }
+
   const std::string & getCPUString() const { return CPUString; }
 
   unsigned getMispredictionPenalty() const;
+  
+  /// This function returns true if the target has sincos() routine in its
+  /// compiler runtime or math libraries.
+  bool hasSinCos() const;
 
   /// enablePostRAScheduler - True at 'More' optimization.
   bool enablePostRAScheduler(CodeGenOpt::Level OptLevel,
diff --git a/lib/Target/ARM/ARMTargetMachine.cpp b/lib/Target/ARM/ARMTargetMachine.cpp
index e6dbcb6..be84bf6 100644
--- a/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/lib/Target/ARM/ARMTargetMachine.cpp
@@ -196,8 +196,13 @@ bool ARMPassConfig::addPreSched2() {
   addPass(createARMExpandPseudoPass());
 
   if (getOptLevel() != CodeGenOpt::None) {
-    if (!getARMSubtarget().isThumb1Only())
+    if (!getARMSubtarget().isThumb1Only()) {
+      // in v8, IfConversion depends on Thumb instruction widths
+      if (getARMSubtarget().restrictIT() &&
+          !getARMSubtarget().prefers32BitThumb())
+        addPass(createThumb2SizeReductionPass());
       addPass(&IfConverterID);
+    }
   }
   if (getARMSubtarget().isThumb2())
     addPass(createThumb2ITBlockPass());
diff --git a/lib/Target/ARM/ARMTargetObjectFile.cpp b/lib/Target/ARM/ARMTargetObjectFile.cpp
index dfdf6ab..7ec71b2 100644
--- a/lib/Target/ARM/ARMTargetObjectFile.cpp
+++ b/lib/Target/ARM/ARMTargetObjectFile.cpp
@@ -47,7 +47,7 @@ getTTypeGlobalReference(const GlobalValue *GV, Mangler *Mang,
                         MCStreamer &Streamer) const {
   assert(Encoding == DW_EH_PE_absptr && "Can handle absptr encoding only");
 
-  return MCSymbolRefExpr::Create(Mang->getSymbol(GV),
+  return MCSymbolRefExpr::Create(getSymbol(*Mang, GV),
                                  MCSymbolRefExpr::VK_ARM_TARGET2,
                                  getContext());
 }
diff --git a/lib/Target/ARM/ARMTargetTransformInfo.cpp b/lib/Target/ARM/ARMTargetTransformInfo.cpp
index 34576ba..6bbb38f 100644
--- a/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -129,6 +129,9 @@ public:
   unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty,
                                   OperandValueKind Op1Info = OK_AnyValue,
                                   OperandValueKind Op2Info = OK_AnyValue) const;
+
+  unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
+                           unsigned AddressSpace) const;
   /// @}
 };
 
@@ -182,7 +185,7 @@ unsigned ARMTTI::getCastInstrCost(unsigned Opcode, Type *Dst,
   assert(ISD && "Invalid opcode");
 
   // Single to/from double precision conversions.
-  static const CostTblEntry<MVT> NEONFltDblTbl[] = {
+  static const CostTblEntry<MVT::SimpleValueType> NEONFltDblTbl[] = {
     // Vector fptrunc/fpext conversions.
     { ISD::FP_ROUND,   MVT::v2f64, 2 },
     { ISD::FP_EXTEND,  MVT::v2f32, 2 },
@@ -192,8 +195,7 @@ unsigned ARMTTI::getCastInstrCost(unsigned Opcode, Type *Dst,
   if (Src->isVectorTy() && ST->hasNEON() && (ISD == ISD::FP_ROUND ||
                                           ISD == ISD::FP_EXTEND)) {
     std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Src);
-    int Idx = CostTableLookup<MVT>(NEONFltDblTbl, array_lengthof(NEONFltDblTbl),
-                                ISD, LT.second);
+    int Idx = CostTableLookup(NEONFltDblTbl, ISD, LT.second);
     if (Idx != -1)
       return LT.first * NEONFltDblTbl[Idx].Cost;
   }
@@ -207,7 +209,8 @@ unsigned ARMTTI::getCastInstrCost(unsigned Opcode, Type *Dst,
   // Some arithmetic, load and store operations have specific instructions
   // to cast up/down their types automatically at no extra cost.
   // TODO: Get these tables to know at least what the related operations are.
-  static const TypeConversionCostTblEntry<MVT> NEONVectorConversionTbl[] = {
+  static const TypeConversionCostTblEntry<MVT::SimpleValueType>
+  NEONVectorConversionTbl[] = {
     { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 0 },
     { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 0 },
     { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 1 },
@@ -283,15 +286,15 @@ unsigned ARMTTI::getCastInstrCost(unsigned Opcode, Type *Dst,
   };
 
   if (SrcTy.isVector() && ST->hasNEON()) {
-    int Idx = ConvertCostTableLookup<MVT>(NEONVectorConversionTbl,
-                                array_lengthof(NEONVectorConversionTbl),
-                                ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT());
+    int Idx = ConvertCostTableLookup(NEONVectorConversionTbl, ISD,
+                                     DstTy.getSimpleVT(), SrcTy.getSimpleVT());
     if (Idx != -1)
       return NEONVectorConversionTbl[Idx].Cost;
   }
 
   // Scalar float to integer conversions.
-  static const TypeConversionCostTblEntry<MVT> NEONFloatConversionTbl[] = {
+  static const TypeConversionCostTblEntry<MVT::SimpleValueType>
+  NEONFloatConversionTbl[] = {
     { ISD::FP_TO_SINT,  MVT::i1, MVT::f32, 2 },
     { ISD::FP_TO_UINT,  MVT::i1, MVT::f32, 2 },
     { ISD::FP_TO_SINT,  MVT::i1, MVT::f64, 2 },
@@ -314,16 +317,15 @@ unsigned ARMTTI::getCastInstrCost(unsigned Opcode, Type *Dst,
     { ISD::FP_TO_UINT,  MVT::i64, MVT::f64, 10 }
   };
   if (SrcTy.isFloatingPoint() && ST->hasNEON()) {
-    int Idx = ConvertCostTableLookup<MVT>(NEONFloatConversionTbl,
-                                        array_lengthof(NEONFloatConversionTbl),
-                                        ISD, DstTy.getSimpleVT(),
-                                        SrcTy.getSimpleVT());
+    int Idx = ConvertCostTableLookup(NEONFloatConversionTbl, ISD,
+                                     DstTy.getSimpleVT(), SrcTy.getSimpleVT());
     if (Idx != -1)
         return NEONFloatConversionTbl[Idx].Cost;
   }
 
   // Scalar integer to float conversions.
-  static const TypeConversionCostTblEntry<MVT> NEONIntegerConversionTbl[] = {
+  static const TypeConversionCostTblEntry<MVT::SimpleValueType>
+  NEONIntegerConversionTbl[] = {
     { ISD::SINT_TO_FP,  MVT::f32, MVT::i1, 2 },
     { ISD::UINT_TO_FP,  MVT::f32, MVT::i1, 2 },
     { ISD::SINT_TO_FP,  MVT::f64, MVT::i1, 2 },
@@ -347,16 +349,15 @@ unsigned ARMTTI::getCastInstrCost(unsigned Opcode, Type *Dst,
   };
 
   if (SrcTy.isInteger() && ST->hasNEON()) {
-    int Idx = ConvertCostTableLookup<MVT>(NEONIntegerConversionTbl,
-                                       array_lengthof(NEONIntegerConversionTbl),
-                                       ISD, DstTy.getSimpleVT(),
-                                       SrcTy.getSimpleVT());
+    int Idx = ConvertCostTableLookup(NEONIntegerConversionTbl, ISD,
+                                     DstTy.getSimpleVT(), SrcTy.getSimpleVT());
     if (Idx != -1)
       return NEONIntegerConversionTbl[Idx].Cost;
   }
 
   // Scalar integer conversion costs.
-  static const TypeConversionCostTblEntry<MVT> ARMIntegerConversionTbl[] = {
+  static const TypeConversionCostTblEntry<MVT::SimpleValueType>
+  ARMIntegerConversionTbl[] = {
     // i16 -> i64 requires two dependent operations.
     { ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 2 },
 
@@ -368,11 +369,8 @@ unsigned ARMTTI::getCastInstrCost(unsigned Opcode, Type *Dst,
   };
 
   if (SrcTy.isInteger()) {
-    int Idx =
-      ConvertCostTableLookup<MVT>(ARMIntegerConversionTbl,
-                                  array_lengthof(ARMIntegerConversionTbl),
-                                  ISD, DstTy.getSimpleVT(),
-                                  SrcTy.getSimpleVT());
+    int Idx = ConvertCostTableLookup(ARMIntegerConversionTbl, ISD,
+                                     DstTy.getSimpleVT(), SrcTy.getSimpleVT());
     if (Idx != -1)
       return ARMIntegerConversionTbl[Idx].Cost;
   }
@@ -400,7 +398,8 @@ unsigned ARMTTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
   // On NEON a a vector select gets lowered to vbsl.
   if (ST->hasNEON() && ValTy->isVectorTy() && ISD == ISD::SELECT) {
     // Lowering of some vector selects is currently far from perfect.
-    static const TypeConversionCostTblEntry<MVT> NEONVectorSelectTbl[] = {
+    static const TypeConversionCostTblEntry<MVT::SimpleValueType>
+    NEONVectorSelectTbl[] = {
       { ISD::SELECT, MVT::v16i1, MVT::v16i16, 2*16 + 1 + 3*1 + 4*1 },
       { ISD::SELECT, MVT::v8i1, MVT::v8i32, 4*8 + 1*3 + 1*4 + 1*2 },
       { ISD::SELECT, MVT::v16i1, MVT::v16i32, 4*16 + 1*6 + 1*8 + 1*4 },
@@ -412,10 +411,9 @@ unsigned ARMTTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
     EVT SelCondTy = TLI->getValueType(CondTy);
     EVT SelValTy = TLI->getValueType(ValTy);
     if (SelCondTy.isSimple() && SelValTy.isSimple()) {
-      int Idx = ConvertCostTableLookup<MVT>(NEONVectorSelectTbl,
-                                            array_lengthof(NEONVectorSelectTbl),
-                                            ISD, SelCondTy.getSimpleVT(),
-                                            SelValTy.getSimpleVT());
+      int Idx = ConvertCostTableLookup(NEONVectorSelectTbl, ISD,
+                                       SelCondTy.getSimpleVT(),
+                                       SelValTy.getSimpleVT());
       if (Idx != -1)
         return NEONVectorSelectTbl[Idx].Cost;
     }
@@ -448,7 +446,7 @@ unsigned ARMTTI::getShuffleCost(ShuffleKind Kind, Type *Tp, int Index,
   if (Kind != SK_Reverse)
     return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp);
 
-  static const CostTblEntry<MVT> NEONShuffleTbl[] = {
+  static const CostTblEntry<MVT::SimpleValueType> NEONShuffleTbl[] = {
     // Reverse shuffle cost one instruction if we are shuffling within a double
     // word (vrev) or two if we shuffle a quad word (vrev, vext).
     { ISD::VECTOR_SHUFFLE, MVT::v2i32, 1 },
@@ -464,8 +462,7 @@ unsigned ARMTTI::getShuffleCost(ShuffleKind Kind, Type *Tp, int Index,
 
   std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Tp);
 
-  int Idx = CostTableLookup<MVT>(NEONShuffleTbl, array_lengthof(NEONShuffleTbl),
-                                 ISD::VECTOR_SHUFFLE, LT.second);
+  int Idx = CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second);
   if (Idx == -1)
     return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp);
 
@@ -480,7 +477,7 @@ unsigned ARMTTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty, OperandValueK
 
   const unsigned FunctionCallDivCost = 20;
   const unsigned ReciprocalDivCost = 10;
-  static const CostTblEntry<MVT> CostTbl[] = {
+  static const CostTblEntry<MVT::SimpleValueType> CostTbl[] = {
     // Division.
     // These costs are somewhat random. Choose a cost of 20 to indicate that
     // vectorizing devision (added function call) is going to be very expensive.
@@ -524,14 +521,37 @@ unsigned ARMTTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty, OperandValueK
   int Idx = -1;
 
   if (ST->hasNEON())
-    Idx = CostTableLookup<MVT>(CostTbl, array_lengthof(CostTbl), ISDOpcode,
-                               LT.second);
+    Idx = CostTableLookup(CostTbl, ISDOpcode, LT.second);
 
   if (Idx != -1)
     return LT.first * CostTbl[Idx].Cost;
 
-
-  return TargetTransformInfo::getArithmeticInstrCost(Opcode, Ty, Op1Info,
-                                                     Op2Info);
+  unsigned Cost =
+      TargetTransformInfo::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info);
+
+  // This is somewhat of a hack. The problem that we are facing is that SROA
+  // creates a sequence of shift, and, or instructions to construct values.
+  // These sequences are recognized by the ISel and have zero-cost. Not so for
+  // the vectorized code. Because we have support for v2i64 but not i64 those
+  // sequences look particularily beneficial to vectorize.
+  // To work around this we increase the cost of v2i64 operations to make them
+  // seem less beneficial.
+  if (LT.second == MVT::v2i64 &&
+      Op2Info == TargetTransformInfo::OK_UniformConstantValue)
+    Cost += 4;
+
+  return Cost;
 }
 
+unsigned ARMTTI::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
+                                 unsigned AddressSpace) const {
+  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Src);
+
+  if (Src->isVectorTy() && Alignment != 16 &&
+      Src->getVectorElementType()->isDoubleTy()) {
+    // Unaligned loads/stores are extremely inefficient.
+    // We need 4 uops for vst.1/vld.1 vs 1uop for vldr/vstr.
+    return LT.first * 4;
+  }
+  return LT.first;
+}
diff --git a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index 80e5c6e..e3f9e0d 100644
--- a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -7,6 +7,9 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "ARMBuildAttrs.h"
+#include "ARMFPUName.h"
+#include "ARMFeatures.h"
 #include "llvm/MC/MCTargetAsmParser.h"
 #include "MCTargetDesc/ARMAddressingModes.h"
 #include "MCTargetDesc/ARMBaseInfo.h"
@@ -24,6 +27,7 @@
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/MC/MCParser/MCAsmParser.h"
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
@@ -47,8 +51,14 @@ enum VectorLaneTy { NoLanes, AllLanes, IndexedLane };
 class ARMAsmParser : public MCTargetAsmParser {
   MCSubtargetInfo &STI;
   MCAsmParser &Parser;
+  const MCInstrInfo &MII;
   const MCRegisterInfo *MRI;
 
+  ARMTargetStreamer &getTargetStreamer() {
+    MCTargetStreamer &TS = getParser().getStreamer().getTargetStreamer();
+    return static_cast<ARMTargetStreamer &>(TS);
+  }
+
   // Unwind directives state
   SMLoc FnStartLoc;
   SMLoc CantUnwindLoc;
@@ -66,6 +76,8 @@ class ARMAsmParser : public MCTargetAsmParser {
   // Map of register aliases registers via the .req directive.
   StringMap<unsigned> RegisterReqs;
 
+  bool NextSymbolIsThumb;
+
   struct {
     ARMCC::CondCodes Cond;    // Condition for IT block.
     unsigned Mask:4;          // Condition mask for instructions.
@@ -127,6 +139,8 @@ class ARMAsmParser : public MCTargetAsmParser {
   bool parseDirectiveUnreq(SMLoc L);
   bool parseDirectiveArch(SMLoc L);
   bool parseDirectiveEabiAttr(SMLoc L);
+  bool parseDirectiveCPU(SMLoc L);
+  bool parseDirectiveFPU(SMLoc L);
   bool parseDirectiveFnStart(SMLoc L);
   bool parseDirectiveFnEnd(SMLoc L);
   bool parseDirectiveCantUnwind(SMLoc L);
@@ -139,7 +153,8 @@ class ARMAsmParser : public MCTargetAsmParser {
   StringRef splitMnemonic(StringRef Mnemonic, unsigned &PredicationCode,
                           bool &CarrySetting, unsigned &ProcessorIMod,
                           StringRef &ITMask);
-  void getMnemonicAcceptInfo(StringRef Mnemonic, bool &CanAcceptCarrySet,
+  void getMnemonicAcceptInfo(StringRef Mnemonic, StringRef FullInst,
+                             bool &CanAcceptCarrySet,
                              bool &CanAcceptPredicationCode);
 
   bool isThumb() const {
@@ -158,6 +173,9 @@ class ARMAsmParser : public MCTargetAsmParser {
   bool hasV6Ops() const {
     return STI.getFeatureBits() & ARM::HasV6Ops;
   }
+  bool hasV6MOps() const {
+    return STI.getFeatureBits() & ARM::HasV6MOps;
+  }
   bool hasV7Ops() const {
     return STI.getFeatureBits() & ARM::HasV7Ops;
   }
@@ -221,6 +239,9 @@ class ARMAsmParser : public MCTargetAsmParser {
   // Asm Match Converter Methods
   void cvtThumbMultiply(MCInst &Inst,
                         const SmallVectorImpl<MCParsedAsmOperand*> &);
+  void cvtThumbBranches(MCInst &Inst,
+                        const SmallVectorImpl<MCParsedAsmOperand*> &);
+                        
   bool validateInstruction(MCInst &Inst,
                            const SmallVectorImpl<MCParsedAsmOperand*> &Ops);
   bool processInstruction(MCInst &Inst,
@@ -229,8 +250,6 @@ class ARMAsmParser : public MCTargetAsmParser {
                               SmallVectorImpl<MCParsedAsmOperand*> &Operands);
   bool shouldOmitPredicateOperand(StringRef Mnemonic,
                               SmallVectorImpl<MCParsedAsmOperand*> &Operands);
-  bool isDeprecated(MCInst &Inst, StringRef &Info);
-
 public:
   enum ARMMatchResultTy {
     Match_RequiresITBlock = FIRST_TARGET_MATCH_RESULT_TY,
@@ -242,8 +261,9 @@ public:
 
   };
 
-  ARMAsmParser(MCSubtargetInfo &_STI, MCAsmParser &_Parser)
-    : MCTargetAsmParser(), STI(_STI), Parser(_Parser), FPReg(-1) {
+  ARMAsmParser(MCSubtargetInfo &_STI, MCAsmParser &_Parser,
+               const MCInstrInfo &MII)
+      : MCTargetAsmParser(), STI(_STI), Parser(_Parser), MII(MII), FPReg(-1) {
     MCAsmParserExtension::Initialize(_Parser);
 
     // Cache the MCRegisterInfo.
@@ -255,12 +275,7 @@ public:
     // Not in an ITBlock to start with.
     ITState.CurPosition = ~0U;
 
-    // Set ELF header flags.
-    // FIXME: This should eventually end up somewhere else where more
-    // intelligent flag decisions can be made. For now we are just maintaining
-    // the statu/parseDirects quo for ARM and setting EF_ARM_EABI_VER5 as the default.
-    if (MCELFStreamer *MES = dyn_cast<MCELFStreamer>(&Parser.getStreamer()))
-      MES->getAssembler().setELFHeaderEFlags(ELF::EF_ARM_EABI_VER5);
+    NextSymbolIsThumb = false;
   }
 
   // Implementation of the MCTargetAsmParser interface:
@@ -277,6 +292,8 @@ public:
                                SmallVectorImpl<MCParsedAsmOperand*> &Operands,
                                MCStreamer &Out, unsigned &ErrorInfo,
                                bool MatchingInlineAsm);
+  void onLabelParsed(MCSymbol *Symbol);
+
 };
 } // end anonymous namespace
 
@@ -601,7 +618,7 @@ public:
   template<unsigned width, unsigned scale>
   bool isUnsignedOffset() const {
     if (!isImm()) return false;
-    if (dyn_cast<MCSymbolRefExpr>(Imm.Val)) return true;
+    if (isa<MCSymbolRefExpr>(Imm.Val)) return true;
     if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Imm.Val)) {
       int64_t Val = CE->getValue();
       int64_t Align = 1LL << scale;
@@ -610,6 +627,22 @@ public:
     }
     return false;
   }
+  // checks whether this operand is an signed offset which fits is a field
+  // of specified width and scaled by a specific number of bits
+  template<unsigned width, unsigned scale>
+  bool isSignedOffset() const {
+    if (!isImm()) return false;
+    if (isa<MCSymbolRefExpr>(Imm.Val)) return true;
+    if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Imm.Val)) {
+      int64_t Val = CE->getValue();
+      int64_t Align = 1LL << scale;
+      int64_t Max = Align * ((1LL << (width-1)) - 1);
+      int64_t Min = -Align * (1LL << (width-1));
+      return ((Val % Align) == 0) && (Val >= Min) && (Val <= Max);
+    }
+    return false;
+  }
+
   // checks whether this operand is a memory operand computed as an offset
   // applied to PC. the offset may have 8 bits of magnitude and is represented
   // with two bits of shift. textually it may be either [pc, #imm], #imm or 
@@ -628,7 +661,7 @@ public:
       Val = Memory.OffsetImm->getValue();
     }
     else return false;
-    return ((Val % 4) == 0) && (Val >= -1020) && (Val <= 1020);
+    return ((Val % 4) == 0) && (Val >= 0) && (Val <= 1020);
   }
   bool isFPImm() const {
     if (!isImm()) return false;
@@ -658,13 +691,6 @@ public:
     int64_t Value = CE->getValue();
     return ((Value & 3) == 0) && Value >= -1020 && Value <= 1020;
   }
-  bool isImm0_4() const {
-    if (!isImm()) return false;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-    int64_t Value = CE->getValue();
-    return Value >= 0 && Value < 5;
-  }
   bool isImm0_1020s4() const {
     if (!isImm()) return false;
     const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
@@ -687,6 +713,13 @@ public:
     // explicitly exclude zero. we want that to use the normal 0_508 version.
     return ((Value & 3) == 0) && Value > 0 && Value <= 508;
   }
+  bool isImm0_239() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value >= 0 && Value < 240;
+  }
   bool isImm0_255() const {
     if (!isImm()) return false;
     const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
@@ -848,6 +881,15 @@ public:
     int64_t Value = CE->getValue();
     return Value >= 0 && Value < 65536;
   }
+  bool isImm256_65535Expr() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    // If it's not a constant expression, it'll generate a fixup and be
+    // handled later.
+    if (!CE) return true;
+    int64_t Value = CE->getValue();
+    return Value >= 256 && Value < 65536;
+  }
   bool isImm0_65535Expr() const {
     if (!isImm()) return false;
     const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
@@ -927,7 +969,8 @@ public:
     const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
     if (!CE) return false;
     int64_t Value = CE->getValue();
-    return ARM_AM::getT2SOImmVal(~Value) != -1;
+    return ARM_AM::getT2SOImmVal(Value) == -1 &&
+      ARM_AM::getT2SOImmVal(~Value) != -1;
   }
   bool isT2SOImmNeg() const {
     if (!isImm()) return false;
@@ -1773,8 +1816,6 @@ public:
   void addMemPCRelImm12Operands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
     int32_t Imm = Memory.OffsetImm->getValue();
-    // FIXME: Handle #-0
-    if (Imm == INT32_MIN) Imm = 0;
     Inst.addOperand(MCOperand::CreateImm(Imm));
   }
 
@@ -2494,7 +2535,7 @@ void ARMOperand::print(raw_ostream &OS) const {
     getImm()->print(OS);
     break;
   case k_MemBarrierOpt:
-    OS << "<ARM_MB::" << MemBOptToString(getMemBarrierOpt()) << ">";
+    OS << "<ARM_MB::" << MemBOptToString(getMemBarrierOpt(), false) << ">";
     break;
   case k_InstSyncBarrierOpt:
     OS << "<ARM_ISB::" << InstSyncBOptToString(getInstSyncBarrierOpt()) << ">";
@@ -2831,8 +2872,9 @@ static int MatchCoprocessorOperandName(StringRef Name, char CoprocOp) {
       return -1;
     switch (Name[2]) {
     default:  return -1;
-    case '0': return 10;
-    case '1': return 11;
+    // p10 and p11 are invalid for coproc instructions (reserved for FP/NEON)
+    case '0': return CoprocOp == 'p'? -1: 10;
+    case '1': return CoprocOp == 'p'? -1: 11;
     case '2': return 12;
     case '3': return 13;
     case '4': return 14;
@@ -3439,18 +3481,27 @@ parseMemBarrierOptOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
     Opt = StringSwitch<unsigned>(OptStr.slice(0, OptStr.size()).lower())
       .Case("sy",    ARM_MB::SY)
       .Case("st",    ARM_MB::ST)
+      .Case("ld",    ARM_MB::LD)
       .Case("sh",    ARM_MB::ISH)
       .Case("ish",   ARM_MB::ISH)
       .Case("shst",  ARM_MB::ISHST)
       .Case("ishst", ARM_MB::ISHST)
+      .Case("ishld", ARM_MB::ISHLD)
       .Case("nsh",   ARM_MB::NSH)
       .Case("un",    ARM_MB::NSH)
       .Case("nshst", ARM_MB::NSHST)
+      .Case("nshld", ARM_MB::NSHLD)
       .Case("unst",  ARM_MB::NSHST)
       .Case("osh",   ARM_MB::OSH)
       .Case("oshst", ARM_MB::OSHST)
+      .Case("oshld", ARM_MB::OSHLD)
       .Default(~0U);
 
+    // ishld, oshld, nshld and ld are only available from ARMv8.
+    if (!hasV8Ops() && (Opt == ARM_MB::ISHLD || Opt == ARM_MB::OSHLD ||
+                        Opt == ARM_MB::NSHLD || Opt == ARM_MB::LD))
+      Opt = ~0U;
+
     if (Opt == ~0U)
       return MatchOperand_NoMatch;
 
@@ -3498,7 +3549,7 @@ parseInstSyncBarrierOptOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   if (Tok.is(AsmToken::Identifier)) {
     StringRef OptStr = Tok.getString();
 
-    if (OptStr.lower() == "sy")
+    if (OptStr.equals_lower("sy"))
       Opt = ARM_ISB::SY;
     else
       return MatchOperand_NoMatch;
@@ -4102,6 +4153,65 @@ cvtThumbMultiply(MCInst &Inst,
   ((ARMOperand*)Operands[2])->addCondCodeOperands(Inst, 2);
 }
 
+void ARMAsmParser::
+cvtThumbBranches(MCInst &Inst,
+           const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  int CondOp = -1, ImmOp = -1;
+  switch(Inst.getOpcode()) {
+    case ARM::tB:
+    case ARM::tBcc:  CondOp = 1; ImmOp = 2; break;
+
+    case ARM::t2B:
+    case ARM::t2Bcc: CondOp = 1; ImmOp = 3; break;
+
+    default: llvm_unreachable("Unexpected instruction in cvtThumbBranches");
+  }
+  // first decide whether or not the branch should be conditional
+  // by looking at it's location relative to an IT block
+  if(inITBlock()) {
+    // inside an IT block we cannot have any conditional branches. any 
+    // such instructions needs to be converted to unconditional form
+    switch(Inst.getOpcode()) {
+      case ARM::tBcc: Inst.setOpcode(ARM::tB); break;
+      case ARM::t2Bcc: Inst.setOpcode(ARM::t2B); break;
+    }
+  } else {
+    // outside IT blocks we can only have unconditional branches with AL
+    // condition code or conditional branches with non-AL condition code
+    unsigned Cond = static_cast<ARMOperand*>(Operands[CondOp])->getCondCode();
+    switch(Inst.getOpcode()) {
+      case ARM::tB:
+      case ARM::tBcc: 
+        Inst.setOpcode(Cond == ARMCC::AL ? ARM::tB : ARM::tBcc); 
+        break;
+      case ARM::t2B:
+      case ARM::t2Bcc: 
+        Inst.setOpcode(Cond == ARMCC::AL ? ARM::t2B : ARM::t2Bcc);
+        break;
+    }
+  }
+  
+  // now decide on encoding size based on branch target range
+  switch(Inst.getOpcode()) {
+    // classify tB as either t2B or t1B based on range of immediate operand
+    case ARM::tB: {
+      ARMOperand* op = static_cast<ARMOperand*>(Operands[ImmOp]);
+      if(!op->isSignedOffset<11, 1>() && isThumbTwo()) 
+        Inst.setOpcode(ARM::t2B);
+      break;
+    }
+    // classify tBcc as either t2Bcc or t1Bcc based on range of immediate operand
+    case ARM::tBcc: {
+      ARMOperand* op = static_cast<ARMOperand*>(Operands[ImmOp]);
+      if(!op->isSignedOffset<8, 1>() && isThumbTwo())
+        Inst.setOpcode(ARM::t2Bcc);
+      break;
+    }
+  }
+  ((ARMOperand*)Operands[ImmOp])->addImmOperands(Inst, 1);
+  ((ARMOperand*)Operands[CondOp])->addCondCodeOperands(Inst, 2);
+}
+
 /// Parse an ARM memory expression, return false if successful else return true
 /// or an error.  The first token must be a '[' when called.
 bool ARMAsmParser::
@@ -4601,7 +4711,7 @@ StringRef ARMAsmParser::splitMnemonic(StringRef Mnemonic,
       Mnemonic == "mls"   || Mnemonic == "smmls"  || Mnemonic == "vcls"  ||
       Mnemonic == "vmls"  || Mnemonic == "vnmls"  || Mnemonic == "vacge" ||
       Mnemonic == "vcge"  || Mnemonic == "vclt"   || Mnemonic == "vacgt" ||
-      Mnemonic == "vaclt" || Mnemonic == "vacle"  ||
+      Mnemonic == "vaclt" || Mnemonic == "vacle"  || Mnemonic == "hlt" ||
       Mnemonic == "vcgt"  || Mnemonic == "vcle"   || Mnemonic == "smlal" ||
       Mnemonic == "umaal" || Mnemonic == "umlal"  || Mnemonic == "vabal" ||
       Mnemonic == "vmlal" || Mnemonic == "vpadal" || Mnemonic == "vqdmlal" ||
@@ -4688,8 +4798,8 @@ StringRef ARMAsmParser::splitMnemonic(StringRef Mnemonic,
 //
 // FIXME: It would be nice to autogen this.
 void ARMAsmParser::
-getMnemonicAcceptInfo(StringRef Mnemonic, bool &CanAcceptCarrySet,
-                      bool &CanAcceptPredicationCode) {
+getMnemonicAcceptInfo(StringRef Mnemonic, StringRef FullInst,
+                     bool &CanAcceptCarrySet, bool &CanAcceptPredicationCode) {
   if (Mnemonic == "and" || Mnemonic == "lsl" || Mnemonic == "lsr" ||
       Mnemonic == "rrx" || Mnemonic == "ror" || Mnemonic == "sub" ||
       Mnemonic == "add" || Mnemonic == "adc" ||
@@ -4707,12 +4817,14 @@ getMnemonicAcceptInfo(StringRef Mnemonic, bool &CanAcceptCarrySet,
 
   if (Mnemonic == "bkpt" || Mnemonic == "cbnz" || Mnemonic == "setend" ||
       Mnemonic == "cps" ||  Mnemonic == "it" ||  Mnemonic == "cbz" ||
-      Mnemonic == "trap" || Mnemonic == "setend" ||
+      Mnemonic == "trap" || Mnemonic == "hlt" || Mnemonic.startswith("crc32") ||
       Mnemonic.startswith("cps") || Mnemonic.startswith("vsel") ||
       Mnemonic == "vmaxnm" || Mnemonic == "vminnm" || Mnemonic == "vcvta" ||
       Mnemonic == "vcvtn" || Mnemonic == "vcvtp" || Mnemonic == "vcvtm" ||
       Mnemonic == "vrinta" || Mnemonic == "vrintn" || Mnemonic == "vrintp" ||
-      Mnemonic == "vrintm") {
+      Mnemonic == "vrintm" || Mnemonic.startswith("aes") ||
+      Mnemonic.startswith("sha1") || Mnemonic.startswith("sha256") ||
+      (FullInst.startswith("vmull") && FullInst.endswith(".p64"))) {
     // These mnemonics are never predicable
     CanAcceptPredicationCode = false;
   } else if (!isThumb()) {
@@ -4726,7 +4838,10 @@ getMnemonicAcceptInfo(StringRef Mnemonic, bool &CanAcceptCarrySet,
         Mnemonic != "stc2" && Mnemonic != "stc2l" &&
         !Mnemonic.startswith("rfe") && !Mnemonic.startswith("srs");
   } else if (isThumbOne()) {
-    CanAcceptPredicationCode = Mnemonic != "nop" && Mnemonic != "movs";
+    if (hasV6MOps())
+      CanAcceptPredicationCode = Mnemonic != "movs";
+    else
+      CanAcceptPredicationCode = Mnemonic != "nop" && Mnemonic != "movs";
   } else
     CanAcceptPredicationCode = true;
 }
@@ -4877,14 +4992,6 @@ bool ARMAsmParser::shouldOmitPredicateOperand(
   return false;
 }
 
-bool ARMAsmParser::isDeprecated(MCInst &Inst, StringRef &Info) {
-  if (hasV8Ops() && Inst.getOpcode() == ARM::SETEND) {
-    Info = "armv8";
-    return true;
-  }
-  return false;
-}
-
 static bool isDataTypeToken(StringRef Tok) {
   return Tok == ".8" || Tok == ".16" || Tok == ".32" || Tok == ".64" ||
     Tok == ".i8" || Tok == ".i16" || Tok == ".i32" || Tok == ".i64" ||
@@ -4980,7 +5087,7 @@ bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
   // the matcher deal with finding the right instruction or generating an
   // appropriate error.
   bool CanAcceptCarrySet, CanAcceptPredicationCode;
-  getMnemonicAcceptInfo(Mnemonic, CanAcceptCarrySet, CanAcceptPredicationCode);
+  getMnemonicAcceptInfo(Mnemonic, Name, CanAcceptCarrySet, CanAcceptPredicationCode);
 
   // If we had a carry-set on an instruction that can't do that, issue an
   // error.
@@ -5115,8 +5222,9 @@ bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
   // expressed as a GPRPair, so we have to manually merge them.
   // FIXME: We would really like to be able to tablegen'erate this.
   if (!isThumb() && Operands.size() > 4 &&
-      (Mnemonic == "ldrexd" || Mnemonic == "strexd")) {
-    bool isLoad = (Mnemonic == "ldrexd");
+      (Mnemonic == "ldrexd" || Mnemonic == "strexd" || Mnemonic == "ldaexd" ||
+       Mnemonic == "stlexd")) {
+    bool isLoad = (Mnemonic == "ldrexd" || Mnemonic == "ldaexd");
     unsigned Idx = isLoad ? 2 : 3;
     ARMOperand* Op1 = static_cast<ARMOperand*>(Operands[Idx]);
     ARMOperand* Op2 = static_cast<ARMOperand*>(Operands[Idx+1]);
@@ -5200,45 +5308,44 @@ static bool listContainsReg(MCInst &Inst, unsigned OpNo, unsigned Reg) {
   return false;
 }
 
-// FIXME: We would really prefer to have MCInstrInfo (the wrapper around
-// the ARMInsts array) instead. Getting that here requires awkward
-// API changes, though. Better way?
-namespace llvm {
-extern const MCInstrDesc ARMInsts[];
-}
-static const MCInstrDesc &getInstDesc(unsigned Opcode) {
-  return ARMInsts[Opcode];
+// Return true if instruction has the interesting property of being
+// allowed in IT blocks, but not being predicable.
+static bool instIsBreakpoint(const MCInst &Inst) {
+    return Inst.getOpcode() == ARM::tBKPT ||
+           Inst.getOpcode() == ARM::BKPT ||
+           Inst.getOpcode() == ARM::tHLT ||
+           Inst.getOpcode() == ARM::HLT;
+
 }
 
 // FIXME: We would really like to be able to tablegen'erate this.
 bool ARMAsmParser::
 validateInstruction(MCInst &Inst,
                     const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-  const MCInstrDesc &MCID = getInstDesc(Inst.getOpcode());
+  const MCInstrDesc &MCID = MII.get(Inst.getOpcode());
   SMLoc Loc = Operands[0]->getStartLoc();
+
   // Check the IT block state first.
-  // NOTE: BKPT instruction has the interesting property of being
-  // allowed in IT blocks, but not being predicable.  It just always
-  // executes.
-  if (inITBlock() && Inst.getOpcode() != ARM::tBKPT &&
-      Inst.getOpcode() != ARM::BKPT) {
-    unsigned bit = 1;
+  // NOTE: BKPT and HLT instructions have the interesting property of being
+  // allowed in IT blocks, but not being predicable. They just always execute.
+  if (inITBlock() && !instIsBreakpoint(Inst)) {
+    unsigned Bit = 1;
     if (ITState.FirstCond)
       ITState.FirstCond = false;
     else
-      bit = (ITState.Mask >> (5 - ITState.CurPosition)) & 1;
+      Bit = (ITState.Mask >> (5 - ITState.CurPosition)) & 1;
     // The instruction must be predicable.
     if (!MCID.isPredicable())
       return Error(Loc, "instructions in IT block must be predicable");
     unsigned Cond = Inst.getOperand(MCID.findFirstPredOperandIdx()).getImm();
-    unsigned ITCond = bit ? ITState.Cond :
+    unsigned ITCond = Bit ? ITState.Cond :
       ARMCC::getOppositeCondition(ITState.Cond);
     if (Cond != ITCond) {
       // Find the condition code Operand to get its SMLoc information.
       SMLoc CondLoc;
-      for (unsigned i = 1; i < Operands.size(); ++i)
-        if (static_cast<ARMOperand*>(Operands[i])->isCondCode())
-          CondLoc = Operands[i]->getStartLoc();
+      for (unsigned I = 1; I < Operands.size(); ++I)
+        if (static_cast<ARMOperand*>(Operands[I])->isCondCode())
+          CondLoc = Operands[I]->getStartLoc();
       return Error(CondLoc, "incorrect condition in IT block; got '" +
                    StringRef(ARMCondCodeToString(ARMCC::CondCodes(Cond))) +
                    "', but expected '" +
@@ -5247,20 +5354,55 @@ validateInstruction(MCInst &Inst,
   // Check for non-'al' condition codes outside of the IT block.
   } else if (isThumbTwo() && MCID.isPredicable() &&
              Inst.getOperand(MCID.findFirstPredOperandIdx()).getImm() !=
-             ARMCC::AL && Inst.getOpcode() != ARM::tB &&
-             Inst.getOpcode() != ARM::t2B)
+             ARMCC::AL && Inst.getOpcode() != ARM::tBcc &&
+             Inst.getOpcode() != ARM::t2Bcc)
     return Error(Loc, "predicated instructions must be in IT block");
 
-  switch (Inst.getOpcode()) {
+  const unsigned Opcode = Inst.getOpcode();
+  switch (Opcode) {
   case ARM::LDRD:
   case ARM::LDRD_PRE:
   case ARM::LDRD_POST: {
+    const unsigned RtReg = Inst.getOperand(0).getReg();
+
+    // Rt can't be R14.
+    if (RtReg == ARM::LR)
+      return Error(Operands[3]->getStartLoc(),
+                   "Rt can't be R14");
+
+    const unsigned Rt = MRI->getEncodingValue(RtReg);
+    // Rt must be even-numbered.
+    if ((Rt & 1) == 1)
+      return Error(Operands[3]->getStartLoc(),
+                   "Rt must be even-numbered");
+
     // Rt2 must be Rt + 1.
-    unsigned Rt = MRI->getEncodingValue(Inst.getOperand(0).getReg());
-    unsigned Rt2 = MRI->getEncodingValue(Inst.getOperand(1).getReg());
+    const unsigned Rt2 = MRI->getEncodingValue(Inst.getOperand(1).getReg());
     if (Rt2 != Rt + 1)
       return Error(Operands[3]->getStartLoc(),
                    "destination operands must be sequential");
+
+    if (Opcode == ARM::LDRD_PRE || Opcode == ARM::LDRD_POST) {
+      const unsigned Rn = MRI->getEncodingValue(Inst.getOperand(3).getReg());
+      // For addressing modes with writeback, the base register needs to be
+      // different from the destination registers.
+      if (Rn == Rt || Rn == Rt2)
+        return Error(Operands[3]->getStartLoc(),
+                     "base register needs to be different from destination "
+                     "registers");
+    }
+
+    return false;
+  }
+  case ARM::t2LDRDi8:
+  case ARM::t2LDRD_PRE:
+  case ARM::t2LDRD_POST: {
+    // Rt2 must be different from Rt.
+    unsigned Rt = MRI->getEncodingValue(Inst.getOperand(0).getReg());
+    unsigned Rt2 = MRI->getEncodingValue(Inst.getOperand(1).getReg());
+    if (Rt2 == Rt)
+      return Error(Operands[3]->getStartLoc(),
+                   "destination operands can't be identical");
     return false;
   }
   case ARM::STRD: {
@@ -5284,50 +5426,77 @@ validateInstruction(MCInst &Inst,
   }
   case ARM::SBFX:
   case ARM::UBFX: {
-    // width must be in range [1, 32-lsb]
-    unsigned lsb = Inst.getOperand(2).getImm();
-    unsigned widthm1 = Inst.getOperand(3).getImm();
-    if (widthm1 >= 32 - lsb)
+    // Width must be in range [1, 32-lsb].
+    unsigned LSB = Inst.getOperand(2).getImm();
+    unsigned Widthm1 = Inst.getOperand(3).getImm();
+    if (Widthm1 >= 32 - LSB)
       return Error(Operands[5]->getStartLoc(),
                    "bitfield width must be in range [1,32-lsb]");
     return false;
   }
+  // Notionally handles ARM::tLDMIA_UPD too.
   case ARM::tLDMIA: {
     // If we're parsing Thumb2, the .w variant is available and handles
-    // most cases that are normally illegal for a Thumb1 LDM
-    // instruction. We'll make the transformation in processInstruction()
-    // if necessary.
+    // most cases that are normally illegal for a Thumb1 LDM instruction.
+    // We'll make the transformation in processInstruction() if necessary.
     //
     // Thumb LDM instructions are writeback iff the base register is not
     // in the register list.
     unsigned Rn = Inst.getOperand(0).getReg();
-    bool hasWritebackToken =
+    bool HasWritebackToken =
       (static_cast<ARMOperand*>(Operands[3])->isToken() &&
        static_cast<ARMOperand*>(Operands[3])->getToken() == "!");
-    bool listContainsBase;
-    if (checkLowRegisterList(Inst, 3, Rn, 0, listContainsBase) && !isThumbTwo())
-      return Error(Operands[3 + hasWritebackToken]->getStartLoc(),
+    bool ListContainsBase;
+    if (checkLowRegisterList(Inst, 3, Rn, 0, ListContainsBase) && !isThumbTwo())
+      return Error(Operands[3 + HasWritebackToken]->getStartLoc(),
                    "registers must be in range r0-r7");
     // If we should have writeback, then there should be a '!' token.
-    if (!listContainsBase && !hasWritebackToken && !isThumbTwo())
+    if (!ListContainsBase && !HasWritebackToken && !isThumbTwo())
       return Error(Operands[2]->getStartLoc(),
                    "writeback operator '!' expected");
     // If we should not have writeback, there must not be a '!'. This is
     // true even for the 32-bit wide encodings.
-    if (listContainsBase && hasWritebackToken)
+    if (ListContainsBase && HasWritebackToken)
       return Error(Operands[3]->getStartLoc(),
                    "writeback operator '!' not allowed when base register "
                    "in register list");
 
     break;
   }
-  case ARM::t2LDMIA_UPD: {
+  case ARM::LDMIA_UPD:
+  case ARM::LDMDB_UPD:
+  case ARM::LDMIB_UPD:
+  case ARM::LDMDA_UPD:
+    // ARM variants loading and updating the same register are only officially
+    // UNPREDICTABLE on v7 upwards. Goodness knows what they did before.
+    if (!hasV7Ops())
+      break;
+    // Fallthrough
+  case ARM::t2LDMIA_UPD:
+  case ARM::t2LDMDB_UPD:
+  case ARM::t2STMIA_UPD:
+  case ARM::t2STMDB_UPD: {
     if (listContainsReg(Inst, 3, Inst.getOperand(0).getReg()))
-      return Error(Operands[4]->getStartLoc(),
-                   "writeback operator '!' not allowed when base register "
-                   "in register list");
+      return Error(Operands.back()->getStartLoc(),
+                   "writeback register not allowed in register list");
     break;
   }
+  case ARM::sysLDMIA_UPD:
+  case ARM::sysLDMDA_UPD:
+  case ARM::sysLDMDB_UPD:
+  case ARM::sysLDMIB_UPD:
+    if (!listContainsReg(Inst, 3, ARM::PC))
+      return Error(Operands[4]->getStartLoc(),
+                   "writeback register only allowed on system LDM "
+                   "if PC in register-list");
+    break;
+  case ARM::sysSTMIA_UPD:
+  case ARM::sysSTMDA_UPD:
+  case ARM::sysSTMDB_UPD:
+  case ARM::sysSTMIB_UPD:
+    return Error(Operands[2]->getStartLoc(),
+                 "system STM cannot have writeback register");
+    break;
   case ARM::tMUL: {
     // The second source operand must be the same register as the destination
     // operand.
@@ -5351,26 +5520,35 @@ validateInstruction(MCInst &Inst,
   // so only issue a diagnostic for thumb1. The instructions will be
   // switched to the t2 encodings in processInstruction() if necessary.
   case ARM::tPOP: {
-    bool listContainsBase;
-    if (checkLowRegisterList(Inst, 2, 0, ARM::PC, listContainsBase) &&
+    bool ListContainsBase;
+    if (checkLowRegisterList(Inst, 2, 0, ARM::PC, ListContainsBase) &&
         !isThumbTwo())
       return Error(Operands[2]->getStartLoc(),
                    "registers must be in range r0-r7 or pc");
     break;
   }
   case ARM::tPUSH: {
-    bool listContainsBase;
-    if (checkLowRegisterList(Inst, 2, 0, ARM::LR, listContainsBase) &&
+    bool ListContainsBase;
+    if (checkLowRegisterList(Inst, 2, 0, ARM::LR, ListContainsBase) &&
         !isThumbTwo())
       return Error(Operands[2]->getStartLoc(),
                    "registers must be in range r0-r7 or lr");
     break;
   }
   case ARM::tSTMIA_UPD: {
-    bool listContainsBase;
-    if (checkLowRegisterList(Inst, 4, 0, 0, listContainsBase) && !isThumbTwo())
+    bool ListContainsBase, InvalidLowList;
+    InvalidLowList = checkLowRegisterList(Inst, 4, Inst.getOperand(0).getReg(),
+                                          0, ListContainsBase);
+    if (InvalidLowList && !isThumbTwo())
       return Error(Operands[4]->getStartLoc(),
                    "registers must be in range r0-r7");
+
+    // This would be converted to a 32-bit stm, but that's not valid if the
+    // writeback register is in the list.
+    if (InvalidLowList && ListContainsBase)
+      return Error(Operands[4]->getStartLoc(),
+                   "writeback operator '!' not allowed when base register "
+                   "in register list");
     break;
   }
   case ARM::tADDrSP: {
@@ -5383,11 +5561,29 @@ validateInstruction(MCInst &Inst,
     }
     break;
   }
+  // Final range checking for Thumb unconditional branch instructions.
+  case ARM::tB:
+    if (!(static_cast<ARMOperand*>(Operands[2]))->isSignedOffset<11, 1>())
+      return Error(Operands[2]->getStartLoc(), "branch target out of range");
+    break;
+  case ARM::t2B: {
+    int op = (Operands[2]->isImm()) ? 2 : 3;
+    if (!(static_cast<ARMOperand*>(Operands[op]))->isSignedOffset<24, 1>())
+      return Error(Operands[op]->getStartLoc(), "branch target out of range");
+    break;
+  }
+  // Final range checking for Thumb conditional branch instructions.
+  case ARM::tBcc:
+    if (!(static_cast<ARMOperand*>(Operands[2]))->isSignedOffset<8, 1>())
+      return Error(Operands[2]->getStartLoc(), "branch target out of range");
+    break;
+  case ARM::t2Bcc: {
+    int Op = (Operands[2]->isImm()) ? 2 : 3;
+    if (!(static_cast<ARMOperand*>(Operands[Op]))->isSignedOffset<20, 1>())
+      return Error(Operands[Op]->getStartLoc(), "branch target out of range");
+    break;
+  }
   }
-
-  StringRef DepInfo;
-  if (isDeprecated(Inst, DepInfo))
-    Warning(Loc, "deprecated on " + DepInfo);
 
   return false;
 }
@@ -7425,7 +7621,7 @@ unsigned ARMAsmParser::checkTargetMatchPredicate(MCInst &Inst) {
   // 16-bit thumb arithmetic instructions either require or preclude the 'S'
   // suffix depending on whether they're in an IT block or not.
   unsigned Opc = Inst.getOpcode();
-  const MCInstrDesc &MCID = getInstDesc(Opc);
+  const MCInstrDesc &MCID = MII.get(Opc);
   if (MCID.TSFlags & ARMII::ThumbArithFlagSetting) {
     assert(MCID.hasOptionalDef() &&
            "optionally flag setting instruction missing optional def operand");
@@ -7486,12 +7682,22 @@ MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
       return true;
     }
 
-    // Some instructions need post-processing to, for example, tweak which
-    // encoding is selected. Loop on it while changes happen so the
-    // individual transformations can chain off each other. E.g.,
-    // tPOP(r8)->t2LDMIA_UPD(sp,r8)->t2STR_POST(sp,r8)
-    while (processInstruction(Inst, Operands))
-      ;
+    { // processInstruction() updates inITBlock state, we need to save it away
+      bool wasInITBlock = inITBlock();
+
+      // Some instructions need post-processing to, for example, tweak which
+      // encoding is selected. Loop on it while changes happen so the
+      // individual transformations can chain off each other. E.g.,
+      // tPOP(r8)->t2LDMIA_UPD(sp,r8)->t2STR_POST(sp,r8)
+      while (processInstruction(Inst, Operands))
+        ;
+
+      // Only after the instruction is fully processed, we can validate it
+      if (wasInITBlock && hasV8Ops() && isThumb() &&
+          !isV8EligibleForIT(&Inst, 2)) {
+        Warning(IDLoc, "deprecated instruction in IT block");
+      }
+    }
 
     // Only move forward at the very end so that everything in validate
     // and process gets a consistent answer about whether we're in an IT
@@ -7544,15 +7750,15 @@ MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
     return Error(IDLoc, "instruction variant requires ARMv6 or later");
   case Match_RequiresThumb2:
     return Error(IDLoc, "instruction variant requires Thumb2");
-  case Match_ImmRange0_4: {
+  case Match_ImmRange0_15: {
     SMLoc ErrorLoc = ((ARMOperand*)Operands[ErrorInfo])->getStartLoc();
     if (ErrorLoc == SMLoc()) ErrorLoc = IDLoc;
-    return Error(ErrorLoc, "immediate operand must be in the range [0,4]");
+    return Error(ErrorLoc, "immediate operand must be in the range [0,15]");
   }
-  case Match_ImmRange0_15: {
+  case Match_ImmRange0_239: {
     SMLoc ErrorLoc = ((ARMOperand*)Operands[ErrorInfo])->getStartLoc();
     if (ErrorLoc == SMLoc()) ErrorLoc = IDLoc;
-    return Error(ErrorLoc, "immediate operand must be in the range [0,15]");
+    return Error(ErrorLoc, "immediate operand must be in the range [0,239]");
   }
   }
 
@@ -7580,6 +7786,10 @@ bool ARMAsmParser::ParseDirective(AsmToken DirectiveID) {
     return parseDirectiveArch(DirectiveID.getLoc());
   else if (IDVal == ".eabi_attribute")
     return parseDirectiveEabiAttr(DirectiveID.getLoc());
+  else if (IDVal == ".cpu")
+    return parseDirectiveCPU(DirectiveID.getLoc());
+  else if (IDVal == ".fpu")
+    return parseDirectiveFPU(DirectiveID.getLoc());
   else if (IDVal == ".fnstart")
     return parseDirectiveFnStart(DirectiveID.getLoc());
   else if (IDVal == ".fnend")
@@ -7658,13 +7868,18 @@ bool ARMAsmParser::parseDirectiveARM(SMLoc L) {
   return false;
 }
 
+void ARMAsmParser::onLabelParsed(MCSymbol *Symbol) {
+  if (NextSymbolIsThumb) {
+    getParser().getStreamer().EmitThumbFunc(Symbol);
+    NextSymbolIsThumb = false;
+  }
+}
+
 /// parseDirectiveThumbFunc
 ///  ::= .thumbfunc symbol_name
 bool ARMAsmParser::parseDirectiveThumbFunc(SMLoc L) {
   const MCAsmInfo *MAI = getParser().getStreamer().getContext().getAsmInfo();
   bool isMachO = MAI->hasSubsectionsViaSymbols();
-  StringRef Name;
-  bool needFuncName = true;
 
   // Darwin asm has (optionally) function name after .thumb_func direction
   // ELF doesn't
@@ -7673,29 +7888,19 @@ bool ARMAsmParser::parseDirectiveThumbFunc(SMLoc L) {
     if (Tok.isNot(AsmToken::EndOfStatement)) {
       if (Tok.isNot(AsmToken::Identifier) && Tok.isNot(AsmToken::String))
         return Error(L, "unexpected token in .thumb_func directive");
-      Name = Tok.getIdentifier();
+      MCSymbol *Func =
+          getParser().getContext().GetOrCreateSymbol(Tok.getIdentifier());
+      getParser().getStreamer().EmitThumbFunc(Func);
       Parser.Lex(); // Consume the identifier token.
-      needFuncName = false;
+      return false;
     }
   }
 
   if (getLexer().isNot(AsmToken::EndOfStatement))
     return Error(L, "unexpected token in directive");
 
-  // Eat the end of statement and any blank lines that follow.
-  while (getLexer().is(AsmToken::EndOfStatement))
-    Parser.Lex();
+  NextSymbolIsThumb = true;
 
-  // FIXME: assuming function name will be the line following .thumb_func
-  // We really should be checking the next symbol definition even if there's
-  // stuff in between.
-  if (needFuncName) {
-    Name = Parser.getTok().getIdentifier();
-  }
-
-  // Mark symbol as a thumb symbol.
-  MCSymbol *Func = getParser().getContext().GetOrCreateSymbol(Name);
-  getParser().getStreamer().EmitThumbFunc(Func);
   return false;
 }
 
@@ -7807,7 +8012,48 @@ bool ARMAsmParser::parseDirectiveArch(SMLoc L) {
 /// parseDirectiveEabiAttr
 ///  ::= .eabi_attribute int, int
 bool ARMAsmParser::parseDirectiveEabiAttr(SMLoc L) {
-  return true;
+  if (Parser.getTok().isNot(AsmToken::Integer))
+    return Error(L, "integer expected");
+  int64_t Tag = Parser.getTok().getIntVal();
+  Parser.Lex(); // eat tag integer
+
+  if (Parser.getTok().isNot(AsmToken::Comma))
+    return Error(L, "comma expected");
+  Parser.Lex(); // skip comma
+
+  L = Parser.getTok().getLoc();
+  if (Parser.getTok().isNot(AsmToken::Integer))
+    return Error(L, "integer expected");
+  int64_t Value = Parser.getTok().getIntVal();
+  Parser.Lex(); // eat value integer
+
+  getTargetStreamer().emitAttribute(Tag, Value);
+  return false;
+}
+
+/// parseDirectiveCPU
+///  ::= .cpu str
+bool ARMAsmParser::parseDirectiveCPU(SMLoc L) {
+  StringRef CPU = getParser().parseStringToEndOfStatement().trim();
+  getTargetStreamer().emitTextAttribute(ARMBuildAttrs::CPU_name, CPU);
+  return false;
+}
+
+/// parseDirectiveFPU
+///  ::= .fpu str
+bool ARMAsmParser::parseDirectiveFPU(SMLoc L) {
+  StringRef FPU = getParser().parseStringToEndOfStatement().trim();
+
+  unsigned ID = StringSwitch<unsigned>(FPU)
+#define ARM_FPU_NAME(NAME, ID) .Case(NAME, ARM::ID)
+#include "ARMFPUName.def"
+    .Default(ARM::INVALID_FPU);
+
+  if (ID == ARM::INVALID_FPU)
+    return Error(L, "Unknown FPU name");
+
+  getTargetStreamer().emitFPU(ID);
+  return false;
 }
 
 /// parseDirectiveFnStart
@@ -7820,7 +8066,7 @@ bool ARMAsmParser::parseDirectiveFnStart(SMLoc L) {
   }
 
   FnStartLoc = L;
-  getParser().getStreamer().EmitFnStart();
+  getTargetStreamer().emitFnStart();
   return false;
 }
 
@@ -7833,8 +8079,7 @@ bool ARMAsmParser::parseDirectiveFnEnd(SMLoc L) {
 
   // Reset the unwind directives parser state
   resetUnwindDirectiveParserState();
-
-  getParser().getStreamer().EmitFnEnd();
+  getTargetStreamer().emitFnEnd();
   return false;
 }
 
@@ -7856,7 +8101,7 @@ bool ARMAsmParser::parseDirectiveCantUnwind(SMLoc L) {
     return true;
   }
 
-  getParser().getStreamer().EmitCantUnwind();
+  getTargetStreamer().emitCantUnwind();
   return false;
 }
 
@@ -7887,7 +8132,7 @@ bool ARMAsmParser::parseDirectivePersonality(SMLoc L) {
   Parser.Lex();
 
   MCSymbol *PR = getParser().getContext().GetOrCreateSymbol(Name);
-  getParser().getStreamer().EmitPersonality(PR);
+  getTargetStreamer().emitPersonality(PR);
   return false;
 }
 
@@ -7904,7 +8149,7 @@ bool ARMAsmParser::parseDirectiveHandlerData(SMLoc L) {
     return true;
   }
 
-  getParser().getStreamer().EmitHandlerData();
+  getTargetStreamer().emitHandlerData();
   return false;
 }
 
@@ -7964,9 +8209,8 @@ bool ARMAsmParser::parseDirectiveSetFP(SMLoc L) {
     Offset = CE->getValue();
   }
 
-  getParser().getStreamer().EmitSetFP(static_cast<unsigned>(NewFPReg),
-                                      static_cast<unsigned>(NewSPReg),
-                                      Offset);
+  getTargetStreamer().emitSetFP(static_cast<unsigned>(NewFPReg),
+                                static_cast<unsigned>(NewSPReg), Offset);
   return false;
 }
 
@@ -7995,7 +8239,7 @@ bool ARMAsmParser::parseDirectivePad(SMLoc L) {
   if (!CE)
     return Error(ExLoc, "pad offset must be an immediate");
 
-  getParser().getStreamer().EmitPad(CE->getValue());
+  getTargetStreamer().emitPad(CE->getValue());
   return false;
 }
 
@@ -8027,7 +8271,7 @@ bool ARMAsmParser::parseDirectiveRegSave(SMLoc L, bool IsVector) {
   if (IsVector && !Op->isDPRRegList())
     return Error(L, ".vsave expects DPR registers");
 
-  getParser().getStreamer().EmitRegSave(Op->getRegList(), IsVector);
+  getTargetStreamer().emitRegSave(Op->getRegList(), IsVector);
   return false;
 }
 
diff --git a/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
index 8a06664..9c7988f 100644
--- a/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
+++ b/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
@@ -323,8 +323,6 @@ static DecodeStatus DecodeVCVTD(MCInst &Inst, unsigned Insn,
                                 uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeVCVTQ(MCInst &Inst, unsigned Insn,
                                 uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeImm0_4(MCInst &Inst, unsigned Insn, uint64_t Address,
-                                 const void *Decoder);
 
 
 static DecodeStatus DecodeThumbAddSpecialReg(MCInst &Inst, uint16_t Insn,
@@ -507,6 +505,14 @@ DecodeStatus ARMDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
   }
 
   MI.clear();
+  result = decodeInstruction(DecoderTablev8Crypto32, MI, insn, Address,
+                             this, STI);
+  if (result != MCDisassembler::Fail) {
+    Size = 4;
+    return result;
+  }
+
+  MI.clear();
   Size = 0;
   return MCDisassembler::Fail;
 }
@@ -823,16 +829,28 @@ DecodeStatus ThumbDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
       Check(result, AddThumbPredicate(MI));
       return result;
     }
-  }
 
-  MI.clear();
-  uint32_t NEONv8Insn = insn32;
-  NEONv8Insn &= 0xF3FFFFFF; // Clear bits 27-26
-  result = decodeInstruction(DecoderTablev8NEON32, MI, NEONv8Insn, Address,
-                             this, STI);
-  if (result != MCDisassembler::Fail) {
-    Size = 4;
-    return result;
+    MI.clear();
+    uint32_t NEONCryptoInsn = insn32;
+    NEONCryptoInsn &= 0xF0FFFFFF; // Clear bits 27-24
+    NEONCryptoInsn |= (NEONCryptoInsn & 0x10000000) >> 4; // Move bit 28 to bit 24
+    NEONCryptoInsn |= 0x12000000; // Set bits 28 and 25
+    result = decodeInstruction(DecoderTablev8Crypto32, MI, NEONCryptoInsn,
+                               Address, this, STI);
+    if (result != MCDisassembler::Fail) {
+      Size = 4;
+      return result;
+    }
+
+    MI.clear();
+    uint32_t NEONv8Insn = insn32;
+    NEONv8Insn &= 0xF3FFFFFF; // Clear bits 27-26
+    result = decodeInstruction(DecoderTablev8NEON32, MI, NEONv8Insn, Address,
+                               this, STI);
+    if (result != MCDisassembler::Fail) {
+      Size = 4;
+      return result;
+    }
   }
 
   MI.clear();
@@ -1185,20 +1203,22 @@ static DecodeStatus DecodeRegListOperand(MCInst &Inst, unsigned Val,
                                  uint64_t Address, const void *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
-  bool writebackLoad = false;
-  unsigned writebackReg = 0;
+  bool NeedDisjointWriteback = false;
+  unsigned WritebackReg = 0;
   switch (Inst.getOpcode()) {
-    default:
-      break;
-    case ARM::LDMIA_UPD:
-    case ARM::LDMDB_UPD:
-    case ARM::LDMIB_UPD:
-    case ARM::LDMDA_UPD:
-    case ARM::t2LDMIA_UPD:
-    case ARM::t2LDMDB_UPD:
-      writebackLoad = true;
-      writebackReg = Inst.getOperand(0).getReg();
-      break;
+  default:
+    break;
+  case ARM::LDMIA_UPD:
+  case ARM::LDMDB_UPD:
+  case ARM::LDMIB_UPD:
+  case ARM::LDMDA_UPD:
+  case ARM::t2LDMIA_UPD:
+  case ARM::t2LDMDB_UPD:
+  case ARM::t2STMIA_UPD:
+  case ARM::t2STMDB_UPD:
+    NeedDisjointWriteback = true;
+    WritebackReg = Inst.getOperand(0).getReg();
+    break;
   }
 
   // Empty register lists are not allowed.
@@ -1208,7 +1228,7 @@ static DecodeStatus DecodeRegListOperand(MCInst &Inst, unsigned Val,
       if (!Check(S, DecodeGPRRegisterClass(Inst, i, Address, Decoder)))
         return MCDisassembler::Fail;
       // Writeback not allowed if Rn is in the target list.
-      if (writebackLoad && writebackReg == Inst.end()[-1].getReg())
+      if (NeedDisjointWriteback && WritebackReg == Inst.end()[-1].getReg())
         Check(S, MCDisassembler::SoftFail);
     }
   }
@@ -1343,6 +1363,11 @@ static DecodeStatus DecodeCopMemInstruction(MCInst &Inst, unsigned Insn,
       break;
   }
 
+  uint64_t featureBits = ((const MCDisassembler*)Decoder)->getSubtargetInfo()
+                                                          .getFeatureBits();
+  if ((featureBits & ARM::HasV8Ops) && (coproc != 14))
+    return MCDisassembler::Fail;
+
   Inst.addOperand(MCOperand::CreateImm(coproc));
   Inst.addOperand(MCOperand::CreateImm(CRd));
   if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
@@ -3794,6 +3819,11 @@ static DecodeStatus DecodeCoprocessor(MCInst &Inst, unsigned Val,
   if (Val == 0xA || Val == 0xB)
     return MCDisassembler::Fail;
 
+  uint64_t featureBits = ((const MCDisassembler*)Decoder)->getSubtargetInfo()
+                                                          .getFeatureBits();
+  if ((featureBits & ARM::HasV8Ops) && !(Val == 14 || Val == 15))
+    return MCDisassembler::Fail;
+
   Inst.addOperand(MCOperand::CreateImm(Val));
   return MCDisassembler::Success;
 }
@@ -4901,15 +4931,6 @@ static DecodeStatus DecodeVCVTQ(MCInst &Inst, unsigned Insn,
   return S;
 }
 
-static DecodeStatus DecodeImm0_4(MCInst &Inst, unsigned Insn, uint64_t Address,
-                                 const void *Decoder)
-{
-  unsigned Imm = fieldFromInstruction(Insn, 0, 3);
-  if (Imm > 4) return MCDisassembler::Fail;
-  Inst.addOperand(MCOperand::CreateImm(Imm));
-  return MCDisassembler::Success;
-}
-
 static DecodeStatus DecodeLDR(MCInst &Inst, unsigned Val,
                                 uint64_t Address, const void *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
diff --git a/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp b/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
index 97da232..f897028 100644
--- a/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
+++ b/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
@@ -76,14 +76,23 @@ void ARMInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
                                StringRef Annot) {
   unsigned Opcode = MI->getOpcode();
 
+  switch(Opcode) {
+
   // Check for HINT instructions w/ canonical names.
-  if (Opcode == ARM::HINT || Opcode == ARM::t2HINT) {
+  case ARM::HINT:
+  case ARM::tHINT:
+  case ARM::t2HINT:
     switch (MI->getOperand(0).getImm()) {
     case 0: O << "\tnop"; break;
     case 1: O << "\tyield"; break;
     case 2: O << "\twfe"; break;
     case 3: O << "\twfi"; break;
     case 4: O << "\tsev"; break;
+    case 5:
+      if ((getAvailableFeatures() & ARM::HasV8Ops)) {
+        O << "\tsevl";
+        break;
+      } // Fallthrough for non-v8
     default:
       // Anything else should just print normally.
       printInstruction(MI, O);
@@ -95,10 +104,9 @@ void ARMInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
       O << ".w";
     printAnnotation(O, Annot);
     return;
-  }
 
   // Check for MOVs and print canonical forms, instead.
-  if (Opcode == ARM::MOVsr) {
+  case ARM::MOVsr: {
     // FIXME: Thumb variants?
     const MCOperand &Dst = MI->getOperand(0);
     const MCOperand &MO1 = MI->getOperand(1);
@@ -121,7 +129,7 @@ void ARMInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
     return;
   }
 
-  if (Opcode == ARM::MOVsi) {
+  case ARM::MOVsi: {
     // FIXME: Thumb variants?
     const MCOperand &Dst = MI->getOperand(0);
     const MCOperand &MO1 = MI->getOperand(1);
@@ -149,81 +157,91 @@ void ARMInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
     return;
   }
 
-
   // A8.6.123 PUSH
-  if ((Opcode == ARM::STMDB_UPD || Opcode == ARM::t2STMDB_UPD) &&
-      MI->getOperand(0).getReg() == ARM::SP &&
-      MI->getNumOperands() > 5) {
-    // Should only print PUSH if there are at least two registers in the list.
-    O << '\t' << "push";
-    printPredicateOperand(MI, 2, O);
-    if (Opcode == ARM::t2STMDB_UPD)
-      O << ".w";
-    O << '\t';
-    printRegisterList(MI, 4, O);
-    printAnnotation(O, Annot);
-    return;
-  }
-  if (Opcode == ARM::STR_PRE_IMM && MI->getOperand(2).getReg() == ARM::SP &&
-      MI->getOperand(3).getImm() == -4) {
-    O << '\t' << "push";
-    printPredicateOperand(MI, 4, O);
-    O << "\t{";
-    printRegName(O, MI->getOperand(1).getReg());
-    O << "}";
-    printAnnotation(O, Annot);
-    return;
-  }
+  case ARM::STMDB_UPD:
+  case ARM::t2STMDB_UPD:
+    if (MI->getOperand(0).getReg() == ARM::SP && MI->getNumOperands() > 5) {
+      // Should only print PUSH if there are at least two registers in the list.
+      O << '\t' << "push";
+      printPredicateOperand(MI, 2, O);
+      if (Opcode == ARM::t2STMDB_UPD)
+        O << ".w";
+      O << '\t';
+      printRegisterList(MI, 4, O);
+      printAnnotation(O, Annot);
+      return;
+    } else
+      break;
+
+  case ARM::STR_PRE_IMM:
+    if (MI->getOperand(2).getReg() == ARM::SP &&
+        MI->getOperand(3).getImm() == -4) {
+      O << '\t' << "push";
+      printPredicateOperand(MI, 4, O);
+      O << "\t{";
+      printRegName(O, MI->getOperand(1).getReg());
+      O << "}";
+      printAnnotation(O, Annot);
+      return;
+    } else
+      break;
 
   // A8.6.122 POP
-  if ((Opcode == ARM::LDMIA_UPD || Opcode == ARM::t2LDMIA_UPD) &&
-      MI->getOperand(0).getReg() == ARM::SP &&
-      MI->getNumOperands() > 5) {
-    // Should only print POP if there are at least two registers in the list.
-    O << '\t' << "pop";
-    printPredicateOperand(MI, 2, O);
-    if (Opcode == ARM::t2LDMIA_UPD)
-      O << ".w";
-    O << '\t';
-    printRegisterList(MI, 4, O);
-    printAnnotation(O, Annot);
-    return;
-  }
-  if (Opcode == ARM::LDR_POST_IMM && MI->getOperand(2).getReg() == ARM::SP &&
-      MI->getOperand(4).getImm() == 4) {
-    O << '\t' << "pop";
-    printPredicateOperand(MI, 5, O);
-    O << "\t{";
-    printRegName(O, MI->getOperand(0).getReg());
-    O << "}";
-    printAnnotation(O, Annot);
-    return;
-  }
-
+  case ARM::LDMIA_UPD:
+  case ARM::t2LDMIA_UPD:
+    if (MI->getOperand(0).getReg() == ARM::SP && MI->getNumOperands() > 5) {
+      // Should only print POP if there are at least two registers in the list.
+      O << '\t' << "pop";
+      printPredicateOperand(MI, 2, O);
+      if (Opcode == ARM::t2LDMIA_UPD)
+        O << ".w";
+      O << '\t';
+      printRegisterList(MI, 4, O);
+      printAnnotation(O, Annot);
+      return;
+    } else
+      break;
+
+  case ARM::LDR_POST_IMM:
+    if (MI->getOperand(2).getReg() == ARM::SP &&
+        MI->getOperand(4).getImm() == 4) {
+      O << '\t' << "pop";
+      printPredicateOperand(MI, 5, O);
+      O << "\t{";
+      printRegName(O, MI->getOperand(0).getReg());
+      O << "}";
+      printAnnotation(O, Annot);
+      return;
+    } else
+      break;
 
   // A8.6.355 VPUSH
-  if ((Opcode == ARM::VSTMSDB_UPD || Opcode == ARM::VSTMDDB_UPD) &&
-      MI->getOperand(0).getReg() == ARM::SP) {
-    O << '\t' << "vpush";
-    printPredicateOperand(MI, 2, O);
-    O << '\t';
-    printRegisterList(MI, 4, O);
-    printAnnotation(O, Annot);
-    return;
-  }
+  case ARM::VSTMSDB_UPD:
+  case ARM::VSTMDDB_UPD:
+    if (MI->getOperand(0).getReg() == ARM::SP) {
+      O << '\t' << "vpush";
+      printPredicateOperand(MI, 2, O);
+      O << '\t';
+      printRegisterList(MI, 4, O);
+      printAnnotation(O, Annot);
+      return;
+    } else
+      break;
 
   // A8.6.354 VPOP
-  if ((Opcode == ARM::VLDMSIA_UPD || Opcode == ARM::VLDMDIA_UPD) &&
-      MI->getOperand(0).getReg() == ARM::SP) {
-    O << '\t' << "vpop";
-    printPredicateOperand(MI, 2, O);
-    O << '\t';
-    printRegisterList(MI, 4, O);
-    printAnnotation(O, Annot);
-    return;
-  }
+  case ARM::VLDMSIA_UPD:
+  case ARM::VLDMDIA_UPD:
+    if (MI->getOperand(0).getReg() == ARM::SP) {
+      O << '\t' << "vpop";
+      printPredicateOperand(MI, 2, O);
+      O << '\t';
+      printRegisterList(MI, 4, O);
+      printAnnotation(O, Annot);
+      return;
+    } else
+      break;
 
-  if (Opcode == ARM::tLDMIA) {
+  case ARM::tLDMIA: {
     bool Writeback = true;
     unsigned BaseReg = MI->getOperand(0).getReg();
     for (unsigned i = 3; i < MI->getNumOperands(); ++i) {
@@ -249,9 +267,10 @@ void ARMInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
   // GPRs. However, when decoding them, the two GRPs cannot be automatically
   // expressed as a GPRPair, so we have to manually merge them.
   // FIXME: We would really like to be able to tablegen'erate this.
-  if (Opcode == ARM::LDREXD || Opcode == ARM::STREXD) {
+  case ARM::LDREXD: case ARM::STREXD:
+  case ARM::LDAEXD: case ARM::STLEXD:
     const MCRegisterClass& MRC = MRI.getRegClass(ARM::GPRRegClassID);
-    bool isStore = Opcode == ARM::STREXD;
+    bool isStore = Opcode == ARM::STREXD || Opcode == ARM::STLEXD;
     unsigned Reg = MI->getOperand(isStore ? 1 : 0).getReg();
     if (MRC.contains(Reg)) {
       MCInst NewMI;
@@ -676,7 +695,7 @@ void ARMInstPrinter::printBitfieldInvMaskImmOperand(const MCInst *MI,
 void ARMInstPrinter::printMemBOption(const MCInst *MI, unsigned OpNum,
                                      raw_ostream &O) {
   unsigned val = MI->getOperand(OpNum).getImm();
-  O << ARM_MB::MemBOptToString(val);
+  O << ARM_MB::MemBOptToString(val, (getAvailableFeatures() & ARM::HasV8Ops));
 }
 
 void ARMInstPrinter::printInstSyncBOption(const MCInst *MI, unsigned OpNum,
diff --git a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
index b1e25d8..5615b80 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
@@ -25,9 +25,9 @@
 #include "llvm/MC/MCSectionMachO.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCValue.h"
-#include "llvm/Object/MachOFormat.h"
 #include "llvm/Support/ELF.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachO.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
@@ -640,16 +640,16 @@ public:
 // FIXME: This should be in a separate file.
 class DarwinARMAsmBackend : public ARMAsmBackend {
 public:
-  const object::mach::CPUSubtypeARM Subtype;
+  const MachO::CPUSubTypeARM Subtype;
   DarwinARMAsmBackend(const Target &T, const StringRef TT,
-                      object::mach::CPUSubtypeARM st)
+                      MachO::CPUSubTypeARM st)
     : ARMAsmBackend(T, TT), Subtype(st) {
       HasDataInCodeSupport = true;
     }
 
   MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
     return createARMMachObjectWriter(OS, /*Is64Bit=*/false,
-                                     object::mach::CTM_ARM,
+                                     MachO::CPU_TYPE_ARM,
                                      Subtype);
   }
 
@@ -660,22 +660,24 @@ public:
 
 } // end anonymous namespace
 
-MCAsmBackend *llvm::createARMAsmBackend(const Target &T, StringRef TT, StringRef CPU) {
+MCAsmBackend *llvm::createARMAsmBackend(const Target &T,
+                                        const MCRegisterInfo &MRI,
+                                        StringRef TT, StringRef CPU) {
   Triple TheTriple(TT);
 
   if (TheTriple.isOSDarwin()) {
-    object::mach::CPUSubtypeARM CS =
-      StringSwitch<object::mach::CPUSubtypeARM>(TheTriple.getArchName())
-      .Cases("armv4t", "thumbv4t", object::mach::CSARM_V4T)
-      .Cases("armv5e", "thumbv5e",object::mach::CSARM_V5TEJ)
-      .Cases("armv6", "thumbv6", object::mach::CSARM_V6)
-      .Cases("armv6m", "thumbv6m", object::mach::CSARM_V6M)
-      .Cases("armv7em", "thumbv7em", object::mach::CSARM_V7EM)
-      .Cases("armv7f", "thumbv7f", object::mach::CSARM_V7F)
-      .Cases("armv7k", "thumbv7k", object::mach::CSARM_V7K)
-      .Cases("armv7m", "thumbv7m", object::mach::CSARM_V7M)
-      .Cases("armv7s", "thumbv7s", object::mach::CSARM_V7S)
-      .Default(object::mach::CSARM_V7);
+    MachO::CPUSubTypeARM CS =
+      StringSwitch<MachO::CPUSubTypeARM>(TheTriple.getArchName())
+      .Cases("armv4t", "thumbv4t", MachO::CPU_SUBTYPE_ARM_V4T)
+      .Cases("armv5e", "thumbv5e", MachO::CPU_SUBTYPE_ARM_V5TEJ)
+      .Cases("armv6", "thumbv6", MachO::CPU_SUBTYPE_ARM_V6)
+      .Cases("armv6m", "thumbv6m", MachO::CPU_SUBTYPE_ARM_V6M)
+      .Cases("armv7em", "thumbv7em", MachO::CPU_SUBTYPE_ARM_V7EM)
+      .Cases("armv7f", "thumbv7f", MachO::CPU_SUBTYPE_ARM_V7F)
+      .Cases("armv7k", "thumbv7k", MachO::CPU_SUBTYPE_ARM_V7K)
+      .Cases("armv7m", "thumbv7m", MachO::CPU_SUBTYPE_ARM_V7M)
+      .Cases("armv7s", "thumbv7s", MachO::CPU_SUBTYPE_ARM_V7S)
+      .Default(MachO::CPU_SUBTYPE_ARM_V7);
 
     return new DarwinARMAsmBackend(T, TT, CS);
   }
diff --git a/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h b/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h
index ff9917d..af939fc 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h
@@ -121,41 +121,41 @@ namespace ARM_MB {
   // the option field for memory barrier operations.
   enum MemBOpt {
     RESERVED_0 = 0,
-    RESERVED_1 = 1,
+    OSHLD = 1,
     OSHST = 2,
     OSH   = 3,
     RESERVED_4 = 4,
-    RESERVED_5 = 5,
+    NSHLD = 5,
     NSHST = 6,
     NSH   = 7,
     RESERVED_8 = 8,
-    RESERVED_9 = 9,
+    ISHLD = 9,
     ISHST = 10,
     ISH   = 11,
     RESERVED_12 = 12,
-    RESERVED_13 = 13,
+    LD = 13,
     ST    = 14,
     SY    = 15
   };
 
-  inline static const char *MemBOptToString(unsigned val) {
+  inline static const char *MemBOptToString(unsigned val, bool HasV8) {
     switch (val) {
     default: llvm_unreachable("Unknown memory operation");
     case SY:    return "sy";
     case ST:    return "st";
-    case RESERVED_13: return "#0xd";
+    case LD: return HasV8 ? "ld" : "#0xd";
     case RESERVED_12: return "#0xc";
     case ISH:   return "ish";
     case ISHST: return "ishst";
-    case RESERVED_9: return "#0x9";
+    case ISHLD: return HasV8 ?  "ishld" : "#0x9";
     case RESERVED_8: return "#0x8";
     case NSH:   return "nsh";
     case NSHST: return "nshst";
-    case RESERVED_5: return "#0x5";
+    case NSHLD: return HasV8 ? "nshld" : "#0x5";
     case RESERVED_4: return "#0x4";
     case OSH:   return "osh";
     case OSHST: return "oshst";
-    case RESERVED_1: return "#0x1";
+    case OSHLD: return HasV8 ? "oshld" : "#0x1";
     case RESERVED_0: return "#0x0";
     }
   }
diff --git a/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
index 6b98205..471897d 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
@@ -13,6 +13,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "ARMBuildAttrs.h"
+#include "ARMFPUName.h"
 #include "ARMRegisterInfo.h"
 #include "ARMUnwindOp.h"
 #include "ARMUnwindOpAsm.h"
@@ -27,6 +29,7 @@
 #include "llvm/MC/MCELFSymbolFlags.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstPrinter.h"
 #include "llvm/MC/MCObjectStreamer.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSection.h"
@@ -36,7 +39,9 @@
 #include "llvm/MC/MCValue.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ELF.h"
+#include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/raw_ostream.h"
+#include <algorithm>
 
 using namespace llvm;
 
@@ -45,8 +50,218 @@ static std::string GetAEABIUnwindPersonalityName(unsigned Index) {
   return (Twine("__aeabi_unwind_cpp_pr") + Twine(Index)).str();
 }
 
+static const char *GetFPUName(unsigned ID) {
+  switch (ID) {
+  default:
+    llvm_unreachable("Unknown FPU kind");
+    break;
+#define ARM_FPU_NAME(NAME, ID) case ARM::ID: return NAME;
+#include "ARMFPUName.def"
+  }
+  return NULL;
+}
+
 namespace {
 
+class ARMELFStreamer;
+
+class ARMTargetAsmStreamer : public ARMTargetStreamer {
+  formatted_raw_ostream &OS;
+  MCInstPrinter &InstPrinter;
+
+  virtual void emitFnStart();
+  virtual void emitFnEnd();
+  virtual void emitCantUnwind();
+  virtual void emitPersonality(const MCSymbol *Personality);
+  virtual void emitHandlerData();
+  virtual void emitSetFP(unsigned FpReg, unsigned SpReg, int64_t Offset = 0);
+  virtual void emitPad(int64_t Offset);
+  virtual void emitRegSave(const SmallVectorImpl<unsigned> &RegList,
+                           bool isVector);
+
+  virtual void switchVendor(StringRef Vendor);
+  virtual void emitAttribute(unsigned Attribute, unsigned Value);
+  virtual void emitTextAttribute(unsigned Attribute, StringRef String);
+  virtual void emitFPU(unsigned FPU);
+  virtual void finishAttributeSection();
+
+public:
+  ARMTargetAsmStreamer(formatted_raw_ostream &OS, MCInstPrinter &InstPrinter);
+};
+
+ARMTargetAsmStreamer::ARMTargetAsmStreamer(formatted_raw_ostream &OS,
+                                           MCInstPrinter &InstPrinter)
+    : OS(OS), InstPrinter(InstPrinter) {}
+void ARMTargetAsmStreamer::emitFnStart() { OS << "\t.fnstart\n"; }
+void ARMTargetAsmStreamer::emitFnEnd() { OS << "\t.fnend\n"; }
+void ARMTargetAsmStreamer::emitCantUnwind() { OS << "\t.cantunwind\n"; }
+void ARMTargetAsmStreamer::emitPersonality(const MCSymbol *Personality) {
+  OS << "\t.personality " << Personality->getName() << '\n';
+}
+void ARMTargetAsmStreamer::emitHandlerData() { OS << "\t.handlerdata\n"; }
+void ARMTargetAsmStreamer::emitSetFP(unsigned FpReg, unsigned SpReg,
+                                     int64_t Offset) {
+  OS << "\t.setfp\t";
+  InstPrinter.printRegName(OS, FpReg);
+  OS << ", ";
+  InstPrinter.printRegName(OS, SpReg);
+  if (Offset)
+    OS << ", #" << Offset;
+  OS << '\n';
+}
+void ARMTargetAsmStreamer::emitPad(int64_t Offset) {
+  OS << "\t.pad\t#" << Offset << '\n';
+}
+void ARMTargetAsmStreamer::emitRegSave(const SmallVectorImpl<unsigned> &RegList,
+                                       bool isVector) {
+  assert(RegList.size() && "RegList should not be empty");
+  if (isVector)
+    OS << "\t.vsave\t{";
+  else
+    OS << "\t.save\t{";
+
+  InstPrinter.printRegName(OS, RegList[0]);
+
+  for (unsigned i = 1, e = RegList.size(); i != e; ++i) {
+    OS << ", ";
+    InstPrinter.printRegName(OS, RegList[i]);
+  }
+
+  OS << "}\n";
+}
+void ARMTargetAsmStreamer::switchVendor(StringRef Vendor) {
+}
+void ARMTargetAsmStreamer::emitAttribute(unsigned Attribute, unsigned Value) {
+  OS << "\t.eabi_attribute\t" << Attribute << ", " << Twine(Value) << "\n";
+}
+void ARMTargetAsmStreamer::emitTextAttribute(unsigned Attribute,
+                                             StringRef String) {
+  switch (Attribute) {
+  default: llvm_unreachable("Unsupported Text attribute in ASM Mode");
+  case ARMBuildAttrs::CPU_name:
+    OS << "\t.cpu\t" << String.lower() << "\n";
+    break;
+  }
+}
+void ARMTargetAsmStreamer::emitFPU(unsigned FPU) {
+  OS << "\t.fpu\t" << GetFPUName(FPU) << "\n";
+}
+void ARMTargetAsmStreamer::finishAttributeSection() {
+}
+
+class ARMTargetELFStreamer : public ARMTargetStreamer {
+private:
+  // This structure holds all attributes, accounting for
+  // their string/numeric value, so we can later emmit them
+  // in declaration order, keeping all in the same vector
+  struct AttributeItem {
+    enum {
+      HiddenAttribute = 0,
+      NumericAttribute,
+      TextAttribute
+    } Type;
+    unsigned Tag;
+    unsigned IntValue;
+    StringRef StringValue;
+
+    static bool LessTag(const AttributeItem &LHS, const AttributeItem &RHS) {
+      return (LHS.Tag < RHS.Tag);
+    }
+  };
+
+  StringRef CurrentVendor;
+  unsigned FPU;
+  SmallVector<AttributeItem, 64> Contents;
+
+  const MCSection *AttributeSection;
+
+  // FIXME: this should be in a more generic place, but
+  // getULEBSize() is in MCAsmInfo and will be moved to MCDwarf
+  static size_t getULEBSize(int Value) {
+    size_t Size = 0;
+    do {
+      Value >>= 7;
+      Size += sizeof(int8_t); // Is this really necessary?
+    } while (Value);
+    return Size;
+  }
+
+  AttributeItem *getAttributeItem(unsigned Attribute) {
+    for (size_t i = 0; i < Contents.size(); ++i)
+      if (Contents[i].Tag == Attribute)
+        return &Contents[i];
+    return 0;
+  }
+
+  void setAttributeItem(unsigned Attribute, unsigned Value,
+                        bool OverwriteExisting) {
+    // Look for existing attribute item
+    if (AttributeItem *Item = getAttributeItem(Attribute)) {
+      if (!OverwriteExisting)
+        return;
+      Item->IntValue = Value;
+      return;
+    }
+
+    // Create new attribute item
+    AttributeItem Item = {
+      AttributeItem::NumericAttribute,
+      Attribute,
+      Value,
+      StringRef("")
+    };
+    Contents.push_back(Item);
+  }
+
+  void setAttributeItem(unsigned Attribute, StringRef Value,
+                        bool OverwriteExisting) {
+    // Look for existing attribute item
+    if (AttributeItem *Item = getAttributeItem(Attribute)) {
+      if (!OverwriteExisting)
+        return;
+      Item->StringValue = Value;
+      return;
+    }
+
+    // Create new attribute item
+    AttributeItem Item = {
+      AttributeItem::TextAttribute,
+      Attribute,
+      0,
+      Value
+    };
+    Contents.push_back(Item);
+  }
+
+  void emitFPUDefaultAttributes();
+
+  ARMELFStreamer &getStreamer();
+
+  virtual void emitFnStart();
+  virtual void emitFnEnd();
+  virtual void emitCantUnwind();
+  virtual void emitPersonality(const MCSymbol *Personality);
+  virtual void emitHandlerData();
+  virtual void emitSetFP(unsigned FpReg, unsigned SpReg, int64_t Offset = 0);
+  virtual void emitPad(int64_t Offset);
+  virtual void emitRegSave(const SmallVectorImpl<unsigned> &RegList,
+                           bool isVector);
+
+  virtual void switchVendor(StringRef Vendor);
+  virtual void emitAttribute(unsigned Attribute, unsigned Value);
+  virtual void emitTextAttribute(unsigned Attribute, StringRef String);
+  virtual void emitFPU(unsigned FPU);
+  virtual void finishAttributeSection();
+
+  size_t calculateContentSize() const;
+
+public:
+  ARMTargetELFStreamer()
+    : ARMTargetStreamer(), CurrentVendor("aeabi"), FPU(ARM::INVALID_FPU),
+      AttributeSection(0) {
+  }
+};
+
 /// Extend the generic ELFStreamer class so that it can emit mapping symbols at
 /// the appropriate points in the object files. These symbols are defined in the
 /// ARM ELF ABI: infocenter.arm.com/help/topic/com.arm.../IHI0044D_aaelf.pdf.
@@ -61,27 +276,29 @@ namespace {
 /// by MachO. Beware!
 class ARMELFStreamer : public MCELFStreamer {
 public:
-  ARMELFStreamer(MCContext &Context, MCAsmBackend &TAB, raw_ostream &OS,
-                 MCCodeEmitter *Emitter, bool IsThumb)
-      : MCELFStreamer(SK_ARMELFStreamer, Context, TAB, OS, Emitter),
+  friend class ARMTargetELFStreamer;
+
+  ARMELFStreamer(MCContext &Context, MCTargetStreamer *TargetStreamer,
+                 MCAsmBackend &TAB, raw_ostream &OS, MCCodeEmitter *Emitter,
+                 bool IsThumb)
+      : MCELFStreamer(Context, TargetStreamer, TAB, OS, Emitter),
         IsThumb(IsThumb), MappingSymbolCounter(0), LastEMS(EMS_None) {
     Reset();
   }
 
   ~ARMELFStreamer() {}
 
+  virtual void FinishImpl();
+
   // ARM exception handling directives
-  virtual void EmitFnStart();
-  virtual void EmitFnEnd();
-  virtual void EmitCantUnwind();
-  virtual void EmitPersonality(const MCSymbol *Per);
-  virtual void EmitHandlerData();
-  virtual void EmitSetFP(unsigned NewFpReg,
-                         unsigned NewSpReg,
-                         int64_t Offset = 0);
-  virtual void EmitPad(int64_t Offset);
-  virtual void EmitRegSave(const SmallVectorImpl<unsigned> &RegList,
-                           bool isVector);
+  void emitFnStart();
+  void emitFnEnd();
+  void emitCantUnwind();
+  void emitPersonality(const MCSymbol *Per);
+  void emitHandlerData();
+  void emitSetFP(unsigned NewFpReg, unsigned NewSpReg, int64_t Offset = 0);
+  void emitPad(int64_t Offset);
+  void emitRegSave(const SmallVectorImpl<unsigned> &RegList, bool isVector);
 
   virtual void ChangeSection(const MCSection *Section,
                              const MCExpr *Subsection) {
@@ -141,10 +358,6 @@ public:
     }
   }
 
-  static bool classof(const MCStreamer *S) {
-    return S->getKind() == SK_ARMELFStreamer;
-  }
-
 private:
   enum ElfMappingSymbol {
     EMS_None,
@@ -183,7 +396,7 @@ private:
     MCELF::SetType(SD, ELF::STT_NOTYPE);
     MCELF::SetBinding(SD, ELF::STB_LOCAL);
     SD.setExternal(false);
-    Symbol->setSection(*getCurrentSection().first);
+    AssignSection(Symbol, getCurrentSection().first);
 
     const MCExpr *Value = MCSymbolRefExpr::Create(Start, getContext());
     Symbol->setVariableValue(Value);
@@ -232,6 +445,224 @@ private:
 };
 } // end anonymous namespace
 
+ARMELFStreamer &ARMTargetELFStreamer::getStreamer() {
+  ARMELFStreamer *S = static_cast<ARMELFStreamer *>(Streamer);
+  return *S;
+}
+
+void ARMTargetELFStreamer::emitFnStart() { getStreamer().emitFnStart(); }
+void ARMTargetELFStreamer::emitFnEnd() { getStreamer().emitFnEnd(); }
+void ARMTargetELFStreamer::emitCantUnwind() { getStreamer().emitCantUnwind(); }
+void ARMTargetELFStreamer::emitPersonality(const MCSymbol *Personality) {
+  getStreamer().emitPersonality(Personality);
+}
+void ARMTargetELFStreamer::emitHandlerData() {
+  getStreamer().emitHandlerData();
+}
+void ARMTargetELFStreamer::emitSetFP(unsigned FpReg, unsigned SpReg,
+                                     int64_t Offset) {
+  getStreamer().emitSetFP(FpReg, SpReg, Offset);
+}
+void ARMTargetELFStreamer::emitPad(int64_t Offset) {
+  getStreamer().emitPad(Offset);
+}
+void ARMTargetELFStreamer::emitRegSave(const SmallVectorImpl<unsigned> &RegList,
+                                       bool isVector) {
+  getStreamer().emitRegSave(RegList, isVector);
+}
+void ARMTargetELFStreamer::switchVendor(StringRef Vendor) {
+  assert(!Vendor.empty() && "Vendor cannot be empty.");
+
+  if (CurrentVendor == Vendor)
+    return;
+
+  if (!CurrentVendor.empty())
+    finishAttributeSection();
+
+  assert(Contents.empty() &&
+         ".ARM.attributes should be flushed before changing vendor");
+  CurrentVendor = Vendor;
+
+}
+void ARMTargetELFStreamer::emitAttribute(unsigned Attribute, unsigned Value) {
+  setAttributeItem(Attribute, Value, /* OverwriteExisting= */ true);
+}
+void ARMTargetELFStreamer::emitTextAttribute(unsigned Attribute,
+                                             StringRef Value) {
+  setAttributeItem(Attribute, Value, /* OverwriteExisting= */ true);
+}
+void ARMTargetELFStreamer::emitFPU(unsigned Value) {
+  FPU = Value;
+}
+void ARMTargetELFStreamer::emitFPUDefaultAttributes() {
+  switch (FPU) {
+  case ARM::VFP:
+  case ARM::VFPV2:
+    setAttributeItem(ARMBuildAttrs::VFP_arch,
+                     ARMBuildAttrs::AllowFPv2,
+                     /* OverwriteExisting= */ false);
+    break;
+
+  case ARM::VFPV3:
+    setAttributeItem(ARMBuildAttrs::VFP_arch,
+                     ARMBuildAttrs::AllowFPv3A,
+                     /* OverwriteExisting= */ false);
+    break;
+
+  case ARM::VFPV3_D16:
+    setAttributeItem(ARMBuildAttrs::VFP_arch,
+                     ARMBuildAttrs::AllowFPv3B,
+                     /* OverwriteExisting= */ false);
+    break;
+
+  case ARM::VFPV4:
+    setAttributeItem(ARMBuildAttrs::VFP_arch,
+                     ARMBuildAttrs::AllowFPv4A,
+                     /* OverwriteExisting= */ false);
+    break;
+
+  case ARM::VFPV4_D16:
+    setAttributeItem(ARMBuildAttrs::VFP_arch,
+                     ARMBuildAttrs::AllowFPv4B,
+                     /* OverwriteExisting= */ false);
+    break;
+
+  case ARM::FP_ARMV8:
+    setAttributeItem(ARMBuildAttrs::VFP_arch,
+                     ARMBuildAttrs::AllowFPARMv8A,
+                     /* OverwriteExisting= */ false);
+    break;
+
+  case ARM::NEON:
+    setAttributeItem(ARMBuildAttrs::VFP_arch,
+                     ARMBuildAttrs::AllowFPv3A,
+                     /* OverwriteExisting= */ false);
+    setAttributeItem(ARMBuildAttrs::Advanced_SIMD_arch,
+                     ARMBuildAttrs::AllowNeon,
+                     /* OverwriteExisting= */ false);
+    break;
+
+  case ARM::NEON_VFPV4:
+    setAttributeItem(ARMBuildAttrs::VFP_arch,
+                     ARMBuildAttrs::AllowFPv4A,
+                     /* OverwriteExisting= */ false);
+    setAttributeItem(ARMBuildAttrs::Advanced_SIMD_arch,
+                     ARMBuildAttrs::AllowNeon2,
+                     /* OverwriteExisting= */ false);
+    break;
+
+  case ARM::NEON_FP_ARMV8:
+  case ARM::CRYPTO_NEON_FP_ARMV8:
+    setAttributeItem(ARMBuildAttrs::VFP_arch,
+                     ARMBuildAttrs::AllowFPARMv8A,
+                     /* OverwriteExisting= */ false);
+    setAttributeItem(ARMBuildAttrs::Advanced_SIMD_arch,
+                     ARMBuildAttrs::AllowNeonARMv8,
+                     /* OverwriteExisting= */ false);
+    break;
+
+  default:
+    report_fatal_error("Unknown FPU: " + Twine(FPU));
+    break;
+  }
+}
+size_t ARMTargetELFStreamer::calculateContentSize() const {
+  size_t Result = 0;
+  for (size_t i = 0; i < Contents.size(); ++i) {
+    AttributeItem item = Contents[i];
+    switch (item.Type) {
+    case AttributeItem::HiddenAttribute:
+      break;
+    case AttributeItem::NumericAttribute:
+      Result += getULEBSize(item.Tag);
+      Result += getULEBSize(item.IntValue);
+      break;
+    case AttributeItem::TextAttribute:
+      Result += getULEBSize(item.Tag);
+      Result += item.StringValue.size() + 1; // string + '\0'
+      break;
+    }
+  }
+  return Result;
+}
+void ARMTargetELFStreamer::finishAttributeSection() {
+  // <format-version>
+  // [ <section-length> "vendor-name"
+  // [ <file-tag> <size> <attribute>*
+  //   | <section-tag> <size> <section-number>* 0 <attribute>*
+  //   | <symbol-tag> <size> <symbol-number>* 0 <attribute>*
+  //   ]+
+  // ]*
+
+  if (FPU != ARM::INVALID_FPU)
+    emitFPUDefaultAttributes();
+
+  if (Contents.empty())
+    return;
+
+  std::sort(Contents.begin(), Contents.end(), AttributeItem::LessTag);
+
+  ARMELFStreamer &Streamer = getStreamer();
+
+  // Switch to .ARM.attributes section
+  if (AttributeSection) {
+    Streamer.SwitchSection(AttributeSection);
+  } else {
+    AttributeSection =
+      Streamer.getContext().getELFSection(".ARM.attributes",
+                                          ELF::SHT_ARM_ATTRIBUTES,
+                                          0,
+                                          SectionKind::getMetadata());
+    Streamer.SwitchSection(AttributeSection);
+
+    // Format version
+    Streamer.EmitIntValue(0x41, 1);
+  }
+
+  // Vendor size + Vendor name + '\0'
+  const size_t VendorHeaderSize = 4 + CurrentVendor.size() + 1;
+
+  // Tag + Tag Size
+  const size_t TagHeaderSize = 1 + 4;
+
+  const size_t ContentsSize = calculateContentSize();
+
+  Streamer.EmitIntValue(VendorHeaderSize + TagHeaderSize + ContentsSize, 4);
+  Streamer.EmitBytes(CurrentVendor);
+  Streamer.EmitIntValue(0, 1); // '\0'
+
+  Streamer.EmitIntValue(ARMBuildAttrs::File, 1);
+  Streamer.EmitIntValue(TagHeaderSize + ContentsSize, 4);
+
+  // Size should have been accounted for already, now
+  // emit each field as its type (ULEB or String)
+  for (size_t i = 0; i < Contents.size(); ++i) {
+    AttributeItem item = Contents[i];
+    Streamer.EmitULEB128IntValue(item.Tag);
+    switch (item.Type) {
+    default: llvm_unreachable("Invalid attribute type");
+    case AttributeItem::NumericAttribute:
+      Streamer.EmitULEB128IntValue(item.IntValue);
+      break;
+    case AttributeItem::TextAttribute:
+      Streamer.EmitBytes(item.StringValue.upper());
+      Streamer.EmitIntValue(0, 1); // '\0'
+      break;
+    }
+  }
+
+  Contents.clear();
+  FPU = ARM::INVALID_FPU;
+}
+
+void ARMELFStreamer::FinishImpl() {
+  MCTargetStreamer &TS = getTargetStreamer();
+  ARMTargetStreamer &ATS = static_cast<ARMTargetStreamer &>(TS);
+  ATS.finishAttributeSection();
+
+  MCELFStreamer::FinishImpl();
+}
+
 inline void ARMELFStreamer::SwitchToEHSection(const char *Prefix,
                                               unsigned Type,
                                               unsigned Flags,
@@ -295,29 +726,13 @@ void ARMELFStreamer::Reset() {
   UnwindOpAsm.Reset();
 }
 
-// Add the R_ARM_NONE fixup at the same position
-void ARMELFStreamer::EmitPersonalityFixup(StringRef Name) {
-  const MCSymbol *PersonalitySym = getContext().GetOrCreateSymbol(Name);
-
-  const MCSymbolRefExpr *PersonalityRef =
-    MCSymbolRefExpr::Create(PersonalitySym,
-                            MCSymbolRefExpr::VK_ARM_NONE,
-                            getContext());
-
-  AddValueSymbols(PersonalityRef);
-  MCDataFragment *DF = getOrCreateDataFragment();
-  DF->getFixups().push_back(
-    MCFixup::Create(DF->getContents().size(), PersonalityRef,
-                    MCFixup::getKindForSize(4, false)));
-}
-
-void ARMELFStreamer::EmitFnStart() {
+void ARMELFStreamer::emitFnStart() {
   assert(FnStart == 0);
   FnStart = getContext().CreateTempSymbol();
   EmitLabel(FnStart);
 }
 
-void ARMELFStreamer::EmitFnEnd() {
+void ARMELFStreamer::emitFnEnd() {
   assert(FnStart && ".fnstart must preceeds .fnend");
 
   // Emit unwind opcodes if there is no .handlerdata directive
@@ -365,8 +780,20 @@ void ARMELFStreamer::EmitFnEnd() {
   Reset();
 }
 
-void ARMELFStreamer::EmitCantUnwind() {
-  CantUnwind = true;
+void ARMELFStreamer::emitCantUnwind() { CantUnwind = true; }
+
+// Add the R_ARM_NONE fixup at the same position
+void ARMELFStreamer::EmitPersonalityFixup(StringRef Name) {
+  const MCSymbol *PersonalitySym = getContext().GetOrCreateSymbol(Name);
+
+  const MCSymbolRefExpr *PersonalityRef = MCSymbolRefExpr::Create(
+      PersonalitySym, MCSymbolRefExpr::VK_ARM_NONE, getContext());
+
+  AddValueSymbols(PersonalityRef);
+  MCDataFragment *DF = getOrCreateDataFragment();
+  DF->getFixups().push_back(MCFixup::Create(DF->getContents().size(),
+                                            PersonalityRef,
+                                            MCFixup::getKindForSize(4, false)));
 }
 
 void ARMELFStreamer::FlushPendingOffset() {
@@ -429,17 +856,14 @@ void ARMELFStreamer::FlushUnwindOpcodes(bool NoHandlerData) {
     EmitIntValue(0, 4);
 }
 
-void ARMELFStreamer::EmitHandlerData() {
-  FlushUnwindOpcodes(false);
-}
+void ARMELFStreamer::emitHandlerData() { FlushUnwindOpcodes(false); }
 
-void ARMELFStreamer::EmitPersonality(const MCSymbol *Per) {
+void ARMELFStreamer::emitPersonality(const MCSymbol *Per) {
   Personality = Per;
   UnwindOpAsm.setPersonality(Per);
 }
 
-void ARMELFStreamer::EmitSetFP(unsigned NewFPReg,
-                               unsigned NewSPReg,
+void ARMELFStreamer::emitSetFP(unsigned NewFPReg, unsigned NewSPReg,
                                int64_t Offset) {
   assert((NewSPReg == ARM::SP || NewSPReg == FPReg) &&
          "the operand of .setfp directive should be either $sp or $fp");
@@ -453,7 +877,7 @@ void ARMELFStreamer::EmitSetFP(unsigned NewFPReg,
     FPOffset += Offset;
 }
 
-void ARMELFStreamer::EmitPad(int64_t Offset) {
+void ARMELFStreamer::emitPad(int64_t Offset) {
   // Track the change of the $sp offset
   SPOffset -= Offset;
 
@@ -462,7 +886,7 @@ void ARMELFStreamer::EmitPad(int64_t Offset) {
   PendingOffset -= Offset;
 }
 
-void ARMELFStreamer::EmitRegSave(const SmallVectorImpl<unsigned> &RegList,
+void ARMELFStreamer::emitRegSave(const SmallVectorImpl<unsigned> &RegList,
                                  bool IsVector) {
   // Collect the registers in the register list
   unsigned Count = 0;
@@ -493,11 +917,31 @@ void ARMELFStreamer::EmitRegSave(const SmallVectorImpl<unsigned> &RegList,
 }
 
 namespace llvm {
+
+MCStreamer *createMCAsmStreamer(MCContext &Ctx, formatted_raw_ostream &OS,
+                                bool isVerboseAsm, bool useLoc, bool useCFI,
+                                bool useDwarfDirectory,
+                                MCInstPrinter *InstPrint, MCCodeEmitter *CE,
+                                MCAsmBackend *TAB, bool ShowInst) {
+  ARMTargetAsmStreamer *S = new ARMTargetAsmStreamer(OS, *InstPrint);
+
+  return llvm::createAsmStreamer(Ctx, S, OS, isVerboseAsm, useLoc, useCFI,
+                                 useDwarfDirectory, InstPrint, CE, TAB,
+                                 ShowInst);
+}
+
   MCELFStreamer* createARMELFStreamer(MCContext &Context, MCAsmBackend &TAB,
                                       raw_ostream &OS, MCCodeEmitter *Emitter,
                                       bool RelaxAll, bool NoExecStack,
                                       bool IsThumb) {
-    ARMELFStreamer *S = new ARMELFStreamer(Context, TAB, OS, Emitter, IsThumb);
+    ARMTargetELFStreamer *TS = new ARMTargetELFStreamer();
+    ARMELFStreamer *S =
+        new ARMELFStreamer(Context, TS, TAB, OS, Emitter, IsThumb);
+    // FIXME: This should eventually end up somewhere else where more
+    // intelligent flag decisions can be made. For now we are just maintaining
+    // the status quo for ARM and setting EF_ARM_EABI_VER5 as the default.
+    S->getAssembler().setELFHeaderEFlags(ELF::EF_ARM_EABI_VER5);
+
     if (RelaxAll)
       S->getAssembler().setRelaxAll(true);
     if (NoExecStack)
diff --git a/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.h b/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.h
deleted file mode 100644
index 77ae5d2..0000000
--- a/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.h
+++ /dev/null
@@ -1,27 +0,0 @@
-//===-- ARMELFStreamer.h - ELF Streamer for ARM ------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements ELF streamer information for the ARM backend.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef ARM_ELF_STREAMER_H
-#define ARM_ELF_STREAMER_H
-
-#include "llvm/MC/MCELFStreamer.h"
-
-namespace llvm {
-
-  MCELFStreamer* createARMELFStreamer(MCContext &Context, MCAsmBackend &TAB,
-                                      raw_ostream &OS, MCCodeEmitter *Emitter,
-                                      bool RelaxAll, bool NoExecStack,
-                                      bool IsThumb);
-}
-
-#endif // ARM_ELF_STREAMER_H
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp
index c1aab9c..ad796e6 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp
@@ -49,8 +49,6 @@ ARMELFMCAsmInfo::ARMELFMCAsmInfo() {
   Code16Directive = ".code\t16";
   Code32Directive = ".code\t32";
 
-  WeakRefDirective = "\t.weak\t";
-
   HasLEB128 = true;
   SupportsDebugInformation = true;
 
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.h b/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.h
index f0b289c..e1f716d 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.h
@@ -15,6 +15,7 @@
 #define LLVM_ARMTARGETASMINFO_H
 
 #include "llvm/MC/MCAsmInfoDarwin.h"
+#include "llvm/MC/MCAsmInfoELF.h"
 
 namespace llvm {
 
@@ -24,7 +25,7 @@ namespace llvm {
     explicit ARMMCAsmInfoDarwin();
   };
 
-  class ARMELFMCAsmInfo : public MCAsmInfo {
+  class ARMELFMCAsmInfo : public MCAsmInfoELF {
     virtual void anchor();
   public:
     explicit ARMELFMCAsmInfo();
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
index a18d465..4382d0d 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
@@ -58,8 +58,7 @@ public:
   }
   bool isTargetDarwin() const {
     Triple TT(STI.getTargetTriple());
-    Triple::OSType OS = TT.getOS();
-    return OS == Triple::Darwin || OS == Triple::MacOSX || OS == Triple::IOS;
+  	return TT.isOSDarwin();
   }
 
   unsigned getMachineSoImmOpValue(unsigned SoImm) const;
@@ -638,8 +637,14 @@ getARMBLXTargetOpValue(const MCInst &MI, unsigned OpIdx,
 uint32_t ARMMCCodeEmitter::
 getUnconditionalBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
                        SmallVectorImpl<MCFixup> &Fixups) const {
-  unsigned Val =
-    ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_t2_uncondbranch, Fixups);
+  unsigned Val = 0;
+  const MCOperand MO = MI.getOperand(OpIdx);
+    
+  if(MO.isExpr())
+    return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_t2_uncondbranch, Fixups);
+  else 
+    Val = MO.getImm() >> 1;
+
   bool I  = (Val & 0x800000);
   bool J1 = (Val & 0x400000);
   bool J2 = (Val & 0x200000);
@@ -665,7 +670,7 @@ getAdrLabelOpValue(const MCInst &MI, unsigned OpIdx,
   if (MO.isExpr())
     return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_adr_pcrel_12,
                                     Fixups);
-  int32_t offset = MO.getImm();
+  int64_t offset = MO.getImm();
   uint32_t Val = 0x2000;
 
   int SoImmVal;
@@ -772,8 +777,10 @@ getAddrModeImm12OpValue(const MCInst &MI, unsigned OpIdx,
     } else {
       Reg = ARM::PC;
       int32_t Offset = MO.getImm();
-      // FIXME: Handle #-0.
-      if (Offset < 0) {
+      if (Offset == INT32_MIN) {
+        Offset = 0;
+        isAdd = false;
+      } else if (Offset < 0) {
         Offset *= -1;
         isAdd = false;
       }
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
index caa1949..a99de0e 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
@@ -12,30 +12,73 @@
 //===----------------------------------------------------------------------===//
 
 #include "ARMBaseInfo.h"
-#include "ARMELFStreamer.h"
 #include "ARMMCAsmInfo.h"
 #include "ARMMCTargetDesc.h"
 #include "InstPrinter/ARMInstPrinter.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/MC/MCCodeGenInfo.h"
+#include "llvm/MC/MCELFStreamer.h"
 #include "llvm/MC/MCInstrAnalysis.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
-#include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TargetRegistry.h"
 
+using namespace llvm;
+
 #define GET_REGINFO_MC_DESC
 #include "ARMGenRegisterInfo.inc"
 
+static bool getMCRDeprecationInfo(MCInst &MI, MCSubtargetInfo &STI,
+                                  std::string &Info) {
+  if (STI.getFeatureBits() & llvm::ARM::HasV7Ops &&
+      (MI.getOperand(0).isImm() && MI.getOperand(0).getImm() == 15) &&
+      (MI.getOperand(1).isImm() && MI.getOperand(1).getImm() == 0) &&
+      // Checks for the deprecated CP15ISB encoding:
+      // mcr p15, #0, rX, c7, c5, #4
+      (MI.getOperand(3).isImm() && MI.getOperand(3).getImm() == 7)) {
+    if ((MI.getOperand(5).isImm() && MI.getOperand(5).getImm() == 4)) {
+      if (MI.getOperand(4).isImm() && MI.getOperand(4).getImm() == 5) {
+        Info = "deprecated since v7, use 'isb'";
+        return true;
+      }
+
+      // Checks for the deprecated CP15DSB encoding:
+      // mcr p15, #0, rX, c7, c10, #4
+      if (MI.getOperand(4).isImm() && MI.getOperand(4).getImm() == 10) {
+        Info = "deprecated since v7, use 'dsb'";
+        return true;
+      }
+    }
+    // Checks for the deprecated CP15DMB encoding:
+    // mcr p15, #0, rX, c7, c10, #5
+    if (MI.getOperand(4).isImm() && MI.getOperand(4).getImm() == 10 &&
+        (MI.getOperand(5).isImm() && MI.getOperand(5).getImm() == 5)) {
+      Info = "deprecated since v7, use 'dmb'";
+      return true;
+    }
+  }
+  return false;
+}
+
+static bool getITDeprecationInfo(MCInst &MI, MCSubtargetInfo &STI,
+                                  std::string &Info) {
+  if (STI.getFeatureBits() & llvm::ARM::HasV8Ops &&
+      MI.getOperand(1).isImm() && MI.getOperand(1).getImm() != 8) {
+    Info = "applying IT instruction to more than one subsequent instruction is deprecated";
+    return true;
+  }
+
+  return false;
+}
+
 #define GET_INSTRINFO_MC_DESC
 #include "ARMGenInstrInfo.inc"
 
 #define GET_SUBTARGETINFO_MC_DESC
 #include "ARMGenSubtargetInfo.inc"
 
-using namespace llvm;
 
 std::string ARM_MC::ParseARMTriple(StringRef TT, StringRef CPU) {
   Triple triple(TT);
@@ -60,8 +103,13 @@ std::string ARM_MC::ParseARMTriple(StringRef TT, StringRef CPU) {
   if (Idx) {
     unsigned SubVer = TT[Idx];
     if (SubVer == '8') {
-      // FIXME: Parse v8 features
-      ARMArchFeature = "+v8";
+      if (NoCPU)
+        // v8a: FeatureDB, FeatureFPARMv8, FeatureNEON, FeatureDSPThumb2, FeatureMP,
+        //      FeatureHWDiv, FeatureHWDivARM, FeatureTrustZone, FeatureT2XtPk, FeatureCrypto, FeatureCRC
+        ARMArchFeature = "+v8,+db,+fp-armv8,+neon,+t2dsp,+mp,+hwdiv,+hwdiv-arm,+trustzone,+t2xtpk,+crypto,+crc";
+      else
+        // Use CPU to figure out the exact features
+        ARMArchFeature = "+v8";
     } else if (SubVer == '7') {
       if (Len >= Idx+2 && TT[Idx+1] == 'm') {
         isThumb = true;
@@ -106,7 +154,7 @@ std::string ARM_MC::ParseARMTriple(StringRef TT, StringRef CPU) {
         isThumb = true;
         if (NoCPU)
           // v6m: FeatureNoARM, FeatureMClass
-          ARMArchFeature = "+v6,+noarm,+mclass";
+          ARMArchFeature = "+v6m,+noarm,+mclass";
         else
           ARMArchFeature = "+v6";
       } else
@@ -307,6 +355,10 @@ extern "C" void LLVMInitializeARMTargetMC() {
   TargetRegistry::RegisterMCObjectStreamer(TheARMTarget, createMCStreamer);
   TargetRegistry::RegisterMCObjectStreamer(TheThumbTarget, createMCStreamer);
 
+  // Register the asm streamer.
+  TargetRegistry::RegisterAsmStreamer(TheARMTarget, createMCAsmStreamer);
+  TargetRegistry::RegisterAsmStreamer(TheThumbTarget, createMCAsmStreamer);
+
   // Register the MCInstPrinter.
   TargetRegistry::RegisterMCInstPrinter(TheARMTarget, createARMMCInstPrinter);
   TargetRegistry::RegisterMCInstPrinter(TheThumbTarget, createARMMCInstPrinter);
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h
index 4e94c53..959be8b 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h
@@ -18,13 +18,16 @@
 #include <string>
 
 namespace llvm {
+class formatted_raw_ostream;
 class MCAsmBackend;
 class MCCodeEmitter;
 class MCContext;
 class MCInstrInfo;
+class MCInstPrinter;
 class MCObjectWriter;
 class MCRegisterInfo;
 class MCSubtargetInfo;
+class MCStreamer;
 class MCRelocationInfo;
 class StringRef;
 class Target;
@@ -42,12 +45,19 @@ namespace ARM_MC {
                                             StringRef FS);
 }
 
+MCStreamer *createMCAsmStreamer(MCContext &Ctx, formatted_raw_ostream &OS,
+                                bool isVerboseAsm, bool useLoc, bool useCFI,
+                                bool useDwarfDirectory,
+                                MCInstPrinter *InstPrint, MCCodeEmitter *CE,
+                                MCAsmBackend *TAB, bool ShowInst);
+
 MCCodeEmitter *createARMMCCodeEmitter(const MCInstrInfo &MCII,
                                       const MCRegisterInfo &MRI,
                                       const MCSubtargetInfo &STI,
                                       MCContext &Ctx);
 
-MCAsmBackend *createARMAsmBackend(const Target &T, StringRef TT, StringRef CPU);
+MCAsmBackend *createARMAsmBackend(const Target &T, const MCRegisterInfo &MRI,
+                                  StringRef TT, StringRef CPU);
 
 /// createARMELFObjectWriter - Construct an ELF Mach-O object writer.
 MCObjectWriter *createARMELFObjectWriter(raw_ostream &OS,
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp b/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
index b9efe74..1f681ba 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
@@ -20,10 +20,9 @@
 #include "llvm/MC/MCMachOSymbolFlags.h"
 #include "llvm/MC/MCMachObjectWriter.h"
 #include "llvm/MC/MCValue.h"
-#include "llvm/Object/MachOFormat.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachO.h"
 using namespace llvm;
-using namespace llvm::object;
 
 namespace {
 class ARMMachObjectWriter : public MCMachObjectTargetWriter {
@@ -63,7 +62,7 @@ public:
 
 static bool getARMFixupKindMachOInfo(unsigned Kind, unsigned &RelocType,
                               unsigned &Log2Size) {
-  RelocType = unsigned(macho::RIT_Vanilla);
+  RelocType = unsigned(MachO::ARM_RELOC_VANILLA);
   Log2Size = ~0U;
 
   switch (Kind) {
@@ -92,21 +91,21 @@ static bool getARMFixupKindMachOInfo(unsigned Kind, unsigned &RelocType,
   case ARM::fixup_arm_uncondbl:
   case ARM::fixup_arm_condbl:
   case ARM::fixup_arm_blx:
-    RelocType = unsigned(macho::RIT_ARM_Branch24Bit);
+    RelocType = unsigned(MachO::ARM_RELOC_BR24);
     // Report as 'long', even though that is not quite accurate.
     Log2Size = llvm::Log2_32(4);
     return true;
 
     // Handle Thumb branches.
   case ARM::fixup_arm_thumb_br:
-    RelocType = unsigned(macho::RIT_ARM_ThumbBranch22Bit);
+    RelocType = unsigned(MachO::ARM_THUMB_RELOC_BR22);
     Log2Size = llvm::Log2_32(2);
     return true;
 
   case ARM::fixup_t2_uncondbranch:
   case ARM::fixup_arm_thumb_bl:
   case ARM::fixup_arm_thumb_blx:
-    RelocType = unsigned(macho::RIT_ARM_ThumbBranch22Bit);
+    RelocType = unsigned(MachO::ARM_THUMB_RELOC_BR22);
     Log2Size = llvm::Log2_32(4);
     return true;
 
@@ -121,23 +120,23 @@ static bool getARMFixupKindMachOInfo(unsigned Kind, unsigned &RelocType,
   //      1 - thumb instructions
   case ARM::fixup_arm_movt_hi16:
   case ARM::fixup_arm_movt_hi16_pcrel:
-    RelocType = unsigned(macho::RIT_ARM_Half);
+    RelocType = unsigned(MachO::ARM_RELOC_HALF);
     Log2Size = 1;
     return true;
   case ARM::fixup_t2_movt_hi16:
   case ARM::fixup_t2_movt_hi16_pcrel:
-    RelocType = unsigned(macho::RIT_ARM_Half);
+    RelocType = unsigned(MachO::ARM_RELOC_HALF);
     Log2Size = 3;
     return true;
 
   case ARM::fixup_arm_movw_lo16:
   case ARM::fixup_arm_movw_lo16_pcrel:
-    RelocType = unsigned(macho::RIT_ARM_Half);
+    RelocType = unsigned(MachO::ARM_RELOC_HALF);
     Log2Size = 0;
     return true;
   case ARM::fixup_t2_movw_lo16:
   case ARM::fixup_t2_movw_lo16_pcrel:
-    RelocType = unsigned(macho::RIT_ARM_Half);
+    RelocType = unsigned(MachO::ARM_RELOC_HALF);
     Log2Size = 2;
     return true;
   }
@@ -153,7 +152,7 @@ RecordARMScatteredHalfRelocation(MachObjectWriter *Writer,
                                  uint64_t &FixedValue) {
   uint32_t FixupOffset = Layout.getFragmentOffset(Fragment)+Fixup.getOffset();
   unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind());
-  unsigned Type = macho::RIT_ARM_Half;
+  unsigned Type = MachO::ARM_RELOC_HALF;
 
   // See <reloc.h>.
   const MCSymbol *A = &Target.getSymA()->getSymbol();
@@ -179,7 +178,7 @@ RecordARMScatteredHalfRelocation(MachObjectWriter *Writer,
                          "' can not be undefined in a subtraction expression");
 
     // Select the appropriate difference relocation type.
-    Type = macho::RIT_ARM_HalfDifference;
+    Type = MachO::ARM_RELOC_HALF_SECTDIFF;
     Value2 = Writer->getSymbolAddress(B_SD, Layout);
     FixedValue -= Writer->getSectionAddress(B_SD->getFragment()->getParent());
   }
@@ -223,29 +222,29 @@ RecordARMScatteredHalfRelocation(MachObjectWriter *Writer,
     break;
   }
 
-  if (Type == macho::RIT_ARM_HalfDifference) {
+  if (Type == MachO::ARM_RELOC_HALF_SECTDIFF) {
     uint32_t OtherHalf = MovtBit
       ? (FixedValue & 0xffff) : ((FixedValue & 0xffff0000) >> 16);
 
-    macho::RelocationEntry MRE;
-    MRE.Word0 = ((OtherHalf       <<  0) |
-                 (macho::RIT_Pair << 24) |
-                 (MovtBit         << 28) |
-                 (ThumbBit        << 29) |
-                 (IsPCRel         << 30) |
-                 macho::RF_Scattered);
-    MRE.Word1 = Value2;
+    MachO::any_relocation_info MRE;
+    MRE.r_word0 = ((OtherHalf             <<  0) |
+                   (MachO::ARM_RELOC_PAIR << 24) |
+                   (MovtBit               << 28) |
+                   (ThumbBit              << 29) |
+                   (IsPCRel               << 30) |
+                   MachO::R_SCATTERED);
+    MRE.r_word1 = Value2;
     Writer->addRelocation(Fragment->getParent(), MRE);
   }
 
-  macho::RelocationEntry MRE;
-  MRE.Word0 = ((FixupOffset <<  0) |
-               (Type        << 24) |
-               (MovtBit     << 28) |
-               (ThumbBit    << 29) |
-               (IsPCRel     << 30) |
-               macho::RF_Scattered);
-  MRE.Word1 = Value;
+  MachO::any_relocation_info MRE;
+  MRE.r_word0 = ((FixupOffset <<  0) |
+                 (Type        << 24) |
+                 (MovtBit     << 28) |
+                 (ThumbBit    << 29) |
+                 (IsPCRel     << 30) |
+                 MachO::R_SCATTERED);
+  MRE.r_word1 = Value;
   Writer->addRelocation(Fragment->getParent(), MRE);
 }
 
@@ -259,7 +258,7 @@ void ARMMachObjectWriter::RecordARMScatteredRelocation(MachObjectWriter *Writer,
                                                     uint64_t &FixedValue) {
   uint32_t FixupOffset = Layout.getFragmentOffset(Fragment)+Fixup.getOffset();
   unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind());
-  unsigned Type = macho::RIT_Vanilla;
+  unsigned Type = MachO::ARM_RELOC_VANILLA;
 
   // See <reloc.h>.
   const MCSymbol *A = &Target.getSymA()->getSymbol();
@@ -284,31 +283,31 @@ void ARMMachObjectWriter::RecordARMScatteredRelocation(MachObjectWriter *Writer,
                          "' can not be undefined in a subtraction expression");
 
     // Select the appropriate difference relocation type.
-    Type = macho::RIT_Difference;
+    Type = MachO::ARM_RELOC_SECTDIFF;
     Value2 = Writer->getSymbolAddress(B_SD, Layout);
     FixedValue -= Writer->getSectionAddress(B_SD->getFragment()->getParent());
   }
 
   // Relocations are written out in reverse order, so the PAIR comes first.
-  if (Type == macho::RIT_Difference ||
-      Type == macho::RIT_Generic_LocalDifference) {
-    macho::RelocationEntry MRE;
-    MRE.Word0 = ((0         <<  0) |
-                 (macho::RIT_Pair  << 24) |
-                 (Log2Size  << 28) |
-                 (IsPCRel   << 30) |
-                 macho::RF_Scattered);
-    MRE.Word1 = Value2;
+  if (Type == MachO::ARM_RELOC_SECTDIFF ||
+      Type == MachO::ARM_RELOC_LOCAL_SECTDIFF) {
+    MachO::any_relocation_info MRE;
+    MRE.r_word0 = ((0                     <<  0) |
+                   (MachO::ARM_RELOC_PAIR << 24) |
+                   (Log2Size              << 28) |
+                   (IsPCRel               << 30) |
+                   MachO::R_SCATTERED);
+    MRE.r_word1 = Value2;
     Writer->addRelocation(Fragment->getParent(), MRE);
   }
 
-  macho::RelocationEntry MRE;
-  MRE.Word0 = ((FixupOffset <<  0) |
-               (Type        << 24) |
-               (Log2Size    << 28) |
-               (IsPCRel     << 30) |
-               macho::RF_Scattered);
-  MRE.Word1 = Value;
+  MachO::any_relocation_info MRE;
+  MRE.r_word0 = ((FixupOffset <<  0) |
+                 (Type        << 24) |
+                 (Log2Size    << 28) |
+                 (IsPCRel     << 30) |
+                 MachO::R_SCATTERED);
+  MRE.r_word1 = Value;
   Writer->addRelocation(Fragment->getParent(), MRE);
 }
 
@@ -326,13 +325,13 @@ bool ARMMachObjectWriter::requiresExternRelocation(MachObjectWriter *Writer,
   switch (RelocType) {
   default:
     return false;
-  case macho::RIT_ARM_Branch24Bit:
+  case MachO::ARM_RELOC_BR24:
     // PC pre-adjustment of 8 for these instructions.
     Value -= 8;
     // ARM BL/BLX has a 25-bit offset.
     Range = 0x1ffffff;
     break;
-  case macho::RIT_ARM_ThumbBranch22Bit:
+  case MachO::ARM_THUMB_RELOC_BR22:
     // PC pre-adjustment of 4 for these instructions.
     Value -= 4;
     // Thumb BL/BLX has a 24-bit offset.
@@ -361,7 +360,7 @@ void ARMMachObjectWriter::RecordRelocation(MachObjectWriter *Writer,
                                            uint64_t &FixedValue) {
   unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind());
   unsigned Log2Size;
-  unsigned RelocType = macho::RIT_Vanilla;
+  unsigned RelocType = MachO::ARM_RELOC_VANILLA;
   if (!getARMFixupKindMachOInfo(Fixup.getKind(), RelocType, Log2Size))
     // If we failed to get fixup kind info, it's because there's no legal
     // relocation type for the fixup kind. This happens when it's a fixup that's
@@ -374,7 +373,7 @@ void ARMMachObjectWriter::RecordRelocation(MachObjectWriter *Writer,
   // scattered relocation entry.  Differences always require scattered
   // relocations.
   if (Target.getSymB()) {
-    if (RelocType == macho::RIT_ARM_Half)
+    if (RelocType == MachO::ARM_RELOC_HALF)
       return RecordARMScatteredHalfRelocation(Writer, Asm, Layout, Fragment,
                                               Fixup, Target, FixedValue);
     return RecordARMScatteredRelocation(Writer, Asm, Layout, Fragment, Fixup,
@@ -392,7 +391,7 @@ void ARMMachObjectWriter::RecordRelocation(MachObjectWriter *Writer,
   //
   // Is this right for ARM?
   uint32_t Offset = Target.getConstant();
-  if (IsPCRel && RelocType == macho::RIT_Vanilla)
+  if (IsPCRel && RelocType == MachO::ARM_RELOC_VANILLA)
     Offset += 1 << Log2Size;
   if (Offset && SD && !Writer->doesSymbolRequireExternRelocation(SD))
     return RecordARMScatteredRelocation(Writer, Asm, Layout, Fragment, Fixup,
@@ -445,17 +444,17 @@ void ARMMachObjectWriter::RecordRelocation(MachObjectWriter *Writer,
   }
 
   // struct relocation_info (8 bytes)
-  macho::RelocationEntry MRE;
-  MRE.Word0 = FixupOffset;
-  MRE.Word1 = ((Index     <<  0) |
-               (IsPCRel   << 24) |
-               (Log2Size  << 25) |
-               (IsExtern  << 27) |
-               (Type      << 28));
+  MachO::any_relocation_info MRE;
+  MRE.r_word0 = FixupOffset;
+  MRE.r_word1 = ((Index     <<  0) |
+                 (IsPCRel   << 24) |
+                 (Log2Size  << 25) |
+                 (IsExtern  << 27) |
+                 (Type      << 28));
 
   // Even when it's not a scattered relocation, movw/movt always uses
   // a PAIR relocation.
-  if (Type == macho::RIT_ARM_Half) {
+  if (Type == MachO::ARM_RELOC_HALF) {
     // The other-half value only gets populated for the movt and movw
     // relocation entries.
     uint32_t Value = 0;
@@ -474,11 +473,11 @@ void ARMMachObjectWriter::RecordRelocation(MachObjectWriter *Writer,
       Value = FixedValue & 0xffff;
       break;
     }
-    macho::RelocationEntry MREPair;
-    MREPair.Word0 = Value;
-    MREPair.Word1 = ((0xffffff) |
-                     (Log2Size << 25) |
-                     (macho::RIT_Pair << 28));
+    MachO::any_relocation_info MREPair;
+    MREPair.r_word0 = Value;
+    MREPair.r_word1 = ((0xffffff              <<  0) |
+                       (Log2Size              << 25) |
+                       (MachO::ARM_RELOC_PAIR << 28));
 
     Writer->addRelocation(Fragment->getParent(), MREPair);
   }
diff --git a/lib/Target/ARM/Thumb1FrameLowering.cpp b/lib/Target/ARM/Thumb1FrameLowering.cpp
index db49db8..cfb33f5 100644
--- a/lib/Target/ARM/Thumb1FrameLowering.cpp
+++ b/lib/Target/ARM/Thumb1FrameLowering.cpp
@@ -127,7 +127,6 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF) const {
     case ARM::LR:
       if (Reg == FramePtr)
         FramePtrSpillFI = FI;
-      AFI->addGPRCalleeSavedArea1Frame(FI);
       GPRCS1Size += 4;
       break;
     case ARM::R8:
@@ -136,16 +135,12 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF) const {
     case ARM::R11:
       if (Reg == FramePtr)
         FramePtrSpillFI = FI;
-      if (STI.isTargetIOS()) {
-        AFI->addGPRCalleeSavedArea2Frame(FI);
+      if (STI.isTargetIOS())
         GPRCS2Size += 4;
-      } else {
-        AFI->addGPRCalleeSavedArea1Frame(FI);
+      else
         GPRCS1Size += 4;
-      }
       break;
     default:
-      AFI->addDPRCalleeSavedAreaFrame(FI);
       DPRCSSize += 8;
     }
   }
@@ -169,10 +164,17 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF) const {
   AFI->setDPRCalleeSavedAreaOffset(DPRCSOffset);
   NumBytes = DPRCSOffset;
 
+  int FramePtrOffsetInBlock = 0;
+  if (tryFoldSPUpdateIntoPushPop(MF, prior(MBBI), NumBytes)) {
+    FramePtrOffsetInBlock = NumBytes;
+    NumBytes = 0;
+  }
+
   // Adjust FP so it point to the stack slot that contains the previous FP.
   if (HasFP) {
+    FramePtrOffsetInBlock += MFI->getObjectOffset(FramePtrSpillFI) + GPRCS1Size;
     AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tADDrSPi), FramePtr)
-      .addFrameIndex(FramePtrSpillFI).addImm(0)
+      .addReg(ARM::SP).addImm(FramePtrOffsetInBlock / 4)
       .setMIFlags(MachineInstr::FrameSetup));
     if (NumBytes > 508)
       // If offset is > 508 then sp cannot be adjusted in a single instruction,
@@ -213,13 +215,6 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF) const {
     AFI->setShouldRestoreSPFromFP(true);
 }
 
-static bool isCalleeSavedRegister(unsigned Reg, const uint16_t *CSRegs) {
-  for (unsigned i = 0; CSRegs[i]; ++i)
-    if (Reg == CSRegs[i])
-      return true;
-  return false;
-}
-
 static bool isCSRestore(MachineInstr *MI, const uint16_t *CSRegs) {
   if (MI->getOpcode() == ARM::tLDRspi &&
       MI->getOperand(1).isFI() &&
@@ -296,8 +291,9 @@ void Thumb1FrameLowering::emitEpilogue(MachineFunction &MF,
           &MBB.front() != MBBI &&
           prior(MBBI)->getOpcode() == ARM::tPOP) {
         MachineBasicBlock::iterator PMBBI = prior(MBBI);
-        emitSPUpdate(MBB, PMBBI, TII, dl, *RegInfo, NumBytes);
-      } else
+        if (!tryFoldSPUpdateIntoPushPop(MF, PMBBI, NumBytes))
+          emitSPUpdate(MBB, PMBBI, TII, dl, *RegInfo, NumBytes);
+      } else if (!tryFoldSPUpdateIntoPushPop(MF, MBBI, NumBytes))
         emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, NumBytes);
     }
   }
diff --git a/lib/Target/ARM/Thumb1RegisterInfo.cpp b/lib/Target/ARM/Thumb1RegisterInfo.cpp
index 6722614..65a7221 100644
--- a/lib/Target/ARM/Thumb1RegisterInfo.cpp
+++ b/lib/Target/ARM/Thumb1RegisterInfo.cpp
@@ -426,7 +426,7 @@ rewriteFrameIndex(MachineBasicBlock::iterator II, unsigned FrameRegIdx,
                                 *this);
     } else {
       // Translate r0 = add sp, -imm to
-      // r0 = -imm (this is then translated into a series of instructons)
+      // r0 = -imm (this is then translated into a series of instructions)
       // r0 = add r0, sp
       emitThumbConstant(MBB, II, DestReg, Offset, TII, *this, dl);
 
@@ -573,11 +573,7 @@ Thumb1RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   int Offset = MF.getFrameInfo()->getObjectOffset(FrameIndex) +
                MF.getFrameInfo()->getStackSize() + SPAdj;
 
-  if (AFI->isGPRCalleeSavedArea1Frame(FrameIndex))
-    Offset -= AFI->getGPRCalleeSavedArea1Offset();
-  else if (AFI->isGPRCalleeSavedArea2Frame(FrameIndex))
-    Offset -= AFI->getGPRCalleeSavedArea2Offset();
-  else if (MF.getFrameInfo()->hasVarSizedObjects()) {
+  if (MF.getFrameInfo()->hasVarSizedObjects()) {
     assert(SPAdj == 0 && MF.getTarget().getFrameLowering()->hasFP(MF) &&
            "Unexpected");
     // There are alloca()'s in this function, must reference off the frame
diff --git a/lib/Target/ARM/Thumb2ITBlockPass.cpp b/lib/Target/ARM/Thumb2ITBlockPass.cpp
index d8596d7..0b7d3bb 100644
--- a/lib/Target/ARM/Thumb2ITBlockPass.cpp
+++ b/lib/Target/ARM/Thumb2ITBlockPass.cpp
@@ -28,6 +28,7 @@ namespace {
     static char ID;
     Thumb2ITBlockPass() : MachineFunctionPass(ID) {}
 
+    bool restrictIT;
     const Thumb2InstrInfo *TII;
     const TargetRegisterInfo *TRI;
     ARMFunctionInfo *AFI;
@@ -192,37 +193,42 @@ bool Thumb2ITBlockPass::InsertITInstructions(MachineBasicBlock &MBB) {
     // Form IT block.
     ARMCC::CondCodes OCC = ARMCC::getOppositeCondition(CC);
     unsigned Mask = 0, Pos = 3;
-    // Branches, including tricky ones like LDM_RET, need to end an IT
-    // block so check the instruction we just put in the block.
-    for (; MBBI != E && Pos &&
-           (!MI->isBranch() && !MI->isReturn()) ; ++MBBI) {
-      if (MBBI->isDebugValue())
-        continue;
-
-      MachineInstr *NMI = &*MBBI;
-      MI = NMI;
-
-      unsigned NPredReg = 0;
-      ARMCC::CondCodes NCC = getITInstrPredicate(NMI, NPredReg);
-      if (NCC == CC || NCC == OCC) {
-        Mask |= (NCC & 1) << Pos;
-        // Add implicit use of ITSTATE.
-        NMI->addOperand(MachineOperand::CreateReg(ARM::ITSTATE, false/*ifDef*/,
-                                               true/*isImp*/, false/*isKill*/));
-        LastITMI = NMI;
-      } else {
-        if (NCC == ARMCC::AL &&
-            MoveCopyOutOfITBlock(NMI, CC, OCC, Defs, Uses)) {
-          --MBBI;
-          MBB.remove(NMI);
-          MBB.insert(InsertPos, NMI);
-          ++NumMovedInsts;
+
+    // v8 IT blocks are limited to one conditional op unless -arm-no-restrict-it
+    // is set: skip the loop
+    if (!restrictIT) {
+      // Branches, including tricky ones like LDM_RET, need to end an IT
+      // block so check the instruction we just put in the block.
+      for (; MBBI != E && Pos &&
+             (!MI->isBranch() && !MI->isReturn()) ; ++MBBI) {
+        if (MBBI->isDebugValue())
           continue;
+
+        MachineInstr *NMI = &*MBBI;
+        MI = NMI;
+
+        unsigned NPredReg = 0;
+        ARMCC::CondCodes NCC = getITInstrPredicate(NMI, NPredReg);
+        if (NCC == CC || NCC == OCC) {
+          Mask |= (NCC & 1) << Pos;
+          // Add implicit use of ITSTATE.
+          NMI->addOperand(MachineOperand::CreateReg(ARM::ITSTATE, false/*ifDef*/,
+                                                 true/*isImp*/, false/*isKill*/));
+          LastITMI = NMI;
+        } else {
+          if (NCC == ARMCC::AL &&
+              MoveCopyOutOfITBlock(NMI, CC, OCC, Defs, Uses)) {
+            --MBBI;
+            MBB.remove(NMI);
+            MBB.insert(InsertPos, NMI);
+            ++NumMovedInsts;
+            continue;
+          }
+          break;
         }
-        break;
+        TrackDefUses(NMI, Defs, Uses, TRI);
+        --Pos;
       }
-      TrackDefUses(NMI, Defs, Uses, TRI);
-      --Pos;
     }
 
     // Finalize IT mask.
@@ -250,6 +256,7 @@ bool Thumb2ITBlockPass::runOnMachineFunction(MachineFunction &Fn) {
   AFI = Fn.getInfo<ARMFunctionInfo>();
   TII = static_cast<const Thumb2InstrInfo*>(TM.getInstrInfo());
   TRI = TM.getRegisterInfo();
+  restrictIT = TM.getSubtarget<ARMSubtarget>().restrictIT();
 
   if (!AFI->isThumbFunction())
     return false;
diff --git a/lib/Target/ARM/Thumb2InstrInfo.cpp b/lib/Target/ARM/Thumb2InstrInfo.cpp
index 286eaa0..91788ac 100644
--- a/lib/Target/ARM/Thumb2InstrInfo.cpp
+++ b/lib/Target/ARM/Thumb2InstrInfo.cpp
@@ -36,7 +36,8 @@ Thumb2InstrInfo::Thumb2InstrInfo(const ARMSubtarget &STI)
 
 /// getNoopForMachoTarget - Return the noop instruction to use for a noop.
 void Thumb2InstrInfo::getNoopForMachoTarget(MCInst &NopInst) const {
-  NopInst.setOpcode(ARM::tNOP);
+  NopInst.setOpcode(ARM::tHINT);
+  NopInst.addOperand(MCOperand::CreateImm(0));
   NopInst.addOperand(MCOperand::CreateImm(ARMCC::AL));
   NopInst.addOperand(MCOperand::CreateReg(0));
 }
@@ -214,6 +215,13 @@ void llvm::emitT2RegPlusImmediate(MachineBasicBlock &MBB,
                                unsigned DestReg, unsigned BaseReg, int NumBytes,
                                ARMCC::CondCodes Pred, unsigned PredReg,
                                const ARMBaseInstrInfo &TII, unsigned MIFlags) {
+  if (NumBytes == 0 && DestReg != BaseReg) {
+    BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), DestReg)
+      .addReg(BaseReg, RegState::Kill)
+      .addImm((unsigned)Pred).addReg(PredReg).setMIFlags(MIFlags);
+    return;
+  }
+
   bool isSub = NumBytes < 0;
   if (isSub) NumBytes = -NumBytes;
 
@@ -334,6 +342,7 @@ negativeOffsetOpcode(unsigned opcode)
   case ARM::t2STRi12:   return ARM::t2STRi8;
   case ARM::t2STRBi12:  return ARM::t2STRBi8;
   case ARM::t2STRHi12:  return ARM::t2STRHi8;
+  case ARM::t2PLDi12:   return ARM::t2PLDi8;
 
   case ARM::t2LDRi8:
   case ARM::t2LDRHi8:
@@ -343,6 +352,7 @@ negativeOffsetOpcode(unsigned opcode)
   case ARM::t2STRi8:
   case ARM::t2STRBi8:
   case ARM::t2STRHi8:
+  case ARM::t2PLDi8:
     return opcode;
 
   default:
@@ -364,6 +374,7 @@ positiveOffsetOpcode(unsigned opcode)
   case ARM::t2STRi8:   return ARM::t2STRi12;
   case ARM::t2STRBi8:  return ARM::t2STRBi12;
   case ARM::t2STRHi8:  return ARM::t2STRHi12;
+  case ARM::t2PLDi8:   return ARM::t2PLDi12;
 
   case ARM::t2LDRi12:
   case ARM::t2LDRHi12:
@@ -373,6 +384,7 @@ positiveOffsetOpcode(unsigned opcode)
   case ARM::t2STRi12:
   case ARM::t2STRBi12:
   case ARM::t2STRHi12:
+  case ARM::t2PLDi12:
     return opcode;
 
   default:
@@ -394,6 +406,7 @@ immediateOffsetOpcode(unsigned opcode)
   case ARM::t2STRs:   return ARM::t2STRi12;
   case ARM::t2STRBs:  return ARM::t2STRBi12;
   case ARM::t2STRHs:  return ARM::t2STRHi12;
+  case ARM::t2PLDs:   return ARM::t2PLDi12;
 
   case ARM::t2LDRi12:
   case ARM::t2LDRHi12:
@@ -403,6 +416,7 @@ immediateOffsetOpcode(unsigned opcode)
   case ARM::t2STRi12:
   case ARM::t2STRBi12:
   case ARM::t2STRHi12:
+  case ARM::t2PLDi12:
   case ARM::t2LDRi8:
   case ARM::t2LDRHi8:
   case ARM::t2LDRBi8:
@@ -411,6 +425,7 @@ immediateOffsetOpcode(unsigned opcode)
   case ARM::t2STRi8:
   case ARM::t2STRBi8:
   case ARM::t2STRHi8:
+  case ARM::t2PLDi8:
     return opcode;
 
   default:
diff --git a/lib/Target/CppBackend/CPPBackend.cpp b/lib/Target/CppBackend/CPPBackend.cpp
index 0ddcad2..ddc7a66 100644
--- a/lib/Target/CppBackend/CPPBackend.cpp
+++ b/lib/Target/CppBackend/CPPBackend.cpp
@@ -33,6 +33,7 @@
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/TargetRegistry.h"
 #include <algorithm>
+#include <cctype>
 #include <cstdio>
 #include <map>
 #include <set>
@@ -291,8 +292,6 @@ void CppWriter::printLinkageType(GlobalValue::LinkageTypes LT) {
     Out << "GlobalValue::LinkOnceAnyLinkage "; break;
   case GlobalValue::LinkOnceODRLinkage:
     Out << "GlobalValue::LinkOnceODRLinkage "; break;
-  case GlobalValue::LinkOnceODRAutoHideLinkage:
-    Out << "GlobalValue::LinkOnceODRAutoHideLinkage"; break;
   case GlobalValue::WeakAnyLinkage:
     Out << "GlobalValue::WeakAnyLinkage"; break;
   case GlobalValue::WeakODRLinkage:
@@ -497,6 +496,7 @@ void CppWriter::printAttributes(const AttributeSet &PAL,
       HANDLE_ATTR(ReadOnly);
       HANDLE_ATTR(NoInline);
       HANDLE_ATTR(AlwaysInline);
+      HANDLE_ATTR(OptimizeNone);
       HANDLE_ATTR(OptimizeForSize);
       HANDLE_ATTR(StackProtect);
       HANDLE_ATTR(StackProtectReq);
@@ -1139,7 +1139,7 @@ void CppWriter::printInstruction(const Instruction *I,
     nl(Out);
     for (SwitchInst::ConstCaseIt i = SI->case_begin(), e = SI->case_end();
          i != e; ++i) {
-      const IntegersSubset CaseVal = i.getCaseValueEx();
+      const ConstantInt* CaseVal = i.getCaseValue();
       const BasicBlock *BB = i.getCaseSuccessor();
       Out << iName << "->addCase("
           << getOpName(CaseVal) << ", "
@@ -1160,8 +1160,7 @@ void CppWriter::printInstruction(const Instruction *I,
     break;
   }
   case Instruction::Resume: {
-    Out << "ResumeInst::Create(mod->getContext(), " << opNames[0]
-        << ", " << bbname << ");";
+    Out << "ResumeInst::Create(" << opNames[0] << ", " << bbname << ");";
     break;
   }
   case Instruction::Invoke: {
@@ -1175,7 +1174,7 @@ void CppWriter::printInstruction(const Instruction *I,
     }
     // FIXME: This shouldn't use magic numbers -3, -2, and -1.
     Out << "InvokeInst *" << iName << " = InvokeInst::Create("
-        << getOpName(inv->getCalledFunction()) << ", "
+        << getOpName(inv->getCalledValue()) << ", "
         << getOpName(inv->getNormalDest()) << ", "
         << getOpName(inv->getUnwindDest()) << ", "
         << iName << "_params, \"";
@@ -1589,6 +1588,20 @@ void CppWriter::printInstruction(const Instruction *I,
     Out << "\");";
     break;
   }
+  case Instruction::LandingPad: {
+    const LandingPadInst *lpi = cast<LandingPadInst>(I);
+    Out << "LandingPadInst* " << iName << " = LandingPadInst::Create(";
+    printCppName(lpi->getType());
+    Out << ", " << opNames[0] << ", " << lpi->getNumClauses() << ", \"";
+    printEscapedString(lpi->getName());
+    Out << "\", " << bbname << ");";
+    nl(Out) << iName << "->setCleanup("
+            << (lpi->isCleanup() ? "true" : "false")
+            << ");";
+    for (unsigned i = 0, e = lpi->getNumClauses(); i != e; ++i)
+      nl(Out) << iName << "->addClause(" << opNames[i+1] << ");";
+    break;
+  }
   }
   DefinedValues.insert(I);
   nl(Out);
diff --git a/lib/Target/Hexagon/CMakeLists.txt b/lib/Target/Hexagon/CMakeLists.txt
index 2b79791..ae3c9eb 100644
--- a/lib/Target/Hexagon/CMakeLists.txt
+++ b/lib/Target/Hexagon/CMakeLists.txt
@@ -17,6 +17,7 @@ add_llvm_target(HexagonCodeGen
   HexagonFrameLowering.cpp
   HexagonHardwareLoops.cpp
   HexagonFixupHwLoops.cpp
+  HexagonMachineFunctionInfo.cpp
   HexagonMachineScheduler.cpp
   HexagonMCInstLower.cpp
   HexagonInstrInfo.cpp
diff --git a/lib/Target/Hexagon/HexagonAsmPrinter.cpp b/lib/Target/Hexagon/HexagonAsmPrinter.cpp
index 88cd3fb..a2e04ba 100644
--- a/lib/Target/Hexagon/HexagonAsmPrinter.cpp
+++ b/lib/Target/Hexagon/HexagonAsmPrinter.cpp
@@ -99,7 +99,7 @@ void HexagonAsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo,
     return;
   case MachineOperand::MO_GlobalAddress:
     // Computing the address of a global symbol, not calling it.
-    O << *Mang->getSymbol(MO.getGlobal());
+    O << *getSymbol(MO.getGlobal());
     printOffset(MO.getOffset(), O);
     return;
   }
@@ -267,7 +267,7 @@ void HexagonAsmPrinter::printGlobalOperand(const MachineInstr *MI, int OpNo,
   assert( (MO.getType() == MachineOperand::MO_GlobalAddress) &&
          "Expecting global address");
 
-  O << *Mang->getSymbol(MO.getGlobal());
+  O << *getSymbol(MO.getGlobal());
   if (MO.getOffset() != 0) {
     O << " + ";
     O << MO.getOffset();
diff --git a/lib/Target/Hexagon/HexagonHardwareLoops.cpp b/lib/Target/Hexagon/HexagonHardwareLoops.cpp
index 3c4ca0f..52d5ab2 100644
--- a/lib/Target/Hexagon/HexagonHardwareLoops.cpp
+++ b/lib/Target/Hexagon/HexagonHardwareLoops.cpp
@@ -871,7 +871,7 @@ bool HexagonHardwareLoops::isInvalidLoopOperation(
 /// \brief - Return true if the loop contains an instruction that inhibits
 /// the use of the hardware loop function.
 bool HexagonHardwareLoops::containsInvalidInstruction(MachineLoop *L) const {
-  const std::vector<MachineBasicBlock*> Blocks = L->getBlocks();
+  const std::vector<MachineBasicBlock *> &Blocks = L->getBlocks();
   for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
     MachineBasicBlock *MBB = Blocks[i];
     for (MachineBasicBlock::iterator
diff --git a/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp b/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
index 9e78e51..5ae9328 100644
--- a/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
+++ b/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
@@ -1344,8 +1344,10 @@ SDNode *HexagonDAGToDAGISel::SelectAdd(SDNode *N) {
 
 
 SDNode *HexagonDAGToDAGISel::Select(SDNode *N) {
-  if (N->isMachineOpcode())
+  if (N->isMachineOpcode()) {
+    N->setNodeId(-1);
     return NULL;   // Already selected.
+  }
 
 
   switch (N->getOpcode()) {
diff --git a/lib/Target/Hexagon/HexagonISelLowering.cpp b/lib/Target/Hexagon/HexagonISelLowering.cpp
index 567faca..1374179 100644
--- a/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -39,13 +39,24 @@
 
 using namespace llvm;
 
-const unsigned Hexagon_MAX_RET_SIZE = 64;
-
 static cl::opt<bool>
 EmitJumpTables("hexagon-emit-jump-tables", cl::init(true), cl::Hidden,
                cl::desc("Control jump table emission on Hexagon target"));
 
-int NumNamedVarArgParams = -1;
+namespace {
+class HexagonCCState : public CCState {
+  int NumNamedVarArgParams;
+
+public:
+  HexagonCCState(CallingConv::ID CC, bool isVarArg, MachineFunction &MF,
+                 const TargetMachine &TM, SmallVectorImpl<CCValAssign> &locs,
+                 LLVMContext &C, int NumNamedVarArgParams)
+      : CCState(CC, isVarArg, MF, TM, locs, C),
+        NumNamedVarArgParams(NumNamedVarArgParams) {}
+
+  int getNumNamedVarArgParams() const { return NumNamedVarArgParams; }
+};
+}
 
 // Implement calling convention for Hexagon.
 static bool
@@ -82,12 +93,13 @@ static bool
 CC_Hexagon_VarArg (unsigned ValNo, MVT ValVT,
             MVT LocVT, CCValAssign::LocInfo LocInfo,
             ISD::ArgFlagsTy ArgFlags, CCState &State) {
+  HexagonCCState &HState = static_cast<HexagonCCState &>(State);
 
   // NumNamedVarArgParams can not be zero for a VarArg function.
-  assert ( (NumNamedVarArgParams > 0) &&
-           "NumNamedVarArgParams is not bigger than zero.");
+  assert((HState.getNumNamedVarArgParams() > 0) &&
+         "NumNamedVarArgParams is not bigger than zero.");
 
-  if ( (int)ValNo < NumNamedVarArgParams ) {
+  if ((int)ValNo < HState.getNumNamedVarArgParams()) {
     // Deal with named arguments.
     return CC_Hexagon(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State);
   }
@@ -394,13 +406,8 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
   bool IsStructRet    = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
 
-  // Analyze operands of the call, assigning locations to each operand.
-  SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-                 getTargetMachine(), ArgLocs, *DAG.getContext());
-
   // Check for varargs.
-  NumNamedVarArgParams = -1;
+  int NumNamedVarArgParams = -1;
   if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Callee))
   {
     const Function* CalleeFn = NULL;
@@ -417,6 +424,12 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     }
   }
 
+  // Analyze operands of the call, assigning locations to each operand.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  HexagonCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+                        getTargetMachine(), ArgLocs, *DAG.getContext(),
+                        NumNamedVarArgParams);
+
   if (NumNamedVarArgParams > 0)
     CCInfo.AnalyzeCallOperands(Outs, CC_Hexagon_VarArg);
   else
diff --git a/lib/Target/Hexagon/HexagonISelLowering.h b/lib/Target/Hexagon/HexagonISelLowering.h
index 4fe0107..73da226 100644
--- a/lib/Target/Hexagon/HexagonISelLowering.h
+++ b/lib/Target/Hexagon/HexagonISelLowering.h
@@ -141,8 +141,11 @@ namespace llvm {
 
     SDValue  LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
     SDValue  LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
-    virtual EVT getSetCCResultType(LLVMContext &, EVT) const {
-      return MVT::i1;
+    virtual EVT getSetCCResultType(LLVMContext &C, EVT VT) const {
+      if (!VT.isVector())
+        return MVT::i1;
+      else
+        return EVT::getVectorVT(C, MVT::i1, VT.getVectorNumElements());
     }
 
     virtual bool getPostIndexedAddressParts(SDNode *N, SDNode *Op,
diff --git a/lib/Target/Hexagon/HexagonInstrFormats.td b/lib/Target/Hexagon/HexagonInstrFormats.td
index e71386a..d25bfa8 100644
--- a/lib/Target/Hexagon/HexagonInstrFormats.td
+++ b/lib/Target/Hexagon/HexagonInstrFormats.td
@@ -63,7 +63,7 @@ class MemAccessSize<bits<3> value> {
 def NoMemAccess      : MemAccessSize<0>;// Not a memory acces instruction.
 def ByteAccess       : MemAccessSize<1>;// Byte access instruction (memb).
 def HalfWordAccess   : MemAccessSize<2>;// Half word access instruction (memh).
-def WordAccess       : MemAccessSize<3>;// Word access instrution (memw).
+def WordAccess       : MemAccessSize<3>;// Word access instruction (memw).
 def DoubleWordAccess : MemAccessSize<4>;// Double word access instruction (memd)
 
 
diff --git a/lib/Target/Hexagon/HexagonInstrInfo.cpp b/lib/Target/Hexagon/HexagonInstrInfo.cpp
index 5af645c..6b97609 100644
--- a/lib/Target/Hexagon/HexagonInstrInfo.cpp
+++ b/lib/Target/Hexagon/HexagonInstrInfo.cpp
@@ -26,7 +26,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
-#define GET_INSTRINFO_CTOR
+#define GET_INSTRINFO_CTOR_DTOR
 #define GET_INSTRMAP_INFO
 #include "HexagonGenInstrInfo.inc"
 #include "HexagonGenDFAPacketizer.inc"
@@ -55,6 +55,8 @@ const int Hexagon_MEMH_AUTOINC_MIN = -16;
 const int Hexagon_MEMB_AUTOINC_MAX = 7;
 const int Hexagon_MEMB_AUTOINC_MIN = -8;
 
+// Pin the vtable to this file.
+void HexagonInstrInfo::anchor() {}
 
 HexagonInstrInfo::HexagonInstrInfo(HexagonSubtarget &ST)
   : HexagonGenInstrInfo(Hexagon::ADJCALLSTACKDOWN, Hexagon::ADJCALLSTACKUP),
diff --git a/lib/Target/Hexagon/HexagonInstrInfo.h b/lib/Target/Hexagon/HexagonInstrInfo.h
index 3c28df4..3f45b8b 100644
--- a/lib/Target/Hexagon/HexagonInstrInfo.h
+++ b/lib/Target/Hexagon/HexagonInstrInfo.h
@@ -26,6 +26,7 @@
 namespace llvm {
 
 class HexagonInstrInfo : public HexagonGenInstrInfo {
+  virtual void anchor();
   const HexagonRegisterInfo RI;
   const HexagonSubtarget &Subtarget;
   typedef unsigned Opcode_t;
diff --git a/lib/Target/Hexagon/HexagonInstrInfoV4.td b/lib/Target/Hexagon/HexagonInstrInfoV4.td
index fee83fb..475c23d 100644
--- a/lib/Target/Hexagon/HexagonInstrInfoV4.td
+++ b/lib/Target/Hexagon/HexagonInstrInfoV4.td
@@ -3390,4 +3390,3 @@ def : Pat<(i32 (load FoldGlobalAddrGP:$addr)),
 def : Pat<(atomic_load_32 FoldGlobalAddrGP:$addr),
           (i32 (LDriw_abs_V4 FoldGlobalAddrGP:$addr))>,
            Requires<[HasV4T]>;
-
diff --git a/lib/Target/Hexagon/HexagonMCInstLower.cpp b/lib/Target/Hexagon/HexagonMCInstLower.cpp
index f011d51..bbb2fa4 100644
--- a/lib/Target/Hexagon/HexagonMCInstLower.cpp
+++ b/lib/Target/Hexagon/HexagonMCInstLower.cpp
@@ -73,7 +73,7 @@ void llvm::HexagonLowerToMC(const MachineInstr* MI, HexagonMCInst& MCI,
                AP.OutContext));
       break;
     case MachineOperand::MO_GlobalAddress:
-      MCO = GetSymbolRef(MO, AP.Mang->getSymbol(MO.getGlobal()), AP);
+      MCO = GetSymbolRef(MO, AP.getSymbol(MO.getGlobal()), AP);
       break;
     case MachineOperand::MO_ExternalSymbol:
       MCO = GetSymbolRef(MO, AP.GetExternalSymbolSymbol(MO.getSymbolName()),
diff --git a/lib/Target/Hexagon/HexagonMachineFunctionInfo.cpp b/lib/Target/Hexagon/HexagonMachineFunctionInfo.cpp
new file mode 100644
index 0000000..9579c8b
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonMachineFunctionInfo.cpp
@@ -0,0 +1,16 @@
+//= HexagonMachineFunctionInfo.cpp - Hexagon machine function info *- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "HexagonMachineFunctionInfo.h"
+
+using namespace llvm;
+
+// pin vtable to this file
+void HexagonMachineFunctionInfo::anchor() {}
+
diff --git a/lib/Target/Hexagon/HexagonMachineFunctionInfo.h b/lib/Target/Hexagon/HexagonMachineFunctionInfo.h
index bd7b26a..a59c8c9 100644
--- a/lib/Target/Hexagon/HexagonMachineFunctionInfo.h
+++ b/lib/Target/Hexagon/HexagonMachineFunctionInfo.h
@@ -1,4 +1,4 @@
-//=- HexagonMachineFuctionInfo.h - Hexagon machine function info --*- C++ -*-=//
+//=- HexagonMachineFunctionInfo.h - Hexagon machine function info -*- C++ -*-=//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -10,6 +10,7 @@
 #ifndef HexagonMACHINEFUNCTIONINFO_H
 #define HexagonMACHINEFUNCTIONINFO_H
 
+#include <map>
 #include "llvm/CodeGen/MachineFunction.h"
 
 namespace llvm {
@@ -30,9 +31,8 @@ class HexagonMachineFunctionInfo : public MachineFunctionInfo {
   int VarArgsFrameIndex;
   bool HasClobberLR;
   bool HasEHReturn;
-
   std::map<const MachineInstr*, unsigned> PacketInfo;
-
+  virtual void anchor();
 
 public:
   HexagonMachineFunctionInfo() : SRetReturnReg(0), HasClobberLR(0),
diff --git a/lib/Target/Hexagon/HexagonMachineScheduler.cpp b/lib/Target/Hexagon/HexagonMachineScheduler.cpp
index 10bb3e9..c94f081 100644
--- a/lib/Target/Hexagon/HexagonMachineScheduler.cpp
+++ b/lib/Target/Hexagon/HexagonMachineScheduler.cpp
@@ -407,11 +407,11 @@ SUnit *ConvergingVLIWScheduler::SchedBoundary::pickOnlyChoice() {
 #ifndef NDEBUG
 void ConvergingVLIWScheduler::traceCandidate(const char *Label,
                                              const ReadyQueue &Q,
-                                             SUnit *SU, PressureElement P) {
+                                             SUnit *SU, PressureChange P) {
   dbgs() << Label << " " << Q.getName() << " ";
   if (P.isValid())
-    dbgs() << DAG->TRI->getRegPressureSetName(P.PSetID) << ":" << P.UnitIncrease
-           << " ";
+    dbgs() << DAG->TRI->getRegPressureSetName(P.getPSet()) << ":"
+           << P.getUnitInc() << " ";
   else
     dbgs() << "     ";
   SU->dump(DAG);
@@ -457,9 +457,7 @@ static SUnit *getSingleUnscheduledSucc(SUnit *SU) {
 // Constants used to denote relative importance of
 // heuristic components for cost computation.
 static const unsigned PriorityOne = 200;
-static const unsigned PriorityTwo = 100;
-static const unsigned PriorityThree = 50;
-static const unsigned PriorityFour = 20;
+static const unsigned PriorityTwo = 50;
 static const unsigned ScaleTwo = 10;
 static const unsigned FactorOne = 2;
 
@@ -517,8 +515,8 @@ int ConvergingVLIWScheduler::SchedulingCost(ReadyQueue &Q, SUnit *SU,
   ResCount += (NumNodesBlocking * ScaleTwo);
 
   // Factor in reg pressure as a heuristic.
-  ResCount -= (Delta.Excess.UnitIncrease*PriorityThree);
-  ResCount -= (Delta.CriticalMax.UnitIncrease*PriorityThree);
+  ResCount -= (Delta.Excess.getUnitInc()*PriorityTwo);
+  ResCount -= (Delta.CriticalMax.getUnitInc()*PriorityTwo);
 
   DEBUG(if (verbose) dbgs() << " Total(" << ResCount << ")");
 
diff --git a/lib/Target/Hexagon/HexagonMachineScheduler.h b/lib/Target/Hexagon/HexagonMachineScheduler.h
index 171193e..8ac333f 100644
--- a/lib/Target/Hexagon/HexagonMachineScheduler.h
+++ b/lib/Target/Hexagon/HexagonMachineScheduler.h
@@ -233,7 +233,7 @@ protected:
                                SchedCandidate &Candidate);
 #ifndef NDEBUG
   void traceCandidate(const char *Label, const ReadyQueue &Q, SUnit *SU,
-                      PressureElement P = PressureElement());
+                      PressureChange P = PressureChange());
 #endif
 };
 
diff --git a/lib/Target/Hexagon/HexagonPeephole.cpp b/lib/Target/Hexagon/HexagonPeephole.cpp
index 89e3406..5490ecd 100644
--- a/lib/Target/Hexagon/HexagonPeephole.cpp
+++ b/lib/Target/Hexagon/HexagonPeephole.cpp
@@ -29,7 +29,7 @@
 //
 // Note: The peephole pass makes the instrucstions like
 // %vreg170<def> = SXTW %vreg166 or %vreg16<def> = NOT_p %vreg15<kill>
-// redundant and relies on some form of dead removal instrucions, like
+// redundant and relies on some form of dead removal instructions, like
 // DCE or DIE to actually eliminate them.
 
 
diff --git a/lib/Target/Hexagon/HexagonRegisterInfo.cpp b/lib/Target/Hexagon/HexagonRegisterInfo.cpp
index d5ca4d7..1786e9d 100644
--- a/lib/Target/Hexagon/HexagonRegisterInfo.cpp
+++ b/lib/Target/Hexagon/HexagonRegisterInfo.cpp
@@ -295,13 +295,5 @@ unsigned HexagonRegisterInfo::getStackRegister() const {
   return Hexagon::R29;
 }
 
-unsigned HexagonRegisterInfo::getEHExceptionRegister() const {
-  llvm_unreachable("What is the exception register");
-}
-
-unsigned HexagonRegisterInfo::getEHHandlerRegister() const {
-  llvm_unreachable("What is the exception handler register");
-}
-
 #define GET_REGINFO_TARGET_DESC
 #include "HexagonGenRegisterInfo.inc"
diff --git a/lib/Target/Hexagon/HexagonRegisterInfo.h b/lib/Target/Hexagon/HexagonRegisterInfo.h
index fec86df..89af7c3 100644
--- a/lib/Target/Hexagon/HexagonRegisterInfo.h
+++ b/lib/Target/Hexagon/HexagonRegisterInfo.h
@@ -78,10 +78,6 @@ struct HexagonRegisterInfo : public HexagonGenRegisterInfo {
   unsigned getFrameRegister(const MachineFunction &MF) const;
   unsigned getFrameRegister() const;
   unsigned getStackRegister() const;
-
-  // Exception handling queries.
-  unsigned getEHExceptionRegister() const;
-  unsigned getEHHandlerRegister() const;
 };
 
 } // end namespace llvm
diff --git a/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp b/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp
index 3bf2f20..5166f8e 100644
--- a/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp
+++ b/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp
@@ -16,33 +16,31 @@
 // scheduled after register allocation.
 //
 //===----------------------------------------------------------------------===//
+
 #define DEBUG_TYPE "xfer"
-#include "llvm/CodeGen/Passes.h"
-#include "llvm/CodeGen/ScheduleDAGInstrs.h"
+
+#include "HexagonTargetMachine.h"
+#include "HexagonSubtarget.h"
+#include "HexagonMachineFunctionInfo.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/LatencyPriorityQueue.h"
-#include "llvm/CodeGen/SchedulerRegistry.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/ScheduleDAGInstrs.h"
 #include "llvm/CodeGen/ScheduleHazardRecognizer.h"
+#include "llvm/CodeGen/SchedulerRegistry.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/MathExtras.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "HexagonTargetMachine.h"
-#include "HexagonSubtarget.h"
-#include "HexagonMachineFunctionInfo.h"
 #include <map>
-#include <iostream>
-
-#include "llvm/Support/CommandLine.h"
-#define DEBUG_TYPE "xfer"
-
 
 using namespace llvm;
 
diff --git a/lib/Target/Hexagon/HexagonSubtarget.cpp b/lib/Target/Hexagon/HexagonSubtarget.cpp
index 07d5ce1..fca6707 100644
--- a/lib/Target/Hexagon/HexagonSubtarget.cpp
+++ b/lib/Target/Hexagon/HexagonSubtarget.cpp
@@ -86,3 +86,5 @@ HexagonSubtarget::HexagonSubtarget(StringRef TT, StringRef CPU, StringRef FS):
     ModeIEEERndNear = false;
 }
 
+// Pin the vtable to this file.
+void HexagonSubtarget::anchor() {}
diff --git a/lib/Target/Hexagon/HexagonSubtarget.h b/lib/Target/Hexagon/HexagonSubtarget.h
index 76a8fba..690bef0 100644
--- a/lib/Target/Hexagon/HexagonSubtarget.h
+++ b/lib/Target/Hexagon/HexagonSubtarget.h
@@ -27,7 +27,7 @@
 namespace llvm {
 
 class HexagonSubtarget : public HexagonGenSubtargetInfo {
-
+  virtual void anchor();
   bool UseMemOps;
   bool ModeIEEERndNear;
 
diff --git a/lib/Target/Hexagon/HexagonTargetMachine.cpp b/lib/Target/Hexagon/HexagonTargetMachine.cpp
index cd96b58..bb950a0 100644
--- a/lib/Target/Hexagon/HexagonTargetMachine.cpp
+++ b/lib/Target/Hexagon/HexagonTargetMachine.cpp
@@ -102,17 +102,25 @@ class HexagonPassConfig : public TargetPassConfig {
 public:
   HexagonPassConfig(HexagonTargetMachine *TM, PassManagerBase &PM)
     : TargetPassConfig(TM, PM) {
-    // Enable MI scheduler.
-    if (!DisableHexagonMISched) {
+    // FIXME: Rather than calling enablePass(&MachineSchedulerID) below, define
+    // HexagonSubtarget::enableMachineScheduler() { return true; }.
+    // That will bypass the SelectionDAG VLIW scheduler, which is probably just
+    // hurting compile time and will be removed eventually anyway.
+    if (DisableHexagonMISched)
+      disablePass(&MachineSchedulerID);
+    else
       enablePass(&MachineSchedulerID);
-      MachineSchedRegistry::setDefault(createVLIWMachineSched);
-    }
   }
 
   HexagonTargetMachine &getHexagonTargetMachine() const {
     return getTM<HexagonTargetMachine>();
   }
 
+  virtual ScheduleDAGInstrs *
+  createMachineScheduler(MachineSchedContext *C) const {
+    return createVLIWMachineSched(C);
+  }
+
   virtual bool addInstSelector();
   virtual bool addPreRegAlloc();
   virtual bool addPostRegAlloc();
diff --git a/lib/Target/Hexagon/InstPrinter/HexagonInstPrinter.cpp b/lib/Target/Hexagon/InstPrinter/HexagonInstPrinter.cpp
index 2ea0d2e..7c41507 100644
--- a/lib/Target/Hexagon/InstPrinter/HexagonInstPrinter.cpp
+++ b/lib/Target/Hexagon/InstPrinter/HexagonInstPrinter.cpp
@@ -21,7 +21,6 @@
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/Support/raw_ostream.h"
-#include <cstdio>
 
 using namespace llvm;
 
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h b/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
index e0f5a27..8519cf3 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
@@ -73,7 +73,7 @@ namespace HexagonII {
     NoMemAccess = 0,            // Not a memory acces instruction.
     ByteAccess = 1,             // Byte access instruction (memb).
     HalfWordAccess = 2,         // Half word access instruction (memh).
-    WordAccess = 3,             // Word access instrution (memw).
+    WordAccess = 3,             // Word access instruction (memw).
     DoubleWordAccess = 4        // Double word access instruction (memd)
   };
 
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp
index 495dbb9..3f9415b 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp
@@ -15,6 +15,9 @@
 
 using namespace llvm;
 
+// Pin the vtable to this file.
+void HexagonMCAsmInfo::anchor() {}
+
 HexagonMCAsmInfo::HexagonMCAsmInfo(StringRef TT) {
   Data16bitsDirective = "\t.half\t";
   Data32bitsDirective = "\t.word\t";
@@ -29,7 +32,6 @@ HexagonMCAsmInfo::HexagonMCAsmInfo(StringRef TT) {
   InlineAsmEnd = "# InlineAsm End";
   ZeroDirective = "\t.space\t";
   AscizDirective = "\t.string\t";
-  WeakRefDirective = "\t.weak\t";
 
   SupportsDebugInformation = true;
   UsesELFSectionDirectiveForBSS  = true;
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.h b/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.h
index 0b94d21..bd8cb76 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.h
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.h
@@ -15,10 +15,11 @@
 #define HexagonMCASMINFO_H
 
 #include "llvm/ADT/StringRef.h"
-#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCAsmInfoELF.h"
 
 namespace llvm {
-  class HexagonMCAsmInfo : public MCAsmInfo {
+  class HexagonMCAsmInfo : public MCAsmInfoELF {
+    virtual void anchor();
   public:
     explicit HexagonMCAsmInfo(StringRef TT);
   };
diff --git a/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.cpp b/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.cpp
index d213a45..acf2ab8 100644
--- a/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.cpp
+++ b/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.cpp
@@ -21,11 +21,8 @@ MSP430MCAsmInfo::MSP430MCAsmInfo(StringRef TT) {
   PointerSize = CalleeSaveStackSlotSize = 2;
 
   PrivateGlobalPrefix = ".L";
-  WeakRefDirective ="\t.weak\t";
-  PCSymbol=".";
   CommentString = ";";
 
   AlignmentIsInBytes = false;
-  AllowNameToStartWithDigit = true;
   UsesELFSectionDirectiveForBSS = true;
 }
diff --git a/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.h b/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.h
index feb040d..a7e0e58 100644
--- a/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.h
+++ b/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.h
@@ -14,12 +14,12 @@
 #ifndef MSP430TARGETASMINFO_H
 #define MSP430TARGETASMINFO_H
 
-#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCAsmInfoELF.h"
 
 namespace llvm {
   class StringRef;
 
-  class MSP430MCAsmInfo : public MCAsmInfo {
+  class MSP430MCAsmInfo : public MCAsmInfoELF {
     virtual void anchor();
   public:
     explicit MSP430MCAsmInfo(StringRef TT);
diff --git a/lib/Target/MSP430/MSP430AsmPrinter.cpp b/lib/Target/MSP430/MSP430AsmPrinter.cpp
index 0a04e5d..18311c3 100644
--- a/lib/Target/MSP430/MSP430AsmPrinter.cpp
+++ b/lib/Target/MSP430/MSP430AsmPrinter.cpp
@@ -92,7 +92,7 @@ void MSP430AsmPrinter::printOperand(const MachineInstr *MI, int OpNum,
     if (Offset)
       O << '(' << Offset << '+';
 
-    O << *Mang->getSymbol(MO.getGlobal());
+    O << *getSymbol(MO.getGlobal());
 
     if (Offset)
       O << ')';
diff --git a/lib/Target/MSP430/MSP430CallingConv.td b/lib/Target/MSP430/MSP430CallingConv.td
index b448cc4..8a69d1e 100644
--- a/lib/Target/MSP430/MSP430CallingConv.td
+++ b/lib/Target/MSP430/MSP430CallingConv.td
@@ -23,18 +23,15 @@ def RetCC_MSP430 : CallingConv<[
 //===----------------------------------------------------------------------===//
 // MSP430 Argument Calling Conventions
 //===----------------------------------------------------------------------===//
-def CC_MSP430 : CallingConv<[
+def CC_MSP430_AssignStack : CallingConv<[
   // Pass by value if the byval attribute is given
   CCIfByVal<CCPassByVal<2, 2>>,
 
   // Promote i8 arguments to i16.
   CCIfType<[i8], CCPromoteToType<i16>>,
 
-  // The first 4 integer arguments of non-varargs functions are passed in
-  // integer registers.
-  CCIfNotVarArg<CCIfType<[i16], CCAssignToReg<[R15W, R14W, R13W, R12W]>>>,
-
   // Integer values get stored in stack slots that are 2 bytes in
   // size and 2-byte aligned.
   CCIfType<[i16], CCAssignToStack<2, 2>>
 ]>;
+
diff --git a/lib/Target/MSP430/MSP430FrameLowering.h b/lib/Target/MSP430/MSP430FrameLowering.h
index c673f59..8370714 100644
--- a/lib/Target/MSP430/MSP430FrameLowering.h
+++ b/lib/Target/MSP430/MSP430FrameLowering.h
@@ -27,8 +27,8 @@ protected:
 
 public:
   explicit MSP430FrameLowering(const MSP430Subtarget &sti)
-    : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, 2, -2), STI(sti) {
-  }
+    : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, 2, -2, 2),
+      STI(sti) {}
 
   /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
   /// the function.
diff --git a/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp b/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp
index 543f54c..4152829 100644
--- a/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp
+++ b/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp
@@ -395,6 +395,7 @@ SDNode *MSP430DAGToDAGISel::Select(SDNode *Node) {
     DEBUG(errs() << "== ";
           Node->dump(CurDAG);
           errs() << "\n");
+    Node->setNodeId(-1);
     return NULL;
   }
 
diff --git a/lib/Target/MSP430/MSP430ISelLowering.cpp b/lib/Target/MSP430/MSP430ISelLowering.cpp
index 803e899..745cdf5 100644
--- a/lib/Target/MSP430/MSP430ISelLowering.cpp
+++ b/lib/Target/MSP430/MSP430ISelLowering.cpp
@@ -45,7 +45,7 @@ typedef enum {
 } HWMultUseMode;
 
 static cl::opt<HWMultUseMode>
-HWMultMode("msp430-hwmult-mode",
+HWMultMode("msp430-hwmult-mode", cl::Hidden,
            cl::desc("Hardware multiplier use mode"),
            cl::init(HWMultNoIntr),
            cl::values(
@@ -250,6 +250,123 @@ getRegForInlineAsmConstraint(const std::string &Constraint,
 
 #include "MSP430GenCallingConv.inc"
 
+/// For each argument in a function store the number of pieces it is composed
+/// of.
+template<typename ArgT>
+static void ParseFunctionArgs(const SmallVectorImpl<ArgT> &Args,
+                              SmallVectorImpl<unsigned> &Out) {
+  unsigned CurrentArgIndex = ~0U;
+  for (unsigned i = 0, e = Args.size(); i != e; i++) {
+    if (CurrentArgIndex == Args[i].OrigArgIndex) {
+      Out.back()++;
+    } else {
+      Out.push_back(1);
+      CurrentArgIndex++;
+    }
+  }
+}
+
+static void AnalyzeVarArgs(CCState &State,
+                           const SmallVectorImpl<ISD::OutputArg> &Outs) {
+  State.AnalyzeCallOperands(Outs, CC_MSP430_AssignStack);
+}
+
+static void AnalyzeVarArgs(CCState &State,
+                           const SmallVectorImpl<ISD::InputArg> &Ins) {
+  State.AnalyzeFormalArguments(Ins, CC_MSP430_AssignStack);
+}
+
+/// Analyze incoming and outgoing function arguments. We need custom C++ code
+/// to handle special constraints in the ABI like reversing the order of the
+/// pieces of splitted arguments. In addition, all pieces of a certain argument
+/// have to be passed either using registers or the stack but never mixing both.
+template<typename ArgT>
+static void AnalyzeArguments(CCState &State,
+                             SmallVectorImpl<CCValAssign> &ArgLocs,
+                             const SmallVectorImpl<ArgT> &Args) {
+  static const uint16_t RegList[] = {
+    MSP430::R15W, MSP430::R14W, MSP430::R13W, MSP430::R12W
+  };
+  static const unsigned NbRegs = array_lengthof(RegList);
+
+  if (State.isVarArg()) {
+    AnalyzeVarArgs(State, Args);
+    return;
+  }
+
+  SmallVector<unsigned, 4> ArgsParts;
+  ParseFunctionArgs(Args, ArgsParts);
+
+  unsigned RegsLeft = NbRegs;
+  bool UseStack = false;
+  unsigned ValNo = 0;
+
+  for (unsigned i = 0, e = ArgsParts.size(); i != e; i++) {
+    MVT ArgVT = Args[ValNo].VT;
+    ISD::ArgFlagsTy ArgFlags = Args[ValNo].Flags;
+    MVT LocVT = ArgVT;
+    CCValAssign::LocInfo LocInfo = CCValAssign::Full;
+
+    // Promote i8 to i16
+    if (LocVT == MVT::i8) {
+      LocVT = MVT::i16;
+      if (ArgFlags.isSExt())
+          LocInfo = CCValAssign::SExt;
+      else if (ArgFlags.isZExt())
+          LocInfo = CCValAssign::ZExt;
+      else
+          LocInfo = CCValAssign::AExt;
+    }
+
+    // Handle byval arguments
+    if (ArgFlags.isByVal()) {
+      State.HandleByVal(ValNo++, ArgVT, LocVT, LocInfo, 2, 2, ArgFlags);
+      continue;
+    }
+
+    unsigned Parts = ArgsParts[i];
+
+    if (!UseStack && Parts <= RegsLeft) {
+      unsigned FirstVal = ValNo;
+      for (unsigned j = 0; j < Parts; j++) {
+        unsigned Reg = State.AllocateReg(RegList, NbRegs);
+        State.addLoc(CCValAssign::getReg(ValNo++, ArgVT, Reg, LocVT, LocInfo));
+        RegsLeft--;
+      }
+
+      // Reverse the order of the pieces to agree with the "big endian" format
+      // required in the calling convention ABI.
+      SmallVectorImpl<CCValAssign>::iterator B = ArgLocs.begin() + FirstVal;
+      std::reverse(B, B + Parts);
+    } else {
+      UseStack = true;
+      for (unsigned j = 0; j < Parts; j++)
+        CC_MSP430_AssignStack(ValNo++, ArgVT, LocVT, LocInfo, ArgFlags, State);
+    }
+  }
+}
+
+static void AnalyzeRetResult(CCState &State,
+                             const SmallVectorImpl<ISD::InputArg> &Ins) {
+  State.AnalyzeCallResult(Ins, RetCC_MSP430);
+}
+
+static void AnalyzeRetResult(CCState &State,
+                             const SmallVectorImpl<ISD::OutputArg> &Outs) {
+  State.AnalyzeReturn(Outs, RetCC_MSP430);
+}
+
+template<typename ArgT>
+static void AnalyzeReturnValues(CCState &State,
+                                SmallVectorImpl<CCValAssign> &RVLocs,
+                                const SmallVectorImpl<ArgT> &Args) {
+  AnalyzeRetResult(State, Args);
+
+  // Reverse splitted return values to get the "big endian" format required
+  // to agree with the calling convention ABI.
+  std::reverse(RVLocs.begin(), RVLocs.end());
+}
+
 SDValue
 MSP430TargetLowering::LowerFormalArguments(SDValue Chain,
                                            CallingConv::ID CallConv,
@@ -325,7 +442,7 @@ MSP430TargetLowering::LowerCCCArguments(SDValue Chain,
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
                  getTargetMachine(), ArgLocs, *DAG.getContext());
-  CCInfo.AnalyzeFormalArguments(Ins, CC_MSP430);
+  AnalyzeArguments(CCInfo, ArgLocs, Ins);
 
   // Create frame index for the start of the first vararg value
   if (isVarArg) {
@@ -423,7 +540,7 @@ MSP430TargetLowering::LowerReturn(SDValue Chain,
                  getTargetMachine(), RVLocs, *DAG.getContext());
 
   // Analize return values.
-  CCInfo.AnalyzeReturn(Outs, RetCC_MSP430);
+  AnalyzeReturnValues(CCInfo, RVLocs, Outs);
 
   SDValue Flag;
   SmallVector<SDValue, 4> RetOps(1, Chain);
@@ -471,8 +588,7 @@ MSP430TargetLowering::LowerCCCCallTo(SDValue Chain, SDValue Callee,
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
                  getTargetMachine(), ArgLocs, *DAG.getContext());
-
-  CCInfo.AnalyzeCallOperands(Outs, CC_MSP430);
+  AnalyzeArguments(CCInfo, ArgLocs, Outs);
 
   // Get a count of how many bytes are to be pushed on the stack.
   unsigned NumBytes = CCInfo.getNextStackOffset();
@@ -610,7 +726,7 @@ MSP430TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
                  getTargetMachine(), RVLocs, *DAG.getContext());
 
-  CCInfo.AnalyzeCallResult(Ins, RetCC_MSP430);
+  AnalyzeReturnValues(CCInfo, RVLocs, Ins);
 
   // Copy all of the result registers out of their specified physreg.
   for (unsigned i = 0; i != RVLocs.size(); ++i) {
diff --git a/lib/Target/MSP430/MSP430InstrInfo.cpp b/lib/Target/MSP430/MSP430InstrInfo.cpp
index c850594..7a0b00a 100644
--- a/lib/Target/MSP430/MSP430InstrInfo.cpp
+++ b/lib/Target/MSP430/MSP430InstrInfo.cpp
@@ -22,11 +22,14 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TargetRegistry.h"
 
-#define GET_INSTRINFO_CTOR
+#define GET_INSTRINFO_CTOR_DTOR
 #include "MSP430GenInstrInfo.inc"
 
 using namespace llvm;
 
+// Pin the vtable to this file.
+void MSP430InstrInfo::anchor() {}
+
 MSP430InstrInfo::MSP430InstrInfo(MSP430TargetMachine &tm)
   : MSP430GenInstrInfo(MSP430::ADJCALLSTACKDOWN, MSP430::ADJCALLSTACKUP),
     RI(tm) {}
diff --git a/lib/Target/MSP430/MSP430InstrInfo.h b/lib/Target/MSP430/MSP430InstrInfo.h
index d79f992..ad2b8cc 100644
--- a/lib/Target/MSP430/MSP430InstrInfo.h
+++ b/lib/Target/MSP430/MSP430InstrInfo.h
@@ -42,6 +42,7 @@ namespace MSP430II {
 
 class MSP430InstrInfo : public MSP430GenInstrInfo {
   const MSP430RegisterInfo RI;
+  virtual void anchor();
 public:
   explicit MSP430InstrInfo(MSP430TargetMachine &TM);
 
diff --git a/lib/Target/MSP430/MSP430MCInstLower.cpp b/lib/Target/MSP430/MSP430MCInstLower.cpp
index 043e5be..52f9ee5 100644
--- a/lib/Target/MSP430/MSP430MCInstLower.cpp
+++ b/lib/Target/MSP430/MSP430MCInstLower.cpp
@@ -33,7 +33,7 @@ GetGlobalAddressSymbol(const MachineOperand &MO) const {
   case 0: break;
   }
 
-  return Printer.Mang->getSymbol(MO.getGlobal());
+  return Printer.getSymbol(MO.getGlobal());
 }
 
 MCSymbol *MSP430MCInstLower::
diff --git a/lib/Target/Mangler.cpp b/lib/Target/Mangler.cpp
index f86428c..38be25c 100644
--- a/lib/Target/Mangler.cpp
+++ b/lib/Target/Mangler.cpp
@@ -23,84 +23,6 @@
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
-static bool isAcceptableChar(char C, bool AllowPeriod, bool AllowUTF8) {
-  if ((C < 'a' || C > 'z') &&
-      (C < 'A' || C > 'Z') &&
-      (C < '0' || C > '9') &&
-      C != '_' && C != '$' && C != '@' &&
-      !(AllowPeriod && C == '.') &&
-      !(AllowUTF8 && (C & 0x80)))
-    return false;
-  return true;
-}
-
-static char HexDigit(int V) {
-  return V < 10 ? V+'0' : V+'A'-10;
-}
-
-static void MangleLetter(SmallVectorImpl<char> &OutName, unsigned char C) {
-  OutName.push_back('_');
-  OutName.push_back(HexDigit(C >> 4));
-  OutName.push_back(HexDigit(C & 15));
-  OutName.push_back('_');
-}
-
-/// NameNeedsEscaping - Return true if the identifier \p Str needs quotes
-/// for this assembler.
-static bool NameNeedsEscaping(StringRef Str, const MCAsmInfo *MAI) {
-  assert(!Str.empty() && "Cannot create an empty MCSymbol");
-  
-  // If the first character is a number and the target does not allow this, we
-  // need quotes.
-  if (!MAI->doesAllowNameToStartWithDigit() && Str[0] >= '0' && Str[0] <= '9')
-    return true;
-  
-  // If any of the characters in the string is an unacceptable character, force
-  // quotes.
-  bool AllowPeriod = MAI->doesAllowPeriodsInName();
-  bool AllowUTF8 = MAI->doesAllowUTF8();
-  for (unsigned i = 0, e = Str.size(); i != e; ++i)
-    if (!isAcceptableChar(Str[i], AllowPeriod, AllowUTF8))
-      return true;
-  return false;
-}
-
-/// appendMangledName - Add the specified string in mangled form if it uses
-/// any unusual characters.
-static void appendMangledName(SmallVectorImpl<char> &OutName, StringRef Str,
-                              const MCAsmInfo *MAI) {
-  // The first character is not allowed to be a number unless the target
-  // explicitly allows it.
-  if (!MAI->doesAllowNameToStartWithDigit() && Str[0] >= '0' && Str[0] <= '9') {
-    MangleLetter(OutName, Str[0]);
-    Str = Str.substr(1);
-  }
-
-  bool AllowPeriod = MAI->doesAllowPeriodsInName();
-  bool AllowUTF8 = MAI->doesAllowUTF8();
-  for (unsigned i = 0, e = Str.size(); i != e; ++i) {
-    if (!isAcceptableChar(Str[i], AllowPeriod, AllowUTF8))
-      MangleLetter(OutName, Str[i]);
-    else
-      OutName.push_back(Str[i]);
-  }
-}
-
-
-/// appendMangledQuotedName - On systems that support quoted symbols, we still
-/// have to escape some (obscure) characters like " and \n which would break the
-/// assembler's lexing.
-static void appendMangledQuotedName(SmallVectorImpl<char> &OutName,
-                                   StringRef Str) {
-  for (unsigned i = 0, e = Str.size(); i != e; ++i) {
-    if (Str[i] == '"' || Str[i] == '\n')
-      MangleLetter(OutName, Str[i]);
-    else
-      OutName.push_back(Str[i]);
-  }
-}
-
-
 /// getNameWithPrefix - Fill OutName with the name of the appropriate prefix
 /// and the specified name as the global variable name.  GVName must not be
 /// empty.
@@ -111,7 +33,7 @@ void Mangler::getNameWithPrefix(SmallVectorImpl<char> &OutName,
   StringRef Name = GVName.toStringRef(TmpData);
   assert(!Name.empty() && "getNameWithPrefix requires non-empty name");
   
-  const MCAsmInfo *MAI = Context.getAsmInfo();
+  const MCAsmInfo *MAI = TM->getMCAsmInfo();
   
   // If the global name is not led with \1, add the appropriate prefixes.
   if (Name[0] == '\1') {
@@ -136,26 +58,9 @@ void Mangler::getNameWithPrefix(SmallVectorImpl<char> &OutName,
         OutName.append(Prefix, Prefix+strlen(Prefix));
     }
   }
-  
+
   // If this is a simple string that doesn't need escaping, just append it.
-  if (!NameNeedsEscaping(Name, MAI) ||
-      // If quotes are supported, they can be used unless the string contains
-      // a quote or newline.
-      (MAI->doesAllowQuotesInName() &&
-       Name.find_first_of("\n\"") == StringRef::npos)) {
-    OutName.append(Name.begin(), Name.end());
-    return;
-  }
-  
-  // On systems that do not allow quoted names, we need to mangle most
-  // strange characters.
-  if (!MAI->doesAllowQuotesInName())
-    return appendMangledName(OutName, Name, MAI);
-  
-  // Okay, the system allows quoted strings.  We can quote most anything, the
-  // only characters that need escaping are " and \n.
-  assert(Name.find_first_of("\n\"") != StringRef::npos);
-  return appendMangledQuotedName(OutName, Name);
+  OutName.append(Name.begin(), Name.end());
 }
 
 /// AddFastCallStdCallSuffix - Microsoft fastcall and stdcall functions require
@@ -212,7 +117,7 @@ void Mangler::getNameWithPrefix(SmallVectorImpl<char> &OutName,
   
   // If we are supposed to add a microsoft-style suffix for stdcall/fastcall,
   // add it.
-  if (Context.getAsmInfo()->hasMicrosoftFastStdCallMangling()) {
+  if (TM->getMCAsmInfo()->hasMicrosoftFastStdCallMangling()) {
     if (const Function *F = dyn_cast<Function>(GV)) {
       CallingConv::ID CC = F->getCallingConv();
     
@@ -236,13 +141,3 @@ void Mangler::getNameWithPrefix(SmallVectorImpl<char> &OutName,
     }
   }
 }
-
-/// getSymbol - Return the MCSymbol for the specified global value.  This
-/// symbol is the main label that is the address of the global.
-MCSymbol *Mangler::getSymbol(const GlobalValue *GV) {
-  SmallString<60> NameStr;
-  getNameWithPrefix(NameStr, GV, false);
-  return Context.GetOrCreateSymbol(NameStr.str());
-}
-
-
diff --git a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
index 3dd6562..cdae6c2 100644
--- a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
+++ b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
@@ -9,6 +9,7 @@
 
 #include "MCTargetDesc/MipsMCTargetDesc.h"
 #include "MipsRegisterInfo.h"
+#include "MipsTargetStreamer.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
@@ -24,23 +25,25 @@
 
 using namespace llvm;
 
+namespace llvm {
+class MCInstrInfo;
+}
+
 namespace {
 class MipsAssemblerOptions {
 public:
-  MipsAssemblerOptions():
-    aTReg(1), reorder(true), macro(true) {
-  }
+  MipsAssemblerOptions() : aTReg(1), reorder(true), macro(true) {}
 
-  unsigned getATRegNum() {return aTReg;}
+  unsigned getATRegNum() { return aTReg; }
   bool setATReg(unsigned Reg);
 
-  bool isReorder() {return reorder;}
-  void setReorder() {reorder = true;}
-  void setNoreorder() {reorder = false;}
+  bool isReorder() { return reorder; }
+  void setReorder() { reorder = true; }
+  void setNoreorder() { reorder = false; }
 
-  bool isMacro() {return macro;}
-  void setMacro() {macro = true;}
-  void setNomacro() {macro = false;}
+  bool isMacro() { return macro; }
+  void setMacro() { macro = true; }
+  void setNomacro() { macro = false; }
 
 private:
   unsigned aTReg;
@@ -52,23 +55,21 @@ private:
 namespace {
 class MipsAsmParser : public MCTargetAsmParser {
 
-  enum FpFormatTy {
-    FP_FORMAT_NONE = -1,
-    FP_FORMAT_S,
-    FP_FORMAT_D,
-    FP_FORMAT_L,
-    FP_FORMAT_W
-  } FpFormat;
+  MipsTargetStreamer &getTargetStreamer() {
+    MCTargetStreamer &TS = Parser.getStreamer().getTargetStreamer();
+    return static_cast<MipsTargetStreamer &>(TS);
+  }
 
   MCSubtargetInfo &STI;
   MCAsmParser &Parser;
   MipsAssemblerOptions Options;
+  bool hasConsumedDollar;
 
 #define GET_ASSEMBLER_HEADER
 #include "MipsGenAsmMatcher.inc"
 
   bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
-                               SmallVectorImpl<MCParsedAsmOperand*> &Operands,
+                               SmallVectorImpl<MCParsedAsmOperand *> &Operands,
                                MCStreamer &Out, unsigned &ErrorInfo,
                                bool MatchingInlineAsm);
 
@@ -76,56 +77,98 @@ class MipsAsmParser : public MCTargetAsmParser {
 
   bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
                         SMLoc NameLoc,
-                        SmallVectorImpl<MCParsedAsmOperand*> &Operands);
+                        SmallVectorImpl<MCParsedAsmOperand *> &Operands);
 
   bool ParseDirective(AsmToken DirectiveID);
 
   MipsAsmParser::OperandMatchResultTy
-  parseRegs(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-                         int RegKind);
+  parseRegs(SmallVectorImpl<MCParsedAsmOperand *> &Operands, int RegKind);
 
   MipsAsmParser::OperandMatchResultTy
-  parseMemOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
+  parseMSARegs(SmallVectorImpl<MCParsedAsmOperand *> &Operands, int RegKind);
 
   MipsAsmParser::OperandMatchResultTy
-  parseGPR32(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
+  parseMSACtrlRegs(SmallVectorImpl<MCParsedAsmOperand *> &Operands,
+                   int RegKind);
 
   MipsAsmParser::OperandMatchResultTy
-  parseGPR64(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
+  parseMemOperand(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
+
+  bool parsePtrReg(SmallVectorImpl<MCParsedAsmOperand *> &Operands,
+                   int RegKind);
 
   MipsAsmParser::OperandMatchResultTy
-  parseHWRegs(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
+  parsePtrReg(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
 
   MipsAsmParser::OperandMatchResultTy
-  parseHW64Regs(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
+  parseGPR32(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
 
   MipsAsmParser::OperandMatchResultTy
-  parseCCRRegs(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
+  parseGPR64(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
 
   MipsAsmParser::OperandMatchResultTy
-  parseAFGR64Regs(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
+  parseHWRegs(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
 
   MipsAsmParser::OperandMatchResultTy
-  parseFGR64Regs(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
+  parseCCRRegs(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
 
   MipsAsmParser::OperandMatchResultTy
-  parseFGR32Regs(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
+  parseAFGR64Regs(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
 
   MipsAsmParser::OperandMatchResultTy
-  parseFCCRegs(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
+  parseFGR64Regs(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
 
   MipsAsmParser::OperandMatchResultTy
-  parseACRegsDSP(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
+  parseFGR32Regs(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
+
+  MipsAsmParser::OperandMatchResultTy
+  parseFGRH32Regs(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
+
+  MipsAsmParser::OperandMatchResultTy
+  parseFCCRegs(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
+
+  MipsAsmParser::OperandMatchResultTy
+  parseACC64DSP(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
+
+  MipsAsmParser::OperandMatchResultTy
+  parseLO32DSP(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
+
+  MipsAsmParser::OperandMatchResultTy
+  parseHI32DSP(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
+
+  MipsAsmParser::OperandMatchResultTy
+  parseCOP2(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
+
+  MipsAsmParser::OperandMatchResultTy
+  parseMSA128BRegs(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
+
+  MipsAsmParser::OperandMatchResultTy
+  parseMSA128HRegs(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
+
+  MipsAsmParser::OperandMatchResultTy
+  parseMSA128WRegs(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
+
+  MipsAsmParser::OperandMatchResultTy
+  parseMSA128DRegs(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
+
+  MipsAsmParser::OperandMatchResultTy
+  parseMSA128CtrlRegs(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
+
+  MipsAsmParser::OperandMatchResultTy
+  parseInvNum(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
+
+  MipsAsmParser::OperandMatchResultTy
+  parseLSAImm(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
 
   bool searchSymbolAlias(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
                          unsigned RegKind);
 
-  bool ParseOperand(SmallVectorImpl<MCParsedAsmOperand*> &,
+  bool ParseOperand(SmallVectorImpl<MCParsedAsmOperand *> &,
                     StringRef Mnemonic);
 
   int tryParseRegister(bool is64BitReg);
 
-  bool tryParseRegisterOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
+  bool tryParseRegisterOperand(SmallVectorImpl<MCParsedAsmOperand *> &Operands,
                                bool is64BitReg);
 
   bool needsExpansion(MCInst &Inst);
@@ -139,17 +182,19 @@ class MipsAsmParser : public MCTargetAsmParser {
   void expandLoadAddressReg(MCInst &Inst, SMLoc IDLoc,
                             SmallVectorImpl<MCInst> &Instructions);
   void expandMemInst(MCInst &Inst, SMLoc IDLoc,
-                     SmallVectorImpl<MCInst> &Instructions,
-                     bool isLoad,bool isImmOpnd);
+                     SmallVectorImpl<MCInst> &Instructions, bool isLoad,
+                     bool isImmOpnd);
   bool reportParseError(StringRef ErrorMsg);
 
   bool parseMemOffset(const MCExpr *&Res, bool isParenExpr);
   bool parseRelocOperand(const MCExpr *&Res);
 
-  const MCExpr* evaluateRelocExpr(const MCExpr *Expr, StringRef RelocStr);
+  const MCExpr *evaluateRelocExpr(const MCExpr *Expr, StringRef RelocStr);
 
   bool isEvaluated(const MCExpr *Expr);
   bool parseDirectiveSet();
+  bool parseDirectiveMipsHackStocg();
+  bool parseDirectiveMipsHackELFFlags();
 
   bool parseSetAtDirective();
   bool parseSetNoAtDirective();
@@ -161,6 +206,7 @@ class MipsAsmParser : public MCTargetAsmParser {
   bool parseSetAssignment();
 
   bool parseDirectiveWord(unsigned Size, SMLoc L);
+  bool parseDirectiveGpWord();
 
   MCSymbolRefExpr::VariantKind getVariantKind(StringRef Symbol);
 
@@ -172,40 +218,49 @@ class MipsAsmParser : public MCTargetAsmParser {
     return (STI.getFeatureBits() & Mips::FeatureFP64Bit) != 0;
   }
 
+  bool isN64() const { return STI.getFeatureBits() & Mips::FeatureN64; }
+
   int matchRegisterName(StringRef Symbol, bool is64BitReg);
 
   int matchCPURegisterName(StringRef Symbol);
 
   int matchRegisterByNumber(unsigned RegNum, unsigned RegClass);
 
-  int matchFPURegisterName(StringRef Name, FpFormatTy Format);
+  int matchFPURegisterName(StringRef Name);
 
-  void setFpFormat(FpFormatTy Format) {
-    FpFormat = Format;
-  }
+  int matchFCCRegisterName(StringRef Name);
 
-  void setDefaultFpFormat();
+  int matchACRegisterName(StringRef Name);
 
-  void setFpFormat(StringRef Format);
+  int matchMSA128RegisterName(StringRef Name);
 
-  FpFormatTy getFpFormat() {return FpFormat;}
+  int matchMSA128CtrlRegisterName(StringRef Name);
+
+  int regKindToRegClass(int RegKind);
 
   unsigned getReg(int RC, int RegNo);
 
   int getATReg();
 
   bool processInstruction(MCInst &Inst, SMLoc IDLoc,
-                        SmallVectorImpl<MCInst> &Instructions);
+                          SmallVectorImpl<MCInst> &Instructions);
+
+  // Helper function that checks if the value of a vector index is within the
+  // boundaries of accepted values for each RegisterKind
+  // Example: INSERT.B $w0[n], $1 => 16 > n >= 0
+  bool validateMSAIndex(int Val, int RegKind);
+
 public:
-  MipsAsmParser(MCSubtargetInfo &sti, MCAsmParser &parser)
-    : MCTargetAsmParser(), STI(sti), Parser(parser) {
+  MipsAsmParser(MCSubtargetInfo &sti, MCAsmParser &parser,
+                const MCInstrInfo &MII)
+      : MCTargetAsmParser(), STI(sti), Parser(parser),
+        hasConsumedDollar(false) {
     // Initialize the set of available features.
     setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
   }
 
   MCAsmParser &getParser() const { return Parser; }
   MCAsmLexer &getLexer() const { return Parser.getLexer(); }
-
 };
 }
 
@@ -221,13 +276,21 @@ public:
     Kind_GPR32,
     Kind_GPR64,
     Kind_HWRegs,
-    Kind_HW64Regs,
     Kind_FGR32Regs,
+    Kind_FGRH32Regs,
     Kind_FGR64Regs,
     Kind_AFGR64Regs,
     Kind_CCRRegs,
     Kind_FCCRegs,
-    Kind_ACRegsDSP
+    Kind_ACC64DSP,
+    Kind_LO32DSP,
+    Kind_HI32DSP,
+    Kind_COP2,
+    Kind_MSA128BRegs,
+    Kind_MSA128HRegs,
+    Kind_MSA128WRegs,
+    Kind_MSA128DRegs,
+    Kind_MSA128CtrlRegs
   };
 
 private:
@@ -238,7 +301,9 @@ private:
     k_Memory,
     k_PostIndexRegister,
     k_Register,
-    k_Token
+    k_PtrReg,
+    k_Token,
+    k_LSAImm
   } Kind;
 
   MipsOperand(KindTy K) : MCParsedAsmOperand(), Kind(K) {}
@@ -277,7 +342,12 @@ public:
     Inst.addOperand(MCOperand::CreateReg(getReg()));
   }
 
-  void addExpr(MCInst &Inst, const MCExpr *Expr) const{
+  void addPtrRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(getPtrReg()));
+  }
+
+  void addExpr(MCInst &Inst, const MCExpr *Expr) const {
     // Add as immediate when possible.  Null MCExpr = 0.
     if (Expr == 0)
       Inst.addOperand(MCOperand::CreateImm(0));
@@ -306,6 +376,9 @@ public:
   bool isImm() const { return Kind == k_Immediate; }
   bool isToken() const { return Kind == k_Token; }
   bool isMem() const { return Kind == k_Memory; }
+  bool isPtrReg() const { return Kind == k_PtrReg; }
+  bool isInvNum() const { return Kind == k_Immediate; }
+  bool isLSAImm() const { return Kind == k_LSAImm; }
 
   StringRef getToken() const {
     assert(Kind == k_Token && "Invalid access!");
@@ -317,13 +390,18 @@ public:
     return Reg.RegNum;
   }
 
+  unsigned getPtrReg() const {
+    assert((Kind == k_PtrReg) && "Invalid access!");
+    return Reg.RegNum;
+  }
+
   void setRegKind(RegisterKind RegKind) {
-    assert((Kind == k_Register) && "Invalid access!");
+    assert((Kind == k_Register || Kind == k_PtrReg) && "Invalid access!");
     Reg.Kind = RegKind;
   }
 
   const MCExpr *getImm() const {
-    assert((Kind == k_Immediate) && "Invalid access!");
+    assert((Kind == k_Immediate || Kind == k_LSAImm) && "Invalid access!");
     return Imm.Val;
   }
 
@@ -354,6 +432,14 @@ public:
     return Op;
   }
 
+  static MipsOperand *CreatePtrReg(unsigned RegNum, SMLoc S, SMLoc E) {
+    MipsOperand *Op = new MipsOperand(k_PtrReg);
+    Op->Reg.RegNum = RegNum;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
   static MipsOperand *CreateImm(const MCExpr *Val, SMLoc S, SMLoc E) {
     MipsOperand *Op = new MipsOperand(k_Immediate);
     Op->Imm.Val = Val;
@@ -362,8 +448,16 @@ public:
     return Op;
   }
 
+  static MipsOperand *CreateLSAImm(const MCExpr *Val, SMLoc S, SMLoc E) {
+    MipsOperand *Op = new MipsOperand(k_LSAImm);
+    Op->Imm.Val = Val;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
   static MipsOperand *CreateMem(unsigned Base, const MCExpr *Off,
-                                 SMLoc S, SMLoc E) {
+                                SMLoc S, SMLoc E) {
     MipsOperand *Op = new MipsOperand(k_Memory);
     Op->Mem.Base = Base;
     Op->Mem.Off = Off;
@@ -388,17 +482,12 @@ public:
     return Reg.Kind == Kind_HWRegs;
   }
 
-  bool isHW64RegsAsm() const {
-    assert((Kind == k_Register) && "Invalid access!");
-    return Reg.Kind == Kind_HW64Regs;
-  }
-
   bool isCCRAsm() const {
     assert((Kind == k_Register) && "Invalid access!");
     return Reg.Kind == Kind_CCRRegs;
   }
 
-   bool isAFGR64Asm() const {
+  bool isAFGR64Asm() const {
     return Kind == k_Register && Reg.Kind == Kind_AFGR64Regs;
   }
 
@@ -410,28 +499,58 @@ public:
     return (Kind == k_Register) && Reg.Kind == Kind_FGR32Regs;
   }
 
+  bool isFGRH32Asm() const {
+    return (Kind == k_Register) && Reg.Kind == Kind_FGRH32Regs;
+  }
+
   bool isFCCRegsAsm() const {
     return (Kind == k_Register) && Reg.Kind == Kind_FCCRegs;
   }
 
-  bool isACRegsDSPAsm() const {
-    return Kind == k_Register && Reg.Kind == Kind_ACRegsDSP;
+  bool isACC64DSPAsm() const {
+    return Kind == k_Register && Reg.Kind == Kind_ACC64DSP;
   }
 
-  /// getStartLoc - Get the location of the first token of this operand.
-  SMLoc getStartLoc() const {
-    return StartLoc;
+  bool isLO32DSPAsm() const {
+    return Kind == k_Register && Reg.Kind == Kind_LO32DSP;
   }
-  /// getEndLoc - Get the location of the last token of this operand.
-  SMLoc getEndLoc() const {
-    return EndLoc;
+
+  bool isHI32DSPAsm() const {
+    return Kind == k_Register && Reg.Kind == Kind_HI32DSP;
+  }
+
+  bool isCOP2Asm() const { return Kind == k_Register && Reg.Kind == Kind_COP2; }
+
+  bool isMSA128BAsm() const {
+    return Kind == k_Register && Reg.Kind == Kind_MSA128BRegs;
+  }
+
+  bool isMSA128HAsm() const {
+    return Kind == k_Register && Reg.Kind == Kind_MSA128HRegs;
+  }
+
+  bool isMSA128WAsm() const {
+    return Kind == k_Register && Reg.Kind == Kind_MSA128WRegs;
+  }
+
+  bool isMSA128DAsm() const {
+    return Kind == k_Register && Reg.Kind == Kind_MSA128DRegs;
   }
 
+  bool isMSA128CRAsm() const {
+    return Kind == k_Register && Reg.Kind == Kind_MSA128CtrlRegs;
+  }
+
+  /// getStartLoc - Get the location of the first token of this operand.
+  SMLoc getStartLoc() const { return StartLoc; }
+  /// getEndLoc - Get the location of the last token of this operand.
+  SMLoc getEndLoc() const { return EndLoc; }
+
   virtual void print(raw_ostream &OS) const {
     llvm_unreachable("unimplemented!");
   }
 }; // class MipsOperand
-}  // namespace
+} // namespace
 
 namespace llvm {
 extern const MCInstrDesc MipsInsts[];
@@ -462,8 +581,8 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
     // reference or immediate we may have to expand instructions.
     for (unsigned i = 0; i < MCID.getNumOperands(); i++) {
       const MCOperandInfo &OpInfo = MCID.OpInfo[i];
-      if ((OpInfo.OperandType == MCOI::OPERAND_MEMORY)
-          || (OpInfo.OperandType == MCOI::OPERAND_UNKNOWN)) {
+      if ((OpInfo.OperandType == MCOI::OPERAND_MEMORY) ||
+          (OpInfo.OperandType == MCOI::OPERAND_UNKNOWN)) {
         MCOperand &Op = Inst.getOperand(i);
         if (Op.isImm()) {
           int MemOffset = Op.getImm();
@@ -476,7 +595,7 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
           const MCExpr *Expr = Op.getExpr();
           if (Expr->getKind() == MCExpr::SymbolRef) {
             const MCSymbolRefExpr *SR =
-                static_cast<const MCSymbolRefExpr*>(Expr);
+                static_cast<const MCSymbolRefExpr *>(Expr);
             if (SR->getKind() == MCSymbolRefExpr::VK_None) {
               // Expand symbol.
               expandMemInst(Inst, IDLoc, Instructions, MCID.mayLoad(), false);
@@ -489,7 +608,7 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
         }
       }
     } // for
-  } // if load/store
+  }   // if load/store
 
   if (needsExpansion(Inst))
     expandInstruction(Inst, IDLoc, Instructions);
@@ -512,7 +631,7 @@ bool MipsAsmParser::needsExpansion(MCInst &Inst) {
 }
 
 void MipsAsmParser::expandInstruction(MCInst &Inst, SMLoc IDLoc,
-                                       SmallVectorImpl<MCInst> &Instructions) {
+                                      SmallVectorImpl<MCInst> &Instructions) {
   switch (Inst.getOpcode()) {
   case Mips::LoadImm32Reg:
     return expandLoadImm(Inst, IDLoc, Instructions);
@@ -567,8 +686,9 @@ void MipsAsmParser::expandLoadImm(MCInst &Inst, SMLoc IDLoc,
   }
 }
 
-void MipsAsmParser::expandLoadAddressReg(MCInst &Inst, SMLoc IDLoc,
-                                       SmallVectorImpl<MCInst> &Instructions) {
+void
+MipsAsmParser::expandLoadAddressReg(MCInst &Inst, SMLoc IDLoc,
+                                    SmallVectorImpl<MCInst> &Instructions) {
   MCInst tmpInst;
   const MCOperand &ImmOp = Inst.getOperand(2);
   assert(ImmOp.isImm() && "expected immediate operand kind");
@@ -609,8 +729,9 @@ void MipsAsmParser::expandLoadAddressReg(MCInst &Inst, SMLoc IDLoc,
   }
 }
 
-void MipsAsmParser::expandLoadAddressImm(MCInst &Inst, SMLoc IDLoc,
-                                       SmallVectorImpl<MCInst> &Instructions) {
+void
+MipsAsmParser::expandLoadAddressImm(MCInst &Inst, SMLoc IDLoc,
+                                    SmallVectorImpl<MCInst> &Instructions) {
   MCInst tmpInst;
   const MCOperand &ImmOp = Inst.getOperand(1);
   assert(ImmOp.isImm() && "expected immediate operand kind");
@@ -643,14 +764,15 @@ void MipsAsmParser::expandLoadAddressImm(MCInst &Inst, SMLoc IDLoc,
 }
 
 void MipsAsmParser::expandMemInst(MCInst &Inst, SMLoc IDLoc,
-          SmallVectorImpl<MCInst> &Instructions, bool isLoad, bool isImmOpnd) {
+                                  SmallVectorImpl<MCInst> &Instructions,
+                                  bool isLoad, bool isImmOpnd) {
   const MCSymbolRefExpr *SR;
   MCInst TempInst;
   unsigned ImmOffset, HiOffset, LoOffset;
   const MCExpr *ExprOffset;
   unsigned TmpRegNum;
-  unsigned AtRegNum = getReg((isMips64()) ? Mips::GPR64RegClassID
-                             : Mips::GPR32RegClassID, getATReg());
+  unsigned AtRegNum = getReg(
+      (isMips64()) ? Mips::GPR64RegClassID : Mips::GPR32RegClassID, getATReg());
   // 1st operand is either the source or destination register.
   assert(Inst.getOperand(0).isReg() && "expected register operand kind");
   unsigned RegOpNum = Inst.getOperand(0).getReg();
@@ -680,7 +802,7 @@ void MipsAsmParser::expandMemInst(MCInst &Inst, SMLoc IDLoc,
     TempInst.addOperand(MCOperand::CreateImm(HiOffset));
   else {
     if (ExprOffset->getKind() == MCExpr::SymbolRef) {
-      SR = static_cast<const MCSymbolRefExpr*>(ExprOffset);
+      SR = static_cast<const MCSymbolRefExpr *>(ExprOffset);
       const MCSymbolRefExpr *HiExpr = MCSymbolRefExpr::Create(
           SR->getSymbol().getName(), MCSymbolRefExpr::VK_Mips_ABS_HI,
           getContext());
@@ -723,15 +845,14 @@ void MipsAsmParser::expandMemInst(MCInst &Inst, SMLoc IDLoc,
   TempInst.clear();
 }
 
-bool MipsAsmParser::
-MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
-                        SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-                        MCStreamer &Out, unsigned &ErrorInfo,
-                        bool MatchingInlineAsm) {
+bool MipsAsmParser::MatchAndEmitInstruction(
+    SMLoc IDLoc, unsigned &Opcode,
+    SmallVectorImpl<MCParsedAsmOperand *> &Operands, MCStreamer &Out,
+    unsigned &ErrorInfo, bool MatchingInlineAsm) {
   MCInst Inst;
   SmallVector<MCInst, 8> Instructions;
-  unsigned MatchResult = MatchInstructionImpl(Operands, Inst, ErrorInfo,
-                                              MatchingInlineAsm);
+  unsigned MatchResult =
+      MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm);
 
   switch (MatchResult) {
   default:
@@ -752,7 +873,7 @@ MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
       if (ErrorInfo >= Operands.size())
         return Error(IDLoc, "too few operands for instruction");
 
-      ErrorLoc = ((MipsOperand*) Operands[ErrorInfo])->getStartLoc();
+      ErrorLoc = ((MipsOperand *)Operands[ErrorInfo])->getStartLoc();
       if (ErrorLoc == SMLoc())
         ErrorLoc = IDLoc;
     }
@@ -766,44 +887,44 @@ MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
 }
 
 int MipsAsmParser::matchCPURegisterName(StringRef Name) {
-   int CC;
+  int CC;
 
   if (Name == "at")
     return getATReg();
 
-    CC = StringSwitch<unsigned>(Name)
-    .Case("zero", 0)
-    .Case("a0",   4)
-    .Case("a1",   5)
-    .Case("a2",   6)
-    .Case("a3",   7)
-    .Case("v0",   2)
-    .Case("v1",   3)
-    .Case("s0",  16)
-    .Case("s1",  17)
-    .Case("s2",  18)
-    .Case("s3",  19)
-    .Case("s4",  20)
-    .Case("s5",  21)
-    .Case("s6",  22)
-    .Case("s7",  23)
-    .Case("k0",  26)
-    .Case("k1",  27)
-    .Case("sp",  29)
-    .Case("fp",  30)
-    .Case("gp",  28)
-    .Case("ra",  31)
-    .Case("t0",   8)
-    .Case("t1",   9)
-    .Case("t2",  10)
-    .Case("t3",  11)
-    .Case("t4",  12)
-    .Case("t5",  13)
-    .Case("t6",  14)
-    .Case("t7",  15)
-    .Case("t8",  24)
-    .Case("t9",  25)
-    .Default(-1);
+  CC = StringSwitch<unsigned>(Name)
+           .Case("zero", 0)
+           .Case("a0", 4)
+           .Case("a1", 5)
+           .Case("a2", 6)
+           .Case("a3", 7)
+           .Case("v0", 2)
+           .Case("v1", 3)
+           .Case("s0", 16)
+           .Case("s1", 17)
+           .Case("s2", 18)
+           .Case("s3", 19)
+           .Case("s4", 20)
+           .Case("s5", 21)
+           .Case("s6", 22)
+           .Case("s7", 23)
+           .Case("k0", 26)
+           .Case("k1", 27)
+           .Case("sp", 29)
+           .Case("fp", 30)
+           .Case("gp", 28)
+           .Case("ra", 31)
+           .Case("t0", 8)
+           .Case("t1", 9)
+           .Case("t2", 10)
+           .Case("t3", 11)
+           .Case("t4", 12)
+           .Case("t5", 13)
+           .Case("t6", 14)
+           .Case("t7", 15)
+           .Case("t8", 24)
+           .Case("t9", 25)
+           .Default(-1);
 
   // Although SGI documentation just cuts out t0-t3 for n32/n64,
   // GNU pushes the values of t0-t3 to override the o32/o64 values for t4-t7
@@ -813,72 +934,140 @@ int MipsAsmParser::matchCPURegisterName(StringRef Name) {
 
   if (CC == -1 && isMips64())
     CC = StringSwitch<unsigned>(Name)
-      .Case("a4",   8)
-      .Case("a5",   9)
-      .Case("a6",  10)
-      .Case("a7",  11)
-      .Case("kt0", 26)
-      .Case("kt1", 27)
-      .Case("s8",  30)
-      .Default(-1);
+             .Case("a4", 8)
+             .Case("a5", 9)
+             .Case("a6", 10)
+             .Case("a7", 11)
+             .Case("kt0", 26)
+             .Case("kt1", 27)
+             .Case("s8", 30)
+             .Default(-1);
 
   return CC;
 }
 
-int MipsAsmParser::matchFPURegisterName(StringRef Name, FpFormatTy Format) {
+int MipsAsmParser::matchFPURegisterName(StringRef Name) {
 
   if (Name[0] == 'f') {
     StringRef NumString = Name.substr(1);
     unsigned IntVal;
     if (NumString.getAsInteger(10, IntVal))
-      return -1; // This is not an integer.
-    if (IntVal > 31)
+      return -1;     // This is not an integer.
+    if (IntVal > 31) // Maximum index for fpu register.
       return -1;
+    return IntVal;
+  }
+  return -1;
+}
 
-    if (Format == FP_FORMAT_S || Format == FP_FORMAT_W)
-      return getReg(Mips::FGR32RegClassID, IntVal);
-    if (Format == FP_FORMAT_D) {
-      if (isFP64()) {
-        return getReg(Mips::FGR64RegClassID, IntVal);
-      }
-      // Only even numbers available as register pairs.
-      if ((IntVal > 31) || (IntVal % 2 != 0))
-        return -1;
-      return getReg(Mips::AFGR64RegClassID, IntVal / 2);
-    }
+int MipsAsmParser::matchFCCRegisterName(StringRef Name) {
+
+  if (Name.startswith("fcc")) {
+    StringRef NumString = Name.substr(3);
+    unsigned IntVal;
+    if (NumString.getAsInteger(10, IntVal))
+      return -1;    // This is not an integer.
+    if (IntVal > 7) // There are only 8 fcc registers.
+      return -1;
+    return IntVal;
   }
   return -1;
 }
 
-int MipsAsmParser::matchRegisterName(StringRef Name, bool is64BitReg) {
+int MipsAsmParser::matchACRegisterName(StringRef Name) {
+
+  if (Name.startswith("ac")) {
+    StringRef NumString = Name.substr(2);
+    unsigned IntVal;
+    if (NumString.getAsInteger(10, IntVal))
+      return -1;    // This is not an integer.
+    if (IntVal > 3) // There are only 3 acc registers.
+      return -1;
+    return IntVal;
+  }
+  return -1;
+}
 
-  if (Name.equals("fcc0"))
-    return Mips::FCC0;
+int MipsAsmParser::matchMSA128RegisterName(StringRef Name) {
+  unsigned IntVal;
+
+  if (Name.front() != 'w' || Name.drop_front(1).getAsInteger(10, IntVal))
+    return -1;
+
+  if (IntVal > 31)
+    return -1;
+
+  return IntVal;
+}
+
+int MipsAsmParser::matchMSA128CtrlRegisterName(StringRef Name) {
+  int CC;
+
+  CC = StringSwitch<unsigned>(Name)
+           .Case("msair", 0)
+           .Case("msacsr", 1)
+           .Case("msaaccess", 2)
+           .Case("msasave", 3)
+           .Case("msamodify", 4)
+           .Case("msarequest", 5)
+           .Case("msamap", 6)
+           .Case("msaunmap", 7)
+           .Default(-1);
+
+  return CC;
+}
+
+int MipsAsmParser::matchRegisterName(StringRef Name, bool is64BitReg) {
 
   int CC;
   CC = matchCPURegisterName(Name);
   if (CC != -1)
     return matchRegisterByNumber(CC, is64BitReg ? Mips::GPR64RegClassID
                                                 : Mips::GPR32RegClassID);
-  return matchFPURegisterName(Name, getFpFormat());
-}
-
-void MipsAsmParser::setDefaultFpFormat() {
-
-  if (isMips64() || isFP64())
-    FpFormat = FP_FORMAT_D;
-  else
-    FpFormat = FP_FORMAT_S;
+  CC = matchFPURegisterName(Name);
+  // TODO: decide about fpu register class
+  if (CC != -1)
+    return matchRegisterByNumber(CC, isFP64() ? Mips::FGR64RegClassID
+                                              : Mips::FGR32RegClassID);
+  return matchMSA128RegisterName(Name);
 }
 
-void MipsAsmParser::setFpFormat(StringRef Format) {
-
-  FpFormat = StringSwitch<FpFormatTy>(Format.lower())
-    .Case(".s",  FP_FORMAT_S)
-    .Case(".d",  FP_FORMAT_D)
-    .Case(".l",  FP_FORMAT_L)
-    .Case(".w",  FP_FORMAT_W)
-    .Default(FP_FORMAT_NONE);
+int MipsAsmParser::regKindToRegClass(int RegKind) {
+
+  switch (RegKind) {
+  case MipsOperand::Kind_GPR32:
+    return Mips::GPR32RegClassID;
+  case MipsOperand::Kind_GPR64:
+    return Mips::GPR64RegClassID;
+  case MipsOperand::Kind_HWRegs:
+    return Mips::HWRegsRegClassID;
+  case MipsOperand::Kind_FGR32Regs:
+    return Mips::FGR32RegClassID;
+  case MipsOperand::Kind_FGRH32Regs:
+    return Mips::FGRH32RegClassID;
+  case MipsOperand::Kind_FGR64Regs:
+    return Mips::FGR64RegClassID;
+  case MipsOperand::Kind_AFGR64Regs:
+    return Mips::AFGR64RegClassID;
+  case MipsOperand::Kind_CCRRegs:
+    return Mips::CCRRegClassID;
+  case MipsOperand::Kind_ACC64DSP:
+    return Mips::ACC64DSPRegClassID;
+  case MipsOperand::Kind_FCCRegs:
+    return Mips::FCCRegClassID;
+  case MipsOperand::Kind_MSA128BRegs:
+    return Mips::MSA128BRegClassID;
+  case MipsOperand::Kind_MSA128HRegs:
+    return Mips::MSA128HRegClassID;
+  case MipsOperand::Kind_MSA128WRegs:
+    return Mips::MSA128WRegClassID;
+  case MipsOperand::Kind_MSA128DRegs:
+    return Mips::MSA128DRegClassID;
+  case MipsOperand::Kind_MSA128CtrlRegs:
+    return Mips::MSACtrlRegClassID;
+  default:
+    return -1;
+  }
 }
 
 bool MipsAssemblerOptions::setATReg(unsigned Reg) {
@@ -889,17 +1078,15 @@ bool MipsAssemblerOptions::setATReg(unsigned Reg) {
   return true;
 }
 
-int MipsAsmParser::getATReg() {
-  return Options.getATRegNum();
-}
+int MipsAsmParser::getATReg() { return Options.getATRegNum(); }
 
 unsigned MipsAsmParser::getReg(int RC, int RegNo) {
   return *(getContext().getRegisterInfo()->getRegClass(RC).begin() + RegNo);
 }
 
 int MipsAsmParser::matchRegisterByNumber(unsigned RegNum, unsigned RegClass) {
-
-  if (RegNum > 31)
+  if (RegNum >
+      getContext().getRegisterInfo()->getRegClass(RegClass).getNumRegs())
     return -1;
 
   return getReg(RegClass, RegNum);
@@ -914,12 +1101,13 @@ int MipsAsmParser::tryParseRegister(bool is64BitReg) {
     RegNum = matchRegisterName(lowerCase, is64BitReg);
   } else if (Tok.is(AsmToken::Integer))
     RegNum = matchRegisterByNumber(static_cast<unsigned>(Tok.getIntVal()),
-        is64BitReg ? Mips::GPR64RegClassID : Mips::GPR32RegClassID);
+                                   is64BitReg ? Mips::GPR64RegClassID
+                                              : Mips::GPR32RegClassID);
   return RegNum;
 }
 
 bool MipsAsmParser::tryParseRegisterOperand(
-             SmallVectorImpl<MCParsedAsmOperand*> &Operands, bool is64BitReg) {
+    SmallVectorImpl<MCParsedAsmOperand *> &Operands, bool is64BitReg) {
 
   SMLoc S = Parser.getTok().getLoc();
   int RegNo = -1;
@@ -928,14 +1116,15 @@ bool MipsAsmParser::tryParseRegisterOperand(
   if (RegNo == -1)
     return true;
 
-  Operands.push_back(MipsOperand::CreateReg(RegNo, S,
-                                            Parser.getTok().getLoc()));
+  Operands.push_back(
+      MipsOperand::CreateReg(RegNo, S, Parser.getTok().getLoc()));
   Parser.Lex(); // Eat register token.
   return false;
 }
 
-bool MipsAsmParser::ParseOperand(SmallVectorImpl<MCParsedAsmOperand*>&Operands,
-                                 StringRef Mnemonic) {
+bool
+MipsAsmParser::ParseOperand(SmallVectorImpl<MCParsedAsmOperand *> &Operands,
+                            StringRef Mnemonic) {
   // Check if the current operand has a custom associated parser, if so, try to
   // custom parse the operand, or fallback to the general approach.
   OperandMatchResultTy ResTy = MatchOperandParserImpl(Operands, Mnemonic);
@@ -983,22 +1172,39 @@ bool MipsAsmParser::ParseOperand(SmallVectorImpl<MCParsedAsmOperand*>&Operands,
       return true;
 
     SMLoc E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
-
     MCSymbol *Sym = getContext().GetOrCreateSymbol("$" + Identifier);
-
     // Otherwise create a symbol reference.
-    const MCExpr *Res = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_None,
-                                                getContext());
+    const MCExpr *Res =
+        MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_None, getContext());
 
     Operands.push_back(MipsOperand::CreateImm(Res, S, E));
     return false;
   }
   case AsmToken::Identifier:
+    // For instruction aliases like "bc1f $Label" dedicated parser will
+    // eat the '$' sign before failing. So in order to look for appropriate
+    // label we must check first if we have already consumed '$'.
+    if (hasConsumedDollar) {
+      hasConsumedDollar = false;
+      SMLoc S = Parser.getTok().getLoc();
+      StringRef Identifier;
+      if (Parser.parseIdentifier(Identifier))
+        return true;
+      SMLoc E =
+          SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+      MCSymbol *Sym = getContext().GetOrCreateSymbol("$" + Identifier);
+      // Create a symbol reference.
+      const MCExpr *Res =
+          MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_None, getContext());
+
+      Operands.push_back(MipsOperand::CreateImm(Res, S, E));
+      return false;
+    }
     // Look for the existing symbol, we should check if
     // we need to assigne the propper RegisterKind.
     if (searchSymbolAlias(Operands, MipsOperand::Kind_None))
       return false;
-    // Else drop to expression parsing.
+  // Else drop to expression parsing.
   case AsmToken::LParen:
   case AsmToken::Minus:
   case AsmToken::Plus:
@@ -1029,7 +1235,7 @@ bool MipsAsmParser::ParseOperand(SmallVectorImpl<MCParsedAsmOperand*>&Operands,
   return true;
 }
 
-const MCExpr* MipsAsmParser::evaluateRelocExpr(const MCExpr *Expr,
+const MCExpr *MipsAsmParser::evaluateRelocExpr(const MCExpr *Expr,
                                                StringRef RelocStr) {
   const MCExpr *Res;
   // Check the type of the expression.
@@ -1099,7 +1305,7 @@ bool MipsAsmParser::isEvaluated(const MCExpr *Expr) {
 }
 
 bool MipsAsmParser::parseRelocOperand(const MCExpr *&Res) {
-  Parser.Lex(); // Eat the % token.
+  Parser.Lex();                          // Eat the % token.
   const AsmToken &Tok = Parser.getTok(); // Get next token, operation.
   if (Tok.isNot(AsmToken::Identifier))
     return true;
@@ -1145,7 +1351,7 @@ bool MipsAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
   StartLoc = Parser.getTok().getLoc();
   RegNo = tryParseRegister(isMips64());
   EndLoc = Parser.getTok().getLoc();
-  return (RegNo == (unsigned) -1);
+  return (RegNo == (unsigned)-1);
 }
 
 bool MipsAsmParser::parseMemOffset(const MCExpr *&Res, bool isParenExpr) {
@@ -1177,11 +1383,12 @@ bool MipsAsmParser::parseMemOffset(const MCExpr *&Res, bool isParenExpr) {
 }
 
 MipsAsmParser::OperandMatchResultTy MipsAsmParser::parseMemOperand(
-                               SmallVectorImpl<MCParsedAsmOperand*>&Operands) {
+    SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
 
   const MCExpr *IdVal = 0;
   SMLoc S;
   bool isParenExpr = false;
+  MipsAsmParser::OperandMatchResultTy Res = MatchOperand_NoMatch;
   // First operand is the offset.
   S = Parser.getTok().getLoc();
 
@@ -1196,21 +1403,20 @@ MipsAsmParser::OperandMatchResultTy MipsAsmParser::parseMemOperand(
 
     const AsmToken &Tok = Parser.getTok(); // Get the next token.
     if (Tok.isNot(AsmToken::LParen)) {
-      MipsOperand *Mnemonic = static_cast<MipsOperand*>(Operands[0]);
+      MipsOperand *Mnemonic = static_cast<MipsOperand *>(Operands[0]);
       if (Mnemonic->getToken() == "la") {
-        SMLoc E = SMLoc::getFromPointer(
-            Parser.getTok().getLoc().getPointer() - 1);
+        SMLoc E =
+            SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
         Operands.push_back(MipsOperand::CreateImm(IdVal, S, E));
         return MatchOperand_Success;
       }
       if (Tok.is(AsmToken::EndOfStatement)) {
-        SMLoc E = SMLoc::getFromPointer(
-            Parser.getTok().getLoc().getPointer() - 1);
+        SMLoc E =
+            SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
 
         // Zero register assumed, add a memory operand with ZERO as its base.
-        Operands.push_back(MipsOperand::CreateMem(isMips64() ? Mips::ZERO_64
-                                                             : Mips::ZERO,
-                           IdVal, S, E));
+        Operands.push_back(MipsOperand::CreateMem(
+            isMips64() ? Mips::ZERO_64 : Mips::ZERO, IdVal, S, E));
         return MatchOperand_Success;
       }
       Error(Parser.getTok().getLoc(), "'(' expected");
@@ -1220,21 +1426,12 @@ MipsAsmParser::OperandMatchResultTy MipsAsmParser::parseMemOperand(
     Parser.Lex(); // Eat the '(' token.
   }
 
-  const AsmToken &Tok1 = Parser.getTok(); // Get next token
-  if (Tok1.is(AsmToken::Dollar)) {
-    Parser.Lex(); // Eat the '$' token.
-    if (tryParseRegisterOperand(Operands, isMips64())) {
-      Error(Parser.getTok().getLoc(), "unexpected token in operand");
-      return MatchOperand_ParseFail;
-    }
-
-  } else {
-    Error(Parser.getTok().getLoc(), "unexpected token in operand");
-    return MatchOperand_ParseFail;
-  }
+  Res = parseRegs(Operands, isMips64() ? (int)MipsOperand::Kind_GPR64
+                                       : (int)MipsOperand::Kind_GPR32);
+  if (Res != MatchOperand_Success)
+    return Res;
 
-  const AsmToken &Tok2 = Parser.getTok(); // Get next token.
-  if (Tok2.isNot(AsmToken::RParen)) {
+  if (Parser.getTok().isNot(AsmToken::RParen)) {
     Error(Parser.getTok().getLoc(), "')' expected");
     return MatchOperand_ParseFail;
   }
@@ -1247,7 +1444,7 @@ MipsAsmParser::OperandMatchResultTy MipsAsmParser::parseMemOperand(
     IdVal = MCConstantExpr::Create(0, getContext());
 
   // Replace the register operand with the memory operand.
-  MipsOperand* op = static_cast<MipsOperand*>(Operands.back());
+  MipsOperand *op = static_cast<MipsOperand *>(Operands.back());
   int RegNo = op->getReg();
   // Remove the register from the operands.
   Operands.pop_back();
@@ -1266,31 +1463,150 @@ MipsAsmParser::OperandMatchResultTy MipsAsmParser::parseMemOperand(
   return MatchOperand_Success;
 }
 
+bool MipsAsmParser::parsePtrReg(SmallVectorImpl<MCParsedAsmOperand *> &Operands,
+                                int RegKind) {
+  // If the first token is not '$' we have an error.
+  if (Parser.getTok().isNot(AsmToken::Dollar))
+    return false;
+
+  SMLoc S = Parser.getTok().getLoc();
+  Parser.Lex();
+  AsmToken::TokenKind TkKind = getLexer().getKind();
+  int Reg;
+
+  if (TkKind == AsmToken::Integer) {
+    Reg = matchRegisterByNumber(Parser.getTok().getIntVal(),
+                                regKindToRegClass(RegKind));
+    if (Reg == -1)
+      return false;
+  } else if (TkKind == AsmToken::Identifier) {
+    if ((Reg = matchCPURegisterName(Parser.getTok().getString().lower())) == -1)
+      return false;
+    Reg = getReg(regKindToRegClass(RegKind), Reg);
+  } else {
+    return false;
+  }
+
+  MipsOperand *Op = MipsOperand::CreatePtrReg(Reg, S, Parser.getTok().getLoc());
+  Op->setRegKind((MipsOperand::RegisterKind)RegKind);
+  Operands.push_back(Op);
+  Parser.Lex();
+  return true;
+}
+
+MipsAsmParser::OperandMatchResultTy
+MipsAsmParser::parsePtrReg(SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
+  MipsOperand::RegisterKind RegKind =
+      isN64() ? MipsOperand::Kind_GPR64 : MipsOperand::Kind_GPR32;
+
+  // Parse index register.
+  if (!parsePtrReg(Operands, RegKind))
+    return MatchOperand_NoMatch;
+
+  // Parse '('.
+  if (Parser.getTok().isNot(AsmToken::LParen))
+    return MatchOperand_NoMatch;
+
+  Operands.push_back(MipsOperand::CreateToken("(", getLexer().getLoc()));
+  Parser.Lex();
+
+  // Parse base register.
+  if (!parsePtrReg(Operands, RegKind))
+    return MatchOperand_NoMatch;
+
+  // Parse ')'.
+  if (Parser.getTok().isNot(AsmToken::RParen))
+    return MatchOperand_NoMatch;
+
+  Operands.push_back(MipsOperand::CreateToken(")", getLexer().getLoc()));
+  Parser.Lex();
+
+  return MatchOperand_Success;
+}
+
 MipsAsmParser::OperandMatchResultTy
-MipsAsmParser::parseRegs(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
+MipsAsmParser::parseRegs(SmallVectorImpl<MCParsedAsmOperand *> &Operands,
                          int RegKind) {
   MipsOperand::RegisterKind Kind = (MipsOperand::RegisterKind)RegKind;
-  if (getLexer().getKind() == AsmToken::Identifier) {
+  if (getLexer().getKind() == AsmToken::Identifier && !hasConsumedDollar) {
     if (searchSymbolAlias(Operands, Kind))
       return MatchOperand_Success;
     return MatchOperand_NoMatch;
   }
+  SMLoc S = Parser.getTok().getLoc();
   // If the first token is not '$', we have an error.
-  if (Parser.getTok().isNot(AsmToken::Dollar))
+  if (Parser.getTok().isNot(AsmToken::Dollar) && !hasConsumedDollar)
     return MatchOperand_NoMatch;
-
-  Parser.Lex(); // Eat $
-  if (!tryParseRegisterOperand(Operands,
-                               RegKind == MipsOperand::Kind_GPR64)) {
-    // Set the proper register kind.
-    MipsOperand* op = static_cast<MipsOperand*>(Operands.back());
-    op->setRegKind(Kind);
-    if ((Kind == MipsOperand::Kind_GPR32)
-      && (getLexer().is(AsmToken::LParen))) {
+  if (!hasConsumedDollar) {
+    Parser.Lex(); // Eat the '$'
+    hasConsumedDollar = true;
+  }
+  if (getLexer().getKind() == AsmToken::Identifier) {
+    int RegNum = -1;
+    std::string RegName = Parser.getTok().getString().lower();
+    // Match register by name
+    switch (RegKind) {
+    case MipsOperand::Kind_GPR32:
+    case MipsOperand::Kind_GPR64:
+      RegNum = matchCPURegisterName(RegName);
+      break;
+    case MipsOperand::Kind_AFGR64Regs:
+    case MipsOperand::Kind_FGR64Regs:
+    case MipsOperand::Kind_FGR32Regs:
+    case MipsOperand::Kind_FGRH32Regs:
+      RegNum = matchFPURegisterName(RegName);
+      if (RegKind == MipsOperand::Kind_AFGR64Regs)
+        RegNum /= 2;
+      else if (RegKind == MipsOperand::Kind_FGRH32Regs && !isFP64())
+        if (RegNum != -1 && RegNum % 2 != 0)
+          Warning(S, "Float register should be even.");
+      break;
+    case MipsOperand::Kind_FCCRegs:
+      RegNum = matchFCCRegisterName(RegName);
+      break;
+    case MipsOperand::Kind_ACC64DSP:
+      RegNum = matchACRegisterName(RegName);
+      break;
+    default:
+      break; // No match, value is set to -1.
+    }
+    // No match found, return _NoMatch to give a chance to other round.
+    if (RegNum < 0)
+      return MatchOperand_NoMatch;
+
+    int RegVal = getReg(regKindToRegClass(Kind), RegNum);
+    if (RegVal == -1)
+      return MatchOperand_NoMatch;
+
+    MipsOperand *Op =
+        MipsOperand::CreateReg(RegVal, S, Parser.getTok().getLoc());
+    Op->setRegKind(Kind);
+    Operands.push_back(Op);
+    hasConsumedDollar = false;
+    Parser.Lex(); // Eat the register name.
+    return MatchOperand_Success;
+  } else if (getLexer().getKind() == AsmToken::Integer) {
+    unsigned RegNum = Parser.getTok().getIntVal();
+    if (Kind == MipsOperand::Kind_HWRegs) {
+      if (RegNum != 29)
+        return MatchOperand_NoMatch;
+      // Only hwreg 29 is supported, found at index 0.
+      RegNum = 0;
+    }
+    int Reg = matchRegisterByNumber(RegNum, regKindToRegClass(Kind));
+    if (Reg == -1)
+      return MatchOperand_NoMatch;
+    MipsOperand *Op = MipsOperand::CreateReg(Reg, S, Parser.getTok().getLoc());
+    Op->setRegKind(Kind);
+    Operands.push_back(Op);
+    hasConsumedDollar = false;
+    Parser.Lex(); // Eat the register number.
+    if ((RegKind == MipsOperand::Kind_GPR32) &&
+        (getLexer().is(AsmToken::LParen))) {
       // Check if it is indexed addressing operand.
       Operands.push_back(MipsOperand::CreateToken("(", getLexer().getLoc()));
       Parser.Lex(); // Eat the parenthesis.
-      if (parseRegs(Operands,RegKind) != MatchOperand_Success)
+      if (parseRegs(Operands, RegKind) != MatchOperand_Success)
         return MatchOperand_NoMatch;
       if (getLexer().isNot(AsmToken::RParen))
         return MatchOperand_NoMatch;
@@ -1302,49 +1618,254 @@ MipsAsmParser::parseRegs(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
   return MatchOperand_NoMatch;
 }
 
+bool MipsAsmParser::validateMSAIndex(int Val, int RegKind) {
+  MipsOperand::RegisterKind Kind = (MipsOperand::RegisterKind)RegKind;
+
+  if (Val < 0)
+    return false;
+
+  switch (Kind) {
+  default:
+    return false;
+  case MipsOperand::Kind_MSA128BRegs:
+    return Val < 16;
+  case MipsOperand::Kind_MSA128HRegs:
+    return Val < 8;
+  case MipsOperand::Kind_MSA128WRegs:
+    return Val < 4;
+  case MipsOperand::Kind_MSA128DRegs:
+    return Val < 2;
+  }
+}
+
 MipsAsmParser::OperandMatchResultTy
-MipsAsmParser::parseGPR64(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+MipsAsmParser::parseMSARegs(SmallVectorImpl<MCParsedAsmOperand *> &Operands,
+                            int RegKind) {
+  MipsOperand::RegisterKind Kind = (MipsOperand::RegisterKind)RegKind;
+  SMLoc S = Parser.getTok().getLoc();
+  std::string RegName;
 
-  if (!isMips64())
+  if (Parser.getTok().isNot(AsmToken::Dollar))
+    return MatchOperand_NoMatch;
+
+  switch (RegKind) {
+  default:
+    return MatchOperand_ParseFail;
+  case MipsOperand::Kind_MSA128BRegs:
+  case MipsOperand::Kind_MSA128HRegs:
+  case MipsOperand::Kind_MSA128WRegs:
+  case MipsOperand::Kind_MSA128DRegs:
+    break;
+  }
+
+  Parser.Lex(); // Eat the '$'.
+  if (getLexer().getKind() == AsmToken::Identifier)
+    RegName = Parser.getTok().getString().lower();
+  else
+    return MatchOperand_ParseFail;
+
+  int RegNum = matchMSA128RegisterName(RegName);
+
+  if (RegNum < 0 || RegNum > 31)
+    return MatchOperand_ParseFail;
+
+  int RegVal = getReg(regKindToRegClass(Kind), RegNum);
+  if (RegVal == -1)
+    return MatchOperand_ParseFail;
+
+  MipsOperand *Op = MipsOperand::CreateReg(RegVal, S, Parser.getTok().getLoc());
+  Op->setRegKind(Kind);
+  Operands.push_back(Op);
+
+  Parser.Lex(); // Eat the register identifier.
+
+  // MSA registers may be suffixed with an index in the form of:
+  // 1) Immediate expression.
+  // 2) General Purpose Register.
+  // Examples:
+  //   1) copy_s.b $29,$w0[0]
+  //   2) sld.b $w0,$w1[$1]
+
+  if (Parser.getTok().isNot(AsmToken::LBrac))
+    return MatchOperand_Success;
+
+  MipsOperand *Mnemonic = static_cast<MipsOperand *>(Operands[0]);
+
+  Operands.push_back(MipsOperand::CreateToken("[", Parser.getTok().getLoc()));
+  Parser.Lex(); // Parse the '[' token.
+
+  if (Parser.getTok().is(AsmToken::Dollar)) {
+    // This must be a GPR.
+    MipsOperand *RegOp;
+    SMLoc VIdx = Parser.getTok().getLoc();
+    Parser.Lex(); // Parse the '$' token.
+
+    // GPR have aliases and we must account for that. Example: $30 == $fp
+    if (getLexer().getKind() == AsmToken::Integer) {
+      unsigned RegNum = Parser.getTok().getIntVal();
+      int Reg = matchRegisterByNumber(
+          RegNum, regKindToRegClass(MipsOperand::Kind_GPR32));
+      if (Reg == -1) {
+        Error(VIdx, "invalid general purpose register");
+        return MatchOperand_ParseFail;
+      }
+
+      RegOp = MipsOperand::CreateReg(Reg, VIdx, Parser.getTok().getLoc());
+    } else if (getLexer().getKind() == AsmToken::Identifier) {
+      int RegNum = -1;
+      std::string RegName = Parser.getTok().getString().lower();
+
+      RegNum = matchCPURegisterName(RegName);
+      if (RegNum == -1) {
+        Error(VIdx, "general purpose register expected");
+        return MatchOperand_ParseFail;
+      }
+      RegNum = getReg(regKindToRegClass(MipsOperand::Kind_GPR32), RegNum);
+      RegOp = MipsOperand::CreateReg(RegNum, VIdx, Parser.getTok().getLoc());
+    } else
+      return MatchOperand_ParseFail;
+
+    RegOp->setRegKind(MipsOperand::Kind_GPR32);
+    Operands.push_back(RegOp);
+    Parser.Lex(); // Eat the register identifier.
+
+    if (Parser.getTok().isNot(AsmToken::RBrac))
+      return MatchOperand_ParseFail;
+
+    Operands.push_back(MipsOperand::CreateToken("]", Parser.getTok().getLoc()));
+    Parser.Lex(); // Parse the ']' token.
+
+    return MatchOperand_Success;
+  }
+
+  // The index must be a constant expression then.
+  SMLoc VIdx = Parser.getTok().getLoc();
+  const MCExpr *ImmVal;
+
+  if (getParser().parseExpression(ImmVal))
+    return MatchOperand_ParseFail;
+
+  const MCConstantExpr *expr = dyn_cast<MCConstantExpr>(ImmVal);
+  if (!expr || !validateMSAIndex((int)expr->getValue(), Kind)) {
+    Error(VIdx, "invalid immediate value");
+    return MatchOperand_ParseFail;
+  }
+
+  SMLoc E = Parser.getTok().getEndLoc();
+
+  if (Parser.getTok().isNot(AsmToken::RBrac))
+    return MatchOperand_ParseFail;
+
+  bool insve =
+      Mnemonic->getToken() == "insve.b" || Mnemonic->getToken() == "insve.h" ||
+      Mnemonic->getToken() == "insve.w" || Mnemonic->getToken() == "insve.d";
+
+  // The second vector index of insve instructions is always 0.
+  if (insve && Operands.size() > 6) {
+    if (expr->getValue() != 0) {
+      Error(VIdx, "immediate value must be 0");
+      return MatchOperand_ParseFail;
+    }
+    Operands.push_back(MipsOperand::CreateToken("0", VIdx));
+  } else
+    Operands.push_back(MipsOperand::CreateImm(expr, VIdx, E));
+
+  Operands.push_back(MipsOperand::CreateToken("]", Parser.getTok().getLoc()));
+
+  Parser.Lex(); // Parse the ']' token.
+
+  return MatchOperand_Success;
+}
+
+MipsAsmParser::OperandMatchResultTy
+MipsAsmParser::parseMSACtrlRegs(SmallVectorImpl<MCParsedAsmOperand *> &Operands,
+                                int RegKind) {
+  MipsOperand::RegisterKind Kind = (MipsOperand::RegisterKind)RegKind;
+
+  if (Kind != MipsOperand::Kind_MSA128CtrlRegs)
     return MatchOperand_NoMatch;
-  return parseRegs(Operands, (int) MipsOperand::Kind_GPR64);
+
+  if (Parser.getTok().isNot(AsmToken::Dollar))
+    return MatchOperand_ParseFail;
+
+  SMLoc S = Parser.getTok().getLoc();
+
+  Parser.Lex(); // Eat the '$' symbol.
+
+  int RegNum = -1;
+  if (getLexer().getKind() == AsmToken::Identifier)
+    RegNum = matchMSA128CtrlRegisterName(Parser.getTok().getString().lower());
+  else if (getLexer().getKind() == AsmToken::Integer)
+    RegNum = Parser.getTok().getIntVal();
+  else
+    return MatchOperand_ParseFail;
+
+  if (RegNum < 0 || RegNum > 7)
+    return MatchOperand_ParseFail;
+
+  int RegVal = getReg(regKindToRegClass(Kind), RegNum);
+  if (RegVal == -1)
+    return MatchOperand_ParseFail;
+
+  MipsOperand *RegOp =
+      MipsOperand::CreateReg(RegVal, S, Parser.getTok().getLoc());
+  RegOp->setRegKind(MipsOperand::Kind_MSA128CtrlRegs);
+  Operands.push_back(RegOp);
+  Parser.Lex(); // Eat the register identifier.
+
+  return MatchOperand_Success;
 }
 
 MipsAsmParser::OperandMatchResultTy
-MipsAsmParser::parseGPR32(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
- return parseRegs(Operands, (int) MipsOperand::Kind_GPR32);
+MipsAsmParser::parseGPR64(SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
+
+  if (!isMips64())
+    return MatchOperand_NoMatch;
+  return parseRegs(Operands, (int)MipsOperand::Kind_GPR64);
 }
 
 MipsAsmParser::OperandMatchResultTy
-MipsAsmParser::parseAFGR64Regs(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+MipsAsmParser::parseGPR32(SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
+  return parseRegs(Operands, (int)MipsOperand::Kind_GPR32);
+}
+
+MipsAsmParser::OperandMatchResultTy MipsAsmParser::parseAFGR64Regs(
+    SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
 
   if (isFP64())
     return MatchOperand_NoMatch;
-  // Double operand is expected, set appropriate format
-  setFpFormat(FP_FORMAT_D);
-
-  return parseRegs(Operands, (int) MipsOperand::Kind_AFGR64Regs);
+  return parseRegs(Operands, (int)MipsOperand::Kind_AFGR64Regs);
 }
 
 MipsAsmParser::OperandMatchResultTy
-MipsAsmParser::parseFGR64Regs(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+MipsAsmParser::parseFGR64Regs(SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
   if (!isFP64())
     return MatchOperand_NoMatch;
-  // Double operand is expected, set appropriate format
-  setFpFormat(FP_FORMAT_D);
+  return parseRegs(Operands, (int)MipsOperand::Kind_FGR64Regs);
+}
+
+MipsAsmParser::OperandMatchResultTy
+MipsAsmParser::parseFGR32Regs(SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
+  return parseRegs(Operands, (int)MipsOperand::Kind_FGR32Regs);
+}
 
- return parseRegs(Operands, (int) MipsOperand::Kind_FGR64Regs);
+MipsAsmParser::OperandMatchResultTy MipsAsmParser::parseFGRH32Regs(
+    SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
+  return parseRegs(Operands, (int)MipsOperand::Kind_FGRH32Regs);
 }
 
 MipsAsmParser::OperandMatchResultTy
-MipsAsmParser::parseFGR32Regs(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
- // Single operand is expected, set appropriate format
-  setFpFormat(FP_FORMAT_S);
-  return parseRegs(Operands, (int) MipsOperand::Kind_FGR32Regs);
+MipsAsmParser::parseFCCRegs(SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
+  return parseRegs(Operands, (int)MipsOperand::Kind_FCCRegs);
 }
 
 MipsAsmParser::OperandMatchResultTy
-MipsAsmParser::parseFCCRegs(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+MipsAsmParser::parseACC64DSP(SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
+  return parseRegs(Operands, (int)MipsOperand::Kind_ACC64DSP);
+}
+
+MipsAsmParser::OperandMatchResultTy
+MipsAsmParser::parseLO32DSP(SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
   // If the first token is not '$' we have an error.
   if (Parser.getTok().isNot(AsmToken::Dollar))
     return MatchOperand_NoMatch;
@@ -1357,19 +1878,19 @@ MipsAsmParser::parseFCCRegs(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   if (Tok.isNot(AsmToken::Identifier))
     return MatchOperand_NoMatch;
 
-  if (!Tok.getIdentifier().startswith("fcc"))
+  if (!Tok.getIdentifier().startswith("ac"))
     return MatchOperand_NoMatch;
 
-  StringRef NumString = Tok.getIdentifier().substr(3);
+  StringRef NumString = Tok.getIdentifier().substr(2);
 
   unsigned IntVal;
   if (NumString.getAsInteger(10, IntVal))
     return MatchOperand_NoMatch;
 
-  unsigned Reg = matchRegisterByNumber(IntVal, Mips::FCCRegClassID);
+  unsigned Reg = matchRegisterByNumber(IntVal, Mips::LO32DSPRegClassID);
 
   MipsOperand *Op = MipsOperand::CreateReg(Reg, S, Parser.getTok().getLoc());
-  Op->setRegKind(MipsOperand::Kind_FCCRegs);
+  Op->setRegKind(MipsOperand::Kind_LO32DSP);
   Operands.push_back(Op);
 
   Parser.Lex(); // Eat the register number.
@@ -1377,7 +1898,7 @@ MipsAsmParser::parseFCCRegs(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
 }
 
 MipsAsmParser::OperandMatchResultTy
-MipsAsmParser::parseACRegsDSP(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+MipsAsmParser::parseHI32DSP(SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
   // If the first token is not '$' we have an error.
   if (Parser.getTok().isNot(AsmToken::Dollar))
     return MatchOperand_NoMatch;
@@ -1390,27 +1911,78 @@ MipsAsmParser::parseACRegsDSP(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   if (Tok.isNot(AsmToken::Identifier))
     return MatchOperand_NoMatch;
 
-  if (!Tok.getIdentifier().startswith("acc"))
+  if (!Tok.getIdentifier().startswith("ac"))
     return MatchOperand_NoMatch;
 
-  StringRef NumString = Tok.getIdentifier().substr(3);
+  StringRef NumString = Tok.getIdentifier().substr(2);
 
   unsigned IntVal;
   if (NumString.getAsInteger(10, IntVal))
     return MatchOperand_NoMatch;
 
-  unsigned Reg = matchRegisterByNumber(IntVal, Mips::ACRegsDSPRegClassID);
+  unsigned Reg = matchRegisterByNumber(IntVal, Mips::HI32DSPRegClassID);
 
   MipsOperand *Op = MipsOperand::CreateReg(Reg, S, Parser.getTok().getLoc());
-  Op->setRegKind(MipsOperand::Kind_ACRegsDSP);
+  Op->setRegKind(MipsOperand::Kind_HI32DSP);
   Operands.push_back(Op);
 
   Parser.Lex(); // Eat the register number.
   return MatchOperand_Success;
 }
 
+MipsAsmParser::OperandMatchResultTy
+MipsAsmParser::parseCOP2(SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
+  // If the first token is not '$' we have an error.
+  if (Parser.getTok().isNot(AsmToken::Dollar))
+    return MatchOperand_NoMatch;
+
+  SMLoc S = Parser.getTok().getLoc();
+  Parser.Lex(); // Eat the '$'
+
+  const AsmToken &Tok = Parser.getTok(); // Get next token.
+
+  if (Tok.isNot(AsmToken::Integer))
+    return MatchOperand_NoMatch;
+
+  unsigned IntVal = Tok.getIntVal();
+
+  unsigned Reg = matchRegisterByNumber(IntVal, Mips::COP2RegClassID);
+
+  MipsOperand *Op = MipsOperand::CreateReg(Reg, S, Parser.getTok().getLoc());
+  Op->setRegKind(MipsOperand::Kind_COP2);
+  Operands.push_back(Op);
+
+  Parser.Lex(); // Eat the register number.
+  return MatchOperand_Success;
+}
+
+MipsAsmParser::OperandMatchResultTy MipsAsmParser::parseMSA128BRegs(
+    SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
+  return parseMSARegs(Operands, (int)MipsOperand::Kind_MSA128BRegs);
+}
+
+MipsAsmParser::OperandMatchResultTy MipsAsmParser::parseMSA128HRegs(
+    SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
+  return parseMSARegs(Operands, (int)MipsOperand::Kind_MSA128HRegs);
+}
+
+MipsAsmParser::OperandMatchResultTy MipsAsmParser::parseMSA128WRegs(
+    SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
+  return parseMSARegs(Operands, (int)MipsOperand::Kind_MSA128WRegs);
+}
+
+MipsAsmParser::OperandMatchResultTy MipsAsmParser::parseMSA128DRegs(
+    SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
+  return parseMSARegs(Operands, (int)MipsOperand::Kind_MSA128DRegs);
+}
+
+MipsAsmParser::OperandMatchResultTy MipsAsmParser::parseMSA128CtrlRegs(
+    SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
+  return parseMSACtrlRegs(Operands, (int)MipsOperand::Kind_MSA128CtrlRegs);
+}
+
 bool MipsAsmParser::searchSymbolAlias(
-    SmallVectorImpl<MCParsedAsmOperand*> &Operands, unsigned RegKind) {
+    SmallVectorImpl<MCParsedAsmOperand *> &Operands, unsigned RegKind) {
 
   MCSymbol *Sym = getContext().LookupSymbol(Parser.getTok().getIdentifier());
   if (Sym) {
@@ -1421,38 +1993,39 @@ bool MipsAsmParser::searchSymbolAlias(
     else
       return false;
     if (Expr->getKind() == MCExpr::SymbolRef) {
-      MipsOperand::RegisterKind Kind = (MipsOperand::RegisterKind) RegKind;
-      const MCSymbolRefExpr *Ref = static_cast<const MCSymbolRefExpr*>(Expr);
+      MipsOperand::RegisterKind Kind = (MipsOperand::RegisterKind)RegKind;
+      const MCSymbolRefExpr *Ref = static_cast<const MCSymbolRefExpr *>(Expr);
       const StringRef DefSymbol = Ref->getSymbol().getName();
       if (DefSymbol.startswith("$")) {
         int RegNum = -1;
         APInt IntVal(32, -1);
         if (!DefSymbol.substr(1).getAsInteger(10, IntVal))
           RegNum = matchRegisterByNumber(IntVal.getZExtValue(),
-                                     isMips64()
-                                       ? Mips::GPR64RegClassID
-                                       : Mips::GPR32RegClassID);
+                                         isMips64() ? Mips::GPR64RegClassID
+                                                    : Mips::GPR32RegClassID);
         else {
           // Lookup for the register with the corresponding name.
           switch (Kind) {
           case MipsOperand::Kind_AFGR64Regs:
           case MipsOperand::Kind_FGR64Regs:
-            RegNum = matchFPURegisterName(DefSymbol.substr(1), FP_FORMAT_D);
+            RegNum = matchFPURegisterName(DefSymbol.substr(1));
             break;
           case MipsOperand::Kind_FGR32Regs:
-            RegNum = matchFPURegisterName(DefSymbol.substr(1), FP_FORMAT_S);
+            RegNum = matchFPURegisterName(DefSymbol.substr(1));
             break;
           case MipsOperand::Kind_GPR64:
           case MipsOperand::Kind_GPR32:
           default:
-            RegNum = matchRegisterName(DefSymbol.substr(1), isMips64());
+            RegNum = matchCPURegisterName(DefSymbol.substr(1));
             break;
           }
+          if (RegNum > -1)
+            RegNum = getReg(regKindToRegClass(Kind), RegNum);
         }
         if (RegNum > -1) {
           Parser.Lex();
-          MipsOperand *op = MipsOperand::CreateReg(RegNum, S,
-                                                   Parser.getTok().getLoc());
+          MipsOperand *op =
+              MipsOperand::CreateReg(RegNum, S, Parser.getTok().getLoc());
           op->setRegKind(Kind);
           Operands.push_back(op);
           return true;
@@ -1460,9 +2033,9 @@ bool MipsAsmParser::searchSymbolAlias(
       }
     } else if (Expr->getKind() == MCExpr::Constant) {
       Parser.Lex();
-      const MCConstantExpr *Const = static_cast<const MCConstantExpr*>(Expr);
-      MipsOperand *op = MipsOperand::CreateImm(Const, S,
-          Parser.getTok().getLoc());
+      const MCConstantExpr *Const = static_cast<const MCConstantExpr *>(Expr);
+      MipsOperand *op =
+          MipsOperand::CreateImm(Const, S, Parser.getTok().getLoc());
       Operands.push_back(op);
       return true;
     }
@@ -1471,115 +2044,101 @@ bool MipsAsmParser::searchSymbolAlias(
 }
 
 MipsAsmParser::OperandMatchResultTy
-MipsAsmParser::parseHWRegs(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-
-  // If the first token is not '$' we have error.
-  if (Parser.getTok().isNot(AsmToken::Dollar))
-    return MatchOperand_NoMatch;
-  SMLoc S = Parser.getTok().getLoc();
-  Parser.Lex(); // Eat the '$'.
-
-  const AsmToken &Tok = Parser.getTok(); // Get the next token.
-  if (Tok.isNot(AsmToken::Integer))
-    return MatchOperand_NoMatch;
-
-  unsigned RegNum = Tok.getIntVal();
-  // At the moment only hwreg29 is supported.
-  if (RegNum != 29)
-    return MatchOperand_ParseFail;
-
-  MipsOperand *op = MipsOperand::CreateReg(Mips::HWR29, S,
-      Parser.getTok().getLoc());
-  op->setRegKind(MipsOperand::Kind_HWRegs);
-  Operands.push_back(op);
-
-  Parser.Lex(); // Eat the register number.
-  return MatchOperand_Success;
+MipsAsmParser::parseHWRegs(SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
+  return parseRegs(Operands, (int)MipsOperand::Kind_HWRegs);
 }
 
 MipsAsmParser::OperandMatchResultTy
-MipsAsmParser::parseHW64Regs(
-    SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+MipsAsmParser::parseCCRRegs(SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
+  return parseRegs(Operands, (int)MipsOperand::Kind_CCRRegs);
+}
 
-  if (!isMips64())
-    return MatchOperand_NoMatch;
-  // If the first token is not '$' we have an error.
-  if (Parser.getTok().isNot(AsmToken::Dollar))
+MipsAsmParser::OperandMatchResultTy
+MipsAsmParser::parseInvNum(SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
+  const MCExpr *IdVal;
+  // If the first token is '$' we may have register operand.
+  if (Parser.getTok().is(AsmToken::Dollar))
     return MatchOperand_NoMatch;
   SMLoc S = Parser.getTok().getLoc();
-  Parser.Lex(); // Eat $
-
-  const AsmToken &Tok = Parser.getTok(); // Get the next token.
-  if (Tok.isNot(AsmToken::Integer))
-    return MatchOperand_NoMatch;
-
-  unsigned RegNum = Tok.getIntVal();
-  // At the moment only hwreg29 is supported.
-  if (RegNum != 29)
+  if (getParser().parseExpression(IdVal))
     return MatchOperand_ParseFail;
-
-  MipsOperand *op = MipsOperand::CreateReg(Mips::HWR29_64, S,
-                                           Parser.getTok().getLoc());
-  op->setRegKind(MipsOperand::Kind_HW64Regs);
-  Operands.push_back(op);
-
-  Parser.Lex(); // Eat the register number.
+  const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(IdVal);
+  assert(MCE && "Unexpected MCExpr type.");
+  int64_t Val = MCE->getValue();
+  SMLoc E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+  Operands.push_back(MipsOperand::CreateImm(
+      MCConstantExpr::Create(0 - Val, getContext()), S, E));
   return MatchOperand_Success;
 }
 
 MipsAsmParser::OperandMatchResultTy
-MipsAsmParser::parseCCRRegs(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-  // If the first token is not '$' we have an error.
-  if (Parser.getTok().isNot(AsmToken::Dollar))
+MipsAsmParser::parseLSAImm(SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
+  switch (getLexer().getKind()) {
+  default:
     return MatchOperand_NoMatch;
+  case AsmToken::LParen:
+  case AsmToken::Plus:
+  case AsmToken::Minus:
+  case AsmToken::Integer:
+    break;
+  }
 
+  const MCExpr *Expr;
   SMLoc S = Parser.getTok().getLoc();
-  Parser.Lex(); // Eat the '$'
-
-  const AsmToken &Tok = Parser.getTok(); // Get next token.
 
-  if (Tok.isNot(AsmToken::Integer))
-    return MatchOperand_NoMatch;
+  if (getParser().parseExpression(Expr))
+    return MatchOperand_ParseFail;
 
-  unsigned Reg = matchRegisterByNumber(Tok.getIntVal(), Mips::CCRRegClassID);
+  int64_t Val;
+  if (!Expr->EvaluateAsAbsolute(Val)) {
+    Error(S, "expected immediate value");
+    return MatchOperand_ParseFail;
+  }
 
-  MipsOperand *Op = MipsOperand::CreateReg(Reg, S, Parser.getTok().getLoc());
-  Op->setRegKind(MipsOperand::Kind_CCRRegs);
-  Operands.push_back(Op);
+  // The LSA instruction allows a 2-bit unsigned immediate. For this reason
+  // and because the CPU always adds one to the immediate field, the allowed
+  // range becomes 1..4. We'll only check the range here and will deal
+  // with the addition/subtraction when actually decoding/encoding
+  // the instruction.
+  if (Val < 1 || Val > 4) {
+    Error(S, "immediate not in range (1..4)");
+    return MatchOperand_ParseFail;
+  }
 
-  Parser.Lex(); // Eat the register number.
+  Operands.push_back(MipsOperand::CreateLSAImm(Expr, S,
+                                               Parser.getTok().getLoc()));
   return MatchOperand_Success;
 }
 
 MCSymbolRefExpr::VariantKind MipsAsmParser::getVariantKind(StringRef Symbol) {
 
-  MCSymbolRefExpr::VariantKind VK
-                   = StringSwitch<MCSymbolRefExpr::VariantKind>(Symbol)
-    .Case("hi",          MCSymbolRefExpr::VK_Mips_ABS_HI)
-    .Case("lo",          MCSymbolRefExpr::VK_Mips_ABS_LO)
-    .Case("gp_rel",      MCSymbolRefExpr::VK_Mips_GPREL)
-    .Case("call16",      MCSymbolRefExpr::VK_Mips_GOT_CALL)
-    .Case("got",         MCSymbolRefExpr::VK_Mips_GOT)
-    .Case("tlsgd",       MCSymbolRefExpr::VK_Mips_TLSGD)
-    .Case("tlsldm",      MCSymbolRefExpr::VK_Mips_TLSLDM)
-    .Case("dtprel_hi",   MCSymbolRefExpr::VK_Mips_DTPREL_HI)
-    .Case("dtprel_lo",   MCSymbolRefExpr::VK_Mips_DTPREL_LO)
-    .Case("gottprel",    MCSymbolRefExpr::VK_Mips_GOTTPREL)
-    .Case("tprel_hi",    MCSymbolRefExpr::VK_Mips_TPREL_HI)
-    .Case("tprel_lo",    MCSymbolRefExpr::VK_Mips_TPREL_LO)
-    .Case("got_disp",    MCSymbolRefExpr::VK_Mips_GOT_DISP)
-    .Case("got_page",    MCSymbolRefExpr::VK_Mips_GOT_PAGE)
-    .Case("got_ofst",    MCSymbolRefExpr::VK_Mips_GOT_OFST)
-    .Case("hi(%neg(%gp_rel",    MCSymbolRefExpr::VK_Mips_GPOFF_HI)
-    .Case("lo(%neg(%gp_rel",    MCSymbolRefExpr::VK_Mips_GPOFF_LO)
-    .Default(MCSymbolRefExpr::VK_None);
+  MCSymbolRefExpr::VariantKind VK =
+      StringSwitch<MCSymbolRefExpr::VariantKind>(Symbol)
+          .Case("hi", MCSymbolRefExpr::VK_Mips_ABS_HI)
+          .Case("lo", MCSymbolRefExpr::VK_Mips_ABS_LO)
+          .Case("gp_rel", MCSymbolRefExpr::VK_Mips_GPREL)
+          .Case("call16", MCSymbolRefExpr::VK_Mips_GOT_CALL)
+          .Case("got", MCSymbolRefExpr::VK_Mips_GOT)
+          .Case("tlsgd", MCSymbolRefExpr::VK_Mips_TLSGD)
+          .Case("tlsldm", MCSymbolRefExpr::VK_Mips_TLSLDM)
+          .Case("dtprel_hi", MCSymbolRefExpr::VK_Mips_DTPREL_HI)
+          .Case("dtprel_lo", MCSymbolRefExpr::VK_Mips_DTPREL_LO)
+          .Case("gottprel", MCSymbolRefExpr::VK_Mips_GOTTPREL)
+          .Case("tprel_hi", MCSymbolRefExpr::VK_Mips_TPREL_HI)
+          .Case("tprel_lo", MCSymbolRefExpr::VK_Mips_TPREL_LO)
+          .Case("got_disp", MCSymbolRefExpr::VK_Mips_GOT_DISP)
+          .Case("got_page", MCSymbolRefExpr::VK_Mips_GOT_PAGE)
+          .Case("got_ofst", MCSymbolRefExpr::VK_Mips_GOT_OFST)
+          .Case("hi(%neg(%gp_rel", MCSymbolRefExpr::VK_Mips_GPOFF_HI)
+          .Case("lo(%neg(%gp_rel", MCSymbolRefExpr::VK_Mips_GPOFF_LO)
+          .Default(MCSymbolRefExpr::VK_None);
 
   return VK;
 }
 
-bool MipsAsmParser::
-ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc,
-                 SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+bool MipsAsmParser::ParseInstruction(
+    ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc,
+    SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
   // Check if we have valid mnemonic
   if (!mnemonicIsValid(Name, 0)) {
     Parser.eatToEndOfStatement();
@@ -1757,12 +2316,13 @@ bool MipsAsmParser::parseSetAssignment() {
     // We have a '$' followed by something, make sure they are adjacent.
     if (DollarLoc.getPointer() + 1 != getTok().getLoc().getPointer())
       return true;
-    StringRef Res = StringRef(DollarLoc.getPointer(),
-        getTok().getEndLoc().getPointer() - DollarLoc.getPointer());
+    StringRef Res =
+        StringRef(DollarLoc.getPointer(),
+                  getTok().getEndLoc().getPointer() - DollarLoc.getPointer());
     Symbol = getContext().GetOrCreateSymbol(Res);
     Parser.Lex();
-    Value = MCSymbolRefExpr::Create(Symbol, MCSymbolRefExpr::VK_None,
-                                    getContext());
+    Value =
+        MCSymbolRefExpr::Create(Symbol, MCSymbolRefExpr::VK_None, getContext());
   } else if (Parser.parseExpression(Value))
     return reportParseError("expected valid expression after comma");
 
@@ -1810,6 +2370,34 @@ bool MipsAsmParser::parseDirectiveSet() {
   return true;
 }
 
+bool MipsAsmParser::parseDirectiveMipsHackStocg() {
+  MCAsmParser &Parser = getParser();
+  StringRef Name;
+  if (Parser.parseIdentifier(Name))
+    reportParseError("expected identifier");
+
+  MCSymbol *Sym = getContext().GetOrCreateSymbol(Name);
+  if (getLexer().isNot(AsmToken::Comma))
+    return TokError("unexpected token");
+  Lex();
+
+  int64_t Flags = 0;
+  if (Parser.parseAbsoluteExpression(Flags))
+    return TokError("unexpected token");
+
+  getTargetStreamer().emitMipsHackSTOCG(Sym, Flags);
+  return false;
+}
+
+bool MipsAsmParser::parseDirectiveMipsHackELFFlags() {
+  int64_t Flags = 0;
+  if (Parser.parseAbsoluteExpression(Flags))
+    return TokError("unexpected token");
+
+  getTargetStreamer().emitMipsHackELFFlags(Flags);
+  return false;
+}
+
 /// parseDirectiveWord
 ///  ::= .word [ expression (, expression)* ]
 bool MipsAsmParser::parseDirectiveWord(unsigned Size, SMLoc L) {
@@ -1835,6 +2423,22 @@ bool MipsAsmParser::parseDirectiveWord(unsigned Size, SMLoc L) {
   return false;
 }
 
+/// parseDirectiveGpWord
+///  ::= .gpword local_sym
+bool MipsAsmParser::parseDirectiveGpWord() {
+  const MCExpr *Value;
+  // EmitGPRel32Value requires an expression, so we are using base class
+  // method to evaluate the expression.
+  if (getParser().parseExpression(Value))
+    return true;
+  getParser().getStreamer().EmitGPRel32Value(Value);
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return Error(getLexer().getLoc(), "unexpected token in directive");
+  Parser.Lex(); // Eat EndOfStatement token.
+  return false;
+}
+
 bool MipsAsmParser::ParseDirective(AsmToken DirectiveID) {
 
   StringRef IDVal = DirectiveID.getString();
@@ -1875,7 +2479,7 @@ bool MipsAsmParser::ParseDirective(AsmToken DirectiveID) {
 
   if (IDVal == ".gpword") {
     // Ignore this directive for now.
-    Parser.eatToEndOfStatement();
+    parseDirectiveGpWord();
     return false;
   }
 
@@ -1884,6 +2488,12 @@ bool MipsAsmParser::ParseDirective(AsmToken DirectiveID) {
     return false;
   }
 
+  if (IDVal == ".mips_hack_stocg")
+    return parseDirectiveMipsHackStocg();
+
+  if (IDVal == ".mips_hack_elf_flags")
+    return parseDirectiveMipsHackELFFlags();
+
   return true;
 }
 
diff --git a/lib/Target/Mips/CMakeLists.txt b/lib/Target/Mips/CMakeLists.txt
index aedb78b..6acc9a8 100644
--- a/lib/Target/Mips/CMakeLists.txt
+++ b/lib/Target/Mips/CMakeLists.txt
@@ -35,7 +35,6 @@ add_llvm_target(MipsCodeGen
   MipsMachineFunction.cpp
   MipsModuleISelDAGToDAG.cpp
   MipsOs16.cpp
-  MipsOptimizeMathLibCalls.cpp
   MipsRegisterInfo.cpp
   MipsSEFrameLowering.cpp
   MipsSEInstrInfo.cpp
diff --git a/lib/Target/Mips/Disassembler/MipsDisassembler.cpp b/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
index d99df4d..60508a8 100644
--- a/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
+++ b/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
@@ -35,26 +35,33 @@ public:
   ///
   MipsDisassemblerBase(const MCSubtargetInfo &STI, const MCRegisterInfo *Info,
                        bool bigEndian) :
-    MCDisassembler(STI), RegInfo(Info), isBigEndian(bigEndian) {}
+    MCDisassembler(STI), RegInfo(Info),
+    IsN64(STI.getFeatureBits() & Mips::FeatureN64), isBigEndian(bigEndian) {}
 
   virtual ~MipsDisassemblerBase() {}
 
   const MCRegisterInfo *getRegInfo() const { return RegInfo.get(); }
 
+  bool isN64() const { return IsN64; }
+
 private:
   OwningPtr<const MCRegisterInfo> RegInfo;
+  bool IsN64;
 protected:
   bool isBigEndian;
 };
 
 /// MipsDisassembler - a disasembler class for Mips32.
 class MipsDisassembler : public MipsDisassemblerBase {
+  bool IsMicroMips;
 public:
   /// Constructor     - Initializes the disassembler.
   ///
   MipsDisassembler(const MCSubtargetInfo &STI, const MCRegisterInfo *Info,
                    bool bigEndian) :
-    MipsDisassemblerBase(STI, Info, bigEndian) {}
+    MipsDisassemblerBase(STI, Info, bigEndian) {
+      IsMicroMips = STI.getFeatureBits() & Mips::FeatureMicroMips;
+    }
 
   /// getInstruction - See MCDisassembler.
   virtual DecodeStatus getInstruction(MCInst &instr,
@@ -103,10 +110,15 @@ static DecodeStatus DecodeGPR32RegisterClass(MCInst &Inst,
                                              uint64_t Address,
                                              const void *Decoder);
 
-static DecodeStatus DecodeDSPRegsRegisterClass(MCInst &Inst,
-                                               unsigned RegNo,
-                                               uint64_t Address,
-                                               const void *Decoder);
+static DecodeStatus DecodePtrRegisterClass(MCInst &Inst,
+                                           unsigned Insn,
+                                           uint64_t Address,
+                                           const void *Decoder);
+
+static DecodeStatus DecodeDSPRRegisterClass(MCInst &Inst,
+                                            unsigned RegNo,
+                                            uint64_t Address,
+                                            const void *Decoder);
 
 static DecodeStatus DecodeFGR64RegisterClass(MCInst &Inst,
                                              unsigned RegNo,
@@ -118,6 +130,11 @@ static DecodeStatus DecodeFGR32RegisterClass(MCInst &Inst,
                                              uint64_t Address,
                                              const void *Decoder);
 
+static DecodeStatus DecodeFGRH32RegisterClass(MCInst &Inst,
+                                              unsigned RegNo,
+                                              uint64_t Address,
+                                              const void *Decoder);
+
 static DecodeStatus DecodeCCRRegisterClass(MCInst &Inst,
                                            unsigned RegNo,
                                            uint64_t Address,
@@ -138,20 +155,45 @@ static DecodeStatus DecodeAFGR64RegisterClass(MCInst &Inst,
                                               uint64_t Address,
                                               const void *Decoder);
 
-static DecodeStatus DecodeACRegsDSPRegisterClass(MCInst &Inst,
-                                                 unsigned RegNo,
-                                                 uint64_t Address,
-                                                 const void *Decoder);
+static DecodeStatus DecodeACC64DSPRegisterClass(MCInst &Inst,
+                                                unsigned RegNo,
+                                                uint64_t Address,
+                                                const void *Decoder);
 
-static DecodeStatus DecodeHIRegsDSPRegisterClass(MCInst &Inst,
-                                                 unsigned RegNo,
-                                                 uint64_t Address,
-                                                 const void *Decoder);
+static DecodeStatus DecodeHI32DSPRegisterClass(MCInst &Inst,
+                                               unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder);
 
-static DecodeStatus DecodeLORegsDSPRegisterClass(MCInst &Inst,
-                                                 unsigned RegNo,
-                                                 uint64_t Address,
-                                                 const void *Decoder);
+static DecodeStatus DecodeLO32DSPRegisterClass(MCInst &Inst,
+                                               unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder);
+
+static DecodeStatus DecodeMSA128BRegisterClass(MCInst &Inst,
+                                               unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder);
+
+static DecodeStatus DecodeMSA128HRegisterClass(MCInst &Inst,
+                                               unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder);
+
+static DecodeStatus DecodeMSA128WRegisterClass(MCInst &Inst,
+                                               unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder);
+
+static DecodeStatus DecodeMSA128DRegisterClass(MCInst &Inst,
+                                               unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder);
+
+static DecodeStatus DecodeMSACtrlRegisterClass(MCInst &Inst,
+                                               unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder);
 
 static DecodeStatus DecodeBranchTarget(MCInst &Inst,
                                        unsigned Offset,
@@ -163,11 +205,38 @@ static DecodeStatus DecodeJumpTarget(MCInst &Inst,
                                      uint64_t Address,
                                      const void *Decoder);
 
+// DecodeBranchTargetMM - Decode microMIPS branch offset, which is
+// shifted left by 1 bit.
+static DecodeStatus DecodeBranchTargetMM(MCInst &Inst,
+                                         unsigned Offset,
+                                         uint64_t Address,
+                                         const void *Decoder);
+
+// DecodeJumpTargetMM - Decode microMIPS jump target, which is
+// shifted left by 1 bit.
+static DecodeStatus DecodeJumpTargetMM(MCInst &Inst,
+                                       unsigned Insn,
+                                       uint64_t Address,
+                                       const void *Decoder);
+
 static DecodeStatus DecodeMem(MCInst &Inst,
                               unsigned Insn,
                               uint64_t Address,
                               const void *Decoder);
 
+static DecodeStatus DecodeMSA128Mem(MCInst &Inst, unsigned Insn,
+                                    uint64_t Address, const void *Decoder);
+
+static DecodeStatus DecodeMemMMImm12(MCInst &Inst,
+                                     unsigned Insn,
+                                     uint64_t Address,
+                                     const void *Decoder);
+
+static DecodeStatus DecodeMemMMImm16(MCInst &Inst,
+                                     unsigned Insn,
+                                     uint64_t Address,
+                                     const void *Decoder);
+
 static DecodeStatus DecodeFMem(MCInst &Inst, unsigned Insn,
                                uint64_t Address,
                                const void *Decoder);
@@ -177,6 +246,13 @@ static DecodeStatus DecodeSimm16(MCInst &Inst,
                                  uint64_t Address,
                                  const void *Decoder);
 
+// Decode the immediate field of an LSA instruction which
+// is off by one.
+static DecodeStatus DecodeLSAImm(MCInst &Inst,
+                                 unsigned Insn,
+                                 uint64_t Address,
+                                 const void *Decoder);
+
 static DecodeStatus DecodeInsSize(MCInst &Inst,
                                   unsigned Insn,
                                   uint64_t Address,
@@ -237,7 +313,8 @@ static DecodeStatus readInstruction32(const MemoryObject &region,
                                       uint64_t address,
                                       uint64_t &size,
                                       uint32_t &insn,
-                                      bool isBigEndian) {
+                                      bool isBigEndian,
+                                      bool IsMicroMips) {
   uint8_t Bytes[4];
 
   // We want to read exactly 4 Bytes of data.
@@ -255,10 +332,20 @@ static DecodeStatus readInstruction32(const MemoryObject &region,
   }
   else {
     // Encoded as a small-endian 32-bit word in the stream.
-    insn = (Bytes[0] <<  0) |
-           (Bytes[1] <<  8) |
-           (Bytes[2] << 16) |
-           (Bytes[3] << 24);
+    // Little-endian byte ordering:
+    //   mips32r2:   4 | 3 | 2 | 1
+    //   microMIPS:  2 | 1 | 4 | 3
+    if (IsMicroMips) {
+      insn = (Bytes[2] <<  0) |
+             (Bytes[3] <<  8) |
+             (Bytes[0] << 16) |
+             (Bytes[1] << 24);
+    } else {
+      insn = (Bytes[0] <<  0) |
+             (Bytes[1] <<  8) |
+             (Bytes[2] << 16) |
+             (Bytes[3] << 24);
+    }
   }
 
   return MCDisassembler::Success;
@@ -274,10 +361,21 @@ MipsDisassembler::getInstruction(MCInst &instr,
   uint32_t Insn;
 
   DecodeStatus Result = readInstruction32(Region, Address, Size,
-                                          Insn, isBigEndian);
+                                          Insn, isBigEndian, IsMicroMips);
   if (Result == MCDisassembler::Fail)
     return MCDisassembler::Fail;
 
+  if (IsMicroMips) {
+    // Calling the auto-generated decoder function.
+    Result = decodeInstruction(DecoderTableMicroMips32, instr, Insn, Address,
+                               this, STI);
+    if (Result != MCDisassembler::Fail) {
+      Size = 4;
+      return Result;
+    }
+    return MCDisassembler::Fail;
+  }
+
   // Calling the auto-generated decoder function.
   Result = decodeInstruction(DecoderTableMips32, instr, Insn, Address,
                              this, STI);
@@ -299,7 +397,7 @@ Mips64Disassembler::getInstruction(MCInst &instr,
   uint32_t Insn;
 
   DecodeStatus Result = readInstruction32(Region, Address, Size,
-                                          Insn, isBigEndian);
+                                          Insn, isBigEndian, false);
   if (Result == MCDisassembler::Fail)
     return MCDisassembler::Fail;
 
@@ -359,10 +457,20 @@ static DecodeStatus DecodeGPR32RegisterClass(MCInst &Inst,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeDSPRegsRegisterClass(MCInst &Inst,
-                                               unsigned RegNo,
-                                               uint64_t Address,
-                                               const void *Decoder) {
+static DecodeStatus DecodePtrRegisterClass(MCInst &Inst,
+                                           unsigned RegNo,
+                                           uint64_t Address,
+                                           const void *Decoder) {
+  if (static_cast<const MipsDisassembler *>(Decoder)->isN64())
+    return DecodeGPR64RegisterClass(Inst, RegNo, Address, Decoder);
+
+  return DecodeGPR32RegisterClass(Inst, RegNo, Address, Decoder);
+}
+
+static DecodeStatus DecodeDSPRRegisterClass(MCInst &Inst,
+                                            unsigned RegNo,
+                                            uint64_t Address,
+                                            const void *Decoder) {
   return DecodeGPR32RegisterClass(Inst, RegNo, Address, Decoder);
 }
 
@@ -390,6 +498,18 @@ static DecodeStatus DecodeFGR32RegisterClass(MCInst &Inst,
   return MCDisassembler::Success;
 }
 
+static DecodeStatus DecodeFGRH32RegisterClass(MCInst &Inst,
+                                              unsigned RegNo,
+                                              uint64_t Address,
+                                              const void *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+
+  unsigned Reg = getReg(Decoder, Mips::FGRH32RegClassID, RegNo);
+  Inst.addOperand(MCOperand::CreateReg(Reg));
+  return MCDisassembler::Success;
+}
+
 static DecodeStatus DecodeCCRRegisterClass(MCInst &Inst,
                                            unsigned RegNo,
                                            uint64_t Address,
@@ -434,6 +554,58 @@ static DecodeStatus DecodeMem(MCInst &Inst,
   return MCDisassembler::Success;
 }
 
+static DecodeStatus DecodeMSA128Mem(MCInst &Inst, unsigned Insn,
+                                    uint64_t Address, const void *Decoder) {
+  int Offset = SignExtend32<10>(fieldFromInstruction(Insn, 16, 10));
+  unsigned Reg = fieldFromInstruction(Insn, 6, 5);
+  unsigned Base = fieldFromInstruction(Insn, 11, 5);
+
+  Reg = getReg(Decoder, Mips::MSA128BRegClassID, Reg);
+  Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+
+  Inst.addOperand(MCOperand::CreateReg(Reg));
+  Inst.addOperand(MCOperand::CreateReg(Base));
+  Inst.addOperand(MCOperand::CreateImm(Offset));
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeMemMMImm12(MCInst &Inst,
+                                     unsigned Insn,
+                                     uint64_t Address,
+                                     const void *Decoder) {
+  int Offset = SignExtend32<12>(Insn & 0x0fff);
+  unsigned Reg = fieldFromInstruction(Insn, 21, 5);
+  unsigned Base = fieldFromInstruction(Insn, 16, 5);
+
+  Reg = getReg(Decoder, Mips::GPR32RegClassID, Reg);
+  Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+
+  Inst.addOperand(MCOperand::CreateReg(Reg));
+  Inst.addOperand(MCOperand::CreateReg(Base));
+  Inst.addOperand(MCOperand::CreateImm(Offset));
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeMemMMImm16(MCInst &Inst,
+                                     unsigned Insn,
+                                     uint64_t Address,
+                                     const void *Decoder) {
+  int Offset = SignExtend32<16>(Insn & 0xffff);
+  unsigned Reg = fieldFromInstruction(Insn, 21, 5);
+  unsigned Base = fieldFromInstruction(Insn, 16, 5);
+
+  Reg = getReg(Decoder, Mips::GPR32RegClassID, Reg);
+  Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+
+  Inst.addOperand(MCOperand::CreateReg(Reg));
+  Inst.addOperand(MCOperand::CreateReg(Base));
+  Inst.addOperand(MCOperand::CreateImm(Offset));
+
+  return MCDisassembler::Success;
+}
+
 static DecodeStatus DecodeFMem(MCInst &Inst,
                                unsigned Insn,
                                uint64_t Address,
@@ -477,38 +649,98 @@ static DecodeStatus DecodeAFGR64RegisterClass(MCInst &Inst,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeACRegsDSPRegisterClass(MCInst &Inst,
-                                                 unsigned RegNo,
-                                                 uint64_t Address,
-                                                 const void *Decoder) {
+static DecodeStatus DecodeACC64DSPRegisterClass(MCInst &Inst,
+                                                unsigned RegNo,
+                                                uint64_t Address,
+                                                const void *Decoder) {
   if (RegNo >= 4)
     return MCDisassembler::Fail;
 
-  unsigned Reg = getReg(Decoder, Mips::ACRegsDSPRegClassID, RegNo);
+  unsigned Reg = getReg(Decoder, Mips::ACC64DSPRegClassID, RegNo);
   Inst.addOperand(MCOperand::CreateReg(Reg));
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeHIRegsDSPRegisterClass(MCInst &Inst,
-                                                 unsigned RegNo,
-                                                 uint64_t Address,
-                                                 const void *Decoder) {
+static DecodeStatus DecodeHI32DSPRegisterClass(MCInst &Inst,
+                                               unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder) {
   if (RegNo >= 4)
     return MCDisassembler::Fail;
 
-  unsigned Reg = getReg(Decoder, Mips::HIRegsDSPRegClassID, RegNo);
+  unsigned Reg = getReg(Decoder, Mips::HI32DSPRegClassID, RegNo);
   Inst.addOperand(MCOperand::CreateReg(Reg));
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeLORegsDSPRegisterClass(MCInst &Inst,
-                                                 unsigned RegNo,
-                                                 uint64_t Address,
-                                                 const void *Decoder) {
+static DecodeStatus DecodeLO32DSPRegisterClass(MCInst &Inst,
+                                               unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder) {
   if (RegNo >= 4)
     return MCDisassembler::Fail;
 
-  unsigned Reg = getReg(Decoder, Mips::LORegsDSPRegClassID, RegNo);
+  unsigned Reg = getReg(Decoder, Mips::LO32DSPRegClassID, RegNo);
+  Inst.addOperand(MCOperand::CreateReg(Reg));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeMSA128BRegisterClass(MCInst &Inst,
+                                               unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+
+  unsigned Reg = getReg(Decoder, Mips::MSA128BRegClassID, RegNo);
+  Inst.addOperand(MCOperand::CreateReg(Reg));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeMSA128HRegisterClass(MCInst &Inst,
+                                               unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+
+  unsigned Reg = getReg(Decoder, Mips::MSA128HRegClassID, RegNo);
+  Inst.addOperand(MCOperand::CreateReg(Reg));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeMSA128WRegisterClass(MCInst &Inst,
+                                               unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+
+  unsigned Reg = getReg(Decoder, Mips::MSA128WRegClassID, RegNo);
+  Inst.addOperand(MCOperand::CreateReg(Reg));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeMSA128DRegisterClass(MCInst &Inst,
+                                               unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+
+  unsigned Reg = getReg(Decoder, Mips::MSA128DRegClassID, RegNo);
+  Inst.addOperand(MCOperand::CreateReg(Reg));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeMSACtrlRegisterClass(MCInst &Inst,
+                                               unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder) {
+  if (RegNo > 7)
+    return MCDisassembler::Fail;
+
+  unsigned Reg = getReg(Decoder, Mips::MSACtrlRegClassID, RegNo);
   Inst.addOperand(MCOperand::CreateReg(Reg));
   return MCDisassembler::Success;
 }
@@ -533,6 +765,24 @@ static DecodeStatus DecodeJumpTarget(MCInst &Inst,
   return MCDisassembler::Success;
 }
 
+static DecodeStatus DecodeBranchTargetMM(MCInst &Inst,
+                                         unsigned Offset,
+                                         uint64_t Address,
+                                         const void *Decoder) {
+  unsigned BranchOffset = Offset & 0xffff;
+  BranchOffset = SignExtend32<18>(BranchOffset << 1);
+  Inst.addOperand(MCOperand::CreateImm(BranchOffset));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeJumpTargetMM(MCInst &Inst,
+                                       unsigned Insn,
+                                       uint64_t Address,
+                                       const void *Decoder) {
+  unsigned JumpOffset = fieldFromInstruction(Insn, 0, 26) << 1;
+  Inst.addOperand(MCOperand::CreateImm(JumpOffset));
+  return MCDisassembler::Success;
+}
 
 static DecodeStatus DecodeSimm16(MCInst &Inst,
                                  unsigned Insn,
@@ -542,6 +792,15 @@ static DecodeStatus DecodeSimm16(MCInst &Inst,
   return MCDisassembler::Success;
 }
 
+static DecodeStatus DecodeLSAImm(MCInst &Inst,
+                                 unsigned Insn,
+                                 uint64_t Address,
+                                 const void *Decoder) {
+  // We add one to the immediate field as it was encoded as 'imm - 1'.
+  Inst.addOperand(MCOperand::CreateImm(Insn + 1));
+  return MCDisassembler::Success;
+}
+
 static DecodeStatus DecodeInsSize(MCInst &Inst,
                                   unsigned Insn,
                                   uint64_t Address,
diff --git a/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp b/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp
index 369fece..7884589 100644
--- a/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp
+++ b/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp
@@ -184,6 +184,15 @@ void MipsInstPrinter::printUnsignedImm(const MCInst *MI, int opNum,
     printOperand(MI, opNum, O);
 }
 
+void MipsInstPrinter::printUnsignedImm8(const MCInst *MI, int opNum,
+                                        raw_ostream &O) {
+  const MCOperand &MO = MI->getOperand(opNum);
+  if (MO.isImm())
+    O << (unsigned short int)(unsigned char)MO.getImm();
+  else
+    printOperand(MI, opNum, O);
+}
+
 void MipsInstPrinter::
 printMemOperand(const MCInst *MI, int opNum, raw_ostream &O) {
   // Load/Store memory operands -- imm($reg)
@@ -211,6 +220,11 @@ printFCCOperand(const MCInst *MI, int opNum, raw_ostream &O) {
   O << MipsFCCToString((Mips::CondCode)MO.getImm());
 }
 
+void MipsInstPrinter::
+printSHFMask(const MCInst *MI, int opNum, raw_ostream &O) {
+  llvm_unreachable("TODO");
+}
+
 bool MipsInstPrinter::printAlias(const char *Str, const MCInst &MI,
                                  unsigned OpNo, raw_ostream &OS) {
   OS << "\t" << Str << "\t";
@@ -230,8 +244,11 @@ bool MipsInstPrinter::printAlias(const char *Str, const MCInst &MI,
 bool MipsInstPrinter::printAlias(const MCInst &MI, raw_ostream &OS) {
   switch (MI.getOpcode()) {
   case Mips::BEQ:
+    // beq $zero, $zero, $L2 => b $L2
     // beq $r0, $zero, $L2 => beqz $r0, $L2
-    return isReg<Mips::ZERO>(MI, 1) && printAlias("beqz", MI, 0, 2, OS);
+    return (isReg<Mips::ZERO>(MI, 0) && isReg<Mips::ZERO>(MI, 1) &&
+            printAlias("b", MI, 2, OS)) ||
+           (isReg<Mips::ZERO>(MI, 1) && printAlias("beqz", MI, 0, 2, OS));
   case Mips::BEQ64:
     // beq $r0, $zero, $L2 => beqz $r0, $L2
     return isReg<Mips::ZERO_64>(MI, 1) && printAlias("beqz", MI, 0, 2, OS);
@@ -257,6 +274,7 @@ bool MipsInstPrinter::printAlias(const MCInst &MI, raw_ostream &OS) {
     // jalr $ra, $r1 => jalr $r1
     return isReg<Mips::RA_64>(MI, 0) && printAlias("jalr", MI, 1, OS);
   case Mips::NOR:
+  case Mips::NOR_MM:
     // nor $r0, $r1, $zero => not $r0, $r1
     return isReg<Mips::ZERO>(MI, 2) && printAlias("not", MI, 0, 1, OS);
   case Mips::NOR64:
diff --git a/lib/Target/Mips/InstPrinter/MipsInstPrinter.h b/lib/Target/Mips/InstPrinter/MipsInstPrinter.h
index 1253ab0..f75ae24 100644
--- a/lib/Target/Mips/InstPrinter/MipsInstPrinter.h
+++ b/lib/Target/Mips/InstPrinter/MipsInstPrinter.h
@@ -93,9 +93,11 @@ public:
 private:
   void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printUnsignedImm(const MCInst *MI, int opNum, raw_ostream &O);
+  void printUnsignedImm8(const MCInst *MI, int opNum, raw_ostream &O);
   void printMemOperand(const MCInst *MI, int opNum, raw_ostream &O);
   void printMemOperandEA(const MCInst *MI, int opNum, raw_ostream &O);
   void printFCCOperand(const MCInst *MI, int opNum, raw_ostream &O);
+  void printSHFMask(const MCInst *MI, int opNum, raw_ostream &O);
 
   bool printAlias(const char *Str, const MCInst &MI, unsigned OpNo,
                   raw_ostream &OS);
diff --git a/lib/Target/Mips/MCTargetDesc/CMakeLists.txt b/lib/Target/Mips/MCTargetDesc/CMakeLists.txt
index 1f08789..9116748 100644
--- a/lib/Target/Mips/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/Mips/MCTargetDesc/CMakeLists.txt
@@ -5,7 +5,7 @@ add_llvm_library(LLVMMipsDesc
   MipsMCTargetDesc.cpp
   MipsELFObjectWriter.cpp
   MipsReginfo.cpp
-  MipsELFStreamer.cpp
+  MipsTargetStreamer.cpp
   )
 
 add_dependencies(LLVMMipsDesc MipsCommonTableGen)
diff --git a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
index 0b13607..3e70b23 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
@@ -45,6 +45,10 @@ static unsigned adjustFixupValue(unsigned Kind, uint64_t Value) {
   case Mips::fixup_Mips_GOT_DISP:
   case Mips::fixup_Mips_GOT_LO16:
   case Mips::fixup_Mips_CALL_LO16:
+  case Mips::fixup_MICROMIPS_LO16:
+  case Mips::fixup_MICROMIPS_GOT_PAGE:
+  case Mips::fixup_MICROMIPS_GOT_OFST:
+  case Mips::fixup_MICROMIPS_GOT_DISP:
     break;
   case Mips::fixup_Mips_PC16:
     // So far we are only using this type for branches.
@@ -65,6 +69,7 @@ static unsigned adjustFixupValue(unsigned Kind, uint64_t Value) {
   case Mips::fixup_Mips_GOT_Local:
   case Mips::fixup_Mips_GOT_HI16:
   case Mips::fixup_Mips_CALL_HI16:
+  case Mips::fixup_MICROMIPS_HI16:
     // Get the 2nd 16-bits. Also add 1 if bit 15 is 1.
     Value = ((Value + 0x8000) >> 16) & 0xffff;
     break;
@@ -76,6 +81,13 @@ static unsigned adjustFixupValue(unsigned Kind, uint64_t Value) {
     // Get the 4th 16-bits.
     Value = ((Value + 0x800080008000LL) >> 48) & 0xffff;
     break;
+  case Mips::fixup_MICROMIPS_26_S1:
+    Value >>= 1;
+    break;
+  case Mips::fixup_MICROMIPS_PC16_S1:
+    Value -= 4;
+    Value >>= 1;
+    break;
   }
 
   return Value;
@@ -188,7 +200,20 @@ public:
       { "fixup_Mips_GOT_HI16",     0,     16,   0 },
       { "fixup_Mips_GOT_LO16",     0,     16,   0 },
       { "fixup_Mips_CALL_HI16",    0,     16,   0 },
-      { "fixup_Mips_CALL_LO16",    0,     16,   0 }
+      { "fixup_Mips_CALL_LO16",    0,     16,   0 },
+      { "fixup_MICROMIPS_26_S1",   0,     26,   0 },
+      { "fixup_MICROMIPS_HI16",    0,     16,   0 },
+      { "fixup_MICROMIPS_LO16",    0,     16,   0 },
+      { "fixup_MICROMIPS_GOT16",   0,     16,   0 },
+      { "fixup_MICROMIPS_PC16_S1", 0,     16,   MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_MICROMIPS_CALL16",  0,     16,   0 },
+      { "fixup_MICROMIPS_GOT_DISP",        0,     16,   0 },
+      { "fixup_MICROMIPS_GOT_PAGE",        0,     16,   0 },
+      { "fixup_MICROMIPS_GOT_OFST",        0,     16,   0 },
+      { "fixup_MICROMIPS_TLS_DTPREL_HI16", 0,     16,   0 },
+      { "fixup_MICROMIPS_TLS_DTPREL_LO16", 0,     16,   0 },
+      { "fixup_MICROMIPS_TLS_TPREL_HI16",  0,     16,   0 },
+      { "fixup_MICROMIPS_TLS_TPREL_LO16",  0,     16,   0 }
     };
 
     if (Kind < FirstTargetFixupKind)
@@ -253,25 +278,33 @@ public:
 } // namespace
 
 // MCAsmBackend
-MCAsmBackend *llvm::createMipsAsmBackendEL32(const Target &T, StringRef TT,
+MCAsmBackend *llvm::createMipsAsmBackendEL32(const Target &T,
+                                             const MCRegisterInfo &MRI,
+                                             StringRef TT,
                                              StringRef CPU) {
   return new MipsAsmBackend(T, Triple(TT).getOS(),
                             /*IsLittle*/true, /*Is64Bit*/false);
 }
 
-MCAsmBackend *llvm::createMipsAsmBackendEB32(const Target &T, StringRef TT,
+MCAsmBackend *llvm::createMipsAsmBackendEB32(const Target &T,
+                                             const MCRegisterInfo &MRI,
+                                             StringRef TT,
                                              StringRef CPU) {
   return new MipsAsmBackend(T, Triple(TT).getOS(),
                             /*IsLittle*/false, /*Is64Bit*/false);
 }
 
-MCAsmBackend *llvm::createMipsAsmBackendEL64(const Target &T, StringRef TT,
+MCAsmBackend *llvm::createMipsAsmBackendEL64(const Target &T,
+                                             const MCRegisterInfo &MRI,
+                                             StringRef TT,
                                              StringRef CPU) {
   return new MipsAsmBackend(T, Triple(TT).getOS(),
                             /*IsLittle*/true, /*Is64Bit*/true);
 }
 
-MCAsmBackend *llvm::createMipsAsmBackendEB64(const Target &T, StringRef TT,
+MCAsmBackend *llvm::createMipsAsmBackendEB64(const Target &T,
+                                             const MCRegisterInfo &MRI,
+                                             StringRef TT,
                                              StringRef CPU) {
   return new MipsAsmBackend(T, Triple(TT).getOS(),
                             /*IsLittle*/false, /*Is64Bit*/true);
diff --git a/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp b/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
index 6471b51..83c7d4b 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
@@ -183,6 +183,45 @@ unsigned MipsELFObjectWriter::GetRelocType(const MCValue &Target,
   case Mips::fixup_Mips_CALL_LO16:
     Type = ELF::R_MIPS_CALL_LO16;
     break;
+  case Mips::fixup_MICROMIPS_26_S1:
+    Type = ELF::R_MICROMIPS_26_S1;
+    break;
+  case Mips::fixup_MICROMIPS_HI16:
+    Type = ELF::R_MICROMIPS_HI16;
+    break;
+  case Mips::fixup_MICROMIPS_LO16:
+    Type = ELF::R_MICROMIPS_LO16;
+    break;
+  case Mips::fixup_MICROMIPS_GOT16:
+    Type = ELF::R_MICROMIPS_GOT16;
+    break;
+  case Mips::fixup_MICROMIPS_PC16_S1:
+    Type = ELF::R_MICROMIPS_PC16_S1;
+    break;
+  case Mips::fixup_MICROMIPS_CALL16:
+    Type = ELF::R_MICROMIPS_CALL16;
+    break;
+  case Mips::fixup_MICROMIPS_GOT_DISP:
+    Type = ELF::R_MICROMIPS_GOT_DISP;
+    break;
+  case Mips::fixup_MICROMIPS_GOT_PAGE:
+    Type = ELF::R_MICROMIPS_GOT_PAGE;
+    break;
+  case Mips::fixup_MICROMIPS_GOT_OFST:
+    Type = ELF::R_MICROMIPS_GOT_OFST;
+    break;
+  case Mips::fixup_MICROMIPS_TLS_DTPREL_HI16:
+    Type = ELF::R_MICROMIPS_TLS_DTPREL_HI16;
+    break;
+  case Mips::fixup_MICROMIPS_TLS_DTPREL_LO16:
+    Type = ELF::R_MICROMIPS_TLS_DTPREL_LO16;
+    break;
+  case Mips::fixup_MICROMIPS_TLS_TPREL_HI16:
+    Type = ELF::R_MICROMIPS_TLS_TPREL_HI16;
+    break;
+  case Mips::fixup_MICROMIPS_TLS_TPREL_LO16:
+    Type = ELF::R_MICROMIPS_TLS_TPREL_LO16;
+    break;
   }
   return Type;
 }
diff --git a/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp b/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp
deleted file mode 100644
index cfcb877..0000000
--- a/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp
+++ /dev/null
@@ -1,93 +0,0 @@
-//===-- MipsELFStreamer.cpp - MipsELFStreamer ---------------------------===//
-//
-//                       The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===-------------------------------------------------------------------===//
-#include "MCTargetDesc/MipsELFStreamer.h"
-#include "MipsSubtarget.h"
-#include "llvm/MC/MCAssembler.h"
-#include "llvm/MC/MCELF.h"
-#include "llvm/MC/MCELFSymbolFlags.h"
-#include "llvm/MC/MCSymbol.h"
-#include "llvm/Support/ELF.h"
-#include "llvm/Support/ErrorHandling.h"
-
-namespace llvm {
-
-  MCELFStreamer* createMipsELFStreamer(MCContext &Context, MCAsmBackend &TAB,
-                                       raw_ostream &OS, MCCodeEmitter *Emitter,
-                                       bool RelaxAll, bool NoExecStack) {
-    MipsELFStreamer *S = new MipsELFStreamer(Context, TAB, OS, Emitter,
-                                             RelaxAll, NoExecStack);
-    return S;
-  }
-
-  // For llc. Set a group of ELF header flags
-  void
-  MipsELFStreamer::emitELFHeaderFlagsCG(const MipsSubtarget &Subtarget) {
-
-    if (hasRawTextSupport())
-      return;
-
-    // Update e_header flags
-    MCAssembler& MCA = getAssembler();
-    unsigned EFlags = MCA.getELFHeaderEFlags();
-
-    // TODO: Need to add -mabicalls and -mno-abicalls flags.
-    // Currently we assume that -mabicalls is the default.
-    EFlags |= ELF::EF_MIPS_CPIC;
-
-    if (Subtarget.inMips16Mode())
-      EFlags |= ELF::EF_MIPS_ARCH_ASE_M16;
-    else
-      EFlags |= ELF::EF_MIPS_NOREORDER;
-
-    // Architecture
-    if (Subtarget.hasMips64r2())
-      EFlags |= ELF::EF_MIPS_ARCH_64R2;
-    else if (Subtarget.hasMips64())
-      EFlags |= ELF::EF_MIPS_ARCH_64;
-    else if (Subtarget.hasMips32r2())
-      EFlags |= ELF::EF_MIPS_ARCH_32R2;
-    else
-      EFlags |= ELF::EF_MIPS_ARCH_32;
-
-    if (Subtarget.inMicroMipsMode())
-      EFlags |= ELF::EF_MIPS_MICROMIPS;
-
-    // ABI
-    if (Subtarget.isABI_O32())
-      EFlags |= ELF::EF_MIPS_ABI_O32;
-
-    // Relocation Model
-    Reloc::Model RM = Subtarget.getRelocationModel();
-    if (RM == Reloc::PIC_ || RM == Reloc::Default)
-      EFlags |= ELF::EF_MIPS_PIC;
-    else if (RM == Reloc::Static)
-      ; // Do nothing for Reloc::Static
-    else
-      llvm_unreachable("Unsupported relocation model for e_flags");
-
-    MCA.setELFHeaderEFlags(EFlags);
-  }
-
-  // For llc. Set a symbol's STO flags
-  void
-  MipsELFStreamer::emitMipsSTOCG(const MipsSubtarget &Subtarget,
-                                 MCSymbol *Sym,
-                                 unsigned Val) {
-
-    if (hasRawTextSupport())
-      return;
-
-    MCSymbolData &Data = getOrCreateSymbolData(Sym);
-    // The "other" values are stored in the last 6 bits of the second byte
-    // The traditional defines for STO values assume the full byte and thus
-    // the shift to pack it.
-    MCELF::setOther(Data, Val >> 2);
-  }
-
-} // namespace llvm
diff --git a/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h b/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h
deleted file mode 100644
index b10ccc7..0000000
--- a/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h
+++ /dev/null
@@ -1,43 +0,0 @@
-//=== MipsELFStreamer.h - MipsELFStreamer ------------------------------===//
-//
-//                    The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENCE.TXT for details.
-//
-//===-------------------------------------------------------------------===//
-#ifndef MIPSELFSTREAMER_H_
-#define MIPSELFSTREAMER_H_
-
-#include "llvm/MC/MCELFStreamer.h"
-
-namespace llvm {
-class MipsAsmPrinter;
-class MipsSubtarget;
-class MCSymbol;
-
-class MipsELFStreamer : public MCELFStreamer {
-public:
-  MipsELFStreamer(MCContext &Context, MCAsmBackend &TAB,
-                  raw_ostream &OS, MCCodeEmitter *Emitter,
-                  bool RelaxAll, bool NoExecStack)
-    : MCELFStreamer(SK_MipsELFStreamer, Context, TAB, OS, Emitter) {
-  }
-
-  ~MipsELFStreamer() {}
-  void emitELFHeaderFlagsCG(const MipsSubtarget &Subtarget);
-  void emitMipsSTOCG(const MipsSubtarget &Subtarget,
-                     MCSymbol *Sym,
-                     unsigned Val);
-
-  static bool classof(const MCStreamer *S) {
-    return S->getKind() == SK_MipsELFStreamer;
-  }
-};
-
-  MCELFStreamer* createMipsELFStreamer(MCContext &Context, MCAsmBackend &TAB,
-                                       raw_ostream &OS, MCCodeEmitter *Emitter,
-                                       bool RelaxAll, bool NoExecStack);
-}
-
-#endif /* MIPSELFSTREAMER_H_ */
diff --git a/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h b/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h
index f963900..6ed44b7 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h
@@ -128,6 +128,45 @@ namespace Mips {
     // resulting in - R_MIPS_CALL_LO16
     fixup_Mips_CALL_LO16,
 
+    // resulting in - R_MICROMIPS_26_S1
+    fixup_MICROMIPS_26_S1,
+
+    // resulting in - R_MICROMIPS_HI16
+    fixup_MICROMIPS_HI16,
+
+    // resulting in - R_MICROMIPS_LO16
+    fixup_MICROMIPS_LO16,
+
+    // resulting in - R_MICROMIPS_GOT16
+    fixup_MICROMIPS_GOT16,
+
+    // resulting in - R_MICROMIPS_PC16_S1
+    fixup_MICROMIPS_PC16_S1,
+
+    // resulting in - R_MICROMIPS_CALL16
+    fixup_MICROMIPS_CALL16,
+
+    // resulting in - R_MICROMIPS_GOT_DISP
+    fixup_MICROMIPS_GOT_DISP,
+
+    // resulting in - R_MICROMIPS_GOT_PAGE
+    fixup_MICROMIPS_GOT_PAGE,
+
+    // resulting in - R_MICROMIPS_GOT_OFST
+    fixup_MICROMIPS_GOT_OFST,
+
+    // resulting in - R_MICROMIPS_TLS_DTPREL_HI16
+    fixup_MICROMIPS_TLS_DTPREL_HI16,
+
+    // resulting in - R_MICROMIPS_TLS_DTPREL_LO16
+    fixup_MICROMIPS_TLS_DTPREL_LO16,
+
+    // resulting in - R_MICROMIPS_TLS_TPREL_HI16
+    fixup_MICROMIPS_TLS_TPREL_HI16,
+
+    // resulting in - R_MICROMIPS_TLS_TPREL_LO16
+    fixup_MICROMIPS_TLS_TPREL_LO16,
+
     // Marker
     LastTargetFixupKind,
     NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
index 33f6f96..6aa3c76 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
@@ -38,7 +38,6 @@ MipsMCAsmInfo::MipsMCAsmInfo(StringRef TT) {
   ZeroDirective               = "\t.space\t";
   GPRel32Directive            = "\t.gpword\t";
   GPRel64Directive            = "\t.gpdword\t";
-  WeakRefDirective            = "\t.weak\t";
   DebugLabelSuffix            = "=.";
   SupportsDebugInformation = true;
   ExceptionsType = ExceptionHandling::DwarfCFI;
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h b/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h
index 772234e..1000113 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h
@@ -14,12 +14,12 @@
 #ifndef MIPSTARGETASMINFO_H
 #define MIPSTARGETASMINFO_H
 
-#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCAsmInfoELF.h"
 
 namespace llvm {
   class StringRef;
 
-  class MipsMCAsmInfo : public MCAsmInfo {
+  class MipsMCAsmInfo : public MCAsmInfoELF {
     virtual void anchor();
   public:
     explicit MipsMCAsmInfo(StringRef TT);
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
index 4dc6917..66428bd 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
@@ -39,11 +39,14 @@ class MipsMCCodeEmitter : public MCCodeEmitter {
   MCContext &Ctx;
   const MCSubtargetInfo &STI;
   bool IsLittleEndian;
+  bool IsMicroMips;
 
 public:
   MipsMCCodeEmitter(const MCInstrInfo &mcii, MCContext &Ctx_,
                     const MCSubtargetInfo &sti, bool IsLittle) :
-    MCII(mcii), Ctx(Ctx_), STI (sti), IsLittleEndian(IsLittle) {}
+    MCII(mcii), Ctx(Ctx_), STI (sti), IsLittleEndian(IsLittle) {
+      IsMicroMips = STI.getFeatureBits() & Mips::FeatureMicroMips;
+    }
 
   ~MipsMCCodeEmitter() {}
 
@@ -53,9 +56,17 @@ public:
 
   void EmitInstruction(uint64_t Val, unsigned Size, raw_ostream &OS) const {
     // Output the instruction encoding in little endian byte order.
-    for (unsigned i = 0; i < Size; ++i) {
-      unsigned Shift = IsLittleEndian ? i * 8 : (Size - 1 - i) * 8;
-      EmitByte((Val >> Shift) & 0xff, OS);
+    // Little-endian byte ordering:
+    //   mips32r2:   4 | 3 | 2 | 1
+    //   microMIPS:  2 | 1 | 4 | 3
+    if (IsLittleEndian && Size == 4 && IsMicroMips) {
+      EmitInstruction(Val>>16, 2, OS);
+      EmitInstruction(Val, 2, OS);
+    } else {
+      for (unsigned i = 0; i < Size; ++i) {
+        unsigned Shift = IsLittleEndian ? i * 8 : (Size - 1 - i) * 8;
+        EmitByte((Val >> Shift) & 0xff, OS);
+      }
     }
   }
 
@@ -73,12 +84,24 @@ public:
    unsigned getJumpTargetOpValue(const MCInst &MI, unsigned OpNo,
                                  SmallVectorImpl<MCFixup> &Fixups) const;
 
+  // getBranchJumpOpValueMM - Return binary encoding of the microMIPS jump
+  // target operand. If the machine operand requires relocation,
+  // record the relocation and return zero.
+  unsigned getJumpTargetOpValueMM(const MCInst &MI, unsigned OpNo,
+                                  SmallVectorImpl<MCFixup> &Fixups) const;
+
    // getBranchTargetOpValue - Return binary encoding of the branch
    // target operand. If the machine operand requires relocation,
    // record the relocation and return zero.
   unsigned getBranchTargetOpValue(const MCInst &MI, unsigned OpNo,
                                   SmallVectorImpl<MCFixup> &Fixups) const;
 
+  // getBranchTargetOpValue - Return binary encoding of the microMIPS branch
+  // target operand. If the machine operand requires relocation,
+  // record the relocation and return zero.
+  unsigned getBranchTargetOpValueMM(const MCInst &MI, unsigned OpNo,
+                                    SmallVectorImpl<MCFixup> &Fixups) const;
+
    // getMachineOpValue - Return binary encoding of operand. If the machin
    // operand requires relocation, record the relocation and return zero.
   unsigned getMachineOpValue(const MCInst &MI,const MCOperand &MO,
@@ -86,11 +109,17 @@ public:
 
   unsigned getMemEncoding(const MCInst &MI, unsigned OpNo,
                           SmallVectorImpl<MCFixup> &Fixups) const;
+  unsigned getMemEncodingMMImm12(const MCInst &MI, unsigned OpNo,
+                                 SmallVectorImpl<MCFixup> &Fixups) const;
   unsigned getSizeExtEncoding(const MCInst &MI, unsigned OpNo,
                               SmallVectorImpl<MCFixup> &Fixups) const;
   unsigned getSizeInsEncoding(const MCInst &MI, unsigned OpNo,
                               SmallVectorImpl<MCFixup> &Fixups) const;
 
+  // getLSAImmEncoding - Return binary encoding of LSA immediate.
+  unsigned getLSAImmEncoding(const MCInst &MI, unsigned OpNo,
+                             SmallVectorImpl<MCFixup> &Fixups) const;
+
   unsigned
   getExprOpValue(const MCExpr *Expr,SmallVectorImpl<MCFixup> &Fixups) const;
 
@@ -142,6 +171,9 @@ static void LowerLargeShift(MCInst& Inst) {
   case Mips::DSRA:
     Inst.setOpcode(Mips::DSRA32);
     return;
+  case Mips::DROTR:
+    Inst.setOpcode(Mips::DROTR32);
+    return;
   }
 }
 
@@ -177,7 +209,7 @@ static void LowerDextDins(MCInst& InstIn) {
 }
 
 /// EncodeInstruction - Emit the instruction.
-/// Size the instruction (currently only 4 bytes
+/// Size the instruction with Desc.getSize().
 void MipsMCCodeEmitter::
 EncodeInstruction(const MCInst &MI, raw_ostream &OS,
                   SmallVectorImpl<MCFixup> &Fixups) const
@@ -193,6 +225,7 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
   case Mips::DSLL:
   case Mips::DSRL:
   case Mips::DSRA:
+  case Mips::DROTR:
     LowerLargeShift(TmpInst);
     break;
     // Double extract instruction is chosen by pos and size operands
@@ -201,6 +234,7 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
     LowerDextDins(TmpInst);
   }
 
+  unsigned long N = Fixups.size();
   uint32_t Binary = getBinaryCodeForInstr(TmpInst, Fixups);
 
   // Check for unimplemented opcodes.
@@ -213,6 +247,8 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
   if (STI.getFeatureBits() & Mips::FeatureMicroMips) {
     int NewOpcode = Mips::Std2MicroMips (Opcode, Mips::Arch_micromips);
     if (NewOpcode != -1) {
+      if (Fixups.size() > N)
+        Fixups.pop_back();
       Opcode = NewOpcode;
       TmpInst.setOpcode (NewOpcode);
       Binary = getBinaryCodeForInstr(TmpInst, Fixups);
@@ -250,6 +286,28 @@ getBranchTargetOpValue(const MCInst &MI, unsigned OpNo,
   return 0;
 }
 
+/// getBranchTargetOpValue - Return binary encoding of the microMIPS branch
+/// target operand. If the machine operand requires relocation,
+/// record the relocation and return zero.
+unsigned MipsMCCodeEmitter::
+getBranchTargetOpValueMM(const MCInst &MI, unsigned OpNo,
+                         SmallVectorImpl<MCFixup> &Fixups) const {
+
+  const MCOperand &MO = MI.getOperand(OpNo);
+
+  // If the destination is an immediate, divide by 2.
+  if (MO.isImm()) return MO.getImm() >> 1;
+
+  assert(MO.isExpr() &&
+         "getBranchTargetOpValueMM expects only expressions or immediates");
+
+  const MCExpr *Expr = MO.getExpr();
+  Fixups.push_back(MCFixup::Create(0, Expr,
+                   MCFixupKind(Mips::
+                               fixup_MICROMIPS_PC16_S1)));
+  return 0;
+}
+
 /// getJumpTargetOpValue - Return binary encoding of the jump
 /// target operand. If the machine operand requires relocation,
 /// record the relocation and return zero.
@@ -271,6 +329,23 @@ getJumpTargetOpValue(const MCInst &MI, unsigned OpNo,
 }
 
 unsigned MipsMCCodeEmitter::
+getJumpTargetOpValueMM(const MCInst &MI, unsigned OpNo,
+                       SmallVectorImpl<MCFixup> &Fixups) const {
+
+  const MCOperand &MO = MI.getOperand(OpNo);
+  // If the destination is an immediate, divide by 2.
+  if (MO.isImm()) return MO.getImm() >> 1;
+
+  assert(MO.isExpr() &&
+         "getJumpTargetOpValueMM expects only expressions or an immediate");
+
+  const MCExpr *Expr = MO.getExpr();
+  Fixups.push_back(MCFixup::Create(0, Expr,
+                                   MCFixupKind(Mips::fixup_MICROMIPS_26_S1)));
+  return 0;
+}
+
+unsigned MipsMCCodeEmitter::
 getExprOpValue(const MCExpr *Expr,SmallVectorImpl<MCFixup> &Fixups) const {
   int64_t Res;
 
@@ -300,31 +375,39 @@ getExprOpValue(const MCExpr *Expr,SmallVectorImpl<MCFixup> &Fixups) const {
     FixupKind = Mips::fixup_Mips_GPOFF_LO;
     break;
   case MCSymbolRefExpr::VK_Mips_GOT_PAGE :
-    FixupKind = Mips::fixup_Mips_GOT_PAGE;
+    FixupKind = IsMicroMips ? Mips::fixup_MICROMIPS_GOT_PAGE
+                            : Mips::fixup_Mips_GOT_PAGE;
     break;
   case MCSymbolRefExpr::VK_Mips_GOT_OFST :
-    FixupKind = Mips::fixup_Mips_GOT_OFST;
+    FixupKind = IsMicroMips ? Mips::fixup_MICROMIPS_GOT_OFST
+                            : Mips::fixup_Mips_GOT_OFST;
     break;
   case MCSymbolRefExpr::VK_Mips_GOT_DISP :
-    FixupKind = Mips::fixup_Mips_GOT_DISP;
+    FixupKind = IsMicroMips ? Mips::fixup_MICROMIPS_GOT_DISP
+                            : Mips::fixup_Mips_GOT_DISP;
     break;
   case MCSymbolRefExpr::VK_Mips_GPREL:
     FixupKind = Mips::fixup_Mips_GPREL16;
     break;
   case MCSymbolRefExpr::VK_Mips_GOT_CALL:
-    FixupKind = Mips::fixup_Mips_CALL16;
+    FixupKind = IsMicroMips ? Mips::fixup_MICROMIPS_CALL16
+                            : Mips::fixup_Mips_CALL16;
     break;
   case MCSymbolRefExpr::VK_Mips_GOT16:
-    FixupKind = Mips::fixup_Mips_GOT_Global;
+    FixupKind = IsMicroMips ? Mips::fixup_MICROMIPS_GOT16
+                            : Mips::fixup_Mips_GOT_Global;
     break;
   case MCSymbolRefExpr::VK_Mips_GOT:
-    FixupKind = Mips::fixup_Mips_GOT_Local;
+    FixupKind = IsMicroMips ? Mips::fixup_MICROMIPS_GOT16
+                            : Mips::fixup_Mips_GOT_Local;
     break;
   case MCSymbolRefExpr::VK_Mips_ABS_HI:
-    FixupKind = Mips::fixup_Mips_HI16;
+    FixupKind = IsMicroMips ? Mips::fixup_MICROMIPS_HI16
+                            : Mips::fixup_Mips_HI16;
     break;
   case MCSymbolRefExpr::VK_Mips_ABS_LO:
-    FixupKind = Mips::fixup_Mips_LO16;
+    FixupKind = IsMicroMips ? Mips::fixup_MICROMIPS_LO16
+                            : Mips::fixup_Mips_LO16;
     break;
   case MCSymbolRefExpr::VK_Mips_TLSGD:
     FixupKind = Mips::fixup_Mips_TLSGD;
@@ -333,19 +416,23 @@ getExprOpValue(const MCExpr *Expr,SmallVectorImpl<MCFixup> &Fixups) const {
     FixupKind = Mips::fixup_Mips_TLSLDM;
     break;
   case MCSymbolRefExpr::VK_Mips_DTPREL_HI:
-    FixupKind = Mips::fixup_Mips_DTPREL_HI;
+    FixupKind = IsMicroMips ? Mips::fixup_MICROMIPS_TLS_DTPREL_HI16
+                            : Mips::fixup_Mips_DTPREL_HI;
     break;
   case MCSymbolRefExpr::VK_Mips_DTPREL_LO:
-    FixupKind = Mips::fixup_Mips_DTPREL_LO;
+    FixupKind = IsMicroMips ? Mips::fixup_MICROMIPS_TLS_DTPREL_LO16
+                            : Mips::fixup_Mips_DTPREL_LO;
     break;
   case MCSymbolRefExpr::VK_Mips_GOTTPREL:
     FixupKind = Mips::fixup_Mips_GOTTPREL;
     break;
   case MCSymbolRefExpr::VK_Mips_TPREL_HI:
-    FixupKind = Mips::fixup_Mips_TPREL_HI;
+    FixupKind = IsMicroMips ? Mips::fixup_MICROMIPS_TLS_TPREL_HI16
+                            : Mips::fixup_Mips_TPREL_HI;
     break;
   case MCSymbolRefExpr::VK_Mips_TPREL_LO:
-    FixupKind = Mips::fixup_Mips_TPREL_LO;
+    FixupKind = IsMicroMips ? Mips::fixup_MICROMIPS_TLS_TPREL_LO16
+                            : Mips::fixup_Mips_TPREL_LO;
     break;
   case MCSymbolRefExpr::VK_Mips_HIGHER:
     FixupKind = Mips::fixup_Mips_HIGHER;
@@ -406,6 +493,17 @@ MipsMCCodeEmitter::getMemEncoding(const MCInst &MI, unsigned OpNo,
   return (OffBits & 0xFFFF) | RegBits;
 }
 
+unsigned MipsMCCodeEmitter::
+getMemEncodingMMImm12(const MCInst &MI, unsigned OpNo,
+                      SmallVectorImpl<MCFixup> &Fixups) const {
+  // Base register is encoded in bits 20-16, offset is encoded in bits 11-0.
+  assert(MI.getOperand(OpNo).isReg());
+  unsigned RegBits = getMachineOpValue(MI, MI.getOperand(OpNo), Fixups) << 16;
+  unsigned OffBits = getMachineOpValue(MI, MI.getOperand(OpNo+1), Fixups);
+
+  return (OffBits & 0x0FFF) | RegBits;
+}
+
 unsigned
 MipsMCCodeEmitter::getSizeExtEncoding(const MCInst &MI, unsigned OpNo,
                                       SmallVectorImpl<MCFixup> &Fixups) const {
@@ -427,5 +525,13 @@ MipsMCCodeEmitter::getSizeInsEncoding(const MCInst &MI, unsigned OpNo,
   return Position + Size - 1;
 }
 
+unsigned
+MipsMCCodeEmitter::getLSAImmEncoding(const MCInst &MI, unsigned OpNo,
+                                     SmallVectorImpl<MCFixup> &Fixups) const {
+  assert(MI.getOperand(OpNo).isImm());
+  // The immediate is encoded as 'immediate - 1'.
+  return getMachineOpValue(MI, MI.getOperand(OpNo), Fixups) - 1;
+}
+
 #include "MipsGenMCCodeEmitter.inc"
 
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
index 837fabe..5548aaa 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
@@ -11,17 +11,21 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "MCTargetDesc/MipsELFStreamer.h"
 #include "MipsMCTargetDesc.h"
 #include "InstPrinter/MipsInstPrinter.h"
 #include "MipsMCAsmInfo.h"
+#include "MipsTargetStreamer.h"
 #include "llvm/MC/MCCodeGenInfo.h"
+#include "llvm/MC/MCELF.h"
+#include "llvm/MC/MCELFStreamer.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
-#include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MachineLocation.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/TargetRegistry.h"
 
 #define GET_INSTRINFO_MC_DESC
@@ -125,14 +129,23 @@ static MCInstPrinter *createMipsMCInstPrinter(const Target &T,
 }
 
 static MCStreamer *createMCStreamer(const Target &T, StringRef TT,
-                                    MCContext &Ctx, MCAsmBackend &MAB,
-                                    raw_ostream &_OS,
-                                    MCCodeEmitter *_Emitter,
-                                    bool RelaxAll,
-                                    bool NoExecStack) {
-  Triple TheTriple(TT);
-
-  return createMipsELFStreamer(Ctx, MAB, _OS, _Emitter, RelaxAll, NoExecStack);
+                                    MCContext &Context, MCAsmBackend &MAB,
+                                    raw_ostream &OS, MCCodeEmitter *Emitter,
+                                    bool RelaxAll, bool NoExecStack) {
+  MipsTargetELFStreamer *S = new MipsTargetELFStreamer();
+  return createELFStreamer(Context, S, MAB, OS, Emitter, RelaxAll, NoExecStack);
+}
+
+static MCStreamer *
+createMCAsmStreamer(MCContext &Ctx, formatted_raw_ostream &OS,
+                    bool isVerboseAsm, bool useLoc, bool useCFI,
+                    bool useDwarfDirectory, MCInstPrinter *InstPrint,
+                    MCCodeEmitter *CE, MCAsmBackend *TAB, bool ShowInst) {
+  MipsTargetAsmStreamer *S = new MipsTargetAsmStreamer(OS);
+
+  return llvm::createAsmStreamer(Ctx, S, OS, isVerboseAsm, useLoc, useCFI,
+                                 useDwarfDirectory, InstPrint, CE, TAB,
+                                 ShowInst);
 }
 
 extern "C" void LLVMInitializeMipsTargetMC() {
@@ -183,6 +196,12 @@ extern "C" void LLVMInitializeMipsTargetMC() {
   TargetRegistry::RegisterMCObjectStreamer(TheMips64elTarget,
                                            createMCStreamer);
 
+  // Register the asm streamer.
+  TargetRegistry::RegisterAsmStreamer(TheMipsTarget, createMCAsmStreamer);
+  TargetRegistry::RegisterAsmStreamer(TheMipselTarget, createMCAsmStreamer);
+  TargetRegistry::RegisterAsmStreamer(TheMips64Target, createMCAsmStreamer);
+  TargetRegistry::RegisterAsmStreamer(TheMips64elTarget, createMCAsmStreamer);
+
   // Register the asm backend.
   TargetRegistry::RegisterMCAsmBackend(TheMipsTarget,
                                        createMipsAsmBackendEB32);
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h b/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h
index 71954a4..eabebfe 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h
@@ -42,14 +42,14 @@ MCCodeEmitter *createMipsMCCodeEmitterEL(const MCInstrInfo &MCII,
                                          const MCSubtargetInfo &STI,
                                          MCContext &Ctx);
 
-MCAsmBackend *createMipsAsmBackendEB32(const Target &T, StringRef TT,
-                                       StringRef CPU);
-MCAsmBackend *createMipsAsmBackendEL32(const Target &T, StringRef TT,
-                                       StringRef CPU);
-MCAsmBackend *createMipsAsmBackendEB64(const Target &T, StringRef TT,
-                                       StringRef CPU);
-MCAsmBackend *createMipsAsmBackendEL64(const Target &T, StringRef TT,
-                                       StringRef CPU);
+MCAsmBackend *createMipsAsmBackendEB32(const Target &T, const MCRegisterInfo &MRI,
+                                       StringRef TT, StringRef CPU);
+MCAsmBackend *createMipsAsmBackendEL32(const Target &T, const MCRegisterInfo &MRI,
+                                       StringRef TT, StringRef CPU);
+MCAsmBackend *createMipsAsmBackendEB64(const Target &T, const MCRegisterInfo &MRI,
+                                       StringRef TT, StringRef CPU);
+MCAsmBackend *createMipsAsmBackendEL64(const Target &T, const MCRegisterInfo &MRI,
+                                       StringRef TT, StringRef CPU);
 
 MCObjectWriter *createMipsELFObjectWriter(raw_ostream &OS,
                                           uint8_t OSABI,
diff --git a/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp b/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
new file mode 100644
index 0000000..5e90bbc
--- /dev/null
+++ b/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
@@ -0,0 +1,67 @@
+//===-- MipsTargetStreamer.cpp - Mips Target Streamer Methods -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides Mips specific target streamer methods.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsTargetStreamer.h"
+#include "llvm/MC/MCELF.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+
+using namespace llvm;
+
+static cl::opt<bool> PrintHackDirectives("print-hack-directives",
+                                         cl::init(false), cl::Hidden);
+
+// pin vtable to this file
+void MipsTargetStreamer::anchor() {}
+
+MipsTargetAsmStreamer::MipsTargetAsmStreamer(formatted_raw_ostream &OS)
+    : OS(OS) {}
+
+void MipsTargetAsmStreamer::emitMipsHackELFFlags(unsigned Flags) {
+  if (!PrintHackDirectives)
+    return;
+
+  OS << "\t.mips_hack_elf_flags 0x";
+  OS.write_hex(Flags);
+  OS << '\n';
+}
+void MipsTargetAsmStreamer::emitMipsHackSTOCG(MCSymbol *Sym, unsigned Val) {
+  if (!PrintHackDirectives)
+    return;
+
+  OS << "\t.mips_hack_stocg ";
+  OS << Sym->getName();
+  OS << ", ";
+  OS << Val;
+  OS << '\n';
+}
+
+MCELFStreamer &MipsTargetELFStreamer::getStreamer() {
+  return static_cast<MCELFStreamer &>(*Streamer);
+}
+
+void MipsTargetELFStreamer::emitMipsHackELFFlags(unsigned Flags) {
+  MCAssembler &MCA = getStreamer().getAssembler();
+  MCA.setELFHeaderEFlags(Flags);
+}
+
+// Set a symbol's STO flags
+void MipsTargetELFStreamer::emitMipsHackSTOCG(MCSymbol *Sym, unsigned Val) {
+  MCSymbolData &Data = getStreamer().getOrCreateSymbolData(Sym);
+  // The "other" values are stored in the last 6 bits of the second byte
+  // The traditional defines for STO values assume the full byte and thus
+  // the shift to pack it.
+  MCELF::setOther(Data, Val >> 2);
+}
diff --git a/lib/Target/Mips/MSA.txt b/lib/Target/Mips/MSA.txt
new file mode 100644
index 0000000..d1c4193
--- /dev/null
+++ b/lib/Target/Mips/MSA.txt
@@ -0,0 +1,78 @@
+Code Generation Notes for MSA
+=============================
+
+Intrinsics are lowered to SelectionDAG nodes where possible in order to enable
+optimisation, reduce the size of the ISel matcher, and reduce repetition in
+the implementation. In a small number of cases, this can cause different
+(semantically equivalent) instructions to be used in place of the requested
+instruction, even when no optimisation has taken place.
+
+Instructions
+============
+
+This section describes any quirks of instruction selection for MSA. For
+example, two instructions might be equally valid for some given IR and one is
+chosen in preference to the other.
+
+bclri.b:
+        It is not possible to emit bclri.b since andi.b covers exactly the
+        same cases. andi.b should use fractionally less power than bclri.b in
+        most hardware implementations so it is used in preference to bclri.b.
+
+vshf.w:
+        It is not possible to emit vshf.w when the shuffle description is
+        constant since shf.w covers exactly the same cases. shf.w is used
+        instead. It is also impossible for the shuffle description to be
+        unknown at compile-time due to the definition of shufflevector in
+        LLVM IR.
+
+vshf.[bhwd]
+        When the shuffle description describes a splat operation, splat.[bhwd]
+        instructions will be selected instead of vshf.[bhwd]. Unlike the ilv*,
+        and pck* instructions, this is matched from MipsISD::VSHF instead of
+        a special-case MipsISD node.
+
+ilvl.d, pckev.d:
+        It is not possible to emit ilvl.d, or pckev.d since ilvev.d covers the
+        same shuffle. ilvev.d will be emitted instead.
+
+ilvr.d, ilvod.d, pckod.d:
+        It is not possible to emit ilvr.d, or pckod.d since ilvod.d covers the
+        same shuffle. ilvod.d will be emitted instead.
+
+splat.[bhwd]
+        The intrinsic will work as expected. However, unlike other intrinsics
+        it lowers directly to MipsISD::VSHF instead of using common IR.
+
+splati.w:
+        It is not possible to emit splati.w since shf.w covers the same cases.
+        shf.w will be emitted instead.
+
+copy_s.w:
+        On MIPS32, the copy_u.d intrinsic will emit this instruction instead of
+        copy_u.w. This is semantically equivalent since the general-purpose
+        register file is 32-bits wide.
+
+binsri.[bhwd],  binsli.[bhwd]:
+        These two operations are equivalent to each other with the operands
+        swapped and condition inverted. The compiler may use either one as
+        appropriate.
+        Furthermore, the compiler may use bsel.[bhwd] for some masks that do
+        not survive the legalization process (this is a bug and will be fixed).
+
+bmnz.v, bmz.v, bsel.v:
+        These three operations differ only in the operand that is tied to the
+        result.
+        It is (currently) not possible to emit bmz.v, or bsel.v since bmnz.v is
+        the same operation and will be emitted instead.
+        In future, the compiler may choose between these three instructions
+        according to register allocation.
+
+bmnzi.b, bmzi.b:
+        Like their non-immediate counterparts, bmnzi.v and bmzi.v are the same
+        operation with the operands swapped. bmnzi.v will (currently) be emitted
+        for both cases.
+
+bseli.v:
+        Unlike the non-immediate versions, bseli.v is distinguishable from
+        bmnzi.b and bmzi.b and can be emitted.
diff --git a/lib/Target/Mips/MicroMipsInstrFormats.td b/lib/Target/Mips/MicroMipsInstrFormats.td
index 665b4d2..c12a32e 100644
--- a/lib/Target/Mips/MicroMipsInstrFormats.td
+++ b/lib/Target/Mips/MicroMipsInstrFormats.td
@@ -39,8 +39,8 @@ class SLTI_FM_MM<bits<6> op> : MMArch {
   bits<32> Inst;
 
   let Inst{31-26} = op;
-  let Inst{25-21} = rs;
-  let Inst{20-16} = rt;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rs;
   let Inst{15-0}  = imm16;
 }
 
@@ -110,3 +110,195 @@ class LW_FM_MM<bits<6> op> : MMArch {
   let Inst{20-16} = addr{20-16};
   let Inst{15-0}  = addr{15-0};
 }
+
+class LWL_FM_MM<bits<4> funct> {
+  bits<5> rt;
+  bits<21> addr;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x18;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = addr{20-16};
+  let Inst{15-12} = funct;
+  let Inst{11-0}  = addr{11-0};
+}
+
+class CMov_F_I_FM_MM<bits<7> func> : MMArch {
+  bits<5> rd;
+  bits<5> rs;
+  bits<3> fcc;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x15;
+  let Inst{25-21} = rd;
+  let Inst{20-16} = rs;
+  let Inst{15-13} = fcc;
+  let Inst{12-6}  = func;
+  let Inst{5-0}   = 0x3b;
+}
+
+class MTLO_FM_MM<bits<10> funct> : MMArch {
+  bits<5> rs;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x00;
+  let Inst{25-21} = 0x00;
+  let Inst{20-16} = rs;
+  let Inst{15-6}  = funct;
+  let Inst{5-0}   = 0x3c;
+}
+
+class MFLO_FM_MM<bits<10> funct> : MMArch {
+  bits<5> rd;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x00;
+  let Inst{25-21} = 0x00;
+  let Inst{20-16} = rd;
+  let Inst{15-6}  = funct;
+  let Inst{5-0}   = 0x3c;
+}
+
+class CLO_FM_MM<bits<10> funct> : MMArch {
+  bits<5> rd;
+  bits<5> rs;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x00;
+  let Inst{25-21} = rd;
+  let Inst{20-16} = rs;
+  let Inst{15-6}  = funct;
+  let Inst{5-0}   = 0x3c;
+}
+
+class SEB_FM_MM<bits<10> funct> : MMArch {
+  bits<5> rd;
+  bits<5> rt;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x00;
+  let Inst{25-21} = rd;
+  let Inst{20-16} = rt;
+  let Inst{15-6}  = funct;
+  let Inst{5-0}   = 0x3c;
+}
+
+class EXT_FM_MM<bits<6> funct> : MMArch {
+  bits<5> rt;
+  bits<5> rs;
+  bits<5> pos;
+  bits<5> size;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x00;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rs;
+  let Inst{15-11} = size;
+  let Inst{10-6}  = pos;
+  let Inst{5-0}   = funct;
+}
+
+class J_FM_MM<bits<6> op> : MMArch {
+  bits<26> target;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = op;
+  let Inst{25-0}  = target;
+}
+
+class JR_FM_MM<bits<8> funct> : MMArch {
+  bits<5> rs;
+
+  bits<32> Inst;
+
+  let Inst{31-21} = 0x00;
+  let Inst{20-16} = rs;
+  let Inst{15-14} = 0x0;
+  let Inst{13-6}  = funct;
+  let Inst{5-0}   = 0x3c;
+}
+
+class JALR_FM_MM<bits<10> funct> : MMArch {
+  bits<5> rs;
+  bits<5> rd;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x00;
+  let Inst{25-21} = rd;
+  let Inst{20-16} = rs;
+  let Inst{15-6}  = funct;
+  let Inst{5-0}   = 0x3c;
+}
+
+class BEQ_FM_MM<bits<6> op> : MMArch {
+  bits<5>  rs;
+  bits<5>  rt;
+  bits<16> offset;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = op;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rs;
+  let Inst{15-0}  = offset;
+}
+
+class BGEZ_FM_MM<bits<5> funct> : MMArch {
+  bits<5>  rs;
+  bits<16> offset;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x10;
+  let Inst{25-21} = funct;
+  let Inst{20-16} = rs;
+  let Inst{15-0}  = offset;
+}
+
+class BGEZAL_FM_MM<bits<5> funct> : MMArch {
+  bits<5>  rs;
+  bits<16> offset;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x10;
+  let Inst{25-21} = funct;
+  let Inst{20-16} = rs;
+  let Inst{15-0}  = offset;
+}
+
+class TEQ_FM_MM<bits<6> funct> : MMArch {
+  bits<5> rs;
+  bits<5> rt;
+  bits<4> code_;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x00;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rs;
+  let Inst{15-12} = code_;
+  let Inst{11-6}  = funct;
+  let Inst{5-0}   = 0x3c;
+}
+
+class TEQI_FM_MM<bits<5> funct> : MMArch {
+  bits<5> rs;
+  bits<16> imm16;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x10;
+  let Inst{25-21} = funct;
+  let Inst{20-16} = rs;
+  let Inst{15-0}  = imm16;
+}
diff --git a/lib/Target/Mips/MicroMipsInstrInfo.td b/lib/Target/Mips/MicroMipsInstrInfo.td
index 249d712..d9507fa 100644
--- a/lib/Target/Mips/MicroMipsInstrInfo.td
+++ b/lib/Target/Mips/MicroMipsInstrInfo.td
@@ -1,4 +1,51 @@
-let isCodeGenOnly = 1 in {
+def addrimm12 : ComplexPattern<iPTR, 2, "selectIntAddrMM", [frameindex]>;
+
+def simm12 : Operand<i32> {
+  let DecoderMethod = "DecodeSimm12";
+}
+
+def mem_mm_12 : Operand<i32> {
+  let PrintMethod = "printMemOperand";
+  let MIOperandInfo = (ops GPR32, simm12);
+  let EncoderMethod = "getMemEncodingMMImm12";
+  let ParserMatchClass = MipsMemAsmOperand;
+  let OperandType = "OPERAND_MEMORY";
+}
+
+def jmptarget_mm : Operand<OtherVT> {
+  let EncoderMethod = "getJumpTargetOpValueMM";
+}
+
+def calltarget_mm : Operand<iPTR> {
+  let EncoderMethod = "getJumpTargetOpValueMM";
+}
+
+def brtarget_mm : Operand<OtherVT> {
+  let EncoderMethod = "getBranchTargetOpValueMM";
+  let OperandType   = "OPERAND_PCREL";
+  let DecoderMethod = "DecodeBranchTargetMM";
+}
+
+let canFoldAsLoad = 1 in
+class LoadLeftRightMM<string opstr, SDNode OpNode, RegisterOperand RO,
+                      Operand MemOpnd> :
+  InstSE<(outs RO:$rt), (ins MemOpnd:$addr, RO:$src),
+         !strconcat(opstr, "\t$rt, $addr"),
+         [(set RO:$rt, (OpNode addrimm12:$addr, RO:$src))],
+         NoItinerary, FrmI> {
+  let DecoderMethod = "DecodeMemMMImm12";
+  string Constraints = "$src = $rt";
+}
+
+class StoreLeftRightMM<string opstr, SDNode OpNode, RegisterOperand RO,
+                       Operand MemOpnd>:
+  InstSE<(outs), (ins RO:$rt, MemOpnd:$addr),
+         !strconcat(opstr, "\t$rt, $addr"),
+         [(OpNode RO:$rt, addrimm12:$addr)], NoItinerary, FrmI> {
+  let DecoderMethod = "DecodeMemMMImm12";
+}
+
+let DecoderNamespace = "MicroMips", Predicates = [InMicroMips] in {
   /// Arithmetic Instructions (ALU Immediate)
   def ADDiu_MM : MMRel, ArithLogicI<"addiu", simm16, GPR32Opnd>,
                  ADDI_FM_MM<0xc>;
@@ -32,17 +79,21 @@ let isCodeGenOnly = 1 in {
   def XOR_MM   : MMRel, ArithLogicR<"xor", GPR32Opnd, 1, IIAlu, xor>,
                  ADD_FM_MM<0, 0x310>;
   def NOR_MM   : MMRel, LogicNOR<"nor", GPR32Opnd>, ADD_FM_MM<0, 0x2d0>;
-  def MULT_MM  : MMRel, Mult<"mult", IIImul, GPR32Opnd, [HI, LO]>,
+  def MULT_MM  : MMRel, Mult<"mult", IIImul, GPR32Opnd, [HI0, LO0]>,
                  MULT_FM_MM<0x22c>;
-  def MULTu_MM : MMRel, Mult<"multu", IIImul, GPR32Opnd, [HI, LO]>,
+  def MULTu_MM : MMRel, Mult<"multu", IIImul, GPR32Opnd, [HI0, LO0]>,
                  MULT_FM_MM<0x26c>;
+  def SDIV_MM  : MMRel, Div<"div", IIIdiv, GPR32Opnd, [HI0, LO0]>,
+                 MULT_FM_MM<0x2ac>;
+  def UDIV_MM  : MMRel, Div<"divu", IIIdiv, GPR32Opnd, [HI0, LO0]>,
+                 MULT_FM_MM<0x2ec>;
 
   /// Shift Instructions
-  def SLL_MM   : MMRel, shift_rotate_imm<"sll", shamt, GPR32Opnd>,
+  def SLL_MM   : MMRel, shift_rotate_imm<"sll", uimm5, GPR32Opnd>,
                  SRA_FM_MM<0, 0>;
-  def SRL_MM   : MMRel, shift_rotate_imm<"srl", shamt, GPR32Opnd>,
+  def SRL_MM   : MMRel, shift_rotate_imm<"srl", uimm5, GPR32Opnd>,
                  SRA_FM_MM<0x40, 0>;
-  def SRA_MM   : MMRel, shift_rotate_imm<"sra", shamt, GPR32Opnd>,
+  def SRA_MM   : MMRel, shift_rotate_imm<"sra", uimm5, GPR32Opnd>,
                  SRA_FM_MM<0x80, 0>;
   def SLLV_MM  : MMRel, shift_rotate_reg<"sllv", GPR32Opnd>,
                  SRLV_FM_MM<0x10, 0>;
@@ -50,18 +101,119 @@ let isCodeGenOnly = 1 in {
                  SRLV_FM_MM<0x50, 0>;
   def SRAV_MM  : MMRel, shift_rotate_reg<"srav", GPR32Opnd>,
                  SRLV_FM_MM<0x90, 0>;
-  def ROTR_MM  : MMRel, shift_rotate_imm<"rotr", shamt, GPR32Opnd>,
+  def ROTR_MM  : MMRel, shift_rotate_imm<"rotr", uimm5, GPR32Opnd>,
                  SRA_FM_MM<0xc0, 0>;
   def ROTRV_MM : MMRel, shift_rotate_reg<"rotrv", GPR32Opnd>,
                  SRLV_FM_MM<0xd0, 0>;
 
   /// Load and Store Instructions - aligned
-  defm LB_MM  : LoadM<"lb", GPR32Opnd, sextloadi8>, MMRel, LW_FM_MM<0x7>;
-  defm LBu_MM : LoadM<"lbu", GPR32Opnd, zextloadi8>, MMRel, LW_FM_MM<0x5>;
-  defm LH_MM  : LoadM<"lh", GPR32Opnd, sextloadi16>, MMRel, LW_FM_MM<0xf>;
-  defm LHu_MM : LoadM<"lhu", GPR32Opnd, zextloadi16>, MMRel, LW_FM_MM<0xd>;
-  defm LW_MM  : LoadM<"lw", GPR32Opnd>, MMRel, LW_FM_MM<0x3f>;
-  defm SB_MM  : StoreM<"sb", GPR32Opnd, truncstorei8>, MMRel, LW_FM_MM<0x6>;
-  defm SH_MM  : StoreM<"sh", GPR32Opnd, truncstorei16>, MMRel, LW_FM_MM<0xe>;
-  defm SW_MM  : StoreM<"sw", GPR32Opnd>, MMRel, LW_FM_MM<0x3e>;
+  let DecoderMethod = "DecodeMemMMImm16" in {
+    def LB_MM  : Load<"lb", GPR32Opnd>, MMRel, LW_FM_MM<0x7>;
+    def LBu_MM : Load<"lbu", GPR32Opnd>, MMRel, LW_FM_MM<0x5>;
+    def LH_MM  : Load<"lh", GPR32Opnd>, MMRel, LW_FM_MM<0xf>;
+    def LHu_MM : Load<"lhu", GPR32Opnd>, MMRel, LW_FM_MM<0xd>;
+    def LW_MM  : Load<"lw", GPR32Opnd>, MMRel, LW_FM_MM<0x3f>;
+    def SB_MM  : Store<"sb", GPR32Opnd>, MMRel, LW_FM_MM<0x6>;
+    def SH_MM  : Store<"sh", GPR32Opnd>, MMRel, LW_FM_MM<0xe>;
+    def SW_MM  : Store<"sw", GPR32Opnd>, MMRel, LW_FM_MM<0x3e>;
+  }
+
+  /// Load and Store Instructions - unaligned
+  def LWL_MM : LoadLeftRightMM<"lwl", MipsLWL, GPR32Opnd, mem_mm_12>,
+               LWL_FM_MM<0x0>;
+  def LWR_MM : LoadLeftRightMM<"lwr", MipsLWR, GPR32Opnd, mem_mm_12>,
+               LWL_FM_MM<0x1>;
+  def SWL_MM : StoreLeftRightMM<"swl", MipsSWL, GPR32Opnd, mem_mm_12>,
+               LWL_FM_MM<0x8>;
+  def SWR_MM : StoreLeftRightMM<"swr", MipsSWR, GPR32Opnd, mem_mm_12>,
+               LWL_FM_MM<0x9>;
+
+  /// Move Conditional
+  def MOVZ_I_MM : MMRel, CMov_I_I_FT<"movz", GPR32Opnd, GPR32Opnd,
+                  NoItinerary>, ADD_FM_MM<0, 0x58>;
+  def MOVN_I_MM : MMRel, CMov_I_I_FT<"movn", GPR32Opnd, GPR32Opnd,
+                  NoItinerary>, ADD_FM_MM<0, 0x18>;
+  def MOVT_I_MM : MMRel, CMov_F_I_FT<"movt", GPR32Opnd, IIAlu>,
+                  CMov_F_I_FM_MM<0x25>;
+  def MOVF_I_MM : MMRel, CMov_F_I_FT<"movf", GPR32Opnd, IIAlu>,
+                  CMov_F_I_FM_MM<0x5>;
+
+  /// Move to/from HI/LO
+  def MTHI_MM : MMRel, MoveToLOHI<"mthi", GPR32Opnd, [HI0]>,
+                MTLO_FM_MM<0x0b5>;
+  def MTLO_MM : MMRel, MoveToLOHI<"mtlo", GPR32Opnd, [LO0]>,
+                MTLO_FM_MM<0x0f5>;
+  def MFHI_MM : MMRel, MoveFromLOHI<"mfhi", GPR32Opnd, AC0>,
+                MFLO_FM_MM<0x035>;
+  def MFLO_MM : MMRel, MoveFromLOHI<"mflo", GPR32Opnd, AC0>,
+                MFLO_FM_MM<0x075>;
+
+  /// Multiply Add/Sub Instructions
+  def MADD_MM  : MMRel, MArithR<"madd", 1>, MULT_FM_MM<0x32c>;
+  def MADDU_MM : MMRel, MArithR<"maddu", 1>, MULT_FM_MM<0x36c>;
+  def MSUB_MM  : MMRel, MArithR<"msub">, MULT_FM_MM<0x3ac>;
+  def MSUBU_MM : MMRel, MArithR<"msubu">, MULT_FM_MM<0x3ec>;
+
+  /// Count Leading
+  def CLZ_MM : MMRel, CountLeading0<"clz", GPR32Opnd>, CLO_FM_MM<0x16c>;
+  def CLO_MM : MMRel, CountLeading1<"clo", GPR32Opnd>, CLO_FM_MM<0x12c>;
+
+  /// Sign Ext In Register Instructions.
+  def SEB_MM : MMRel, SignExtInReg<"seb", i8, GPR32Opnd>, SEB_FM_MM<0x0ac>;
+  def SEH_MM : MMRel, SignExtInReg<"seh", i16, GPR32Opnd>, SEB_FM_MM<0x0ec>;
+
+  /// Word Swap Bytes Within Halfwords
+  def WSBH_MM : MMRel, SubwordSwap<"wsbh", GPR32Opnd>, SEB_FM_MM<0x1ec>;
+
+  def EXT_MM : MMRel, ExtBase<"ext", GPR32Opnd, uimm5, MipsExt>,
+               EXT_FM_MM<0x2c>;
+  def INS_MM : MMRel, InsBase<"ins", GPR32Opnd, uimm5, MipsIns>,
+               EXT_FM_MM<0x0c>;
+
+  /// Jump Instructions
+  let DecoderMethod = "DecodeJumpTargetMM" in {
+    def J_MM        : MMRel, JumpFJ<jmptarget_mm, "j", br, bb, "j">,
+                      J_FM_MM<0x35>;
+    def JAL_MM      : MMRel, JumpLink<"jal", calltarget_mm>, J_FM_MM<0x3d>;
+    def TAILCALL_MM : MMRel, JumpFJ<calltarget_mm, "j", MipsTailCall, imm,
+                                    "tcall">, J_FM_MM<0x3d>, IsTailCall;
+  }
+  def JR_MM   : MMRel, IndirectBranch<"jr", GPR32Opnd>, JR_FM_MM<0x3c>;
+  def JALR_MM : MMRel, JumpLinkReg<"jalr", GPR32Opnd>, JALR_FM_MM<0x03c>;
+  def TAILCALL_R_MM : MMRel, JumpFR<"tcallr", GPR32Opnd, MipsTailCall>,
+                      JR_FM_MM<0x3c>, IsTailCall;
+  def RET_MM : MMRel, RetBase<"ret", GPR32Opnd>, JR_FM_MM<0x3c>;
+
+  /// Branch Instructions
+  def BEQ_MM  : MMRel, CBranch<"beq", brtarget_mm, seteq, GPR32Opnd>,
+                BEQ_FM_MM<0x25>;
+  def BNE_MM  : MMRel, CBranch<"bne", brtarget_mm, setne, GPR32Opnd>,
+                BEQ_FM_MM<0x2d>;
+  def BGEZ_MM : MMRel, CBranchZero<"bgez", brtarget_mm, setge, GPR32Opnd>,
+                BGEZ_FM_MM<0x2>;
+  def BGTZ_MM : MMRel, CBranchZero<"bgtz", brtarget_mm, setgt, GPR32Opnd>,
+                BGEZ_FM_MM<0x6>;
+  def BLEZ_MM : MMRel, CBranchZero<"blez", brtarget_mm, setle, GPR32Opnd>,
+                BGEZ_FM_MM<0x4>;
+  def BLTZ_MM : MMRel, CBranchZero<"bltz", brtarget_mm, setlt, GPR32Opnd>,
+                BGEZ_FM_MM<0x0>;
+  def BGEZAL_MM : MMRel, BGEZAL_FT<"bgezal", brtarget_mm, GPR32Opnd>,
+                  BGEZAL_FM_MM<0x03>;
+  def BLTZAL_MM : MMRel, BGEZAL_FT<"bltzal", brtarget_mm, GPR32Opnd>,
+                  BGEZAL_FM_MM<0x01>;
+
+  /// Trap Instructions
+  def TEQ_MM  : MMRel, TEQ_FT<"teq", GPR32Opnd>, TEQ_FM_MM<0x0>;
+  def TGE_MM  : MMRel, TEQ_FT<"tge", GPR32Opnd>, TEQ_FM_MM<0x08>;
+  def TGEU_MM : MMRel, TEQ_FT<"tgeu", GPR32Opnd>, TEQ_FM_MM<0x10>;
+  def TLT_MM  : MMRel, TEQ_FT<"tlt", GPR32Opnd>, TEQ_FM_MM<0x20>;
+  def TLTU_MM : MMRel, TEQ_FT<"tltu", GPR32Opnd>, TEQ_FM_MM<0x28>;
+  def TNE_MM  : MMRel, TEQ_FT<"tne", GPR32Opnd>, TEQ_FM_MM<0x30>;
+
+  def TEQI_MM  : MMRel, TEQI_FT<"teqi", GPR32Opnd>, TEQI_FM_MM<0x0e>;
+  def TGEI_MM  : MMRel, TEQI_FT<"tgei", GPR32Opnd>, TEQI_FM_MM<0x09>;
+  def TGEIU_MM : MMRel, TEQI_FT<"tgeiu", GPR32Opnd>, TEQI_FM_MM<0x0b>;
+  def TLTI_MM  : MMRel, TEQI_FT<"tlti", GPR32Opnd>, TEQI_FM_MM<0x08>;
+  def TLTIU_MM : MMRel, TEQI_FT<"tltiu", GPR32Opnd>, TEQI_FM_MM<0x0a>;
+  def TNEI_MM  : MMRel, TEQI_FT<"tnei", GPR32Opnd>, TEQI_FM_MM<0x0c>;
 }
diff --git a/lib/Target/Mips/Mips.h b/lib/Target/Mips/Mips.h
index b88c0d2..e796deb 100644
--- a/lib/Target/Mips/Mips.h
+++ b/lib/Target/Mips/Mips.h
@@ -28,7 +28,6 @@ namespace llvm {
   FunctionPass *createMipsJITCodeEmitterPass(MipsTargetMachine &TM,
                                              JITCodeEmitter &JCE);
   FunctionPass *createMipsConstantIslandPass(MipsTargetMachine &tm);
-  FunctionPass *createMipsOptimizeMathLibCalls(MipsTargetMachine &TM);
 } // end namespace llvm;
 
 #endif
diff --git a/lib/Target/Mips/Mips.td b/lib/Target/Mips/Mips.td
index 2595e41..b8e3f39 100644
--- a/lib/Target/Mips/Mips.td
+++ b/lib/Target/Mips/Mips.td
@@ -78,6 +78,8 @@ def FeatureDSP : SubtargetFeature<"dsp", "HasDSP", "true", "Mips DSP ASE">;
 def FeatureDSPR2 : SubtargetFeature<"dspr2", "HasDSPR2", "true",
                                     "Mips DSP-R2 ASE", [FeatureDSP]>;
 
+def FeatureMSA : SubtargetFeature<"msa", "HasMSA", "true", "Mips MSA ASE">;
+
 def FeatureMicroMips  : SubtargetFeature<"micromips", "InMicroMipsMode", "true",
                                          "microMips mode">;
 
diff --git a/lib/Target/Mips/Mips16FrameLowering.h b/lib/Target/Mips/Mips16FrameLowering.h
index 54fdb78..8ce2ced 100644
--- a/lib/Target/Mips/Mips16FrameLowering.h
+++ b/lib/Target/Mips/Mips16FrameLowering.h
@@ -20,7 +20,7 @@ namespace llvm {
 class Mips16FrameLowering : public MipsFrameLowering {
 public:
   explicit Mips16FrameLowering(const MipsSubtarget &STI)
-    : MipsFrameLowering(STI, 8) {}
+    : MipsFrameLowering(STI, STI.stackAlignment()) {}
 
   /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
   /// the function.
diff --git a/lib/Target/Mips/Mips16HardFloat.cpp b/lib/Target/Mips/Mips16HardFloat.cpp
index 7e456aa..81bf18c 100644
--- a/lib/Target/Mips/Mips16HardFloat.cpp
+++ b/lib/Target/Mips/Mips16HardFloat.cpp
@@ -16,6 +16,7 @@
 #include "llvm/IR/Module.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include <algorithm>
 #include <string>
 
 static void inlineAsmOut
@@ -321,6 +322,37 @@ static void assureFPCallStub(Function &F, Module *M,
 }
 
 //
+// Functions that are llvm intrinsics and don't need helpers.
+//
+static const char *IntrinsicInline[] =
+  {"fabs",
+   "fabsf",
+   "llvm.ceil.f32", "llvm.ceil.f64",
+   "llvm.copysign.f32", "llvm.copysign.f64",
+   "llvm.cos.f32", "llvm.cos.f64",
+   "llvm.exp.f32", "llvm.exp.f64",
+   "llvm.exp2.f32", "llvm.exp2.f64",
+   "llvm.fabs.f32", "llvm.fabs.f64",
+   "llvm.floor.f32", "llvm.floor.f64",
+   "llvm.fma.f32", "llvm.fma.f64",
+   "llvm.log.f32", "llvm.log.f64",
+   "llvm.log10.f32", "llvm.log10.f64",
+   "llvm.nearbyint.f32", "llvm.nearbyint.f64",
+   "llvm.pow.f32", "llvm.pow.f64",
+   "llvm.powi.f32", "llvm.powi.f64",
+   "llvm.rint.f32", "llvm.rint.f64",
+   "llvm.round.f32", "llvm.round.f64",
+   "llvm.sin.f32", "llvm.sin.f64",
+   "llvm.sqrt.f32", "llvm.sqrt.f64",
+   "llvm.trunc.f32", "llvm.trunc.f64",
+  };
+
+static bool isIntrinsicInline(Function *F) {
+  return std::binary_search(
+    IntrinsicInline, array_endof(IntrinsicInline),
+    F->getName());
+}
+//
 // Returns of float, double and complex need to be handled with a helper
 // function.
 //
@@ -372,7 +404,7 @@ static bool fixupFPReturnAndCall
           // helper functions
           if (Subtarget.getRelocationModel() != Reloc::PIC_ ) {
             Function *F_ =  CI->getCalledFunction();
-            if (F_ && needsFPHelperFromSig(*F_)) {
+            if (F_ && !isIntrinsicInline(F_) && needsFPHelperFromSig(*F_)) {
               assureFPCallStub(*F_, M, Subtarget);
               Modified=true;
             }
@@ -390,7 +422,7 @@ static void createFPFnStub(Function *F, Module *M, FPParamVariant PV,
   std::string Name = F->getName();
   std::string SectionName = ".mips16.fn." + Name;
   std::string StubName = "__fn_stub_" + Name;
-  std::string LocalName = "__fn_local_" + Name;
+  std::string LocalName = "$$__fn_local_" + Name;
   Function *FStub = Function::Create
     (F->getFunctionType(),
      Function::InternalLinkage, StubName, M);
@@ -405,19 +437,36 @@ static void createFPFnStub(Function *F, Module *M, FPParamVariant PV,
   IAH.Out(" .set  macro");
   if (PicMode) {
     IAH.Out(".set noreorder");
-    IAH.Out(".cpload  $$2");
+    IAH.Out(".cpload  $$25");
     IAH.Out(".set reorder");
     IAH.Out(".reloc 0,R_MIPS_NONE," + Name);
     IAH.Out("la $$25," + LocalName);
   }
-  else
-    IAH.Out("la $$25, " + Name);
+  else {
+    IAH.Out(".set reorder");
+    IAH.Out("la $$25," + Name);
+  }
   swapFPIntParams(PV, M, IAH, LE, false);
   IAH.Out("jr $$25");
   IAH.Out(LocalName + " = " + Name);
   new UnreachableInst(FStub->getContext(), BB);
 }
 
+//
+// remove the use-soft-float attribute
+//
+static void removeUseSoftFloat(Function &F) {
+  AttributeSet A;
+  DEBUG(errs() << "removing -use-soft-float\n");
+  A = A.addAttribute(F.getContext(), AttributeSet::FunctionIndex,
+                     "use-soft-float", "false");
+  F.removeAttributes(AttributeSet::FunctionIndex, A);
+  if (F.hasFnAttribute("use-soft-float")) {
+    DEBUG(errs() << "still has -use-soft-float\n");
+  }
+  F.addAttributes(AttributeSet::FunctionIndex, A);
+}
+
 namespace llvm {
 
 //
@@ -441,6 +490,11 @@ bool Mips16HardFloat::runOnModule(Module &M) {
   DEBUG(errs() << "Run on Module Mips16HardFloat\n");
   bool Modified = false;
   for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) {
+    if (F->hasFnAttribute("nomips16") &&
+        F->hasFnAttribute("use-soft-float")) {
+      removeUseSoftFloat(*F);
+      continue;
+    }
     if (F->isDeclaration() || F->hasFnAttribute("mips16_fp_stub") ||
         F->hasFnAttribute("nomips16")) continue;
     Modified |= fixupFPReturnAndCall(*F, &M, Subtarget);
diff --git a/lib/Target/Mips/Mips16ISelDAGToDAG.cpp b/lib/Target/Mips/Mips16ISelDAGToDAG.cpp
index 0caa277..4948f40 100644
--- a/lib/Target/Mips/Mips16ISelDAGToDAG.cpp
+++ b/lib/Target/Mips/Mips16ISelDAGToDAG.cpp
@@ -80,10 +80,11 @@ void Mips16DAGToDAGISel::initGlobalBaseReg(MachineFunction &MF) {
   V1 = RegInfo.createVirtualRegister(RC);
   V2 = RegInfo.createVirtualRegister(RC);
 
-  BuildMI(MBB, I, DL, TII.get(Mips::LiRxImmX16), V0)
-    .addExternalSymbol("_gp_disp", MipsII::MO_ABS_HI);
-  BuildMI(MBB, I, DL, TII.get(Mips::AddiuRxPcImmX16), V1)
-    .addExternalSymbol("_gp_disp", MipsII::MO_ABS_LO);
+  BuildMI(MBB, I, DL, TII.get(Mips::GotPrologue16), V0).
+    addReg(V1, RegState::Define).
+    addExternalSymbol("_gp_disp", MipsII::MO_ABS_HI).
+    addExternalSymbol("_gp_disp", MipsII::MO_ABS_LO);
+
   BuildMI(MBB, I, DL, TII.get(Mips::SllX16), V2).addReg(V0).addImm(16);
   BuildMI(MBB, I, DL, TII.get(Mips::AdduRxRyRz16), GlobalBaseReg)
     .addReg(V1).addReg(V2);
diff --git a/lib/Target/Mips/Mips16ISelLowering.cpp b/lib/Target/Mips/Mips16ISelLowering.cpp
index 6ed1d9e..61d8bb8 100644
--- a/lib/Target/Mips/Mips16ISelLowering.cpp
+++ b/lib/Target/Mips/Mips16ISelLowering.cpp
@@ -90,6 +90,7 @@ static const Mips16Libcall HardFloatLibCalls[] = {
 };
 
 static const Mips16IntrinsicHelperType Mips16IntrinsicHelper[] = {
+  {"__fixunsdfsi", "__mips16_call_stub_2" },
   {"ceil",  "__mips16_call_stub_df_2"},
   {"ceilf", "__mips16_call_stub_sf_1"},
   {"copysign",  "__mips16_call_stub_df_10"},
@@ -144,6 +145,11 @@ Mips16TargetLowering::Mips16TargetLowering(MipsTargetMachine &TM)
   setOperationAction(ISD::ATOMIC_LOAD_UMIN,   MVT::i32,   Expand);
   setOperationAction(ISD::ATOMIC_LOAD_UMAX,   MVT::i32,   Expand);
 
+  setOperationAction(ISD::ROTR, MVT::i32,  Expand);
+  setOperationAction(ISD::ROTR, MVT::i64,  Expand);
+  setOperationAction(ISD::BSWAP, MVT::i32, Expand);
+  setOperationAction(ISD::BSWAP, MVT::i64, Expand);
+
   computeRegisterProperties();
 }
 
@@ -168,57 +174,57 @@ Mips16TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   case Mips::SelBneZ:
     return emitSel16(Mips::BnezRxImm16, MI, BB);
   case Mips::SelTBteqZCmpi:
-    return emitSeliT16(Mips::BteqzX16, Mips::CmpiRxImmX16, MI, BB);
+    return emitSeliT16(Mips::Bteqz16, Mips::CmpiRxImmX16, MI, BB);
   case Mips::SelTBteqZSlti:
-    return emitSeliT16(Mips::BteqzX16, Mips::SltiRxImmX16, MI, BB);
+    return emitSeliT16(Mips::Bteqz16, Mips::SltiRxImmX16, MI, BB);
   case Mips::SelTBteqZSltiu:
-    return emitSeliT16(Mips::BteqzX16, Mips::SltiuRxImmX16, MI, BB);
+    return emitSeliT16(Mips::Bteqz16, Mips::SltiuRxImmX16, MI, BB);
   case Mips::SelTBtneZCmpi:
-    return emitSeliT16(Mips::BtnezX16, Mips::CmpiRxImmX16, MI, BB);
+    return emitSeliT16(Mips::Btnez16, Mips::CmpiRxImmX16, MI, BB);
   case Mips::SelTBtneZSlti:
-    return emitSeliT16(Mips::BtnezX16, Mips::SltiRxImmX16, MI, BB);
+    return emitSeliT16(Mips::Btnez16, Mips::SltiRxImmX16, MI, BB);
   case Mips::SelTBtneZSltiu:
-    return emitSeliT16(Mips::BtnezX16, Mips::SltiuRxImmX16, MI, BB);
+    return emitSeliT16(Mips::Btnez16, Mips::SltiuRxImmX16, MI, BB);
   case Mips::SelTBteqZCmp:
-    return emitSelT16(Mips::BteqzX16, Mips::CmpRxRy16, MI, BB);
+    return emitSelT16(Mips::Bteqz16, Mips::CmpRxRy16, MI, BB);
   case Mips::SelTBteqZSlt:
-    return emitSelT16(Mips::BteqzX16, Mips::SltRxRy16, MI, BB);
+    return emitSelT16(Mips::Bteqz16, Mips::SltRxRy16, MI, BB);
   case Mips::SelTBteqZSltu:
-    return emitSelT16(Mips::BteqzX16, Mips::SltuRxRy16, MI, BB);
+    return emitSelT16(Mips::Bteqz16, Mips::SltuRxRy16, MI, BB);
   case Mips::SelTBtneZCmp:
-    return emitSelT16(Mips::BtnezX16, Mips::CmpRxRy16, MI, BB);
+    return emitSelT16(Mips::Btnez16, Mips::CmpRxRy16, MI, BB);
   case Mips::SelTBtneZSlt:
-    return emitSelT16(Mips::BtnezX16, Mips::SltRxRy16, MI, BB);
+    return emitSelT16(Mips::Btnez16, Mips::SltRxRy16, MI, BB);
   case Mips::SelTBtneZSltu:
-    return emitSelT16(Mips::BtnezX16, Mips::SltuRxRy16, MI, BB);
+    return emitSelT16(Mips::Btnez16, Mips::SltuRxRy16, MI, BB);
   case Mips::BteqzT8CmpX16:
-    return emitFEXT_T8I816_ins(Mips::BteqzX16, Mips::CmpRxRy16, MI, BB);
+    return emitFEXT_T8I816_ins(Mips::Bteqz16, Mips::CmpRxRy16, MI, BB);
   case Mips::BteqzT8SltX16:
-    return emitFEXT_T8I816_ins(Mips::BteqzX16, Mips::SltRxRy16, MI, BB);
+    return emitFEXT_T8I816_ins(Mips::Bteqz16, Mips::SltRxRy16, MI, BB);
   case Mips::BteqzT8SltuX16:
     // TBD: figure out a way to get this or remove the instruction
     // altogether.
-    return emitFEXT_T8I816_ins(Mips::BteqzX16, Mips::SltuRxRy16, MI, BB);
+    return emitFEXT_T8I816_ins(Mips::Bteqz16, Mips::SltuRxRy16, MI, BB);
   case Mips::BtnezT8CmpX16:
-    return emitFEXT_T8I816_ins(Mips::BtnezX16, Mips::CmpRxRy16, MI, BB);
+    return emitFEXT_T8I816_ins(Mips::Btnez16, Mips::CmpRxRy16, MI, BB);
   case Mips::BtnezT8SltX16:
-    return emitFEXT_T8I816_ins(Mips::BtnezX16, Mips::SltRxRy16, MI, BB);
+    return emitFEXT_T8I816_ins(Mips::Btnez16, Mips::SltRxRy16, MI, BB);
   case Mips::BtnezT8SltuX16:
     // TBD: figure out a way to get this or remove the instruction
     // altogether.
-    return emitFEXT_T8I816_ins(Mips::BtnezX16, Mips::SltuRxRy16, MI, BB);
+    return emitFEXT_T8I816_ins(Mips::Btnez16, Mips::SltuRxRy16, MI, BB);
   case Mips::BteqzT8CmpiX16: return emitFEXT_T8I8I16_ins(
-    Mips::BteqzX16, Mips::CmpiRxImm16, Mips::CmpiRxImmX16, false, MI, BB);
+    Mips::Bteqz16, Mips::CmpiRxImm16, Mips::CmpiRxImmX16, false, MI, BB);
   case Mips::BteqzT8SltiX16: return emitFEXT_T8I8I16_ins(
-    Mips::BteqzX16, Mips::SltiRxImm16, Mips::SltiRxImmX16, true, MI, BB);
+    Mips::Bteqz16, Mips::SltiRxImm16, Mips::SltiRxImmX16, true, MI, BB);
   case Mips::BteqzT8SltiuX16: return emitFEXT_T8I8I16_ins(
-    Mips::BteqzX16, Mips::SltiuRxImm16, Mips::SltiuRxImmX16, false, MI, BB);
+    Mips::Bteqz16, Mips::SltiuRxImm16, Mips::SltiuRxImmX16, false, MI, BB);
   case Mips::BtnezT8CmpiX16: return emitFEXT_T8I8I16_ins(
-    Mips::BtnezX16, Mips::CmpiRxImm16, Mips::CmpiRxImmX16, false, MI, BB);
+    Mips::Btnez16, Mips::CmpiRxImm16, Mips::CmpiRxImmX16, false, MI, BB);
   case Mips::BtnezT8SltiX16: return emitFEXT_T8I8I16_ins(
-    Mips::BtnezX16, Mips::SltiRxImm16, Mips::SltiRxImmX16, true, MI, BB);
+    Mips::Btnez16, Mips::SltiRxImm16, Mips::SltiRxImmX16, true, MI, BB);
   case Mips::BtnezT8SltiuX16: return emitFEXT_T8I8I16_ins(
-    Mips::BtnezX16, Mips::SltiuRxImm16, Mips::SltiuRxImmX16, false, MI, BB);
+    Mips::Btnez16, Mips::SltiuRxImm16, Mips::SltiuRxImmX16, false, MI, BB);
     break;
   case Mips::SltCCRxRy16:
     return emitFEXT_CCRX16_ins(Mips::SltRxRy16, MI, BB);
@@ -418,6 +424,8 @@ getOpndList(SmallVectorImpl<SDValue> &Ops,
             bool IsPICCall, bool GlobalOrExternal, bool InternalLinkage,
             CallLoweringInfo &CLI, SDValue Callee, SDValue Chain) const {
   SelectionDAG &DAG = CLI.DAG;
+  MachineFunction &MF = DAG.getMachineFunction();
+  MipsFunctionInfo *FuncInfo = MF.getInfo<MipsFunctionInfo>();
   const char* Mips16HelperFunction = 0;
   bool NeedMips16Helper = false;
 
@@ -473,7 +481,10 @@ getOpndList(SmallVectorImpl<SDValue> &Ops,
     if (NeedMips16Helper) {
       RegsToPass.push_front(std::make_pair(V0Reg, Callee));
       JumpTarget = DAG.getExternalSymbol(Mips16HelperFunction, getPointerTy());
-      JumpTarget = getAddrGlobal(JumpTarget, DAG, MipsII::MO_GOT);
+      ExternalSymbolSDNode *S = cast<ExternalSymbolSDNode>(JumpTarget);
+      JumpTarget = getAddrGlobal(S, JumpTarget.getValueType(), DAG,
+                                 MipsII::MO_GOT, Chain,
+                                 FuncInfo->callPtrInfo(S->getSymbol()));
     } else
       RegsToPass.push_front(std::make_pair((unsigned)Mips::T9, Callee));
   }
diff --git a/lib/Target/Mips/Mips16InstrInfo.cpp b/lib/Target/Mips/Mips16InstrInfo.cpp
index 05e70ab..000ea28 100644
--- a/lib/Target/Mips/Mips16InstrInfo.cpp
+++ b/lib/Target/Mips/Mips16InstrInfo.cpp
@@ -10,7 +10,6 @@
 // This file contains the Mips16 implementation of the TargetInstrInfo class.
 //
 //===----------------------------------------------------------------------===//
-#include <stdio.h>
 #include "Mips16InstrInfo.h"
 #include "InstPrinter/MipsInstPrinter.h"
 #include "MipsMachineFunction.h"
@@ -20,10 +19,12 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TargetRegistry.h"
+#include <cctype>
 
 using namespace llvm;
 
@@ -36,7 +37,7 @@ static cl::opt<bool> NeverUseSaveRestore(
 
 
 Mips16InstrInfo::Mips16InstrInfo(MipsTargetMachine &tm)
-  : MipsInstrInfo(tm, Mips::BimmX16),
+  : MipsInstrInfo(tm, Mips::Bimm16),
     RI(*tm.getSubtargetImpl()) {}
 
 const MipsRegisterInfo &Mips16InstrInfo::getRegisterInfo() const {
@@ -77,11 +78,11 @@ void Mips16InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
   else if (Mips::GPR32RegClass.contains(DestReg) &&
            Mips::CPU16RegsRegClass.contains(SrcReg))
     Opc = Mips::Move32R16;
-  else if ((SrcReg == Mips::HI) &&
+  else if ((SrcReg == Mips::HI0) &&
            (Mips::CPU16RegsRegClass.contains(DestReg)))
     Opc = Mips::Mfhi16, SrcReg = 0;
 
-  else if ((SrcReg == Mips::LO) &&
+  else if ((SrcReg == Mips::LO0) &&
            (Mips::CPU16RegsRegClass.contains(DestReg)))
     Opc = Mips::Mflo16, SrcReg = 0;
 
@@ -151,13 +152,17 @@ unsigned Mips16InstrInfo::getOppositeBranchOpc(unsigned Opc) const {
   default:  llvm_unreachable("Illegal opcode!");
   case Mips::BeqzRxImmX16: return Mips::BnezRxImmX16;
   case Mips::BnezRxImmX16: return Mips::BeqzRxImmX16;
+  case Mips::BeqzRxImm16: return Mips::BnezRxImm16;
+  case Mips::BnezRxImm16: return Mips::BeqzRxImm16;
   case Mips::BteqzT8CmpX16: return Mips::BtnezT8CmpX16;
   case Mips::BteqzT8SltX16: return Mips::BtnezT8SltX16;
   case Mips::BteqzT8SltiX16: return Mips::BtnezT8SltiX16;
+  case Mips::Btnez16: return Mips::Bteqz16;
   case Mips::BtnezX16: return Mips::BteqzX16;
   case Mips::BtnezT8CmpiX16: return Mips::BteqzT8CmpiX16;
   case Mips::BtnezT8SltuX16: return Mips::BteqzT8SltuX16;
   case Mips::BtnezT8SltiuX16: return Mips::BteqzT8SltiuX16;
+  case Mips::Bteqz16: return Mips::Btnez16;
   case Mips::BteqzX16: return Mips::BtnezX16;
   case Mips::BteqzT8CmpiX16: return Mips::BtnezT8CmpiX16;
   case Mips::BteqzT8SltuX16: return Mips::BtnezT8SltuX16;
@@ -439,6 +444,9 @@ Mips16InstrInfo::basicLoadImmediate(
 
 unsigned Mips16InstrInfo::getAnalyzableBrOpc(unsigned Opc) const {
   return (Opc == Mips::BeqzRxImmX16   || Opc == Mips::BimmX16  ||
+          Opc == Mips::Bimm16  ||
+          Opc == Mips::Bteqz16        || Opc == Mips::Btnez16 ||
+          Opc == Mips::BeqzRxImm16    || Opc == Mips::BnezRxImm16   ||
           Opc == Mips::BnezRxImmX16   || Opc == Mips::BteqzX16 ||
           Opc == Mips::BteqzT8CmpX16  || Opc == Mips::BteqzT8CmpiX16 ||
           Opc == Mips::BteqzT8SltX16  || Opc == Mips::BteqzT8SltuX16  ||
@@ -473,7 +481,6 @@ const MipsInstrInfo *llvm::createMips16InstrInfo(MipsTargetMachine &TM) {
   return new Mips16InstrInfo(TM);
 }
 
-#include <stdio.h>
 bool Mips16InstrInfo::validImmediate(unsigned Opcode, unsigned Reg,
                                      int64_t Amount) {
   switch (Opcode) {
@@ -493,6 +500,49 @@ bool Mips16InstrInfo::validImmediate(unsigned Opcode, unsigned Reg,
       return isInt<16>(Amount);
     return isInt<15>(Amount);
   }
-  printf("Unexpected opcode %i \n", Opcode);
   llvm_unreachable("unexpected Opcode in validImmediate");
 }
+
+/// Measure the specified inline asm to determine an approximation of its
+/// length.
+/// Comments (which run till the next SeparatorString or newline) do not
+/// count as an instruction.
+/// Any other non-whitespace text is considered an instruction, with
+/// multiple instructions separated by SeparatorString or newlines.
+/// Variable-length instructions are not handled here; this function
+/// may be overloaded in the target code to do that.
+/// We implement the special case of the .space directive taking only an
+/// integer argument, which is the size in bytes. This is used for creating
+/// inline code spacing for testing purposes using inline assembly.
+///
+unsigned Mips16InstrInfo::getInlineAsmLength(const char *Str,
+                                             const MCAsmInfo &MAI) const {
+
+
+  // Count the number of instructions in the asm.
+  bool atInsnStart = true;
+  unsigned Length = 0;
+  for (; *Str; ++Str) {
+    if (*Str == '\n' || strncmp(Str, MAI.getSeparatorString(),
+                                strlen(MAI.getSeparatorString())) == 0)
+      atInsnStart = true;
+    if (atInsnStart && !std::isspace(static_cast<unsigned char>(*Str))) {
+      if (strncmp(Str, ".space", 6)==0) {
+        char *EStr; int Sz;
+        Sz = strtol(Str+6, &EStr, 10);
+        while (isspace(*EStr)) ++EStr;
+        if (*EStr=='\0') {
+          DEBUG(dbgs() << "parsed .space " << Sz << '\n');
+          return Sz;
+        }
+      }
+      Length += MAI.getMaxInstLength();
+      atInsnStart = false;
+    }
+    if (atInsnStart && strncmp(Str, MAI.getCommentString(),
+                               strlen(MAI.getCommentString())) == 0)
+      atInsnStart = false;
+  }
+
+  return Length;
+}
diff --git a/lib/Target/Mips/Mips16InstrInfo.h b/lib/Target/Mips/Mips16InstrInfo.h
index 118d258..d9a594b 100644
--- a/lib/Target/Mips/Mips16InstrInfo.h
+++ b/lib/Target/Mips/Mips16InstrInfo.h
@@ -108,6 +108,8 @@ public:
   void BuildAddiuSpImm
     (MachineBasicBlock &MBB, MachineBasicBlock::iterator I, int64_t Imm) const;
 
+  unsigned getInlineAsmLength(const char *Str,
+                              const MCAsmInfo &MAI) const;
 private:
   virtual unsigned getAnalyzableBrOpc(unsigned Opc) const;
 
diff --git a/lib/Target/Mips/Mips16InstrInfo.td b/lib/Target/Mips/Mips16InstrInfo.td
index aef4e92..7441c78 100644
--- a/lib/Target/Mips/Mips16InstrInfo.td
+++ b/lib/Target/Mips/Mips16InstrInfo.td
@@ -32,6 +32,16 @@ def mem16_ea : Operand<i32> {
 }
 
 //
+// I-type instruction format
+//
+// this is only used by bimm. the actual assembly value is a 12 bit signed
+// number
+//
+class FI16_ins<bits<5> op, string asmstr, InstrItinClass itin>:
+  FI16<op, (outs), (ins brtarget:$imm16),
+            !strconcat(asmstr, "\t$imm16 # 16 bit inst"), [], itin>;
+
+//
 //
 // I8 instruction format
 //
@@ -41,7 +51,10 @@ class FI816_ins_base<bits<3> _func, string asmstr,
   FI816<_func, (outs), (ins simm16:$imm), !strconcat(asmstr, asmstr2),
         [], itin>;
 
-
+class FI816_ins<bits<3> _func, string asmstr,
+                InstrItinClass itin>:
+  FI816_ins_base<_func, asmstr, "\t$imm  # 16 bit inst", itin>;
+ 
 class FI816_SP_ins<bits<3> _func, string asmstr,
                    InstrItinClass itin>:
   FI816_ins_base<_func, asmstr, "\t$$sp, $imm # 16 bit inst", itin>;
@@ -60,6 +73,11 @@ class FRI16_ins<bits<5> op, string asmstr,
                 InstrItinClass itin>:
   FRI16_ins_base<op, asmstr, "\t$rx, $imm \t# 16 bit inst", itin>;
 
+class FRI16_TCP_ins<bits<5> _op, string asmstr,
+                    InstrItinClass itin>:
+  FRI16<_op, (outs CPU16Regs:$rx), (ins pcrel16:$imm, i32imm:$size),
+            !strconcat(asmstr, "\t$rx, $imm\t# 16 bit inst"), [], itin>;
+            
 class FRI16R_ins_base<bits<5> op, string asmstr, string asmstr2,
                      InstrItinClass itin>:
   FRI16<op, (outs), (ins CPU16Regs:$rx, simm16:$imm),
@@ -172,6 +190,11 @@ class FEXT_RI16_B_ins<bits<5> _op, string asmstr,
   FEXT_RI16<_op, (outs), (ins  CPU16Regs:$rx, brtarget:$imm),
             !strconcat(asmstr, "\t$rx, $imm"), [], itin>;
 
+class FEXT_RI16_TCP_ins<bits<5> _op, string asmstr,
+                        InstrItinClass itin>:
+  FEXT_RI16<_op, (outs CPU16Regs:$rx), (ins pcrel16:$imm, i32imm:$size),
+            !strconcat(asmstr, "\t$rx, $imm"), [], itin>;
+
 class FEXT_2RI16_ins<bits<5> _op, string asmstr,
                      InstrItinClass itin>:
   FEXT_RI16<_op, (outs CPU16Regs:$rx), (ins CPU16Regs:$rx_, simm16:$imm),
@@ -220,7 +243,7 @@ class FEXT_RRI_A16_mem_ins<bits<1> op, string asmstr, Operand MemOpnd,
 // EXT-SHIFT instruction format
 //
 class FEXT_SHIFT16_ins<bits<2> _f, string asmstr, InstrItinClass itin>:
-  FEXT_SHIFT16<_f, (outs CPU16Regs:$rx), (ins CPU16Regs:$ry, shamt:$sa),
+  FEXT_SHIFT16<_f, (outs CPU16Regs:$rx), (ins CPU16Regs:$ry, uimm5:$sa),
                !strconcat(asmstr, "\t$rx, $ry, $sa"), [], itin>;
 
 //
@@ -343,6 +366,14 @@ class FRR16_JALRC_ins<bits<1> nd, bits<1> l, bits<1> ra,
   FRR16_JALRC<nd, l, ra, (outs), (ins CPU16Regs:$rx),
               !strconcat(asmstr, "\t $rx"), [], itin> ;
 
+class FRR_SF16_ins
+  <bits<5> _funct, bits<3> _subfunc,
+    string asmstr, InstrItinClass itin>:
+  FRR_SF16<_funct, _subfunc, (outs CPU16Regs:$rx), (ins CPU16Regs:$rx_),
+           !strconcat(asmstr, "\t $rx"),
+           [], itin> {
+  let Constraints = "$rx_ = $rx";
+  }
 //
 // RRR-type instruction format
 //
@@ -447,7 +478,7 @@ def Constant32:
   MipsPseudo16<(outs), (ins imm32:$imm), "\t.word $imm", []>;
 
 def LwConstant32:
-  MipsPseudo16<(outs CPU16Regs:$rx), (ins imm32:$imm),
+  MipsPseudo16<(outs CPU16Regs:$rx), (ins imm32:$imm, imm32:$constid),
     "lw\t$rx, 1f\n\tb\t2f\n\t.align\t2\n1: \t.word\t$imm\n2:", []>;
 
 
@@ -559,6 +590,14 @@ def BeqzRxImm16: FRI16_B_ins<0b00100, "beqz", IIAlu>, cbranch16;
 //
 def BeqzRxImmX16: FEXT_RI16_B_ins<0b00100, "beqz", IIAlu>, cbranch16;
 
+//
+// Format: B offset MIPS16e
+// Purpose: Unconditional Branch (Extended)
+// To do an unconditional PC-relative branch.
+//
+
+def Bimm16: FI16_ins<0b00010, "b", IIAlu>, branch16;
+
 // Format: B offset MIPS16e
 // Purpose: Unconditional Branch
 // To do an unconditional PC-relative branch.
@@ -591,6 +630,10 @@ def Break16: FRRBreakNull16_ins<"break 0", NoItinerary>;
 // Purpose: Branch on T Equal to Zero (Extended)
 // To test special register T then do a PC-relative conditional branch.
 //
+def Bteqz16: FI816_ins<0b000, "bteqz", IIAlu>, cbranch16 {
+  let Uses = [T8];
+}
+
 def BteqzX16: FEXT_I816_ins<0b000, "bteqz", IIAlu>, cbranch16 {
   let Uses = [T8];
 }
@@ -614,6 +657,11 @@ def BteqzT8SltiuX16: FEXT_T8I8I16_ins<"bteqz", "sltiu">,
 // Purpose: Branch on T Not Equal to Zero (Extended)
 // To test special register T then do a PC-relative conditional branch.
 //
+
+def Btnez16: FI816_ins<0b001, "btnez", IIAlu>, cbranch16 {
+  let Uses = [T8];
+}
+
 def BtnezX16: FEXT_I816_ins<0b001, "btnez", IIAlu> ,cbranch16 {
   let Uses = [T8];
 }
@@ -665,7 +713,7 @@ def CmpiRxImmX16: FEXT_RI16R_ins<0b01110, "cmpi", IIAlu> {
 // To divide 32-bit signed integers.
 //
 def DivRxRy16: FRR16_div_ins<0b11010, "div", IIAlu> {
-  let Defs = [HI, LO];
+  let Defs = [HI0, LO0];
 }
 
 //
@@ -674,7 +722,7 @@ def DivRxRy16: FRR16_div_ins<0b11010, "div", IIAlu> {
 // To divide 32-bit unsigned integers.
 //
 def DivuRxRy16: FRR16_div_ins<0b11011, "divu", IIAlu> {
-  let Defs = [HI, LO];
+  let Defs = [HI0, LO0];
 }
 //
 // Format: JAL target MIPS16e
@@ -684,10 +732,7 @@ def DivuRxRy16: FRR16_div_ins<0b11011, "divu", IIAlu> {
 //
 
 def Jal16 : FJAL16_ins<0b0, "jal", IIAlu> {
-  let isBranch = 1;
   let hasDelaySlot = 0;  // not true, but we add the nop for now
-  let isTerminator=1;
-  let isBarrier=1;
   let isCall=1;
 }
 
@@ -771,6 +816,10 @@ def LiRxImm16: FRI16_ins<0b01101, "li", IIAlu>;
 //
 def LiRxImmX16: FEXT_RI16_ins<0b01101, "li", IIAlu>;
 
+def LiRxImmAlignX16: FEXT_RI16_ins<0b01101, ".align 2\n\tli", IIAlu> {
+  let isCodeGenOnly = 1;
+}
+
 //
 // Format: LW ry, offset(rx) MIPS16e
 // Purpose: Load Word (Extended)
@@ -784,10 +833,13 @@ def LwRxRyOffMemX16: FEXT_RRI16_mem_ins<0b10011, "lw", mem16, IILoad>, MayLoad{
 // Purpose: Load Word (SP-Relative, Extended)
 // To load an SP-relative word from memory as a signed value.
 //
-def LwRxSpImmX16: FEXT_RI16_SP_explicit_ins<0b10110, "lw", IILoad>, MayLoad{
+def LwRxSpImmX16: FEXT_RI16_SP_explicit_ins<0b10010, "lw", IILoad>, MayLoad{
   let Uses = [SP];
 }
 
+def LwRxPcTcp16: FRI16_TCP_ins<0b10110, "lw", IILoad>, MayLoad;
+
+def LwRxPcTcpX16: FEXT_RI16_TCP_ins<0b10110, "lw", IILoad>, MayLoad;
 //
 // Format: MOVE r32, rz MIPS16e
 // Purpose: Move
@@ -808,7 +860,7 @@ def MoveR3216: FI8_MOVR3216_ins<"move", IIAlu>;
 // To copy the special purpose HI register to a GPR.
 //
 def Mfhi16: FRR16_M_ins<0b10000, "mfhi", IIAlu> {
-  let Uses = [HI];
+  let Uses = [HI0];
   let neverHasSideEffects = 1;
 }
 
@@ -818,7 +870,7 @@ def Mfhi16: FRR16_M_ins<0b10000, "mfhi", IIAlu> {
 // To copy the special purpose LO register to a GPR.
 //
 def Mflo16: FRR16_M_ins<0b10010, "mflo", IIAlu> {
-  let Uses = [LO];
+  let Uses = [LO0];
   let neverHasSideEffects = 1;
 }
 
@@ -828,13 +880,13 @@ def Mflo16: FRR16_M_ins<0b10010, "mflo", IIAlu> {
 def MultRxRy16:  FMULT16_ins<"mult",  IIAlu> {
   let isCommutable = 1;
   let neverHasSideEffects = 1;
-  let Defs = [HI, LO];
+  let Defs = [HI0, LO0];
 }
 
 def MultuRxRy16: FMULT16_ins<"multu", IIAlu> {
   let isCommutable = 1;
   let neverHasSideEffects = 1;
-  let Defs = [HI, LO];
+  let Defs = [HI0, LO0];
 }
 
 //
@@ -845,7 +897,7 @@ def MultuRxRy16: FMULT16_ins<"multu", IIAlu> {
 def MultRxRyRz16: FMULT16_LO_ins<"mult", IIAlu> {
   let isCommutable = 1;
   let neverHasSideEffects = 1;
-  let Defs = [HI, LO];
+  let Defs = [HI0, LO0];
 }
 
 //
@@ -856,7 +908,7 @@ def MultRxRyRz16: FMULT16_LO_ins<"mult", IIAlu> {
 def MultuRxRyRz16: FMULT16_LO_ins<"multu", IIAlu> {
   let isCommutable = 1;
   let neverHasSideEffects = 1;
-  let Defs = [HI, LO];
+  let Defs = [HI0, LO0];
 }
 
 //
@@ -951,6 +1003,22 @@ def SbRxRyOffMemX16:
   FEXT_RRI16_mem2_ins<0b11000, "sb", mem16, IIStore>, MayStore;
 
 //
+// Format: SEB rx MIPS16e
+// Purpose: Sign-Extend Byte
+// Sign-extend least significant byte in register rx.
+//
+def SebRx16
+  : FRR_SF16_ins<0b10001, 0b100, "seb", IIAlu>;
+
+//
+// Format: SEH rx MIPS16e
+// Purpose: Sign-Extend Halfword
+// Sign-extend least significant word in register rx.
+//
+def SehRx16
+  : FRR_SF16_ins<0b10001, 0b101, "seh", IIAlu>;
+
+//
 // The Sel(T) instructions are pseudos
 // T means that they use T8 implicitly.
 //
@@ -1075,7 +1143,7 @@ def ShRxRyOffMemX16:
 //
 // Format: SLL rx, ry, sa MIPS16e
 // Purpose: Shift Word Left Logical (Extended)
-// To execute a left-shift of a word by a fixed number of bits—0 to 31 bits.
+// To execute a left-shift of a word by a fixed number of bits-0 to 31 bits.
 //
 def SllX16: FEXT_SHIFT16_ins<0b00, "sll", IIAlu>;
 
@@ -1171,7 +1239,7 @@ def SravRxRy16: FRxRxRy16_ins<0b00111, "srav", IIAlu>;
 // Format: SRA rx, ry, sa MIPS16e
 // Purpose: Shift Word Right Arithmetic (Extended)
 // To execute an arithmetic right-shift of a word by a fixed
-// number of bits—1 to 8 bits.
+// number of bits-1 to 8 bits.
 //
 def SraX16: FEXT_SHIFT16_ins<0b11, "sra", IIAlu>;
 
@@ -1189,7 +1257,7 @@ def SrlvRxRy16: FRxRxRy16_ins<0b00110, "srlv", IIAlu>;
 // Format: SRL rx, ry, sa MIPS16e
 // Purpose: Shift Word Right Logical (Extended)
 // To execute a logical right-shift of a word by a fixed
-// number of bits—1 to 31 bits.
+// number of bits-1 to 31 bits.
 //
 def SrlX16: FEXT_SHIFT16_ins<0b10, "srl", IIAlu>;
 
@@ -1330,9 +1398,7 @@ def: Mips16Pat<(i32  addr16:$addr),
 
 
 // Large (>16 bit) immediate loads
-def : Mips16Pat<(i32 imm:$imm),
-                (OrRxRxRy16 (SllX16 (LiRxImmX16 (HI16 imm:$imm)), 16),
-                (LiRxImmX16 (LO16 imm:$imm)))>;
+def : Mips16Pat<(i32 imm:$imm), (LwConstant32 imm:$imm, -1)>;
 
 // Carry MipsPatterns
 def : Mips16Pat<(subc CPU16Regs:$lhs, CPU16Regs:$rhs),
@@ -1373,7 +1439,7 @@ def: Mips16Pat
 
 def: Mips16Pat
   <(brcond (i32 (seteq CPU16Regs:$rx, 0)), bb:$targ16),
-   (BeqzRxImmX16 CPU16Regs:$rx, bb:$targ16)
+   (BeqzRxImm16 CPU16Regs:$rx, bb:$targ16)
   >;
 
 //
@@ -1435,7 +1501,7 @@ def: Mips16Pat
 
 def: Mips16Pat
   <(brcond (i32 (setne CPU16Regs:$rx, 0)), bb:$targ16),
-   (BnezRxImmX16 CPU16Regs:$rx, bb:$targ16)
+   (BnezRxImm16 CPU16Regs:$rx, bb:$targ16)
   >;
 
 //
@@ -1443,7 +1509,7 @@ def: Mips16Pat
 //
 def: Mips16Pat
   <(brcond CPU16Regs:$rx, bb:$targ16),
-   (BnezRxImmX16 CPU16Regs:$rx, bb:$targ16)
+   (BnezRxImm16 CPU16Regs:$rx, bb:$targ16)
   >;
 
 //
@@ -1473,7 +1539,7 @@ def: Mips16Pat
 //   (BtnezT8SltuX16 CPU16Regs:$rx, CPU16Regs:$ry,  bb:$imm16)
 //  >;
 
-def: UncondBranch16_pat<br, BimmX16>;
+def: UncondBranch16_pat<br, Bimm16>;
 
 // Small immediates
 def: Mips16Pat<(i32 immSExt16:$in),
@@ -1787,7 +1853,8 @@ def: Mips16Pat<(add CPU16Regs:$hi, (MipsLo tglobaladdr:$lo)),
                (AddiuRxRxImmX16 CPU16Regs:$hi, tglobaladdr:$lo)>;
 
 // hi/lo relocs
-
+def : Mips16Pat<(MipsHi tblockaddress:$in),
+                (SllX16 (LiRxImmX16 tblockaddress:$in), 16)>;
 def : Mips16Pat<(MipsHi tglobaladdr:$in),
                 (SllX16 (LiRxImmX16 tglobaladdr:$in), 16)>;
 def : Mips16Pat<(MipsHi tjumptable:$in),
@@ -1795,6 +1862,8 @@ def : Mips16Pat<(MipsHi tjumptable:$in),
 def : Mips16Pat<(MipsHi tglobaltlsaddr:$in),
                 (SllX16 (LiRxImmX16 tglobaltlsaddr:$in), 16)>;
 
+def : Mips16Pat<(MipsLo tblockaddress:$in), (LiRxImmX16 tblockaddress:$in)>;
+
 // wrapper_pic
 class Wrapper16Pat<SDNode node, Instruction ADDiuOp, RegisterClass RC>:
   Mips16Pat<(MipsWrapper RC:$gp, node:$in),
@@ -1811,3 +1880,30 @@ def : Mips16Pat<(i32 (extloadi16  addr16:$src)),
 
 def: Mips16Pat<(trap), (Break16)>;
 
+def : Mips16Pat<(sext_inreg CPU16Regs:$val, i8),
+                (SebRx16 CPU16Regs:$val)>;
+
+def : Mips16Pat<(sext_inreg CPU16Regs:$val, i16),
+                (SehRx16 CPU16Regs:$val)>;
+
+def GotPrologue16:   
+  MipsPseudo16<
+    (outs CPU16Regs:$rh, CPU16Regs:$rl),
+    (ins simm16:$immHi, simm16:$immLo),
+    ".align 2\n\tli\t$rh, $immHi\n\taddiu\t$rl, $$pc, $immLo\n ",[]> ;
+
+// An operand for the CONSTPOOL_ENTRY pseudo-instruction.
+def cpinst_operand : Operand<i32> {
+  // let PrintMethod = "printCPInstOperand";
+}
+
+// CONSTPOOL_ENTRY - This instruction represents a floating constant pool in
+// the function.  The first operand is the ID# for this instruction, the second
+// is the index into the MachineConstantPool that this is, the third is the
+// size in bytes of this constant pool entry.
+//
+let neverHasSideEffects = 1, isNotDuplicable = 1 in
+def CONSTPOOL_ENTRY :
+MipsPseudo16<(outs), (ins cpinst_operand:$instid, cpinst_operand:$cpidx,
+                      i32imm:$size), "foo", []>;
+
diff --git a/lib/Target/Mips/Mips64InstrInfo.td b/lib/Target/Mips/Mips64InstrInfo.td
index a752ab8..15ef654 100644
--- a/lib/Target/Mips/Mips64InstrInfo.td
+++ b/lib/Target/Mips/Mips64InstrInfo.td
@@ -15,9 +15,6 @@
 // Mips Operand, Complex Patterns and Transformations Definitions.
 //===----------------------------------------------------------------------===//
 
-// Instruction operand types
-def shamt_64       : Operand<i64>;
-
 // Unsigned Operand
 def uimm16_64      : Operand<i64> {
   let PrintMethod = "printUnsignedImm";
@@ -34,36 +31,21 @@ def immZExt6 : ImmLeaf<i32, [{return Imm == (Imm & 0x3f);}]>;
 //===----------------------------------------------------------------------===//
 // Instructions specific format
 //===----------------------------------------------------------------------===//
-let DecoderNamespace = "Mips64" in {
-
-multiclass Atomic2Ops64<PatFrag Op> {
-  def NAME : Atomic2Ops<Op, GPR64, GPR32>, Requires<[NotN64, HasStdEnc]>;
-  def _P8  : Atomic2Ops<Op, GPR64, GPR64>, Requires<[IsN64, HasStdEnc]>;
-}
-
-multiclass AtomicCmpSwap64<PatFrag Op>  {
-  def NAME : AtomicCmpSwap<Op, GPR64, GPR32>,
-             Requires<[NotN64, HasStdEnc]>;
-  def _P8  : AtomicCmpSwap<Op, GPR64, GPR64>,
-             Requires<[IsN64, HasStdEnc]>;
-}
-}
-let usesCustomInserter = 1, Predicates = [HasStdEnc],
-  DecoderNamespace = "Mips64" in {
-  defm ATOMIC_LOAD_ADD_I64  : Atomic2Ops64<atomic_load_add_64>;
-  defm ATOMIC_LOAD_SUB_I64  : Atomic2Ops64<atomic_load_sub_64>;
-  defm ATOMIC_LOAD_AND_I64  : Atomic2Ops64<atomic_load_and_64>;
-  defm ATOMIC_LOAD_OR_I64   : Atomic2Ops64<atomic_load_or_64>;
-  defm ATOMIC_LOAD_XOR_I64  : Atomic2Ops64<atomic_load_xor_64>;
-  defm ATOMIC_LOAD_NAND_I64 : Atomic2Ops64<atomic_load_nand_64>;
-  defm ATOMIC_SWAP_I64      : Atomic2Ops64<atomic_swap_64>;
-  defm ATOMIC_CMP_SWAP_I64  : AtomicCmpSwap64<atomic_cmp_swap_64>;
+let usesCustomInserter = 1 in {
+  def ATOMIC_LOAD_ADD_I64  : Atomic2Ops<atomic_load_add_64, GPR64>;
+  def ATOMIC_LOAD_SUB_I64  : Atomic2Ops<atomic_load_sub_64, GPR64>;
+  def ATOMIC_LOAD_AND_I64  : Atomic2Ops<atomic_load_and_64, GPR64>;
+  def ATOMIC_LOAD_OR_I64   : Atomic2Ops<atomic_load_or_64, GPR64>;
+  def ATOMIC_LOAD_XOR_I64  : Atomic2Ops<atomic_load_xor_64, GPR64>;
+  def ATOMIC_LOAD_NAND_I64 : Atomic2Ops<atomic_load_nand_64, GPR64>;
+  def ATOMIC_SWAP_I64      : Atomic2Ops<atomic_swap_64, GPR64>;
+  def ATOMIC_CMP_SWAP_I64  : AtomicCmpSwap<atomic_cmp_swap_64, GPR64>;
 }
 
 /// Pseudo instructions for loading and storing accumulator registers.
 let isPseudo = 1, isCodeGenOnly = 1 in {
-  defm LOAD_AC128  : LoadM<"", ACRegs128>;
-  defm STORE_AC128 : StoreM<"", ACRegs128>;
+  def LOAD_ACC128  : Load<"", ACC128>;
+  def STORE_ACC128 : Store<"", ACC128>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -110,103 +92,101 @@ def NOR64  : LogicNOR<"nor", GPR64Opnd>, ADD_FM<0, 0x27>;
 }
 
 /// Shift Instructions
-def DSLL   : shift_rotate_imm<"dsll", shamt, GPR64Opnd, shl, immZExt6>,
+def DSLL   : shift_rotate_imm<"dsll", uimm6, GPR64Opnd, shl, immZExt6>,
              SRA_FM<0x38, 0>;
-def DSRL   : shift_rotate_imm<"dsrl", shamt, GPR64Opnd, srl, immZExt6>,
+def DSRL   : shift_rotate_imm<"dsrl", uimm6, GPR64Opnd, srl, immZExt6>,
              SRA_FM<0x3a, 0>;
-def DSRA   : shift_rotate_imm<"dsra", shamt, GPR64Opnd, sra, immZExt6>,
+def DSRA   : shift_rotate_imm<"dsra", uimm6, GPR64Opnd, sra, immZExt6>,
              SRA_FM<0x3b, 0>;
 def DSLLV  : shift_rotate_reg<"dsllv", GPR64Opnd, shl>, SRLV_FM<0x14, 0>;
 def DSRLV  : shift_rotate_reg<"dsrlv", GPR64Opnd, srl>, SRLV_FM<0x16, 0>;
 def DSRAV  : shift_rotate_reg<"dsrav", GPR64Opnd, sra>, SRLV_FM<0x17, 0>;
-def DSLL32 : shift_rotate_imm<"dsll32", shamt, GPR64Opnd>, SRA_FM<0x3c, 0>;
-def DSRL32 : shift_rotate_imm<"dsrl32", shamt, GPR64Opnd>, SRA_FM<0x3e, 0>;
-def DSRA32 : shift_rotate_imm<"dsra32", shamt, GPR64Opnd>, SRA_FM<0x3f, 0>;
+def DSLL32 : shift_rotate_imm<"dsll32", uimm5, GPR64Opnd>, SRA_FM<0x3c, 0>;
+def DSRL32 : shift_rotate_imm<"dsrl32", uimm5, GPR64Opnd>, SRA_FM<0x3e, 0>;
+def DSRA32 : shift_rotate_imm<"dsra32", uimm5, GPR64Opnd>, SRA_FM<0x3f, 0>;
 
 // Rotate Instructions
 let Predicates = [HasMips64r2, HasStdEnc] in {
-  def DROTR  : shift_rotate_imm<"drotr", shamt, GPR64Opnd, rotr, immZExt6>,
+  def DROTR  : shift_rotate_imm<"drotr", uimm6, GPR64Opnd, rotr, immZExt6>,
                SRA_FM<0x3a, 1>;
   def DROTRV : shift_rotate_reg<"drotrv", GPR64Opnd, rotr>,
                SRLV_FM<0x16, 1>;
+  def DROTR32 : shift_rotate_imm<"drotr32", uimm5, GPR64Opnd>, SRA_FM<0x3e, 1>;
 }
 
 /// Load and Store Instructions
 ///  aligned
 let isCodeGenOnly = 1 in {
-defm LB64  : LoadM<"lb", GPR64Opnd, sextloadi8, IILoad>, LW_FM<0x20>;
-defm LBu64 : LoadM<"lbu", GPR64Opnd, zextloadi8, IILoad>, LW_FM<0x24>;
-defm LH64  : LoadM<"lh", GPR64Opnd, sextloadi16, IILoad>, LW_FM<0x21>;
-defm LHu64 : LoadM<"lhu", GPR64Opnd, zextloadi16, IILoad>, LW_FM<0x25>;
-defm LW64  : LoadM<"lw", GPR64Opnd, sextloadi32, IILoad>, LW_FM<0x23>;
-defm SB64  : StoreM<"sb", GPR64Opnd, truncstorei8, IIStore>, LW_FM<0x28>;
-defm SH64  : StoreM<"sh", GPR64Opnd, truncstorei16, IIStore>, LW_FM<0x29>;
-defm SW64  : StoreM<"sw", GPR64Opnd, truncstorei32, IIStore>, LW_FM<0x2b>;
+def LB64  : Load<"lb", GPR64Opnd, sextloadi8, IILoad>, LW_FM<0x20>;
+def LBu64 : Load<"lbu", GPR64Opnd, zextloadi8, IILoad>, LW_FM<0x24>;
+def LH64  : Load<"lh", GPR64Opnd, sextloadi16, IILoad>, LW_FM<0x21>;
+def LHu64 : Load<"lhu", GPR64Opnd, zextloadi16, IILoad>, LW_FM<0x25>;
+def LW64  : Load<"lw", GPR64Opnd, sextloadi32, IILoad>, LW_FM<0x23>;
+def SB64  : Store<"sb", GPR64Opnd, truncstorei8, IIStore>, LW_FM<0x28>;
+def SH64  : Store<"sh", GPR64Opnd, truncstorei16, IIStore>, LW_FM<0x29>;
+def SW64  : Store<"sw", GPR64Opnd, truncstorei32, IIStore>, LW_FM<0x2b>;
 }
 
-defm LWu   : LoadM<"lwu", GPR64Opnd, zextloadi32, IILoad>, LW_FM<0x27>;
-defm LD    : LoadM<"ld", GPR64Opnd, load, IILoad>, LW_FM<0x37>;
-defm SD    : StoreM<"sd", GPR64Opnd, store, IIStore>, LW_FM<0x3f>;
+def LWu   : Load<"lwu", GPR64Opnd, zextloadi32, IILoad>, LW_FM<0x27>;
+def LD    : Load<"ld", GPR64Opnd, load, IILoad>, LW_FM<0x37>;
+def SD    : Store<"sd", GPR64Opnd, store, IIStore>, LW_FM<0x3f>;
 
 /// load/store left/right
 let isCodeGenOnly = 1 in {
-defm LWL64 : LoadLeftRightM<"lwl", MipsLWL, GPR64Opnd>, LW_FM<0x22>;
-defm LWR64 : LoadLeftRightM<"lwr", MipsLWR, GPR64Opnd>, LW_FM<0x26>;
-defm SWL64 : StoreLeftRightM<"swl", MipsSWL, GPR64Opnd>, LW_FM<0x2a>;
-defm SWR64 : StoreLeftRightM<"swr", MipsSWR, GPR64Opnd>, LW_FM<0x2e>;
+def LWL64 : LoadLeftRight<"lwl", MipsLWL, GPR64Opnd, IILoad>, LW_FM<0x22>;
+def LWR64 : LoadLeftRight<"lwr", MipsLWR, GPR64Opnd, IILoad>, LW_FM<0x26>;
+def SWL64 : StoreLeftRight<"swl", MipsSWL, GPR64Opnd, IIStore>, LW_FM<0x2a>;
+def SWR64 : StoreLeftRight<"swr", MipsSWR, GPR64Opnd, IIStore>, LW_FM<0x2e>;
 }
 
-defm LDL   : LoadLeftRightM<"ldl", MipsLDL, GPR64Opnd>, LW_FM<0x1a>;
-defm LDR   : LoadLeftRightM<"ldr", MipsLDR, GPR64Opnd>, LW_FM<0x1b>;
-defm SDL   : StoreLeftRightM<"sdl", MipsSDL, GPR64Opnd>, LW_FM<0x2c>;
-defm SDR   : StoreLeftRightM<"sdr", MipsSDR, GPR64Opnd>, LW_FM<0x2d>;
+def LDL   : LoadLeftRight<"ldl", MipsLDL, GPR64Opnd, IILoad>, LW_FM<0x1a>;
+def LDR   : LoadLeftRight<"ldr", MipsLDR, GPR64Opnd, IILoad>, LW_FM<0x1b>;
+def SDL   : StoreLeftRight<"sdl", MipsSDL, GPR64Opnd, IIStore>, LW_FM<0x2c>;
+def SDR   : StoreLeftRight<"sdr", MipsSDR, GPR64Opnd, IIStore>, LW_FM<0x2d>;
 
 /// Load-linked, Store-conditional
-let Predicates = [NotN64, HasStdEnc] in {
-  def LLD : LLBase<"lld", GPR64Opnd, mem>, LW_FM<0x34>;
-  def SCD : SCBase<"scd", GPR64Opnd, mem>, LW_FM<0x3c>;
-}
-
-let Predicates = [IsN64, HasStdEnc], isCodeGenOnly = 1 in {
-  def LLD_P8 : LLBase<"lld", GPR64Opnd, mem64>, LW_FM<0x34>;
-  def SCD_P8 : SCBase<"scd", GPR64Opnd, mem64>, LW_FM<0x3c>;
-}
+def LLD : LLBase<"lld", GPR64Opnd>, LW_FM<0x34>;
+def SCD : SCBase<"scd", GPR64Opnd>, LW_FM<0x3c>;
 
 /// Jump and Branch Instructions
 let isCodeGenOnly = 1 in {
-def JR64   : IndirectBranch<GPR64Opnd>, MTLO_FM<8>;
-def BEQ64  : CBranch<"beq", seteq, GPR64Opnd>, BEQ_FM<4>;
-def BNE64  : CBranch<"bne", setne, GPR64Opnd>, BEQ_FM<5>;
-def BGEZ64 : CBranchZero<"bgez", setge, GPR64Opnd>, BGEZ_FM<1, 1>;
-def BGTZ64 : CBranchZero<"bgtz", setgt, GPR64Opnd>, BGEZ_FM<7, 0>;
-def BLEZ64 : CBranchZero<"blez", setle, GPR64Opnd>, BGEZ_FM<6, 0>;
-def BLTZ64 : CBranchZero<"bltz", setlt, GPR64Opnd>, BGEZ_FM<1, 0>;
+def JR64   : IndirectBranch<"jr", GPR64Opnd>, MTLO_FM<8>;
+def BEQ64  : CBranch<"beq", brtarget, seteq, GPR64Opnd>, BEQ_FM<4>;
+def BNE64  : CBranch<"bne", brtarget, setne, GPR64Opnd>, BEQ_FM<5>;
+def BGEZ64 : CBranchZero<"bgez", brtarget, setge, GPR64Opnd>, BGEZ_FM<1, 1>;
+def BGTZ64 : CBranchZero<"bgtz", brtarget, setgt, GPR64Opnd>, BGEZ_FM<7, 0>;
+def BLEZ64 : CBranchZero<"blez", brtarget, setle, GPR64Opnd>, BGEZ_FM<6, 0>;
+def BLTZ64 : CBranchZero<"bltz", brtarget, setlt, GPR64Opnd>, BGEZ_FM<1, 0>;
 def JALR64 : JumpLinkReg<"jalr", GPR64Opnd>, JALR_FM;
 def JALR64Pseudo : JumpLinkRegPseudo<GPR64Opnd, JALR, RA, GPR32Opnd>;
-def TAILCALL64_R : JumpFR<GPR64Opnd, MipsTailCall>, MTLO_FM<8>, IsTailCall;
+def TAILCALL64_R : JumpFR<"tcallr", GPR64Opnd, MipsTailCall>,
+                   MTLO_FM<8>, IsTailCall;
 }
 
 /// Multiply and Divide Instructions.
-def DMULT  : Mult<"dmult", IIImult, GPR64Opnd, [HI64, LO64]>,
+def DMULT  : Mult<"dmult", IIImult, GPR64Opnd, [HI0_64, LO0_64]>,
              MULT_FM<0, 0x1c>;
-def DMULTu : Mult<"dmultu", IIImult, GPR64Opnd, [HI64, LO64]>,
+def DMULTu : Mult<"dmultu", IIImult, GPR64Opnd, [HI0_64, LO0_64]>,
              MULT_FM<0, 0x1d>;
-def PseudoDMULT  : MultDivPseudo<DMULT, ACRegs128, GPR64Opnd, MipsMult,
+def PseudoDMULT  : MultDivPseudo<DMULT, ACC128, GPR64Opnd, MipsMult,
                                  IIImult>;
-def PseudoDMULTu : MultDivPseudo<DMULTu, ACRegs128, GPR64Opnd, MipsMultu,
+def PseudoDMULTu : MultDivPseudo<DMULTu, ACC128, GPR64Opnd, MipsMultu,
                                  IIImult>;
-def DSDIV : Div<"ddiv", IIIdiv, GPR64Opnd, [HI64, LO64]>, MULT_FM<0, 0x1e>;
-def DUDIV : Div<"ddivu", IIIdiv, GPR64Opnd, [HI64, LO64]>, MULT_FM<0, 0x1f>;
-def PseudoDSDIV : MultDivPseudo<DSDIV, ACRegs128, GPR64Opnd, MipsDivRem,
+def DSDIV : Div<"ddiv", IIIdiv, GPR64Opnd, [HI0_64, LO0_64]>, MULT_FM<0, 0x1e>;
+def DUDIV : Div<"ddivu", IIIdiv, GPR64Opnd, [HI0_64, LO0_64]>, MULT_FM<0, 0x1f>;
+def PseudoDSDIV : MultDivPseudo<DSDIV, ACC128, GPR64Opnd, MipsDivRem,
                                 IIIdiv, 0, 1, 1>;
-def PseudoDUDIV : MultDivPseudo<DUDIV, ACRegs128, GPR64Opnd, MipsDivRemU,
+def PseudoDUDIV : MultDivPseudo<DUDIV, ACC128, GPR64Opnd, MipsDivRemU,
                                 IIIdiv, 0, 1, 1>;
 
 let isCodeGenOnly = 1 in {
-def MTHI64 : MoveToLOHI<"mthi", GPR64Opnd, [HI64]>, MTLO_FM<0x11>;
-def MTLO64 : MoveToLOHI<"mtlo", GPR64Opnd, [LO64]>, MTLO_FM<0x13>;
-def MFHI64 : MoveFromLOHI<"mfhi", GPR64Opnd, [HI64]>, MFLO_FM<0x10>;
-def MFLO64 : MoveFromLOHI<"mflo", GPR64Opnd, [LO64]>, MFLO_FM<0x12>;
+def MTHI64 : MoveToLOHI<"mthi", GPR64Opnd, [HI0_64]>, MTLO_FM<0x11>;
+def MTLO64 : MoveToLOHI<"mtlo", GPR64Opnd, [LO0_64]>, MTLO_FM<0x13>;
+def MFHI64 : MoveFromLOHI<"mfhi", GPR64Opnd, AC0_64>, MFLO_FM<0x10>;
+def MFLO64 : MoveFromLOHI<"mflo", GPR64Opnd, AC0_64>, MFLO_FM<0x12>;
+def PseudoMFHI64 : PseudoMFLOHI<GPR64, ACC128, MipsMFHI>;
+def PseudoMFLO64 : PseudoMFLOHI<GPR64, ACC128, MipsMFLO>;
+def PseudoMTLOHI64 : PseudoMTLOHI<ACC128, GPR64>;
 
 /// Sign Ext In Register Instructions.
 def SEB64 : SignExtInReg<"seb", i8, GPR64Opnd>, SEB_FM<0x10, 0x20>;
@@ -221,21 +201,18 @@ def DCLO : CountLeading1<"dclo", GPR64Opnd>, CLO_FM<0x25>;
 def DSBH : SubwordSwap<"dsbh", GPR64Opnd>, SEB_FM<2, 0x24>;
 def DSHD : SubwordSwap<"dshd", GPR64Opnd>, SEB_FM<5, 0x24>;
 
-def LEA_ADDiu64 : EffectiveAddress<"daddiu", GPR64Opnd, mem_ea_64>, LW_FM<0x19>;
+def LEA_ADDiu64 : EffectiveAddress<"daddiu", GPR64Opnd>, LW_FM<0x19>;
 
 let isCodeGenOnly = 1 in
-def RDHWR64 : ReadHardware<GPR64Opnd, HW64RegsOpnd>, RDHWR_FM;
+def RDHWR64 : ReadHardware<GPR64Opnd, HWRegsOpnd>, RDHWR_FM;
 
-def DEXT : ExtBase<"dext", GPR64Opnd>, EXT_FM<3>;
-let Pattern = []<dag> in {
-  def DEXTU : ExtBase<"dextu", GPR64Opnd>, EXT_FM<2>;
-  def DEXTM : ExtBase<"dextm", GPR64Opnd>, EXT_FM<1>;
-}
-def DINS : InsBase<"dins", GPR64Opnd>, EXT_FM<7>;
-let Pattern = []<dag> in {
-  def DINSU : InsBase<"dinsu", GPR64Opnd>, EXT_FM<6>;
-  def DINSM : InsBase<"dinsm", GPR64Opnd>, EXT_FM<5>;
-}
+def DEXT : ExtBase<"dext", GPR64Opnd, uimm6, MipsExt>, EXT_FM<3>;
+def DEXTU : ExtBase<"dextu", GPR64Opnd, uimm6>, EXT_FM<2>;
+def DEXTM : ExtBase<"dextm", GPR64Opnd, uimm5>, EXT_FM<1>;
+
+def DINS : InsBase<"dins", GPR64Opnd, uimm6, MipsIns>, EXT_FM<7>;
+def DINSU : InsBase<"dinsu", GPR64Opnd, uimm6>, EXT_FM<6>;
+def DINSM : InsBase<"dinsm", GPR64Opnd, uimm5>, EXT_FM<5>;
 
 let isCodeGenOnly = 1, rs = 0, shamt = 0 in {
   def DSLL64_32 : FR<0x00, 0x3c, (outs GPR64:$rd), (ins GPR32:$rt),
@@ -251,18 +228,12 @@ let isCodeGenOnly = 1, rs = 0, shamt = 0 in {
 //===----------------------------------------------------------------------===//
 
 // extended loads
-let Predicates = [NotN64, HasStdEnc] in {
+let Predicates = [HasStdEnc] in {
   def : MipsPat<(i64 (extloadi1  addr:$src)), (LB64 addr:$src)>;
   def : MipsPat<(i64 (extloadi8  addr:$src)), (LB64 addr:$src)>;
   def : MipsPat<(i64 (extloadi16 addr:$src)), (LH64 addr:$src)>;
   def : MipsPat<(i64 (extloadi32 addr:$src)), (LW64 addr:$src)>;
 }
-let Predicates = [IsN64, HasStdEnc] in {
-  def : MipsPat<(i64 (extloadi1  addr:$src)), (LB64_P8 addr:$src)>;
-  def : MipsPat<(i64 (extloadi8  addr:$src)), (LB64_P8 addr:$src)>;
-  def : MipsPat<(i64 (extloadi16 addr:$src)), (LH64_P8 addr:$src)>;
-  def : MipsPat<(i64 (extloadi32 addr:$src)), (LW64_P8 addr:$src)>;
-}
 
 // hi/lo relocs
 def : MipsPat<(MipsHi tglobaladdr:$in), (LUi64 tglobaladdr:$in)>;
@@ -316,7 +287,7 @@ defm : SetgeImmPats<GPR64, SLTi64, SLTiu64>;
 // truncate
 def : MipsPat<(i32 (trunc GPR64:$src)),
               (SLL (EXTRACT_SUBREG GPR64:$src, sub_32), 0)>,
-      Requires<[IsN64, HasStdEnc]>;
+      Requires<[HasStdEnc]>;
 
 // 32-to-64-bit extension
 def : MipsPat<(i64 (anyext GPR32:$src)), (SLL64_32 GPR32:$src)>;
@@ -330,10 +301,6 @@ def : MipsPat<(i64 (sext_inreg GPR64:$src, i32)),
 // bswap MipsPattern
 def : MipsPat<(bswap GPR64:$rt), (DSHD (DSBH GPR64:$rt))>;
 
-// mflo/hi patterns.
-def : MipsPat<(i64 (ExtractLOHI ACRegs128:$ac, imm:$lohi_idx)),
-              (EXTRACT_SUBREG ACRegs128:$ac, imm:$lohi_idx)>;
-
 //===----------------------------------------------------------------------===//
 // Instruction aliases
 //===----------------------------------------------------------------------===//
@@ -348,28 +315,16 @@ def : InstAlias<"dadd $rs, $rt, $imm",
                 0>;
 
 /// Move between CPU and coprocessor registers
-let DecoderNamespace = "Mips64" in {
-def DMFC0_3OP64 : MFC3OP<(outs GPR64Opnd:$rt),
-                         (ins GPR64Opnd:$rd, uimm16:$sel),
-                         "dmfc0\t$rt, $rd, $sel">, MFC3OP_FM<0x10, 1>;
-def DMTC0_3OP64 : MFC3OP<(outs GPR64Opnd:$rd, uimm16:$sel),
-                         (ins GPR64Opnd:$rt),
-                         "dmtc0\t$rt, $rd, $sel">, MFC3OP_FM<0x10, 5>;
-def DMFC2_3OP64 : MFC3OP<(outs GPR64Opnd:$rt),
-                         (ins GPR64Opnd:$rd, uimm16:$sel),
-                         "dmfc2\t$rt, $rd, $sel">, MFC3OP_FM<0x12, 1>;
-def DMTC2_3OP64 : MFC3OP<(outs GPR64Opnd:$rd, uimm16:$sel),
-                         (ins GPR64Opnd:$rt),
-                         "dmtc2\t$rt, $rd, $sel">, MFC3OP_FM<0x12, 5>;
+let DecoderNamespace = "Mips64", Predicates = [HasMips64] in {
+def DMFC0 : MFC3OP<"dmfc0", GPR64Opnd>, MFC3OP_FM<0x10, 1>;
+def DMTC0 : MFC3OP<"dmtc0", GPR64Opnd>, MFC3OP_FM<0x10, 5>;
+def DMFC2 : MFC3OP<"dmfc2", GPR64Opnd>, MFC3OP_FM<0x12, 1>;
+def DMTC2 : MFC3OP<"dmtc2", GPR64Opnd>, MFC3OP_FM<0x12, 5>;
 }
 
 // Two operand (implicit 0 selector) versions:
-def : InstAlias<"dmfc0 $rt, $rd",
-                (DMFC0_3OP64 GPR64Opnd:$rt, GPR64Opnd:$rd, 0), 0>;
-def : InstAlias<"dmtc0 $rt, $rd",
-                (DMTC0_3OP64 GPR64Opnd:$rd, 0, GPR64Opnd:$rt), 0>;
-def : InstAlias<"dmfc2 $rt, $rd",
-                (DMFC2_3OP64 GPR64Opnd:$rt, GPR64Opnd:$rd, 0), 0>;
-def : InstAlias<"dmtc2 $rt, $rd",
-                (DMTC2_3OP64 GPR64Opnd:$rd, 0, GPR64Opnd:$rt), 0>;
+def : InstAlias<"dmfc0 $rt, $rd", (DMFC0 GPR64Opnd:$rt, GPR64Opnd:$rd, 0), 0>;
+def : InstAlias<"dmtc0 $rt, $rd", (DMTC0 GPR64Opnd:$rt, GPR64Opnd:$rd, 0), 0>;
+def : InstAlias<"dmfc2 $rt, $rd", (DMFC2 GPR64Opnd:$rt, GPR64Opnd:$rd, 0), 0>;
+def : InstAlias<"dmtc2 $rt, $rd", (DMTC2 GPR64Opnd:$rt, GPR64Opnd:$rd, 0), 0>;
 
diff --git a/lib/Target/Mips/MipsAnalyzeImmediate.h b/lib/Target/Mips/MipsAnalyzeImmediate.h
index a094dda..cc09034 100644
--- a/lib/Target/Mips/MipsAnalyzeImmediate.h
+++ b/lib/Target/Mips/MipsAnalyzeImmediate.h
@@ -22,7 +22,7 @@ namespace llvm {
     };
     typedef SmallVector<Inst, 7 > InstSeq;
 
-    /// Analyze - Get an instrucion sequence to load immediate Imm. The last
+    /// Analyze - Get an instruction sequence to load immediate Imm. The last
     /// instruction in the sequence must be an ADDiu if LastInstrIsADDiu is
     /// true;
     const InstSeq &Analyze(uint64_t Imm, unsigned Size, bool LastInstrIsADDiu);
@@ -32,19 +32,19 @@ namespace llvm {
     /// AddInstr - Add I to all instruction sequences in SeqLs.
     void AddInstr(InstSeqLs &SeqLs, const Inst &I);
 
-    /// GetInstSeqLsADDiu - Get instrucion sequences which end with an ADDiu to
+    /// GetInstSeqLsADDiu - Get instruction sequences which end with an ADDiu to
     /// load immediate Imm
     void GetInstSeqLsADDiu(uint64_t Imm, unsigned RemSize, InstSeqLs &SeqLs);
 
-    /// GetInstSeqLsORi - Get instrucion sequences which end with an ORi to
+    /// GetInstSeqLsORi - Get instrutcion sequences which end with an ORi to
     /// load immediate Imm
     void GetInstSeqLsORi(uint64_t Imm, unsigned RemSize, InstSeqLs &SeqLs);
 
-    /// GetInstSeqLsSLL - Get instrucion sequences which end with a SLL to
+    /// GetInstSeqLsSLL - Get instruction sequences which end with a SLL to
     /// load immediate Imm
     void GetInstSeqLsSLL(uint64_t Imm, unsigned RemSize, InstSeqLs &SeqLs);
 
-    /// GetInstSeqLs - Get instrucion sequences to load immediate Imm.
+    /// GetInstSeqLs - Get instruction sequences to load immediate Imm.
     void GetInstSeqLs(uint64_t Imm, unsigned RemSize, InstSeqLs &SeqLs);
 
     /// ReplaceADDiuSLLWithLUi - Replace an ADDiu & SLL pair with a LUi.
diff --git a/lib/Target/Mips/MipsAsmPrinter.cpp b/lib/Target/Mips/MipsAsmPrinter.cpp
index 1dc3326..45c4398 100644
--- a/lib/Target/Mips/MipsAsmPrinter.cpp
+++ b/lib/Target/Mips/MipsAsmPrinter.cpp
@@ -15,11 +15,11 @@
 #define DEBUG_TYPE "mips-asm-printer"
 #include "InstPrinter/MipsInstPrinter.h"
 #include "MCTargetDesc/MipsBaseInfo.h"
-#include "MCTargetDesc/MipsELFStreamer.h"
 #include "Mips.h"
 #include "MipsAsmPrinter.h"
 #include "MipsInstrInfo.h"
 #include "MipsMCInstLower.h"
+#include "MipsTargetStreamer.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/Twine.h"
@@ -33,8 +33,8 @@
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCELFStreamer.h"
 #include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/ELF.h"
 #include "llvm/Support/TargetRegistry.h"
@@ -45,12 +45,17 @@
 
 using namespace llvm;
 
+MipsTargetStreamer &MipsAsmPrinter::getTargetStreamer() {
+  return static_cast<MipsTargetStreamer &>(OutStreamer.getTargetStreamer());
+}
+
 bool MipsAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
   // Initialize TargetLoweringObjectFile.
   if (Subtarget->allowMixed16_32())
     const_cast<TargetLoweringObjectFile&>(getObjFileLowering())
       .Initialize(OutContext, TM);
   MipsFI = MF.getInfo<MipsFunctionInfo>();
+  MCP = MF.getConstantPool();
   AsmPrinter::runOnMachineFunction(MF);
   return true;
 }
@@ -71,6 +76,39 @@ void MipsAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     return;
   }
 
+  // If we just ended a constant pool, mark it as such.
+  if (InConstantPool && MI->getOpcode() != Mips::CONSTPOOL_ENTRY) {
+    OutStreamer.EmitDataRegion(MCDR_DataRegionEnd);
+    InConstantPool = false;
+  }
+  if (MI->getOpcode() == Mips::CONSTPOOL_ENTRY) {
+    // CONSTPOOL_ENTRY - This instruction represents a floating
+    //constant pool in the function.  The first operand is the ID#
+    // for this instruction, the second is the index into the
+    // MachineConstantPool that this is, the third is the size in
+    // bytes of this constant pool entry.
+    // The required alignment is specified on the basic block holding this MI.
+    //
+    unsigned LabelId = (unsigned)MI->getOperand(0).getImm();
+    unsigned CPIdx   = (unsigned)MI->getOperand(1).getIndex();
+
+    // If this is the first entry of the pool, mark it.
+    if (!InConstantPool) {
+      OutStreamer.EmitDataRegion(MCDR_DataRegion);
+      InConstantPool = true;
+    }
+
+    OutStreamer.EmitLabel(GetCPISymbol(LabelId));
+
+    const MachineConstantPoolEntry &MCPE = MCP->getConstants()[CPIdx];
+    if (MCPE.isMachineConstantPoolEntry())
+      EmitMachineConstantPoolValue(MCPE.Val.MachineCPVal);
+    else
+      EmitGlobalConstant(MCPE.Val.ConstVal);
+    return;
+  }
+
+
   MachineBasicBlock::const_instr_iterator I = MI;
   MachineBasicBlock::const_instr_iterator E = MI->getParent()->instr_end();
 
@@ -238,16 +276,15 @@ void MipsAsmPrinter::EmitFunctionEntryLabel() {
   }
 
   if (Subtarget->inMicroMipsMode())
-    if (MipsELFStreamer *MES = dyn_cast<MipsELFStreamer>(&OutStreamer))
-      MES->emitMipsSTOCG(*Subtarget, CurrentFnSym,
-      (unsigned)ELF::STO_MIPS_MICROMIPS);
+    getTargetStreamer().emitMipsHackSTOCG(CurrentFnSym,
+                                          (unsigned)ELF::STO_MIPS_MICROMIPS);
   OutStreamer.EmitLabel(CurrentFnSym);
 }
 
 /// EmitFunctionBodyStart - Targets can override this to emit stuff before
 /// the first basic block in the function.
 void MipsAsmPrinter::EmitFunctionBodyStart() {
-  MCInstLowering.Initialize(Mang, &MF->getContext());
+  MCInstLowering.Initialize(&MF->getContext());
 
   bool IsNakedFunction =
     MF->getFunction()->
@@ -284,6 +321,12 @@ void MipsAsmPrinter::EmitFunctionBodyEnd() {
     }
     OutStreamer.EmitRawText("\t.end\t" + Twine(CurrentFnSym->getName()));
   }
+  // Make sure to terminate any constant pools that were at the end
+  // of the function.
+  if (!InConstantPool)
+    return;
+  InConstantPool = false;
+  OutStreamer.EmitDataRegion(MCDR_DataRegionEnd);
 }
 
 /// isBlockOnlyReachableByFallthough - Return true if the basic block has
@@ -418,6 +461,11 @@ bool MipsAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
         return false;
       }
     }
+    case 'w':
+      // Print MSA registers for the 'f' constraint
+      // In LLVM, the 'w' modifier doesn't need to do anything.
+      // We can just call printOperand as normal.
+      break;
     }
   }
 
@@ -485,7 +533,7 @@ void MipsAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
       return;
 
     case MachineOperand::MO_GlobalAddress:
-      O << *Mang->getSymbol(MO.getGlobal());
+      O << *getSymbol(MO.getGlobal());
       break;
 
     case MachineOperand::MO_BlockAddress: {
@@ -526,6 +574,15 @@ void MipsAsmPrinter::printUnsignedImm(const MachineInstr *MI, int opNum,
     printOperand(MI, opNum, O);
 }
 
+void MipsAsmPrinter::printUnsignedImm8(const MachineInstr *MI, int opNum,
+                                       raw_ostream &O) {
+  const MachineOperand &MO = MI->getOperand(opNum);
+  if (MO.isImm())
+    O << (unsigned short int)(unsigned char)MO.getImm();
+  else
+    printOperand(MI, opNum, O);
+}
+
 void MipsAsmPrinter::
 printMemOperand(const MachineInstr *MI, int opNum, raw_ostream &O) {
   // Load/Store memory operands -- imm($reg)
@@ -587,15 +644,54 @@ void MipsAsmPrinter::EmitStartOfAsmFile(Module &M) {
 
 }
 
-void MipsAsmPrinter::EmitEndOfAsmFile(Module &M) {
+static void emitELFHeaderFlagsCG(MipsTargetStreamer &TargetStreamer,
+                                 const MipsSubtarget &Subtarget) {
+  // Update e_header flags
+  unsigned EFlags = 0;
 
-  if (OutStreamer.hasRawTextSupport()) return;
+  // TODO: Need to add -mabicalls and -mno-abicalls flags.
+  // Currently we assume that -mabicalls is the default.
+  EFlags |= ELF::EF_MIPS_CPIC;
 
+  if (Subtarget.inMips16Mode())
+    EFlags |= ELF::EF_MIPS_ARCH_ASE_M16;
+  else
+    EFlags |= ELF::EF_MIPS_NOREORDER;
+
+  // Architecture
+  if (Subtarget.hasMips64r2())
+    EFlags |= ELF::EF_MIPS_ARCH_64R2;
+  else if (Subtarget.hasMips64())
+    EFlags |= ELF::EF_MIPS_ARCH_64;
+  else if (Subtarget.hasMips32r2())
+    EFlags |= ELF::EF_MIPS_ARCH_32R2;
+  else
+    EFlags |= ELF::EF_MIPS_ARCH_32;
+
+  if (Subtarget.inMicroMipsMode())
+    EFlags |= ELF::EF_MIPS_MICROMIPS;
+
+  // ABI
+  if (Subtarget.isABI_O32())
+    EFlags |= ELF::EF_MIPS_ABI_O32;
+
+  // Relocation Model
+  Reloc::Model RM = Subtarget.getRelocationModel();
+  if (RM == Reloc::PIC_ || RM == Reloc::Default)
+    EFlags |= ELF::EF_MIPS_PIC;
+  else if (RM == Reloc::Static)
+    ; // Do nothing for Reloc::Static
+  else
+    llvm_unreachable("Unsupported relocation model for e_flags");
+
+  TargetStreamer.emitMipsHackELFFlags(EFlags);
+}
+
+void MipsAsmPrinter::EmitEndOfAsmFile(Module &M) {
   // Emit Mips ELF register info
   Subtarget->getMReginfo().emitMipsReginfoSectionCG(
              OutStreamer, getObjFileLowering(), *Subtarget);
-  if (MipsELFStreamer *MES = dyn_cast<MipsELFStreamer>(&OutStreamer))
-    MES->emitELFHeaderFlagsCG(*Subtarget);
+  emitELFHeaderFlagsCG(getTargetStreamer(), *Subtarget);
 }
 
 void MipsAsmPrinter::PrintDebugValueComment(const MachineInstr *MI,
diff --git a/lib/Target/Mips/MipsAsmPrinter.h b/lib/Target/Mips/MipsAsmPrinter.h
index 4d1d624..11c6acd 100644
--- a/lib/Target/Mips/MipsAsmPrinter.h
+++ b/lib/Target/Mips/MipsAsmPrinter.h
@@ -25,10 +25,12 @@ namespace llvm {
 class MCStreamer;
 class MachineInstr;
 class MachineBasicBlock;
+class MipsTargetStreamer;
 class Module;
 class raw_ostream;
 
 class LLVM_LIBRARY_VISIBILITY MipsAsmPrinter : public AsmPrinter {
+  MipsTargetStreamer &getTargetStreamer();
 
   void EmitInstrWithMacroNoAT(const MachineInstr *MI);
 
@@ -40,6 +42,16 @@ private:
   // lowerOperand - Convert a MachineOperand into the equivalent MCOperand.
   bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp);
 
+  /// MCP - Keep a pointer to constantpool entries of the current
+  /// MachineFunction.
+  const MachineConstantPool *MCP;
+
+  /// InConstantPool - Maintain state when emitting a sequence of constant
+  /// pool entries so we can properly mark them as data regions.
+  bool InConstantPool;
+
+  bool UsingConstantPools;
+
 public:
 
   const MipsSubtarget *Subtarget;
@@ -47,8 +59,11 @@ public:
   MipsMCInstLower MCInstLowering;
 
   explicit MipsAsmPrinter(TargetMachine &TM,  MCStreamer &Streamer)
-    : AsmPrinter(TM, Streamer), MCInstLowering(*this) {
+    : AsmPrinter(TM, Streamer), MCP(0), InConstantPool(false),
+      MCInstLowering(*this) {
     Subtarget = &TM.getSubtarget<MipsSubtarget>();
+    UsingConstantPools =
+      (Subtarget->inMips16Mode() && Subtarget->useConstantIslands());
   }
 
   virtual const char *getPassName() const {
@@ -57,6 +72,12 @@ public:
 
   virtual bool runOnMachineFunction(MachineFunction &MF);
 
+  virtual void EmitConstantPool() LLVM_OVERRIDE {
+    if (!UsingConstantPools)
+      AsmPrinter::EmitConstantPool();
+    // we emit constant pools customly!
+  }
+
   void EmitInstruction(const MachineInstr *MI);
   void printSavedRegsBitmask(raw_ostream &O);
   void printHex32(unsigned int Value, raw_ostream &O);
@@ -75,6 +96,7 @@ public:
                              raw_ostream &O);
   void printOperand(const MachineInstr *MI, int opNum, raw_ostream &O);
   void printUnsignedImm(const MachineInstr *MI, int opNum, raw_ostream &O);
+  void printUnsignedImm8(const MachineInstr *MI, int opNum, raw_ostream &O);
   void printMemOperand(const MachineInstr *MI, int opNum, raw_ostream &O);
   void printMemOperandEA(const MachineInstr *MI, int opNum, raw_ostream &O);
   void printFCCOperand(const MachineInstr *MI, int opNum, raw_ostream &O,
diff --git a/lib/Target/Mips/MipsCallingConv.td b/lib/Target/Mips/MipsCallingConv.td
index ac40b11..66391cb 100644
--- a/lib/Target/Mips/MipsCallingConv.td
+++ b/lib/Target/Mips/MipsCallingConv.td
@@ -26,8 +26,10 @@ def RetCC_MipsO32 : CallingConv<[
   // f32 are returned in registers F0, F2
   CCIfType<[f32], CCAssignToReg<[F0, F2]>>,
 
-  // f64 are returned in register D0, D1
-  CCIfType<[f64], CCIfSubtarget<"isNotSingleFloat()", CCAssignToReg<[D0, D1]>>>
+  // f64 arguments are returned in D0_64 and D1_64 in FP64bit mode or
+  // in D0 and D1 in FP32bit mode.
+  CCIfType<[f64], CCIfSubtarget<"isFP64bit()", CCAssignToReg<[D0_64, D1_64]>>>,
+  CCIfType<[f64], CCIfSubtarget<"isNotFP64bit()", CCAssignToReg<[D0, D1]>>>
 ]>;
 
 //===----------------------------------------------------------------------===//
@@ -149,7 +151,16 @@ def RetCC_MipsEABI : CallingConv<[
 //===----------------------------------------------------------------------===//
 def CC_MipsO32_FastCC : CallingConv<[
   // f64 arguments are passed in double-precision floating pointer registers.
-  CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7, D8, D9]>>,
+  CCIfType<[f64], CCIfSubtarget<"isNotFP64bit()",
+                                CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7,
+                                               D8, D9]>>>,
+  CCIfType<[f64], CCIfSubtarget<"isFP64bit()",
+                                CCAssignToReg<[D0_64, D1_64, D2_64, D3_64,
+                                               D4_64, D5_64, D6_64, D7_64,
+                                               D8_64, D9_64, D10_64, D11_64,
+                                               D12_64, D13_64, D14_64, D15_64,
+                                               D16_64, D17_64, D18_64,
+                                               D19_64]>>>,
 
   // Stack parameter slots for f64 are 64-bit doublewords and 8-byte aligned.
   CCIfType<[f64], CCAssignToStack<8, 8>>
@@ -224,6 +235,9 @@ def CSR_SingleFloatOnly : CalleeSavedRegs<(add (sequence "F%u", 31, 20), RA, FP,
 def CSR_O32 : CalleeSavedRegs<(add (sequence "D%u", 15, 10), RA, FP,
                                    (sequence "S%u", 7, 0))>;
 
+def CSR_O32_FP64 : CalleeSavedRegs<(add (sequence "D%u_64", 31, 20), RA, FP,
+                                        (sequence "S%u", 7, 0))>;
+
 def CSR_N32 : CalleeSavedRegs<(add D31_64, D29_64, D27_64, D25_64, D24_64,
                                    D23_64, D22_64, D21_64, RA_64, FP_64, GP_64,
                                    (sequence "S%u_64", 7, 0))>;
diff --git a/lib/Target/Mips/MipsCodeEmitter.cpp b/lib/Target/Mips/MipsCodeEmitter.cpp
index 813037e..ca4163d 100644
--- a/lib/Target/Mips/MipsCodeEmitter.cpp
+++ b/lib/Target/Mips/MipsCodeEmitter.cpp
@@ -105,11 +105,16 @@ private:
                          const MachineOperand &MO) const;
 
   unsigned getJumpTargetOpValue(const MachineInstr &MI, unsigned OpNo) const;
+  unsigned getJumpTargetOpValueMM(const MachineInstr &MI, unsigned OpNo) const;
+  unsigned getBranchTargetOpValueMM(const MachineInstr &MI,
+                                    unsigned OpNo) const;
 
   unsigned getBranchTargetOpValue(const MachineInstr &MI, unsigned OpNo) const;
   unsigned getMemEncoding(const MachineInstr &MI, unsigned OpNo) const;
+  unsigned getMemEncodingMMImm12(const MachineInstr &MI, unsigned OpNo) const;
   unsigned getSizeExtEncoding(const MachineInstr &MI, unsigned OpNo) const;
   unsigned getSizeInsEncoding(const MachineInstr &MI, unsigned OpNo) const;
+  unsigned getLSAImmEncoding(const MachineInstr &MI, unsigned OpNo) const;
 
   void emitGlobalAddressUnaligned(const GlobalValue *GV, unsigned Reloc,
                                   int Offset) const;
@@ -186,6 +191,18 @@ unsigned MipsCodeEmitter::getJumpTargetOpValue(const MachineInstr &MI,
   return 0;
 }
 
+unsigned MipsCodeEmitter::getJumpTargetOpValueMM(const MachineInstr &MI,
+                                                 unsigned OpNo) const {
+  llvm_unreachable("Unimplemented function.");
+  return 0;
+}
+
+unsigned MipsCodeEmitter::getBranchTargetOpValueMM(const MachineInstr &MI,
+                                                   unsigned OpNo) const {
+  llvm_unreachable("Unimplemented function.");
+  return 0;
+}
+
 unsigned MipsCodeEmitter::getBranchTargetOpValue(const MachineInstr &MI,
                                                  unsigned OpNo) const {
   MachineOperand MO = MI.getOperand(OpNo);
@@ -201,6 +218,12 @@ unsigned MipsCodeEmitter::getMemEncoding(const MachineInstr &MI,
   return (getMachineOpValue(MI, MI.getOperand(OpNo+1)) & 0xFFFF) | RegBits;
 }
 
+unsigned MipsCodeEmitter::getMemEncodingMMImm12(const MachineInstr &MI,
+                                                unsigned OpNo) const {
+  llvm_unreachable("Unimplemented function.");
+  return 0;
+}
+
 unsigned MipsCodeEmitter::getSizeExtEncoding(const MachineInstr &MI,
                                              unsigned OpNo) const {
   // size is encoded as size-1.
@@ -214,6 +237,12 @@ unsigned MipsCodeEmitter::getSizeInsEncoding(const MachineInstr &MI,
          getMachineOpValue(MI, MI.getOperand(OpNo)) - 1;
 }
 
+unsigned MipsCodeEmitter::getLSAImmEncoding(const MachineInstr &MI,
+                                            unsigned OpNo) const {
+  llvm_unreachable("Unimplemented function.");
+  return 0;
+}
+
 /// getMachineOpValue - Return binary encoding of operand. If the machine
 /// operand requires relocation, record the relocation and return zero.
 unsigned MipsCodeEmitter::getMachineOpValue(const MachineInstr &MI,
@@ -316,6 +345,14 @@ bool MipsCodeEmitter::expandPseudos(MachineBasicBlock::instr_iterator &MI,
     BuildMI(MBB, &*MI, MI->getDebugLoc(), II->get(Mips::SLL), Mips::ZERO)
       .addReg(Mips::ZERO).addImm(0);
     break;
+  case Mips::B:
+    BuildMI(MBB, &*MI, MI->getDebugLoc(), II->get(Mips::BEQ)).addReg(Mips::ZERO)
+      .addReg(Mips::ZERO).addOperand(MI->getOperand(0));
+    break;
+  case Mips::TRAP:
+    BuildMI(MBB, &*MI, MI->getDebugLoc(), II->get(Mips::BREAK)).addImm(0)
+      .addImm(0);
+    break;
   case Mips::JALRPseudo:
     BuildMI(MBB, &*MI, MI->getDebugLoc(), II->get(Mips::JALR), Mips::RA)
       .addReg(MI->getOperand(0).getReg());
diff --git a/lib/Target/Mips/MipsCondMov.td b/lib/Target/Mips/MipsCondMov.td
index 39862b3..2de1430 100644
--- a/lib/Target/Mips/MipsCondMov.td
+++ b/lib/Target/Mips/MipsCondMov.td
@@ -19,7 +19,7 @@
 class CMov_I_I_FT<string opstr, RegisterOperand CRC, RegisterOperand DRC,
                   InstrItinClass Itin> :
   InstSE<(outs DRC:$rd), (ins DRC:$rs, CRC:$rt, DRC:$F),
-         !strconcat(opstr, "\t$rd, $rs, $rt"), [], Itin, FrmFR> {
+         !strconcat(opstr, "\t$rd, $rs, $rt"), [], Itin, FrmFR, opstr> {
   let Constraints = "$F = $rd";
 }
 
@@ -37,7 +37,7 @@ class CMov_F_I_FT<string opstr, RegisterOperand RC, InstrItinClass Itin,
   InstSE<(outs RC:$rd), (ins RC:$rs, FCCRegsOpnd:$fcc, RC:$F),
          !strconcat(opstr, "\t$rd, $rs, $fcc"),
          [(set RC:$rd, (OpNode RC:$rs, FCCRegsOpnd:$fcc, RC:$F))],
-         Itin, FrmFR> {
+         Itin, FrmFR, opstr> {
   let Constraints = "$F = $rd";
 }
 
@@ -103,91 +103,94 @@ multiclass MovnPats<RegisterClass CRC, RegisterClass DRC, Instruction MOVNInst,
 }
 
 // Instantiation of instructions.
-def MOVZ_I_I : CMov_I_I_FT<"movz", GPR32Opnd, GPR32Opnd, NoItinerary>,
+def MOVZ_I_I : MMRel, CMov_I_I_FT<"movz", GPR32Opnd, GPR32Opnd, IIArith>,
                ADD_FM<0, 0xa>;
 
 let Predicates = [HasStdEnc], isCodeGenOnly = 1 in {
-  def MOVZ_I_I64   : CMov_I_I_FT<"movz", GPR32Opnd, GPR64Opnd,
-                                  NoItinerary>, ADD_FM<0, 0xa>;
-  def MOVZ_I64_I   : CMov_I_I_FT<"movz", GPR64Opnd, GPR32Opnd,
-                                  NoItinerary>, ADD_FM<0, 0xa>;
-  def MOVZ_I64_I64 : CMov_I_I_FT<"movz", GPR64Opnd, GPR64Opnd,
-                                  NoItinerary>, ADD_FM<0, 0xa>;
+  def MOVZ_I_I64   : CMov_I_I_FT<"movz", GPR32Opnd, GPR64Opnd, IIArith>,
+                     ADD_FM<0, 0xa>;
+  def MOVZ_I64_I   : CMov_I_I_FT<"movz", GPR64Opnd, GPR32Opnd, IIArith>,
+                     ADD_FM<0, 0xa>;
+  def MOVZ_I64_I64 : CMov_I_I_FT<"movz", GPR64Opnd, GPR64Opnd, IIArith>,
+                     ADD_FM<0, 0xa>;
 }
 
-def MOVN_I_I       : CMov_I_I_FT<"movn", GPR32Opnd, GPR32Opnd,
-                                  NoItinerary>, ADD_FM<0, 0xb>;
+def MOVN_I_I       : MMRel, CMov_I_I_FT<"movn", GPR32Opnd, GPR32Opnd, IIArith>,
+                     ADD_FM<0, 0xb>;
 
 let Predicates = [HasStdEnc], isCodeGenOnly = 1 in {
-  def MOVN_I_I64   : CMov_I_I_FT<"movn", GPR32Opnd, GPR64Opnd,
-                                  NoItinerary>, ADD_FM<0, 0xb>;
-  def MOVN_I64_I   : CMov_I_I_FT<"movn", GPR64Opnd, GPR32Opnd,
-                                  NoItinerary>, ADD_FM<0, 0xb>;
-  def MOVN_I64_I64 : CMov_I_I_FT<"movn", GPR64Opnd, GPR64Opnd,
-                                  NoItinerary>, ADD_FM<0, 0xb>;
+  def MOVN_I_I64   : CMov_I_I_FT<"movn", GPR32Opnd, GPR64Opnd, IIArith>,
+                     ADD_FM<0, 0xb>;
+  def MOVN_I64_I   : CMov_I_I_FT<"movn", GPR64Opnd, GPR32Opnd, IIArith>,
+                     ADD_FM<0, 0xb>;
+  def MOVN_I64_I64 : CMov_I_I_FT<"movn", GPR64Opnd, GPR64Opnd, IIArith>,
+                     ADD_FM<0, 0xb>;
 }
 
-def MOVZ_I_S : CMov_I_F_FT<"movz.s", GPR32Opnd, FGR32RegsOpnd, IIFmove>,
+def MOVZ_I_S : CMov_I_F_FT<"movz.s", GPR32Opnd, FGR32Opnd, IIFmove>,
                CMov_I_F_FM<18, 16>;
 
 let isCodeGenOnly = 1 in
-def MOVZ_I64_S : CMov_I_F_FT<"movz.s", GPR64Opnd, FGR32RegsOpnd, IIFmove>,
+def MOVZ_I64_S : CMov_I_F_FT<"movz.s", GPR64Opnd, FGR32Opnd, IIFmove>,
                  CMov_I_F_FM<18, 16>, Requires<[HasMips64, HasStdEnc]>;
 
-def MOVN_I_S : CMov_I_F_FT<"movn.s", GPR32Opnd, FGR32RegsOpnd, IIFmove>,
+def MOVN_I_S : CMov_I_F_FT<"movn.s", GPR32Opnd, FGR32Opnd, IIFmove>,
                CMov_I_F_FM<19, 16>;
 
 let isCodeGenOnly = 1 in
-def MOVN_I64_S : CMov_I_F_FT<"movn.s", GPR64Opnd, FGR32RegsOpnd, IIFmove>,
+def MOVN_I64_S : CMov_I_F_FT<"movn.s", GPR64Opnd, FGR32Opnd, IIFmove>,
                  CMov_I_F_FM<19, 16>, Requires<[HasMips64, HasStdEnc]>;
 
 let Predicates = [NotFP64bit, HasStdEnc] in {
-  def MOVZ_I_D32 : CMov_I_F_FT<"movz.d", GPR32Opnd, AFGR64RegsOpnd, IIFmove>,
+  def MOVZ_I_D32 : CMov_I_F_FT<"movz.d", GPR32Opnd, AFGR64Opnd, IIFmove>,
                    CMov_I_F_FM<18, 17>;
-  def MOVN_I_D32 : CMov_I_F_FT<"movn.d", GPR32Opnd, AFGR64RegsOpnd, IIFmove>,
+  def MOVN_I_D32 : CMov_I_F_FT<"movn.d", GPR32Opnd, AFGR64Opnd, IIFmove>,
                    CMov_I_F_FM<19, 17>;
 }
 
-let Predicates = [IsFP64bit, HasStdEnc], isCodeGenOnly = 1 in {
-  def MOVZ_I_D64 : CMov_I_F_FT<"movz.d", GPR32Opnd, FGR64RegsOpnd, IIFmove>,
+let Predicates = [IsFP64bit, HasStdEnc], DecoderNamespace = "Mips64" in {
+  def MOVZ_I_D64 : CMov_I_F_FT<"movz.d", GPR32Opnd, FGR64Opnd, IIFmove>,
                    CMov_I_F_FM<18, 17>;
-  def MOVZ_I64_D64 : CMov_I_F_FT<"movz.d", GPR64Opnd, FGR64RegsOpnd,
-                                  IIFmove>, CMov_I_F_FM<18, 17>;
-  def MOVN_I_D64 : CMov_I_F_FT<"movn.d", GPR32Opnd, FGR64RegsOpnd, IIFmove>,
+  def MOVN_I_D64 : CMov_I_F_FT<"movn.d", GPR32Opnd, FGR64Opnd, IIFmove>,
                    CMov_I_F_FM<19, 17>;
-  def MOVN_I64_D64 : CMov_I_F_FT<"movn.d", GPR64Opnd, FGR64RegsOpnd,
-                                  IIFmove>, CMov_I_F_FM<19, 17>;
+  let isCodeGenOnly = 1 in {
+    def MOVZ_I64_D64 : CMov_I_F_FT<"movz.d", GPR64Opnd, FGR64Opnd,
+                                   IIFmove>, CMov_I_F_FM<18, 17>;
+    def MOVN_I64_D64 : CMov_I_F_FT<"movn.d", GPR64Opnd, FGR64Opnd,
+                                   IIFmove>, CMov_I_F_FM<19, 17>;
+  }
 }
 
-def MOVT_I : CMov_F_I_FT<"movt", GPR32Opnd, IIArith, MipsCMovFP_T>,
+def MOVT_I : MMRel, CMov_F_I_FT<"movt", GPR32Opnd, IIArith, MipsCMovFP_T>,
              CMov_F_I_FM<1>;
 
 let isCodeGenOnly = 1 in
 def MOVT_I64 : CMov_F_I_FT<"movt", GPR64Opnd, IIArith, MipsCMovFP_T>,
                CMov_F_I_FM<1>, Requires<[HasMips64, HasStdEnc]>;
 
-def MOVF_I : CMov_F_I_FT<"movf", GPR32Opnd, IIArith, MipsCMovFP_F>,
+def MOVF_I : MMRel, CMov_F_I_FT<"movf", GPR32Opnd, IIArith, MipsCMovFP_F>,
              CMov_F_I_FM<0>;
 
 let isCodeGenOnly = 1 in
 def MOVF_I64 : CMov_F_I_FT<"movf", GPR64Opnd, IIArith, MipsCMovFP_F>,
                CMov_F_I_FM<0>, Requires<[HasMips64, HasStdEnc]>;
 
-def MOVT_S : CMov_F_F_FT<"movt.s", FGR32RegsOpnd, IIFmove, MipsCMovFP_T>,
+def MOVT_S : CMov_F_F_FT<"movt.s", FGR32Opnd, IIFmove, MipsCMovFP_T>,
              CMov_F_F_FM<16, 1>;
-def MOVF_S : CMov_F_F_FT<"movf.s", FGR32RegsOpnd, IIFmove, MipsCMovFP_F>,
+def MOVF_S : CMov_F_F_FT<"movf.s", FGR32Opnd, IIFmove, MipsCMovFP_F>,
              CMov_F_F_FM<16, 0>;
 
 let Predicates = [NotFP64bit, HasStdEnc] in {
-  def MOVT_D32 : CMov_F_F_FT<"movt.d", AFGR64RegsOpnd, IIFmove, MipsCMovFP_T>,
+  def MOVT_D32 : CMov_F_F_FT<"movt.d", AFGR64Opnd, IIFmove, MipsCMovFP_T>,
                  CMov_F_F_FM<17, 1>;
-  def MOVF_D32 : CMov_F_F_FT<"movf.d", AFGR64RegsOpnd, IIFmove, MipsCMovFP_F>,
+  def MOVF_D32 : CMov_F_F_FT<"movf.d", AFGR64Opnd, IIFmove, MipsCMovFP_F>,
                  CMov_F_F_FM<17, 0>;
 }
-let Predicates = [IsFP64bit, HasStdEnc], isCodeGenOnly = 1 in {
-  def MOVT_D64 : CMov_F_F_FT<"movt.d", FGR64RegsOpnd, IIFmove, MipsCMovFP_T>,
+
+let Predicates = [IsFP64bit, HasStdEnc], DecoderNamespace = "Mips64" in {
+  def MOVT_D64 : CMov_F_F_FT<"movt.d", FGR64Opnd, IIFmove, MipsCMovFP_T>,
                  CMov_F_F_FM<17, 1>;
-  def MOVF_D64 : CMov_F_F_FT<"movf.d", FGR64RegsOpnd, IIFmove, MipsCMovFP_F>,
+  def MOVF_D64 : CMov_F_F_FT<"movf.d", FGR64Opnd, IIFmove, MipsCMovFP_F>,
                  CMov_F_F_FM<17, 0>;
 }
 
diff --git a/lib/Target/Mips/MipsConstantIslandPass.cpp b/lib/Target/Mips/MipsConstantIslandPass.cpp
index bda0167..c46bbac 100644
--- a/lib/Target/Mips/MipsConstantIslandPass.cpp
+++ b/lib/Target/Mips/MipsConstantIslandPass.cpp
@@ -9,9 +9,7 @@
 //
 //
 // This pass is used to make Pc relative loads of constants.
-// For now, only Mips16 will use this. While it has the same name and
-// uses many ideas from the LLVM ARM Constant Island Pass, it's not intended
-// to reuse any of the code from the ARM version.
+// For now, only Mips16 will use this. 
 //
 // Loading constants inline is expensive on Mips16 and it's in general better
 // to place the constant nearby in code space and then it can be loaded with a
@@ -27,31 +25,244 @@
 
 #include "Mips.h"
 #include "MCTargetDesc/MipsBaseInfo.h"
+#include "Mips16InstrInfo.h"
+#include "MipsMachineFunction.h"
 #include "MipsTargetMachine.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/IR/Function.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/InstIterator.h"
 #include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Support/Format.h"
+#include <algorithm>
 
 using namespace llvm;
 
+STATISTIC(NumCPEs,       "Number of constpool entries");
+STATISTIC(NumSplit,      "Number of uncond branches inserted");
+STATISTIC(NumCBrFixed,   "Number of cond branches fixed");
+STATISTIC(NumUBrFixed,   "Number of uncond branches fixed");
+
+// FIXME: This option should be removed once it has received sufficient testing.
+static cl::opt<bool>
+AlignConstantIslands("mips-align-constant-islands", cl::Hidden, cl::init(true),
+          cl::desc("Align constant islands in code"));
+
+
+// Rather than do make check tests with huge amounts of code, we force
+// the test to use this amount.
+//
+static cl::opt<int> ConstantIslandsSmallOffset(
+  "mips-constant-islands-small-offset",
+  cl::init(0),
+  cl::desc("Make small offsets be this amount for testing purposes"),
+  cl::Hidden);
+
+//
+// For testing purposes we tell it to not use relaxed load forms so that it
+// will split blocks.
+//
+static cl::opt<bool> NoLoadRelaxation(
+  "mips-constant-islands-no-load-relaxation",
+  cl::init(false),
+  cl::desc("Don't relax loads to long loads - for testing purposes"),
+  cl::Hidden);
+
+
 namespace {
+
+
   typedef MachineBasicBlock::iterator Iter;
   typedef MachineBasicBlock::reverse_iterator ReverseIter;
 
+  /// MipsConstantIslands - Due to limited PC-relative displacements, Mips
+  /// requires constant pool entries to be scattered among the instructions
+  /// inside a function.  To do this, it completely ignores the normal LLVM
+  /// constant pool; instead, it places constants wherever it feels like with
+  /// special instructions.
+  ///
+  /// The terminology used in this pass includes:
+  ///   Islands - Clumps of constants placed in the function.
+  ///   Water   - Potential places where an island could be formed.
+  ///   CPE     - A constant pool entry that has been placed somewhere, which
+  ///             tracks a list of users.
+
   class MipsConstantIslands : public MachineFunctionPass {
 
+    /// BasicBlockInfo - Information about the offset and size of a single
+    /// basic block.
+    struct BasicBlockInfo {
+      /// Offset - Distance from the beginning of the function to the beginning
+      /// of this basic block.
+      ///
+      /// Offsets are computed assuming worst case padding before an aligned
+      /// block. This means that subtracting basic block offsets always gives a
+      /// conservative estimate of the real distance which may be smaller.
+      ///
+      /// Because worst case padding is used, the computed offset of an aligned
+      /// block may not actually be aligned.
+      unsigned Offset;
+
+      /// Size - Size of the basic block in bytes.  If the block contains
+      /// inline assembly, this is a worst case estimate.
+      ///
+      /// The size does not include any alignment padding whether from the
+      /// beginning of the block, or from an aligned jump table at the end.
+      unsigned Size;
+
+      // FIXME: ignore LogAlign for this patch
+      //
+      unsigned postOffset(unsigned LogAlign = 0) const {
+        unsigned PO = Offset + Size;
+        return PO;
+      }
+
+      BasicBlockInfo() : Offset(0), Size(0) {}
+
+    };
+
+    std::vector<BasicBlockInfo> BBInfo;
+
+    /// WaterList - A sorted list of basic blocks where islands could be placed
+    /// (i.e. blocks that don't fall through to the following block, due
+    /// to a return, unreachable, or unconditional branch).
+    std::vector<MachineBasicBlock*> WaterList;
+
+    /// NewWaterList - The subset of WaterList that was created since the
+    /// previous iteration by inserting unconditional branches.
+    SmallSet<MachineBasicBlock*, 4> NewWaterList;
+
+    typedef std::vector<MachineBasicBlock*>::iterator water_iterator;
+
+    /// CPUser - One user of a constant pool, keeping the machine instruction
+    /// pointer, the constant pool being referenced, and the max displacement
+    /// allowed from the instruction to the CP.  The HighWaterMark records the
+    /// highest basic block where a new CPEntry can be placed.  To ensure this
+    /// pass terminates, the CP entries are initially placed at the end of the
+    /// function and then move monotonically to lower addresses.  The
+    /// exception to this rule is when the current CP entry for a particular
+    /// CPUser is out of range, but there is another CP entry for the same
+    /// constant value in range.  We want to use the existing in-range CP
+    /// entry, but if it later moves out of range, the search for new water
+    /// should resume where it left off.  The HighWaterMark is used to record
+    /// that point.
+    struct CPUser {
+      MachineInstr *MI;
+      MachineInstr *CPEMI;
+      MachineBasicBlock *HighWaterMark;
+    private:
+      unsigned MaxDisp;
+      unsigned LongFormMaxDisp; // mips16 has 16/32 bit instructions
+                                // with different displacements
+      unsigned LongFormOpcode;
+    public:
+      bool NegOk;
+      CPUser(MachineInstr *mi, MachineInstr *cpemi, unsigned maxdisp,
+             bool neg,
+             unsigned longformmaxdisp, unsigned longformopcode)
+        : MI(mi), CPEMI(cpemi), MaxDisp(maxdisp),
+          LongFormMaxDisp(longformmaxdisp), LongFormOpcode(longformopcode),
+          NegOk(neg){
+        HighWaterMark = CPEMI->getParent();
+      }
+      /// getMaxDisp - Returns the maximum displacement supported by MI.
+      unsigned getMaxDisp() const {
+        unsigned xMaxDisp = ConstantIslandsSmallOffset?
+                            ConstantIslandsSmallOffset: MaxDisp;
+        return xMaxDisp;
+      }
+      void setMaxDisp(unsigned val) {
+        MaxDisp = val;
+      }
+      unsigned getLongFormMaxDisp() const {
+        return LongFormMaxDisp;
+      }
+      unsigned getLongFormOpcode() const {
+          return LongFormOpcode;
+      }
+    };
+
+    /// CPUsers - Keep track of all of the machine instructions that use various
+    /// constant pools and their max displacement.
+    std::vector<CPUser> CPUsers;
+
+  /// CPEntry - One per constant pool entry, keeping the machine instruction
+  /// pointer, the constpool index, and the number of CPUser's which
+  /// reference this entry.
+  struct CPEntry {
+    MachineInstr *CPEMI;
+    unsigned CPI;
+    unsigned RefCount;
+    CPEntry(MachineInstr *cpemi, unsigned cpi, unsigned rc = 0)
+      : CPEMI(cpemi), CPI(cpi), RefCount(rc) {}
+  };
+
+  /// CPEntries - Keep track of all of the constant pool entry machine
+  /// instructions. For each original constpool index (i.e. those that
+  /// existed upon entry to this pass), it keeps a vector of entries.
+  /// Original elements are cloned as we go along; the clones are
+  /// put in the vector of the original element, but have distinct CPIs.
+  std::vector<std::vector<CPEntry> > CPEntries;
+
+  /// ImmBranch - One per immediate branch, keeping the machine instruction
+  /// pointer, conditional or unconditional, the max displacement,
+  /// and (if isCond is true) the corresponding unconditional branch
+  /// opcode.
+  struct ImmBranch {
+    MachineInstr *MI;
+    unsigned MaxDisp : 31;
+    bool isCond : 1;
+    int UncondBr;
+    ImmBranch(MachineInstr *mi, unsigned maxdisp, bool cond, int ubr)
+      : MI(mi), MaxDisp(maxdisp), isCond(cond), UncondBr(ubr) {}
+  };
+
+  /// ImmBranches - Keep track of all the immediate branch instructions.
+  ///
+  std::vector<ImmBranch> ImmBranches;
+
+  /// HasFarJump - True if any far jump instruction has been emitted during
+  /// the branch fix up pass.
+  bool HasFarJump;
+
+  const TargetMachine &TM;
+  bool IsPIC;
+  unsigned ABI;
+  const MipsSubtarget *STI;
+  const Mips16InstrInfo *TII;
+  MipsFunctionInfo *MFI;
+  MachineFunction *MF;
+  MachineConstantPool *MCP;
+
+  unsigned PICLabelUId;
+  bool PrescannedForConstants;
+
+  void initPICLabelUId(unsigned UId) {
+    PICLabelUId = UId;
+  }
+
+
+  unsigned createPICLabelUId() {
+    return PICLabelUId++;
+  }
+
   public:
     static char ID;
     MipsConstantIslands(TargetMachine &tm)
       : MachineFunctionPass(ID), TM(tm),
         IsPIC(TM.getRelocationModel() == Reloc::PIC_),
-        ABI(TM.getSubtarget<MipsSubtarget>().getTargetABI()) {}
+        ABI(TM.getSubtarget<MipsSubtarget>().getTargetABI()),
+        STI(&TM.getSubtarget<MipsSubtarget>()), MF(0), MCP(0),
+        PrescannedForConstants(false){}
 
     virtual const char *getPassName() const {
       return "Mips Constant Islands";
@@ -59,26 +270,1264 @@ namespace {
 
     bool runOnMachineFunction(MachineFunction &F);
 
+    void doInitialPlacement(std::vector<MachineInstr*> &CPEMIs);
+    CPEntry *findConstPoolEntry(unsigned CPI, const MachineInstr *CPEMI);
+    unsigned getCPELogAlign(const MachineInstr *CPEMI);
+    void initializeFunctionInfo(const std::vector<MachineInstr*> &CPEMIs);
+    unsigned getOffsetOf(MachineInstr *MI) const;
+    unsigned getUserOffset(CPUser&) const;
+    void dumpBBs();
+    void verify();
+
+    bool isOffsetInRange(unsigned UserOffset, unsigned TrialOffset,
+                         unsigned Disp, bool NegativeOK);
+    bool isOffsetInRange(unsigned UserOffset, unsigned TrialOffset,
+                         const CPUser &U);
+
+    bool isLongFormOffsetInRange(unsigned UserOffset, unsigned TrialOffset,
+                                const CPUser &U);
+
+    void computeBlockSize(MachineBasicBlock *MBB);
+    MachineBasicBlock *splitBlockBeforeInstr(MachineInstr *MI);
+    void updateForInsertedWaterBlock(MachineBasicBlock *NewBB);
+    void adjustBBOffsetsAfter(MachineBasicBlock *BB);
+    bool decrementCPEReferenceCount(unsigned CPI, MachineInstr* CPEMI);
+    int findInRangeCPEntry(CPUser& U, unsigned UserOffset);
+    int findLongFormInRangeCPEntry(CPUser& U, unsigned UserOffset);
+    bool findAvailableWater(CPUser&U, unsigned UserOffset,
+                            water_iterator &WaterIter);
+    void createNewWater(unsigned CPUserIndex, unsigned UserOffset,
+                        MachineBasicBlock *&NewMBB);
+    bool handleConstantPoolUser(unsigned CPUserIndex);
+    void removeDeadCPEMI(MachineInstr *CPEMI);
+    bool removeUnusedCPEntries();
+    bool isCPEntryInRange(MachineInstr *MI, unsigned UserOffset,
+                          MachineInstr *CPEMI, unsigned Disp, bool NegOk,
+                          bool DoDump = false);
+    bool isWaterInRange(unsigned UserOffset, MachineBasicBlock *Water,
+                        CPUser &U, unsigned &Growth);
+    bool isBBInRange(MachineInstr *MI, MachineBasicBlock *BB, unsigned Disp);
+    bool fixupImmediateBr(ImmBranch &Br);
+    bool fixupConditionalBr(ImmBranch &Br);
+    bool fixupUnconditionalBr(ImmBranch &Br);
+
+    void prescanForConstants();
+
   private:
-    const TargetMachine &TM;
-    bool IsPIC;
-    unsigned ABI;
+
   };
 
   char MipsConstantIslands::ID = 0;
 } // end of anonymous namespace
 
+
+bool MipsConstantIslands::isLongFormOffsetInRange
+  (unsigned UserOffset, unsigned TrialOffset,
+   const CPUser &U) {
+  return isOffsetInRange(UserOffset, TrialOffset,
+                         U.getLongFormMaxDisp(), U.NegOk);
+}
+
+bool MipsConstantIslands::isOffsetInRange
+  (unsigned UserOffset, unsigned TrialOffset,
+   const CPUser &U) {
+  return isOffsetInRange(UserOffset, TrialOffset,
+                         U.getMaxDisp(), U.NegOk);
+}
+/// print block size and offset information - debugging
+void MipsConstantIslands::dumpBBs() {
+  DEBUG({
+    for (unsigned J = 0, E = BBInfo.size(); J !=E; ++J) {
+      const BasicBlockInfo &BBI = BBInfo[J];
+      dbgs() << format("%08x BB#%u\t", BBI.Offset, J)
+             << format(" size=%#x\n", BBInfo[J].Size);
+    }
+  });
+}
 /// createMipsLongBranchPass - Returns a pass that converts branches to long
 /// branches.
 FunctionPass *llvm::createMipsConstantIslandPass(MipsTargetMachine &tm) {
   return new MipsConstantIslands(tm);
 }
 
-bool MipsConstantIslands::runOnMachineFunction(MachineFunction &F) {
+bool MipsConstantIslands::runOnMachineFunction(MachineFunction &mf) {
   // The intention is for this to be a mips16 only pass for now
   // FIXME:
-  // if (!TM.getSubtarget<MipsSubtarget>().inMips16Mode())
-  //  return false;
+  MF = &mf;
+  MCP = mf.getConstantPool();
+  DEBUG(dbgs() << "constant island machine function " << "\n");
+  if (!TM.getSubtarget<MipsSubtarget>().inMips16Mode() ||
+      !MipsSubtarget::useConstantIslands()) {
+    return false;
+  }
+  TII = (const Mips16InstrInfo*)MF->getTarget().getInstrInfo();
+  MFI = MF->getInfo<MipsFunctionInfo>();
+  DEBUG(dbgs() << "constant island processing " << "\n");
+  //
+  // will need to make predermination if there is any constants we need to
+  // put in constant islands. TBD.
+  //
+  if (!PrescannedForConstants) prescanForConstants();
+
+  HasFarJump = false;
+  // This pass invalidates liveness information when it splits basic blocks.
+  MF->getRegInfo().invalidateLiveness();
+
+  // Renumber all of the machine basic blocks in the function, guaranteeing that
+  // the numbers agree with the position of the block in the function.
+  MF->RenumberBlocks();
+
+  bool MadeChange = false;
+
+  // Perform the initial placement of the constant pool entries.  To start with,
+  // we put them all at the end of the function.
+  std::vector<MachineInstr*> CPEMIs;
+  if (!MCP->isEmpty())
+    doInitialPlacement(CPEMIs);
+
+  /// The next UID to take is the first unused one.
+  initPICLabelUId(CPEMIs.size());
+
+  // Do the initial scan of the function, building up information about the
+  // sizes of each block, the location of all the water, and finding all of the
+  // constant pool users.
+  initializeFunctionInfo(CPEMIs);
+  CPEMIs.clear();
+  DEBUG(dumpBBs());
+
+  /// Remove dead constant pool entries.
+  MadeChange |= removeUnusedCPEntries();
+
+  // Iteratively place constant pool entries and fix up branches until there
+  // is no change.
+  unsigned NoCPIters = 0, NoBRIters = 0;
+  (void)NoBRIters;
+  while (true) {
+    DEBUG(dbgs() << "Beginning CP iteration #" << NoCPIters << '\n');
+    bool CPChange = false;
+    for (unsigned i = 0, e = CPUsers.size(); i != e; ++i)
+      CPChange |= handleConstantPoolUser(i);
+    if (CPChange && ++NoCPIters > 30)
+      report_fatal_error("Constant Island pass failed to converge!");
+    DEBUG(dumpBBs());
+
+    // Clear NewWaterList now.  If we split a block for branches, it should
+    // appear as "new water" for the next iteration of constant pool placement.
+    NewWaterList.clear();
+
+    DEBUG(dbgs() << "Beginning BR iteration #" << NoBRIters << '\n');
+    bool BRChange = false;
+    for (unsigned i = 0, e = ImmBranches.size(); i != e; ++i)
+      BRChange |= fixupImmediateBr(ImmBranches[i]);
+    if (BRChange && ++NoBRIters > 30)
+      report_fatal_error("Branch Fix Up pass failed to converge!");
+    DEBUG(dumpBBs());
+    if (!CPChange && !BRChange)
+      break;
+    MadeChange = true;
+  }
+
+  DEBUG(dbgs() << '\n'; dumpBBs());
+
+  BBInfo.clear();
+  WaterList.clear();
+  CPUsers.clear();
+  CPEntries.clear();
+  ImmBranches.clear();
+  return MadeChange;
+}
+
+/// doInitialPlacement - Perform the initial placement of the constant pool
+/// entries.  To start with, we put them all at the end of the function.
+void
+MipsConstantIslands::doInitialPlacement(std::vector<MachineInstr*> &CPEMIs) {
+  // Create the basic block to hold the CPE's.
+  MachineBasicBlock *BB = MF->CreateMachineBasicBlock();
+  MF->push_back(BB);
+
+
+  // MachineConstantPool measures alignment in bytes. We measure in log2(bytes).
+  unsigned MaxAlign = Log2_32(MCP->getConstantPoolAlignment());
+
+  // Mark the basic block as required by the const-pool.
+  // If AlignConstantIslands isn't set, use 4-byte alignment for everything.
+  BB->setAlignment(AlignConstantIslands ? MaxAlign : 2);
+
+  // The function needs to be as aligned as the basic blocks. The linker may
+  // move functions around based on their alignment.
+  MF->ensureAlignment(BB->getAlignment());
+
+  // Order the entries in BB by descending alignment.  That ensures correct
+  // alignment of all entries as long as BB is sufficiently aligned.  Keep
+  // track of the insertion point for each alignment.  We are going to bucket
+  // sort the entries as they are created.
+  SmallVector<MachineBasicBlock::iterator, 8> InsPoint(MaxAlign + 1, BB->end());
+
+  // Add all of the constants from the constant pool to the end block, use an
+  // identity mapping of CPI's to CPE's.
+  const std::vector<MachineConstantPoolEntry> &CPs = MCP->getConstants();
+
+  const DataLayout &TD = *MF->getTarget().getDataLayout();
+  for (unsigned i = 0, e = CPs.size(); i != e; ++i) {
+    unsigned Size = TD.getTypeAllocSize(CPs[i].getType());
+    assert(Size >= 4 && "Too small constant pool entry");
+    unsigned Align = CPs[i].getAlignment();
+    assert(isPowerOf2_32(Align) && "Invalid alignment");
+    // Verify that all constant pool entries are a multiple of their alignment.
+    // If not, we would have to pad them out so that instructions stay aligned.
+    assert((Size % Align) == 0 && "CP Entry not multiple of 4 bytes!");
+
+    // Insert CONSTPOOL_ENTRY before entries with a smaller alignment.
+    unsigned LogAlign = Log2_32(Align);
+    MachineBasicBlock::iterator InsAt = InsPoint[LogAlign];
+
+    MachineInstr *CPEMI =
+      BuildMI(*BB, InsAt, DebugLoc(), TII->get(Mips::CONSTPOOL_ENTRY))
+        .addImm(i).addConstantPoolIndex(i).addImm(Size);
+
+    CPEMIs.push_back(CPEMI);
+
+    // Ensure that future entries with higher alignment get inserted before
+    // CPEMI. This is bucket sort with iterators.
+    for (unsigned a = LogAlign + 1; a <= MaxAlign; ++a)
+      if (InsPoint[a] == InsAt)
+        InsPoint[a] = CPEMI;
+    // Add a new CPEntry, but no corresponding CPUser yet.
+    std::vector<CPEntry> CPEs;
+    CPEs.push_back(CPEntry(CPEMI, i));
+    CPEntries.push_back(CPEs);
+    ++NumCPEs;
+    DEBUG(dbgs() << "Moved CPI#" << i << " to end of function, size = "
+                 << Size << ", align = " << Align <<'\n');
+  }
+  DEBUG(BB->dump());
+}
+
+/// BBHasFallthrough - Return true if the specified basic block can fallthrough
+/// into the block immediately after it.
+static bool BBHasFallthrough(MachineBasicBlock *MBB) {
+  // Get the next machine basic block in the function.
+  MachineFunction::iterator MBBI = MBB;
+  // Can't fall off end of function.
+  if (llvm::next(MBBI) == MBB->getParent()->end())
+    return false;
+
+  MachineBasicBlock *NextBB = llvm::next(MBBI);
+  for (MachineBasicBlock::succ_iterator I = MBB->succ_begin(),
+       E = MBB->succ_end(); I != E; ++I)
+    if (*I == NextBB)
+      return true;
+
   return false;
 }
 
+/// findConstPoolEntry - Given the constpool index and CONSTPOOL_ENTRY MI,
+/// look up the corresponding CPEntry.
+MipsConstantIslands::CPEntry
+*MipsConstantIslands::findConstPoolEntry(unsigned CPI,
+                                        const MachineInstr *CPEMI) {
+  std::vector<CPEntry> &CPEs = CPEntries[CPI];
+  // Number of entries per constpool index should be small, just do a
+  // linear search.
+  for (unsigned i = 0, e = CPEs.size(); i != e; ++i) {
+    if (CPEs[i].CPEMI == CPEMI)
+      return &CPEs[i];
+  }
+  return NULL;
+}
+
+/// getCPELogAlign - Returns the required alignment of the constant pool entry
+/// represented by CPEMI.  Alignment is measured in log2(bytes) units.
+unsigned MipsConstantIslands::getCPELogAlign(const MachineInstr *CPEMI) {
+  assert(CPEMI && CPEMI->getOpcode() == Mips::CONSTPOOL_ENTRY);
+
+  // Everything is 4-byte aligned unless AlignConstantIslands is set.
+  if (!AlignConstantIslands)
+    return 2;
+
+  unsigned CPI = CPEMI->getOperand(1).getIndex();
+  assert(CPI < MCP->getConstants().size() && "Invalid constant pool index.");
+  unsigned Align = MCP->getConstants()[CPI].getAlignment();
+  assert(isPowerOf2_32(Align) && "Invalid CPE alignment");
+  return Log2_32(Align);
+}
+
+/// initializeFunctionInfo - Do the initial scan of the function, building up
+/// information about the sizes of each block, the location of all the water,
+/// and finding all of the constant pool users.
+void MipsConstantIslands::
+initializeFunctionInfo(const std::vector<MachineInstr*> &CPEMIs) {
+  BBInfo.clear();
+  BBInfo.resize(MF->getNumBlockIDs());
+
+  // First thing, compute the size of all basic blocks, and see if the function
+  // has any inline assembly in it. If so, we have to be conservative about
+  // alignment assumptions, as we don't know for sure the size of any
+  // instructions in the inline assembly.
+  for (MachineFunction::iterator I = MF->begin(), E = MF->end(); I != E; ++I)
+    computeBlockSize(I);
+
+
+  // Compute block offsets.
+  adjustBBOffsetsAfter(MF->begin());
+
+  // Now go back through the instructions and build up our data structures.
+  for (MachineFunction::iterator MBBI = MF->begin(), E = MF->end();
+       MBBI != E; ++MBBI) {
+    MachineBasicBlock &MBB = *MBBI;
+
+    // If this block doesn't fall through into the next MBB, then this is
+    // 'water' that a constant pool island could be placed.
+    if (!BBHasFallthrough(&MBB))
+      WaterList.push_back(&MBB);
+    for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
+         I != E; ++I) {
+      if (I->isDebugValue())
+        continue;
+
+      int Opc = I->getOpcode();
+      if (I->isBranch()) {
+        bool isCond = false;
+        unsigned Bits = 0;
+        unsigned Scale = 1;
+        int UOpc = Opc;
+        switch (Opc) {
+        default:
+          continue;  // Ignore other branches for now
+        case Mips::Bimm16:
+          Bits = 11;
+          Scale = 2;
+          isCond = false;
+          break;
+        case Mips::BimmX16:
+          Bits = 16;
+          Scale = 2;
+          isCond = false;
+        }
+        // Record this immediate branch.
+        unsigned MaxOffs = ((1 << (Bits-1))-1) * Scale;
+        ImmBranches.push_back(ImmBranch(I, MaxOffs, isCond, UOpc));
+      }
+
+      if (Opc == Mips::CONSTPOOL_ENTRY)
+        continue;
+
+
+      // Scan the instructions for constant pool operands.
+      for (unsigned op = 0, e = I->getNumOperands(); op != e; ++op)
+        if (I->getOperand(op).isCPI()) {
+
+          // We found one.  The addressing mode tells us the max displacement
+          // from the PC that this instruction permits.
+
+          // Basic size info comes from the TSFlags field.
+          unsigned Bits = 0;
+          unsigned Scale = 1;
+          bool NegOk = false;
+          unsigned LongFormBits = 0;
+          unsigned LongFormScale = 0;
+          unsigned LongFormOpcode = 0;
+          switch (Opc) {
+          default:
+            llvm_unreachable("Unknown addressing mode for CP reference!");
+          case Mips::LwRxPcTcp16:
+            Bits = 8;
+            Scale = 4;
+            LongFormOpcode = Mips::LwRxPcTcpX16;
+            LongFormBits = 16;
+            LongFormScale = 1;
+            break;
+          case Mips::LwRxPcTcpX16:
+            Bits = 16;
+            Scale = 1;
+            NegOk = true;
+            break;
+          }
+          // Remember that this is a user of a CP entry.
+          unsigned CPI = I->getOperand(op).getIndex();
+          MachineInstr *CPEMI = CPEMIs[CPI];
+          unsigned MaxOffs = ((1 << Bits)-1) * Scale;
+          unsigned LongFormMaxOffs = ((1 << LongFormBits)-1) * LongFormScale;
+          CPUsers.push_back(CPUser(I, CPEMI, MaxOffs, NegOk,
+                                   LongFormMaxOffs, LongFormOpcode));
+
+          // Increment corresponding CPEntry reference count.
+          CPEntry *CPE = findConstPoolEntry(CPI, CPEMI);
+          assert(CPE && "Cannot find a corresponding CPEntry!");
+          CPE->RefCount++;
+
+          // Instructions can only use one CP entry, don't bother scanning the
+          // rest of the operands.
+          break;
+
+        }
+
+    }
+  }
+
+}
+
+/// computeBlockSize - Compute the size and some alignment information for MBB.
+/// This function updates BBInfo directly.
+void MipsConstantIslands::computeBlockSize(MachineBasicBlock *MBB) {
+  BasicBlockInfo &BBI = BBInfo[MBB->getNumber()];
+  BBI.Size = 0;
+
+  for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;
+       ++I)
+    BBI.Size += TII->GetInstSizeInBytes(I);
+
+}
+
+/// getOffsetOf - Return the current offset of the specified machine instruction
+/// from the start of the function.  This offset changes as stuff is moved
+/// around inside the function.
+unsigned MipsConstantIslands::getOffsetOf(MachineInstr *MI) const {
+  MachineBasicBlock *MBB = MI->getParent();
+
+  // The offset is composed of two things: the sum of the sizes of all MBB's
+  // before this instruction's block, and the offset from the start of the block
+  // it is in.
+  unsigned Offset = BBInfo[MBB->getNumber()].Offset;
+
+  // Sum instructions before MI in MBB.
+  for (MachineBasicBlock::iterator I = MBB->begin(); &*I != MI; ++I) {
+    assert(I != MBB->end() && "Didn't find MI in its own basic block?");
+    Offset += TII->GetInstSizeInBytes(I);
+  }
+  return Offset;
+}
+
+/// CompareMBBNumbers - Little predicate function to sort the WaterList by MBB
+/// ID.
+static bool CompareMBBNumbers(const MachineBasicBlock *LHS,
+                              const MachineBasicBlock *RHS) {
+  return LHS->getNumber() < RHS->getNumber();
+}
+
+/// updateForInsertedWaterBlock - When a block is newly inserted into the
+/// machine function, it upsets all of the block numbers.  Renumber the blocks
+/// and update the arrays that parallel this numbering.
+void MipsConstantIslands::updateForInsertedWaterBlock
+  (MachineBasicBlock *NewBB) {
+  // Renumber the MBB's to keep them consecutive.
+  NewBB->getParent()->RenumberBlocks(NewBB);
+
+  // Insert an entry into BBInfo to align it properly with the (newly
+  // renumbered) block numbers.
+  BBInfo.insert(BBInfo.begin() + NewBB->getNumber(), BasicBlockInfo());
+
+  // Next, update WaterList.  Specifically, we need to add NewMBB as having
+  // available water after it.
+  water_iterator IP =
+    std::lower_bound(WaterList.begin(), WaterList.end(), NewBB,
+                     CompareMBBNumbers);
+  WaterList.insert(IP, NewBB);
+}
+
+unsigned MipsConstantIslands::getUserOffset(CPUser &U) const {
+  return getOffsetOf(U.MI);
+}
+
+/// Split the basic block containing MI into two blocks, which are joined by
+/// an unconditional branch.  Update data structures and renumber blocks to
+/// account for this change and returns the newly created block.
+MachineBasicBlock *MipsConstantIslands::splitBlockBeforeInstr
+  (MachineInstr *MI) {
+  MachineBasicBlock *OrigBB = MI->getParent();
+
+  // Create a new MBB for the code after the OrigBB.
+  MachineBasicBlock *NewBB =
+    MF->CreateMachineBasicBlock(OrigBB->getBasicBlock());
+  MachineFunction::iterator MBBI = OrigBB; ++MBBI;
+  MF->insert(MBBI, NewBB);
+
+  // Splice the instructions starting with MI over to NewBB.
+  NewBB->splice(NewBB->end(), OrigBB, MI, OrigBB->end());
+
+  // Add an unconditional branch from OrigBB to NewBB.
+  // Note the new unconditional branch is not being recorded.
+  // There doesn't seem to be meaningful DebugInfo available; this doesn't
+  // correspond to anything in the source.
+  BuildMI(OrigBB, DebugLoc(), TII->get(Mips::Bimm16)).addMBB(NewBB);
+  ++NumSplit;
+
+  // Update the CFG.  All succs of OrigBB are now succs of NewBB.
+  NewBB->transferSuccessors(OrigBB);
+
+  // OrigBB branches to NewBB.
+  OrigBB->addSuccessor(NewBB);
+
+  // Update internal data structures to account for the newly inserted MBB.
+  // This is almost the same as updateForInsertedWaterBlock, except that
+  // the Water goes after OrigBB, not NewBB.
+  MF->RenumberBlocks(NewBB);
+
+  // Insert an entry into BBInfo to align it properly with the (newly
+  // renumbered) block numbers.
+  BBInfo.insert(BBInfo.begin() + NewBB->getNumber(), BasicBlockInfo());
+
+  // Next, update WaterList.  Specifically, we need to add OrigMBB as having
+  // available water after it (but not if it's already there, which happens
+  // when splitting before a conditional branch that is followed by an
+  // unconditional branch - in that case we want to insert NewBB).
+  water_iterator IP =
+    std::lower_bound(WaterList.begin(), WaterList.end(), OrigBB,
+                     CompareMBBNumbers);
+  MachineBasicBlock* WaterBB = *IP;
+  if (WaterBB == OrigBB)
+    WaterList.insert(llvm::next(IP), NewBB);
+  else
+    WaterList.insert(IP, OrigBB);
+  NewWaterList.insert(OrigBB);
+
+  // Figure out how large the OrigBB is.  As the first half of the original
+  // block, it cannot contain a tablejump.  The size includes
+  // the new jump we added.  (It should be possible to do this without
+  // recounting everything, but it's very confusing, and this is rarely
+  // executed.)
+  computeBlockSize(OrigBB);
+
+  // Figure out how large the NewMBB is.  As the second half of the original
+  // block, it may contain a tablejump.
+  computeBlockSize(NewBB);
+
+  // All BBOffsets following these blocks must be modified.
+  adjustBBOffsetsAfter(OrigBB);
+
+  return NewBB;
+}
+
+
+
+/// isOffsetInRange - Checks whether UserOffset (the location of a constant pool
+/// reference) is within MaxDisp of TrialOffset (a proposed location of a
+/// constant pool entry).
+bool MipsConstantIslands::isOffsetInRange(unsigned UserOffset,
+                                         unsigned TrialOffset, unsigned MaxDisp,
+                                         bool NegativeOK) {
+  if (UserOffset <= TrialOffset) {
+    // User before the Trial.
+    if (TrialOffset - UserOffset <= MaxDisp)
+      return true;
+  } else if (NegativeOK) {
+    if (UserOffset - TrialOffset <= MaxDisp)
+      return true;
+  }
+  return false;
+}
+
+/// isWaterInRange - Returns true if a CPE placed after the specified
+/// Water (a basic block) will be in range for the specific MI.
+///
+/// Compute how much the function will grow by inserting a CPE after Water.
+bool MipsConstantIslands::isWaterInRange(unsigned UserOffset,
+                                        MachineBasicBlock* Water, CPUser &U,
+                                        unsigned &Growth) {
+  unsigned CPELogAlign = getCPELogAlign(U.CPEMI);
+  unsigned CPEOffset = BBInfo[Water->getNumber()].postOffset(CPELogAlign);
+  unsigned NextBlockOffset, NextBlockAlignment;
+  MachineFunction::const_iterator NextBlock = Water;
+  if (++NextBlock == MF->end()) {
+    NextBlockOffset = BBInfo[Water->getNumber()].postOffset();
+    NextBlockAlignment = 0;
+  } else {
+    NextBlockOffset = BBInfo[NextBlock->getNumber()].Offset;
+    NextBlockAlignment = NextBlock->getAlignment();
+  }
+  unsigned Size = U.CPEMI->getOperand(2).getImm();
+  unsigned CPEEnd = CPEOffset + Size;
+
+  // The CPE may be able to hide in the alignment padding before the next
+  // block. It may also cause more padding to be required if it is more aligned
+  // that the next block.
+  if (CPEEnd > NextBlockOffset) {
+    Growth = CPEEnd - NextBlockOffset;
+    // Compute the padding that would go at the end of the CPE to align the next
+    // block.
+    Growth += OffsetToAlignment(CPEEnd, 1u << NextBlockAlignment);
+
+    // If the CPE is to be inserted before the instruction, that will raise
+    // the offset of the instruction. Also account for unknown alignment padding
+    // in blocks between CPE and the user.
+    if (CPEOffset < UserOffset)
+      UserOffset += Growth;
+  } else
+    // CPE fits in existing padding.
+    Growth = 0;
+
+  return isOffsetInRange(UserOffset, CPEOffset, U);
+}
+
+/// isCPEntryInRange - Returns true if the distance between specific MI and
+/// specific ConstPool entry instruction can fit in MI's displacement field.
+bool MipsConstantIslands::isCPEntryInRange
+  (MachineInstr *MI, unsigned UserOffset,
+   MachineInstr *CPEMI, unsigned MaxDisp,
+   bool NegOk, bool DoDump) {
+  unsigned CPEOffset  = getOffsetOf(CPEMI);
+
+  if (DoDump) {
+    DEBUG({
+      unsigned Block = MI->getParent()->getNumber();
+      const BasicBlockInfo &BBI = BBInfo[Block];
+      dbgs() << "User of CPE#" << CPEMI->getOperand(0).getImm()
+             << " max delta=" << MaxDisp
+             << format(" insn address=%#x", UserOffset)
+             << " in BB#" << Block << ": "
+             << format("%#x-%x\t", BBI.Offset, BBI.postOffset()) << *MI
+             << format("CPE address=%#x offset=%+d: ", CPEOffset,
+                       int(CPEOffset-UserOffset));
+    });
+  }
+
+  return isOffsetInRange(UserOffset, CPEOffset, MaxDisp, NegOk);
+}
+
+#ifndef NDEBUG
+/// BBIsJumpedOver - Return true of the specified basic block's only predecessor
+/// unconditionally branches to its only successor.
+static bool BBIsJumpedOver(MachineBasicBlock *MBB) {
+  if (MBB->pred_size() != 1 || MBB->succ_size() != 1)
+    return false;
+  MachineBasicBlock *Succ = *MBB->succ_begin();
+  MachineBasicBlock *Pred = *MBB->pred_begin();
+  MachineInstr *PredMI = &Pred->back();
+  if (PredMI->getOpcode() == Mips::Bimm16)
+    return PredMI->getOperand(0).getMBB() == Succ;
+  return false;
+}
+#endif
+
+void MipsConstantIslands::adjustBBOffsetsAfter(MachineBasicBlock *BB) {
+  unsigned BBNum = BB->getNumber();
+  for(unsigned i = BBNum + 1, e = MF->getNumBlockIDs(); i < e; ++i) {
+    // Get the offset and known bits at the end of the layout predecessor.
+    // Include the alignment of the current block.
+    unsigned Offset = BBInfo[i - 1].Offset + BBInfo[i - 1].Size;
+    BBInfo[i].Offset = Offset;
+  }
+}
+
+/// decrementCPEReferenceCount - find the constant pool entry with index CPI
+/// and instruction CPEMI, and decrement its refcount.  If the refcount
+/// becomes 0 remove the entry and instruction.  Returns true if we removed
+/// the entry, false if we didn't.
+
+bool MipsConstantIslands::decrementCPEReferenceCount(unsigned CPI,
+                                                    MachineInstr *CPEMI) {
+  // Find the old entry. Eliminate it if it is no longer used.
+  CPEntry *CPE = findConstPoolEntry(CPI, CPEMI);
+  assert(CPE && "Unexpected!");
+  if (--CPE->RefCount == 0) {
+    removeDeadCPEMI(CPEMI);
+    CPE->CPEMI = NULL;
+    --NumCPEs;
+    return true;
+  }
+  return false;
+}
+
+/// LookForCPEntryInRange - see if the currently referenced CPE is in range;
+/// if not, see if an in-range clone of the CPE is in range, and if so,
+/// change the data structures so the user references the clone.  Returns:
+/// 0 = no existing entry found
+/// 1 = entry found, and there were no code insertions or deletions
+/// 2 = entry found, and there were code insertions or deletions
+int MipsConstantIslands::findInRangeCPEntry(CPUser& U, unsigned UserOffset)
+{
+  MachineInstr *UserMI = U.MI;
+  MachineInstr *CPEMI  = U.CPEMI;
+
+  // Check to see if the CPE is already in-range.
+  if (isCPEntryInRange(UserMI, UserOffset, CPEMI, U.getMaxDisp(), U.NegOk,
+                       true)) {
+    DEBUG(dbgs() << "In range\n");
+    return 1;
+  }
+
+  // No.  Look for previously created clones of the CPE that are in range.
+  unsigned CPI = CPEMI->getOperand(1).getIndex();
+  std::vector<CPEntry> &CPEs = CPEntries[CPI];
+  for (unsigned i = 0, e = CPEs.size(); i != e; ++i) {
+    // We already tried this one
+    if (CPEs[i].CPEMI == CPEMI)
+      continue;
+    // Removing CPEs can leave empty entries, skip
+    if (CPEs[i].CPEMI == NULL)
+      continue;
+    if (isCPEntryInRange(UserMI, UserOffset, CPEs[i].CPEMI, U.getMaxDisp(),
+                     U.NegOk)) {
+      DEBUG(dbgs() << "Replacing CPE#" << CPI << " with CPE#"
+                   << CPEs[i].CPI << "\n");
+      // Point the CPUser node to the replacement
+      U.CPEMI = CPEs[i].CPEMI;
+      // Change the CPI in the instruction operand to refer to the clone.
+      for (unsigned j = 0, e = UserMI->getNumOperands(); j != e; ++j)
+        if (UserMI->getOperand(j).isCPI()) {
+          UserMI->getOperand(j).setIndex(CPEs[i].CPI);
+          break;
+        }
+      // Adjust the refcount of the clone...
+      CPEs[i].RefCount++;
+      // ...and the original.  If we didn't remove the old entry, none of the
+      // addresses changed, so we don't need another pass.
+      return decrementCPEReferenceCount(CPI, CPEMI) ? 2 : 1;
+    }
+  }
+  return 0;
+}
+
+/// LookForCPEntryInRange - see if the currently referenced CPE is in range;
+/// This version checks if the longer form of the instruction can be used to
+/// to satisfy things.
+/// if not, see if an in-range clone of the CPE is in range, and if so,
+/// change the data structures so the user references the clone.  Returns:
+/// 0 = no existing entry found
+/// 1 = entry found, and there were no code insertions or deletions
+/// 2 = entry found, and there were code insertions or deletions
+int MipsConstantIslands::findLongFormInRangeCPEntry
+  (CPUser& U, unsigned UserOffset)
+{
+  MachineInstr *UserMI = U.MI;
+  MachineInstr *CPEMI  = U.CPEMI;
+
+  // Check to see if the CPE is already in-range.
+  if (isCPEntryInRange(UserMI, UserOffset, CPEMI,
+                       U.getLongFormMaxDisp(), U.NegOk,
+                       true)) {
+    DEBUG(dbgs() << "In range\n");
+    UserMI->setDesc(TII->get(U.getLongFormOpcode()));
+    U.setMaxDisp(U.getLongFormMaxDisp());
+    return 2;  // instruction is longer length now
+  }
+
+  // No.  Look for previously created clones of the CPE that are in range.
+  unsigned CPI = CPEMI->getOperand(1).getIndex();
+  std::vector<CPEntry> &CPEs = CPEntries[CPI];
+  for (unsigned i = 0, e = CPEs.size(); i != e; ++i) {
+    // We already tried this one
+    if (CPEs[i].CPEMI == CPEMI)
+      continue;
+    // Removing CPEs can leave empty entries, skip
+    if (CPEs[i].CPEMI == NULL)
+      continue;
+    if (isCPEntryInRange(UserMI, UserOffset, CPEs[i].CPEMI,
+                         U.getLongFormMaxDisp(), U.NegOk)) {
+      DEBUG(dbgs() << "Replacing CPE#" << CPI << " with CPE#"
+                   << CPEs[i].CPI << "\n");
+      // Point the CPUser node to the replacement
+      U.CPEMI = CPEs[i].CPEMI;
+      // Change the CPI in the instruction operand to refer to the clone.
+      for (unsigned j = 0, e = UserMI->getNumOperands(); j != e; ++j)
+        if (UserMI->getOperand(j).isCPI()) {
+          UserMI->getOperand(j).setIndex(CPEs[i].CPI);
+          break;
+        }
+      // Adjust the refcount of the clone...
+      CPEs[i].RefCount++;
+      // ...and the original.  If we didn't remove the old entry, none of the
+      // addresses changed, so we don't need another pass.
+      return decrementCPEReferenceCount(CPI, CPEMI) ? 2 : 1;
+    }
+  }
+  return 0;
+}
+
+/// getUnconditionalBrDisp - Returns the maximum displacement that can fit in
+/// the specific unconditional branch instruction.
+static inline unsigned getUnconditionalBrDisp(int Opc) {
+  switch (Opc) {
+  case Mips::Bimm16:
+    return ((1<<10)-1)*2;
+  case Mips::BimmX16:
+    return ((1<<16)-1)*2;
+  default:
+    break;
+  }
+  return ((1<<16)-1)*2;
+}
+
+/// findAvailableWater - Look for an existing entry in the WaterList in which
+/// we can place the CPE referenced from U so it's within range of U's MI.
+/// Returns true if found, false if not.  If it returns true, WaterIter
+/// is set to the WaterList entry.  
+/// To ensure that this pass
+/// terminates, the CPE location for a particular CPUser is only allowed to
+/// move to a lower address, so search backward from the end of the list and
+/// prefer the first water that is in range.
+bool MipsConstantIslands::findAvailableWater(CPUser &U, unsigned UserOffset,
+                                      water_iterator &WaterIter) {
+  if (WaterList.empty())
+    return false;
+
+  unsigned BestGrowth = ~0u;
+  for (water_iterator IP = prior(WaterList.end()), B = WaterList.begin();;
+       --IP) {
+    MachineBasicBlock* WaterBB = *IP;
+    // Check if water is in range and is either at a lower address than the
+    // current "high water mark" or a new water block that was created since
+    // the previous iteration by inserting an unconditional branch.  In the
+    // latter case, we want to allow resetting the high water mark back to
+    // this new water since we haven't seen it before.  Inserting branches
+    // should be relatively uncommon and when it does happen, we want to be
+    // sure to take advantage of it for all the CPEs near that block, so that
+    // we don't insert more branches than necessary.
+    unsigned Growth;
+    if (isWaterInRange(UserOffset, WaterBB, U, Growth) &&
+        (WaterBB->getNumber() < U.HighWaterMark->getNumber() ||
+         NewWaterList.count(WaterBB)) && Growth < BestGrowth) {
+      // This is the least amount of required padding seen so far.
+      BestGrowth = Growth;
+      WaterIter = IP;
+      DEBUG(dbgs() << "Found water after BB#" << WaterBB->getNumber()
+                   << " Growth=" << Growth << '\n');
+
+      // Keep looking unless it is perfect.
+      if (BestGrowth == 0)
+        return true;
+    }
+    if (IP == B)
+      break;
+  }
+  return BestGrowth != ~0u;
+}
+
+/// createNewWater - No existing WaterList entry will work for
+/// CPUsers[CPUserIndex], so create a place to put the CPE.  The end of the
+/// block is used if in range, and the conditional branch munged so control
+/// flow is correct.  Otherwise the block is split to create a hole with an
+/// unconditional branch around it.  In either case NewMBB is set to a
+/// block following which the new island can be inserted (the WaterList
+/// is not adjusted).
+void MipsConstantIslands::createNewWater(unsigned CPUserIndex,
+                                        unsigned UserOffset,
+                                        MachineBasicBlock *&NewMBB) {
+  CPUser &U = CPUsers[CPUserIndex];
+  MachineInstr *UserMI = U.MI;
+  MachineInstr *CPEMI  = U.CPEMI;
+  unsigned CPELogAlign = getCPELogAlign(CPEMI);
+  MachineBasicBlock *UserMBB = UserMI->getParent();
+  const BasicBlockInfo &UserBBI = BBInfo[UserMBB->getNumber()];
+
+  // If the block does not end in an unconditional branch already, and if the
+  // end of the block is within range, make new water there.  
+  if (BBHasFallthrough(UserMBB)) {
+    // Size of branch to insert.
+    unsigned Delta = 2;
+    // Compute the offset where the CPE will begin.
+    unsigned CPEOffset = UserBBI.postOffset(CPELogAlign) + Delta;
+
+    if (isOffsetInRange(UserOffset, CPEOffset, U)) {
+      DEBUG(dbgs() << "Split at end of BB#" << UserMBB->getNumber()
+            << format(", expected CPE offset %#x\n", CPEOffset));
+      NewMBB = llvm::next(MachineFunction::iterator(UserMBB));
+      // Add an unconditional branch from UserMBB to fallthrough block.  Record
+      // it for branch lengthening; this new branch will not get out of range,
+      // but if the preceding conditional branch is out of range, the targets
+      // will be exchanged, and the altered branch may be out of range, so the
+      // machinery has to know about it.
+      int UncondBr = Mips::Bimm16;
+      BuildMI(UserMBB, DebugLoc(), TII->get(UncondBr)).addMBB(NewMBB);
+      unsigned MaxDisp = getUnconditionalBrDisp(UncondBr);
+      ImmBranches.push_back(ImmBranch(&UserMBB->back(),
+                                      MaxDisp, false, UncondBr));
+      BBInfo[UserMBB->getNumber()].Size += Delta;
+      adjustBBOffsetsAfter(UserMBB);
+      return;
+    }
+  }
+
+  // What a big block.  Find a place within the block to split it.  
+
+  // Try to split the block so it's fully aligned.  Compute the latest split
+  // point where we can add a 4-byte branch instruction, and then align to
+  // LogAlign which is the largest possible alignment in the function.
+  unsigned LogAlign = MF->getAlignment();
+  assert(LogAlign >= CPELogAlign && "Over-aligned constant pool entry");
+  unsigned BaseInsertOffset = UserOffset + U.getMaxDisp();
+  DEBUG(dbgs() << format("Split in middle of big block before %#x",
+                         BaseInsertOffset));
+
+  // The 4 in the following is for the unconditional branch we'll be inserting
+  // Alignment of the island is handled
+  // inside isOffsetInRange.
+  BaseInsertOffset -= 4;
+
+  DEBUG(dbgs() << format(", adjusted to %#x", BaseInsertOffset)
+               << " la=" << LogAlign << '\n');
+
+  // This could point off the end of the block if we've already got constant
+  // pool entries following this block; only the last one is in the water list.
+  // Back past any possible branches (allow for a conditional and a maximally
+  // long unconditional).
+  if (BaseInsertOffset + 8 >= UserBBI.postOffset()) {
+    BaseInsertOffset = UserBBI.postOffset() - 8;
+    DEBUG(dbgs() << format("Move inside block: %#x\n", BaseInsertOffset));
+  }
+  unsigned EndInsertOffset = BaseInsertOffset + 4 +
+    CPEMI->getOperand(2).getImm();
+  MachineBasicBlock::iterator MI = UserMI;
+  ++MI;
+  unsigned CPUIndex = CPUserIndex+1;
+  unsigned NumCPUsers = CPUsers.size();
+  //MachineInstr *LastIT = 0;
+  for (unsigned Offset = UserOffset+TII->GetInstSizeInBytes(UserMI);
+       Offset < BaseInsertOffset;
+       Offset += TII->GetInstSizeInBytes(MI),
+       MI = llvm::next(MI)) {
+    assert(MI != UserMBB->end() && "Fell off end of block");
+    if (CPUIndex < NumCPUsers && CPUsers[CPUIndex].MI == MI) {
+      CPUser &U = CPUsers[CPUIndex];
+      if (!isOffsetInRange(Offset, EndInsertOffset, U)) {
+        // Shift intertion point by one unit of alignment so it is within reach.
+        BaseInsertOffset -= 1u << LogAlign;
+        EndInsertOffset  -= 1u << LogAlign;
+      }
+      // This is overly conservative, as we don't account for CPEMIs being
+      // reused within the block, but it doesn't matter much.  Also assume CPEs
+      // are added in order with alignment padding.  We may eventually be able
+      // to pack the aligned CPEs better.
+      EndInsertOffset += U.CPEMI->getOperand(2).getImm();
+      CPUIndex++;
+    }
+  }
+
+  --MI;
+  NewMBB = splitBlockBeforeInstr(MI);
+}
+
+/// handleConstantPoolUser - Analyze the specified user, checking to see if it
+/// is out-of-range.  If so, pick up the constant pool value and move it some
+/// place in-range.  Return true if we changed any addresses (thus must run
+/// another pass of branch lengthening), false otherwise.
+bool MipsConstantIslands::handleConstantPoolUser(unsigned CPUserIndex) {
+  CPUser &U = CPUsers[CPUserIndex];
+  MachineInstr *UserMI = U.MI;
+  MachineInstr *CPEMI  = U.CPEMI;
+  unsigned CPI = CPEMI->getOperand(1).getIndex();
+  unsigned Size = CPEMI->getOperand(2).getImm();
+  // Compute this only once, it's expensive.
+  unsigned UserOffset = getUserOffset(U);
+
+  // See if the current entry is within range, or there is a clone of it
+  // in range.
+  int result = findInRangeCPEntry(U, UserOffset);
+  if (result==1) return false;
+  else if (result==2) return true;
+
+
+  // Look for water where we can place this CPE.
+  MachineBasicBlock *NewIsland = MF->CreateMachineBasicBlock();
+  MachineBasicBlock *NewMBB;
+  water_iterator IP;
+  if (findAvailableWater(U, UserOffset, IP)) {
+    DEBUG(dbgs() << "Found water in range\n");
+    MachineBasicBlock *WaterBB = *IP;
+
+    // If the original WaterList entry was "new water" on this iteration,
+    // propagate that to the new island.  This is just keeping NewWaterList
+    // updated to match the WaterList, which will be updated below.
+    if (NewWaterList.erase(WaterBB))
+      NewWaterList.insert(NewIsland);
+
+    // The new CPE goes before the following block (NewMBB).
+    NewMBB = llvm::next(MachineFunction::iterator(WaterBB));
+
+  } else {
+    // No water found.
+    // we first see if a longer form of the instrucion could have reached
+    // the constant. in that case we won't bother to split
+    if (!NoLoadRelaxation) {
+      result = findLongFormInRangeCPEntry(U, UserOffset);
+      if (result != 0) return true;
+    }
+    DEBUG(dbgs() << "No water found\n");
+    createNewWater(CPUserIndex, UserOffset, NewMBB);
+
+    // splitBlockBeforeInstr adds to WaterList, which is important when it is
+    // called while handling branches so that the water will be seen on the
+    // next iteration for constant pools, but in this context, we don't want
+    // it.  Check for this so it will be removed from the WaterList.
+    // Also remove any entry from NewWaterList.
+    MachineBasicBlock *WaterBB = prior(MachineFunction::iterator(NewMBB));
+    IP = std::find(WaterList.begin(), WaterList.end(), WaterBB);
+    if (IP != WaterList.end())
+      NewWaterList.erase(WaterBB);
+
+    // We are adding new water.  Update NewWaterList.
+    NewWaterList.insert(NewIsland);
+  }
+
+  // Remove the original WaterList entry; we want subsequent insertions in
+  // this vicinity to go after the one we're about to insert.  This
+  // considerably reduces the number of times we have to move the same CPE
+  // more than once and is also important to ensure the algorithm terminates.
+  if (IP != WaterList.end())
+    WaterList.erase(IP);
+
+  // Okay, we know we can put an island before NewMBB now, do it!
+  MF->insert(NewMBB, NewIsland);
+
+  // Update internal data structures to account for the newly inserted MBB.
+  updateForInsertedWaterBlock(NewIsland);
+
+  // Decrement the old entry, and remove it if refcount becomes 0.
+  decrementCPEReferenceCount(CPI, CPEMI);
+
+  // Now that we have an island to add the CPE to, clone the original CPE and
+  // add it to the island.
+  U.HighWaterMark = NewIsland;
+  U.CPEMI = BuildMI(NewIsland, DebugLoc(), TII->get(Mips::CONSTPOOL_ENTRY))
+                .addImm(ID).addConstantPoolIndex(CPI).addImm(Size);
+  CPEntries[CPI].push_back(CPEntry(U.CPEMI, ID, 1));
+  ++NumCPEs;
+
+  // Mark the basic block as aligned as required by the const-pool entry.
+  NewIsland->setAlignment(getCPELogAlign(U.CPEMI));
+
+  // Increase the size of the island block to account for the new entry.
+  BBInfo[NewIsland->getNumber()].Size += Size;
+  adjustBBOffsetsAfter(llvm::prior(MachineFunction::iterator(NewIsland)));
+
+  // No existing clone of this CPE is within range.
+  // We will be generating a new clone.  Get a UID for it.
+  unsigned ID = createPICLabelUId();
+
+  // Finally, change the CPI in the instruction operand to be ID.
+  for (unsigned i = 0, e = UserMI->getNumOperands(); i != e; ++i)
+    if (UserMI->getOperand(i).isCPI()) {
+      UserMI->getOperand(i).setIndex(ID);
+      break;
+    }
+
+  DEBUG(dbgs() << "  Moved CPE to #" << ID << " CPI=" << CPI
+        << format(" offset=%#x\n", BBInfo[NewIsland->getNumber()].Offset));
+
+  return true;
+}
+
+/// removeDeadCPEMI - Remove a dead constant pool entry instruction. Update
+/// sizes and offsets of impacted basic blocks.
+void MipsConstantIslands::removeDeadCPEMI(MachineInstr *CPEMI) {
+  MachineBasicBlock *CPEBB = CPEMI->getParent();
+  unsigned Size = CPEMI->getOperand(2).getImm();
+  CPEMI->eraseFromParent();
+  BBInfo[CPEBB->getNumber()].Size -= Size;
+  // All succeeding offsets have the current size value added in, fix this.
+  if (CPEBB->empty()) {
+    BBInfo[CPEBB->getNumber()].Size = 0;
+
+    // This block no longer needs to be aligned.
+    CPEBB->setAlignment(0);
+  } else
+    // Entries are sorted by descending alignment, so realign from the front.
+    CPEBB->setAlignment(getCPELogAlign(CPEBB->begin()));
+
+  adjustBBOffsetsAfter(CPEBB);
+  // An island has only one predecessor BB and one successor BB. Check if
+  // this BB's predecessor jumps directly to this BB's successor. This
+  // shouldn't happen currently.
+  assert(!BBIsJumpedOver(CPEBB) && "How did this happen?");
+  // FIXME: remove the empty blocks after all the work is done?
+}
+
+/// removeUnusedCPEntries - Remove constant pool entries whose refcounts
+/// are zero.
+bool MipsConstantIslands::removeUnusedCPEntries() {
+  unsigned MadeChange = false;
+  for (unsigned i = 0, e = CPEntries.size(); i != e; ++i) {
+      std::vector<CPEntry> &CPEs = CPEntries[i];
+      for (unsigned j = 0, ee = CPEs.size(); j != ee; ++j) {
+        if (CPEs[j].RefCount == 0 && CPEs[j].CPEMI) {
+          removeDeadCPEMI(CPEs[j].CPEMI);
+          CPEs[j].CPEMI = NULL;
+          MadeChange = true;
+        }
+      }
+  }
+  return MadeChange;
+}
+
+/// isBBInRange - Returns true if the distance between specific MI and
+/// specific BB can fit in MI's displacement field.
+bool MipsConstantIslands::isBBInRange
+  (MachineInstr *MI,MachineBasicBlock *DestBB, unsigned MaxDisp) {
+
+unsigned PCAdj = 4;
+
+  unsigned BrOffset   = getOffsetOf(MI) + PCAdj;
+  unsigned DestOffset = BBInfo[DestBB->getNumber()].Offset;
+
+  DEBUG(dbgs() << "Branch of destination BB#" << DestBB->getNumber()
+               << " from BB#" << MI->getParent()->getNumber()
+               << " max delta=" << MaxDisp
+               << " from " << getOffsetOf(MI) << " to " << DestOffset
+               << " offset " << int(DestOffset-BrOffset) << "\t" << *MI);
+
+  if (BrOffset <= DestOffset) {
+    // Branch before the Dest.
+    if (DestOffset-BrOffset <= MaxDisp)
+      return true;
+  } else {
+    if (BrOffset-DestOffset <= MaxDisp)
+      return true;
+  }
+  return false;
+}
+
+/// fixupImmediateBr - Fix up an immediate branch whose destination is too far
+/// away to fit in its displacement field.
+bool MipsConstantIslands::fixupImmediateBr(ImmBranch &Br) {
+  MachineInstr *MI = Br.MI;
+  MachineBasicBlock *DestBB = MI->getOperand(0).getMBB();
+
+  // Check to see if the DestBB is already in-range.
+  if (isBBInRange(MI, DestBB, Br.MaxDisp))
+    return false;
+
+  if (!Br.isCond)
+    return fixupUnconditionalBr(Br);
+  return fixupConditionalBr(Br);
+}
+
+/// fixupUnconditionalBr - Fix up an unconditional branch whose destination is
+/// too far away to fit in its displacement field. If the LR register has been
+/// spilled in the epilogue, then we can use BL to implement a far jump.
+/// Otherwise, add an intermediate branch instruction to a branch.
+bool
+MipsConstantIslands::fixupUnconditionalBr(ImmBranch &Br) {
+  MachineInstr *MI = Br.MI;
+  MachineBasicBlock *MBB = MI->getParent();
+  // Use BL to implement far jump.
+  Br.MaxDisp = ((1 << 16)-1) * 2;
+  MI->setDesc(TII->get(Mips::BimmX16));
+  BBInfo[MBB->getNumber()].Size += 2;
+  adjustBBOffsetsAfter(MBB);
+  HasFarJump = true;
+  ++NumUBrFixed;
+
+  DEBUG(dbgs() << "  Changed B to long jump " << *MI);
+
+  return true;
+}
+
+/// fixupConditionalBr - Fix up a conditional branch whose destination is too
+/// far away to fit in its displacement field. It is converted to an inverse
+/// conditional branch + an unconditional branch to the destination.
+bool
+MipsConstantIslands::fixupConditionalBr(ImmBranch &Br) {
+  MachineInstr *MI = Br.MI;
+  MachineBasicBlock *DestBB = MI->getOperand(0).getMBB();
+
+  // Add an unconditional branch to the destination and invert the branch
+  // condition to jump over it:
+  // blt L1
+  // =>
+  // bge L2
+  // b   L1
+  // L2:
+  unsigned CCReg = 0;  // FIXME
+  unsigned CC=0; //FIXME
+
+  // If the branch is at the end of its MBB and that has a fall-through block,
+  // direct the updated conditional branch to the fall-through block. Otherwise,
+  // split the MBB before the next instruction.
+  MachineBasicBlock *MBB = MI->getParent();
+  MachineInstr *BMI = &MBB->back();
+  bool NeedSplit = (BMI != MI) || !BBHasFallthrough(MBB);
+
+  ++NumCBrFixed;
+  if (BMI != MI) {
+    if (llvm::next(MachineBasicBlock::iterator(MI)) == prior(MBB->end()) &&
+        BMI->getOpcode() == Br.UncondBr) {
+      // Last MI in the BB is an unconditional branch. Can we simply invert the
+      // condition and swap destinations:
+      // beq L1
+      // b   L2
+      // =>
+      // bne L2
+      // b   L1
+      MachineBasicBlock *NewDest = BMI->getOperand(0).getMBB();
+      if (isBBInRange(MI, NewDest, Br.MaxDisp)) {
+        DEBUG(dbgs() << "  Invert Bcc condition and swap its destination with "
+                     << *BMI);
+        BMI->getOperand(0).setMBB(DestBB);
+        MI->getOperand(0).setMBB(NewDest);
+        return true;
+      }
+    }
+  }
+
+  if (NeedSplit) {
+    splitBlockBeforeInstr(MI);
+    // No need for the branch to the next block. We're adding an unconditional
+    // branch to the destination.
+    int delta = TII->GetInstSizeInBytes(&MBB->back());
+    BBInfo[MBB->getNumber()].Size -= delta;
+    MBB->back().eraseFromParent();
+    // BBInfo[SplitBB].Offset is wrong temporarily, fixed below
+  }
+  MachineBasicBlock *NextBB = llvm::next(MachineFunction::iterator(MBB));
+
+  DEBUG(dbgs() << "  Insert B to BB#" << DestBB->getNumber()
+               << " also invert condition and change dest. to BB#"
+               << NextBB->getNumber() << "\n");
+
+  // Insert a new conditional branch and a new unconditional branch.
+  // Also update the ImmBranch as well as adding a new entry for the new branch.
+  BuildMI(MBB, DebugLoc(), TII->get(MI->getOpcode()))
+    .addMBB(NextBB).addImm(CC).addReg(CCReg);
+  Br.MI = &MBB->back();
+  BBInfo[MBB->getNumber()].Size += TII->GetInstSizeInBytes(&MBB->back());
+  BuildMI(MBB, DebugLoc(), TII->get(Br.UncondBr)).addMBB(DestBB);
+  BBInfo[MBB->getNumber()].Size += TII->GetInstSizeInBytes(&MBB->back());
+  unsigned MaxDisp = getUnconditionalBrDisp(Br.UncondBr);
+  ImmBranches.push_back(ImmBranch(&MBB->back(), MaxDisp, false, Br.UncondBr));
+
+  // Remove the old conditional branch.  It may or may not still be in MBB.
+  BBInfo[MI->getParent()->getNumber()].Size -= TII->GetInstSizeInBytes(MI);
+  MI->eraseFromParent();
+  adjustBBOffsetsAfter(MBB);
+  return true;
+}
+
+
+void MipsConstantIslands::prescanForConstants() {
+  unsigned J = 0;
+  (void)J;
+  PrescannedForConstants = true;
+  for (MachineFunction::iterator B =
+         MF->begin(), E = MF->end(); B != E; ++B) {
+    for (MachineBasicBlock::instr_iterator I =
+        B->instr_begin(), EB = B->instr_end(); I != EB; ++I) {
+      switch(I->getDesc().getOpcode()) {
+        case Mips::LwConstant32: {
+          DEBUG(dbgs() << "constant island constant " << *I << "\n");
+          J = I->getNumOperands();
+          DEBUG(dbgs() << "num operands " << J  << "\n");
+          MachineOperand& Literal = I->getOperand(1);
+          if (Literal.isImm()) {
+            int64_t V = Literal.getImm();
+            DEBUG(dbgs() << "literal " << V  << "\n");
+            Type *Int32Ty =
+              Type::getInt32Ty(MF->getFunction()->getContext());
+            const Constant *C = ConstantInt::get(Int32Ty, V);
+            unsigned index = MCP->getConstantPoolIndex(C, 4);
+            I->getOperand(2).ChangeToImmediate(index);
+            DEBUG(dbgs() << "constant island constant " << *I << "\n");
+            I->setDesc(TII->get(Mips::LwRxPcTcp16));
+            I->RemoveOperand(1);
+            I->RemoveOperand(1);
+            I->addOperand(MachineOperand::CreateCPI(index, 0));
+            I->addOperand(MachineOperand::CreateImm(4));
+          }
+          break;
+        }
+        default:
+          break;
+      }
+    }
+  }
+}
+
diff --git a/lib/Target/Mips/MipsDSPInstrInfo.td b/lib/Target/Mips/MipsDSPInstrInfo.td
index 526821a..d268384 100644
--- a/lib/Target/Mips/MipsDSPInstrInfo.td
+++ b/lib/Target/Mips/MipsDSPInstrInfo.td
@@ -256,236 +256,235 @@ class PREPEND_ENC : APPEND_FMT<0b00001>;
 
 // Instruction desc.
 class ADDU_QB_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
-                        InstrItinClass itin, RegisterClass RCD,
-                        RegisterClass RCS,  RegisterClass RCT = RCS> {
-  dag OutOperandList = (outs RCD:$rd);
-  dag InOperandList = (ins RCS:$rs, RCT:$rt);
+                        InstrItinClass itin, RegisterOperand ROD,
+                        RegisterOperand ROS,  RegisterOperand ROT = ROS> {
+  dag OutOperandList = (outs ROD:$rd);
+  dag InOperandList = (ins ROS:$rs, ROT:$rt);
   string AsmString = !strconcat(instr_asm, "\t$rd, $rs, $rt");
-  list<dag> Pattern = [(set RCD:$rd, (OpNode RCS:$rs, RCT:$rt))];
+  list<dag> Pattern = [(set ROD:$rd, (OpNode ROS:$rs, ROT:$rt))];
   InstrItinClass Itinerary = itin;
 }
 
 class RADDU_W_QB_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
-                           InstrItinClass itin, RegisterClass RCD,
-                           RegisterClass RCS = RCD> {
-  dag OutOperandList = (outs RCD:$rd);
-  dag InOperandList = (ins RCS:$rs);
+                           InstrItinClass itin, RegisterOperand ROD,
+                           RegisterOperand ROS = ROD> {
+  dag OutOperandList = (outs ROD:$rd);
+  dag InOperandList = (ins ROS:$rs);
   string AsmString = !strconcat(instr_asm, "\t$rd, $rs");
-  list<dag> Pattern = [(set RCD:$rd, (OpNode RCS:$rs))];
+  list<dag> Pattern = [(set ROD:$rd, (OpNode ROS:$rs))];
   InstrItinClass Itinerary = itin;
 }
 
 class CMP_EQ_QB_R2_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
-                             InstrItinClass itin, RegisterClass RCS,
-                             RegisterClass RCT = RCS> {
+                             InstrItinClass itin, RegisterOperand ROS,
+                             RegisterOperand ROT = ROS> {
   dag OutOperandList = (outs);
-  dag InOperandList = (ins RCS:$rs, RCT:$rt);
+  dag InOperandList = (ins ROS:$rs, ROT:$rt);
   string AsmString = !strconcat(instr_asm, "\t$rs, $rt");
-  list<dag> Pattern = [(OpNode RCS:$rs, RCT:$rt)];
+  list<dag> Pattern = [(OpNode ROS:$rs, ROT:$rt)];
   InstrItinClass Itinerary = itin;
 }
 
 class CMP_EQ_QB_R3_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
-                             InstrItinClass itin, RegisterClass RCD,
-                             RegisterClass RCS,  RegisterClass RCT = RCS> {
-  dag OutOperandList = (outs RCD:$rd);
-  dag InOperandList = (ins RCS:$rs, RCT:$rt);
+                             InstrItinClass itin, RegisterOperand ROD,
+                             RegisterOperand ROS,  RegisterOperand ROT = ROS> {
+  dag OutOperandList = (outs ROD:$rd);
+  dag InOperandList = (ins ROS:$rs, ROT:$rt);
   string AsmString = !strconcat(instr_asm, "\t$rd, $rs, $rt");
-  list<dag> Pattern = [(set RCD:$rd, (OpNode RCS:$rs, RCT:$rt))];
+  list<dag> Pattern = [(set ROD:$rd, (OpNode ROS:$rs, ROT:$rt))];
   InstrItinClass Itinerary = itin;
 }
 
 class PRECR_SRA_PH_W_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
-                               InstrItinClass itin, RegisterClass RCT,
-                               RegisterClass RCS = RCT> {
-  dag OutOperandList = (outs RCT:$rt);
-  dag InOperandList = (ins RCS:$rs, shamt:$sa, RCS:$src);
+                               InstrItinClass itin, RegisterOperand ROT,
+                               RegisterOperand ROS = ROT> {
+  dag OutOperandList = (outs ROT:$rt);
+  dag InOperandList = (ins ROS:$rs, uimm5:$sa, ROS:$src);
   string AsmString = !strconcat(instr_asm, "\t$rt, $rs, $sa");
-  list<dag> Pattern = [(set RCT:$rt, (OpNode RCS:$src, RCS:$rs, immZExt5:$sa))];
+  list<dag> Pattern = [(set ROT:$rt, (OpNode ROS:$src, ROS:$rs, immZExt5:$sa))];
   InstrItinClass Itinerary = itin;
   string Constraints = "$src = $rt";
 }
 
 class ABSQ_S_PH_R2_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
-                             InstrItinClass itin, RegisterClass RCD,
-                             RegisterClass RCT = RCD> {
-  dag OutOperandList = (outs RCD:$rd);
-  dag InOperandList = (ins RCT:$rt);
+                             InstrItinClass itin, RegisterOperand ROD,
+                             RegisterOperand ROT = ROD> {
+  dag OutOperandList = (outs ROD:$rd);
+  dag InOperandList = (ins ROT:$rt);
   string AsmString = !strconcat(instr_asm, "\t$rd, $rt");
-  list<dag> Pattern = [(set RCD:$rd, (OpNode RCT:$rt))];
+  list<dag> Pattern = [(set ROD:$rd, (OpNode ROT:$rt))];
   InstrItinClass Itinerary = itin;
 }
 
 class REPL_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
-                     ImmLeaf immPat, InstrItinClass itin, RegisterClass RC> {
-  dag OutOperandList = (outs RC:$rd);
+                     ImmLeaf immPat, InstrItinClass itin, RegisterOperand RO> {
+  dag OutOperandList = (outs RO:$rd);
   dag InOperandList = (ins uimm16:$imm);
   string AsmString = !strconcat(instr_asm, "\t$rd, $imm");
-  list<dag> Pattern = [(set RC:$rd, (OpNode immPat:$imm))];
+  list<dag> Pattern = [(set RO:$rd, (OpNode immPat:$imm))];
   InstrItinClass Itinerary = itin;
 }
 
 class SHLL_QB_R3_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
-                           InstrItinClass itin, RegisterClass RC> {
-  dag OutOperandList = (outs RC:$rd);
-  dag InOperandList =  (ins RC:$rt, GPR32:$rs_sa);
+                           InstrItinClass itin, RegisterOperand RO> {
+  dag OutOperandList = (outs RO:$rd);
+  dag InOperandList =  (ins RO:$rt, GPR32Opnd:$rs_sa);
   string AsmString = !strconcat(instr_asm, "\t$rd, $rt, $rs_sa");
-  list<dag> Pattern = [(set RC:$rd, (OpNode RC:$rt, GPR32:$rs_sa))];
+  list<dag> Pattern = [(set RO:$rd, (OpNode RO:$rt, GPR32Opnd:$rs_sa))];
   InstrItinClass Itinerary = itin;
 }
 
 class SHLL_QB_R2_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
                            SDPatternOperator ImmPat, InstrItinClass itin,
-                           RegisterClass RC> {
-  dag OutOperandList = (outs RC:$rd);
-  dag InOperandList = (ins RC:$rt, uimm16:$rs_sa);
+                           RegisterOperand RO> {
+  dag OutOperandList = (outs RO:$rd);
+  dag InOperandList = (ins RO:$rt, uimm16:$rs_sa);
   string AsmString = !strconcat(instr_asm, "\t$rd, $rt, $rs_sa");
-  list<dag> Pattern = [(set RC:$rd, (OpNode RC:$rt, ImmPat:$rs_sa))];
+  list<dag> Pattern = [(set RO:$rd, (OpNode RO:$rt, ImmPat:$rs_sa))];
   InstrItinClass Itinerary = itin;
   bit hasSideEffects = 1;
 }
 
 class LX_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
                    InstrItinClass itin> {
-  dag OutOperandList = (outs GPR32:$rd);
-  dag InOperandList = (ins GPR32:$base, GPR32:$index);
+  dag OutOperandList = (outs GPR32Opnd:$rd);
+  dag InOperandList = (ins PtrRC:$base, PtrRC:$index);
   string AsmString = !strconcat(instr_asm, "\t$rd, ${index}(${base})");
-  list<dag> Pattern = [(set GPR32:$rd,
-                       (OpNode GPR32:$base, GPR32:$index))];
+  list<dag> Pattern = [(set GPR32Opnd:$rd, (OpNode iPTR:$base, iPTR:$index))];
   InstrItinClass Itinerary = itin;
   bit mayLoad = 1;
 }
 
 class ADDUH_QB_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
-                         InstrItinClass itin, RegisterClass RCD,
-                         RegisterClass RCS = RCD,  RegisterClass RCT = RCD> {
-  dag OutOperandList = (outs RCD:$rd);
-  dag InOperandList = (ins RCS:$rs, RCT:$rt);
+                         InstrItinClass itin, RegisterOperand ROD,
+                         RegisterOperand ROS = ROD,  RegisterOperand ROT = ROD> {
+  dag OutOperandList = (outs ROD:$rd);
+  dag InOperandList = (ins ROS:$rs, ROT:$rt);
   string AsmString = !strconcat(instr_asm, "\t$rd, $rs, $rt");
-  list<dag> Pattern = [(set RCD:$rd, (OpNode RCS:$rs, RCT:$rt))];
+  list<dag> Pattern = [(set ROD:$rd, (OpNode ROS:$rs, ROT:$rt))];
   InstrItinClass Itinerary = itin;
 }
 
 class APPEND_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
                        SDPatternOperator ImmOp, InstrItinClass itin> {
-  dag OutOperandList = (outs GPR32:$rt);
-  dag InOperandList = (ins GPR32:$rs, shamt:$sa, GPR32:$src);
+  dag OutOperandList = (outs GPR32Opnd:$rt);
+  dag InOperandList = (ins GPR32Opnd:$rs, uimm5:$sa, GPR32Opnd:$src);
   string AsmString = !strconcat(instr_asm, "\t$rt, $rs, $sa");
-  list<dag> Pattern =  [(set GPR32:$rt,
-                        (OpNode GPR32:$src, GPR32:$rs, ImmOp:$sa))];
+  list<dag> Pattern =  [(set GPR32Opnd:$rt,
+                        (OpNode GPR32Opnd:$src, GPR32Opnd:$rs, ImmOp:$sa))];
   InstrItinClass Itinerary = itin;
   string Constraints = "$src = $rt";
 }
 
 class EXTR_W_TY1_R2_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
                               InstrItinClass itin> {
-  dag OutOperandList = (outs GPR32:$rt);
-  dag InOperandList = (ins ACRegsDSP:$ac, GPR32:$shift_rs);
+  dag OutOperandList = (outs GPR32Opnd:$rt);
+  dag InOperandList = (ins ACC64DSPOpnd:$ac, GPR32Opnd:$shift_rs);
   string AsmString = !strconcat(instr_asm, "\t$rt, $ac, $shift_rs");
   InstrItinClass Itinerary = itin;
 }
 
 class EXTR_W_TY1_R1_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
                               InstrItinClass itin> {
-  dag OutOperandList = (outs GPR32:$rt);
-  dag InOperandList = (ins ACRegsDSP:$ac, uimm16:$shift_rs);
+  dag OutOperandList = (outs GPR32Opnd:$rt);
+  dag InOperandList = (ins ACC64DSPOpnd:$ac, uimm16:$shift_rs);
   string AsmString = !strconcat(instr_asm, "\t$rt, $ac, $shift_rs");
   InstrItinClass Itinerary = itin;
 }
 
 class SHILO_R1_DESC_BASE<string instr_asm, SDPatternOperator OpNode> {
-  dag OutOperandList = (outs ACRegsDSP:$ac);
-  dag InOperandList = (ins simm16:$shift, ACRegsDSP:$acin);
+  dag OutOperandList = (outs ACC64DSPOpnd:$ac);
+  dag InOperandList = (ins simm16:$shift, ACC64DSPOpnd:$acin);
   string AsmString = !strconcat(instr_asm, "\t$ac, $shift");
-  list<dag> Pattern = [(set ACRegsDSP:$ac,
-                        (OpNode immSExt6:$shift, ACRegsDSP:$acin))];
+  list<dag> Pattern = [(set ACC64DSPOpnd:$ac,
+                        (OpNode immSExt6:$shift, ACC64DSPOpnd:$acin))];
   string Constraints = "$acin = $ac";
 }
 
 class SHILO_R2_DESC_BASE<string instr_asm, SDPatternOperator OpNode> {
-  dag OutOperandList = (outs ACRegsDSP:$ac);
-  dag InOperandList = (ins GPR32:$rs, ACRegsDSP:$acin);
+  dag OutOperandList = (outs ACC64DSPOpnd:$ac);
+  dag InOperandList = (ins GPR32Opnd:$rs, ACC64DSPOpnd:$acin);
   string AsmString = !strconcat(instr_asm, "\t$ac, $rs");
-  list<dag> Pattern = [(set ACRegsDSP:$ac,
-                        (OpNode GPR32:$rs, ACRegsDSP:$acin))];
+  list<dag> Pattern = [(set ACC64DSPOpnd:$ac,
+                        (OpNode GPR32Opnd:$rs, ACC64DSPOpnd:$acin))];
   string Constraints = "$acin = $ac";
 }
 
 class MTHLIP_DESC_BASE<string instr_asm, SDPatternOperator OpNode> {
-  dag OutOperandList = (outs ACRegsDSP:$ac);
-  dag InOperandList = (ins GPR32:$rs, ACRegsDSP:$acin);
+  dag OutOperandList = (outs ACC64DSPOpnd:$ac);
+  dag InOperandList = (ins GPR32Opnd:$rs, ACC64DSPOpnd:$acin);
   string AsmString = !strconcat(instr_asm, "\t$rs, $ac");
-  list<dag> Pattern = [(set ACRegsDSP:$ac,
-                        (OpNode GPR32:$rs, ACRegsDSP:$acin))];
+  list<dag> Pattern = [(set ACC64DSPOpnd:$ac,
+                        (OpNode GPR32Opnd:$rs, ACC64DSPOpnd:$acin))];
   string Constraints = "$acin = $ac";
 }
 
 class RDDSP_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
                       InstrItinClass itin> {
-  dag OutOperandList = (outs GPR32:$rd);
+  dag OutOperandList = (outs GPR32Opnd:$rd);
   dag InOperandList = (ins uimm16:$mask);
   string AsmString = !strconcat(instr_asm, "\t$rd, $mask");
-  list<dag> Pattern = [(set GPR32:$rd, (OpNode immZExt10:$mask))];
+  list<dag> Pattern = [(set GPR32Opnd:$rd, (OpNode immZExt10:$mask))];
   InstrItinClass Itinerary = itin;
 }
 
 class WRDSP_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
                       InstrItinClass itin> {
   dag OutOperandList = (outs);
-  dag InOperandList = (ins GPR32:$rs, uimm16:$mask);
+  dag InOperandList = (ins GPR32Opnd:$rs, uimm16:$mask);
   string AsmString = !strconcat(instr_asm, "\t$rs, $mask");
-  list<dag> Pattern = [(OpNode GPR32:$rs, immZExt10:$mask)];
+  list<dag> Pattern = [(OpNode GPR32Opnd:$rs, immZExt10:$mask)];
   InstrItinClass Itinerary = itin;
 }
 
 class DPA_W_PH_DESC_BASE<string instr_asm, SDPatternOperator OpNode> {
-  dag OutOperandList = (outs ACRegsDSP:$ac);
-  dag InOperandList = (ins GPR32:$rs, GPR32:$rt, ACRegsDSP:$acin);
+  dag OutOperandList = (outs ACC64DSPOpnd:$ac);
+  dag InOperandList = (ins GPR32Opnd:$rs, GPR32Opnd:$rt, ACC64DSPOpnd:$acin);
   string AsmString = !strconcat(instr_asm, "\t$ac, $rs, $rt");
-  list<dag> Pattern = [(set ACRegsDSP:$ac,
-                        (OpNode GPR32:$rs, GPR32:$rt, ACRegsDSP:$acin))];
+  list<dag> Pattern = [(set ACC64DSPOpnd:$ac,
+                        (OpNode GPR32Opnd:$rs, GPR32Opnd:$rt, ACC64DSPOpnd:$acin))];
   string Constraints = "$acin = $ac";
 }
 
 class MULT_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
                      InstrItinClass itin> {
-  dag OutOperandList = (outs ACRegsDSP:$ac);
-  dag InOperandList = (ins GPR32:$rs, GPR32:$rt);
+  dag OutOperandList = (outs ACC64DSPOpnd:$ac);
+  dag InOperandList = (ins GPR32Opnd:$rs, GPR32Opnd:$rt);
   string AsmString = !strconcat(instr_asm, "\t$ac, $rs, $rt");
-  list<dag> Pattern = [(set ACRegsDSP:$ac, (OpNode GPR32:$rs, GPR32:$rt))];
+  list<dag> Pattern = [(set ACC64DSPOpnd:$ac, (OpNode GPR32Opnd:$rs, GPR32Opnd:$rt))];
   InstrItinClass Itinerary = itin;
-  int AddedComplexity = 20;
   bit isCommutable = 1;
 }
 
 class MADD_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
                      InstrItinClass itin> {
-  dag OutOperandList = (outs ACRegsDSP:$ac);
-  dag InOperandList = (ins GPR32:$rs, GPR32:$rt, ACRegsDSP:$acin);
+  dag OutOperandList = (outs ACC64DSPOpnd:$ac);
+  dag InOperandList = (ins GPR32Opnd:$rs, GPR32Opnd:$rt, ACC64DSPOpnd:$acin);
   string AsmString = !strconcat(instr_asm, "\t$ac, $rs, $rt");
-  list<dag> Pattern = [(set ACRegsDSP:$ac,
-                        (OpNode GPR32:$rs, GPR32:$rt, ACRegsDSP:$acin))];
+  list<dag> Pattern = [(set ACC64DSPOpnd:$ac,
+                        (OpNode GPR32Opnd:$rs, GPR32Opnd:$rt, ACC64DSPOpnd:$acin))];
   InstrItinClass Itinerary = itin;
-  int AddedComplexity = 20;
   string Constraints = "$acin = $ac";
 }
 
-class MFHI_DESC_BASE<string instr_asm, RegisterClass RC, InstrItinClass itin> {
-  dag OutOperandList = (outs GPR32:$rd);
-  dag InOperandList = (ins RC:$ac);
+class MFHI_DESC_BASE<string instr_asm, RegisterOperand RO, SDNode OpNode,
+                     InstrItinClass itin> {
+  dag OutOperandList = (outs GPR32Opnd:$rd);
+  dag InOperandList = (ins RO:$ac);
   string AsmString = !strconcat(instr_asm, "\t$rd, $ac");
+  list<dag> Pattern = [(set GPR32Opnd:$rd, (OpNode RO:$ac))];
   InstrItinClass Itinerary = itin;
 }
 
-class MTHI_DESC_BASE<string instr_asm, RegisterClass RC, InstrItinClass itin> {
-  dag OutOperandList = (outs RC:$ac);
-  dag InOperandList = (ins GPR32:$rs);
+class MTHI_DESC_BASE<string instr_asm, RegisterOperand RO, InstrItinClass itin> {
+  dag OutOperandList = (outs RO:$ac);
+  dag InOperandList = (ins GPR32Opnd:$rs);
   string AsmString = !strconcat(instr_asm, "\t$rs, $ac");
   InstrItinClass Itinerary = itin;
 }
 
 class BPOSGE32_PSEUDO_DESC_BASE<SDPatternOperator OpNode, InstrItinClass itin> :
-  MipsPseudo<(outs GPR32:$dst), (ins), [(set GPR32:$dst, (OpNode))]> {
+  MipsPseudo<(outs GPR32Opnd:$dst), (ins), [(set GPR32Opnd:$dst, (OpNode))]> {
   bit usesCustomInserter = 1;
 }
 
@@ -501,10 +500,10 @@ class BPOSGE32_DESC_BASE<string instr_asm, InstrItinClass itin> {
 
 class INSV_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
                      InstrItinClass itin> {
-  dag OutOperandList = (outs GPR32:$rt);
-  dag InOperandList = (ins GPR32:$src, GPR32:$rs);
+  dag OutOperandList = (outs GPR32Opnd:$rt);
+  dag InOperandList = (ins GPR32Opnd:$src, GPR32Opnd:$rs);
   string AsmString = !strconcat(instr_asm, "\t$rt, $rs");
-  list<dag> Pattern = [(set GPR32:$rt, (OpNode GPR32:$src, GPR32:$rs))];
+  list<dag> Pattern = [(set GPR32Opnd:$rt, (OpNode GPR32Opnd:$src, GPR32Opnd:$rs))];
   InstrItinClass Itinerary = itin;
   string Constraints = "$src = $rt";
 }
@@ -515,209 +514,209 @@ class INSV_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
 
 // Addition/subtraction
 class ADDU_QB_DESC : ADDU_QB_DESC_BASE<"addu.qb", null_frag, NoItinerary,
-                                       DSPRegs, DSPRegs>, IsCommutable,
+                                       DSPROpnd, DSPROpnd>, IsCommutable,
                      Defs<[DSPOutFlag20]>;
 
 class ADDU_S_QB_DESC : ADDU_QB_DESC_BASE<"addu_s.qb", int_mips_addu_s_qb,
-                                         NoItinerary, DSPRegs, DSPRegs>,
+                                         NoItinerary, DSPROpnd, DSPROpnd>,
                        IsCommutable, Defs<[DSPOutFlag20]>;
 
 class SUBU_QB_DESC : ADDU_QB_DESC_BASE<"subu.qb", null_frag, NoItinerary,
-                                       DSPRegs, DSPRegs>,
+                                       DSPROpnd, DSPROpnd>,
                      Defs<[DSPOutFlag20]>;
 
 class SUBU_S_QB_DESC : ADDU_QB_DESC_BASE<"subu_s.qb", int_mips_subu_s_qb,
-                                         NoItinerary, DSPRegs, DSPRegs>,
+                                         NoItinerary, DSPROpnd, DSPROpnd>,
                        Defs<[DSPOutFlag20]>;
 
 class ADDQ_PH_DESC : ADDU_QB_DESC_BASE<"addq.ph", null_frag, NoItinerary,
-                                       DSPRegs, DSPRegs>, IsCommutable,
+                                       DSPROpnd, DSPROpnd>, IsCommutable,
                      Defs<[DSPOutFlag20]>;
 
 class ADDQ_S_PH_DESC : ADDU_QB_DESC_BASE<"addq_s.ph", int_mips_addq_s_ph,
-                                         NoItinerary, DSPRegs, DSPRegs>,
+                                         NoItinerary, DSPROpnd, DSPROpnd>,
                        IsCommutable, Defs<[DSPOutFlag20]>;
 
 class SUBQ_PH_DESC : ADDU_QB_DESC_BASE<"subq.ph", null_frag, NoItinerary,
-                                       DSPRegs, DSPRegs>,
+                                       DSPROpnd, DSPROpnd>,
                      Defs<[DSPOutFlag20]>;
 
 class SUBQ_S_PH_DESC : ADDU_QB_DESC_BASE<"subq_s.ph", int_mips_subq_s_ph,
-                                         NoItinerary, DSPRegs, DSPRegs>,
+                                         NoItinerary, DSPROpnd, DSPROpnd>,
                        Defs<[DSPOutFlag20]>;
 
 class ADDQ_S_W_DESC : ADDU_QB_DESC_BASE<"addq_s.w", int_mips_addq_s_w,
-                                        NoItinerary, GPR32, GPR32>,
+                                        NoItinerary, GPR32Opnd, GPR32Opnd>,
                       IsCommutable, Defs<[DSPOutFlag20]>;
 
 class SUBQ_S_W_DESC : ADDU_QB_DESC_BASE<"subq_s.w", int_mips_subq_s_w,
-                                        NoItinerary, GPR32, GPR32>,
+                                        NoItinerary, GPR32Opnd, GPR32Opnd>,
                       Defs<[DSPOutFlag20]>;
 
 class ADDSC_DESC : ADDU_QB_DESC_BASE<"addsc", null_frag, NoItinerary,
-                                     GPR32, GPR32>, IsCommutable,
+                                     GPR32Opnd, GPR32Opnd>, IsCommutable,
                    Defs<[DSPCarry]>;
 
 class ADDWC_DESC : ADDU_QB_DESC_BASE<"addwc", null_frag, NoItinerary,
-                                     GPR32, GPR32>,
+                                     GPR32Opnd, GPR32Opnd>,
                    IsCommutable, Uses<[DSPCarry]>, Defs<[DSPOutFlag20]>;
 
 class MODSUB_DESC : ADDU_QB_DESC_BASE<"modsub", int_mips_modsub, NoItinerary,
-                                      GPR32, GPR32>;
+                                      GPR32Opnd, GPR32Opnd>;
 
 class RADDU_W_QB_DESC : RADDU_W_QB_DESC_BASE<"raddu.w.qb", int_mips_raddu_w_qb,
-                                             NoItinerary, GPR32, DSPRegs>;
+                                             NoItinerary, GPR32Opnd, DSPROpnd>;
 
 // Absolute value
 class ABSQ_S_PH_DESC : ABSQ_S_PH_R2_DESC_BASE<"absq_s.ph", int_mips_absq_s_ph,
-                                              NoItinerary, DSPRegs>,
+                                              NoItinerary, DSPROpnd>,
                        Defs<[DSPOutFlag20]>;
 
 class ABSQ_S_W_DESC : ABSQ_S_PH_R2_DESC_BASE<"absq_s.w", int_mips_absq_s_w,
-                                             NoItinerary, GPR32>,
+                                             NoItinerary, GPR32Opnd>,
                       Defs<[DSPOutFlag20]>;
 
 // Precision reduce/expand
 class PRECRQ_QB_PH_DESC : CMP_EQ_QB_R3_DESC_BASE<"precrq.qb.ph",
                                                  int_mips_precrq_qb_ph,
-                                                 NoItinerary, DSPRegs, DSPRegs>;
+                                                 NoItinerary, DSPROpnd, DSPROpnd>;
 
 class PRECRQ_PH_W_DESC : CMP_EQ_QB_R3_DESC_BASE<"precrq.ph.w",
                                                 int_mips_precrq_ph_w,
-                                                NoItinerary, DSPRegs, GPR32>;
+                                                NoItinerary, DSPROpnd, GPR32Opnd>;
 
 class PRECRQ_RS_PH_W_DESC : CMP_EQ_QB_R3_DESC_BASE<"precrq_rs.ph.w",
                                                    int_mips_precrq_rs_ph_w,
-                                                   NoItinerary, DSPRegs,
-                                                   GPR32>,
+                                                   NoItinerary, DSPROpnd,
+                                                   GPR32Opnd>,
                             Defs<[DSPOutFlag22]>;
 
 class PRECRQU_S_QB_PH_DESC : CMP_EQ_QB_R3_DESC_BASE<"precrqu_s.qb.ph",
                                                     int_mips_precrqu_s_qb_ph,
-                                                    NoItinerary, DSPRegs,
-                                                    DSPRegs>,
+                                                    NoItinerary, DSPROpnd,
+                                                    DSPROpnd>,
                              Defs<[DSPOutFlag22]>;
 
 class PRECEQ_W_PHL_DESC : ABSQ_S_PH_R2_DESC_BASE<"preceq.w.phl",
                                                  int_mips_preceq_w_phl,
-                                                 NoItinerary, GPR32, DSPRegs>;
+                                                 NoItinerary, GPR32Opnd, DSPROpnd>;
 
 class PRECEQ_W_PHR_DESC : ABSQ_S_PH_R2_DESC_BASE<"preceq.w.phr",
                                                  int_mips_preceq_w_phr,
-                                                 NoItinerary, GPR32, DSPRegs>;
+                                                 NoItinerary, GPR32Opnd, DSPROpnd>;
 
 class PRECEQU_PH_QBL_DESC : ABSQ_S_PH_R2_DESC_BASE<"precequ.ph.qbl",
                                                    int_mips_precequ_ph_qbl,
-                                                   NoItinerary, DSPRegs>;
+                                                   NoItinerary, DSPROpnd>;
 
 class PRECEQU_PH_QBR_DESC : ABSQ_S_PH_R2_DESC_BASE<"precequ.ph.qbr",
                                                    int_mips_precequ_ph_qbr,
-                                                   NoItinerary, DSPRegs>;
+                                                   NoItinerary, DSPROpnd>;
 
 class PRECEQU_PH_QBLA_DESC : ABSQ_S_PH_R2_DESC_BASE<"precequ.ph.qbla",
                                                     int_mips_precequ_ph_qbla,
-                                                    NoItinerary, DSPRegs>;
+                                                    NoItinerary, DSPROpnd>;
 
 class PRECEQU_PH_QBRA_DESC : ABSQ_S_PH_R2_DESC_BASE<"precequ.ph.qbra",
                                                     int_mips_precequ_ph_qbra,
-                                                    NoItinerary, DSPRegs>;
+                                                    NoItinerary, DSPROpnd>;
 
 class PRECEU_PH_QBL_DESC : ABSQ_S_PH_R2_DESC_BASE<"preceu.ph.qbl",
                                                   int_mips_preceu_ph_qbl,
-                                                  NoItinerary, DSPRegs>;
+                                                  NoItinerary, DSPROpnd>;
 
 class PRECEU_PH_QBR_DESC : ABSQ_S_PH_R2_DESC_BASE<"preceu.ph.qbr",
                                                   int_mips_preceu_ph_qbr,
-                                                  NoItinerary, DSPRegs>;
+                                                  NoItinerary, DSPROpnd>;
 
 class PRECEU_PH_QBLA_DESC : ABSQ_S_PH_R2_DESC_BASE<"preceu.ph.qbla",
                                                    int_mips_preceu_ph_qbla,
-                                                   NoItinerary, DSPRegs>;
+                                                   NoItinerary, DSPROpnd>;
 
 class PRECEU_PH_QBRA_DESC : ABSQ_S_PH_R2_DESC_BASE<"preceu.ph.qbra",
                                                    int_mips_preceu_ph_qbra,
-                                                   NoItinerary, DSPRegs>;
+                                                   NoItinerary, DSPROpnd>;
 
 // Shift
 class SHLL_QB_DESC : SHLL_QB_R2_DESC_BASE<"shll.qb", null_frag, immZExt3,
-                                          NoItinerary, DSPRegs>,
+                                          NoItinerary, DSPROpnd>,
                      Defs<[DSPOutFlag22]>;
 
 class SHLLV_QB_DESC : SHLL_QB_R3_DESC_BASE<"shllv.qb", int_mips_shll_qb,
-                                           NoItinerary, DSPRegs>,
+                                           NoItinerary, DSPROpnd>,
                       Defs<[DSPOutFlag22]>;
 
 class SHRL_QB_DESC : SHLL_QB_R2_DESC_BASE<"shrl.qb", null_frag, immZExt3,
-                                          NoItinerary, DSPRegs>;
+                                          NoItinerary, DSPROpnd>;
 
 class SHRLV_QB_DESC : SHLL_QB_R3_DESC_BASE<"shrlv.qb", int_mips_shrl_qb,
-                                           NoItinerary, DSPRegs>;
+                                           NoItinerary, DSPROpnd>;
 
 class SHLL_PH_DESC : SHLL_QB_R2_DESC_BASE<"shll.ph", null_frag, immZExt4,
-                                          NoItinerary, DSPRegs>,
+                                          NoItinerary, DSPROpnd>,
                      Defs<[DSPOutFlag22]>;
 
 class SHLLV_PH_DESC : SHLL_QB_R3_DESC_BASE<"shllv.ph", int_mips_shll_ph,
-                                           NoItinerary, DSPRegs>,
+                                           NoItinerary, DSPROpnd>,
                       Defs<[DSPOutFlag22]>;
 
 class SHLL_S_PH_DESC : SHLL_QB_R2_DESC_BASE<"shll_s.ph", int_mips_shll_s_ph,
-                                            immZExt4, NoItinerary, DSPRegs>,
+                                            immZExt4, NoItinerary, DSPROpnd>,
                        Defs<[DSPOutFlag22]>;
 
 class SHLLV_S_PH_DESC : SHLL_QB_R3_DESC_BASE<"shllv_s.ph", int_mips_shll_s_ph,
-                                             NoItinerary, DSPRegs>,
+                                             NoItinerary, DSPROpnd>,
                         Defs<[DSPOutFlag22]>;
 
 class SHRA_PH_DESC : SHLL_QB_R2_DESC_BASE<"shra.ph", null_frag, immZExt4,
-                                          NoItinerary, DSPRegs>;
+                                          NoItinerary, DSPROpnd>;
 
 class SHRAV_PH_DESC : SHLL_QB_R3_DESC_BASE<"shrav.ph", int_mips_shra_ph,
-                                           NoItinerary, DSPRegs>;
+                                           NoItinerary, DSPROpnd>;
 
 class SHRA_R_PH_DESC : SHLL_QB_R2_DESC_BASE<"shra_r.ph", int_mips_shra_r_ph,
-                                            immZExt4, NoItinerary, DSPRegs>;
+                                            immZExt4, NoItinerary, DSPROpnd>;
 
 class SHRAV_R_PH_DESC : SHLL_QB_R3_DESC_BASE<"shrav_r.ph", int_mips_shra_r_ph,
-                                             NoItinerary, DSPRegs>;
+                                             NoItinerary, DSPROpnd>;
 
 class SHLL_S_W_DESC : SHLL_QB_R2_DESC_BASE<"shll_s.w", int_mips_shll_s_w,
-                                           immZExt5, NoItinerary, GPR32>,
+                                           immZExt5, NoItinerary, GPR32Opnd>,
                       Defs<[DSPOutFlag22]>;
 
 class SHLLV_S_W_DESC : SHLL_QB_R3_DESC_BASE<"shllv_s.w", int_mips_shll_s_w,
-                                            NoItinerary, GPR32>,
+                                            NoItinerary, GPR32Opnd>,
                        Defs<[DSPOutFlag22]>;
 
 class SHRA_R_W_DESC : SHLL_QB_R2_DESC_BASE<"shra_r.w", int_mips_shra_r_w,
-                                           immZExt5, NoItinerary, GPR32>;
+                                           immZExt5, NoItinerary, GPR32Opnd>;
 
 class SHRAV_R_W_DESC : SHLL_QB_R3_DESC_BASE<"shrav_r.w", int_mips_shra_r_w,
-                                            NoItinerary, GPR32>;
+                                            NoItinerary, GPR32Opnd>;
 
 // Multiplication
 class MULEU_S_PH_QBL_DESC : ADDU_QB_DESC_BASE<"muleu_s.ph.qbl",
                                               int_mips_muleu_s_ph_qbl,
-                                              NoItinerary, DSPRegs, DSPRegs>,
+                                              NoItinerary, DSPROpnd, DSPROpnd>,
                             Defs<[DSPOutFlag21]>;
 
 class MULEU_S_PH_QBR_DESC : ADDU_QB_DESC_BASE<"muleu_s.ph.qbr",
                                               int_mips_muleu_s_ph_qbr,
-                                              NoItinerary, DSPRegs, DSPRegs>,
+                                              NoItinerary, DSPROpnd, DSPROpnd>,
                             Defs<[DSPOutFlag21]>;
 
 class MULEQ_S_W_PHL_DESC : ADDU_QB_DESC_BASE<"muleq_s.w.phl",
                                              int_mips_muleq_s_w_phl,
-                                             NoItinerary, GPR32, DSPRegs>,
+                                             NoItinerary, GPR32Opnd, DSPROpnd>,
                            IsCommutable, Defs<[DSPOutFlag21]>;
 
 class MULEQ_S_W_PHR_DESC : ADDU_QB_DESC_BASE<"muleq_s.w.phr",
                                              int_mips_muleq_s_w_phr,
-                                             NoItinerary, GPR32, DSPRegs>,
+                                             NoItinerary, GPR32Opnd, DSPROpnd>,
                            IsCommutable, Defs<[DSPOutFlag21]>;
 
 class MULQ_RS_PH_DESC : ADDU_QB_DESC_BASE<"mulq_rs.ph", int_mips_mulq_rs_ph,
-                                          NoItinerary, DSPRegs, DSPRegs>,
+                                          NoItinerary, DSPROpnd, DSPROpnd>,
                         IsCommutable, Defs<[DSPOutFlag21]>;
 
 class MULSAQ_S_W_PH_DESC : DPA_W_PH_DESC_BASE<"mulsaq_s.w.ph",
@@ -737,10 +736,10 @@ class MAQ_SA_W_PHR_DESC : DPA_W_PH_DESC_BASE<"maq_sa.w.phr", MipsMAQ_SA_W_PHR>,
                           Defs<[DSPOutFlag16_19]>;
 
 // Move from/to hi/lo.
-class MFHI_DESC : MFHI_DESC_BASE<"mfhi", HIRegsDSP, NoItinerary>;
-class MFLO_DESC : MFHI_DESC_BASE<"mflo", LORegsDSP, NoItinerary>;
-class MTHI_DESC : MTHI_DESC_BASE<"mthi", HIRegsDSP, NoItinerary>;
-class MTLO_DESC : MTHI_DESC_BASE<"mtlo", LORegsDSP, NoItinerary>;
+class MFHI_DESC : MFHI_DESC_BASE<"mfhi", ACC64DSPOpnd, MipsMFHI, NoItinerary>;
+class MFLO_DESC : MFHI_DESC_BASE<"mflo", ACC64DSPOpnd, MipsMFLO, NoItinerary>;
+class MTHI_DESC : MTHI_DESC_BASE<"mthi", HI32DSPOpnd, NoItinerary>;
+class MTLO_DESC : MTHI_DESC_BASE<"mtlo", LO32DSPOpnd, NoItinerary>;
 
 // Dot product with accumulate/subtract
 class DPAU_H_QBL_DESC : DPA_W_PH_DESC_BASE<"dpau.h.qbl", MipsDPAU_H_QBL>;
@@ -773,67 +772,67 @@ class MSUBU_DSP_DESC : MADD_DESC_BASE<"msubu", MipsMSubu, NoItinerary>;
 // Comparison
 class CMPU_EQ_QB_DESC : CMP_EQ_QB_R2_DESC_BASE<"cmpu.eq.qb",
                                                int_mips_cmpu_eq_qb, NoItinerary,
-                                               DSPRegs>,
+                                               DSPROpnd>,
                         IsCommutable, Defs<[DSPCCond]>;
 
 class CMPU_LT_QB_DESC : CMP_EQ_QB_R2_DESC_BASE<"cmpu.lt.qb",
                                                int_mips_cmpu_lt_qb, NoItinerary,
-                                               DSPRegs>, Defs<[DSPCCond]>;
+                                               DSPROpnd>, Defs<[DSPCCond]>;
 
 class CMPU_LE_QB_DESC : CMP_EQ_QB_R2_DESC_BASE<"cmpu.le.qb",
                                                int_mips_cmpu_le_qb, NoItinerary,
-                                               DSPRegs>, Defs<[DSPCCond]>;
+                                               DSPROpnd>, Defs<[DSPCCond]>;
 
 class CMPGU_EQ_QB_DESC : CMP_EQ_QB_R3_DESC_BASE<"cmpgu.eq.qb",
                                                 int_mips_cmpgu_eq_qb,
-                                                NoItinerary, GPR32, DSPRegs>,
+                                                NoItinerary, GPR32Opnd, DSPROpnd>,
                          IsCommutable;
 
 class CMPGU_LT_QB_DESC : CMP_EQ_QB_R3_DESC_BASE<"cmpgu.lt.qb",
                                                 int_mips_cmpgu_lt_qb,
-                                                NoItinerary, GPR32, DSPRegs>;
+                                                NoItinerary, GPR32Opnd, DSPROpnd>;
 
 class CMPGU_LE_QB_DESC : CMP_EQ_QB_R3_DESC_BASE<"cmpgu.le.qb",
                                                 int_mips_cmpgu_le_qb,
-                                                NoItinerary, GPR32, DSPRegs>;
+                                                NoItinerary, GPR32Opnd, DSPROpnd>;
 
 class CMP_EQ_PH_DESC : CMP_EQ_QB_R2_DESC_BASE<"cmp.eq.ph", int_mips_cmp_eq_ph,
-                                              NoItinerary, DSPRegs>,
+                                              NoItinerary, DSPROpnd>,
                        IsCommutable, Defs<[DSPCCond]>;
 
 class CMP_LT_PH_DESC : CMP_EQ_QB_R2_DESC_BASE<"cmp.lt.ph", int_mips_cmp_lt_ph,
-                                              NoItinerary, DSPRegs>,
+                                              NoItinerary, DSPROpnd>,
                        Defs<[DSPCCond]>;
 
 class CMP_LE_PH_DESC : CMP_EQ_QB_R2_DESC_BASE<"cmp.le.ph", int_mips_cmp_le_ph,
-                                              NoItinerary, DSPRegs>,
+                                              NoItinerary, DSPROpnd>,
                        Defs<[DSPCCond]>;
 
 // Misc
 class BITREV_DESC : ABSQ_S_PH_R2_DESC_BASE<"bitrev", int_mips_bitrev,
-                                           NoItinerary, GPR32>;
+                                           NoItinerary, GPR32Opnd>;
 
 class PACKRL_PH_DESC : CMP_EQ_QB_R3_DESC_BASE<"packrl.ph", int_mips_packrl_ph,
-                                              NoItinerary, DSPRegs, DSPRegs>;
+                                              NoItinerary, DSPROpnd, DSPROpnd>;
 
 class REPL_QB_DESC : REPL_DESC_BASE<"repl.qb", int_mips_repl_qb, immZExt8,
-                                    NoItinerary, DSPRegs>;
+                                    NoItinerary, DSPROpnd>;
 
 class REPL_PH_DESC : REPL_DESC_BASE<"repl.ph", int_mips_repl_ph, immZExt10,
-                                    NoItinerary, DSPRegs>;
+                                    NoItinerary, DSPROpnd>;
 
 class REPLV_QB_DESC : ABSQ_S_PH_R2_DESC_BASE<"replv.qb", int_mips_repl_qb,
-                                             NoItinerary, DSPRegs, GPR32>;
+                                             NoItinerary, DSPROpnd, GPR32Opnd>;
 
 class REPLV_PH_DESC : ABSQ_S_PH_R2_DESC_BASE<"replv.ph", int_mips_repl_ph,
-                                             NoItinerary, DSPRegs, GPR32>;
+                                             NoItinerary, DSPROpnd, GPR32Opnd>;
 
 class PICK_QB_DESC : CMP_EQ_QB_R3_DESC_BASE<"pick.qb", int_mips_pick_qb,
-                                            NoItinerary, DSPRegs, DSPRegs>,
+                                            NoItinerary, DSPROpnd, DSPROpnd>,
                      Uses<[DSPCCond]>;
 
 class PICK_PH_DESC : CMP_EQ_QB_R3_DESC_BASE<"pick.ph", int_mips_pick_ph,
-                                            NoItinerary, DSPRegs, DSPRegs>,
+                                            NoItinerary, DSPROpnd, DSPROpnd>,
                      Uses<[DSPCCond]>;
 
 class LWX_DESC : LX_DESC_BASE<"lwx", int_mips_lwx, NoItinerary>;
@@ -905,97 +904,97 @@ class INSV_DESC : INSV_DESC_BASE<"insv", int_mips_insv, NoItinerary>,
 // MIPS DSP Rev 2
 // Addition/subtraction
 class ADDU_PH_DESC : ADDU_QB_DESC_BASE<"addu.ph", int_mips_addu_ph, NoItinerary,
-                                       DSPRegs, DSPRegs>, IsCommutable,
+                                       DSPROpnd, DSPROpnd>, IsCommutable,
                      Defs<[DSPOutFlag20]>;
 
 class ADDU_S_PH_DESC : ADDU_QB_DESC_BASE<"addu_s.ph", int_mips_addu_s_ph,
-                                         NoItinerary, DSPRegs, DSPRegs>,
+                                         NoItinerary, DSPROpnd, DSPROpnd>,
                        IsCommutable, Defs<[DSPOutFlag20]>;
 
 class SUBU_PH_DESC : ADDU_QB_DESC_BASE<"subu.ph", int_mips_subu_ph, NoItinerary,
-                                       DSPRegs, DSPRegs>,
+                                       DSPROpnd, DSPROpnd>,
                      Defs<[DSPOutFlag20]>;
 
 class SUBU_S_PH_DESC : ADDU_QB_DESC_BASE<"subu_s.ph", int_mips_subu_s_ph,
-                                         NoItinerary, DSPRegs, DSPRegs>,
+                                         NoItinerary, DSPROpnd, DSPROpnd>,
                        Defs<[DSPOutFlag20]>;
 
 class ADDUH_QB_DESC : ADDUH_QB_DESC_BASE<"adduh.qb", int_mips_adduh_qb,
-                                         NoItinerary, DSPRegs>, IsCommutable;
+                                         NoItinerary, DSPROpnd>, IsCommutable;
 
 class ADDUH_R_QB_DESC : ADDUH_QB_DESC_BASE<"adduh_r.qb", int_mips_adduh_r_qb,
-                                           NoItinerary, DSPRegs>, IsCommutable;
+                                           NoItinerary, DSPROpnd>, IsCommutable;
 
 class SUBUH_QB_DESC : ADDUH_QB_DESC_BASE<"subuh.qb", int_mips_subuh_qb,
-                                         NoItinerary, DSPRegs>;
+                                         NoItinerary, DSPROpnd>;
 
 class SUBUH_R_QB_DESC : ADDUH_QB_DESC_BASE<"subuh_r.qb", int_mips_subuh_r_qb,
-                                           NoItinerary, DSPRegs>;
+                                           NoItinerary, DSPROpnd>;
 
 class ADDQH_PH_DESC : ADDUH_QB_DESC_BASE<"addqh.ph", int_mips_addqh_ph,
-                                         NoItinerary, DSPRegs>, IsCommutable;
+                                         NoItinerary, DSPROpnd>, IsCommutable;
 
 class ADDQH_R_PH_DESC : ADDUH_QB_DESC_BASE<"addqh_r.ph", int_mips_addqh_r_ph,
-                                           NoItinerary, DSPRegs>, IsCommutable;
+                                           NoItinerary, DSPROpnd>, IsCommutable;
 
 class SUBQH_PH_DESC : ADDUH_QB_DESC_BASE<"subqh.ph", int_mips_subqh_ph,
-                                         NoItinerary, DSPRegs>;
+                                         NoItinerary, DSPROpnd>;
 
 class SUBQH_R_PH_DESC : ADDUH_QB_DESC_BASE<"subqh_r.ph", int_mips_subqh_r_ph,
-                                           NoItinerary, DSPRegs>;
+                                           NoItinerary, DSPROpnd>;
 
 class ADDQH_W_DESC : ADDUH_QB_DESC_BASE<"addqh.w", int_mips_addqh_w,
-                                        NoItinerary, GPR32>, IsCommutable;
+                                        NoItinerary, GPR32Opnd>, IsCommutable;
 
 class ADDQH_R_W_DESC : ADDUH_QB_DESC_BASE<"addqh_r.w", int_mips_addqh_r_w,
-                                          NoItinerary, GPR32>, IsCommutable;
+                                          NoItinerary, GPR32Opnd>, IsCommutable;
 
 class SUBQH_W_DESC : ADDUH_QB_DESC_BASE<"subqh.w", int_mips_subqh_w,
-                                        NoItinerary, GPR32>;
+                                        NoItinerary, GPR32Opnd>;
 
 class SUBQH_R_W_DESC : ADDUH_QB_DESC_BASE<"subqh_r.w", int_mips_subqh_r_w,
-                                          NoItinerary, GPR32>;
+                                          NoItinerary, GPR32Opnd>;
 
 // Comparison
 class CMPGDU_EQ_QB_DESC : CMP_EQ_QB_R3_DESC_BASE<"cmpgdu.eq.qb",
                                                  int_mips_cmpgdu_eq_qb,
-                                                 NoItinerary, GPR32, DSPRegs>,
+                                                 NoItinerary, GPR32Opnd, DSPROpnd>,
                           IsCommutable, Defs<[DSPCCond]>;
 
 class CMPGDU_LT_QB_DESC : CMP_EQ_QB_R3_DESC_BASE<"cmpgdu.lt.qb",
                                                  int_mips_cmpgdu_lt_qb,
-                                                 NoItinerary, GPR32, DSPRegs>,
+                                                 NoItinerary, GPR32Opnd, DSPROpnd>,
                           Defs<[DSPCCond]>;
 
 class CMPGDU_LE_QB_DESC : CMP_EQ_QB_R3_DESC_BASE<"cmpgdu.le.qb",
                                                  int_mips_cmpgdu_le_qb,
-                                                 NoItinerary, GPR32, DSPRegs>,
+                                                 NoItinerary, GPR32Opnd, DSPROpnd>,
                           Defs<[DSPCCond]>;
 
 // Absolute
 class ABSQ_S_QB_DESC : ABSQ_S_PH_R2_DESC_BASE<"absq_s.qb", int_mips_absq_s_qb,
-                                              NoItinerary, DSPRegs>,
+                                              NoItinerary, DSPROpnd>,
                        Defs<[DSPOutFlag20]>;
 
 // Multiplication
 class MUL_PH_DESC : ADDUH_QB_DESC_BASE<"mul.ph", null_frag, NoItinerary,
-                                       DSPRegs>, IsCommutable,
+                                       DSPROpnd>, IsCommutable,
                     Defs<[DSPOutFlag21]>;
 
 class MUL_S_PH_DESC : ADDUH_QB_DESC_BASE<"mul_s.ph", int_mips_mul_s_ph,
-                                         NoItinerary, DSPRegs>, IsCommutable,
+                                         NoItinerary, DSPROpnd>, IsCommutable,
                       Defs<[DSPOutFlag21]>;
 
 class MULQ_S_W_DESC : ADDUH_QB_DESC_BASE<"mulq_s.w", int_mips_mulq_s_w,
-                                         NoItinerary, GPR32>, IsCommutable,
+                                         NoItinerary, GPR32Opnd>, IsCommutable,
                       Defs<[DSPOutFlag21]>;
 
 class MULQ_RS_W_DESC : ADDUH_QB_DESC_BASE<"mulq_rs.w", int_mips_mulq_rs_w,
-                                          NoItinerary, GPR32>, IsCommutable,
+                                          NoItinerary, GPR32Opnd>, IsCommutable,
                        Defs<[DSPOutFlag21]>;
 
 class MULQ_S_PH_DESC : ADDU_QB_DESC_BASE<"mulq_s.ph", int_mips_mulq_s_ph,
-                                         NoItinerary, DSPRegs, DSPRegs>,
+                                         NoItinerary, DSPROpnd, DSPROpnd>,
                        IsCommutable, Defs<[DSPOutFlag21]>;
 
 // Dot product with accumulate/subtract
@@ -1026,36 +1025,36 @@ class MULSA_W_PH_DESC : DPA_W_PH_DESC_BASE<"mulsa.w.ph", MipsMULSA_W_PH>;
 // Precision reduce/expand
 class PRECR_QB_PH_DESC : CMP_EQ_QB_R3_DESC_BASE<"precr.qb.ph",
                                                 int_mips_precr_qb_ph,
-                                                NoItinerary, DSPRegs, DSPRegs>;
+                                                NoItinerary, DSPROpnd, DSPROpnd>;
 
 class PRECR_SRA_PH_W_DESC : PRECR_SRA_PH_W_DESC_BASE<"precr_sra.ph.w",
                                                      int_mips_precr_sra_ph_w,
-                                                     NoItinerary, DSPRegs,
-                                                     GPR32>;
+                                                     NoItinerary, DSPROpnd,
+                                                     GPR32Opnd>;
 
 class PRECR_SRA_R_PH_W_DESC : PRECR_SRA_PH_W_DESC_BASE<"precr_sra_r.ph.w",
                                                       int_mips_precr_sra_r_ph_w,
-                                                       NoItinerary, DSPRegs,
-                                                       GPR32>;
+                                                       NoItinerary, DSPROpnd,
+                                                       GPR32Opnd>;
 
 // Shift
 class SHRA_QB_DESC : SHLL_QB_R2_DESC_BASE<"shra.qb", null_frag, immZExt3,
-                                          NoItinerary, DSPRegs>;
+                                          NoItinerary, DSPROpnd>;
 
 class SHRAV_QB_DESC : SHLL_QB_R3_DESC_BASE<"shrav.qb", int_mips_shra_qb,
-                                           NoItinerary, DSPRegs>;
+                                           NoItinerary, DSPROpnd>;
 
 class SHRA_R_QB_DESC : SHLL_QB_R2_DESC_BASE<"shra_r.qb", int_mips_shra_r_qb,
-                                            immZExt3, NoItinerary, DSPRegs>;
+                                            immZExt3, NoItinerary, DSPROpnd>;
 
 class SHRAV_R_QB_DESC : SHLL_QB_R3_DESC_BASE<"shrav_r.qb", int_mips_shra_r_qb,
-                                             NoItinerary, DSPRegs>;
+                                             NoItinerary, DSPROpnd>;
 
 class SHRL_PH_DESC : SHLL_QB_R2_DESC_BASE<"shrl.ph", null_frag, immZExt4,
-                                          NoItinerary, DSPRegs>;
+                                          NoItinerary, DSPROpnd>;
 
 class SHRLV_PH_DESC : SHLL_QB_R3_DESC_BASE<"shrlv.ph", int_mips_shrl_ph,
-                                           NoItinerary, DSPRegs>;
+                                           NoItinerary, DSPROpnd>;
 
 // Misc
 class APPEND_DESC : APPEND_DESC_BASE<"append", int_mips_append, immZExt5,
@@ -1240,24 +1239,24 @@ def PREPEND : PREPEND_ENC, PREPEND_DESC;
 }
 
 // Pseudos.
-let isPseudo = 1 in {
+let isPseudo = 1, isCodeGenOnly = 1 in {
   // Pseudo instructions for loading and storing accumulator registers.
-  defm LOAD_AC_DSP  : LoadM<"load_ac_dsp", ACRegsDSPOpnd>;
-  defm STORE_AC_DSP : StoreM<"store_ac_dsp", ACRegsDSPOpnd>;
+  def LOAD_ACC64DSP  : Load<"", ACC64DSPOpnd>;
+  def STORE_ACC64DSP : Store<"", ACC64DSPOpnd>;
 
   // Pseudos for loading and storing ccond field of DSP control register.
-  defm LOAD_CCOND_DSP  : LoadM<"load_ccond_dsp", DSPCC>;
-  defm STORE_CCOND_DSP : StoreM<"store_ccond_dsp", DSPCC>;
+  def LOAD_CCOND_DSP  : Load<"load_ccond_dsp", DSPCC>;
+  def STORE_CCOND_DSP : Store<"store_ccond_dsp", DSPCC>;
 }
 
 // Pseudo CMP and PICK instructions.
 class PseudoCMP<Instruction RealInst> :
-  PseudoDSP<(outs DSPCC:$cmp), (ins DSPRegs:$rs, DSPRegs:$rt), []>,
-  PseudoInstExpansion<(RealInst DSPRegs:$rs, DSPRegs:$rt)>, NeverHasSideEffects;
+  PseudoDSP<(outs DSPCC:$cmp), (ins DSPROpnd:$rs, DSPROpnd:$rt), []>,
+  PseudoInstExpansion<(RealInst DSPROpnd:$rs, DSPROpnd:$rt)>, NeverHasSideEffects;
 
 class PseudoPICK<Instruction RealInst> :
-  PseudoDSP<(outs DSPRegs:$rd), (ins DSPCC:$cmp, DSPRegs:$rs, DSPRegs:$rt), []>,
-  PseudoInstExpansion<(RealInst DSPRegs:$rd, DSPRegs:$rs, DSPRegs:$rt)>,
+  PseudoDSP<(outs DSPROpnd:$rd), (ins DSPCC:$cmp, DSPROpnd:$rs, DSPROpnd:$rt), []>,
+  PseudoInstExpansion<(RealInst DSPROpnd:$rd, DSPROpnd:$rs, DSPROpnd:$rt)>,
   NeverHasSideEffects;
 
 def PseudoCMP_EQ_PH : PseudoCMP<CMP_EQ_PH>;
@@ -1270,6 +1269,8 @@ def PseudoCMPU_LE_QB : PseudoCMP<CMPU_LE_QB>;
 def PseudoPICK_PH : PseudoPICK<PICK_PH>;
 def PseudoPICK_QB : PseudoPICK<PICK_QB>;
 
+def PseudoMTLOHI_DSP : PseudoMTLOHI<ACC64DSP, GPR32>;
+
 // Patterns.
 class DSPPat<dag pattern, dag result, Predicate pred = HasDSP> :
   Pat<pattern, result>, Requires<[pred]>;
@@ -1279,19 +1280,19 @@ class BitconvertPat<ValueType DstVT, ValueType SrcVT, RegisterClass DstRC,
    DSPPat<(DstVT (bitconvert (SrcVT SrcRC:$src))),
           (COPY_TO_REGCLASS SrcRC:$src, DstRC)>;
 
-def : BitconvertPat<i32, v2i16, GPR32, DSPRegs>;
-def : BitconvertPat<i32, v4i8, GPR32, DSPRegs>;
-def : BitconvertPat<v2i16, i32, DSPRegs, GPR32>;
-def : BitconvertPat<v4i8, i32, DSPRegs, GPR32>;
+def : BitconvertPat<i32, v2i16, GPR32, DSPR>;
+def : BitconvertPat<i32, v4i8, GPR32, DSPR>;
+def : BitconvertPat<v2i16, i32, DSPR, GPR32>;
+def : BitconvertPat<v4i8, i32, DSPR, GPR32>;
 
 def : DSPPat<(v2i16 (load addr:$a)),
-             (v2i16 (COPY_TO_REGCLASS (LW addr:$a), DSPRegs))>;
+             (v2i16 (COPY_TO_REGCLASS (LW addr:$a), DSPR))>;
 def : DSPPat<(v4i8 (load addr:$a)),
-             (v4i8 (COPY_TO_REGCLASS (LW addr:$a), DSPRegs))>;
-def : DSPPat<(store (v2i16 DSPRegs:$val), addr:$a),
-             (SW (COPY_TO_REGCLASS DSPRegs:$val, GPR32), addr:$a)>;
-def : DSPPat<(store (v4i8 DSPRegs:$val), addr:$a),
-             (SW (COPY_TO_REGCLASS DSPRegs:$val, GPR32), addr:$a)>;
+             (v4i8 (COPY_TO_REGCLASS (LW addr:$a), DSPR))>;
+def : DSPPat<(store (v2i16 DSPR:$val), addr:$a),
+             (SW (COPY_TO_REGCLASS DSPR:$val, GPR32), addr:$a)>;
+def : DSPPat<(store (v4i8 DSPR:$val), addr:$a),
+             (SW (COPY_TO_REGCLASS DSPR:$val, GPR32), addr:$a)>;
 
 // Binary operations.
 class DSPBinPat<Instruction Inst, ValueType ValTy, SDPatternOperator Node,
@@ -1336,7 +1337,7 @@ class DSPSetCCPat<Instruction Cmp, Instruction Pick, ValueType ValTy,
                   CondCode CC> :
   DSPPat<(ValTy (MipsSETCC_DSP ValTy:$a, ValTy:$b, CC)),
          (ValTy (Pick (ValTy (Cmp ValTy:$a, ValTy:$b)),
-                      (ValTy (COPY_TO_REGCLASS (ADDiu ZERO, -1), DSPRegs)),
+                      (ValTy (COPY_TO_REGCLASS (ADDiu ZERO, -1), DSPR)),
                       (ValTy ZERO)))>;
 
 class DSPSetCCPatInv<Instruction Cmp, Instruction Pick, ValueType ValTy,
@@ -1344,7 +1345,7 @@ class DSPSetCCPatInv<Instruction Cmp, Instruction Pick, ValueType ValTy,
   DSPPat<(ValTy (MipsSETCC_DSP ValTy:$a, ValTy:$b, CC)),
          (ValTy (Pick (ValTy (Cmp ValTy:$a, ValTy:$b)),
                       (ValTy ZERO),
-                      (ValTy (COPY_TO_REGCLASS (ADDiu ZERO, -1), DSPRegs))))>;
+                      (ValTy (COPY_TO_REGCLASS (ADDiu ZERO, -1), DSPR))))>;
 
 class DSPSelectCCPat<Instruction Cmp, Instruction Pick, ValueType ValTy,
                      CondCode CC> :
@@ -1384,12 +1385,12 @@ def : DSPSelectCCPatInv<PseudoCMPU_LE_QB, PseudoPICK_QB, v4i8, SETUGT>;
 
 // Extr patterns.
 class EXTR_W_TY1_R2_Pat<SDPatternOperator OpNode, Instruction Instr> :
-  DSPPat<(i32 (OpNode GPR32:$rs, ACRegsDSP:$ac)),
-         (Instr ACRegsDSP:$ac, GPR32:$rs)>;
+  DSPPat<(i32 (OpNode GPR32:$rs, ACC64DSP:$ac)),
+         (Instr ACC64DSP:$ac, GPR32:$rs)>;
 
 class EXTR_W_TY1_R1_Pat<SDPatternOperator OpNode, Instruction Instr> :
-  DSPPat<(i32 (OpNode immZExt5:$shift, ACRegsDSP:$ac)),
-         (Instr ACRegsDSP:$ac, immZExt5:$shift)>;
+  DSPPat<(i32 (OpNode immZExt5:$shift, ACC64DSP:$ac)),
+         (Instr ACC64DSP:$ac, immZExt5:$shift)>;
 
 def : EXTR_W_TY1_R1_Pat<MipsEXTP, EXTP>;
 def : EXTR_W_TY1_R2_Pat<MipsEXTP, EXTPV>;
@@ -1404,11 +1405,6 @@ def : EXTR_W_TY1_R2_Pat<MipsEXTR_RS_W, EXTRV_RS_W>;
 def : EXTR_W_TY1_R1_Pat<MipsEXTR_S_H, EXTR_S_H>;
 def : EXTR_W_TY1_R2_Pat<MipsEXTR_S_H, EXTRV_S_H>;
 
-// mflo/hi patterns.
-let AddedComplexity = 20 in
-def : DSPPat<(i32 (ExtractLOHI ACRegsDSP:$ac, imm:$lohi_idx)),
-             (EXTRACT_SUBREG ACRegsDSP:$ac, imm:$lohi_idx)>;
-
 // Indexed load patterns.
 class IndexedLoadPat<SDPatternOperator LoadNode, Instruction Instr> :
   DSPPat<(i32 (LoadNode (add i32:$base, i32:$index))),
diff --git a/lib/Target/Mips/MipsDelaySlotFiller.cpp b/lib/Target/Mips/MipsDelaySlotFiller.cpp
index 545a38d..ffbd83b 100644
--- a/lib/Target/Mips/MipsDelaySlotFiller.cpp
+++ b/lib/Target/Mips/MipsDelaySlotFiller.cpp
@@ -421,8 +421,7 @@ bool LoadFromStackOrConst::hasHazard_(const MachineInstr &MI) {
     return false;
 
   if (const PseudoSourceValue *PSV = dyn_cast<const PseudoSourceValue>(V))
-    return !PSV->PseudoSourceValue::isConstant(0) &&
-      (V != PseudoSourceValue::getStack());
+    return !PSV->isConstant(0) && V != PseudoSourceValue::getStack();
 
   return true;
 }
@@ -563,14 +562,13 @@ bool Filler::searchBackward(MachineBasicBlock &MBB, Iter Slot) const {
 
   RegDU.init(*Slot);
 
-  if (searchRange(MBB, ReverseIter(Slot), MBB.rend(), RegDU, MemDU, Filler)) {
-    MBB.splice(llvm::next(Slot), &MBB, llvm::next(Filler).base());
-    MIBundleBuilder(MBB, Slot, llvm::next(llvm::next(Slot)));
-    ++UsefulSlots;
-    return true;
-  }
+  if (!searchRange(MBB, ReverseIter(Slot), MBB.rend(), RegDU, MemDU, Filler))
+    return false;
 
-  return false;
+  MBB.splice(llvm::next(Slot), &MBB, llvm::next(Filler).base());
+  MIBundleBuilder(MBB, Slot, llvm::next(llvm::next(Slot)));
+  ++UsefulSlots;
+  return true;
 }
 
 bool Filler::searchForward(MachineBasicBlock &MBB, Iter Slot) const {
@@ -584,14 +582,13 @@ bool Filler::searchForward(MachineBasicBlock &MBB, Iter Slot) const {
 
   RegDU.setCallerSaved(*Slot);
 
-  if (searchRange(MBB, llvm::next(Slot), MBB.end(), RegDU, NM, Filler)) {
-    MBB.splice(llvm::next(Slot), &MBB, Filler);
-    MIBundleBuilder(MBB, Slot, llvm::next(llvm::next(Slot)));
-    ++UsefulSlots;
-    return true;
-  }
+  if (!searchRange(MBB, llvm::next(Slot), MBB.end(), RegDU, NM, Filler))
+    return false;
 
-  return false;
+  MBB.splice(llvm::next(Slot), &MBB, Filler);
+  MIBundleBuilder(MBB, Slot, llvm::next(llvm::next(Slot)));
+  ++UsefulSlots;
+  return true;
 }
 
 bool Filler::searchSuccBBs(MachineBasicBlock &MBB, Iter Slot) const {
diff --git a/lib/Target/Mips/MipsISelDAGToDAG.cpp b/lib/Target/Mips/MipsISelDAGToDAG.cpp
index 0002a5f..c417bd5 100644
--- a/lib/Target/Mips/MipsISelDAGToDAG.cpp
+++ b/lib/Target/Mips/MipsISelDAGToDAG.cpp
@@ -69,6 +69,12 @@ bool MipsDAGToDAGISel::selectAddrRegImm(SDValue Addr, SDValue &Base,
   return false;
 }
 
+bool MipsDAGToDAGISel::selectAddrRegReg(SDValue Addr, SDValue &Base,
+                                        SDValue &Offset) const {
+  llvm_unreachable("Unimplemented function.");
+  return false;
+}
+
 bool MipsDAGToDAGISel::selectAddrDefault(SDValue Addr, SDValue &Base,
                                          SDValue &Offset) const {
   llvm_unreachable("Unimplemented function.");
@@ -81,12 +87,83 @@ bool MipsDAGToDAGISel::selectIntAddr(SDValue Addr, SDValue &Base,
   return false;
 }
 
+bool MipsDAGToDAGISel::selectIntAddrMM(SDValue Addr, SDValue &Base,
+                                       SDValue &Offset) const {
+  llvm_unreachable("Unimplemented function.");
+  return false;
+}
+
 bool MipsDAGToDAGISel::selectAddr16(SDNode *Parent, SDValue N, SDValue &Base,
                                     SDValue &Offset, SDValue &Alias) {
   llvm_unreachable("Unimplemented function.");
   return false;
 }
 
+bool MipsDAGToDAGISel::selectVSplat(SDNode *N, APInt &Imm) const {
+  llvm_unreachable("Unimplemented function.");
+  return false;
+}
+
+bool MipsDAGToDAGISel::selectVSplatUimm1(SDValue N, SDValue &Imm) const {
+  llvm_unreachable("Unimplemented function.");
+  return false;
+}
+
+bool MipsDAGToDAGISel::selectVSplatUimm2(SDValue N, SDValue &Imm) const {
+  llvm_unreachable("Unimplemented function.");
+  return false;
+}
+
+bool MipsDAGToDAGISel::selectVSplatUimm3(SDValue N, SDValue &Imm) const {
+  llvm_unreachable("Unimplemented function.");
+  return false;
+}
+
+bool MipsDAGToDAGISel::selectVSplatUimm4(SDValue N, SDValue &Imm) const {
+  llvm_unreachable("Unimplemented function.");
+  return false;
+}
+
+bool MipsDAGToDAGISel::selectVSplatUimm5(SDValue N, SDValue &Imm) const {
+  llvm_unreachable("Unimplemented function.");
+  return false;
+}
+
+bool MipsDAGToDAGISel::selectVSplatUimm6(SDValue N, SDValue &Imm) const {
+  llvm_unreachable("Unimplemented function.");
+  return false;
+}
+
+bool MipsDAGToDAGISel::selectVSplatUimm8(SDValue N, SDValue &Imm) const {
+  llvm_unreachable("Unimplemented function.");
+  return false;
+}
+
+bool MipsDAGToDAGISel::selectVSplatSimm5(SDValue N, SDValue &Imm) const {
+  llvm_unreachable("Unimplemented function.");
+  return false;
+}
+
+bool MipsDAGToDAGISel::selectVSplatUimmPow2(SDValue N, SDValue &Imm) const {
+  llvm_unreachable("Unimplemented function.");
+  return false;
+}
+
+bool MipsDAGToDAGISel::selectVSplatUimmInvPow2(SDValue N, SDValue &Imm) const {
+  llvm_unreachable("Unimplemented function.");
+  return false;
+}
+
+bool MipsDAGToDAGISel::selectVSplatMaskL(SDValue N, SDValue &Imm) const {
+  llvm_unreachable("Unimplemented function.");
+  return false;
+}
+
+bool MipsDAGToDAGISel::selectVSplatMaskR(SDValue N, SDValue &Imm) const {
+  llvm_unreachable("Unimplemented function.");
+  return false;
+}
+
 /// Select instructions not customized! Used for
 /// expanded, promoted and normal instructions
 SDNode* MipsDAGToDAGISel::Select(SDNode *Node) {
@@ -98,6 +175,7 @@ SDNode* MipsDAGToDAGISel::Select(SDNode *Node) {
   // If we have a custom node, we already have selected!
   if (Node->isMachineOpcode()) {
     DEBUG(errs() << "== "; Node->dump(CurDAG); errs() << "\n");
+    Node->setNodeId(-1);
     return NULL;
   }
 
diff --git a/lib/Target/Mips/MipsISelDAGToDAG.h b/lib/Target/Mips/MipsISelDAGToDAG.h
index cf0f9c5..a4d9da5 100644
--- a/lib/Target/Mips/MipsISelDAGToDAG.h
+++ b/lib/Target/Mips/MipsISelDAGToDAG.h
@@ -57,6 +57,11 @@ private:
   virtual bool selectAddrRegImm(SDValue Addr, SDValue &Base,
                                 SDValue &Offset) const;
 
+  // Complex Pattern.
+  /// (reg + reg).
+  virtual bool selectAddrRegReg(SDValue Addr, SDValue &Base,
+                                SDValue &Offset) const;
+
   /// Fall back on this function if all else fails.
   virtual bool selectAddrDefault(SDValue Addr, SDValue &Base,
                                  SDValue &Offset) const;
@@ -65,9 +70,42 @@ private:
   virtual bool selectIntAddr(SDValue Addr, SDValue &Base,
                              SDValue &Offset) const;
 
+  virtual bool selectIntAddrMM(SDValue Addr, SDValue &Base,
+                               SDValue &Offset) const;
+
   virtual bool selectAddr16(SDNode *Parent, SDValue N, SDValue &Base,
                             SDValue &Offset, SDValue &Alias);
 
+  /// \brief Select constant vector splats.
+  virtual bool selectVSplat(SDNode *N, APInt &Imm) const;
+  /// \brief Select constant vector splats whose value fits in a uimm1.
+  virtual bool selectVSplatUimm1(SDValue N, SDValue &Imm) const;
+  /// \brief Select constant vector splats whose value fits in a uimm2.
+  virtual bool selectVSplatUimm2(SDValue N, SDValue &Imm) const;
+  /// \brief Select constant vector splats whose value fits in a uimm3.
+  virtual bool selectVSplatUimm3(SDValue N, SDValue &Imm) const;
+  /// \brief Select constant vector splats whose value fits in a uimm4.
+  virtual bool selectVSplatUimm4(SDValue N, SDValue &Imm) const;
+  /// \brief Select constant vector splats whose value fits in a uimm5.
+  virtual bool selectVSplatUimm5(SDValue N, SDValue &Imm) const;
+  /// \brief Select constant vector splats whose value fits in a uimm6.
+  virtual bool selectVSplatUimm6(SDValue N, SDValue &Imm) const;
+  /// \brief Select constant vector splats whose value fits in a uimm8.
+  virtual bool selectVSplatUimm8(SDValue N, SDValue &Imm) const;
+  /// \brief Select constant vector splats whose value fits in a simm5.
+  virtual bool selectVSplatSimm5(SDValue N, SDValue &Imm) const;
+  /// \brief Select constant vector splats whose value is a power of 2.
+  virtual bool selectVSplatUimmPow2(SDValue N, SDValue &Imm) const;
+  /// \brief Select constant vector splats whose value is the inverse of a
+  /// power of 2.
+  virtual bool selectVSplatUimmInvPow2(SDValue N, SDValue &Imm) const;
+  /// \brief Select constant vector splats whose value is a run of set bits
+  /// ending at the most significant bit
+  virtual bool selectVSplatMaskL(SDValue N, SDValue &Imm) const;
+  /// \brief Select constant vector splats whose value is a run of set bits
+  /// starting at bit zero.
+  virtual bool selectVSplatMaskR(SDValue N, SDValue &Imm) const;
+
   virtual SDNode *Select(SDNode *N);
 
   virtual std::pair<bool, SDNode*> selectNode(SDNode *Node) = 0;
diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp
index a62e84f..1e8250c 100644
--- a/lib/Target/Mips/MipsISelLowering.cpp
+++ b/lib/Target/Mips/MipsISelLowering.cpp
@@ -20,6 +20,7 @@
 #include "MipsTargetMachine.h"
 #include "MipsTargetObjectFile.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringSwitch.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -34,6 +35,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cctype>
 
 using namespace llvm;
 
@@ -67,7 +69,7 @@ static const uint16_t Mips64DPRegs[8] = {
 // For example, if I is 0x003ff800, (Pos, Size) = (11, 11).
 static bool isShiftedMask(uint64_t I, uint64_t &Pos, uint64_t &Size) {
   if (!isShiftedMask_64(I))
-     return false;
+    return false;
 
   Size = CountPopulation_64(I);
   Pos = countTrailingZeros(I);
@@ -79,72 +81,35 @@ SDValue MipsTargetLowering::getGlobalReg(SelectionDAG &DAG, EVT Ty) const {
   return DAG.getRegister(FI->getGlobalBaseReg(), Ty);
 }
 
-static SDValue getTargetNode(SDValue Op, SelectionDAG &DAG, unsigned Flag) {
-  EVT Ty = Op.getValueType();
+SDValue MipsTargetLowering::getTargetNode(GlobalAddressSDNode *N, EVT Ty,
+                                          SelectionDAG &DAG,
+                                          unsigned Flag) const {
+  return DAG.getTargetGlobalAddress(N->getGlobal(), SDLoc(N), Ty, 0, Flag);
+}
 
-  if (GlobalAddressSDNode *N = dyn_cast<GlobalAddressSDNode>(Op))
-    return DAG.getTargetGlobalAddress(N->getGlobal(), SDLoc(Op), Ty, 0,
-                                      Flag);
-  if (ExternalSymbolSDNode *N = dyn_cast<ExternalSymbolSDNode>(Op))
-    return DAG.getTargetExternalSymbol(N->getSymbol(), Ty, Flag);
-  if (BlockAddressSDNode *N = dyn_cast<BlockAddressSDNode>(Op))
-    return DAG.getTargetBlockAddress(N->getBlockAddress(), Ty, 0, Flag);
-  if (JumpTableSDNode *N = dyn_cast<JumpTableSDNode>(Op))
-    return DAG.getTargetJumpTable(N->getIndex(), Ty, Flag);
-  if (ConstantPoolSDNode *N = dyn_cast<ConstantPoolSDNode>(Op))
-    return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlignment(),
-                                     N->getOffset(), Flag);
-
-  llvm_unreachable("Unexpected node type.");
-  return SDValue();
+SDValue MipsTargetLowering::getTargetNode(ExternalSymbolSDNode *N, EVT Ty,
+                                          SelectionDAG &DAG,
+                                          unsigned Flag) const {
+  return DAG.getTargetExternalSymbol(N->getSymbol(), Ty, Flag);
 }
 
-static SDValue getAddrNonPIC(SDValue Op, SelectionDAG &DAG) {
-  SDLoc DL(Op);
-  EVT Ty = Op.getValueType();
-  SDValue Hi = getTargetNode(Op, DAG, MipsII::MO_ABS_HI);
-  SDValue Lo = getTargetNode(Op, DAG, MipsII::MO_ABS_LO);
-  return DAG.getNode(ISD::ADD, DL, Ty,
-                     DAG.getNode(MipsISD::Hi, DL, Ty, Hi),
-                     DAG.getNode(MipsISD::Lo, DL, Ty, Lo));
+SDValue MipsTargetLowering::getTargetNode(BlockAddressSDNode *N, EVT Ty,
+                                          SelectionDAG &DAG,
+                                          unsigned Flag) const {
+  return DAG.getTargetBlockAddress(N->getBlockAddress(), Ty, 0, Flag);
 }
 
-SDValue MipsTargetLowering::getAddrLocal(SDValue Op, SelectionDAG &DAG,
-                                         bool HasMips64) const {
-  SDLoc DL(Op);
-  EVT Ty = Op.getValueType();
-  unsigned GOTFlag = HasMips64 ? MipsII::MO_GOT_PAGE : MipsII::MO_GOT;
-  SDValue GOT = DAG.getNode(MipsISD::Wrapper, DL, Ty, getGlobalReg(DAG, Ty),
-                            getTargetNode(Op, DAG, GOTFlag));
-  SDValue Load = DAG.getLoad(Ty, DL, DAG.getEntryNode(), GOT,
-                             MachinePointerInfo::getGOT(), false, false, false,
-                             0);
-  unsigned LoFlag = HasMips64 ? MipsII::MO_GOT_OFST : MipsII::MO_ABS_LO;
-  SDValue Lo = DAG.getNode(MipsISD::Lo, DL, Ty, getTargetNode(Op, DAG, LoFlag));
-  return DAG.getNode(ISD::ADD, DL, Ty, Load, Lo);
-}
-
-SDValue MipsTargetLowering::getAddrGlobal(SDValue Op, SelectionDAG &DAG,
+SDValue MipsTargetLowering::getTargetNode(JumpTableSDNode *N, EVT Ty,
+                                          SelectionDAG &DAG,
                                           unsigned Flag) const {
-  SDLoc DL(Op);
-  EVT Ty = Op.getValueType();
-  SDValue Tgt = DAG.getNode(MipsISD::Wrapper, DL, Ty, getGlobalReg(DAG, Ty),
-                            getTargetNode(Op, DAG, Flag));
-  return DAG.getLoad(Ty, DL, DAG.getEntryNode(), Tgt,
-                     MachinePointerInfo::getGOT(), false, false, false, 0);
+  return DAG.getTargetJumpTable(N->getIndex(), Ty, Flag);
 }
 
-SDValue MipsTargetLowering::getAddrGlobalLargeGOT(SDValue Op, SelectionDAG &DAG,
-                                                  unsigned HiFlag,
-                                                  unsigned LoFlag) const {
-  SDLoc DL(Op);
-  EVT Ty = Op.getValueType();
-  SDValue Hi = DAG.getNode(MipsISD::Hi, DL, Ty, getTargetNode(Op, DAG, HiFlag));
-  Hi = DAG.getNode(ISD::ADD, DL, Ty, Hi, getGlobalReg(DAG, Ty));
-  SDValue Wrapper = DAG.getNode(MipsISD::Wrapper, DL, Ty, Hi,
-                                getTargetNode(Op, DAG, LoFlag));
-  return DAG.getLoad(Ty, DL, DAG.getEntryNode(), Wrapper,
-                     MachinePointerInfo::getGOT(), false, false, false, 0);
+SDValue MipsTargetLowering::getTargetNode(ConstantPoolSDNode *N, EVT Ty,
+                                          SelectionDAG &DAG,
+                                          unsigned Flag) const {
+  return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlignment(),
+                                   N->getOffset(), Flag);
 }
 
 const char *MipsTargetLowering::getTargetNodeName(unsigned Opcode) const {
@@ -162,8 +127,9 @@ const char *MipsTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case MipsISD::CMovFP_T:          return "MipsISD::CMovFP_T";
   case MipsISD::CMovFP_F:          return "MipsISD::CMovFP_F";
   case MipsISD::TruncIntFP:        return "MipsISD::TruncIntFP";
-  case MipsISD::ExtractLOHI:       return "MipsISD::ExtractLOHI";
-  case MipsISD::InsertLOHI:        return "MipsISD::InsertLOHI";
+  case MipsISD::MFHI:              return "MipsISD::MFHI";
+  case MipsISD::MFLO:              return "MipsISD::MFLO";
+  case MipsISD::MTLOHI:            return "MipsISD::MTLOHI";
   case MipsISD::Mult:              return "MipsISD::Mult";
   case MipsISD::Multu:             return "MipsISD::Multu";
   case MipsISD::MAdd:              return "MipsISD::MAdd";
@@ -207,6 +173,30 @@ const char *MipsTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case MipsISD::SHRL_DSP:          return "MipsISD::SHRL_DSP";
   case MipsISD::SETCC_DSP:         return "MipsISD::SETCC_DSP";
   case MipsISD::SELECT_CC_DSP:     return "MipsISD::SELECT_CC_DSP";
+  case MipsISD::VALL_ZERO:         return "MipsISD::VALL_ZERO";
+  case MipsISD::VANY_ZERO:         return "MipsISD::VANY_ZERO";
+  case MipsISD::VALL_NONZERO:      return "MipsISD::VALL_NONZERO";
+  case MipsISD::VANY_NONZERO:      return "MipsISD::VANY_NONZERO";
+  case MipsISD::VCEQ:              return "MipsISD::VCEQ";
+  case MipsISD::VCLE_S:            return "MipsISD::VCLE_S";
+  case MipsISD::VCLE_U:            return "MipsISD::VCLE_U";
+  case MipsISD::VCLT_S:            return "MipsISD::VCLT_S";
+  case MipsISD::VCLT_U:            return "MipsISD::VCLT_U";
+  case MipsISD::VSMAX:             return "MipsISD::VSMAX";
+  case MipsISD::VSMIN:             return "MipsISD::VSMIN";
+  case MipsISD::VUMAX:             return "MipsISD::VUMAX";
+  case MipsISD::VUMIN:             return "MipsISD::VUMIN";
+  case MipsISD::VEXTRACT_SEXT_ELT: return "MipsISD::VEXTRACT_SEXT_ELT";
+  case MipsISD::VEXTRACT_ZEXT_ELT: return "MipsISD::VEXTRACT_ZEXT_ELT";
+  case MipsISD::VNOR:              return "MipsISD::VNOR";
+  case MipsISD::VSHF:              return "MipsISD::VSHF";
+  case MipsISD::SHF:               return "MipsISD::SHF";
+  case MipsISD::ILVEV:             return "MipsISD::ILVEV";
+  case MipsISD::ILVOD:             return "MipsISD::ILVOD";
+  case MipsISD::ILVL:              return "MipsISD::ILVL";
+  case MipsISD::ILVR:              return "MipsISD::ILVR";
+  case MipsISD::PCKEV:             return "MipsISD::PCKEV";
+  case MipsISD::PCKOD:             return "MipsISD::PCKOD";
   default:                         return NULL;
   }
 }
@@ -424,8 +414,8 @@ static SDValue performDivRemCombine(SDNode *N, SelectionDAG &DAG,
     return SDValue();
 
   EVT Ty = N->getValueType(0);
-  unsigned LO = (Ty == MVT::i32) ? Mips::LO : Mips::LO64;
-  unsigned HI = (Ty == MVT::i32) ? Mips::HI : Mips::HI64;
+  unsigned LO = (Ty == MVT::i32) ? Mips::LO0 : Mips::LO0_64;
+  unsigned HI = (Ty == MVT::i32) ? Mips::HI0 : Mips::HI0_64;
   unsigned Opc = N->getOpcode() == ISD::SDIVREM ? MipsISD::DivRem16 :
                                                   MipsISD::DivRemU16;
   SDLoc DL(N);
@@ -566,7 +556,7 @@ static SDValue performANDCombine(SDNode *N, SelectionDAG &DAG,
   // Pattern match EXT.
   //  $dst = and ((sra or srl) $src , pos), (2**size - 1)
   //  => ext $dst, $src, size, pos
-  if (DCI.isBeforeLegalizeOps() || !Subtarget->hasMips32r2())
+  if (DCI.isBeforeLegalizeOps() || !Subtarget->hasExtractInsert())
     return SDValue();
 
   SDValue ShiftRight = N->getOperand(0), Mask = N->getOperand(1);
@@ -607,7 +597,7 @@ static SDValue performORCombine(SDNode *N, SelectionDAG &DAG,
   //  $dst = or (and $src1 , mask0), (and (shl $src, pos), mask1),
   //  where mask1 = (2**size - 1) << pos, mask0 = ~mask1
   //  => ins $dst, $src, size, pos, $src1
-  if (DCI.isBeforeLegalizeOps() || !Subtarget->hasMips32r2())
+  if (DCI.isBeforeLegalizeOps() || !Subtarget->hasExtractInsert())
     return SDValue();
 
   SDValue And0 = N->getOperand(0), And1 = N->getOperand(1);
@@ -779,13 +769,17 @@ static MachineBasicBlock *expandPseudoDIV(MachineInstr *MI,
   // Insert instruction "teq $divisor_reg, $zero, 7".
   MachineBasicBlock::iterator I(MI);
   MachineInstrBuilder MIB;
+  MachineOperand &Divisor = MI->getOperand(2);
   MIB = BuildMI(MBB, llvm::next(I), MI->getDebugLoc(), TII.get(Mips::TEQ))
-    .addOperand(MI->getOperand(2)).addReg(Mips::ZERO).addImm(7);
+    .addReg(Divisor.getReg(), getKillRegState(Divisor.isKill()))
+    .addReg(Mips::ZERO).addImm(7);
 
   // Use the 32-bit sub-register if this is a 64-bit division.
   if (Is64Bit)
     MIB->getOperand(0).setSubReg(Mips::sub_32);
 
+  // Clear Divisor's kill flag.
+  Divisor.setIsKill(false);
   return &MBB;
 }
 
@@ -796,107 +790,75 @@ MipsTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   default:
     llvm_unreachable("Unexpected instr type to insert");
   case Mips::ATOMIC_LOAD_ADD_I8:
-  case Mips::ATOMIC_LOAD_ADD_I8_P8:
     return emitAtomicBinaryPartword(MI, BB, 1, Mips::ADDu);
   case Mips::ATOMIC_LOAD_ADD_I16:
-  case Mips::ATOMIC_LOAD_ADD_I16_P8:
     return emitAtomicBinaryPartword(MI, BB, 2, Mips::ADDu);
   case Mips::ATOMIC_LOAD_ADD_I32:
-  case Mips::ATOMIC_LOAD_ADD_I32_P8:
     return emitAtomicBinary(MI, BB, 4, Mips::ADDu);
   case Mips::ATOMIC_LOAD_ADD_I64:
-  case Mips::ATOMIC_LOAD_ADD_I64_P8:
     return emitAtomicBinary(MI, BB, 8, Mips::DADDu);
 
   case Mips::ATOMIC_LOAD_AND_I8:
-  case Mips::ATOMIC_LOAD_AND_I8_P8:
     return emitAtomicBinaryPartword(MI, BB, 1, Mips::AND);
   case Mips::ATOMIC_LOAD_AND_I16:
-  case Mips::ATOMIC_LOAD_AND_I16_P8:
     return emitAtomicBinaryPartword(MI, BB, 2, Mips::AND);
   case Mips::ATOMIC_LOAD_AND_I32:
-  case Mips::ATOMIC_LOAD_AND_I32_P8:
     return emitAtomicBinary(MI, BB, 4, Mips::AND);
   case Mips::ATOMIC_LOAD_AND_I64:
-  case Mips::ATOMIC_LOAD_AND_I64_P8:
     return emitAtomicBinary(MI, BB, 8, Mips::AND64);
 
   case Mips::ATOMIC_LOAD_OR_I8:
-  case Mips::ATOMIC_LOAD_OR_I8_P8:
     return emitAtomicBinaryPartword(MI, BB, 1, Mips::OR);
   case Mips::ATOMIC_LOAD_OR_I16:
-  case Mips::ATOMIC_LOAD_OR_I16_P8:
     return emitAtomicBinaryPartword(MI, BB, 2, Mips::OR);
   case Mips::ATOMIC_LOAD_OR_I32:
-  case Mips::ATOMIC_LOAD_OR_I32_P8:
     return emitAtomicBinary(MI, BB, 4, Mips::OR);
   case Mips::ATOMIC_LOAD_OR_I64:
-  case Mips::ATOMIC_LOAD_OR_I64_P8:
     return emitAtomicBinary(MI, BB, 8, Mips::OR64);
 
   case Mips::ATOMIC_LOAD_XOR_I8:
-  case Mips::ATOMIC_LOAD_XOR_I8_P8:
     return emitAtomicBinaryPartword(MI, BB, 1, Mips::XOR);
   case Mips::ATOMIC_LOAD_XOR_I16:
-  case Mips::ATOMIC_LOAD_XOR_I16_P8:
     return emitAtomicBinaryPartword(MI, BB, 2, Mips::XOR);
   case Mips::ATOMIC_LOAD_XOR_I32:
-  case Mips::ATOMIC_LOAD_XOR_I32_P8:
     return emitAtomicBinary(MI, BB, 4, Mips::XOR);
   case Mips::ATOMIC_LOAD_XOR_I64:
-  case Mips::ATOMIC_LOAD_XOR_I64_P8:
     return emitAtomicBinary(MI, BB, 8, Mips::XOR64);
 
   case Mips::ATOMIC_LOAD_NAND_I8:
-  case Mips::ATOMIC_LOAD_NAND_I8_P8:
     return emitAtomicBinaryPartword(MI, BB, 1, 0, true);
   case Mips::ATOMIC_LOAD_NAND_I16:
-  case Mips::ATOMIC_LOAD_NAND_I16_P8:
     return emitAtomicBinaryPartword(MI, BB, 2, 0, true);
   case Mips::ATOMIC_LOAD_NAND_I32:
-  case Mips::ATOMIC_LOAD_NAND_I32_P8:
     return emitAtomicBinary(MI, BB, 4, 0, true);
   case Mips::ATOMIC_LOAD_NAND_I64:
-  case Mips::ATOMIC_LOAD_NAND_I64_P8:
     return emitAtomicBinary(MI, BB, 8, 0, true);
 
   case Mips::ATOMIC_LOAD_SUB_I8:
-  case Mips::ATOMIC_LOAD_SUB_I8_P8:
     return emitAtomicBinaryPartword(MI, BB, 1, Mips::SUBu);
   case Mips::ATOMIC_LOAD_SUB_I16:
-  case Mips::ATOMIC_LOAD_SUB_I16_P8:
     return emitAtomicBinaryPartword(MI, BB, 2, Mips::SUBu);
   case Mips::ATOMIC_LOAD_SUB_I32:
-  case Mips::ATOMIC_LOAD_SUB_I32_P8:
     return emitAtomicBinary(MI, BB, 4, Mips::SUBu);
   case Mips::ATOMIC_LOAD_SUB_I64:
-  case Mips::ATOMIC_LOAD_SUB_I64_P8:
     return emitAtomicBinary(MI, BB, 8, Mips::DSUBu);
 
   case Mips::ATOMIC_SWAP_I8:
-  case Mips::ATOMIC_SWAP_I8_P8:
     return emitAtomicBinaryPartword(MI, BB, 1, 0);
   case Mips::ATOMIC_SWAP_I16:
-  case Mips::ATOMIC_SWAP_I16_P8:
     return emitAtomicBinaryPartword(MI, BB, 2, 0);
   case Mips::ATOMIC_SWAP_I32:
-  case Mips::ATOMIC_SWAP_I32_P8:
     return emitAtomicBinary(MI, BB, 4, 0);
   case Mips::ATOMIC_SWAP_I64:
-  case Mips::ATOMIC_SWAP_I64_P8:
     return emitAtomicBinary(MI, BB, 8, 0);
 
   case Mips::ATOMIC_CMP_SWAP_I8:
-  case Mips::ATOMIC_CMP_SWAP_I8_P8:
     return emitAtomicCmpSwapPartword(MI, BB, 1);
   case Mips::ATOMIC_CMP_SWAP_I16:
-  case Mips::ATOMIC_CMP_SWAP_I16_P8:
     return emitAtomicCmpSwapPartword(MI, BB, 2);
   case Mips::ATOMIC_CMP_SWAP_I32:
-  case Mips::ATOMIC_CMP_SWAP_I32_P8:
     return emitAtomicCmpSwap(MI, BB, 4);
   case Mips::ATOMIC_CMP_SWAP_I64:
-  case Mips::ATOMIC_CMP_SWAP_I64_P8:
     return emitAtomicCmpSwap(MI, BB, 8);
   case Mips::PseudoSDIV:
   case Mips::PseudoUDIV:
@@ -923,16 +885,16 @@ MipsTargetLowering::emitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
   unsigned LL, SC, AND, NOR, ZERO, BEQ;
 
   if (Size == 4) {
-    LL = IsN64 ? Mips::LL_P8 : Mips::LL;
-    SC = IsN64 ? Mips::SC_P8 : Mips::SC;
+    LL = Mips::LL;
+    SC = Mips::SC;
     AND = Mips::AND;
     NOR = Mips::NOR;
     ZERO = Mips::ZERO;
     BEQ = Mips::BEQ;
   }
   else {
-    LL = IsN64 ? Mips::LLD_P8 : Mips::LLD;
-    SC = IsN64 ? Mips::SCD_P8 : Mips::SCD;
+    LL = Mips::LLD;
+    SC = Mips::SCD;
     AND = Mips::AND64;
     NOR = Mips::NOR64;
     ZERO = Mips::ZERO_64;
@@ -958,8 +920,7 @@ MipsTargetLowering::emitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
 
   // Transfer the remainder of BB and its successor edges to exitMBB.
   exitMBB->splice(exitMBB->begin(), BB,
-                  llvm::next(MachineBasicBlock::iterator(MI)),
-                  BB->end());
+                  llvm::next(MachineBasicBlock::iterator(MI)), BB->end());
   exitMBB->transferSuccessorsAndUpdatePHIs(BB);
 
   //  thisMBB:
@@ -990,7 +951,7 @@ MipsTargetLowering::emitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
   BuildMI(BB, DL, TII->get(SC), Success).addReg(StoreVal).addReg(Ptr).addImm(0);
   BuildMI(BB, DL, TII->get(BEQ)).addReg(Success).addReg(ZERO).addMBB(loopMBB);
 
-  MI->eraseFromParent();   // The instruction is gone now.
+  MI->eraseFromParent(); // The instruction is gone now.
 
   return exitMBB;
 }
@@ -1001,15 +962,13 @@ MipsTargetLowering::emitAtomicBinaryPartword(MachineInstr *MI,
                                              unsigned Size, unsigned BinOpcode,
                                              bool Nand) const {
   assert((Size == 1 || Size == 2) &&
-      "Unsupported size for EmitAtomicBinaryPartial.");
+         "Unsupported size for EmitAtomicBinaryPartial.");
 
   MachineFunction *MF = BB->getParent();
   MachineRegisterInfo &RegInfo = MF->getRegInfo();
   const TargetRegisterClass *RC = getRegClassFor(MVT::i32);
   const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
   DebugLoc DL = MI->getDebugLoc();
-  unsigned LL = IsN64 ? Mips::LL_P8 : Mips::LL;
-  unsigned SC = IsN64 ? Mips::SC_P8 : Mips::SC;
 
   unsigned Dest = MI->getOperand(0).getReg();
   unsigned Ptr = MI->getOperand(1).getReg();
@@ -1106,7 +1065,7 @@ MipsTargetLowering::emitAtomicBinaryPartword(MachineInstr *MI,
   //   beq     success,$0,loopMBB
 
   BB = loopMBB;
-  BuildMI(BB, DL, TII->get(LL), OldVal).addReg(AlignedAddr).addImm(0);
+  BuildMI(BB, DL, TII->get(Mips::LL), OldVal).addReg(AlignedAddr).addImm(0);
   if (Nand) {
     //  and andres, oldval, incr2
     //  nor binopres, $0, andres
@@ -1120,7 +1079,7 @@ MipsTargetLowering::emitAtomicBinaryPartword(MachineInstr *MI,
     //  and newval, binopres, mask
     BuildMI(BB, DL, TII->get(BinOpcode), BinOpRes).addReg(OldVal).addReg(Incr2);
     BuildMI(BB, DL, TII->get(Mips::AND), NewVal).addReg(BinOpRes).addReg(Mask);
-  } else {// atomic.swap
+  } else { // atomic.swap
     //  and newval, incr2, mask
     BuildMI(BB, DL, TII->get(Mips::AND), NewVal).addReg(Incr2).addReg(Mask);
   }
@@ -1129,7 +1088,7 @@ MipsTargetLowering::emitAtomicBinaryPartword(MachineInstr *MI,
     .addReg(OldVal).addReg(Mask2);
   BuildMI(BB, DL, TII->get(Mips::OR), StoreVal)
     .addReg(MaskedOldVal0).addReg(NewVal);
-  BuildMI(BB, DL, TII->get(SC), Success)
+  BuildMI(BB, DL, TII->get(Mips::SC), Success)
     .addReg(StoreVal).addReg(AlignedAddr).addImm(0);
   BuildMI(BB, DL, TII->get(Mips::BEQ))
     .addReg(Success).addReg(Mips::ZERO).addMBB(loopMBB);
@@ -1151,15 +1110,14 @@ MipsTargetLowering::emitAtomicBinaryPartword(MachineInstr *MI,
   BuildMI(BB, DL, TII->get(Mips::SRA), Dest)
       .addReg(SllRes).addImm(ShiftImm);
 
-  MI->eraseFromParent();   // The instruction is gone now.
+  MI->eraseFromParent(); // The instruction is gone now.
 
   return exitMBB;
 }
 
-MachineBasicBlock *
-MipsTargetLowering::emitAtomicCmpSwap(MachineInstr *MI,
-                                      MachineBasicBlock *BB,
-                                      unsigned Size) const {
+MachineBasicBlock * MipsTargetLowering::emitAtomicCmpSwap(MachineInstr *MI,
+                                                          MachineBasicBlock *BB,
+                                                          unsigned Size) const {
   assert((Size == 4 || Size == 8) && "Unsupported size for EmitAtomicCmpSwap.");
 
   MachineFunction *MF = BB->getParent();
@@ -1170,15 +1128,14 @@ MipsTargetLowering::emitAtomicCmpSwap(MachineInstr *MI,
   unsigned LL, SC, ZERO, BNE, BEQ;
 
   if (Size == 4) {
-    LL = IsN64 ? Mips::LL_P8 : Mips::LL;
-    SC = IsN64 ? Mips::SC_P8 : Mips::SC;
+    LL = Mips::LL;
+    SC = Mips::SC;
     ZERO = Mips::ZERO;
     BNE = Mips::BNE;
     BEQ = Mips::BEQ;
-  }
-  else {
-    LL = IsN64 ? Mips::LLD_P8 : Mips::LLD;
-    SC = IsN64 ? Mips::SCD_P8 : Mips::SCD;
+  } else {
+    LL = Mips::LLD;
+    SC = Mips::SCD;
     ZERO = Mips::ZERO_64;
     BNE = Mips::BNE64;
     BEQ = Mips::BEQ64;
@@ -1233,7 +1190,7 @@ MipsTargetLowering::emitAtomicCmpSwap(MachineInstr *MI,
   BuildMI(BB, DL, TII->get(BEQ))
     .addReg(Success).addReg(ZERO).addMBB(loop1MBB);
 
-  MI->eraseFromParent();   // The instruction is gone now.
+  MI->eraseFromParent(); // The instruction is gone now.
 
   return exitMBB;
 }
@@ -1250,8 +1207,6 @@ MipsTargetLowering::emitAtomicCmpSwapPartword(MachineInstr *MI,
   const TargetRegisterClass *RC = getRegClassFor(MVT::i32);
   const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
   DebugLoc DL = MI->getDebugLoc();
-  unsigned LL = IsN64 ? Mips::LL_P8 : Mips::LL;
-  unsigned SC = IsN64 ? Mips::SC_P8 : Mips::SC;
 
   unsigned Dest    = MI->getOperand(0).getReg();
   unsigned Ptr     = MI->getOperand(1).getReg();
@@ -1348,7 +1303,7 @@ MipsTargetLowering::emitAtomicCmpSwapPartword(MachineInstr *MI,
   //    and     maskedoldval0,oldval,mask
   //    bne     maskedoldval0,shiftedcmpval,sinkMBB
   BB = loop1MBB;
-  BuildMI(BB, DL, TII->get(LL), OldVal).addReg(AlignedAddr).addImm(0);
+  BuildMI(BB, DL, TII->get(Mips::LL), OldVal).addReg(AlignedAddr).addImm(0);
   BuildMI(BB, DL, TII->get(Mips::AND), MaskedOldVal0)
     .addReg(OldVal).addReg(Mask);
   BuildMI(BB, DL, TII->get(Mips::BNE))
@@ -1364,7 +1319,7 @@ MipsTargetLowering::emitAtomicCmpSwapPartword(MachineInstr *MI,
     .addReg(OldVal).addReg(Mask2);
   BuildMI(BB, DL, TII->get(Mips::OR), StoreVal)
     .addReg(MaskedOldVal1).addReg(ShiftedNewVal);
-  BuildMI(BB, DL, TII->get(SC), Success)
+  BuildMI(BB, DL, TII->get(Mips::SC), Success)
       .addReg(StoreVal).addReg(AlignedAddr).addImm(0);
   BuildMI(BB, DL, TII->get(Mips::BEQ))
       .addReg(Success).addReg(Mips::ZERO).addMBB(loop1MBB);
@@ -1421,9 +1376,7 @@ SDValue MipsTargetLowering::lowerBR_JT(SDValue Op, SelectionDAG &DAG) const {
   return DAG.getNode(ISD::BRIND, DL, MVT::Other, Chain, Addr);
 }
 
-SDValue MipsTargetLowering::
-lowerBRCOND(SDValue Op, SelectionDAG &DAG) const
-{
+SDValue MipsTargetLowering::lowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
   // The first operand is the chain, the second is the condition, the third is
   // the block to branch to if the condition is true.
   SDValue Chain = Op.getOperand(0);
@@ -1489,7 +1442,9 @@ SDValue MipsTargetLowering::lowerGlobalAddress(SDValue Op,
                                                SelectionDAG &DAG) const {
   // FIXME there isn't actually debug info here
   SDLoc DL(Op);
-  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+  EVT Ty = Op.getValueType();
+  GlobalAddressSDNode *N = cast<GlobalAddressSDNode>(Op);
+  const GlobalValue *GV = N->getGlobal();
 
   if (getTargetMachine().getRelocationModel() != Reloc::PIC_ && !IsN64) {
     const MipsTargetObjectFile &TLOF =
@@ -1506,26 +1461,31 @@ SDValue MipsTargetLowering::lowerGlobalAddress(SDValue Op,
     }
 
     // %hi/%lo relocation
-    return getAddrNonPIC(Op, DAG);
+    return getAddrNonPIC(N, Ty, DAG);
   }
 
   if (GV->hasInternalLinkage() || (GV->hasLocalLinkage() && !isa<Function>(GV)))
-    return getAddrLocal(Op, DAG, HasMips64);
+    return getAddrLocal(N, Ty, DAG, HasMips64);
 
   if (LargeGOT)
-    return getAddrGlobalLargeGOT(Op, DAG, MipsII::MO_GOT_HI16,
-                                 MipsII::MO_GOT_LO16);
+    return getAddrGlobalLargeGOT(N, Ty, DAG, MipsII::MO_GOT_HI16,
+                                 MipsII::MO_GOT_LO16, DAG.getEntryNode(),
+                                 MachinePointerInfo::getGOT());
 
-  return getAddrGlobal(Op, DAG,
-                       HasMips64 ? MipsII::MO_GOT_DISP : MipsII::MO_GOT16);
+  return getAddrGlobal(N, Ty, DAG,
+                       HasMips64 ? MipsII::MO_GOT_DISP : MipsII::MO_GOT16,
+                       DAG.getEntryNode(), MachinePointerInfo::getGOT());
 }
 
 SDValue MipsTargetLowering::lowerBlockAddress(SDValue Op,
                                               SelectionDAG &DAG) const {
+  BlockAddressSDNode *N = cast<BlockAddressSDNode>(Op);
+  EVT Ty = Op.getValueType();
+
   if (getTargetMachine().getRelocationModel() != Reloc::PIC_ && !IsN64)
-    return getAddrNonPIC(Op, DAG);
+    return getAddrNonPIC(N, Ty, DAG);
 
-  return getAddrLocal(Op, DAG, HasMips64);
+  return getAddrLocal(N, Ty, DAG, HasMips64);
 }
 
 SDValue MipsTargetLowering::
@@ -1612,10 +1572,13 @@ lowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const
 SDValue MipsTargetLowering::
 lowerJumpTable(SDValue Op, SelectionDAG &DAG) const
 {
+  JumpTableSDNode *N = cast<JumpTableSDNode>(Op);
+  EVT Ty = Op.getValueType();
+
   if (getTargetMachine().getRelocationModel() != Reloc::PIC_ && !IsN64)
-    return getAddrNonPIC(Op, DAG);
+    return getAddrNonPIC(N, Ty, DAG);
 
-  return getAddrLocal(Op, DAG, HasMips64);
+  return getAddrLocal(N, Ty, DAG, HasMips64);
 }
 
 SDValue MipsTargetLowering::
@@ -1630,11 +1593,13 @@ lowerConstantPool(SDValue Op, SelectionDAG &DAG) const
   //  SDValue GPRelNode = DAG.getNode(MipsISD::GPRel, MVT::i32, CP);
   //  SDValue GOT = DAG.getGLOBAL_OFFSET_TABLE(MVT::i32);
   //  ResNode = DAG.getNode(ISD::ADD, MVT::i32, GOT, GPRelNode);
+  ConstantPoolSDNode *N = cast<ConstantPoolSDNode>(Op);
+  EVT Ty = Op.getValueType();
 
   if (getTargetMachine().getRelocationModel() != Reloc::PIC_ && !IsN64)
-    return getAddrNonPIC(Op, DAG);
+    return getAddrNonPIC(N, Ty, DAG);
 
-  return getAddrLocal(Op, DAG, HasMips64);
+  return getAddrLocal(N, Ty, DAG, HasMips64);
 }
 
 SDValue MipsTargetLowering::lowerVASTART(SDValue Op, SelectionDAG &DAG) const {
@@ -1652,7 +1617,8 @@ SDValue MipsTargetLowering::lowerVASTART(SDValue Op, SelectionDAG &DAG) const {
                       MachinePointerInfo(SV), false, false, 0);
 }
 
-static SDValue lowerFCOPYSIGN32(SDValue Op, SelectionDAG &DAG, bool HasR2) {
+static SDValue lowerFCOPYSIGN32(SDValue Op, SelectionDAG &DAG,
+                                bool HasExtractInsert) {
   EVT TyX = Op.getOperand(0).getValueType();
   EVT TyY = Op.getOperand(1).getValueType();
   SDValue Const1 = DAG.getConstant(1, MVT::i32);
@@ -1671,7 +1637,7 @@ static SDValue lowerFCOPYSIGN32(SDValue Op, SelectionDAG &DAG, bool HasR2) {
     DAG.getNode(MipsISD::ExtractElementF64, DL, MVT::i32, Op.getOperand(1),
                 Const1);
 
-  if (HasR2) {
+  if (HasExtractInsert) {
     // ext  E, Y, 31, 1  ; extract bit31 of Y
     // ins  X, E, 31, 1  ; insert extracted bit at bit31 of X
     SDValue E = DAG.getNode(MipsISD::Ext, DL, MVT::i32, Y, Const31, Const1);
@@ -1697,7 +1663,8 @@ static SDValue lowerFCOPYSIGN32(SDValue Op, SelectionDAG &DAG, bool HasR2) {
   return DAG.getNode(MipsISD::BuildPairF64, DL, MVT::f64, LowX, Res);
 }
 
-static SDValue lowerFCOPYSIGN64(SDValue Op, SelectionDAG &DAG, bool HasR2) {
+static SDValue lowerFCOPYSIGN64(SDValue Op, SelectionDAG &DAG,
+                                bool HasExtractInsert) {
   unsigned WidthX = Op.getOperand(0).getValueSizeInBits();
   unsigned WidthY = Op.getOperand(1).getValueSizeInBits();
   EVT TyX = MVT::getIntegerVT(WidthX), TyY = MVT::getIntegerVT(WidthY);
@@ -1708,7 +1675,7 @@ static SDValue lowerFCOPYSIGN64(SDValue Op, SelectionDAG &DAG, bool HasR2) {
   SDValue X = DAG.getNode(ISD::BITCAST, DL, TyX, Op.getOperand(0));
   SDValue Y = DAG.getNode(ISD::BITCAST, DL, TyY, Op.getOperand(1));
 
-  if (HasR2) {
+  if (HasExtractInsert) {
     // ext  E, Y, width(Y) - 1, 1  ; extract bit width(Y)-1 of Y
     // ins  X, E, width(X) - 1, 1  ; insert extracted bit at bit width(X)-1 of X
     SDValue E = DAG.getNode(MipsISD::Ext, DL, TyY, Y,
@@ -1748,12 +1715,13 @@ static SDValue lowerFCOPYSIGN64(SDValue Op, SelectionDAG &DAG, bool HasR2) {
 SDValue
 MipsTargetLowering::lowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
   if (Subtarget->hasMips64())
-    return lowerFCOPYSIGN64(Op, DAG, Subtarget->hasMips32r2());
+    return lowerFCOPYSIGN64(Op, DAG, Subtarget->hasExtractInsert());
 
-  return lowerFCOPYSIGN32(Op, DAG, Subtarget->hasMips32r2());
+  return lowerFCOPYSIGN32(Op, DAG, Subtarget->hasExtractInsert());
 }
 
-static SDValue lowerFABS32(SDValue Op, SelectionDAG &DAG, bool HasR2) {
+static SDValue lowerFABS32(SDValue Op, SelectionDAG &DAG,
+                           bool HasExtractInsert) {
   SDValue Res, Const1 = DAG.getConstant(1, MVT::i32);
   SDLoc DL(Op);
 
@@ -1765,7 +1733,7 @@ static SDValue lowerFABS32(SDValue Op, SelectionDAG &DAG, bool HasR2) {
                 Const1);
 
   // Clear MSB.
-  if (HasR2)
+  if (HasExtractInsert)
     Res = DAG.getNode(MipsISD::Ins, DL, MVT::i32,
                       DAG.getRegister(Mips::ZERO, MVT::i32),
                       DAG.getConstant(31, MVT::i32), Const1, X);
@@ -1782,7 +1750,8 @@ static SDValue lowerFABS32(SDValue Op, SelectionDAG &DAG, bool HasR2) {
   return DAG.getNode(MipsISD::BuildPairF64, DL, MVT::f64, LowX, Res);
 }
 
-static SDValue lowerFABS64(SDValue Op, SelectionDAG &DAG, bool HasR2) {
+static SDValue lowerFABS64(SDValue Op, SelectionDAG &DAG,
+                           bool HasExtractInsert) {
   SDValue Res, Const1 = DAG.getConstant(1, MVT::i32);
   SDLoc DL(Op);
 
@@ -1790,7 +1759,7 @@ static SDValue lowerFABS64(SDValue Op, SelectionDAG &DAG, bool HasR2) {
   SDValue X = DAG.getNode(ISD::BITCAST, DL, MVT::i64, Op.getOperand(0));
 
   // Clear MSB.
-  if (HasR2)
+  if (HasExtractInsert)
     Res = DAG.getNode(MipsISD::Ins, DL, MVT::i64,
                       DAG.getRegister(Mips::ZERO_64, MVT::i64),
                       DAG.getConstant(63, MVT::i32), Const1, X);
@@ -1805,9 +1774,9 @@ static SDValue lowerFABS64(SDValue Op, SelectionDAG &DAG, bool HasR2) {
 SDValue
 MipsTargetLowering::lowerFABS(SDValue Op, SelectionDAG &DAG) const {
   if (Subtarget->hasMips64() && (Op.getValueType() == MVT::f64))
-    return lowerFABS64(Op, DAG, Subtarget->hasMips32r2());
+    return lowerFABS64(Op, DAG, Subtarget->hasExtractInsert());
 
-  return lowerFABS32(Op, DAG, Subtarget->hasMips32r2());
+  return lowerFABS32(Op, DAG, Subtarget->hasExtractInsert());
 }
 
 SDValue MipsTargetLowering::
@@ -2152,21 +2121,14 @@ SDValue MipsTargetLowering::lowerFP_TO_SINT(SDValue Op,
 //  For vararg functions, all arguments are passed in A0, A1, A2, A3 and stack.
 //===----------------------------------------------------------------------===//
 
-static bool CC_MipsO32(unsigned ValNo, MVT ValVT,
-                       MVT LocVT, CCValAssign::LocInfo LocInfo,
-                       ISD::ArgFlagsTy ArgFlags, CCState &State) {
+static bool CC_MipsO32(unsigned ValNo, MVT ValVT, MVT LocVT,
+                       CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
+                       CCState &State, const uint16_t *F64Regs) {
 
-  static const unsigned IntRegsSize=4, FloatRegsSize=2;
+  static const unsigned IntRegsSize = 4, FloatRegsSize = 2;
 
-  static const uint16_t IntRegs[] = {
-      Mips::A0, Mips::A1, Mips::A2, Mips::A3
-  };
-  static const uint16_t F32Regs[] = {
-      Mips::F12, Mips::F14
-  };
-  static const uint16_t F64Regs[] = {
-      Mips::D6, Mips::D7
-  };
+  static const uint16_t IntRegs[] = { Mips::A0, Mips::A1, Mips::A2, Mips::A3 };
+  static const uint16_t F32Regs[] = { Mips::F12, Mips::F14 };
 
   // Do not process byval args here.
   if (ArgFlags.isByVal())
@@ -2235,14 +2197,28 @@ static bool CC_MipsO32(unsigned ValNo, MVT ValVT,
   return false;
 }
 
+static bool CC_MipsO32_FP32(unsigned ValNo, MVT ValVT,
+                            MVT LocVT, CCValAssign::LocInfo LocInfo,
+                            ISD::ArgFlagsTy ArgFlags, CCState &State) {
+  static const uint16_t F64Regs[] = { Mips::D6, Mips::D7 };
+
+  return CC_MipsO32(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State, F64Regs);
+}
+
+static bool CC_MipsO32_FP64(unsigned ValNo, MVT ValVT,
+                            MVT LocVT, CCValAssign::LocInfo LocInfo,
+                            ISD::ArgFlagsTy ArgFlags, CCState &State) {
+  static const uint16_t F64Regs[] = { Mips::D12_64, Mips::D14_64 };
+
+  return CC_MipsO32(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State, F64Regs);
+}
+
 #include "MipsGenCallingConv.inc"
 
 //===----------------------------------------------------------------------===//
 //                  Call Calling Convention Implementation
 //===----------------------------------------------------------------------===//
 
-static const unsigned O32IntRegsSize = 4;
-
 // Return next O32 integer argument register.
 static unsigned getNextIntArgReg(unsigned Reg) {
   assert((Reg == Mips::A0) || (Reg == Mips::A2));
@@ -2339,6 +2315,7 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo *MFI = MF.getFrameInfo();
   const TargetFrameLowering *TFL = MF.getTarget().getFrameLowering();
+  MipsFunctionInfo *FuncInfo = MF.getInfo<MipsFunctionInfo>();
   bool IsPIC = getTargetMachine().getRelocationModel() == Reloc::PIC_;
 
   // Analyze operands of the call, assigning locations to each operand.
@@ -2347,10 +2324,11 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                  getTargetMachine(), ArgLocs, *DAG.getContext());
   MipsCC::SpecialCallingConvType SpecialCallingConv =
     getSpecialCallingConv(Callee);
-  MipsCC MipsCCInfo(CallConv, IsO32, CCInfo, SpecialCallingConv);
+  MipsCC MipsCCInfo(CallConv, IsO32, Subtarget->isFP64bit(), CCInfo,
+                    SpecialCallingConv);
 
   MipsCCInfo.analyzeCallOperands(Outs, IsVarArg,
-                                 getTargetMachine().Options.UseSoftFloat,
+                                 Subtarget->mipsSEUsesSoftFloat(),
                                  Callee.getNode(), CLI.Args);
 
   // Get a count of how many bytes are to be pushed on the stack.
@@ -2467,32 +2445,40 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   bool IsPICCall = (IsN64 || IsPIC); // true if calls are translated to jalr $25
   bool GlobalOrExternal = false, InternalLinkage = false;
   SDValue CalleeLo;
+  EVT Ty = Callee.getValueType();
 
   if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
     if (IsPICCall) {
-      InternalLinkage = G->getGlobal()->hasInternalLinkage();
+      const GlobalValue *Val = G->getGlobal();
+      InternalLinkage = Val->hasInternalLinkage();
 
       if (InternalLinkage)
-        Callee = getAddrLocal(Callee, DAG, HasMips64);
+        Callee = getAddrLocal(G, Ty, DAG, HasMips64);
       else if (LargeGOT)
-        Callee = getAddrGlobalLargeGOT(Callee, DAG, MipsII::MO_CALL_HI16,
-                                       MipsII::MO_CALL_LO16);
+        Callee = getAddrGlobalLargeGOT(G, Ty, DAG, MipsII::MO_CALL_HI16,
+                                       MipsII::MO_CALL_LO16, Chain,
+                                       FuncInfo->callPtrInfo(Val));
       else
-        Callee = getAddrGlobal(Callee, DAG, MipsII::MO_GOT_CALL);
+        Callee = getAddrGlobal(G, Ty, DAG, MipsII::MO_GOT_CALL, Chain,
+                               FuncInfo->callPtrInfo(Val));
     } else
       Callee = DAG.getTargetGlobalAddress(G->getGlobal(), DL, getPointerTy(), 0,
                                           MipsII::MO_NO_FLAG);
     GlobalOrExternal = true;
   }
   else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
+    const char *Sym = S->getSymbol();
+
     if (!IsN64 && !IsPIC) // !N64 && static
-      Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(),
+      Callee = DAG.getTargetExternalSymbol(Sym, getPointerTy(),
                                             MipsII::MO_NO_FLAG);
     else if (LargeGOT)
-      Callee = getAddrGlobalLargeGOT(Callee, DAG, MipsII::MO_CALL_HI16,
-                                     MipsII::MO_CALL_LO16);
+      Callee = getAddrGlobalLargeGOT(S, Ty, DAG, MipsII::MO_CALL_HI16,
+                                     MipsII::MO_CALL_LO16, Chain,
+                                     FuncInfo->callPtrInfo(Sym));
     else // N64 || PIC
-      Callee = getAddrGlobal(Callee, DAG, MipsII::MO_GOT_CALL);
+      Callee = getAddrGlobal(S, Ty, DAG, MipsII::MO_GOT_CALL, Chain,
+                             FuncInfo->callPtrInfo(Sym));
 
     GlobalOrExternal = true;
   }
@@ -2534,9 +2520,9 @@ MipsTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
   SmallVector<CCValAssign, 16> RVLocs;
   CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(),
                  getTargetMachine(), RVLocs, *DAG.getContext());
-  MipsCC MipsCCInfo(CallConv, IsO32, CCInfo);
+  MipsCC MipsCCInfo(CallConv, IsO32, Subtarget->isFP64bit(), CCInfo);
 
-  MipsCCInfo.analyzeCallResult(Ins, getTargetMachine().Options.UseSoftFloat,
+  MipsCCInfo.analyzeCallResult(Ins, Subtarget->mipsSEUsesSoftFloat(),
                                CallNode, RetTy);
 
   // Copy all of the result registers out of their specified physreg.
@@ -2581,10 +2567,10 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain,
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(),
                  getTargetMachine(), ArgLocs, *DAG.getContext());
-  MipsCC MipsCCInfo(CallConv, IsO32, CCInfo);
+  MipsCC MipsCCInfo(CallConv, IsO32, Subtarget->isFP64bit(), CCInfo);
   Function::const_arg_iterator FuncArg =
     DAG.getMachineFunction().getFunction()->arg_begin();
-  bool UseSoftFloat = getTargetMachine().Options.UseSoftFloat;
+  bool UseSoftFloat = Subtarget->mipsSEUsesSoftFloat();
 
   MipsCCInfo.analyzeFormalArguments(Ins, UseSoftFloat, FuncArg);
   MipsFI->setFormalArgInfo(CCInfo.getNextStackOffset(),
@@ -2613,21 +2599,9 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain,
 
     // Arguments stored on registers
     if (IsRegLoc) {
-      EVT RegVT = VA.getLocVT();
+      MVT RegVT = VA.getLocVT();
       unsigned ArgReg = VA.getLocReg();
-      const TargetRegisterClass *RC;
-
-      if (RegVT == MVT::i32)
-        RC = Subtarget->inMips16Mode()? &Mips::CPU16RegsRegClass :
-                                        &Mips::GPR32RegClass;
-      else if (RegVT == MVT::i64)
-        RC = &Mips::GPR64RegClass;
-      else if (RegVT == MVT::f32)
-        RC = &Mips::FGR32RegClass;
-      else if (RegVT == MVT::f64)
-        RC = HasMips64 ? &Mips::FGR64RegClass : &Mips::AFGR64RegClass;
-      else
-        llvm_unreachable("RegVT not supported by FormalArguments Lowering");
+      const TargetRegisterClass *RC = getRegClassFor(RegVT);
 
       // Transform the arguments stored on
       // physical registers into virtual ones
@@ -2677,9 +2651,11 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain,
 
       // Create load nodes to retrieve arguments from the stack
       SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
-      InVals.push_back(DAG.getLoad(ValVT, DL, Chain, FIN,
-                                   MachinePointerInfo::getFixedStack(FI),
-                                   false, false, false, 0));
+      SDValue Load = DAG.getLoad(ValVT, DL, Chain, FIN,
+                                 MachinePointerInfo::getFixedStack(FI),
+                                 false, false, false, 0);
+      InVals.push_back(Load);
+      OutChains.push_back(Load.getValue(1));
     }
   }
 
@@ -2740,10 +2716,10 @@ MipsTargetLowering::LowerReturn(SDValue Chain,
   // CCState - Info about the registers and stack slot.
   CCState CCInfo(CallConv, IsVarArg, MF, getTargetMachine(), RVLocs,
                  *DAG.getContext());
-  MipsCC MipsCCInfo(CallConv, IsO32, CCInfo);
+  MipsCC MipsCCInfo(CallConv, IsO32, Subtarget->isFP64bit(), CCInfo);
 
   // Analyze return values.
-  MipsCCInfo.analyzeReturn(Outs, getTargetMachine().Options.UseSoftFloat,
+  MipsCCInfo.analyzeReturn(Outs, Subtarget->mipsSEUsesSoftFloat(),
                            MF.getFunction()->getReturnType());
 
   SDValue Flag;
@@ -2802,7 +2778,7 @@ MipsTargetLowering::LowerReturn(SDValue Chain,
 MipsTargetLowering::ConstraintType MipsTargetLowering::
 getConstraintType(const std::string &Constraint) const
 {
-  // Mips specific constrainy
+  // Mips specific constraints
   // GCC config/mips/constraints.md
   //
   // 'd' : An address register. Equivalent to r
@@ -2853,16 +2829,19 @@ MipsTargetLowering::getSingleConstraintMatchWeight(
     if (type->isIntegerTy())
       weight = CW_Register;
     break;
-  case 'f':
-    if (type->isFloatTy())
+  case 'f': // FPU or MSA register
+    if (Subtarget->hasMSA() && type->isVectorTy() &&
+        cast<VectorType>(type)->getBitWidth() == 128)
+      weight = CW_Register;
+    else if (type->isFloatTy())
       weight = CW_Register;
     break;
   case 'c': // $25 for indirect jumps
   case 'l': // lo register
   case 'x': // hilo register pair
-      if (type->isIntegerTy())
+    if (type->isIntegerTy())
       weight = CW_SpecificReg;
-      break;
+    break;
   case 'I': // signed 16 bit immediate
   case 'J': // integer zero
   case 'K': // unsigned 16 bit immediate
@@ -2880,6 +2859,104 @@ MipsTargetLowering::getSingleConstraintMatchWeight(
   return weight;
 }
 
+/// This is a helper function to parse a physical register string and split it
+/// into non-numeric and numeric parts (Prefix and Reg). The first boolean flag
+/// that is returned indicates whether parsing was successful. The second flag
+/// is true if the numeric part exists.
+static std::pair<bool, bool>
+parsePhysicalReg(const StringRef &C, std::string &Prefix,
+                 unsigned long long &Reg) {
+  if (C.front() != '{' || C.back() != '}')
+    return std::make_pair(false, false);
+
+  // Search for the first numeric character.
+  StringRef::const_iterator I, B = C.begin() + 1, E = C.end() - 1;
+  I = std::find_if(B, E, std::ptr_fun(isdigit));
+
+  Prefix.assign(B, I - B);
+
+  // The second flag is set to false if no numeric characters were found.
+  if (I == E)
+    return std::make_pair(true, false);
+
+  // Parse the numeric characters.
+  return std::make_pair(!getAsUnsignedInteger(StringRef(I, E - I), 10, Reg),
+                        true);
+}
+
+std::pair<unsigned, const TargetRegisterClass *> MipsTargetLowering::
+parseRegForInlineAsmConstraint(const StringRef &C, MVT VT) const {
+  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
+  const TargetRegisterClass *RC;
+  std::string Prefix;
+  unsigned long long Reg;
+
+  std::pair<bool, bool> R = parsePhysicalReg(C, Prefix, Reg);
+
+  if (!R.first)
+    return std::make_pair((unsigned)0, (const TargetRegisterClass*)0);
+
+  if ((Prefix == "hi" || Prefix == "lo")) { // Parse hi/lo.
+    // No numeric characters follow "hi" or "lo".
+    if (R.second)
+      return std::make_pair((unsigned)0, (const TargetRegisterClass*)0);
+
+    RC = TRI->getRegClass(Prefix == "hi" ?
+                          Mips::HI32RegClassID : Mips::LO32RegClassID);
+    return std::make_pair(*(RC->begin()), RC);
+  } else if (Prefix.compare(0, 4, "$msa") == 0) {
+    // Parse $msa(ir|csr|access|save|modify|request|map|unmap)
+
+    // No numeric characters follow the name.
+    if (R.second)
+      return std::make_pair((unsigned)0, (const TargetRegisterClass *)0);
+
+    Reg = StringSwitch<unsigned long long>(Prefix)
+              .Case("$msair", Mips::MSAIR)
+              .Case("$msacsr", Mips::MSACSR)
+              .Case("$msaaccess", Mips::MSAAccess)
+              .Case("$msasave", Mips::MSASave)
+              .Case("$msamodify", Mips::MSAModify)
+              .Case("$msarequest", Mips::MSARequest)
+              .Case("$msamap", Mips::MSAMap)
+              .Case("$msaunmap", Mips::MSAUnmap)
+              .Default(0);
+
+    if (!Reg)
+      return std::make_pair((unsigned)0, (const TargetRegisterClass *)0);
+
+    RC = TRI->getRegClass(Mips::MSACtrlRegClassID);
+    return std::make_pair(Reg, RC);
+  }
+
+  if (!R.second)
+    return std::make_pair((unsigned)0, (const TargetRegisterClass*)0);
+
+  if (Prefix == "$f") { // Parse $f0-$f31.
+    // If the size of FP registers is 64-bit or Reg is an even number, select
+    // the 64-bit register class. Otherwise, select the 32-bit register class.
+    if (VT == MVT::Other)
+      VT = (Subtarget->isFP64bit() || !(Reg % 2)) ? MVT::f64 : MVT::f32;
+
+    RC = getRegClassFor(VT);
+
+    if (RC == &Mips::AFGR64RegClass) {
+      assert(Reg % 2 == 0);
+      Reg >>= 1;
+    }
+  } else if (Prefix == "$fcc") // Parse $fcc0-$fcc7.
+    RC = TRI->getRegClass(Mips::FCCRegClassID);
+  else if (Prefix == "$w") { // Parse $w0-$w31.
+    RC = getRegClassFor((VT == MVT::Other) ? MVT::v16i8 : VT);
+  } else { // Parse $0-$31.
+    assert(Prefix == "$");
+    RC = getRegClassFor((VT == MVT::Other) ? MVT::i32 : VT);
+  }
+
+  assert(Reg < RC->getNumRegs());
+  return std::make_pair(*(RC->begin() + Reg), RC);
+}
+
 /// Given a register class constraint, like 'r', if this corresponds directly
 /// to an LLVM register class, return a register of 0 and the register class
 /// pointer.
@@ -2902,10 +2979,18 @@ getRegForInlineAsmConstraint(const std::string &Constraint, MVT VT) const
         return std::make_pair(0U, &Mips::GPR64RegClass);
       // This will generate an error message
       return std::make_pair(0u, static_cast<const TargetRegisterClass*>(0));
-    case 'f':
-      if (VT == MVT::f32)
+    case 'f': // FPU or MSA register
+      if (VT == MVT::v16i8)
+        return std::make_pair(0U, &Mips::MSA128BRegClass);
+      else if (VT == MVT::v8i16 || VT == MVT::v8f16)
+        return std::make_pair(0U, &Mips::MSA128HRegClass);
+      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+        return std::make_pair(0U, &Mips::MSA128WRegClass);
+      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+        return std::make_pair(0U, &Mips::MSA128DRegClass);
+      else if (VT == MVT::f32)
         return std::make_pair(0U, &Mips::FGR32RegClass);
-      if ((VT == MVT::f64) && (!Subtarget->isSingleFloat())) {
+      else if ((VT == MVT::f64) && (!Subtarget->isSingleFloat())) {
         if (Subtarget->isFP64bit())
           return std::make_pair(0U, &Mips::FGR64RegClass);
         return std::make_pair(0U, &Mips::AFGR64RegClass);
@@ -2918,14 +3003,21 @@ getRegForInlineAsmConstraint(const std::string &Constraint, MVT VT) const
       return std::make_pair((unsigned)Mips::T9_64, &Mips::GPR64RegClass);
     case 'l': // register suitable for indirect jump
       if (VT == MVT::i32)
-        return std::make_pair((unsigned)Mips::LO, &Mips::LORegsRegClass);
-      return std::make_pair((unsigned)Mips::LO64, &Mips::LORegs64RegClass);
+        return std::make_pair((unsigned)Mips::LO0, &Mips::LO32RegClass);
+      return std::make_pair((unsigned)Mips::LO0_64, &Mips::LO64RegClass);
     case 'x': // register suitable for indirect jump
       // Fixme: Not triggering the use of both hi and low
       // This will generate an error message
       return std::make_pair(0u, static_cast<const TargetRegisterClass*>(0));
     }
   }
+
+  std::pair<unsigned, const TargetRegisterClass *> R;
+  R = parseRegForInlineAsmConstraint(Constraint, VT);
+
+  if (R.second)
+    return R;
+
   return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
 }
 
@@ -3024,8 +3116,8 @@ void MipsTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
   TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
 }
 
-bool
-MipsTargetLowering::isLegalAddressingMode(const AddrMode &AM, Type *Ty) const {
+bool MipsTargetLowering::isLegalAddressingMode(const AddrMode &AM,
+                                               Type *Ty) const {
   // No global is ever allowed as a base.
   if (AM.BaseGV)
     return false;
@@ -3089,13 +3181,13 @@ static bool isF128SoftLibCall(const char *CallSym) {
      "log10l", "log2l", "logl", "nearbyintl", "powl", "rintl", "sinl", "sqrtl",
      "truncl"};
 
-  const char * const *End = LibCalls + array_lengthof(LibCalls);
+  const char *const *End = LibCalls + array_lengthof(LibCalls);
 
   // Check that LibCalls is sorted alphabetically.
   MipsTargetLowering::LTStr Comp;
 
 #ifndef NDEBUG
-  for (const char * const *I = LibCalls; I < End - 1; ++I)
+  for (const char *const *I = LibCalls; I < End - 1; ++I)
     assert(Comp(*I, *(I + 1)));
 #endif
 
@@ -3133,9 +3225,9 @@ MipsTargetLowering::MipsCC::SpecialCallingConvType
 }
 
 MipsTargetLowering::MipsCC::MipsCC(
-  CallingConv::ID CC, bool IsO32_, CCState &Info,
-    MipsCC::SpecialCallingConvType SpecialCallingConv_)
-  : CCInfo(Info), CallConv(CC), IsO32(IsO32_),
+  CallingConv::ID CC, bool IsO32_, bool IsFP64_, CCState &Info,
+  MipsCC::SpecialCallingConvType SpecialCallingConv_)
+  : CCInfo(Info), CallConv(CC), IsO32(IsO32_), IsFP64(IsFP64_),
     SpecialCallingConv(SpecialCallingConv_){
   // Pre-allocate reserved argument area.
   CCInfo.AllocateStack(reservedArgArea(), 1);
@@ -3249,11 +3341,10 @@ analyzeReturn(const SmallVectorImpl<ISD::OutputArg> &Outs, bool IsSoftFloat,
   analyzeReturn(Outs, IsSoftFloat, 0, RetTy);
 }
 
-void
-MipsTargetLowering::MipsCC::handleByValArg(unsigned ValNo, MVT ValVT,
-                                           MVT LocVT,
-                                           CCValAssign::LocInfo LocInfo,
-                                           ISD::ArgFlagsTy ArgFlags) {
+void MipsTargetLowering::MipsCC::handleByValArg(unsigned ValNo, MVT ValVT,
+                                                MVT LocVT,
+                                                CCValAssign::LocInfo LocInfo,
+                                                ISD::ArgFlagsTy ArgFlags) {
   assert(ArgFlags.getByValSize() && "Byval argument's size shouldn't be 0.");
 
   struct ByValArgInfo ByVal;
@@ -3291,11 +3382,11 @@ llvm::CCAssignFn *MipsTargetLowering::MipsCC::fixedArgFn() const {
 
   if (SpecialCallingConv == Mips16RetHelperConv)
     return CC_Mips16RetHelper;
-  return IsO32 ? CC_MipsO32 : CC_MipsN;
+  return IsO32 ? (IsFP64 ? CC_MipsO32_FP64 : CC_MipsO32_FP32) : CC_MipsN;
 }
 
 llvm::CCAssignFn *MipsTargetLowering::MipsCC::varArgFn() const {
-  return IsO32 ? CC_MipsO32 : CC_MipsN_VarArg;
+  return IsO32 ? (IsFP64 ? CC_MipsO32_FP64 : CC_MipsO32_FP32) : CC_MipsN_VarArg;
 }
 
 const uint16_t *MipsTargetLowering::MipsCC::shadowRegs() const {
@@ -3473,17 +3564,15 @@ passByValArg(SDValue Chain, SDLoc DL,
                             DAG.getConstant(Offset, PtrTy));
   SDValue Dst = DAG.getNode(ISD::ADD, DL, PtrTy, StackPtr,
                             DAG.getIntPtrConstant(ByVal.Address));
-  Chain = DAG.getMemcpy(Chain, DL, Dst, Src,
-                        DAG.getConstant(MemCpySize, PtrTy), Alignment,
-                        /*isVolatile=*/false, /*AlwaysInline=*/false,
+  Chain = DAG.getMemcpy(Chain, DL, Dst, Src, DAG.getConstant(MemCpySize, PtrTy),
+                        Alignment, /*isVolatile=*/false, /*AlwaysInline=*/false,
                         MachinePointerInfo(0), MachinePointerInfo(0));
   MemOpChains.push_back(Chain);
 }
 
-void
-MipsTargetLowering::writeVarArgRegs(std::vector<SDValue> &OutChains,
-                                    const MipsCC &CC, SDValue Chain,
-                                    SDLoc DL, SelectionDAG &DAG) const {
+void MipsTargetLowering::writeVarArgRegs(std::vector<SDValue> &OutChains,
+                                         const MipsCC &CC, SDValue Chain,
+                                         SDLoc DL, SelectionDAG &DAG) const {
   unsigned NumRegs = CC.numIntArgRegs();
   const uint16_t *ArgRegs = CC.intArgRegs();
   const CCState &CCInfo = CC.getCCInfo();
@@ -3501,8 +3590,7 @@ MipsTargetLowering::writeVarArgRegs(std::vector<SDValue> &OutChains,
   if (NumRegs == Idx)
     VaArgOffset = RoundUpToAlignment(CCInfo.getNextStackOffset(), RegSize);
   else
-    VaArgOffset =
-      (int)CC.reservedArgArea() - (int)(RegSize * (NumRegs - Idx));
+    VaArgOffset = (int)CC.reservedArgArea() - (int)(RegSize * (NumRegs - Idx));
 
   // Record the frame index of the first variable argument
   // which is a value necessary to VASTART.
diff --git a/lib/Target/Mips/MipsISelLowering.h b/lib/Target/Mips/MipsISelLowering.h
index 123a2a6..65f68f0 100644
--- a/lib/Target/Mips/MipsISelLowering.h
+++ b/lib/Target/Mips/MipsISelLowering.h
@@ -17,6 +17,7 @@
 
 #include "Mips.h"
 #include "MipsSubtarget.h"
+#include "MCTargetDesc/MipsBaseInfo.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/IR/Function.h"
@@ -69,10 +70,11 @@ namespace llvm {
       EH_RETURN,
 
       // Node used to extract integer from accumulator.
-      ExtractLOHI,
+      MFHI,
+      MFLO,
 
       // Node used to insert integers to accumulator.
-      InsertLOHI,
+      MTLOHI,
 
       // Mult nodes.
       Mult,
@@ -152,6 +154,43 @@ namespace llvm {
       SETCC_DSP,
       SELECT_CC_DSP,
 
+      // Vector comparisons.
+      // These take a vector and return a boolean.
+      VALL_ZERO,
+      VANY_ZERO,
+      VALL_NONZERO,
+      VANY_NONZERO,
+
+      // These take a vector and return a vector bitmask.
+      VCEQ,
+      VCLE_S,
+      VCLE_U,
+      VCLT_S,
+      VCLT_U,
+
+      // Element-wise vector max/min.
+      VSMAX,
+      VSMIN,
+      VUMAX,
+      VUMIN,
+
+      // Vector Shuffle with mask as an operand
+      VSHF,  // Generic shuffle
+      SHF,   // 4-element set shuffle.
+      ILVEV, // Interleave even elements
+      ILVOD, // Interleave odd elements
+      ILVL,  // Interleave left elements
+      ILVR,  // Interleave right elements
+      PCKEV, // Pack even elements
+      PCKOD, // Pack odd elements
+
+      // Combined (XOR (OR $a, $b), -1)
+      VNOR,
+
+      // Extended vector element extraction
+      VEXTRACT_SEXT_ELT,
+      VEXTRACT_ZEXT_ELT,
+
       // Load/Store Left/Right nodes.
       LWL = ISD::FIRST_TARGET_MEMORY_OPCODE,
       LWR,
@@ -211,12 +250,72 @@ namespace llvm {
   protected:
     SDValue getGlobalReg(SelectionDAG &DAG, EVT Ty) const;
 
-    SDValue getAddrLocal(SDValue Op, SelectionDAG &DAG, bool HasMips64) const;
-
-    SDValue getAddrGlobal(SDValue Op, SelectionDAG &DAG, unsigned Flag) const;
-
-    SDValue getAddrGlobalLargeGOT(SDValue Op, SelectionDAG &DAG,
-                                  unsigned HiFlag, unsigned LoFlag) const;
+    // This method creates the following nodes, which are necessary for
+    // computing a local symbol's address:
+    //
+    // (add (load (wrapper $gp, %got(sym)), %lo(sym))
+    template<class NodeTy>
+    SDValue getAddrLocal(NodeTy *N, EVT Ty, SelectionDAG &DAG,
+                         bool HasMips64) const {
+      SDLoc DL(N);
+      unsigned GOTFlag = HasMips64 ? MipsII::MO_GOT_PAGE : MipsII::MO_GOT;
+      SDValue GOT = DAG.getNode(MipsISD::Wrapper, DL, Ty, getGlobalReg(DAG, Ty),
+                                getTargetNode(N, Ty, DAG, GOTFlag));
+      SDValue Load = DAG.getLoad(Ty, DL, DAG.getEntryNode(), GOT,
+                                 MachinePointerInfo::getGOT(), false, false,
+                                 false, 0);
+      unsigned LoFlag = HasMips64 ? MipsII::MO_GOT_OFST : MipsII::MO_ABS_LO;
+      SDValue Lo = DAG.getNode(MipsISD::Lo, DL, Ty,
+                               getTargetNode(N, Ty, DAG, LoFlag));
+      return DAG.getNode(ISD::ADD, DL, Ty, Load, Lo);
+    }
+
+    // This method creates the following nodes, which are necessary for
+    // computing a global symbol's address:
+    //
+    // (load (wrapper $gp, %got(sym)))
+    template<class NodeTy>
+    SDValue getAddrGlobal(NodeTy *N, EVT Ty, SelectionDAG &DAG,
+                          unsigned Flag, SDValue Chain,
+                          const MachinePointerInfo &PtrInfo) const {
+      SDLoc DL(N);
+      SDValue Tgt = DAG.getNode(MipsISD::Wrapper, DL, Ty, getGlobalReg(DAG, Ty),
+                                getTargetNode(N, Ty, DAG, Flag));
+      return DAG.getLoad(Ty, DL, Chain, Tgt, PtrInfo, false, false, false, 0);
+    }
+
+    // This method creates the following nodes, which are necessary for
+    // computing a global symbol's address in large-GOT mode:
+    //
+    // (load (wrapper (add %hi(sym), $gp), %lo(sym)))
+    template<class NodeTy>
+    SDValue getAddrGlobalLargeGOT(NodeTy *N, EVT Ty, SelectionDAG &DAG,
+                                  unsigned HiFlag, unsigned LoFlag,
+                                  SDValue Chain,
+                                  const MachinePointerInfo &PtrInfo) const {
+      SDLoc DL(N);
+      SDValue Hi = DAG.getNode(MipsISD::Hi, DL, Ty,
+                               getTargetNode(N, Ty, DAG, HiFlag));
+      Hi = DAG.getNode(ISD::ADD, DL, Ty, Hi, getGlobalReg(DAG, Ty));
+      SDValue Wrapper = DAG.getNode(MipsISD::Wrapper, DL, Ty, Hi,
+                                    getTargetNode(N, Ty, DAG, LoFlag));
+      return DAG.getLoad(Ty, DL, Chain, Wrapper, PtrInfo, false, false, false,
+                         0);
+    }
+
+    // This method creates the following nodes, which are necessary for
+    // computing a symbol's address in non-PIC mode:
+    //
+    // (add %hi(sym), %lo(sym))
+    template<class NodeTy>
+    SDValue getAddrNonPIC(NodeTy *N, EVT Ty, SelectionDAG &DAG) const {
+      SDLoc DL(N);
+      SDValue Hi = getTargetNode(N, Ty, DAG, MipsII::MO_ABS_HI);
+      SDValue Lo = getTargetNode(N, Ty, DAG, MipsII::MO_ABS_LO);
+      return DAG.getNode(ISD::ADD, DL, Ty,
+                         DAG.getNode(MipsISD::Hi, DL, Ty, Hi),
+                         DAG.getNode(MipsISD::Lo, DL, Ty, Lo));
+    }
 
     /// This function fills Ops, which is the list of operands that will later
     /// be used when a function call node is created. It also generates
@@ -244,9 +343,8 @@ namespace llvm {
         Mips16RetHelperConv, NoSpecialCallingConv
       };
 
-      MipsCC(
-        CallingConv::ID CallConv, bool IsO32, CCState &Info,
-        SpecialCallingConvType SpecialCallingConv = NoSpecialCallingConv);
+      MipsCC(CallingConv::ID CallConv, bool IsO32, bool IsFP64, CCState &Info,
+             SpecialCallingConvType SpecialCallingConv = NoSpecialCallingConv);
 
 
       void analyzeCallOperands(const SmallVectorImpl<ISD::OutputArg> &Outs,
@@ -319,17 +417,39 @@ namespace llvm {
 
       CCState &CCInfo;
       CallingConv::ID CallConv;
-      bool IsO32;
+      bool IsO32, IsFP64;
       SpecialCallingConvType SpecialCallingConv;
       SmallVector<ByValArgInfo, 2> ByValArgs;
     };
   protected:
+    SDValue lowerLOAD(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerSTORE(SDValue Op, SelectionDAG &DAG) const;
+
     // Subtarget Info
     const MipsSubtarget *Subtarget;
 
     bool HasMips64, IsN64, IsO32;
 
   private:
+    // Create a TargetGlobalAddress node.
+    SDValue getTargetNode(GlobalAddressSDNode *N, EVT Ty, SelectionDAG &DAG,
+                          unsigned Flag) const;
+
+    // Create a TargetExternalSymbol node.
+    SDValue getTargetNode(ExternalSymbolSDNode *N, EVT Ty, SelectionDAG &DAG,
+                          unsigned Flag) const;
+
+    // Create a TargetBlockAddress node.
+    SDValue getTargetNode(BlockAddressSDNode *N, EVT Ty, SelectionDAG &DAG,
+                          unsigned Flag) const;
+
+    // Create a TargetJumpTable node.
+    SDValue getTargetNode(JumpTableSDNode *N, EVT Ty, SelectionDAG &DAG,
+                          unsigned Flag) const;
+
+    // Create a TargetConstantPool node.
+    SDValue getTargetNode(ConstantPoolSDNode *N, EVT Ty, SelectionDAG &DAG,
+                          unsigned Flag) const;
 
     MipsCC::SpecialCallingConvType getSpecialCallingConv(SDValue Callee) const;
     // Lower Operand helpers
@@ -361,8 +481,6 @@ namespace llvm {
     SDValue lowerShiftLeftParts(SDValue Op, SelectionDAG& DAG) const;
     SDValue lowerShiftRightParts(SDValue Op, SelectionDAG& DAG,
                                  bool IsSRA) const;
-    SDValue lowerLOAD(SDValue Op, SelectionDAG &DAG) const;
-    SDValue lowerSTORE(SDValue Op, SelectionDAG &DAG) const;
     SDValue lowerADD(SDValue Op, SelectionDAG &DAG) const;
     SDValue lowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const;
 
@@ -433,6 +551,11 @@ namespace llvm {
     ConstraintWeight getSingleConstraintMatchWeight(
       AsmOperandInfo &info, const char *constraint) const;
 
+    /// This function parses registers that appear in inline-asm constraints.
+    /// It returns pair (0, 0) on failure.
+    std::pair<unsigned, const TargetRegisterClass *>
+    parseRegForInlineAsmConstraint(const StringRef &C, MVT VT) const;
+
     std::pair<unsigned, const TargetRegisterClass*>
               getRegForInlineAsmConstraint(const std::string &Constraint,
                                            MVT VT) const;
diff --git a/lib/Target/Mips/MipsInstrFPU.td b/lib/Target/Mips/MipsInstrFPU.td
index b992e77..9f7ce9a 100644
--- a/lib/Target/Mips/MipsInstrFPU.td
+++ b/lib/Target/Mips/MipsInstrFPU.td
@@ -99,9 +99,9 @@ class ADDS_FT<string opstr, RegisterOperand RC, InstrItinClass Itin, bit IsComm,
 
 multiclass ADDS_M<string opstr, InstrItinClass Itin, bit IsComm,
                   SDPatternOperator OpNode = null_frag> {
-  def _D32 : ADDS_FT<opstr, AFGR64RegsOpnd, Itin, IsComm, OpNode>,
+  def _D32 : ADDS_FT<opstr, AFGR64Opnd, Itin, IsComm, OpNode>,
              Requires<[NotFP64bit, HasStdEnc]>;
-  def _D64 : ADDS_FT<opstr, FGR64RegsOpnd, Itin, IsComm, OpNode>,
+  def _D64 : ADDS_FT<opstr, FGR64Opnd, Itin, IsComm, OpNode>,
              Requires<[IsFP64bit, HasStdEnc]> {
     string DecoderNamespace = "Mips64";
   }
@@ -115,18 +115,18 @@ class ABSS_FT<string opstr, RegisterOperand DstRC, RegisterOperand SrcRC,
 
 multiclass ABSS_M<string opstr, InstrItinClass Itin,
                   SDPatternOperator OpNode= null_frag> {
-  def _D32 : ABSS_FT<opstr, AFGR64RegsOpnd, AFGR64RegsOpnd, Itin, OpNode>,
+  def _D32 : ABSS_FT<opstr, AFGR64Opnd, AFGR64Opnd, Itin, OpNode>,
              Requires<[NotFP64bit, HasStdEnc]>;
-  def _D64 : ABSS_FT<opstr, FGR64RegsOpnd, FGR64RegsOpnd, Itin, OpNode>,
+  def _D64 : ABSS_FT<opstr, FGR64Opnd, FGR64Opnd, Itin, OpNode>,
              Requires<[IsFP64bit, HasStdEnc]> {
     string DecoderNamespace = "Mips64";
   }
 }
 
 multiclass ROUND_M<string opstr, InstrItinClass Itin> {
-  def _D32 : ABSS_FT<opstr, FGR32RegsOpnd, AFGR64RegsOpnd, Itin>,
+  def _D32 : ABSS_FT<opstr, FGR32Opnd, AFGR64Opnd, Itin>,
              Requires<[NotFP64bit, HasStdEnc]>;
-  def _D64 : ABSS_FT<opstr, FGR32RegsOpnd, FGR64RegsOpnd, Itin>,
+  def _D64 : ABSS_FT<opstr, FGR32Opnd, FGR64Opnd, Itin>,
              Requires<[IsFP64bit, HasStdEnc]> {
     let DecoderNamespace = "Mips64";
   }
@@ -143,16 +143,16 @@ class MTC1_FT<string opstr, RegisterOperand DstRC, RegisterOperand SrcRC,
          [(set DstRC:$fs, (OpNode SrcRC:$rt))], Itin, FrmFR>;
 
 class LW_FT<string opstr, RegisterOperand RC, InstrItinClass Itin,
-            Operand MemOpnd, SDPatternOperator OpNode= null_frag> :
-  InstSE<(outs RC:$rt), (ins MemOpnd:$addr), !strconcat(opstr, "\t$rt, $addr"),
+            SDPatternOperator OpNode= null_frag> :
+  InstSE<(outs RC:$rt), (ins mem:$addr), !strconcat(opstr, "\t$rt, $addr"),
          [(set RC:$rt, (OpNode addrDefault:$addr))], Itin, FrmFI> {
   let DecoderMethod = "DecodeFMem";
   let mayLoad = 1;
 }
 
 class SW_FT<string opstr, RegisterOperand RC, InstrItinClass Itin,
-            Operand MemOpnd, SDPatternOperator OpNode= null_frag> :
-  InstSE<(outs), (ins RC:$rt, MemOpnd:$addr), !strconcat(opstr, "\t$rt, $addr"),
+            SDPatternOperator OpNode= null_frag> :
+  InstSE<(outs), (ins RC:$rt, mem:$addr), !strconcat(opstr, "\t$rt, $addr"),
          [(OpNode RC:$rt, addrDefault:$addr)], Itin, FrmFI> {
   let DecoderMethod = "DecodeFMem";
   let mayStore = 1;
@@ -171,19 +171,19 @@ class NMADDS_FT<string opstr, RegisterOperand RC, InstrItinClass Itin,
          [(set RC:$fd, (fsub fpimm0, (OpNode (fmul RC:$fs, RC:$ft), RC:$fr)))],
          Itin, FrmFR>;
 
-class LWXC1_FT<string opstr, RegisterOperand DRC, RegisterOperand PRC,
+class LWXC1_FT<string opstr, RegisterOperand DRC,
                InstrItinClass Itin, SDPatternOperator OpNode = null_frag> :
-  InstSE<(outs DRC:$fd), (ins PRC:$base, PRC:$index),
+  InstSE<(outs DRC:$fd), (ins PtrRC:$base, PtrRC:$index),
          !strconcat(opstr, "\t$fd, ${index}(${base})"),
-         [(set DRC:$fd, (OpNode (add PRC:$base, PRC:$index)))], Itin, FrmFI> {
+         [(set DRC:$fd, (OpNode (add iPTR:$base, iPTR:$index)))], Itin, FrmFI> {
   let AddedComplexity = 20;
 }
 
-class SWXC1_FT<string opstr, RegisterOperand DRC, RegisterOperand PRC,
+class SWXC1_FT<string opstr, RegisterOperand DRC,
                InstrItinClass Itin, SDPatternOperator OpNode = null_frag> :
-  InstSE<(outs), (ins DRC:$fs, PRC:$base, PRC:$index),
+  InstSE<(outs), (ins DRC:$fs, PtrRC:$base, PtrRC:$index),
          !strconcat(opstr, "\t$fs, ${index}(${base})"),
-         [(OpNode DRC:$fs, (add PRC:$base, PRC:$index))], Itin, FrmFI> {
+         [(OpNode DRC:$fs, (add iPTR:$base, iPTR:$index))], Itin, FrmFI> {
   let AddedComplexity = 20;
 }
 
@@ -231,24 +231,24 @@ multiclass C_COND_M<string TypeStr, RegisterOperand RC, bits<5> fmt> {
   def C_NGT_#NAME : C_COND_FT<"ngt", TypeStr, RC>, C_COND_FM<fmt, 15>;
 }
 
-defm S : C_COND_M<"s", FGR32RegsOpnd, 16>;
-defm D32 : C_COND_M<"d", AFGR64RegsOpnd, 17>,
+defm S : C_COND_M<"s", FGR32Opnd, 16>;
+defm D32 : C_COND_M<"d", AFGR64Opnd, 17>,
                     Requires<[NotFP64bit, HasStdEnc]>;
 let DecoderNamespace = "Mips64" in
-defm D64 : C_COND_M<"d", FGR64RegsOpnd, 17>, Requires<[IsFP64bit, HasStdEnc]>;
+defm D64 : C_COND_M<"d", FGR64Opnd, 17>, Requires<[IsFP64bit, HasStdEnc]>;
 
 //===----------------------------------------------------------------------===//
 // Floating Point Instructions
 //===----------------------------------------------------------------------===//
-def ROUND_W_S  : ABSS_FT<"round.w.s", FGR32RegsOpnd, FGR32RegsOpnd, IIFcvt>,
+def ROUND_W_S  : ABSS_FT<"round.w.s", FGR32Opnd, FGR32Opnd, IIFcvt>,
                  ABSS_FM<0xc, 16>;
-def TRUNC_W_S  : ABSS_FT<"trunc.w.s", FGR32RegsOpnd, FGR32RegsOpnd, IIFcvt>,
+def TRUNC_W_S  : ABSS_FT<"trunc.w.s", FGR32Opnd, FGR32Opnd, IIFcvt>,
                  ABSS_FM<0xd, 16>;
-def CEIL_W_S   : ABSS_FT<"ceil.w.s", FGR32RegsOpnd, FGR32RegsOpnd, IIFcvt>,
+def CEIL_W_S   : ABSS_FT<"ceil.w.s", FGR32Opnd, FGR32Opnd, IIFcvt>,
                  ABSS_FM<0xe, 16>;
-def FLOOR_W_S  : ABSS_FT<"floor.w.s", FGR32RegsOpnd, FGR32RegsOpnd, IIFcvt>,
+def FLOOR_W_S  : ABSS_FT<"floor.w.s", FGR32Opnd, FGR32Opnd, IIFcvt>,
                  ABSS_FM<0xf, 16>;
-def CVT_W_S    : ABSS_FT<"cvt.w.s", FGR32RegsOpnd, FGR32RegsOpnd, IIFcvt>,
+def CVT_W_S    : ABSS_FT<"cvt.w.s", FGR32Opnd, FGR32Opnd, IIFcvt>,
                  ABSS_FM<0x24, 16>;
 
 defm ROUND_W : ROUND_M<"round.w.d", IIFcvt>, ABSS_FM<0xc, 17>;
@@ -258,71 +258,71 @@ defm FLOOR_W : ROUND_M<"floor.w.d", IIFcvt>, ABSS_FM<0xf, 17>;
 defm CVT_W   : ROUND_M<"cvt.w.d", IIFcvt>, ABSS_FM<0x24, 17>;
 
 let Predicates = [IsFP64bit, HasStdEnc], DecoderNamespace = "Mips64" in {
-  def ROUND_L_S : ABSS_FT<"round.l.s", FGR64RegsOpnd, FGR32RegsOpnd, IIFcvt>,
+  def ROUND_L_S : ABSS_FT<"round.l.s", FGR64Opnd, FGR32Opnd, IIFcvt>,
                   ABSS_FM<0x8, 16>;
-  def ROUND_L_D64 : ABSS_FT<"round.l.d", FGR64RegsOpnd, FGR64RegsOpnd, IIFcvt>,
+  def ROUND_L_D64 : ABSS_FT<"round.l.d", FGR64Opnd, FGR64Opnd, IIFcvt>,
                     ABSS_FM<0x8, 17>;
-  def TRUNC_L_S : ABSS_FT<"trunc.l.s", FGR64RegsOpnd, FGR32RegsOpnd, IIFcvt>,
+  def TRUNC_L_S : ABSS_FT<"trunc.l.s", FGR64Opnd, FGR32Opnd, IIFcvt>,
                   ABSS_FM<0x9, 16>;
-  def TRUNC_L_D64 : ABSS_FT<"trunc.l.d", FGR64RegsOpnd, FGR64RegsOpnd, IIFcvt>,
+  def TRUNC_L_D64 : ABSS_FT<"trunc.l.d", FGR64Opnd, FGR64Opnd, IIFcvt>,
                     ABSS_FM<0x9, 17>;
-  def CEIL_L_S  : ABSS_FT<"ceil.l.s", FGR64RegsOpnd, FGR32RegsOpnd, IIFcvt>,
+  def CEIL_L_S  : ABSS_FT<"ceil.l.s", FGR64Opnd, FGR32Opnd, IIFcvt>,
                   ABSS_FM<0xa, 16>;
-  def CEIL_L_D64 : ABSS_FT<"ceil.l.d", FGR64RegsOpnd, FGR64RegsOpnd, IIFcvt>,
+  def CEIL_L_D64 : ABSS_FT<"ceil.l.d", FGR64Opnd, FGR64Opnd, IIFcvt>,
                    ABSS_FM<0xa, 17>;
-  def FLOOR_L_S : ABSS_FT<"floor.l.s", FGR64RegsOpnd, FGR32RegsOpnd, IIFcvt>,
+  def FLOOR_L_S : ABSS_FT<"floor.l.s", FGR64Opnd, FGR32Opnd, IIFcvt>,
                   ABSS_FM<0xb, 16>;
-  def FLOOR_L_D64 : ABSS_FT<"floor.l.d", FGR64RegsOpnd, FGR64RegsOpnd, IIFcvt>,
+  def FLOOR_L_D64 : ABSS_FT<"floor.l.d", FGR64Opnd, FGR64Opnd, IIFcvt>,
                     ABSS_FM<0xb, 17>;
 }
 
-def CVT_S_W : ABSS_FT<"cvt.s.w", FGR32RegsOpnd, FGR32RegsOpnd, IIFcvt>,
+def CVT_S_W : ABSS_FT<"cvt.s.w", FGR32Opnd, FGR32Opnd, IIFcvt>,
               ABSS_FM<0x20, 20>;
-def CVT_L_S : ABSS_FT<"cvt.l.s", FGR64RegsOpnd, FGR32RegsOpnd, IIFcvt>,
+def CVT_L_S : ABSS_FT<"cvt.l.s", FGR64Opnd, FGR32Opnd, IIFcvt>,
               ABSS_FM<0x25, 16>;
-def CVT_L_D64: ABSS_FT<"cvt.l.d", FGR64RegsOpnd, FGR64RegsOpnd, IIFcvt>,
+def CVT_L_D64: ABSS_FT<"cvt.l.d", FGR64Opnd, FGR64Opnd, IIFcvt>,
                ABSS_FM<0x25, 17>;
 
 let Predicates = [NotFP64bit, HasStdEnc] in {
-  def CVT_S_D32 : ABSS_FT<"cvt.s.d", FGR32RegsOpnd, AFGR64RegsOpnd, IIFcvt>,
+  def CVT_S_D32 : ABSS_FT<"cvt.s.d", FGR32Opnd, AFGR64Opnd, IIFcvt>,
                   ABSS_FM<0x20, 17>;
-  def CVT_D32_W : ABSS_FT<"cvt.d.w", AFGR64RegsOpnd, FGR32RegsOpnd, IIFcvt>,
+  def CVT_D32_W : ABSS_FT<"cvt.d.w", AFGR64Opnd, FGR32Opnd, IIFcvt>,
                   ABSS_FM<0x21, 20>;
-  def CVT_D32_S : ABSS_FT<"cvt.d.s", AFGR64RegsOpnd, FGR32RegsOpnd, IIFcvt>,
+  def CVT_D32_S : ABSS_FT<"cvt.d.s", AFGR64Opnd, FGR32Opnd, IIFcvt>,
                   ABSS_FM<0x21, 16>;
 }
 
 let Predicates = [IsFP64bit, HasStdEnc], DecoderNamespace = "Mips64" in {
-  def CVT_S_D64 : ABSS_FT<"cvt.s.d", FGR32RegsOpnd, FGR64RegsOpnd, IIFcvt>,
+  def CVT_S_D64 : ABSS_FT<"cvt.s.d", FGR32Opnd, FGR64Opnd, IIFcvt>,
                   ABSS_FM<0x20, 17>;
-  def CVT_S_L   : ABSS_FT<"cvt.s.l", FGR32RegsOpnd, FGR64RegsOpnd, IIFcvt>,
+  def CVT_S_L   : ABSS_FT<"cvt.s.l", FGR32Opnd, FGR64Opnd, IIFcvt>,
                   ABSS_FM<0x20, 21>;
-  def CVT_D64_W : ABSS_FT<"cvt.d.w", FGR64RegsOpnd, FGR32RegsOpnd, IIFcvt>,
+  def CVT_D64_W : ABSS_FT<"cvt.d.w", FGR64Opnd, FGR32Opnd, IIFcvt>,
                   ABSS_FM<0x21, 20>;
-  def CVT_D64_S : ABSS_FT<"cvt.d.s", FGR64RegsOpnd, FGR32RegsOpnd, IIFcvt>,
+  def CVT_D64_S : ABSS_FT<"cvt.d.s", FGR64Opnd, FGR32Opnd, IIFcvt>,
                   ABSS_FM<0x21, 16>;
-  def CVT_D64_L : ABSS_FT<"cvt.d.l", FGR64RegsOpnd, FGR64RegsOpnd, IIFcvt>,
+  def CVT_D64_L : ABSS_FT<"cvt.d.l", FGR64Opnd, FGR64Opnd, IIFcvt>,
                   ABSS_FM<0x21, 21>;
 }
 
 let isPseudo = 1, isCodeGenOnly = 1 in {
-  def PseudoCVT_S_W : ABSS_FT<"", FGR32RegsOpnd, GPR32Opnd, IIFcvt>;
-  def PseudoCVT_D32_W : ABSS_FT<"", AFGR64RegsOpnd, GPR32Opnd, IIFcvt>;
-  def PseudoCVT_S_L : ABSS_FT<"", FGR64RegsOpnd, GPR64Opnd, IIFcvt>;
-  def PseudoCVT_D64_W : ABSS_FT<"", FGR64RegsOpnd, GPR32Opnd, IIFcvt>;
-  def PseudoCVT_D64_L : ABSS_FT<"", FGR64RegsOpnd, GPR64Opnd, IIFcvt>;
+  def PseudoCVT_S_W : ABSS_FT<"", FGR32Opnd, GPR32Opnd, IIFcvt>;
+  def PseudoCVT_D32_W : ABSS_FT<"", AFGR64Opnd, GPR32Opnd, IIFcvt>;
+  def PseudoCVT_S_L : ABSS_FT<"", FGR64Opnd, GPR64Opnd, IIFcvt>;
+  def PseudoCVT_D64_W : ABSS_FT<"", FGR64Opnd, GPR32Opnd, IIFcvt>;
+  def PseudoCVT_D64_L : ABSS_FT<"", FGR64Opnd, GPR64Opnd, IIFcvt>;
 }
 
 let Predicates = [NoNaNsFPMath, HasStdEnc] in {
-  def FABS_S : ABSS_FT<"abs.s", FGR32RegsOpnd, FGR32RegsOpnd, IIFcvt, fabs>,
+  def FABS_S : ABSS_FT<"abs.s", FGR32Opnd, FGR32Opnd, IIFcvt, fabs>,
                ABSS_FM<0x5, 16>;
-  def FNEG_S : ABSS_FT<"neg.s", FGR32RegsOpnd, FGR32RegsOpnd, IIFcvt, fneg>,
+  def FNEG_S : ABSS_FT<"neg.s", FGR32Opnd, FGR32Opnd, IIFcvt, fneg>,
                ABSS_FM<0x7, 16>;
   defm FABS : ABSS_M<"abs.d", IIFcvt, fabs>, ABSS_FM<0x5, 17>;
   defm FNEG : ABSS_M<"neg.d", IIFcvt, fneg>, ABSS_FM<0x7, 17>;
 }
 
-def  FSQRT_S : ABSS_FT<"sqrt.s", FGR32RegsOpnd, FGR32RegsOpnd, IIFsqrtSingle,
+def  FSQRT_S : ABSS_FT<"sqrt.s", FGR32Opnd, FGR32Opnd, IIFsqrtSingle,
                fsqrt>, ABSS_FM<0x4, 16>;
 defm FSQRT : ABSS_M<"sqrt.d", IIFsqrtDouble, fsqrt>, ABSS_FM<0x4, 17>;
 
@@ -334,164 +334,134 @@ defm FSQRT : ABSS_M<"sqrt.d", IIFsqrtDouble, fsqrt>, ABSS_FM<0x4, 17>;
 /// Move Control Registers From/To CPU Registers
 def CFC1 : MFC1_FT<"cfc1", GPR32Opnd, CCROpnd, IIFmove>, MFC1_FM<2>;
 def CTC1 : MTC1_FT<"ctc1", CCROpnd, GPR32Opnd, IIFmove>, MFC1_FM<6>;
-def MFC1 : MFC1_FT<"mfc1", GPR32Opnd, FGR32RegsOpnd, IIFmoveC1, bitconvert>,
+def MFC1 : MFC1_FT<"mfc1", GPR32Opnd, FGR32Opnd, IIFmoveC1, bitconvert>,
            MFC1_FM<0>;
-def MTC1 : MTC1_FT<"mtc1", FGR32RegsOpnd, GPR32Opnd, IIFmoveC1, bitconvert>,
+def MTC1 : MTC1_FT<"mtc1", FGR32Opnd, GPR32Opnd, IIFmoveC1, bitconvert>,
            MFC1_FM<4>;
-def DMFC1 : MFC1_FT<"dmfc1", GPR64Opnd, FGR64RegsOpnd, IIFmoveC1,
+def MFHC1 : MFC1_FT<"mfhc1", GPR32Opnd, FGRH32Opnd, IIFmoveC1>,
+            MFC1_FM<3>;
+def MTHC1 : MTC1_FT<"mthc1", FGRH32Opnd, GPR32Opnd, IIFmoveC1>,
+            MFC1_FM<7>;
+def DMFC1 : MFC1_FT<"dmfc1", GPR64Opnd, FGR64Opnd, IIFmoveC1,
             bitconvert>, MFC1_FM<1>;
-def DMTC1 : MTC1_FT<"dmtc1", FGR64RegsOpnd, GPR64Opnd, IIFmoveC1,
+def DMTC1 : MTC1_FT<"dmtc1", FGR64Opnd, GPR64Opnd, IIFmoveC1,
             bitconvert>, MFC1_FM<5>;
 
-def FMOV_S   : ABSS_FT<"mov.s", FGR32RegsOpnd, FGR32RegsOpnd, IIFmove>,
+def FMOV_S   : ABSS_FT<"mov.s", FGR32Opnd, FGR32Opnd, IIFmove>,
                ABSS_FM<0x6, 16>;
-def FMOV_D32 : ABSS_FT<"mov.d", AFGR64RegsOpnd, AFGR64RegsOpnd, IIFmove>,
+def FMOV_D32 : ABSS_FT<"mov.d", AFGR64Opnd, AFGR64Opnd, IIFmove>,
                ABSS_FM<0x6, 17>, Requires<[NotFP64bit, HasStdEnc]>;
-def FMOV_D64 : ABSS_FT<"mov.d", FGR64RegsOpnd, FGR64RegsOpnd, IIFmove>,
+def FMOV_D64 : ABSS_FT<"mov.d", FGR64Opnd, FGR64Opnd, IIFmove>,
                ABSS_FM<0x6, 17>, Requires<[IsFP64bit, HasStdEnc]> {
                  let DecoderNamespace = "Mips64";
 }
 
 /// Floating Point Memory Instructions
-let Predicates = [IsN64, HasStdEnc], DecoderNamespace = "Mips64" in {
-  def LWC1_P8 : LW_FT<"lwc1", FGR32RegsOpnd, IIFLoad, mem64, load>,
-                LW_FM<0x31>;
-  def SWC1_P8 : SW_FT<"swc1", FGR32RegsOpnd, IIFStore, mem64, store>,
-                LW_FM<0x39>;
-  def LDC164_P8 : LW_FT<"ldc1", FGR64RegsOpnd, IIFLoad, mem64, load>,
-                  LW_FM<0x35> {
-    let isCodeGenOnly =1;
-  }
-  def SDC164_P8 : SW_FT<"sdc1", FGR64RegsOpnd, IIFStore, mem64, store>,
-                  LW_FM<0x3d> {
-    let isCodeGenOnly =1;
-  }
+let Predicates = [HasStdEnc] in {
+  def LWC1 : LW_FT<"lwc1", FGR32Opnd, IIFLoad, load>, LW_FM<0x31>;
+  def SWC1 : SW_FT<"swc1", FGR32Opnd, IIFStore, store>, LW_FM<0x39>;
 }
 
-let Predicates = [NotN64, HasStdEnc] in {
-  def LWC1 : LW_FT<"lwc1", FGR32RegsOpnd, IIFLoad, mem, load>, LW_FM<0x31>;
-  def SWC1 : SW_FT<"swc1", FGR32RegsOpnd, IIFStore, mem, store>, LW_FM<0x39>;
+let Predicates = [IsFP64bit, HasStdEnc], DecoderNamespace = "Mips64" in {
+  def LDC164 : LW_FT<"ldc1", FGR64Opnd, IIFLoad, load>, LW_FM<0x35>;
+  def SDC164 : SW_FT<"sdc1", FGR64Opnd, IIFStore, store>, LW_FM<0x3d>;
 }
 
-let Predicates = [NotN64, HasMips64, HasStdEnc],
-  DecoderNamespace = "Mips64" in {
-  def LDC164 : LW_FT<"ldc1", FGR64RegsOpnd, IIFLoad, mem, load>, LW_FM<0x35>;
-  def SDC164 : SW_FT<"sdc1", FGR64RegsOpnd, IIFStore, mem, store>, LW_FM<0x3d>;
+let Predicates = [NotFP64bit, HasStdEnc] in {
+  def LDC1 : LW_FT<"ldc1", AFGR64Opnd, IIFLoad, load>, LW_FM<0x35>;
+  def SDC1 : SW_FT<"sdc1", AFGR64Opnd, IIFStore, store>, LW_FM<0x3d>;
 }
 
-let Predicates = [NotN64, NotMips64, HasStdEnc] in {
-  let isPseudo = 1, isCodeGenOnly = 1 in {
-    def PseudoLDC1 : LW_FT<"", AFGR64RegsOpnd, IIFLoad, mem, load>;
-    def PseudoSDC1 : SW_FT<"", AFGR64RegsOpnd, IIFStore, mem, store>;
-  }
-  def LDC1 : LW_FT<"ldc1", AFGR64RegsOpnd, IIFLoad, mem>, LW_FM<0x35>;
-  def SDC1 : SW_FT<"sdc1", AFGR64RegsOpnd, IIFStore, mem>, LW_FM<0x3d>;
+/// Cop2 Memory Instructions
+let Predicates = [HasStdEnc] in {
+  def LWC2 : LW_FT<"lwc2", COP2Opnd, NoItinerary, load>, LW_FM<0x32>;
+  def SWC2 : SW_FT<"swc2", COP2Opnd, NoItinerary, store>, LW_FM<0x3a>;
+  def LDC2 : LW_FT<"ldc2", COP2Opnd, NoItinerary, load>, LW_FM<0x36>;
+  def SDC2 : SW_FT<"sdc2", COP2Opnd, NoItinerary, store>, LW_FM<0x3e>;
 }
 
 // Indexed loads and stores.
 let Predicates = [HasFPIdx, HasStdEnc] in {
-  def LWXC1 : LWXC1_FT<"lwxc1", FGR32RegsOpnd, GPR32Opnd, IIFLoad, load>,
-              LWXC1_FM<0>;
-  def SWXC1 : SWXC1_FT<"swxc1", FGR32RegsOpnd, GPR32Opnd, IIFStore, store>,
-              SWXC1_FM<8>;
-}
-
-let Predicates = [HasMips32r2, NotMips64, HasStdEnc] in {
-  def LDXC1 : LWXC1_FT<"ldxc1", AFGR64RegsOpnd, GPR32Opnd, IIFLoad, load>,
-              LWXC1_FM<1>;
-  def SDXC1 : SWXC1_FT<"sdxc1", AFGR64RegsOpnd, GPR32Opnd, IIFStore, store>,
-              SWXC1_FM<9>;
+  def LWXC1 : LWXC1_FT<"lwxc1", FGR32Opnd, IIFLoad, load>, LWXC1_FM<0>;
+  def SWXC1 : SWXC1_FT<"swxc1", FGR32Opnd, IIFStore, store>, SWXC1_FM<8>;
 }
 
-let Predicates = [HasMips64, NotN64, HasStdEnc], DecoderNamespace="Mips64" in {
-  def LDXC164 : LWXC1_FT<"ldxc1", FGR64RegsOpnd, GPR32Opnd, IIFLoad, load>,
-                LWXC1_FM<1>;
-  def SDXC164 : SWXC1_FT<"sdxc1", FGR64RegsOpnd, GPR32Opnd, IIFStore, store>,
-                SWXC1_FM<9>;
+let Predicates = [HasFPIdx, NotFP64bit, HasStdEnc] in {
+  def LDXC1 : LWXC1_FT<"ldxc1", AFGR64Opnd, IIFLoad, load>, LWXC1_FM<1>;
+  def SDXC1 : SWXC1_FT<"sdxc1", AFGR64Opnd, IIFStore, store>, SWXC1_FM<9>;
 }
 
-// n64
-let Predicates = [IsN64, HasStdEnc], isCodeGenOnly=1 in {
-  def LWXC1_P8 : LWXC1_FT<"lwxc1", FGR32RegsOpnd, GPR64Opnd, IIFLoad, load>,
-                 LWXC1_FM<0>;
-  def LDXC164_P8 : LWXC1_FT<"ldxc1", FGR64RegsOpnd, GPR64Opnd, IIFLoad,
-                             load>, LWXC1_FM<1>;
-  def SWXC1_P8 : SWXC1_FT<"swxc1", FGR32RegsOpnd, GPR64Opnd, IIFStore,
-                          store>, SWXC1_FM<8>;
-  def SDXC164_P8 : SWXC1_FT<"sdxc1", FGR64RegsOpnd, GPR64Opnd, IIFStore,
-                            store>, SWXC1_FM<9>;
+let Predicates = [HasFPIdx, IsFP64bit, HasStdEnc],
+    DecoderNamespace="Mips64" in {
+  def LDXC164 : LWXC1_FT<"ldxc1", FGR64Opnd, IIFLoad, load>, LWXC1_FM<1>;
+  def SDXC164 : SWXC1_FT<"sdxc1", FGR64Opnd, IIFStore, store>, SWXC1_FM<9>;
 }
 
 // Load/store doubleword indexed unaligned.
-let Predicates = [NotMips64, HasStdEnc] in {
-  def LUXC1 : LWXC1_FT<"luxc1", AFGR64RegsOpnd, GPR32Opnd, IIFLoad>,
-              LWXC1_FM<0x5>;
-  def SUXC1 : SWXC1_FT<"suxc1", AFGR64RegsOpnd, GPR32Opnd, IIFStore>,
-              SWXC1_FM<0xd>;
+let Predicates = [NotFP64bit, HasStdEnc] in {
+  def LUXC1 : LWXC1_FT<"luxc1", AFGR64Opnd, IIFLoad>, LWXC1_FM<0x5>;
+  def SUXC1 : SWXC1_FT<"suxc1", AFGR64Opnd, IIFStore>, SWXC1_FM<0xd>;
 }
 
-let Predicates = [HasMips64, HasStdEnc],
-  DecoderNamespace="Mips64" in {
-  def LUXC164 : LWXC1_FT<"luxc1", FGR64RegsOpnd, GPR32Opnd, IIFLoad>,
-                LWXC1_FM<0x5>;
-  def SUXC164 : SWXC1_FT<"suxc1", FGR64RegsOpnd, GPR32Opnd, IIFStore>,
-                SWXC1_FM<0xd>;
+let Predicates = [IsFP64bit, HasStdEnc], DecoderNamespace="Mips64" in {
+  def LUXC164 : LWXC1_FT<"luxc1", FGR64Opnd, IIFLoad>, LWXC1_FM<0x5>;
+  def SUXC164 : SWXC1_FT<"suxc1", FGR64Opnd, IIFStore>, SWXC1_FM<0xd>;
 }
 
 /// Floating-point Aritmetic
-def FADD_S : ADDS_FT<"add.s", FGR32RegsOpnd, IIFadd, 1, fadd>,
+def FADD_S : ADDS_FT<"add.s", FGR32Opnd, IIFadd, 1, fadd>,
              ADDS_FM<0x00, 16>;
 defm FADD :  ADDS_M<"add.d", IIFadd, 1, fadd>, ADDS_FM<0x00, 17>;
-def FDIV_S : ADDS_FT<"div.s", FGR32RegsOpnd, IIFdivSingle, 0, fdiv>,
+def FDIV_S : ADDS_FT<"div.s", FGR32Opnd, IIFdivSingle, 0, fdiv>,
              ADDS_FM<0x03, 16>;
 defm FDIV :  ADDS_M<"div.d", IIFdivDouble, 0, fdiv>, ADDS_FM<0x03, 17>;
-def FMUL_S : ADDS_FT<"mul.s", FGR32RegsOpnd, IIFmulSingle, 1, fmul>,
+def FMUL_S : ADDS_FT<"mul.s", FGR32Opnd, IIFmulSingle, 1, fmul>,
              ADDS_FM<0x02, 16>;
 defm FMUL :  ADDS_M<"mul.d", IIFmulDouble, 1, fmul>, ADDS_FM<0x02, 17>;
-def FSUB_S : ADDS_FT<"sub.s", FGR32RegsOpnd, IIFadd, 0, fsub>,
+def FSUB_S : ADDS_FT<"sub.s", FGR32Opnd, IIFadd, 0, fsub>,
              ADDS_FM<0x01, 16>;
 defm FSUB :  ADDS_M<"sub.d", IIFadd, 0, fsub>, ADDS_FM<0x01, 17>;
 
 let Predicates = [HasMips32r2, HasStdEnc] in {
-  def MADD_S : MADDS_FT<"madd.s", FGR32RegsOpnd, IIFmulSingle, fadd>,
+  def MADD_S : MADDS_FT<"madd.s", FGR32Opnd, IIFmulSingle, fadd>,
                MADDS_FM<4, 0>;
-  def MSUB_S : MADDS_FT<"msub.s", FGR32RegsOpnd, IIFmulSingle, fsub>,
+  def MSUB_S : MADDS_FT<"msub.s", FGR32Opnd, IIFmulSingle, fsub>,
                MADDS_FM<5, 0>;
 }
 
 let Predicates = [HasMips32r2, NoNaNsFPMath, HasStdEnc] in {
-  def NMADD_S : NMADDS_FT<"nmadd.s", FGR32RegsOpnd, IIFmulSingle, fadd>,
+  def NMADD_S : NMADDS_FT<"nmadd.s", FGR32Opnd, IIFmulSingle, fadd>,
                 MADDS_FM<6, 0>;
-  def NMSUB_S : NMADDS_FT<"nmsub.s", FGR32RegsOpnd, IIFmulSingle, fsub>,
+  def NMSUB_S : NMADDS_FT<"nmsub.s", FGR32Opnd, IIFmulSingle, fsub>,
                 MADDS_FM<7, 0>;
 }
 
 let Predicates = [HasMips32r2, NotFP64bit, HasStdEnc] in {
-  def MADD_D32 : MADDS_FT<"madd.d", AFGR64RegsOpnd, IIFmulDouble, fadd>,
+  def MADD_D32 : MADDS_FT<"madd.d", AFGR64Opnd, IIFmulDouble, fadd>,
                  MADDS_FM<4, 1>;
-  def MSUB_D32 : MADDS_FT<"msub.d", AFGR64RegsOpnd, IIFmulDouble, fsub>,
+  def MSUB_D32 : MADDS_FT<"msub.d", AFGR64Opnd, IIFmulDouble, fsub>,
                  MADDS_FM<5, 1>;
 }
 
 let Predicates = [HasMips32r2, NotFP64bit, NoNaNsFPMath, HasStdEnc] in {
-  def NMADD_D32 : NMADDS_FT<"nmadd.d", AFGR64RegsOpnd, IIFmulDouble, fadd>,
+  def NMADD_D32 : NMADDS_FT<"nmadd.d", AFGR64Opnd, IIFmulDouble, fadd>,
                   MADDS_FM<6, 1>;
-  def NMSUB_D32 : NMADDS_FT<"nmsub.d", AFGR64RegsOpnd, IIFmulDouble, fsub>,
+  def NMSUB_D32 : NMADDS_FT<"nmsub.d", AFGR64Opnd, IIFmulDouble, fsub>,
                   MADDS_FM<7, 1>;
 }
 
 let Predicates = [HasMips32r2, IsFP64bit, HasStdEnc], isCodeGenOnly=1 in {
-  def MADD_D64 : MADDS_FT<"madd.d", FGR64RegsOpnd, IIFmulDouble, fadd>,
+  def MADD_D64 : MADDS_FT<"madd.d", FGR64Opnd, IIFmulDouble, fadd>,
                  MADDS_FM<4, 1>;
-  def MSUB_D64 : MADDS_FT<"msub.d", FGR64RegsOpnd, IIFmulDouble, fsub>,
+  def MSUB_D64 : MADDS_FT<"msub.d", FGR64Opnd, IIFmulDouble, fsub>,
                  MADDS_FM<5, 1>;
 }
 
 let Predicates = [HasMips32r2, IsFP64bit, NoNaNsFPMath, HasStdEnc],
     isCodeGenOnly=1 in {
-  def NMADD_D64 : NMADDS_FT<"nmadd.d", FGR64RegsOpnd, IIFmulDouble, fadd>,
+  def NMADD_D64 : NMADDS_FT<"nmadd.d", FGR64Opnd, IIFmulDouble, fadd>,
                   MADDS_FM<6, 1>;
-  def NMSUB_D64 : NMADDS_FT<"nmsub.d", FGR64RegsOpnd, IIFmulDouble, fsub>,
+  def NMSUB_D64 : NMADDS_FT<"nmsub.d", FGR64Opnd, IIFmulDouble, fsub>,
                   MADDS_FM<7, 1>;
 }
 
@@ -542,20 +512,27 @@ def FCMP_D64 : CEQS_FT<"d", FGR64, IIFcmp, MipsFPCmp>, CEQS_FM<17>,
 
 // This pseudo instr gets expanded into 2 mtc1 instrs after register
 // allocation.
-def BuildPairF64 :
-  PseudoSE<(outs AFGR64RegsOpnd:$dst),
-           (ins GPR32Opnd:$lo, GPR32Opnd:$hi),
-           [(set AFGR64RegsOpnd:$dst,
-            (MipsBuildPairF64 GPR32Opnd:$lo, GPR32Opnd:$hi))]>;
+class BuildPairF64Base<RegisterOperand RO> :
+  PseudoSE<(outs RO:$dst), (ins GPR32Opnd:$lo, GPR32Opnd:$hi),
+           [(set RO:$dst, (MipsBuildPairF64 GPR32Opnd:$lo, GPR32Opnd:$hi))]>;
+
+def BuildPairF64 : BuildPairF64Base<AFGR64Opnd>,
+                   Requires<[NotFP64bit, HasStdEnc]>;
+def BuildPairF64_64 : BuildPairF64Base<FGR64Opnd>,
+                      Requires<[IsFP64bit, HasStdEnc]>;
 
 // This pseudo instr gets expanded into 2 mfc1 instrs after register
 // allocation.
 // if n is 0, lower part of src is extracted.
 // if n is 1, higher part of src is extracted.
-def ExtractElementF64 :
-  PseudoSE<(outs GPR32Opnd:$dst), (ins AFGR64RegsOpnd:$src, i32imm:$n),
-           [(set GPR32Opnd:$dst,
-            (MipsExtractElementF64 AFGR64RegsOpnd:$src, imm:$n))]>;
+class ExtractElementF64Base<RegisterOperand RO> :
+  PseudoSE<(outs GPR32Opnd:$dst), (ins RO:$src, i32imm:$n),
+           [(set GPR32Opnd:$dst, (MipsExtractElementF64 RO:$src, imm:$n))]>;
+
+def ExtractElementF64 : ExtractElementF64Base<AFGR64Opnd>,
+                        Requires<[NotFP64bit, HasStdEnc]>;
+def ExtractElementF64_64 : ExtractElementF64Base<FGR64Opnd>,
+                           Requires<[IsFP64bit, HasStdEnc]>;
 
 //===----------------------------------------------------------------------===//
 // InstAliases.
@@ -571,18 +548,18 @@ def : MipsPat<(f32 fpimm0neg), (FNEG_S (MTC1 ZERO))>;
 
 def : MipsPat<(f32 (sint_to_fp GPR32Opnd:$src)),
               (PseudoCVT_S_W GPR32Opnd:$src)>;
-def : MipsPat<(MipsTruncIntFP FGR32RegsOpnd:$src),
-              (TRUNC_W_S FGR32RegsOpnd:$src)>;
+def : MipsPat<(MipsTruncIntFP FGR32Opnd:$src),
+              (TRUNC_W_S FGR32Opnd:$src)>;
 
 let Predicates = [NotFP64bit, HasStdEnc] in {
   def : MipsPat<(f64 (sint_to_fp GPR32Opnd:$src)),
                 (PseudoCVT_D32_W GPR32Opnd:$src)>;
-  def : MipsPat<(MipsTruncIntFP AFGR64RegsOpnd:$src),
-                (TRUNC_W_D32 AFGR64RegsOpnd:$src)>;
-  def : MipsPat<(f32 (fround AFGR64RegsOpnd:$src)),
-                (CVT_S_D32 AFGR64RegsOpnd:$src)>;
-  def : MipsPat<(f64 (fextend FGR32RegsOpnd:$src)),
-                (CVT_D32_S FGR32RegsOpnd:$src)>;
+  def : MipsPat<(MipsTruncIntFP AFGR64Opnd:$src),
+                (TRUNC_W_D32 AFGR64Opnd:$src)>;
+  def : MipsPat<(f32 (fround AFGR64Opnd:$src)),
+                (CVT_S_D32 AFGR64Opnd:$src)>;
+  def : MipsPat<(f64 (fextend FGR32Opnd:$src)),
+                (CVT_D32_S FGR32Opnd:$src)>;
 }
 
 let Predicates = [IsFP64bit, HasStdEnc] in {
@@ -592,44 +569,37 @@ let Predicates = [IsFP64bit, HasStdEnc] in {
   def : MipsPat<(f64 (sint_to_fp GPR32Opnd:$src)),
                 (PseudoCVT_D64_W GPR32Opnd:$src)>;
   def : MipsPat<(f32 (sint_to_fp GPR64Opnd:$src)),
-                (EXTRACT_SUBREG (PseudoCVT_S_L GPR64Opnd:$src), sub_32)>;
+                (EXTRACT_SUBREG (PseudoCVT_S_L GPR64Opnd:$src), sub_lo)>;
   def : MipsPat<(f64 (sint_to_fp GPR64Opnd:$src)),
                 (PseudoCVT_D64_L GPR64Opnd:$src)>;
 
-  def : MipsPat<(MipsTruncIntFP FGR64RegsOpnd:$src),
-                (TRUNC_W_D64 FGR64RegsOpnd:$src)>;
-  def : MipsPat<(MipsTruncIntFP FGR32RegsOpnd:$src),
-                (TRUNC_L_S FGR32RegsOpnd:$src)>;
-  def : MipsPat<(MipsTruncIntFP FGR64RegsOpnd:$src),
-                (TRUNC_L_D64 FGR64RegsOpnd:$src)>;
+  def : MipsPat<(MipsTruncIntFP FGR64Opnd:$src),
+                (TRUNC_W_D64 FGR64Opnd:$src)>;
+  def : MipsPat<(MipsTruncIntFP FGR32Opnd:$src),
+                (TRUNC_L_S FGR32Opnd:$src)>;
+  def : MipsPat<(MipsTruncIntFP FGR64Opnd:$src),
+                (TRUNC_L_D64 FGR64Opnd:$src)>;
 
-  def : MipsPat<(f32 (fround FGR64RegsOpnd:$src)),
-                (CVT_S_D64 FGR64RegsOpnd:$src)>;
-  def : MipsPat<(f64 (fextend FGR32RegsOpnd:$src)),
-                (CVT_D64_S FGR32RegsOpnd:$src)>;
+  def : MipsPat<(f32 (fround FGR64Opnd:$src)),
+                (CVT_S_D64 FGR64Opnd:$src)>;
+  def : MipsPat<(f64 (fextend FGR32Opnd:$src)),
+                (CVT_D64_S FGR32Opnd:$src)>;
 }
 
 // Patterns for loads/stores with a reg+imm operand.
 let AddedComplexity = 40 in {
-  let Predicates = [IsN64, HasStdEnc] in {
-    def : LoadRegImmPat<LWC1_P8, f32, load>;
-    def : StoreRegImmPat<SWC1_P8, f32>;
-    def : LoadRegImmPat<LDC164_P8, f64, load>;
-    def : StoreRegImmPat<SDC164_P8, f64>;
-  }
-
-  let Predicates = [NotN64, HasStdEnc] in {
+  let Predicates = [HasStdEnc] in {
     def : LoadRegImmPat<LWC1, f32, load>;
     def : StoreRegImmPat<SWC1, f32>;
   }
 
-  let Predicates = [NotN64, HasMips64, HasStdEnc] in {
+  let Predicates = [IsFP64bit, HasStdEnc] in {
     def : LoadRegImmPat<LDC164, f64, load>;
     def : StoreRegImmPat<SDC164, f64>;
   }
 
-  let Predicates = [NotN64, NotMips64, HasStdEnc] in {
-    def : LoadRegImmPat<PseudoLDC1, f64, load>;
-    def : StoreRegImmPat<PseudoSDC1, f64>;
+  let Predicates = [NotFP64bit, HasStdEnc] in {
+    def : LoadRegImmPat<LDC1, f64, load>;
+    def : StoreRegImmPat<SDC1, f64>;
   }
 }
diff --git a/lib/Target/Mips/MipsInstrFormats.td b/lib/Target/Mips/MipsInstrFormats.td
index 1322784..737a018 100644
--- a/lib/Target/Mips/MipsInstrFormats.td
+++ b/lib/Target/Mips/MipsInstrFormats.td
@@ -183,7 +183,7 @@ class BranchBase<bits<6> op, dag outs, dag ins, string asmstr,
 // Format J instruction class in Mips : <|opcode|address|>
 //===----------------------------------------------------------------------===//
 
-class FJ<bits<6> op>
+class FJ<bits<6> op> : StdArch
 {
   bits<26> target;
 
@@ -272,7 +272,7 @@ class SRLV_FM<bits<6> funct, bit rotate> : StdArch {
   let Inst{5-0}   = funct;
 }
 
-class BEQ_FM<bits<6> op> {
+class BEQ_FM<bits<6> op> : StdArch {
   bits<5>  rs;
   bits<5>  rt;
   bits<16> offset;
@@ -285,7 +285,7 @@ class BEQ_FM<bits<6> op> {
   let Inst{15-0}  = offset;
 }
 
-class BGEZ_FM<bits<6> op, bits<5> funct> {
+class BGEZ_FM<bits<6> op, bits<5> funct> : StdArch {
   bits<5>  rs;
   bits<16> offset;
 
@@ -297,17 +297,6 @@ class BGEZ_FM<bits<6> op, bits<5> funct> {
   let Inst{15-0}  = offset;
 }
 
-class B_FM {
-  bits<16> offset;
-
-  bits<32> Inst;
-
-  let Inst{31-26} = 4;
-  let Inst{25-21} = 0;
-  let Inst{20-16} = 0;
-  let Inst{15-0}  = offset;
-}
-
 class SLTI_FM<bits<6> op> : StdArch {
   bits<5> rt;
   bits<5> rs;
@@ -321,7 +310,7 @@ class SLTI_FM<bits<6> op> : StdArch {
   let Inst{15-0}  = imm16;
 }
 
-class MFLO_FM<bits<6> funct> {
+class MFLO_FM<bits<6> funct> : StdArch {
   bits<5> rd;
 
   bits<32> Inst;
@@ -333,7 +322,7 @@ class MFLO_FM<bits<6> funct> {
   let Inst{5-0}   = funct;
 }
 
-class MTLO_FM<bits<6> funct> {
+class MTLO_FM<bits<6> funct> : StdArch {
   bits<5> rs;
 
   bits<32> Inst;
@@ -344,7 +333,7 @@ class MTLO_FM<bits<6> funct> {
   let Inst{5-0}   = funct;
 }
 
-class SEB_FM<bits<5> funct, bits<6> funct2> {
+class SEB_FM<bits<5> funct, bits<6> funct2> : StdArch {
   bits<5> rd;
   bits<5> rt;
 
@@ -358,7 +347,7 @@ class SEB_FM<bits<5> funct, bits<6> funct2> {
   let Inst{5-0}   = funct2;
 }
 
-class CLO_FM<bits<6> funct> {
+class CLO_FM<bits<6> funct> : StdArch {
   bits<5> rd;
   bits<5> rs;
   bits<5> rt;
@@ -374,7 +363,7 @@ class CLO_FM<bits<6> funct> {
   let rt = rd;
 }
 
-class LUI_FM {
+class LUI_FM : StdArch {
   bits<5> rt;
   bits<16> imm16;
 
@@ -386,7 +375,7 @@ class LUI_FM {
   let Inst{15-0}  = imm16;
 }
 
-class JALR_FM {
+class JALR_FM : StdArch {
   bits<5> rd;
   bits<5> rs;
 
@@ -400,7 +389,7 @@ class JALR_FM {
   let Inst{5-0}   = 9;
 }
 
-class BGEZAL_FM<bits<5> funct> {
+class BGEZAL_FM<bits<5> funct> : StdArch {
   bits<5>  rs;
   bits<16> offset;
 
@@ -435,7 +424,7 @@ class MULT_FM<bits<6> op, bits<6> funct> : StdArch {
   let Inst{5-0}   = funct;
 }
 
-class EXT_FM<bits<6> funct> {
+class EXT_FM<bits<6> funct> : StdArch {
   bits<5> rt;
   bits<5> rs;
   bits<5> pos;
@@ -465,7 +454,7 @@ class RDHWR_FM {
   let Inst{5-0}   = 0x3b;
 }
 
-class TEQ_FM<bits<6> funct> {
+class TEQ_FM<bits<6> funct> : StdArch {
   bits<5> rs;
   bits<5> rt;
   bits<10> code_;
@@ -479,6 +468,17 @@ class TEQ_FM<bits<6> funct> {
   let Inst{5-0}   = funct;
 }
 
+class TEQI_FM<bits<5> funct> : StdArch {
+  bits<5> rs;
+  bits<16> imm16;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 1;
+  let Inst{25-21} = rs;
+  let Inst{20-16}   = funct;
+  let Inst{15-0}  = imm16;
+}
 //===----------------------------------------------------------------------===//
 //  System calls format <op|code_|funct>
 //===----------------------------------------------------------------------===//
@@ -520,6 +520,24 @@ class ER_FM<bits<6> funct>
   let Inst{5-0}   = funct;
 }
 
+
+//===----------------------------------------------------------------------===//
+//  Enable/disable interrupt instruction format <Cop0|MFMC0|rt|12|0|sc|0|0>
+//===----------------------------------------------------------------------===//
+
+class EI_FM<bits<1> sc>
+{
+  bits<32> Inst;
+  bits<5> rt;
+  let Inst{31-26} = 0x10;
+  let Inst{25-21} = 0xb;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = 0xc;
+  let Inst{10-6}  = 0;
+  let Inst{5}     = sc;
+  let Inst{4-0}   = 0;
+}
+
 //===----------------------------------------------------------------------===//
 //
 //  FLOATING POINT INSTRUCTION FORMATS
@@ -701,7 +719,7 @@ class CMov_I_F_FM<bits<6> funct, bits<5> fmt> {
   let Inst{5-0} = funct;
 }
 
-class CMov_F_I_FM<bit tf> {
+class CMov_F_I_FM<bit tf> : StdArch {
   bits<5> rd;
   bits<5> rs;
   bits<3> fcc;
diff --git a/lib/Target/Mips/MipsInstrInfo.cpp b/lib/Target/Mips/MipsInstrInfo.cpp
index eae05a3..0ebad05 100644
--- a/lib/Target/Mips/MipsInstrInfo.cpp
+++ b/lib/Target/Mips/MipsInstrInfo.cpp
@@ -22,11 +22,14 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TargetRegistry.h"
 
-#define GET_INSTRINFO_CTOR
+#define GET_INSTRINFO_CTOR_DTOR
 #include "MipsGenInstrInfo.inc"
 
 using namespace llvm;
 
+// Pin the vtable to this file.
+void MipsInstrInfo::anchor() {}
+
 MipsInstrInfo::MipsInstrInfo(MipsTargetMachine &tm, unsigned UncondBr)
   : MipsGenInstrInfo(Mips::ADJCALLSTACKDOWN, Mips::ADJCALLSTACKUP),
     TM(tm), UncondBrOpc(UncondBr) {}
@@ -219,7 +222,7 @@ AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
 
   // If there is only one terminator instruction, process it.
   if (!SecondLastOpc) {
-    // Unconditional branch
+    // Unconditional branch.
     if (LastOpc == UncondBrOpc) {
       TBB = LastInst->getOperand(0).getMBB();
       return BT_Uncond;
@@ -271,6 +274,10 @@ unsigned MipsInstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const {
     const char *AsmStr = MI->getOperand(0).getSymbolName();
     return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo());
   }
+  case Mips::CONSTPOOL_ENTRY:
+    // If this machine instr is a constant pool entry, its size is recorded as
+    // operand #2.
+    return MI->getOperand(2).getImm();
   }
 }
 
diff --git a/lib/Target/Mips/MipsInstrInfo.h b/lib/Target/Mips/MipsInstrInfo.h
index b6480ef..d9ac961 100644
--- a/lib/Target/Mips/MipsInstrInfo.h
+++ b/lib/Target/Mips/MipsInstrInfo.h
@@ -27,6 +27,7 @@
 namespace llvm {
 
 class MipsInstrInfo : public MipsGenInstrInfo {
+  virtual void anchor();
 protected:
   MipsTargetMachine &TM;
   unsigned UncondBrOpc;
diff --git a/lib/Target/Mips/MipsInstrInfo.td b/lib/Target/Mips/MipsInstrInfo.td
index b9e8895..ebdbaa4 100644
--- a/lib/Target/Mips/MipsInstrInfo.td
+++ b/lib/Target/Mips/MipsInstrInfo.td
@@ -23,11 +23,9 @@ def SDT_MipsCMov         : SDTypeProfile<1, 4, [SDTCisSameAs<0, 1>,
                                                 SDTCisInt<4>]>;
 def SDT_MipsCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>]>;
 def SDT_MipsCallSeqEnd   : SDCallSeqEnd<[SDTCisVT<0, i32>, SDTCisVT<1, i32>]>;
-def SDT_ExtractLOHI : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisVT<1, untyped>,
-                                           SDTCisVT<2, i32>]>;
-def SDT_InsertLOHI : SDTypeProfile<1, 2, [SDTCisVT<0, untyped>,
-                                          SDTCisVT<1, i32>,
-                                          SDTCisSameAs<1, 2>]>;
+def SDT_MFLOHI : SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisVT<1, untyped>]>;
+def SDT_MTLOHI : SDTypeProfile<1, 2, [SDTCisVT<0, untyped>,
+                                      SDTCisInt<1>, SDTCisSameAs<1, 2>]>;
 def SDT_MipsMultDiv : SDTypeProfile<1, 2, [SDTCisVT<0, untyped>, SDTCisInt<1>,
                                     SDTCisSameAs<1, 2>]>;
 def SDT_MipsMAddMSub : SDTypeProfile<1, 3,
@@ -86,11 +84,12 @@ def callseq_end   : SDNode<"ISD::CALLSEQ_END", SDT_MipsCallSeqEnd,
                            [SDNPHasChain, SDNPSideEffect,
                             SDNPOptInGlue, SDNPOutGlue]>;
 
-// Node used to extract integer from LO/HI register.
-def ExtractLOHI : SDNode<"MipsISD::ExtractLOHI", SDT_ExtractLOHI>;
+// Nodes used to extract LO/HI registers.
+def MipsMFHI : SDNode<"MipsISD::MFHI", SDT_MFLOHI>;
+def MipsMFLO : SDNode<"MipsISD::MFLO", SDT_MFLOHI>;
 
 // Node used to insert 32-bit integers to LOHI register pair.
-def InsertLOHI : SDNode<"MipsISD::InsertLOHI", SDT_InsertLOHI>;
+def MipsMTLOHI : SDNode<"MipsISD::MTLOHI", SDT_MTLOHI>;
 
 // Mult nodes.
 def MipsMult  : SDNode<"MipsISD::Mult", SDT_MipsMultDiv>;
@@ -115,7 +114,7 @@ def MipsDivRemU16 : SDNode<"MipsISD::DivRemU16", SDT_MipsDivRem16,
 // Wrapper node patterns give the instruction selector a chance to replace
 // target constant nodes that would otherwise remain unchanged with ADDiu
 // nodes. Without these wrapper node patterns, the following conditional move
-// instrucion is emitted when function cmov2 in test/CodeGen/Mips/cmov.ll is
+// instruction is emitted when function cmov2 in test/CodeGen/Mips/cmov.ll is
 // compiled:
 //  movn  %got(d)($gp), %got(c)($gp), $4
 // This instruction is illegal since movn can take only register operands.
@@ -182,6 +181,12 @@ def NoNaNsFPMath :    Predicate<"TM.Options.NoNaNsFPMath">,
 def HasStdEnc :       Predicate<"Subtarget.hasStandardEncoding()">,
                       AssemblerPredicate<"!FeatureMips16">;
 def NotDSP :          Predicate<"!Subtarget.hasDSP()">;
+def InMicroMips    :  Predicate<"Subtarget.inMicroMipsMode()">,
+                      AssemblerPredicate<"FeatureMicroMips">;
+def NotInMicroMips :  Predicate<"!Subtarget.inMicroMipsMode()">,
+                      AssemblerPredicate<"!FeatureMicroMips">;
+def IsLE           :  Predicate<"Subtarget.isLittle()">;
+def IsBE           :  Predicate<"!Subtarget.isLittle()">;
 
 class MipsPat<dag pattern, dag result> : Pat<pattern, result> {
   let Predicates = [HasStdEnc];
@@ -242,7 +247,7 @@ def brtarget    : Operand<OtherVT> {
 def calltarget  : Operand<iPTR> {
   let EncoderMethod = "getJumpTargetOpValue";
 }
-def calltarget64: Operand<i64>;
+
 def simm16      : Operand<i32> {
   let DecoderMethod= "DecodeSimm16";
 }
@@ -256,48 +261,67 @@ def uimm20      : Operand<i32> {
 def uimm10      : Operand<i32> {
 }
 
-def simm16_64   : Operand<i64>;
-def shamt       : Operand<i32>;
+def simm16_64   : Operand<i64> {
+  let DecoderMethod = "DecodeSimm16";
+}
 
 // Unsigned Operand
+def uimm5       : Operand<i32> {
+  let PrintMethod = "printUnsignedImm";
+}
+
+def uimm6 : Operand<i32> {
+  let PrintMethod = "printUnsignedImm";
+}
+
 def uimm16      : Operand<i32> {
   let PrintMethod = "printUnsignedImm";
 }
 
+def pcrel16      : Operand<i32> {
+}
+
 def MipsMemAsmOperand : AsmOperandClass {
   let Name = "Mem";
   let ParserMethod = "parseMemOperand";
 }
 
-// Address operand
-def mem : Operand<i32> {
-  let PrintMethod = "printMemOperand";
-  let MIOperandInfo = (ops GPR32, simm16);
-  let EncoderMethod = "getMemEncoding";
-  let ParserMatchClass = MipsMemAsmOperand;
-  let OperandType = "OPERAND_MEMORY";
+def MipsInvertedImmoperand : AsmOperandClass {
+  let Name = "InvNum";
+  let RenderMethod = "addImmOperands";
+  let ParserMethod = "parseInvNum";
+}
+
+def PtrRegAsmOperand : AsmOperandClass {
+  let Name = "PtrReg";
+  let ParserMethod = "parsePtrReg";
+}
+
+
+def InvertedImOperand : Operand<i32> {
+  let ParserMatchClass = MipsInvertedImmoperand;
 }
 
-def mem64 : Operand<i64> {
+// Address operand
+def mem : Operand<iPTR> {
   let PrintMethod = "printMemOperand";
-  let MIOperandInfo = (ops GPR64, simm16_64);
+  let MIOperandInfo = (ops ptr_rc, simm16);
   let EncoderMethod = "getMemEncoding";
   let ParserMatchClass = MipsMemAsmOperand;
   let OperandType = "OPERAND_MEMORY";
 }
 
-def mem_ea : Operand<i32> {
+def mem_ea : Operand<iPTR> {
   let PrintMethod = "printMemOperandEA";
-  let MIOperandInfo = (ops GPR32, simm16);
+  let MIOperandInfo = (ops ptr_rc, simm16);
   let EncoderMethod = "getMemEncoding";
   let OperandType = "OPERAND_MEMORY";
 }
 
-def mem_ea_64 : Operand<i64> {
-  let PrintMethod = "printMemOperandEA";
-  let MIOperandInfo = (ops GPR64, simm16_64);
-  let EncoderMethod = "getMemEncoding";
-  let OperandType = "OPERAND_MEMORY";
+def PtrRC : Operand<iPTR> {
+  let MIOperandInfo = (ops ptr_rc);
+  let DecoderMethod = "DecodePtrRegisterClass";
+  let ParserMatchClass = PtrRegAsmOperand;
 }
 
 // size operand of ext instruction
@@ -370,6 +394,9 @@ def addr :
 def addrRegImm :
   ComplexPattern<iPTR, 2, "selectAddrRegImm", [frameindex]>;
 
+def addrRegReg :
+  ComplexPattern<iPTR, 2, "selectAddrRegReg", [frameindex]>;
+
 def addrDefault :
   ComplexPattern<iPTR, 2, "selectAddrDefault", [frameindex]>;
 
@@ -404,9 +431,9 @@ class ArithLogicI<string opstr, Operand Od, RegisterOperand RO,
 // Arithmetic Multiply ADD/SUB
 class MArithR<string opstr, bit isComm = 0> :
   InstSE<(outs), (ins GPR32Opnd:$rs, GPR32Opnd:$rt),
-         !strconcat(opstr, "\t$rs, $rt"), [], IIImult, FrmR> {
-  let Defs = [HI, LO];
-  let Uses = [HI, LO];
+         !strconcat(opstr, "\t$rs, $rt"), [], IIImult, FrmR, opstr> {
+  let Defs = [HI0, LO0];
+  let Uses = [HI0, LO0];
   let isCommutable = isComm;
 }
 
@@ -435,121 +462,66 @@ class shift_rotate_reg<string opstr, RegisterOperand RO,
 // Load Upper Imediate
 class LoadUpper<string opstr, RegisterOperand RO, Operand Imm>:
   InstSE<(outs RO:$rt), (ins Imm:$imm16), !strconcat(opstr, "\t$rt, $imm16"),
-         [], IIArith, FrmI>, IsAsCheapAsAMove {
+         [], IIArith, FrmI, opstr>, IsAsCheapAsAMove {
   let neverHasSideEffects = 1;
   let isReMaterializable = 1;
 }
 
-class FMem<bits<6> op, dag outs, dag ins, string asmstr, list<dag> pattern,
-          InstrItinClass itin>: FFI<op, outs, ins, asmstr, pattern> {
-  bits<21> addr;
-  let Inst{25-21} = addr{20-16};
-  let Inst{15-0}  = addr{15-0};
-  let DecoderMethod = "DecodeMem";
-}
-
 // Memory Load/Store
-class Load<string opstr, SDPatternOperator OpNode, DAGOperand RO,
-           InstrItinClass Itin, Operand MemOpnd, ComplexPattern Addr,
-           string ofsuffix> :
-  InstSE<(outs RO:$rt), (ins MemOpnd:$addr), !strconcat(opstr, "\t$rt, $addr"),
-         [(set RO:$rt, (OpNode Addr:$addr))], NoItinerary, FrmI,
-         !strconcat(opstr, ofsuffix)> {
+class Load<string opstr, DAGOperand RO, SDPatternOperator OpNode = null_frag,
+           InstrItinClass Itin = NoItinerary, ComplexPattern Addr = addr> :
+  InstSE<(outs RO:$rt), (ins mem:$addr), !strconcat(opstr, "\t$rt, $addr"),
+         [(set RO:$rt, (OpNode Addr:$addr))], Itin, FrmI, opstr> {
   let DecoderMethod = "DecodeMem";
   let canFoldAsLoad = 1;
   let mayLoad = 1;
 }
 
-class Store<string opstr, SDPatternOperator OpNode, DAGOperand RO,
-            InstrItinClass Itin, Operand MemOpnd, ComplexPattern Addr,
-            string ofsuffix> :
-  InstSE<(outs), (ins RO:$rt, MemOpnd:$addr), !strconcat(opstr, "\t$rt, $addr"),
-         [(OpNode RO:$rt, Addr:$addr)], NoItinerary, FrmI,
-         !strconcat(opstr, ofsuffix)> {
+class Store<string opstr, DAGOperand RO, SDPatternOperator OpNode = null_frag,
+            InstrItinClass Itin = NoItinerary, ComplexPattern Addr = addr> :
+  InstSE<(outs), (ins RO:$rt, mem:$addr), !strconcat(opstr, "\t$rt, $addr"),
+         [(OpNode RO:$rt, Addr:$addr)], Itin, FrmI, opstr> {
   let DecoderMethod = "DecodeMem";
   let mayStore = 1;
 }
 
-multiclass LoadM<string opstr, DAGOperand RO,
-                 SDPatternOperator OpNode = null_frag,
-                 InstrItinClass Itin = NoItinerary,
-                 ComplexPattern Addr = addr> {
-  def NAME : Load<opstr, OpNode, RO, Itin, mem, Addr, "">,
-             Requires<[NotN64, HasStdEnc]>;
-  def _P8  : Load<opstr, OpNode, RO, Itin, mem64, Addr, "_p8">,
-             Requires<[IsN64, HasStdEnc]> {
-    let DecoderNamespace = "Mips64";
-    let isCodeGenOnly = 1;
-  }
-}
-
-multiclass StoreM<string opstr, DAGOperand RO,
-                  SDPatternOperator OpNode = null_frag,
-                  InstrItinClass Itin = NoItinerary,
-                  ComplexPattern Addr = addr> {
-  def NAME : Store<opstr, OpNode, RO, Itin, mem, Addr, "">,
-             Requires<[NotN64, HasStdEnc]>;
-  def _P8  : Store<opstr, OpNode, RO, Itin, mem64, Addr, "_p8">,
-             Requires<[IsN64, HasStdEnc]> {
-    let DecoderNamespace = "Mips64";
-    let isCodeGenOnly = 1;
-  }
-}
-
 // Load/Store Left/Right
 let canFoldAsLoad = 1 in
 class LoadLeftRight<string opstr, SDNode OpNode, RegisterOperand RO,
-                    Operand MemOpnd> :
-  InstSE<(outs RO:$rt), (ins MemOpnd:$addr, RO:$src),
+                    InstrItinClass Itin> :
+  InstSE<(outs RO:$rt), (ins mem:$addr, RO:$src),
          !strconcat(opstr, "\t$rt, $addr"),
-         [(set RO:$rt, (OpNode addr:$addr, RO:$src))], NoItinerary, FrmI> {
+         [(set RO:$rt, (OpNode addr:$addr, RO:$src))], Itin, FrmI> {
   let DecoderMethod = "DecodeMem";
   string Constraints = "$src = $rt";
 }
 
 class StoreLeftRight<string opstr, SDNode OpNode, RegisterOperand RO,
-                     Operand MemOpnd>:
-  InstSE<(outs), (ins RO:$rt, MemOpnd:$addr), !strconcat(opstr, "\t$rt, $addr"),
-         [(OpNode RO:$rt, addr:$addr)], NoItinerary, FrmI> {
+                     InstrItinClass Itin> :
+  InstSE<(outs), (ins RO:$rt, mem:$addr), !strconcat(opstr, "\t$rt, $addr"),
+         [(OpNode RO:$rt, addr:$addr)], Itin, FrmI> {
   let DecoderMethod = "DecodeMem";
 }
 
-multiclass LoadLeftRightM<string opstr, SDNode OpNode, RegisterOperand RO> {
-  def NAME : LoadLeftRight<opstr, OpNode, RO, mem>,
-             Requires<[NotN64, HasStdEnc]>;
-  def _P8  : LoadLeftRight<opstr, OpNode, RO, mem64>,
-             Requires<[IsN64, HasStdEnc]> {
-    let DecoderNamespace = "Mips64";
-    let isCodeGenOnly = 1;
-  }
-}
-
-multiclass StoreLeftRightM<string opstr, SDNode OpNode, RegisterOperand RO> {
-  def NAME : StoreLeftRight<opstr, OpNode, RO, mem>,
-             Requires<[NotN64, HasStdEnc]>;
-  def _P8  : StoreLeftRight<opstr, OpNode, RO, mem64>,
-             Requires<[IsN64, HasStdEnc]> {
-    let DecoderNamespace = "Mips64";
-    let isCodeGenOnly = 1;
-  }
-}
-
 // Conditional Branch
-class CBranch<string opstr, PatFrag cond_op, RegisterOperand RO> :
-  InstSE<(outs), (ins RO:$rs, RO:$rt, brtarget:$offset),
+class CBranch<string opstr, DAGOperand opnd, PatFrag cond_op,
+              RegisterOperand RO> :
+  InstSE<(outs), (ins RO:$rs, RO:$rt, opnd:$offset),
          !strconcat(opstr, "\t$rs, $rt, $offset"),
          [(brcond (i32 (cond_op RO:$rs, RO:$rt)), bb:$offset)], IIBranch,
-         FrmI> {
+         FrmI, opstr> {
   let isBranch = 1;
   let isTerminator = 1;
   let hasDelaySlot = 1;
   let Defs = [AT];
 }
 
-class CBranchZero<string opstr, PatFrag cond_op, RegisterOperand RO> :
-  InstSE<(outs), (ins RO:$rs, brtarget:$offset),
+class CBranchZero<string opstr, DAGOperand opnd, PatFrag cond_op,
+                  RegisterOperand RO> :
+  InstSE<(outs), (ins RO:$rs, opnd:$offset),
          !strconcat(opstr, "\t$rs, $offset"),
-         [(brcond (i32 (cond_op RO:$rs, 0)), bb:$offset)], IIBranch, FrmI> {
+         [(brcond (i32 (cond_op RO:$rs, 0)), bb:$offset)], IIBranch,
+         FrmI, opstr> {
   let isBranch = 1;
   let isTerminator = 1;
   let hasDelaySlot = 1;
@@ -572,9 +544,9 @@ class SetCC_I<string opstr, PatFrag cond_op, Operand Od, PatLeaf imm_type,
 
 // Jump
 class JumpFJ<DAGOperand opnd, string opstr, SDPatternOperator operator,
-             SDPatternOperator targetoperator> :
+             SDPatternOperator targetoperator, string bopstr> :
   InstSE<(outs), (ins opnd:$target), !strconcat(opstr, "\t$target"),
-         [(operator targetoperator:$target)], IIBranch, FrmJ> {
+         [(operator targetoperator:$target)], IIBranch, FrmJ, bopstr> {
   let isTerminator=1;
   let isBarrier=1;
   let hasDelaySlot = 1;
@@ -583,9 +555,9 @@ class JumpFJ<DAGOperand opnd, string opstr, SDPatternOperator operator,
 }
 
 // Unconditional branch
-class UncondBranch<string opstr> :
-  InstSE<(outs), (ins brtarget:$offset), !strconcat(opstr, "\t$offset"),
-         [(br bb:$offset)], IIBranch, FrmI> {
+class UncondBranch<Instruction BEQInst> :
+  PseudoSE<(outs), (ins brtarget:$offset), [(br bb:$offset)], IIBranch>,
+  PseudoInstExpansion<(BEQInst ZERO, ZERO, brtarget:$offset)> {
   let isBranch = 1;
   let isTerminator = 1;
   let isBarrier = 1;
@@ -596,17 +568,20 @@ class UncondBranch<string opstr> :
 
 // Base class for indirect branch and return instruction classes.
 let isTerminator=1, isBarrier=1, hasDelaySlot = 1 in
-class JumpFR<RegisterOperand RO, SDPatternOperator operator = null_frag>:
-  InstSE<(outs), (ins RO:$rs), "jr\t$rs", [(operator RO:$rs)], IIBranch, FrmR>;
+class JumpFR<string opstr, RegisterOperand RO,
+             SDPatternOperator operator = null_frag>:
+  InstSE<(outs), (ins RO:$rs), "jr\t$rs", [(operator RO:$rs)], IIBranch,
+         FrmR, opstr>;
 
 // Indirect branch
-class IndirectBranch<RegisterOperand RO>: JumpFR<RO, brind> {
+class IndirectBranch<string opstr, RegisterOperand RO> :
+      JumpFR<opstr, RO, brind> {
   let isBranch = 1;
   let isIndirectBranch = 1;
 }
 
 // Return instruction
-class RetBase<RegisterOperand RO>: JumpFR<RO> {
+class RetBase<string opstr, RegisterOperand RO>: JumpFR<opstr, RO> {
   let isReturn = 1;
   let isCodeGenOnly = 1;
   let hasCtrlDep = 1;
@@ -615,9 +590,9 @@ class RetBase<RegisterOperand RO>: JumpFR<RO> {
 
 // Jump and Link (Call)
 let isCall=1, hasDelaySlot=1, Defs = [RA] in {
-  class JumpLink<string opstr> :
-    InstSE<(outs), (ins calltarget:$target), !strconcat(opstr, "\t$target"),
-           [(MipsJmpLink imm:$target)], IIBranch, FrmJ> {
+  class JumpLink<string opstr, DAGOperand opnd> :
+    InstSE<(outs), (ins opnd:$target), !strconcat(opstr, "\t$target"),
+           [(MipsJmpLink imm:$target)], IIBranch, FrmJ, opstr> {
     let DecoderMethod = "DecodeJumpTarget";
   }
 
@@ -628,11 +603,11 @@ let isCall=1, hasDelaySlot=1, Defs = [RA] in {
 
   class JumpLinkReg<string opstr, RegisterOperand RO>:
     InstSE<(outs RO:$rd), (ins RO:$rs), !strconcat(opstr, "\t$rd, $rs"),
-           [], IIBranch, FrmR>;
+           [], IIBranch, FrmR, opstr>;
 
-  class BGEZAL_FT<string opstr, RegisterOperand RO> :
-    InstSE<(outs), (ins RO:$rs, brtarget:$offset),
-           !strconcat(opstr, "\t$rs, $offset"), [], IIBranch, FrmI>;
+  class BGEZAL_FT<string opstr, DAGOperand opnd, RegisterOperand RO> :
+    InstSE<(outs), (ins RO:$rs, opnd:$offset),
+           !strconcat(opstr, "\t$rs, $offset"), [], IIBranch, FrmI, opstr>;
 
 }
 
@@ -660,6 +635,20 @@ class ER_FT<string opstr> :
   InstSE<(outs), (ins),
          opstr, [], NoItinerary, FrmOther>;
 
+// Interrupts
+class DEI_FT<string opstr, RegisterOperand RO> :
+  InstSE<(outs RO:$rt), (ins),
+         !strconcat(opstr, "\t$rt"), [], NoItinerary, FrmOther>;
+
+// Wait
+class WAIT_FT<string opstr> :
+  InstSE<(outs), (ins), opstr, [], NoItinerary, FrmOther> {
+  let Inst{31-26} = 0x10;
+  let Inst{25}    = 1;
+  let Inst{24-6}  = 0;
+  let Inst{5-0}   = 0x20;
+}
+
 // Sync
 let hasSideEffects = 1 in
 class SYNC_FT :
@@ -669,8 +658,12 @@ class SYNC_FT :
 let hasSideEffects = 1 in
 class TEQ_FT<string opstr, RegisterOperand RO> :
   InstSE<(outs), (ins RO:$rs, RO:$rt, uimm16:$code_),
-         !strconcat(opstr, "\t$rs, $rt, $code_"), [], NoItinerary, FrmI>;
+         !strconcat(opstr, "\t$rs, $rt, $code_"), [], NoItinerary,
+         FrmI, opstr>;
 
+class TEQI_FT<string opstr, RegisterOperand RO> :
+  InstSE<(outs), (ins RO:$rs, uimm16:$imm16),
+         !strconcat(opstr, "\t$rs, $imm16"), [], NoItinerary, FrmOther, opstr>;
 // Mul, Div
 class Mult<string opstr, InstrItinClass itin, RegisterOperand RO,
            list<Register> DefRegs> :
@@ -698,10 +691,10 @@ class MultDivPseudo<Instruction RealInst, RegisterClass R0, RegisterOperand R1,
 // Pseudo multiply add/sub instruction with explicit accumulator register
 // operands.
 class MAddSubPseudo<Instruction RealInst, SDPatternOperator OpNode>
-  : PseudoSE<(outs ACRegs:$ac),
-             (ins GPR32Opnd:$rs, GPR32Opnd:$rt, ACRegs:$acin),
-             [(set ACRegs:$ac,
-              (OpNode GPR32Opnd:$rs, GPR32Opnd:$rt, ACRegs:$acin))],
+  : PseudoSE<(outs ACC64:$ac),
+             (ins GPR32Opnd:$rs, GPR32Opnd:$rt, ACC64:$acin),
+             [(set ACC64:$ac,
+              (OpNode GPR32Opnd:$rs, GPR32Opnd:$rt, ACC64:$acin))],
              IIImult>,
     PseudoInstExpansion<(RealInst GPR32Opnd:$rs, GPR32Opnd:$rt)> {
   string Constraints = "$acin = $ac";
@@ -710,25 +703,35 @@ class MAddSubPseudo<Instruction RealInst, SDPatternOperator OpNode>
 class Div<string opstr, InstrItinClass itin, RegisterOperand RO,
           list<Register> DefRegs> :
   InstSE<(outs), (ins RO:$rs, RO:$rt), !strconcat(opstr, "\t$$zero, $rs, $rt"),
-         [], itin, FrmR> {
+         [], itin, FrmR, opstr> {
   let Defs = DefRegs;
 }
 
 // Move from Hi/Lo
-class MoveFromLOHI<string opstr, RegisterOperand RO, list<Register> UseRegs>:
-  InstSE<(outs RO:$rd), (ins), !strconcat(opstr, "\t$rd"), [], IIHiLo, FrmR> {
-  let Uses = UseRegs;
+class PseudoMFLOHI<RegisterClass DstRC, RegisterClass SrcRC, SDNode OpNode>
+  : PseudoSE<(outs DstRC:$rd), (ins SrcRC:$hilo),
+             [(set DstRC:$rd, (OpNode SrcRC:$hilo))], IIHiLo>;
+
+class MoveFromLOHI<string opstr, RegisterOperand RO, Register UseReg>:
+  InstSE<(outs RO:$rd), (ins), !strconcat(opstr, "\t$rd"), [], IIHiLo, FrmR,
+         opstr> {
+  let Uses = [UseReg];
   let neverHasSideEffects = 1;
 }
 
+class PseudoMTLOHI<RegisterClass DstRC, RegisterClass SrcRC>
+  : PseudoSE<(outs DstRC:$lohi), (ins SrcRC:$lo, SrcRC:$hi),
+             [(set DstRC:$lohi, (MipsMTLOHI SrcRC:$lo, SrcRC:$hi))], IIHiLo>;
+
 class MoveToLOHI<string opstr, RegisterOperand RO, list<Register> DefRegs>:
-  InstSE<(outs), (ins RO:$rs), !strconcat(opstr, "\t$rs"), [], IIHiLo, FrmR> {
+  InstSE<(outs), (ins RO:$rs), !strconcat(opstr, "\t$rs"), [], IIHiLo,
+  FrmR, opstr> {
   let Defs = DefRegs;
   let neverHasSideEffects = 1;
 }
 
-class EffectiveAddress<string opstr, RegisterOperand RO, Operand Mem> :
-  InstSE<(outs RO:$rt), (ins Mem:$addr), !strconcat(opstr, "\t$rt, $addr"),
+class EffectiveAddress<string opstr, RegisterOperand RO> :
+  InstSE<(outs RO:$rt), (ins mem_ea:$addr), !strconcat(opstr, "\t$rt, $addr"),
          [(set RO:$rt, addr:$addr)], NoItinerary, FrmI> {
   let isCodeGenOnly = 1;
   let DecoderMethod = "DecodeMem";
@@ -737,26 +740,26 @@ class EffectiveAddress<string opstr, RegisterOperand RO, Operand Mem> :
 // Count Leading Ones/Zeros in Word
 class CountLeading0<string opstr, RegisterOperand RO>:
   InstSE<(outs RO:$rd), (ins RO:$rs), !strconcat(opstr, "\t$rd, $rs"),
-         [(set RO:$rd, (ctlz RO:$rs))], IIArith, FrmR>,
+         [(set RO:$rd, (ctlz RO:$rs))], IIArith, FrmR, opstr>,
   Requires<[HasBitCount, HasStdEnc]>;
 
 class CountLeading1<string opstr, RegisterOperand RO>:
   InstSE<(outs RO:$rd), (ins RO:$rs), !strconcat(opstr, "\t$rd, $rs"),
-         [(set RO:$rd, (ctlz (not RO:$rs)))], IIArith, FrmR>,
+         [(set RO:$rd, (ctlz (not RO:$rs)))], IIArith, FrmR, opstr>,
   Requires<[HasBitCount, HasStdEnc]>;
 
 
 // Sign Extend in Register.
 class SignExtInReg<string opstr, ValueType vt, RegisterOperand RO> :
   InstSE<(outs RO:$rd), (ins RO:$rt), !strconcat(opstr, "\t$rd, $rt"),
-         [(set RO:$rd, (sext_inreg RO:$rt, vt))], IIseb, FrmR> {
+         [(set RO:$rd, (sext_inreg RO:$rt, vt))], IIseb, FrmR, opstr> {
   let Predicates = [HasSEInReg, HasStdEnc];
 }
 
 // Subword Swap
 class SubwordSwap<string opstr, RegisterOperand RO>:
   InstSE<(outs RO:$rd), (ins RO:$rt), !strconcat(opstr, "\t$rd, $rt"), [],
-         NoItinerary, FrmR> {
+         NoItinerary, FrmR, opstr> {
   let Predicates = [HasSwap, HasStdEnc];
   let neverHasSideEffects = 1;
 }
@@ -767,66 +770,60 @@ class ReadHardware<RegisterOperand CPURegOperand, RegisterOperand RO> :
          IIArith, FrmR>;
 
 // Ext and Ins
-class ExtBase<string opstr, RegisterOperand RO>:
-  InstSE<(outs RO:$rt), (ins RO:$rs, uimm16:$pos, size_ext:$size),
+class ExtBase<string opstr, RegisterOperand RO, Operand PosOpnd,
+              SDPatternOperator Op = null_frag>:
+  InstSE<(outs RO:$rt), (ins RO:$rs, PosOpnd:$pos, size_ext:$size),
          !strconcat(opstr, " $rt, $rs, $pos, $size"),
-         [(set RO:$rt, (MipsExt RO:$rs, imm:$pos, imm:$size))], NoItinerary,
-         FrmR> {
+         [(set RO:$rt, (Op RO:$rs, imm:$pos, imm:$size))], NoItinerary,
+         FrmR, opstr> {
   let Predicates = [HasMips32r2, HasStdEnc];
 }
 
-class InsBase<string opstr, RegisterOperand RO>:
-  InstSE<(outs RO:$rt), (ins RO:$rs, uimm16:$pos, size_ins:$size, RO:$src),
+class InsBase<string opstr, RegisterOperand RO, Operand PosOpnd,
+              SDPatternOperator Op = null_frag>:
+  InstSE<(outs RO:$rt), (ins RO:$rs, PosOpnd:$pos, size_ins:$size, RO:$src),
          !strconcat(opstr, " $rt, $rs, $pos, $size"),
-         [(set RO:$rt, (MipsIns RO:$rs, imm:$pos, imm:$size, RO:$src))],
-         NoItinerary, FrmR> {
+         [(set RO:$rt, (Op RO:$rs, imm:$pos, imm:$size, RO:$src))],
+         NoItinerary, FrmR, opstr> {
   let Predicates = [HasMips32r2, HasStdEnc];
   let Constraints = "$src = $rt";
 }
 
 // Atomic instructions with 2 source operands (ATOMIC_SWAP & ATOMIC_LOAD_*).
-class Atomic2Ops<PatFrag Op, RegisterClass DRC, RegisterClass PRC> :
-  PseudoSE<(outs DRC:$dst), (ins PRC:$ptr, DRC:$incr),
-           [(set DRC:$dst, (Op PRC:$ptr, DRC:$incr))]>;
-
-multiclass Atomic2Ops32<PatFrag Op> {
-  def NAME : Atomic2Ops<Op, GPR32, GPR32>, Requires<[NotN64, HasStdEnc]>;
-  def _P8  : Atomic2Ops<Op, GPR32, GPR64>, Requires<[IsN64, HasStdEnc]>;
-}
+class Atomic2Ops<PatFrag Op, RegisterClass DRC> :
+  PseudoSE<(outs DRC:$dst), (ins PtrRC:$ptr, DRC:$incr),
+           [(set DRC:$dst, (Op iPTR:$ptr, DRC:$incr))]>;
 
 // Atomic Compare & Swap.
-class AtomicCmpSwap<PatFrag Op, RegisterClass DRC, RegisterClass PRC> :
-  PseudoSE<(outs DRC:$dst), (ins PRC:$ptr, DRC:$cmp, DRC:$swap),
-           [(set DRC:$dst, (Op PRC:$ptr, DRC:$cmp, DRC:$swap))]>;
-
-multiclass AtomicCmpSwap32<PatFrag Op>  {
-  def NAME : AtomicCmpSwap<Op, GPR32, GPR32>,
-             Requires<[NotN64, HasStdEnc]>;
-  def _P8  : AtomicCmpSwap<Op, GPR32, GPR64>,
-             Requires<[IsN64, HasStdEnc]>;
-}
+class AtomicCmpSwap<PatFrag Op, RegisterClass DRC> :
+  PseudoSE<(outs DRC:$dst), (ins PtrRC:$ptr, DRC:$cmp, DRC:$swap),
+           [(set DRC:$dst, (Op iPTR:$ptr, DRC:$cmp, DRC:$swap))]>;
 
-class LLBase<string opstr, RegisterOperand RO, Operand Mem> :
-  InstSE<(outs RO:$rt), (ins Mem:$addr), !strconcat(opstr, "\t$rt, $addr"),
+class LLBase<string opstr, RegisterOperand RO> :
+  InstSE<(outs RO:$rt), (ins mem:$addr), !strconcat(opstr, "\t$rt, $addr"),
          [], NoItinerary, FrmI> {
   let DecoderMethod = "DecodeMem";
   let mayLoad = 1;
 }
 
-class SCBase<string opstr, RegisterOperand RO, Operand Mem> :
-  InstSE<(outs RO:$dst), (ins RO:$rt, Mem:$addr),
+class SCBase<string opstr, RegisterOperand RO> :
+  InstSE<(outs RO:$dst), (ins RO:$rt, mem:$addr),
          !strconcat(opstr, "\t$rt, $addr"), [], NoItinerary, FrmI> {
   let DecoderMethod = "DecodeMem";
   let mayStore = 1;
   let Constraints = "$rt = $dst";
 }
 
-class MFC3OP<dag outs, dag ins, string asmstr> :
-  InstSE<outs, ins, asmstr, [], NoItinerary, FrmFR>;
+class MFC3OP<string asmstr, RegisterOperand RO> :
+  InstSE<(outs RO:$rt, RO:$rd, uimm16:$sel), (ins),
+         !strconcat(asmstr, "\t$rt, $rd, $sel"), [], NoItinerary, FrmFR>;
 
-let isBarrier = 1, isTerminator = 1, isCodeGenOnly = 1 in
-def TRAP : InstSE<(outs), (ins), "break", [(trap)], NoItinerary, FrmOther> {
-   let Inst = 0x0000000d;
+class TrapBase<Instruction RealInst>
+  : PseudoSE<(outs), (ins), [(trap)], NoItinerary>,
+    PseudoInstExpansion<(RealInst 0, 0)> {
+  let isBarrier = 1;
+  let isTerminator = 1;
+  let isCodeGenOnly = 1;
 }
 
 //===----------------------------------------------------------------------===//
@@ -845,38 +842,38 @@ def ADJCALLSTACKUP   : MipsPseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
 }
 
 let usesCustomInserter = 1 in {
-  defm ATOMIC_LOAD_ADD_I8   : Atomic2Ops32<atomic_load_add_8>;
-  defm ATOMIC_LOAD_ADD_I16  : Atomic2Ops32<atomic_load_add_16>;
-  defm ATOMIC_LOAD_ADD_I32  : Atomic2Ops32<atomic_load_add_32>;
-  defm ATOMIC_LOAD_SUB_I8   : Atomic2Ops32<atomic_load_sub_8>;
-  defm ATOMIC_LOAD_SUB_I16  : Atomic2Ops32<atomic_load_sub_16>;
-  defm ATOMIC_LOAD_SUB_I32  : Atomic2Ops32<atomic_load_sub_32>;
-  defm ATOMIC_LOAD_AND_I8   : Atomic2Ops32<atomic_load_and_8>;
-  defm ATOMIC_LOAD_AND_I16  : Atomic2Ops32<atomic_load_and_16>;
-  defm ATOMIC_LOAD_AND_I32  : Atomic2Ops32<atomic_load_and_32>;
-  defm ATOMIC_LOAD_OR_I8    : Atomic2Ops32<atomic_load_or_8>;
-  defm ATOMIC_LOAD_OR_I16   : Atomic2Ops32<atomic_load_or_16>;
-  defm ATOMIC_LOAD_OR_I32   : Atomic2Ops32<atomic_load_or_32>;
-  defm ATOMIC_LOAD_XOR_I8   : Atomic2Ops32<atomic_load_xor_8>;
-  defm ATOMIC_LOAD_XOR_I16  : Atomic2Ops32<atomic_load_xor_16>;
-  defm ATOMIC_LOAD_XOR_I32  : Atomic2Ops32<atomic_load_xor_32>;
-  defm ATOMIC_LOAD_NAND_I8  : Atomic2Ops32<atomic_load_nand_8>;
-  defm ATOMIC_LOAD_NAND_I16 : Atomic2Ops32<atomic_load_nand_16>;
-  defm ATOMIC_LOAD_NAND_I32 : Atomic2Ops32<atomic_load_nand_32>;
-
-  defm ATOMIC_SWAP_I8       : Atomic2Ops32<atomic_swap_8>;
-  defm ATOMIC_SWAP_I16      : Atomic2Ops32<atomic_swap_16>;
-  defm ATOMIC_SWAP_I32      : Atomic2Ops32<atomic_swap_32>;
-
-  defm ATOMIC_CMP_SWAP_I8   : AtomicCmpSwap32<atomic_cmp_swap_8>;
-  defm ATOMIC_CMP_SWAP_I16  : AtomicCmpSwap32<atomic_cmp_swap_16>;
-  defm ATOMIC_CMP_SWAP_I32  : AtomicCmpSwap32<atomic_cmp_swap_32>;
+  def ATOMIC_LOAD_ADD_I8   : Atomic2Ops<atomic_load_add_8, GPR32>;
+  def ATOMIC_LOAD_ADD_I16  : Atomic2Ops<atomic_load_add_16, GPR32>;
+  def ATOMIC_LOAD_ADD_I32  : Atomic2Ops<atomic_load_add_32, GPR32>;
+  def ATOMIC_LOAD_SUB_I8   : Atomic2Ops<atomic_load_sub_8, GPR32>;
+  def ATOMIC_LOAD_SUB_I16  : Atomic2Ops<atomic_load_sub_16, GPR32>;
+  def ATOMIC_LOAD_SUB_I32  : Atomic2Ops<atomic_load_sub_32, GPR32>;
+  def ATOMIC_LOAD_AND_I8   : Atomic2Ops<atomic_load_and_8, GPR32>;
+  def ATOMIC_LOAD_AND_I16  : Atomic2Ops<atomic_load_and_16, GPR32>;
+  def ATOMIC_LOAD_AND_I32  : Atomic2Ops<atomic_load_and_32, GPR32>;
+  def ATOMIC_LOAD_OR_I8    : Atomic2Ops<atomic_load_or_8, GPR32>;
+  def ATOMIC_LOAD_OR_I16   : Atomic2Ops<atomic_load_or_16, GPR32>;
+  def ATOMIC_LOAD_OR_I32   : Atomic2Ops<atomic_load_or_32, GPR32>;
+  def ATOMIC_LOAD_XOR_I8   : Atomic2Ops<atomic_load_xor_8, GPR32>;
+  def ATOMIC_LOAD_XOR_I16  : Atomic2Ops<atomic_load_xor_16, GPR32>;
+  def ATOMIC_LOAD_XOR_I32  : Atomic2Ops<atomic_load_xor_32, GPR32>;
+  def ATOMIC_LOAD_NAND_I8  : Atomic2Ops<atomic_load_nand_8, GPR32>;
+  def ATOMIC_LOAD_NAND_I16 : Atomic2Ops<atomic_load_nand_16, GPR32>;
+  def ATOMIC_LOAD_NAND_I32 : Atomic2Ops<atomic_load_nand_32, GPR32>;
+
+  def ATOMIC_SWAP_I8       : Atomic2Ops<atomic_swap_8, GPR32>;
+  def ATOMIC_SWAP_I16      : Atomic2Ops<atomic_swap_16, GPR32>;
+  def ATOMIC_SWAP_I32      : Atomic2Ops<atomic_swap_32, GPR32>;
+
+  def ATOMIC_CMP_SWAP_I8   : AtomicCmpSwap<atomic_cmp_swap_8, GPR32>;
+  def ATOMIC_CMP_SWAP_I16  : AtomicCmpSwap<atomic_cmp_swap_16, GPR32>;
+  def ATOMIC_CMP_SWAP_I32  : AtomicCmpSwap<atomic_cmp_swap_32, GPR32>;
 }
 
 /// Pseudo instructions for loading and storing accumulator registers.
 let isPseudo = 1, isCodeGenOnly = 1 in {
-  defm LOAD_AC64  : LoadM<"", ACRegs>;
-  defm STORE_AC64 : StoreM<"", ACRegs>;
+  def LOAD_ACC64  : Load<"", ACC64>;
+  def STORE_ACC64 : Store<"", ACC64>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -911,6 +908,7 @@ def ADDu  : MMRel, ArithLogicR<"addu", GPR32Opnd, 1, IIArith, add>,
             ADD_FM<0, 0x21>;
 def SUBu  : MMRel, ArithLogicR<"subu", GPR32Opnd, 0, IIArith, sub>,
             ADD_FM<0, 0x23>;
+let Defs = [HI0, LO0] in
 def MUL   : MMRel, ArithLogicR<"mul", GPR32Opnd, 1, IIImul, mul>,
             ADD_FM<0x1c, 2>;
 def ADD   : MMRel, ArithLogicR<"add", GPR32Opnd>, ADD_FM<0, 0x20>;
@@ -926,11 +924,11 @@ def XOR   : MMRel, ArithLogicR<"xor", GPR32Opnd, 1, IILogic, xor>,
 def NOR   : MMRel, LogicNOR<"nor", GPR32Opnd>, ADD_FM<0, 0x27>;
 
 /// Shift Instructions
-def SLL  : MMRel, shift_rotate_imm<"sll", shamt, GPR32Opnd, shl, immZExt5>,
+def SLL  : MMRel, shift_rotate_imm<"sll", uimm5, GPR32Opnd, shl, immZExt5>,
            SRA_FM<0, 0>;
-def SRL  : MMRel, shift_rotate_imm<"srl", shamt, GPR32Opnd, srl, immZExt5>,
+def SRL  : MMRel, shift_rotate_imm<"srl", uimm5, GPR32Opnd, srl, immZExt5>,
            SRA_FM<2, 0>;
-def SRA  : MMRel, shift_rotate_imm<"sra", shamt, GPR32Opnd, sra, immZExt5>,
+def SRA  : MMRel, shift_rotate_imm<"sra", uimm5, GPR32Opnd, sra, immZExt5>,
            SRA_FM<3, 0>;
 def SLLV : MMRel, shift_rotate_reg<"sllv", GPR32Opnd, shl>, SRLV_FM<4, 0>;
 def SRLV : MMRel, shift_rotate_reg<"srlv", GPR32Opnd, srl>, SRLV_FM<6, 0>;
@@ -938,7 +936,7 @@ def SRAV : MMRel, shift_rotate_reg<"srav", GPR32Opnd, sra>, SRLV_FM<7, 0>;
 
 // Rotate Instructions
 let Predicates = [HasMips32r2, HasStdEnc] in {
-  def ROTR  : MMRel, shift_rotate_imm<"rotr", shamt, GPR32Opnd, rotr,
+  def ROTR  : MMRel, shift_rotate_imm<"rotr", uimm5, GPR32Opnd, rotr,
                                       immZExt5>,
               SRA_FM<2, 1>;
   def ROTRV : MMRel, shift_rotate_reg<"rotrv", GPR32Opnd, rotr>,
@@ -947,65 +945,85 @@ let Predicates = [HasMips32r2, HasStdEnc] in {
 
 /// Load and Store Instructions
 ///  aligned
-defm LB  : LoadM<"lb", GPR32Opnd, sextloadi8, IILoad>, MMRel, LW_FM<0x20>;
-defm LBu : LoadM<"lbu", GPR32Opnd, zextloadi8, IILoad, addrDefault>, MMRel,
-           LW_FM<0x24>;
-defm LH  : LoadM<"lh", GPR32Opnd, sextloadi16, IILoad, addrDefault>, MMRel,
-           LW_FM<0x21>;
-defm LHu : LoadM<"lhu", GPR32Opnd, zextloadi16, IILoad>, MMRel, LW_FM<0x25>;
-defm LW  : LoadM<"lw", GPR32Opnd, load, IILoad, addrDefault>, MMRel, LW_FM<0x23>;
-defm SB  : StoreM<"sb", GPR32Opnd, truncstorei8, IIStore>, MMRel, LW_FM<0x28>;
-defm SH  : StoreM<"sh", GPR32Opnd, truncstorei16, IIStore>, MMRel, LW_FM<0x29>;
-defm SW  : StoreM<"sw", GPR32Opnd, store, IIStore>, MMRel, LW_FM<0x2b>;
+def LB  : Load<"lb", GPR32Opnd, sextloadi8, IILoad>, MMRel, LW_FM<0x20>;
+def LBu : Load<"lbu", GPR32Opnd, zextloadi8, IILoad, addrDefault>, MMRel,
+          LW_FM<0x24>;
+def LH  : Load<"lh", GPR32Opnd, sextloadi16, IILoad, addrDefault>, MMRel,
+          LW_FM<0x21>;
+def LHu : Load<"lhu", GPR32Opnd, zextloadi16, IILoad>, MMRel, LW_FM<0x25>;
+def LW  : Load<"lw", GPR32Opnd, load, IILoad, addrDefault>, MMRel,
+          LW_FM<0x23>;
+def SB  : Store<"sb", GPR32Opnd, truncstorei8, IIStore>, MMRel, LW_FM<0x28>;
+def SH  : Store<"sh", GPR32Opnd, truncstorei16, IIStore>, MMRel, LW_FM<0x29>;
+def SW  : Store<"sw", GPR32Opnd, store, IIStore>, MMRel, LW_FM<0x2b>;
 
 /// load/store left/right
-defm LWL : LoadLeftRightM<"lwl", MipsLWL, GPR32Opnd>, LW_FM<0x22>;
-defm LWR : LoadLeftRightM<"lwr", MipsLWR, GPR32Opnd>, LW_FM<0x26>;
-defm SWL : StoreLeftRightM<"swl", MipsSWL, GPR32Opnd>, LW_FM<0x2a>;
-defm SWR : StoreLeftRightM<"swr", MipsSWR, GPR32Opnd>, LW_FM<0x2e>;
+let Predicates = [NotInMicroMips] in {
+def LWL : LoadLeftRight<"lwl", MipsLWL, GPR32Opnd, IILoad>, LW_FM<0x22>;
+def LWR : LoadLeftRight<"lwr", MipsLWR, GPR32Opnd, IILoad>, LW_FM<0x26>;
+def SWL : StoreLeftRight<"swl", MipsSWL, GPR32Opnd, IIStore>, LW_FM<0x2a>;
+def SWR : StoreLeftRight<"swr", MipsSWR, GPR32Opnd, IIStore>, LW_FM<0x2e>;
+}
 
 def SYNC : SYNC_FT, SYNC_FM;
-def TEQ : TEQ_FT<"teq", GPR32Opnd>, TEQ_FM<0x34>;
+def TEQ : MMRel, TEQ_FT<"teq", GPR32Opnd>, TEQ_FM<0x34>;
+def TGE : MMRel, TEQ_FT<"tge", GPR32Opnd>, TEQ_FM<0x30>;
+def TGEU : MMRel, TEQ_FT<"tgeu", GPR32Opnd>, TEQ_FM<0x31>;
+def TLT : MMRel, TEQ_FT<"tlt", GPR32Opnd>, TEQ_FM<0x32>;
+def TLTU : MMRel, TEQ_FT<"tltu", GPR32Opnd>, TEQ_FM<0x33>;
+def TNE : MMRel, TEQ_FT<"tne", GPR32Opnd>, TEQ_FM<0x36>;
+
+def TEQI : MMRel, TEQI_FT<"teqi", GPR32Opnd>, TEQI_FM<0xc>;
+def TGEI : MMRel, TEQI_FT<"tgei", GPR32Opnd>, TEQI_FM<0x8>;
+def TGEIU : MMRel, TEQI_FT<"tgeiu", GPR32Opnd>, TEQI_FM<0x9>;
+def TLTI : MMRel, TEQI_FT<"tlti", GPR32Opnd>, TEQI_FM<0xa>;
+def TTLTIU : MMRel, TEQI_FT<"tltiu", GPR32Opnd>, TEQI_FM<0xb>;
+def TNEI : MMRel, TEQI_FT<"tnei", GPR32Opnd>, TEQI_FM<0xe>;
 
 def BREAK : BRK_FT<"break">, BRK_FM<0xd>;
 def SYSCALL : SYS_FT<"syscall">, SYS_FM<0xc>;
+def TRAP : TrapBase<BREAK>;
 
 def ERET : ER_FT<"eret">, ER_FM<0x18>;
 def DERET : ER_FT<"deret">, ER_FM<0x1f>;
 
-/// Load-linked, Store-conditional
-let Predicates = [NotN64, HasStdEnc] in {
-  def LL : LLBase<"ll", GPR32Opnd, mem>, LW_FM<0x30>;
-  def SC : SCBase<"sc", GPR32Opnd, mem>, LW_FM<0x38>;
-}
+def EI : DEI_FT<"ei", GPR32Opnd>, EI_FM<1>;
+def DI : DEI_FT<"di", GPR32Opnd>, EI_FM<0>;
 
-let Predicates = [IsN64, HasStdEnc], DecoderNamespace = "Mips64" in {
-  def LL_P8 : LLBase<"ll", GPR32Opnd, mem64>, LW_FM<0x30>;
-  def SC_P8 : SCBase<"sc", GPR32Opnd, mem64>, LW_FM<0x38>;
-}
+def WAIT : WAIT_FT<"wait">;
+
+/// Load-linked, Store-conditional
+def LL : LLBase<"ll", GPR32Opnd>, LW_FM<0x30>;
+def SC : SCBase<"sc", GPR32Opnd>, LW_FM<0x38>;
 
 /// Jump and Branch Instructions
-def J       : JumpFJ<jmptarget, "j", br, bb>, FJ<2>,
+def J       : MMRel, JumpFJ<jmptarget, "j", br, bb, "j">, FJ<2>,
               Requires<[RelocStatic, HasStdEnc]>, IsBranch;
-def JR      : IndirectBranch<GPR32Opnd>, MTLO_FM<8>;
-def B       : UncondBranch<"b">, B_FM;
-def BEQ     : CBranch<"beq", seteq, GPR32Opnd>, BEQ_FM<4>;
-def BNE     : CBranch<"bne", setne, GPR32Opnd>, BEQ_FM<5>;
-def BGEZ    : CBranchZero<"bgez", setge, GPR32Opnd>, BGEZ_FM<1, 1>;
-def BGTZ    : CBranchZero<"bgtz", setgt, GPR32Opnd>, BGEZ_FM<7, 0>;
-def BLEZ    : CBranchZero<"blez", setle, GPR32Opnd>, BGEZ_FM<6, 0>;
-def BLTZ    : CBranchZero<"bltz", setlt, GPR32Opnd>, BGEZ_FM<1, 0>;
-
-def JAL  : JumpLink<"jal">, FJ<3>;
-def JALR : JumpLinkReg<"jalr", GPR32Opnd>, JALR_FM;
+def JR      : MMRel, IndirectBranch<"jr", GPR32Opnd>, MTLO_FM<8>;
+def BEQ     : MMRel, CBranch<"beq", brtarget, seteq, GPR32Opnd>, BEQ_FM<4>;
+def BNE     : MMRel, CBranch<"bne", brtarget, setne, GPR32Opnd>, BEQ_FM<5>;
+def BGEZ    : MMRel, CBranchZero<"bgez", brtarget, setge, GPR32Opnd>,
+              BGEZ_FM<1, 1>;
+def BGTZ    : MMRel, CBranchZero<"bgtz", brtarget, setgt, GPR32Opnd>,
+              BGEZ_FM<7, 0>;
+def BLEZ    : MMRel, CBranchZero<"blez", brtarget, setle, GPR32Opnd>,
+              BGEZ_FM<6, 0>;
+def BLTZ    : MMRel, CBranchZero<"bltz", brtarget, setlt, GPR32Opnd>,
+              BGEZ_FM<1, 0>;
+def B       : UncondBranch<BEQ>;
+
+def JAL  : MMRel, JumpLink<"jal", calltarget>, FJ<3>;
+def JALR : MMRel, JumpLinkReg<"jalr", GPR32Opnd>, JALR_FM;
 def JALRPseudo : JumpLinkRegPseudo<GPR32Opnd, JALR, RA>;
-def BGEZAL : BGEZAL_FT<"bgezal", GPR32Opnd>, BGEZAL_FM<0x11>;
-def BLTZAL : BGEZAL_FT<"bltzal", GPR32Opnd>, BGEZAL_FM<0x10>;
+def BGEZAL : MMRel, BGEZAL_FT<"bgezal", brtarget, GPR32Opnd>, BGEZAL_FM<0x11>;
+def BLTZAL : MMRel, BGEZAL_FT<"bltzal", brtarget, GPR32Opnd>, BGEZAL_FM<0x10>;
 def BAL_BR : BAL_BR_Pseudo<BGEZAL>;
-def TAILCALL : JumpFJ<calltarget, "j", MipsTailCall, imm>, FJ<2>, IsTailCall;
-def TAILCALL_R : JumpFR<GPR32Opnd, MipsTailCall>, MTLO_FM<8>, IsTailCall;
+def TAILCALL : MMRel, JumpFJ<calltarget, "j", MipsTailCall, imm, "tcall">,
+               FJ<2>, IsTailCall;
+def TAILCALL_R : MMRel, JumpFR<"tcallr", GPR32Opnd, MipsTailCall>, MTLO_FM<8>,
+                 IsTailCall;
 
-def RET : RetBase<GPR32Opnd>, MTLO_FM<8>;
+def RET : MMRel, RetBase<"ret", GPR32Opnd>, MTLO_FM<8>;
 
 // Exception handling related node and instructions.
 // The conversion sequence is:
@@ -1029,34 +1047,30 @@ let Uses = [V0, V1], isTerminator = 1, isReturn = 1, isBarrier = 1 in {
 }
 
 /// Multiply and Divide Instructions.
-def MULT  : MMRel, Mult<"mult", IIImult, GPR32Opnd, [HI, LO]>,
+def MULT  : MMRel, Mult<"mult", IIImult, GPR32Opnd, [HI0, LO0]>,
             MULT_FM<0, 0x18>;
-def MULTu : MMRel, Mult<"multu", IIImult, GPR32Opnd, [HI, LO]>,
+def MULTu : MMRel, Mult<"multu", IIImult, GPR32Opnd, [HI0, LO0]>,
             MULT_FM<0, 0x19>;
-def PseudoMULT  : MultDivPseudo<MULT, ACRegs, GPR32Opnd, MipsMult, IIImult>;
-def PseudoMULTu : MultDivPseudo<MULTu, ACRegs, GPR32Opnd, MipsMultu, IIImult>;
-def SDIV  : Div<"div", IIIdiv, GPR32Opnd, [HI, LO]>, MULT_FM<0, 0x1a>;
-def UDIV  : Div<"divu", IIIdiv, GPR32Opnd, [HI, LO]>, MULT_FM<0, 0x1b>;
-def PseudoSDIV : MultDivPseudo<SDIV, ACRegs, GPR32Opnd, MipsDivRem, IIIdiv,
-                               0, 1, 1>;
-def PseudoUDIV : MultDivPseudo<UDIV, ACRegs, GPR32Opnd, MipsDivRemU, IIIdiv,
-                               0, 1, 1>;
+def SDIV  : MMRel, Div<"div", IIIdiv, GPR32Opnd, [HI0, LO0]>,
+            MULT_FM<0, 0x1a>;
+def UDIV  : MMRel, Div<"divu", IIIdiv, GPR32Opnd, [HI0, LO0]>,
+            MULT_FM<0, 0x1b>;
 
-def MTHI : MoveToLOHI<"mthi", GPR32Opnd, [HI]>, MTLO_FM<0x11>;
-def MTLO : MoveToLOHI<"mtlo", GPR32Opnd, [LO]>, MTLO_FM<0x13>;
-def MFHI : MoveFromLOHI<"mfhi", GPR32Opnd, [HI]>, MFLO_FM<0x10>;
-def MFLO : MoveFromLOHI<"mflo", GPR32Opnd, [LO]>, MFLO_FM<0x12>;
+def MTHI : MMRel, MoveToLOHI<"mthi", GPR32Opnd, [HI0]>, MTLO_FM<0x11>;
+def MTLO : MMRel, MoveToLOHI<"mtlo", GPR32Opnd, [LO0]>, MTLO_FM<0x13>;
+def MFHI : MMRel, MoveFromLOHI<"mfhi", GPR32Opnd, AC0>, MFLO_FM<0x10>;
+def MFLO : MMRel, MoveFromLOHI<"mflo", GPR32Opnd, AC0>, MFLO_FM<0x12>;
 
 /// Sign Ext In Register Instructions.
-def SEB : SignExtInReg<"seb", i8, GPR32Opnd>, SEB_FM<0x10, 0x20>;
-def SEH : SignExtInReg<"seh", i16, GPR32Opnd>, SEB_FM<0x18, 0x20>;
+def SEB : MMRel, SignExtInReg<"seb", i8, GPR32Opnd>, SEB_FM<0x10, 0x20>;
+def SEH : MMRel, SignExtInReg<"seh", i16, GPR32Opnd>, SEB_FM<0x18, 0x20>;
 
 /// Count Leading
-def CLZ : CountLeading0<"clz", GPR32Opnd>, CLO_FM<0x20>;
-def CLO : CountLeading1<"clo", GPR32Opnd>, CLO_FM<0x21>;
+def CLZ : MMRel, CountLeading0<"clz", GPR32Opnd>, CLO_FM<0x20>;
+def CLO : MMRel, CountLeading1<"clo", GPR32Opnd>, CLO_FM<0x21>;
 
 /// Word Swap Bytes Within Halfwords
-def WSBH : SubwordSwap<"wsbh", GPR32Opnd>, SEB_FM<2, 0x20>;
+def WSBH : MMRel, SubwordSwap<"wsbh", GPR32Opnd>, SEB_FM<2, 0x20>;
 
 /// No operation.
 def NOP : PseudoSE<(outs), (ins), []>, PseudoInstExpansion<(SLL ZERO, ZERO, 0)>;
@@ -1065,39 +1079,41 @@ def NOP : PseudoSE<(outs), (ins), []>, PseudoInstExpansion<(SLL ZERO, ZERO, 0)>;
 // instructions. The same not happens for stack address copies, so an
 // add op with mem ComplexPattern is used and the stack address copy
 // can be matched. It's similar to Sparc LEA_ADDRi
-def LEA_ADDiu : EffectiveAddress<"addiu", GPR32Opnd, mem_ea>, LW_FM<9>;
+def LEA_ADDiu : EffectiveAddress<"addiu", GPR32Opnd>, LW_FM<9>;
 
 // MADD*/MSUB*
-def MADD  : MArithR<"madd", 1>, MULT_FM<0x1c, 0>;
-def MADDU : MArithR<"maddu", 1>, MULT_FM<0x1c, 1>;
-def MSUB  : MArithR<"msub">, MULT_FM<0x1c, 4>;
-def MSUBU : MArithR<"msubu">, MULT_FM<0x1c, 5>;
+def MADD  : MMRel, MArithR<"madd", 1>, MULT_FM<0x1c, 0>;
+def MADDU : MMRel, MArithR<"maddu", 1>, MULT_FM<0x1c, 1>;
+def MSUB  : MMRel, MArithR<"msub">, MULT_FM<0x1c, 4>;
+def MSUBU : MMRel, MArithR<"msubu">, MULT_FM<0x1c, 5>;
+
+let Predicates = [HasStdEnc, NotDSP] in {
+def PseudoMULT  : MultDivPseudo<MULT, ACC64, GPR32Opnd, MipsMult, IIImult>;
+def PseudoMULTu : MultDivPseudo<MULTu, ACC64, GPR32Opnd, MipsMultu, IIImult>;
+def PseudoMFHI : PseudoMFLOHI<GPR32, ACC64, MipsMFHI>;
+def PseudoMFLO : PseudoMFLOHI<GPR32, ACC64, MipsMFLO>;
+def PseudoMTLOHI : PseudoMTLOHI<ACC64, GPR32>;
 def PseudoMADD  : MAddSubPseudo<MADD, MipsMAdd>;
 def PseudoMADDU : MAddSubPseudo<MADDU, MipsMAddu>;
 def PseudoMSUB  : MAddSubPseudo<MSUB, MipsMSub>;
 def PseudoMSUBU : MAddSubPseudo<MSUBU, MipsMSubu>;
+}
+
+def PseudoSDIV : MultDivPseudo<SDIV, ACC64, GPR32Opnd, MipsDivRem, IIIdiv,
+                               0, 1, 1>;
+def PseudoUDIV : MultDivPseudo<UDIV, ACC64, GPR32Opnd, MipsDivRemU, IIIdiv,
+                               0, 1, 1>;
 
 def RDHWR : ReadHardware<GPR32Opnd, HWRegsOpnd>, RDHWR_FM;
 
-def EXT : ExtBase<"ext", GPR32Opnd>, EXT_FM<0>;
-def INS : InsBase<"ins", GPR32Opnd>, EXT_FM<4>;
+def EXT : MMRel, ExtBase<"ext", GPR32Opnd, uimm5, MipsExt>, EXT_FM<0>;
+def INS : MMRel, InsBase<"ins", GPR32Opnd, uimm5, MipsIns>, EXT_FM<4>;
 
 /// Move Control Registers From/To CPU Registers
-def MFC0_3OP : MFC3OP<(outs GPR32Opnd:$rt),
-                      (ins GPR32Opnd:$rd, uimm16:$sel),
-                      "mfc0\t$rt, $rd, $sel">, MFC3OP_FM<0x10, 0>;
-
-def MTC0_3OP : MFC3OP<(outs GPR32Opnd:$rd, uimm16:$sel),
-                      (ins GPR32Opnd:$rt),
-                      "mtc0\t$rt, $rd, $sel">, MFC3OP_FM<0x10, 4>;
-
-def MFC2_3OP : MFC3OP<(outs GPR32Opnd:$rt),
-                      (ins GPR32Opnd:$rd, uimm16:$sel),
-                      "mfc2\t$rt, $rd, $sel">, MFC3OP_FM<0x12, 0>;
-
-def MTC2_3OP : MFC3OP<(outs GPR32Opnd:$rd, uimm16:$sel),
-                      (ins GPR32Opnd:$rt),
-                      "mtc2\t$rt, $rd, $sel">, MFC3OP_FM<0x12, 4>;
+def MFC0 : MFC3OP<"mfc0", GPR32Opnd>, MFC3OP_FM<0x10, 0>;
+def MTC0 : MFC3OP<"mtc0", GPR32Opnd>, MFC3OP_FM<0x10, 4>;
+def MFC2 : MFC3OP<"mfc2", GPR32Opnd>, MFC3OP_FM<0x12, 0>;
+def MTC2 : MFC3OP<"mtc2", GPR32Opnd>, MFC3OP_FM<0x12, 4>;
 
 //===----------------------------------------------------------------------===//
 // Instruction aliases
@@ -1129,14 +1145,11 @@ def : InstAlias<"xor $rs, $rt, $imm",
 def : InstAlias<"or $rs, $rt, $imm",
                 (ORi GPR32Opnd:$rs, GPR32Opnd:$rt, uimm16:$imm), 0>;
 def : InstAlias<"nop", (SLL ZERO, ZERO, 0), 1>;
-def : InstAlias<"mfc0 $rt, $rd",
-                (MFC0_3OP GPR32Opnd:$rt, GPR32Opnd:$rd, 0), 0>;
-def : InstAlias<"mtc0 $rt, $rd",
-                (MTC0_3OP GPR32Opnd:$rd, 0, GPR32Opnd:$rt), 0>;
-def : InstAlias<"mfc2 $rt, $rd",
-                (MFC2_3OP GPR32Opnd:$rt, GPR32Opnd:$rd, 0), 0>;
-def : InstAlias<"mtc2 $rt, $rd",
-                (MTC2_3OP GPR32Opnd:$rd, 0, GPR32Opnd:$rt), 0>;
+def : InstAlias<"mfc0 $rt, $rd", (MFC0 GPR32Opnd:$rt, GPR32Opnd:$rd, 0), 0>;
+def : InstAlias<"mtc0 $rt, $rd", (MTC0 GPR32Opnd:$rt, GPR32Opnd:$rd, 0), 0>;
+def : InstAlias<"mfc2 $rt, $rd", (MFC2 GPR32Opnd:$rt, GPR32Opnd:$rd, 0), 0>;
+def : InstAlias<"mtc2 $rt, $rd", (MTC2 GPR32Opnd:$rt, GPR32Opnd:$rd, 0), 0>;
+def : InstAlias<"b $offset", (BEQ ZERO, ZERO, brtarget:$offset), 0>;
 def : InstAlias<"bnez $rs,$offset",
                 (BNE GPR32Opnd:$rs, ZERO, brtarget:$offset), 0>;
 def : InstAlias<"beqz $rs,$offset",
@@ -1145,6 +1158,20 @@ def : InstAlias<"syscall", (SYSCALL 0), 1>;
 
 def : InstAlias<"break $imm", (BREAK uimm10:$imm, 0), 1>;
 def : InstAlias<"break", (BREAK 0, 0), 1>;
+def : InstAlias<"ei", (EI ZERO), 1>;
+def : InstAlias<"di", (DI ZERO), 1>;
+
+def  : InstAlias<"teq $rs, $rt", (TEQ GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>;
+def  : InstAlias<"tge $rs, $rt", (TGE GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>;
+def  : InstAlias<"tgeu $rs, $rt", (TGEU GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>;
+def  : InstAlias<"tlt $rs, $rt", (TLT GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>;
+def  : InstAlias<"tltu $rs, $rt", (TLTU GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>;
+def  : InstAlias<"tne $rs, $rt", (TNE GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>;
+def : InstAlias<"sub, $rd, $rs, $imm",
+                (ADDi GPR32Opnd:$rd, GPR32Opnd:$rs, InvertedImOperand:$imm)>;
+def : InstAlias<"subu, $rd, $rs, $imm",
+                (ADDiu GPR32Opnd:$rd, GPR32Opnd:$rs, InvertedImOperand:$imm)>;
+
 //===----------------------------------------------------------------------===//
 // Assembler Pseudo Instructions
 //===----------------------------------------------------------------------===//
@@ -1152,7 +1179,7 @@ def : InstAlias<"break", (BREAK 0, 0), 1>;
 class LoadImm32< string instr_asm, Operand Od, RegisterOperand RO> :
   MipsAsmPseudoInst<(outs RO:$rt), (ins Od:$imm32),
                      !strconcat(instr_asm, "\t$rt, $imm32")> ;
-def LoadImm32Reg : LoadImm32<"li", shamt,GPR32Opnd>;
+def LoadImm32Reg : LoadImm32<"li", uimm5, GPR32Opnd>;
 
 class LoadAddress<string instr_asm, Operand MemOpnd, RegisterOperand RO> :
   MipsAsmPseudoInst<(outs RO:$rt), (ins MemOpnd:$addr),
@@ -1162,9 +1189,7 @@ def LoadAddr32Reg : LoadAddress<"la", mem, GPR32Opnd>;
 class LoadAddressImm<string instr_asm, Operand Od, RegisterOperand RO> :
   MipsAsmPseudoInst<(outs RO:$rt), (ins Od:$imm32),
                      !strconcat(instr_asm, "\t$rt, $imm32")> ;
-def LoadAddr32Imm : LoadAddressImm<"la", shamt,GPR32Opnd>;
-
-
+def LoadAddr32Imm : LoadAddressImm<"la", uimm5, GPR32Opnd>;
 
 //===----------------------------------------------------------------------===//
 //  Arbitrary patterns that map to one or more instructions
@@ -1261,24 +1286,15 @@ def : MipsPat<(not GPR32:$in),
               (NOR GPR32Opnd:$in, ZERO)>;
 
 // extended loads
-let Predicates = [NotN64, HasStdEnc] in {
+let Predicates = [HasStdEnc] in {
   def : MipsPat<(i32 (extloadi1  addr:$src)), (LBu addr:$src)>;
   def : MipsPat<(i32 (extloadi8  addr:$src)), (LBu addr:$src)>;
   def : MipsPat<(i32 (extloadi16 addr:$src)), (LHu addr:$src)>;
 }
-let Predicates = [IsN64, HasStdEnc] in {
-  def : MipsPat<(i32 (extloadi1  addr:$src)), (LBu_P8 addr:$src)>;
-  def : MipsPat<(i32 (extloadi8  addr:$src)), (LBu_P8 addr:$src)>;
-  def : MipsPat<(i32 (extloadi16 addr:$src)), (LHu_P8 addr:$src)>;
-}
 
 // peepholes
-let Predicates = [NotN64, HasStdEnc] in {
-  def : MipsPat<(store (i32 0), addr:$dst), (SW ZERO, addr:$dst)>;
-}
-let Predicates = [IsN64, HasStdEnc] in {
-  def : MipsPat<(store (i32 0), addr:$dst), (SW_P8 ZERO, addr:$dst)>;
-}
+let Predicates = [HasStdEnc] in
+def : MipsPat<(store (i32 0), addr:$dst), (SW ZERO, addr:$dst)>;
 
 // brcond patterns
 multiclass BrcondPats<RegisterClass RC, Instruction BEQOp, Instruction BNEOp,
@@ -1369,22 +1385,13 @@ defm : SetgeImmPats<GPR32, SLTi, SLTiu>;
 // bswap pattern
 def : MipsPat<(bswap GPR32:$rt), (ROTR (WSBH GPR32:$rt), 16)>;
 
-// mflo/hi patterns.
-def : MipsPat<(i32 (ExtractLOHI ACRegs:$ac, imm:$lohi_idx)),
-              (EXTRACT_SUBREG ACRegs:$ac, imm:$lohi_idx)>;
-
 // Load halfword/word patterns.
 let AddedComplexity = 40 in {
-  let Predicates = [NotN64, HasStdEnc] in {
+  let Predicates = [HasStdEnc] in {
     def : LoadRegImmPat<LBu, i32, zextloadi8>;
     def : LoadRegImmPat<LH, i32, sextloadi16>;
     def : LoadRegImmPat<LW, i32, load>;
   }
-  let Predicates = [IsN64, HasStdEnc] in {
-    def : LoadRegImmPat<LBu_P8, i32, zextloadi8>;
-    def : LoadRegImmPat<LH_P8, i32, sextloadi16>;
-    def : LoadRegImmPat<LW_P8, i32, load>;
-  }
 }
 
 //===----------------------------------------------------------------------===//
@@ -1405,6 +1412,10 @@ include "Mips16InstrInfo.td"
 include "MipsDSPInstrFormats.td"
 include "MipsDSPInstrInfo.td"
 
+// MSA
+include "MipsMSAInstrFormats.td"
+include "MipsMSAInstrInfo.td"
+
 // Micromips
 include "MicroMipsInstrFormats.td"
 include "MicroMipsInstrInfo.td"
diff --git a/lib/Target/Mips/MipsLongBranch.cpp b/lib/Target/Mips/MipsLongBranch.cpp
index 971176e..2efe578 100644
--- a/lib/Target/Mips/MipsLongBranch.cpp
+++ b/lib/Target/Mips/MipsLongBranch.cpp
@@ -237,6 +237,11 @@ void MipsLongBranch::replaceBranch(MachineBasicBlock &MBB, Iter Br,
 
   MIB.addMBB(MBBOpnd);
 
+  // Bundle the instruction in the delay slot to the newly created branch
+  // and erase the original branch.
+  assert(Br->isBundledWithSucc());
+  MachineBasicBlock::instr_iterator II(Br);
+  MIBundleBuilder(&*MIB).append((++II)->removeFromBundle());
   Br->eraseFromParent();
 }
 
@@ -432,8 +437,10 @@ bool MipsLongBranch::runOnMachineFunction(MachineFunction &F) {
       if (!I->Br || I->HasLongBranch)
         continue;
 
+      int ShVal = TM.getSubtarget<MipsSubtarget>().inMicroMipsMode() ? 2 : 4;
+
       // Check if offset fits into 16-bit immediate field of branches.
-      if (!ForceLongBranch && isInt<16>(computeOffset(I->Br) / 4))
+      if (!ForceLongBranch && isInt<16>(computeOffset(I->Br) / ShVal))
         continue;
 
       I->HasLongBranch = true;
diff --git a/lib/Target/Mips/MipsMCInstLower.cpp b/lib/Target/Mips/MipsMCInstLower.cpp
index d836975..b6dfadc 100644
--- a/lib/Target/Mips/MipsMCInstLower.cpp
+++ b/lib/Target/Mips/MipsMCInstLower.cpp
@@ -28,8 +28,7 @@ using namespace llvm;
 MipsMCInstLower::MipsMCInstLower(MipsAsmPrinter &asmprinter)
   : AsmPrinter(asmprinter) {}
 
-void MipsMCInstLower::Initialize(Mangler *M, MCContext *C) {
-  Mang = M;
+void MipsMCInstLower::Initialize(MCContext *C) {
   Ctx = C;
 }
 
@@ -74,7 +73,7 @@ MCOperand MipsMCInstLower::LowerSymbolOperand(const MachineOperand &MO,
     break;
 
   case MachineOperand::MO_GlobalAddress:
-    Symbol = Mang->getSymbol(MO.getGlobal());
+    Symbol = AsmPrinter.getSymbol(MO.getGlobal());
     Offset += MO.getOffset();
     break;
 
diff --git a/lib/Target/Mips/MipsMCInstLower.h b/lib/Target/Mips/MipsMCInstLower.h
index c4a6016..4570bd9 100644
--- a/lib/Target/Mips/MipsMCInstLower.h
+++ b/lib/Target/Mips/MipsMCInstLower.h
@@ -19,7 +19,6 @@ namespace llvm {
   class MCOperand;
   class MachineInstr;
   class MachineFunction;
-  class Mangler;
   class MipsAsmPrinter;
 
 /// MipsMCInstLower - This class is used to lower an MachineInstr into an
@@ -27,11 +26,10 @@ namespace llvm {
 class LLVM_LIBRARY_VISIBILITY MipsMCInstLower {
   typedef MachineOperand::MachineOperandType MachineOperandType;
   MCContext *Ctx;
-  Mangler *Mang;
   MipsAsmPrinter &AsmPrinter;
 public:
   MipsMCInstLower(MipsAsmPrinter &asmprinter);
-  void Initialize(Mangler *mang, MCContext *C);
+  void Initialize(MCContext *C);
   void Lower(const MachineInstr *MI, MCInst &OutMI) const;
   MCOperand LowerOperand(const MachineOperand& MO, unsigned offset = 0) const;
 
diff --git a/lib/Target/Mips/MipsMSAInstrFormats.td b/lib/Target/Mips/MipsMSAInstrFormats.td
new file mode 100644
index 0000000..875dc0b
--- /dev/null
+++ b/lib/Target/Mips/MipsMSAInstrFormats.td
@@ -0,0 +1,406 @@
+//===- MipsMSAInstrFormats.td - Mips Instruction Formats ---*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+def HasMSA : Predicate<"Subtarget.hasMSA()">,
+             AssemblerPredicate<"FeatureMSA">;
+
+class MSAInst : MipsInst<(outs), (ins), "", [], NoItinerary, FrmOther> {
+  let Predicates = [HasMSA];
+  let Inst{31-26} = 0b011110;
+}
+
+class MSACBranch : MSAInst {
+  let Inst{31-26} = 0b010001;
+}
+
+class MSASpecial : MSAInst {
+  let Inst{31-26} = 0b000000;
+}
+
+class PseudoMSA<dag outs, dag ins, list<dag> pattern,
+                InstrItinClass itin = IIPseudo>:
+  MipsPseudo<outs, ins, pattern, itin> {
+  let Predicates = [HasMSA];
+}
+
+class MSA_BIT_B_FMT<bits<3> major, bits<6> minor>: MSAInst {
+  bits<5> ws;
+  bits<5> wd;
+  bits<3> m;
+
+  let Inst{25-23} = major;
+  let Inst{22-19} = 0b1110;
+  let Inst{18-16} = m;
+  let Inst{15-11} = ws;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_BIT_H_FMT<bits<3> major, bits<6> minor>: MSAInst {
+  bits<5> ws;
+  bits<5> wd;
+  bits<4> m;
+
+  let Inst{25-23} = major;
+  let Inst{22-20} = 0b110;
+  let Inst{19-16} = m;
+  let Inst{15-11} = ws;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_BIT_W_FMT<bits<3> major, bits<6> minor>: MSAInst {
+  bits<5> ws;
+  bits<5> wd;
+  bits<5> m;
+
+  let Inst{25-23} = major;
+  let Inst{22-21} = 0b10;
+  let Inst{20-16} = m;
+  let Inst{15-11} = ws;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_BIT_D_FMT<bits<3> major, bits<6> minor>: MSAInst {
+  bits<5> ws;
+  bits<5> wd;
+  bits<6> m;
+
+  let Inst{25-23} = major;
+  let Inst{22} = 0b0;
+  let Inst{21-16} = m;
+  let Inst{15-11} = ws;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_2R_FILL_FMT<bits<8> major, bits<2> df, bits<6> minor>: MSAInst {
+  bits<5> rs;
+  bits<5> wd;
+
+  let Inst{25-18} = major;
+  let Inst{17-16} = df;
+  let Inst{15-11} = rs;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_2R_FMT<bits<8> major, bits<2> df, bits<6> minor>: MSAInst {
+  bits<5> ws;
+  bits<5> wd;
+
+  let Inst{25-18} = major;
+  let Inst{17-16} = df;
+  let Inst{15-11} = ws;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_2RF_FMT<bits<9> major, bits<1> df, bits<6> minor>: MSAInst {
+  bits<5> ws;
+  bits<5> wd;
+
+  let Inst{25-17} = major;
+  let Inst{16} = df;
+  let Inst{15-11} = ws;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_3R_FMT<bits<3> major, bits<2> df, bits<6> minor>: MSAInst {
+  bits<5> wt;
+  bits<5> ws;
+  bits<5> wd;
+
+  let Inst{25-23} = major;
+  let Inst{22-21} = df;
+  let Inst{20-16} = wt;
+  let Inst{15-11} = ws;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_3RF_FMT<bits<4> major, bits<1> df, bits<6> minor>: MSAInst {
+  bits<5> wt;
+  bits<5> ws;
+  bits<5> wd;
+
+  let Inst{25-22} = major;
+  let Inst{21} = df;
+  let Inst{20-16} = wt;
+  let Inst{15-11} = ws;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_3R_INDEX_FMT<bits<3> major, bits<2> df, bits<6> minor>: MSAInst {
+  bits<5> rt;
+  bits<5> ws;
+  bits<5> wd;
+
+  let Inst{25-23} = major;
+  let Inst{22-21} = df;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = ws;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_ELM_FMT<bits<10> major, bits<6> minor>: MSAInst {
+  bits<5> ws;
+  bits<5> wd;
+
+  let Inst{25-16} = major;
+  let Inst{15-11} = ws;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_ELM_CFCMSA_FMT<bits<10> major, bits<6> minor>: MSAInst {
+  bits<5> rd;
+  bits<5> cs;
+
+  let Inst{25-16} = major;
+  let Inst{15-11} = cs;
+  let Inst{10-6} = rd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_ELM_CTCMSA_FMT<bits<10> major, bits<6> minor>: MSAInst {
+  bits<5> rs;
+  bits<5> cd;
+
+  let Inst{25-16} = major;
+  let Inst{15-11} = rs;
+  let Inst{10-6} = cd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_ELM_B_FMT<bits<4> major, bits<6> minor>: MSAInst {
+  bits<4> n;
+  bits<5> ws;
+  bits<5> wd;
+
+  let Inst{25-22} = major;
+  let Inst{21-20} = 0b00;
+  let Inst{19-16} = n{3-0};
+  let Inst{15-11} = ws;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_ELM_H_FMT<bits<4> major, bits<6> minor>: MSAInst {
+  bits<4> n;
+  bits<5> ws;
+  bits<5> wd;
+
+  let Inst{25-22} = major;
+  let Inst{21-19} = 0b100;
+  let Inst{18-16} = n{2-0};
+  let Inst{15-11} = ws;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_ELM_W_FMT<bits<4> major, bits<6> minor>: MSAInst {
+  bits<4> n;
+  bits<5> ws;
+  bits<5> wd;
+
+  let Inst{25-22} = major;
+  let Inst{21-18} = 0b1100;
+  let Inst{17-16} = n{1-0};
+  let Inst{15-11} = ws;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_ELM_D_FMT<bits<4> major, bits<6> minor>: MSAInst {
+  bits<4> n;
+  bits<5> ws;
+  bits<5> wd;
+
+  let Inst{25-22} = major;
+  let Inst{21-17} = 0b11100;
+  let Inst{16} = n{0};
+  let Inst{15-11} = ws;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_ELM_COPY_B_FMT<bits<4> major, bits<6> minor>: MSAInst {
+  bits<4> n;
+  bits<5> ws;
+  bits<5> rd;
+
+  let Inst{25-22} = major;
+  let Inst{21-20} = 0b00;
+  let Inst{19-16} = n{3-0};
+  let Inst{15-11} = ws;
+  let Inst{10-6} = rd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_ELM_COPY_H_FMT<bits<4> major, bits<6> minor>: MSAInst {
+  bits<4> n;
+  bits<5> ws;
+  bits<5> rd;
+
+  let Inst{25-22} = major;
+  let Inst{21-19} = 0b100;
+  let Inst{18-16} = n{2-0};
+  let Inst{15-11} = ws;
+  let Inst{10-6} = rd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_ELM_COPY_W_FMT<bits<4> major, bits<6> minor>: MSAInst {
+  bits<4> n;
+  bits<5> ws;
+  bits<5> rd;
+
+  let Inst{25-22} = major;
+  let Inst{21-18} = 0b1100;
+  let Inst{17-16} = n{1-0};
+  let Inst{15-11} = ws;
+  let Inst{10-6} = rd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_ELM_INSERT_B_FMT<bits<4> major, bits<6> minor>: MSAInst {
+  bits<6> n;
+  bits<5> rs;
+  bits<5> wd;
+
+  let Inst{25-22} = major;
+  let Inst{21-20} = 0b00;
+  let Inst{19-16} = n{3-0};
+  let Inst{15-11} = rs;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_ELM_INSERT_H_FMT<bits<4> major, bits<6> minor>: MSAInst {
+  bits<6> n;
+  bits<5> rs;
+  bits<5> wd;
+
+  let Inst{25-22} = major;
+  let Inst{21-19} = 0b100;
+  let Inst{18-16} = n{2-0};
+  let Inst{15-11} = rs;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_ELM_INSERT_W_FMT<bits<4> major, bits<6> minor>: MSAInst {
+  bits<6> n;
+  bits<5> rs;
+  bits<5> wd;
+
+  let Inst{25-22} = major;
+  let Inst{21-18} = 0b1100;
+  let Inst{17-16} = n{1-0};
+  let Inst{15-11} = rs;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_I5_FMT<bits<3> major, bits<2> df, bits<6> minor>: MSAInst {
+  bits<5> imm;
+  bits<5> ws;
+  bits<5> wd;
+
+  let Inst{25-23} = major;
+  let Inst{22-21} = df;
+  let Inst{20-16} = imm;
+  let Inst{15-11} = ws;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_I8_FMT<bits<2> major, bits<6> minor>: MSAInst {
+  bits<8> u8;
+  bits<5> ws;
+  bits<5> wd;
+
+  let Inst{25-24} = major;
+  let Inst{23-16} = u8;
+  let Inst{15-11} = ws;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_I10_FMT<bits<3> major, bits<2> df, bits<6> minor>: MSAInst {
+  bits<10> s10;
+  bits<5> wd;
+
+  let Inst{25-23} = major;
+  let Inst{22-21} = df;
+  let Inst{20-11} = s10;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_MI10_FMT<bits<2> df, bits<4> minor>: MSAInst {
+  bits<21> addr;
+  bits<5> wd;
+
+  let Inst{25-16} = addr{9-0};
+  let Inst{15-11} = addr{20-16};
+  let Inst{10-6} = wd;
+  let Inst{5-2} = minor;
+  let Inst{1-0} = df;
+}
+
+class MSA_VEC_FMT<bits<5> major, bits<6> minor>: MSAInst {
+  bits<5> wt;
+  bits<5> ws;
+  bits<5> wd;
+
+  let Inst{25-21} = major;
+  let Inst{20-16} = wt;
+  let Inst{15-11} = ws;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_CBRANCH_FMT<bits<3> major, bits<2> df>: MSACBranch {
+  bits<16> offset;
+  bits<5> wt;
+
+  let Inst{25-23} = major;
+  let Inst{22-21} = df;
+  let Inst{20-16} = wt;
+  let Inst{15-0} = offset;
+}
+
+class MSA_CBRANCH_V_FMT<bits<5> major>: MSACBranch {
+  bits<16> offset;
+  bits<5> wt;
+
+  let Inst{25-21} = major;
+  let Inst{20-16} = wt;
+  let Inst{15-0} = offset;
+}
+
+class SPECIAL_LSA_FMT<bits<6> minor>: MSASpecial {
+  bits<5> rs;
+  bits<5> rt;
+  bits<5> rd;
+  bits<2> sa;
+
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = rd;
+  let Inst{10-8} = 0b000;
+  let Inst{7-6} = sa;
+  let Inst{5-0} = minor;
+}
diff --git a/lib/Target/Mips/MipsMSAInstrInfo.td b/lib/Target/Mips/MipsMSAInstrInfo.td
new file mode 100644
index 0000000..82c51a6
--- /dev/null
+++ b/lib/Target/Mips/MipsMSAInstrInfo.td
@@ -0,0 +1,3694 @@
+//===- MipsMSAInstrInfo.td - MSA ASE instructions -*- tablegen ------------*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes Mips MSA ASE instructions.
+//
+//===----------------------------------------------------------------------===//
+
+def SDT_MipsVecCond : SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisVec<1>]>;
+def SDT_VSetCC : SDTypeProfile<1, 3, [SDTCisInt<0>,
+                                      SDTCisInt<1>,
+                                      SDTCisSameAs<1, 2>,
+                                      SDTCisVT<3, OtherVT>]>;
+def SDT_VFSetCC : SDTypeProfile<1, 3, [SDTCisInt<0>,
+                                       SDTCisFP<1>,
+                                       SDTCisSameAs<1, 2>,
+                                       SDTCisVT<3, OtherVT>]>;
+def SDT_VSHF : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisVec<0>,
+                                    SDTCisInt<1>, SDTCisVec<1>,
+                                    SDTCisSameAs<0, 2>, SDTCisSameAs<2, 3>]>;
+def SDT_SHF : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisVec<0>,
+                                   SDTCisVT<1, i32>, SDTCisSameAs<0, 2>]>;
+def SDT_ILV : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisVec<0>,
+                                   SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>]>;
+
+def MipsVAllNonZero : SDNode<"MipsISD::VALL_NONZERO", SDT_MipsVecCond>;
+def MipsVAnyNonZero : SDNode<"MipsISD::VANY_NONZERO", SDT_MipsVecCond>;
+def MipsVAllZero : SDNode<"MipsISD::VALL_ZERO", SDT_MipsVecCond>;
+def MipsVAnyZero : SDNode<"MipsISD::VANY_ZERO", SDT_MipsVecCond>;
+def MipsVSMax : SDNode<"MipsISD::VSMAX", SDTIntBinOp,
+                       [SDNPCommutative, SDNPAssociative]>;
+def MipsVSMin : SDNode<"MipsISD::VSMIN", SDTIntBinOp,
+                       [SDNPCommutative, SDNPAssociative]>;
+def MipsVUMax : SDNode<"MipsISD::VUMAX", SDTIntBinOp,
+                       [SDNPCommutative, SDNPAssociative]>;
+def MipsVUMin : SDNode<"MipsISD::VUMIN", SDTIntBinOp,
+                       [SDNPCommutative, SDNPAssociative]>;
+def MipsVNOR : SDNode<"MipsISD::VNOR", SDTIntBinOp,
+                      [SDNPCommutative, SDNPAssociative]>;
+def MipsVSHF : SDNode<"MipsISD::VSHF", SDT_VSHF>;
+def MipsSHF : SDNode<"MipsISD::SHF", SDT_SHF>;
+def MipsILVEV : SDNode<"MipsISD::ILVEV", SDT_ILV>;
+def MipsILVOD : SDNode<"MipsISD::ILVOD", SDT_ILV>;
+def MipsILVL  : SDNode<"MipsISD::ILVL",  SDT_ILV>;
+def MipsILVR  : SDNode<"MipsISD::ILVR",  SDT_ILV>;
+def MipsPCKEV : SDNode<"MipsISD::PCKEV", SDT_ILV>;
+def MipsPCKOD : SDNode<"MipsISD::PCKOD", SDT_ILV>;
+
+def vsetcc : SDNode<"ISD::SETCC", SDT_VSetCC>;
+def vfsetcc : SDNode<"ISD::SETCC", SDT_VFSetCC>;
+
+def MipsVExtractSExt : SDNode<"MipsISD::VEXTRACT_SEXT_ELT",
+    SDTypeProfile<1, 3, [SDTCisPtrTy<2>]>, []>;
+def MipsVExtractZExt : SDNode<"MipsISD::VEXTRACT_ZEXT_ELT",
+    SDTypeProfile<1, 3, [SDTCisPtrTy<2>]>, []>;
+
+// Operands
+
+def uimm2 : Operand<i32> {
+  let PrintMethod = "printUnsignedImm";
+}
+
+// The immediate of an LSA instruction needs special handling
+// as the encoded value should be subtracted by one.
+def uimm2LSAAsmOperand : AsmOperandClass {
+  let Name = "LSAImm";
+  let ParserMethod = "parseLSAImm";
+  let RenderMethod = "addImmOperands";
+}
+
+def LSAImm : Operand<i32> {
+  let PrintMethod = "printUnsignedImm";
+  let EncoderMethod = "getLSAImmEncoding";
+  let DecoderMethod = "DecodeLSAImm";
+  let ParserMatchClass = uimm2LSAAsmOperand;
+}
+
+def uimm3 : Operand<i32> {
+  let PrintMethod = "printUnsignedImm8";
+}
+
+def uimm4 : Operand<i32> {
+  let PrintMethod = "printUnsignedImm8";
+}
+
+def uimm8 : Operand<i32> {
+  let PrintMethod = "printUnsignedImm8";
+}
+
+def simm5 : Operand<i32>;
+
+def simm10 : Operand<i32>;
+
+def vsplat_uimm1 : Operand<vAny> {
+  let PrintMethod = "printUnsignedImm8";
+}
+
+def vsplat_uimm2 : Operand<vAny> {
+  let PrintMethod = "printUnsignedImm8";
+}
+
+def vsplat_uimm3 : Operand<vAny> {
+  let PrintMethod = "printUnsignedImm8";
+}
+
+def vsplat_uimm4 : Operand<vAny> {
+  let PrintMethod = "printUnsignedImm8";
+}
+
+def vsplat_uimm5 : Operand<vAny> {
+  let PrintMethod = "printUnsignedImm8";
+}
+
+def vsplat_uimm6 : Operand<vAny> {
+  let PrintMethod = "printUnsignedImm8";
+}
+
+def vsplat_uimm8 : Operand<vAny> {
+  let PrintMethod = "printUnsignedImm8";
+}
+
+def vsplat_simm5 : Operand<vAny>;
+
+def vsplat_simm10 : Operand<vAny>;
+
+def immZExt2Lsa : ImmLeaf<i32, [{return isUInt<2>(Imm - 1);}]>;
+
+// Pattern fragments
+def vextract_sext_i8  : PatFrag<(ops node:$vec, node:$idx),
+                                (MipsVExtractSExt node:$vec, node:$idx, i8)>;
+def vextract_sext_i16 : PatFrag<(ops node:$vec, node:$idx),
+                                (MipsVExtractSExt node:$vec, node:$idx, i16)>;
+def vextract_sext_i32 : PatFrag<(ops node:$vec, node:$idx),
+                                (MipsVExtractSExt node:$vec, node:$idx, i32)>;
+
+def vextract_zext_i8  : PatFrag<(ops node:$vec, node:$idx),
+                                (MipsVExtractZExt node:$vec, node:$idx, i8)>;
+def vextract_zext_i16 : PatFrag<(ops node:$vec, node:$idx),
+                                (MipsVExtractZExt node:$vec, node:$idx, i16)>;
+def vextract_zext_i32 : PatFrag<(ops node:$vec, node:$idx),
+                                (MipsVExtractZExt node:$vec, node:$idx, i32)>;
+
+def vinsert_v16i8 : PatFrag<(ops node:$vec, node:$val, node:$idx),
+    (v16i8 (vector_insert node:$vec, node:$val, node:$idx))>;
+def vinsert_v8i16 : PatFrag<(ops node:$vec, node:$val, node:$idx),
+    (v8i16 (vector_insert node:$vec, node:$val, node:$idx))>;
+def vinsert_v4i32 : PatFrag<(ops node:$vec, node:$val, node:$idx),
+    (v4i32 (vector_insert node:$vec, node:$val, node:$idx))>;
+
+class vfsetcc_type<ValueType ResTy, ValueType OpTy, CondCode CC> :
+  PatFrag<(ops node:$lhs, node:$rhs),
+          (ResTy (vfsetcc (OpTy node:$lhs), (OpTy node:$rhs), CC))>;
+
+// ISD::SETFALSE cannot occur
+def vfsetoeq_v4f32 : vfsetcc_type<v4i32, v4f32, SETOEQ>;
+def vfsetoeq_v2f64 : vfsetcc_type<v2i64, v2f64, SETOEQ>;
+def vfsetoge_v4f32 : vfsetcc_type<v4i32, v4f32, SETOGE>;
+def vfsetoge_v2f64 : vfsetcc_type<v2i64, v2f64, SETOGE>;
+def vfsetogt_v4f32 : vfsetcc_type<v4i32, v4f32, SETOGT>;
+def vfsetogt_v2f64 : vfsetcc_type<v2i64, v2f64, SETOGT>;
+def vfsetole_v4f32 : vfsetcc_type<v4i32, v4f32, SETOLE>;
+def vfsetole_v2f64 : vfsetcc_type<v2i64, v2f64, SETOLE>;
+def vfsetolt_v4f32 : vfsetcc_type<v4i32, v4f32, SETOLT>;
+def vfsetolt_v2f64 : vfsetcc_type<v2i64, v2f64, SETOLT>;
+def vfsetone_v4f32 : vfsetcc_type<v4i32, v4f32, SETONE>;
+def vfsetone_v2f64 : vfsetcc_type<v2i64, v2f64, SETONE>;
+def vfsetord_v4f32 : vfsetcc_type<v4i32, v4f32, SETO>;
+def vfsetord_v2f64 : vfsetcc_type<v2i64, v2f64, SETO>;
+def vfsetun_v4f32  : vfsetcc_type<v4i32, v4f32, SETUO>;
+def vfsetun_v2f64  : vfsetcc_type<v2i64, v2f64, SETUO>;
+def vfsetueq_v4f32 : vfsetcc_type<v4i32, v4f32, SETUEQ>;
+def vfsetueq_v2f64 : vfsetcc_type<v2i64, v2f64, SETUEQ>;
+def vfsetuge_v4f32 : vfsetcc_type<v4i32, v4f32, SETUGE>;
+def vfsetuge_v2f64 : vfsetcc_type<v2i64, v2f64, SETUGE>;
+def vfsetugt_v4f32 : vfsetcc_type<v4i32, v4f32, SETUGT>;
+def vfsetugt_v2f64 : vfsetcc_type<v2i64, v2f64, SETUGT>;
+def vfsetule_v4f32 : vfsetcc_type<v4i32, v4f32, SETULE>;
+def vfsetule_v2f64 : vfsetcc_type<v2i64, v2f64, SETULE>;
+def vfsetult_v4f32 : vfsetcc_type<v4i32, v4f32, SETULT>;
+def vfsetult_v2f64 : vfsetcc_type<v2i64, v2f64, SETULT>;
+def vfsetune_v4f32 : vfsetcc_type<v4i32, v4f32, SETUNE>;
+def vfsetune_v2f64 : vfsetcc_type<v2i64, v2f64, SETUNE>;
+// ISD::SETTRUE cannot occur
+// ISD::SETFALSE2 cannot occur
+// ISD::SETTRUE2 cannot occur
+
+class vsetcc_type<ValueType ResTy, CondCode CC> :
+  PatFrag<(ops node:$lhs, node:$rhs),
+          (ResTy (vsetcc node:$lhs, node:$rhs, CC))>;
+
+def vseteq_v16i8  : vsetcc_type<v16i8, SETEQ>;
+def vseteq_v8i16  : vsetcc_type<v8i16, SETEQ>;
+def vseteq_v4i32  : vsetcc_type<v4i32, SETEQ>;
+def vseteq_v2i64  : vsetcc_type<v2i64, SETEQ>;
+def vsetle_v16i8  : vsetcc_type<v16i8, SETLE>;
+def vsetle_v8i16  : vsetcc_type<v8i16, SETLE>;
+def vsetle_v4i32  : vsetcc_type<v4i32, SETLE>;
+def vsetle_v2i64  : vsetcc_type<v2i64, SETLE>;
+def vsetlt_v16i8  : vsetcc_type<v16i8, SETLT>;
+def vsetlt_v8i16  : vsetcc_type<v8i16, SETLT>;
+def vsetlt_v4i32  : vsetcc_type<v4i32, SETLT>;
+def vsetlt_v2i64  : vsetcc_type<v2i64, SETLT>;
+def vsetule_v16i8 : vsetcc_type<v16i8, SETULE>;
+def vsetule_v8i16 : vsetcc_type<v8i16, SETULE>;
+def vsetule_v4i32 : vsetcc_type<v4i32, SETULE>;
+def vsetule_v2i64 : vsetcc_type<v2i64, SETULE>;
+def vsetult_v16i8 : vsetcc_type<v16i8, SETULT>;
+def vsetult_v8i16 : vsetcc_type<v8i16, SETULT>;
+def vsetult_v4i32 : vsetcc_type<v4i32, SETULT>;
+def vsetult_v2i64 : vsetcc_type<v2i64, SETULT>;
+
+def vsplati8  : PatFrag<(ops node:$e0),
+                        (v16i8 (build_vector node:$e0, node:$e0,
+                                             node:$e0, node:$e0,
+                                             node:$e0, node:$e0,
+                                             node:$e0, node:$e0,
+                                             node:$e0, node:$e0,
+                                             node:$e0, node:$e0,
+                                             node:$e0, node:$e0,
+                                             node:$e0, node:$e0))>;
+def vsplati16 : PatFrag<(ops node:$e0),
+                        (v8i16 (build_vector node:$e0, node:$e0,
+                                             node:$e0, node:$e0,
+                                             node:$e0, node:$e0,
+                                             node:$e0, node:$e0))>;
+def vsplati32 : PatFrag<(ops node:$e0),
+                        (v4i32 (build_vector node:$e0, node:$e0,
+                                             node:$e0, node:$e0))>;
+def vsplati64 : PatFrag<(ops node:$e0),
+                        (v2i64 (build_vector:$v0 node:$e0, node:$e0))>;
+def vsplatf32 : PatFrag<(ops node:$e0),
+                        (v4f32 (build_vector node:$e0, node:$e0,
+                                             node:$e0, node:$e0))>;
+def vsplatf64 : PatFrag<(ops node:$e0),
+                        (v2f64 (build_vector node:$e0, node:$e0))>;
+
+def vsplati8_elt  : PatFrag<(ops node:$v, node:$i),
+                            (MipsVSHF (vsplati8 node:$i), node:$v, node:$v)>;
+def vsplati16_elt : PatFrag<(ops node:$v, node:$i),
+                            (MipsVSHF (vsplati16 node:$i), node:$v, node:$v)>;
+def vsplati32_elt : PatFrag<(ops node:$v, node:$i),
+                            (MipsVSHF (vsplati32 node:$i), node:$v, node:$v)>;
+def vsplati64_elt : PatFrag<(ops node:$v, node:$i),
+                            (MipsVSHF (vsplati64 node:$i), node:$v, node:$v)>;
+
+class SplatPatLeaf<Operand opclass, dag frag, code pred = [{}],
+                   SDNodeXForm xform = NOOP_SDNodeXForm>
+  : PatLeaf<frag, pred, xform> {
+  Operand OpClass = opclass;
+}
+
+class SplatComplexPattern<Operand opclass, ValueType ty, int numops, string fn,
+                          list<SDNode> roots = [],
+                          list<SDNodeProperty> props = []> :
+  ComplexPattern<ty, numops, fn, roots, props> {
+  Operand OpClass = opclass;
+}
+
+def vsplati8_uimm3 : SplatComplexPattern<vsplat_uimm3, v16i8, 1,
+                                         "selectVSplatUimm3",
+                                         [build_vector, bitconvert]>;
+
+def vsplati8_uimm4 : SplatComplexPattern<vsplat_uimm4, v16i8, 1,
+                                         "selectVSplatUimm4",
+                                         [build_vector, bitconvert]>;
+
+def vsplati8_uimm5 : SplatComplexPattern<vsplat_uimm5, v16i8, 1,
+                                         "selectVSplatUimm5",
+                                         [build_vector, bitconvert]>;
+
+def vsplati8_uimm8 : SplatComplexPattern<vsplat_uimm8, v16i8, 1,
+                                         "selectVSplatUimm8",
+                                         [build_vector, bitconvert]>;
+
+def vsplati8_simm5 : SplatComplexPattern<vsplat_simm5, v16i8, 1,
+                                         "selectVSplatSimm5",
+                                         [build_vector, bitconvert]>;
+
+def vsplati16_uimm3 : SplatComplexPattern<vsplat_uimm3, v8i16, 1,
+                                          "selectVSplatUimm3",
+                                          [build_vector, bitconvert]>;
+
+def vsplati16_uimm4 : SplatComplexPattern<vsplat_uimm4, v8i16, 1,
+                                          "selectVSplatUimm4",
+                                          [build_vector, bitconvert]>;
+
+def vsplati16_uimm5 : SplatComplexPattern<vsplat_uimm5, v8i16, 1,
+                                          "selectVSplatUimm5",
+                                          [build_vector, bitconvert]>;
+
+def vsplati16_simm5 : SplatComplexPattern<vsplat_simm5, v8i16, 1,
+                                          "selectVSplatSimm5",
+                                          [build_vector, bitconvert]>;
+
+def vsplati32_uimm2 : SplatComplexPattern<vsplat_uimm2, v4i32, 1,
+                                          "selectVSplatUimm2",
+                                          [build_vector, bitconvert]>;
+
+def vsplati32_uimm5 : SplatComplexPattern<vsplat_uimm5, v4i32, 1,
+                                          "selectVSplatUimm5",
+                                          [build_vector, bitconvert]>;
+
+def vsplati32_simm5 : SplatComplexPattern<vsplat_simm5, v4i32, 1,
+                                          "selectVSplatSimm5",
+                                          [build_vector, bitconvert]>;
+
+def vsplati64_uimm1 : SplatComplexPattern<vsplat_uimm1, v2i64, 1,
+                                          "selectVSplatUimm1",
+                                          [build_vector, bitconvert]>;
+
+def vsplati64_uimm5 : SplatComplexPattern<vsplat_uimm5, v2i64, 1,
+                                          "selectVSplatUimm5",
+                                          [build_vector, bitconvert]>;
+
+def vsplati64_uimm6 : SplatComplexPattern<vsplat_uimm6, v2i64, 1,
+                                          "selectVSplatUimm6",
+                                          [build_vector, bitconvert]>;
+
+def vsplati64_simm5 : SplatComplexPattern<vsplat_simm5, v2i64, 1,
+                                          "selectVSplatSimm5",
+                                          [build_vector, bitconvert]>;
+
+// Any build_vector that is a constant splat with a value that is an exact
+// power of 2
+def vsplat_uimm_pow2 : ComplexPattern<vAny, 1, "selectVSplatUimmPow2",
+                                      [build_vector, bitconvert]>;
+
+// Any build_vector that is a constant splat with a value that is the bitwise
+// inverse of an exact power of 2
+def vsplat_uimm_inv_pow2 : ComplexPattern<vAny, 1, "selectVSplatUimmInvPow2",
+                                          [build_vector, bitconvert]>;
+
+// Any build_vector that is a constant splat with only a consecutive sequence
+// of left-most bits set.
+def vsplat_maskl_bits : SplatComplexPattern<vsplat_uimm8, vAny, 1,
+                                            "selectVSplatMaskL",
+                                            [build_vector, bitconvert]>;
+
+// Any build_vector that is a constant splat with only a consecutive sequence
+// of right-most bits set.
+def vsplat_maskr_bits : SplatComplexPattern<vsplat_uimm8, vAny, 1,
+                                            "selectVSplatMaskR",
+                                            [build_vector, bitconvert]>;
+
+// Any build_vector that is a constant splat with a value that equals 1
+// FIXME: These should be a ComplexPattern but we can't use them because the
+//        ISel generator requires the uses to have a name, but providing a name
+//        causes other errors ("used in pattern but not operand list")
+def vsplat_imm_eq_1 : PatLeaf<(build_vector), [{
+  APInt Imm;
+  EVT EltTy = N->getValueType(0).getVectorElementType();
+
+  return selectVSplat (N, Imm) &&
+         Imm.getBitWidth() == EltTy.getSizeInBits() && Imm == 1;
+}]>;
+
+def vsplati64_imm_eq_1 : PatLeaf<(bitconvert (v4i32 (build_vector))), [{
+  APInt Imm;
+  SDNode *BV = N->getOperand(0).getNode();
+  EVT EltTy = N->getValueType(0).getVectorElementType();
+
+  return selectVSplat (BV, Imm) &&
+         Imm.getBitWidth() == EltTy.getSizeInBits() && Imm == 1;
+}]>;
+
+def vbclr_b : PatFrag<(ops node:$ws, node:$wt),
+                      (and node:$ws, (xor (shl vsplat_imm_eq_1, node:$wt),
+                                          immAllOnesV))>;
+def vbclr_h : PatFrag<(ops node:$ws, node:$wt),
+                      (and node:$ws, (xor (shl vsplat_imm_eq_1, node:$wt),
+                                          immAllOnesV))>;
+def vbclr_w : PatFrag<(ops node:$ws, node:$wt),
+                      (and node:$ws, (xor (shl vsplat_imm_eq_1, node:$wt),
+                                          immAllOnesV))>;
+def vbclr_d : PatFrag<(ops node:$ws, node:$wt),
+                      (and node:$ws, (xor (shl (v2i64 vsplati64_imm_eq_1),
+                                               node:$wt),
+                                          (bitconvert (v4i32 immAllOnesV))))>;
+
+def vbneg_b : PatFrag<(ops node:$ws, node:$wt),
+                      (xor node:$ws, (shl vsplat_imm_eq_1, node:$wt))>;
+def vbneg_h : PatFrag<(ops node:$ws, node:$wt),
+                      (xor node:$ws, (shl vsplat_imm_eq_1, node:$wt))>;
+def vbneg_w : PatFrag<(ops node:$ws, node:$wt),
+                      (xor node:$ws, (shl vsplat_imm_eq_1, node:$wt))>;
+def vbneg_d : PatFrag<(ops node:$ws, node:$wt),
+                      (xor node:$ws, (shl (v2i64 vsplati64_imm_eq_1),
+                                          node:$wt))>;
+
+def vbset_b : PatFrag<(ops node:$ws, node:$wt),
+                      (or node:$ws, (shl vsplat_imm_eq_1, node:$wt))>;
+def vbset_h : PatFrag<(ops node:$ws, node:$wt),
+                      (or node:$ws, (shl vsplat_imm_eq_1, node:$wt))>;
+def vbset_w : PatFrag<(ops node:$ws, node:$wt),
+                      (or node:$ws, (shl vsplat_imm_eq_1, node:$wt))>;
+def vbset_d : PatFrag<(ops node:$ws, node:$wt),
+                      (or node:$ws, (shl (v2i64 vsplati64_imm_eq_1),
+                                         node:$wt))>;
+
+def fms : PatFrag<(ops node:$wd, node:$ws, node:$wt),
+                  (fsub node:$wd, (fmul node:$ws, node:$wt))>;
+
+def muladd : PatFrag<(ops node:$wd, node:$ws, node:$wt),
+                     (add node:$wd, (mul node:$ws, node:$wt))>;
+
+def mulsub : PatFrag<(ops node:$wd, node:$ws, node:$wt),
+                     (sub node:$wd, (mul node:$ws, node:$wt))>;
+
+def mul_fexp2 : PatFrag<(ops node:$ws, node:$wt),
+                        (fmul node:$ws, (fexp2 node:$wt))>;
+
+// Immediates
+def immSExt5 : ImmLeaf<i32, [{return isInt<5>(Imm);}]>;
+def immSExt10: ImmLeaf<i32, [{return isInt<10>(Imm);}]>;
+
+// Instruction encoding.
+class ADD_A_B_ENC : MSA_3R_FMT<0b000, 0b00, 0b010000>;
+class ADD_A_H_ENC : MSA_3R_FMT<0b000, 0b01, 0b010000>;
+class ADD_A_W_ENC : MSA_3R_FMT<0b000, 0b10, 0b010000>;
+class ADD_A_D_ENC : MSA_3R_FMT<0b000, 0b11, 0b010000>;
+
+class ADDS_A_B_ENC : MSA_3R_FMT<0b001, 0b00, 0b010000>;
+class ADDS_A_H_ENC : MSA_3R_FMT<0b001, 0b01, 0b010000>;
+class ADDS_A_W_ENC : MSA_3R_FMT<0b001, 0b10, 0b010000>;
+class ADDS_A_D_ENC : MSA_3R_FMT<0b001, 0b11, 0b010000>;
+
+class ADDS_S_B_ENC : MSA_3R_FMT<0b010, 0b00, 0b010000>;
+class ADDS_S_H_ENC : MSA_3R_FMT<0b010, 0b01, 0b010000>;
+class ADDS_S_W_ENC : MSA_3R_FMT<0b010, 0b10, 0b010000>;
+class ADDS_S_D_ENC : MSA_3R_FMT<0b010, 0b11, 0b010000>;
+
+class ADDS_U_B_ENC : MSA_3R_FMT<0b011, 0b00, 0b010000>;
+class ADDS_U_H_ENC : MSA_3R_FMT<0b011, 0b01, 0b010000>;
+class ADDS_U_W_ENC : MSA_3R_FMT<0b011, 0b10, 0b010000>;
+class ADDS_U_D_ENC : MSA_3R_FMT<0b011, 0b11, 0b010000>;
+
+class ADDV_B_ENC : MSA_3R_FMT<0b000, 0b00, 0b001110>;
+class ADDV_H_ENC : MSA_3R_FMT<0b000, 0b01, 0b001110>;
+class ADDV_W_ENC : MSA_3R_FMT<0b000, 0b10, 0b001110>;
+class ADDV_D_ENC : MSA_3R_FMT<0b000, 0b11, 0b001110>;
+
+class ADDVI_B_ENC : MSA_I5_FMT<0b000, 0b00, 0b000110>;
+class ADDVI_H_ENC : MSA_I5_FMT<0b000, 0b01, 0b000110>;
+class ADDVI_W_ENC : MSA_I5_FMT<0b000, 0b10, 0b000110>;
+class ADDVI_D_ENC : MSA_I5_FMT<0b000, 0b11, 0b000110>;
+
+class AND_V_ENC : MSA_VEC_FMT<0b00000, 0b011110>;
+
+class ANDI_B_ENC : MSA_I8_FMT<0b00, 0b000000>;
+
+class ASUB_S_B_ENC : MSA_3R_FMT<0b100, 0b00, 0b010001>;
+class ASUB_S_H_ENC : MSA_3R_FMT<0b100, 0b01, 0b010001>;
+class ASUB_S_W_ENC : MSA_3R_FMT<0b100, 0b10, 0b010001>;
+class ASUB_S_D_ENC : MSA_3R_FMT<0b100, 0b11, 0b010001>;
+
+class ASUB_U_B_ENC : MSA_3R_FMT<0b101, 0b00, 0b010001>;
+class ASUB_U_H_ENC : MSA_3R_FMT<0b101, 0b01, 0b010001>;
+class ASUB_U_W_ENC : MSA_3R_FMT<0b101, 0b10, 0b010001>;
+class ASUB_U_D_ENC : MSA_3R_FMT<0b101, 0b11, 0b010001>;
+
+class AVE_S_B_ENC : MSA_3R_FMT<0b100, 0b00, 0b010000>;
+class AVE_S_H_ENC : MSA_3R_FMT<0b100, 0b01, 0b010000>;
+class AVE_S_W_ENC : MSA_3R_FMT<0b100, 0b10, 0b010000>;
+class AVE_S_D_ENC : MSA_3R_FMT<0b100, 0b11, 0b010000>;
+
+class AVE_U_B_ENC : MSA_3R_FMT<0b101, 0b00, 0b010000>;
+class AVE_U_H_ENC : MSA_3R_FMT<0b101, 0b01, 0b010000>;
+class AVE_U_W_ENC : MSA_3R_FMT<0b101, 0b10, 0b010000>;
+class AVE_U_D_ENC : MSA_3R_FMT<0b101, 0b11, 0b010000>;
+
+class AVER_S_B_ENC : MSA_3R_FMT<0b110, 0b00, 0b010000>;
+class AVER_S_H_ENC : MSA_3R_FMT<0b110, 0b01, 0b010000>;
+class AVER_S_W_ENC : MSA_3R_FMT<0b110, 0b10, 0b010000>;
+class AVER_S_D_ENC : MSA_3R_FMT<0b110, 0b11, 0b010000>;
+
+class AVER_U_B_ENC : MSA_3R_FMT<0b111, 0b00, 0b010000>;
+class AVER_U_H_ENC : MSA_3R_FMT<0b111, 0b01, 0b010000>;
+class AVER_U_W_ENC : MSA_3R_FMT<0b111, 0b10, 0b010000>;
+class AVER_U_D_ENC : MSA_3R_FMT<0b111, 0b11, 0b010000>;
+
+class BCLR_B_ENC : MSA_3R_FMT<0b011, 0b00, 0b001101>;
+class BCLR_H_ENC : MSA_3R_FMT<0b011, 0b01, 0b001101>;
+class BCLR_W_ENC : MSA_3R_FMT<0b011, 0b10, 0b001101>;
+class BCLR_D_ENC : MSA_3R_FMT<0b011, 0b11, 0b001101>;
+
+class BCLRI_B_ENC : MSA_BIT_B_FMT<0b011, 0b001001>;
+class BCLRI_H_ENC : MSA_BIT_H_FMT<0b011, 0b001001>;
+class BCLRI_W_ENC : MSA_BIT_W_FMT<0b011, 0b001001>;
+class BCLRI_D_ENC : MSA_BIT_D_FMT<0b011, 0b001001>;
+
+class BINSL_B_ENC : MSA_3R_FMT<0b110, 0b00, 0b001101>;
+class BINSL_H_ENC : MSA_3R_FMT<0b110, 0b01, 0b001101>;
+class BINSL_W_ENC : MSA_3R_FMT<0b110, 0b10, 0b001101>;
+class BINSL_D_ENC : MSA_3R_FMT<0b110, 0b11, 0b001101>;
+
+class BINSLI_B_ENC : MSA_BIT_B_FMT<0b110, 0b001001>;
+class BINSLI_H_ENC : MSA_BIT_H_FMT<0b110, 0b001001>;
+class BINSLI_W_ENC : MSA_BIT_W_FMT<0b110, 0b001001>;
+class BINSLI_D_ENC : MSA_BIT_D_FMT<0b110, 0b001001>;
+
+class BINSR_B_ENC : MSA_3R_FMT<0b111, 0b00, 0b001101>;
+class BINSR_H_ENC : MSA_3R_FMT<0b111, 0b01, 0b001101>;
+class BINSR_W_ENC : MSA_3R_FMT<0b111, 0b10, 0b001101>;
+class BINSR_D_ENC : MSA_3R_FMT<0b111, 0b11, 0b001101>;
+
+class BINSRI_B_ENC : MSA_BIT_B_FMT<0b111, 0b001001>;
+class BINSRI_H_ENC : MSA_BIT_H_FMT<0b111, 0b001001>;
+class BINSRI_W_ENC : MSA_BIT_W_FMT<0b111, 0b001001>;
+class BINSRI_D_ENC : MSA_BIT_D_FMT<0b111, 0b001001>;
+
+class BMNZ_V_ENC : MSA_VEC_FMT<0b00100, 0b011110>;
+
+class BMNZI_B_ENC : MSA_I8_FMT<0b00, 0b000001>;
+
+class BMZ_V_ENC : MSA_VEC_FMT<0b00101, 0b011110>;
+
+class BMZI_B_ENC : MSA_I8_FMT<0b01, 0b000001>;
+
+class BNEG_B_ENC : MSA_3R_FMT<0b101, 0b00, 0b001101>;
+class BNEG_H_ENC : MSA_3R_FMT<0b101, 0b01, 0b001101>;
+class BNEG_W_ENC : MSA_3R_FMT<0b101, 0b10, 0b001101>;
+class BNEG_D_ENC : MSA_3R_FMT<0b101, 0b11, 0b001101>;
+
+class BNEGI_B_ENC : MSA_BIT_B_FMT<0b101, 0b001001>;
+class BNEGI_H_ENC : MSA_BIT_H_FMT<0b101, 0b001001>;
+class BNEGI_W_ENC : MSA_BIT_W_FMT<0b101, 0b001001>;
+class BNEGI_D_ENC : MSA_BIT_D_FMT<0b101, 0b001001>;
+
+class BNZ_B_ENC : MSA_CBRANCH_FMT<0b111, 0b00>;
+class BNZ_H_ENC : MSA_CBRANCH_FMT<0b111, 0b01>;
+class BNZ_W_ENC : MSA_CBRANCH_FMT<0b111, 0b10>;
+class BNZ_D_ENC : MSA_CBRANCH_FMT<0b111, 0b11>;
+
+class BNZ_V_ENC : MSA_CBRANCH_V_FMT<0b01111>;
+
+class BSEL_V_ENC : MSA_VEC_FMT<0b00110, 0b011110>;
+
+class BSELI_B_ENC : MSA_I8_FMT<0b10, 0b000001>;
+
+class BSET_B_ENC : MSA_3R_FMT<0b100, 0b00, 0b001101>;
+class BSET_H_ENC : MSA_3R_FMT<0b100, 0b01, 0b001101>;
+class BSET_W_ENC : MSA_3R_FMT<0b100, 0b10, 0b001101>;
+class BSET_D_ENC : MSA_3R_FMT<0b100, 0b11, 0b001101>;
+
+class BSETI_B_ENC : MSA_BIT_B_FMT<0b100, 0b001001>;
+class BSETI_H_ENC : MSA_BIT_H_FMT<0b100, 0b001001>;
+class BSETI_W_ENC : MSA_BIT_W_FMT<0b100, 0b001001>;
+class BSETI_D_ENC : MSA_BIT_D_FMT<0b100, 0b001001>;
+
+class BZ_B_ENC : MSA_CBRANCH_FMT<0b110, 0b00>;
+class BZ_H_ENC : MSA_CBRANCH_FMT<0b110, 0b01>;
+class BZ_W_ENC : MSA_CBRANCH_FMT<0b110, 0b10>;
+class BZ_D_ENC : MSA_CBRANCH_FMT<0b110, 0b11>;
+
+class BZ_V_ENC : MSA_CBRANCH_V_FMT<0b01011>;
+
+class CEQ_B_ENC : MSA_3R_FMT<0b000, 0b00, 0b001111>;
+class CEQ_H_ENC : MSA_3R_FMT<0b000, 0b01, 0b001111>;
+class CEQ_W_ENC : MSA_3R_FMT<0b000, 0b10, 0b001111>;
+class CEQ_D_ENC : MSA_3R_FMT<0b000, 0b11, 0b001111>;
+
+class CEQI_B_ENC : MSA_I5_FMT<0b000, 0b00, 0b000111>;
+class CEQI_H_ENC : MSA_I5_FMT<0b000, 0b01, 0b000111>;
+class CEQI_W_ENC : MSA_I5_FMT<0b000, 0b10, 0b000111>;
+class CEQI_D_ENC : MSA_I5_FMT<0b000, 0b11, 0b000111>;
+
+class CFCMSA_ENC : MSA_ELM_CFCMSA_FMT<0b0001111110, 0b011001>;
+
+class CLE_S_B_ENC : MSA_3R_FMT<0b100, 0b00, 0b001111>;
+class CLE_S_H_ENC : MSA_3R_FMT<0b100, 0b01, 0b001111>;
+class CLE_S_W_ENC : MSA_3R_FMT<0b100, 0b10, 0b001111>;
+class CLE_S_D_ENC : MSA_3R_FMT<0b100, 0b11, 0b001111>;
+
+class CLE_U_B_ENC : MSA_3R_FMT<0b101, 0b00, 0b001111>;
+class CLE_U_H_ENC : MSA_3R_FMT<0b101, 0b01, 0b001111>;
+class CLE_U_W_ENC : MSA_3R_FMT<0b101, 0b10, 0b001111>;
+class CLE_U_D_ENC : MSA_3R_FMT<0b101, 0b11, 0b001111>;
+
+class CLEI_S_B_ENC : MSA_I5_FMT<0b100, 0b00, 0b000111>;
+class CLEI_S_H_ENC : MSA_I5_FMT<0b100, 0b01, 0b000111>;
+class CLEI_S_W_ENC : MSA_I5_FMT<0b100, 0b10, 0b000111>;
+class CLEI_S_D_ENC : MSA_I5_FMT<0b100, 0b11, 0b000111>;
+
+class CLEI_U_B_ENC : MSA_I5_FMT<0b101, 0b00, 0b000111>;
+class CLEI_U_H_ENC : MSA_I5_FMT<0b101, 0b01, 0b000111>;
+class CLEI_U_W_ENC : MSA_I5_FMT<0b101, 0b10, 0b000111>;
+class CLEI_U_D_ENC : MSA_I5_FMT<0b101, 0b11, 0b000111>;
+
+class CLT_S_B_ENC : MSA_3R_FMT<0b010, 0b00, 0b001111>;
+class CLT_S_H_ENC : MSA_3R_FMT<0b010, 0b01, 0b001111>;
+class CLT_S_W_ENC : MSA_3R_FMT<0b010, 0b10, 0b001111>;
+class CLT_S_D_ENC : MSA_3R_FMT<0b010, 0b11, 0b001111>;
+
+class CLT_U_B_ENC : MSA_3R_FMT<0b011, 0b00, 0b001111>;
+class CLT_U_H_ENC : MSA_3R_FMT<0b011, 0b01, 0b001111>;
+class CLT_U_W_ENC : MSA_3R_FMT<0b011, 0b10, 0b001111>;
+class CLT_U_D_ENC : MSA_3R_FMT<0b011, 0b11, 0b001111>;
+
+class CLTI_S_B_ENC : MSA_I5_FMT<0b010, 0b00, 0b000111>;
+class CLTI_S_H_ENC : MSA_I5_FMT<0b010, 0b01, 0b000111>;
+class CLTI_S_W_ENC : MSA_I5_FMT<0b010, 0b10, 0b000111>;
+class CLTI_S_D_ENC : MSA_I5_FMT<0b010, 0b11, 0b000111>;
+
+class CLTI_U_B_ENC : MSA_I5_FMT<0b011, 0b00, 0b000111>;
+class CLTI_U_H_ENC : MSA_I5_FMT<0b011, 0b01, 0b000111>;
+class CLTI_U_W_ENC : MSA_I5_FMT<0b011, 0b10, 0b000111>;
+class CLTI_U_D_ENC : MSA_I5_FMT<0b011, 0b11, 0b000111>;
+
+class COPY_S_B_ENC : MSA_ELM_COPY_B_FMT<0b0010, 0b011001>;
+class COPY_S_H_ENC : MSA_ELM_COPY_H_FMT<0b0010, 0b011001>;
+class COPY_S_W_ENC : MSA_ELM_COPY_W_FMT<0b0010, 0b011001>;
+
+class COPY_U_B_ENC : MSA_ELM_COPY_B_FMT<0b0011, 0b011001>;
+class COPY_U_H_ENC : MSA_ELM_COPY_H_FMT<0b0011, 0b011001>;
+class COPY_U_W_ENC : MSA_ELM_COPY_W_FMT<0b0011, 0b011001>;
+
+class CTCMSA_ENC : MSA_ELM_CTCMSA_FMT<0b0000111110, 0b011001>;
+
+class DIV_S_B_ENC : MSA_3R_FMT<0b100, 0b00, 0b010010>;
+class DIV_S_H_ENC : MSA_3R_FMT<0b100, 0b01, 0b010010>;
+class DIV_S_W_ENC : MSA_3R_FMT<0b100, 0b10, 0b010010>;
+class DIV_S_D_ENC : MSA_3R_FMT<0b100, 0b11, 0b010010>;
+
+class DIV_U_B_ENC : MSA_3R_FMT<0b101, 0b00, 0b010010>;
+class DIV_U_H_ENC : MSA_3R_FMT<0b101, 0b01, 0b010010>;
+class DIV_U_W_ENC : MSA_3R_FMT<0b101, 0b10, 0b010010>;
+class DIV_U_D_ENC : MSA_3R_FMT<0b101, 0b11, 0b010010>;
+
+class DOTP_S_H_ENC : MSA_3R_FMT<0b000, 0b01, 0b010011>;
+class DOTP_S_W_ENC : MSA_3R_FMT<0b000, 0b10, 0b010011>;
+class DOTP_S_D_ENC : MSA_3R_FMT<0b000, 0b11, 0b010011>;
+
+class DOTP_U_H_ENC : MSA_3R_FMT<0b001, 0b01, 0b010011>;
+class DOTP_U_W_ENC : MSA_3R_FMT<0b001, 0b10, 0b010011>;
+class DOTP_U_D_ENC : MSA_3R_FMT<0b001, 0b11, 0b010011>;
+
+class DPADD_S_H_ENC : MSA_3R_FMT<0b010, 0b01, 0b010011>;
+class DPADD_S_W_ENC : MSA_3R_FMT<0b010, 0b10, 0b010011>;
+class DPADD_S_D_ENC : MSA_3R_FMT<0b010, 0b11, 0b010011>;
+
+class DPADD_U_H_ENC : MSA_3R_FMT<0b011, 0b01, 0b010011>;
+class DPADD_U_W_ENC : MSA_3R_FMT<0b011, 0b10, 0b010011>;
+class DPADD_U_D_ENC : MSA_3R_FMT<0b011, 0b11, 0b010011>;
+
+class DPSUB_S_H_ENC : MSA_3R_FMT<0b100, 0b01, 0b010011>;
+class DPSUB_S_W_ENC : MSA_3R_FMT<0b100, 0b10, 0b010011>;
+class DPSUB_S_D_ENC : MSA_3R_FMT<0b100, 0b11, 0b010011>;
+
+class DPSUB_U_H_ENC : MSA_3R_FMT<0b101, 0b01, 0b010011>;
+class DPSUB_U_W_ENC : MSA_3R_FMT<0b101, 0b10, 0b010011>;
+class DPSUB_U_D_ENC : MSA_3R_FMT<0b101, 0b11, 0b010011>;
+
+class FADD_W_ENC : MSA_3RF_FMT<0b0000, 0b0, 0b011011>;
+class FADD_D_ENC : MSA_3RF_FMT<0b0000, 0b1, 0b011011>;
+
+class FCAF_W_ENC : MSA_3RF_FMT<0b0000, 0b0, 0b011010>;
+class FCAF_D_ENC : MSA_3RF_FMT<0b0000, 0b1, 0b011010>;
+
+class FCEQ_W_ENC : MSA_3RF_FMT<0b0010, 0b0, 0b011010>;
+class FCEQ_D_ENC : MSA_3RF_FMT<0b0010, 0b1, 0b011010>;
+
+class FCLASS_W_ENC : MSA_2RF_FMT<0b110010000, 0b0, 0b011110>;
+class FCLASS_D_ENC : MSA_2RF_FMT<0b110010000, 0b1, 0b011110>;
+
+class FCLE_W_ENC : MSA_3RF_FMT<0b0110, 0b0, 0b011010>;
+class FCLE_D_ENC : MSA_3RF_FMT<0b0110, 0b1, 0b011010>;
+
+class FCLT_W_ENC : MSA_3RF_FMT<0b0100, 0b0, 0b011010>;
+class FCLT_D_ENC : MSA_3RF_FMT<0b0100, 0b1, 0b011010>;
+
+class FCNE_W_ENC : MSA_3RF_FMT<0b0011, 0b0, 0b011100>;
+class FCNE_D_ENC : MSA_3RF_FMT<0b0011, 0b1, 0b011100>;
+
+class FCOR_W_ENC : MSA_3RF_FMT<0b0001, 0b0, 0b011100>;
+class FCOR_D_ENC : MSA_3RF_FMT<0b0001, 0b1, 0b011100>;
+
+class FCUEQ_W_ENC : MSA_3RF_FMT<0b0011, 0b0, 0b011010>;
+class FCUEQ_D_ENC : MSA_3RF_FMT<0b0011, 0b1, 0b011010>;
+
+class FCULE_W_ENC : MSA_3RF_FMT<0b0111, 0b0, 0b011010>;
+class FCULE_D_ENC : MSA_3RF_FMT<0b0111, 0b1, 0b011010>;
+
+class FCULT_W_ENC : MSA_3RF_FMT<0b0101, 0b0, 0b011010>;
+class FCULT_D_ENC : MSA_3RF_FMT<0b0101, 0b1, 0b011010>;
+
+class FCUN_W_ENC : MSA_3RF_FMT<0b0001, 0b0, 0b011010>;
+class FCUN_D_ENC : MSA_3RF_FMT<0b0001, 0b1, 0b011010>;
+
+class FCUNE_W_ENC : MSA_3RF_FMT<0b0010, 0b0, 0b011100>;
+class FCUNE_D_ENC : MSA_3RF_FMT<0b0010, 0b1, 0b011100>;
+
+class FDIV_W_ENC : MSA_3RF_FMT<0b0011, 0b0, 0b011011>;
+class FDIV_D_ENC : MSA_3RF_FMT<0b0011, 0b1, 0b011011>;
+
+class FEXDO_H_ENC : MSA_3RF_FMT<0b1000, 0b0, 0b011011>;
+class FEXDO_W_ENC : MSA_3RF_FMT<0b1000, 0b1, 0b011011>;
+
+class FEXP2_W_ENC : MSA_3RF_FMT<0b0111, 0b0, 0b011011>;
+class FEXP2_D_ENC : MSA_3RF_FMT<0b0111, 0b1, 0b011011>;
+
+class FEXUPL_W_ENC : MSA_2RF_FMT<0b110011000, 0b0, 0b011110>;
+class FEXUPL_D_ENC : MSA_2RF_FMT<0b110011000, 0b1, 0b011110>;
+
+class FEXUPR_W_ENC : MSA_2RF_FMT<0b110011001, 0b0, 0b011110>;
+class FEXUPR_D_ENC : MSA_2RF_FMT<0b110011001, 0b1, 0b011110>;
+
+class FFINT_S_W_ENC : MSA_2RF_FMT<0b110011110, 0b0, 0b011110>;
+class FFINT_S_D_ENC : MSA_2RF_FMT<0b110011110, 0b1, 0b011110>;
+
+class FFINT_U_W_ENC : MSA_2RF_FMT<0b110011111, 0b0, 0b011110>;
+class FFINT_U_D_ENC : MSA_2RF_FMT<0b110011111, 0b1, 0b011110>;
+
+class FFQL_W_ENC : MSA_2RF_FMT<0b110011010, 0b0, 0b011110>;
+class FFQL_D_ENC : MSA_2RF_FMT<0b110011010, 0b1, 0b011110>;
+
+class FFQR_W_ENC : MSA_2RF_FMT<0b110011011, 0b0, 0b011110>;
+class FFQR_D_ENC : MSA_2RF_FMT<0b110011011, 0b1, 0b011110>;
+
+class FILL_B_ENC : MSA_2R_FILL_FMT<0b11000000, 0b00, 0b011110>;
+class FILL_H_ENC : MSA_2R_FILL_FMT<0b11000000, 0b01, 0b011110>;
+class FILL_W_ENC : MSA_2R_FILL_FMT<0b11000000, 0b10, 0b011110>;
+
+class FLOG2_W_ENC : MSA_2RF_FMT<0b110010111, 0b0, 0b011110>;
+class FLOG2_D_ENC : MSA_2RF_FMT<0b110010111, 0b1, 0b011110>;
+
+class FMADD_W_ENC : MSA_3RF_FMT<0b0100, 0b0, 0b011011>;
+class FMADD_D_ENC : MSA_3RF_FMT<0b0100, 0b1, 0b011011>;
+
+class FMAX_W_ENC : MSA_3RF_FMT<0b1110, 0b0, 0b011011>;
+class FMAX_D_ENC : MSA_3RF_FMT<0b1110, 0b1, 0b011011>;
+
+class FMAX_A_W_ENC : MSA_3RF_FMT<0b1111, 0b0, 0b011011>;
+class FMAX_A_D_ENC : MSA_3RF_FMT<0b1111, 0b1, 0b011011>;
+
+class FMIN_W_ENC : MSA_3RF_FMT<0b1100, 0b0, 0b011011>;
+class FMIN_D_ENC : MSA_3RF_FMT<0b1100, 0b1, 0b011011>;
+
+class FMIN_A_W_ENC : MSA_3RF_FMT<0b1101, 0b0, 0b011011>;
+class FMIN_A_D_ENC : MSA_3RF_FMT<0b1101, 0b1, 0b011011>;
+
+class FMSUB_W_ENC : MSA_3RF_FMT<0b0101, 0b0, 0b011011>;
+class FMSUB_D_ENC : MSA_3RF_FMT<0b0101, 0b1, 0b011011>;
+
+class FMUL_W_ENC : MSA_3RF_FMT<0b0010, 0b0, 0b011011>;
+class FMUL_D_ENC : MSA_3RF_FMT<0b0010, 0b1, 0b011011>;
+
+class FRINT_W_ENC : MSA_2RF_FMT<0b110010110, 0b0, 0b011110>;
+class FRINT_D_ENC : MSA_2RF_FMT<0b110010110, 0b1, 0b011110>;
+
+class FRCP_W_ENC : MSA_2RF_FMT<0b110010101, 0b0, 0b011110>;
+class FRCP_D_ENC : MSA_2RF_FMT<0b110010101, 0b1, 0b011110>;
+
+class FRSQRT_W_ENC : MSA_2RF_FMT<0b110010100, 0b0, 0b011110>;
+class FRSQRT_D_ENC : MSA_2RF_FMT<0b110010100, 0b1, 0b011110>;
+
+class FSAF_W_ENC : MSA_3RF_FMT<0b1000, 0b0, 0b011010>;
+class FSAF_D_ENC : MSA_3RF_FMT<0b1000, 0b1, 0b011010>;
+
+class FSEQ_W_ENC : MSA_3RF_FMT<0b1010, 0b0, 0b011010>;
+class FSEQ_D_ENC : MSA_3RF_FMT<0b1010, 0b1, 0b011010>;
+
+class FSLE_W_ENC : MSA_3RF_FMT<0b1110, 0b0, 0b011010>;
+class FSLE_D_ENC : MSA_3RF_FMT<0b1110, 0b1, 0b011010>;
+
+class FSLT_W_ENC : MSA_3RF_FMT<0b1100, 0b0, 0b011010>;
+class FSLT_D_ENC : MSA_3RF_FMT<0b1100, 0b1, 0b011010>;
+
+class FSNE_W_ENC : MSA_3RF_FMT<0b1011, 0b0, 0b011100>;
+class FSNE_D_ENC : MSA_3RF_FMT<0b1011, 0b1, 0b011100>;
+
+class FSOR_W_ENC : MSA_3RF_FMT<0b1001, 0b0, 0b011100>;
+class FSOR_D_ENC : MSA_3RF_FMT<0b1001, 0b1, 0b011100>;
+
+class FSQRT_W_ENC : MSA_2RF_FMT<0b110010011, 0b0, 0b011110>;
+class FSQRT_D_ENC : MSA_2RF_FMT<0b110010011, 0b1, 0b011110>;
+
+class FSUB_W_ENC : MSA_3RF_FMT<0b0001, 0b0, 0b011011>;
+class FSUB_D_ENC : MSA_3RF_FMT<0b0001, 0b1, 0b011011>;
+
+class FSUEQ_W_ENC : MSA_3RF_FMT<0b1011, 0b0, 0b011010>;
+class FSUEQ_D_ENC : MSA_3RF_FMT<0b1011, 0b1, 0b011010>;
+
+class FSULE_W_ENC : MSA_3RF_FMT<0b1111, 0b0, 0b011010>;
+class FSULE_D_ENC : MSA_3RF_FMT<0b1111, 0b1, 0b011010>;
+
+class FSULT_W_ENC : MSA_3RF_FMT<0b1101, 0b0, 0b011010>;
+class FSULT_D_ENC : MSA_3RF_FMT<0b1101, 0b1, 0b011010>;
+
+class FSUN_W_ENC : MSA_3RF_FMT<0b1001, 0b0, 0b011010>;
+class FSUN_D_ENC : MSA_3RF_FMT<0b1001, 0b1, 0b011010>;
+
+class FSUNE_W_ENC : MSA_3RF_FMT<0b1010, 0b0, 0b011100>;
+class FSUNE_D_ENC : MSA_3RF_FMT<0b1010, 0b1, 0b011100>;
+
+class FTINT_S_W_ENC : MSA_2RF_FMT<0b110011100, 0b0, 0b011110>;
+class FTINT_S_D_ENC : MSA_2RF_FMT<0b110011100, 0b1, 0b011110>;
+
+class FTINT_U_W_ENC : MSA_2RF_FMT<0b110011101, 0b0, 0b011110>;
+class FTINT_U_D_ENC : MSA_2RF_FMT<0b110011101, 0b1, 0b011110>;
+
+class FTQ_H_ENC : MSA_3RF_FMT<0b1010, 0b0, 0b011011>;
+class FTQ_W_ENC : MSA_3RF_FMT<0b1010, 0b1, 0b011011>;
+
+class FTRUNC_S_W_ENC : MSA_2RF_FMT<0b110010001, 0b0, 0b011110>;
+class FTRUNC_S_D_ENC : MSA_2RF_FMT<0b110010001, 0b1, 0b011110>;
+
+class FTRUNC_U_W_ENC : MSA_2RF_FMT<0b110010010, 0b0, 0b011110>;
+class FTRUNC_U_D_ENC : MSA_2RF_FMT<0b110010010, 0b1, 0b011110>;
+
+class HADD_S_H_ENC : MSA_3R_FMT<0b100, 0b01, 0b010101>;
+class HADD_S_W_ENC : MSA_3R_FMT<0b100, 0b10, 0b010101>;
+class HADD_S_D_ENC : MSA_3R_FMT<0b100, 0b11, 0b010101>;
+
+class HADD_U_H_ENC : MSA_3R_FMT<0b101, 0b01, 0b010101>;
+class HADD_U_W_ENC : MSA_3R_FMT<0b101, 0b10, 0b010101>;
+class HADD_U_D_ENC : MSA_3R_FMT<0b101, 0b11, 0b010101>;
+
+class HSUB_S_H_ENC : MSA_3R_FMT<0b110, 0b01, 0b010101>;
+class HSUB_S_W_ENC : MSA_3R_FMT<0b110, 0b10, 0b010101>;
+class HSUB_S_D_ENC : MSA_3R_FMT<0b110, 0b11, 0b010101>;
+
+class HSUB_U_H_ENC : MSA_3R_FMT<0b111, 0b01, 0b010101>;
+class HSUB_U_W_ENC : MSA_3R_FMT<0b111, 0b10, 0b010101>;
+class HSUB_U_D_ENC : MSA_3R_FMT<0b111, 0b11, 0b010101>;
+
+class ILVEV_B_ENC : MSA_3R_FMT<0b110, 0b00, 0b010100>;
+class ILVEV_H_ENC : MSA_3R_FMT<0b110, 0b01, 0b010100>;
+class ILVEV_W_ENC : MSA_3R_FMT<0b110, 0b10, 0b010100>;
+class ILVEV_D_ENC : MSA_3R_FMT<0b110, 0b11, 0b010100>;
+
+class ILVL_B_ENC : MSA_3R_FMT<0b100, 0b00, 0b010100>;
+class ILVL_H_ENC : MSA_3R_FMT<0b100, 0b01, 0b010100>;
+class ILVL_W_ENC : MSA_3R_FMT<0b100, 0b10, 0b010100>;
+class ILVL_D_ENC : MSA_3R_FMT<0b100, 0b11, 0b010100>;
+
+class ILVOD_B_ENC : MSA_3R_FMT<0b111, 0b00, 0b010100>;
+class ILVOD_H_ENC : MSA_3R_FMT<0b111, 0b01, 0b010100>;
+class ILVOD_W_ENC : MSA_3R_FMT<0b111, 0b10, 0b010100>;
+class ILVOD_D_ENC : MSA_3R_FMT<0b111, 0b11, 0b010100>;
+
+class ILVR_B_ENC : MSA_3R_FMT<0b101, 0b00, 0b010100>;
+class ILVR_H_ENC : MSA_3R_FMT<0b101, 0b01, 0b010100>;
+class ILVR_W_ENC : MSA_3R_FMT<0b101, 0b10, 0b010100>;
+class ILVR_D_ENC : MSA_3R_FMT<0b101, 0b11, 0b010100>;
+
+class INSERT_B_ENC : MSA_ELM_INSERT_B_FMT<0b0100, 0b011001>;
+class INSERT_H_ENC : MSA_ELM_INSERT_H_FMT<0b0100, 0b011001>;
+class INSERT_W_ENC : MSA_ELM_INSERT_W_FMT<0b0100, 0b011001>;
+
+class INSVE_B_ENC : MSA_ELM_B_FMT<0b0101, 0b011001>;
+class INSVE_H_ENC : MSA_ELM_H_FMT<0b0101, 0b011001>;
+class INSVE_W_ENC : MSA_ELM_W_FMT<0b0101, 0b011001>;
+class INSVE_D_ENC : MSA_ELM_D_FMT<0b0101, 0b011001>;
+
+class LD_B_ENC   : MSA_MI10_FMT<0b00, 0b1000>;
+class LD_H_ENC   : MSA_MI10_FMT<0b01, 0b1000>;
+class LD_W_ENC   : MSA_MI10_FMT<0b10, 0b1000>;
+class LD_D_ENC   : MSA_MI10_FMT<0b11, 0b1000>;
+
+class LDI_B_ENC  : MSA_I10_FMT<0b110, 0b00, 0b000111>;
+class LDI_H_ENC  : MSA_I10_FMT<0b110, 0b01, 0b000111>;
+class LDI_W_ENC  : MSA_I10_FMT<0b110, 0b10, 0b000111>;
+class LDI_D_ENC  : MSA_I10_FMT<0b110, 0b11, 0b000111>;
+
+class LSA_ENC : SPECIAL_LSA_FMT<0b000101>;
+
+class MADD_Q_H_ENC : MSA_3RF_FMT<0b0101, 0b0, 0b011100>;
+class MADD_Q_W_ENC : MSA_3RF_FMT<0b0101, 0b1, 0b011100>;
+
+class MADDR_Q_H_ENC : MSA_3RF_FMT<0b1101, 0b0, 0b011100>;
+class MADDR_Q_W_ENC : MSA_3RF_FMT<0b1101, 0b1, 0b011100>;
+
+class MADDV_B_ENC : MSA_3R_FMT<0b001, 0b00, 0b010010>;
+class MADDV_H_ENC : MSA_3R_FMT<0b001, 0b01, 0b010010>;
+class MADDV_W_ENC : MSA_3R_FMT<0b001, 0b10, 0b010010>;
+class MADDV_D_ENC : MSA_3R_FMT<0b001, 0b11, 0b010010>;
+
+class MAX_A_B_ENC : MSA_3R_FMT<0b110, 0b00, 0b001110>;
+class MAX_A_H_ENC : MSA_3R_FMT<0b110, 0b01, 0b001110>;
+class MAX_A_W_ENC : MSA_3R_FMT<0b110, 0b10, 0b001110>;
+class MAX_A_D_ENC : MSA_3R_FMT<0b110, 0b11, 0b001110>;
+
+class MAX_S_B_ENC : MSA_3R_FMT<0b010, 0b00, 0b001110>;
+class MAX_S_H_ENC : MSA_3R_FMT<0b010, 0b01, 0b001110>;
+class MAX_S_W_ENC : MSA_3R_FMT<0b010, 0b10, 0b001110>;
+class MAX_S_D_ENC : MSA_3R_FMT<0b010, 0b11, 0b001110>;
+
+class MAX_U_B_ENC : MSA_3R_FMT<0b011, 0b00, 0b001110>;
+class MAX_U_H_ENC : MSA_3R_FMT<0b011, 0b01, 0b001110>;
+class MAX_U_W_ENC : MSA_3R_FMT<0b011, 0b10, 0b001110>;
+class MAX_U_D_ENC : MSA_3R_FMT<0b011, 0b11, 0b001110>;
+
+class MAXI_S_B_ENC : MSA_I5_FMT<0b010, 0b00, 0b000110>;
+class MAXI_S_H_ENC : MSA_I5_FMT<0b010, 0b01, 0b000110>;
+class MAXI_S_W_ENC : MSA_I5_FMT<0b010, 0b10, 0b000110>;
+class MAXI_S_D_ENC : MSA_I5_FMT<0b010, 0b11, 0b000110>;
+
+class MAXI_U_B_ENC : MSA_I5_FMT<0b011, 0b00, 0b000110>;
+class MAXI_U_H_ENC : MSA_I5_FMT<0b011, 0b01, 0b000110>;
+class MAXI_U_W_ENC : MSA_I5_FMT<0b011, 0b10, 0b000110>;
+class MAXI_U_D_ENC : MSA_I5_FMT<0b011, 0b11, 0b000110>;
+
+class MIN_A_B_ENC : MSA_3R_FMT<0b111, 0b00, 0b001110>;
+class MIN_A_H_ENC : MSA_3R_FMT<0b111, 0b01, 0b001110>;
+class MIN_A_W_ENC : MSA_3R_FMT<0b111, 0b10, 0b001110>;
+class MIN_A_D_ENC : MSA_3R_FMT<0b111, 0b11, 0b001110>;
+
+class MIN_S_B_ENC : MSA_3R_FMT<0b100, 0b00, 0b001110>;
+class MIN_S_H_ENC : MSA_3R_FMT<0b100, 0b01, 0b001110>;
+class MIN_S_W_ENC : MSA_3R_FMT<0b100, 0b10, 0b001110>;
+class MIN_S_D_ENC : MSA_3R_FMT<0b100, 0b11, 0b001110>;
+
+class MIN_U_B_ENC : MSA_3R_FMT<0b101, 0b00, 0b001110>;
+class MIN_U_H_ENC : MSA_3R_FMT<0b101, 0b01, 0b001110>;
+class MIN_U_W_ENC : MSA_3R_FMT<0b101, 0b10, 0b001110>;
+class MIN_U_D_ENC : MSA_3R_FMT<0b101, 0b11, 0b001110>;
+
+class MINI_S_B_ENC : MSA_I5_FMT<0b100, 0b00, 0b000110>;
+class MINI_S_H_ENC : MSA_I5_FMT<0b100, 0b01, 0b000110>;
+class MINI_S_W_ENC : MSA_I5_FMT<0b100, 0b10, 0b000110>;
+class MINI_S_D_ENC : MSA_I5_FMT<0b100, 0b11, 0b000110>;
+
+class MINI_U_B_ENC : MSA_I5_FMT<0b101, 0b00, 0b000110>;
+class MINI_U_H_ENC : MSA_I5_FMT<0b101, 0b01, 0b000110>;
+class MINI_U_W_ENC : MSA_I5_FMT<0b101, 0b10, 0b000110>;
+class MINI_U_D_ENC : MSA_I5_FMT<0b101, 0b11, 0b000110>;
+
+class MOD_S_B_ENC : MSA_3R_FMT<0b110, 0b00, 0b010010>;
+class MOD_S_H_ENC : MSA_3R_FMT<0b110, 0b01, 0b010010>;
+class MOD_S_W_ENC : MSA_3R_FMT<0b110, 0b10, 0b010010>;
+class MOD_S_D_ENC : MSA_3R_FMT<0b110, 0b11, 0b010010>;
+
+class MOD_U_B_ENC : MSA_3R_FMT<0b111, 0b00, 0b010010>;
+class MOD_U_H_ENC : MSA_3R_FMT<0b111, 0b01, 0b010010>;
+class MOD_U_W_ENC : MSA_3R_FMT<0b111, 0b10, 0b010010>;
+class MOD_U_D_ENC : MSA_3R_FMT<0b111, 0b11, 0b010010>;
+
+class MOVE_V_ENC : MSA_ELM_FMT<0b0010111110, 0b011001>;
+
+class MSUB_Q_H_ENC : MSA_3RF_FMT<0b0110, 0b0, 0b011100>;
+class MSUB_Q_W_ENC : MSA_3RF_FMT<0b0110, 0b1, 0b011100>;
+
+class MSUBR_Q_H_ENC : MSA_3RF_FMT<0b1110, 0b0, 0b011100>;
+class MSUBR_Q_W_ENC : MSA_3RF_FMT<0b1110, 0b1, 0b011100>;
+
+class MSUBV_B_ENC : MSA_3R_FMT<0b010, 0b00, 0b010010>;
+class MSUBV_H_ENC : MSA_3R_FMT<0b010, 0b01, 0b010010>;
+class MSUBV_W_ENC : MSA_3R_FMT<0b010, 0b10, 0b010010>;
+class MSUBV_D_ENC : MSA_3R_FMT<0b010, 0b11, 0b010010>;
+
+class MUL_Q_H_ENC : MSA_3RF_FMT<0b0100, 0b0, 0b011100>;
+class MUL_Q_W_ENC : MSA_3RF_FMT<0b0100, 0b1, 0b011100>;
+
+class MULR_Q_H_ENC : MSA_3RF_FMT<0b1100, 0b0, 0b011100>;
+class MULR_Q_W_ENC : MSA_3RF_FMT<0b1100, 0b1, 0b011100>;
+
+class MULV_B_ENC : MSA_3R_FMT<0b000, 0b00, 0b010010>;
+class MULV_H_ENC : MSA_3R_FMT<0b000, 0b01, 0b010010>;
+class MULV_W_ENC : MSA_3R_FMT<0b000, 0b10, 0b010010>;
+class MULV_D_ENC : MSA_3R_FMT<0b000, 0b11, 0b010010>;
+
+class NLOC_B_ENC : MSA_2R_FMT<0b11000010, 0b00, 0b011110>;
+class NLOC_H_ENC : MSA_2R_FMT<0b11000010, 0b01, 0b011110>;
+class NLOC_W_ENC : MSA_2R_FMT<0b11000010, 0b10, 0b011110>;
+class NLOC_D_ENC : MSA_2R_FMT<0b11000010, 0b11, 0b011110>;
+
+class NLZC_B_ENC : MSA_2R_FMT<0b11000011, 0b00, 0b011110>;
+class NLZC_H_ENC : MSA_2R_FMT<0b11000011, 0b01, 0b011110>;
+class NLZC_W_ENC : MSA_2R_FMT<0b11000011, 0b10, 0b011110>;
+class NLZC_D_ENC : MSA_2R_FMT<0b11000011, 0b11, 0b011110>;
+
+class NOR_V_ENC : MSA_VEC_FMT<0b00010, 0b011110>;
+
+class NORI_B_ENC : MSA_I8_FMT<0b10, 0b000000>;
+
+class OR_V_ENC : MSA_VEC_FMT<0b00001, 0b011110>;
+
+class ORI_B_ENC  : MSA_I8_FMT<0b01, 0b000000>;
+
+class PCKEV_B_ENC : MSA_3R_FMT<0b010, 0b00, 0b010100>;
+class PCKEV_H_ENC : MSA_3R_FMT<0b010, 0b01, 0b010100>;
+class PCKEV_W_ENC : MSA_3R_FMT<0b010, 0b10, 0b010100>;
+class PCKEV_D_ENC : MSA_3R_FMT<0b010, 0b11, 0b010100>;
+
+class PCKOD_B_ENC : MSA_3R_FMT<0b011, 0b00, 0b010100>;
+class PCKOD_H_ENC : MSA_3R_FMT<0b011, 0b01, 0b010100>;
+class PCKOD_W_ENC : MSA_3R_FMT<0b011, 0b10, 0b010100>;
+class PCKOD_D_ENC : MSA_3R_FMT<0b011, 0b11, 0b010100>;
+
+class PCNT_B_ENC : MSA_2R_FMT<0b11000001, 0b00, 0b011110>;
+class PCNT_H_ENC : MSA_2R_FMT<0b11000001, 0b01, 0b011110>;
+class PCNT_W_ENC : MSA_2R_FMT<0b11000001, 0b10, 0b011110>;
+class PCNT_D_ENC : MSA_2R_FMT<0b11000001, 0b11, 0b011110>;
+
+class SAT_S_B_ENC : MSA_BIT_B_FMT<0b000, 0b001010>;
+class SAT_S_H_ENC : MSA_BIT_H_FMT<0b000, 0b001010>;
+class SAT_S_W_ENC : MSA_BIT_W_FMT<0b000, 0b001010>;
+class SAT_S_D_ENC : MSA_BIT_D_FMT<0b000, 0b001010>;
+
+class SAT_U_B_ENC : MSA_BIT_B_FMT<0b001, 0b001010>;
+class SAT_U_H_ENC : MSA_BIT_H_FMT<0b001, 0b001010>;
+class SAT_U_W_ENC : MSA_BIT_W_FMT<0b001, 0b001010>;
+class SAT_U_D_ENC : MSA_BIT_D_FMT<0b001, 0b001010>;
+
+class SHF_B_ENC  : MSA_I8_FMT<0b00, 0b000010>;
+class SHF_H_ENC  : MSA_I8_FMT<0b01, 0b000010>;
+class SHF_W_ENC  : MSA_I8_FMT<0b10, 0b000010>;
+
+class SLD_B_ENC : MSA_3R_INDEX_FMT<0b000, 0b00, 0b010100>;
+class SLD_H_ENC : MSA_3R_INDEX_FMT<0b000, 0b01, 0b010100>;
+class SLD_W_ENC : MSA_3R_INDEX_FMT<0b000, 0b10, 0b010100>;
+class SLD_D_ENC : MSA_3R_INDEX_FMT<0b000, 0b11, 0b010100>;
+
+class SLDI_B_ENC : MSA_ELM_B_FMT<0b0000, 0b011001>;
+class SLDI_H_ENC : MSA_ELM_H_FMT<0b0000, 0b011001>;
+class SLDI_W_ENC : MSA_ELM_W_FMT<0b0000, 0b011001>;
+class SLDI_D_ENC : MSA_ELM_D_FMT<0b0000, 0b011001>;
+
+class SLL_B_ENC : MSA_3R_FMT<0b000, 0b00, 0b001101>;
+class SLL_H_ENC : MSA_3R_FMT<0b000, 0b01, 0b001101>;
+class SLL_W_ENC : MSA_3R_FMT<0b000, 0b10, 0b001101>;
+class SLL_D_ENC : MSA_3R_FMT<0b000, 0b11, 0b001101>;
+
+class SLLI_B_ENC : MSA_BIT_B_FMT<0b000, 0b001001>;
+class SLLI_H_ENC : MSA_BIT_H_FMT<0b000, 0b001001>;
+class SLLI_W_ENC : MSA_BIT_W_FMT<0b000, 0b001001>;
+class SLLI_D_ENC : MSA_BIT_D_FMT<0b000, 0b001001>;
+
+class SPLAT_B_ENC : MSA_3R_INDEX_FMT<0b001, 0b00, 0b010100>;
+class SPLAT_H_ENC : MSA_3R_INDEX_FMT<0b001, 0b01, 0b010100>;
+class SPLAT_W_ENC : MSA_3R_INDEX_FMT<0b001, 0b10, 0b010100>;
+class SPLAT_D_ENC : MSA_3R_INDEX_FMT<0b001, 0b11, 0b010100>;
+
+class SPLATI_B_ENC : MSA_ELM_B_FMT<0b0001, 0b011001>;
+class SPLATI_H_ENC : MSA_ELM_H_FMT<0b0001, 0b011001>;
+class SPLATI_W_ENC : MSA_ELM_W_FMT<0b0001, 0b011001>;
+class SPLATI_D_ENC : MSA_ELM_D_FMT<0b0001, 0b011001>;
+
+class SRA_B_ENC : MSA_3R_FMT<0b001, 0b00, 0b001101>;
+class SRA_H_ENC : MSA_3R_FMT<0b001, 0b01, 0b001101>;
+class SRA_W_ENC : MSA_3R_FMT<0b001, 0b10, 0b001101>;
+class SRA_D_ENC : MSA_3R_FMT<0b001, 0b11, 0b001101>;
+
+class SRAI_B_ENC : MSA_BIT_B_FMT<0b001, 0b001001>;
+class SRAI_H_ENC : MSA_BIT_H_FMT<0b001, 0b001001>;
+class SRAI_W_ENC : MSA_BIT_W_FMT<0b001, 0b001001>;
+class SRAI_D_ENC : MSA_BIT_D_FMT<0b001, 0b001001>;
+
+class SRAR_B_ENC : MSA_3R_FMT<0b001, 0b00, 0b010101>;
+class SRAR_H_ENC : MSA_3R_FMT<0b001, 0b01, 0b010101>;
+class SRAR_W_ENC : MSA_3R_FMT<0b001, 0b10, 0b010101>;
+class SRAR_D_ENC : MSA_3R_FMT<0b001, 0b11, 0b010101>;
+
+class SRARI_B_ENC : MSA_BIT_B_FMT<0b010, 0b001010>;
+class SRARI_H_ENC : MSA_BIT_H_FMT<0b010, 0b001010>;
+class SRARI_W_ENC : MSA_BIT_W_FMT<0b010, 0b001010>;
+class SRARI_D_ENC : MSA_BIT_D_FMT<0b010, 0b001010>;
+
+class SRL_B_ENC : MSA_3R_FMT<0b010, 0b00, 0b001101>;
+class SRL_H_ENC : MSA_3R_FMT<0b010, 0b01, 0b001101>;
+class SRL_W_ENC : MSA_3R_FMT<0b010, 0b10, 0b001101>;
+class SRL_D_ENC : MSA_3R_FMT<0b010, 0b11, 0b001101>;
+
+class SRLI_B_ENC : MSA_BIT_B_FMT<0b010, 0b001001>;
+class SRLI_H_ENC : MSA_BIT_H_FMT<0b010, 0b001001>;
+class SRLI_W_ENC : MSA_BIT_W_FMT<0b010, 0b001001>;
+class SRLI_D_ENC : MSA_BIT_D_FMT<0b010, 0b001001>;
+
+class SRLR_B_ENC : MSA_3R_FMT<0b010, 0b00, 0b010101>;
+class SRLR_H_ENC : MSA_3R_FMT<0b010, 0b01, 0b010101>;
+class SRLR_W_ENC : MSA_3R_FMT<0b010, 0b10, 0b010101>;
+class SRLR_D_ENC : MSA_3R_FMT<0b010, 0b11, 0b010101>;
+
+class SRLRI_B_ENC : MSA_BIT_B_FMT<0b011, 0b001010>;
+class SRLRI_H_ENC : MSA_BIT_H_FMT<0b011, 0b001010>;
+class SRLRI_W_ENC : MSA_BIT_W_FMT<0b011, 0b001010>;
+class SRLRI_D_ENC : MSA_BIT_D_FMT<0b011, 0b001010>;
+
+class ST_B_ENC   : MSA_MI10_FMT<0b00, 0b1001>;
+class ST_H_ENC   : MSA_MI10_FMT<0b01, 0b1001>;
+class ST_W_ENC   : MSA_MI10_FMT<0b10, 0b1001>;
+class ST_D_ENC   : MSA_MI10_FMT<0b11, 0b1001>;
+
+class SUBS_S_B_ENC : MSA_3R_FMT<0b000, 0b00, 0b010001>;
+class SUBS_S_H_ENC : MSA_3R_FMT<0b000, 0b01, 0b010001>;
+class SUBS_S_W_ENC : MSA_3R_FMT<0b000, 0b10, 0b010001>;
+class SUBS_S_D_ENC : MSA_3R_FMT<0b000, 0b11, 0b010001>;
+
+class SUBS_U_B_ENC : MSA_3R_FMT<0b001, 0b00, 0b010001>;
+class SUBS_U_H_ENC : MSA_3R_FMT<0b001, 0b01, 0b010001>;
+class SUBS_U_W_ENC : MSA_3R_FMT<0b001, 0b10, 0b010001>;
+class SUBS_U_D_ENC : MSA_3R_FMT<0b001, 0b11, 0b010001>;
+
+class SUBSUS_U_B_ENC : MSA_3R_FMT<0b010, 0b00, 0b010001>;
+class SUBSUS_U_H_ENC : MSA_3R_FMT<0b010, 0b01, 0b010001>;
+class SUBSUS_U_W_ENC : MSA_3R_FMT<0b010, 0b10, 0b010001>;
+class SUBSUS_U_D_ENC : MSA_3R_FMT<0b010, 0b11, 0b010001>;
+
+class SUBSUU_S_B_ENC : MSA_3R_FMT<0b011, 0b00, 0b010001>;
+class SUBSUU_S_H_ENC : MSA_3R_FMT<0b011, 0b01, 0b010001>;
+class SUBSUU_S_W_ENC : MSA_3R_FMT<0b011, 0b10, 0b010001>;
+class SUBSUU_S_D_ENC : MSA_3R_FMT<0b011, 0b11, 0b010001>;
+
+class SUBV_B_ENC : MSA_3R_FMT<0b001, 0b00, 0b001110>;
+class SUBV_H_ENC : MSA_3R_FMT<0b001, 0b01, 0b001110>;
+class SUBV_W_ENC : MSA_3R_FMT<0b001, 0b10, 0b001110>;
+class SUBV_D_ENC : MSA_3R_FMT<0b001, 0b11, 0b001110>;
+
+class SUBVI_B_ENC : MSA_I5_FMT<0b001, 0b00, 0b000110>;
+class SUBVI_H_ENC : MSA_I5_FMT<0b001, 0b01, 0b000110>;
+class SUBVI_W_ENC : MSA_I5_FMT<0b001, 0b10, 0b000110>;
+class SUBVI_D_ENC : MSA_I5_FMT<0b001, 0b11, 0b000110>;
+
+class VSHF_B_ENC : MSA_3R_FMT<0b000, 0b00, 0b010101>;
+class VSHF_H_ENC : MSA_3R_FMT<0b000, 0b01, 0b010101>;
+class VSHF_W_ENC : MSA_3R_FMT<0b000, 0b10, 0b010101>;
+class VSHF_D_ENC : MSA_3R_FMT<0b000, 0b11, 0b010101>;
+
+class XOR_V_ENC : MSA_VEC_FMT<0b00011, 0b011110>;
+
+class XORI_B_ENC : MSA_I8_FMT<0b11, 0b000000>;
+
+// Instruction desc.
+class MSA_BIT_B_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                          ComplexPattern Imm, RegisterOperand ROWD,
+                          RegisterOperand ROWS = ROWD,
+                          InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWS:$ws, vsplat_uimm3:$m);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $m");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, Imm:$m))];
+  InstrItinClass Itinerary = itin;
+}
+
+class MSA_BIT_H_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                          ComplexPattern Imm, RegisterOperand ROWD,
+                          RegisterOperand ROWS = ROWD,
+                          InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWS:$ws, vsplat_uimm4:$m);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $m");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, Imm:$m))];
+  InstrItinClass Itinerary = itin;
+}
+
+class MSA_BIT_W_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                          ComplexPattern Imm, RegisterOperand ROWD,
+                          RegisterOperand ROWS = ROWD,
+                          InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWS:$ws, vsplat_uimm5:$m);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $m");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, Imm:$m))];
+  InstrItinClass Itinerary = itin;
+}
+
+class MSA_BIT_D_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                          ComplexPattern Imm, RegisterOperand ROWD,
+                          RegisterOperand ROWS = ROWD,
+                          InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWS:$ws, vsplat_uimm6:$m);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $m");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, Imm:$m))];
+  InstrItinClass Itinerary = itin;
+}
+
+// This class is deprecated and will be removed soon.
+class MSA_BIT_B_X_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                            RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
+                            InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWS:$ws, uimm3:$m);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $m");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, immZExt3:$m))];
+  InstrItinClass Itinerary = itin;
+}
+
+// This class is deprecated and will be removed soon.
+class MSA_BIT_H_X_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                            RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
+                            InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWS:$ws, uimm4:$m);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $m");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, immZExt4:$m))];
+  InstrItinClass Itinerary = itin;
+}
+
+// This class is deprecated and will be removed soon.
+class MSA_BIT_W_X_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                            RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
+                            InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWS:$ws, uimm5:$m);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $m");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, immZExt5:$m))];
+  InstrItinClass Itinerary = itin;
+}
+
+// This class is deprecated and will be removed soon.
+class MSA_BIT_D_X_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                            RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
+                            InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWS:$ws, uimm6:$m);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $m");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, immZExt6:$m))];
+  InstrItinClass Itinerary = itin;
+}
+
+class MSA_BIT_BINSXI_DESC_BASE<string instr_asm, ValueType Ty,
+                               ComplexPattern Mask, RegisterOperand ROWD,
+                               RegisterOperand ROWS = ROWD,
+                               InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWD:$wd_in, ROWS:$ws, vsplat_uimm8:$m);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $m");
+  list<dag> Pattern = [(set ROWD:$wd, (vselect (Ty Mask:$m), (Ty ROWD:$wd_in),
+                                               ROWS:$ws))];
+  InstrItinClass Itinerary = itin;
+  string Constraints = "$wd = $wd_in";
+}
+
+class MSA_BIT_BINSLI_DESC_BASE<string instr_asm, ValueType Ty,
+                               RegisterOperand ROWD,
+                               RegisterOperand ROWS = ROWD,
+                               InstrItinClass itin = NoItinerary> :
+  MSA_BIT_BINSXI_DESC_BASE<instr_asm, Ty, vsplat_maskl_bits, ROWD, ROWS, itin>;
+
+class MSA_BIT_BINSRI_DESC_BASE<string instr_asm, ValueType Ty,
+                               RegisterOperand ROWD,
+                               RegisterOperand ROWS = ROWD,
+                               InstrItinClass itin = NoItinerary> :
+  MSA_BIT_BINSXI_DESC_BASE<instr_asm, Ty, vsplat_maskr_bits, ROWD, ROWS, itin>;
+
+class MSA_BIT_SPLAT_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                              SplatComplexPattern SplatImm,
+                              RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
+                              InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWS:$ws, SplatImm.OpClass:$m);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $m");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, SplatImm:$m))];
+  InstrItinClass Itinerary = itin;
+}
+
+class MSA_COPY_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                         ValueType VecTy, RegisterOperand ROD,
+                         RegisterOperand ROWS,
+                         InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROD:$rd);
+  dag InOperandList = (ins ROWS:$ws, uimm4:$n);
+  string AsmString = !strconcat(instr_asm, "\t$rd, $ws[$n]");
+  list<dag> Pattern = [(set ROD:$rd, (OpNode (VecTy ROWS:$ws), immZExt4:$n))];
+  InstrItinClass Itinerary = itin;
+}
+
+class MSA_ELM_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                        RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
+                        InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWS:$ws, uimm4:$n);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws[$n]");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, immZExt4:$n))];
+  InstrItinClass Itinerary = itin;
+}
+
+class MSA_COPY_PSEUDO_BASE<SDPatternOperator OpNode, ValueType VecTy,
+                           RegisterClass RCD, RegisterClass RCWS> :
+      MipsPseudo<(outs RCD:$wd), (ins RCWS:$ws, uimm4:$n),
+                 [(set RCD:$wd, (OpNode (VecTy RCWS:$ws), immZExt4:$n))]> {
+  bit usesCustomInserter = 1;
+}
+
+class MSA_I5_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                       SplatComplexPattern SplatImm, RegisterOperand ROWD,
+                       RegisterOperand ROWS = ROWD,
+                       InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWS:$ws, SplatImm.OpClass:$imm);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $imm");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, SplatImm:$imm))];
+  InstrItinClass Itinerary = itin;
+}
+
+class MSA_I8_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                       SplatComplexPattern SplatImm, RegisterOperand ROWD,
+                       RegisterOperand ROWS = ROWD,
+                       InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWS:$ws, SplatImm.OpClass:$u8);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $u8");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, SplatImm:$u8))];
+  InstrItinClass Itinerary = itin;
+}
+
+// This class is deprecated and will be removed in the next few patches
+class MSA_I8_X_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                         RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
+                         InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWS:$ws, uimm8:$u8);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $u8");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, immZExt8:$u8))];
+  InstrItinClass Itinerary = itin;
+}
+
+class MSA_I8_SHF_DESC_BASE<string instr_asm, RegisterOperand ROWD,
+                           RegisterOperand ROWS = ROWD,
+                           InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWS:$ws, uimm8:$u8);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $u8");
+  list<dag> Pattern = [(set ROWD:$wd, (MipsSHF immZExt8:$u8, ROWS:$ws))];
+  InstrItinClass Itinerary = itin;
+}
+
+class MSA_I10_LDI_DESC_BASE<string instr_asm, RegisterOperand ROWD,
+                            InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins vsplat_simm10:$s10);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $s10");
+  // LDI is matched using custom matching code in MipsSEISelDAGToDAG.cpp
+  list<dag> Pattern = [];
+  bit hasSideEffects = 0;
+  InstrItinClass Itinerary = itin;
+}
+
+class MSA_2R_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                       RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
+                       InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWS:$ws);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws))];
+  InstrItinClass Itinerary = itin;
+}
+
+class MSA_2R_FILL_DESC_BASE<string instr_asm, ValueType VT,
+                            SDPatternOperator OpNode, RegisterOperand ROWD,
+                            RegisterOperand ROS = ROWD,
+                            InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROS:$rs);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $rs");
+  list<dag> Pattern = [(set ROWD:$wd, (VT (OpNode ROS:$rs)))];
+  InstrItinClass Itinerary = itin;
+}
+
+class MSA_2R_FILL_PSEUDO_BASE<ValueType VT, SDPatternOperator OpNode,
+                              RegisterClass RCWD, RegisterClass RCWS = RCWD> :
+      MipsPseudo<(outs RCWD:$wd), (ins RCWS:$fs),
+                 [(set RCWD:$wd, (OpNode RCWS:$fs))]> {
+  let usesCustomInserter = 1;
+}
+
+class MSA_2RF_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                        RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
+                        InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWS:$ws);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws))];
+  InstrItinClass Itinerary = itin;
+}
+
+class MSA_3R_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                       RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
+                       RegisterOperand ROWT = ROWD,
+                       InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWS:$ws, ROWT:$wt);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $wt");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, ROWT:$wt))];
+  InstrItinClass Itinerary = itin;
+}
+
+class MSA_3R_BINSX_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                             RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
+                             RegisterOperand ROWT = ROWD,
+                             InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWD:$wd_in, ROWS:$ws, ROWT:$wt);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $wt");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWD:$wd_in, ROWS:$ws,
+                                              ROWT:$wt))];
+  string Constraints = "$wd = $wd_in";
+  InstrItinClass Itinerary = itin;
+}
+
+class MSA_3R_SPLAT_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                             RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
+                             InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWS:$ws, GPR32:$rt);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws[$rt]");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, GPR32:$rt))];
+  InstrItinClass Itinerary = itin;
+}
+
+class MSA_3R_VSHF_DESC_BASE<string instr_asm, RegisterOperand ROWD,
+                            RegisterOperand ROWS = ROWD,
+                            RegisterOperand ROWT = ROWD,
+                            InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWD:$wd_in, ROWS:$ws, ROWT:$wt);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $wt");
+  list<dag> Pattern = [(set ROWD:$wd, (MipsVSHF ROWD:$wd_in, ROWS:$ws,
+                                                ROWT:$wt))];
+  string Constraints = "$wd = $wd_in";
+  InstrItinClass Itinerary = itin;
+}
+
+class MSA_3R_SLD_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                           RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
+                           InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWS:$ws, GPR32:$rt);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws[$rt]");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, GPR32:$rt))];
+  InstrItinClass Itinerary = itin;
+}
+
+class MSA_3R_4R_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                          RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
+                          RegisterOperand ROWT = ROWD,
+                          InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWD:$wd_in, ROWS:$ws, ROWT:$wt);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $wt");
+  list<dag> Pattern = [(set ROWD:$wd,
+                       (OpNode ROWD:$wd_in, ROWS:$ws, ROWT:$wt))];
+  InstrItinClass Itinerary = itin;
+  string Constraints = "$wd = $wd_in";
+}
+
+class MSA_3RF_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                        RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
+                        RegisterOperand ROWT = ROWD,
+                        InstrItinClass itin = NoItinerary> :
+  MSA_3R_DESC_BASE<instr_asm, OpNode, ROWD, ROWS, ROWT, itin>;
+
+class MSA_3RF_4RF_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                            RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
+                            RegisterOperand ROWT = ROWD,
+                            InstrItinClass itin = NoItinerary> :
+  MSA_3R_4R_DESC_BASE<instr_asm, OpNode, ROWD, ROWS, ROWT, itin>;
+
+class MSA_CBRANCH_DESC_BASE<string instr_asm, RegisterOperand ROWD> {
+  dag OutOperandList = (outs);
+  dag InOperandList = (ins ROWD:$wt, brtarget:$offset);
+  string AsmString = !strconcat(instr_asm, "\t$wt, $offset");
+  list<dag> Pattern = [];
+  InstrItinClass Itinerary = IIBranch;
+  bit isBranch = 1;
+  bit isTerminator = 1;
+  bit hasDelaySlot = 1;
+  list<Register> Defs = [AT];
+}
+
+class MSA_INSERT_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                           RegisterOperand ROWD, RegisterOperand ROS,
+                           InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWD:$wd_in, ROS:$rs, uimm6:$n);
+  string AsmString = !strconcat(instr_asm, "\t$wd[$n], $rs");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWD:$wd_in,
+                                              ROS:$rs,
+                                              immZExt6:$n))];
+  InstrItinClass Itinerary = itin;
+  string Constraints = "$wd = $wd_in";
+}
+
+class MSA_INSERT_PSEUDO_BASE<SDPatternOperator OpNode, ValueType Ty,
+                             RegisterOperand ROWD, RegisterOperand ROFS> :
+      MipsPseudo<(outs ROWD:$wd), (ins ROWD:$wd_in, uimm6:$n, ROFS:$fs),
+                 [(set ROWD:$wd, (OpNode (Ty ROWD:$wd_in), ROFS:$fs,
+                                        immZExt6:$n))]> {
+  bit usesCustomInserter = 1;
+  string Constraints = "$wd = $wd_in";
+}
+
+class MSA_INSVE_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                          RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
+                          InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWD:$wd_in, uimm6:$n, ROWS:$ws);
+  string AsmString = !strconcat(instr_asm, "\t$wd[$n], $ws[0]");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWD:$wd_in,
+                                              immZExt6:$n,
+                                              ROWS:$ws))];
+  InstrItinClass Itinerary = itin;
+  string Constraints = "$wd = $wd_in";
+}
+
+class MSA_VEC_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                        RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
+                        RegisterOperand ROWT = ROWD,
+                        InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWS:$ws, ROWT:$wt);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $wt");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, ROWT:$wt))];
+  InstrItinClass Itinerary = itin;
+}
+
+class MSA_ELM_SPLAT_DESC_BASE<string instr_asm, SplatComplexPattern SplatImm,
+                              RegisterOperand ROWD,
+                              RegisterOperand ROWS = ROWD,
+                              InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWS:$ws, SplatImm.OpClass:$n);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws[$n]");
+  list<dag> Pattern = [(set ROWD:$wd, (MipsVSHF SplatImm:$n, ROWS:$ws,
+                                                ROWS:$ws))];
+  InstrItinClass Itinerary = itin;
+}
+
+class MSA_VEC_PSEUDO_BASE<SDPatternOperator OpNode, RegisterOperand ROWD,
+                          RegisterOperand ROWS = ROWD,
+                          RegisterOperand ROWT = ROWD> :
+      MipsPseudo<(outs ROWD:$wd), (ins ROWS:$ws, ROWT:$wt),
+                 [(set ROWD:$wd, (OpNode ROWS:$ws, ROWT:$wt))]>;
+
+class ADD_A_B_DESC : MSA_3R_DESC_BASE<"add_a.b", int_mips_add_a_b, MSA128BOpnd>,
+                     IsCommutable;
+class ADD_A_H_DESC : MSA_3R_DESC_BASE<"add_a.h", int_mips_add_a_h, MSA128HOpnd>,
+                     IsCommutable;
+class ADD_A_W_DESC : MSA_3R_DESC_BASE<"add_a.w", int_mips_add_a_w, MSA128WOpnd>,
+                     IsCommutable;
+class ADD_A_D_DESC : MSA_3R_DESC_BASE<"add_a.d", int_mips_add_a_d, MSA128DOpnd>,
+                     IsCommutable;
+
+class ADDS_A_B_DESC : MSA_3R_DESC_BASE<"adds_a.b", int_mips_adds_a_b,
+                                       MSA128BOpnd>, IsCommutable;
+class ADDS_A_H_DESC : MSA_3R_DESC_BASE<"adds_a.h", int_mips_adds_a_h,
+                                       MSA128HOpnd>, IsCommutable;
+class ADDS_A_W_DESC : MSA_3R_DESC_BASE<"adds_a.w", int_mips_adds_a_w,
+                                       MSA128WOpnd>, IsCommutable;
+class ADDS_A_D_DESC : MSA_3R_DESC_BASE<"adds_a.d", int_mips_adds_a_d,
+                                       MSA128DOpnd>, IsCommutable;
+
+class ADDS_S_B_DESC : MSA_3R_DESC_BASE<"adds_s.b", int_mips_adds_s_b,
+                                       MSA128BOpnd>, IsCommutable;
+class ADDS_S_H_DESC : MSA_3R_DESC_BASE<"adds_s.h", int_mips_adds_s_h,
+                                       MSA128HOpnd>, IsCommutable;
+class ADDS_S_W_DESC : MSA_3R_DESC_BASE<"adds_s.w", int_mips_adds_s_w,
+                                       MSA128WOpnd>, IsCommutable;
+class ADDS_S_D_DESC : MSA_3R_DESC_BASE<"adds_s.d", int_mips_adds_s_d,
+                                       MSA128DOpnd>, IsCommutable;
+
+class ADDS_U_B_DESC : MSA_3R_DESC_BASE<"adds_u.b", int_mips_adds_u_b,
+                                       MSA128BOpnd>, IsCommutable;
+class ADDS_U_H_DESC : MSA_3R_DESC_BASE<"adds_u.h", int_mips_adds_u_h,
+                                       MSA128HOpnd>, IsCommutable;
+class ADDS_U_W_DESC : MSA_3R_DESC_BASE<"adds_u.w", int_mips_adds_u_w,
+                                       MSA128WOpnd>, IsCommutable;
+class ADDS_U_D_DESC : MSA_3R_DESC_BASE<"adds_u.d", int_mips_adds_u_d,
+                                       MSA128DOpnd>, IsCommutable;
+
+class ADDV_B_DESC : MSA_3R_DESC_BASE<"addv.b", add, MSA128BOpnd>, IsCommutable;
+class ADDV_H_DESC : MSA_3R_DESC_BASE<"addv.h", add, MSA128HOpnd>, IsCommutable;
+class ADDV_W_DESC : MSA_3R_DESC_BASE<"addv.w", add, MSA128WOpnd>, IsCommutable;
+class ADDV_D_DESC : MSA_3R_DESC_BASE<"addv.d", add, MSA128DOpnd>, IsCommutable;
+
+class ADDVI_B_DESC : MSA_I5_DESC_BASE<"addvi.b", add, vsplati8_uimm5,
+                                      MSA128BOpnd>;
+class ADDVI_H_DESC : MSA_I5_DESC_BASE<"addvi.h", add, vsplati16_uimm5,
+                                      MSA128HOpnd>;
+class ADDVI_W_DESC : MSA_I5_DESC_BASE<"addvi.w", add, vsplati32_uimm5,
+                                      MSA128WOpnd>;
+class ADDVI_D_DESC : MSA_I5_DESC_BASE<"addvi.d", add, vsplati64_uimm5,
+                                      MSA128DOpnd>;
+
+class AND_V_DESC : MSA_VEC_DESC_BASE<"and.v", and, MSA128BOpnd>;
+class AND_V_H_PSEUDO_DESC : MSA_VEC_PSEUDO_BASE<and, MSA128HOpnd>;
+class AND_V_W_PSEUDO_DESC : MSA_VEC_PSEUDO_BASE<and, MSA128WOpnd>;
+class AND_V_D_PSEUDO_DESC : MSA_VEC_PSEUDO_BASE<and, MSA128DOpnd>;
+
+class ANDI_B_DESC : MSA_I8_DESC_BASE<"andi.b", and, vsplati8_uimm8,
+                                     MSA128BOpnd>;
+
+class ASUB_S_B_DESC : MSA_3R_DESC_BASE<"asub_s.b", int_mips_asub_s_b,
+                                       MSA128BOpnd>;
+class ASUB_S_H_DESC : MSA_3R_DESC_BASE<"asub_s.h", int_mips_asub_s_h,
+                                       MSA128HOpnd>;
+class ASUB_S_W_DESC : MSA_3R_DESC_BASE<"asub_s.w", int_mips_asub_s_w,
+                                       MSA128WOpnd>;
+class ASUB_S_D_DESC : MSA_3R_DESC_BASE<"asub_s.d", int_mips_asub_s_d,
+                                       MSA128DOpnd>;
+
+class ASUB_U_B_DESC : MSA_3R_DESC_BASE<"asub_u.b", int_mips_asub_u_b,
+                                       MSA128BOpnd>;
+class ASUB_U_H_DESC : MSA_3R_DESC_BASE<"asub_u.h", int_mips_asub_u_h,
+                                       MSA128HOpnd>;
+class ASUB_U_W_DESC : MSA_3R_DESC_BASE<"asub_u.w", int_mips_asub_u_w,
+                                       MSA128WOpnd>;
+class ASUB_U_D_DESC : MSA_3R_DESC_BASE<"asub_u.d", int_mips_asub_u_d,
+                                       MSA128DOpnd>;
+
+class AVE_S_B_DESC : MSA_3R_DESC_BASE<"ave_s.b", int_mips_ave_s_b, MSA128BOpnd>,
+                     IsCommutable;
+class AVE_S_H_DESC : MSA_3R_DESC_BASE<"ave_s.h", int_mips_ave_s_h, MSA128HOpnd>,
+                     IsCommutable;
+class AVE_S_W_DESC : MSA_3R_DESC_BASE<"ave_s.w", int_mips_ave_s_w, MSA128WOpnd>,
+                     IsCommutable;
+class AVE_S_D_DESC : MSA_3R_DESC_BASE<"ave_s.d", int_mips_ave_s_d, MSA128DOpnd>,
+                     IsCommutable;
+
+class AVE_U_B_DESC : MSA_3R_DESC_BASE<"ave_u.b", int_mips_ave_u_b, MSA128BOpnd>,
+                     IsCommutable;
+class AVE_U_H_DESC : MSA_3R_DESC_BASE<"ave_u.h", int_mips_ave_u_h, MSA128HOpnd>,
+                     IsCommutable;
+class AVE_U_W_DESC : MSA_3R_DESC_BASE<"ave_u.w", int_mips_ave_u_w, MSA128WOpnd>,
+                     IsCommutable;
+class AVE_U_D_DESC : MSA_3R_DESC_BASE<"ave_u.d", int_mips_ave_u_d, MSA128DOpnd>,
+                     IsCommutable;
+
+class AVER_S_B_DESC : MSA_3R_DESC_BASE<"aver_s.b", int_mips_aver_s_b,
+                                       MSA128BOpnd>, IsCommutable;
+class AVER_S_H_DESC : MSA_3R_DESC_BASE<"aver_s.h", int_mips_aver_s_h,
+                                       MSA128HOpnd>, IsCommutable;
+class AVER_S_W_DESC : MSA_3R_DESC_BASE<"aver_s.w", int_mips_aver_s_w,
+                                       MSA128WOpnd>, IsCommutable;
+class AVER_S_D_DESC : MSA_3R_DESC_BASE<"aver_s.d", int_mips_aver_s_d,
+                                       MSA128DOpnd>, IsCommutable;
+
+class AVER_U_B_DESC : MSA_3R_DESC_BASE<"aver_u.b", int_mips_aver_u_b,
+                                       MSA128BOpnd>, IsCommutable;
+class AVER_U_H_DESC : MSA_3R_DESC_BASE<"aver_u.h", int_mips_aver_u_h,
+                                       MSA128HOpnd>, IsCommutable;
+class AVER_U_W_DESC : MSA_3R_DESC_BASE<"aver_u.w", int_mips_aver_u_w,
+                                       MSA128WOpnd>, IsCommutable;
+class AVER_U_D_DESC : MSA_3R_DESC_BASE<"aver_u.d", int_mips_aver_u_d,
+                                       MSA128DOpnd>, IsCommutable;
+
+class BCLR_B_DESC : MSA_3R_DESC_BASE<"bclr.b", vbclr_b, MSA128BOpnd>;
+class BCLR_H_DESC : MSA_3R_DESC_BASE<"bclr.h", vbclr_h, MSA128HOpnd>;
+class BCLR_W_DESC : MSA_3R_DESC_BASE<"bclr.w", vbclr_w, MSA128WOpnd>;
+class BCLR_D_DESC : MSA_3R_DESC_BASE<"bclr.d", vbclr_d, MSA128DOpnd>;
+
+class BCLRI_B_DESC : MSA_BIT_B_DESC_BASE<"bclri.b", and, vsplat_uimm_inv_pow2,
+                                         MSA128BOpnd>;
+class BCLRI_H_DESC : MSA_BIT_H_DESC_BASE<"bclri.h", and, vsplat_uimm_inv_pow2,
+                                         MSA128HOpnd>;
+class BCLRI_W_DESC : MSA_BIT_W_DESC_BASE<"bclri.w", and, vsplat_uimm_inv_pow2,
+                                         MSA128WOpnd>;
+class BCLRI_D_DESC : MSA_BIT_D_DESC_BASE<"bclri.d", and, vsplat_uimm_inv_pow2,
+                                         MSA128DOpnd>;
+
+class BINSL_B_DESC : MSA_3R_BINSX_DESC_BASE<"binsl.b", int_mips_binsl_b,
+                                            MSA128BOpnd>;
+class BINSL_H_DESC : MSA_3R_BINSX_DESC_BASE<"binsl.h", int_mips_binsl_h,
+                                            MSA128HOpnd>;
+class BINSL_W_DESC : MSA_3R_BINSX_DESC_BASE<"binsl.w", int_mips_binsl_w,
+                                            MSA128WOpnd>;
+class BINSL_D_DESC : MSA_3R_BINSX_DESC_BASE<"binsl.d", int_mips_binsl_d,
+                                            MSA128DOpnd>;
+
+class BINSLI_B_DESC : MSA_BIT_BINSLI_DESC_BASE<"binsli.b", v16i8, MSA128BOpnd>;
+class BINSLI_H_DESC : MSA_BIT_BINSLI_DESC_BASE<"binsli.h", v8i16, MSA128HOpnd>;
+class BINSLI_W_DESC : MSA_BIT_BINSLI_DESC_BASE<"binsli.w", v4i32, MSA128WOpnd>;
+class BINSLI_D_DESC : MSA_BIT_BINSLI_DESC_BASE<"binsli.d", v2i64, MSA128DOpnd>;
+
+class BINSR_B_DESC : MSA_3R_BINSX_DESC_BASE<"binsr.b", int_mips_binsr_b,
+                                            MSA128BOpnd>;
+class BINSR_H_DESC : MSA_3R_BINSX_DESC_BASE<"binsr.h", int_mips_binsr_h,
+                                            MSA128HOpnd>;
+class BINSR_W_DESC : MSA_3R_BINSX_DESC_BASE<"binsr.w", int_mips_binsr_w,
+                                            MSA128WOpnd>;
+class BINSR_D_DESC : MSA_3R_BINSX_DESC_BASE<"binsr.d", int_mips_binsr_d,
+                                            MSA128DOpnd>;
+
+class BINSRI_B_DESC : MSA_BIT_BINSRI_DESC_BASE<"binsri.b", v16i8, MSA128BOpnd>;
+class BINSRI_H_DESC : MSA_BIT_BINSRI_DESC_BASE<"binsri.h", v8i16, MSA128HOpnd>;
+class BINSRI_W_DESC : MSA_BIT_BINSRI_DESC_BASE<"binsri.w", v4i32, MSA128WOpnd>;
+class BINSRI_D_DESC : MSA_BIT_BINSRI_DESC_BASE<"binsri.d", v2i64, MSA128DOpnd>;
+
+class BMNZ_V_DESC {
+  dag OutOperandList = (outs MSA128BOpnd:$wd);
+  dag InOperandList = (ins MSA128BOpnd:$wd_in, MSA128BOpnd:$ws,
+                       MSA128BOpnd:$wt);
+  string AsmString = "bmnz.v\t$wd, $ws, $wt";
+  list<dag> Pattern = [(set MSA128BOpnd:$wd, (vselect MSA128BOpnd:$wt,
+                                                      MSA128BOpnd:$ws,
+                                                      MSA128BOpnd:$wd_in))];
+  InstrItinClass Itinerary = NoItinerary;
+  string Constraints = "$wd = $wd_in";
+}
+
+class BMNZI_B_DESC {
+  dag OutOperandList = (outs MSA128BOpnd:$wd);
+  dag InOperandList = (ins MSA128BOpnd:$wd_in, MSA128BOpnd:$ws,
+                           vsplat_uimm8:$u8);
+  string AsmString = "bmnzi.b\t$wd, $ws, $u8";
+  list<dag> Pattern = [(set MSA128BOpnd:$wd, (vselect vsplati8_uimm8:$u8,
+                                                      MSA128BOpnd:$ws,
+                                                      MSA128BOpnd:$wd_in))];
+  InstrItinClass Itinerary = NoItinerary;
+  string Constraints = "$wd = $wd_in";
+}
+
+class BMZ_V_DESC {
+  dag OutOperandList = (outs MSA128BOpnd:$wd);
+  dag InOperandList = (ins MSA128BOpnd:$wd_in, MSA128BOpnd:$ws,
+                       MSA128BOpnd:$wt);
+  string AsmString = "bmz.v\t$wd, $ws, $wt";
+  list<dag> Pattern = [(set MSA128BOpnd:$wd, (vselect MSA128BOpnd:$wt,
+                                                      MSA128BOpnd:$wd_in,
+                                                      MSA128BOpnd:$ws))];
+  InstrItinClass Itinerary = NoItinerary;
+  string Constraints = "$wd = $wd_in";
+}
+
+class BMZI_B_DESC {
+  dag OutOperandList = (outs MSA128BOpnd:$wd);
+  dag InOperandList = (ins MSA128BOpnd:$wd_in, MSA128BOpnd:$ws,
+                           vsplat_uimm8:$u8);
+  string AsmString = "bmzi.b\t$wd, $ws, $u8";
+  list<dag> Pattern = [(set MSA128BOpnd:$wd, (vselect vsplati8_uimm8:$u8,
+                                                      MSA128BOpnd:$wd_in,
+                                                      MSA128BOpnd:$ws))];
+  InstrItinClass Itinerary = NoItinerary;
+  string Constraints = "$wd = $wd_in";
+}
+
+class BNEG_B_DESC : MSA_3R_DESC_BASE<"bneg.b", vbneg_b, MSA128BOpnd>;
+class BNEG_H_DESC : MSA_3R_DESC_BASE<"bneg.h", vbneg_h, MSA128HOpnd>;
+class BNEG_W_DESC : MSA_3R_DESC_BASE<"bneg.w", vbneg_w, MSA128WOpnd>;
+class BNEG_D_DESC : MSA_3R_DESC_BASE<"bneg.d", vbneg_d, MSA128DOpnd>;
+
+class BNEGI_B_DESC : MSA_BIT_B_DESC_BASE<"bnegi.b", xor, vsplat_uimm_pow2, MSA128BOpnd>;
+class BNEGI_H_DESC : MSA_BIT_H_DESC_BASE<"bnegi.h", xor, vsplat_uimm_pow2, MSA128HOpnd>;
+class BNEGI_W_DESC : MSA_BIT_W_DESC_BASE<"bnegi.w", xor, vsplat_uimm_pow2, MSA128WOpnd>;
+class BNEGI_D_DESC : MSA_BIT_D_DESC_BASE<"bnegi.d", xor, vsplat_uimm_pow2, MSA128DOpnd>;
+
+class BNZ_B_DESC : MSA_CBRANCH_DESC_BASE<"bnz.b", MSA128BOpnd>;
+class BNZ_H_DESC : MSA_CBRANCH_DESC_BASE<"bnz.h", MSA128HOpnd>;
+class BNZ_W_DESC : MSA_CBRANCH_DESC_BASE<"bnz.w", MSA128WOpnd>;
+class BNZ_D_DESC : MSA_CBRANCH_DESC_BASE<"bnz.d", MSA128DOpnd>;
+
+class BNZ_V_DESC : MSA_CBRANCH_DESC_BASE<"bnz.v", MSA128BOpnd>;
+
+class BSEL_V_DESC {
+  dag OutOperandList = (outs MSA128BOpnd:$wd);
+  dag InOperandList = (ins MSA128BOpnd:$wd_in, MSA128BOpnd:$ws,
+                       MSA128BOpnd:$wt);
+  string AsmString = "bsel.v\t$wd, $ws, $wt";
+  list<dag> Pattern = [(set MSA128BOpnd:$wd,
+                        (vselect MSA128BOpnd:$wd_in, MSA128BOpnd:$ws,
+                                                  MSA128BOpnd:$wt))];
+  InstrItinClass Itinerary = NoItinerary;
+  string Constraints = "$wd = $wd_in";
+}
+
+class BSELI_B_DESC {
+  dag OutOperandList = (outs MSA128BOpnd:$wd);
+  dag InOperandList = (ins MSA128BOpnd:$wd_in, MSA128BOpnd:$ws,
+                           vsplat_uimm8:$u8);
+  string AsmString = "bseli.b\t$wd, $ws, $u8";
+  list<dag> Pattern = [(set MSA128BOpnd:$wd, (vselect MSA128BOpnd:$wd_in,
+                                                      MSA128BOpnd:$ws,
+                                                      vsplati8_uimm8:$u8))];
+  InstrItinClass Itinerary = NoItinerary;
+  string Constraints = "$wd = $wd_in";
+}
+
+class BSET_B_DESC : MSA_3R_DESC_BASE<"bset.b", vbset_b, MSA128BOpnd>;
+class BSET_H_DESC : MSA_3R_DESC_BASE<"bset.h", vbset_h, MSA128HOpnd>;
+class BSET_W_DESC : MSA_3R_DESC_BASE<"bset.w", vbset_w, MSA128WOpnd>;
+class BSET_D_DESC : MSA_3R_DESC_BASE<"bset.d", vbset_d, MSA128DOpnd>;
+
+class BSETI_B_DESC : MSA_BIT_B_DESC_BASE<"bseti.b", or, vsplat_uimm_pow2,
+                                         MSA128BOpnd>;
+class BSETI_H_DESC : MSA_BIT_H_DESC_BASE<"bseti.h", or, vsplat_uimm_pow2,
+                                         MSA128HOpnd>;
+class BSETI_W_DESC : MSA_BIT_W_DESC_BASE<"bseti.w", or, vsplat_uimm_pow2,
+                                         MSA128WOpnd>;
+class BSETI_D_DESC : MSA_BIT_D_DESC_BASE<"bseti.d", or, vsplat_uimm_pow2,
+                                         MSA128DOpnd>;
+
+class BZ_B_DESC : MSA_CBRANCH_DESC_BASE<"bz.b", MSA128BOpnd>;
+class BZ_H_DESC : MSA_CBRANCH_DESC_BASE<"bz.h", MSA128HOpnd>;
+class BZ_W_DESC : MSA_CBRANCH_DESC_BASE<"bz.w", MSA128WOpnd>;
+class BZ_D_DESC : MSA_CBRANCH_DESC_BASE<"bz.d", MSA128DOpnd>;
+
+class BZ_V_DESC : MSA_CBRANCH_DESC_BASE<"bz.v", MSA128BOpnd>;
+
+class CEQ_B_DESC : MSA_3R_DESC_BASE<"ceq.b", vseteq_v16i8, MSA128BOpnd>,
+                   IsCommutable;
+class CEQ_H_DESC : MSA_3R_DESC_BASE<"ceq.h", vseteq_v8i16, MSA128HOpnd>,
+                   IsCommutable;
+class CEQ_W_DESC : MSA_3R_DESC_BASE<"ceq.w", vseteq_v4i32, MSA128WOpnd>,
+                   IsCommutable;
+class CEQ_D_DESC : MSA_3R_DESC_BASE<"ceq.d", vseteq_v2i64, MSA128DOpnd>,
+                   IsCommutable;
+
+class CEQI_B_DESC : MSA_I5_DESC_BASE<"ceqi.b", vseteq_v16i8, vsplati8_simm5,
+                                     MSA128BOpnd>;
+class CEQI_H_DESC : MSA_I5_DESC_BASE<"ceqi.h", vseteq_v8i16, vsplati16_simm5,
+                                     MSA128HOpnd>;
+class CEQI_W_DESC : MSA_I5_DESC_BASE<"ceqi.w", vseteq_v4i32, vsplati32_simm5,
+                                     MSA128WOpnd>;
+class CEQI_D_DESC : MSA_I5_DESC_BASE<"ceqi.d", vseteq_v2i64, vsplati64_simm5,
+                                     MSA128DOpnd>;
+
+class CFCMSA_DESC {
+  dag OutOperandList = (outs GPR32Opnd:$rd);
+  dag InOperandList = (ins MSA128CROpnd:$cs);
+  string AsmString = "cfcmsa\t$rd, $cs";
+  InstrItinClass Itinerary = NoItinerary;
+  bit hasSideEffects = 1;
+}
+
+class CLE_S_B_DESC : MSA_3R_DESC_BASE<"cle_s.b", vsetle_v16i8, MSA128BOpnd>;
+class CLE_S_H_DESC : MSA_3R_DESC_BASE<"cle_s.h", vsetle_v8i16, MSA128HOpnd>;
+class CLE_S_W_DESC : MSA_3R_DESC_BASE<"cle_s.w", vsetle_v4i32, MSA128WOpnd>;
+class CLE_S_D_DESC : MSA_3R_DESC_BASE<"cle_s.d", vsetle_v2i64, MSA128DOpnd>;
+
+class CLE_U_B_DESC : MSA_3R_DESC_BASE<"cle_u.b", vsetule_v16i8, MSA128BOpnd>;
+class CLE_U_H_DESC : MSA_3R_DESC_BASE<"cle_u.h", vsetule_v8i16, MSA128HOpnd>;
+class CLE_U_W_DESC : MSA_3R_DESC_BASE<"cle_u.w", vsetule_v4i32, MSA128WOpnd>;
+class CLE_U_D_DESC : MSA_3R_DESC_BASE<"cle_u.d", vsetule_v2i64, MSA128DOpnd>;
+
+class CLEI_S_B_DESC : MSA_I5_DESC_BASE<"clei_s.b", vsetle_v16i8,
+                                       vsplati8_simm5,  MSA128BOpnd>;
+class CLEI_S_H_DESC : MSA_I5_DESC_BASE<"clei_s.h", vsetle_v8i16,
+                                       vsplati16_simm5, MSA128HOpnd>;
+class CLEI_S_W_DESC : MSA_I5_DESC_BASE<"clei_s.w", vsetle_v4i32,
+                                       vsplati32_simm5, MSA128WOpnd>;
+class CLEI_S_D_DESC : MSA_I5_DESC_BASE<"clei_s.d", vsetle_v2i64,
+                                       vsplati64_simm5, MSA128DOpnd>;
+
+class CLEI_U_B_DESC : MSA_I5_DESC_BASE<"clei_u.b", vsetule_v16i8,
+                                       vsplati8_uimm5,  MSA128BOpnd>;
+class CLEI_U_H_DESC : MSA_I5_DESC_BASE<"clei_u.h", vsetule_v8i16,
+                                       vsplati16_uimm5, MSA128HOpnd>;
+class CLEI_U_W_DESC : MSA_I5_DESC_BASE<"clei_u.w", vsetule_v4i32,
+                                       vsplati32_uimm5, MSA128WOpnd>;
+class CLEI_U_D_DESC : MSA_I5_DESC_BASE<"clei_u.d", vsetule_v2i64,
+                                       vsplati64_uimm5, MSA128DOpnd>;
+
+class CLT_S_B_DESC : MSA_3R_DESC_BASE<"clt_s.b", vsetlt_v16i8, MSA128BOpnd>;
+class CLT_S_H_DESC : MSA_3R_DESC_BASE<"clt_s.h", vsetlt_v8i16, MSA128HOpnd>;
+class CLT_S_W_DESC : MSA_3R_DESC_BASE<"clt_s.w", vsetlt_v4i32, MSA128WOpnd>;
+class CLT_S_D_DESC : MSA_3R_DESC_BASE<"clt_s.d", vsetlt_v2i64, MSA128DOpnd>;
+
+class CLT_U_B_DESC : MSA_3R_DESC_BASE<"clt_u.b", vsetult_v16i8, MSA128BOpnd>;
+class CLT_U_H_DESC : MSA_3R_DESC_BASE<"clt_u.h", vsetult_v8i16, MSA128HOpnd>;
+class CLT_U_W_DESC : MSA_3R_DESC_BASE<"clt_u.w", vsetult_v4i32, MSA128WOpnd>;
+class CLT_U_D_DESC : MSA_3R_DESC_BASE<"clt_u.d", vsetult_v2i64, MSA128DOpnd>;
+
+class CLTI_S_B_DESC : MSA_I5_DESC_BASE<"clti_s.b", vsetlt_v16i8,
+                                       vsplati8_simm5, MSA128BOpnd>;
+class CLTI_S_H_DESC : MSA_I5_DESC_BASE<"clti_s.h", vsetlt_v8i16,
+                                       vsplati16_simm5, MSA128HOpnd>;
+class CLTI_S_W_DESC : MSA_I5_DESC_BASE<"clti_s.w", vsetlt_v4i32,
+                                       vsplati32_simm5, MSA128WOpnd>;
+class CLTI_S_D_DESC : MSA_I5_DESC_BASE<"clti_s.d", vsetlt_v2i64,
+                                       vsplati64_simm5, MSA128DOpnd>;
+
+class CLTI_U_B_DESC : MSA_I5_DESC_BASE<"clti_u.b", vsetult_v16i8,
+                                       vsplati8_uimm5, MSA128BOpnd>;
+class CLTI_U_H_DESC : MSA_I5_DESC_BASE<"clti_u.h", vsetult_v8i16,
+                                       vsplati16_uimm5, MSA128HOpnd>;
+class CLTI_U_W_DESC : MSA_I5_DESC_BASE<"clti_u.w", vsetult_v4i32,
+                                       vsplati32_uimm5, MSA128WOpnd>;
+class CLTI_U_D_DESC : MSA_I5_DESC_BASE<"clti_u.d", vsetult_v2i64,
+                                       vsplati64_uimm5, MSA128DOpnd>;
+
+class COPY_S_B_DESC : MSA_COPY_DESC_BASE<"copy_s.b", vextract_sext_i8,  v16i8,
+                                         GPR32Opnd, MSA128BOpnd>;
+class COPY_S_H_DESC : MSA_COPY_DESC_BASE<"copy_s.h", vextract_sext_i16, v8i16,
+                                         GPR32Opnd, MSA128HOpnd>;
+class COPY_S_W_DESC : MSA_COPY_DESC_BASE<"copy_s.w", vextract_sext_i32, v4i32,
+                                         GPR32Opnd, MSA128WOpnd>;
+
+class COPY_U_B_DESC : MSA_COPY_DESC_BASE<"copy_u.b", vextract_zext_i8,  v16i8,
+                                         GPR32Opnd, MSA128BOpnd>;
+class COPY_U_H_DESC : MSA_COPY_DESC_BASE<"copy_u.h", vextract_zext_i16, v8i16,
+                                         GPR32Opnd, MSA128HOpnd>;
+class COPY_U_W_DESC : MSA_COPY_DESC_BASE<"copy_u.w", vextract_zext_i32, v4i32,
+                                         GPR32Opnd, MSA128WOpnd>;
+
+class COPY_FW_PSEUDO_DESC : MSA_COPY_PSEUDO_BASE<vector_extract, v4f32, FGR32,
+                                                 MSA128W>;
+class COPY_FD_PSEUDO_DESC : MSA_COPY_PSEUDO_BASE<vector_extract, v2f64, FGR64,
+                                                 MSA128D>;
+
+class CTCMSA_DESC {
+  dag OutOperandList = (outs);
+  dag InOperandList = (ins MSA128CROpnd:$cd, GPR32Opnd:$rs);
+  string AsmString = "ctcmsa\t$cd, $rs";
+  InstrItinClass Itinerary = NoItinerary;
+  bit hasSideEffects = 1;
+}
+
+class DIV_S_B_DESC : MSA_3R_DESC_BASE<"div_s.b", sdiv, MSA128BOpnd>;
+class DIV_S_H_DESC : MSA_3R_DESC_BASE<"div_s.h", sdiv, MSA128HOpnd>;
+class DIV_S_W_DESC : MSA_3R_DESC_BASE<"div_s.w", sdiv, MSA128WOpnd>;
+class DIV_S_D_DESC : MSA_3R_DESC_BASE<"div_s.d", sdiv, MSA128DOpnd>;
+
+class DIV_U_B_DESC : MSA_3R_DESC_BASE<"div_u.b", udiv, MSA128BOpnd>;
+class DIV_U_H_DESC : MSA_3R_DESC_BASE<"div_u.h", udiv, MSA128HOpnd>;
+class DIV_U_W_DESC : MSA_3R_DESC_BASE<"div_u.w", udiv, MSA128WOpnd>;
+class DIV_U_D_DESC : MSA_3R_DESC_BASE<"div_u.d", udiv, MSA128DOpnd>;
+
+class DOTP_S_H_DESC : MSA_3R_DESC_BASE<"dotp_s.h", int_mips_dotp_s_h,
+                                       MSA128HOpnd, MSA128BOpnd, MSA128BOpnd>,
+                      IsCommutable;
+class DOTP_S_W_DESC : MSA_3R_DESC_BASE<"dotp_s.w", int_mips_dotp_s_w,
+                                       MSA128WOpnd, MSA128HOpnd, MSA128HOpnd>,
+                      IsCommutable;
+class DOTP_S_D_DESC : MSA_3R_DESC_BASE<"dotp_s.d", int_mips_dotp_s_d,
+                                       MSA128DOpnd, MSA128WOpnd, MSA128WOpnd>,
+                      IsCommutable;
+
+class DOTP_U_H_DESC : MSA_3R_DESC_BASE<"dotp_u.h", int_mips_dotp_u_h,
+                                       MSA128HOpnd, MSA128BOpnd, MSA128BOpnd>,
+                      IsCommutable;
+class DOTP_U_W_DESC : MSA_3R_DESC_BASE<"dotp_u.w", int_mips_dotp_u_w,
+                                       MSA128WOpnd, MSA128HOpnd, MSA128HOpnd>,
+                      IsCommutable;
+class DOTP_U_D_DESC : MSA_3R_DESC_BASE<"dotp_u.d", int_mips_dotp_u_d,
+                                       MSA128DOpnd, MSA128WOpnd, MSA128WOpnd>,
+                      IsCommutable;
+
+class DPADD_S_H_DESC : MSA_3R_4R_DESC_BASE<"dpadd_s.h", int_mips_dpadd_s_h,
+                                           MSA128HOpnd, MSA128BOpnd,
+                                           MSA128BOpnd>, IsCommutable;
+class DPADD_S_W_DESC : MSA_3R_4R_DESC_BASE<"dpadd_s.w", int_mips_dpadd_s_w,
+                                           MSA128WOpnd, MSA128HOpnd,
+                                           MSA128HOpnd>, IsCommutable;
+class DPADD_S_D_DESC : MSA_3R_4R_DESC_BASE<"dpadd_s.d", int_mips_dpadd_s_d,
+                                           MSA128DOpnd, MSA128WOpnd,
+                                           MSA128WOpnd>, IsCommutable;
+
+class DPADD_U_H_DESC : MSA_3R_4R_DESC_BASE<"dpadd_u.h", int_mips_dpadd_u_h,
+                                           MSA128HOpnd, MSA128BOpnd,
+                                           MSA128BOpnd>, IsCommutable;
+class DPADD_U_W_DESC : MSA_3R_4R_DESC_BASE<"dpadd_u.w", int_mips_dpadd_u_w,
+                                           MSA128WOpnd, MSA128HOpnd,
+                                           MSA128HOpnd>, IsCommutable;
+class DPADD_U_D_DESC : MSA_3R_4R_DESC_BASE<"dpadd_u.d", int_mips_dpadd_u_d,
+                                           MSA128DOpnd, MSA128WOpnd,
+                                           MSA128WOpnd>, IsCommutable;
+
+class DPSUB_S_H_DESC : MSA_3R_4R_DESC_BASE<"dpsub_s.h", int_mips_dpsub_s_h,
+                                           MSA128HOpnd, MSA128BOpnd,
+                                           MSA128BOpnd>;
+class DPSUB_S_W_DESC : MSA_3R_4R_DESC_BASE<"dpsub_s.w", int_mips_dpsub_s_w,
+                                           MSA128WOpnd, MSA128HOpnd,
+                                           MSA128HOpnd>;
+class DPSUB_S_D_DESC : MSA_3R_4R_DESC_BASE<"dpsub_s.d", int_mips_dpsub_s_d,
+                                           MSA128DOpnd, MSA128WOpnd,
+                                           MSA128WOpnd>;
+
+class DPSUB_U_H_DESC : MSA_3R_4R_DESC_BASE<"dpsub_u.h", int_mips_dpsub_u_h,
+                                           MSA128HOpnd, MSA128BOpnd,
+                                           MSA128BOpnd>;
+class DPSUB_U_W_DESC : MSA_3R_4R_DESC_BASE<"dpsub_u.w", int_mips_dpsub_u_w,
+                                           MSA128WOpnd, MSA128HOpnd,
+                                           MSA128HOpnd>;
+class DPSUB_U_D_DESC : MSA_3R_4R_DESC_BASE<"dpsub_u.d", int_mips_dpsub_u_d,
+                                           MSA128DOpnd, MSA128WOpnd,
+                                           MSA128WOpnd>;
+
+class FADD_W_DESC : MSA_3RF_DESC_BASE<"fadd.w", fadd, MSA128WOpnd>,
+                    IsCommutable;
+class FADD_D_DESC : MSA_3RF_DESC_BASE<"fadd.d", fadd, MSA128DOpnd>,
+                    IsCommutable;
+
+class FCAF_W_DESC : MSA_3RF_DESC_BASE<"fcaf.w", int_mips_fcaf_w, MSA128WOpnd>,
+                    IsCommutable;
+class FCAF_D_DESC : MSA_3RF_DESC_BASE<"fcaf.d", int_mips_fcaf_d, MSA128DOpnd>,
+                    IsCommutable;
+
+class FCEQ_W_DESC : MSA_3RF_DESC_BASE<"fceq.w", vfsetoeq_v4f32, MSA128WOpnd>,
+                    IsCommutable;
+class FCEQ_D_DESC : MSA_3RF_DESC_BASE<"fceq.d", vfsetoeq_v2f64, MSA128DOpnd>,
+                    IsCommutable;
+
+class FCLASS_W_DESC : MSA_2RF_DESC_BASE<"fclass.w", int_mips_fclass_w,
+                                        MSA128WOpnd>;
+class FCLASS_D_DESC : MSA_2RF_DESC_BASE<"fclass.d", int_mips_fclass_d,
+                                        MSA128DOpnd>;
+
+class FCLE_W_DESC : MSA_3RF_DESC_BASE<"fcle.w", vfsetole_v4f32, MSA128WOpnd>;
+class FCLE_D_DESC : MSA_3RF_DESC_BASE<"fcle.d", vfsetole_v2f64, MSA128DOpnd>;
+
+class FCLT_W_DESC : MSA_3RF_DESC_BASE<"fclt.w", vfsetolt_v4f32, MSA128WOpnd>;
+class FCLT_D_DESC : MSA_3RF_DESC_BASE<"fclt.d", vfsetolt_v2f64, MSA128DOpnd>;
+
+class FCNE_W_DESC : MSA_3RF_DESC_BASE<"fcne.w", vfsetone_v4f32, MSA128WOpnd>,
+                    IsCommutable;
+class FCNE_D_DESC : MSA_3RF_DESC_BASE<"fcne.d", vfsetone_v2f64, MSA128DOpnd>,
+                    IsCommutable;
+
+class FCOR_W_DESC : MSA_3RF_DESC_BASE<"fcor.w", vfsetord_v4f32, MSA128WOpnd>,
+                    IsCommutable;
+class FCOR_D_DESC : MSA_3RF_DESC_BASE<"fcor.d", vfsetord_v2f64, MSA128DOpnd>,
+                    IsCommutable;
+
+class FCUEQ_W_DESC : MSA_3RF_DESC_BASE<"fcueq.w", vfsetueq_v4f32, MSA128WOpnd>,
+                     IsCommutable;
+class FCUEQ_D_DESC : MSA_3RF_DESC_BASE<"fcueq.d", vfsetueq_v2f64, MSA128DOpnd>,
+                     IsCommutable;
+
+class FCULE_W_DESC : MSA_3RF_DESC_BASE<"fcule.w", vfsetule_v4f32, MSA128WOpnd>,
+                     IsCommutable;
+class FCULE_D_DESC : MSA_3RF_DESC_BASE<"fcule.d", vfsetule_v2f64, MSA128DOpnd>,
+                     IsCommutable;
+
+class FCULT_W_DESC : MSA_3RF_DESC_BASE<"fcult.w", vfsetult_v4f32, MSA128WOpnd>,
+                     IsCommutable;
+class FCULT_D_DESC : MSA_3RF_DESC_BASE<"fcult.d", vfsetult_v2f64, MSA128DOpnd>,
+                     IsCommutable;
+
+class FCUN_W_DESC : MSA_3RF_DESC_BASE<"fcun.w", vfsetun_v4f32, MSA128WOpnd>,
+                    IsCommutable;
+class FCUN_D_DESC : MSA_3RF_DESC_BASE<"fcun.d", vfsetun_v2f64, MSA128DOpnd>,
+                    IsCommutable;
+
+class FCUNE_W_DESC : MSA_3RF_DESC_BASE<"fcune.w", vfsetune_v4f32, MSA128WOpnd>,
+                     IsCommutable;
+class FCUNE_D_DESC : MSA_3RF_DESC_BASE<"fcune.d", vfsetune_v2f64, MSA128DOpnd>,
+                     IsCommutable;
+
+class FDIV_W_DESC : MSA_3RF_DESC_BASE<"fdiv.w", fdiv, MSA128WOpnd>;
+class FDIV_D_DESC : MSA_3RF_DESC_BASE<"fdiv.d", fdiv, MSA128DOpnd>;
+
+class FEXDO_H_DESC : MSA_3RF_DESC_BASE<"fexdo.h", int_mips_fexdo_h,
+                                       MSA128HOpnd, MSA128WOpnd, MSA128WOpnd>;
+class FEXDO_W_DESC : MSA_3RF_DESC_BASE<"fexdo.w", int_mips_fexdo_w,
+                                       MSA128WOpnd, MSA128DOpnd, MSA128DOpnd>;
+
+// The fexp2.df instruction multiplies the first operand by 2 to the power of
+// the second operand. We therefore need a pseudo-insn in order to invent the
+// 1.0 when we only need to match ISD::FEXP2.
+class FEXP2_W_DESC : MSA_3RF_DESC_BASE<"fexp2.w", mul_fexp2, MSA128WOpnd>;
+class FEXP2_D_DESC : MSA_3RF_DESC_BASE<"fexp2.d", mul_fexp2, MSA128DOpnd>;
+let usesCustomInserter = 1 in {
+  class FEXP2_W_1_PSEUDO_DESC :
+      MipsPseudo<(outs MSA128W:$wd), (ins MSA128W:$ws),
+                 [(set MSA128W:$wd, (fexp2 MSA128W:$ws))]>;
+  class FEXP2_D_1_PSEUDO_DESC :
+      MipsPseudo<(outs MSA128D:$wd), (ins MSA128D:$ws),
+                 [(set MSA128D:$wd, (fexp2 MSA128D:$ws))]>;
+}
+
+class FEXUPL_W_DESC : MSA_2RF_DESC_BASE<"fexupl.w", int_mips_fexupl_w,
+                                        MSA128WOpnd, MSA128HOpnd>;
+class FEXUPL_D_DESC : MSA_2RF_DESC_BASE<"fexupl.d", int_mips_fexupl_d,
+                                        MSA128DOpnd, MSA128WOpnd>;
+
+class FEXUPR_W_DESC : MSA_2RF_DESC_BASE<"fexupr.w", int_mips_fexupr_w,
+                                        MSA128WOpnd, MSA128HOpnd>;
+class FEXUPR_D_DESC : MSA_2RF_DESC_BASE<"fexupr.d", int_mips_fexupr_d,
+                                        MSA128DOpnd, MSA128WOpnd>;
+
+class FFINT_S_W_DESC : MSA_2RF_DESC_BASE<"ffint_s.w", sint_to_fp, MSA128WOpnd>;
+class FFINT_S_D_DESC : MSA_2RF_DESC_BASE<"ffint_s.d", sint_to_fp, MSA128DOpnd>;
+
+class FFINT_U_W_DESC : MSA_2RF_DESC_BASE<"ffint_u.w", uint_to_fp, MSA128WOpnd>;
+class FFINT_U_D_DESC : MSA_2RF_DESC_BASE<"ffint_u.d", uint_to_fp, MSA128DOpnd>;
+
+class FFQL_W_DESC : MSA_2RF_DESC_BASE<"ffql.w", int_mips_ffql_w,
+                                      MSA128WOpnd, MSA128HOpnd>;
+class FFQL_D_DESC : MSA_2RF_DESC_BASE<"ffql.d", int_mips_ffql_d,
+                                      MSA128DOpnd, MSA128WOpnd>;
+
+class FFQR_W_DESC : MSA_2RF_DESC_BASE<"ffqr.w", int_mips_ffqr_w,
+                                      MSA128WOpnd, MSA128HOpnd>;
+class FFQR_D_DESC : MSA_2RF_DESC_BASE<"ffqr.d", int_mips_ffqr_d,
+                                      MSA128DOpnd, MSA128WOpnd>;
+
+class FILL_B_DESC : MSA_2R_FILL_DESC_BASE<"fill.b", v16i8, vsplati8,
+                                          MSA128BOpnd, GPR32Opnd>;
+class FILL_H_DESC : MSA_2R_FILL_DESC_BASE<"fill.h", v8i16, vsplati16,
+                                          MSA128HOpnd, GPR32Opnd>;
+class FILL_W_DESC : MSA_2R_FILL_DESC_BASE<"fill.w", v4i32, vsplati32,
+                                          MSA128WOpnd, GPR32Opnd>;
+
+class FILL_FW_PSEUDO_DESC : MSA_2R_FILL_PSEUDO_BASE<v4f32, vsplatf32, MSA128W,
+                                                    FGR32>;
+class FILL_FD_PSEUDO_DESC : MSA_2R_FILL_PSEUDO_BASE<v2f64, vsplatf64, MSA128D,
+                                                    FGR64>;
+
+class FLOG2_W_DESC : MSA_2RF_DESC_BASE<"flog2.w", flog2, MSA128WOpnd>;
+class FLOG2_D_DESC : MSA_2RF_DESC_BASE<"flog2.d", flog2, MSA128DOpnd>;
+
+class FMADD_W_DESC : MSA_3RF_4RF_DESC_BASE<"fmadd.w", fma, MSA128WOpnd>;
+class FMADD_D_DESC : MSA_3RF_4RF_DESC_BASE<"fmadd.d", fma, MSA128DOpnd>;
+
+class FMAX_W_DESC : MSA_3RF_DESC_BASE<"fmax.w", int_mips_fmax_w, MSA128WOpnd>;
+class FMAX_D_DESC : MSA_3RF_DESC_BASE<"fmax.d", int_mips_fmax_d, MSA128DOpnd>;
+
+class FMAX_A_W_DESC : MSA_3RF_DESC_BASE<"fmax_a.w", int_mips_fmax_a_w,
+                                        MSA128WOpnd>;
+class FMAX_A_D_DESC : MSA_3RF_DESC_BASE<"fmax_a.d", int_mips_fmax_a_d,
+                                        MSA128DOpnd>;
+
+class FMIN_W_DESC : MSA_3RF_DESC_BASE<"fmin.w", int_mips_fmin_w, MSA128WOpnd>;
+class FMIN_D_DESC : MSA_3RF_DESC_BASE<"fmin.d", int_mips_fmin_d, MSA128DOpnd>;
+
+class FMIN_A_W_DESC : MSA_3RF_DESC_BASE<"fmin_a.w", int_mips_fmin_a_w,
+                                        MSA128WOpnd>;
+class FMIN_A_D_DESC : MSA_3RF_DESC_BASE<"fmin_a.d", int_mips_fmin_a_d,
+                                        MSA128DOpnd>;
+
+class FMSUB_W_DESC : MSA_3RF_4RF_DESC_BASE<"fmsub.w", fms, MSA128WOpnd>;
+class FMSUB_D_DESC : MSA_3RF_4RF_DESC_BASE<"fmsub.d", fms, MSA128DOpnd>;
+
+class FMUL_W_DESC : MSA_3RF_DESC_BASE<"fmul.w", fmul, MSA128WOpnd>;
+class FMUL_D_DESC : MSA_3RF_DESC_BASE<"fmul.d", fmul, MSA128DOpnd>;
+
+class FRINT_W_DESC : MSA_2RF_DESC_BASE<"frint.w", frint, MSA128WOpnd>;
+class FRINT_D_DESC : MSA_2RF_DESC_BASE<"frint.d", frint, MSA128DOpnd>;
+
+class FRCP_W_DESC : MSA_2RF_DESC_BASE<"frcp.w", int_mips_frcp_w, MSA128WOpnd>;
+class FRCP_D_DESC : MSA_2RF_DESC_BASE<"frcp.d", int_mips_frcp_d, MSA128DOpnd>;
+
+class FRSQRT_W_DESC : MSA_2RF_DESC_BASE<"frsqrt.w", int_mips_frsqrt_w,
+                                        MSA128WOpnd>;
+class FRSQRT_D_DESC : MSA_2RF_DESC_BASE<"frsqrt.d", int_mips_frsqrt_d,
+                                        MSA128DOpnd>;
+
+class FSAF_W_DESC : MSA_3RF_DESC_BASE<"fsaf.w", int_mips_fsaf_w, MSA128WOpnd>;
+class FSAF_D_DESC : MSA_3RF_DESC_BASE<"fsaf.d", int_mips_fsaf_d, MSA128DOpnd>;
+
+class FSEQ_W_DESC : MSA_3RF_DESC_BASE<"fseq.w", int_mips_fseq_w, MSA128WOpnd>;
+class FSEQ_D_DESC : MSA_3RF_DESC_BASE<"fseq.d", int_mips_fseq_d, MSA128DOpnd>;
+
+class FSLE_W_DESC : MSA_3RF_DESC_BASE<"fsle.w", int_mips_fsle_w, MSA128WOpnd>;
+class FSLE_D_DESC : MSA_3RF_DESC_BASE<"fsle.d", int_mips_fsle_d, MSA128DOpnd>;
+
+class FSLT_W_DESC : MSA_3RF_DESC_BASE<"fslt.w", int_mips_fslt_w, MSA128WOpnd>;
+class FSLT_D_DESC : MSA_3RF_DESC_BASE<"fslt.d", int_mips_fslt_d, MSA128DOpnd>;
+
+class FSNE_W_DESC : MSA_3RF_DESC_BASE<"fsne.w", int_mips_fsne_w, MSA128WOpnd>;
+class FSNE_D_DESC : MSA_3RF_DESC_BASE<"fsne.d", int_mips_fsne_d, MSA128DOpnd>;
+
+class FSOR_W_DESC : MSA_3RF_DESC_BASE<"fsor.w", int_mips_fsor_w, MSA128WOpnd>;
+class FSOR_D_DESC : MSA_3RF_DESC_BASE<"fsor.d", int_mips_fsor_d, MSA128DOpnd>;
+
+class FSQRT_W_DESC : MSA_2RF_DESC_BASE<"fsqrt.w", fsqrt, MSA128WOpnd>;
+class FSQRT_D_DESC : MSA_2RF_DESC_BASE<"fsqrt.d", fsqrt, MSA128DOpnd>;
+
+class FSUB_W_DESC : MSA_3RF_DESC_BASE<"fsub.w", fsub, MSA128WOpnd>;
+class FSUB_D_DESC : MSA_3RF_DESC_BASE<"fsub.d", fsub, MSA128DOpnd>;
+
+class FSUEQ_W_DESC : MSA_3RF_DESC_BASE<"fsueq.w", int_mips_fsueq_w,
+                                       MSA128WOpnd>;
+class FSUEQ_D_DESC : MSA_3RF_DESC_BASE<"fsueq.d", int_mips_fsueq_d,
+                                       MSA128DOpnd>;
+
+class FSULE_W_DESC : MSA_3RF_DESC_BASE<"fsule.w", int_mips_fsule_w,
+                                       MSA128WOpnd>;
+class FSULE_D_DESC : MSA_3RF_DESC_BASE<"fsule.d", int_mips_fsule_d,
+                                       MSA128DOpnd>;
+
+class FSULT_W_DESC : MSA_3RF_DESC_BASE<"fsult.w", int_mips_fsult_w,
+                                       MSA128WOpnd>;
+class FSULT_D_DESC : MSA_3RF_DESC_BASE<"fsult.d", int_mips_fsult_d,
+                                       MSA128DOpnd>;
+
+class FSUN_W_DESC : MSA_3RF_DESC_BASE<"fsun.w", int_mips_fsun_w,
+                                      MSA128WOpnd>;
+class FSUN_D_DESC : MSA_3RF_DESC_BASE<"fsun.d", int_mips_fsun_d,
+                                      MSA128DOpnd>;
+
+class FSUNE_W_DESC : MSA_3RF_DESC_BASE<"fsune.w", int_mips_fsune_w,
+                                       MSA128WOpnd>;
+class FSUNE_D_DESC : MSA_3RF_DESC_BASE<"fsune.d", int_mips_fsune_d,
+                                       MSA128DOpnd>;
+
+class FTINT_S_W_DESC : MSA_2RF_DESC_BASE<"ftint_s.w", int_mips_ftint_s_w,
+                                         MSA128WOpnd>;
+class FTINT_S_D_DESC : MSA_2RF_DESC_BASE<"ftint_s.d", int_mips_ftint_s_d,
+                                         MSA128DOpnd>;
+
+class FTINT_U_W_DESC : MSA_2RF_DESC_BASE<"ftint_u.w", int_mips_ftint_u_w,
+                                         MSA128WOpnd>;
+class FTINT_U_D_DESC : MSA_2RF_DESC_BASE<"ftint_u.d", int_mips_ftint_u_d,
+                                         MSA128DOpnd>;
+
+class FTQ_H_DESC : MSA_3RF_DESC_BASE<"ftq.h", int_mips_ftq_h,
+                                     MSA128HOpnd, MSA128WOpnd, MSA128WOpnd>;
+class FTQ_W_DESC : MSA_3RF_DESC_BASE<"ftq.w", int_mips_ftq_w,
+                                     MSA128WOpnd, MSA128DOpnd, MSA128DOpnd>;
+
+class FTRUNC_S_W_DESC : MSA_2RF_DESC_BASE<"ftrunc_s.w", fp_to_sint,
+                                          MSA128WOpnd>;
+class FTRUNC_S_D_DESC : MSA_2RF_DESC_BASE<"ftrunc_s.d", fp_to_sint,
+                                          MSA128DOpnd>;
+
+class FTRUNC_U_W_DESC : MSA_2RF_DESC_BASE<"ftrunc_u.w", fp_to_uint,
+                                          MSA128WOpnd>;
+class FTRUNC_U_D_DESC : MSA_2RF_DESC_BASE<"ftrunc_u.d", fp_to_uint,
+                                          MSA128DOpnd>;
+
+class HADD_S_H_DESC : MSA_3R_DESC_BASE<"hadd_s.h", int_mips_hadd_s_h,
+                                       MSA128HOpnd, MSA128BOpnd, MSA128BOpnd>;
+class HADD_S_W_DESC : MSA_3R_DESC_BASE<"hadd_s.w", int_mips_hadd_s_w,
+                                       MSA128WOpnd, MSA128HOpnd, MSA128HOpnd>;
+class HADD_S_D_DESC : MSA_3R_DESC_BASE<"hadd_s.d", int_mips_hadd_s_d,
+                                       MSA128DOpnd, MSA128WOpnd, MSA128WOpnd>;
+
+class HADD_U_H_DESC : MSA_3R_DESC_BASE<"hadd_u.h", int_mips_hadd_u_h,
+                                       MSA128HOpnd, MSA128BOpnd, MSA128BOpnd>;
+class HADD_U_W_DESC : MSA_3R_DESC_BASE<"hadd_u.w", int_mips_hadd_u_w,
+                                       MSA128WOpnd, MSA128HOpnd, MSA128HOpnd>;
+class HADD_U_D_DESC : MSA_3R_DESC_BASE<"hadd_u.d", int_mips_hadd_u_d,
+                                       MSA128DOpnd, MSA128WOpnd, MSA128WOpnd>;
+
+class HSUB_S_H_DESC : MSA_3R_DESC_BASE<"hsub_s.h", int_mips_hsub_s_h,
+                                       MSA128HOpnd, MSA128BOpnd, MSA128BOpnd>;
+class HSUB_S_W_DESC : MSA_3R_DESC_BASE<"hsub_s.w", int_mips_hsub_s_w,
+                                       MSA128WOpnd, MSA128HOpnd, MSA128HOpnd>;
+class HSUB_S_D_DESC : MSA_3R_DESC_BASE<"hsub_s.d", int_mips_hsub_s_d,
+                                       MSA128DOpnd, MSA128WOpnd, MSA128WOpnd>;
+
+class HSUB_U_H_DESC : MSA_3R_DESC_BASE<"hsub_u.h", int_mips_hsub_u_h,
+                                       MSA128HOpnd, MSA128BOpnd, MSA128BOpnd>;
+class HSUB_U_W_DESC : MSA_3R_DESC_BASE<"hsub_u.w", int_mips_hsub_u_w,
+                                       MSA128WOpnd, MSA128HOpnd, MSA128HOpnd>;
+class HSUB_U_D_DESC : MSA_3R_DESC_BASE<"hsub_u.d", int_mips_hsub_u_d,
+                                       MSA128DOpnd, MSA128WOpnd, MSA128WOpnd>;
+
+class ILVEV_B_DESC : MSA_3R_DESC_BASE<"ilvev.b", MipsILVEV, MSA128BOpnd>;
+class ILVEV_H_DESC : MSA_3R_DESC_BASE<"ilvev.h", MipsILVEV, MSA128HOpnd>;
+class ILVEV_W_DESC : MSA_3R_DESC_BASE<"ilvev.w", MipsILVEV, MSA128WOpnd>;
+class ILVEV_D_DESC : MSA_3R_DESC_BASE<"ilvev.d", MipsILVEV, MSA128DOpnd>;
+
+class ILVL_B_DESC : MSA_3R_DESC_BASE<"ilvl.b", MipsILVL, MSA128BOpnd>;
+class ILVL_H_DESC : MSA_3R_DESC_BASE<"ilvl.h", MipsILVL, MSA128HOpnd>;
+class ILVL_W_DESC : MSA_3R_DESC_BASE<"ilvl.w", MipsILVL, MSA128WOpnd>;
+class ILVL_D_DESC : MSA_3R_DESC_BASE<"ilvl.d", MipsILVL, MSA128DOpnd>;
+
+class ILVOD_B_DESC : MSA_3R_DESC_BASE<"ilvod.b", MipsILVOD, MSA128BOpnd>;
+class ILVOD_H_DESC : MSA_3R_DESC_BASE<"ilvod.h", MipsILVOD, MSA128HOpnd>;
+class ILVOD_W_DESC : MSA_3R_DESC_BASE<"ilvod.w", MipsILVOD, MSA128WOpnd>;
+class ILVOD_D_DESC : MSA_3R_DESC_BASE<"ilvod.d", MipsILVOD, MSA128DOpnd>;
+
+class ILVR_B_DESC : MSA_3R_DESC_BASE<"ilvr.b", MipsILVR, MSA128BOpnd>;
+class ILVR_H_DESC : MSA_3R_DESC_BASE<"ilvr.h", MipsILVR, MSA128HOpnd>;
+class ILVR_W_DESC : MSA_3R_DESC_BASE<"ilvr.w", MipsILVR, MSA128WOpnd>;
+class ILVR_D_DESC : MSA_3R_DESC_BASE<"ilvr.d", MipsILVR, MSA128DOpnd>;
+
+class INSERT_B_DESC : MSA_INSERT_DESC_BASE<"insert.b", vinsert_v16i8,
+                                           MSA128BOpnd, GPR32Opnd>;
+class INSERT_H_DESC : MSA_INSERT_DESC_BASE<"insert.h", vinsert_v8i16,
+                                           MSA128HOpnd, GPR32Opnd>;
+class INSERT_W_DESC : MSA_INSERT_DESC_BASE<"insert.w", vinsert_v4i32,
+                                           MSA128WOpnd, GPR32Opnd>;
+
+class INSERT_FW_PSEUDO_DESC : MSA_INSERT_PSEUDO_BASE<vector_insert, v4f32,
+                                                     MSA128WOpnd, FGR32Opnd>;
+class INSERT_FD_PSEUDO_DESC : MSA_INSERT_PSEUDO_BASE<vector_insert, v2f64,
+                                                     MSA128DOpnd, FGR64Opnd>;
+
+class INSVE_B_DESC : MSA_INSVE_DESC_BASE<"insve.b", int_mips_insve_b,
+                                         MSA128BOpnd>;
+class INSVE_H_DESC : MSA_INSVE_DESC_BASE<"insve.h", int_mips_insve_h,
+                                         MSA128HOpnd>;
+class INSVE_W_DESC : MSA_INSVE_DESC_BASE<"insve.w", int_mips_insve_w,
+                                         MSA128WOpnd>;
+class INSVE_D_DESC : MSA_INSVE_DESC_BASE<"insve.d", int_mips_insve_d,
+                                         MSA128DOpnd>;
+
+class LD_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                   ValueType TyNode, RegisterOperand ROWD,
+                   Operand MemOpnd = mem, ComplexPattern Addr = addrRegImm,
+                   InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins MemOpnd:$addr);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $addr");
+  list<dag> Pattern = [(set ROWD:$wd, (TyNode (OpNode Addr:$addr)))];
+  InstrItinClass Itinerary = itin;
+  string DecoderMethod = "DecodeMSA128Mem";
+}
+
+class LD_B_DESC : LD_DESC_BASE<"ld.b", load, v16i8, MSA128BOpnd>;
+class LD_H_DESC : LD_DESC_BASE<"ld.h", load, v8i16, MSA128HOpnd>;
+class LD_W_DESC : LD_DESC_BASE<"ld.w", load, v4i32, MSA128WOpnd>;
+class LD_D_DESC : LD_DESC_BASE<"ld.d", load, v2i64, MSA128DOpnd>;
+
+class LDI_B_DESC : MSA_I10_LDI_DESC_BASE<"ldi.b", MSA128BOpnd>;
+class LDI_H_DESC : MSA_I10_LDI_DESC_BASE<"ldi.h", MSA128HOpnd>;
+class LDI_W_DESC : MSA_I10_LDI_DESC_BASE<"ldi.w", MSA128WOpnd>;
+class LDI_D_DESC : MSA_I10_LDI_DESC_BASE<"ldi.d", MSA128DOpnd>;
+
+class LSA_DESC {
+  dag OutOperandList = (outs GPR32Opnd:$rd);
+  dag InOperandList = (ins GPR32Opnd:$rs, GPR32Opnd:$rt, LSAImm:$sa);
+  string AsmString = "lsa\t$rd, $rs, $rt, $sa";
+  list<dag> Pattern = [(set GPR32Opnd:$rd, (add GPR32Opnd:$rs,
+                                                (shl GPR32Opnd:$rt,
+                                                     immZExt2Lsa:$sa)))];
+  InstrItinClass Itinerary = NoItinerary;
+}
+
+class MADD_Q_H_DESC : MSA_3RF_4RF_DESC_BASE<"madd_q.h", int_mips_madd_q_h,
+                                            MSA128HOpnd>;
+class MADD_Q_W_DESC : MSA_3RF_4RF_DESC_BASE<"madd_q.w", int_mips_madd_q_w,
+                                            MSA128WOpnd>;
+
+class MADDR_Q_H_DESC : MSA_3RF_4RF_DESC_BASE<"maddr_q.h", int_mips_maddr_q_h,
+                                             MSA128HOpnd>;
+class MADDR_Q_W_DESC : MSA_3RF_4RF_DESC_BASE<"maddr_q.w", int_mips_maddr_q_w,
+                                             MSA128WOpnd>;
+
+class MADDV_B_DESC : MSA_3R_4R_DESC_BASE<"maddv.b", muladd, MSA128BOpnd>;
+class MADDV_H_DESC : MSA_3R_4R_DESC_BASE<"maddv.h", muladd, MSA128HOpnd>;
+class MADDV_W_DESC : MSA_3R_4R_DESC_BASE<"maddv.w", muladd, MSA128WOpnd>;
+class MADDV_D_DESC : MSA_3R_4R_DESC_BASE<"maddv.d", muladd, MSA128DOpnd>;
+
+class MAX_A_B_DESC : MSA_3R_DESC_BASE<"max_a.b", int_mips_max_a_b, MSA128BOpnd>;
+class MAX_A_H_DESC : MSA_3R_DESC_BASE<"max_a.h", int_mips_max_a_h, MSA128HOpnd>;
+class MAX_A_W_DESC : MSA_3R_DESC_BASE<"max_a.w", int_mips_max_a_w, MSA128WOpnd>;
+class MAX_A_D_DESC : MSA_3R_DESC_BASE<"max_a.d", int_mips_max_a_d, MSA128DOpnd>;
+
+class MAX_S_B_DESC : MSA_3R_DESC_BASE<"max_s.b", MipsVSMax, MSA128BOpnd>;
+class MAX_S_H_DESC : MSA_3R_DESC_BASE<"max_s.h", MipsVSMax, MSA128HOpnd>;
+class MAX_S_W_DESC : MSA_3R_DESC_BASE<"max_s.w", MipsVSMax, MSA128WOpnd>;
+class MAX_S_D_DESC : MSA_3R_DESC_BASE<"max_s.d", MipsVSMax, MSA128DOpnd>;
+
+class MAX_U_B_DESC : MSA_3R_DESC_BASE<"max_u.b", MipsVUMax, MSA128BOpnd>;
+class MAX_U_H_DESC : MSA_3R_DESC_BASE<"max_u.h", MipsVUMax, MSA128HOpnd>;
+class MAX_U_W_DESC : MSA_3R_DESC_BASE<"max_u.w", MipsVUMax, MSA128WOpnd>;
+class MAX_U_D_DESC : MSA_3R_DESC_BASE<"max_u.d", MipsVUMax, MSA128DOpnd>;
+
+class MAXI_S_B_DESC : MSA_I5_DESC_BASE<"maxi_s.b", MipsVSMax, vsplati8_simm5,
+                                       MSA128BOpnd>;
+class MAXI_S_H_DESC : MSA_I5_DESC_BASE<"maxi_s.h", MipsVSMax, vsplati16_simm5,
+                                       MSA128HOpnd>;
+class MAXI_S_W_DESC : MSA_I5_DESC_BASE<"maxi_s.w", MipsVSMax, vsplati32_simm5,
+                                       MSA128WOpnd>;
+class MAXI_S_D_DESC : MSA_I5_DESC_BASE<"maxi_s.d", MipsVSMax, vsplati64_simm5,
+                                       MSA128DOpnd>;
+
+class MAXI_U_B_DESC : MSA_I5_DESC_BASE<"maxi_u.b", MipsVUMax, vsplati8_uimm5,
+                                       MSA128BOpnd>;
+class MAXI_U_H_DESC : MSA_I5_DESC_BASE<"maxi_u.h", MipsVUMax, vsplati16_uimm5,
+                                       MSA128HOpnd>;
+class MAXI_U_W_DESC : MSA_I5_DESC_BASE<"maxi_u.w", MipsVUMax, vsplati32_uimm5,
+                                       MSA128WOpnd>;
+class MAXI_U_D_DESC : MSA_I5_DESC_BASE<"maxi_u.d", MipsVUMax, vsplati64_uimm5,
+                                       MSA128DOpnd>;
+
+class MIN_A_B_DESC : MSA_3R_DESC_BASE<"min_a.b", int_mips_min_a_b, MSA128BOpnd>;
+class MIN_A_H_DESC : MSA_3R_DESC_BASE<"min_a.h", int_mips_min_a_h, MSA128HOpnd>;
+class MIN_A_W_DESC : MSA_3R_DESC_BASE<"min_a.w", int_mips_min_a_w, MSA128WOpnd>;
+class MIN_A_D_DESC : MSA_3R_DESC_BASE<"min_a.d", int_mips_min_a_d, MSA128DOpnd>;
+
+class MIN_S_B_DESC : MSA_3R_DESC_BASE<"min_s.b", MipsVSMin, MSA128BOpnd>;
+class MIN_S_H_DESC : MSA_3R_DESC_BASE<"min_s.h", MipsVSMin, MSA128HOpnd>;
+class MIN_S_W_DESC : MSA_3R_DESC_BASE<"min_s.w", MipsVSMin, MSA128WOpnd>;
+class MIN_S_D_DESC : MSA_3R_DESC_BASE<"min_s.d", MipsVSMin, MSA128DOpnd>;
+
+class MIN_U_B_DESC : MSA_3R_DESC_BASE<"min_u.b", MipsVUMin, MSA128BOpnd>;
+class MIN_U_H_DESC : MSA_3R_DESC_BASE<"min_u.h", MipsVUMin, MSA128HOpnd>;
+class MIN_U_W_DESC : MSA_3R_DESC_BASE<"min_u.w", MipsVUMin, MSA128WOpnd>;
+class MIN_U_D_DESC : MSA_3R_DESC_BASE<"min_u.d", MipsVUMin, MSA128DOpnd>;
+
+class MINI_S_B_DESC : MSA_I5_DESC_BASE<"mini_s.b", MipsVSMin, vsplati8_simm5,
+                                       MSA128BOpnd>;
+class MINI_S_H_DESC : MSA_I5_DESC_BASE<"mini_s.h", MipsVSMin, vsplati16_simm5,
+                                       MSA128HOpnd>;
+class MINI_S_W_DESC : MSA_I5_DESC_BASE<"mini_s.w", MipsVSMin, vsplati32_simm5,
+                                       MSA128WOpnd>;
+class MINI_S_D_DESC : MSA_I5_DESC_BASE<"mini_s.d", MipsVSMin, vsplati64_simm5,
+                                       MSA128DOpnd>;
+
+class MINI_U_B_DESC : MSA_I5_DESC_BASE<"mini_u.b", MipsVUMin, vsplati8_uimm5,
+                                       MSA128BOpnd>;
+class MINI_U_H_DESC : MSA_I5_DESC_BASE<"mini_u.h", MipsVUMin, vsplati16_uimm5,
+                                       MSA128HOpnd>;
+class MINI_U_W_DESC : MSA_I5_DESC_BASE<"mini_u.w", MipsVUMin, vsplati32_uimm5,
+                                       MSA128WOpnd>;
+class MINI_U_D_DESC : MSA_I5_DESC_BASE<"mini_u.d", MipsVUMin, vsplati64_uimm5,
+                                       MSA128DOpnd>;
+
+class MOD_S_B_DESC : MSA_3R_DESC_BASE<"mod_s.b", srem, MSA128BOpnd>;
+class MOD_S_H_DESC : MSA_3R_DESC_BASE<"mod_s.h", srem, MSA128HOpnd>;
+class MOD_S_W_DESC : MSA_3R_DESC_BASE<"mod_s.w", srem, MSA128WOpnd>;
+class MOD_S_D_DESC : MSA_3R_DESC_BASE<"mod_s.d", srem, MSA128DOpnd>;
+
+class MOD_U_B_DESC : MSA_3R_DESC_BASE<"mod_u.b", urem, MSA128BOpnd>;
+class MOD_U_H_DESC : MSA_3R_DESC_BASE<"mod_u.h", urem, MSA128HOpnd>;
+class MOD_U_W_DESC : MSA_3R_DESC_BASE<"mod_u.w", urem, MSA128WOpnd>;
+class MOD_U_D_DESC : MSA_3R_DESC_BASE<"mod_u.d", urem, MSA128DOpnd>;
+
+class MOVE_V_DESC {
+  dag OutOperandList = (outs MSA128BOpnd:$wd);
+  dag InOperandList = (ins MSA128BOpnd:$ws);
+  string AsmString = "move.v\t$wd, $ws";
+  list<dag> Pattern = [];
+  InstrItinClass Itinerary = NoItinerary;
+}
+
+class MSUB_Q_H_DESC : MSA_3RF_4RF_DESC_BASE<"msub_q.h", int_mips_msub_q_h,
+                                            MSA128HOpnd>;
+class MSUB_Q_W_DESC : MSA_3RF_4RF_DESC_BASE<"msub_q.w", int_mips_msub_q_w,
+                                            MSA128WOpnd>;
+
+class MSUBR_Q_H_DESC : MSA_3RF_4RF_DESC_BASE<"msubr_q.h", int_mips_msubr_q_h,
+                                             MSA128HOpnd>;
+class MSUBR_Q_W_DESC : MSA_3RF_4RF_DESC_BASE<"msubr_q.w", int_mips_msubr_q_w,
+                                             MSA128WOpnd>;
+
+class MSUBV_B_DESC : MSA_3R_4R_DESC_BASE<"msubv.b", mulsub, MSA128BOpnd>;
+class MSUBV_H_DESC : MSA_3R_4R_DESC_BASE<"msubv.h", mulsub, MSA128HOpnd>;
+class MSUBV_W_DESC : MSA_3R_4R_DESC_BASE<"msubv.w", mulsub, MSA128WOpnd>;
+class MSUBV_D_DESC : MSA_3R_4R_DESC_BASE<"msubv.d", mulsub, MSA128DOpnd>;
+
+class MUL_Q_H_DESC : MSA_3RF_DESC_BASE<"mul_q.h", int_mips_mul_q_h,
+                                       MSA128HOpnd>;
+class MUL_Q_W_DESC : MSA_3RF_DESC_BASE<"mul_q.w", int_mips_mul_q_w,
+                                       MSA128WOpnd>;
+
+class MULR_Q_H_DESC : MSA_3RF_DESC_BASE<"mulr_q.h", int_mips_mulr_q_h,
+                                        MSA128HOpnd>;
+class MULR_Q_W_DESC : MSA_3RF_DESC_BASE<"mulr_q.w", int_mips_mulr_q_w,
+                                        MSA128WOpnd>;
+
+class MULV_B_DESC : MSA_3R_DESC_BASE<"mulv.b", mul, MSA128BOpnd>;
+class MULV_H_DESC : MSA_3R_DESC_BASE<"mulv.h", mul, MSA128HOpnd>;
+class MULV_W_DESC : MSA_3R_DESC_BASE<"mulv.w", mul, MSA128WOpnd>;
+class MULV_D_DESC : MSA_3R_DESC_BASE<"mulv.d", mul, MSA128DOpnd>;
+
+class NLOC_B_DESC : MSA_2R_DESC_BASE<"nloc.b", int_mips_nloc_b, MSA128BOpnd>;
+class NLOC_H_DESC : MSA_2R_DESC_BASE<"nloc.h", int_mips_nloc_h, MSA128HOpnd>;
+class NLOC_W_DESC : MSA_2R_DESC_BASE<"nloc.w", int_mips_nloc_w, MSA128WOpnd>;
+class NLOC_D_DESC : MSA_2R_DESC_BASE<"nloc.d", int_mips_nloc_d, MSA128DOpnd>;
+
+class NLZC_B_DESC : MSA_2R_DESC_BASE<"nlzc.b", ctlz, MSA128BOpnd>;
+class NLZC_H_DESC : MSA_2R_DESC_BASE<"nlzc.h", ctlz, MSA128HOpnd>;
+class NLZC_W_DESC : MSA_2R_DESC_BASE<"nlzc.w", ctlz, MSA128WOpnd>;
+class NLZC_D_DESC : MSA_2R_DESC_BASE<"nlzc.d", ctlz, MSA128DOpnd>;
+
+class NOR_V_DESC : MSA_VEC_DESC_BASE<"nor.v", MipsVNOR, MSA128BOpnd>;
+class NOR_V_H_PSEUDO_DESC : MSA_VEC_PSEUDO_BASE<MipsVNOR, MSA128HOpnd>;
+class NOR_V_W_PSEUDO_DESC : MSA_VEC_PSEUDO_BASE<MipsVNOR, MSA128WOpnd>;
+class NOR_V_D_PSEUDO_DESC : MSA_VEC_PSEUDO_BASE<MipsVNOR, MSA128DOpnd>;
+
+class NORI_B_DESC : MSA_I8_DESC_BASE<"nori.b", MipsVNOR, vsplati8_uimm8,
+                                     MSA128BOpnd>;
+
+class OR_V_DESC : MSA_VEC_DESC_BASE<"or.v", or, MSA128BOpnd>;
+class OR_V_H_PSEUDO_DESC : MSA_VEC_PSEUDO_BASE<or, MSA128HOpnd>;
+class OR_V_W_PSEUDO_DESC : MSA_VEC_PSEUDO_BASE<or, MSA128WOpnd>;
+class OR_V_D_PSEUDO_DESC : MSA_VEC_PSEUDO_BASE<or, MSA128DOpnd>;
+
+class ORI_B_DESC : MSA_I8_DESC_BASE<"ori.b", or, vsplati8_uimm8, MSA128BOpnd>;
+
+class PCKEV_B_DESC : MSA_3R_DESC_BASE<"pckev.b", MipsPCKEV, MSA128BOpnd>;
+class PCKEV_H_DESC : MSA_3R_DESC_BASE<"pckev.h", MipsPCKEV, MSA128HOpnd>;
+class PCKEV_W_DESC : MSA_3R_DESC_BASE<"pckev.w", MipsPCKEV, MSA128WOpnd>;
+class PCKEV_D_DESC : MSA_3R_DESC_BASE<"pckev.d", MipsPCKEV, MSA128DOpnd>;
+
+class PCKOD_B_DESC : MSA_3R_DESC_BASE<"pckod.b", MipsPCKOD, MSA128BOpnd>;
+class PCKOD_H_DESC : MSA_3R_DESC_BASE<"pckod.h", MipsPCKOD, MSA128HOpnd>;
+class PCKOD_W_DESC : MSA_3R_DESC_BASE<"pckod.w", MipsPCKOD, MSA128WOpnd>;
+class PCKOD_D_DESC : MSA_3R_DESC_BASE<"pckod.d", MipsPCKOD, MSA128DOpnd>;
+
+class PCNT_B_DESC : MSA_2R_DESC_BASE<"pcnt.b", ctpop, MSA128BOpnd>;
+class PCNT_H_DESC : MSA_2R_DESC_BASE<"pcnt.h", ctpop, MSA128HOpnd>;
+class PCNT_W_DESC : MSA_2R_DESC_BASE<"pcnt.w", ctpop, MSA128WOpnd>;
+class PCNT_D_DESC : MSA_2R_DESC_BASE<"pcnt.d", ctpop, MSA128DOpnd>;
+
+class SAT_S_B_DESC : MSA_BIT_B_X_DESC_BASE<"sat_s.b", int_mips_sat_s_b,
+                                           MSA128BOpnd>;
+class SAT_S_H_DESC : MSA_BIT_H_X_DESC_BASE<"sat_s.h", int_mips_sat_s_h,
+                                           MSA128HOpnd>;
+class SAT_S_W_DESC : MSA_BIT_W_X_DESC_BASE<"sat_s.w", int_mips_sat_s_w,
+                                           MSA128WOpnd>;
+class SAT_S_D_DESC : MSA_BIT_D_X_DESC_BASE<"sat_s.d", int_mips_sat_s_d,
+                                           MSA128DOpnd>;
+
+class SAT_U_B_DESC : MSA_BIT_B_X_DESC_BASE<"sat_u.b", int_mips_sat_u_b,
+                                           MSA128BOpnd>;
+class SAT_U_H_DESC : MSA_BIT_H_X_DESC_BASE<"sat_u.h", int_mips_sat_u_h,
+                                           MSA128HOpnd>;
+class SAT_U_W_DESC : MSA_BIT_W_X_DESC_BASE<"sat_u.w", int_mips_sat_u_w,
+                                           MSA128WOpnd>;
+class SAT_U_D_DESC : MSA_BIT_D_X_DESC_BASE<"sat_u.d", int_mips_sat_u_d,
+                                           MSA128DOpnd>;
+
+class SHF_B_DESC : MSA_I8_SHF_DESC_BASE<"shf.b", MSA128BOpnd>;
+class SHF_H_DESC : MSA_I8_SHF_DESC_BASE<"shf.h", MSA128HOpnd>;
+class SHF_W_DESC : MSA_I8_SHF_DESC_BASE<"shf.w", MSA128WOpnd>;
+
+class SLD_B_DESC : MSA_3R_SLD_DESC_BASE<"sld.b", int_mips_sld_b, MSA128BOpnd>;
+class SLD_H_DESC : MSA_3R_SLD_DESC_BASE<"sld.h", int_mips_sld_h, MSA128HOpnd>;
+class SLD_W_DESC : MSA_3R_SLD_DESC_BASE<"sld.w", int_mips_sld_w, MSA128WOpnd>;
+class SLD_D_DESC : MSA_3R_SLD_DESC_BASE<"sld.d", int_mips_sld_d, MSA128DOpnd>;
+
+class SLDI_B_DESC : MSA_ELM_DESC_BASE<"sldi.b", int_mips_sldi_b, MSA128BOpnd>;
+class SLDI_H_DESC : MSA_ELM_DESC_BASE<"sldi.h", int_mips_sldi_h, MSA128HOpnd>;
+class SLDI_W_DESC : MSA_ELM_DESC_BASE<"sldi.w", int_mips_sldi_w, MSA128WOpnd>;
+class SLDI_D_DESC : MSA_ELM_DESC_BASE<"sldi.d", int_mips_sldi_d, MSA128DOpnd>;
+
+class SLL_B_DESC : MSA_3R_DESC_BASE<"sll.b", shl, MSA128BOpnd>;
+class SLL_H_DESC : MSA_3R_DESC_BASE<"sll.h", shl, MSA128HOpnd>;
+class SLL_W_DESC : MSA_3R_DESC_BASE<"sll.w", shl, MSA128WOpnd>;
+class SLL_D_DESC : MSA_3R_DESC_BASE<"sll.d", shl, MSA128DOpnd>;
+
+class SLLI_B_DESC : MSA_BIT_SPLAT_DESC_BASE<"slli.b", shl, vsplati8_uimm3,
+                                            MSA128BOpnd>;
+class SLLI_H_DESC : MSA_BIT_SPLAT_DESC_BASE<"slli.h", shl, vsplati16_uimm4,
+                                            MSA128HOpnd>;
+class SLLI_W_DESC : MSA_BIT_SPLAT_DESC_BASE<"slli.w", shl, vsplati32_uimm5,
+                                            MSA128WOpnd>;
+class SLLI_D_DESC : MSA_BIT_SPLAT_DESC_BASE<"slli.d", shl, vsplati64_uimm6,
+                                            MSA128DOpnd>;
+
+class SPLAT_B_DESC : MSA_3R_SPLAT_DESC_BASE<"splat.b", vsplati8_elt,
+                                            MSA128BOpnd>;
+class SPLAT_H_DESC : MSA_3R_SPLAT_DESC_BASE<"splat.h", vsplati16_elt,
+                                            MSA128HOpnd>;
+class SPLAT_W_DESC : MSA_3R_SPLAT_DESC_BASE<"splat.w", vsplati32_elt,
+                                            MSA128WOpnd>;
+class SPLAT_D_DESC : MSA_3R_SPLAT_DESC_BASE<"splat.d", vsplati64_elt,
+                                            MSA128DOpnd>;
+
+class SPLATI_B_DESC : MSA_ELM_SPLAT_DESC_BASE<"splati.b", vsplati8_uimm4,
+                                              MSA128BOpnd>;
+class SPLATI_H_DESC : MSA_ELM_SPLAT_DESC_BASE<"splati.h", vsplati16_uimm3,
+                                              MSA128HOpnd>;
+class SPLATI_W_DESC : MSA_ELM_SPLAT_DESC_BASE<"splati.w", vsplati32_uimm2,
+                                              MSA128WOpnd>;
+class SPLATI_D_DESC : MSA_ELM_SPLAT_DESC_BASE<"splati.d", vsplati64_uimm1,
+                                              MSA128DOpnd>;
+
+class SRA_B_DESC : MSA_3R_DESC_BASE<"sra.b", sra, MSA128BOpnd>;
+class SRA_H_DESC : MSA_3R_DESC_BASE<"sra.h", sra, MSA128HOpnd>;
+class SRA_W_DESC : MSA_3R_DESC_BASE<"sra.w", sra, MSA128WOpnd>;
+class SRA_D_DESC : MSA_3R_DESC_BASE<"sra.d", sra, MSA128DOpnd>;
+
+class SRAI_B_DESC : MSA_BIT_SPLAT_DESC_BASE<"srai.b", sra, vsplati8_uimm3,
+                                            MSA128BOpnd>;
+class SRAI_H_DESC : MSA_BIT_SPLAT_DESC_BASE<"srai.h", sra, vsplati16_uimm4,
+                                            MSA128HOpnd>;
+class SRAI_W_DESC : MSA_BIT_SPLAT_DESC_BASE<"srai.w", sra, vsplati32_uimm5,
+                                            MSA128WOpnd>;
+class SRAI_D_DESC : MSA_BIT_SPLAT_DESC_BASE<"srai.d", sra, vsplati64_uimm6,
+                                            MSA128DOpnd>;
+
+class SRAR_B_DESC : MSA_3R_DESC_BASE<"srar.b", int_mips_srar_b, MSA128BOpnd>;
+class SRAR_H_DESC : MSA_3R_DESC_BASE<"srar.h", int_mips_srar_h, MSA128HOpnd>;
+class SRAR_W_DESC : MSA_3R_DESC_BASE<"srar.w", int_mips_srar_w, MSA128WOpnd>;
+class SRAR_D_DESC : MSA_3R_DESC_BASE<"srar.d", int_mips_srar_d, MSA128DOpnd>;
+
+class SRARI_B_DESC : MSA_BIT_B_X_DESC_BASE<"srari.b", int_mips_srari_b,
+                                           MSA128BOpnd>;
+class SRARI_H_DESC : MSA_BIT_H_X_DESC_BASE<"srari.h", int_mips_srari_h,
+                                           MSA128HOpnd>;
+class SRARI_W_DESC : MSA_BIT_W_X_DESC_BASE<"srari.w", int_mips_srari_w,
+                                           MSA128WOpnd>;
+class SRARI_D_DESC : MSA_BIT_D_X_DESC_BASE<"srari.d", int_mips_srari_d,
+                                           MSA128DOpnd>;
+
+class SRL_B_DESC : MSA_3R_DESC_BASE<"srl.b", srl, MSA128BOpnd>;
+class SRL_H_DESC : MSA_3R_DESC_BASE<"srl.h", srl, MSA128HOpnd>;
+class SRL_W_DESC : MSA_3R_DESC_BASE<"srl.w", srl, MSA128WOpnd>;
+class SRL_D_DESC : MSA_3R_DESC_BASE<"srl.d", srl, MSA128DOpnd>;
+
+class SRLI_B_DESC : MSA_BIT_SPLAT_DESC_BASE<"srli.b", srl, vsplati8_uimm3,
+                                            MSA128BOpnd>;
+class SRLI_H_DESC : MSA_BIT_SPLAT_DESC_BASE<"srli.h", srl, vsplati16_uimm4,
+                                            MSA128HOpnd>;
+class SRLI_W_DESC : MSA_BIT_SPLAT_DESC_BASE<"srli.w", srl, vsplati32_uimm5,
+                                            MSA128WOpnd>;
+class SRLI_D_DESC : MSA_BIT_SPLAT_DESC_BASE<"srli.d", srl, vsplati64_uimm6,
+                                            MSA128DOpnd>;
+
+class SRLR_B_DESC : MSA_3R_DESC_BASE<"srlr.b", int_mips_srlr_b, MSA128BOpnd>;
+class SRLR_H_DESC : MSA_3R_DESC_BASE<"srlr.h", int_mips_srlr_h, MSA128HOpnd>;
+class SRLR_W_DESC : MSA_3R_DESC_BASE<"srlr.w", int_mips_srlr_w, MSA128WOpnd>;
+class SRLR_D_DESC : MSA_3R_DESC_BASE<"srlr.d", int_mips_srlr_d, MSA128DOpnd>;
+
+class SRLRI_B_DESC : MSA_BIT_B_X_DESC_BASE<"srlri.b", int_mips_srlri_b,
+                                           MSA128BOpnd>;
+class SRLRI_H_DESC : MSA_BIT_H_X_DESC_BASE<"srlri.h", int_mips_srlri_h,
+                                           MSA128HOpnd>;
+class SRLRI_W_DESC : MSA_BIT_W_X_DESC_BASE<"srlri.w", int_mips_srlri_w,
+                                           MSA128WOpnd>;
+class SRLRI_D_DESC : MSA_BIT_D_X_DESC_BASE<"srlri.d", int_mips_srlri_d,
+                                           MSA128DOpnd>;
+
+class ST_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                   ValueType TyNode, RegisterOperand ROWD,
+                   Operand MemOpnd = mem, ComplexPattern Addr = addrRegImm,
+                   InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs);
+  dag InOperandList = (ins ROWD:$wd, MemOpnd:$addr);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $addr");
+  list<dag> Pattern = [(OpNode (TyNode ROWD:$wd), Addr:$addr)];
+  InstrItinClass Itinerary = itin;
+  string DecoderMethod = "DecodeMSA128Mem";
+}
+
+class ST_B_DESC : ST_DESC_BASE<"st.b", store, v16i8, MSA128BOpnd>;
+class ST_H_DESC : ST_DESC_BASE<"st.h", store, v8i16, MSA128HOpnd>;
+class ST_W_DESC : ST_DESC_BASE<"st.w", store, v4i32, MSA128WOpnd>;
+class ST_D_DESC : ST_DESC_BASE<"st.d", store, v2i64, MSA128DOpnd>;
+
+class SUBS_S_B_DESC : MSA_3R_DESC_BASE<"subs_s.b", int_mips_subs_s_b,
+                                       MSA128BOpnd>;
+class SUBS_S_H_DESC : MSA_3R_DESC_BASE<"subs_s.h", int_mips_subs_s_h,
+                                       MSA128HOpnd>;
+class SUBS_S_W_DESC : MSA_3R_DESC_BASE<"subs_s.w", int_mips_subs_s_w,
+                                       MSA128WOpnd>;
+class SUBS_S_D_DESC : MSA_3R_DESC_BASE<"subs_s.d", int_mips_subs_s_d,
+                                       MSA128DOpnd>;
+
+class SUBS_U_B_DESC : MSA_3R_DESC_BASE<"subs_u.b", int_mips_subs_u_b,
+                                       MSA128BOpnd>;
+class SUBS_U_H_DESC : MSA_3R_DESC_BASE<"subs_u.h", int_mips_subs_u_h,
+                                       MSA128HOpnd>;
+class SUBS_U_W_DESC : MSA_3R_DESC_BASE<"subs_u.w", int_mips_subs_u_w,
+                                       MSA128WOpnd>;
+class SUBS_U_D_DESC : MSA_3R_DESC_BASE<"subs_u.d", int_mips_subs_u_d,
+                                       MSA128DOpnd>;
+
+class SUBSUS_U_B_DESC : MSA_3R_DESC_BASE<"subsus_u.b", int_mips_subsus_u_b,
+                                         MSA128BOpnd>;
+class SUBSUS_U_H_DESC : MSA_3R_DESC_BASE<"subsus_u.h", int_mips_subsus_u_h,
+                                         MSA128HOpnd>;
+class SUBSUS_U_W_DESC : MSA_3R_DESC_BASE<"subsus_u.w", int_mips_subsus_u_w,
+                                         MSA128WOpnd>;
+class SUBSUS_U_D_DESC : MSA_3R_DESC_BASE<"subsus_u.d", int_mips_subsus_u_d,
+                                         MSA128DOpnd>;
+
+class SUBSUU_S_B_DESC : MSA_3R_DESC_BASE<"subsuu_s.b", int_mips_subsuu_s_b,
+                                         MSA128BOpnd>;
+class SUBSUU_S_H_DESC : MSA_3R_DESC_BASE<"subsuu_s.h", int_mips_subsuu_s_h,
+                                         MSA128HOpnd>;
+class SUBSUU_S_W_DESC : MSA_3R_DESC_BASE<"subsuu_s.w", int_mips_subsuu_s_w,
+                                         MSA128WOpnd>;
+class SUBSUU_S_D_DESC : MSA_3R_DESC_BASE<"subsuu_s.d", int_mips_subsuu_s_d,
+                                         MSA128DOpnd>;
+
+class SUBV_B_DESC : MSA_3R_DESC_BASE<"subv.b", sub, MSA128BOpnd>;
+class SUBV_H_DESC : MSA_3R_DESC_BASE<"subv.h", sub, MSA128HOpnd>;
+class SUBV_W_DESC : MSA_3R_DESC_BASE<"subv.w", sub, MSA128WOpnd>;
+class SUBV_D_DESC : MSA_3R_DESC_BASE<"subv.d", sub, MSA128DOpnd>;
+
+class SUBVI_B_DESC : MSA_I5_DESC_BASE<"subvi.b", sub, vsplati8_uimm5,
+                                      MSA128BOpnd>;
+class SUBVI_H_DESC : MSA_I5_DESC_BASE<"subvi.h", sub, vsplati16_uimm5,
+                                      MSA128HOpnd>;
+class SUBVI_W_DESC : MSA_I5_DESC_BASE<"subvi.w", sub, vsplati32_uimm5,
+                                      MSA128WOpnd>;
+class SUBVI_D_DESC : MSA_I5_DESC_BASE<"subvi.d", sub, vsplati64_uimm5,
+                                      MSA128DOpnd>;
+
+class VSHF_B_DESC : MSA_3R_VSHF_DESC_BASE<"vshf.b", MSA128BOpnd>;
+class VSHF_H_DESC : MSA_3R_VSHF_DESC_BASE<"vshf.h", MSA128HOpnd>;
+class VSHF_W_DESC : MSA_3R_VSHF_DESC_BASE<"vshf.w", MSA128WOpnd>;
+class VSHF_D_DESC : MSA_3R_VSHF_DESC_BASE<"vshf.d", MSA128DOpnd>;
+
+class XOR_V_DESC : MSA_VEC_DESC_BASE<"xor.v", xor, MSA128BOpnd>;
+class XOR_V_H_PSEUDO_DESC : MSA_VEC_PSEUDO_BASE<xor, MSA128HOpnd>;
+class XOR_V_W_PSEUDO_DESC : MSA_VEC_PSEUDO_BASE<xor, MSA128WOpnd>;
+class XOR_V_D_PSEUDO_DESC : MSA_VEC_PSEUDO_BASE<xor, MSA128DOpnd>;
+
+class XORI_B_DESC : MSA_I8_DESC_BASE<"xori.b", xor, vsplati8_uimm8,
+                                     MSA128BOpnd>;
+
+// Instruction defs.
+def ADD_A_B : ADD_A_B_ENC, ADD_A_B_DESC;
+def ADD_A_H : ADD_A_H_ENC, ADD_A_H_DESC;
+def ADD_A_W : ADD_A_W_ENC, ADD_A_W_DESC;
+def ADD_A_D : ADD_A_D_ENC, ADD_A_D_DESC;
+
+def ADDS_A_B : ADDS_A_B_ENC, ADDS_A_B_DESC;
+def ADDS_A_H : ADDS_A_H_ENC, ADDS_A_H_DESC;
+def ADDS_A_W : ADDS_A_W_ENC, ADDS_A_W_DESC;
+def ADDS_A_D : ADDS_A_D_ENC, ADDS_A_D_DESC;
+
+def ADDS_S_B : ADDS_S_B_ENC, ADDS_S_B_DESC;
+def ADDS_S_H : ADDS_S_H_ENC, ADDS_S_H_DESC;
+def ADDS_S_W : ADDS_S_W_ENC, ADDS_S_W_DESC;
+def ADDS_S_D : ADDS_S_D_ENC, ADDS_S_D_DESC;
+
+def ADDS_U_B : ADDS_U_B_ENC, ADDS_U_B_DESC;
+def ADDS_U_H : ADDS_U_H_ENC, ADDS_U_H_DESC;
+def ADDS_U_W : ADDS_U_W_ENC, ADDS_U_W_DESC;
+def ADDS_U_D : ADDS_U_D_ENC, ADDS_U_D_DESC;
+
+def ADDV_B : ADDV_B_ENC, ADDV_B_DESC;
+def ADDV_H : ADDV_H_ENC, ADDV_H_DESC;
+def ADDV_W : ADDV_W_ENC, ADDV_W_DESC;
+def ADDV_D : ADDV_D_ENC, ADDV_D_DESC;
+
+def ADDVI_B : ADDVI_B_ENC, ADDVI_B_DESC;
+def ADDVI_H : ADDVI_H_ENC, ADDVI_H_DESC;
+def ADDVI_W : ADDVI_W_ENC, ADDVI_W_DESC;
+def ADDVI_D : ADDVI_D_ENC, ADDVI_D_DESC;
+
+def AND_V : AND_V_ENC, AND_V_DESC;
+def AND_V_H_PSEUDO : AND_V_H_PSEUDO_DESC,
+                     PseudoInstExpansion<(AND_V MSA128BOpnd:$wd,
+                                                MSA128BOpnd:$ws,
+                                                MSA128BOpnd:$wt)>;
+def AND_V_W_PSEUDO : AND_V_W_PSEUDO_DESC,
+                     PseudoInstExpansion<(AND_V MSA128BOpnd:$wd,
+                                                MSA128BOpnd:$ws,
+                                                MSA128BOpnd:$wt)>;
+def AND_V_D_PSEUDO : AND_V_D_PSEUDO_DESC,
+                     PseudoInstExpansion<(AND_V MSA128BOpnd:$wd,
+                                                MSA128BOpnd:$ws,
+                                                MSA128BOpnd:$wt)>;
+
+def ANDI_B : ANDI_B_ENC, ANDI_B_DESC;
+
+def ASUB_S_B : ASUB_S_B_ENC, ASUB_S_B_DESC;
+def ASUB_S_H : ASUB_S_H_ENC, ASUB_S_H_DESC;
+def ASUB_S_W : ASUB_S_W_ENC, ASUB_S_W_DESC;
+def ASUB_S_D : ASUB_S_D_ENC, ASUB_S_D_DESC;
+
+def ASUB_U_B : ASUB_U_B_ENC, ASUB_U_B_DESC;
+def ASUB_U_H : ASUB_U_H_ENC, ASUB_U_H_DESC;
+def ASUB_U_W : ASUB_U_W_ENC, ASUB_U_W_DESC;
+def ASUB_U_D : ASUB_U_D_ENC, ASUB_U_D_DESC;
+
+def AVE_S_B : AVE_S_B_ENC, AVE_S_B_DESC;
+def AVE_S_H : AVE_S_H_ENC, AVE_S_H_DESC;
+def AVE_S_W : AVE_S_W_ENC, AVE_S_W_DESC;
+def AVE_S_D : AVE_S_D_ENC, AVE_S_D_DESC;
+
+def AVE_U_B : AVE_U_B_ENC, AVE_U_B_DESC;
+def AVE_U_H : AVE_U_H_ENC, AVE_U_H_DESC;
+def AVE_U_W : AVE_U_W_ENC, AVE_U_W_DESC;
+def AVE_U_D : AVE_U_D_ENC, AVE_U_D_DESC;
+
+def AVER_S_B : AVER_S_B_ENC, AVER_S_B_DESC;
+def AVER_S_H : AVER_S_H_ENC, AVER_S_H_DESC;
+def AVER_S_W : AVER_S_W_ENC, AVER_S_W_DESC;
+def AVER_S_D : AVER_S_D_ENC, AVER_S_D_DESC;
+
+def AVER_U_B : AVER_U_B_ENC, AVER_U_B_DESC;
+def AVER_U_H : AVER_U_H_ENC, AVER_U_H_DESC;
+def AVER_U_W : AVER_U_W_ENC, AVER_U_W_DESC;
+def AVER_U_D : AVER_U_D_ENC, AVER_U_D_DESC;
+
+def BCLR_B : BCLR_B_ENC, BCLR_B_DESC;
+def BCLR_H : BCLR_H_ENC, BCLR_H_DESC;
+def BCLR_W : BCLR_W_ENC, BCLR_W_DESC;
+def BCLR_D : BCLR_D_ENC, BCLR_D_DESC;
+
+def BCLRI_B : BCLRI_B_ENC, BCLRI_B_DESC;
+def BCLRI_H : BCLRI_H_ENC, BCLRI_H_DESC;
+def BCLRI_W : BCLRI_W_ENC, BCLRI_W_DESC;
+def BCLRI_D : BCLRI_D_ENC, BCLRI_D_DESC;
+
+def BINSL_B : BINSL_B_ENC, BINSL_B_DESC;
+def BINSL_H : BINSL_H_ENC, BINSL_H_DESC;
+def BINSL_W : BINSL_W_ENC, BINSL_W_DESC;
+def BINSL_D : BINSL_D_ENC, BINSL_D_DESC;
+
+def BINSLI_B : BINSLI_B_ENC, BINSLI_B_DESC;
+def BINSLI_H : BINSLI_H_ENC, BINSLI_H_DESC;
+def BINSLI_W : BINSLI_W_ENC, BINSLI_W_DESC;
+def BINSLI_D : BINSLI_D_ENC, BINSLI_D_DESC;
+
+def BINSR_B : BINSR_B_ENC, BINSR_B_DESC;
+def BINSR_H : BINSR_H_ENC, BINSR_H_DESC;
+def BINSR_W : BINSR_W_ENC, BINSR_W_DESC;
+def BINSR_D : BINSR_D_ENC, BINSR_D_DESC;
+
+def BINSRI_B : BINSRI_B_ENC, BINSRI_B_DESC;
+def BINSRI_H : BINSRI_H_ENC, BINSRI_H_DESC;
+def BINSRI_W : BINSRI_W_ENC, BINSRI_W_DESC;
+def BINSRI_D : BINSRI_D_ENC, BINSRI_D_DESC;
+
+def BMNZ_V : BMNZ_V_ENC, BMNZ_V_DESC;
+
+def BMNZI_B : BMNZI_B_ENC, BMNZI_B_DESC;
+
+def BMZ_V : BMZ_V_ENC, BMZ_V_DESC;
+
+def BMZI_B : BMZI_B_ENC, BMZI_B_DESC;
+
+def BNEG_B : BNEG_B_ENC, BNEG_B_DESC;
+def BNEG_H : BNEG_H_ENC, BNEG_H_DESC;
+def BNEG_W : BNEG_W_ENC, BNEG_W_DESC;
+def BNEG_D : BNEG_D_ENC, BNEG_D_DESC;
+
+def BNEGI_B : BNEGI_B_ENC, BNEGI_B_DESC;
+def BNEGI_H : BNEGI_H_ENC, BNEGI_H_DESC;
+def BNEGI_W : BNEGI_W_ENC, BNEGI_W_DESC;
+def BNEGI_D : BNEGI_D_ENC, BNEGI_D_DESC;
+
+def BNZ_B : BNZ_B_ENC, BNZ_B_DESC;
+def BNZ_H : BNZ_H_ENC, BNZ_H_DESC;
+def BNZ_W : BNZ_W_ENC, BNZ_W_DESC;
+def BNZ_D : BNZ_D_ENC, BNZ_D_DESC;
+
+def BNZ_V : BNZ_V_ENC, BNZ_V_DESC;
+
+def BSEL_V : BSEL_V_ENC, BSEL_V_DESC;
+
+class MSA_BSEL_PSEUDO_BASE<RegisterOperand RO, ValueType Ty> :
+  MipsPseudo<(outs RO:$wd), (ins RO:$wd_in, RO:$ws, RO:$wt),
+             [(set RO:$wd, (Ty (vselect RO:$wd_in, RO:$ws, RO:$wt)))]>,
+  PseudoInstExpansion<(BSEL_V MSA128BOpnd:$wd, MSA128BOpnd:$wd_in,
+                              MSA128BOpnd:$ws, MSA128BOpnd:$wt)> {
+  let Constraints = "$wd_in = $wd";
+}
+
+def BSEL_H_PSEUDO : MSA_BSEL_PSEUDO_BASE<MSA128HOpnd, v8i16>;
+def BSEL_W_PSEUDO : MSA_BSEL_PSEUDO_BASE<MSA128WOpnd, v4i32>;
+def BSEL_D_PSEUDO : MSA_BSEL_PSEUDO_BASE<MSA128DOpnd, v2i64>;
+def BSEL_FW_PSEUDO : MSA_BSEL_PSEUDO_BASE<MSA128WOpnd, v4f32>;
+def BSEL_FD_PSEUDO : MSA_BSEL_PSEUDO_BASE<MSA128DOpnd, v2f64>;
+
+def BSELI_B : BSELI_B_ENC, BSELI_B_DESC;
+
+def BSET_B : BSET_B_ENC, BSET_B_DESC;
+def BSET_H : BSET_H_ENC, BSET_H_DESC;
+def BSET_W : BSET_W_ENC, BSET_W_DESC;
+def BSET_D : BSET_D_ENC, BSET_D_DESC;
+
+def BSETI_B : BSETI_B_ENC, BSETI_B_DESC;
+def BSETI_H : BSETI_H_ENC, BSETI_H_DESC;
+def BSETI_W : BSETI_W_ENC, BSETI_W_DESC;
+def BSETI_D : BSETI_D_ENC, BSETI_D_DESC;
+
+def BZ_B : BZ_B_ENC, BZ_B_DESC;
+def BZ_H : BZ_H_ENC, BZ_H_DESC;
+def BZ_W : BZ_W_ENC, BZ_W_DESC;
+def BZ_D : BZ_D_ENC, BZ_D_DESC;
+
+def BZ_V : BZ_V_ENC, BZ_V_DESC;
+
+def CEQ_B : CEQ_B_ENC, CEQ_B_DESC;
+def CEQ_H : CEQ_H_ENC, CEQ_H_DESC;
+def CEQ_W : CEQ_W_ENC, CEQ_W_DESC;
+def CEQ_D : CEQ_D_ENC, CEQ_D_DESC;
+
+def CEQI_B : CEQI_B_ENC, CEQI_B_DESC;
+def CEQI_H : CEQI_H_ENC, CEQI_H_DESC;
+def CEQI_W : CEQI_W_ENC, CEQI_W_DESC;
+def CEQI_D : CEQI_D_ENC, CEQI_D_DESC;
+
+def CFCMSA : CFCMSA_ENC, CFCMSA_DESC;
+
+def CLE_S_B : CLE_S_B_ENC, CLE_S_B_DESC;
+def CLE_S_H : CLE_S_H_ENC, CLE_S_H_DESC;
+def CLE_S_W : CLE_S_W_ENC, CLE_S_W_DESC;
+def CLE_S_D : CLE_S_D_ENC, CLE_S_D_DESC;
+
+def CLE_U_B : CLE_U_B_ENC, CLE_U_B_DESC;
+def CLE_U_H : CLE_U_H_ENC, CLE_U_H_DESC;
+def CLE_U_W : CLE_U_W_ENC, CLE_U_W_DESC;
+def CLE_U_D : CLE_U_D_ENC, CLE_U_D_DESC;
+
+def CLEI_S_B : CLEI_S_B_ENC, CLEI_S_B_DESC;
+def CLEI_S_H : CLEI_S_H_ENC, CLEI_S_H_DESC;
+def CLEI_S_W : CLEI_S_W_ENC, CLEI_S_W_DESC;
+def CLEI_S_D : CLEI_S_D_ENC, CLEI_S_D_DESC;
+
+def CLEI_U_B : CLEI_U_B_ENC, CLEI_U_B_DESC;
+def CLEI_U_H : CLEI_U_H_ENC, CLEI_U_H_DESC;
+def CLEI_U_W : CLEI_U_W_ENC, CLEI_U_W_DESC;
+def CLEI_U_D : CLEI_U_D_ENC, CLEI_U_D_DESC;
+
+def CLT_S_B : CLT_S_B_ENC, CLT_S_B_DESC;
+def CLT_S_H : CLT_S_H_ENC, CLT_S_H_DESC;
+def CLT_S_W : CLT_S_W_ENC, CLT_S_W_DESC;
+def CLT_S_D : CLT_S_D_ENC, CLT_S_D_DESC;
+
+def CLT_U_B : CLT_U_B_ENC, CLT_U_B_DESC;
+def CLT_U_H : CLT_U_H_ENC, CLT_U_H_DESC;
+def CLT_U_W : CLT_U_W_ENC, CLT_U_W_DESC;
+def CLT_U_D : CLT_U_D_ENC, CLT_U_D_DESC;
+
+def CLTI_S_B : CLTI_S_B_ENC, CLTI_S_B_DESC;
+def CLTI_S_H : CLTI_S_H_ENC, CLTI_S_H_DESC;
+def CLTI_S_W : CLTI_S_W_ENC, CLTI_S_W_DESC;
+def CLTI_S_D : CLTI_S_D_ENC, CLTI_S_D_DESC;
+
+def CLTI_U_B : CLTI_U_B_ENC, CLTI_U_B_DESC;
+def CLTI_U_H : CLTI_U_H_ENC, CLTI_U_H_DESC;
+def CLTI_U_W : CLTI_U_W_ENC, CLTI_U_W_DESC;
+def CLTI_U_D : CLTI_U_D_ENC, CLTI_U_D_DESC;
+
+def COPY_S_B : COPY_S_B_ENC, COPY_S_B_DESC;
+def COPY_S_H : COPY_S_H_ENC, COPY_S_H_DESC;
+def COPY_S_W : COPY_S_W_ENC, COPY_S_W_DESC;
+
+def COPY_U_B : COPY_U_B_ENC, COPY_U_B_DESC;
+def COPY_U_H : COPY_U_H_ENC, COPY_U_H_DESC;
+def COPY_U_W : COPY_U_W_ENC, COPY_U_W_DESC;
+
+def COPY_FW_PSEUDO : COPY_FW_PSEUDO_DESC;
+def COPY_FD_PSEUDO : COPY_FD_PSEUDO_DESC;
+
+def CTCMSA : CTCMSA_ENC, CTCMSA_DESC;
+
+def DIV_S_B : DIV_S_B_ENC, DIV_S_B_DESC;
+def DIV_S_H : DIV_S_H_ENC, DIV_S_H_DESC;
+def DIV_S_W : DIV_S_W_ENC, DIV_S_W_DESC;
+def DIV_S_D : DIV_S_D_ENC, DIV_S_D_DESC;
+
+def DIV_U_B : DIV_U_B_ENC, DIV_U_B_DESC;
+def DIV_U_H : DIV_U_H_ENC, DIV_U_H_DESC;
+def DIV_U_W : DIV_U_W_ENC, DIV_U_W_DESC;
+def DIV_U_D : DIV_U_D_ENC, DIV_U_D_DESC;
+
+def DOTP_S_H : DOTP_S_H_ENC, DOTP_S_H_DESC;
+def DOTP_S_W : DOTP_S_W_ENC, DOTP_S_W_DESC;
+def DOTP_S_D : DOTP_S_D_ENC, DOTP_S_D_DESC;
+
+def DOTP_U_H : DOTP_U_H_ENC, DOTP_U_H_DESC;
+def DOTP_U_W : DOTP_U_W_ENC, DOTP_U_W_DESC;
+def DOTP_U_D : DOTP_U_D_ENC, DOTP_U_D_DESC;
+
+def DPADD_S_H : DPADD_S_H_ENC, DPADD_S_H_DESC;
+def DPADD_S_W : DPADD_S_W_ENC, DPADD_S_W_DESC;
+def DPADD_S_D : DPADD_S_D_ENC, DPADD_S_D_DESC;
+
+def DPADD_U_H : DPADD_U_H_ENC, DPADD_U_H_DESC;
+def DPADD_U_W : DPADD_U_W_ENC, DPADD_U_W_DESC;
+def DPADD_U_D : DPADD_U_D_ENC, DPADD_U_D_DESC;
+
+def DPSUB_S_H : DPSUB_S_H_ENC, DPSUB_S_H_DESC;
+def DPSUB_S_W : DPSUB_S_W_ENC, DPSUB_S_W_DESC;
+def DPSUB_S_D : DPSUB_S_D_ENC, DPSUB_S_D_DESC;
+
+def DPSUB_U_H : DPSUB_U_H_ENC, DPSUB_U_H_DESC;
+def DPSUB_U_W : DPSUB_U_W_ENC, DPSUB_U_W_DESC;
+def DPSUB_U_D : DPSUB_U_D_ENC, DPSUB_U_D_DESC;
+
+def FADD_W : FADD_W_ENC, FADD_W_DESC;
+def FADD_D : FADD_D_ENC, FADD_D_DESC;
+
+def FCAF_W : FCAF_W_ENC, FCAF_W_DESC;
+def FCAF_D : FCAF_D_ENC, FCAF_D_DESC;
+
+def FCEQ_W : FCEQ_W_ENC, FCEQ_W_DESC;
+def FCEQ_D : FCEQ_D_ENC, FCEQ_D_DESC;
+
+def FCLE_W : FCLE_W_ENC, FCLE_W_DESC;
+def FCLE_D : FCLE_D_ENC, FCLE_D_DESC;
+
+def FCLT_W : FCLT_W_ENC, FCLT_W_DESC;
+def FCLT_D : FCLT_D_ENC, FCLT_D_DESC;
+
+def FCLASS_W : FCLASS_W_ENC, FCLASS_W_DESC;
+def FCLASS_D : FCLASS_D_ENC, FCLASS_D_DESC;
+
+def FCNE_W : FCNE_W_ENC, FCNE_W_DESC;
+def FCNE_D : FCNE_D_ENC, FCNE_D_DESC;
+
+def FCOR_W : FCOR_W_ENC, FCOR_W_DESC;
+def FCOR_D : FCOR_D_ENC, FCOR_D_DESC;
+
+def FCUEQ_W : FCUEQ_W_ENC, FCUEQ_W_DESC;
+def FCUEQ_D : FCUEQ_D_ENC, FCUEQ_D_DESC;
+
+def FCULE_W : FCULE_W_ENC, FCULE_W_DESC;
+def FCULE_D : FCULE_D_ENC, FCULE_D_DESC;
+
+def FCULT_W : FCULT_W_ENC, FCULT_W_DESC;
+def FCULT_D : FCULT_D_ENC, FCULT_D_DESC;
+
+def FCUN_W : FCUN_W_ENC, FCUN_W_DESC;
+def FCUN_D : FCUN_D_ENC, FCUN_D_DESC;
+
+def FCUNE_W : FCUNE_W_ENC, FCUNE_W_DESC;
+def FCUNE_D : FCUNE_D_ENC, FCUNE_D_DESC;
+
+def FDIV_W : FDIV_W_ENC, FDIV_W_DESC;
+def FDIV_D : FDIV_D_ENC, FDIV_D_DESC;
+
+def FEXDO_H : FEXDO_H_ENC, FEXDO_H_DESC;
+def FEXDO_W : FEXDO_W_ENC, FEXDO_W_DESC;
+
+def FEXP2_W : FEXP2_W_ENC, FEXP2_W_DESC;
+def FEXP2_D : FEXP2_D_ENC, FEXP2_D_DESC;
+def FEXP2_W_1_PSEUDO : FEXP2_W_1_PSEUDO_DESC;
+def FEXP2_D_1_PSEUDO : FEXP2_D_1_PSEUDO_DESC;
+
+def FEXUPL_W : FEXUPL_W_ENC, FEXUPL_W_DESC;
+def FEXUPL_D : FEXUPL_D_ENC, FEXUPL_D_DESC;
+
+def FEXUPR_W : FEXUPR_W_ENC, FEXUPR_W_DESC;
+def FEXUPR_D : FEXUPR_D_ENC, FEXUPR_D_DESC;
+
+def FFINT_S_W : FFINT_S_W_ENC, FFINT_S_W_DESC;
+def FFINT_S_D : FFINT_S_D_ENC, FFINT_S_D_DESC;
+
+def FFINT_U_W : FFINT_U_W_ENC, FFINT_U_W_DESC;
+def FFINT_U_D : FFINT_U_D_ENC, FFINT_U_D_DESC;
+
+def FFQL_W : FFQL_W_ENC, FFQL_W_DESC;
+def FFQL_D : FFQL_D_ENC, FFQL_D_DESC;
+
+def FFQR_W : FFQR_W_ENC, FFQR_W_DESC;
+def FFQR_D : FFQR_D_ENC, FFQR_D_DESC;
+
+def FILL_B : FILL_B_ENC, FILL_B_DESC;
+def FILL_H : FILL_H_ENC, FILL_H_DESC;
+def FILL_W : FILL_W_ENC, FILL_W_DESC;
+def FILL_FW_PSEUDO : FILL_FW_PSEUDO_DESC;
+def FILL_FD_PSEUDO : FILL_FD_PSEUDO_DESC;
+
+def FLOG2_W : FLOG2_W_ENC, FLOG2_W_DESC;
+def FLOG2_D : FLOG2_D_ENC, FLOG2_D_DESC;
+
+def FMADD_W : FMADD_W_ENC, FMADD_W_DESC;
+def FMADD_D : FMADD_D_ENC, FMADD_D_DESC;
+
+def FMAX_W : FMAX_W_ENC, FMAX_W_DESC;
+def FMAX_D : FMAX_D_ENC, FMAX_D_DESC;
+
+def FMAX_A_W : FMAX_A_W_ENC, FMAX_A_W_DESC;
+def FMAX_A_D : FMAX_A_D_ENC, FMAX_A_D_DESC;
+
+def FMIN_W : FMIN_W_ENC, FMIN_W_DESC;
+def FMIN_D : FMIN_D_ENC, FMIN_D_DESC;
+
+def FMIN_A_W : FMIN_A_W_ENC, FMIN_A_W_DESC;
+def FMIN_A_D : FMIN_A_D_ENC, FMIN_A_D_DESC;
+
+def FMSUB_W : FMSUB_W_ENC, FMSUB_W_DESC;
+def FMSUB_D : FMSUB_D_ENC, FMSUB_D_DESC;
+
+def FMUL_W : FMUL_W_ENC, FMUL_W_DESC;
+def FMUL_D : FMUL_D_ENC, FMUL_D_DESC;
+
+def FRINT_W : FRINT_W_ENC, FRINT_W_DESC;
+def FRINT_D : FRINT_D_ENC, FRINT_D_DESC;
+
+def FRCP_W : FRCP_W_ENC, FRCP_W_DESC;
+def FRCP_D : FRCP_D_ENC, FRCP_D_DESC;
+
+def FRSQRT_W : FRSQRT_W_ENC, FRSQRT_W_DESC;
+def FRSQRT_D : FRSQRT_D_ENC, FRSQRT_D_DESC;
+
+def FSAF_W : FSAF_W_ENC, FSAF_W_DESC;
+def FSAF_D : FSAF_D_ENC, FSAF_D_DESC;
+
+def FSEQ_W : FSEQ_W_ENC, FSEQ_W_DESC;
+def FSEQ_D : FSEQ_D_ENC, FSEQ_D_DESC;
+
+def FSLE_W : FSLE_W_ENC, FSLE_W_DESC;
+def FSLE_D : FSLE_D_ENC, FSLE_D_DESC;
+
+def FSLT_W : FSLT_W_ENC, FSLT_W_DESC;
+def FSLT_D : FSLT_D_ENC, FSLT_D_DESC;
+
+def FSNE_W : FSNE_W_ENC, FSNE_W_DESC;
+def FSNE_D : FSNE_D_ENC, FSNE_D_DESC;
+
+def FSOR_W : FSOR_W_ENC, FSOR_W_DESC;
+def FSOR_D : FSOR_D_ENC, FSOR_D_DESC;
+
+def FSQRT_W : FSQRT_W_ENC, FSQRT_W_DESC;
+def FSQRT_D : FSQRT_D_ENC, FSQRT_D_DESC;
+
+def FSUB_W : FSUB_W_ENC, FSUB_W_DESC;
+def FSUB_D : FSUB_D_ENC, FSUB_D_DESC;
+
+def FSUEQ_W : FSUEQ_W_ENC, FSUEQ_W_DESC;
+def FSUEQ_D : FSUEQ_D_ENC, FSUEQ_D_DESC;
+
+def FSULE_W : FSULE_W_ENC, FSULE_W_DESC;
+def FSULE_D : FSULE_D_ENC, FSULE_D_DESC;
+
+def FSULT_W : FSULT_W_ENC, FSULT_W_DESC;
+def FSULT_D : FSULT_D_ENC, FSULT_D_DESC;
+
+def FSUN_W : FSUN_W_ENC, FSUN_W_DESC;
+def FSUN_D : FSUN_D_ENC, FSUN_D_DESC;
+
+def FSUNE_W : FSUNE_W_ENC, FSUNE_W_DESC;
+def FSUNE_D : FSUNE_D_ENC, FSUNE_D_DESC;
+
+def FTINT_S_W : FTINT_S_W_ENC, FTINT_S_W_DESC;
+def FTINT_S_D : FTINT_S_D_ENC, FTINT_S_D_DESC;
+
+def FTINT_U_W : FTINT_U_W_ENC, FTINT_U_W_DESC;
+def FTINT_U_D : FTINT_U_D_ENC, FTINT_U_D_DESC;
+
+def FTQ_H : FTQ_H_ENC, FTQ_H_DESC;
+def FTQ_W : FTQ_W_ENC, FTQ_W_DESC;
+
+def FTRUNC_S_W : FTRUNC_S_W_ENC, FTRUNC_S_W_DESC;
+def FTRUNC_S_D : FTRUNC_S_D_ENC, FTRUNC_S_D_DESC;
+
+def FTRUNC_U_W : FTRUNC_U_W_ENC, FTRUNC_U_W_DESC;
+def FTRUNC_U_D : FTRUNC_U_D_ENC, FTRUNC_U_D_DESC;
+
+def HADD_S_H : HADD_S_H_ENC, HADD_S_H_DESC;
+def HADD_S_W : HADD_S_W_ENC, HADD_S_W_DESC;
+def HADD_S_D : HADD_S_D_ENC, HADD_S_D_DESC;
+
+def HADD_U_H : HADD_U_H_ENC, HADD_U_H_DESC;
+def HADD_U_W : HADD_U_W_ENC, HADD_U_W_DESC;
+def HADD_U_D : HADD_U_D_ENC, HADD_U_D_DESC;
+
+def HSUB_S_H : HSUB_S_H_ENC, HSUB_S_H_DESC;
+def HSUB_S_W : HSUB_S_W_ENC, HSUB_S_W_DESC;
+def HSUB_S_D : HSUB_S_D_ENC, HSUB_S_D_DESC;
+
+def HSUB_U_H : HSUB_U_H_ENC, HSUB_U_H_DESC;
+def HSUB_U_W : HSUB_U_W_ENC, HSUB_U_W_DESC;
+def HSUB_U_D : HSUB_U_D_ENC, HSUB_U_D_DESC;
+
+def ILVEV_B : ILVEV_B_ENC, ILVEV_B_DESC;
+def ILVEV_H : ILVEV_H_ENC, ILVEV_H_DESC;
+def ILVEV_W : ILVEV_W_ENC, ILVEV_W_DESC;
+def ILVEV_D : ILVEV_D_ENC, ILVEV_D_DESC;
+
+def ILVL_B : ILVL_B_ENC, ILVL_B_DESC;
+def ILVL_H : ILVL_H_ENC, ILVL_H_DESC;
+def ILVL_W : ILVL_W_ENC, ILVL_W_DESC;
+def ILVL_D : ILVL_D_ENC, ILVL_D_DESC;
+
+def ILVOD_B : ILVOD_B_ENC, ILVOD_B_DESC;
+def ILVOD_H : ILVOD_H_ENC, ILVOD_H_DESC;
+def ILVOD_W : ILVOD_W_ENC, ILVOD_W_DESC;
+def ILVOD_D : ILVOD_D_ENC, ILVOD_D_DESC;
+
+def ILVR_B : ILVR_B_ENC, ILVR_B_DESC;
+def ILVR_H : ILVR_H_ENC, ILVR_H_DESC;
+def ILVR_W : ILVR_W_ENC, ILVR_W_DESC;
+def ILVR_D : ILVR_D_ENC, ILVR_D_DESC;
+
+def INSERT_B : INSERT_B_ENC, INSERT_B_DESC;
+def INSERT_H : INSERT_H_ENC, INSERT_H_DESC;
+def INSERT_W : INSERT_W_ENC, INSERT_W_DESC;
+
+// INSERT_FW_PSEUDO defined after INSVE_W
+// INSERT_FD_PSEUDO defined after INSVE_D
+
+def INSVE_B : INSVE_B_ENC, INSVE_B_DESC;
+def INSVE_H : INSVE_H_ENC, INSVE_H_DESC;
+def INSVE_W : INSVE_W_ENC, INSVE_W_DESC;
+def INSVE_D : INSVE_D_ENC, INSVE_D_DESC;
+
+def INSERT_FW_PSEUDO : INSERT_FW_PSEUDO_DESC;
+def INSERT_FD_PSEUDO : INSERT_FD_PSEUDO_DESC;
+
+def LD_B: LD_B_ENC, LD_B_DESC;
+def LD_H: LD_H_ENC, LD_H_DESC;
+def LD_W: LD_W_ENC, LD_W_DESC;
+def LD_D: LD_D_ENC, LD_D_DESC;
+
+def LDI_B : LDI_B_ENC, LDI_B_DESC;
+def LDI_H : LDI_H_ENC, LDI_H_DESC;
+def LDI_W : LDI_W_ENC, LDI_W_DESC;
+def LDI_D : LDI_D_ENC, LDI_D_DESC;
+
+def LSA : LSA_ENC, LSA_DESC;
+
+def MADD_Q_H : MADD_Q_H_ENC, MADD_Q_H_DESC;
+def MADD_Q_W : MADD_Q_W_ENC, MADD_Q_W_DESC;
+
+def MADDR_Q_H : MADDR_Q_H_ENC, MADDR_Q_H_DESC;
+def MADDR_Q_W : MADDR_Q_W_ENC, MADDR_Q_W_DESC;
+
+def MADDV_B : MADDV_B_ENC, MADDV_B_DESC;
+def MADDV_H : MADDV_H_ENC, MADDV_H_DESC;
+def MADDV_W : MADDV_W_ENC, MADDV_W_DESC;
+def MADDV_D : MADDV_D_ENC, MADDV_D_DESC;
+
+def MAX_A_B : MAX_A_B_ENC, MAX_A_B_DESC;
+def MAX_A_H : MAX_A_H_ENC, MAX_A_H_DESC;
+def MAX_A_W : MAX_A_W_ENC, MAX_A_W_DESC;
+def MAX_A_D : MAX_A_D_ENC, MAX_A_D_DESC;
+
+def MAX_S_B : MAX_S_B_ENC, MAX_S_B_DESC;
+def MAX_S_H : MAX_S_H_ENC, MAX_S_H_DESC;
+def MAX_S_W : MAX_S_W_ENC, MAX_S_W_DESC;
+def MAX_S_D : MAX_S_D_ENC, MAX_S_D_DESC;
+
+def MAX_U_B : MAX_U_B_ENC, MAX_U_B_DESC;
+def MAX_U_H : MAX_U_H_ENC, MAX_U_H_DESC;
+def MAX_U_W : MAX_U_W_ENC, MAX_U_W_DESC;
+def MAX_U_D : MAX_U_D_ENC, MAX_U_D_DESC;
+
+def MAXI_S_B : MAXI_S_B_ENC, MAXI_S_B_DESC;
+def MAXI_S_H : MAXI_S_H_ENC, MAXI_S_H_DESC;
+def MAXI_S_W : MAXI_S_W_ENC, MAXI_S_W_DESC;
+def MAXI_S_D : MAXI_S_D_ENC, MAXI_S_D_DESC;
+
+def MAXI_U_B : MAXI_U_B_ENC, MAXI_U_B_DESC;
+def MAXI_U_H : MAXI_U_H_ENC, MAXI_U_H_DESC;
+def MAXI_U_W : MAXI_U_W_ENC, MAXI_U_W_DESC;
+def MAXI_U_D : MAXI_U_D_ENC, MAXI_U_D_DESC;
+
+def MIN_A_B : MIN_A_B_ENC, MIN_A_B_DESC;
+def MIN_A_H : MIN_A_H_ENC, MIN_A_H_DESC;
+def MIN_A_W : MIN_A_W_ENC, MIN_A_W_DESC;
+def MIN_A_D : MIN_A_D_ENC, MIN_A_D_DESC;
+
+def MIN_S_B : MIN_S_B_ENC, MIN_S_B_DESC;
+def MIN_S_H : MIN_S_H_ENC, MIN_S_H_DESC;
+def MIN_S_W : MIN_S_W_ENC, MIN_S_W_DESC;
+def MIN_S_D : MIN_S_D_ENC, MIN_S_D_DESC;
+
+def MIN_U_B : MIN_U_B_ENC, MIN_U_B_DESC;
+def MIN_U_H : MIN_U_H_ENC, MIN_U_H_DESC;
+def MIN_U_W : MIN_U_W_ENC, MIN_U_W_DESC;
+def MIN_U_D : MIN_U_D_ENC, MIN_U_D_DESC;
+
+def MINI_S_B : MINI_S_B_ENC, MINI_S_B_DESC;
+def MINI_S_H : MINI_S_H_ENC, MINI_S_H_DESC;
+def MINI_S_W : MINI_S_W_ENC, MINI_S_W_DESC;
+def MINI_S_D : MINI_S_D_ENC, MINI_S_D_DESC;
+
+def MINI_U_B : MINI_U_B_ENC, MINI_U_B_DESC;
+def MINI_U_H : MINI_U_H_ENC, MINI_U_H_DESC;
+def MINI_U_W : MINI_U_W_ENC, MINI_U_W_DESC;
+def MINI_U_D : MINI_U_D_ENC, MINI_U_D_DESC;
+
+def MOD_S_B : MOD_S_B_ENC, MOD_S_B_DESC;
+def MOD_S_H : MOD_S_H_ENC, MOD_S_H_DESC;
+def MOD_S_W : MOD_S_W_ENC, MOD_S_W_DESC;
+def MOD_S_D : MOD_S_D_ENC, MOD_S_D_DESC;
+
+def MOD_U_B : MOD_U_B_ENC, MOD_U_B_DESC;
+def MOD_U_H : MOD_U_H_ENC, MOD_U_H_DESC;
+def MOD_U_W : MOD_U_W_ENC, MOD_U_W_DESC;
+def MOD_U_D : MOD_U_D_ENC, MOD_U_D_DESC;
+
+def MOVE_V : MOVE_V_ENC, MOVE_V_DESC;
+
+def MSUB_Q_H : MSUB_Q_H_ENC, MSUB_Q_H_DESC;
+def MSUB_Q_W : MSUB_Q_W_ENC, MSUB_Q_W_DESC;
+
+def MSUBR_Q_H : MSUBR_Q_H_ENC, MSUBR_Q_H_DESC;
+def MSUBR_Q_W : MSUBR_Q_W_ENC, MSUBR_Q_W_DESC;
+
+def MSUBV_B : MSUBV_B_ENC, MSUBV_B_DESC;
+def MSUBV_H : MSUBV_H_ENC, MSUBV_H_DESC;
+def MSUBV_W : MSUBV_W_ENC, MSUBV_W_DESC;
+def MSUBV_D : MSUBV_D_ENC, MSUBV_D_DESC;
+
+def MUL_Q_H : MUL_Q_H_ENC, MUL_Q_H_DESC;
+def MUL_Q_W : MUL_Q_W_ENC, MUL_Q_W_DESC;
+
+def MULR_Q_H : MULR_Q_H_ENC, MULR_Q_H_DESC;
+def MULR_Q_W : MULR_Q_W_ENC, MULR_Q_W_DESC;
+
+def MULV_B : MULV_B_ENC, MULV_B_DESC;
+def MULV_H : MULV_H_ENC, MULV_H_DESC;
+def MULV_W : MULV_W_ENC, MULV_W_DESC;
+def MULV_D : MULV_D_ENC, MULV_D_DESC;
+
+def NLOC_B : NLOC_B_ENC, NLOC_B_DESC;
+def NLOC_H : NLOC_H_ENC, NLOC_H_DESC;
+def NLOC_W : NLOC_W_ENC, NLOC_W_DESC;
+def NLOC_D : NLOC_D_ENC, NLOC_D_DESC;
+
+def NLZC_B : NLZC_B_ENC, NLZC_B_DESC;
+def NLZC_H : NLZC_H_ENC, NLZC_H_DESC;
+def NLZC_W : NLZC_W_ENC, NLZC_W_DESC;
+def NLZC_D : NLZC_D_ENC, NLZC_D_DESC;
+
+def NOR_V : NOR_V_ENC, NOR_V_DESC;
+def NOR_V_H_PSEUDO : NOR_V_H_PSEUDO_DESC,
+                     PseudoInstExpansion<(NOR_V MSA128BOpnd:$wd,
+                                                MSA128BOpnd:$ws,
+                                                MSA128BOpnd:$wt)>;
+def NOR_V_W_PSEUDO : NOR_V_W_PSEUDO_DESC,
+                     PseudoInstExpansion<(NOR_V MSA128BOpnd:$wd,
+                                                MSA128BOpnd:$ws,
+                                                MSA128BOpnd:$wt)>;
+def NOR_V_D_PSEUDO : NOR_V_D_PSEUDO_DESC,
+                     PseudoInstExpansion<(NOR_V MSA128BOpnd:$wd,
+                                                MSA128BOpnd:$ws,
+                                                MSA128BOpnd:$wt)>;
+
+def NORI_B : NORI_B_ENC, NORI_B_DESC;
+
+def OR_V : OR_V_ENC, OR_V_DESC;
+def OR_V_H_PSEUDO : OR_V_H_PSEUDO_DESC,
+                    PseudoInstExpansion<(OR_V MSA128BOpnd:$wd,
+                                              MSA128BOpnd:$ws,
+                                              MSA128BOpnd:$wt)>;
+def OR_V_W_PSEUDO : OR_V_W_PSEUDO_DESC,
+                    PseudoInstExpansion<(OR_V MSA128BOpnd:$wd,
+                                              MSA128BOpnd:$ws,
+                                              MSA128BOpnd:$wt)>;
+def OR_V_D_PSEUDO : OR_V_D_PSEUDO_DESC,
+                    PseudoInstExpansion<(OR_V MSA128BOpnd:$wd,
+                                              MSA128BOpnd:$ws,
+                                              MSA128BOpnd:$wt)>;
+
+def ORI_B : ORI_B_ENC, ORI_B_DESC;
+
+def PCKEV_B : PCKEV_B_ENC, PCKEV_B_DESC;
+def PCKEV_H : PCKEV_H_ENC, PCKEV_H_DESC;
+def PCKEV_W : PCKEV_W_ENC, PCKEV_W_DESC;
+def PCKEV_D : PCKEV_D_ENC, PCKEV_D_DESC;
+
+def PCKOD_B : PCKOD_B_ENC, PCKOD_B_DESC;
+def PCKOD_H : PCKOD_H_ENC, PCKOD_H_DESC;
+def PCKOD_W : PCKOD_W_ENC, PCKOD_W_DESC;
+def PCKOD_D : PCKOD_D_ENC, PCKOD_D_DESC;
+
+def PCNT_B : PCNT_B_ENC, PCNT_B_DESC;
+def PCNT_H : PCNT_H_ENC, PCNT_H_DESC;
+def PCNT_W : PCNT_W_ENC, PCNT_W_DESC;
+def PCNT_D : PCNT_D_ENC, PCNT_D_DESC;
+
+def SAT_S_B : SAT_S_B_ENC, SAT_S_B_DESC;
+def SAT_S_H : SAT_S_H_ENC, SAT_S_H_DESC;
+def SAT_S_W : SAT_S_W_ENC, SAT_S_W_DESC;
+def SAT_S_D : SAT_S_D_ENC, SAT_S_D_DESC;
+
+def SAT_U_B : SAT_U_B_ENC, SAT_U_B_DESC;
+def SAT_U_H : SAT_U_H_ENC, SAT_U_H_DESC;
+def SAT_U_W : SAT_U_W_ENC, SAT_U_W_DESC;
+def SAT_U_D : SAT_U_D_ENC, SAT_U_D_DESC;
+
+def SHF_B : SHF_B_ENC, SHF_B_DESC;
+def SHF_H : SHF_H_ENC, SHF_H_DESC;
+def SHF_W : SHF_W_ENC, SHF_W_DESC;
+
+def SLD_B : SLD_B_ENC, SLD_B_DESC;
+def SLD_H : SLD_H_ENC, SLD_H_DESC;
+def SLD_W : SLD_W_ENC, SLD_W_DESC;
+def SLD_D : SLD_D_ENC, SLD_D_DESC;
+
+def SLDI_B : SLDI_B_ENC, SLDI_B_DESC;
+def SLDI_H : SLDI_H_ENC, SLDI_H_DESC;
+def SLDI_W : SLDI_W_ENC, SLDI_W_DESC;
+def SLDI_D : SLDI_D_ENC, SLDI_D_DESC;
+
+def SLL_B : SLL_B_ENC, SLL_B_DESC;
+def SLL_H : SLL_H_ENC, SLL_H_DESC;
+def SLL_W : SLL_W_ENC, SLL_W_DESC;
+def SLL_D : SLL_D_ENC, SLL_D_DESC;
+
+def SLLI_B : SLLI_B_ENC, SLLI_B_DESC;
+def SLLI_H : SLLI_H_ENC, SLLI_H_DESC;
+def SLLI_W : SLLI_W_ENC, SLLI_W_DESC;
+def SLLI_D : SLLI_D_ENC, SLLI_D_DESC;
+
+def SPLAT_B : SPLAT_B_ENC, SPLAT_B_DESC;
+def SPLAT_H : SPLAT_H_ENC, SPLAT_H_DESC;
+def SPLAT_W : SPLAT_W_ENC, SPLAT_W_DESC;
+def SPLAT_D : SPLAT_D_ENC, SPLAT_D_DESC;
+
+def SPLATI_B : SPLATI_B_ENC, SPLATI_B_DESC;
+def SPLATI_H : SPLATI_H_ENC, SPLATI_H_DESC;
+def SPLATI_W : SPLATI_W_ENC, SPLATI_W_DESC;
+def SPLATI_D : SPLATI_D_ENC, SPLATI_D_DESC;
+
+def SRA_B : SRA_B_ENC, SRA_B_DESC;
+def SRA_H : SRA_H_ENC, SRA_H_DESC;
+def SRA_W : SRA_W_ENC, SRA_W_DESC;
+def SRA_D : SRA_D_ENC, SRA_D_DESC;
+
+def SRAI_B : SRAI_B_ENC, SRAI_B_DESC;
+def SRAI_H : SRAI_H_ENC, SRAI_H_DESC;
+def SRAI_W : SRAI_W_ENC, SRAI_W_DESC;
+def SRAI_D : SRAI_D_ENC, SRAI_D_DESC;
+
+def SRAR_B : SRAR_B_ENC, SRAR_B_DESC;
+def SRAR_H : SRAR_H_ENC, SRAR_H_DESC;
+def SRAR_W : SRAR_W_ENC, SRAR_W_DESC;
+def SRAR_D : SRAR_D_ENC, SRAR_D_DESC;
+
+def SRARI_B : SRARI_B_ENC, SRARI_B_DESC;
+def SRARI_H : SRARI_H_ENC, SRARI_H_DESC;
+def SRARI_W : SRARI_W_ENC, SRARI_W_DESC;
+def SRARI_D : SRARI_D_ENC, SRARI_D_DESC;
+
+def SRL_B : SRL_B_ENC, SRL_B_DESC;
+def SRL_H : SRL_H_ENC, SRL_H_DESC;
+def SRL_W : SRL_W_ENC, SRL_W_DESC;
+def SRL_D : SRL_D_ENC, SRL_D_DESC;
+
+def SRLI_B : SRLI_B_ENC, SRLI_B_DESC;
+def SRLI_H : SRLI_H_ENC, SRLI_H_DESC;
+def SRLI_W : SRLI_W_ENC, SRLI_W_DESC;
+def SRLI_D : SRLI_D_ENC, SRLI_D_DESC;
+
+def SRLR_B : SRLR_B_ENC, SRLR_B_DESC;
+def SRLR_H : SRLR_H_ENC, SRLR_H_DESC;
+def SRLR_W : SRLR_W_ENC, SRLR_W_DESC;
+def SRLR_D : SRLR_D_ENC, SRLR_D_DESC;
+
+def SRLRI_B : SRLRI_B_ENC, SRLRI_B_DESC;
+def SRLRI_H : SRLRI_H_ENC, SRLRI_H_DESC;
+def SRLRI_W : SRLRI_W_ENC, SRLRI_W_DESC;
+def SRLRI_D : SRLRI_D_ENC, SRLRI_D_DESC;
+
+def ST_B: ST_B_ENC, ST_B_DESC;
+def ST_H: ST_H_ENC, ST_H_DESC;
+def ST_W: ST_W_ENC, ST_W_DESC;
+def ST_D: ST_D_ENC, ST_D_DESC;
+
+def SUBS_S_B : SUBS_S_B_ENC, SUBS_S_B_DESC;
+def SUBS_S_H : SUBS_S_H_ENC, SUBS_S_H_DESC;
+def SUBS_S_W : SUBS_S_W_ENC, SUBS_S_W_DESC;
+def SUBS_S_D : SUBS_S_D_ENC, SUBS_S_D_DESC;
+
+def SUBS_U_B : SUBS_U_B_ENC, SUBS_U_B_DESC;
+def SUBS_U_H : SUBS_U_H_ENC, SUBS_U_H_DESC;
+def SUBS_U_W : SUBS_U_W_ENC, SUBS_U_W_DESC;
+def SUBS_U_D : SUBS_U_D_ENC, SUBS_U_D_DESC;
+
+def SUBSUS_U_B : SUBSUS_U_B_ENC, SUBSUS_U_B_DESC;
+def SUBSUS_U_H : SUBSUS_U_H_ENC, SUBSUS_U_H_DESC;
+def SUBSUS_U_W : SUBSUS_U_W_ENC, SUBSUS_U_W_DESC;
+def SUBSUS_U_D : SUBSUS_U_D_ENC, SUBSUS_U_D_DESC;
+
+def SUBSUU_S_B : SUBSUU_S_B_ENC, SUBSUU_S_B_DESC;
+def SUBSUU_S_H : SUBSUU_S_H_ENC, SUBSUU_S_H_DESC;
+def SUBSUU_S_W : SUBSUU_S_W_ENC, SUBSUU_S_W_DESC;
+def SUBSUU_S_D : SUBSUU_S_D_ENC, SUBSUU_S_D_DESC;
+
+def SUBV_B : SUBV_B_ENC, SUBV_B_DESC;
+def SUBV_H : SUBV_H_ENC, SUBV_H_DESC;
+def SUBV_W : SUBV_W_ENC, SUBV_W_DESC;
+def SUBV_D : SUBV_D_ENC, SUBV_D_DESC;
+
+def SUBVI_B : SUBVI_B_ENC, SUBVI_B_DESC;
+def SUBVI_H : SUBVI_H_ENC, SUBVI_H_DESC;
+def SUBVI_W : SUBVI_W_ENC, SUBVI_W_DESC;
+def SUBVI_D : SUBVI_D_ENC, SUBVI_D_DESC;
+
+def VSHF_B : VSHF_B_ENC, VSHF_B_DESC;
+def VSHF_H : VSHF_H_ENC, VSHF_H_DESC;
+def VSHF_W : VSHF_W_ENC, VSHF_W_DESC;
+def VSHF_D : VSHF_D_ENC, VSHF_D_DESC;
+
+def XOR_V : XOR_V_ENC, XOR_V_DESC;
+def XOR_V_H_PSEUDO : XOR_V_H_PSEUDO_DESC,
+                     PseudoInstExpansion<(XOR_V MSA128BOpnd:$wd,
+                                                MSA128BOpnd:$ws,
+                                                MSA128BOpnd:$wt)>;
+def XOR_V_W_PSEUDO : XOR_V_W_PSEUDO_DESC,
+                     PseudoInstExpansion<(XOR_V MSA128BOpnd:$wd,
+                                                MSA128BOpnd:$ws,
+                                                MSA128BOpnd:$wt)>;
+def XOR_V_D_PSEUDO : XOR_V_D_PSEUDO_DESC,
+                     PseudoInstExpansion<(XOR_V MSA128BOpnd:$wd,
+                                                MSA128BOpnd:$ws,
+                                                MSA128BOpnd:$wt)>;
+
+def XORI_B : XORI_B_ENC, XORI_B_DESC;
+
+// Patterns.
+class MSAPat<dag pattern, dag result, list<Predicate> pred = [HasMSA]> :
+  Pat<pattern, result>, Requires<pred>;
+
+def : MSAPat<(extractelt (v4i32 MSA128W:$ws), immZExt4:$idx),
+             (COPY_S_W MSA128W:$ws, immZExt4:$idx)>;
+
+def : MSAPat<(v16i8 (load addr:$addr)), (LD_B addr:$addr)>;
+def : MSAPat<(v8i16 (load addr:$addr)), (LD_H addr:$addr)>;
+def : MSAPat<(v4i32 (load addr:$addr)), (LD_W addr:$addr)>;
+def : MSAPat<(v2i64 (load addr:$addr)), (LD_D addr:$addr)>;
+def : MSAPat<(v8f16 (load addr:$addr)), (LD_H addr:$addr)>;
+def : MSAPat<(v4f32 (load addr:$addr)), (LD_W addr:$addr)>;
+def : MSAPat<(v2f64 (load addr:$addr)), (LD_D addr:$addr)>;
+
+def : MSAPat<(v8f16 (load addrRegImm:$addr)), (LD_H addrRegImm:$addr)>;
+def : MSAPat<(v4f32 (load addrRegImm:$addr)), (LD_W addrRegImm:$addr)>;
+def : MSAPat<(v2f64 (load addrRegImm:$addr)), (LD_D addrRegImm:$addr)>;
+
+def : MSAPat<(store (v16i8 MSA128B:$ws), addr:$addr),
+             (ST_B MSA128B:$ws, addr:$addr)>;
+def : MSAPat<(store (v8i16 MSA128H:$ws), addr:$addr),
+             (ST_H MSA128H:$ws, addr:$addr)>;
+def : MSAPat<(store (v4i32 MSA128W:$ws), addr:$addr),
+             (ST_W MSA128W:$ws, addr:$addr)>;
+def : MSAPat<(store (v2i64 MSA128D:$ws), addr:$addr),
+             (ST_D MSA128D:$ws, addr:$addr)>;
+def : MSAPat<(store (v8f16 MSA128H:$ws), addr:$addr),
+             (ST_H MSA128H:$ws, addr:$addr)>;
+def : MSAPat<(store (v4f32 MSA128W:$ws), addr:$addr),
+             (ST_W MSA128W:$ws, addr:$addr)>;
+def : MSAPat<(store (v2f64 MSA128D:$ws), addr:$addr),
+             (ST_D MSA128D:$ws, addr:$addr)>;
+
+def ST_FH : MSAPat<(store (v8f16 MSA128H:$ws), addrRegImm:$addr),
+                   (ST_H MSA128H:$ws, addrRegImm:$addr)>;
+def ST_FW : MSAPat<(store (v4f32 MSA128W:$ws), addrRegImm:$addr),
+                   (ST_W MSA128W:$ws, addrRegImm:$addr)>;
+def ST_FD : MSAPat<(store (v2f64 MSA128D:$ws), addrRegImm:$addr),
+                   (ST_D MSA128D:$ws, addrRegImm:$addr)>;
+
+class MSA_FABS_PSEUDO_DESC_BASE<RegisterOperand ROWD,
+                                RegisterOperand ROWS = ROWD,
+                                InstrItinClass itin = NoItinerary> :
+  MipsPseudo<(outs ROWD:$wd),
+             (ins ROWS:$ws),
+             [(set ROWD:$wd, (fabs ROWS:$ws))]> {
+  InstrItinClass Itinerary = itin;
+}
+def FABS_W : MSA_FABS_PSEUDO_DESC_BASE<MSA128WOpnd>,
+             PseudoInstExpansion<(FMAX_A_W MSA128WOpnd:$wd, MSA128WOpnd:$ws,
+                                           MSA128WOpnd:$ws)>;
+def FABS_D : MSA_FABS_PSEUDO_DESC_BASE<MSA128DOpnd>,
+             PseudoInstExpansion<(FMAX_A_D MSA128DOpnd:$wd, MSA128DOpnd:$ws,
+                                           MSA128DOpnd:$ws)>;
+
+class MSABitconvertPat<ValueType DstVT, ValueType SrcVT,
+                       RegisterClass DstRC, list<Predicate> preds = [HasMSA]> :
+   MSAPat<(DstVT (bitconvert SrcVT:$src)),
+          (COPY_TO_REGCLASS SrcVT:$src, DstRC), preds>;
+
+// These are endian-independant because the element size doesnt change
+def : MSABitconvertPat<v8i16, v8f16, MSA128H>;
+def : MSABitconvertPat<v4i32, v4f32, MSA128W>;
+def : MSABitconvertPat<v2i64, v2f64, MSA128D>;
+def : MSABitconvertPat<v8f16, v8i16, MSA128H>;
+def : MSABitconvertPat<v4f32, v4i32, MSA128W>;
+def : MSABitconvertPat<v2f64, v2i64, MSA128D>;
+
+// Little endian bitcasts are always no-ops
+def : MSABitconvertPat<v16i8, v8i16, MSA128B, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v16i8, v4i32, MSA128B, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v16i8, v2i64, MSA128B, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v16i8, v8f16, MSA128B, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v16i8, v4f32, MSA128B, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v16i8, v2f64, MSA128B, [HasMSA, IsLE]>;
+
+def : MSABitconvertPat<v8i16, v16i8, MSA128H, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v8i16, v4i32, MSA128H, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v8i16, v2i64, MSA128H, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v8i16, v4f32, MSA128H, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v8i16, v2f64, MSA128H, [HasMSA, IsLE]>;
+
+def : MSABitconvertPat<v4i32, v16i8, MSA128W, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v4i32, v8i16, MSA128W, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v4i32, v2i64, MSA128W, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v4i32, v8f16, MSA128W, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v4i32, v2f64, MSA128W, [HasMSA, IsLE]>;
+
+def : MSABitconvertPat<v2i64, v16i8, MSA128D, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v2i64, v8i16, MSA128D, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v2i64, v4i32, MSA128D, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v2i64, v8f16, MSA128D, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v2i64, v4f32, MSA128D, [HasMSA, IsLE]>;
+
+def : MSABitconvertPat<v4f32, v16i8, MSA128W, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v4f32, v8i16, MSA128W, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v4f32, v2i64, MSA128W, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v4f32, v8f16, MSA128W, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v4f32, v2f64, MSA128W, [HasMSA, IsLE]>;
+
+def : MSABitconvertPat<v2f64, v16i8, MSA128D, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v2f64, v8i16, MSA128D, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v2f64, v4i32, MSA128D, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v2f64, v8f16, MSA128D, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v2f64, v4f32, MSA128D, [HasMSA, IsLE]>;
+
+// Big endian bitcasts expand to shuffle instructions.
+// This is because bitcast is defined to be a store/load sequence and the
+// vector store/load instructions are mixed-endian with respect to the vector
+// as a whole (little endian with respect to element order, but big endian
+// elements).
+
+class MSABitconvertReverseQuartersPat<ValueType DstVT, ValueType SrcVT,
+                                      RegisterClass DstRC, MSAInst Insn,
+                                      RegisterClass ViaRC> :
+  MSAPat<(DstVT (bitconvert SrcVT:$src)),
+         (COPY_TO_REGCLASS (Insn (COPY_TO_REGCLASS SrcVT:$src, ViaRC), 27),
+                           DstRC),
+         [HasMSA, IsBE]>;
+
+class MSABitconvertReverseHalvesPat<ValueType DstVT, ValueType SrcVT,
+                                    RegisterClass DstRC, MSAInst Insn,
+                                    RegisterClass ViaRC> :
+  MSAPat<(DstVT (bitconvert SrcVT:$src)),
+         (COPY_TO_REGCLASS (Insn (COPY_TO_REGCLASS SrcVT:$src, ViaRC), 177),
+                           DstRC),
+         [HasMSA, IsBE]>;
+
+class MSABitconvertReverseBInHPat<ValueType DstVT, ValueType SrcVT,
+                                  RegisterClass DstRC> :
+  MSABitconvertReverseHalvesPat<DstVT, SrcVT, DstRC, SHF_B, MSA128B>;
+
+class MSABitconvertReverseBInWPat<ValueType DstVT, ValueType SrcVT,
+                                  RegisterClass DstRC> :
+  MSABitconvertReverseQuartersPat<DstVT, SrcVT, DstRC, SHF_B, MSA128B>;
+
+class MSABitconvertReverseBInDPat<ValueType DstVT, ValueType SrcVT,
+                                  RegisterClass DstRC> :
+  MSAPat<(DstVT (bitconvert SrcVT:$src)),
+         (COPY_TO_REGCLASS
+           (SHF_W
+             (COPY_TO_REGCLASS
+               (SHF_B (COPY_TO_REGCLASS SrcVT:$src, MSA128B), 27),
+               MSA128W), 177),
+           DstRC),
+         [HasMSA, IsBE]>;
+
+class MSABitconvertReverseHInWPat<ValueType DstVT, ValueType SrcVT,
+                                  RegisterClass DstRC> :
+  MSABitconvertReverseHalvesPat<DstVT, SrcVT, DstRC, SHF_H, MSA128H>;
+
+class MSABitconvertReverseHInDPat<ValueType DstVT, ValueType SrcVT,
+                                  RegisterClass DstRC> :
+  MSABitconvertReverseQuartersPat<DstVT, SrcVT, DstRC, SHF_H, MSA128H>;
+
+class MSABitconvertReverseWInDPat<ValueType DstVT, ValueType SrcVT,
+                                  RegisterClass DstRC> :
+  MSABitconvertReverseHalvesPat<DstVT, SrcVT, DstRC, SHF_W, MSA128W>;
+
+def : MSABitconvertReverseBInHPat<v8i16, v16i8, MSA128H>;
+def : MSABitconvertReverseBInHPat<v8f16, v16i8, MSA128H>;
+def : MSABitconvertReverseBInWPat<v4i32, v16i8, MSA128W>;
+def : MSABitconvertReverseBInWPat<v4f32, v16i8, MSA128W>;
+def : MSABitconvertReverseBInDPat<v2i64, v16i8, MSA128D>;
+def : MSABitconvertReverseBInDPat<v2f64, v16i8, MSA128D>;
+
+def : MSABitconvertReverseBInHPat<v16i8, v8i16, MSA128B>;
+def : MSABitconvertReverseHInWPat<v4i32, v8i16, MSA128W>;
+def : MSABitconvertReverseHInWPat<v4f32, v8i16, MSA128W>;
+def : MSABitconvertReverseHInDPat<v2i64, v8i16, MSA128D>;
+def : MSABitconvertReverseHInDPat<v2f64, v8i16, MSA128D>;
+
+def : MSABitconvertReverseBInHPat<v16i8, v8f16, MSA128B>;
+def : MSABitconvertReverseHInWPat<v4i32, v8f16, MSA128W>;
+def : MSABitconvertReverseHInWPat<v4f32, v8f16, MSA128W>;
+def : MSABitconvertReverseHInDPat<v2i64, v8f16, MSA128D>;
+def : MSABitconvertReverseHInDPat<v2f64, v8f16, MSA128D>;
+
+def : MSABitconvertReverseBInWPat<v16i8, v4i32, MSA128B>;
+def : MSABitconvertReverseHInWPat<v8i16, v4i32, MSA128H>;
+def : MSABitconvertReverseHInWPat<v8f16, v4i32, MSA128H>;
+def : MSABitconvertReverseWInDPat<v2i64, v4i32, MSA128D>;
+def : MSABitconvertReverseWInDPat<v2f64, v4i32, MSA128D>;
+
+def : MSABitconvertReverseBInWPat<v16i8, v4f32, MSA128B>;
+def : MSABitconvertReverseHInWPat<v8i16, v4f32, MSA128H>;
+def : MSABitconvertReverseHInWPat<v8f16, v4f32, MSA128H>;
+def : MSABitconvertReverseWInDPat<v2i64, v4f32, MSA128D>;
+def : MSABitconvertReverseWInDPat<v2f64, v4f32, MSA128D>;
+
+def : MSABitconvertReverseBInDPat<v16i8, v2i64, MSA128B>;
+def : MSABitconvertReverseHInDPat<v8i16, v2i64, MSA128H>;
+def : MSABitconvertReverseHInDPat<v8f16, v2i64, MSA128H>;
+def : MSABitconvertReverseWInDPat<v4i32, v2i64, MSA128W>;
+def : MSABitconvertReverseWInDPat<v4f32, v2i64, MSA128W>;
+
+def : MSABitconvertReverseBInDPat<v16i8, v2f64, MSA128B>;
+def : MSABitconvertReverseHInDPat<v8i16, v2f64, MSA128H>;
+def : MSABitconvertReverseHInDPat<v8f16, v2f64, MSA128H>;
+def : MSABitconvertReverseWInDPat<v4i32, v2f64, MSA128W>;
+def : MSABitconvertReverseWInDPat<v4f32, v2f64, MSA128W>;
+
+// Pseudos used to implement BNZ.df, and BZ.df
+
+class MSA_CBRANCH_PSEUDO_DESC_BASE<SDPatternOperator OpNode, ValueType TyNode,
+                                   RegisterClass RCWS,
+                                   InstrItinClass itin = NoItinerary> :
+  MipsPseudo<(outs GPR32:$dst),
+             (ins RCWS:$ws),
+             [(set GPR32:$dst, (OpNode (TyNode RCWS:$ws)))]> {
+  bit usesCustomInserter = 1;
+}
+
+def SNZ_B_PSEUDO : MSA_CBRANCH_PSEUDO_DESC_BASE<MipsVAllNonZero, v16i8,
+                                                MSA128B, NoItinerary>;
+def SNZ_H_PSEUDO : MSA_CBRANCH_PSEUDO_DESC_BASE<MipsVAllNonZero, v8i16,
+                                                MSA128H, NoItinerary>;
+def SNZ_W_PSEUDO : MSA_CBRANCH_PSEUDO_DESC_BASE<MipsVAllNonZero, v4i32,
+                                                MSA128W, NoItinerary>;
+def SNZ_D_PSEUDO : MSA_CBRANCH_PSEUDO_DESC_BASE<MipsVAllNonZero, v2i64,
+                                                MSA128D, NoItinerary>;
+def SNZ_V_PSEUDO : MSA_CBRANCH_PSEUDO_DESC_BASE<MipsVAnyNonZero, v16i8,
+                                                MSA128B, NoItinerary>;
+
+def SZ_B_PSEUDO : MSA_CBRANCH_PSEUDO_DESC_BASE<MipsVAllZero, v16i8,
+                                               MSA128B, NoItinerary>;
+def SZ_H_PSEUDO : MSA_CBRANCH_PSEUDO_DESC_BASE<MipsVAllZero, v8i16,
+                                               MSA128H, NoItinerary>;
+def SZ_W_PSEUDO : MSA_CBRANCH_PSEUDO_DESC_BASE<MipsVAllZero, v4i32,
+                                               MSA128W, NoItinerary>;
+def SZ_D_PSEUDO : MSA_CBRANCH_PSEUDO_DESC_BASE<MipsVAllZero, v2i64,
+                                               MSA128D, NoItinerary>;
+def SZ_V_PSEUDO : MSA_CBRANCH_PSEUDO_DESC_BASE<MipsVAnyZero, v16i8,
+                                               MSA128B, NoItinerary>;
diff --git a/lib/Target/Mips/MipsMachineFunction.cpp b/lib/Target/Mips/MipsMachineFunction.cpp
index a7299d7..dedf802 100644
--- a/lib/Target/Mips/MipsMachineFunction.cpp
+++ b/lib/Target/Mips/MipsMachineFunction.cpp
@@ -15,6 +15,7 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/IR/Function.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
 
@@ -22,6 +23,53 @@ static cl::opt<bool>
 FixGlobalBaseReg("mips-fix-global-base-reg", cl::Hidden, cl::init(true),
                  cl::desc("Always use $gp as the global base register."));
 
+// class MipsCallEntry.
+MipsCallEntry::MipsCallEntry(const StringRef &N) {
+#ifndef NDEBUG
+  Name = N;
+  Val = 0;
+#endif
+}
+
+MipsCallEntry::MipsCallEntry(const GlobalValue *V) {
+#ifndef NDEBUG
+  Val = V;
+#endif
+}
+
+bool MipsCallEntry::isConstant(const MachineFrameInfo *) const {
+  return false;
+}
+
+bool MipsCallEntry::isAliased(const MachineFrameInfo *) const {
+  return false;
+}
+
+bool MipsCallEntry::mayAlias(const MachineFrameInfo *) const {
+  return false;
+}
+
+void MipsCallEntry::printCustom(raw_ostream &O) const {
+  O << "MipsCallEntry: ";
+#ifndef NDEBUG
+  if (Val)
+    O << Val->getName();
+  else
+    O << Name;
+#endif
+}
+
+MipsFunctionInfo::~MipsFunctionInfo() {
+  for (StringMap<const MipsCallEntry *>::iterator
+       I = ExternalCallEntries.begin(), E = ExternalCallEntries.end(); I != E;
+       ++I)
+    delete I->getValue();
+
+  for (ValueMap<const GlobalValue *, const MipsCallEntry *>::iterator
+       I = GlobalCallEntries.begin(), E = GlobalCallEntries.end(); I != E; ++I)
+    delete I->second;
+}
+
 bool MipsFunctionInfo::globalBaseRegSet() const {
   return GlobalBaseReg;
 }
@@ -72,4 +120,22 @@ bool MipsFunctionInfo::isEhDataRegFI(int FI) const {
                         || FI == EhDataRegFI[2] || FI == EhDataRegFI[3]);
 }
 
+MachinePointerInfo MipsFunctionInfo::callPtrInfo(const StringRef &Name) {
+  const MipsCallEntry *&E = ExternalCallEntries[Name];
+
+  if (!E)
+    E = new MipsCallEntry(Name);
+
+  return MachinePointerInfo(E);
+}
+
+MachinePointerInfo MipsFunctionInfo::callPtrInfo(const GlobalValue *Val) {
+  const MipsCallEntry *&E = GlobalCallEntries[Val];
+
+  if (!E)
+    E = new MipsCallEntry(Val);
+
+  return MachinePointerInfo(E);
+}
+
 void MipsFunctionInfo::anchor() { }
diff --git a/lib/Target/Mips/MipsMachineFunction.h b/lib/Target/Mips/MipsMachineFunction.h
index b05b348..43bf682 100644
--- a/lib/Target/Mips/MipsMachineFunction.h
+++ b/lib/Target/Mips/MipsMachineFunction.h
@@ -15,56 +15,48 @@
 #define MIPS_MACHINE_FUNCTION_INFO_H
 
 #include "MipsSubtarget.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/ValueMap.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/IR/GlobalValue.h"
 #include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetMachine.h"
 #include <utility>
 
 namespace llvm {
 
+/// \brief A class derived from PseudoSourceValue that represents a GOT entry
+/// resolved by lazy-binding.
+class MipsCallEntry : public PseudoSourceValue {
+public:
+  explicit MipsCallEntry(const StringRef &N);
+  explicit MipsCallEntry(const GlobalValue *V);
+  virtual bool isConstant(const MachineFrameInfo *) const;
+  virtual bool isAliased(const MachineFrameInfo *) const;
+  virtual bool mayAlias(const MachineFrameInfo *) const;
+
+private:
+  virtual void printCustom(raw_ostream &O) const;
+#ifndef NDEBUG
+  std::string Name;
+  const GlobalValue *Val;
+#endif
+};
+
 /// MipsFunctionInfo - This class is derived from MachineFunction private
 /// Mips target-specific information for each MachineFunction.
 class MipsFunctionInfo : public MachineFunctionInfo {
-  virtual void anchor();
-
-  MachineFunction& MF;
-  /// SRetReturnReg - Some subtargets require that sret lowering includes
-  /// returning the value of the returned struct in a register. This field
-  /// holds the virtual register into which the sret argument is passed.
-  unsigned SRetReturnReg;
-
-  /// GlobalBaseReg - keeps track of the virtual register initialized for
-  /// use as the global base register. This is used for PIC in some PIC
-  /// relocation models.
-  unsigned GlobalBaseReg;
-
-  /// Mips16SPAliasReg - keeps track of the virtual register initialized for
-  /// use as an alias for SP for use in load/store of halfword/byte from/to
-  /// the stack
-  unsigned Mips16SPAliasReg;
-
-  /// VarArgsFrameIndex - FrameIndex for start of varargs area.
-  int VarArgsFrameIndex;
-
-  /// True if function has a byval argument.
-  bool HasByvalArg;
-
-  /// Size of incoming argument area.
-  unsigned IncomingArgSize;
-
-  /// CallsEhReturn - Whether the function calls llvm.eh.return.
-  bool CallsEhReturn;
-
-  /// Frame objects for spilling eh data registers.
-  int EhDataRegFI[4];
-
 public:
   MipsFunctionInfo(MachineFunction& MF)
    : MF(MF), SRetReturnReg(0), GlobalBaseReg(0), Mips16SPAliasReg(0),
      VarArgsFrameIndex(0), CallsEhReturn(false)
   {}
 
+  ~MipsFunctionInfo();
+
   unsigned getSRetReturnReg() const { return SRetReturnReg; }
   void setSRetReturnReg(unsigned Reg) { SRetReturnReg = Reg; }
 
@@ -92,6 +84,51 @@ public:
   int getEhDataRegFI(unsigned Reg) const { return EhDataRegFI[Reg]; }
   bool isEhDataRegFI(int FI) const;
 
+  /// \brief Create a MachinePointerInfo that has a MipsCallEntr object
+  /// representing a GOT entry for an external function.
+  MachinePointerInfo callPtrInfo(const StringRef &Name);
+
+  /// \brief Create a MachinePointerInfo that has a MipsCallEntr object
+  /// representing a GOT entry for a global function.
+  MachinePointerInfo callPtrInfo(const GlobalValue *Val);
+
+private:
+  virtual void anchor();
+
+  MachineFunction& MF;
+  /// SRetReturnReg - Some subtargets require that sret lowering includes
+  /// returning the value of the returned struct in a register. This field
+  /// holds the virtual register into which the sret argument is passed.
+  unsigned SRetReturnReg;
+
+  /// GlobalBaseReg - keeps track of the virtual register initialized for
+  /// use as the global base register. This is used for PIC in some PIC
+  /// relocation models.
+  unsigned GlobalBaseReg;
+
+  /// Mips16SPAliasReg - keeps track of the virtual register initialized for
+  /// use as an alias for SP for use in load/store of halfword/byte from/to
+  /// the stack
+  unsigned Mips16SPAliasReg;
+
+  /// VarArgsFrameIndex - FrameIndex for start of varargs area.
+  int VarArgsFrameIndex;
+
+  /// True if function has a byval argument.
+  bool HasByvalArg;
+
+  /// Size of incoming argument area.
+  unsigned IncomingArgSize;
+
+  /// CallsEhReturn - Whether the function calls llvm.eh.return.
+  bool CallsEhReturn;
+
+  /// Frame objects for spilling eh data registers.
+  int EhDataRegFI[4];
+
+  /// MipsCallEntry maps.
+  StringMap<const MipsCallEntry *> ExternalCallEntries;
+  ValueMap<const GlobalValue *, const MipsCallEntry *> GlobalCallEntries;
 };
 
 } // end of namespace llvm
diff --git a/lib/Target/Mips/MipsOs16.cpp b/lib/Target/Mips/MipsOs16.cpp
index 1919077..fe60841 100644
--- a/lib/Target/Mips/MipsOs16.cpp
+++ b/lib/Target/Mips/MipsOs16.cpp
@@ -14,9 +14,17 @@
 #define DEBUG_TYPE "mips-os16"
 #include "MipsOs16.h"
 #include "llvm/IR/Module.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 
+
+static cl::opt<std::string> Mips32FunctionMask(
+  "mips32-function-mask",
+  cl::init(""),
+  cl::desc("Force function to be mips32"),
+  cl::Hidden);
+
 namespace {
 
   // Figure out if we need float point based on the function signature.
@@ -85,18 +93,43 @@ namespace llvm {
 
 
 bool MipsOs16::runOnModule(Module &M) {
-  DEBUG(errs() << "Run on Module MipsOs16\n");
+  bool usingMask = Mips32FunctionMask.length() > 0;
+  bool doneUsingMask = false; // this will make it stop repeating
+  DEBUG(dbgs() << "Run on Module MipsOs16 \n" << Mips32FunctionMask << "\n");
+  if (usingMask)
+    DEBUG(dbgs() << "using mask \n" << Mips32FunctionMask << "\n");
+  unsigned int functionIndex = 0;
   bool modified = false;
   for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) {
     if (F->isDeclaration()) continue;
     DEBUG(dbgs() << "Working on " << F->getName() << "\n");
-    if (needsFP(*F)) {
-      DEBUG(dbgs() << " need to compile as nomips16 \n");
-      F->addFnAttr("nomips16");
+    if (usingMask) {
+      if (!doneUsingMask) {
+        if (functionIndex == Mips32FunctionMask.length())
+          functionIndex = 0;
+        switch (Mips32FunctionMask[functionIndex]) {
+        case '1':
+          DEBUG(dbgs() << "mask forced mips32: " << F->getName() << "\n");
+          F->addFnAttr("nomips16");
+          break;
+        case '.':
+          doneUsingMask = true;
+          break;
+        default:
+          break;
+        }
+        functionIndex++;
+      }
     }
     else {
-      F->addFnAttr("mips16");
-      DEBUG(dbgs() << " no need to compile as nomips16 \n");
+      if (needsFP(*F)) {
+        DEBUG(dbgs() << "os16 forced mips32: " << F->getName() << "\n");
+        F->addFnAttr("nomips16");
+      }
+      else {
+        DEBUG(dbgs() << "os16 forced mips16: " << F->getName() << "\n");
+        F->addFnAttr("mips16");
+      }
     }
   }
   return modified;
diff --git a/lib/Target/Mips/MipsRegisterInfo.cpp b/lib/Target/Mips/MipsRegisterInfo.cpp
index 0b5fc33..3105b02 100644
--- a/lib/Target/Mips/MipsRegisterInfo.cpp
+++ b/lib/Target/Mips/MipsRegisterInfo.cpp
@@ -47,6 +47,11 @@ MipsRegisterInfo::MipsRegisterInfo(const MipsSubtarget &ST)
 
 unsigned MipsRegisterInfo::getPICCallReg() { return Mips::T9; }
 
+const TargetRegisterClass *
+MipsRegisterInfo::getPointerRegClass(const MachineFunction &MF,
+                                     unsigned Kind) const {
+  return Subtarget.isABI_N64() ? &Mips::GPR64RegClass : &Mips::GPR32RegClass;
+}
 
 unsigned
 MipsRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
@@ -56,7 +61,7 @@ MipsRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
     return 0;
   case Mips::GPR32RegClassID:
   case Mips::GPR64RegClassID:
-  case Mips::DSPRegsRegClassID: {
+  case Mips::DSPRRegClassID: {
     const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
     return 28 - TFI->hasFP(MF);
   }
@@ -78,26 +83,34 @@ const uint16_t* MipsRegisterInfo::
 getCalleeSavedRegs(const MachineFunction *MF) const {
   if (Subtarget.isSingleFloat())
     return CSR_SingleFloatOnly_SaveList;
-  else if (!Subtarget.hasMips64())
-    return CSR_O32_SaveList;
-  else if (Subtarget.isABI_N32())
+
+  if (Subtarget.isABI_N64())
+    return CSR_N64_SaveList;
+
+  if (Subtarget.isABI_N32())
     return CSR_N32_SaveList;
 
-  assert(Subtarget.isABI_N64());
-  return CSR_N64_SaveList;
+  if (Subtarget.isFP64bit())
+    return CSR_O32_FP64_SaveList;
+
+  return CSR_O32_SaveList;
 }
 
 const uint32_t*
 MipsRegisterInfo::getCallPreservedMask(CallingConv::ID) const {
   if (Subtarget.isSingleFloat())
     return CSR_SingleFloatOnly_RegMask;
-  else if (!Subtarget.hasMips64())
-    return CSR_O32_RegMask;
-  else if (Subtarget.isABI_N32())
+
+  if (Subtarget.isABI_N64())
+    return CSR_N64_RegMask;
+
+  if (Subtarget.isABI_N32())
     return CSR_N32_RegMask;
 
-  assert(Subtarget.isABI_N64());
-  return CSR_N64_RegMask;
+  if (Subtarget.isFP64bit())
+    return CSR_O32_FP64_RegMask;
+
+  return CSR_O32_RegMask;
 }
 
 const uint32_t *MipsRegisterInfo::getMips16RetHelperMask() {
@@ -123,7 +136,7 @@ getReservedRegs(const MachineFunction &MF) const {
   for (unsigned I = 0; I < array_lengthof(ReservedGPR64); ++I)
     Reserved.set(ReservedGPR64[I]);
 
-  if (Subtarget.hasMips64()) {
+  if (Subtarget.isFP64bit()) {
     // Reserve all registers in AFGR64.
     for (RegIter Reg = Mips::AFGR64RegClass.begin(),
          EReg = Mips::AFGR64RegClass.end(); Reg != EReg; ++Reg)
@@ -146,7 +159,6 @@ getReservedRegs(const MachineFunction &MF) const {
 
   // Reserve hardware registers.
   Reserved.set(Mips::HWR29);
-  Reserved.set(Mips::HWR29_64);
 
   // Reserve DSP control register.
   Reserved.set(Mips::DSPPos);
@@ -155,6 +167,16 @@ getReservedRegs(const MachineFunction &MF) const {
   Reserved.set(Mips::DSPEFI);
   Reserved.set(Mips::DSPOutFlag);
 
+  // Reserve MSA control registers.
+  Reserved.set(Mips::MSAIR);
+  Reserved.set(Mips::MSACSR);
+  Reserved.set(Mips::MSAAccess);
+  Reserved.set(Mips::MSASave);
+  Reserved.set(Mips::MSAModify);
+  Reserved.set(Mips::MSARequest);
+  Reserved.set(Mips::MSAMap);
+  Reserved.set(Mips::MSAUnmap);
+
   // Reserve RA if in mips16 mode.
   if (Subtarget.inMips16Mode()) {
     Reserved.set(Mips::RA);
@@ -218,12 +240,3 @@ getFrameRegister(const MachineFunction &MF) const {
 
 }
 
-unsigned MipsRegisterInfo::
-getEHExceptionRegister() const {
-  llvm_unreachable("What is the exception register");
-}
-
-unsigned MipsRegisterInfo::
-getEHHandlerRegister() const {
-  llvm_unreachable("What is the exception handler register");
-}
diff --git a/lib/Target/Mips/MipsRegisterInfo.h b/lib/Target/Mips/MipsRegisterInfo.h
index 20ba41d..0450c6f 100644
--- a/lib/Target/Mips/MipsRegisterInfo.h
+++ b/lib/Target/Mips/MipsRegisterInfo.h
@@ -42,6 +42,9 @@ public:
   void adjustMipsStackFrame(MachineFunction &MF) const;
 
   /// Code Generation virtual methods...
+  const TargetRegisterClass *getPointerRegClass(const MachineFunction &MF,
+                                                unsigned Kind) const;
+
   unsigned getRegPressureLimit(const TargetRegisterClass *RC,
                                MachineFunction &MF) const;
   const uint16_t *getCalleeSavedRegs(const MachineFunction *MF = 0) const;
@@ -65,10 +68,6 @@ public:
   /// Debug information queries.
   unsigned getFrameRegister(const MachineFunction &MF) const;
 
-  /// Exception handling queries.
-  unsigned getEHExceptionRegister() const;
-  unsigned getEHHandlerRegister() const;
-
   /// \brief Return GPR register class.
   virtual const TargetRegisterClass *intRegClass(unsigned Size) const = 0;
 
diff --git a/lib/Target/Mips/MipsRegisterInfo.td b/lib/Target/Mips/MipsRegisterInfo.td
index c72c30d..3173d09 100644
--- a/lib/Target/Mips/MipsRegisterInfo.td
+++ b/lib/Target/Mips/MipsRegisterInfo.td
@@ -11,9 +11,8 @@
 //  Declarations that describe the MIPS register file
 //===----------------------------------------------------------------------===//
 let Namespace = "Mips" in {
-def sub_fpeven : SubRegIndex<32>;
-def sub_fpodd  : SubRegIndex<32, 32>;
 def sub_32     : SubRegIndex<32>;
+def sub_64     : SubRegIndex<64>;
 def sub_lo     : SubRegIndex<32>;
 def sub_hi     : SubRegIndex<32, 32>;
 def sub_dsp16_19 : SubRegIndex<4, 16>;
@@ -54,17 +53,24 @@ class FPR<bits<16> Enc, string n> : MipsReg<Enc, n>;
 // Mips 64-bit (aliased) FPU Registers
 class AFPR<bits<16> Enc, string n, list<Register> subregs>
   : MipsRegWithSubRegs<Enc, n, subregs> {
-  let SubRegIndices = [sub_fpeven, sub_fpodd];
+  let SubRegIndices = [sub_lo, sub_hi];
   let CoveredBySubRegs = 1;
 }
 
 class AFPR64<bits<16> Enc, string n, list<Register> subregs>
   : MipsRegWithSubRegs<Enc, n, subregs> {
-  let SubRegIndices = [sub_32];
+  let SubRegIndices = [sub_lo, sub_hi];
+  let CoveredBySubRegs = 1;
+}
+
+// Mips 128-bit (aliased) MSA Registers
+class AFPR128<bits<16> Enc, string n, list<Register> subregs>
+  : MipsRegWithSubRegs<Enc, n, subregs> {
+  let SubRegIndices = [sub_64];
 }
 
 // Accumulator Registers
-class ACC<bits<16> Enc, string n, list<Register> subregs>
+class ACCReg<bits<16> Enc, string n, list<Register> subregs>
   : MipsRegWithSubRegs<Enc, n, subregs> {
   let SubRegIndices = [sub_lo, sub_hi];
   let CoveredBySubRegs = 1;
@@ -150,6 +156,10 @@ let Namespace = "Mips" in {
   foreach I = 0-31 in
   def F#I : FPR<I, "f"#I>, DwarfRegNum<[!add(I, 32)]>;
 
+  // Higher half of 64-bit FP registers.
+  foreach I = 0-31 in
+  def F_HI#I : FPR<I, "f"#I>, DwarfRegNum<[!add(I, 32)]>;
+
   /// Mips Double point precision FPU Registers (aliased
   /// with the single precision to hold 64 bit values)
   foreach I = 0-15 in
@@ -159,22 +169,28 @@ let Namespace = "Mips" in {
 
   /// Mips Double point precision FPU Registers in MFP64 mode.
   foreach I = 0-31 in
-  def D#I#_64 : AFPR64<I, "f"#I, [!cast<FPR>("F"#I)]>,
+  def D#I#_64 : AFPR64<I, "f"#I, [!cast<FPR>("F"#I), !cast<FPR>("F_HI"#I)]>,
                 DwarfRegNum<[!add(I, 32)]>;
 
+  /// Mips MSA registers
+  /// MSA and FPU cannot both be present unless the FPU has 64-bit registers
+  foreach I = 0-31 in
+  def W#I : AFPR128<I, "w"#I, [!cast<AFPR64>("D"#I#"_64")]>,
+            DwarfRegNum<[!add(I, 32)]>;
+
   // Hi/Lo registers
-  def HI  : Register<"ac0">, DwarfRegNum<[64]>;
-  def HI1 : Register<"ac1">, DwarfRegNum<[176]>;
-  def HI2 : Register<"ac2">, DwarfRegNum<[178]>;
-  def HI3 : Register<"ac3">, DwarfRegNum<[180]>;
-  def LO  : Register<"ac0">, DwarfRegNum<[65]>;
-  def LO1 : Register<"ac1">, DwarfRegNum<[177]>;
-  def LO2 : Register<"ac2">, DwarfRegNum<[179]>;
-  def LO3 : Register<"ac3">, DwarfRegNum<[181]>;
+  def HI0 : MipsReg<0, "ac0">, DwarfRegNum<[64]>;
+  def HI1 : MipsReg<1, "ac1">, DwarfRegNum<[176]>;
+  def HI2 : MipsReg<2, "ac2">, DwarfRegNum<[178]>;
+  def HI3 : MipsReg<3, "ac3">, DwarfRegNum<[180]>;
+  def LO0 : MipsReg<0, "ac0">, DwarfRegNum<[65]>;
+  def LO1 : MipsReg<1, "ac1">, DwarfRegNum<[177]>;
+  def LO2 : MipsReg<2, "ac2">, DwarfRegNum<[179]>;
+  def LO3 : MipsReg<3, "ac3">, DwarfRegNum<[181]>;
 
   let SubRegIndices = [sub_32] in {
-  def HI64  : RegisterWithSubRegs<"hi", [HI]>;
-  def LO64  : RegisterWithSubRegs<"lo", [LO]>;
+  def HI0_64  : RegisterWithSubRegs<"hi", [HI0]>;
+  def LO0_64  : RegisterWithSubRegs<"lo", [LO0]>;
   }
 
   // FP control registers.
@@ -185,20 +201,22 @@ let Namespace = "Mips" in {
   foreach I = 0-7 in
   def FCC#I : MipsReg<#I, "fcc"#I>;
 
+  // COP2 registers.
+  foreach I = 0-31 in
+  def COP2#I : MipsReg<#I, ""#I>;
+
   // PC register
   def PC : Register<"pc">;
 
   // Hardware register $29
   def HWR29 : MipsReg<29, "29">;
-  def HWR29_64 : MipsReg<29, "29">;
 
   // Accum registers
-  def AC0 : ACC<0, "ac0", [LO, HI]>;
-  def AC1 : ACC<1, "ac1", [LO1, HI1]>;
-  def AC2 : ACC<2, "ac2", [LO2, HI2]>;
-  def AC3 : ACC<3, "ac3", [LO3, HI3]>;
+  foreach I = 0-3 in
+  def AC#I : ACCReg<#I, "ac"#I,
+                    [!cast<Register>("LO"#I), !cast<Register>("HI"#I)]>;
 
-  def AC0_64 : ACC<0, "ac0", [LO64, HI64]>;
+  def AC0_64 : ACCReg<0, "ac0", [LO0_64, HI0_64]>;
 
   // DSP-ASE control register fields.
   def DSPPos : Register<"">;
@@ -217,6 +235,16 @@ let Namespace = "Mips" in {
   def DSPOutFlag : RegisterWithSubRegs<"", [DSPOutFlag16_19, DSPOutFlag20,
                                             DSPOutFlag21, DSPOutFlag22,
                                             DSPOutFlag23]>;
+
+  // MSA-ASE control registers.
+  def MSAIR      : MipsReg<0, "0">;
+  def MSACSR     : MipsReg<1, "1">;
+  def MSAAccess  : MipsReg<2, "2">;
+  def MSASave    : MipsReg<3, "3">;
+  def MSAModify  : MipsReg<4, "4">;
+  def MSARequest : MipsReg<5, "5">;
+  def MSAMap     : MipsReg<6, "6">;
+  def MSAUnmap   : MipsReg<7, "7">;
 }
 
 //===----------------------------------------------------------------------===//
@@ -239,7 +267,7 @@ class GPR32Class<list<ValueType> regTypes> :
   K0, K1, GP, SP, FP, RA)>;
 
 def GPR32 : GPR32Class<[i32]>;
-def DSPRegs : GPR32Class<[v4i8, v2i16]>;
+def DSPR  : GPR32Class<[v4i8, v2i16]>;
 
 def GPR64 : RegisterClass<"Mips", [i64], 64, (add
 // Reserved
@@ -281,6 +309,9 @@ def CPUSPReg : RegisterClass<"Mips", [i32], 32, (add SP)>, Unallocatable;
 // * FGR32 - 32 32-bit registers (single float only mode)
 def FGR32 : RegisterClass<"Mips", [f32], 32, (sequence "F%u", 0, 31)>;
 
+def FGRH32 : RegisterClass<"Mips", [f32], 32, (sequence "F_HI%u", 0, 31)>,
+             Unallocatable;
+
 def AFGR64 : RegisterClass<"Mips", [f64], 64, (add
   // Return Values and Arguments
   D0, D1,
@@ -303,33 +334,48 @@ def CCR : RegisterClass<"Mips", [i32], 32, (sequence "FCR%u", 0, 31)>,
 def FCC : RegisterClass<"Mips", [i32], 32, (sequence "FCC%u", 0, 7)>,
           Unallocatable;
 
+def MSA128B: RegisterClass<"Mips", [v16i8], 128,
+                           (sequence "W%u", 0, 31)>;
+def MSA128H: RegisterClass<"Mips", [v8i16, v8f16], 128,
+                           (sequence "W%u", 0, 31)>;
+def MSA128W: RegisterClass<"Mips", [v4i32, v4f32], 128,
+                           (sequence "W%u", 0, 31)>;
+def MSA128D: RegisterClass<"Mips", [v2i64, v2f64], 128,
+                           (sequence "W%u", 0, 31)>;
+
+def MSACtrl: RegisterClass<"Mips", [i32], 32, (add
+  MSAIR, MSACSR, MSAAccess, MSASave, MSAModify, MSARequest, MSAMap, MSAUnmap)>;
+
 // Hi/Lo Registers
-def LORegs : RegisterClass<"Mips", [i32], 32, (add LO)>;
-def HIRegs : RegisterClass<"Mips", [i32], 32, (add HI)>;
-def LORegsDSP : RegisterClass<"Mips", [i32], 32, (add LO, LO1, LO2, LO3)>;
-def HIRegsDSP : RegisterClass<"Mips", [i32], 32, (add HI, HI1, HI2, HI3)>;
-def LORegs64 : RegisterClass<"Mips", [i64], 64, (add LO64)>;
-def HIRegs64 : RegisterClass<"Mips", [i64], 64, (add HI64)>;
+def LO32 : RegisterClass<"Mips", [i32], 32, (add LO0)>;
+def HI32 : RegisterClass<"Mips", [i32], 32, (add HI0)>;
+def LO32DSP : RegisterClass<"Mips", [i32], 32, (sequence "LO%u", 0, 3)>;
+def HI32DSP : RegisterClass<"Mips", [i32], 32, (sequence "HI%u", 0, 3)>;
+def LO64 : RegisterClass<"Mips", [i64], 64, (add LO0_64)>;
+def HI64 : RegisterClass<"Mips", [i64], 64, (add HI0_64)>;
 
 // Hardware registers
 def HWRegs : RegisterClass<"Mips", [i32], 32, (add HWR29)>, Unallocatable;
-def HWRegs64 : RegisterClass<"Mips", [i64], 64, (add HWR29_64)>, Unallocatable;
 
 // Accumulator Registers
-def ACRegs : RegisterClass<"Mips", [untyped], 64, (add AC0)> {
+def ACC64 : RegisterClass<"Mips", [untyped], 64, (add AC0)> {
   let Size = 64;
 }
 
-def ACRegs128 : RegisterClass<"Mips", [untyped], 128, (add AC0_64)> {
+def ACC128 : RegisterClass<"Mips", [untyped], 128, (add AC0_64)> {
   let Size = 128;
 }
 
-def ACRegsDSP : RegisterClass<"Mips", [untyped], 64, (sequence "AC%u", 0, 3)> {
+def ACC64DSP : RegisterClass<"Mips", [untyped], 64, (sequence "AC%u", 0, 3)> {
   let Size = 64;
 }
 
 def DSPCC : RegisterClass<"Mips", [v4i8, v2i16], 32, (add DSPCCond)>;
 
+// Coprocessor 2 registers.
+def COP2 : RegisterClass<"Mips", [i32], 32, (sequence "COP2%u", 0, 31)>,
+           Unallocatable;
+
 // Register Operands.
 
 class MipsAsmRegOperand : AsmOperandClass {
@@ -345,9 +391,19 @@ def GPR64AsmOperand : MipsAsmRegOperand {
   let ParserMethod = "parseGPR64";
 }
 
-def ACRegsDSPAsmOperand : MipsAsmRegOperand {
-  let Name = "ACRegsDSPAsm";
-  let ParserMethod = "parseACRegsDSP";
+def ACC64DSPAsmOperand : MipsAsmRegOperand {
+  let Name = "ACC64DSPAsm";
+  let ParserMethod = "parseACC64DSP";
+}
+
+def LO32DSPAsmOperand : MipsAsmRegOperand {
+  let Name = "LO32DSPAsm";
+  let ParserMethod = "parseLO32DSP";
+}
+
+def HI32DSPAsmOperand : MipsAsmRegOperand {
+  let Name = "HI32DSPAsm";
+  let ParserMethod = "parseHI32DSP";
 }
 
 def CCRAsmOperand : MipsAsmRegOperand {
@@ -370,11 +426,41 @@ def FGR32AsmOperand : MipsAsmRegOperand {
   let ParserMethod = "parseFGR32Regs";
 }
 
+def FGRH32AsmOperand : MipsAsmRegOperand {
+  let Name = "FGRH32Asm";
+  let ParserMethod = "parseFGRH32Regs";
+}
+
 def FCCRegsAsmOperand : MipsAsmRegOperand {
   let Name = "FCCRegsAsm";
   let ParserMethod = "parseFCCRegs";
 }
 
+def MSA128BAsmOperand : MipsAsmRegOperand {
+  let Name = "MSA128BAsm";
+  let ParserMethod = "parseMSA128BRegs";
+}
+
+def MSA128HAsmOperand : MipsAsmRegOperand {
+  let Name = "MSA128HAsm";
+  let ParserMethod = "parseMSA128HRegs";
+}
+
+def MSA128WAsmOperand : MipsAsmRegOperand {
+  let Name = "MSA128WAsm";
+  let ParserMethod = "parseMSA128WRegs";
+}
+
+def MSA128DAsmOperand : MipsAsmRegOperand {
+  let Name = "MSA128DAsm";
+  let ParserMethod = "parseMSA128DRegs";
+}
+
+def MSA128CRAsmOperand : MipsAsmRegOperand {
+  let Name = "MSA128CRAsm";
+  let ParserMethod = "parseMSA128CtrlRegs";
+}
+
 def GPR32Opnd : RegisterOperand<GPR32> {
   let ParserMatchClass = GPR32AsmOperand;
 }
@@ -383,6 +469,10 @@ def GPR64Opnd : RegisterOperand<GPR64> {
   let ParserMatchClass = GPR64AsmOperand;
 }
 
+def DSPROpnd : RegisterOperand<DSPR> {
+  let ParserMatchClass = GPR32AsmOperand;
+}
+
 def CCROpnd : RegisterOperand<CCR> {
   let ParserMatchClass = CCRAsmOperand;
 }
@@ -392,35 +482,68 @@ def HWRegsAsmOperand : MipsAsmRegOperand {
   let ParserMethod = "parseHWRegs";
 }
 
-def HW64RegsAsmOperand : MipsAsmRegOperand {
-  let Name = "HW64RegsAsm";
-  let ParserMethod = "parseHW64Regs";
+def COP2AsmOperand : MipsAsmRegOperand {
+  let Name = "COP2Asm";
+  let ParserMethod = "parseCOP2";
 }
 
 def HWRegsOpnd : RegisterOperand<HWRegs> {
   let ParserMatchClass = HWRegsAsmOperand;
 }
 
-def HW64RegsOpnd : RegisterOperand<HWRegs64> {
-  let ParserMatchClass = HW64RegsAsmOperand;
-}
-
-def AFGR64RegsOpnd : RegisterOperand<AFGR64> {
+def AFGR64Opnd : RegisterOperand<AFGR64> {
   let ParserMatchClass = AFGR64AsmOperand;
 }
 
-def FGR64RegsOpnd : RegisterOperand<FGR64> {
+def FGR64Opnd : RegisterOperand<FGR64> {
   let ParserMatchClass = FGR64AsmOperand;
 }
 
-def FGR32RegsOpnd : RegisterOperand<FGR32> {
+def FGR32Opnd : RegisterOperand<FGR32> {
   let ParserMatchClass = FGR32AsmOperand;
 }
 
+def FGRH32Opnd : RegisterOperand<FGRH32> {
+  let ParserMatchClass = FGRH32AsmOperand;
+}
+
 def FCCRegsOpnd : RegisterOperand<FCC> {
   let ParserMatchClass = FCCRegsAsmOperand;
 }
 
-def ACRegsDSPOpnd : RegisterOperand<ACRegsDSP> {
-  let ParserMatchClass = ACRegsDSPAsmOperand;
+def LO32DSPOpnd : RegisterOperand<LO32DSP> {
+  let ParserMatchClass = LO32DSPAsmOperand;
 }
+
+def HI32DSPOpnd : RegisterOperand<HI32DSP> {
+  let ParserMatchClass = HI32DSPAsmOperand;
+}
+
+def ACC64DSPOpnd : RegisterOperand<ACC64DSP> {
+  let ParserMatchClass = ACC64DSPAsmOperand;
+}
+
+def COP2Opnd : RegisterOperand<COP2> {
+  let ParserMatchClass = COP2AsmOperand;
+}
+
+def MSA128BOpnd : RegisterOperand<MSA128B> {
+  let ParserMatchClass = MSA128BAsmOperand;
+}
+
+def MSA128HOpnd : RegisterOperand<MSA128H> {
+  let ParserMatchClass = MSA128HAsmOperand;
+}
+
+def MSA128WOpnd : RegisterOperand<MSA128W> {
+  let ParserMatchClass = MSA128WAsmOperand;
+}
+
+def MSA128DOpnd : RegisterOperand<MSA128D> {
+  let ParserMatchClass = MSA128DAsmOperand;
+}
+
+def MSA128CROpnd : RegisterOperand<MSACtrl> {
+  let ParserMatchClass = MSA128CRAsmOperand;
+}
+
diff --git a/lib/Target/Mips/MipsSEFrameLowering.cpp b/lib/Target/Mips/MipsSEFrameLowering.cpp
index d9e0fa4..33ed4b3 100644
--- a/lib/Target/Mips/MipsSEFrameLowering.cpp
+++ b/lib/Target/Mips/MipsSEFrameLowering.cpp
@@ -32,6 +32,21 @@ using namespace llvm;
 namespace {
 typedef MachineBasicBlock::iterator Iter;
 
+static std::pair<unsigned, unsigned> getMFHiLoOpc(unsigned Src) {
+  if (Mips::ACC64RegClass.contains(Src))
+    return std::make_pair((unsigned)Mips::PseudoMFHI,
+                          (unsigned)Mips::PseudoMFLO);
+
+  if (Mips::ACC64DSPRegClass.contains(Src))
+    return std::make_pair((unsigned)Mips::MFHI_DSP, (unsigned)Mips::MFLO_DSP);
+
+  if (Mips::ACC128RegClass.contains(Src))
+    return std::make_pair((unsigned)Mips::PseudoMFHI64,
+                          (unsigned)Mips::PseudoMFLO64);
+
+  return std::make_pair(0, 0);
+}
+
 /// Helper class to expand pseudos.
 class ExpandPseudo {
 public:
@@ -43,10 +58,11 @@ private:
   void expandLoadCCond(MachineBasicBlock &MBB, Iter I);
   void expandStoreCCond(MachineBasicBlock &MBB, Iter I);
   void expandLoadACC(MachineBasicBlock &MBB, Iter I, unsigned RegSize);
-  void expandStoreACC(MachineBasicBlock &MBB, Iter I, unsigned RegSize);
+  void expandStoreACC(MachineBasicBlock &MBB, Iter I, unsigned MFHiOpc,
+                      unsigned MFLoOpc, unsigned RegSize);
   bool expandCopy(MachineBasicBlock &MBB, Iter I);
-  bool expandCopyACC(MachineBasicBlock &MBB, Iter I, unsigned Dst,
-                     unsigned Src, unsigned RegSize);
+  bool expandCopyACC(MachineBasicBlock &MBB, Iter I, unsigned MFHiOpc,
+                     unsigned MFLoOpc);
 
   MachineFunction &MF;
   MachineRegisterInfo &MRI;
@@ -70,32 +86,26 @@ bool ExpandPseudo::expand() {
 bool ExpandPseudo::expandInstr(MachineBasicBlock &MBB, Iter I) {
   switch(I->getOpcode()) {
   case Mips::LOAD_CCOND_DSP:
-  case Mips::LOAD_CCOND_DSP_P8:
     expandLoadCCond(MBB, I);
     break;
   case Mips::STORE_CCOND_DSP:
-  case Mips::STORE_CCOND_DSP_P8:
     expandStoreCCond(MBB, I);
     break;
-  case Mips::LOAD_AC64:
-  case Mips::LOAD_AC64_P8:
-  case Mips::LOAD_AC_DSP:
-  case Mips::LOAD_AC_DSP_P8:
+  case Mips::LOAD_ACC64:
+  case Mips::LOAD_ACC64DSP:
     expandLoadACC(MBB, I, 4);
     break;
-  case Mips::LOAD_AC128:
-  case Mips::LOAD_AC128_P8:
+  case Mips::LOAD_ACC128:
     expandLoadACC(MBB, I, 8);
     break;
-  case Mips::STORE_AC64:
-  case Mips::STORE_AC64_P8:
-  case Mips::STORE_AC_DSP:
-  case Mips::STORE_AC_DSP_P8:
-    expandStoreACC(MBB, I, 4);
+  case Mips::STORE_ACC64:
+    expandStoreACC(MBB, I, Mips::PseudoMFHI, Mips::PseudoMFLO, 4);
+    break;
+  case Mips::STORE_ACC64DSP:
+    expandStoreACC(MBB, I, Mips::MFHI_DSP, Mips::MFLO_DSP, 4);
     break;
-  case Mips::STORE_AC128:
-  case Mips::STORE_AC128_P8:
-    expandStoreACC(MBB, I, 8);
+  case Mips::STORE_ACC128:
+    expandStoreACC(MBB, I, Mips::PseudoMFHI64, Mips::PseudoMFLO64, 8);
     break;
   case TargetOpcode::COPY:
     if (!expandCopy(MBB, I))
@@ -179,10 +189,11 @@ void ExpandPseudo::expandLoadACC(MachineBasicBlock &MBB, Iter I,
 }
 
 void ExpandPseudo::expandStoreACC(MachineBasicBlock &MBB, Iter I,
+                                  unsigned MFHiOpc, unsigned MFLoOpc,
                                   unsigned RegSize) {
-  //  copy $vr0, lo
+  //  mflo $vr0, src
   //  store $vr0, FI
-  //  copy $vr1, hi
+  //  mfhi $vr1, src
   //  store $vr1, FI + 4
 
   assert(I->getOperand(0).isReg() && I->getOperand(1).isFI());
@@ -197,33 +208,29 @@ void ExpandPseudo::expandStoreACC(MachineBasicBlock &MBB, Iter I,
   unsigned VR1 = MRI.createVirtualRegister(RC);
   unsigned Src = I->getOperand(0).getReg(), FI = I->getOperand(1).getIndex();
   unsigned SrcKill = getKillRegState(I->getOperand(0).isKill());
-  unsigned Lo = RegInfo.getSubReg(Src, Mips::sub_lo);
-  unsigned Hi = RegInfo.getSubReg(Src, Mips::sub_hi);
   DebugLoc DL = I->getDebugLoc();
 
-  BuildMI(MBB, I, DL, TII.get(TargetOpcode::COPY), VR0).addReg(Lo, SrcKill);
+  BuildMI(MBB, I, DL, TII.get(MFLoOpc), VR0).addReg(Src);
   TII.storeRegToStack(MBB, I, VR0, true, FI, RC, &RegInfo, 0);
-  BuildMI(MBB, I, DL, TII.get(TargetOpcode::COPY), VR1).addReg(Hi, SrcKill);
+  BuildMI(MBB, I, DL, TII.get(MFHiOpc), VR1).addReg(Src, SrcKill);
   TII.storeRegToStack(MBB, I, VR1, true, FI, RC, &RegInfo, RegSize);
 }
 
 bool ExpandPseudo::expandCopy(MachineBasicBlock &MBB, Iter I) {
-  unsigned Dst = I->getOperand(0).getReg(), Src = I->getOperand(1).getReg();
-
-  if (Mips::ACRegsDSPRegClass.contains(Dst, Src))
-    return expandCopyACC(MBB, I, Dst, Src, 4);
+  unsigned Src = I->getOperand(1).getReg();
+  std::pair<unsigned, unsigned> Opcodes = getMFHiLoOpc(Src);
 
-  if (Mips::ACRegs128RegClass.contains(Dst, Src))
-    return expandCopyACC(MBB, I, Dst, Src, 8);
+  if (!Opcodes.first)
+    return false;
 
-  return false;
+  return expandCopyACC(MBB, I, Opcodes.first, Opcodes.second);
 }
 
-bool ExpandPseudo::expandCopyACC(MachineBasicBlock &MBB, Iter I, unsigned Dst,
-                                 unsigned Src, unsigned RegSize) {
-  //  copy $vr0, src_lo
+bool ExpandPseudo::expandCopyACC(MachineBasicBlock &MBB, Iter I,
+                                 unsigned MFHiOpc, unsigned MFLoOpc) {
+  //  mflo $vr0, src
   //  copy dst_lo, $vr0
-  //  copy $vr1, src_hi
+  //  mfhi $vr1, src
   //  copy dst_hi, $vr1
 
   const MipsSEInstrInfo &TII =
@@ -231,20 +238,20 @@ bool ExpandPseudo::expandCopyACC(MachineBasicBlock &MBB, Iter I, unsigned Dst,
   const MipsRegisterInfo &RegInfo =
     *static_cast<const MipsRegisterInfo*>(MF.getTarget().getRegisterInfo());
 
-  const TargetRegisterClass *RC = RegInfo.intRegClass(RegSize);
+  unsigned Dst = I->getOperand(0).getReg(), Src = I->getOperand(1).getReg();
+  unsigned VRegSize = RegInfo.getMinimalPhysRegClass(Dst)->getSize() / 2;
+  const TargetRegisterClass *RC = RegInfo.intRegClass(VRegSize);
   unsigned VR0 = MRI.createVirtualRegister(RC);
   unsigned VR1 = MRI.createVirtualRegister(RC);
   unsigned SrcKill = getKillRegState(I->getOperand(1).isKill());
   unsigned DstLo = RegInfo.getSubReg(Dst, Mips::sub_lo);
   unsigned DstHi = RegInfo.getSubReg(Dst, Mips::sub_hi);
-  unsigned SrcLo = RegInfo.getSubReg(Src, Mips::sub_lo);
-  unsigned SrcHi = RegInfo.getSubReg(Src, Mips::sub_hi);
   DebugLoc DL = I->getDebugLoc();
 
-  BuildMI(MBB, I, DL, TII.get(TargetOpcode::COPY), VR0).addReg(SrcLo, SrcKill);
+  BuildMI(MBB, I, DL, TII.get(MFLoOpc), VR0).addReg(Src);
   BuildMI(MBB, I, DL, TII.get(TargetOpcode::COPY), DstLo)
     .addReg(VR0, RegState::Kill);
-  BuildMI(MBB, I, DL, TII.get(TargetOpcode::COPY), VR1).addReg(SrcHi, SrcKill);
+  BuildMI(MBB, I, DL, TII.get(MFHiOpc), VR1).addReg(Src, SrcKill);
   BuildMI(MBB, I, DL, TII.get(TargetOpcode::COPY), DstHi)
     .addReg(VR1, RegState::Kill);
   return true;
@@ -321,9 +328,9 @@ void MipsSEFrameLowering::emitPrologue(MachineFunction &MF) const {
       // one for each of the paired single precision registers.
       if (Mips::AFGR64RegClass.contains(Reg)) {
         unsigned Reg0 =
-            MRI->getDwarfRegNum(RegInfo.getSubReg(Reg, Mips::sub_fpeven), true);
+            MRI->getDwarfRegNum(RegInfo.getSubReg(Reg, Mips::sub_lo), true);
         unsigned Reg1 =
-            MRI->getDwarfRegNum(RegInfo.getSubReg(Reg, Mips::sub_fpodd), true);
+            MRI->getDwarfRegNum(RegInfo.getSubReg(Reg, Mips::sub_hi), true);
 
         if (!STI.isLittle())
           std::swap(Reg0, Reg1);
diff --git a/lib/Target/Mips/MipsSEFrameLowering.h b/lib/Target/Mips/MipsSEFrameLowering.h
index 193a66c..8fa9e46 100644
--- a/lib/Target/Mips/MipsSEFrameLowering.h
+++ b/lib/Target/Mips/MipsSEFrameLowering.h
@@ -21,7 +21,7 @@ namespace llvm {
 class MipsSEFrameLowering : public MipsFrameLowering {
 public:
   explicit MipsSEFrameLowering(const MipsSubtarget &STI)
-    : MipsFrameLowering(STI, STI.hasMips64() ? 16 : 8) {}
+    : MipsFrameLowering(STI, STI.stackAlignment()) {}
 
   /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
   /// the function.
diff --git a/lib/Target/Mips/MipsSEISelDAGToDAG.cpp b/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
index 3b6480a..737660e 100644
--- a/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
+++ b/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
@@ -66,6 +66,21 @@ void MipsSEDAGToDAGISel::addDSPCtrlRegOperands(bool IsDef, MachineInstr &MI,
     MIB.addReg(Mips::DSPEFI, Flag);
 }
 
+unsigned MipsSEDAGToDAGISel::getMSACtrlReg(const SDValue RegIdx) const {
+  switch (cast<ConstantSDNode>(RegIdx)->getZExtValue()) {
+  default:
+    llvm_unreachable("Could not map int to register");
+  case 0: return Mips::MSAIR;
+  case 1: return Mips::MSACSR;
+  case 2: return Mips::MSAAccess;
+  case 3: return Mips::MSASave;
+  case 4: return Mips::MSAModify;
+  case 5: return Mips::MSARequest;
+  case 6: return Mips::MSAMap;
+  case 7: return Mips::MSAUnmap;
+  }
+}
+
 bool MipsSEDAGToDAGISel::replaceUsesWithZeroReg(MachineRegisterInfo *MRI,
                                                 const MachineInstr& MI) {
   unsigned DstReg = 0, ZeroReg = 0;
@@ -301,6 +316,20 @@ bool MipsSEDAGToDAGISel::selectAddrRegImm(SDValue Addr, SDValue &Base,
   return false;
 }
 
+/// ComplexPattern used on MipsInstrInfo
+/// Used on Mips Load/Store instructions
+bool MipsSEDAGToDAGISel::selectAddrRegReg(SDValue Addr, SDValue &Base,
+                                          SDValue &Offset) const {
+  // Operand is a result from an ADD.
+  if (Addr.getOpcode() == ISD::ADD) {
+    Base = Addr.getOperand(0);
+    Offset = Addr.getOperand(1);
+    return true;
+  }
+
+  return false;
+}
+
 bool MipsSEDAGToDAGISel::selectAddrDefault(SDValue Addr, SDValue &Base,
                                            SDValue &Offset) const {
   Base = Addr;
@@ -314,6 +343,263 @@ bool MipsSEDAGToDAGISel::selectIntAddr(SDValue Addr, SDValue &Base,
     selectAddrDefault(Addr, Base, Offset);
 }
 
+/// Used on microMIPS Load/Store unaligned instructions (12-bit offset)
+bool MipsSEDAGToDAGISel::selectAddrRegImm12(SDValue Addr, SDValue &Base,
+                                            SDValue &Offset) const {
+  EVT ValTy = Addr.getValueType();
+
+  // Addresses of the form FI+const or FI|const
+  if (CurDAG->isBaseWithConstantOffset(Addr)) {
+    ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
+    if (isInt<12>(CN->getSExtValue())) {
+
+      // If the first operand is a FI then get the TargetFI Node
+      if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>
+                                  (Addr.getOperand(0)))
+        Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), ValTy);
+      else
+        Base = Addr.getOperand(0);
+
+      Offset = CurDAG->getTargetConstant(CN->getZExtValue(), ValTy);
+      return true;
+    }
+  }
+
+  return false;
+}
+
+bool MipsSEDAGToDAGISel::selectIntAddrMM(SDValue Addr, SDValue &Base,
+                                         SDValue &Offset) const {
+  return selectAddrRegImm12(Addr, Base, Offset) ||
+    selectAddrDefault(Addr, Base, Offset);
+}
+
+// Select constant vector splats.
+//
+// Returns true and sets Imm if:
+// * MSA is enabled
+// * N is a ISD::BUILD_VECTOR representing a constant splat
+bool MipsSEDAGToDAGISel::selectVSplat(SDNode *N, APInt &Imm) const {
+  if (!Subtarget.hasMSA())
+    return false;
+
+  BuildVectorSDNode *Node = dyn_cast<BuildVectorSDNode>(N);
+
+  if (Node == NULL)
+    return false;
+
+  APInt SplatValue, SplatUndef;
+  unsigned SplatBitSize;
+  bool HasAnyUndefs;
+
+  if (!Node->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
+                             HasAnyUndefs, 8,
+                             !Subtarget.isLittle()))
+    return false;
+
+  Imm = SplatValue;
+
+  return true;
+}
+
+// Select constant vector splats.
+//
+// In addition to the requirements of selectVSplat(), this function returns
+// true and sets Imm if:
+// * The splat value is the same width as the elements of the vector
+// * The splat value fits in an integer with the specified signed-ness and
+//   width.
+//
+// This function looks through ISD::BITCAST nodes.
+// TODO: This might not be appropriate for big-endian MSA since BITCAST is
+//       sometimes a shuffle in big-endian mode.
+//
+// It's worth noting that this function is not used as part of the selection
+// of ldi.[bhwd] since it does not permit using the wrong-typed ldi.[bhwd]
+// instruction to achieve the desired bit pattern. ldi.[bhwd] is selected in
+// MipsSEDAGToDAGISel::selectNode.
+bool MipsSEDAGToDAGISel::
+selectVSplatCommon(SDValue N, SDValue &Imm, bool Signed,
+                   unsigned ImmBitSize) const {
+  APInt ImmValue;
+  EVT EltTy = N->getValueType(0).getVectorElementType();
+
+  if (N->getOpcode() == ISD::BITCAST)
+    N = N->getOperand(0);
+
+  if (selectVSplat (N.getNode(), ImmValue) &&
+      ImmValue.getBitWidth() == EltTy.getSizeInBits()) {
+    if (( Signed && ImmValue.isSignedIntN(ImmBitSize)) ||
+        (!Signed && ImmValue.isIntN(ImmBitSize))) {
+      Imm = CurDAG->getTargetConstant(ImmValue, EltTy);
+      return true;
+    }
+  }
+
+  return false;
+}
+
+// Select constant vector splats.
+bool MipsSEDAGToDAGISel::
+selectVSplatUimm1(SDValue N, SDValue &Imm) const {
+  return selectVSplatCommon(N, Imm, false, 1);
+}
+
+bool MipsSEDAGToDAGISel::
+selectVSplatUimm2(SDValue N, SDValue &Imm) const {
+  return selectVSplatCommon(N, Imm, false, 2);
+}
+
+bool MipsSEDAGToDAGISel::
+selectVSplatUimm3(SDValue N, SDValue &Imm) const {
+  return selectVSplatCommon(N, Imm, false, 3);
+}
+
+// Select constant vector splats.
+bool MipsSEDAGToDAGISel::
+selectVSplatUimm4(SDValue N, SDValue &Imm) const {
+  return selectVSplatCommon(N, Imm, false, 4);
+}
+
+// Select constant vector splats.
+bool MipsSEDAGToDAGISel::
+selectVSplatUimm5(SDValue N, SDValue &Imm) const {
+  return selectVSplatCommon(N, Imm, false, 5);
+}
+
+// Select constant vector splats.
+bool MipsSEDAGToDAGISel::
+selectVSplatUimm6(SDValue N, SDValue &Imm) const {
+  return selectVSplatCommon(N, Imm, false, 6);
+}
+
+// Select constant vector splats.
+bool MipsSEDAGToDAGISel::
+selectVSplatUimm8(SDValue N, SDValue &Imm) const {
+  return selectVSplatCommon(N, Imm, false, 8);
+}
+
+// Select constant vector splats.
+bool MipsSEDAGToDAGISel::
+selectVSplatSimm5(SDValue N, SDValue &Imm) const {
+  return selectVSplatCommon(N, Imm, true, 5);
+}
+
+// Select constant vector splats whose value is a power of 2.
+//
+// In addition to the requirements of selectVSplat(), this function returns
+// true and sets Imm if:
+// * The splat value is the same width as the elements of the vector
+// * The splat value is a power of two.
+//
+// This function looks through ISD::BITCAST nodes.
+// TODO: This might not be appropriate for big-endian MSA since BITCAST is
+//       sometimes a shuffle in big-endian mode.
+bool MipsSEDAGToDAGISel::selectVSplatUimmPow2(SDValue N, SDValue &Imm) const {
+  APInt ImmValue;
+  EVT EltTy = N->getValueType(0).getVectorElementType();
+
+  if (N->getOpcode() == ISD::BITCAST)
+    N = N->getOperand(0);
+
+  if (selectVSplat (N.getNode(), ImmValue) &&
+      ImmValue.getBitWidth() == EltTy.getSizeInBits()) {
+    int32_t Log2 = ImmValue.exactLogBase2();
+
+    if (Log2 != -1) {
+      Imm = CurDAG->getTargetConstant(Log2, EltTy);
+      return true;
+    }
+  }
+
+  return false;
+}
+
+// Select constant vector splats whose value only has a consecutive sequence
+// of left-most bits set (e.g. 0b11...1100...00).
+//
+// In addition to the requirements of selectVSplat(), this function returns
+// true and sets Imm if:
+// * The splat value is the same width as the elements of the vector
+// * The splat value is a consecutive sequence of left-most bits.
+//
+// This function looks through ISD::BITCAST nodes.
+// TODO: This might not be appropriate for big-endian MSA since BITCAST is
+//       sometimes a shuffle in big-endian mode.
+bool MipsSEDAGToDAGISel::selectVSplatMaskL(SDValue N, SDValue &Imm) const {
+  APInt ImmValue;
+  EVT EltTy = N->getValueType(0).getVectorElementType();
+
+  if (N->getOpcode() == ISD::BITCAST)
+    N = N->getOperand(0);
+
+  if (selectVSplat(N.getNode(), ImmValue) &&
+      ImmValue.getBitWidth() == EltTy.getSizeInBits()) {
+    // Extract the run of set bits starting with bit zero from the bitwise
+    // inverse of ImmValue, and test that the inverse of this is the same
+    // as the original value.
+    if (ImmValue == ~(~ImmValue & ~(~ImmValue + 1))) {
+
+      Imm = CurDAG->getTargetConstant(ImmValue.countPopulation(), EltTy);
+      return true;
+    }
+  }
+
+  return false;
+}
+
+// Select constant vector splats whose value only has a consecutive sequence
+// of right-most bits set (e.g. 0b00...0011...11).
+//
+// In addition to the requirements of selectVSplat(), this function returns
+// true and sets Imm if:
+// * The splat value is the same width as the elements of the vector
+// * The splat value is a consecutive sequence of right-most bits.
+//
+// This function looks through ISD::BITCAST nodes.
+// TODO: This might not be appropriate for big-endian MSA since BITCAST is
+//       sometimes a shuffle in big-endian mode.
+bool MipsSEDAGToDAGISel::selectVSplatMaskR(SDValue N, SDValue &Imm) const {
+  APInt ImmValue;
+  EVT EltTy = N->getValueType(0).getVectorElementType();
+
+  if (N->getOpcode() == ISD::BITCAST)
+    N = N->getOperand(0);
+
+  if (selectVSplat(N.getNode(), ImmValue) &&
+      ImmValue.getBitWidth() == EltTy.getSizeInBits()) {
+    // Extract the run of set bits starting with bit zero, and test that the
+    // result is the same as the original value
+    if (ImmValue == (ImmValue & ~(ImmValue + 1))) {
+      Imm = CurDAG->getTargetConstant(ImmValue.countPopulation(), EltTy);
+      return true;
+    }
+  }
+
+  return false;
+}
+
+bool MipsSEDAGToDAGISel::selectVSplatUimmInvPow2(SDValue N,
+                                                 SDValue &Imm) const {
+  APInt ImmValue;
+  EVT EltTy = N->getValueType(0).getVectorElementType();
+
+  if (N->getOpcode() == ISD::BITCAST)
+    N = N->getOperand(0);
+
+  if (selectVSplat(N.getNode(), ImmValue) &&
+      ImmValue.getBitWidth() == EltTy.getSizeInBits()) {
+    int32_t Log2 = (~ImmValue).exactLogBase2();
+
+    if (Log2 != -1) {
+      Imm = CurDAG->getTargetConstant(Log2, EltTy);
+      return true;
+    }
+  }
+
+  return false;
+}
+
 std::pair<bool, SDNode*> MipsSEDAGToDAGISel::selectNode(SDNode *Node) {
   unsigned Opcode = Node->getOpcode();
   SDLoc DL(Node);
@@ -348,6 +634,11 @@ std::pair<bool, SDNode*> MipsSEDAGToDAGISel::selectNode(SDNode *Node) {
         SDValue Zero = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL,
                                               Mips::ZERO_64, MVT::i64);
         Result = CurDAG->getMachineNode(Mips::DMTC1, DL, MVT::f64, Zero);
+      } else if (Subtarget.isFP64bit()) {
+        SDValue Zero = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL,
+                                              Mips::ZERO, MVT::i32);
+        Result = CurDAG->getMachineNode(Mips::BuildPairF64_64, DL, MVT::f64,
+                                        Zero, Zero);
       } else {
         SDValue Zero = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL,
                                               Mips::ZERO, MVT::i32);
@@ -401,24 +692,71 @@ std::pair<bool, SDNode*> MipsSEDAGToDAGISel::selectNode(SDNode *Node) {
     return std::make_pair(true, RegOpnd);
   }
 
+  case ISD::INTRINSIC_W_CHAIN: {
+    switch (cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue()) {
+    default:
+      break;
+
+    case Intrinsic::mips_cfcmsa: {
+      SDValue ChainIn = Node->getOperand(0);
+      SDValue RegIdx = Node->getOperand(2);
+      SDValue Reg = CurDAG->getCopyFromReg(ChainIn, DL,
+                                           getMSACtrlReg(RegIdx), MVT::i32);
+      return std::make_pair(true, Reg.getNode());
+    }
+    }
+    break;
+  }
+
+  case ISD::INTRINSIC_WO_CHAIN: {
+    switch (cast<ConstantSDNode>(Node->getOperand(0))->getZExtValue()) {
+    default:
+      break;
+
+    case Intrinsic::mips_move_v:
+      // Like an assignment but will always produce a move.v even if
+      // unnecessary.
+      return std::make_pair(true,
+                            CurDAG->getMachineNode(Mips::MOVE_V, DL,
+                                                   Node->getValueType(0),
+                                                   Node->getOperand(1)));
+    }
+    break;
+  }
+
+  case ISD::INTRINSIC_VOID: {
+    switch (cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue()) {
+    default:
+      break;
+
+    case Intrinsic::mips_ctcmsa: {
+      SDValue ChainIn = Node->getOperand(0);
+      SDValue RegIdx  = Node->getOperand(2);
+      SDValue Value   = Node->getOperand(3);
+      SDValue ChainOut = CurDAG->getCopyToReg(ChainIn, DL,
+                                              getMSACtrlReg(RegIdx), Value);
+      return std::make_pair(true, ChainOut.getNode());
+    }
+    }
+    break;
+  }
+
   case MipsISD::ThreadPointer: {
     EVT PtrVT = getTargetLowering()->getPointerTy();
-    unsigned RdhwrOpc, SrcReg, DestReg;
+    unsigned RdhwrOpc, DestReg;
 
     if (PtrVT == MVT::i32) {
       RdhwrOpc = Mips::RDHWR;
-      SrcReg = Mips::HWR29;
       DestReg = Mips::V1;
     } else {
       RdhwrOpc = Mips::RDHWR64;
-      SrcReg = Mips::HWR29_64;
       DestReg = Mips::V1_64;
     }
 
     SDNode *Rdhwr =
       CurDAG->getMachineNode(RdhwrOpc, SDLoc(Node),
                              Node->getValueType(0),
-                             CurDAG->getRegister(SrcReg, PtrVT));
+                             CurDAG->getRegister(Mips::HWR29, MVT::i32));
     SDValue Chain = CurDAG->getCopyToReg(CurDAG->getEntryNode(), DL, DestReg,
                                          SDValue(Rdhwr, 0));
     SDValue ResNode = CurDAG->getCopyFromReg(Chain, DL, DestReg, PtrVT);
@@ -426,18 +764,81 @@ std::pair<bool, SDNode*> MipsSEDAGToDAGISel::selectNode(SDNode *Node) {
     return std::make_pair(true, ResNode.getNode());
   }
 
-  case MipsISD::InsertLOHI: {
-    unsigned RCID = Subtarget.hasDSP() ? Mips::ACRegsDSPRegClassID :
-                                         Mips::ACRegsRegClassID;
-    SDValue RegClass = CurDAG->getTargetConstant(RCID, MVT::i32);
-    SDValue LoIdx = CurDAG->getTargetConstant(Mips::sub_lo, MVT::i32);
-    SDValue HiIdx = CurDAG->getTargetConstant(Mips::sub_hi, MVT::i32);
-    const SDValue Ops[] = { RegClass, Node->getOperand(0), LoIdx,
-                            Node->getOperand(1), HiIdx };
-    SDNode *Res = CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL,
-                                         MVT::Untyped, Ops);
+  case ISD::BUILD_VECTOR: {
+    // Select appropriate ldi.[bhwd] instructions for constant splats of
+    // 128-bit when MSA is enabled. Fixup any register class mismatches that
+    // occur as a result.
+    //
+    // This allows the compiler to use a wider range of immediates than would
+    // otherwise be allowed. If, for example, v4i32 could only use ldi.h then
+    // it would not be possible to load { 0x01010101, 0x01010101, 0x01010101,
+    // 0x01010101 } without using a constant pool. This would be sub-optimal
+    // when // 'ldi.b wd, 1' is capable of producing that bit-pattern in the
+    // same set/ of registers. Similarly, ldi.h isn't capable of producing {
+    // 0x00000000, 0x00000001, 0x00000000, 0x00000001 } but 'ldi.d wd, 1' can.
+
+    BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Node);
+    APInt SplatValue, SplatUndef;
+    unsigned SplatBitSize;
+    bool HasAnyUndefs;
+    unsigned LdiOp;
+    EVT ResVecTy = BVN->getValueType(0);
+    EVT ViaVecTy;
+
+    if (!Subtarget.hasMSA() || !BVN->getValueType(0).is128BitVector())
+      return std::make_pair(false, (SDNode*)NULL);
+
+    if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
+                              HasAnyUndefs, 8,
+                              !Subtarget.isLittle()))
+      return std::make_pair(false, (SDNode*)NULL);
+
+    switch (SplatBitSize) {
+    default:
+      return std::make_pair(false, (SDNode*)NULL);
+    case 8:
+      LdiOp = Mips::LDI_B;
+      ViaVecTy = MVT::v16i8;
+      break;
+    case 16:
+      LdiOp = Mips::LDI_H;
+      ViaVecTy = MVT::v8i16;
+      break;
+    case 32:
+      LdiOp = Mips::LDI_W;
+      ViaVecTy = MVT::v4i32;
+      break;
+    case 64:
+      LdiOp = Mips::LDI_D;
+      ViaVecTy = MVT::v2i64;
+      break;
+    }
+
+    if (!SplatValue.isSignedIntN(10))
+      return std::make_pair(false, (SDNode*)NULL);
+
+    SDValue Imm = CurDAG->getTargetConstant(SplatValue,
+                                            ViaVecTy.getVectorElementType());
+
+    SDNode *Res = CurDAG->getMachineNode(LdiOp, SDLoc(Node), ViaVecTy, Imm);
+
+    if (ResVecTy != ViaVecTy) {
+      // If LdiOp is writing to a different register class to ResVecTy, then
+      // fix it up here. This COPY_TO_REGCLASS should never cause a move.v
+      // since the source and destination register sets contain the same
+      // registers.
+      const TargetLowering *TLI = getTargetLowering();
+      MVT ResVecTySimple = ResVecTy.getSimpleVT();
+      const TargetRegisterClass *RC = TLI->getRegClassFor(ResVecTySimple);
+      Res = CurDAG->getMachineNode(Mips::COPY_TO_REGCLASS, SDLoc(Node),
+                                   ResVecTy, SDValue(Res, 0),
+                                   CurDAG->getTargetConstant(RC->getID(),
+                                                             MVT::i32));
+    }
+
     return std::make_pair(true, Res);
   }
+
   }
 
   return std::make_pair(false, (SDNode*)NULL);
diff --git a/lib/Target/Mips/MipsSEISelDAGToDAG.h b/lib/Target/Mips/MipsSEISelDAGToDAG.h
index 03ed1f9..dc52064 100644
--- a/lib/Target/Mips/MipsSEISelDAGToDAG.h
+++ b/lib/Target/Mips/MipsSEISelDAGToDAG.h
@@ -30,6 +30,8 @@ private:
   void addDSPCtrlRegOperands(bool IsDef, MachineInstr &MI,
                              MachineFunction &MF);
 
+  unsigned getMSACtrlReg(const SDValue RegIdx) const;
+
   bool replaceUsesWithZeroReg(MachineRegisterInfo *MRI, const MachineInstr&);
 
   std::pair<SDNode*, SDNode*> selectMULT(SDNode *N, unsigned Opc, SDLoc dl,
@@ -41,12 +43,54 @@ private:
   virtual bool selectAddrRegImm(SDValue Addr, SDValue &Base,
                                 SDValue &Offset) const;
 
+  virtual bool selectAddrRegReg(SDValue Addr, SDValue &Base,
+                                SDValue &Offset) const;
+
   virtual bool selectAddrDefault(SDValue Addr, SDValue &Base,
                                  SDValue &Offset) const;
 
   virtual bool selectIntAddr(SDValue Addr, SDValue &Base,
                              SDValue &Offset) const;
 
+  virtual bool selectAddrRegImm12(SDValue Addr, SDValue &Base,
+                                  SDValue &Offset) const;
+
+  virtual bool selectIntAddrMM(SDValue Addr, SDValue &Base,
+                               SDValue &Offset) const;
+
+  /// \brief Select constant vector splats.
+  virtual bool selectVSplat(SDNode *N, APInt &Imm) const;
+  /// \brief Select constant vector splats whose value fits in a given integer.
+  virtual bool selectVSplatCommon(SDValue N, SDValue &Imm, bool Signed,
+                                  unsigned ImmBitSize) const;
+  /// \brief Select constant vector splats whose value fits in a uimm1.
+  virtual bool selectVSplatUimm1(SDValue N, SDValue &Imm) const;
+  /// \brief Select constant vector splats whose value fits in a uimm2.
+  virtual bool selectVSplatUimm2(SDValue N, SDValue &Imm) const;
+  /// \brief Select constant vector splats whose value fits in a uimm3.
+  virtual bool selectVSplatUimm3(SDValue N, SDValue &Imm) const;
+  /// \brief Select constant vector splats whose value fits in a uimm4.
+  virtual bool selectVSplatUimm4(SDValue N, SDValue &Imm) const;
+  /// \brief Select constant vector splats whose value fits in a uimm5.
+  virtual bool selectVSplatUimm5(SDValue N, SDValue &Imm) const;
+  /// \brief Select constant vector splats whose value fits in a uimm6.
+  virtual bool selectVSplatUimm6(SDValue N, SDValue &Imm) const;
+  /// \brief Select constant vector splats whose value fits in a uimm8.
+  virtual bool selectVSplatUimm8(SDValue N, SDValue &Imm) const;
+  /// \brief Select constant vector splats whose value fits in a simm5.
+  virtual bool selectVSplatSimm5(SDValue N, SDValue &Imm) const;
+  /// \brief Select constant vector splats whose value is a power of 2.
+  virtual bool selectVSplatUimmPow2(SDValue N, SDValue &Imm) const;
+  /// \brief Select constant vector splats whose value is the inverse of a
+  /// power of 2.
+  virtual bool selectVSplatUimmInvPow2(SDValue N, SDValue &Imm) const;
+  /// \brief Select constant vector splats whose value is a run of set bits
+  /// ending at the most significant bit
+  virtual bool selectVSplatMaskL(SDValue N, SDValue &Imm) const;
+  /// \brief Select constant vector splats whose value is a run of set bits
+  /// starting at bit zero.
+  virtual bool selectVSplatMaskR(SDValue N, SDValue &Imm) const;
+
   virtual std::pair<bool, SDNode*> selectNode(SDNode *Node);
 
   virtual void processFunctionAfterISel(MachineFunction &MF);
diff --git a/lib/Target/Mips/MipsSEISelLowering.cpp b/lib/Target/Mips/MipsSEISelLowering.cpp
index a0aacb5..809adc0 100644
--- a/lib/Target/Mips/MipsSEISelLowering.cpp
+++ b/lib/Target/Mips/MipsSEISelLowering.cpp
@@ -10,6 +10,7 @@
 // Subclass of MipsTargetLowering specialized for mips32/64.
 //
 //===----------------------------------------------------------------------===//
+#define DEBUG_TYPE "mips-isel"
 #include "MipsSEISelLowering.h"
 #include "MipsRegisterInfo.h"
 #include "MipsTargetMachine.h"
@@ -17,6 +18,8 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
 
 using namespace llvm;
@@ -25,22 +28,40 @@ static cl::opt<bool>
 EnableMipsTailCalls("enable-mips-tail-calls", cl::Hidden,
                     cl::desc("MIPS: Enable tail calls."), cl::init(false));
 
+static cl::opt<bool> NoDPLoadStore("mno-ldc1-sdc1", cl::init(false),
+                                   cl::desc("Expand double precision loads and "
+                                            "stores to their single precision "
+                                            "counterparts"));
+
 MipsSETargetLowering::MipsSETargetLowering(MipsTargetMachine &TM)
   : MipsTargetLowering(TM) {
   // Set up the register classes
-
-  clearRegisterClasses();
-
   addRegisterClass(MVT::i32, &Mips::GPR32RegClass);
 
   if (HasMips64)
     addRegisterClass(MVT::i64, &Mips::GPR64RegClass);
 
+  if (Subtarget->hasDSP() || Subtarget->hasMSA()) {
+    // Expand all truncating stores and extending loads.
+    unsigned FirstVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
+    unsigned LastVT = (unsigned)MVT::LAST_VECTOR_VALUETYPE;
+
+    for (unsigned VT0 = FirstVT; VT0 <= LastVT; ++VT0) {
+      for (unsigned VT1 = FirstVT; VT1 <= LastVT; ++VT1)
+        setTruncStoreAction((MVT::SimpleValueType)VT0,
+                            (MVT::SimpleValueType)VT1, Expand);
+
+      setLoadExtAction(ISD::SEXTLOAD, (MVT::SimpleValueType)VT0, Expand);
+      setLoadExtAction(ISD::ZEXTLOAD, (MVT::SimpleValueType)VT0, Expand);
+      setLoadExtAction(ISD::EXTLOAD, (MVT::SimpleValueType)VT0, Expand);
+    }
+  }
+
   if (Subtarget->hasDSP()) {
     MVT::SimpleValueType VecTys[2] = {MVT::v2i16, MVT::v4i8};
 
     for (unsigned i = 0; i < array_lengthof(VecTys); ++i) {
-      addRegisterClass(VecTys[i], &Mips::DSPRegsRegClass);
+      addRegisterClass(VecTys[i], &Mips::DSPRRegClass);
 
       // Expand all builtin opcodes.
       for (unsigned Opc = 0; Opc < ISD::BUILTIN_OP_END; ++Opc)
@@ -53,20 +74,6 @@ MipsSETargetLowering::MipsSETargetLowering(MipsTargetMachine &TM)
       setOperationAction(ISD::BITCAST, VecTys[i], Legal);
     }
 
-    // Expand all truncating stores and extending loads.
-    unsigned FirstVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
-    unsigned LastVT = (unsigned)MVT::LAST_VECTOR_VALUETYPE;
-
-    for (unsigned VT0 = FirstVT; VT0 <= LastVT; ++VT0) {
-      for (unsigned VT1 = FirstVT; VT1 <= LastVT; ++VT1)
-        setTruncStoreAction((MVT::SimpleValueType)VT0,
-                            (MVT::SimpleValueType)VT1, Expand);
-
-      setLoadExtAction(ISD::SEXTLOAD, (MVT::SimpleValueType)VT0, Expand);
-      setLoadExtAction(ISD::ZEXTLOAD, (MVT::SimpleValueType)VT0, Expand);
-      setLoadExtAction(ISD::EXTLOAD, (MVT::SimpleValueType)VT0, Expand);
-    }
-
     setTargetDAGCombine(ISD::SHL);
     setTargetDAGCombine(ISD::SRA);
     setTargetDAGCombine(ISD::SRL);
@@ -77,12 +84,28 @@ MipsSETargetLowering::MipsSETargetLowering(MipsTargetMachine &TM)
   if (Subtarget->hasDSPR2())
     setOperationAction(ISD::MUL, MVT::v2i16, Legal);
 
-  if (!TM.Options.UseSoftFloat) {
+  if (Subtarget->hasMSA()) {
+    addMSAIntType(MVT::v16i8, &Mips::MSA128BRegClass);
+    addMSAIntType(MVT::v8i16, &Mips::MSA128HRegClass);
+    addMSAIntType(MVT::v4i32, &Mips::MSA128WRegClass);
+    addMSAIntType(MVT::v2i64, &Mips::MSA128DRegClass);
+    addMSAFloatType(MVT::v8f16, &Mips::MSA128HRegClass);
+    addMSAFloatType(MVT::v4f32, &Mips::MSA128WRegClass);
+    addMSAFloatType(MVT::v2f64, &Mips::MSA128DRegClass);
+
+    setTargetDAGCombine(ISD::AND);
+    setTargetDAGCombine(ISD::OR);
+    setTargetDAGCombine(ISD::SRA);
+    setTargetDAGCombine(ISD::VSELECT);
+    setTargetDAGCombine(ISD::XOR);
+  }
+
+  if (!Subtarget->mipsSEUsesSoftFloat()) {
     addRegisterClass(MVT::f32, &Mips::FGR32RegClass);
 
     // When dealing with single precision only, use libcalls
     if (!Subtarget->isSingleFloat()) {
-      if (HasMips64)
+      if (Subtarget->isFP64bit())
         addRegisterClass(MVT::f64, &Mips::FGR64RegClass);
       else
         addRegisterClass(MVT::f64, &Mips::AFGR64RegClass);
@@ -115,6 +138,15 @@ MipsSETargetLowering::MipsSETargetLowering(MipsTargetMachine &TM)
   setTargetDAGCombine(ISD::SUBE);
   setTargetDAGCombine(ISD::MUL);
 
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
+  setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
+
+  if (NoDPLoadStore) {
+    setOperationAction(ISD::LOAD, MVT::f64, Custom);
+    setOperationAction(ISD::STORE, MVT::f64, Custom);
+  }
+
   computeRegisterProperties();
 }
 
@@ -123,6 +155,93 @@ llvm::createMipsSETargetLowering(MipsTargetMachine &TM) {
   return new MipsSETargetLowering(TM);
 }
 
+// Enable MSA support for the given integer type and Register class.
+void MipsSETargetLowering::
+addMSAIntType(MVT::SimpleValueType Ty, const TargetRegisterClass *RC) {
+  addRegisterClass(Ty, RC);
+
+  // Expand all builtin opcodes.
+  for (unsigned Opc = 0; Opc < ISD::BUILTIN_OP_END; ++Opc)
+    setOperationAction(Opc, Ty, Expand);
+
+  setOperationAction(ISD::BITCAST, Ty, Legal);
+  setOperationAction(ISD::LOAD, Ty, Legal);
+  setOperationAction(ISD::STORE, Ty, Legal);
+  setOperationAction(ISD::EXTRACT_VECTOR_ELT, Ty, Custom);
+  setOperationAction(ISD::INSERT_VECTOR_ELT, Ty, Legal);
+  setOperationAction(ISD::BUILD_VECTOR, Ty, Custom);
+
+  setOperationAction(ISD::ADD, Ty, Legal);
+  setOperationAction(ISD::AND, Ty, Legal);
+  setOperationAction(ISD::CTLZ, Ty, Legal);
+  setOperationAction(ISD::CTPOP, Ty, Legal);
+  setOperationAction(ISD::MUL, Ty, Legal);
+  setOperationAction(ISD::OR, Ty, Legal);
+  setOperationAction(ISD::SDIV, Ty, Legal);
+  setOperationAction(ISD::SREM, Ty, Legal);
+  setOperationAction(ISD::SHL, Ty, Legal);
+  setOperationAction(ISD::SRA, Ty, Legal);
+  setOperationAction(ISD::SRL, Ty, Legal);
+  setOperationAction(ISD::SUB, Ty, Legal);
+  setOperationAction(ISD::UDIV, Ty, Legal);
+  setOperationAction(ISD::UREM, Ty, Legal);
+  setOperationAction(ISD::VECTOR_SHUFFLE, Ty, Custom);
+  setOperationAction(ISD::VSELECT, Ty, Legal);
+  setOperationAction(ISD::XOR, Ty, Legal);
+
+  if (Ty == MVT::v4i32 || Ty == MVT::v2i64) {
+    setOperationAction(ISD::FP_TO_SINT, Ty, Legal);
+    setOperationAction(ISD::FP_TO_UINT, Ty, Legal);
+    setOperationAction(ISD::SINT_TO_FP, Ty, Legal);
+    setOperationAction(ISD::UINT_TO_FP, Ty, Legal);
+  }
+
+  setOperationAction(ISD::SETCC, Ty, Legal);
+  setCondCodeAction(ISD::SETNE, Ty, Expand);
+  setCondCodeAction(ISD::SETGE, Ty, Expand);
+  setCondCodeAction(ISD::SETGT, Ty, Expand);
+  setCondCodeAction(ISD::SETUGE, Ty, Expand);
+  setCondCodeAction(ISD::SETUGT, Ty, Expand);
+}
+
+// Enable MSA support for the given floating-point type and Register class.
+void MipsSETargetLowering::
+addMSAFloatType(MVT::SimpleValueType Ty, const TargetRegisterClass *RC) {
+  addRegisterClass(Ty, RC);
+
+  // Expand all builtin opcodes.
+  for (unsigned Opc = 0; Opc < ISD::BUILTIN_OP_END; ++Opc)
+    setOperationAction(Opc, Ty, Expand);
+
+  setOperationAction(ISD::LOAD, Ty, Legal);
+  setOperationAction(ISD::STORE, Ty, Legal);
+  setOperationAction(ISD::BITCAST, Ty, Legal);
+  setOperationAction(ISD::EXTRACT_VECTOR_ELT, Ty, Legal);
+  setOperationAction(ISD::INSERT_VECTOR_ELT, Ty, Legal);
+  setOperationAction(ISD::BUILD_VECTOR, Ty, Custom);
+
+  if (Ty != MVT::v8f16) {
+    setOperationAction(ISD::FABS,  Ty, Legal);
+    setOperationAction(ISD::FADD,  Ty, Legal);
+    setOperationAction(ISD::FDIV,  Ty, Legal);
+    setOperationAction(ISD::FEXP2, Ty, Legal);
+    setOperationAction(ISD::FLOG2, Ty, Legal);
+    setOperationAction(ISD::FMA,   Ty, Legal);
+    setOperationAction(ISD::FMUL,  Ty, Legal);
+    setOperationAction(ISD::FRINT, Ty, Legal);
+    setOperationAction(ISD::FSQRT, Ty, Legal);
+    setOperationAction(ISD::FSUB,  Ty, Legal);
+    setOperationAction(ISD::VSELECT, Ty, Legal);
+
+    setOperationAction(ISD::SETCC, Ty, Legal);
+    setCondCodeAction(ISD::SETOGE, Ty, Expand);
+    setCondCodeAction(ISD::SETOGT, Ty, Expand);
+    setCondCodeAction(ISD::SETUGE, Ty, Expand);
+    setCondCodeAction(ISD::SETUGT, Ty, Expand);
+    setCondCodeAction(ISD::SETGE,  Ty, Expand);
+    setCondCodeAction(ISD::SETGT,  Ty, Expand);
+  }
+}
 
 bool
 MipsSETargetLowering::allowsUnalignedMemoryAccesses(EVT VT, bool *Fast) const {
@@ -142,6 +261,8 @@ MipsSETargetLowering::allowsUnalignedMemoryAccesses(EVT VT, bool *Fast) const {
 SDValue MipsSETargetLowering::LowerOperation(SDValue Op,
                                              SelectionDAG &DAG) const {
   switch(Op.getOpcode()) {
+  case ISD::LOAD:  return lowerLOAD(Op, DAG);
+  case ISD::STORE: return lowerSTORE(Op, DAG);
   case ISD::SMUL_LOHI: return lowerMulDiv(Op, MipsISD::Mult, true, true, DAG);
   case ISD::UMUL_LOHI: return lowerMulDiv(Op, MipsISD::Multu, true, true, DAG);
   case ISD::MULHS:     return lowerMulDiv(Op, MipsISD::Mult, false, true, DAG);
@@ -152,6 +273,10 @@ SDValue MipsSETargetLowering::LowerOperation(SDValue Op,
                                           DAG);
   case ISD::INTRINSIC_WO_CHAIN: return lowerINTRINSIC_WO_CHAIN(Op, DAG);
   case ISD::INTRINSIC_W_CHAIN:  return lowerINTRINSIC_W_CHAIN(Op, DAG);
+  case ISD::INTRINSIC_VOID:     return lowerINTRINSIC_VOID(Op, DAG);
+  case ISD::EXTRACT_VECTOR_ELT: return lowerEXTRACT_VECTOR_ELT(Op, DAG);
+  case ISD::BUILD_VECTOR:       return lowerBUILD_VECTOR(Op, DAG);
+  case ISD::VECTOR_SHUFFLE:     return lowerVECTOR_SHUFFLE(Op, DAG);
   }
 
   return MipsTargetLowering::LowerOperation(Op, DAG);
@@ -204,7 +329,7 @@ static bool selectMADD(SDNode *ADDENode, SelectionDAG *CurDAG) {
   SDLoc DL(ADDENode);
 
   // Initialize accumulator.
-  SDValue ACCIn = CurDAG->getNode(MipsISD::InsertLOHI, DL, MVT::Untyped,
+  SDValue ACCIn = CurDAG->getNode(MipsISD::MTLOHI, DL, MVT::Untyped,
                                   ADDCNode->getOperand(1),
                                   ADDENode->getOperand(1));
 
@@ -218,15 +343,11 @@ static bool selectMADD(SDNode *ADDENode, SelectionDAG *CurDAG) {
 
   // replace uses of adde and addc here
   if (!SDValue(ADDCNode, 0).use_empty()) {
-    SDValue LoIdx = CurDAG->getConstant(Mips::sub_lo, MVT::i32);
-    SDValue LoOut = CurDAG->getNode(MipsISD::ExtractLOHI, DL, MVT::i32, MAdd,
-                                    LoIdx);
+    SDValue LoOut = CurDAG->getNode(MipsISD::MFLO, DL, MVT::i32, MAdd);
     CurDAG->ReplaceAllUsesOfValueWith(SDValue(ADDCNode, 0), LoOut);
   }
   if (!SDValue(ADDENode, 0).use_empty()) {
-    SDValue HiIdx = CurDAG->getConstant(Mips::sub_hi, MVT::i32);
-    SDValue HiOut = CurDAG->getNode(MipsISD::ExtractLOHI, DL, MVT::i32, MAdd,
-                                    HiIdx);
+    SDValue HiOut = CurDAG->getNode(MipsISD::MFHI, DL, MVT::i32, MAdd);
     CurDAG->ReplaceAllUsesOfValueWith(SDValue(ADDENode, 0), HiOut);
   }
 
@@ -280,7 +401,7 @@ static bool selectMSUB(SDNode *SUBENode, SelectionDAG *CurDAG) {
   SDLoc DL(SUBENode);
 
   // Initialize accumulator.
-  SDValue ACCIn = CurDAG->getNode(MipsISD::InsertLOHI, DL, MVT::Untyped,
+  SDValue ACCIn = CurDAG->getNode(MipsISD::MTLOHI, DL, MVT::Untyped,
                                   SUBCNode->getOperand(0),
                                   SUBENode->getOperand(0));
 
@@ -294,15 +415,11 @@ static bool selectMSUB(SDNode *SUBENode, SelectionDAG *CurDAG) {
 
   // replace uses of sube and subc here
   if (!SDValue(SUBCNode, 0).use_empty()) {
-    SDValue LoIdx = CurDAG->getConstant(Mips::sub_lo, MVT::i32);
-    SDValue LoOut = CurDAG->getNode(MipsISD::ExtractLOHI, DL, MVT::i32, MSub,
-                                    LoIdx);
+    SDValue LoOut = CurDAG->getNode(MipsISD::MFLO, DL, MVT::i32, MSub);
     CurDAG->ReplaceAllUsesOfValueWith(SDValue(SUBCNode, 0), LoOut);
   }
   if (!SDValue(SUBENode, 0).use_empty()) {
-    SDValue HiIdx = CurDAG->getConstant(Mips::sub_hi, MVT::i32);
-    SDValue HiOut = CurDAG->getNode(MipsISD::ExtractLOHI, DL, MVT::i32, MSub,
-                                    HiIdx);
+    SDValue HiOut = CurDAG->getNode(MipsISD::MFHI, DL, MVT::i32, MSub);
     CurDAG->ReplaceAllUsesOfValueWith(SDValue(SUBENode, 0), HiOut);
   }
 
@@ -322,6 +439,248 @@ static SDValue performADDECombine(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
+// Fold zero extensions into MipsISD::VEXTRACT_[SZ]EXT_ELT
+//
+// Performs the following transformations:
+// - Changes MipsISD::VEXTRACT_[SZ]EXT_ELT to zero extension if its
+//   sign/zero-extension is completely overwritten by the new one performed by
+//   the ISD::AND.
+// - Removes redundant zero extensions performed by an ISD::AND.
+static SDValue performANDCombine(SDNode *N, SelectionDAG &DAG,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 const MipsSubtarget *Subtarget) {
+  if (!Subtarget->hasMSA())
+    return SDValue();
+
+  SDValue Op0 = N->getOperand(0);
+  SDValue Op1 = N->getOperand(1);
+  unsigned Op0Opcode = Op0->getOpcode();
+
+  // (and (MipsVExtract[SZ]Ext $a, $b, $c), imm:$d)
+  // where $d + 1 == 2^n and n == 32
+  // or    $d + 1 == 2^n and n <= 32 and ZExt
+  // -> (MipsVExtractZExt $a, $b, $c)
+  if (Op0Opcode == MipsISD::VEXTRACT_SEXT_ELT ||
+      Op0Opcode == MipsISD::VEXTRACT_ZEXT_ELT) {
+    ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(Op1);
+
+    if (!Mask)
+      return SDValue();
+
+    int32_t Log2IfPositive = (Mask->getAPIntValue() + 1).exactLogBase2();
+
+    if (Log2IfPositive <= 0)
+      return SDValue(); // Mask+1 is not a power of 2
+
+    SDValue Op0Op2 = Op0->getOperand(2);
+    EVT ExtendTy = cast<VTSDNode>(Op0Op2)->getVT();
+    unsigned ExtendTySize = ExtendTy.getSizeInBits();
+    unsigned Log2 = Log2IfPositive;
+
+    if ((Op0Opcode == MipsISD::VEXTRACT_ZEXT_ELT && Log2 >= ExtendTySize) ||
+        Log2 == ExtendTySize) {
+      SDValue Ops[] = { Op0->getOperand(0), Op0->getOperand(1), Op0Op2 };
+      DAG.MorphNodeTo(Op0.getNode(), MipsISD::VEXTRACT_ZEXT_ELT,
+                      Op0->getVTList(), Ops, Op0->getNumOperands());
+      return Op0;
+    }
+  }
+
+  return SDValue();
+}
+
+// Determine if the specified node is a constant vector splat.
+//
+// Returns true and sets Imm if:
+// * N is a ISD::BUILD_VECTOR representing a constant splat
+//
+// This function is quite similar to MipsSEDAGToDAGISel::selectVSplat. The
+// differences are that it assumes the MSA has already been checked and the
+// arbitrary requirement for a maximum of 32-bit integers isn't applied (and
+// must not be in order for binsri.d to be selectable).
+static bool isVSplat(SDValue N, APInt &Imm, bool IsLittleEndian) {
+  BuildVectorSDNode *Node = dyn_cast<BuildVectorSDNode>(N.getNode());
+
+  if (Node == NULL)
+    return false;
+
+  APInt SplatValue, SplatUndef;
+  unsigned SplatBitSize;
+  bool HasAnyUndefs;
+
+  if (!Node->isConstantSplat(SplatValue, SplatUndef, SplatBitSize, HasAnyUndefs,
+                             8, !IsLittleEndian))
+    return false;
+
+  Imm = SplatValue;
+
+  return true;
+}
+
+// Test whether the given node is an all-ones build_vector.
+static bool isVectorAllOnes(SDValue N) {
+  // Look through bitcasts. Endianness doesn't matter because we are looking
+  // for an all-ones value.
+  if (N->getOpcode() == ISD::BITCAST)
+    N = N->getOperand(0);
+
+  BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N);
+
+  if (!BVN)
+    return false;
+
+  APInt SplatValue, SplatUndef;
+  unsigned SplatBitSize;
+  bool HasAnyUndefs;
+
+  // Endianness doesn't matter in this context because we are looking for
+  // an all-ones value.
+  if (BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize, HasAnyUndefs))
+    return SplatValue.isAllOnesValue();
+
+  return false;
+}
+
+// Test whether N is the bitwise inverse of OfNode.
+static bool isBitwiseInverse(SDValue N, SDValue OfNode) {
+  if (N->getOpcode() != ISD::XOR)
+    return false;
+
+  if (isVectorAllOnes(N->getOperand(0)))
+    return N->getOperand(1) == OfNode;
+
+  if (isVectorAllOnes(N->getOperand(1)))
+    return N->getOperand(0) == OfNode;
+
+  return false;
+}
+
+// Perform combines where ISD::OR is the root node.
+//
+// Performs the following transformations:
+// - (or (and $a, $mask), (and $b, $inv_mask)) => (vselect $mask, $a, $b)
+//   where $inv_mask is the bitwise inverse of $mask and the 'or' has a 128-bit
+//   vector type.
+static SDValue performORCombine(SDNode *N, SelectionDAG &DAG,
+                                TargetLowering::DAGCombinerInfo &DCI,
+                                const MipsSubtarget *Subtarget) {
+  if (!Subtarget->hasMSA())
+    return SDValue();
+
+  EVT Ty = N->getValueType(0);
+
+  if (!Ty.is128BitVector())
+    return SDValue();
+
+  SDValue Op0 = N->getOperand(0);
+  SDValue Op1 = N->getOperand(1);
+
+  if (Op0->getOpcode() == ISD::AND && Op1->getOpcode() == ISD::AND) {
+    SDValue Op0Op0 = Op0->getOperand(0);
+    SDValue Op0Op1 = Op0->getOperand(1);
+    SDValue Op1Op0 = Op1->getOperand(0);
+    SDValue Op1Op1 = Op1->getOperand(1);
+    bool IsLittleEndian = !Subtarget->isLittle();
+
+    SDValue IfSet, IfClr, Cond;
+    bool IsConstantMask = false;
+    APInt Mask, InvMask;
+
+    // If Op0Op0 is an appropriate mask, try to find it's inverse in either
+    // Op1Op0, or Op1Op1. Keep track of the Cond, IfSet, and IfClr nodes, while
+    // looking.
+    // IfClr will be set if we find a valid match.
+    if (isVSplat(Op0Op0, Mask, IsLittleEndian)) {
+      Cond = Op0Op0;
+      IfSet = Op0Op1;
+
+      if (isVSplat(Op1Op0, InvMask, IsLittleEndian) &&
+          Mask.getBitWidth() == InvMask.getBitWidth() && Mask == ~InvMask)
+        IfClr = Op1Op1;
+      else if (isVSplat(Op1Op1, InvMask, IsLittleEndian) &&
+               Mask.getBitWidth() == InvMask.getBitWidth() && Mask == ~InvMask)
+        IfClr = Op1Op0;
+
+      IsConstantMask = true;
+    }
+
+    // If IfClr is not yet set, and Op0Op1 is an appropriate mask, try the same
+    // thing again using this mask.
+    // IfClr will be set if we find a valid match.
+    if (!IfClr.getNode() && isVSplat(Op0Op1, Mask, IsLittleEndian)) {
+      Cond = Op0Op1;
+      IfSet = Op0Op0;
+
+      if (isVSplat(Op1Op0, InvMask, IsLittleEndian) &&
+          Mask.getBitWidth() == InvMask.getBitWidth() && Mask == ~InvMask)
+        IfClr = Op1Op1;
+      else if (isVSplat(Op1Op1, InvMask, IsLittleEndian) &&
+               Mask.getBitWidth() == InvMask.getBitWidth() && Mask == ~InvMask)
+        IfClr = Op1Op0;
+
+      IsConstantMask = true;
+    }
+
+    // If IfClr is not yet set, try looking for a non-constant match.
+    // IfClr will be set if we find a valid match amongst the eight
+    // possibilities.
+    if (!IfClr.getNode()) {
+      if (isBitwiseInverse(Op0Op0, Op1Op0)) {
+        Cond = Op1Op0;
+        IfSet = Op1Op1;
+        IfClr = Op0Op1;
+      } else if (isBitwiseInverse(Op0Op1, Op1Op0)) {
+        Cond = Op1Op0;
+        IfSet = Op1Op1;
+        IfClr = Op0Op0;
+      } else if (isBitwiseInverse(Op0Op0, Op1Op1)) {
+        Cond = Op1Op1;
+        IfSet = Op1Op0;
+        IfClr = Op0Op1;
+      } else if (isBitwiseInverse(Op0Op1, Op1Op1)) {
+        Cond = Op1Op1;
+        IfSet = Op1Op0;
+        IfClr = Op0Op0;
+      } else if (isBitwiseInverse(Op1Op0, Op0Op0)) {
+        Cond = Op0Op0;
+        IfSet = Op0Op1;
+        IfClr = Op1Op1;
+      } else if (isBitwiseInverse(Op1Op1, Op0Op0)) {
+        Cond = Op0Op0;
+        IfSet = Op0Op1;
+        IfClr = Op1Op0;
+      } else if (isBitwiseInverse(Op1Op0, Op0Op1)) {
+        Cond = Op0Op1;
+        IfSet = Op0Op0;
+        IfClr = Op1Op1;
+      } else if (isBitwiseInverse(Op1Op1, Op0Op1)) {
+        Cond = Op0Op1;
+        IfSet = Op0Op0;
+        IfClr = Op1Op0;
+      }
+    }
+
+    // At this point, IfClr will be set if we have a valid match.
+    if (!IfClr.getNode())
+      return SDValue();
+
+    assert(Cond.getNode() && IfSet.getNode());
+
+    // Fold degenerate cases.
+    if (IsConstantMask) {
+      if (Mask.isAllOnesValue())
+        return IfSet;
+      else if (Mask == 0)
+        return IfClr;
+    }
+
+    // Transform the DAG into an equivalent VSELECT.
+    return DAG.getNode(ISD::VSELECT, SDLoc(N), Ty, Cond, IfClr, IfSet);
+  }
+
+  return SDValue();
+}
+
 static SDValue performSUBECombine(SDNode *N, SelectionDAG &DAG,
                                   TargetLowering::DAGCombinerInfo &DCI,
                                   const MipsSubtarget *Subtarget) {
@@ -396,6 +755,9 @@ static SDValue performDSPShiftCombine(unsigned Opc, SDNode *N, EVT Ty,
   unsigned EltSize = Ty.getVectorElementType().getSizeInBits();
   BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
 
+  if (!Subtarget->hasDSP())
+    return SDValue();
+
   if (!BV ||
       !BV->isConstantSplat(SplatValue, SplatUndef, SplatBitSize, HasAnyUndefs,
                            EltSize, !Subtarget->isLittle()) ||
@@ -418,11 +780,57 @@ static SDValue performSHLCombine(SDNode *N, SelectionDAG &DAG,
   return performDSPShiftCombine(MipsISD::SHLL_DSP, N, Ty, DAG, Subtarget);
 }
 
+// Fold sign-extensions into MipsISD::VEXTRACT_[SZ]EXT_ELT for MSA and fold
+// constant splats into MipsISD::SHRA_DSP for DSPr2.
+//
+// Performs the following transformations:
+// - Changes MipsISD::VEXTRACT_[SZ]EXT_ELT to sign extension if its
+//   sign/zero-extension is completely overwritten by the new one performed by
+//   the ISD::SRA and ISD::SHL nodes.
+// - Removes redundant sign extensions performed by an ISD::SRA and ISD::SHL
+//   sequence.
+//
+// See performDSPShiftCombine for more information about the transformation
+// used for DSPr2.
 static SDValue performSRACombine(SDNode *N, SelectionDAG &DAG,
                                  TargetLowering::DAGCombinerInfo &DCI,
                                  const MipsSubtarget *Subtarget) {
   EVT Ty = N->getValueType(0);
 
+  if (Subtarget->hasMSA()) {
+    SDValue Op0 = N->getOperand(0);
+    SDValue Op1 = N->getOperand(1);
+
+    // (sra (shl (MipsVExtract[SZ]Ext $a, $b, $c), imm:$d), imm:$d)
+    // where $d + sizeof($c) == 32
+    // or    $d + sizeof($c) <= 32 and SExt
+    // -> (MipsVExtractSExt $a, $b, $c)
+    if (Op0->getOpcode() == ISD::SHL && Op1 == Op0->getOperand(1)) {
+      SDValue Op0Op0 = Op0->getOperand(0);
+      ConstantSDNode *ShAmount = dyn_cast<ConstantSDNode>(Op1);
+
+      if (!ShAmount)
+        return SDValue();
+
+      if (Op0Op0->getOpcode() != MipsISD::VEXTRACT_SEXT_ELT &&
+          Op0Op0->getOpcode() != MipsISD::VEXTRACT_ZEXT_ELT)
+        return SDValue();
+
+      EVT ExtendTy = cast<VTSDNode>(Op0Op0->getOperand(2))->getVT();
+      unsigned TotalBits = ShAmount->getZExtValue() + ExtendTy.getSizeInBits();
+
+      if (TotalBits == 32 ||
+          (Op0Op0->getOpcode() == MipsISD::VEXTRACT_SEXT_ELT &&
+           TotalBits <= 32)) {
+        SDValue Ops[] = { Op0Op0->getOperand(0), Op0Op0->getOperand(1),
+                          Op0Op0->getOperand(2) };
+        DAG.MorphNodeTo(Op0Op0.getNode(), MipsISD::VEXTRACT_SEXT_ELT,
+                        Op0Op0->getVTList(), Ops, Op0Op0->getNumOperands());
+        return Op0Op0;
+      }
+    }
+  }
+
   if ((Ty != MVT::v2i16) && ((Ty != MVT::v4i8) || !Subtarget->hasDSPR2()))
     return SDValue();
 
@@ -475,17 +883,84 @@ static SDValue performSETCCCombine(SDNode *N, SelectionDAG &DAG) {
 static SDValue performVSELECTCombine(SDNode *N, SelectionDAG &DAG) {
   EVT Ty = N->getValueType(0);
 
-  if ((Ty != MVT::v2i16) && (Ty != MVT::v4i8))
-    return SDValue();
+  if (Ty.is128BitVector() && Ty.isInteger()) {
+    // Try the following combines:
+    //   (vselect (setcc $a, $b, SETLT), $b, $a)) -> (vsmax $a, $b)
+    //   (vselect (setcc $a, $b, SETLE), $b, $a)) -> (vsmax $a, $b)
+    //   (vselect (setcc $a, $b, SETLT), $a, $b)) -> (vsmin $a, $b)
+    //   (vselect (setcc $a, $b, SETLE), $a, $b)) -> (vsmin $a, $b)
+    //   (vselect (setcc $a, $b, SETULT), $b, $a)) -> (vumax $a, $b)
+    //   (vselect (setcc $a, $b, SETULE), $b, $a)) -> (vumax $a, $b)
+    //   (vselect (setcc $a, $b, SETULT), $a, $b)) -> (vumin $a, $b)
+    //   (vselect (setcc $a, $b, SETULE), $a, $b)) -> (vumin $a, $b)
+    // SETGT/SETGE/SETUGT/SETUGE variants of these will show up initially but
+    // will be expanded to equivalent SETLT/SETLE/SETULT/SETULE versions by the
+    // legalizer.
+    SDValue Op0 = N->getOperand(0);
+
+    if (Op0->getOpcode() != ISD::SETCC)
+      return SDValue();
+
+    ISD::CondCode CondCode = cast<CondCodeSDNode>(Op0->getOperand(2))->get();
+    bool Signed;
+
+    if (CondCode == ISD::SETLT  || CondCode == ISD::SETLE)
+      Signed = true;
+    else if (CondCode == ISD::SETULT || CondCode == ISD::SETULE)
+      Signed = false;
+    else
+      return SDValue();
+
+    SDValue Op1 = N->getOperand(1);
+    SDValue Op2 = N->getOperand(2);
+    SDValue Op0Op0 = Op0->getOperand(0);
+    SDValue Op0Op1 = Op0->getOperand(1);
+
+    if (Op1 == Op0Op0 && Op2 == Op0Op1)
+      return DAG.getNode(Signed ? MipsISD::VSMIN : MipsISD::VUMIN, SDLoc(N),
+                         Ty, Op1, Op2);
+    else if (Op1 == Op0Op1 && Op2 == Op0Op0)
+      return DAG.getNode(Signed ? MipsISD::VSMAX : MipsISD::VUMAX, SDLoc(N),
+                         Ty, Op1, Op2);
+  } else if ((Ty == MVT::v2i16) || (Ty == MVT::v4i8)) {
+    SDValue SetCC = N->getOperand(0);
+
+    if (SetCC.getOpcode() != MipsISD::SETCC_DSP)
+      return SDValue();
+
+    return DAG.getNode(MipsISD::SELECT_CC_DSP, SDLoc(N), Ty,
+                       SetCC.getOperand(0), SetCC.getOperand(1),
+                       N->getOperand(1), N->getOperand(2), SetCC.getOperand(2));
+  }
 
-  SDValue SetCC = N->getOperand(0);
+  return SDValue();
+}
 
-  if (SetCC.getOpcode() != MipsISD::SETCC_DSP)
-    return SDValue();
+static SDValue performXORCombine(SDNode *N, SelectionDAG &DAG,
+                                 const MipsSubtarget *Subtarget) {
+  EVT Ty = N->getValueType(0);
 
-  return DAG.getNode(MipsISD::SELECT_CC_DSP, SDLoc(N), Ty,
-                     SetCC.getOperand(0), SetCC.getOperand(1), N->getOperand(1),
-                     N->getOperand(2), SetCC.getOperand(2));
+  if (Subtarget->hasMSA() && Ty.is128BitVector() && Ty.isInteger()) {
+    // Try the following combines:
+    //   (xor (or $a, $b), (build_vector allones))
+    //   (xor (or $a, $b), (bitcast (build_vector allones)))
+    SDValue Op0 = N->getOperand(0);
+    SDValue Op1 = N->getOperand(1);
+    SDValue NotOp;
+
+    if (ISD::isBuildVectorAllOnes(Op0.getNode()))
+      NotOp = Op1;
+    else if (ISD::isBuildVectorAllOnes(Op1.getNode()))
+      NotOp = Op0;
+    else
+      return SDValue();
+
+    if (NotOp->getOpcode() == ISD::OR)
+      return DAG.getNode(MipsISD::VNOR, SDLoc(N), Ty, NotOp->getOperand(0),
+                         NotOp->getOperand(1));
+  }
+
+  return SDValue();
 }
 
 SDValue
@@ -496,6 +971,12 @@ MipsSETargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const {
   switch (N->getOpcode()) {
   case ISD::ADDE:
     return performADDECombine(N, DAG, DCI, Subtarget);
+  case ISD::AND:
+    Val = performANDCombine(N, DAG, DCI, Subtarget);
+    break;
+  case ISD::OR:
+    Val = performORCombine(N, DAG, DCI, Subtarget);
+    break;
   case ISD::SUBE:
     return performSUBECombine(N, DAG, DCI, Subtarget);
   case ISD::MUL:
@@ -508,14 +989,22 @@ MipsSETargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const {
     return performSRLCombine(N, DAG, DCI, Subtarget);
   case ISD::VSELECT:
     return performVSELECTCombine(N, DAG);
-  case ISD::SETCC: {
+  case ISD::XOR:
+    Val = performXORCombine(N, DAG, Subtarget);
+    break;
+  case ISD::SETCC:
     Val = performSETCCCombine(N, DAG);
     break;
   }
-  }
 
-  if (Val.getNode())
+  if (Val.getNode()) {
+    DEBUG(dbgs() << "\nMipsSE DAG Combine:\n";
+          N->printrWithDepth(dbgs(), &DAG);
+          dbgs() << "\n=> \n";
+          Val.getNode()->printrWithDepth(dbgs(), &DAG);
+          dbgs() << "\n");
     return Val;
+  }
 
   return MipsTargetLowering::PerformDAGCombine(N, DCI);
 }
@@ -528,6 +1017,42 @@ MipsSETargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
     return MipsTargetLowering::EmitInstrWithCustomInserter(MI, BB);
   case Mips::BPOSGE32_PSEUDO:
     return emitBPOSGE32(MI, BB);
+  case Mips::SNZ_B_PSEUDO:
+    return emitMSACBranchPseudo(MI, BB, Mips::BNZ_B);
+  case Mips::SNZ_H_PSEUDO:
+    return emitMSACBranchPseudo(MI, BB, Mips::BNZ_H);
+  case Mips::SNZ_W_PSEUDO:
+    return emitMSACBranchPseudo(MI, BB, Mips::BNZ_W);
+  case Mips::SNZ_D_PSEUDO:
+    return emitMSACBranchPseudo(MI, BB, Mips::BNZ_D);
+  case Mips::SNZ_V_PSEUDO:
+    return emitMSACBranchPseudo(MI, BB, Mips::BNZ_V);
+  case Mips::SZ_B_PSEUDO:
+    return emitMSACBranchPseudo(MI, BB, Mips::BZ_B);
+  case Mips::SZ_H_PSEUDO:
+    return emitMSACBranchPseudo(MI, BB, Mips::BZ_H);
+  case Mips::SZ_W_PSEUDO:
+    return emitMSACBranchPseudo(MI, BB, Mips::BZ_W);
+  case Mips::SZ_D_PSEUDO:
+    return emitMSACBranchPseudo(MI, BB, Mips::BZ_D);
+  case Mips::SZ_V_PSEUDO:
+    return emitMSACBranchPseudo(MI, BB, Mips::BZ_V);
+  case Mips::COPY_FW_PSEUDO:
+    return emitCOPY_FW(MI, BB);
+  case Mips::COPY_FD_PSEUDO:
+    return emitCOPY_FD(MI, BB);
+  case Mips::INSERT_FW_PSEUDO:
+    return emitINSERT_FW(MI, BB);
+  case Mips::INSERT_FD_PSEUDO:
+    return emitINSERT_FD(MI, BB);
+  case Mips::FILL_FW_PSEUDO:
+    return emitFILL_FW(MI, BB);
+  case Mips::FILL_FD_PSEUDO:
+    return emitFILL_FD(MI, BB);
+  case Mips::FEXP2_W_1_PSEUDO:
+    return emitFEXP2_W_1(MI, BB);
+  case Mips::FEXP2_D_1_PSEUDO:
+    return emitFEXP2_D_1(MI, BB);
   }
 }
 
@@ -564,6 +1089,68 @@ getOpndList(SmallVectorImpl<SDValue> &Ops,
                                   InternalLinkage, CLI, Callee, Chain);
 }
 
+SDValue MipsSETargetLowering::lowerLOAD(SDValue Op, SelectionDAG &DAG) const {
+  LoadSDNode &Nd = *cast<LoadSDNode>(Op);
+
+  if (Nd.getMemoryVT() != MVT::f64 || !NoDPLoadStore)
+    return MipsTargetLowering::lowerLOAD(Op, DAG);
+
+  // Replace a double precision load with two i32 loads and a buildpair64.
+  SDLoc DL(Op);
+  SDValue Ptr = Nd.getBasePtr(), Chain = Nd.getChain();
+  EVT PtrVT = Ptr.getValueType();
+
+  // i32 load from lower address.
+  SDValue Lo = DAG.getLoad(MVT::i32, DL, Chain, Ptr,
+                           MachinePointerInfo(), Nd.isVolatile(),
+                           Nd.isNonTemporal(), Nd.isInvariant(),
+                           Nd.getAlignment());
+
+  // i32 load from higher address.
+  Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, Ptr, DAG.getConstant(4, PtrVT));
+  SDValue Hi = DAG.getLoad(MVT::i32, DL, Lo.getValue(1), Ptr,
+                           MachinePointerInfo(), Nd.isVolatile(),
+                           Nd.isNonTemporal(), Nd.isInvariant(),
+                           std::min(Nd.getAlignment(), 4U));
+
+  if (!Subtarget->isLittle())
+    std::swap(Lo, Hi);
+
+  SDValue BP = DAG.getNode(MipsISD::BuildPairF64, DL, MVT::f64, Lo, Hi);
+  SDValue Ops[2] = {BP, Hi.getValue(1)};
+  return DAG.getMergeValues(Ops, 2, DL);
+}
+
+SDValue MipsSETargetLowering::lowerSTORE(SDValue Op, SelectionDAG &DAG) const {
+  StoreSDNode &Nd = *cast<StoreSDNode>(Op);
+
+  if (Nd.getMemoryVT() != MVT::f64 || !NoDPLoadStore)
+    return MipsTargetLowering::lowerSTORE(Op, DAG);
+
+  // Replace a double precision store with two extractelement64s and i32 stores.
+  SDLoc DL(Op);
+  SDValue Val = Nd.getValue(), Ptr = Nd.getBasePtr(), Chain = Nd.getChain();
+  EVT PtrVT = Ptr.getValueType();
+  SDValue Lo = DAG.getNode(MipsISD::ExtractElementF64, DL, MVT::i32,
+                           Val, DAG.getConstant(0, MVT::i32));
+  SDValue Hi = DAG.getNode(MipsISD::ExtractElementF64, DL, MVT::i32,
+                           Val, DAG.getConstant(1, MVT::i32));
+
+  if (!Subtarget->isLittle())
+    std::swap(Lo, Hi);
+
+  // i32 store to lower address.
+  Chain = DAG.getStore(Chain, DL, Lo, Ptr, MachinePointerInfo(),
+                       Nd.isVolatile(), Nd.isNonTemporal(), Nd.getAlignment(),
+                       Nd.getTBAAInfo());
+
+  // i32 store to higher address.
+  Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, Ptr, DAG.getConstant(4, PtrVT));
+  return DAG.getStore(Chain, DL, Hi, Ptr, MachinePointerInfo(),
+                      Nd.isVolatile(), Nd.isNonTemporal(),
+                      std::min(Nd.getAlignment(), 4U), Nd.getTBAAInfo());
+}
+
 SDValue MipsSETargetLowering::lowerMulDiv(SDValue Op, unsigned NewOpc,
                                           bool HasLo, bool HasHi,
                                           SelectionDAG &DAG) const {
@@ -574,11 +1161,9 @@ SDValue MipsSETargetLowering::lowerMulDiv(SDValue Op, unsigned NewOpc,
   SDValue Lo, Hi;
 
   if (HasLo)
-    Lo = DAG.getNode(MipsISD::ExtractLOHI, DL, Ty, Mult,
-                     DAG.getConstant(Mips::sub_lo, MVT::i32));
+    Lo = DAG.getNode(MipsISD::MFLO, DL, Ty, Mult);
   if (HasHi)
-    Hi = DAG.getNode(MipsISD::ExtractLOHI, DL, Ty, Mult,
-                     DAG.getConstant(Mips::sub_hi, MVT::i32));
+    Hi = DAG.getNode(MipsISD::MFHI, DL, Ty, Mult);
 
   if (!HasLo || !HasHi)
     return HasLo ? Lo : Hi;
@@ -593,14 +1178,12 @@ static SDValue initAccumulator(SDValue In, SDLoc DL, SelectionDAG &DAG) {
                              DAG.getConstant(0, MVT::i32));
   SDValue InHi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, In,
                              DAG.getConstant(1, MVT::i32));
-  return DAG.getNode(MipsISD::InsertLOHI, DL, MVT::Untyped, InLo, InHi);
+  return DAG.getNode(MipsISD::MTLOHI, DL, MVT::Untyped, InLo, InHi);
 }
 
 static SDValue extractLOHI(SDValue Op, SDLoc DL, SelectionDAG &DAG) {
-  SDValue Lo = DAG.getNode(MipsISD::ExtractLOHI, DL, MVT::i32, Op,
-                           DAG.getConstant(Mips::sub_lo, MVT::i32));
-  SDValue Hi = DAG.getNode(MipsISD::ExtractLOHI, DL, MVT::i32, Op,
-                           DAG.getConstant(Mips::sub_hi, MVT::i32));
+  SDValue Lo = DAG.getNode(MipsISD::MFLO, DL, MVT::i32, Op);
+  SDValue Hi = DAG.getNode(MipsISD::MFHI, DL, MVT::i32, Op);
   return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Lo, Hi);
 }
 
@@ -664,8 +1247,156 @@ static SDValue lowerDSPIntr(SDValue Op, SelectionDAG &DAG, unsigned Opc) {
   return DAG.getMergeValues(Vals, 2, DL);
 }
 
+// Lower an MSA copy intrinsic into the specified SelectionDAG node
+static SDValue lowerMSACopyIntr(SDValue Op, SelectionDAG &DAG, unsigned Opc) {
+  SDLoc DL(Op);
+  SDValue Vec = Op->getOperand(1);
+  SDValue Idx = Op->getOperand(2);
+  EVT ResTy = Op->getValueType(0);
+  EVT EltTy = Vec->getValueType(0).getVectorElementType();
+
+  SDValue Result = DAG.getNode(Opc, DL, ResTy, Vec, Idx,
+                               DAG.getValueType(EltTy));
+
+  return Result;
+}
+
+static SDValue lowerMSASplatZExt(SDValue Op, unsigned OpNr, SelectionDAG &DAG) {
+  EVT ResVecTy = Op->getValueType(0);
+  EVT ViaVecTy = ResVecTy;
+  SDLoc DL(Op);
+
+  // When ResVecTy == MVT::v2i64, LaneA is the upper 32 bits of the lane and
+  // LaneB is the lower 32-bits. Otherwise LaneA and LaneB are alternating
+  // lanes.
+  SDValue LaneA;
+  SDValue LaneB = Op->getOperand(2);
+
+  if (ResVecTy == MVT::v2i64) {
+    LaneA = DAG.getConstant(0, MVT::i32);
+    ViaVecTy = MVT::v4i32;
+  } else
+    LaneA = LaneB;
+
+  SDValue Ops[16] = { LaneA, LaneB, LaneA, LaneB, LaneA, LaneB, LaneA, LaneB,
+                      LaneA, LaneB, LaneA, LaneB, LaneA, LaneB, LaneA, LaneB };
+
+  SDValue Result = DAG.getNode(ISD::BUILD_VECTOR, DL, ViaVecTy, Ops,
+                               ViaVecTy.getVectorNumElements());
+
+  if (ViaVecTy != ResVecTy)
+    Result = DAG.getNode(ISD::BITCAST, DL, ResVecTy, Result);
+
+  return Result;
+}
+
+static SDValue lowerMSASplatImm(SDValue Op, unsigned ImmOp, SelectionDAG &DAG) {
+  return DAG.getConstant(Op->getConstantOperandVal(ImmOp), Op->getValueType(0));
+}
+
+static SDValue getBuildVectorSplat(EVT VecTy, SDValue SplatValue,
+                                   bool BigEndian, SelectionDAG &DAG) {
+  EVT ViaVecTy = VecTy;
+  SDValue SplatValueA = SplatValue;
+  SDValue SplatValueB = SplatValue;
+  SDLoc DL(SplatValue);
+
+  if (VecTy == MVT::v2i64) {
+    // v2i64 BUILD_VECTOR must be performed via v4i32 so split into i32's.
+    ViaVecTy = MVT::v4i32;
+
+    SplatValueA = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, SplatValue);
+    SplatValueB = DAG.getNode(ISD::SRL, DL, MVT::i64, SplatValue,
+                              DAG.getConstant(32, MVT::i32));
+    SplatValueB = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, SplatValueB);
+  }
+
+  // We currently hold the parts in little endian order. Swap them if
+  // necessary.
+  if (BigEndian)
+    std::swap(SplatValueA, SplatValueB);
+
+  SDValue Ops[16] = { SplatValueA, SplatValueB, SplatValueA, SplatValueB,
+                      SplatValueA, SplatValueB, SplatValueA, SplatValueB,
+                      SplatValueA, SplatValueB, SplatValueA, SplatValueB,
+                      SplatValueA, SplatValueB, SplatValueA, SplatValueB };
+
+  SDValue Result = DAG.getNode(ISD::BUILD_VECTOR, DL, ViaVecTy, Ops,
+                               ViaVecTy.getVectorNumElements());
+
+  if (VecTy != ViaVecTy)
+    Result = DAG.getNode(ISD::BITCAST, DL, VecTy, Result);
+
+  return Result;
+}
+
+static SDValue lowerMSABinaryBitImmIntr(SDValue Op, SelectionDAG &DAG,
+                                        unsigned Opc, SDValue Imm,
+                                        bool BigEndian) {
+  EVT VecTy = Op->getValueType(0);
+  SDValue Exp2Imm;
+  SDLoc DL(Op);
+
+  // The DAG Combiner can't constant fold bitcasted vectors yet so we must do it
+  // here for now.
+  if (VecTy == MVT::v2i64) {
+    if (ConstantSDNode *CImm = dyn_cast<ConstantSDNode>(Imm)) {
+      APInt BitImm = APInt(64, 1) << CImm->getAPIntValue();
+
+      SDValue BitImmHiOp = DAG.getConstant(BitImm.lshr(32).trunc(32), MVT::i32);
+      SDValue BitImmLoOp = DAG.getConstant(BitImm.trunc(32), MVT::i32);
+
+      if (BigEndian)
+        std::swap(BitImmLoOp, BitImmHiOp);
+
+      Exp2Imm =
+          DAG.getNode(ISD::BITCAST, DL, MVT::v2i64,
+                      DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, BitImmLoOp,
+                                  BitImmHiOp, BitImmLoOp, BitImmHiOp));
+    }
+  }
+
+  if (Exp2Imm.getNode() == NULL) {
+    // We couldnt constant fold, do a vector shift instead
+
+    // Extend i32 to i64 if necessary. Sign or zero extend doesn't matter since
+    // only values 0-63 are valid.
+    if (VecTy == MVT::v2i64)
+      Imm = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Imm);
+
+    Exp2Imm = getBuildVectorSplat(VecTy, Imm, BigEndian, DAG);
+
+    Exp2Imm =
+        DAG.getNode(ISD::SHL, DL, VecTy, DAG.getConstant(1, VecTy), Exp2Imm);
+  }
+
+  return DAG.getNode(Opc, DL, VecTy, Op->getOperand(1), Exp2Imm);
+}
+
+static SDValue lowerMSABitClear(SDValue Op, SelectionDAG &DAG) {
+  EVT ResTy = Op->getValueType(0);
+  SDLoc DL(Op);
+  SDValue One = DAG.getConstant(1, ResTy);
+  SDValue Bit = DAG.getNode(ISD::SHL, DL, ResTy, One, Op->getOperand(2));
+
+  return DAG.getNode(ISD::AND, DL, ResTy, Op->getOperand(1),
+                     DAG.getNOT(DL, Bit, ResTy));
+}
+
+static SDValue lowerMSABitClearImm(SDValue Op, SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  EVT ResTy = Op->getValueType(0);
+  APInt BitImm = APInt(ResTy.getVectorElementType().getSizeInBits(), 1)
+                 << cast<ConstantSDNode>(Op->getOperand(2))->getAPIntValue();
+  SDValue BitMask = DAG.getConstant(~BitImm, ResTy);
+
+  return DAG.getNode(ISD::AND, DL, ResTy, Op->getOperand(1), BitMask);
+}
+
 SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
                                                       SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+
   switch (cast<ConstantSDNode>(Op->getOperand(0))->getZExtValue()) {
   default:
     return SDValue();
@@ -701,12 +1432,610 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
     return lowerDSPIntr(Op, DAG, MipsISD::MSub);
   case Intrinsic::mips_msubu:
     return lowerDSPIntr(Op, DAG, MipsISD::MSubu);
+  case Intrinsic::mips_addv_b:
+  case Intrinsic::mips_addv_h:
+  case Intrinsic::mips_addv_w:
+  case Intrinsic::mips_addv_d:
+    return DAG.getNode(ISD::ADD, DL, Op->getValueType(0), Op->getOperand(1),
+                       Op->getOperand(2));
+  case Intrinsic::mips_addvi_b:
+  case Intrinsic::mips_addvi_h:
+  case Intrinsic::mips_addvi_w:
+  case Intrinsic::mips_addvi_d:
+    return DAG.getNode(ISD::ADD, DL, Op->getValueType(0), Op->getOperand(1),
+                       lowerMSASplatImm(Op, 2, DAG));
+  case Intrinsic::mips_and_v:
+    return DAG.getNode(ISD::AND, DL, Op->getValueType(0), Op->getOperand(1),
+                       Op->getOperand(2));
+  case Intrinsic::mips_andi_b:
+    return DAG.getNode(ISD::AND, DL, Op->getValueType(0), Op->getOperand(1),
+                       lowerMSASplatImm(Op, 2, DAG));
+  case Intrinsic::mips_bclr_b:
+  case Intrinsic::mips_bclr_h:
+  case Intrinsic::mips_bclr_w:
+  case Intrinsic::mips_bclr_d:
+    return lowerMSABitClear(Op, DAG);
+  case Intrinsic::mips_bclri_b:
+  case Intrinsic::mips_bclri_h:
+  case Intrinsic::mips_bclri_w:
+  case Intrinsic::mips_bclri_d:
+    return lowerMSABitClearImm(Op, DAG);
+  case Intrinsic::mips_binsli_b:
+  case Intrinsic::mips_binsli_h:
+  case Intrinsic::mips_binsli_w:
+  case Intrinsic::mips_binsli_d: {
+    EVT VecTy = Op->getValueType(0);
+    EVT EltTy = VecTy.getVectorElementType();
+    APInt Mask = APInt::getHighBitsSet(EltTy.getSizeInBits(),
+                                       Op->getConstantOperandVal(3));
+    return DAG.getNode(ISD::VSELECT, DL, VecTy,
+                       DAG.getConstant(Mask, VecTy, true), Op->getOperand(1),
+                       Op->getOperand(2));
+  }
+  case Intrinsic::mips_binsri_b:
+  case Intrinsic::mips_binsri_h:
+  case Intrinsic::mips_binsri_w:
+  case Intrinsic::mips_binsri_d: {
+    EVT VecTy = Op->getValueType(0);
+    EVT EltTy = VecTy.getVectorElementType();
+    APInt Mask = APInt::getLowBitsSet(EltTy.getSizeInBits(),
+                                      Op->getConstantOperandVal(3));
+    return DAG.getNode(ISD::VSELECT, DL, VecTy,
+                       DAG.getConstant(Mask, VecTy, true), Op->getOperand(1),
+                       Op->getOperand(2));
+  }
+  case Intrinsic::mips_bmnz_v:
+    return DAG.getNode(ISD::VSELECT, DL, Op->getValueType(0), Op->getOperand(3),
+                       Op->getOperand(2), Op->getOperand(1));
+  case Intrinsic::mips_bmnzi_b:
+    return DAG.getNode(ISD::VSELECT, DL, Op->getValueType(0),
+                       lowerMSASplatImm(Op, 3, DAG), Op->getOperand(2),
+                       Op->getOperand(1));
+  case Intrinsic::mips_bmz_v:
+    return DAG.getNode(ISD::VSELECT, DL, Op->getValueType(0), Op->getOperand(3),
+                       Op->getOperand(1), Op->getOperand(2));
+  case Intrinsic::mips_bmzi_b:
+    return DAG.getNode(ISD::VSELECT, DL, Op->getValueType(0),
+                       lowerMSASplatImm(Op, 3, DAG), Op->getOperand(1),
+                       Op->getOperand(2));
+  case Intrinsic::mips_bneg_b:
+  case Intrinsic::mips_bneg_h:
+  case Intrinsic::mips_bneg_w:
+  case Intrinsic::mips_bneg_d: {
+    EVT VecTy = Op->getValueType(0);
+    SDValue One = DAG.getConstant(1, VecTy);
+
+    return DAG.getNode(ISD::XOR, DL, VecTy, Op->getOperand(1),
+                       DAG.getNode(ISD::SHL, DL, VecTy, One,
+                                   Op->getOperand(2)));
+  }
+  case Intrinsic::mips_bnegi_b:
+  case Intrinsic::mips_bnegi_h:
+  case Intrinsic::mips_bnegi_w:
+  case Intrinsic::mips_bnegi_d:
+    return lowerMSABinaryBitImmIntr(Op, DAG, ISD::XOR, Op->getOperand(2),
+                                    !Subtarget->isLittle());
+  case Intrinsic::mips_bnz_b:
+  case Intrinsic::mips_bnz_h:
+  case Intrinsic::mips_bnz_w:
+  case Intrinsic::mips_bnz_d:
+    return DAG.getNode(MipsISD::VALL_NONZERO, DL, Op->getValueType(0),
+                       Op->getOperand(1));
+  case Intrinsic::mips_bnz_v:
+    return DAG.getNode(MipsISD::VANY_NONZERO, DL, Op->getValueType(0),
+                       Op->getOperand(1));
+  case Intrinsic::mips_bsel_v:
+    return DAG.getNode(ISD::VSELECT, DL, Op->getValueType(0),
+                       Op->getOperand(1), Op->getOperand(2),
+                       Op->getOperand(3));
+  case Intrinsic::mips_bseli_b:
+    return DAG.getNode(ISD::VSELECT, DL, Op->getValueType(0),
+                       Op->getOperand(1), Op->getOperand(2),
+                       lowerMSASplatImm(Op, 3, DAG));
+  case Intrinsic::mips_bset_b:
+  case Intrinsic::mips_bset_h:
+  case Intrinsic::mips_bset_w:
+  case Intrinsic::mips_bset_d: {
+    EVT VecTy = Op->getValueType(0);
+    SDValue One = DAG.getConstant(1, VecTy);
+
+    return DAG.getNode(ISD::OR, DL, VecTy, Op->getOperand(1),
+                       DAG.getNode(ISD::SHL, DL, VecTy, One,
+                                   Op->getOperand(2)));
+  }
+  case Intrinsic::mips_bseti_b:
+  case Intrinsic::mips_bseti_h:
+  case Intrinsic::mips_bseti_w:
+  case Intrinsic::mips_bseti_d:
+    return lowerMSABinaryBitImmIntr(Op, DAG, ISD::OR, Op->getOperand(2),
+                                    !Subtarget->isLittle());
+  case Intrinsic::mips_bz_b:
+  case Intrinsic::mips_bz_h:
+  case Intrinsic::mips_bz_w:
+  case Intrinsic::mips_bz_d:
+    return DAG.getNode(MipsISD::VALL_ZERO, DL, Op->getValueType(0),
+                       Op->getOperand(1));
+  case Intrinsic::mips_bz_v:
+    return DAG.getNode(MipsISD::VANY_ZERO, DL, Op->getValueType(0),
+                       Op->getOperand(1));
+  case Intrinsic::mips_ceq_b:
+  case Intrinsic::mips_ceq_h:
+  case Intrinsic::mips_ceq_w:
+  case Intrinsic::mips_ceq_d:
+    return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+                        Op->getOperand(2), ISD::SETEQ);
+  case Intrinsic::mips_ceqi_b:
+  case Intrinsic::mips_ceqi_h:
+  case Intrinsic::mips_ceqi_w:
+  case Intrinsic::mips_ceqi_d:
+    return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+                        lowerMSASplatImm(Op, 2, DAG), ISD::SETEQ);
+  case Intrinsic::mips_cle_s_b:
+  case Intrinsic::mips_cle_s_h:
+  case Intrinsic::mips_cle_s_w:
+  case Intrinsic::mips_cle_s_d:
+    return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+                        Op->getOperand(2), ISD::SETLE);
+  case Intrinsic::mips_clei_s_b:
+  case Intrinsic::mips_clei_s_h:
+  case Intrinsic::mips_clei_s_w:
+  case Intrinsic::mips_clei_s_d:
+    return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+                        lowerMSASplatImm(Op, 2, DAG), ISD::SETLE);
+  case Intrinsic::mips_cle_u_b:
+  case Intrinsic::mips_cle_u_h:
+  case Intrinsic::mips_cle_u_w:
+  case Intrinsic::mips_cle_u_d:
+    return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+                        Op->getOperand(2), ISD::SETULE);
+  case Intrinsic::mips_clei_u_b:
+  case Intrinsic::mips_clei_u_h:
+  case Intrinsic::mips_clei_u_w:
+  case Intrinsic::mips_clei_u_d:
+    return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+                        lowerMSASplatImm(Op, 2, DAG), ISD::SETULE);
+  case Intrinsic::mips_clt_s_b:
+  case Intrinsic::mips_clt_s_h:
+  case Intrinsic::mips_clt_s_w:
+  case Intrinsic::mips_clt_s_d:
+    return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+                        Op->getOperand(2), ISD::SETLT);
+  case Intrinsic::mips_clti_s_b:
+  case Intrinsic::mips_clti_s_h:
+  case Intrinsic::mips_clti_s_w:
+  case Intrinsic::mips_clti_s_d:
+    return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+                        lowerMSASplatImm(Op, 2, DAG), ISD::SETLT);
+  case Intrinsic::mips_clt_u_b:
+  case Intrinsic::mips_clt_u_h:
+  case Intrinsic::mips_clt_u_w:
+  case Intrinsic::mips_clt_u_d:
+    return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+                        Op->getOperand(2), ISD::SETULT);
+  case Intrinsic::mips_clti_u_b:
+  case Intrinsic::mips_clti_u_h:
+  case Intrinsic::mips_clti_u_w:
+  case Intrinsic::mips_clti_u_d:
+    return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+                        lowerMSASplatImm(Op, 2, DAG), ISD::SETULT);
+  case Intrinsic::mips_copy_s_b:
+  case Intrinsic::mips_copy_s_h:
+  case Intrinsic::mips_copy_s_w:
+    return lowerMSACopyIntr(Op, DAG, MipsISD::VEXTRACT_SEXT_ELT);
+  case Intrinsic::mips_copy_s_d:
+    // Don't lower directly into VEXTRACT_SEXT_ELT since i64 might be illegal.
+    // Instead lower to the generic EXTRACT_VECTOR_ELT node and let the type
+    // legalizer and EXTRACT_VECTOR_ELT lowering sort it out.
+    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(Op), Op->getValueType(0),
+                       Op->getOperand(1), Op->getOperand(2));
+  case Intrinsic::mips_copy_u_b:
+  case Intrinsic::mips_copy_u_h:
+  case Intrinsic::mips_copy_u_w:
+    return lowerMSACopyIntr(Op, DAG, MipsISD::VEXTRACT_ZEXT_ELT);
+  case Intrinsic::mips_copy_u_d:
+    // Don't lower directly into VEXTRACT_ZEXT_ELT since i64 might be illegal.
+    // Instead lower to the generic EXTRACT_VECTOR_ELT node and let the type
+    // legalizer and EXTRACT_VECTOR_ELT lowering sort it out.
+    //
+    // Note: When i64 is illegal, this results in copy_s.w instructions instead
+    // of copy_u.w instructions. This makes no difference to the behaviour
+    // since i64 is only illegal when the register file is 32-bit.
+    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(Op), Op->getValueType(0),
+                       Op->getOperand(1), Op->getOperand(2));
+  case Intrinsic::mips_div_s_b:
+  case Intrinsic::mips_div_s_h:
+  case Intrinsic::mips_div_s_w:
+  case Intrinsic::mips_div_s_d:
+    return DAG.getNode(ISD::SDIV, DL, Op->getValueType(0), Op->getOperand(1),
+                       Op->getOperand(2));
+  case Intrinsic::mips_div_u_b:
+  case Intrinsic::mips_div_u_h:
+  case Intrinsic::mips_div_u_w:
+  case Intrinsic::mips_div_u_d:
+    return DAG.getNode(ISD::UDIV, DL, Op->getValueType(0), Op->getOperand(1),
+                       Op->getOperand(2));
+  case Intrinsic::mips_fadd_w:
+  case Intrinsic::mips_fadd_d:
+    return DAG.getNode(ISD::FADD, DL, Op->getValueType(0), Op->getOperand(1),
+                       Op->getOperand(2));
+  // Don't lower mips_fcaf_[wd] since LLVM folds SETFALSE condcodes away
+  case Intrinsic::mips_fceq_w:
+  case Intrinsic::mips_fceq_d:
+    return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+                        Op->getOperand(2), ISD::SETOEQ);
+  case Intrinsic::mips_fcle_w:
+  case Intrinsic::mips_fcle_d:
+    return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+                        Op->getOperand(2), ISD::SETOLE);
+  case Intrinsic::mips_fclt_w:
+  case Intrinsic::mips_fclt_d:
+    return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+                        Op->getOperand(2), ISD::SETOLT);
+  case Intrinsic::mips_fcne_w:
+  case Intrinsic::mips_fcne_d:
+    return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+                        Op->getOperand(2), ISD::SETONE);
+  case Intrinsic::mips_fcor_w:
+  case Intrinsic::mips_fcor_d:
+    return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+                        Op->getOperand(2), ISD::SETO);
+  case Intrinsic::mips_fcueq_w:
+  case Intrinsic::mips_fcueq_d:
+    return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+                        Op->getOperand(2), ISD::SETUEQ);
+  case Intrinsic::mips_fcule_w:
+  case Intrinsic::mips_fcule_d:
+    return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+                        Op->getOperand(2), ISD::SETULE);
+  case Intrinsic::mips_fcult_w:
+  case Intrinsic::mips_fcult_d:
+    return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+                        Op->getOperand(2), ISD::SETULT);
+  case Intrinsic::mips_fcun_w:
+  case Intrinsic::mips_fcun_d:
+    return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+                        Op->getOperand(2), ISD::SETUO);
+  case Intrinsic::mips_fcune_w:
+  case Intrinsic::mips_fcune_d:
+    return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+                        Op->getOperand(2), ISD::SETUNE);
+  case Intrinsic::mips_fdiv_w:
+  case Intrinsic::mips_fdiv_d:
+    return DAG.getNode(ISD::FDIV, DL, Op->getValueType(0), Op->getOperand(1),
+                       Op->getOperand(2));
+  case Intrinsic::mips_ffint_u_w:
+  case Intrinsic::mips_ffint_u_d:
+    return DAG.getNode(ISD::UINT_TO_FP, DL, Op->getValueType(0),
+                       Op->getOperand(1));
+  case Intrinsic::mips_ffint_s_w:
+  case Intrinsic::mips_ffint_s_d:
+    return DAG.getNode(ISD::SINT_TO_FP, DL, Op->getValueType(0),
+                       Op->getOperand(1));
+  case Intrinsic::mips_fill_b:
+  case Intrinsic::mips_fill_h:
+  case Intrinsic::mips_fill_w:
+  case Intrinsic::mips_fill_d: {
+    SmallVector<SDValue, 16> Ops;
+    EVT ResTy = Op->getValueType(0);
+
+    for (unsigned i = 0; i < ResTy.getVectorNumElements(); ++i)
+      Ops.push_back(Op->getOperand(1));
+
+    // If ResTy is v2i64 then the type legalizer will break this node down into
+    // an equivalent v4i32.
+    return DAG.getNode(ISD::BUILD_VECTOR, DL, ResTy, &Ops[0], Ops.size());
+  }
+  case Intrinsic::mips_fexp2_w:
+  case Intrinsic::mips_fexp2_d: {
+    EVT ResTy = Op->getValueType(0);
+    return DAG.getNode(
+        ISD::FMUL, SDLoc(Op), ResTy, Op->getOperand(1),
+        DAG.getNode(ISD::FEXP2, SDLoc(Op), ResTy, Op->getOperand(2)));
+  }
+  case Intrinsic::mips_flog2_w:
+  case Intrinsic::mips_flog2_d:
+    return DAG.getNode(ISD::FLOG2, DL, Op->getValueType(0), Op->getOperand(1));
+  case Intrinsic::mips_fmadd_w:
+  case Intrinsic::mips_fmadd_d:
+    return DAG.getNode(ISD::FMA, SDLoc(Op), Op->getValueType(0),
+                       Op->getOperand(1), Op->getOperand(2), Op->getOperand(3));
+  case Intrinsic::mips_fmul_w:
+  case Intrinsic::mips_fmul_d:
+    return DAG.getNode(ISD::FMUL, DL, Op->getValueType(0), Op->getOperand(1),
+                       Op->getOperand(2));
+  case Intrinsic::mips_fmsub_w:
+  case Intrinsic::mips_fmsub_d: {
+    EVT ResTy = Op->getValueType(0);
+    return DAG.getNode(ISD::FSUB, SDLoc(Op), ResTy, Op->getOperand(1),
+                       DAG.getNode(ISD::FMUL, SDLoc(Op), ResTy,
+                                   Op->getOperand(2), Op->getOperand(3)));
+  }
+  case Intrinsic::mips_frint_w:
+  case Intrinsic::mips_frint_d:
+    return DAG.getNode(ISD::FRINT, DL, Op->getValueType(0), Op->getOperand(1));
+  case Intrinsic::mips_fsqrt_w:
+  case Intrinsic::mips_fsqrt_d:
+    return DAG.getNode(ISD::FSQRT, DL, Op->getValueType(0), Op->getOperand(1));
+  case Intrinsic::mips_fsub_w:
+  case Intrinsic::mips_fsub_d:
+    return DAG.getNode(ISD::FSUB, DL, Op->getValueType(0), Op->getOperand(1),
+                       Op->getOperand(2));
+  case Intrinsic::mips_ftrunc_u_w:
+  case Intrinsic::mips_ftrunc_u_d:
+    return DAG.getNode(ISD::FP_TO_UINT, DL, Op->getValueType(0),
+                       Op->getOperand(1));
+  case Intrinsic::mips_ftrunc_s_w:
+  case Intrinsic::mips_ftrunc_s_d:
+    return DAG.getNode(ISD::FP_TO_SINT, DL, Op->getValueType(0),
+                       Op->getOperand(1));
+  case Intrinsic::mips_ilvev_b:
+  case Intrinsic::mips_ilvev_h:
+  case Intrinsic::mips_ilvev_w:
+  case Intrinsic::mips_ilvev_d:
+    return DAG.getNode(MipsISD::ILVEV, DL, Op->getValueType(0),
+                       Op->getOperand(1), Op->getOperand(2));
+  case Intrinsic::mips_ilvl_b:
+  case Intrinsic::mips_ilvl_h:
+  case Intrinsic::mips_ilvl_w:
+  case Intrinsic::mips_ilvl_d:
+    return DAG.getNode(MipsISD::ILVL, DL, Op->getValueType(0),
+                       Op->getOperand(1), Op->getOperand(2));
+  case Intrinsic::mips_ilvod_b:
+  case Intrinsic::mips_ilvod_h:
+  case Intrinsic::mips_ilvod_w:
+  case Intrinsic::mips_ilvod_d:
+    return DAG.getNode(MipsISD::ILVOD, DL, Op->getValueType(0),
+                       Op->getOperand(1), Op->getOperand(2));
+  case Intrinsic::mips_ilvr_b:
+  case Intrinsic::mips_ilvr_h:
+  case Intrinsic::mips_ilvr_w:
+  case Intrinsic::mips_ilvr_d:
+    return DAG.getNode(MipsISD::ILVR, DL, Op->getValueType(0),
+                       Op->getOperand(1), Op->getOperand(2));
+  case Intrinsic::mips_insert_b:
+  case Intrinsic::mips_insert_h:
+  case Intrinsic::mips_insert_w:
+  case Intrinsic::mips_insert_d:
+    return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Op), Op->getValueType(0),
+                       Op->getOperand(1), Op->getOperand(3), Op->getOperand(2));
+  case Intrinsic::mips_ldi_b:
+  case Intrinsic::mips_ldi_h:
+  case Intrinsic::mips_ldi_w:
+  case Intrinsic::mips_ldi_d:
+    return lowerMSASplatImm(Op, 1, DAG);
+  case Intrinsic::mips_lsa: {
+    EVT ResTy = Op->getValueType(0);
+    return DAG.getNode(ISD::ADD, SDLoc(Op), ResTy, Op->getOperand(1),
+                       DAG.getNode(ISD::SHL, SDLoc(Op), ResTy,
+                                   Op->getOperand(2), Op->getOperand(3)));
+  }
+  case Intrinsic::mips_maddv_b:
+  case Intrinsic::mips_maddv_h:
+  case Intrinsic::mips_maddv_w:
+  case Intrinsic::mips_maddv_d: {
+    EVT ResTy = Op->getValueType(0);
+    return DAG.getNode(ISD::ADD, SDLoc(Op), ResTy, Op->getOperand(1),
+                       DAG.getNode(ISD::MUL, SDLoc(Op), ResTy,
+                                   Op->getOperand(2), Op->getOperand(3)));
+  }
+  case Intrinsic::mips_max_s_b:
+  case Intrinsic::mips_max_s_h:
+  case Intrinsic::mips_max_s_w:
+  case Intrinsic::mips_max_s_d:
+    return DAG.getNode(MipsISD::VSMAX, DL, Op->getValueType(0),
+                       Op->getOperand(1), Op->getOperand(2));
+  case Intrinsic::mips_max_u_b:
+  case Intrinsic::mips_max_u_h:
+  case Intrinsic::mips_max_u_w:
+  case Intrinsic::mips_max_u_d:
+    return DAG.getNode(MipsISD::VUMAX, DL, Op->getValueType(0),
+                       Op->getOperand(1), Op->getOperand(2));
+  case Intrinsic::mips_maxi_s_b:
+  case Intrinsic::mips_maxi_s_h:
+  case Intrinsic::mips_maxi_s_w:
+  case Intrinsic::mips_maxi_s_d:
+    return DAG.getNode(MipsISD::VSMAX, DL, Op->getValueType(0),
+                       Op->getOperand(1), lowerMSASplatImm(Op, 2, DAG));
+  case Intrinsic::mips_maxi_u_b:
+  case Intrinsic::mips_maxi_u_h:
+  case Intrinsic::mips_maxi_u_w:
+  case Intrinsic::mips_maxi_u_d:
+    return DAG.getNode(MipsISD::VUMAX, DL, Op->getValueType(0),
+                       Op->getOperand(1), lowerMSASplatImm(Op, 2, DAG));
+  case Intrinsic::mips_min_s_b:
+  case Intrinsic::mips_min_s_h:
+  case Intrinsic::mips_min_s_w:
+  case Intrinsic::mips_min_s_d:
+    return DAG.getNode(MipsISD::VSMIN, DL, Op->getValueType(0),
+                       Op->getOperand(1), Op->getOperand(2));
+  case Intrinsic::mips_min_u_b:
+  case Intrinsic::mips_min_u_h:
+  case Intrinsic::mips_min_u_w:
+  case Intrinsic::mips_min_u_d:
+    return DAG.getNode(MipsISD::VUMIN, DL, Op->getValueType(0),
+                       Op->getOperand(1), Op->getOperand(2));
+  case Intrinsic::mips_mini_s_b:
+  case Intrinsic::mips_mini_s_h:
+  case Intrinsic::mips_mini_s_w:
+  case Intrinsic::mips_mini_s_d:
+    return DAG.getNode(MipsISD::VSMIN, DL, Op->getValueType(0),
+                       Op->getOperand(1), lowerMSASplatImm(Op, 2, DAG));
+  case Intrinsic::mips_mini_u_b:
+  case Intrinsic::mips_mini_u_h:
+  case Intrinsic::mips_mini_u_w:
+  case Intrinsic::mips_mini_u_d:
+    return DAG.getNode(MipsISD::VUMIN, DL, Op->getValueType(0),
+                       Op->getOperand(1), lowerMSASplatImm(Op, 2, DAG));
+  case Intrinsic::mips_mod_s_b:
+  case Intrinsic::mips_mod_s_h:
+  case Intrinsic::mips_mod_s_w:
+  case Intrinsic::mips_mod_s_d:
+    return DAG.getNode(ISD::SREM, DL, Op->getValueType(0), Op->getOperand(1),
+                       Op->getOperand(2));
+  case Intrinsic::mips_mod_u_b:
+  case Intrinsic::mips_mod_u_h:
+  case Intrinsic::mips_mod_u_w:
+  case Intrinsic::mips_mod_u_d:
+    return DAG.getNode(ISD::UREM, DL, Op->getValueType(0), Op->getOperand(1),
+                       Op->getOperand(2));
+  case Intrinsic::mips_mulv_b:
+  case Intrinsic::mips_mulv_h:
+  case Intrinsic::mips_mulv_w:
+  case Intrinsic::mips_mulv_d:
+    return DAG.getNode(ISD::MUL, DL, Op->getValueType(0), Op->getOperand(1),
+                       Op->getOperand(2));
+  case Intrinsic::mips_msubv_b:
+  case Intrinsic::mips_msubv_h:
+  case Intrinsic::mips_msubv_w:
+  case Intrinsic::mips_msubv_d: {
+    EVT ResTy = Op->getValueType(0);
+    return DAG.getNode(ISD::SUB, SDLoc(Op), ResTy, Op->getOperand(1),
+                       DAG.getNode(ISD::MUL, SDLoc(Op), ResTy,
+                                   Op->getOperand(2), Op->getOperand(3)));
+  }
+  case Intrinsic::mips_nlzc_b:
+  case Intrinsic::mips_nlzc_h:
+  case Intrinsic::mips_nlzc_w:
+  case Intrinsic::mips_nlzc_d:
+    return DAG.getNode(ISD::CTLZ, DL, Op->getValueType(0), Op->getOperand(1));
+  case Intrinsic::mips_nor_v: {
+    SDValue Res = DAG.getNode(ISD::OR, DL, Op->getValueType(0),
+                              Op->getOperand(1), Op->getOperand(2));
+    return DAG.getNOT(DL, Res, Res->getValueType(0));
+  }
+  case Intrinsic::mips_nori_b: {
+    SDValue Res =  DAG.getNode(ISD::OR, DL, Op->getValueType(0),
+                               Op->getOperand(1),
+                               lowerMSASplatImm(Op, 2, DAG));
+    return DAG.getNOT(DL, Res, Res->getValueType(0));
+  }
+  case Intrinsic::mips_or_v:
+    return DAG.getNode(ISD::OR, DL, Op->getValueType(0), Op->getOperand(1),
+                       Op->getOperand(2));
+  case Intrinsic::mips_ori_b:
+    return DAG.getNode(ISD::OR, DL, Op->getValueType(0),
+                       Op->getOperand(1), lowerMSASplatImm(Op, 2, DAG));
+  case Intrinsic::mips_pckev_b:
+  case Intrinsic::mips_pckev_h:
+  case Intrinsic::mips_pckev_w:
+  case Intrinsic::mips_pckev_d:
+    return DAG.getNode(MipsISD::PCKEV, DL, Op->getValueType(0),
+                       Op->getOperand(1), Op->getOperand(2));
+  case Intrinsic::mips_pckod_b:
+  case Intrinsic::mips_pckod_h:
+  case Intrinsic::mips_pckod_w:
+  case Intrinsic::mips_pckod_d:
+    return DAG.getNode(MipsISD::PCKOD, DL, Op->getValueType(0),
+                       Op->getOperand(1), Op->getOperand(2));
+  case Intrinsic::mips_pcnt_b:
+  case Intrinsic::mips_pcnt_h:
+  case Intrinsic::mips_pcnt_w:
+  case Intrinsic::mips_pcnt_d:
+    return DAG.getNode(ISD::CTPOP, DL, Op->getValueType(0), Op->getOperand(1));
+  case Intrinsic::mips_shf_b:
+  case Intrinsic::mips_shf_h:
+  case Intrinsic::mips_shf_w:
+    return DAG.getNode(MipsISD::SHF, DL, Op->getValueType(0),
+                       Op->getOperand(2), Op->getOperand(1));
+  case Intrinsic::mips_sll_b:
+  case Intrinsic::mips_sll_h:
+  case Intrinsic::mips_sll_w:
+  case Intrinsic::mips_sll_d:
+    return DAG.getNode(ISD::SHL, DL, Op->getValueType(0), Op->getOperand(1),
+                       Op->getOperand(2));
+  case Intrinsic::mips_slli_b:
+  case Intrinsic::mips_slli_h:
+  case Intrinsic::mips_slli_w:
+  case Intrinsic::mips_slli_d:
+    return DAG.getNode(ISD::SHL, DL, Op->getValueType(0),
+                       Op->getOperand(1), lowerMSASplatImm(Op, 2, DAG));
+  case Intrinsic::mips_splat_b:
+  case Intrinsic::mips_splat_h:
+  case Intrinsic::mips_splat_w:
+  case Intrinsic::mips_splat_d:
+    // We can't lower via VECTOR_SHUFFLE because it requires constant shuffle
+    // masks, nor can we lower via BUILD_VECTOR & EXTRACT_VECTOR_ELT because
+    // EXTRACT_VECTOR_ELT can't extract i64's on MIPS32.
+    // Instead we lower to MipsISD::VSHF and match from there.
+    return DAG.getNode(MipsISD::VSHF, DL, Op->getValueType(0),
+                       lowerMSASplatZExt(Op, 2, DAG), Op->getOperand(1),
+                       Op->getOperand(1));
+  case Intrinsic::mips_splati_b:
+  case Intrinsic::mips_splati_h:
+  case Intrinsic::mips_splati_w:
+  case Intrinsic::mips_splati_d:
+    return DAG.getNode(MipsISD::VSHF, DL, Op->getValueType(0),
+                       lowerMSASplatImm(Op, 2, DAG), Op->getOperand(1),
+                       Op->getOperand(1));
+  case Intrinsic::mips_sra_b:
+  case Intrinsic::mips_sra_h:
+  case Intrinsic::mips_sra_w:
+  case Intrinsic::mips_sra_d:
+    return DAG.getNode(ISD::SRA, DL, Op->getValueType(0), Op->getOperand(1),
+                       Op->getOperand(2));
+  case Intrinsic::mips_srai_b:
+  case Intrinsic::mips_srai_h:
+  case Intrinsic::mips_srai_w:
+  case Intrinsic::mips_srai_d:
+    return DAG.getNode(ISD::SRA, DL, Op->getValueType(0),
+                       Op->getOperand(1), lowerMSASplatImm(Op, 2, DAG));
+  case Intrinsic::mips_srl_b:
+  case Intrinsic::mips_srl_h:
+  case Intrinsic::mips_srl_w:
+  case Intrinsic::mips_srl_d:
+    return DAG.getNode(ISD::SRL, DL, Op->getValueType(0), Op->getOperand(1),
+                       Op->getOperand(2));
+  case Intrinsic::mips_srli_b:
+  case Intrinsic::mips_srli_h:
+  case Intrinsic::mips_srli_w:
+  case Intrinsic::mips_srli_d:
+    return DAG.getNode(ISD::SRL, DL, Op->getValueType(0),
+                       Op->getOperand(1), lowerMSASplatImm(Op, 2, DAG));
+  case Intrinsic::mips_subv_b:
+  case Intrinsic::mips_subv_h:
+  case Intrinsic::mips_subv_w:
+  case Intrinsic::mips_subv_d:
+    return DAG.getNode(ISD::SUB, DL, Op->getValueType(0), Op->getOperand(1),
+                       Op->getOperand(2));
+  case Intrinsic::mips_subvi_b:
+  case Intrinsic::mips_subvi_h:
+  case Intrinsic::mips_subvi_w:
+  case Intrinsic::mips_subvi_d:
+    return DAG.getNode(ISD::SUB, DL, Op->getValueType(0),
+                       Op->getOperand(1), lowerMSASplatImm(Op, 2, DAG));
+  case Intrinsic::mips_vshf_b:
+  case Intrinsic::mips_vshf_h:
+  case Intrinsic::mips_vshf_w:
+  case Intrinsic::mips_vshf_d:
+    return DAG.getNode(MipsISD::VSHF, DL, Op->getValueType(0),
+                       Op->getOperand(1), Op->getOperand(2), Op->getOperand(3));
+  case Intrinsic::mips_xor_v:
+    return DAG.getNode(ISD::XOR, DL, Op->getValueType(0), Op->getOperand(1),
+                       Op->getOperand(2));
+  case Intrinsic::mips_xori_b:
+    return DAG.getNode(ISD::XOR, DL, Op->getValueType(0),
+                       Op->getOperand(1), lowerMSASplatImm(Op, 2, DAG));
   }
 }
 
+static SDValue lowerMSALoadIntr(SDValue Op, SelectionDAG &DAG, unsigned Intr) {
+  SDLoc DL(Op);
+  SDValue ChainIn = Op->getOperand(0);
+  SDValue Address = Op->getOperand(2);
+  SDValue Offset  = Op->getOperand(3);
+  EVT ResTy = Op->getValueType(0);
+  EVT PtrTy = Address->getValueType(0);
+
+  Address = DAG.getNode(ISD::ADD, DL, PtrTy, Address, Offset);
+
+  return DAG.getLoad(ResTy, DL, ChainIn, Address, MachinePointerInfo(), false,
+                     false, false, 16);
+}
+
 SDValue MipsSETargetLowering::lowerINTRINSIC_W_CHAIN(SDValue Op,
                                                      SelectionDAG &DAG) const {
-  switch (cast<ConstantSDNode>(Op->getOperand(1))->getZExtValue()) {
+  unsigned Intr = cast<ConstantSDNode>(Op->getOperand(1))->getZExtValue();
+  switch (Intr) {
   default:
     return SDValue();
   case Intrinsic::mips_extp:
@@ -749,7 +2078,522 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_W_CHAIN(SDValue Op,
     return lowerDSPIntr(Op, DAG, MipsISD::DPSQX_S_W_PH);
   case Intrinsic::mips_dpsqx_sa_w_ph:
     return lowerDSPIntr(Op, DAG, MipsISD::DPSQX_SA_W_PH);
+  case Intrinsic::mips_ld_b:
+  case Intrinsic::mips_ld_h:
+  case Intrinsic::mips_ld_w:
+  case Intrinsic::mips_ld_d:
+   return lowerMSALoadIntr(Op, DAG, Intr);
+  }
+}
+
+static SDValue lowerMSAStoreIntr(SDValue Op, SelectionDAG &DAG, unsigned Intr) {
+  SDLoc DL(Op);
+  SDValue ChainIn = Op->getOperand(0);
+  SDValue Value   = Op->getOperand(2);
+  SDValue Address = Op->getOperand(3);
+  SDValue Offset  = Op->getOperand(4);
+  EVT PtrTy = Address->getValueType(0);
+
+  Address = DAG.getNode(ISD::ADD, DL, PtrTy, Address, Offset);
+
+  return DAG.getStore(ChainIn, DL, Value, Address, MachinePointerInfo(), false,
+                      false, 16);
+}
+
+SDValue MipsSETargetLowering::lowerINTRINSIC_VOID(SDValue Op,
+                                                  SelectionDAG &DAG) const {
+  unsigned Intr = cast<ConstantSDNode>(Op->getOperand(1))->getZExtValue();
+  switch (Intr) {
+  default:
+    return SDValue();
+  case Intrinsic::mips_st_b:
+  case Intrinsic::mips_st_h:
+  case Intrinsic::mips_st_w:
+  case Intrinsic::mips_st_d:
+    return lowerMSAStoreIntr(Op, DAG, Intr);
+  }
+}
+
+/// \brief Check if the given BuildVectorSDNode is a splat.
+/// This method currently relies on DAG nodes being reused when equivalent,
+/// so it's possible for this to return false even when isConstantSplat returns
+/// true.
+static bool isSplatVector(const BuildVectorSDNode *N) {
+  unsigned int nOps = N->getNumOperands();
+  assert(nOps > 1 && "isSplatVector has 0 or 1 sized build vector");
+
+  SDValue Operand0 = N->getOperand(0);
+
+  for (unsigned int i = 1; i < nOps; ++i) {
+    if (N->getOperand(i) != Operand0)
+      return false;
+  }
+
+  return true;
+}
+
+// Lower ISD::EXTRACT_VECTOR_ELT into MipsISD::VEXTRACT_SEXT_ELT.
+//
+// The non-value bits resulting from ISD::EXTRACT_VECTOR_ELT are undefined. We
+// choose to sign-extend but we could have equally chosen zero-extend. The
+// DAGCombiner will fold any sign/zero extension of the ISD::EXTRACT_VECTOR_ELT
+// result into this node later (possibly changing it to a zero-extend in the
+// process).
+SDValue MipsSETargetLowering::
+lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  EVT ResTy = Op->getValueType(0);
+  SDValue Op0 = Op->getOperand(0);
+  EVT VecTy = Op0->getValueType(0);
+
+  if (!VecTy.is128BitVector())
+    return SDValue();
+
+  if (ResTy.isInteger()) {
+    SDValue Op1 = Op->getOperand(1);
+    EVT EltTy = VecTy.getVectorElementType();
+    return DAG.getNode(MipsISD::VEXTRACT_SEXT_ELT, DL, ResTy, Op0, Op1,
+                       DAG.getValueType(EltTy));
+  }
+
+  return Op;
+}
+
+static bool isConstantOrUndef(const SDValue Op) {
+  if (Op->getOpcode() == ISD::UNDEF)
+    return true;
+  if (dyn_cast<ConstantSDNode>(Op))
+    return true;
+  if (dyn_cast<ConstantFPSDNode>(Op))
+    return true;
+  return false;
+}
+
+static bool isConstantOrUndefBUILD_VECTOR(const BuildVectorSDNode *Op) {
+  for (unsigned i = 0; i < Op->getNumOperands(); ++i)
+    if (isConstantOrUndef(Op->getOperand(i)))
+      return true;
+  return false;
+}
+
+// Lowers ISD::BUILD_VECTOR into appropriate SelectionDAG nodes for the
+// backend.
+//
+// Lowers according to the following rules:
+// - Constant splats are legal as-is as long as the SplatBitSize is a power of
+//   2 less than or equal to 64 and the value fits into a signed 10-bit
+//   immediate
+// - Constant splats are lowered to bitconverted BUILD_VECTORs if SplatBitSize
+//   is a power of 2 less than or equal to 64 and the value does not fit into a
+//   signed 10-bit immediate
+// - Non-constant splats are legal as-is.
+// - Non-constant non-splats are lowered to sequences of INSERT_VECTOR_ELT.
+// - All others are illegal and must be expanded.
+SDValue MipsSETargetLowering::lowerBUILD_VECTOR(SDValue Op,
+                                                SelectionDAG &DAG) const {
+  BuildVectorSDNode *Node = cast<BuildVectorSDNode>(Op);
+  EVT ResTy = Op->getValueType(0);
+  SDLoc DL(Op);
+  APInt SplatValue, SplatUndef;
+  unsigned SplatBitSize;
+  bool HasAnyUndefs;
+
+  if (!Subtarget->hasMSA() || !ResTy.is128BitVector())
+    return SDValue();
+
+  if (Node->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
+                            HasAnyUndefs, 8,
+                            !Subtarget->isLittle()) && SplatBitSize <= 64) {
+    // We can only cope with 8, 16, 32, or 64-bit elements
+    if (SplatBitSize != 8 && SplatBitSize != 16 && SplatBitSize != 32 &&
+        SplatBitSize != 64)
+      return SDValue();
+
+    // If the value fits into a simm10 then we can use ldi.[bhwd]
+    // However, if it isn't an integer type we will have to bitcast from an
+    // integer type first. Also, if there are any undefs, we must lower them
+    // to defined values first.
+    if (ResTy.isInteger() && !HasAnyUndefs && SplatValue.isSignedIntN(10))
+      return Op;
+
+    EVT ViaVecTy;
+
+    switch (SplatBitSize) {
+    default:
+      return SDValue();
+    case 8:
+      ViaVecTy = MVT::v16i8;
+      break;
+    case 16:
+      ViaVecTy = MVT::v8i16;
+      break;
+    case 32:
+      ViaVecTy = MVT::v4i32;
+      break;
+    case 64:
+      // There's no fill.d to fall back on for 64-bit values
+      return SDValue();
+    }
+
+    // SelectionDAG::getConstant will promote SplatValue appropriately.
+    SDValue Result = DAG.getConstant(SplatValue, ViaVecTy);
+
+    // Bitcast to the type we originally wanted
+    if (ViaVecTy != ResTy)
+      Result = DAG.getNode(ISD::BITCAST, SDLoc(Node), ResTy, Result);
+
+    return Result;
+  } else if (isSplatVector(Node))
+    return Op;
+  else if (!isConstantOrUndefBUILD_VECTOR(Node)) {
+    // Use INSERT_VECTOR_ELT operations rather than expand to stores.
+    // The resulting code is the same length as the expansion, but it doesn't
+    // use memory operations
+    EVT ResTy = Node->getValueType(0);
+
+    assert(ResTy.isVector());
+
+    unsigned NumElts = ResTy.getVectorNumElements();
+    SDValue Vector = DAG.getUNDEF(ResTy);
+    for (unsigned i = 0; i < NumElts; ++i) {
+      Vector = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ResTy, Vector,
+                           Node->getOperand(i),
+                           DAG.getConstant(i, MVT::i32));
+    }
+    return Vector;
+  }
+
+  return SDValue();
+}
+
+// Lower VECTOR_SHUFFLE into SHF (if possible).
+//
+// SHF splits the vector into blocks of four elements, then shuffles these
+// elements according to a <4 x i2> constant (encoded as an integer immediate).
+//
+// It is therefore possible to lower into SHF when the mask takes the form:
+//   <a, b, c, d, a+4, b+4, c+4, d+4, a+8, b+8, c+8, d+8, ...>
+// When undef's appear they are treated as if they were whatever value is
+// necessary in order to fit the above form.
+//
+// For example:
+//   %2 = shufflevector <8 x i16> %0, <8 x i16> undef,
+//                      <8 x i32> <i32 3, i32 2, i32 1, i32 0,
+//                                 i32 7, i32 6, i32 5, i32 4>
+// is lowered to:
+//   (SHF_H $w0, $w1, 27)
+// where the 27 comes from:
+//   3 + (2 << 2) + (1 << 4) + (0 << 6)
+static SDValue lowerVECTOR_SHUFFLE_SHF(SDValue Op, EVT ResTy,
+                                       SmallVector<int, 16> Indices,
+                                       SelectionDAG &DAG) {
+  int SHFIndices[4] = { -1, -1, -1, -1 };
+
+  if (Indices.size() < 4)
+    return SDValue();
+
+  for (unsigned i = 0; i < 4; ++i) {
+    for (unsigned j = i; j < Indices.size(); j += 4) {
+      int Idx = Indices[j];
+
+      // Convert from vector index to 4-element subvector index
+      // If an index refers to an element outside of the subvector then give up
+      if (Idx != -1) {
+        Idx -= 4 * (j / 4);
+        if (Idx < 0 || Idx >= 4)
+          return SDValue();
+      }
+
+      // If the mask has an undef, replace it with the current index.
+      // Note that it might still be undef if the current index is also undef
+      if (SHFIndices[i] == -1)
+        SHFIndices[i] = Idx;
+
+      // Check that non-undef values are the same as in the mask. If they
+      // aren't then give up
+      if (!(Idx == -1 || Idx == SHFIndices[i]))
+        return SDValue();
+    }
+  }
+
+  // Calculate the immediate. Replace any remaining undefs with zero
+  APInt Imm(32, 0);
+  for (int i = 3; i >= 0; --i) {
+    int Idx = SHFIndices[i];
+
+    if (Idx == -1)
+      Idx = 0;
+
+    Imm <<= 2;
+    Imm |= Idx & 0x3;
+  }
+
+  return DAG.getNode(MipsISD::SHF, SDLoc(Op), ResTy,
+                     DAG.getConstant(Imm, MVT::i32), Op->getOperand(0));
+}
+
+// Lower VECTOR_SHUFFLE into ILVEV (if possible).
+//
+// ILVEV interleaves the even elements from each vector.
+//
+// It is possible to lower into ILVEV when the mask takes the form:
+//   <0, n, 2, n+2, 4, n+4, ...>
+// where n is the number of elements in the vector.
+//
+// When undef's appear in the mask they are treated as if they were whatever
+// value is necessary in order to fit the above form.
+static SDValue lowerVECTOR_SHUFFLE_ILVEV(SDValue Op, EVT ResTy,
+                                         SmallVector<int, 16> Indices,
+                                         SelectionDAG &DAG) {
+  assert ((Indices.size() % 2) == 0);
+  int WsIdx = 0;
+  int WtIdx = ResTy.getVectorNumElements();
+
+  for (unsigned i = 0; i < Indices.size(); i += 2) {
+    if (Indices[i] != -1 && Indices[i] != WsIdx)
+      return SDValue();
+    if (Indices[i+1] != -1 && Indices[i+1] != WtIdx)
+      return SDValue();
+    WsIdx += 2;
+    WtIdx += 2;
+  }
+
+  return DAG.getNode(MipsISD::ILVEV, SDLoc(Op), ResTy, Op->getOperand(0),
+                     Op->getOperand(1));
+}
+
+// Lower VECTOR_SHUFFLE into ILVOD (if possible).
+//
+// ILVOD interleaves the odd elements from each vector.
+//
+// It is possible to lower into ILVOD when the mask takes the form:
+//   <1, n+1, 3, n+3, 5, n+5, ...>
+// where n is the number of elements in the vector.
+//
+// When undef's appear in the mask they are treated as if they were whatever
+// value is necessary in order to fit the above form.
+static SDValue lowerVECTOR_SHUFFLE_ILVOD(SDValue Op, EVT ResTy,
+                                         SmallVector<int, 16> Indices,
+                                         SelectionDAG &DAG) {
+  assert ((Indices.size() % 2) == 0);
+  int WsIdx = 1;
+  int WtIdx = ResTy.getVectorNumElements() + 1;
+
+  for (unsigned i = 0; i < Indices.size(); i += 2) {
+    if (Indices[i] != -1 && Indices[i] != WsIdx)
+      return SDValue();
+    if (Indices[i+1] != -1 && Indices[i+1] != WtIdx)
+      return SDValue();
+    WsIdx += 2;
+    WtIdx += 2;
+  }
+
+  return DAG.getNode(MipsISD::ILVOD, SDLoc(Op), ResTy, Op->getOperand(0),
+                     Op->getOperand(1));
+}
+
+// Lower VECTOR_SHUFFLE into ILVL (if possible).
+//
+// ILVL interleaves consecutive elements from the left half of each vector.
+//
+// It is possible to lower into ILVL when the mask takes the form:
+//   <0, n, 1, n+1, 2, n+2, ...>
+// where n is the number of elements in the vector.
+//
+// When undef's appear in the mask they are treated as if they were whatever
+// value is necessary in order to fit the above form.
+static SDValue lowerVECTOR_SHUFFLE_ILVL(SDValue Op, EVT ResTy,
+                                        SmallVector<int, 16> Indices,
+                                        SelectionDAG &DAG) {
+  assert ((Indices.size() % 2) == 0);
+  int WsIdx = 0;
+  int WtIdx = ResTy.getVectorNumElements();
+
+  for (unsigned i = 0; i < Indices.size(); i += 2) {
+    if (Indices[i] != -1 && Indices[i] != WsIdx)
+      return SDValue();
+    if (Indices[i+1] != -1 && Indices[i+1] != WtIdx)
+      return SDValue();
+    WsIdx ++;
+    WtIdx ++;
   }
+
+  return DAG.getNode(MipsISD::ILVL, SDLoc(Op), ResTy, Op->getOperand(0),
+                     Op->getOperand(1));
+}
+
+// Lower VECTOR_SHUFFLE into ILVR (if possible).
+//
+// ILVR interleaves consecutive elements from the right half of each vector.
+//
+// It is possible to lower into ILVR when the mask takes the form:
+//   <x, n+x, x+1, n+x+1, x+2, n+x+2, ...>
+// where n is the number of elements in the vector and x is half n.
+//
+// When undef's appear in the mask they are treated as if they were whatever
+// value is necessary in order to fit the above form.
+static SDValue lowerVECTOR_SHUFFLE_ILVR(SDValue Op, EVT ResTy,
+                                        SmallVector<int, 16> Indices,
+                                        SelectionDAG &DAG) {
+  assert ((Indices.size() % 2) == 0);
+  unsigned NumElts = ResTy.getVectorNumElements();
+  int WsIdx = NumElts / 2;
+  int WtIdx = NumElts + NumElts / 2;
+
+  for (unsigned i = 0; i < Indices.size(); i += 2) {
+    if (Indices[i] != -1 && Indices[i] != WsIdx)
+      return SDValue();
+    if (Indices[i+1] != -1 && Indices[i+1] != WtIdx)
+      return SDValue();
+    WsIdx ++;
+    WtIdx ++;
+  }
+
+  return DAG.getNode(MipsISD::ILVR, SDLoc(Op), ResTy, Op->getOperand(0),
+                     Op->getOperand(1));
+}
+
+// Lower VECTOR_SHUFFLE into PCKEV (if possible).
+//
+// PCKEV copies the even elements of each vector into the result vector.
+//
+// It is possible to lower into PCKEV when the mask takes the form:
+//   <0, 2, 4, ..., n, n+2, n+4, ...>
+// where n is the number of elements in the vector.
+//
+// When undef's appear in the mask they are treated as if they were whatever
+// value is necessary in order to fit the above form.
+static SDValue lowerVECTOR_SHUFFLE_PCKEV(SDValue Op, EVT ResTy,
+                                         SmallVector<int, 16> Indices,
+                                         SelectionDAG &DAG) {
+  assert ((Indices.size() % 2) == 0);
+  int Idx = 0;
+
+  for (unsigned i = 0; i < Indices.size(); ++i) {
+    if (Indices[i] != -1 && Indices[i] != Idx)
+      return SDValue();
+    Idx += 2;
+  }
+
+  return DAG.getNode(MipsISD::PCKEV, SDLoc(Op), ResTy, Op->getOperand(0),
+                     Op->getOperand(1));
+}
+
+// Lower VECTOR_SHUFFLE into PCKOD (if possible).
+//
+// PCKOD copies the odd elements of each vector into the result vector.
+//
+// It is possible to lower into PCKOD when the mask takes the form:
+//   <1, 3, 5, ..., n+1, n+3, n+5, ...>
+// where n is the number of elements in the vector.
+//
+// When undef's appear in the mask they are treated as if they were whatever
+// value is necessary in order to fit the above form.
+static SDValue lowerVECTOR_SHUFFLE_PCKOD(SDValue Op, EVT ResTy,
+                                         SmallVector<int, 16> Indices,
+                                         SelectionDAG &DAG) {
+  assert ((Indices.size() % 2) == 0);
+  int Idx = 1;
+
+  for (unsigned i = 0; i < Indices.size(); ++i) {
+    if (Indices[i] != -1 && Indices[i] != Idx)
+      return SDValue();
+    Idx += 2;
+  }
+
+  return DAG.getNode(MipsISD::PCKOD, SDLoc(Op), ResTy, Op->getOperand(0),
+                     Op->getOperand(1));
+}
+
+// Lower VECTOR_SHUFFLE into VSHF.
+//
+// This mostly consists of converting the shuffle indices in Indices into a
+// BUILD_VECTOR and adding it as an operand to the resulting VSHF. There is
+// also code to eliminate unused operands of the VECTOR_SHUFFLE. For example,
+// if the type is v8i16 and all the indices are less than 8 then the second
+// operand is unused and can be replaced with anything. We choose to replace it
+// with the used operand since this reduces the number of instructions overall.
+static SDValue lowerVECTOR_SHUFFLE_VSHF(SDValue Op, EVT ResTy,
+                                        SmallVector<int, 16> Indices,
+                                        SelectionDAG &DAG) {
+  SmallVector<SDValue, 16> Ops;
+  SDValue Op0;
+  SDValue Op1;
+  EVT MaskVecTy = ResTy.changeVectorElementTypeToInteger();
+  EVT MaskEltTy = MaskVecTy.getVectorElementType();
+  bool Using1stVec = false;
+  bool Using2ndVec = false;
+  SDLoc DL(Op);
+  int ResTyNumElts = ResTy.getVectorNumElements();
+
+  for (int i = 0; i < ResTyNumElts; ++i) {
+    // Idx == -1 means UNDEF
+    int Idx = Indices[i];
+
+    if (0 <= Idx && Idx < ResTyNumElts)
+      Using1stVec = true;
+    if (ResTyNumElts <= Idx && Idx < ResTyNumElts * 2)
+      Using2ndVec = true;
+  }
+
+  for (SmallVector<int, 16>::iterator I = Indices.begin(); I != Indices.end();
+       ++I)
+    Ops.push_back(DAG.getTargetConstant(*I, MaskEltTy));
+
+  SDValue MaskVec = DAG.getNode(ISD::BUILD_VECTOR, DL, MaskVecTy, &Ops[0],
+                                Ops.size());
+
+  if (Using1stVec && Using2ndVec) {
+    Op0 = Op->getOperand(0);
+    Op1 = Op->getOperand(1);
+  } else if (Using1stVec)
+    Op0 = Op1 = Op->getOperand(0);
+  else if (Using2ndVec)
+    Op0 = Op1 = Op->getOperand(1);
+  else
+    llvm_unreachable("shuffle vector mask references neither vector operand?");
+
+  return DAG.getNode(MipsISD::VSHF, DL, ResTy, MaskVec, Op0, Op1);
+}
+
+// Lower VECTOR_SHUFFLE into one of a number of instructions depending on the
+// indices in the shuffle.
+SDValue MipsSETargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
+                                                  SelectionDAG &DAG) const {
+  ShuffleVectorSDNode *Node = cast<ShuffleVectorSDNode>(Op);
+  EVT ResTy = Op->getValueType(0);
+
+  if (!ResTy.is128BitVector())
+    return SDValue();
+
+  int ResTyNumElts = ResTy.getVectorNumElements();
+  SmallVector<int, 16> Indices;
+
+  for (int i = 0; i < ResTyNumElts; ++i)
+    Indices.push_back(Node->getMaskElt(i));
+
+  SDValue Result = lowerVECTOR_SHUFFLE_SHF(Op, ResTy, Indices, DAG);
+  if (Result.getNode())
+    return Result;
+  Result = lowerVECTOR_SHUFFLE_ILVEV(Op, ResTy, Indices, DAG);
+  if (Result.getNode())
+    return Result;
+  Result = lowerVECTOR_SHUFFLE_ILVOD(Op, ResTy, Indices, DAG);
+  if (Result.getNode())
+    return Result;
+  Result = lowerVECTOR_SHUFFLE_ILVL(Op, ResTy, Indices, DAG);
+  if (Result.getNode())
+    return Result;
+  Result = lowerVECTOR_SHUFFLE_ILVR(Op, ResTy, Indices, DAG);
+  if (Result.getNode())
+    return Result;
+  Result = lowerVECTOR_SHUFFLE_PCKEV(Op, ResTy, Indices, DAG);
+  if (Result.getNode())
+    return Result;
+  Result = lowerVECTOR_SHUFFLE_PCKOD(Op, ResTy, Indices, DAG);
+  if (Result.getNode())
+    return Result;
+  return lowerVECTOR_SHUFFLE_VSHF(Op, ResTy, Indices, DAG);
 }
 
 MachineBasicBlock * MipsSETargetLowering::
@@ -814,3 +2658,318 @@ emitBPOSGE32(MachineInstr *MI, MachineBasicBlock *BB) const{
   MI->eraseFromParent();   // The pseudo instruction is gone now.
   return Sink;
 }
+
+MachineBasicBlock * MipsSETargetLowering::
+emitMSACBranchPseudo(MachineInstr *MI, MachineBasicBlock *BB,
+                     unsigned BranchOp) const{
+  // $bb:
+  //  vany_nonzero $rd, $ws
+  //  =>
+  // $bb:
+  //  bnz.b $ws, $tbb
+  //  b $fbb
+  // $fbb:
+  //  li $rd1, 0
+  //  b $sink
+  // $tbb:
+  //  li $rd2, 1
+  // $sink:
+  //  $rd = phi($rd1, $fbb, $rd2, $tbb)
+
+  MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  const TargetRegisterClass *RC = &Mips::GPR32RegClass;
+  DebugLoc DL = MI->getDebugLoc();
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction::iterator It = llvm::next(MachineFunction::iterator(BB));
+  MachineFunction *F = BB->getParent();
+  MachineBasicBlock *FBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *TBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *Sink  = F->CreateMachineBasicBlock(LLVM_BB);
+  F->insert(It, FBB);
+  F->insert(It, TBB);
+  F->insert(It, Sink);
+
+  // Transfer the remainder of BB and its successor edges to Sink.
+  Sink->splice(Sink->begin(), BB, llvm::next(MachineBasicBlock::iterator(MI)),
+               BB->end());
+  Sink->transferSuccessorsAndUpdatePHIs(BB);
+
+  // Add successors.
+  BB->addSuccessor(FBB);
+  BB->addSuccessor(TBB);
+  FBB->addSuccessor(Sink);
+  TBB->addSuccessor(Sink);
+
+  // Insert the real bnz.b instruction to $BB.
+  BuildMI(BB, DL, TII->get(BranchOp))
+    .addReg(MI->getOperand(1).getReg())
+    .addMBB(TBB);
+
+  // Fill $FBB.
+  unsigned RD1 = RegInfo.createVirtualRegister(RC);
+  BuildMI(*FBB, FBB->end(), DL, TII->get(Mips::ADDiu), RD1)
+    .addReg(Mips::ZERO).addImm(0);
+  BuildMI(*FBB, FBB->end(), DL, TII->get(Mips::B)).addMBB(Sink);
+
+  // Fill $TBB.
+  unsigned RD2 = RegInfo.createVirtualRegister(RC);
+  BuildMI(*TBB, TBB->end(), DL, TII->get(Mips::ADDiu), RD2)
+    .addReg(Mips::ZERO).addImm(1);
+
+  // Insert phi function to $Sink.
+  BuildMI(*Sink, Sink->begin(), DL, TII->get(Mips::PHI),
+          MI->getOperand(0).getReg())
+    .addReg(RD1).addMBB(FBB).addReg(RD2).addMBB(TBB);
+
+  MI->eraseFromParent();   // The pseudo instruction is gone now.
+  return Sink;
+}
+
+// Emit the COPY_FW pseudo instruction.
+//
+// copy_fw_pseudo $fd, $ws, n
+// =>
+// copy_u_w $rt, $ws, $n
+// mtc1     $rt, $fd
+//
+// When n is zero, the equivalent operation can be performed with (potentially)
+// zero instructions due to register overlaps. This optimization is never valid
+// for lane 1 because it would require FR=0 mode which isn't supported by MSA.
+MachineBasicBlock * MipsSETargetLowering::
+emitCOPY_FW(MachineInstr *MI, MachineBasicBlock *BB) const{
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
+  DebugLoc DL = MI->getDebugLoc();
+  unsigned Fd = MI->getOperand(0).getReg();
+  unsigned Ws = MI->getOperand(1).getReg();
+  unsigned Lane = MI->getOperand(2).getImm();
+
+  if (Lane == 0)
+    BuildMI(*BB, MI, DL, TII->get(Mips::COPY), Fd).addReg(Ws, 0, Mips::sub_lo);
+  else {
+    unsigned Wt = RegInfo.createVirtualRegister(&Mips::MSA128WRegClass);
+
+    BuildMI(*BB, MI, DL, TII->get(Mips::SPLATI_W), Wt).addReg(Ws).addImm(1);
+    BuildMI(*BB, MI, DL, TII->get(Mips::COPY), Fd).addReg(Wt, 0, Mips::sub_lo);
+  }
+
+  MI->eraseFromParent();   // The pseudo instruction is gone now.
+  return BB;
+}
+
+// Emit the COPY_FD pseudo instruction.
+//
+// copy_fd_pseudo $fd, $ws, n
+// =>
+// splati.d $wt, $ws, $n
+// copy $fd, $wt:sub_64
+//
+// When n is zero, the equivalent operation can be performed with (potentially)
+// zero instructions due to register overlaps. This optimization is always
+// valid because FR=1 mode which is the only supported mode in MSA.
+MachineBasicBlock * MipsSETargetLowering::
+emitCOPY_FD(MachineInstr *MI, MachineBasicBlock *BB) const{
+  assert(Subtarget->isFP64bit());
+
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
+  unsigned Fd  = MI->getOperand(0).getReg();
+  unsigned Ws  = MI->getOperand(1).getReg();
+  unsigned Lane = MI->getOperand(2).getImm() * 2;
+  DebugLoc DL = MI->getDebugLoc();
+
+  if (Lane == 0)
+    BuildMI(*BB, MI, DL, TII->get(Mips::COPY), Fd).addReg(Ws, 0, Mips::sub_64);
+  else {
+    unsigned Wt = RegInfo.createVirtualRegister(&Mips::MSA128DRegClass);
+
+    BuildMI(*BB, MI, DL, TII->get(Mips::SPLATI_D), Wt).addReg(Ws).addImm(1);
+    BuildMI(*BB, MI, DL, TII->get(Mips::COPY), Fd).addReg(Wt, 0, Mips::sub_64);
+  }
+
+  MI->eraseFromParent();   // The pseudo instruction is gone now.
+  return BB;
+}
+
+// Emit the INSERT_FW pseudo instruction.
+//
+// insert_fw_pseudo $wd, $wd_in, $n, $fs
+// =>
+// subreg_to_reg $wt:sub_lo, $fs
+// insve_w $wd[$n], $wd_in, $wt[0]
+MachineBasicBlock *
+MipsSETargetLowering::emitINSERT_FW(MachineInstr *MI,
+                                    MachineBasicBlock *BB) const {
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
+  DebugLoc DL = MI->getDebugLoc();
+  unsigned Wd = MI->getOperand(0).getReg();
+  unsigned Wd_in = MI->getOperand(1).getReg();
+  unsigned Lane = MI->getOperand(2).getImm();
+  unsigned Fs = MI->getOperand(3).getReg();
+  unsigned Wt = RegInfo.createVirtualRegister(&Mips::MSA128WRegClass);
+
+  BuildMI(*BB, MI, DL, TII->get(Mips::SUBREG_TO_REG), Wt)
+      .addImm(0)
+      .addReg(Fs)
+      .addImm(Mips::sub_lo);
+  BuildMI(*BB, MI, DL, TII->get(Mips::INSVE_W), Wd)
+      .addReg(Wd_in)
+      .addImm(Lane)
+      .addReg(Wt);
+
+  MI->eraseFromParent(); // The pseudo instruction is gone now.
+  return BB;
+}
+
+// Emit the INSERT_FD pseudo instruction.
+//
+// insert_fd_pseudo $wd, $fs, n
+// =>
+// subreg_to_reg $wt:sub_64, $fs
+// insve_d $wd[$n], $wd_in, $wt[0]
+MachineBasicBlock *
+MipsSETargetLowering::emitINSERT_FD(MachineInstr *MI,
+                                    MachineBasicBlock *BB) const {
+  assert(Subtarget->isFP64bit());
+
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
+  DebugLoc DL = MI->getDebugLoc();
+  unsigned Wd = MI->getOperand(0).getReg();
+  unsigned Wd_in = MI->getOperand(1).getReg();
+  unsigned Lane = MI->getOperand(2).getImm();
+  unsigned Fs = MI->getOperand(3).getReg();
+  unsigned Wt = RegInfo.createVirtualRegister(&Mips::MSA128DRegClass);
+
+  BuildMI(*BB, MI, DL, TII->get(Mips::SUBREG_TO_REG), Wt)
+      .addImm(0)
+      .addReg(Fs)
+      .addImm(Mips::sub_64);
+  BuildMI(*BB, MI, DL, TII->get(Mips::INSVE_D), Wd)
+      .addReg(Wd_in)
+      .addImm(Lane)
+      .addReg(Wt);
+
+  MI->eraseFromParent(); // The pseudo instruction is gone now.
+  return BB;
+}
+
+// Emit the FILL_FW pseudo instruction.
+//
+// fill_fw_pseudo $wd, $fs
+// =>
+// implicit_def $wt1
+// insert_subreg $wt2:subreg_lo, $wt1, $fs
+// splati.w $wd, $wt2[0]
+MachineBasicBlock *
+MipsSETargetLowering::emitFILL_FW(MachineInstr *MI,
+                                  MachineBasicBlock *BB) const {
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
+  DebugLoc DL = MI->getDebugLoc();
+  unsigned Wd = MI->getOperand(0).getReg();
+  unsigned Fs = MI->getOperand(1).getReg();
+  unsigned Wt1 = RegInfo.createVirtualRegister(&Mips::MSA128WRegClass);
+  unsigned Wt2 = RegInfo.createVirtualRegister(&Mips::MSA128WRegClass);
+
+  BuildMI(*BB, MI, DL, TII->get(Mips::IMPLICIT_DEF), Wt1);
+  BuildMI(*BB, MI, DL, TII->get(Mips::INSERT_SUBREG), Wt2)
+      .addReg(Wt1)
+      .addReg(Fs)
+      .addImm(Mips::sub_lo);
+  BuildMI(*BB, MI, DL, TII->get(Mips::SPLATI_W), Wd).addReg(Wt2).addImm(0);
+
+  MI->eraseFromParent(); // The pseudo instruction is gone now.
+  return BB;
+}
+
+// Emit the FILL_FD pseudo instruction.
+//
+// fill_fd_pseudo $wd, $fs
+// =>
+// implicit_def $wt1
+// insert_subreg $wt2:subreg_64, $wt1, $fs
+// splati.d $wd, $wt2[0]
+MachineBasicBlock *
+MipsSETargetLowering::emitFILL_FD(MachineInstr *MI,
+                                  MachineBasicBlock *BB) const {
+  assert(Subtarget->isFP64bit());
+
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
+  DebugLoc DL = MI->getDebugLoc();
+  unsigned Wd = MI->getOperand(0).getReg();
+  unsigned Fs = MI->getOperand(1).getReg();
+  unsigned Wt1 = RegInfo.createVirtualRegister(&Mips::MSA128DRegClass);
+  unsigned Wt2 = RegInfo.createVirtualRegister(&Mips::MSA128DRegClass);
+
+  BuildMI(*BB, MI, DL, TII->get(Mips::IMPLICIT_DEF), Wt1);
+  BuildMI(*BB, MI, DL, TII->get(Mips::INSERT_SUBREG), Wt2)
+      .addReg(Wt1)
+      .addReg(Fs)
+      .addImm(Mips::sub_64);
+  BuildMI(*BB, MI, DL, TII->get(Mips::SPLATI_D), Wd).addReg(Wt2).addImm(0);
+
+  MI->eraseFromParent();   // The pseudo instruction is gone now.
+  return BB;
+}
+
+// Emit the FEXP2_W_1 pseudo instructions.
+//
+// fexp2_w_1_pseudo $wd, $wt
+// =>
+// ldi.w $ws, 1
+// fexp2.w $wd, $ws, $wt
+MachineBasicBlock *
+MipsSETargetLowering::emitFEXP2_W_1(MachineInstr *MI,
+                                    MachineBasicBlock *BB) const {
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
+  const TargetRegisterClass *RC = &Mips::MSA128WRegClass;
+  unsigned Ws1 = RegInfo.createVirtualRegister(RC);
+  unsigned Ws2 = RegInfo.createVirtualRegister(RC);
+  DebugLoc DL = MI->getDebugLoc();
+
+  // Splat 1.0 into a vector
+  BuildMI(*BB, MI, DL, TII->get(Mips::LDI_W), Ws1).addImm(1);
+  BuildMI(*BB, MI, DL, TII->get(Mips::FFINT_U_W), Ws2).addReg(Ws1);
+
+  // Emit 1.0 * fexp2(Wt)
+  BuildMI(*BB, MI, DL, TII->get(Mips::FEXP2_W), MI->getOperand(0).getReg())
+      .addReg(Ws2)
+      .addReg(MI->getOperand(1).getReg());
+
+  MI->eraseFromParent(); // The pseudo instruction is gone now.
+  return BB;
+}
+
+// Emit the FEXP2_D_1 pseudo instructions.
+//
+// fexp2_d_1_pseudo $wd, $wt
+// =>
+// ldi.d $ws, 1
+// fexp2.d $wd, $ws, $wt
+MachineBasicBlock *
+MipsSETargetLowering::emitFEXP2_D_1(MachineInstr *MI,
+                                    MachineBasicBlock *BB) const {
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
+  const TargetRegisterClass *RC = &Mips::MSA128DRegClass;
+  unsigned Ws1 = RegInfo.createVirtualRegister(RC);
+  unsigned Ws2 = RegInfo.createVirtualRegister(RC);
+  DebugLoc DL = MI->getDebugLoc();
+
+  // Splat 1.0 into a vector
+  BuildMI(*BB, MI, DL, TII->get(Mips::LDI_D), Ws1).addImm(1);
+  BuildMI(*BB, MI, DL, TII->get(Mips::FFINT_U_D), Ws2).addReg(Ws1);
+
+  // Emit 1.0 * fexp2(Wt)
+  BuildMI(*BB, MI, DL, TII->get(Mips::FEXP2_D), MI->getOperand(0).getReg())
+      .addReg(Ws2)
+      .addReg(MI->getOperand(1).getReg());
+
+  MI->eraseFromParent(); // The pseudo instruction is gone now.
+  return BB;
+}
diff --git a/lib/Target/Mips/MipsSEISelLowering.h b/lib/Target/Mips/MipsSEISelLowering.h
index ec8a5c7..c5210d9 100644
--- a/lib/Target/Mips/MipsSEISelLowering.h
+++ b/lib/Target/Mips/MipsSEISelLowering.h
@@ -22,6 +22,14 @@ namespace llvm {
   public:
     explicit MipsSETargetLowering(MipsTargetMachine &TM);
 
+    /// \brief Enable MSA support for the given integer type and Register
+    /// class.
+    void addMSAIntType(MVT::SimpleValueType Ty, const TargetRegisterClass *RC);
+    /// \brief Enable MSA support for the given floating-point type and
+    /// Register class.
+    void addMSAFloatType(MVT::SimpleValueType Ty,
+                         const TargetRegisterClass *RC);
+
     virtual bool allowsUnalignedMemoryAccesses(EVT VT, bool *Fast) const;
 
     virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
@@ -38,8 +46,8 @@ namespace llvm {
 
     virtual const TargetRegisterClass *getRepRegClassFor(MVT VT) const {
       if (VT == MVT::Untyped)
-        return Subtarget->hasDSP() ? &Mips::ACRegsDSPRegClass :
-                                     &Mips::ACRegsRegClass;
+        return Subtarget->hasDSP() ? &Mips::ACC64DSPRegClass :
+                                     &Mips::ACC64RegClass;
 
       return TargetLowering::getRepRegClassFor(VT);
     }
@@ -56,14 +64,50 @@ namespace llvm {
                 bool IsPICCall, bool GlobalOrExternal, bool InternalLinkage,
                 CallLoweringInfo &CLI, SDValue Callee, SDValue Chain) const;
 
+    SDValue lowerLOAD(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerSTORE(SDValue Op, SelectionDAG &DAG) const;
+
     SDValue lowerMulDiv(SDValue Op, unsigned NewOpc, bool HasLo, bool HasHi,
                         SelectionDAG &DAG) const;
 
     SDValue lowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
     SDValue lowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
+    /// \brief Lower VECTOR_SHUFFLE into one of a number of instructions
+    /// depending on the indices in the shuffle.
+    SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
 
     MachineBasicBlock *emitBPOSGE32(MachineInstr *MI,
                                     MachineBasicBlock *BB) const;
+    MachineBasicBlock *emitMSACBranchPseudo(MachineInstr *MI,
+                                            MachineBasicBlock *BB,
+                                            unsigned BranchOp) const;
+    /// \brief Emit the COPY_FW pseudo instruction
+    MachineBasicBlock *emitCOPY_FW(MachineInstr *MI,
+                                   MachineBasicBlock *BB) const;
+    /// \brief Emit the COPY_FD pseudo instruction
+    MachineBasicBlock *emitCOPY_FD(MachineInstr *MI,
+                                   MachineBasicBlock *BB) const;
+    /// \brief Emit the INSERT_FW pseudo instruction
+    MachineBasicBlock *emitINSERT_FW(MachineInstr *MI,
+                                     MachineBasicBlock *BB) const;
+    /// \brief Emit the INSERT_FD pseudo instruction
+    MachineBasicBlock *emitINSERT_FD(MachineInstr *MI,
+                                     MachineBasicBlock *BB) const;
+    /// \brief Emit the FILL_FW pseudo instruction
+    MachineBasicBlock *emitFILL_FW(MachineInstr *MI,
+                                   MachineBasicBlock *BB) const;
+    /// \brief Emit the FILL_FD pseudo instruction
+    MachineBasicBlock *emitFILL_FD(MachineInstr *MI,
+                                   MachineBasicBlock *BB) const;
+    /// \brief Emit the FEXP2_W_1 pseudo instructions.
+    MachineBasicBlock *emitFEXP2_W_1(MachineInstr *MI,
+                                     MachineBasicBlock *BB) const;
+    /// \brief Emit the FEXP2_D_1 pseudo instructions.
+    MachineBasicBlock *emitFEXP2_D_1(MachineInstr *MI,
+                                     MachineBasicBlock *BB) const;
   };
 }
 
diff --git a/lib/Target/Mips/MipsSEInstrInfo.cpp b/lib/Target/Mips/MipsSEInstrInfo.cpp
index 9521043..02931a3 100644
--- a/lib/Target/Mips/MipsSEInstrInfo.cpp
+++ b/lib/Target/Mips/MipsSEInstrInfo.cpp
@@ -24,11 +24,6 @@
 
 using namespace llvm;
 
-static cl::opt<bool> NoDPLoadStore("mno-ldc1-sdc1", cl::init(false),
-                                   cl::desc("Expand double precision loads and "
-                                            "stores to their single precision "
-                                            "counterparts."));
-
 MipsSEInstrInfo::MipsSEInstrInfo(MipsTargetMachine &tm)
   : MipsInstrInfo(tm,
                   tm.getRelocationModel() == Reloc::PIC_ ? Mips::B : Mips::J),
@@ -49,10 +44,8 @@ isLoadFromStackSlot(const MachineInstr *MI, int &FrameIndex) const
 {
   unsigned Opc = MI->getOpcode();
 
-  if ((Opc == Mips::LW)    || (Opc == Mips::LW_P8)  || (Opc == Mips::LD) ||
-      (Opc == Mips::LD_P8) || (Opc == Mips::LWC1)   || (Opc == Mips::LWC1_P8) ||
-      (Opc == Mips::LDC1)  || (Opc == Mips::LDC164) ||
-      (Opc == Mips::LDC164_P8)) {
+  if ((Opc == Mips::LW)   || (Opc == Mips::LD)   ||
+      (Opc == Mips::LWC1) || (Opc == Mips::LDC1) || (Opc == Mips::LDC164)) {
     if ((MI->getOperand(1).isFI()) && // is a stack slot
         (MI->getOperand(2).isImm()) &&  // the imm is zero
         (isZeroImm(MI->getOperand(2)))) {
@@ -74,10 +67,8 @@ isStoreToStackSlot(const MachineInstr *MI, int &FrameIndex) const
 {
   unsigned Opc = MI->getOpcode();
 
-  if ((Opc == Mips::SW)    || (Opc == Mips::SW_P8)  || (Opc == Mips::SD) ||
-      (Opc == Mips::SD_P8) || (Opc == Mips::SWC1)   || (Opc == Mips::SWC1_P8) ||
-      (Opc == Mips::SDC1)  || (Opc == Mips::SDC164) ||
-      (Opc == Mips::SDC164_P8)) {
+  if ((Opc == Mips::SW)   || (Opc == Mips::SD)   ||
+      (Opc == Mips::SWC1) || (Opc == Mips::SDC1) || (Opc == Mips::SDC164)) {
     if ((MI->getOperand(1).isFI()) && // is a stack slot
         (MI->getOperand(2).isImm()) &&  // the imm is zero
         (isZeroImm(MI->getOperand(2)))) {
@@ -101,32 +92,34 @@ void MipsSEInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
       Opc = Mips::CFC1;
     else if (Mips::FGR32RegClass.contains(SrcReg))
       Opc = Mips::MFC1;
-    else if (Mips::HIRegsRegClass.contains(SrcReg))
+    else if (Mips::HI32RegClass.contains(SrcReg))
       Opc = Mips::MFHI, SrcReg = 0;
-    else if (Mips::LORegsRegClass.contains(SrcReg))
+    else if (Mips::LO32RegClass.contains(SrcReg))
       Opc = Mips::MFLO, SrcReg = 0;
-    else if (Mips::HIRegsDSPRegClass.contains(SrcReg))
+    else if (Mips::HI32DSPRegClass.contains(SrcReg))
       Opc = Mips::MFHI_DSP;
-    else if (Mips::LORegsDSPRegClass.contains(SrcReg))
+    else if (Mips::LO32DSPRegClass.contains(SrcReg))
       Opc = Mips::MFLO_DSP;
     else if (Mips::DSPCCRegClass.contains(SrcReg)) {
       BuildMI(MBB, I, DL, get(Mips::RDDSP), DestReg).addImm(1 << 4)
         .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
       return;
     }
+    else if (Mips::MSACtrlRegClass.contains(SrcReg))
+      Opc = Mips::CFCMSA;
   }
   else if (Mips::GPR32RegClass.contains(SrcReg)) { // Copy from CPU Reg.
     if (Mips::CCRRegClass.contains(DestReg))
       Opc = Mips::CTC1;
     else if (Mips::FGR32RegClass.contains(DestReg))
       Opc = Mips::MTC1;
-    else if (Mips::HIRegsRegClass.contains(DestReg))
+    else if (Mips::HI32RegClass.contains(DestReg))
       Opc = Mips::MTHI, DestReg = 0;
-    else if (Mips::LORegsRegClass.contains(DestReg))
+    else if (Mips::LO32RegClass.contains(DestReg))
       Opc = Mips::MTLO, DestReg = 0;
-    else if (Mips::HIRegsDSPRegClass.contains(DestReg))
+    else if (Mips::HI32DSPRegClass.contains(DestReg))
       Opc = Mips::MTHI_DSP;
-    else if (Mips::LORegsDSPRegClass.contains(DestReg))
+    else if (Mips::LO32DSPRegClass.contains(DestReg))
       Opc = Mips::MTLO_DSP;
     else if (Mips::DSPCCRegClass.contains(DestReg)) {
       BuildMI(MBB, I, DL, get(Mips::WRDSP))
@@ -134,6 +127,8 @@ void MipsSEInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
         .addReg(DestReg, RegState::ImplicitDefine);
       return;
     }
+    else if (Mips::MSACtrlRegClass.contains(DestReg))
+      Opc = Mips::CTCMSA;
   }
   else if (Mips::FGR32RegClass.contains(DestReg, SrcReg))
     Opc = Mips::FMOV_S;
@@ -144,21 +139,25 @@ void MipsSEInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
   else if (Mips::GPR64RegClass.contains(DestReg)) { // Copy to CPU64 Reg.
     if (Mips::GPR64RegClass.contains(SrcReg))
       Opc = Mips::DADDu, ZeroReg = Mips::ZERO_64;
-    else if (Mips::HIRegs64RegClass.contains(SrcReg))
+    else if (Mips::HI64RegClass.contains(SrcReg))
       Opc = Mips::MFHI64, SrcReg = 0;
-    else if (Mips::LORegs64RegClass.contains(SrcReg))
+    else if (Mips::LO64RegClass.contains(SrcReg))
       Opc = Mips::MFLO64, SrcReg = 0;
     else if (Mips::FGR64RegClass.contains(SrcReg))
       Opc = Mips::DMFC1;
   }
   else if (Mips::GPR64RegClass.contains(SrcReg)) { // Copy from CPU64 Reg.
-    if (Mips::HIRegs64RegClass.contains(DestReg))
+    if (Mips::HI64RegClass.contains(DestReg))
       Opc = Mips::MTHI64, DestReg = 0;
-    else if (Mips::LORegs64RegClass.contains(DestReg))
+    else if (Mips::LO64RegClass.contains(DestReg))
       Opc = Mips::MTLO64, DestReg = 0;
     else if (Mips::FGR64RegClass.contains(DestReg))
       Opc = Mips::DMTC1;
   }
+  else if (Mips::MSA128BRegClass.contains(DestReg)) { // Copy to MSA reg
+    if (Mips::MSA128BRegClass.contains(SrcReg))
+      Opc = Mips::MOVE_V;
+  }
 
   assert(Opc && "Cannot copy registers");
 
@@ -186,23 +185,31 @@ storeRegToStack(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
   unsigned Opc = 0;
 
   if (Mips::GPR32RegClass.hasSubClassEq(RC))
-    Opc = IsN64 ? Mips::SW_P8 : Mips::SW;
+    Opc = Mips::SW;
   else if (Mips::GPR64RegClass.hasSubClassEq(RC))
-    Opc = IsN64 ? Mips::SD_P8 : Mips::SD;
-  else if (Mips::ACRegsRegClass.hasSubClassEq(RC))
-    Opc = IsN64 ? Mips::STORE_AC64_P8 : Mips::STORE_AC64;
-  else if (Mips::ACRegsDSPRegClass.hasSubClassEq(RC))
-    Opc = IsN64 ? Mips::STORE_AC_DSP_P8 : Mips::STORE_AC_DSP;
-  else if (Mips::ACRegs128RegClass.hasSubClassEq(RC))
-    Opc = IsN64 ? Mips::STORE_AC128_P8 : Mips::STORE_AC128;
+    Opc = Mips::SD;
+  else if (Mips::ACC64RegClass.hasSubClassEq(RC))
+    Opc = Mips::STORE_ACC64;
+  else if (Mips::ACC64DSPRegClass.hasSubClassEq(RC))
+    Opc = Mips::STORE_ACC64DSP;
+  else if (Mips::ACC128RegClass.hasSubClassEq(RC))
+    Opc = Mips::STORE_ACC128;
   else if (Mips::DSPCCRegClass.hasSubClassEq(RC))
-    Opc = IsN64 ? Mips::STORE_CCOND_DSP_P8 : Mips::STORE_CCOND_DSP;
+    Opc = Mips::STORE_CCOND_DSP;
   else if (Mips::FGR32RegClass.hasSubClassEq(RC))
-    Opc = IsN64 ? Mips::SWC1_P8 : Mips::SWC1;
+    Opc = Mips::SWC1;
   else if (Mips::AFGR64RegClass.hasSubClassEq(RC))
     Opc = Mips::SDC1;
   else if (Mips::FGR64RegClass.hasSubClassEq(RC))
-    Opc = IsN64 ? Mips::SDC164_P8 : Mips::SDC164;
+    Opc = Mips::SDC164;
+  else if (RC->hasType(MVT::v16i8))
+    Opc = Mips::ST_B;
+  else if (RC->hasType(MVT::v8i16) || RC->hasType(MVT::v8f16))
+    Opc = Mips::ST_H;
+  else if (RC->hasType(MVT::v4i32) || RC->hasType(MVT::v4f32))
+    Opc = Mips::ST_W;
+  else if (RC->hasType(MVT::v2i64) || RC->hasType(MVT::v2f64))
+    Opc = Mips::ST_D;
 
   assert(Opc && "Register class not handled!");
   BuildMI(MBB, I, DL, get(Opc)).addReg(SrcReg, getKillRegState(isKill))
@@ -219,23 +226,31 @@ loadRegFromStack(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
   unsigned Opc = 0;
 
   if (Mips::GPR32RegClass.hasSubClassEq(RC))
-    Opc = IsN64 ? Mips::LW_P8 : Mips::LW;
+    Opc = Mips::LW;
   else if (Mips::GPR64RegClass.hasSubClassEq(RC))
-    Opc = IsN64 ? Mips::LD_P8 : Mips::LD;
-  else if (Mips::ACRegsRegClass.hasSubClassEq(RC))
-    Opc = IsN64 ? Mips::LOAD_AC64_P8 : Mips::LOAD_AC64;
-  else if (Mips::ACRegsDSPRegClass.hasSubClassEq(RC))
-    Opc = IsN64 ? Mips::LOAD_AC_DSP_P8 : Mips::LOAD_AC_DSP;
-  else if (Mips::ACRegs128RegClass.hasSubClassEq(RC))
-    Opc = IsN64 ? Mips::LOAD_AC128_P8 : Mips::LOAD_AC128;
+    Opc = Mips::LD;
+  else if (Mips::ACC64RegClass.hasSubClassEq(RC))
+    Opc = Mips::LOAD_ACC64;
+  else if (Mips::ACC64DSPRegClass.hasSubClassEq(RC))
+    Opc = Mips::LOAD_ACC64DSP;
+  else if (Mips::ACC128RegClass.hasSubClassEq(RC))
+    Opc = Mips::LOAD_ACC128;
   else if (Mips::DSPCCRegClass.hasSubClassEq(RC))
-    Opc = IsN64 ? Mips::LOAD_CCOND_DSP_P8 : Mips::LOAD_CCOND_DSP;
+    Opc = Mips::LOAD_CCOND_DSP;
   else if (Mips::FGR32RegClass.hasSubClassEq(RC))
-    Opc = IsN64 ? Mips::LWC1_P8 : Mips::LWC1;
+    Opc = Mips::LWC1;
   else if (Mips::AFGR64RegClass.hasSubClassEq(RC))
     Opc = Mips::LDC1;
   else if (Mips::FGR64RegClass.hasSubClassEq(RC))
-    Opc = IsN64 ? Mips::LDC164_P8 : Mips::LDC164;
+    Opc = Mips::LDC164;
+  else if (RC->hasType(MVT::v16i8))
+    Opc = Mips::LD_B;
+  else if (RC->hasType(MVT::v8i16) || RC->hasType(MVT::v8f16))
+    Opc = Mips::LD_H;
+  else if (RC->hasType(MVT::v4i32) || RC->hasType(MVT::v4f32))
+    Opc = Mips::LD_W;
+  else if (RC->hasType(MVT::v2i64) || RC->hasType(MVT::v2f64))
+    Opc = Mips::LD_D;
 
   assert(Opc && "Register class not handled!");
   BuildMI(MBB, I, DL, get(Opc), DestReg).addFrameIndex(FI).addImm(Offset)
@@ -251,6 +266,27 @@ bool MipsSEInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
   case Mips::RetRA:
     expandRetRA(MBB, MI, Mips::RET);
     break;
+  case Mips::PseudoMFHI:
+    expandPseudoMFHiLo(MBB, MI, Mips::MFHI);
+    break;
+  case Mips::PseudoMFLO:
+    expandPseudoMFHiLo(MBB, MI, Mips::MFLO);
+    break;
+  case Mips::PseudoMFHI64:
+    expandPseudoMFHiLo(MBB, MI, Mips::MFHI64);
+    break;
+  case Mips::PseudoMFLO64:
+    expandPseudoMFHiLo(MBB, MI, Mips::MFLO64);
+    break;
+  case Mips::PseudoMTLOHI:
+    expandPseudoMTLoHi(MBB, MI, Mips::MTLO, Mips::MTHI, false);
+    break;
+  case Mips::PseudoMTLOHI64:
+    expandPseudoMTLoHi(MBB, MI, Mips::MTLO64, Mips::MTHI64, false);
+    break;
+  case Mips::PseudoMTLOHI_DSP:
+    expandPseudoMTLoHi(MBB, MI, Mips::MTLO_DSP, Mips::MTHI_DSP, true);
+    break;
   case Mips::PseudoCVT_S_W:
     expandCvtFPInt(MBB, MI, Mips::CVT_S_W, Mips::MTC1, false);
     break;
@@ -267,16 +303,16 @@ bool MipsSEInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
     expandCvtFPInt(MBB, MI, Mips::CVT_D64_L, Mips::DMTC1, true);
     break;
   case Mips::BuildPairF64:
-    expandBuildPairF64(MBB, MI);
+    expandBuildPairF64(MBB, MI, false);
     break;
-  case Mips::ExtractElementF64:
-    expandExtractElementF64(MBB, MI);
+  case Mips::BuildPairF64_64:
+    expandBuildPairF64(MBB, MI, true);
     break;
-  case Mips::PseudoLDC1:
-    expandDPLoadStore(MBB, MI, Mips::LDC1, Mips::LWC1);
+  case Mips::ExtractElementF64:
+    expandExtractElementF64(MBB, MI, false);
     break;
-  case Mips::PseudoSDC1:
-    expandDPLoadStore(MBB, MI, Mips::SDC1, Mips::SWC1);
+  case Mips::ExtractElementF64_64:
+    expandExtractElementF64(MBB, MI, true);
     break;
   case Mips::MIPSeh_return32:
   case Mips::MIPSeh_return64:
@@ -399,6 +435,41 @@ MipsSEInstrInfo::compareOpndSize(unsigned Opc,
   return std::make_pair(DstRegSize > SrcRegSize, DstRegSize < SrcRegSize);
 }
 
+void MipsSEInstrInfo::expandPseudoMFHiLo(MachineBasicBlock &MBB,
+                                         MachineBasicBlock::iterator I,
+                                         unsigned NewOpc) const {
+  BuildMI(MBB, I, I->getDebugLoc(), get(NewOpc), I->getOperand(0).getReg());
+}
+
+void MipsSEInstrInfo::expandPseudoMTLoHi(MachineBasicBlock &MBB,
+                                         MachineBasicBlock::iterator I,
+                                         unsigned LoOpc,
+                                         unsigned HiOpc,
+                                         bool HasExplicitDef) const {
+  // Expand
+  //  lo_hi pseudomtlohi $gpr0, $gpr1
+  // to these two instructions:
+  //  mtlo $gpr0
+  //  mthi $gpr1
+
+  DebugLoc DL = I->getDebugLoc();
+  const MachineOperand &SrcLo = I->getOperand(1), &SrcHi = I->getOperand(2);
+  MachineInstrBuilder LoInst = BuildMI(MBB, I, DL, get(LoOpc));
+  MachineInstrBuilder HiInst = BuildMI(MBB, I, DL, get(HiOpc));
+  LoInst.addReg(SrcLo.getReg(), getKillRegState(SrcLo.isKill()));
+  HiInst.addReg(SrcHi.getReg(), getKillRegState(SrcHi.isKill()));
+
+  // Add lo/hi registers if the mtlo/hi instructions created have explicit
+  // def registers.
+  if (HasExplicitDef) {
+    unsigned DstReg = I->getOperand(0).getReg();
+    unsigned DstLo = getRegisterInfo().getSubReg(DstReg, Mips::sub_lo);
+    unsigned DstHi = getRegisterInfo().getSubReg(DstReg, Mips::sub_hi);
+    LoInst.addReg(DstLo, RegState::Define);
+    HiInst.addReg(DstHi, RegState::Define);
+  }
+}
+
 void MipsSEInstrInfo::expandCvtFPInt(MachineBasicBlock &MBB,
                                      MachineBasicBlock::iterator I,
                                      unsigned CvtOpc, unsigned MovOpc,
@@ -408,100 +479,63 @@ void MipsSEInstrInfo::expandCvtFPInt(MachineBasicBlock &MBB,
   unsigned DstReg = Dst.getReg(), SrcReg = Src.getReg(), TmpReg = DstReg;
   unsigned KillSrc =  getKillRegState(Src.isKill());
   DebugLoc DL = I->getDebugLoc();
-  unsigned SubIdx = (IsI64 ? Mips::sub_32 : Mips::sub_fpeven);
   bool DstIsLarger, SrcIsLarger;
 
   tie(DstIsLarger, SrcIsLarger) = compareOpndSize(CvtOpc, *MBB.getParent());
 
   if (DstIsLarger)
-    TmpReg = getRegisterInfo().getSubReg(DstReg, SubIdx);
+    TmpReg = getRegisterInfo().getSubReg(DstReg, Mips::sub_lo);
 
   if (SrcIsLarger)
-    DstReg = getRegisterInfo().getSubReg(DstReg, SubIdx);
+    DstReg = getRegisterInfo().getSubReg(DstReg, Mips::sub_lo);
 
   BuildMI(MBB, I, DL, MovDesc, TmpReg).addReg(SrcReg, KillSrc);
   BuildMI(MBB, I, DL, CvtDesc, DstReg).addReg(TmpReg, RegState::Kill);
 }
 
 void MipsSEInstrInfo::expandExtractElementF64(MachineBasicBlock &MBB,
-                                          MachineBasicBlock::iterator I) const {
+                                              MachineBasicBlock::iterator I,
+                                              bool FP64) const {
   unsigned DstReg = I->getOperand(0).getReg();
   unsigned SrcReg = I->getOperand(1).getReg();
   unsigned N = I->getOperand(2).getImm();
-  const MCInstrDesc& Mfc1Tdd = get(Mips::MFC1);
   DebugLoc dl = I->getDebugLoc();
 
   assert(N < 2 && "Invalid immediate");
-  unsigned SubIdx = N ? Mips::sub_fpodd : Mips::sub_fpeven;
+  unsigned SubIdx = N ? Mips::sub_hi : Mips::sub_lo;
   unsigned SubReg = getRegisterInfo().getSubReg(SrcReg, SubIdx);
 
-  BuildMI(MBB, I, dl, Mfc1Tdd, DstReg).addReg(SubReg);
+  if (SubIdx == Mips::sub_hi && FP64)
+    BuildMI(MBB, I, dl, get(Mips::MFHC1), DstReg).addReg(SubReg);
+  else
+    BuildMI(MBB, I, dl, get(Mips::MFC1), DstReg).addReg(SubReg);
 }
 
 void MipsSEInstrInfo::expandBuildPairF64(MachineBasicBlock &MBB,
-                                       MachineBasicBlock::iterator I) const {
+                                         MachineBasicBlock::iterator I,
+                                         bool FP64) const {
   unsigned DstReg = I->getOperand(0).getReg();
   unsigned LoReg = I->getOperand(1).getReg(), HiReg = I->getOperand(2).getReg();
   const MCInstrDesc& Mtc1Tdd = get(Mips::MTC1);
   DebugLoc dl = I->getDebugLoc();
   const TargetRegisterInfo &TRI = getRegisterInfo();
 
-  // mtc1 Lo, $fp
-  // mtc1 Hi, $fp + 1
-  BuildMI(MBB, I, dl, Mtc1Tdd, TRI.getSubReg(DstReg, Mips::sub_fpeven))
-    .addReg(LoReg);
-  BuildMI(MBB, I, dl, Mtc1Tdd, TRI.getSubReg(DstReg, Mips::sub_fpodd))
-    .addReg(HiReg);
-}
-
-/// Add 4 to the displacement of operand MO.
-static void fixDisp(MachineOperand &MO) {
-  switch (MO.getType()) {
-  default:
-    llvm_unreachable("Unhandled operand type.");
-  case MachineOperand::MO_Immediate:
-    MO.setImm(MO.getImm() + 4);
-    break;
-  case MachineOperand::MO_GlobalAddress:
-  case MachineOperand::MO_ConstantPoolIndex:
-  case MachineOperand::MO_BlockAddress:
-  case MachineOperand::MO_TargetIndex:
-  case MachineOperand::MO_ExternalSymbol:
-    MO.setOffset(MO.getOffset() + 4);
-    break;
-  }
-}
-
-void MipsSEInstrInfo::expandDPLoadStore(MachineBasicBlock &MBB,
-                                        MachineBasicBlock::iterator I,
-                                        unsigned OpcD, unsigned OpcS) const {
-  // If NoDPLoadStore is false, just change the opcode.
-  if (!NoDPLoadStore) {
-    genInstrWithNewOpc(OpcD, I);
-    return;
-  }
+  // For FP32 mode:
+  //   mtc1 Lo, $fp
+  //   mtc1 Hi, $fp + 1
+  // For FP64 mode:
+  //   mtc1 Lo, $fp
+  //   mthc1 Hi, $fp
 
-  // Expand a double precision FP load or store to two single precision
-  // instructions.
+  BuildMI(MBB, I, dl, Mtc1Tdd, TRI.getSubReg(DstReg, Mips::sub_lo))
+    .addReg(LoReg);
 
-  const TargetRegisterInfo &TRI = getRegisterInfo();
-  const MachineOperand &ValReg = I->getOperand(0);
-  unsigned LoReg = TRI.getSubReg(ValReg.getReg(), Mips::sub_fpeven);
-  unsigned HiReg = TRI.getSubReg(ValReg.getReg(), Mips::sub_fpodd);
-
-  if (!TM.getSubtarget<MipsSubtarget>().isLittle())
-    std::swap(LoReg, HiReg);
-
-  // Create an instruction which loads from or stores to the lower memory
-  // address.
-  MachineInstrBuilder MIB = genInstrWithNewOpc(OpcS, I);
-  MIB->getOperand(0).setReg(LoReg);
-
-  // Create an instruction which loads from or stores to the higher memory
-  // address.
-  MIB = genInstrWithNewOpc(OpcS, I);
-  MIB->getOperand(0).setReg(HiReg);
-  fixDisp(MIB->getOperand(2));
+  if (FP64)
+    BuildMI(MBB, I, dl, get(Mips::MTHC1), TRI.getSubReg(DstReg, Mips::sub_hi))
+      .addReg(HiReg);
+  else
+    BuildMI(MBB, I, dl, Mtc1Tdd, TRI.getSubReg(DstReg, Mips::sub_hi))
+      .addReg(HiReg);
 }
 
 void MipsSEInstrInfo::expandEhReturn(MachineBasicBlock &MBB,
diff --git a/lib/Target/Mips/MipsSEInstrInfo.h b/lib/Target/Mips/MipsSEInstrInfo.h
index d962ef0..6d2dd90 100644
--- a/lib/Target/Mips/MipsSEInstrInfo.h
+++ b/lib/Target/Mips/MipsSEInstrInfo.h
@@ -87,6 +87,13 @@ private:
   std::pair<bool, bool> compareOpndSize(unsigned Opc,
                                         const MachineFunction &MF) const;
 
+  void expandPseudoMFHiLo(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                          unsigned NewOpc) const;
+
+  void expandPseudoMTLoHi(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                          unsigned LoOpc, unsigned HiOpc,
+                          bool HasExplicitDef) const;
+
   /// Expand pseudo Int-to-FP conversion instructions.
   ///
   /// For example, the following pseudo instruction
@@ -101,12 +108,9 @@ private:
                       unsigned CvtOpc, unsigned MovOpc, bool IsI64) const;
 
   void expandExtractElementF64(MachineBasicBlock &MBB,
-                               MachineBasicBlock::iterator I) const;
+                               MachineBasicBlock::iterator I, bool FP64) const;
   void expandBuildPairF64(MachineBasicBlock &MBB,
-                          MachineBasicBlock::iterator I) const;
-  void expandDPLoadStore(MachineBasicBlock &MBB,
-                         MachineBasicBlock::iterator I, unsigned OpcD,
-                         unsigned OpcS) const;
+                          MachineBasicBlock::iterator I, bool FP64) const;
   void expandEhReturn(MachineBasicBlock &MBB,
                       MachineBasicBlock::iterator I) const;
 };
diff --git a/lib/Target/Mips/MipsSERegisterInfo.cpp b/lib/Target/Mips/MipsSERegisterInfo.cpp
index 286a2e2..2d44084 100644
--- a/lib/Target/Mips/MipsSERegisterInfo.cpp
+++ b/lib/Target/Mips/MipsSERegisterInfo.cpp
@@ -62,6 +62,24 @@ MipsSERegisterInfo::intRegClass(unsigned Size) const {
   return &Mips::GPR64RegClass;
 }
 
+/// Determine whether a given opcode is an MSA load/store (supporting 10-bit
+/// offsets) or a non-MSA load/store (supporting 16-bit offsets).
+static inline bool isMSALoadOrStore(const unsigned Opcode) {
+  switch (Opcode) {
+  case Mips::LD_B:
+  case Mips::LD_H:
+  case Mips::LD_W:
+  case Mips::LD_D:
+  case Mips::ST_B:
+  case Mips::ST_H:
+  case Mips::ST_W:
+  case Mips::ST_D:
+    return true;
+  default:
+    return false;
+  }
+}
+
 void MipsSERegisterInfo::eliminateFI(MachineBasicBlock::iterator II,
                                      unsigned OpNo, int FrameIndex,
                                      uint64_t StackSize,
@@ -111,23 +129,49 @@ void MipsSERegisterInfo::eliminateFI(MachineBasicBlock::iterator II,
 
   DEBUG(errs() << "Offset     : " << Offset << "\n" << "<--------->\n");
 
-  // If MI is not a debug value, make sure Offset fits in the 16-bit immediate
-  // field.
-  if (!MI.isDebugValue() && !isInt<16>(Offset)) {
-    MachineBasicBlock &MBB = *MI.getParent();
-    DebugLoc DL = II->getDebugLoc();
-    unsigned ADDu = Subtarget.isABI_N64() ? Mips::DADDu : Mips::ADDu;
-    unsigned NewImm;
-    const MipsSEInstrInfo &TII =
-      *static_cast<const MipsSEInstrInfo*>(
-        MBB.getParent()->getTarget().getInstrInfo());
-    unsigned Reg = TII.loadImmediate(Offset, MBB, II, DL, &NewImm);
-    BuildMI(MBB, II, DL, TII.get(ADDu), Reg).addReg(FrameReg)
-      .addReg(Reg, RegState::Kill);
-
-    FrameReg = Reg;
-    Offset = SignExtend64<16>(NewImm);
-    IsKill = true;
+  if (!MI.isDebugValue()) {
+    // Make sure Offset fits within the field available.
+    // For MSA instructions, this is a 10-bit signed immediate, otherwise it is
+    // a 16-bit signed immediate.
+    unsigned OffsetBitSize = isMSALoadOrStore(MI.getOpcode()) ? 10 : 16;
+
+    if (OffsetBitSize == 10 && !isInt<10>(Offset) && isInt<16>(Offset)) {
+      // If we have an offset that needs to fit into a signed 10-bit immediate
+      // and doesn't, but does fit into 16-bits then use an ADDiu
+      MachineBasicBlock &MBB = *MI.getParent();
+      DebugLoc DL = II->getDebugLoc();
+      unsigned ADDiu = Subtarget.isABI_N64() ? Mips::DADDiu : Mips::ADDiu;
+      const TargetRegisterClass *RC =
+          Subtarget.isABI_N64() ? &Mips::GPR64RegClass : &Mips::GPR32RegClass;
+      MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo();
+      unsigned Reg = RegInfo.createVirtualRegister(RC);
+      const MipsSEInstrInfo &TII =
+          *static_cast<const MipsSEInstrInfo *>(
+               MBB.getParent()->getTarget().getInstrInfo());
+      BuildMI(MBB, II, DL, TII.get(ADDiu), Reg).addReg(FrameReg).addImm(Offset);
+
+      FrameReg = Reg;
+      Offset = 0;
+      IsKill = true;
+    } else if (!isInt<16>(Offset)) {
+      // Otherwise split the offset into 16-bit pieces and add it in multiple
+      // instructions.
+      MachineBasicBlock &MBB = *MI.getParent();
+      DebugLoc DL = II->getDebugLoc();
+      unsigned ADDu = Subtarget.isABI_N64() ? Mips::DADDu : Mips::ADDu;
+      unsigned NewImm = 0;
+      const MipsSEInstrInfo &TII =
+          *static_cast<const MipsSEInstrInfo *>(
+               MBB.getParent()->getTarget().getInstrInfo());
+      unsigned Reg = TII.loadImmediate(Offset, MBB, II, DL,
+                                       OffsetBitSize == 16 ? &NewImm : NULL);
+      BuildMI(MBB, II, DL, TII.get(ADDu), Reg).addReg(FrameReg)
+        .addReg(Reg, RegState::Kill);
+
+      FrameReg = Reg;
+      Offset = SignExtend64<16>(NewImm);
+      IsKill = true;
+    }
   }
 
   MI.getOperand(OpNo).ChangeToRegister(FrameReg, false, false, IsKill);
diff --git a/lib/Target/Mips/MipsSubtarget.cpp b/lib/Target/Mips/MipsSubtarget.cpp
index 541e2ca..0a81072 100644
--- a/lib/Target/Mips/MipsSubtarget.cpp
+++ b/lib/Target/Mips/MipsSubtarget.cpp
@@ -53,6 +53,12 @@ Mips16HardFloat("mips16-hard-float", cl::NotHidden,
                 cl::desc("MIPS: mips16 hard float enable."),
                 cl::init(false));
 
+static cl::opt<bool>
+Mips16ConstantIslands(
+  "mips16-constant-islands", cl::Hidden,
+  cl::desc("MIPS: mips16 constant islands enable. experimental feature"),
+  cl::init(false));
+
 void MipsSubtarget::anchor() { }
 
 MipsSubtarget::MipsSubtarget(const std::string &TT, const std::string &CPU,
@@ -65,7 +71,7 @@ MipsSubtarget::MipsSubtarget(const std::string &TT, const std::string &CPU,
   HasBitCount(false), HasFPIdx(false),
   InMips16Mode(false), InMips16HardFloat(Mips16HardFloat),
   InMicroMipsMode(false), HasDSP(false), HasDSPR2(false),
-  AllowMixed16_32(Mixed16_32 | Mips_Os16), Os16(Mips_Os16),
+  AllowMixed16_32(Mixed16_32 | Mips_Os16), Os16(Mips_Os16), HasMSA(false),
   RM(_RM), OverrideMode(NoOverride), TM(_TM)
 {
   std::string CPUName = CPU;
@@ -89,12 +95,20 @@ MipsSubtarget::MipsSubtarget(const std::string &TT, const std::string &CPU,
           (hasMips64() && (isABI_N32() || isABI_N64()))) &&
          "Invalid  Arch & ABI pair.");
 
+  if (hasMSA() && !isFP64bit())
+    report_fatal_error("MSA requires a 64-bit FPU register file (FR=1 mode). "
+                       "See -mattr=+fp64.",
+                       false);
+
   // Is the target system Linux ?
   if (TT.find("linux") == std::string::npos)
     IsLinux = false;
 
   // Set UseSmallSection.
   UseSmallSection = !IsLinux && (RM == Reloc::Static);
+  // set some subtarget specific features
+  if (inMips16Mode())
+    HasBitCount=false;
 }
 
 bool
@@ -152,3 +166,11 @@ void MipsSubtarget::resetSubtarget(MachineFunction *MF) {
   }
 }
 
+bool MipsSubtarget::mipsSEUsesSoftFloat() const {
+  return TM->Options.UseSoftFloat && !InMips16HardFloat;
+}
+
+bool MipsSubtarget::useConstantIslands() {
+  DEBUG(dbgs() << "use constant islands " << Mips16ConstantIslands << "\n");
+  return Mips16ConstantIslands;
+}
diff --git a/lib/Target/Mips/MipsSubtarget.h b/lib/Target/Mips/MipsSubtarget.h
index bfb13bb..6b2ab12 100644
--- a/lib/Target/Mips/MipsSubtarget.h
+++ b/lib/Target/Mips/MipsSubtarget.h
@@ -113,6 +113,9 @@ protected:
   // compiled as Mips32
   bool Os16;
 
+  // HasMSA -- supports MSA ASE.
+  bool HasMSA;
+
   InstrItineraryData InstrItins;
 
   // The instance to the register info section object
@@ -157,6 +160,7 @@ public:
 
   bool isLittle() const { return IsLittle; }
   bool isFP64bit() const { return IsFP64bit; }
+  bool isNotFP64bit() const { return !IsFP64bit; }
   bool isGP64bit() const { return IsGP64bit; }
   bool isGP32bit() const { return !IsGP64bit; }
   bool isSingleFloat() const { return IsSingleFloat; }
@@ -182,17 +186,25 @@ public:
   bool inMicroMipsMode() const { return InMicroMipsMode; }
   bool hasDSP() const { return HasDSP; }
   bool hasDSPR2() const { return HasDSPR2; }
+  bool hasMSA() const { return HasMSA; }
   bool isLinux() const { return IsLinux; }
   bool useSmallSection() const { return UseSmallSection; }
 
   bool hasStandardEncoding() const { return !inMips16Mode(); }
 
+  bool mipsSEUsesSoftFloat() const;
+
+  bool enableLongBranchPass() const {
+    return hasStandardEncoding() || allowMixed16_32();
+  }
+
   /// Features related to the presence of specific instructions.
   bool hasSEInReg()   const { return HasSEInReg; }
   bool hasCondMov()   const { return HasCondMov; }
   bool hasSwap()      const { return HasSwap; }
   bool hasBitCount()  const { return HasBitCount; }
   bool hasFPIdx()     const { return HasFPIdx; }
+  bool hasExtractInsert() const { return !inMips16Mode() && hasMips32r2(); }
 
   const InstrItineraryData &getInstrItineraryData() const { return InstrItins; }
   bool allowMixed16_32() const { return inMips16ModeDefault() |
@@ -200,6 +212,13 @@ public:
 
   bool os16() const { return Os16;};
 
+// for now constant islands are on for the whole compilation unit but we only
+// really use them if in addition we are in mips16 mode
+//
+static bool useConstantIslands();
+
+  unsigned stackAlignment() const { return hasMips64() ? 16 : 8; }
+
   // Grab MipsRegInfo object
   const MipsReginfo &getMReginfo() const { return MRI; }
 
diff --git a/lib/Target/Mips/MipsTargetMachine.cpp b/lib/Target/Mips/MipsTargetMachine.cpp
index ced6a09..5046c1b 100644
--- a/lib/Target/Mips/MipsTargetMachine.cpp
+++ b/lib/Target/Mips/MipsTargetMachine.cpp
@@ -32,6 +32,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/TargetRegistry.h"
+#include "llvm/Transforms/Scalar.h"
 using namespace llvm;
 
 
@@ -134,7 +135,13 @@ namespace {
 class MipsPassConfig : public TargetPassConfig {
 public:
   MipsPassConfig(MipsTargetMachine *TM, PassManagerBase &PM)
-    : TargetPassConfig(TM, PM) {}
+    : TargetPassConfig(TM, PM) {
+    // The current implementation of long branch pass requires a scratch
+    // register ($at) to be available before branch instructions. Tail merging
+    // can break this requirement, so disable it when long branch pass is
+    // enabled.
+    EnableTailMerge = !getMipsSubtarget().enableLongBranchPass();
+  }
 
   MipsTargetMachine &getMipsTargetMachine() const {
     return getTM<MipsTargetMachine>();
@@ -160,7 +167,7 @@ void MipsPassConfig::addIRPasses() {
     addPass(createMipsOs16(getMipsTargetMachine()));
   if (getMipsSubtarget().inMips16HardFloat())
     addPass(createMips16HardFloat(getMipsTargetMachine()));
-  addPass(createMipsOptimizeMathLibCalls(getMipsTargetMachine()));
+  addPass(createPartiallyInlineLibCallsPass());
 }
 // Install an instruction selector pass using
 // the ISelDag to gen Mips code.
@@ -196,8 +203,7 @@ bool MipsPassConfig::addPreEmitPass() {
   const MipsSubtarget &Subtarget = TM.getSubtarget<MipsSubtarget>();
   addPass(createMipsDelaySlotFillerPass(TM));
 
-  if (Subtarget.hasStandardEncoding() ||
-      Subtarget.allowMixed16_32())
+  if (Subtarget.enableLongBranchPass())
     addPass(createMipsLongBranchPass(TM));
   if (Subtarget.inMips16Mode() ||
       Subtarget.allowMixed16_32())
diff --git a/lib/Target/Mips/MipsTargetStreamer.h b/lib/Target/Mips/MipsTargetStreamer.h
new file mode 100644
index 0000000..96966fd
--- /dev/null
+++ b/lib/Target/Mips/MipsTargetStreamer.h
@@ -0,0 +1,44 @@
+//===-- MipsTargetStreamer.h - Mips Target Streamer ------------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPSTARGETSTREAMER_H
+#define MIPSTARGETSTREAMER_H
+
+#include "llvm/MC/MCELFStreamer.h"
+#include "llvm/MC/MCStreamer.h"
+
+namespace llvm {
+class MipsTargetStreamer : public MCTargetStreamer {
+  virtual void anchor();
+
+public:
+  virtual void emitMipsHackELFFlags(unsigned Flags) = 0;
+  virtual void emitMipsHackSTOCG(MCSymbol *Sym, unsigned Val) = 0;
+};
+
+// This part is for ascii assembly output
+class MipsTargetAsmStreamer : public MipsTargetStreamer {
+  formatted_raw_ostream &OS;
+
+public:
+  MipsTargetAsmStreamer(formatted_raw_ostream &OS);
+  virtual void emitMipsHackELFFlags(unsigned Flags);
+  virtual void emitMipsHackSTOCG(MCSymbol *Sym, unsigned Val);
+};
+
+// This part is for ELF object output
+class MipsTargetELFStreamer : public MipsTargetStreamer {
+public:
+  MCELFStreamer &getStreamer();
+  virtual void emitMipsHackELFFlags(unsigned Flags);
+  virtual void emitMipsHackSTOCG(MCSymbol *Sym, unsigned Val);
+};
+}
+
+#endif
diff --git a/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.cpp b/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.cpp
index c7b8aa4..d5be0e4 100644
--- a/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.cpp
+++ b/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.cpp
@@ -18,6 +18,7 @@
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormattedStream.h"
@@ -277,3 +278,12 @@ void NVPTXInstPrinter::printMemOperand(const MCInst *MI, int OpNum,
     printOperand(MI, OpNum + 1, O);
   }
 }
+
+void NVPTXInstPrinter::printProtoIdent(const MCInst *MI, int OpNum,
+                                       raw_ostream &O, const char *Modifier) {
+  const MCOperand &Op = MI->getOperand(OpNum);
+  assert(Op.isExpr() && "Call prototype is not an MCExpr?");
+  const MCExpr *Expr = Op.getExpr();
+  const MCSymbol &Sym = cast<MCSymbolRefExpr>(Expr)->getSymbol();
+  O << Sym.getName();
+}
diff --git a/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.h b/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.h
index e0f44da..93029ae 100644
--- a/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.h
+++ b/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.h
@@ -44,7 +44,8 @@ public:
                      raw_ostream &O, const char *Modifier = 0);
   void printMemOperand(const MCInst *MI, int OpNum,
                        raw_ostream &O, const char *Modifier = 0);
-
+  void printProtoIdent(const MCInst *MI, int OpNum,
+                       raw_ostream &O, const char *Modifier = 0);
 };
 
 }
diff --git a/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp
index dfa1ff5..f2784b8 100644
--- a/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp
+++ b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp
@@ -35,8 +35,6 @@ NVPTXMCAsmInfo::NVPTXMCAsmInfo(const StringRef &TT) {
 
   PrivateGlobalPrefix = "$L__";
 
-  AllowPeriodsInName = false;
-
   HasSetDirective = false;
 
   HasSingleParameterDotFile = false;
diff --git a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index a2b9bec..7552fe7 100644
--- a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -20,6 +20,7 @@
 #include "NVPTXRegisterInfo.h"
 #include "NVPTXTargetMachine.h"
 #include "NVPTXUtilities.h"
+#include "InstPrinter/NVPTXInstPrinter.h"
 #include "cl_common_defines.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Analysis/ConstantFolding.h"
@@ -47,21 +48,17 @@
 #include <sstream>
 using namespace llvm;
 
-bool RegAllocNilUsed = true;
-
 #define DEPOTNAME "__local_depot"
 
 static cl::opt<bool>
-EmitLineNumbers("nvptx-emit-line-numbers",
+EmitLineNumbers("nvptx-emit-line-numbers", cl::Hidden,
                 cl::desc("NVPTX Specific: Emit Line numbers even without -G"),
                 cl::init(true));
 
-namespace llvm { bool InterleaveSrcInPtx = false; }
-
-static cl::opt<bool, true>
-InterleaveSrc("nvptx-emit-src", cl::ZeroOrMore,
+static cl::opt<bool>
+InterleaveSrc("nvptx-emit-src", cl::ZeroOrMore, cl::Hidden,
               cl::desc("NVPTX Specific: Emit source line in ptx file"),
-              cl::location(llvm::InterleaveSrcInPtx));
+              cl::init(false));
 
 namespace {
 /// DiscoverDependentGlobals - Return a set of GlobalVariables on which \p V
@@ -129,7 +126,7 @@ const MCExpr *nvptx::LowerConstant(const Constant *CV, AsmPrinter &AP) {
     return MCConstantExpr::Create(CI->getZExtValue(), Ctx);
 
   if (const GlobalValue *GV = dyn_cast<GlobalValue>(CV))
-    return MCSymbolRefExpr::Create(AP.Mang->getSymbol(GV), Ctx);
+    return MCSymbolRefExpr::Create(AP.getSymbol(GV), Ctx);
 
   if (const BlockAddress *BA = dyn_cast<BlockAddress>(CV))
     return MCSymbolRefExpr::Create(AP.GetBlockAddressSymbol(BA), Ctx);
@@ -294,7 +291,7 @@ void NVPTXAsmPrinter::emitLineNumberAsDotLoc(const MachineInstr &MI) {
     return;
 
   // Emit the line from the source file.
-  if (llvm::InterleaveSrcInPtx)
+  if (InterleaveSrc)
     this->emitSrcInText(fileName.str(), curLoc.getLine());
 
   std::stringstream temp;
@@ -317,6 +314,14 @@ void NVPTXAsmPrinter::EmitInstruction(const MachineInstr *MI) {
 void NVPTXAsmPrinter::lowerToMCInst(const MachineInstr *MI, MCInst &OutMI) {
   OutMI.setOpcode(MI->getOpcode());
 
+  // Special: Do not mangle symbol operand of CALL_PROTOTYPE
+  if (MI->getOpcode() == NVPTX::CALL_PROTOTYPE) {
+    const MachineOperand &MO = MI->getOperand(0);
+    OutMI.addOperand(GetSymbolRef(MO,
+      OutContext.GetOrCreateSymbol(Twine(MO.getSymbolName()))));
+    return;
+  }
+
   for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
     const MachineOperand &MO = MI->getOperand(i);
 
@@ -344,7 +349,7 @@ bool NVPTXAsmPrinter::lowerOperand(const MachineOperand &MO,
     MCOp = GetSymbolRef(MO, GetExternalSymbolSymbol(MO.getSymbolName()));
     break;
   case MachineOperand::MO_GlobalAddress:
-    MCOp = GetSymbolRef(MO, Mang->getSymbol(MO.getGlobal()));
+    MCOp = GetSymbolRef(MO, getSymbol(MO.getGlobal()));
     break;
   case MachineOperand::MO_FPImmediate: {
     const ConstantFP *Cnt = MO.getFPImm();
@@ -550,6 +555,19 @@ void NVPTXAsmPrinter::EmitFunctionBodyEnd() {
   VRegMapping.clear();
 }
 
+void NVPTXAsmPrinter::emitImplicitDef(const MachineInstr *MI) const {
+  unsigned RegNo = MI->getOperand(0).getReg();
+  const TargetRegisterInfo *TRI = TM.getRegisterInfo();
+  if (TRI->isVirtualRegister(RegNo)) {
+    OutStreamer.AddComment(Twine("implicit-def: ") +
+                           getVirtualRegisterName(RegNo));
+  } else {
+    OutStreamer.AddComment(Twine("implicit-def: ") +
+                           TM.getRegisterInfo()->getName(RegNo));
+  }
+  OutStreamer.AddBlankLine();
+}
+
 void NVPTXAsmPrinter::emitKernelFunctionDirectives(const Function &F,
                                                    raw_ostream &O) const {
   // If the NVVM IR has some of reqntid* specified, then output
@@ -601,23 +619,30 @@ void NVPTXAsmPrinter::emitKernelFunctionDirectives(const Function &F,
     O << ".minnctapersm " << mincta << "\n";
 }
 
-void NVPTXAsmPrinter::getVirtualRegisterName(unsigned vr, bool isVec,
-                                             raw_ostream &O) {
-  const TargetRegisterClass *RC = MRI->getRegClass(vr);
+std::string
+NVPTXAsmPrinter::getVirtualRegisterName(unsigned Reg) const {
+  const TargetRegisterClass *RC = MRI->getRegClass(Reg);
 
-  DenseMap<unsigned, unsigned> &regmap = VRegMapping[RC];
-  unsigned mapped_vr = regmap[vr];
+  std::string Name;
+  raw_string_ostream NameStr(Name);
 
-  if (!isVec) {
-    O << getNVPTXRegClassStr(RC) << mapped_vr;
-    return;
-  }
-  report_fatal_error("Bad register!");
+  VRegRCMap::const_iterator I = VRegMapping.find(RC);
+  assert(I != VRegMapping.end() && "Bad register class");
+  const DenseMap<unsigned, unsigned> &RegMap = I->second;
+
+  VRegMap::const_iterator VI = RegMap.find(Reg);
+  assert(VI != RegMap.end() && "Bad virtual register");
+  unsigned MappedVR = VI->second;
+
+  NameStr << getNVPTXRegClassStr(RC) << MappedVR;
+
+  NameStr.flush();
+  return Name;
 }
 
-void NVPTXAsmPrinter::emitVirtualRegister(unsigned int vr, bool isVec,
+void NVPTXAsmPrinter::emitVirtualRegister(unsigned int vr,
                                           raw_ostream &O) {
-  getVirtualRegisterName(vr, isVec, O);
+  O << getVirtualRegisterName(vr);
 }
 
 void NVPTXAsmPrinter::printVecModifiedImmediate(
@@ -660,7 +685,7 @@ void NVPTXAsmPrinter::emitDeclaration(const Function *F, raw_ostream &O) {
   else
     O << ".func ";
   printReturnValStr(F, O);
-  O << *Mang->getSymbol(F) << "\n";
+  O << *getSymbol(F) << "\n";
   emitFunctionParamList(F, O);
   O << ";\n";
 }
@@ -870,7 +895,7 @@ bool NVPTXAsmPrinter::doInitialization(Module &M) {
   const_cast<TargetLoweringObjectFile &>(getObjFileLowering())
       .Initialize(OutContext, TM);
 
-  Mang = new Mangler(OutContext, &TM);
+  Mang = new Mangler(&TM);
 
   // Emit header before any dwarf directives are emitted below.
   emitHeader(M, OS1);
@@ -1190,7 +1215,7 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar,
     else
       O << getPTXFundamentalTypeStr(ETy, false);
     O << " ";
-    O << *Mang->getSymbol(GVar);
+    O << *getSymbol(GVar);
 
     // Ptx allows variable initilization only for constant and global state
     // spaces.
@@ -1226,15 +1251,15 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar,
           bufferAggregateConstant(Initializer, &aggBuffer);
           if (aggBuffer.numSymbols) {
             if (nvptxSubtarget.is64Bit()) {
-              O << " .u64 " << *Mang->getSymbol(GVar) << "[";
+              O << " .u64 " << *getSymbol(GVar) << "[";
               O << ElementSize / 8;
             } else {
-              O << " .u32 " << *Mang->getSymbol(GVar) << "[";
+              O << " .u32 " << *getSymbol(GVar) << "[";
               O << ElementSize / 4;
             }
             O << "]";
           } else {
-            O << " .b8 " << *Mang->getSymbol(GVar) << "[";
+            O << " .b8 " << *getSymbol(GVar) << "[";
             O << ElementSize;
             O << "]";
           }
@@ -1242,7 +1267,7 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar,
           aggBuffer.print();
           O << "}";
         } else {
-          O << " .b8 " << *Mang->getSymbol(GVar);
+          O << " .b8 " << *getSymbol(GVar);
           if (ElementSize) {
             O << "[";
             O << ElementSize;
@@ -1250,7 +1275,7 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar,
           }
         }
       } else {
-        O << " .b8 " << *Mang->getSymbol(GVar);
+        O << " .b8 " << *getSymbol(GVar);
         if (ElementSize) {
           O << "[";
           O << ElementSize;
@@ -1357,7 +1382,7 @@ void NVPTXAsmPrinter::emitPTXGlobalVariable(const GlobalVariable *GVar,
     O << " .";
     O << getPTXFundamentalTypeStr(ETy);
     O << " ";
-    O << *Mang->getSymbol(GVar);
+    O << *getSymbol(GVar);
     return;
   }
 
@@ -1372,7 +1397,7 @@ void NVPTXAsmPrinter::emitPTXGlobalVariable(const GlobalVariable *GVar,
   case Type::ArrayTyID:
   case Type::VectorTyID:
     ElementSize = TD->getTypeStoreSize(ETy);
-    O << " .b8 " << *Mang->getSymbol(GVar) << "[";
+    O << " .b8 " << *getSymbol(GVar) << "[";
     if (ElementSize) {
       O << itostr(ElementSize);
     }
@@ -1427,7 +1452,7 @@ void NVPTXAsmPrinter::printParamName(Function::const_arg_iterator I,
                                      int paramIndex, raw_ostream &O) {
   if ((nvptxSubtarget.getDrvInterface() == NVPTX::NVCL) ||
       (nvptxSubtarget.getDrvInterface() == NVPTX::CUDA))
-    O << *Mang->getSymbol(I->getParent()) << "_param_" << paramIndex;
+    O << *getSymbol(I->getParent()) << "_param_" << paramIndex;
   else {
     std::string argName = I->getName();
     const char *p = argName.c_str();
@@ -1486,13 +1511,13 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
       if (llvm::isImage(*I)) {
         std::string sname = I->getName();
         if (llvm::isImageWriteOnly(*I))
-          O << "\t.param .surfref " << *Mang->getSymbol(F) << "_param_"
+          O << "\t.param .surfref " << *getSymbol(F) << "_param_"
             << paramIndex;
         else // Default image is read_only
-          O << "\t.param .texref " << *Mang->getSymbol(F) << "_param_"
+          O << "\t.param .texref " << *getSymbol(F) << "_param_"
             << paramIndex;
       } else // Should be llvm::isSampler(*I)
-        O << "\t.param .samplerref " << *Mang->getSymbol(F) << "_param_"
+        O << "\t.param .samplerref " << *getSymbol(F) << "_param_"
           << paramIndex;
       continue;
     }
@@ -1739,13 +1764,13 @@ void NVPTXAsmPrinter::printScalarConstant(const Constant *CPV, raw_ostream &O) {
     return;
   }
   if (const GlobalValue *GVar = dyn_cast<GlobalValue>(CPV)) {
-    O << *Mang->getSymbol(GVar);
+    O << *getSymbol(GVar);
     return;
   }
   if (const ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(CPV)) {
     const Value *v = Cexpr->stripPointerCasts();
     if (const GlobalValue *GVar = dyn_cast<GlobalValue>(v)) {
-      O << *Mang->getSymbol(GVar);
+      O << *getSymbol(GVar);
       return;
     } else {
       O << *LowerConstant(CPV, *this);
@@ -1863,7 +1888,7 @@ void NVPTXAsmPrinter::bufferLEByte(const Constant *CPV, int Bytes,
   case Type::VectorTyID:
   case Type::StructTyID: {
     if (isa<ConstantArray>(CPV) || isa<ConstantVector>(CPV) ||
-        isa<ConstantStruct>(CPV)) {
+        isa<ConstantStruct>(CPV) || isa<ConstantDataSequential>(CPV)) {
       int ElementSize = TD->getTypeAllocSize(CPV->getType());
       bufferAggregateConstant(CPV, aggBuffer);
       if (Bytes > ElementSize)
@@ -1993,6 +2018,116 @@ bool NVPTXAsmPrinter::ignoreLoc(const MachineInstr &MI) {
   return false;
 }
 
+/// PrintAsmOperand - Print out an operand for an inline asm expression.
+///
+bool NVPTXAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                                      unsigned AsmVariant,
+                                      const char *ExtraCode, raw_ostream &O) {
+  if (ExtraCode && ExtraCode[0]) {
+    if (ExtraCode[1] != 0)
+      return true; // Unknown modifier.
+
+    switch (ExtraCode[0]) {
+    default:
+      // See if this is a generic print operand
+      return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O);
+    case 'r':
+      break;
+    }
+  }
+
+  printOperand(MI, OpNo, O);
+
+  return false;
+}
+
+bool NVPTXAsmPrinter::PrintAsmMemoryOperand(
+    const MachineInstr *MI, unsigned OpNo, unsigned AsmVariant,
+    const char *ExtraCode, raw_ostream &O) {
+  if (ExtraCode && ExtraCode[0])
+    return true; // Unknown modifier
+
+  O << '[';
+  printMemOperand(MI, OpNo, O);
+  O << ']';
+
+  return false;
+}
+
+void NVPTXAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
+                                   raw_ostream &O, const char *Modifier) {
+  const MachineOperand &MO = MI->getOperand(opNum);
+  switch (MO.getType()) {
+  case MachineOperand::MO_Register:
+    if (TargetRegisterInfo::isPhysicalRegister(MO.getReg())) {
+      if (MO.getReg() == NVPTX::VRDepot)
+        O << DEPOTNAME << getFunctionNumber();
+      else
+        O << NVPTXInstPrinter::getRegisterName(MO.getReg());
+    } else {
+      emitVirtualRegister(MO.getReg(), O);
+    }
+    return;
+
+  case MachineOperand::MO_Immediate:
+    if (!Modifier)
+      O << MO.getImm();
+    else if (strstr(Modifier, "vec") == Modifier)
+      printVecModifiedImmediate(MO, Modifier, O);
+    else
+      llvm_unreachable(
+          "Don't know how to handle modifier on immediate operand");
+    return;
+
+  case MachineOperand::MO_FPImmediate:
+    printFPConstant(MO.getFPImm(), O);
+    break;
+
+  case MachineOperand::MO_GlobalAddress:
+    O << *getSymbol(MO.getGlobal());
+    break;
+
+  case MachineOperand::MO_ExternalSymbol: {
+    const char *symbname = MO.getSymbolName();
+    if (strstr(symbname, ".PARAM") == symbname) {
+      unsigned index;
+      sscanf(symbname + 6, "%u[];", &index);
+      printParamName(index, O);
+    } else if (strstr(symbname, ".HLPPARAM") == symbname) {
+      unsigned index;
+      sscanf(symbname + 9, "%u[];", &index);
+      O << *CurrentFnSym << "_param_" << index << "_offset";
+    } else
+      O << symbname;
+    break;
+  }
+
+  case MachineOperand::MO_MachineBasicBlock:
+    O << *MO.getMBB()->getSymbol();
+    return;
+
+  default:
+    llvm_unreachable("Operand type not supported.");
+  }
+}
+
+void NVPTXAsmPrinter::printMemOperand(const MachineInstr *MI, int opNum,
+                                      raw_ostream &O, const char *Modifier) {
+  printOperand(MI, opNum, O);
+
+  if (Modifier && !strcmp(Modifier, "add")) {
+    O << ", ";
+    printOperand(MI, opNum + 1, O);
+  } else {
+    if (MI->getOperand(opNum + 1).isImm() &&
+        MI->getOperand(opNum + 1).getImm() == 0)
+      return; // don't print ',0' or '+0'
+    O << "+";
+    printOperand(MI, opNum + 1, O);
+  }
+}
+
+
 // Force static initialization.
 extern "C" void LLVMInitializeNVPTXBackendAsmPrinter() {
   RegisterAsmPrinter<NVPTXAsmPrinter> X(TheNVPTXTarget32);
diff --git a/lib/Target/NVPTX/NVPTXAsmPrinter.h b/lib/Target/NVPTX/NVPTXAsmPrinter.h
index 27bfa54..3abe5d1 100644
--- a/lib/Target/NVPTX/NVPTXAsmPrinter.h
+++ b/lib/Target/NVPTX/NVPTXAsmPrinter.h
@@ -155,7 +155,7 @@ class LLVM_LIBRARY_VISIBILITY NVPTXAsmPrinter : public AsmPrinter {
           if (pos == nextSymbolPos) {
             const Value *v = Symbols[nSym];
             if (const GlobalValue *GVar = dyn_cast<GlobalValue>(v)) {
-              MCSymbol *Name = AP.Mang->getSymbol(GVar);
+              MCSymbol *Name = AP.getSymbol(GVar);
               O << *Name;
             } else if (const ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(v)) {
               O << *nvptx::LowerConstant(Cexpr, AP);
@@ -188,6 +188,7 @@ private:
   void EmitFunctionEntryLabel();
   void EmitFunctionBodyStart();
   void EmitFunctionBodyEnd();
+  void emitImplicitDef(const MachineInstr *MI) const;
 
   void EmitInstruction(const MachineInstr *);
   void lowerToMCInst(const MachineInstr *MI, MCInst &OutMI);
@@ -213,7 +214,7 @@ private:
   void emitGlobals(const Module &M);
   void emitHeader(Module &M, raw_ostream &O);
   void emitKernelFunctionDirectives(const Function &F, raw_ostream &O) const;
-  void emitVirtualRegister(unsigned int vr, bool isVec, raw_ostream &O);
+  void emitVirtualRegister(unsigned int vr, raw_ostream &);
   void emitFunctionExternParamList(const MachineFunction &MF);
   void emitFunctionParamList(const Function *, raw_ostream &O);
   void emitFunctionParamList(const MachineFunction &MF, raw_ostream &O);
@@ -222,7 +223,14 @@ private:
   bool isImageType(const Type *Ty);
   void printReturnValStr(const Function *, raw_ostream &O);
   void printReturnValStr(const MachineFunction &MF, raw_ostream &O);
-
+  bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                       unsigned AsmVariant, const char *ExtraCode,
+                       raw_ostream &);
+  void printOperand(const MachineInstr *MI, int opNum, raw_ostream &O,
+                    const char *Modifier = 0);
+  bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
+                             unsigned AsmVariant, const char *ExtraCode,
+                             raw_ostream &);
 protected:
   bool doInitialization(Module &M);
   bool doFinalization(Module &M);
@@ -287,7 +295,7 @@ public:
 
   bool ignoreLoc(const MachineInstr &);
 
-  virtual void getVirtualRegisterName(unsigned, bool, raw_ostream &);
+  std::string getVirtualRegisterName(unsigned) const;
 
   DebugLoc prevDebugLoc;
   void emitLineNumberAsDotLoc(const MachineInstr &);
diff --git a/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp b/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp
index 9f92a5b..9fb0dd8 100644
--- a/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp
+++ b/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp
@@ -142,7 +142,7 @@ bool GenericToNVVM::runOnModule(Module &M) {
     GlobalVariable *GV = I->first;
     GlobalVariable *NewGV = I->second;
     ++I;
-    Constant *BitCastNewGV = ConstantExpr::getBitCast(NewGV, GV->getType());
+    Constant *BitCastNewGV = ConstantExpr::getPointerCast(NewGV, GV->getType());
     // At this point, the remaining uses of GV should be found only in global
     // variable initializers, as other uses have been already been removed
     // while walking through the instructions in function definitions.
diff --git a/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index ba85e35..4b8b306 100644
--- a/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -26,24 +26,24 @@
 using namespace llvm;
 
 static cl::opt<int>
-FMAContractLevel("nvptx-fma-level", cl::ZeroOrMore,
+FMAContractLevel("nvptx-fma-level", cl::ZeroOrMore, cl::Hidden,
                  cl::desc("NVPTX Specific: FMA contraction (0: don't do it"
                           " 1: do it  2: do it aggressively"),
                  cl::init(2));
 
 static cl::opt<int> UsePrecDivF32(
-    "nvptx-prec-divf32", cl::ZeroOrMore,
+    "nvptx-prec-divf32", cl::ZeroOrMore, cl::Hidden,
     cl::desc("NVPTX Specifies: 0 use div.approx, 1 use div.full, 2 use"
              " IEEE Compliant F32 div.rnd if avaiable."),
     cl::init(2));
 
 static cl::opt<bool>
-UsePrecSqrtF32("nvptx-prec-sqrtf32",
+UsePrecSqrtF32("nvptx-prec-sqrtf32", cl::Hidden,
           cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."),
           cl::init(true));
 
 static cl::opt<bool>
-FtzEnabled("nvptx-f32ftz", cl::ZeroOrMore,
+FtzEnabled("nvptx-f32ftz", cl::ZeroOrMore, cl::Hidden,
            cl::desc("NVPTX Specific: Flush f32 subnormals to sign-preserving zero."),
            cl::init(false));
 
@@ -118,8 +118,10 @@ bool NVPTXDAGToDAGISel::useF32FTZ() const {
 /// expanded, promoted and normal instructions.
 SDNode *NVPTXDAGToDAGISel::Select(SDNode *N) {
 
-  if (N->isMachineOpcode())
+  if (N->isMachineOpcode()) {
+    N->setNodeId(-1);
     return NULL; // Already selected.
+  }
 
   SDNode *ResNode = NULL;
   switch (N->getOpcode()) {
@@ -249,7 +251,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoad(SDNode *N) {
   SDValue Addr;
   SDValue Offset, Base;
   unsigned Opcode;
-  MVT::SimpleValueType TargetVT = LD->getValueType(0).getSimpleVT().SimpleTy;
+  MVT::SimpleValueType TargetVT = LD->getSimpleValueType(0).SimpleTy;
 
   if (SelectDirectAddr(N1, Addr)) {
     switch (TargetVT) {
@@ -1347,8 +1349,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStore(SDNode *N) {
   SDValue Addr;
   SDValue Offset, Base;
   unsigned Opcode;
-  MVT::SimpleValueType SourceVT =
-      N1.getNode()->getValueType(0).getSimpleVT().SimpleTy;
+  MVT::SimpleValueType SourceVT = N1.getNode()->getSimpleValueType(0).SimpleTy;
 
   if (SelectDirectAddr(N2, Addr)) {
     switch (SourceVT) {
diff --git a/lib/Target/NVPTX/NVPTXISelLowering.cpp b/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 828242d..6a8be75 100644
--- a/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -310,6 +310,8 @@ const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
     return "NVPTXISD::CallSeqBegin";
   case NVPTXISD::CallSeqEnd:
     return "NVPTXISD::CallSeqEnd";
+  case NVPTXISD::CallPrototype:
+    return "NVPTXISD::CallPrototype";
   case NVPTXISD::LoadV2:
     return "NVPTXISD::LoadV2";
   case NVPTXISD::LoadV4:
@@ -471,22 +473,47 @@ NVPTXTargetLowering::getArgumentAlignment(SDValue Callee,
                                           Type *Ty,
                                           unsigned Idx) const {
   const DataLayout *TD = getDataLayout();
-  unsigned align = 0;
-  GlobalAddressSDNode *Func = dyn_cast<GlobalAddressSDNode>(Callee.getNode());
+  unsigned Align = 0;
+  const Value *DirectCallee = CS->getCalledFunction();
+
+  if (!DirectCallee) {
+    // We don't have a direct function symbol, but that may be because of
+    // constant cast instructions in the call.
+    const Instruction *CalleeI = CS->getInstruction();
+    assert(CalleeI && "Call target is not a function or derived value?");
+
+    // With bitcast'd call targets, the instruction will be the call
+    if (isa<CallInst>(CalleeI)) {
+      // Check if we have call alignment metadata
+      if (llvm::getAlign(*cast<CallInst>(CalleeI), Idx, Align))
+        return Align;
+
+      const Value *CalleeV = cast<CallInst>(CalleeI)->getCalledValue();
+      // Ignore any bitcast instructions
+      while(isa<ConstantExpr>(CalleeV)) {
+        const ConstantExpr *CE = cast<ConstantExpr>(CalleeV);
+        if (!CE->isCast())
+          break;
+        // Look through the bitcast
+        CalleeV = cast<ConstantExpr>(CalleeV)->getOperand(0);
+      }
 
-  if (Func) { // direct call
-    assert(CS->getCalledFunction() &&
-           "direct call cannot find callee");
-    if (!llvm::getAlign(*(CS->getCalledFunction()), Idx, align))
-      align = TD->getABITypeAlignment(Ty);
-  }
-  else { // indirect call
-    const CallInst *CallI = dyn_cast<CallInst>(CS->getInstruction());
-    if (!llvm::getAlign(*CallI, Idx, align))
-      align = TD->getABITypeAlignment(Ty);
+      // We have now looked past all of the bitcasts.  Do we finally have a
+      // Function?
+      if (isa<Function>(CalleeV))
+        DirectCallee = CalleeV;
+    }
   }
 
-  return align;
+  // Check for function alignment information if we found that the
+  // ultimate target is a Function
+  if (DirectCallee)
+    if (llvm::getAlign(*cast<Function>(DirectCallee), Idx, Align))
+      return Align;
+
+  // Call is indirect or alignment information is not available, fall back to
+  // the ABI type alignment
+  return TD->getABITypeAlignment(Ty);
 }
 
 SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
@@ -860,18 +887,16 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     // proto_0 : .callprototype(.param .b32 _) _ (.param .b32 _);
     // to be emitted, and the label has to used as the last arg of call
     // instruction.
-    // The prototype is embedded in a string and put as the operand for an
-    // INLINEASM SDNode.
-    SDVTList InlineAsmVTs = DAG.getVTList(MVT::Other, MVT::Glue);
-    std::string proto_string =
-        getPrototype(retTy, Args, Outs, retAlignment, CS);
-    const char *asmstr = nvTM->getManagedStrPool()
-        ->getManagedString(proto_string.c_str())->c_str();
-    SDValue InlineAsmOps[] = {
-      Chain, DAG.getTargetExternalSymbol(asmstr, getPointerTy()),
-      DAG.getMDNode(0), DAG.getTargetConstant(0, MVT::i32), InFlag
+    // The prototype is embedded in a string and put as the operand for a
+    // CallPrototype SDNode which will print out to the value of the string.
+    SDVTList ProtoVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+    std::string Proto = getPrototype(retTy, Args, Outs, retAlignment, CS);
+    const char *ProtoStr =
+      nvTM->getManagedStrPool()->getManagedString(Proto.c_str())->c_str();
+    SDValue ProtoOps[] = {
+      Chain, DAG.getTargetExternalSymbol(ProtoStr, MVT::i32), InFlag,
     };
-    Chain = DAG.getNode(ISD::INLINEASM, dl, InlineAsmVTs, InlineAsmOps, 5);
+    Chain = DAG.getNode(NVPTXISD::CallPrototype, dl, ProtoVTs, &ProtoOps[0], 3);
     InFlag = Chain.getValue(1);
   }
   // Op to just print "call"
@@ -1595,7 +1620,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
             }
             Ofst += TD->getTypeAllocSize(VecVT.getTypeForEVT(F->getContext()));
           }
-          InsIdx += VecSize;
+          InsIdx += NumElts;
         }
 
         if (NumElts > 0)
@@ -2263,3 +2288,29 @@ void NVPTXTargetLowering::ReplaceNodeResults(
     return;
   }
 }
+
+// Pin NVPTXSection's and NVPTXTargetObjectFile's vtables to this file.
+void NVPTXSection::anchor() {}
+
+NVPTXTargetObjectFile::~NVPTXTargetObjectFile() {
+  delete TextSection;
+  delete DataSection;
+  delete BSSSection;
+  delete ReadOnlySection;
+
+  delete StaticCtorSection;
+  delete StaticDtorSection;
+  delete LSDASection;
+  delete EHFrameSection;
+  delete DwarfAbbrevSection;
+  delete DwarfInfoSection;
+  delete DwarfLineSection;
+  delete DwarfFrameSection;
+  delete DwarfPubTypesSection;
+  delete DwarfDebugInlineSection;
+  delete DwarfStrSection;
+  delete DwarfLocSection;
+  delete DwarfARangesSection;
+  delete DwarfRangesSection;
+  delete DwarfMacroInfoSection;
+}
diff --git a/lib/Target/NVPTX/NVPTXISelLowering.h b/lib/Target/NVPTX/NVPTXISelLowering.h
index 3418437..66e708f 100644
--- a/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -49,6 +49,7 @@ enum NodeType {
   RETURN,
   CallSeqBegin,
   CallSeqEnd,
+  CallPrototype,
   Dummy,
 
   LoadV2 = ISD::FIRST_TARGET_MEMORY_OPCODE,
diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.cpp b/lib/Target/NVPTX/NVPTXInstrInfo.cpp
index b406aa9..86ddd38 100644
--- a/lib/Target/NVPTX/NVPTXInstrInfo.cpp
+++ b/lib/Target/NVPTX/NVPTXInstrInfo.cpp
@@ -14,17 +14,19 @@
 #include "NVPTX.h"
 #include "NVPTXInstrInfo.h"
 #include "NVPTXTargetMachine.h"
-#define GET_INSTRINFO_CTOR
+#define GET_INSTRINFO_CTOR_DTOR
 #include "NVPTXGenInstrInfo.inc"
 #include "llvm/IR/Function.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include <cstdio>
 
 using namespace llvm;
 
+// Pin the vtable to this file.
+void NVPTXInstrInfo::anchor() {}
+
 // FIXME: Add the subtarget support on this constructor.
 NVPTXInstrInfo::NVPTXInstrInfo(NVPTXTargetMachine &tm)
     : NVPTXGenInstrInfo(), TM(tm), RegInfo(*TM.getSubtargetImpl()) {}
diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.h b/lib/Target/NVPTX/NVPTXInstrInfo.h
index b1972e9..600fc5c 100644
--- a/lib/Target/NVPTX/NVPTXInstrInfo.h
+++ b/lib/Target/NVPTX/NVPTXInstrInfo.h
@@ -26,6 +26,7 @@ namespace llvm {
 class NVPTXInstrInfo : public NVPTXGenInstrInfo {
   NVPTXTargetMachine &TM;
   const NVPTXRegisterInfo RegInfo;
+  virtual void anchor();
 public:
   explicit NVPTXInstrInfo(NVPTXTargetMachine &TM);
 
diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.td b/lib/Target/NVPTX/NVPTXInstrInfo.td
index 3e430bf..b23f1e4 100644
--- a/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -2607,6 +2607,20 @@ def trapinst : NVPTXInst<(outs), (ins),
                          "trap;",
                          [(trap)]>;
 
+// Call prototype wrapper
+def SDTCallPrototype : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+def CallPrototype
+  : SDNode<"NVPTXISD::CallPrototype", SDTCallPrototype,
+           [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def ProtoIdent : Operand<i32> {
+  let PrintMethod = "printProtoIdent";
+}
+def CALL_PROTOTYPE
+  : NVPTXInst<(outs), (ins ProtoIdent:$ident),
+              "$ident", [(CallPrototype (i32 texternalsym:$ident))]>;
+
+
+
 include "NVPTXIntrinsics.td"
 
 
diff --git a/lib/Target/NVPTX/NVPTXSection.h b/lib/Target/NVPTX/NVPTXSection.h
index e57ace9..f8a692e 100644
--- a/lib/Target/NVPTX/NVPTXSection.h
+++ b/lib/Target/NVPTX/NVPTXSection.h
@@ -24,10 +24,10 @@ namespace llvm {
 /// the ASMPrint interface.
 ///
 class NVPTXSection : public MCSection {
-
+  virtual void anchor();
 public:
   NVPTXSection(SectionVariant V, SectionKind K) : MCSection(V, K) {}
-  ~NVPTXSection() {}
+  virtual ~NVPTXSection() {}
 
   /// Override this as NVPTX has its own way of printing switching
   /// to a section.
diff --git a/lib/Target/NVPTX/NVPTXSplitBBatBar.cpp b/lib/Target/NVPTX/NVPTXSplitBBatBar.cpp
index 83dfe12..b64c308 100644
--- a/lib/Target/NVPTX/NVPTXSplitBBatBar.cpp
+++ b/lib/Target/NVPTX/NVPTXSplitBBatBar.cpp
@@ -36,7 +36,7 @@ bool NVPTXSplitBBatBar::runOnFunction(Function &F) {
     BasicBlock::iterator II = IB;
     BasicBlock::iterator IE = BI->end();
 
-    // Skit the first intruction. No splitting is needed at this
+    // Skit the first instruction. No splitting is needed at this
     // point even if this is a bar.
     while (II != IE) {
       if (IntrinsicInst *inst = dyn_cast<IntrinsicInst>(II)) {
diff --git a/lib/Target/NVPTX/NVPTXSubtarget.cpp b/lib/Target/NVPTX/NVPTXSubtarget.cpp
index c4d0d6e..9771a17 100644
--- a/lib/Target/NVPTX/NVPTXSubtarget.cpp
+++ b/lib/Target/NVPTX/NVPTXSubtarget.cpp
@@ -20,6 +20,9 @@
 using namespace llvm;
 
 
+// Pin the vtable to this file.
+void NVPTXSubtarget::anchor() {}
+
 NVPTXSubtarget::NVPTXSubtarget(const std::string &TT, const std::string &CPU,
                                const std::string &FS, bool is64Bit)
     : NVPTXGenSubtargetInfo(TT, CPU, FS), Is64Bit(is64Bit), PTXVersion(0),
diff --git a/lib/Target/NVPTX/NVPTXSubtarget.h b/lib/Target/NVPTX/NVPTXSubtarget.h
index 670077d..004be11 100644
--- a/lib/Target/NVPTX/NVPTXSubtarget.h
+++ b/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -25,7 +25,7 @@
 namespace llvm {
 
 class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
-
+  virtual void anchor();
   std::string TargetName;
   NVPTX::DrvInterface drvInterface;
   bool Is64Bit;
diff --git a/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/lib/Target/NVPTX/NVPTXTargetMachine.cpp
index 72afe8d..46edd6d 100644
--- a/lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ b/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -57,9 +57,6 @@ extern "C" void LLVMInitializeNVPTXTarget() {
   RegisterTargetMachine<NVPTXTargetMachine32> X(TheNVPTXTarget32);
   RegisterTargetMachine<NVPTXTargetMachine64> Y(TheNVPTXTarget64);
 
-  RegisterMCAsmInfo<NVPTXMCAsmInfo> A(TheNVPTXTarget32);
-  RegisterMCAsmInfo<NVPTXMCAsmInfo> B(TheNVPTXTarget64);
-
   // FIXME: This pass is really intended to be invoked during IR optimization,
   // but it's very NVPTX-specific.
   initializeNVVMReflectPass(*PassRegistry::getPassRegistry());
@@ -129,6 +126,7 @@ void NVPTXPassConfig::addIRPasses() {
   disablePass(&PrologEpilogCodeInserterID);
   disablePass(&MachineCopyPropagationID);
   disablePass(&BranchFolderPassID);
+  disablePass(&TailDuplicateID);
 
   TargetPassConfig::addIRPasses();
   addPass(createGenericToNVVMPass());
@@ -154,10 +152,30 @@ FunctionPass *NVPTXPassConfig::createTargetRegisterAllocator(bool) {
 
 void NVPTXPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) {
   assert(!RegAllocPass && "NVPTX uses no regalloc!");
-  addPass(&StrongPHIEliminationID);
+  addPass(&PHIEliminationID);
+  addPass(&TwoAddressInstructionPassID);
 }
 
 void NVPTXPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
   assert(!RegAllocPass && "NVPTX uses no regalloc!");
-  addPass(&StrongPHIEliminationID);
+
+  addPass(&ProcessImplicitDefsID);
+  addPass(&LiveVariablesID);
+  addPass(&MachineLoopInfoID);
+  addPass(&PHIEliminationID);
+
+  addPass(&TwoAddressInstructionPassID);
+  addPass(&RegisterCoalescerID);
+
+  // PreRA instruction scheduling.
+  if (addPass(&MachineSchedulerID))
+    printAndVerify("After Machine Scheduling");
+
+
+  addPass(&StackSlotColoringID);
+
+  // FIXME: Needs physical registers
+  //addPass(&PostRAMachineLICMID);
+
+  printAndVerify("After StackSlotColoring");
 }
diff --git a/lib/Target/NVPTX/NVPTXTargetObjectFile.h b/lib/Target/NVPTX/NVPTXTargetObjectFile.h
index bfd6ab1..2a7394b 100644
--- a/lib/Target/NVPTX/NVPTXTargetObjectFile.h
+++ b/lib/Target/NVPTX/NVPTXTargetObjectFile.h
@@ -44,30 +44,10 @@ public:
     DwarfMacroInfoSection = 0;
   }
 
-  ~NVPTXTargetObjectFile() {
-    delete TextSection;
-    delete DataSection;
-    delete BSSSection;
-    delete ReadOnlySection;
-
-    delete StaticCtorSection;
-    delete StaticDtorSection;
-    delete LSDASection;
-    delete EHFrameSection;
-    delete DwarfAbbrevSection;
-    delete DwarfInfoSection;
-    delete DwarfLineSection;
-    delete DwarfFrameSection;
-    delete DwarfPubTypesSection;
-    delete DwarfDebugInlineSection;
-    delete DwarfStrSection;
-    delete DwarfLocSection;
-    delete DwarfARangesSection;
-    delete DwarfRangesSection;
-    delete DwarfMacroInfoSection;
-  }
+  virtual ~NVPTXTargetObjectFile();
 
   virtual void Initialize(MCContext &ctx, const TargetMachine &TM) {
+    TargetLoweringObjectFile::Initialize(ctx, TM);
     TextSection = new NVPTXSection(MCSection::SV_ELF, SectionKind::getText());
     DataSection =
         new NVPTXSection(MCSection::SV_ELF, SectionKind::getDataRel());
diff --git a/lib/Target/NVPTX/NVVMReflect.cpp b/lib/Target/NVPTX/NVVMReflect.cpp
index 3cc324b..7406207 100644
--- a/lib/Target/NVPTX/NVVMReflect.cpp
+++ b/lib/Target/NVPTX/NVVMReflect.cpp
@@ -79,7 +79,7 @@ ModulePass *llvm::createNVVMReflectPass(const StringMap<int>& Mapping) {
 }
 
 static cl::opt<bool>
-NVVMReflectEnabled("nvvm-reflect-enable", cl::init(true),
+NVVMReflectEnabled("nvvm-reflect-enable", cl::init(true), cl::Hidden,
                    cl::desc("NVVM reflection, enabled by default"));
 
 char NVVMReflect::ID = 0;
@@ -88,7 +88,7 @@ INITIALIZE_PASS(NVVMReflect, "nvvm-reflect",
                 false)
 
 static cl::list<std::string>
-ReflectList("nvvm-reflect-list", cl::value_desc("name=<int>"),
+ReflectList("nvvm-reflect-list", cl::value_desc("name=<int>"), cl::Hidden,
             cl::desc("A list of string=num assignments"),
             cl::ValueRequired);
 
diff --git a/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp b/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
index a8f7509..fe83fe1 100644
--- a/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
+++ b/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
@@ -13,6 +13,7 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCParser/MCAsmLexer.h"
@@ -174,6 +175,7 @@ struct PPCOperand;
 class PPCAsmParser : public MCTargetAsmParser {
   MCSubtargetInfo &STI;
   MCAsmParser &Parser;
+  const MCInstrInfo &MII;
   bool IsPPC64;
 
   MCAsmParser &getParser() const { return Parser; }
@@ -218,8 +220,9 @@ class PPCAsmParser : public MCTargetAsmParser {
 
 
 public:
-  PPCAsmParser(MCSubtargetInfo &_STI, MCAsmParser &_Parser)
-    : MCTargetAsmParser(), STI(_STI), Parser(_Parser) {
+  PPCAsmParser(MCSubtargetInfo &_STI, MCAsmParser &_Parser,
+               const MCInstrInfo &_MII)
+      : MCTargetAsmParser(), STI(_STI), Parser(_Parser), MII(_MII) {
     // Check for 64-bit vs. 32-bit pointer mode.
     Triple TheTriple(STI.getTargetTriple());
     IsPPC64 = (TheTriple.getArch() == Triple::ppc64 ||
@@ -235,6 +238,10 @@ public:
   virtual bool ParseDirective(AsmToken DirectiveID);
 
   unsigned validateTargetOperandClass(MCParsedAsmOperand *Op, unsigned Kind);
+
+  virtual const MCExpr *applyModifierToExpr(const MCExpr *E,
+                                            MCSymbolRefExpr::VariantKind,
+                                            MCContext &Ctx);
 };
 
 /// PPCOperand - Instances of this class represent a parsed PowerPC machine
@@ -900,19 +907,19 @@ MatchRegisterName(const AsmToken &Tok, unsigned &RegNo, int64_t &IntVal) {
       RegNo = PPC::VRSAVE;
       IntVal = 256;
       return false;
-    } else if (Name.substr(0, 1).equals_lower("r") &&
+    } else if (Name.startswith_lower("r") &&
                !Name.substr(1).getAsInteger(10, IntVal) && IntVal < 32) {
       RegNo = isPPC64()? XRegs[IntVal] : RRegs[IntVal];
       return false;
-    } else if (Name.substr(0, 1).equals_lower("f") &&
+    } else if (Name.startswith_lower("f") &&
                !Name.substr(1).getAsInteger(10, IntVal) && IntVal < 32) {
       RegNo = FRegs[IntVal];
       return false;
-    } else if (Name.substr(0, 1).equals_lower("v") &&
+    } else if (Name.startswith_lower("v") &&
                !Name.substr(1).getAsInteger(10, IntVal) && IntVal < 32) {
       RegNo = VRegs[IntVal];
       return false;
-    } else if (Name.substr(0, 2).equals_lower("cr") &&
+    } else if (Name.startswith_lower("cr") &&
                !Name.substr(2).getAsInteger(10, IntVal) && IntVal < 8) {
       RegNo = CRRegs[IntVal];
       return false;
@@ -1353,6 +1360,8 @@ unsigned PPCAsmParser::validateTargetOperandClass(MCParsedAsmOperand *AsmOp,
   switch (Kind) {
     case MCK_0: ImmVal = 0; break;
     case MCK_1: ImmVal = 1; break;
+    case MCK_2: ImmVal = 2; break;
+    case MCK_3: ImmVal = 3; break;
     default: return Match_InvalidOperand;
   }
 
@@ -1363,3 +1372,26 @@ unsigned PPCAsmParser::validateTargetOperandClass(MCParsedAsmOperand *AsmOp,
   return Match_InvalidOperand;
 }
 
+const MCExpr *
+PPCAsmParser::applyModifierToExpr(const MCExpr *E,
+                                  MCSymbolRefExpr::VariantKind Variant,
+                                  MCContext &Ctx) {
+  switch (Variant) {
+  case MCSymbolRefExpr::VK_PPC_LO:
+    return PPCMCExpr::Create(PPCMCExpr::VK_PPC_LO, E, false, Ctx);
+  case MCSymbolRefExpr::VK_PPC_HI:
+    return PPCMCExpr::Create(PPCMCExpr::VK_PPC_HI, E, false, Ctx);
+  case MCSymbolRefExpr::VK_PPC_HA:
+    return PPCMCExpr::Create(PPCMCExpr::VK_PPC_HA, E, false, Ctx);
+  case MCSymbolRefExpr::VK_PPC_HIGHER:
+    return PPCMCExpr::Create(PPCMCExpr::VK_PPC_HIGHER, E, false, Ctx);
+  case MCSymbolRefExpr::VK_PPC_HIGHERA:
+    return PPCMCExpr::Create(PPCMCExpr::VK_PPC_HIGHERA, E, false, Ctx);
+  case MCSymbolRefExpr::VK_PPC_HIGHEST:
+    return PPCMCExpr::Create(PPCMCExpr::VK_PPC_HIGHEST, E, false, Ctx);
+  case MCSymbolRefExpr::VK_PPC_HIGHESTA:
+    return PPCMCExpr::Create(PPCMCExpr::VK_PPC_HIGHESTA, E, false, Ctx);
+  default:
+    return 0;
+  }
+}
diff --git a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
index 08d7665..8281b5c 100644
--- a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
+++ b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
@@ -18,9 +18,17 @@
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrInfo.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetOpcodes.h"
 using namespace llvm;
 
+// FIXME: Once the integrated assembler supports full register names, tie this
+// to the verbose-asm setting.
+static cl::opt<bool>
+FullRegNames("ppc-asm-full-reg-names", cl::Hidden, cl::init(false),
+             cl::desc("Use full register names when printing assembly"));
+
 #include "PPCGenAsmWriter.inc"
 
 void PPCInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
@@ -78,6 +86,17 @@ void PPCInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
     }
   }
   
+  // For fast-isel, a COPY_TO_REGCLASS may survive this long.  This is
+  // used when converting a 32-bit float to a 64-bit float as part of
+  // conversion to an integer (see PPCFastISel.cpp:SelectFPToI()),
+  // as otherwise we have problems with incorrect register classes
+  // in machine instruction verification.  For now, just avoid trying
+  // to print it as such an instruction has no effect (a 32-bit float
+  // in a register is already in 64-bit form, just with lower
+  // precision).  FIXME: Is there a better solution?
+  if (MI->getOpcode() == TargetOpcode::COPY_TO_REGCLASS)
+    return;
+  
   printInstruction(MI, O);
   printAnnotation(O, Annot);
 }
@@ -285,6 +304,9 @@ void PPCInstPrinter::printTLSCall(const MCInst *MI, unsigned OpNo,
 /// stripRegisterPrefix - This method strips the character prefix from a
 /// register name so that only the number is left.  Used by for linux asm.
 static const char *stripRegisterPrefix(const char *RegName) {
+  if (FullRegNames)
+    return RegName;
+
   switch (RegName[0]) {
   case 'r':
   case 'f':
diff --git a/lib/Target/PowerPC/MCTargetDesc/CMakeLists.txt b/lib/Target/PowerPC/MCTargetDesc/CMakeLists.txt
index 45be471..3efa5ec 100644
--- a/lib/Target/PowerPC/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/PowerPC/MCTargetDesc/CMakeLists.txt
@@ -5,6 +5,7 @@ add_llvm_library(LLVMPowerPCDesc
   PPCMCCodeEmitter.cpp
   PPCMCExpr.cpp
   PPCPredicates.cpp
+  PPCMachObjectWriter.cpp
   PPCELFObjectWriter.cpp
   )
 
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
index b2a8701..0d42081 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
@@ -16,9 +16,9 @@
 #include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCSectionMachO.h"
 #include "llvm/MC/MCValue.h"
-#include "llvm/Object/MachOFormat.h"
 #include "llvm/Support/ELF.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachO.h"
 #include "llvm/Support/TargetRegistry.h"
 using namespace llvm;
 
@@ -69,19 +69,6 @@ static unsigned getFixupKindNumBytes(unsigned Kind) {
 }
 
 namespace {
-class PPCMachObjectWriter : public MCMachObjectTargetWriter {
-public:
-  PPCMachObjectWriter(bool Is64Bit, uint32_t CPUType,
-                      uint32_t CPUSubtype)
-    : MCMachObjectTargetWriter(Is64Bit, CPUType, CPUSubtype) {}
-
-  void RecordRelocation(MachObjectWriter *Writer,
-                        const MCAssembler &Asm, const MCAsmLayout &Layout,
-                        const MCFragment *Fragment, const MCFixup &Fixup,
-                        MCValue Target, uint64_t &FixedValue) {
-    llvm_unreachable("Relocation emission for MachO/PPC unimplemented!");
-  }
-};
 
 class PPCAsmBackend : public MCAsmBackend {
 const Target &TheTarget;
@@ -145,14 +132,17 @@ public:
   }
 
   bool writeNopData(uint64_t Count, MCObjectWriter *OW) const {
-    // Can't emit NOP with size not multiple of 32-bits
-    if (Count % 4 != 0)
-      return false;
-
     uint64_t NumNops = Count / 4;
     for (uint64_t i = 0; i != NumNops; ++i)
       OW->Write32(0x60000000);
 
+    switch (Count % 4) {
+    default: break; // No leftover bytes to write
+    case 1: OW->Write8(0); break;
+    case 2: OW->Write16(0); break;
+    case 3: OW->Write16(0); OW->Write8(0); break;
+    }
+
     return true;
   }
 
@@ -174,12 +164,11 @@ namespace {
 
     MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
       bool is64 = getPointerSize() == 8;
-      return createMachObjectWriter(new PPCMachObjectWriter(
-                                      /*Is64Bit=*/is64,
-                                      (is64 ? object::mach::CTM_PowerPC64 :
-                                       object::mach::CTM_PowerPC),
-                                      object::mach::CSPPC_ALL),
-                                    OS, /*IsLittleEndian=*/false);
+      return createPPCMachObjectWriter(
+          OS,
+          /*Is64Bit=*/is64,
+          (is64 ? MachO::CPU_TYPE_POWERPC64 : MachO::CPU_TYPE_POWERPC),
+          MachO::CPU_SUBTYPE_POWERPC_ALL);
     }
 
     virtual bool doesSectionRequireSymbols(const MCSection &Section) const {
@@ -206,10 +195,9 @@ namespace {
 
 } // end anonymous namespace
 
-
-
-
-MCAsmBackend *llvm::createPPCAsmBackend(const Target &T, StringRef TT, StringRef CPU) {
+MCAsmBackend *llvm::createPPCAsmBackend(const Target &T,
+                                        const MCRegisterInfo &MRI,
+                                        StringRef TT, StringRef CPU) {
   if (Triple(TT).isOSDarwin())
     return new DarwinPPCAsmBackend(T);
 
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
index 6822507..f3dddce 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
@@ -22,7 +22,6 @@ PPCMCAsmInfoDarwin::PPCMCAsmInfoDarwin(bool is64Bit) {
   }
   IsLittleEndian = false;
 
-  PCSymbol = ".";
   CommentString = ";";
   ExceptionsType = ExceptionHandling::DwarfCFI;
 
@@ -47,15 +46,14 @@ PPCLinuxMCAsmInfo::PPCLinuxMCAsmInfo(bool is64Bit) {
   CommentString = "#";
   GlobalPrefix = "";
   PrivateGlobalPrefix = ".L";
-  WeakRefDirective = "\t.weak\t";
-  
+
   // Uses '.section' before '.bss' directive
   UsesELFSectionDirectiveForBSS = true;  
 
   // Debug Information
   SupportsDebugInformation = true;
 
-  PCSymbol = ".";
+  DollarIsPC = true;
 
   // Set up DWARF directives
   HasLEB128 = true;  // Target asm supports leb128 directives (little-endian)
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h b/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h
index 7b4ed9f..1530e77 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h
@@ -15,6 +15,7 @@
 #define PPCTARGETASMINFO_H
 
 #include "llvm/MC/MCAsmInfoDarwin.h"
+#include "llvm/MC/MCAsmInfoELF.h"
 
 namespace llvm {
 
@@ -24,7 +25,7 @@ namespace llvm {
     explicit PPCMCAsmInfoDarwin(bool is64Bit);
   };
 
-  class PPCLinuxMCAsmInfo : public MCAsmInfo {
+  class PPCLinuxMCAsmInfo : public MCAsmInfoELF {
     virtual void anchor();
   public:
     explicit PPCLinuxMCAsmInfo(bool is64Bit);
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
index 59ba9c4..346a9be 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
@@ -23,6 +23,7 @@
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetOpcodes.h"
 using namespace llvm;
 
 STATISTIC(MCNumEmitted, "Number of MC instructions emitted");
@@ -76,11 +77,17 @@ public:
                                  SmallVectorImpl<MCFixup> &Fixups) const;
   void EncodeInstruction(const MCInst &MI, raw_ostream &OS,
                          SmallVectorImpl<MCFixup> &Fixups) const {
+    // For fast-isel, a float COPY_TO_REGCLASS can survive this long.
+    // It's just a nop to keep the register classes happy, so don't
+    // generate anything.
+    unsigned Opcode = MI.getOpcode();
+    if (Opcode == TargetOpcode::COPY_TO_REGCLASS)
+      return;
+
     uint64_t Bits = getBinaryCodeForInstr(MI, Fixups);
 
     // BL8_NOP etc. all have a size of 8 because of the following 'nop'.
     unsigned Size = 4; // FIXME: Have Desc.getSize() return the correct value!
-    unsigned Opcode = MI.getOpcode();
     if (Opcode == PPC::BL8_NOP || Opcode == PPC::BLA8_NOP ||
         Opcode == PPC::BL8_NOP_TLS)
       Size = 8;
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp
index 9529267..d7e8402 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp
@@ -54,7 +54,7 @@ PPCMCExpr::EvaluateAsRelocatableImpl(MCValue &Res,
                                      const MCAsmLayout *Layout) const {
   MCValue Value;
 
-  if (!getSubExpr()->EvaluateAsRelocatable(Value, *Layout))
+  if (!Layout || !getSubExpr()->EvaluateAsRelocatable(Value, *Layout))
     return false;
 
   if (Value.isAbsolute()) {
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
index 5f7a39a..f18d095 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
@@ -14,13 +14,16 @@
 #include "PPCMCTargetDesc.h"
 #include "InstPrinter/PPCInstPrinter.h"
 #include "PPCMCAsmInfo.h"
+#include "PPCTargetStreamer.h"
 #include "llvm/MC/MCCodeGenInfo.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MachineLocation.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/TargetRegistry.h"
 
 #define GET_INSTRINFO_MC_DESC
@@ -34,6 +37,9 @@
 
 using namespace llvm;
 
+// Pin the vtable to this file.
+PPCTargetStreamer::~PPCTargetStreamer() {}
+
 static MCInstrInfo *createPPCMCInstrInfo() {
   MCInstrInfo *X = new MCInstrInfo();
   InitPPCMCInstrInfo(X);
@@ -101,6 +107,29 @@ static MCCodeGenInfo *createPPCMCCodeGenInfo(StringRef TT, Reloc::Model RM,
   return X;
 }
 
+namespace {
+class PPCTargetAsmStreamer : public PPCTargetStreamer {
+  formatted_raw_ostream &OS;
+
+public:
+  PPCTargetAsmStreamer(formatted_raw_ostream &OS) : OS(OS) {}
+  virtual void emitTCEntry(const MCSymbol &S) {
+    OS << "\t.tc ";
+    OS << S.getName();
+    OS << "[TC],";
+    OS << S.getName();
+    OS << '\n';
+  }
+};
+
+class PPCTargetELFStreamer : public PPCTargetStreamer {
+  virtual void emitTCEntry(const MCSymbol &S) {
+    // Creates a R_PPC64_TOC relocation
+    Streamer->EmitSymbolValue(&S, 8);
+  }
+};
+}
+
 // This is duplicated code. Refactor this.
 static MCStreamer *createMCStreamer(const Target &T, StringRef TT,
                                     MCContext &Ctx, MCAsmBackend &MAB,
@@ -111,7 +140,20 @@ static MCStreamer *createMCStreamer(const Target &T, StringRef TT,
   if (Triple(TT).isOSDarwin())
     return createMachOStreamer(Ctx, MAB, OS, Emitter, RelaxAll);
 
-  return createELFStreamer(Ctx, MAB, OS, Emitter, RelaxAll, NoExecStack);
+  PPCTargetStreamer *S = new PPCTargetELFStreamer();
+  return createELFStreamer(Ctx, S, MAB, OS, Emitter, RelaxAll, NoExecStack);
+}
+
+static MCStreamer *
+createMCAsmStreamer(MCContext &Ctx, formatted_raw_ostream &OS,
+                    bool isVerboseAsm, bool useLoc, bool useCFI,
+                    bool useDwarfDirectory, MCInstPrinter *InstPrint,
+                    MCCodeEmitter *CE, MCAsmBackend *TAB, bool ShowInst) {
+  PPCTargetStreamer *S = new PPCTargetAsmStreamer(OS);
+
+  return llvm::createAsmStreamer(Ctx, S, OS, isVerboseAsm, useLoc, useCFI,
+                                 useDwarfDirectory, InstPrint, CE, TAB,
+                                 ShowInst);
 }
 
 static MCInstPrinter *createPPCMCInstPrinter(const Target &T,
@@ -171,6 +213,11 @@ extern "C" void LLVMInitializePowerPCTargetMC() {
   TargetRegistry::RegisterMCObjectStreamer(ThePPC64Target, createMCStreamer);
   TargetRegistry::RegisterMCObjectStreamer(ThePPC64LETarget, createMCStreamer);
 
+  // Register the asm streamer.
+  TargetRegistry::RegisterAsmStreamer(ThePPC32Target, createMCAsmStreamer);
+  TargetRegistry::RegisterAsmStreamer(ThePPC64Target, createMCAsmStreamer);
+  TargetRegistry::RegisterAsmStreamer(ThePPC64LETarget, createMCAsmStreamer);
+
   // Register the MCInstPrinter.
   TargetRegistry::RegisterMCInstPrinter(ThePPC32Target, createPPCMCInstPrinter);
   TargetRegistry::RegisterMCInstPrinter(ThePPC64Target, createPPCMCInstPrinter);
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h b/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h
index 9f29132..0b0ca24 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h
@@ -40,12 +40,17 @@ MCCodeEmitter *createPPCMCCodeEmitter(const MCInstrInfo &MCII,
                                       const MCSubtargetInfo &STI,
                                       MCContext &Ctx);
 
-MCAsmBackend *createPPCAsmBackend(const Target &T, StringRef TT, StringRef CPU);
+MCAsmBackend *createPPCAsmBackend(const Target &T, const MCRegisterInfo &MRI,
+                                  StringRef TT, StringRef CPU);
 
 /// createPPCELFObjectWriter - Construct an PPC ELF object writer.
 MCObjectWriter *createPPCELFObjectWriter(raw_ostream &OS,
                                          bool Is64Bit,
                                          uint8_t OSABI);
+/// createPPCELFObjectWriter - Construct a PPC Mach-O object writer.
+MCObjectWriter *createPPCMachObjectWriter(raw_ostream &OS, bool Is64Bit,
+                                          uint32_t CPUType,
+                                          uint32_t CPUSubtype);
 } // End llvm namespace
 
 // Generated files will use "namespace PPC". To avoid symbol clash,
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp
new file mode 100644
index 0000000..bbafe2e
--- /dev/null
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp
@@ -0,0 +1,389 @@
+//===-- PPCMachObjectWriter.cpp - PPC Mach-O Writer -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/PPCMCTargetDesc.h"
+#include "MCTargetDesc/PPCFixupKinds.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCAsmLayout.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCMachObjectWriter.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/MachO.h"
+
+using namespace llvm;
+
+namespace {
+class PPCMachObjectWriter : public MCMachObjectTargetWriter {
+  bool RecordScatteredRelocation(MachObjectWriter *Writer,
+                                 const MCAssembler &Asm,
+                                 const MCAsmLayout &Layout,
+                                 const MCFragment *Fragment,
+                                 const MCFixup &Fixup, MCValue Target,
+                                 unsigned Log2Size, uint64_t &FixedValue);
+
+  void RecordPPCRelocation(MachObjectWriter *Writer, const MCAssembler &Asm,
+                           const MCAsmLayout &Layout,
+                           const MCFragment *Fragment, const MCFixup &Fixup,
+                           MCValue Target, uint64_t &FixedValue);
+
+public:
+  PPCMachObjectWriter(bool Is64Bit, uint32_t CPUType, uint32_t CPUSubtype)
+      : MCMachObjectTargetWriter(Is64Bit, CPUType, CPUSubtype,
+                                 /*UseAggressiveSymbolFolding=*/Is64Bit) {}
+
+  void RecordRelocation(MachObjectWriter *Writer, const MCAssembler &Asm,
+                        const MCAsmLayout &Layout, const MCFragment *Fragment,
+                        const MCFixup &Fixup, MCValue Target,
+                        uint64_t &FixedValue) {
+    if (Writer->is64Bit()) {
+      report_fatal_error("Relocation emission for MachO/PPC64 unimplemented.");
+    } else
+      RecordPPCRelocation(Writer, Asm, Layout, Fragment, Fixup, Target,
+                          FixedValue);
+  }
+};
+}
+
+/// computes the log2 of the size of the relocation,
+/// used for relocation_info::r_length.
+static unsigned getFixupKindLog2Size(unsigned Kind) {
+  switch (Kind) {
+  default:
+    report_fatal_error("log2size(FixupKind): Unhandled fixup kind!");
+  case FK_PCRel_1:
+  case FK_Data_1:
+    return 0;
+  case FK_PCRel_2:
+  case FK_Data_2:
+    return 1;
+  case FK_PCRel_4:
+  case PPC::fixup_ppc_brcond14:
+  case PPC::fixup_ppc_half16:
+  case PPC::fixup_ppc_br24:
+  case FK_Data_4:
+    return 2;
+  case FK_PCRel_8:
+  case FK_Data_8:
+    return 3;
+  }
+  return 0;
+}
+
+/// Translates generic PPC fixup kind to Mach-O/PPC relocation type enum.
+/// Outline based on PPCELFObjectWriter::getRelocTypeInner().
+static unsigned getRelocType(const MCValue &Target,
+                             const MCFixupKind FixupKind, // from
+                                                          // Fixup.getKind()
+                             const bool IsPCRel) {
+  const MCSymbolRefExpr::VariantKind Modifier =
+      Target.isAbsolute() ? MCSymbolRefExpr::VK_None
+                          : Target.getSymA()->getKind();
+  // determine the type of the relocation
+  unsigned Type = MachO::GENERIC_RELOC_VANILLA;
+  if (IsPCRel) { // relative to PC
+    switch ((unsigned)FixupKind) {
+    default:
+      report_fatal_error("Unimplemented fixup kind (relative)");
+    case PPC::fixup_ppc_br24:
+      Type = MachO::PPC_RELOC_BR24; // R_PPC_REL24
+      break;
+    case PPC::fixup_ppc_brcond14:
+      Type = MachO::PPC_RELOC_BR14;
+      break;
+    case PPC::fixup_ppc_half16:
+      switch (Modifier) {
+      default:
+        llvm_unreachable("Unsupported modifier for half16 fixup");
+      case MCSymbolRefExpr::VK_PPC_HA:
+        Type = MachO::PPC_RELOC_HA16;
+        break;
+      case MCSymbolRefExpr::VK_PPC_LO:
+        Type = MachO::PPC_RELOC_LO16;
+        break;
+      case MCSymbolRefExpr::VK_PPC_HI:
+        Type = MachO::PPC_RELOC_HI16;
+        break;
+      }
+      break;
+    }
+  } else {
+    switch ((unsigned)FixupKind) {
+    default:
+      report_fatal_error("Unimplemented fixup kind (absolute)!");
+    case PPC::fixup_ppc_half16:
+      switch (Modifier) {
+      default:
+        llvm_unreachable("Unsupported modifier for half16 fixup");
+      case MCSymbolRefExpr::VK_PPC_HA:
+        Type = MachO::PPC_RELOC_HA16_SECTDIFF;
+        break;
+      case MCSymbolRefExpr::VK_PPC_LO:
+        Type = MachO::PPC_RELOC_LO16_SECTDIFF;
+        break;
+      case MCSymbolRefExpr::VK_PPC_HI:
+        Type = MachO::PPC_RELOC_HI16_SECTDIFF;
+        break;
+      }
+      break;
+    case FK_Data_4:
+      break;
+    case FK_Data_2:
+      break;
+    }
+  }
+  return Type;
+}
+
+static void makeRelocationInfo(MachO::any_relocation_info &MRE,
+                               const uint32_t FixupOffset, const uint32_t Index,
+                               const unsigned IsPCRel, const unsigned Log2Size,
+                               const unsigned IsExtern, const unsigned Type) {
+  MRE.r_word0 = FixupOffset;
+  // The bitfield offsets that work (as determined by trial-and-error)
+  // are different than what is documented in the mach-o manuals.
+  // This appears to be an endianness issue; reversing the order of the
+  // documented bitfields in <llvm/Support/MachO.h> fixes this (but
+  // breaks x86/ARM assembly).
+  MRE.r_word1 = ((Index << 8) |    // was << 0
+                 (IsPCRel << 7) |  // was << 24
+                 (Log2Size << 5) | // was << 25
+                 (IsExtern << 4) | // was << 27
+                 (Type << 0));     // was << 28
+}
+
+static void
+makeScatteredRelocationInfo(MachO::any_relocation_info &MRE,
+                            const uint32_t Addr, const unsigned Type,
+                            const unsigned Log2Size, const unsigned IsPCRel,
+                            const uint32_t Value2) {
+  // For notes on bitfield positions and endianness, see:
+  // https://developer.apple.com/library/mac/documentation/developertools/conceptual/MachORuntime/Reference/reference.html#//apple_ref/doc/uid/20001298-scattered_relocation_entry
+  MRE.r_word0 = ((Addr << 0) | (Type << 24) | (Log2Size << 28) |
+                 (IsPCRel << 30) | MachO::R_SCATTERED);
+  MRE.r_word1 = Value2;
+}
+
+/// Compute fixup offset (address).
+static uint32_t getFixupOffset(const MCAsmLayout &Layout,
+                               const MCFragment *Fragment,
+                               const MCFixup &Fixup) {
+  uint32_t FixupOffset = Layout.getFragmentOffset(Fragment) + Fixup.getOffset();
+  // On Mach-O, ppc_fixup_half16 relocations must refer to the
+  // start of the instruction, not the second halfword, as ELF does
+  if (unsigned(Fixup.getKind()) == PPC::fixup_ppc_half16)
+    FixupOffset &= ~uint32_t(3);
+  return FixupOffset;
+}
+
+/// \return false if falling back to using non-scattered relocation,
+/// otherwise true for normal scattered relocation.
+/// based on X86MachObjectWriter::RecordScatteredRelocation
+/// and ARMMachObjectWriter::RecordScatteredRelocation
+bool PPCMachObjectWriter::RecordScatteredRelocation(
+    MachObjectWriter *Writer, const MCAssembler &Asm, const MCAsmLayout &Layout,
+    const MCFragment *Fragment, const MCFixup &Fixup, MCValue Target,
+    unsigned Log2Size, uint64_t &FixedValue) {
+  // caller already computes these, can we just pass and reuse?
+  const uint32_t FixupOffset = getFixupOffset(Layout, Fragment, Fixup);
+  const MCFixupKind FK = Fixup.getKind();
+  const unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, FK);
+  const unsigned Type = getRelocType(Target, FK, IsPCRel);
+
+  // Is this a local or SECTDIFF relocation entry?
+  // SECTDIFF relocation entries have symbol subtractions,
+  // and require two entries, the first for the add-symbol value,
+  // the second for the subtract-symbol value.
+
+  // See <reloc.h>.
+  const MCSymbol *A = &Target.getSymA()->getSymbol();
+  MCSymbolData *A_SD = &Asm.getSymbolData(*A);
+
+  if (!A_SD->getFragment())
+    report_fatal_error("symbol '" + A->getName() +
+                       "' can not be undefined in a subtraction expression");
+
+  uint32_t Value = Writer->getSymbolAddress(A_SD, Layout);
+  uint64_t SecAddr =
+      Writer->getSectionAddress(A_SD->getFragment()->getParent());
+  FixedValue += SecAddr;
+  uint32_t Value2 = 0;
+
+  if (const MCSymbolRefExpr *B = Target.getSymB()) {
+    MCSymbolData *B_SD = &Asm.getSymbolData(B->getSymbol());
+
+    if (!B_SD->getFragment())
+      report_fatal_error("symbol '" + B->getSymbol().getName() +
+                         "' can not be undefined in a subtraction expression");
+
+    // FIXME: is Type correct? see include/llvm/Support/MachO.h
+    Value2 = Writer->getSymbolAddress(B_SD, Layout);
+    FixedValue -= Writer->getSectionAddress(B_SD->getFragment()->getParent());
+  }
+  // FIXME: does FixedValue get used??
+
+  // Relocations are written out in reverse order, so the PAIR comes first.
+  if (Type == MachO::PPC_RELOC_SECTDIFF ||
+      Type == MachO::PPC_RELOC_HI16_SECTDIFF ||
+      Type == MachO::PPC_RELOC_LO16_SECTDIFF ||
+      Type == MachO::PPC_RELOC_HA16_SECTDIFF ||
+      Type == MachO::PPC_RELOC_LO14_SECTDIFF ||
+      Type == MachO::PPC_RELOC_LOCAL_SECTDIFF) {
+    // X86 had this piece, but ARM does not
+    // If the offset is too large to fit in a scattered relocation,
+    // we're hosed. It's an unfortunate limitation of the MachO format.
+    if (FixupOffset > 0xffffff) {
+      char Buffer[32];
+      format("0x%x", FixupOffset).print(Buffer, sizeof(Buffer));
+      Asm.getContext().FatalError(Fixup.getLoc(),
+                                  Twine("Section too large, can't encode "
+                                        "r_address (") +
+                                      Buffer + ") into 24 bits of scattered "
+                                               "relocation entry.");
+      llvm_unreachable("fatal error returned?!");
+    }
+
+    // Is this supposed to follow MCTarget/PPCAsmBackend.cpp:adjustFixupValue()?
+    // see PPCMCExpr::EvaluateAsRelocatableImpl()
+    uint32_t other_half = 0;
+    switch (Type) {
+    case MachO::PPC_RELOC_LO16_SECTDIFF:
+      other_half = (FixedValue >> 16) & 0xffff;
+      // applyFixupOffset longer extracts the high part because it now assumes
+      // this was already done.
+      // It looks like this is not true for the FixedValue needed with Mach-O
+      // relocs.
+      // So we need to adjust FixedValue again here.
+      FixedValue &= 0xffff;
+      break;
+    case MachO::PPC_RELOC_HA16_SECTDIFF:
+      other_half = FixedValue & 0xffff;
+      FixedValue =
+          ((FixedValue >> 16) + ((FixedValue & 0x8000) ? 1 : 0)) & 0xffff;
+      break;
+    case MachO::PPC_RELOC_HI16_SECTDIFF:
+      other_half = FixedValue & 0xffff;
+      FixedValue = (FixedValue >> 16) & 0xffff;
+      break;
+    default:
+      llvm_unreachable("Invalid PPC scattered relocation type.");
+      break;
+    }
+
+    MachO::any_relocation_info MRE;
+    makeScatteredRelocationInfo(MRE, other_half, MachO::GENERIC_RELOC_PAIR,
+                                Log2Size, IsPCRel, Value2);
+    Writer->addRelocation(Fragment->getParent(), MRE);
+  } else {
+    // If the offset is more than 24-bits, it won't fit in a scattered
+    // relocation offset field, so we fall back to using a non-scattered
+    // relocation. This is a bit risky, as if the offset reaches out of
+    // the block and the linker is doing scattered loading on this
+    // symbol, things can go badly.
+    //
+    // Required for 'as' compatibility.
+    if (FixupOffset > 0xffffff)
+      return false;
+  }
+  MachO::any_relocation_info MRE;
+  makeScatteredRelocationInfo(MRE, FixupOffset, Type, Log2Size, IsPCRel, Value);
+  Writer->addRelocation(Fragment->getParent(), MRE);
+  return true;
+}
+
+// see PPCELFObjectWriter for a general outline of cases
+void PPCMachObjectWriter::RecordPPCRelocation(
+    MachObjectWriter *Writer, const MCAssembler &Asm, const MCAsmLayout &Layout,
+    const MCFragment *Fragment, const MCFixup &Fixup, MCValue Target,
+    uint64_t &FixedValue) {
+  const MCFixupKind FK = Fixup.getKind(); // unsigned
+  const unsigned Log2Size = getFixupKindLog2Size(FK);
+  const bool IsPCRel = Writer->isFixupKindPCRel(Asm, FK);
+  const unsigned RelocType = getRelocType(Target, FK, IsPCRel);
+
+  // If this is a difference or a defined symbol plus an offset, then we need a
+  // scattered relocation entry. Differences always require scattered
+  // relocations.
+  if (Target.getSymB() &&
+      // Q: are branch targets ever scattered?
+      RelocType != MachO::PPC_RELOC_BR24 &&
+      RelocType != MachO::PPC_RELOC_BR14) {
+    RecordScatteredRelocation(Writer, Asm, Layout, Fragment, Fixup, Target,
+                              Log2Size, FixedValue);
+    return;
+  }
+
+  // this doesn't seem right for RIT_PPC_BR24
+  // Get the symbol data, if any.
+  MCSymbolData *SD = 0;
+  if (Target.getSymA())
+    SD = &Asm.getSymbolData(Target.getSymA()->getSymbol());
+
+  // See <reloc.h>.
+  const uint32_t FixupOffset = getFixupOffset(Layout, Fragment, Fixup);
+  unsigned Index = 0;
+  unsigned IsExtern = 0;
+  unsigned Type = RelocType;
+
+  if (Target.isAbsolute()) { // constant
+                             // SymbolNum of 0 indicates the absolute section.
+                             //
+    // FIXME: Currently, these are never generated (see code below). I cannot
+    // find a case where they are actually emitted.
+    report_fatal_error("FIXME: relocations to absolute targets "
+                       "not yet implemented");
+    // the above line stolen from ARM, not sure
+  } else {
+    // Resolve constant variables.
+    if (SD->getSymbol().isVariable()) {
+      int64_t Res;
+      if (SD->getSymbol().getVariableValue()->EvaluateAsAbsolute(
+              Res, Layout, Writer->getSectionAddressMap())) {
+        FixedValue = Res;
+        return;
+      }
+    }
+
+    // Check whether we need an external or internal relocation.
+    if (Writer->doesSymbolRequireExternRelocation(SD)) {
+      IsExtern = 1;
+      Index = SD->getIndex();
+      // For external relocations, make sure to offset the fixup value to
+      // compensate for the addend of the symbol address, if it was
+      // undefined. This occurs with weak definitions, for example.
+      if (!SD->Symbol->isUndefined())
+        FixedValue -= Layout.getSymbolOffset(SD);
+    } else {
+      // The index is the section ordinal (1-based).
+      const MCSectionData &SymSD =
+          Asm.getSectionData(SD->getSymbol().getSection());
+      Index = SymSD.getOrdinal() + 1;
+      FixedValue += Writer->getSectionAddress(&SymSD);
+    }
+    if (IsPCRel)
+      FixedValue -= Writer->getSectionAddress(Fragment->getParent());
+  }
+
+  // struct relocation_info (8 bytes)
+  MachO::any_relocation_info MRE;
+  makeRelocationInfo(MRE, FixupOffset, Index, IsPCRel, Log2Size, IsExtern,
+                     Type);
+  Writer->addRelocation(Fragment->getParent(), MRE);
+}
+
+MCObjectWriter *llvm::createPPCMachObjectWriter(raw_ostream &OS, bool Is64Bit,
+                                                uint32_t CPUType,
+                                                uint32_t CPUSubtype) {
+  return createMachObjectWriter(
+      new PPCMachObjectWriter(Is64Bit, CPUType, CPUSubtype), OS,
+      /*IsLittleEndian=*/false);
+}
diff --git a/lib/Target/PowerPC/PPC.td b/lib/Target/PowerPC/PPC.td
index 806822c..54e3d40 100644
--- a/lib/Target/PowerPC/PPC.td
+++ b/lib/Target/PowerPC/PPC.td
@@ -57,6 +57,8 @@ def FeatureMFOCRF    : SubtargetFeature<"mfocrf","HasMFOCRF", "true",
                                         "Enable the MFOCRF instruction">;
 def FeatureFSqrt     : SubtargetFeature<"fsqrt","HasFSQRT", "true",
                                         "Enable the fsqrt instruction">;
+def FeatureFCPSGN    : SubtargetFeature<"fcpsgn", "HasFCPSGN", "true",
+                                        "Enable the fcpsgn instruction">;
 def FeatureFRE       : SubtargetFeature<"fre", "HasFRE", "true",
                                         "Enable the fre instruction">;
 def FeatureFRES      : SubtargetFeature<"fres", "HasFRES", "true",
@@ -85,6 +87,13 @@ def FeatureBookE     : SubtargetFeature<"booke", "IsBookE", "true",
                                         "Enable Book E instructions">;
 def FeatureQPX       : SubtargetFeature<"qpx","HasQPX", "true",
                                         "Enable QPX instructions">;
+def FeatureVSX       : SubtargetFeature<"vsx","HasVSX", "true",
+                                        "Enable VSX instructions">;
+
+def DeprecatedMFTB   : SubtargetFeature<"", "DeprecatedMFTB", "true",
+                                        "Treat mftb as deprecated">;
+def DeprecatedDST    : SubtargetFeature<"", "DeprecatedDST", "true",
+  "Treat vector data stream cache control instructions as deprecated">;
 
 // Note: Future features to add when support is extended to more
 // recent ISA levels:
@@ -146,10 +155,10 @@ include "PPCInstrInfo.td"
 def : Processor<"generic", G3Itineraries, [Directive32]>;
 def : Processor<"440", PPC440Itineraries, [Directive440, FeatureISEL,
                                            FeatureFRES, FeatureFRSQRTE,
-                                           FeatureBookE]>;
+                                           FeatureBookE, DeprecatedMFTB]>;
 def : Processor<"450", PPC440Itineraries, [Directive440, FeatureISEL,
                                            FeatureFRES, FeatureFRSQRTE,
-                                           FeatureBookE]>;
+                                           FeatureBookE, DeprecatedMFTB]>;
 def : Processor<"601", G3Itineraries, [Directive601]>;
 def : Processor<"602", G3Itineraries, [Directive602]>;
 def : Processor<"603", G3Itineraries, [Directive603,
@@ -185,29 +194,32 @@ def : ProcessorModel<"g5", G5Model,
                   [Directive970, FeatureAltivec,
                    FeatureMFOCRF, FeatureFSqrt, FeatureSTFIWX,
                    FeatureFRES, FeatureFRSQRTE,
-                   Feature64Bit /*, Feature64BitRegs */]>;
+                   Feature64Bit /*, Feature64BitRegs */,
+                   DeprecatedMFTB, DeprecatedDST]>;
 def : ProcessorModel<"e500mc", PPCE500mcModel,
                   [DirectiveE500mc, FeatureMFOCRF,
-                   FeatureSTFIWX, FeatureBookE, FeatureISEL]>;
+                   FeatureSTFIWX, FeatureBookE, FeatureISEL,
+                   DeprecatedMFTB]>;
 def : ProcessorModel<"e5500", PPCE5500Model,
                   [DirectiveE5500, FeatureMFOCRF, Feature64Bit,
-                   FeatureSTFIWX, FeatureBookE, FeatureISEL]>;
+                   FeatureSTFIWX, FeatureBookE, FeatureISEL,
+                   DeprecatedMFTB]>;
 def : ProcessorModel<"a2", PPCA2Model,
                   [DirectiveA2, FeatureBookE, FeatureMFOCRF,
-                   FeatureFSqrt, FeatureFRE, FeatureFRES,
+                   FeatureFCPSGN, FeatureFSqrt, FeatureFRE, FeatureFRES,
                    FeatureFRSQRTE, FeatureFRSQRTES, FeatureRecipPrec,
                    FeatureSTFIWX, FeatureLFIWAX,
                    FeatureFPRND, FeatureFPCVT, FeatureISEL,
                    FeaturePOPCNTD, FeatureLDBRX, Feature64Bit
-               /*, Feature64BitRegs */]>;
+               /*, Feature64BitRegs */, DeprecatedMFTB]>;
 def : ProcessorModel<"a2q", PPCA2Model,
                   [DirectiveA2, FeatureBookE, FeatureMFOCRF,
-                   FeatureFSqrt, FeatureFRE, FeatureFRES,
+                   FeatureFCPSGN, FeatureFSqrt, FeatureFRE, FeatureFRES,
                    FeatureFRSQRTE, FeatureFRSQRTES, FeatureRecipPrec,
                    FeatureSTFIWX, FeatureLFIWAX,
                    FeatureFPRND, FeatureFPCVT, FeatureISEL,
                    FeaturePOPCNTD, FeatureLDBRX, Feature64Bit
-               /*, Feature64BitRegs */, FeatureQPX]>;
+               /*, Feature64BitRegs */, FeatureQPX, DeprecatedMFTB]>;
 def : ProcessorModel<"pwr3", G5Model,
                   [DirectivePwr3, FeatureAltivec,
                    FeatureFRES, FeatureFRSQRTE, FeatureMFOCRF,
@@ -220,32 +232,37 @@ def : ProcessorModel<"pwr5", G5Model,
                   [DirectivePwr5, FeatureAltivec, FeatureMFOCRF,
                    FeatureFSqrt, FeatureFRE, FeatureFRES,
                    FeatureFRSQRTE, FeatureFRSQRTES,
-                   FeatureSTFIWX, Feature64Bit]>;
+                   FeatureSTFIWX, Feature64Bit,
+                   DeprecatedMFTB, DeprecatedDST]>;
 def : ProcessorModel<"pwr5x", G5Model,
                   [DirectivePwr5x, FeatureAltivec, FeatureMFOCRF,
                    FeatureFSqrt, FeatureFRE, FeatureFRES,
                    FeatureFRSQRTE, FeatureFRSQRTES,
-                   FeatureSTFIWX, FeatureFPRND, Feature64Bit]>;
+                   FeatureSTFIWX, FeatureFPRND, Feature64Bit,
+                   DeprecatedMFTB, DeprecatedDST]>;
 def : ProcessorModel<"pwr6", G5Model,
                   [DirectivePwr6, FeatureAltivec,
-                   FeatureMFOCRF, FeatureFSqrt, FeatureFRE,
+                   FeatureMFOCRF, FeatureFCPSGN, FeatureFSqrt, FeatureFRE,
                    FeatureFRES, FeatureFRSQRTE, FeatureFRSQRTES,
                    FeatureRecipPrec, FeatureSTFIWX, FeatureLFIWAX,
-                   FeatureFPRND, Feature64Bit /*, Feature64BitRegs */]>;
+                   FeatureFPRND, Feature64Bit /*, Feature64BitRegs */,
+                   DeprecatedMFTB, DeprecatedDST]>;
 def : ProcessorModel<"pwr6x", G5Model,
                   [DirectivePwr5x, FeatureAltivec, FeatureMFOCRF,
-                   FeatureFSqrt, FeatureFRE, FeatureFRES,
+                   FeatureFCPSGN, FeatureFSqrt, FeatureFRE, FeatureFRES,
                    FeatureFRSQRTE, FeatureFRSQRTES, FeatureRecipPrec,
                    FeatureSTFIWX, FeatureLFIWAX,
-                   FeatureFPRND, Feature64Bit]>;
+                   FeatureFPRND, Feature64Bit,
+                   DeprecatedMFTB, DeprecatedDST]>;
 def : ProcessorModel<"pwr7", G5Model,
                   [DirectivePwr7, FeatureAltivec,
-                   FeatureMFOCRF, FeatureFSqrt, FeatureFRE,
+                   FeatureMFOCRF, FeatureFCPSGN, FeatureFSqrt, FeatureFRE,
                    FeatureFRES, FeatureFRSQRTE, FeatureFRSQRTES,
                    FeatureRecipPrec, FeatureSTFIWX, FeatureLFIWAX,
                    FeatureFPRND, FeatureFPCVT, FeatureISEL,
                    FeaturePOPCNTD, FeatureLDBRX,
-                   Feature64Bit /*, Feature64BitRegs */]>;
+                   Feature64Bit /*, Feature64BitRegs */,
+                   DeprecatedMFTB, DeprecatedDST]>;
 def : Processor<"ppc", G3Itineraries, [Directive32]>;
 def : ProcessorModel<"ppc64", G5Model,
                   [Directive64, FeatureAltivec,
diff --git a/lib/Target/PowerPC/PPCAsmPrinter.cpp b/lib/Target/PowerPC/PPCAsmPrinter.cpp
index bbfad87..ada34ed 100644
--- a/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -23,6 +23,7 @@
 #include "MCTargetDesc/PPCMCExpr.h"
 #include "PPCSubtarget.h"
 #include "PPCTargetMachine.h"
+#include "PPCTargetStreamer.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
@@ -202,7 +203,7 @@ void PPCAsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo,
             .getGVStubEntry(SymToPrint);
         if (StubSym.getPointer() == 0)
           StubSym = MachineModuleInfoImpl::
-            StubValueTy(Mang->getSymbol(GV), !GV->hasInternalLinkage());
+            StubValueTy(getSymbol(GV), !GV->hasInternalLinkage());
       } else if (GV->isDeclaration() || GV->hasCommonLinkage() ||
                  GV->hasAvailableExternallyLinkage()) {
         SymToPrint = GetSymbolWithGlobalValueBase(GV, "$non_lazy_ptr");
@@ -212,12 +213,12 @@ void PPCAsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo,
                     getHiddenGVStubEntry(SymToPrint);
         if (StubSym.getPointer() == 0)
           StubSym = MachineModuleInfoImpl::
-            StubValueTy(Mang->getSymbol(GV), !GV->hasInternalLinkage());
+            StubValueTy(getSymbol(GV), !GV->hasInternalLinkage());
       } else {
-        SymToPrint = Mang->getSymbol(GV);
+        SymToPrint = getSymbol(GV);
       }
     } else {
-      SymToPrint = Mang->getSymbol(GV);
+      SymToPrint = getSymbol(GV);
     }
     
     O << *SymToPrint;
@@ -363,7 +364,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     assert(MO.isGlobal() || MO.isCPI() || MO.isJTI());
     MCSymbol *MOSymbol = 0;
     if (MO.isGlobal())
-      MOSymbol = Mang->getSymbol(MO.getGlobal());
+      MOSymbol = getSymbol(MO.getGlobal());
     else if (MO.isCPI())
       MOSymbol = GetCPISymbol(MO.getIndex());
     else if (MO.isJTI())
@@ -402,7 +403,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
       const GlobalAlias *GAlias = dyn_cast<GlobalAlias>(GValue);
       const GlobalValue *RealGValue = GAlias ?
         GAlias->resolveAliasedGlobal(false) : GValue;
-      MOSymbol = Mang->getSymbol(RealGValue);
+      MOSymbol = getSymbol(RealGValue);
       const GlobalVariable *GVar = dyn_cast<GlobalVariable>(RealGValue);
       IsExternal = GVar && !GVar->hasInitializer();
       IsCommon = GVar && RealGValue->hasCommonLinkage();
@@ -413,7 +414,8 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     else if (MO.isJTI())
       MOSymbol = GetJTISymbol(MO.getIndex());
 
-    if (IsExternal || IsFunction || IsCommon || IsAvailExt || MO.isJTI())
+    if (IsExternal || IsFunction || IsCommon || IsAvailExt || MO.isJTI() ||
+        TM.getCodeModel() == CodeModel::Large)
       MOSymbol = lookUpOrCreateTOCEntry(MOSymbol);
 
     const MCExpr *Exp =
@@ -438,18 +440,22 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
 
     if (MO.isJTI())
       MOSymbol = lookUpOrCreateTOCEntry(GetJTISymbol(MO.getIndex()));
-    else if (MO.isCPI())
+    else if (MO.isCPI()) {
       MOSymbol = GetCPISymbol(MO.getIndex());
+      if (TM.getCodeModel() == CodeModel::Large)
+        MOSymbol = lookUpOrCreateTOCEntry(MOSymbol);
+    }
     else if (MO.isGlobal()) {
       const GlobalValue *GValue = MO.getGlobal();
       const GlobalAlias *GAlias = dyn_cast<GlobalAlias>(GValue);
       const GlobalValue *RealGValue = GAlias ?
         GAlias->resolveAliasedGlobal(false) : GValue;
-      MOSymbol = Mang->getSymbol(RealGValue);
+      MOSymbol = getSymbol(RealGValue);
       const GlobalVariable *GVar = dyn_cast<GlobalVariable>(RealGValue);
     
       if (!GVar || !GVar->hasInitializer() || RealGValue->hasCommonLinkage() ||
-          RealGValue->hasAvailableExternallyLinkage())
+          RealGValue->hasAvailableExternallyLinkage() ||
+          TM.getCodeModel() == CodeModel::Large)
         MOSymbol = lookUpOrCreateTOCEntry(MOSymbol);
     }
 
@@ -479,14 +485,14 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
       const GlobalAlias *GAlias = dyn_cast<GlobalAlias>(GValue);
       const GlobalValue *RealGValue = GAlias ?
         GAlias->resolveAliasedGlobal(false) : GValue;
-      MOSymbol = Mang->getSymbol(RealGValue);
+      MOSymbol = getSymbol(RealGValue);
       const GlobalVariable *GVar = dyn_cast<GlobalVariable>(RealGValue);
       IsExternal = GVar && !GVar->hasInitializer();
       IsFunction = !GVar;
     } else if (MO.isCPI())
       MOSymbol = GetCPISymbol(MO.getIndex());
 
-    if (IsFunction || IsExternal)
+    if (IsFunction || IsExternal || TM.getCodeModel() == CodeModel::Large)
       MOSymbol = lookUpOrCreateTOCEntry(MOSymbol);
 
     const MCExpr *Exp =
@@ -502,7 +508,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     assert(Subtarget.isPPC64() && "Not supported for 32-bit PowerPC");
     const MachineOperand &MO = MI->getOperand(2);
     const GlobalValue *GValue = MO.getGlobal();
-    MCSymbol *MOSymbol = Mang->getSymbol(GValue);
+    MCSymbol *MOSymbol = getSymbol(GValue);
     const MCExpr *SymGotTprel =
       MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_GOT_TPREL_HA,
                               OutContext);
@@ -520,7 +526,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     TmpInst.setOpcode(PPC::LD);
     const MachineOperand &MO = MI->getOperand(1);
     const GlobalValue *GValue = MO.getGlobal();
-    MCSymbol *MOSymbol = Mang->getSymbol(GValue);
+    MCSymbol *MOSymbol = getSymbol(GValue);
     const MCExpr *Exp =
       MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_GOT_TPREL_LO,
                               OutContext);
@@ -534,7 +540,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     assert(Subtarget.isPPC64() && "Not supported for 32-bit PowerPC");
     const MachineOperand &MO = MI->getOperand(2);
     const GlobalValue *GValue = MO.getGlobal();
-    MCSymbol *MOSymbol = Mang->getSymbol(GValue);
+    MCSymbol *MOSymbol = getSymbol(GValue);
     const MCExpr *SymGotTlsGD =
       MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_GOT_TLSGD_HA,
                               OutContext);
@@ -550,7 +556,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     assert(Subtarget.isPPC64() && "Not supported for 32-bit PowerPC");
     const MachineOperand &MO = MI->getOperand(2);
     const GlobalValue *GValue = MO.getGlobal();
-    MCSymbol *MOSymbol = Mang->getSymbol(GValue);
+    MCSymbol *MOSymbol = getSymbol(GValue);
     const MCExpr *SymGotTlsGD =
       MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_GOT_TLSGD_LO,
                               OutContext);
@@ -571,7 +577,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
       MCSymbolRefExpr::Create(TlsGetAddr, MCSymbolRefExpr::VK_None, OutContext);
     const MachineOperand &MO = MI->getOperand(2);
     const GlobalValue *GValue = MO.getGlobal();
-    MCSymbol *MOSymbol = Mang->getSymbol(GValue);
+    MCSymbol *MOSymbol = getSymbol(GValue);
     const MCExpr *SymVar =
       MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_TLSGD,
                               OutContext);
@@ -586,7 +592,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     assert(Subtarget.isPPC64() && "Not supported for 32-bit PowerPC");
     const MachineOperand &MO = MI->getOperand(2);
     const GlobalValue *GValue = MO.getGlobal();
-    MCSymbol *MOSymbol = Mang->getSymbol(GValue);
+    MCSymbol *MOSymbol = getSymbol(GValue);
     const MCExpr *SymGotTlsLD =
       MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_GOT_TLSLD_HA,
                               OutContext);
@@ -602,7 +608,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     assert(Subtarget.isPPC64() && "Not supported for 32-bit PowerPC");
     const MachineOperand &MO = MI->getOperand(2);
     const GlobalValue *GValue = MO.getGlobal();
-    MCSymbol *MOSymbol = Mang->getSymbol(GValue);
+    MCSymbol *MOSymbol = getSymbol(GValue);
     const MCExpr *SymGotTlsLD =
       MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_GOT_TLSLD_LO,
                               OutContext);
@@ -623,7 +629,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
       MCSymbolRefExpr::Create(TlsGetAddr, MCSymbolRefExpr::VK_None, OutContext);
     const MachineOperand &MO = MI->getOperand(2);
     const GlobalValue *GValue = MO.getGlobal();
-    MCSymbol *MOSymbol = Mang->getSymbol(GValue);
+    MCSymbol *MOSymbol = getSymbol(GValue);
     const MCExpr *SymVar =
       MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_TLSLD,
                               OutContext);
@@ -638,7 +644,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     assert(Subtarget.isPPC64() && "Not supported for 32-bit PowerPC");
     const MachineOperand &MO = MI->getOperand(2);
     const GlobalValue *GValue = MO.getGlobal();
-    MCSymbol *MOSymbol = Mang->getSymbol(GValue);
+    MCSymbol *MOSymbol = getSymbol(GValue);
     const MCExpr *SymDtprel =
       MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_DTPREL_HA,
                               OutContext);
@@ -654,7 +660,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     assert(Subtarget.isPPC64() && "Not supported for 32-bit PowerPC");
     const MachineOperand &MO = MI->getOperand(2);
     const GlobalValue *GValue = MO.getGlobal();
-    MCSymbol *MOSymbol = Mang->getSymbol(GValue);
+    MCSymbol *MOSymbol = getSymbol(GValue);
     const MCExpr *SymDtprel =
       MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_DTPREL_LO,
                               OutContext);
@@ -704,6 +710,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     break;
   case PPC::LD:
   case PPC::STD:
+  case PPC::LWA_32:
   case PPC::LWA: {
     // Verify alignment is legal, so we don't create relocations
     // that can't be supported.
@@ -765,6 +772,9 @@ bool PPCLinuxAsmPrinter::doFinalization(Module &M) {
 
   bool isPPC64 = TD->getPointerSizeInBits() == 64;
 
+  PPCTargetStreamer &TS =
+      static_cast<PPCTargetStreamer &>(OutStreamer.getTargetStreamer());
+
   if (isPPC64 && !TOC.empty()) {
     const MCSectionELF *Section = OutStreamer.getContext().getELFSection(".toc",
         ELF::SHT_PROGBITS, ELF::SHF_WRITE | ELF::SHF_ALLOC,
@@ -775,7 +785,7 @@ bool PPCLinuxAsmPrinter::doFinalization(Module &M) {
          E = TOC.end(); I != E; ++I) {
       OutStreamer.EmitLabel(I->second);
       MCSymbol *S = OutContext.GetOrCreateSymbol(I->first->getName());
-      OutStreamer.EmitTCEntry(*S);
+      TS.emitTCEntry(*S);
     }
   }
 
@@ -1051,7 +1061,7 @@ bool PPCDarwinAsmPrinter::doFinalization(Module &M) {
         MCSymbol *NLPSym = GetSymbolWithGlobalValueBase(*I, "$non_lazy_ptr");
         MachineModuleInfoImpl::StubValueTy &StubSym =
           MMIMacho.getGVStubEntry(NLPSym);
-        StubSym = MachineModuleInfoImpl::StubValueTy(Mang->getSymbol(*I), true);
+        StubSym = MachineModuleInfoImpl::StubValueTy(getSymbol(*I), true);
       }
     }
   }
diff --git a/lib/Target/PowerPC/PPCCTRLoops.cpp b/lib/Target/PowerPC/PPCCTRLoops.cpp
index 4e30c537..4224ae2 100644
--- a/lib/Target/PowerPC/PPCCTRLoops.cpp
+++ b/lib/Target/PowerPC/PPCCTRLoops.cpp
@@ -253,12 +253,19 @@ bool PPCCTRLoops::mightUseCTR(const Triple &TT, BasicBlock *BB) {
           case Intrinsic::sin:
           case Intrinsic::cos:
             return true;
+          case Intrinsic::copysign:
+            if (CI->getArgOperand(0)->getType()->getScalarType()->
+                isPPC_FP128Ty())
+              return true;
+            else
+              continue; // ISD::FCOPYSIGN is never a library call.
           case Intrinsic::sqrt:      Opcode = ISD::FSQRT;      break;
           case Intrinsic::floor:     Opcode = ISD::FFLOOR;     break;
           case Intrinsic::ceil:      Opcode = ISD::FCEIL;      break;
           case Intrinsic::trunc:     Opcode = ISD::FTRUNC;     break;
           case Intrinsic::rint:      Opcode = ISD::FRINT;      break;
           case Intrinsic::nearbyint: Opcode = ISD::FNEARBYINT; break;
+          case Intrinsic::round:     Opcode = ISD::FROUND;     break;
           }
         }
 
@@ -283,8 +290,9 @@ bool PPCCTRLoops::mightUseCTR(const Triple &TT, BasicBlock *BB) {
           default: return true;
           case LibFunc::copysign:
           case LibFunc::copysignf:
-          case LibFunc::copysignl:
             continue; // ISD::FCOPYSIGN is never a library call.
+          case LibFunc::copysignl:
+            return true;
           case LibFunc::fabs:
           case LibFunc::fabsf:
           case LibFunc::fabsl:
@@ -309,6 +317,10 @@ bool PPCCTRLoops::mightUseCTR(const Triple &TT, BasicBlock *BB) {
           case LibFunc::rintf:
           case LibFunc::rintl:
             Opcode = ISD::FRINT; break;
+          case LibFunc::round:
+          case LibFunc::roundf:
+          case LibFunc::roundl:
+            Opcode = ISD::FROUND; break;
           case LibFunc::trunc:
           case LibFunc::truncf:
           case LibFunc::truncl:
diff --git a/lib/Target/PowerPC/PPCCallingConv.td b/lib/Target/PowerPC/PPCCallingConv.td
index a584188..e8e7f4c 100644
--- a/lib/Target/PowerPC/PPCCallingConv.td
+++ b/lib/Target/PowerPC/PPCCallingConv.td
@@ -37,6 +37,37 @@ def RetCC_PPC : CallingConv<[
 ]>;
 
 
+// Note that we don't currently have calling conventions for 64-bit
+// PowerPC, but handle all the complexities of the ABI in the lowering
+// logic.  FIXME: See if the logic can be simplified with use of CCs.
+// This may require some extensions to current table generation.
+
+// Simple calling convention for 64-bit ELF PowerPC fast isel.
+// Only handle ints and floats.  All ints are promoted to i64.
+// Vector types and quadword ints are not handled.
+def CC_PPC64_ELF_FIS : CallingConv<[
+  CCIfType<[i8],  CCPromoteToType<i64>>,
+  CCIfType<[i16], CCPromoteToType<i64>>,
+  CCIfType<[i32], CCPromoteToType<i64>>,
+  CCIfType<[i64], CCAssignToReg<[X3, X4, X5, X6, X7, X8, X9, X10]>>,
+  CCIfType<[f32, f64], CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8]>>
+]>;
+
+// Simple return-value convention for 64-bit ELF PowerPC fast isel.
+// All small ints are promoted to i64.  Vector types, quadword ints,
+// and multiple register returns are "supported" to avoid compile
+// errors, but none are handled by the fast selector.
+def RetCC_PPC64_ELF_FIS : CallingConv<[
+  CCIfType<[i8],   CCPromoteToType<i64>>,
+  CCIfType<[i16],  CCPromoteToType<i64>>,
+  CCIfType<[i32],  CCPromoteToType<i64>>,
+  CCIfType<[i64],  CCAssignToReg<[X3, X4]>>,
+  CCIfType<[i128], CCAssignToReg<[X3, X4, X5, X6]>>,
+  CCIfType<[f32],  CCAssignToReg<[F1, F2]>>,
+  CCIfType<[f64],  CCAssignToReg<[F1, F2, F3, F4]>>,
+  CCIfType<[v16i8, v8i16, v4i32, v4f32], CCAssignToReg<[V2]>>
+]>;
+
 //===----------------------------------------------------------------------===//
 // PowerPC System V Release 4 32-bit ABI
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/PowerPC/PPCFastISel.cpp b/lib/Target/PowerPC/PPCFastISel.cpp
index 8cbf1fb..09117e7 100644
--- a/lib/Target/PowerPC/PPCFastISel.cpp
+++ b/lib/Target/PowerPC/PPCFastISel.cpp
@@ -37,6 +37,25 @@
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetMachine.h"
 
+//===----------------------------------------------------------------------===//
+//
+// TBD:
+//   FastLowerArguments: Handle simple cases.
+//   PPCMaterializeGV: Handle TLS.
+//   SelectCall: Handle function pointers.
+//   SelectCall: Handle multi-register return values.
+//   SelectCall: Optimize away nops for local calls.
+//   processCallArgs: Handle bit-converted arguments.
+//   finishCall: Handle multi-register return values.
+//   PPCComputeAddress: Handle parameter references as FrameIndex's.
+//   PPCEmitCmp: Handle immediate as operand 1.
+//   SelectCall: Handle small byval arguments.
+//   SelectIntrinsicCall: Implement.
+//   SelectSelect: Implement.
+//   Consider factoring isTypeLegal into the base class.
+//   Implement switches and jump tables.
+//
+//===----------------------------------------------------------------------===//
 using namespace llvm;
 
 namespace {
@@ -52,7 +71,7 @@ typedef struct Address {
     int FI;
   } Base;
 
-  int Offset;
+  long Offset;
 
   // Innocuous defaults for our address.
   Address()
@@ -89,15 +108,76 @@ class PPCFastISel : public FastISel {
     virtual bool tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
                                      const LoadInst *LI);
     virtual bool FastLowerArguments();
+    virtual unsigned FastEmit_i(MVT Ty, MVT RetTy, unsigned Opc, uint64_t Imm);
+    virtual unsigned FastEmitInst_ri(unsigned MachineInstOpcode,
+                                     const TargetRegisterClass *RC,
+                                     unsigned Op0, bool Op0IsKill,
+                                     uint64_t Imm);
+    virtual unsigned FastEmitInst_r(unsigned MachineInstOpcode,
+                                    const TargetRegisterClass *RC,
+                                    unsigned Op0, bool Op0IsKill);
+    virtual unsigned FastEmitInst_rr(unsigned MachineInstOpcode,
+                                     const TargetRegisterClass *RC,
+                                     unsigned Op0, bool Op0IsKill,
+                                     unsigned Op1, bool Op1IsKill);
+
+  // Instruction selection routines.
+  private:
+    bool SelectLoad(const Instruction *I);
+    bool SelectStore(const Instruction *I);
+    bool SelectBranch(const Instruction *I);
+    bool SelectIndirectBr(const Instruction *I);
+    bool SelectCmp(const Instruction *I);
+    bool SelectFPExt(const Instruction *I);
+    bool SelectFPTrunc(const Instruction *I);
+    bool SelectIToFP(const Instruction *I, bool IsSigned);
+    bool SelectFPToI(const Instruction *I, bool IsSigned);
+    bool SelectBinaryIntOp(const Instruction *I, unsigned ISDOpcode);
+    bool SelectCall(const Instruction *I);
+    bool SelectRet(const Instruction *I);
+    bool SelectTrunc(const Instruction *I);
+    bool SelectIntExt(const Instruction *I);
 
   // Utility routines.
   private:
+    bool isTypeLegal(Type *Ty, MVT &VT);
+    bool isLoadTypeLegal(Type *Ty, MVT &VT);
+    bool PPCEmitCmp(const Value *Src1Value, const Value *Src2Value,
+                    bool isZExt, unsigned DestReg);
+    bool PPCEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
+                     const TargetRegisterClass *RC, bool IsZExt = true,
+                     unsigned FP64LoadOpc = PPC::LFD);
+    bool PPCEmitStore(MVT VT, unsigned SrcReg, Address &Addr);
+    bool PPCComputeAddress(const Value *Obj, Address &Addr);
+    void PPCSimplifyAddress(Address &Addr, MVT VT, bool &UseOffset,
+                            unsigned &IndexReg);
+    bool PPCEmitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT,
+                           unsigned DestReg, bool IsZExt);
     unsigned PPCMaterializeFP(const ConstantFP *CFP, MVT VT);
+    unsigned PPCMaterializeGV(const GlobalValue *GV, MVT VT);
     unsigned PPCMaterializeInt(const Constant *C, MVT VT);
     unsigned PPCMaterialize32BitInt(int64_t Imm,
                                     const TargetRegisterClass *RC);
     unsigned PPCMaterialize64BitInt(int64_t Imm,
                                     const TargetRegisterClass *RC);
+    unsigned PPCMoveToIntReg(const Instruction *I, MVT VT,
+                             unsigned SrcReg, bool IsSigned);
+    unsigned PPCMoveToFPReg(MVT VT, unsigned SrcReg, bool IsSigned);
+
+  // Call handling routines.
+  private:
+    bool processCallArgs(SmallVectorImpl<Value*> &Args,
+                         SmallVectorImpl<unsigned> &ArgRegs,
+                         SmallVectorImpl<MVT> &ArgVTs,
+                         SmallVectorImpl<ISD::ArgFlagsTy> &ArgFlags,
+                         SmallVectorImpl<unsigned> &RegArgs,
+                         CallingConv::ID CC,
+                         unsigned &NumBytes,
+                         bool IsVarArg);
+    void finishCall(MVT RetVT, SmallVectorImpl<unsigned> &UsedRegs,
+                    const Instruction *I, CallingConv::ID CC,
+                    unsigned &NumBytes, bool IsVarArg);
+    CCAssignFn *usePPC32CCs(unsigned Flag);
 
   private:
   #include "PPCGenFastISel.inc"
@@ -106,10 +186,1601 @@ class PPCFastISel : public FastISel {
 
 } // end anonymous namespace
 
+#include "PPCGenCallingConv.inc"
+
+// Function whose sole purpose is to kill compiler warnings 
+// stemming from unused functions included from PPCGenCallingConv.inc.
+CCAssignFn *PPCFastISel::usePPC32CCs(unsigned Flag) {
+  if (Flag == 1)
+    return CC_PPC32_SVR4;
+  else if (Flag == 2)
+    return CC_PPC32_SVR4_ByVal;
+  else if (Flag == 3)
+    return CC_PPC32_SVR4_VarArg;
+  else
+    return RetCC_PPC;
+}
+
+static Optional<PPC::Predicate> getComparePred(CmpInst::Predicate Pred) {
+  switch (Pred) {
+    // These are not representable with any single compare.
+    case CmpInst::FCMP_FALSE:
+    case CmpInst::FCMP_UEQ:
+    case CmpInst::FCMP_UGT:
+    case CmpInst::FCMP_UGE:
+    case CmpInst::FCMP_ULT:
+    case CmpInst::FCMP_ULE:
+    case CmpInst::FCMP_UNE:
+    case CmpInst::FCMP_TRUE:
+    default:
+      return Optional<PPC::Predicate>();
+
+    case CmpInst::FCMP_OEQ:
+    case CmpInst::ICMP_EQ:
+      return PPC::PRED_EQ;
+
+    case CmpInst::FCMP_OGT:
+    case CmpInst::ICMP_UGT:
+    case CmpInst::ICMP_SGT:
+      return PPC::PRED_GT;
+
+    case CmpInst::FCMP_OGE:
+    case CmpInst::ICMP_UGE:
+    case CmpInst::ICMP_SGE:
+      return PPC::PRED_GE;
+
+    case CmpInst::FCMP_OLT:
+    case CmpInst::ICMP_ULT:
+    case CmpInst::ICMP_SLT:
+      return PPC::PRED_LT;
+
+    case CmpInst::FCMP_OLE:
+    case CmpInst::ICMP_ULE:
+    case CmpInst::ICMP_SLE:
+      return PPC::PRED_LE;
+
+    case CmpInst::FCMP_ONE:
+    case CmpInst::ICMP_NE:
+      return PPC::PRED_NE;
+
+    case CmpInst::FCMP_ORD:
+      return PPC::PRED_NU;
+
+    case CmpInst::FCMP_UNO:
+      return PPC::PRED_UN;
+  }
+}
+
+// Determine whether the type Ty is simple enough to be handled by
+// fast-isel, and return its equivalent machine type in VT.
+// FIXME: Copied directly from ARM -- factor into base class?
+bool PPCFastISel::isTypeLegal(Type *Ty, MVT &VT) {
+  EVT Evt = TLI.getValueType(Ty, true);
+
+  // Only handle simple types.
+  if (Evt == MVT::Other || !Evt.isSimple()) return false;
+  VT = Evt.getSimpleVT();
+
+  // Handle all legal types, i.e. a register that will directly hold this
+  // value.
+  return TLI.isTypeLegal(VT);
+}
+
+// Determine whether the type Ty is simple enough to be handled by
+// fast-isel as a load target, and return its equivalent machine type in VT.
+bool PPCFastISel::isLoadTypeLegal(Type *Ty, MVT &VT) {
+  if (isTypeLegal(Ty, VT)) return true;
+
+  // If this is a type than can be sign or zero-extended to a basic operation
+  // go ahead and accept it now.
+  if (VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32) {
+    return true;
+  }
+
+  return false;
+}
+
+// Given a value Obj, create an Address object Addr that represents its
+// address.  Return false if we can't handle it.
+bool PPCFastISel::PPCComputeAddress(const Value *Obj, Address &Addr) {
+  const User *U = NULL;
+  unsigned Opcode = Instruction::UserOp1;
+  if (const Instruction *I = dyn_cast<Instruction>(Obj)) {
+    // Don't walk into other basic blocks unless the object is an alloca from
+    // another block, otherwise it may not have a virtual register assigned.
+    if (FuncInfo.StaticAllocaMap.count(static_cast<const AllocaInst *>(Obj)) ||
+        FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB) {
+      Opcode = I->getOpcode();
+      U = I;
+    }
+  } else if (const ConstantExpr *C = dyn_cast<ConstantExpr>(Obj)) {
+    Opcode = C->getOpcode();
+    U = C;
+  }
+
+  switch (Opcode) {
+    default:
+      break;
+    case Instruction::BitCast:
+      // Look through bitcasts.
+      return PPCComputeAddress(U->getOperand(0), Addr);
+    case Instruction::IntToPtr:
+      // Look past no-op inttoptrs.
+      if (TLI.getValueType(U->getOperand(0)->getType()) == TLI.getPointerTy())
+        return PPCComputeAddress(U->getOperand(0), Addr);
+      break;
+    case Instruction::PtrToInt:
+      // Look past no-op ptrtoints.
+      if (TLI.getValueType(U->getType()) == TLI.getPointerTy())
+        return PPCComputeAddress(U->getOperand(0), Addr);
+      break;
+    case Instruction::GetElementPtr: {
+      Address SavedAddr = Addr;
+      long TmpOffset = Addr.Offset;
+
+      // Iterate through the GEP folding the constants into offsets where
+      // we can.
+      gep_type_iterator GTI = gep_type_begin(U);
+      for (User::const_op_iterator II = U->op_begin() + 1, IE = U->op_end();
+           II != IE; ++II, ++GTI) {
+        const Value *Op = *II;
+        if (StructType *STy = dyn_cast<StructType>(*GTI)) {
+          const StructLayout *SL = TD.getStructLayout(STy);
+          unsigned Idx = cast<ConstantInt>(Op)->getZExtValue();
+          TmpOffset += SL->getElementOffset(Idx);
+        } else {
+          uint64_t S = TD.getTypeAllocSize(GTI.getIndexedType());
+          for (;;) {
+            if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {
+              // Constant-offset addressing.
+              TmpOffset += CI->getSExtValue() * S;
+              break;
+            }
+            if (canFoldAddIntoGEP(U, Op)) {
+              // A compatible add with a constant operand. Fold the constant.
+              ConstantInt *CI =
+              cast<ConstantInt>(cast<AddOperator>(Op)->getOperand(1));
+              TmpOffset += CI->getSExtValue() * S;
+              // Iterate on the other operand.
+              Op = cast<AddOperator>(Op)->getOperand(0);
+              continue;
+            }
+            // Unsupported
+            goto unsupported_gep;
+          }
+        }
+      }
+
+      // Try to grab the base operand now.
+      Addr.Offset = TmpOffset;
+      if (PPCComputeAddress(U->getOperand(0), Addr)) return true;
+
+      // We failed, restore everything and try the other options.
+      Addr = SavedAddr;
+
+      unsupported_gep:
+      break;
+    }
+    case Instruction::Alloca: {
+      const AllocaInst *AI = cast<AllocaInst>(Obj);
+      DenseMap<const AllocaInst*, int>::iterator SI =
+        FuncInfo.StaticAllocaMap.find(AI);
+      if (SI != FuncInfo.StaticAllocaMap.end()) {
+        Addr.BaseType = Address::FrameIndexBase;
+        Addr.Base.FI = SI->second;
+        return true;
+      }
+      break;
+    }
+  }
+
+  // FIXME: References to parameters fall through to the behavior
+  // below.  They should be able to reference a frame index since
+  // they are stored to the stack, so we can get "ld rx, offset(r1)"
+  // instead of "addi ry, r1, offset / ld rx, 0(ry)".  Obj will
+  // just contain the parameter.  Try to handle this with a FI.
+
+  // Try to get this in a register if nothing else has worked.
+  if (Addr.Base.Reg == 0)
+    Addr.Base.Reg = getRegForValue(Obj);
+
+  // Prevent assignment of base register to X0, which is inappropriate
+  // for loads and stores alike.
+  if (Addr.Base.Reg != 0)
+    MRI.setRegClass(Addr.Base.Reg, &PPC::G8RC_and_G8RC_NOX0RegClass);
+
+  return Addr.Base.Reg != 0;
+}
+
+// Fix up some addresses that can't be used directly.  For example, if
+// an offset won't fit in an instruction field, we may need to move it
+// into an index register.
+void PPCFastISel::PPCSimplifyAddress(Address &Addr, MVT VT, bool &UseOffset,
+                                     unsigned &IndexReg) {
+
+  // Check whether the offset fits in the instruction field.
+  if (!isInt<16>(Addr.Offset))
+    UseOffset = false;
+
+  // If this is a stack pointer and the offset needs to be simplified then
+  // put the alloca address into a register, set the base type back to
+  // register and continue. This should almost never happen.
+  if (!UseOffset && Addr.BaseType == Address::FrameIndexBase) {
+    unsigned ResultReg = createResultReg(&PPC::G8RC_and_G8RC_NOX0RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(PPC::ADDI8),
+            ResultReg).addFrameIndex(Addr.Base.FI).addImm(0);
+    Addr.Base.Reg = ResultReg;
+    Addr.BaseType = Address::RegBase;
+  }
+
+  if (!UseOffset) {
+    IntegerType *OffsetTy = ((VT == MVT::i32) ? Type::getInt32Ty(*Context)
+                             : Type::getInt64Ty(*Context));
+    const ConstantInt *Offset =
+      ConstantInt::getSigned(OffsetTy, (int64_t)(Addr.Offset));
+    IndexReg = PPCMaterializeInt(Offset, MVT::i64);
+    assert(IndexReg && "Unexpected error in PPCMaterializeInt!");
+  }
+}
+
+// Emit a load instruction if possible, returning true if we succeeded,
+// otherwise false.  See commentary below for how the register class of
+// the load is determined. 
+bool PPCFastISel::PPCEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
+                              const TargetRegisterClass *RC,
+                              bool IsZExt, unsigned FP64LoadOpc) {
+  unsigned Opc;
+  bool UseOffset = true;
+
+  // If ResultReg is given, it determines the register class of the load.
+  // Otherwise, RC is the register class to use.  If the result of the
+  // load isn't anticipated in this block, both may be zero, in which
+  // case we must make a conservative guess.  In particular, don't assign
+  // R0 or X0 to the result register, as the result may be used in a load,
+  // store, add-immediate, or isel that won't permit this.  (Though
+  // perhaps the spill and reload of live-exit values would handle this?)
+  const TargetRegisterClass *UseRC =
+    (ResultReg ? MRI.getRegClass(ResultReg) :
+     (RC ? RC :
+      (VT == MVT::f64 ? &PPC::F8RCRegClass :
+       (VT == MVT::f32 ? &PPC::F4RCRegClass :
+        (VT == MVT::i64 ? &PPC::G8RC_and_G8RC_NOX0RegClass :
+         &PPC::GPRC_and_GPRC_NOR0RegClass)))));
+
+  bool Is32BitInt = UseRC->hasSuperClassEq(&PPC::GPRCRegClass);
+
+  switch (VT.SimpleTy) {
+    default: // e.g., vector types not handled
+      return false;
+    case MVT::i8:
+      Opc = Is32BitInt ? PPC::LBZ : PPC::LBZ8;
+      break;
+    case MVT::i16:
+      Opc = (IsZExt ?
+             (Is32BitInt ? PPC::LHZ : PPC::LHZ8) : 
+             (Is32BitInt ? PPC::LHA : PPC::LHA8));
+      break;
+    case MVT::i32:
+      Opc = (IsZExt ? 
+             (Is32BitInt ? PPC::LWZ : PPC::LWZ8) :
+             (Is32BitInt ? PPC::LWA_32 : PPC::LWA));
+      if ((Opc == PPC::LWA || Opc == PPC::LWA_32) && ((Addr.Offset & 3) != 0))
+        UseOffset = false;
+      break;
+    case MVT::i64:
+      Opc = PPC::LD;
+      assert(UseRC->hasSuperClassEq(&PPC::G8RCRegClass) && 
+             "64-bit load with 32-bit target??");
+      UseOffset = ((Addr.Offset & 3) == 0);
+      break;
+    case MVT::f32:
+      Opc = PPC::LFS;
+      break;
+    case MVT::f64:
+      Opc = FP64LoadOpc;
+      break;
+  }
+
+  // If necessary, materialize the offset into a register and use
+  // the indexed form.  Also handle stack pointers with special needs.
+  unsigned IndexReg = 0;
+  PPCSimplifyAddress(Addr, VT, UseOffset, IndexReg);
+  if (ResultReg == 0)
+    ResultReg = createResultReg(UseRC);
+
+  // Note: If we still have a frame index here, we know the offset is
+  // in range, as otherwise PPCSimplifyAddress would have converted it
+  // into a RegBase.
+  if (Addr.BaseType == Address::FrameIndexBase) {
+
+    MachineMemOperand *MMO =
+      FuncInfo.MF->getMachineMemOperand(
+        MachinePointerInfo::getFixedStack(Addr.Base.FI, Addr.Offset),
+        MachineMemOperand::MOLoad, MFI.getObjectSize(Addr.Base.FI),
+        MFI.getObjectAlignment(Addr.Base.FI));
+
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), ResultReg)
+      .addImm(Addr.Offset).addFrameIndex(Addr.Base.FI).addMemOperand(MMO);
+
+  // Base reg with offset in range.
+  } else if (UseOffset) {
+
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), ResultReg)
+      .addImm(Addr.Offset).addReg(Addr.Base.Reg);
+
+  // Indexed form.
+  } else {
+    // Get the RR opcode corresponding to the RI one.  FIXME: It would be
+    // preferable to use the ImmToIdxMap from PPCRegisterInfo.cpp, but it
+    // is hard to get at.
+    switch (Opc) {
+      default:        llvm_unreachable("Unexpected opcode!");
+      case PPC::LBZ:    Opc = PPC::LBZX;    break;
+      case PPC::LBZ8:   Opc = PPC::LBZX8;   break;
+      case PPC::LHZ:    Opc = PPC::LHZX;    break;
+      case PPC::LHZ8:   Opc = PPC::LHZX8;   break;
+      case PPC::LHA:    Opc = PPC::LHAX;    break;
+      case PPC::LHA8:   Opc = PPC::LHAX8;   break;
+      case PPC::LWZ:    Opc = PPC::LWZX;    break;
+      case PPC::LWZ8:   Opc = PPC::LWZX8;   break;
+      case PPC::LWA:    Opc = PPC::LWAX;    break;
+      case PPC::LWA_32: Opc = PPC::LWAX_32; break;
+      case PPC::LD:     Opc = PPC::LDX;     break;
+      case PPC::LFS:    Opc = PPC::LFSX;    break;
+      case PPC::LFD:    Opc = PPC::LFDX;    break;
+    }
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), ResultReg)
+      .addReg(Addr.Base.Reg).addReg(IndexReg);
+  }
+
+  return true;
+}
+
+// Attempt to fast-select a load instruction.
+bool PPCFastISel::SelectLoad(const Instruction *I) {
+  // FIXME: No atomic loads are supported.
+  if (cast<LoadInst>(I)->isAtomic())
+    return false;
+
+  // Verify we have a legal type before going any further.
+  MVT VT;
+  if (!isLoadTypeLegal(I->getType(), VT))
+    return false;
+
+  // See if we can handle this address.
+  Address Addr;
+  if (!PPCComputeAddress(I->getOperand(0), Addr))
+    return false;
+
+  // Look at the currently assigned register for this instruction
+  // to determine the required register class.  This is necessary
+  // to constrain RA from using R0/X0 when this is not legal.
+  unsigned AssignedReg = FuncInfo.ValueMap[I];
+  const TargetRegisterClass *RC =
+    AssignedReg ? MRI.getRegClass(AssignedReg) : 0;
+
+  unsigned ResultReg = 0;
+  if (!PPCEmitLoad(VT, ResultReg, Addr, RC))
+    return false;
+  UpdateValueMap(I, ResultReg);
+  return true;
+}
+
+// Emit a store instruction to store SrcReg at Addr.
+bool PPCFastISel::PPCEmitStore(MVT VT, unsigned SrcReg, Address &Addr) {
+  assert(SrcReg && "Nothing to store!");
+  unsigned Opc;
+  bool UseOffset = true;
+
+  const TargetRegisterClass *RC = MRI.getRegClass(SrcReg);
+  bool Is32BitInt = RC->hasSuperClassEq(&PPC::GPRCRegClass);
+
+  switch (VT.SimpleTy) {
+    default: // e.g., vector types not handled
+      return false;
+    case MVT::i8:
+      Opc = Is32BitInt ? PPC::STB : PPC::STB8;
+      break;
+    case MVT::i16:
+      Opc = Is32BitInt ? PPC::STH : PPC::STH8;
+      break;
+    case MVT::i32:
+      assert(Is32BitInt && "Not GPRC for i32??");
+      Opc = PPC::STW;
+      break;
+    case MVT::i64:
+      Opc = PPC::STD;
+      UseOffset = ((Addr.Offset & 3) == 0);
+      break;
+    case MVT::f32:
+      Opc = PPC::STFS;
+      break;
+    case MVT::f64:
+      Opc = PPC::STFD;
+      break;
+  }
+
+  // If necessary, materialize the offset into a register and use
+  // the indexed form.  Also handle stack pointers with special needs.
+  unsigned IndexReg = 0;
+  PPCSimplifyAddress(Addr, VT, UseOffset, IndexReg);
+
+  // Note: If we still have a frame index here, we know the offset is
+  // in range, as otherwise PPCSimplifyAddress would have converted it
+  // into a RegBase.
+  if (Addr.BaseType == Address::FrameIndexBase) {
+    MachineMemOperand *MMO =
+      FuncInfo.MF->getMachineMemOperand(
+        MachinePointerInfo::getFixedStack(Addr.Base.FI, Addr.Offset),
+        MachineMemOperand::MOStore, MFI.getObjectSize(Addr.Base.FI),
+        MFI.getObjectAlignment(Addr.Base.FI));
+
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc)).addReg(SrcReg)
+      .addImm(Addr.Offset).addFrameIndex(Addr.Base.FI).addMemOperand(MMO);
+
+  // Base reg with offset in range.
+  } else if (UseOffset)
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc))
+      .addReg(SrcReg).addImm(Addr.Offset).addReg(Addr.Base.Reg);
+
+  // Indexed form.
+  else {
+    // Get the RR opcode corresponding to the RI one.  FIXME: It would be
+    // preferable to use the ImmToIdxMap from PPCRegisterInfo.cpp, but it
+    // is hard to get at.
+    switch (Opc) {
+      default:        llvm_unreachable("Unexpected opcode!");
+      case PPC::STB:  Opc = PPC::STBX;  break;
+      case PPC::STH : Opc = PPC::STHX;  break;
+      case PPC::STW : Opc = PPC::STWX;  break;
+      case PPC::STB8: Opc = PPC::STBX8; break;
+      case PPC::STH8: Opc = PPC::STHX8; break;
+      case PPC::STW8: Opc = PPC::STWX8; break;
+      case PPC::STD:  Opc = PPC::STDX;  break;
+      case PPC::STFS: Opc = PPC::STFSX; break;
+      case PPC::STFD: Opc = PPC::STFDX; break;
+    }
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc))
+      .addReg(SrcReg).addReg(Addr.Base.Reg).addReg(IndexReg);
+  }
+
+  return true;
+}
+
+// Attempt to fast-select a store instruction.
+bool PPCFastISel::SelectStore(const Instruction *I) {
+  Value *Op0 = I->getOperand(0);
+  unsigned SrcReg = 0;
+
+  // FIXME: No atomics loads are supported.
+  if (cast<StoreInst>(I)->isAtomic())
+    return false;
+
+  // Verify we have a legal type before going any further.
+  MVT VT;
+  if (!isLoadTypeLegal(Op0->getType(), VT))
+    return false;
+
+  // Get the value to be stored into a register.
+  SrcReg = getRegForValue(Op0);
+  if (SrcReg == 0)
+    return false;
+
+  // See if we can handle this address.
+  Address Addr;
+  if (!PPCComputeAddress(I->getOperand(1), Addr))
+    return false;
+
+  if (!PPCEmitStore(VT, SrcReg, Addr))
+    return false;
+
+  return true;
+}
+
+// Attempt to fast-select a branch instruction.
+bool PPCFastISel::SelectBranch(const Instruction *I) {
+  const BranchInst *BI = cast<BranchInst>(I);
+  MachineBasicBlock *BrBB = FuncInfo.MBB;
+  MachineBasicBlock *TBB = FuncInfo.MBBMap[BI->getSuccessor(0)];
+  MachineBasicBlock *FBB = FuncInfo.MBBMap[BI->getSuccessor(1)];
+
+  // For now, just try the simplest case where it's fed by a compare.
+  if (const CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition())) {
+    Optional<PPC::Predicate> OptPPCPred = getComparePred(CI->getPredicate());
+    if (!OptPPCPred)
+      return false;
+
+    PPC::Predicate PPCPred = OptPPCPred.getValue();
+
+    // Take advantage of fall-through opportunities.
+    if (FuncInfo.MBB->isLayoutSuccessor(TBB)) {
+      std::swap(TBB, FBB);
+      PPCPred = PPC::InvertPredicate(PPCPred);
+    }
+
+    unsigned CondReg = createResultReg(&PPC::CRRCRegClass);
+
+    if (!PPCEmitCmp(CI->getOperand(0), CI->getOperand(1), CI->isUnsigned(),
+                    CondReg))
+      return false;
+
+    BuildMI(*BrBB, FuncInfo.InsertPt, DL, TII.get(PPC::BCC))
+      .addImm(PPCPred).addReg(CondReg).addMBB(TBB);
+    FastEmitBranch(FBB, DL);
+    FuncInfo.MBB->addSuccessor(TBB);
+    return true;
+
+  } else if (const ConstantInt *CI =
+             dyn_cast<ConstantInt>(BI->getCondition())) {
+    uint64_t Imm = CI->getZExtValue();
+    MachineBasicBlock *Target = (Imm == 0) ? FBB : TBB;
+    FastEmitBranch(Target, DL);
+    return true;
+  }
+
+  // FIXME: ARM looks for a case where the block containing the compare
+  // has been split from the block containing the branch.  If this happens,
+  // there is a vreg available containing the result of the compare.  I'm
+  // not sure we can do much, as we've lost the predicate information with
+  // the compare instruction -- we have a 4-bit CR but don't know which bit
+  // to test here.
+  return false;
+}
+
+// Attempt to emit a compare of the two source values.  Signed and unsigned
+// comparisons are supported.  Return false if we can't handle it.
+bool PPCFastISel::PPCEmitCmp(const Value *SrcValue1, const Value *SrcValue2,
+                             bool IsZExt, unsigned DestReg) {
+  Type *Ty = SrcValue1->getType();
+  EVT SrcEVT = TLI.getValueType(Ty, true);
+  if (!SrcEVT.isSimple())
+    return false;
+  MVT SrcVT = SrcEVT.getSimpleVT();
+
+  // See if operand 2 is an immediate encodeable in the compare.
+  // FIXME: Operands are not in canonical order at -O0, so an immediate
+  // operand in position 1 is a lost opportunity for now.  We are
+  // similar to ARM in this regard.
+  long Imm = 0;
+  bool UseImm = false;
+
+  // Only 16-bit integer constants can be represented in compares for 
+  // PowerPC.  Others will be materialized into a register.
+  if (const ConstantInt *ConstInt = dyn_cast<ConstantInt>(SrcValue2)) {
+    if (SrcVT == MVT::i64 || SrcVT == MVT::i32 || SrcVT == MVT::i16 ||
+        SrcVT == MVT::i8 || SrcVT == MVT::i1) {
+      const APInt &CIVal = ConstInt->getValue();
+      Imm = (IsZExt) ? (long)CIVal.getZExtValue() : (long)CIVal.getSExtValue();
+      if ((IsZExt && isUInt<16>(Imm)) || (!IsZExt && isInt<16>(Imm)))
+        UseImm = true;
+    }
+  }
+
+  unsigned CmpOpc;
+  bool NeedsExt = false;
+  switch (SrcVT.SimpleTy) {
+    default: return false;
+    case MVT::f32:
+      CmpOpc = PPC::FCMPUS;
+      break;
+    case MVT::f64:
+      CmpOpc = PPC::FCMPUD;
+      break;
+    case MVT::i1:
+    case MVT::i8:
+    case MVT::i16:
+      NeedsExt = true;
+      // Intentional fall-through.
+    case MVT::i32:
+      if (!UseImm)
+        CmpOpc = IsZExt ? PPC::CMPLW : PPC::CMPW;
+      else
+        CmpOpc = IsZExt ? PPC::CMPLWI : PPC::CMPWI;
+      break;
+    case MVT::i64:
+      if (!UseImm)
+        CmpOpc = IsZExt ? PPC::CMPLD : PPC::CMPD;
+      else
+        CmpOpc = IsZExt ? PPC::CMPLDI : PPC::CMPDI;
+      break;
+  }
+
+  unsigned SrcReg1 = getRegForValue(SrcValue1);
+  if (SrcReg1 == 0)
+    return false;
+
+  unsigned SrcReg2 = 0;
+  if (!UseImm) {
+    SrcReg2 = getRegForValue(SrcValue2);
+    if (SrcReg2 == 0)
+      return false;
+  }
+
+  if (NeedsExt) {
+    unsigned ExtReg = createResultReg(&PPC::GPRCRegClass);
+    if (!PPCEmitIntExt(SrcVT, SrcReg1, MVT::i32, ExtReg, IsZExt))
+      return false;
+    SrcReg1 = ExtReg;
+
+    if (!UseImm) {
+      unsigned ExtReg = createResultReg(&PPC::GPRCRegClass);
+      if (!PPCEmitIntExt(SrcVT, SrcReg2, MVT::i32, ExtReg, IsZExt))
+        return false;
+      SrcReg2 = ExtReg;
+    }
+  }
+
+  if (!UseImm)
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(CmpOpc), DestReg)
+      .addReg(SrcReg1).addReg(SrcReg2);
+  else
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(CmpOpc), DestReg)
+      .addReg(SrcReg1).addImm(Imm);
+
+  return true;
+}
+
+// Attempt to fast-select a floating-point extend instruction.
+bool PPCFastISel::SelectFPExt(const Instruction *I) {
+  Value *Src  = I->getOperand(0);
+  EVT SrcVT  = TLI.getValueType(Src->getType(), true);
+  EVT DestVT = TLI.getValueType(I->getType(), true);
+
+  if (SrcVT != MVT::f32 || DestVT != MVT::f64)
+    return false;
+
+  unsigned SrcReg = getRegForValue(Src);
+  if (!SrcReg)
+    return false;
+
+  // No code is generated for a FP extend.
+  UpdateValueMap(I, SrcReg);
+  return true;
+}
+
+// Attempt to fast-select a floating-point truncate instruction.
+bool PPCFastISel::SelectFPTrunc(const Instruction *I) {
+  Value *Src  = I->getOperand(0);
+  EVT SrcVT  = TLI.getValueType(Src->getType(), true);
+  EVT DestVT = TLI.getValueType(I->getType(), true);
+
+  if (SrcVT != MVT::f64 || DestVT != MVT::f32)
+    return false;
+
+  unsigned SrcReg = getRegForValue(Src);
+  if (!SrcReg)
+    return false;
+
+  // Round the result to single precision.
+  unsigned DestReg = createResultReg(&PPC::F4RCRegClass);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(PPC::FRSP), DestReg)
+    .addReg(SrcReg);
+
+  UpdateValueMap(I, DestReg);
+  return true;
+}
+
+// Move an i32 or i64 value in a GPR to an f64 value in an FPR.
+// FIXME: When direct register moves are implemented (see PowerISA 2.08),
+// those should be used instead of moving via a stack slot when the
+// subtarget permits.
+// FIXME: The code here is sloppy for the 4-byte case.  Can use a 4-byte
+// stack slot and 4-byte store/load sequence.  Or just sext the 4-byte
+// case to 8 bytes which produces tighter code but wastes stack space.
+unsigned PPCFastISel::PPCMoveToFPReg(MVT SrcVT, unsigned SrcReg,
+                                     bool IsSigned) {
+
+  // If necessary, extend 32-bit int to 64-bit.
+  if (SrcVT == MVT::i32) {
+    unsigned TmpReg = createResultReg(&PPC::G8RCRegClass);
+    if (!PPCEmitIntExt(MVT::i32, SrcReg, MVT::i64, TmpReg, !IsSigned))
+      return 0;
+    SrcReg = TmpReg;
+  }
+
+  // Get a stack slot 8 bytes wide, aligned on an 8-byte boundary.
+  Address Addr;
+  Addr.BaseType = Address::FrameIndexBase;
+  Addr.Base.FI = MFI.CreateStackObject(8, 8, false);
+
+  // Store the value from the GPR.
+  if (!PPCEmitStore(MVT::i64, SrcReg, Addr))
+    return 0;
+
+  // Load the integer value into an FPR.  The kind of load used depends
+  // on a number of conditions.
+  unsigned LoadOpc = PPC::LFD;
+
+  if (SrcVT == MVT::i32) {
+    Addr.Offset = 4;
+    if (!IsSigned)
+      LoadOpc = PPC::LFIWZX;
+    else if (PPCSubTarget.hasLFIWAX())
+      LoadOpc = PPC::LFIWAX;
+  }
+
+  const TargetRegisterClass *RC = &PPC::F8RCRegClass;
+  unsigned ResultReg = 0;
+  if (!PPCEmitLoad(MVT::f64, ResultReg, Addr, RC, !IsSigned, LoadOpc))
+    return 0;
+
+  return ResultReg;
+}
+
+// Attempt to fast-select an integer-to-floating-point conversion.
+bool PPCFastISel::SelectIToFP(const Instruction *I, bool IsSigned) {
+  MVT DstVT;
+  Type *DstTy = I->getType();
+  if (!isTypeLegal(DstTy, DstVT))
+    return false;
+
+  if (DstVT != MVT::f32 && DstVT != MVT::f64)
+    return false;
+
+  Value *Src = I->getOperand(0);
+  EVT SrcEVT = TLI.getValueType(Src->getType(), true);
+  if (!SrcEVT.isSimple())
+    return false;
+
+  MVT SrcVT = SrcEVT.getSimpleVT();
+
+  if (SrcVT != MVT::i8  && SrcVT != MVT::i16 &&
+      SrcVT != MVT::i32 && SrcVT != MVT::i64)
+    return false;
+
+  unsigned SrcReg = getRegForValue(Src);
+  if (SrcReg == 0)
+    return false;
+
+  // We can only lower an unsigned convert if we have the newer
+  // floating-point conversion operations.
+  if (!IsSigned && !PPCSubTarget.hasFPCVT())
+    return false;
+
+  // FIXME: For now we require the newer floating-point conversion operations
+  // (which are present only on P7 and A2 server models) when converting
+  // to single-precision float.  Otherwise we have to generate a lot of
+  // fiddly code to avoid double rounding.  If necessary, the fiddly code
+  // can be found in PPCTargetLowering::LowerINT_TO_FP().
+  if (DstVT == MVT::f32 && !PPCSubTarget.hasFPCVT())
+    return false;
+
+  // Extend the input if necessary.
+  if (SrcVT == MVT::i8 || SrcVT == MVT::i16) {
+    unsigned TmpReg = createResultReg(&PPC::G8RCRegClass);
+    if (!PPCEmitIntExt(SrcVT, SrcReg, MVT::i64, TmpReg, !IsSigned))
+      return false;
+    SrcVT = MVT::i64;
+    SrcReg = TmpReg;
+  }
+
+  // Move the integer value to an FPR.
+  unsigned FPReg = PPCMoveToFPReg(SrcVT, SrcReg, IsSigned);
+  if (FPReg == 0)
+    return false;
+
+  // Determine the opcode for the conversion.
+  const TargetRegisterClass *RC = &PPC::F8RCRegClass;
+  unsigned DestReg = createResultReg(RC);
+  unsigned Opc;
+
+  if (DstVT == MVT::f32)
+    Opc = IsSigned ? PPC::FCFIDS : PPC::FCFIDUS;
+  else
+    Opc = IsSigned ? PPC::FCFID : PPC::FCFIDU;
+
+  // Generate the convert.
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), DestReg)
+    .addReg(FPReg);
+
+  UpdateValueMap(I, DestReg);
+  return true;
+}
+
+// Move the floating-point value in SrcReg into an integer destination
+// register, and return the register (or zero if we can't handle it).
+// FIXME: When direct register moves are implemented (see PowerISA 2.08),
+// those should be used instead of moving via a stack slot when the
+// subtarget permits.
+unsigned PPCFastISel::PPCMoveToIntReg(const Instruction *I, MVT VT,
+                                      unsigned SrcReg, bool IsSigned) {
+  // Get a stack slot 8 bytes wide, aligned on an 8-byte boundary.
+  // Note that if have STFIWX available, we could use a 4-byte stack
+  // slot for i32, but this being fast-isel we'll just go with the
+  // easiest code gen possible.
+  Address Addr;
+  Addr.BaseType = Address::FrameIndexBase;
+  Addr.Base.FI = MFI.CreateStackObject(8, 8, false);
+
+  // Store the value from the FPR.
+  if (!PPCEmitStore(MVT::f64, SrcReg, Addr))
+    return 0;
+
+  // Reload it into a GPR.  If we want an i32, modify the address
+  // to have a 4-byte offset so we load from the right place.
+  if (VT == MVT::i32)
+    Addr.Offset = 4;
+
+  // Look at the currently assigned register for this instruction
+  // to determine the required register class.
+  unsigned AssignedReg = FuncInfo.ValueMap[I];
+  const TargetRegisterClass *RC =
+    AssignedReg ? MRI.getRegClass(AssignedReg) : 0;
+
+  unsigned ResultReg = 0;
+  if (!PPCEmitLoad(VT, ResultReg, Addr, RC, !IsSigned))
+    return 0;
+
+  return ResultReg;
+}
+
+// Attempt to fast-select a floating-point-to-integer conversion.
+bool PPCFastISel::SelectFPToI(const Instruction *I, bool IsSigned) {
+  MVT DstVT, SrcVT;
+  Type *DstTy = I->getType();
+  if (!isTypeLegal(DstTy, DstVT))
+    return false;
+
+  if (DstVT != MVT::i32 && DstVT != MVT::i64)
+    return false;
+
+  Value *Src = I->getOperand(0);
+  Type *SrcTy = Src->getType();
+  if (!isTypeLegal(SrcTy, SrcVT))
+    return false;
+
+  if (SrcVT != MVT::f32 && SrcVT != MVT::f64)
+    return false;
+
+  unsigned SrcReg = getRegForValue(Src);
+  if (SrcReg == 0)
+    return false;
+
+  // Convert f32 to f64 if necessary.  This is just a meaningless copy
+  // to get the register class right.  COPY_TO_REGCLASS is needed since
+  // a COPY from F4RC to F8RC is converted to a F4RC-F4RC copy downstream.
+  const TargetRegisterClass *InRC = MRI.getRegClass(SrcReg);
+  if (InRC == &PPC::F4RCRegClass) {
+    unsigned TmpReg = createResultReg(&PPC::F8RCRegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+            TII.get(TargetOpcode::COPY_TO_REGCLASS), TmpReg)
+      .addReg(SrcReg).addImm(PPC::F8RCRegClassID);
+    SrcReg = TmpReg;
+  }
+
+  // Determine the opcode for the conversion, which takes place
+  // entirely within FPRs.
+  unsigned DestReg = createResultReg(&PPC::F8RCRegClass);
+  unsigned Opc;
+
+  if (DstVT == MVT::i32)
+    if (IsSigned)
+      Opc = PPC::FCTIWZ;
+    else
+      Opc = PPCSubTarget.hasFPCVT() ? PPC::FCTIWUZ : PPC::FCTIDZ;
+  else
+    Opc = IsSigned ? PPC::FCTIDZ : PPC::FCTIDUZ;
+
+  // Generate the convert.
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), DestReg)
+    .addReg(SrcReg);
+
+  // Now move the integer value from a float register to an integer register.
+  unsigned IntReg = PPCMoveToIntReg(I, DstVT, DestReg, IsSigned);
+  if (IntReg == 0)
+    return false;
+
+  UpdateValueMap(I, IntReg);
+  return true;
+}
+
+// Attempt to fast-select a binary integer operation that isn't already
+// handled automatically.
+bool PPCFastISel::SelectBinaryIntOp(const Instruction *I, unsigned ISDOpcode) {
+  EVT DestVT  = TLI.getValueType(I->getType(), true);
+
+  // We can get here in the case when we have a binary operation on a non-legal
+  // type and the target independent selector doesn't know how to handle it.
+  if (DestVT != MVT::i16 && DestVT != MVT::i8)
+    return false;
+
+  // Look at the currently assigned register for this instruction
+  // to determine the required register class.  If there is no register,
+  // make a conservative choice (don't assign R0).
+  unsigned AssignedReg = FuncInfo.ValueMap[I];
+  const TargetRegisterClass *RC =
+    (AssignedReg ? MRI.getRegClass(AssignedReg) :
+     &PPC::GPRC_and_GPRC_NOR0RegClass);
+  bool IsGPRC = RC->hasSuperClassEq(&PPC::GPRCRegClass);
+
+  unsigned Opc;
+  switch (ISDOpcode) {
+    default: return false;
+    case ISD::ADD:
+      Opc = IsGPRC ? PPC::ADD4 : PPC::ADD8;
+      break;
+    case ISD::OR:
+      Opc = IsGPRC ? PPC::OR : PPC::OR8;
+      break;
+    case ISD::SUB:
+      Opc = IsGPRC ? PPC::SUBF : PPC::SUBF8;
+      break;
+  }
+
+  unsigned ResultReg = createResultReg(RC ? RC : &PPC::G8RCRegClass);
+  unsigned SrcReg1 = getRegForValue(I->getOperand(0));
+  if (SrcReg1 == 0) return false;
+
+  // Handle case of small immediate operand.
+  if (const ConstantInt *ConstInt = dyn_cast<ConstantInt>(I->getOperand(1))) {
+    const APInt &CIVal = ConstInt->getValue();
+    int Imm = (int)CIVal.getSExtValue();
+    bool UseImm = true;
+    if (isInt<16>(Imm)) {
+      switch (Opc) {
+        default:
+          llvm_unreachable("Missing case!");
+        case PPC::ADD4:
+          Opc = PPC::ADDI;
+          MRI.setRegClass(SrcReg1, &PPC::GPRC_and_GPRC_NOR0RegClass);
+          break;
+        case PPC::ADD8:
+          Opc = PPC::ADDI8;
+          MRI.setRegClass(SrcReg1, &PPC::G8RC_and_G8RC_NOX0RegClass);
+          break;
+        case PPC::OR:
+          Opc = PPC::ORI;
+          break;
+        case PPC::OR8:
+          Opc = PPC::ORI8;
+          break;
+        case PPC::SUBF:
+          if (Imm == -32768)
+            UseImm = false;
+          else {
+            Opc = PPC::ADDI;
+            MRI.setRegClass(SrcReg1, &PPC::GPRC_and_GPRC_NOR0RegClass);
+            Imm = -Imm;
+          }
+          break;
+        case PPC::SUBF8:
+          if (Imm == -32768)
+            UseImm = false;
+          else {
+            Opc = PPC::ADDI8;
+            MRI.setRegClass(SrcReg1, &PPC::G8RC_and_G8RC_NOX0RegClass);
+            Imm = -Imm;
+          }
+          break;
+      }
+
+      if (UseImm) {
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), ResultReg)
+          .addReg(SrcReg1).addImm(Imm);
+        UpdateValueMap(I, ResultReg);
+        return true;
+      }
+    }
+  }
+
+  // Reg-reg case.
+  unsigned SrcReg2 = getRegForValue(I->getOperand(1));
+  if (SrcReg2 == 0) return false;
+
+  // Reverse operands for subtract-from.
+  if (ISDOpcode == ISD::SUB)
+    std::swap(SrcReg1, SrcReg2);
+
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), ResultReg)
+    .addReg(SrcReg1).addReg(SrcReg2);
+  UpdateValueMap(I, ResultReg);
+  return true;
+}
+
+// Handle arguments to a call that we're attempting to fast-select.
+// Return false if the arguments are too complex for us at the moment.
+bool PPCFastISel::processCallArgs(SmallVectorImpl<Value*> &Args,
+                                  SmallVectorImpl<unsigned> &ArgRegs,
+                                  SmallVectorImpl<MVT> &ArgVTs,
+                                  SmallVectorImpl<ISD::ArgFlagsTy> &ArgFlags,
+                                  SmallVectorImpl<unsigned> &RegArgs,
+                                  CallingConv::ID CC,
+                                  unsigned &NumBytes,
+                                  bool IsVarArg) {
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CC, IsVarArg, *FuncInfo.MF, TM, ArgLocs, *Context);
+  CCInfo.AnalyzeCallOperands(ArgVTs, ArgFlags, CC_PPC64_ELF_FIS);
+
+  // Bail out if we can't handle any of the arguments.
+  for (unsigned I = 0, E = ArgLocs.size(); I != E; ++I) {
+    CCValAssign &VA = ArgLocs[I];
+    MVT ArgVT = ArgVTs[VA.getValNo()];
+
+    // Skip vector arguments for now, as well as long double and
+    // uint128_t, and anything that isn't passed in a register.
+    if (ArgVT.isVector() || ArgVT.getSizeInBits() > 64 ||
+        !VA.isRegLoc() || VA.needsCustom())
+      return false;
+
+    // Skip bit-converted arguments for now.
+    if (VA.getLocInfo() == CCValAssign::BCvt)
+      return false;
+  }
+
+  // Get a count of how many bytes are to be pushed onto the stack.
+  NumBytes = CCInfo.getNextStackOffset();
+
+  // Issue CALLSEQ_START.
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+          TII.get(TII.getCallFrameSetupOpcode()))
+    .addImm(NumBytes);
+
+  // Prepare to assign register arguments.  Every argument uses up a
+  // GPR protocol register even if it's passed in a floating-point
+  // register.
+  unsigned NextGPR = PPC::X3;
+  unsigned NextFPR = PPC::F1;
+
+  // Process arguments.
+  for (unsigned I = 0, E = ArgLocs.size(); I != E; ++I) {
+    CCValAssign &VA = ArgLocs[I];
+    unsigned Arg = ArgRegs[VA.getValNo()];
+    MVT ArgVT = ArgVTs[VA.getValNo()];
+
+    // Handle argument promotion and bitcasts.
+    switch (VA.getLocInfo()) {
+      default:
+        llvm_unreachable("Unknown loc info!");
+      case CCValAssign::Full:
+        break;
+      case CCValAssign::SExt: {
+        MVT DestVT = VA.getLocVT();
+        const TargetRegisterClass *RC =
+          (DestVT == MVT::i64) ? &PPC::G8RCRegClass : &PPC::GPRCRegClass;
+        unsigned TmpReg = createResultReg(RC);
+        if (!PPCEmitIntExt(ArgVT, Arg, DestVT, TmpReg, /*IsZExt*/false))
+          llvm_unreachable("Failed to emit a sext!");
+        ArgVT = DestVT;
+        Arg = TmpReg;
+        break;
+      }
+      case CCValAssign::AExt:
+      case CCValAssign::ZExt: {
+        MVT DestVT = VA.getLocVT();
+        const TargetRegisterClass *RC =
+          (DestVT == MVT::i64) ? &PPC::G8RCRegClass : &PPC::GPRCRegClass;
+        unsigned TmpReg = createResultReg(RC);
+        if (!PPCEmitIntExt(ArgVT, Arg, DestVT, TmpReg, /*IsZExt*/true))
+          llvm_unreachable("Failed to emit a zext!");
+        ArgVT = DestVT;
+        Arg = TmpReg;
+        break;
+      }
+      case CCValAssign::BCvt: {
+        // FIXME: Not yet handled.
+        llvm_unreachable("Should have bailed before getting here!");
+        break;
+      }
+    }
+
+    // Copy this argument to the appropriate register.
+    unsigned ArgReg;
+    if (ArgVT == MVT::f32 || ArgVT == MVT::f64) {
+      ArgReg = NextFPR++;
+      ++NextGPR;
+    } else
+      ArgReg = NextGPR++;
+      
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
+            ArgReg).addReg(Arg);
+    RegArgs.push_back(ArgReg);
+  }
+
+  return true;
+}
+
+// For a call that we've determined we can fast-select, finish the
+// call sequence and generate a copy to obtain the return value (if any).
+void PPCFastISel::finishCall(MVT RetVT, SmallVectorImpl<unsigned> &UsedRegs,
+                             const Instruction *I, CallingConv::ID CC,
+                             unsigned &NumBytes, bool IsVarArg) {
+  // Issue CallSEQ_END.
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+          TII.get(TII.getCallFrameDestroyOpcode()))
+    .addImm(NumBytes).addImm(0);
+
+  // Next, generate a copy to obtain the return value.
+  // FIXME: No multi-register return values yet, though I don't foresee
+  // any real difficulties there.
+  if (RetVT != MVT::isVoid) {
+    SmallVector<CCValAssign, 16> RVLocs;
+    CCState CCInfo(CC, IsVarArg, *FuncInfo.MF, TM, RVLocs, *Context);
+    CCInfo.AnalyzeCallResult(RetVT, RetCC_PPC64_ELF_FIS);
+    CCValAssign &VA = RVLocs[0];
+    assert(RVLocs.size() == 1 && "No support for multi-reg return values!");
+    assert(VA.isRegLoc() && "Can only return in registers!");
+
+    MVT DestVT = VA.getValVT();
+    MVT CopyVT = DestVT;
+
+    // Ints smaller than a register still arrive in a full 64-bit
+    // register, so make sure we recognize this.
+    if (RetVT == MVT::i8 || RetVT == MVT::i16 || RetVT == MVT::i32)
+      CopyVT = MVT::i64;
+
+    unsigned SourcePhysReg = VA.getLocReg();
+    unsigned ResultReg = 0;
+
+    if (RetVT == CopyVT) {
+      const TargetRegisterClass *CpyRC = TLI.getRegClassFor(CopyVT);
+      ResultReg = createResultReg(CpyRC);
+
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+              TII.get(TargetOpcode::COPY), ResultReg)
+        .addReg(SourcePhysReg);
+
+    // If necessary, round the floating result to single precision.
+    } else if (CopyVT == MVT::f64) {
+      ResultReg = createResultReg(TLI.getRegClassFor(RetVT));
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(PPC::FRSP),
+              ResultReg).addReg(SourcePhysReg);
+
+    // If only the low half of a general register is needed, generate
+    // a GPRC copy instead of a G8RC copy.  (EXTRACT_SUBREG can't be
+    // used along the fast-isel path (not lowered), and downstream logic
+    // also doesn't like a direct subreg copy on a physical reg.)
+    } else if (RetVT == MVT::i8 || RetVT == MVT::i16 || RetVT == MVT::i32) {
+      ResultReg = createResultReg(&PPC::GPRCRegClass);
+      // Convert physical register from G8RC to GPRC.
+      SourcePhysReg -= PPC::X0 - PPC::R0;
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+              TII.get(TargetOpcode::COPY), ResultReg)
+        .addReg(SourcePhysReg);
+    }
+
+    assert(ResultReg && "ResultReg unset!");
+    UsedRegs.push_back(SourcePhysReg);
+    UpdateValueMap(I, ResultReg);
+  }
+}
+
+// Attempt to fast-select a call instruction.
+bool PPCFastISel::SelectCall(const Instruction *I) {
+  const CallInst *CI = cast<CallInst>(I);
+  const Value *Callee = CI->getCalledValue();
+
+  // Can't handle inline asm.
+  if (isa<InlineAsm>(Callee))
+    return false;
+
+  // Allow SelectionDAG isel to handle tail calls.
+  if (CI->isTailCall())
+    return false;
+
+  // Obtain calling convention.
+  ImmutableCallSite CS(CI);
+  CallingConv::ID CC = CS.getCallingConv();
+
+  PointerType *PT = cast<PointerType>(CS.getCalledValue()->getType());
+  FunctionType *FTy = cast<FunctionType>(PT->getElementType());
+  bool IsVarArg = FTy->isVarArg();
+
+  // Not ready for varargs yet.
+  if (IsVarArg)
+    return false;
+
+  // Handle simple calls for now, with legal return types and
+  // those that can be extended.
+  Type *RetTy = I->getType();
+  MVT RetVT;
+  if (RetTy->isVoidTy())
+    RetVT = MVT::isVoid;
+  else if (!isTypeLegal(RetTy, RetVT) && RetVT != MVT::i16 &&
+           RetVT != MVT::i8)
+    return false;
+
+  // FIXME: No multi-register return values yet.
+  if (RetVT != MVT::isVoid && RetVT != MVT::i8 && RetVT != MVT::i16 &&
+      RetVT != MVT::i32 && RetVT != MVT::i64 && RetVT != MVT::f32 &&
+      RetVT != MVT::f64) {
+    SmallVector<CCValAssign, 16> RVLocs;
+    CCState CCInfo(CC, IsVarArg, *FuncInfo.MF, TM, RVLocs, *Context);
+    CCInfo.AnalyzeCallResult(RetVT, RetCC_PPC64_ELF_FIS);
+    if (RVLocs.size() > 1)
+      return false;
+  }
+
+  // Bail early if more than 8 arguments, as we only currently
+  // handle arguments passed in registers.
+  unsigned NumArgs = CS.arg_size();
+  if (NumArgs > 8)
+    return false;
+
+  // Set up the argument vectors.
+  SmallVector<Value*, 8> Args;
+  SmallVector<unsigned, 8> ArgRegs;
+  SmallVector<MVT, 8> ArgVTs;
+  SmallVector<ISD::ArgFlagsTy, 8> ArgFlags;
+
+  Args.reserve(NumArgs);
+  ArgRegs.reserve(NumArgs);
+  ArgVTs.reserve(NumArgs);
+  ArgFlags.reserve(NumArgs);
+
+  for (ImmutableCallSite::arg_iterator II = CS.arg_begin(), IE = CS.arg_end();
+       II != IE; ++II) {
+    // FIXME: ARM does something for intrinsic calls here, check into that.
+
+    unsigned AttrIdx = II - CS.arg_begin() + 1;
+    
+    // Only handle easy calls for now.  It would be reasonably easy
+    // to handle <= 8-byte structures passed ByVal in registers, but we
+    // have to ensure they are right-justified in the register.
+    if (CS.paramHasAttr(AttrIdx, Attribute::InReg) ||
+        CS.paramHasAttr(AttrIdx, Attribute::StructRet) ||
+        CS.paramHasAttr(AttrIdx, Attribute::Nest) ||
+        CS.paramHasAttr(AttrIdx, Attribute::ByVal))
+      return false;
+
+    ISD::ArgFlagsTy Flags;
+    if (CS.paramHasAttr(AttrIdx, Attribute::SExt))
+      Flags.setSExt();
+    if (CS.paramHasAttr(AttrIdx, Attribute::ZExt))
+      Flags.setZExt();
+
+    Type *ArgTy = (*II)->getType();
+    MVT ArgVT;
+    if (!isTypeLegal(ArgTy, ArgVT) && ArgVT != MVT::i16 && ArgVT != MVT::i8)
+      return false;
+
+    if (ArgVT.isVector())
+      return false;
+
+    unsigned Arg = getRegForValue(*II);
+    if (Arg == 0)
+      return false;
+
+    unsigned OriginalAlignment = TD.getABITypeAlignment(ArgTy);
+    Flags.setOrigAlign(OriginalAlignment);
+
+    Args.push_back(*II);
+    ArgRegs.push_back(Arg);
+    ArgVTs.push_back(ArgVT);
+    ArgFlags.push_back(Flags);
+  }
+
+  // Process the arguments.
+  SmallVector<unsigned, 8> RegArgs;
+  unsigned NumBytes;
+
+  if (!processCallArgs(Args, ArgRegs, ArgVTs, ArgFlags,
+                       RegArgs, CC, NumBytes, IsVarArg))
+    return false;
+
+  // FIXME: No handling for function pointers yet.  This requires
+  // implementing the function descriptor (OPD) setup.
+  const GlobalValue *GV = dyn_cast<GlobalValue>(Callee);
+  if (!GV)
+    return false;
+
+  // Build direct call with NOP for TOC restore.
+  // FIXME: We can and should optimize away the NOP for local calls.
+  MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                                    TII.get(PPC::BL8_NOP));
+  // Add callee.
+  MIB.addGlobalAddress(GV);
+
+  // Add implicit physical register uses to the call.
+  for (unsigned II = 0, IE = RegArgs.size(); II != IE; ++II)
+    MIB.addReg(RegArgs[II], RegState::Implicit);
+
+  // Add a register mask with the call-preserved registers.  Proper
+  // defs for return values will be added by setPhysRegsDeadExcept().
+  MIB.addRegMask(TRI.getCallPreservedMask(CC));
+
+  // Finish off the call including any return values.
+  SmallVector<unsigned, 4> UsedRegs;
+  finishCall(RetVT, UsedRegs, I, CC, NumBytes, IsVarArg);
+
+  // Set all unused physregs defs as dead.
+  static_cast<MachineInstr *>(MIB)->setPhysRegsDeadExcept(UsedRegs, TRI);
+
+  return true;
+}
+
+// Attempt to fast-select a return instruction.
+bool PPCFastISel::SelectRet(const Instruction *I) {
+
+  if (!FuncInfo.CanLowerReturn)
+    return false;
+
+  const ReturnInst *Ret = cast<ReturnInst>(I);
+  const Function &F = *I->getParent()->getParent();
+
+  // Build a list of return value registers.
+  SmallVector<unsigned, 4> RetRegs;
+  CallingConv::ID CC = F.getCallingConv();
+
+  if (Ret->getNumOperands() > 0) {
+    SmallVector<ISD::OutputArg, 4> Outs;
+    GetReturnInfo(F.getReturnType(), F.getAttributes(), Outs, TLI);
+
+    // Analyze operands of the call, assigning locations to each operand.
+    SmallVector<CCValAssign, 16> ValLocs;
+    CCState CCInfo(CC, F.isVarArg(), *FuncInfo.MF, TM, ValLocs, *Context);
+    CCInfo.AnalyzeReturn(Outs, RetCC_PPC64_ELF_FIS);
+    const Value *RV = Ret->getOperand(0);
+    
+    // FIXME: Only one output register for now.
+    if (ValLocs.size() > 1)
+      return false;
+
+    // Special case for returning a constant integer of any size.
+    // Materialize the constant as an i64 and copy it to the return
+    // register.  This avoids an unnecessary extend or truncate.
+    if (isa<ConstantInt>(*RV)) {
+      const Constant *C = cast<Constant>(RV);
+      unsigned SrcReg = PPCMaterializeInt(C, MVT::i64);
+      unsigned RetReg = ValLocs[0].getLocReg();
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
+              RetReg).addReg(SrcReg);
+      RetRegs.push_back(RetReg);
+
+    } else {
+      unsigned Reg = getRegForValue(RV);
+
+      if (Reg == 0)
+        return false;
+
+      // Copy the result values into the output registers.
+      for (unsigned i = 0; i < ValLocs.size(); ++i) {
+
+        CCValAssign &VA = ValLocs[i];
+        assert(VA.isRegLoc() && "Can only return in registers!");
+        RetRegs.push_back(VA.getLocReg());
+        unsigned SrcReg = Reg + VA.getValNo();
+
+        EVT RVEVT = TLI.getValueType(RV->getType());
+        if (!RVEVT.isSimple())
+          return false;
+        MVT RVVT = RVEVT.getSimpleVT();
+        MVT DestVT = VA.getLocVT();
+
+        if (RVVT != DestVT && RVVT != MVT::i8 &&
+            RVVT != MVT::i16 && RVVT != MVT::i32)
+          return false;
+      
+        if (RVVT != DestVT) {
+          switch (VA.getLocInfo()) {
+            default:
+              llvm_unreachable("Unknown loc info!");
+            case CCValAssign::Full:
+              llvm_unreachable("Full value assign but types don't match?");
+            case CCValAssign::AExt:
+            case CCValAssign::ZExt: {
+              const TargetRegisterClass *RC =
+                (DestVT == MVT::i64) ? &PPC::G8RCRegClass : &PPC::GPRCRegClass;
+              unsigned TmpReg = createResultReg(RC);
+              if (!PPCEmitIntExt(RVVT, SrcReg, DestVT, TmpReg, true))
+                return false;
+              SrcReg = TmpReg;
+              break;
+            }
+            case CCValAssign::SExt: {
+              const TargetRegisterClass *RC =
+                (DestVT == MVT::i64) ? &PPC::G8RCRegClass : &PPC::GPRCRegClass;
+              unsigned TmpReg = createResultReg(RC);
+              if (!PPCEmitIntExt(RVVT, SrcReg, DestVT, TmpReg, false))
+                return false;
+              SrcReg = TmpReg;
+              break;
+            }
+          }
+        }
+
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                TII.get(TargetOpcode::COPY), RetRegs[i])
+          .addReg(SrcReg);
+      }
+    }
+  }
+
+  MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                                    TII.get(PPC::BLR));
+
+  for (unsigned i = 0, e = RetRegs.size(); i != e; ++i)
+    MIB.addReg(RetRegs[i], RegState::Implicit);
+
+  return true;
+}
+
+// Attempt to emit an integer extend of SrcReg into DestReg.  Both
+// signed and zero extensions are supported.  Return false if we
+// can't handle it.
+bool PPCFastISel::PPCEmitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT,
+                                unsigned DestReg, bool IsZExt) {
+  if (DestVT != MVT::i32 && DestVT != MVT::i64)
+    return false;
+  if (SrcVT != MVT::i8 && SrcVT != MVT::i16 && SrcVT != MVT::i32)
+    return false;
+
+  // Signed extensions use EXTSB, EXTSH, EXTSW.
+  if (!IsZExt) {
+    unsigned Opc;
+    if (SrcVT == MVT::i8)
+      Opc = (DestVT == MVT::i32) ? PPC::EXTSB : PPC::EXTSB8_32_64;
+    else if (SrcVT == MVT::i16)
+      Opc = (DestVT == MVT::i32) ? PPC::EXTSH : PPC::EXTSH8_32_64;
+    else {
+      assert(DestVT == MVT::i64 && "Signed extend from i32 to i32??");
+      Opc = PPC::EXTSW_32_64;
+    }
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), DestReg)
+      .addReg(SrcReg);
+
+  // Unsigned 32-bit extensions use RLWINM.
+  } else if (DestVT == MVT::i32) {
+    unsigned MB;
+    if (SrcVT == MVT::i8)
+      MB = 24;
+    else {
+      assert(SrcVT == MVT::i16 && "Unsigned extend from i32 to i32??");
+      MB = 16;
+    }
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(PPC::RLWINM),
+            DestReg)
+      .addReg(SrcReg).addImm(/*SH=*/0).addImm(MB).addImm(/*ME=*/31);
+
+  // Unsigned 64-bit extensions use RLDICL (with a 32-bit source).
+  } else {
+    unsigned MB;
+    if (SrcVT == MVT::i8)
+      MB = 56;
+    else if (SrcVT == MVT::i16)
+      MB = 48;
+    else
+      MB = 32;
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+            TII.get(PPC::RLDICL_32_64), DestReg)
+      .addReg(SrcReg).addImm(/*SH=*/0).addImm(MB);
+  }
+
+  return true;
+}
+
+// Attempt to fast-select an indirect branch instruction.
+bool PPCFastISel::SelectIndirectBr(const Instruction *I) {
+  unsigned AddrReg = getRegForValue(I->getOperand(0));
+  if (AddrReg == 0)
+    return false;
+
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(PPC::MTCTR8))
+    .addReg(AddrReg);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(PPC::BCTR8));
+
+  const IndirectBrInst *IB = cast<IndirectBrInst>(I);
+  for (unsigned i = 0, e = IB->getNumSuccessors(); i != e; ++i)
+    FuncInfo.MBB->addSuccessor(FuncInfo.MBBMap[IB->getSuccessor(i)]);
+
+  return true;
+}
+
+// Attempt to fast-select an integer truncate instruction.
+bool PPCFastISel::SelectTrunc(const Instruction *I) {
+  Value *Src  = I->getOperand(0);
+  EVT SrcVT  = TLI.getValueType(Src->getType(), true);
+  EVT DestVT = TLI.getValueType(I->getType(), true);
+
+  if (SrcVT != MVT::i64 && SrcVT != MVT::i32 && SrcVT != MVT::i16)
+    return false;
+
+  if (DestVT != MVT::i32 && DestVT != MVT::i16 && DestVT != MVT::i8)
+    return false;
+
+  unsigned SrcReg = getRegForValue(Src);
+  if (!SrcReg)
+    return false;
+
+  // The only interesting case is when we need to switch register classes.
+  if (SrcVT == MVT::i64) {
+    unsigned ResultReg = createResultReg(&PPC::GPRCRegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
+            ResultReg).addReg(SrcReg, 0, PPC::sub_32);
+    SrcReg = ResultReg;
+  }
+
+  UpdateValueMap(I, SrcReg);
+  return true;
+}
+
+// Attempt to fast-select an integer extend instruction.
+bool PPCFastISel::SelectIntExt(const Instruction *I) {
+  Type *DestTy = I->getType();
+  Value *Src = I->getOperand(0);
+  Type *SrcTy = Src->getType();
+
+  bool IsZExt = isa<ZExtInst>(I);
+  unsigned SrcReg = getRegForValue(Src);
+  if (!SrcReg) return false;
+
+  EVT SrcEVT, DestEVT;
+  SrcEVT = TLI.getValueType(SrcTy, true);
+  DestEVT = TLI.getValueType(DestTy, true);
+  if (!SrcEVT.isSimple())
+    return false;
+  if (!DestEVT.isSimple())
+    return false;
+
+  MVT SrcVT = SrcEVT.getSimpleVT();
+  MVT DestVT = DestEVT.getSimpleVT();
+
+  // If we know the register class needed for the result of this
+  // instruction, use it.  Otherwise pick the register class of the
+  // correct size that does not contain X0/R0, since we don't know
+  // whether downstream uses permit that assignment.
+  unsigned AssignedReg = FuncInfo.ValueMap[I];
+  const TargetRegisterClass *RC =
+    (AssignedReg ? MRI.getRegClass(AssignedReg) :
+     (DestVT == MVT::i64 ? &PPC::G8RC_and_G8RC_NOX0RegClass :
+      &PPC::GPRC_and_GPRC_NOR0RegClass));
+  unsigned ResultReg = createResultReg(RC);
+
+  if (!PPCEmitIntExt(SrcVT, SrcReg, DestVT, ResultReg, IsZExt))
+    return false;
+
+  UpdateValueMap(I, ResultReg);
+  return true;
+}
+
 // Attempt to fast-select an instruction that wasn't handled by
-// the table-generated machinery.  TBD.
+// the table-generated machinery.
 bool PPCFastISel::TargetSelectInstruction(const Instruction *I) {
-  return I && false;
+
+  switch (I->getOpcode()) {
+    case Instruction::Load:
+      return SelectLoad(I);
+    case Instruction::Store:
+      return SelectStore(I);
+    case Instruction::Br:
+      return SelectBranch(I);
+    case Instruction::IndirectBr:
+      return SelectIndirectBr(I);
+    case Instruction::FPExt:
+      return SelectFPExt(I);
+    case Instruction::FPTrunc:
+      return SelectFPTrunc(I);
+    case Instruction::SIToFP:
+      return SelectIToFP(I, /*IsSigned*/ true);
+    case Instruction::UIToFP:
+      return SelectIToFP(I, /*IsSigned*/ false);
+    case Instruction::FPToSI:
+      return SelectFPToI(I, /*IsSigned*/ true);
+    case Instruction::FPToUI:
+      return SelectFPToI(I, /*IsSigned*/ false);
+    case Instruction::Add:
+      return SelectBinaryIntOp(I, ISD::ADD);
+    case Instruction::Or:
+      return SelectBinaryIntOp(I, ISD::OR);
+    case Instruction::Sub:
+      return SelectBinaryIntOp(I, ISD::SUB);
+    case Instruction::Call:
+      if (dyn_cast<IntrinsicInst>(I))
+        return false;
+      return SelectCall(I);
+    case Instruction::Ret:
+      return SelectRet(I);
+    case Instruction::Trunc:
+      return SelectTrunc(I);
+    case Instruction::ZExt:
+    case Instruction::SExt:
+      return SelectIntExt(I);
+    // Here add other flavors of Instruction::XXX that automated
+    // cases don't catch.  For example, switches are terminators
+    // that aren't yet handled.
+    default:
+      break;
+  }
+  return false;
 }
 
 // Materialize a floating-point constant into a register, and return
@@ -131,21 +1802,94 @@ unsigned PPCFastISel::PPCMaterializeFP(const ConstantFP *CFP, MVT VT) {
       MachinePointerInfo::getConstantPool(), MachineMemOperand::MOLoad,
       (VT == MVT::f32) ? 4 : 8, Align);
 
-  // For small code model, generate a LDtocCPT.
-  if (CModel == CodeModel::Small || CModel == CodeModel::JITDefault)
+  unsigned Opc = (VT == MVT::f32) ? PPC::LFS : PPC::LFD;
+  unsigned TmpReg = createResultReg(&PPC::G8RC_and_G8RC_NOX0RegClass);
+
+  // For small code model, generate a LF[SD](0, LDtocCPT(Idx, X2)).
+  if (CModel == CodeModel::Small || CModel == CodeModel::JITDefault) {
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(PPC::LDtocCPT),
-            DestReg)
-      .addConstantPoolIndex(Idx).addReg(PPC::X2).addMemOperand(MMO);
-  else {
+            TmpReg)
+      .addConstantPoolIndex(Idx).addReg(PPC::X2);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), DestReg)
+      .addImm(0).addReg(TmpReg).addMemOperand(MMO);
+  } else {
     // Otherwise we generate LF[SD](Idx[lo], ADDIStocHA(X2, Idx)).
-    unsigned Opc = (VT == MVT::f32) ? PPC::LFS : PPC::LFD;
-    unsigned TmpReg = createResultReg(&PPC::G8RC_and_G8RC_NOX0RegClass);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(PPC::ADDIStocHA),
             TmpReg).addReg(PPC::X2).addConstantPoolIndex(Idx);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), DestReg)
-      .addConstantPoolIndex(Idx, 0, PPCII::MO_TOC_LO)
-      .addReg(TmpReg)
-      .addMemOperand(MMO);
+    // But for large code model, we must generate a LDtocL followed
+    // by the LF[SD].
+    if (CModel == CodeModel::Large) {
+      unsigned TmpReg2 = createResultReg(&PPC::G8RC_and_G8RC_NOX0RegClass);
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(PPC::LDtocL),
+              TmpReg2).addConstantPoolIndex(Idx).addReg(TmpReg);
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), DestReg)
+        .addImm(0).addReg(TmpReg2);
+    } else 
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), DestReg)
+        .addConstantPoolIndex(Idx, 0, PPCII::MO_TOC_LO)
+        .addReg(TmpReg)
+        .addMemOperand(MMO);
+  }
+
+  return DestReg;
+}
+
+// Materialize the address of a global value into a register, and return
+// the register number (or zero if we failed to handle it).
+unsigned PPCFastISel::PPCMaterializeGV(const GlobalValue *GV, MVT VT) {
+  assert(VT == MVT::i64 && "Non-address!");
+  const TargetRegisterClass *RC = &PPC::G8RC_and_G8RC_NOX0RegClass;
+  unsigned DestReg = createResultReg(RC);
+
+  // Global values may be plain old object addresses, TLS object
+  // addresses, constant pool entries, or jump tables.  How we generate
+  // code for these may depend on small, medium, or large code model.
+  CodeModel::Model CModel = TM.getCodeModel();
+
+  // FIXME: Jump tables are not yet required because fast-isel doesn't
+  // handle switches; if that changes, we need them as well.  For now,
+  // what follows assumes everything's a generic (or TLS) global address.
+  const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV);
+  if (!GVar) {
+    // If GV is an alias, use the aliasee for determining thread-locality.
+    if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
+      GVar = dyn_cast_or_null<GlobalVariable>(GA->resolveAliasedGlobal(false));
+  }
+
+  // FIXME: We don't yet handle the complexity of TLS.
+  bool IsTLS = GVar && GVar->isThreadLocal();
+  if (IsTLS)
+    return 0;
+
+  // For small code model, generate a simple TOC load.
+  if (CModel == CodeModel::Small || CModel == CodeModel::JITDefault)
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(PPC::LDtoc), DestReg)
+      .addGlobalAddress(GV).addReg(PPC::X2);
+  else {
+    // If the address is an externally defined symbol, a symbol with
+    // common or externally available linkage, a function address, or a
+    // jump table address (not yet needed), or if we are generating code
+    // for large code model, we generate:
+    //       LDtocL(GV, ADDIStocHA(%X2, GV))
+    // Otherwise we generate:
+    //       ADDItocL(ADDIStocHA(%X2, GV), GV)
+    // Either way, start with the ADDIStocHA:
+    unsigned HighPartReg = createResultReg(RC);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(PPC::ADDIStocHA),
+            HighPartReg).addReg(PPC::X2).addGlobalAddress(GV);
+
+    // !GVar implies a function address.  An external variable is one
+    // without an initializer.
+    // If/when switches are implemented, jump tables should be handled
+    // on the "if" path here.
+    if (CModel == CodeModel::Large || !GVar || !GVar->hasInitializer() ||
+        GVar->hasCommonLinkage() || GVar->hasAvailableExternallyLinkage())
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(PPC::LDtocL),
+              DestReg).addGlobalAddress(GV).addReg(HighPartReg);
+    else
+      // Otherwise generate the ADDItocL.
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(PPC::ADDItocL),
+              DestReg).addReg(HighPartReg).addGlobalAddress(GV);
   }
 
   return DestReg;
@@ -283,23 +2027,112 @@ unsigned PPCFastISel::TargetMaterializeConstant(const Constant *C) {
 
   if (const ConstantFP *CFP = dyn_cast<ConstantFP>(C))
     return PPCMaterializeFP(CFP, VT);
+  else if (const GlobalValue *GV = dyn_cast<GlobalValue>(C))
+    return PPCMaterializeGV(GV, VT);
   else if (isa<ConstantInt>(C))
     return PPCMaterializeInt(C, VT);
-  // TBD: Global values.
 
   return 0;
 }
 
 // Materialize the address created by an alloca into a register, and
-// return the register number (or zero if we failed to handle it).  TBD.
+// return the register number (or zero if we failed to handle it).
 unsigned PPCFastISel::TargetMaterializeAlloca(const AllocaInst *AI) {
-  return AI && 0;
+  // Don't handle dynamic allocas.
+  if (!FuncInfo.StaticAllocaMap.count(AI)) return 0;
+
+  MVT VT;
+  if (!isLoadTypeLegal(AI->getType(), VT)) return 0;
+
+  DenseMap<const AllocaInst*, int>::iterator SI =
+    FuncInfo.StaticAllocaMap.find(AI);
+
+  if (SI != FuncInfo.StaticAllocaMap.end()) {
+    unsigned ResultReg = createResultReg(&PPC::G8RC_and_G8RC_NOX0RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(PPC::ADDI8),
+            ResultReg).addFrameIndex(SI->second).addImm(0);
+    return ResultReg;
+  }
+
+  return 0;
 }
 
-// Fold loads into extends when possible.  TBD.
+// Fold loads into extends when possible.
+// FIXME: We can have multiple redundant extend/trunc instructions
+// following a load.  The folding only picks up one.  Extend this
+// to check subsequent instructions for the same pattern and remove
+// them.  Thus ResultReg should be the def reg for the last redundant
+// instruction in a chain, and all intervening instructions can be
+// removed from parent.  Change test/CodeGen/PowerPC/fast-isel-fold.ll
+// to add ELF64-NOT: rldicl to the appropriate tests when this works.
 bool PPCFastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
                                       const LoadInst *LI) {
-  return MI && OpNo && LI && false;
+  // Verify we have a legal type before going any further.
+  MVT VT;
+  if (!isLoadTypeLegal(LI->getType(), VT))
+    return false;
+
+  // Combine load followed by zero- or sign-extend.
+  bool IsZExt = false;
+  switch(MI->getOpcode()) {
+    default:
+      return false;
+
+    case PPC::RLDICL:
+    case PPC::RLDICL_32_64: {
+      IsZExt = true;
+      unsigned MB = MI->getOperand(3).getImm();
+      if ((VT == MVT::i8 && MB <= 56) ||
+          (VT == MVT::i16 && MB <= 48) ||
+          (VT == MVT::i32 && MB <= 32))
+        break;
+      return false;
+    }
+
+    case PPC::RLWINM:
+    case PPC::RLWINM8: {
+      IsZExt = true;
+      unsigned MB = MI->getOperand(3).getImm();
+      if ((VT == MVT::i8 && MB <= 24) ||
+          (VT == MVT::i16 && MB <= 16))
+        break;
+      return false;
+    }
+
+    case PPC::EXTSB:
+    case PPC::EXTSB8:
+    case PPC::EXTSB8_32_64:
+      /* There is no sign-extending load-byte instruction. */
+      return false;
+
+    case PPC::EXTSH:
+    case PPC::EXTSH8:
+    case PPC::EXTSH8_32_64: {
+      if (VT != MVT::i16 && VT != MVT::i8)
+        return false;
+      break;
+    }
+
+    case PPC::EXTSW:
+    case PPC::EXTSW_32_64: {
+      if (VT != MVT::i32 && VT != MVT::i16 && VT != MVT::i8)
+        return false;
+      break;
+    }
+  }
+
+  // See if we can handle this address.
+  Address Addr;
+  if (!PPCComputeAddress(LI->getOperand(0), Addr))
+    return false;
+
+  unsigned ResultReg = MI->getOperand(0).getReg();
+
+  if (!PPCEmitLoad(VT, ResultReg, Addr, 0, IsZExt))
+    return false;
+
+  MI->eraseFromParent();
+  return true;
 }
 
 // Attempt to lower call arguments in a faster way than done by
@@ -312,6 +2145,81 @@ bool PPCFastISel::FastLowerArguments() {
   return false;
 }
 
+// Handle materializing integer constants into a register.  This is not
+// automatically generated for PowerPC, so must be explicitly created here.
+unsigned PPCFastISel::FastEmit_i(MVT Ty, MVT VT, unsigned Opc, uint64_t Imm) {
+  
+  if (Opc != ISD::Constant)
+    return 0;
+
+  if (VT != MVT::i64 && VT != MVT::i32 && VT != MVT::i16 &&
+      VT != MVT::i8 && VT != MVT::i1) 
+    return 0;
+
+  const TargetRegisterClass *RC = ((VT == MVT::i64) ? &PPC::G8RCRegClass :
+                                   &PPC::GPRCRegClass);
+  if (VT == MVT::i64)
+    return PPCMaterialize64BitInt(Imm, RC);
+  else
+    return PPCMaterialize32BitInt(Imm, RC);
+}
+
+// Override for ADDI and ADDI8 to set the correct register class
+// on RHS operand 0.  The automatic infrastructure naively assumes
+// GPRC for i32 and G8RC for i64; the concept of "no R0" is lost
+// for these cases.  At the moment, none of the other automatically
+// generated RI instructions require special treatment.  However, once
+// SelectSelect is implemented, "isel" requires similar handling.
+//
+// Also be conservative about the output register class.  Avoid
+// assigning R0 or X0 to the output register for GPRC and G8RC
+// register classes, as any such result could be used in ADDI, etc.,
+// where those regs have another meaning.
+unsigned PPCFastISel::FastEmitInst_ri(unsigned MachineInstOpcode,
+                                      const TargetRegisterClass *RC,
+                                      unsigned Op0, bool Op0IsKill,
+                                      uint64_t Imm) {
+  if (MachineInstOpcode == PPC::ADDI)
+    MRI.setRegClass(Op0, &PPC::GPRC_and_GPRC_NOR0RegClass);
+  else if (MachineInstOpcode == PPC::ADDI8)
+    MRI.setRegClass(Op0, &PPC::G8RC_and_G8RC_NOX0RegClass);
+
+  const TargetRegisterClass *UseRC =
+    (RC == &PPC::GPRCRegClass ? &PPC::GPRC_and_GPRC_NOR0RegClass :
+     (RC == &PPC::G8RCRegClass ? &PPC::G8RC_and_G8RC_NOX0RegClass : RC));
+
+  return FastISel::FastEmitInst_ri(MachineInstOpcode, UseRC,
+                                   Op0, Op0IsKill, Imm);
+}
+
+// Override for instructions with one register operand to avoid use of
+// R0/X0.  The automatic infrastructure isn't aware of the context so
+// we must be conservative.
+unsigned PPCFastISel::FastEmitInst_r(unsigned MachineInstOpcode,
+                                     const TargetRegisterClass* RC,
+                                     unsigned Op0, bool Op0IsKill) {
+  const TargetRegisterClass *UseRC =
+    (RC == &PPC::GPRCRegClass ? &PPC::GPRC_and_GPRC_NOR0RegClass :
+     (RC == &PPC::G8RCRegClass ? &PPC::G8RC_and_G8RC_NOX0RegClass : RC));
+
+  return FastISel::FastEmitInst_r(MachineInstOpcode, UseRC, Op0, Op0IsKill);
+}
+
+// Override for instructions with two register operands to avoid use
+// of R0/X0.  The automatic infrastructure isn't aware of the context
+// so we must be conservative.
+unsigned PPCFastISel::FastEmitInst_rr(unsigned MachineInstOpcode,
+                                      const TargetRegisterClass* RC,
+                                      unsigned Op0, bool Op0IsKill,
+                                      unsigned Op1, bool Op1IsKill) {
+  const TargetRegisterClass *UseRC =
+    (RC == &PPC::GPRCRegClass ? &PPC::GPRC_and_GPRC_NOR0RegClass :
+     (RC == &PPC::G8RCRegClass ? &PPC::G8RC_and_G8RC_NOX0RegClass : RC));
+
+  return FastISel::FastEmitInst_rr(MachineInstOpcode, UseRC, Op0, Op0IsKill,
+                                   Op1, Op1IsKill);
+}
+
 namespace llvm {
   // Create the fast instruction selector for PowerPC64 ELF.
   FastISel *PPC::createFastISel(FunctionLoweringInfo &FuncInfo,
diff --git a/lib/Target/PowerPC/PPCFrameLowering.cpp b/lib/Target/PowerPC/PPCFrameLowering.cpp
index 24d3a0b..0ac2ced 100644
--- a/lib/Target/PowerPC/PPCFrameLowering.cpp
+++ b/lib/Target/PowerPC/PPCFrameLowering.cpp
@@ -204,10 +204,9 @@ unsigned PPCFrameLowering::determineFrameLayout(MachineFunction &MF,
   unsigned FrameSize =
     UseEstimate ? MFI->estimateStackSize(MF) : MFI->getStackSize();
 
-  // Get the alignments provided by the target, and the maximum alignment
-  // (if any) of the fixed frame objects.
-  unsigned TargetAlign = getStackAlignment();
-  unsigned MaxAlign = MFI->getMaxAlignment();
+  // Get stack alignments. The frame must be aligned to the greatest of these:
+  unsigned TargetAlign = getStackAlignment(); // alignment required per the ABI
+  unsigned MaxAlign = MFI->getMaxAlignment(); // algmt required by data in frame
   unsigned AlignMask = std::max(MaxAlign, TargetAlign) - 1;
 
   const PPCRegisterInfo *RegInfo =
@@ -346,12 +345,20 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF) const {
   bool needsFrameMoves = MMI.hasDebugInfo() ||
     MF.getFunction()->needsUnwindTableEntry();
 
+  // Get processor type.
+  bool isPPC64 = Subtarget.isPPC64();
+  // Get the ABI.
+  bool isDarwinABI = Subtarget.isDarwinABI();
+  bool isSVR4ABI = Subtarget.isSVR4ABI();
+  assert((isDarwinABI || isSVR4ABI) &&
+         "Currently only Darwin and SVR4 ABIs are supported for PowerPC.");
+
   // Prepare for frame info.
   MCSymbol *FrameLabel = 0;
 
   // Scan the prolog, looking for an UPDATE_VRSAVE instruction.  If we find it,
   // process it.
-  if (!Subtarget.isSVR4ABI())
+  if (!isSVR4ABI)
     for (unsigned i = 0; MBBI != MBB.end(); ++i, ++MBBI) {
       if (MBBI->getOpcode() == PPC::UPDATE_VRSAVE) {
         HandleVRSaveUpdate(MBBI, TII);
@@ -371,23 +378,52 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF) const {
   if (MFI->isFrameAddressTaken())
     replaceFPWithRealFP(MF);
 
-  // Get processor type.
-  bool isPPC64 = Subtarget.isPPC64();
-  // Get operating system
-  bool isDarwinABI = Subtarget.isDarwinABI();
   // Check if the link register (LR) must be saved.
   PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
   bool MustSaveLR = FI->mustSaveLR();
   const SmallVectorImpl<unsigned> &MustSaveCRs = FI->getMustSaveCRs();
-  // Do we have a frame pointer for this function?
+  // Do we have a frame pointer and/or base pointer for this function?
   bool HasFP = hasFP(MF);
   bool HasBP = RegInfo->hasBasePointer(MF);
 
+  unsigned SPReg       = isPPC64 ? PPC::X1  : PPC::R1;
+  unsigned BPReg       = isPPC64 ? PPC::X30 : PPC::R30;
+  unsigned FPReg       = isPPC64 ? PPC::X31 : PPC::R31;
+  unsigned LRReg       = isPPC64 ? PPC::LR8 : PPC::LR;
+  unsigned ScratchReg  = isPPC64 ? PPC::X0  : PPC::R0;
+  unsigned TempReg     = isPPC64 ? PPC::X12 : PPC::R12; // another scratch reg
+  //  ...(R12/X12 is volatile in both Darwin & SVR4, & can't be a function arg.)
+  const MCInstrDesc& MFLRInst = TII.get(isPPC64 ? PPC::MFLR8
+                                                : PPC::MFLR );
+  const MCInstrDesc& StoreInst = TII.get(isPPC64 ? PPC::STD
+                                                 : PPC::STW );
+  const MCInstrDesc& StoreUpdtInst = TII.get(isPPC64 ? PPC::STDU
+                                                     : PPC::STWU );
+  const MCInstrDesc& StoreUpdtIdxInst = TII.get(isPPC64 ? PPC::STDUX
+                                                        : PPC::STWUX);
+  const MCInstrDesc& LoadImmShiftedInst = TII.get(isPPC64 ? PPC::LIS8
+                                                          : PPC::LIS );
+  const MCInstrDesc& OrImmInst = TII.get(isPPC64 ? PPC::ORI8
+                                                 : PPC::ORI );
+  const MCInstrDesc& OrInst = TII.get(isPPC64 ? PPC::OR8
+                                              : PPC::OR );
+  const MCInstrDesc& SubtractCarryingInst = TII.get(isPPC64 ? PPC::SUBFC8
+                                                            : PPC::SUBFC);
+  const MCInstrDesc& SubtractImmCarryingInst = TII.get(isPPC64 ? PPC::SUBFIC8
+                                                               : PPC::SUBFIC);
+
+  // Regarding this assert: Even though LR is saved in the caller's frame (i.e.,
+  // LROffset is positive), that slot is callee-owned. Because PPC32 SVR4 has no
+  // Red Zone, an asynchronous event (a form of "callee") could claim a frame &
+  // overwrite it, so PPC32 SVR4 must claim at least a minimal frame to save LR.
+  assert((isPPC64 || !isSVR4ABI || !(!FrameSize && (MustSaveLR || HasFP))) &&
+         "FrameSize must be >0 to save/restore the FP or LR for 32-bit SVR4.");
+
   int LROffset = PPCFrameLowering::getReturnSaveOffset(isPPC64, isDarwinABI);
 
   int FPOffset = 0;
   if (HasFP) {
-    if (Subtarget.isSVR4ABI()) {
+    if (isSVR4ABI) {
       MachineFrameInfo *FFI = MF.getFrameInfo();
       int FPIndex = FI->getFramePointerSaveIndex();
       assert(FPIndex && "No Frame Pointer Save Slot!");
@@ -399,7 +435,7 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF) const {
 
   int BPOffset = 0;
   if (HasBP) {
-    if (Subtarget.isSVR4ABI()) {
+    if (isSVR4ABI) {
       MachineFrameInfo *FFI = MF.getFrameInfo();
       int BPIndex = FI->getBasePointerSaveIndex();
       assert(BPIndex && "No Base Pointer Save Slot!");
@@ -410,181 +446,116 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF) const {
     }
   }
 
-  if (isPPC64) {
-    if (MustSaveLR)
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::MFLR8), PPC::X0);
+  // Get stack alignments.
+  unsigned MaxAlign = MFI->getMaxAlignment();
+  if (HasBP && MaxAlign > 1)
+    assert(isPowerOf2_32(MaxAlign) && isInt<16>(MaxAlign) &&
+           "Invalid alignment!");
+
+  // Frames of 32KB & larger require special handling because they cannot be
+  // indexed into with a simple STDU/STWU/STD/STW immediate offset operand.
+  bool isLargeFrame = !isInt<16>(NegFrameSize);
 
-    if (!MustSaveCRs.empty()) {
-      MachineInstrBuilder MIB =
-        BuildMI(MBB, MBBI, dl, TII.get(PPC::MFCR8), PPC::X12);
-      for (unsigned i = 0, e = MustSaveCRs.size(); i != e; ++i)
-        MIB.addReg(MustSaveCRs[i], RegState::ImplicitKill);
-    }
+  if (MustSaveLR)
+    BuildMI(MBB, MBBI, dl, MFLRInst, ScratchReg);
 
-    if (HasFP)
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::STD))
-        .addReg(PPC::X31)
-        .addImm(FPOffset)
-        .addReg(PPC::X1);
-
-    if (HasBP)
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::STD))
-        .addReg(PPC::X30)
-        .addImm(BPOffset)
-        .addReg(PPC::X1);
-
-    if (MustSaveLR)
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::STD))
-        .addReg(PPC::X0)
-        .addImm(LROffset)
-        .addReg(PPC::X1);
-
-    if (!MustSaveCRs.empty())
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::STW8))
-        .addReg(PPC::X12, getKillRegState(true))
-        .addImm(8)
-        .addReg(PPC::X1);
-  } else {
-    if (MustSaveLR)
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::MFLR), PPC::R0);
-
-    if (HasFP)
-      // FIXME: On PPC32 SVR4, FPOffset is negative and access to negative
-      // offsets of R1 is not allowed.
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::STW))
-        .addReg(PPC::R31)
-        .addImm(FPOffset)
-        .addReg(PPC::R1);
-
-    if (HasBP)
-      // FIXME: On PPC32 SVR4, FPOffset is negative and access to negative
-      // offsets of R1 is not allowed.
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::STW))
-        .addReg(PPC::R30)
-        .addImm(BPOffset)
-        .addReg(PPC::R1);
-
-    assert(MustSaveCRs.empty() &&
-           "Prologue CR saving supported only in 64-bit mode");
-
-    if (MustSaveLR)
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::STW))
-        .addReg(PPC::R0)
-        .addImm(LROffset)
-        .addReg(PPC::R1);
+  assert((isPPC64 || MustSaveCRs.empty()) &&
+         "Prologue CR saving supported only in 64-bit mode");
+
+  if (!MustSaveCRs.empty()) { // will only occur for PPC64
+    MachineInstrBuilder MIB =
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::MFCR8), TempReg);
+    for (unsigned i = 0, e = MustSaveCRs.size(); i != e; ++i)
+      MIB.addReg(MustSaveCRs[i], RegState::ImplicitKill);
   }
 
-  // Skip if a leaf routine.
+  if (HasFP)
+    // FIXME: On PPC32 SVR4, we must not spill before claiming the stackframe.
+    BuildMI(MBB, MBBI, dl, StoreInst)
+      .addReg(FPReg)
+      .addImm(FPOffset)
+      .addReg(SPReg);
+
+  if (HasBP)
+    // FIXME: On PPC32 SVR4, we must not spill before claiming the stackframe.
+    BuildMI(MBB, MBBI, dl, StoreInst)
+      .addReg(BPReg)
+      .addImm(BPOffset)
+      .addReg(SPReg);
+
+  if (MustSaveLR)
+    // FIXME: On PPC32 SVR4, we must not spill before claiming the stackframe.
+    BuildMI(MBB, MBBI, dl, StoreInst)
+      .addReg(ScratchReg)
+      .addImm(LROffset)
+      .addReg(SPReg);
+
+  if (!MustSaveCRs.empty()) // will only occur for PPC64
+    BuildMI(MBB, MBBI, dl, TII.get(PPC::STW8))
+      .addReg(TempReg, getKillRegState(true))
+      .addImm(8)
+      .addReg(SPReg);
+
+  // Skip the rest if this is a leaf function & all spills fit in the Red Zone.
   if (!FrameSize) return;
 
-  // Get stack alignments.
-  unsigned MaxAlign = MFI->getMaxAlignment();
-
   // Adjust stack pointer: r1 += NegFrameSize.
   // If there is a preferred stack alignment, align R1 now
-  if (!isPPC64) {
-    // PPC32.
-
-    if (HasBP) {
-      // Save a copy of r1 as the base pointer.
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::OR), PPC::R30)
-        .addReg(PPC::R1)
-        .addReg(PPC::R1);
-    }
 
-    if (HasBP && MaxAlign > 1) {
-      assert(isPowerOf2_32(MaxAlign) && isInt<16>(MaxAlign) &&
-             "Invalid alignment!");
+  if (HasBP) {
+    // Save a copy of r1 as the base pointer.
+    BuildMI(MBB, MBBI, dl, OrInst, BPReg)
+      .addReg(SPReg)
+      .addReg(SPReg);
+  }
 
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::RLWINM), PPC::R0)
-        .addReg(PPC::R1)
+  if (HasBP && MaxAlign > 1) {
+    if (isPPC64)
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::RLDICL), ScratchReg)
+        .addReg(SPReg)
+        .addImm(0)
+        .addImm(64 - Log2_32(MaxAlign));
+    else // PPC32...
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::RLWINM), ScratchReg)
+        .addReg(SPReg)
         .addImm(0)
         .addImm(32 - Log2_32(MaxAlign))
         .addImm(31);
-      if (isInt<16>(NegFrameSize)) {
-        BuildMI(MBB, MBBI, dl, TII.get(PPC::SUBFIC), PPC::R0)
-          .addReg(PPC::R0, RegState::Kill)
-          .addImm(NegFrameSize);
-      } else {
-        BuildMI(MBB, MBBI, dl, TII.get(PPC::LIS), PPC::R12)
-          .addImm(NegFrameSize >> 16);
-        BuildMI(MBB, MBBI, dl, TII.get(PPC::ORI), PPC::R12)
-          .addReg(PPC::R12, RegState::Kill)
-          .addImm(NegFrameSize & 0xFFFF);
-        BuildMI(MBB, MBBI, dl, TII.get(PPC::SUBFC), PPC::R0)
-          .addReg(PPC::R0, RegState::Kill)
-          .addReg(PPC::R12, RegState::Kill);
-      }
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::STWUX), PPC::R1)
-        .addReg(PPC::R1, RegState::Kill)
-        .addReg(PPC::R1)
-        .addReg(PPC::R0);
-    } else if (isInt<16>(NegFrameSize)) {
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::STWU), PPC::R1)
-        .addReg(PPC::R1)
-        .addImm(NegFrameSize)
-        .addReg(PPC::R1);
+    if (!isLargeFrame) {
+      BuildMI(MBB, MBBI, dl, SubtractImmCarryingInst, ScratchReg)
+        .addReg(ScratchReg, RegState::Kill)
+        .addImm(NegFrameSize);
     } else {
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::LIS), PPC::R0)
+      BuildMI(MBB, MBBI, dl, LoadImmShiftedInst, TempReg)
         .addImm(NegFrameSize >> 16);
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::ORI), PPC::R0)
-        .addReg(PPC::R0, RegState::Kill)
+      BuildMI(MBB, MBBI, dl, OrImmInst, TempReg)
+        .addReg(TempReg, RegState::Kill)
         .addImm(NegFrameSize & 0xFFFF);
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::STWUX), PPC::R1)
-        .addReg(PPC::R1, RegState::Kill)
-        .addReg(PPC::R1)
-        .addReg(PPC::R0);
-    }
-  } else {    // PPC64.
-    if (HasBP) {
-      // Save a copy of r1 as the base pointer.
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::OR8), PPC::X30)
-        .addReg(PPC::X1)
-        .addReg(PPC::X1);
+      BuildMI(MBB, MBBI, dl, SubtractCarryingInst, ScratchReg)
+        .addReg(ScratchReg, RegState::Kill)
+        .addReg(TempReg, RegState::Kill);
     }
+    BuildMI(MBB, MBBI, dl, StoreUpdtIdxInst, SPReg)
+      .addReg(SPReg, RegState::Kill)
+      .addReg(SPReg)
+      .addReg(ScratchReg);
 
-    if (HasBP && MaxAlign > 1) {
-      assert(isPowerOf2_32(MaxAlign) && isInt<16>(MaxAlign) &&
-             "Invalid alignment!");
+  } else if (!isLargeFrame) {
+    BuildMI(MBB, MBBI, dl, StoreUpdtInst, SPReg)
+      .addReg(SPReg)
+      .addImm(NegFrameSize)
+      .addReg(SPReg);
 
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::RLDICL), PPC::X0)
-        .addReg(PPC::X1)
-        .addImm(0)
-        .addImm(64 - Log2_32(MaxAlign));
-      if (isInt<16>(NegFrameSize)) {
-        BuildMI(MBB, MBBI, dl, TII.get(PPC::SUBFIC8), PPC::X0)
-          .addReg(PPC::X0, RegState::Kill)
-          .addImm(NegFrameSize);
-      } else {
-        BuildMI(MBB, MBBI, dl, TII.get(PPC::LIS8), PPC::X12)
-          .addImm(NegFrameSize >> 16);
-        BuildMI(MBB, MBBI, dl, TII.get(PPC::ORI8), PPC::X12)
-          .addReg(PPC::X12, RegState::Kill)
-          .addImm(NegFrameSize & 0xFFFF);
-        BuildMI(MBB, MBBI, dl, TII.get(PPC::SUBFC8), PPC::X0)
-          .addReg(PPC::X0, RegState::Kill)
-          .addReg(PPC::X12, RegState::Kill);
-      }
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::STDUX), PPC::X1)
-        .addReg(PPC::X1, RegState::Kill)
-        .addReg(PPC::X1)
-        .addReg(PPC::X0);
-    } else if (isInt<16>(NegFrameSize)) {
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::STDU), PPC::X1)
-        .addReg(PPC::X1)
-        .addImm(NegFrameSize)
-        .addReg(PPC::X1);
-    } else {
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::LIS8), PPC::X0)
-        .addImm(NegFrameSize >> 16);
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::ORI8), PPC::X0)
-        .addReg(PPC::X0, RegState::Kill)
-        .addImm(NegFrameSize & 0xFFFF);
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::STDUX), PPC::X1)
-        .addReg(PPC::X1, RegState::Kill)
-        .addReg(PPC::X1)
-        .addReg(PPC::X0);
-    }
+  } else {
+    BuildMI(MBB, MBBI, dl, LoadImmShiftedInst, ScratchReg)
+      .addImm(NegFrameSize >> 16);
+    BuildMI(MBB, MBBI, dl, OrImmInst, ScratchReg)
+      .addReg(ScratchReg, RegState::Kill)
+      .addImm(NegFrameSize & 0xFFFF);
+    BuildMI(MBB, MBBI, dl, StoreUpdtIdxInst, SPReg)
+      .addReg(SPReg, RegState::Kill)
+      .addReg(SPReg)
+      .addReg(ScratchReg);
   }
 
   // Add the "machine moves" for the instructions we generated above, but in
@@ -600,22 +571,19 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF) const {
         MCCFIInstruction::createDefCfaOffset(FrameLabel, NegFrameSize));
 
     if (HasFP) {
-      unsigned Reg = isPPC64 ? PPC::X31 : PPC::R31;
-      Reg = MRI->getDwarfRegNum(Reg, true);
+      unsigned Reg = MRI->getDwarfRegNum(FPReg, true);
       MMI.addFrameInst(
           MCCFIInstruction::createOffset(FrameLabel, Reg, FPOffset));
     }
 
     if (HasBP) {
-      unsigned Reg = isPPC64 ? PPC::X30 : PPC::R30;
-      Reg = MRI->getDwarfRegNum(Reg, true);
+      unsigned Reg = MRI->getDwarfRegNum(BPReg, true);
       MMI.addFrameInst(
           MCCFIInstruction::createOffset(FrameLabel, Reg, BPOffset));
     }
 
     if (MustSaveLR) {
-      unsigned Reg = isPPC64 ? PPC::LR8 : PPC::LR;
-      Reg = MRI->getDwarfRegNum(Reg, true);
+      unsigned Reg = MRI->getDwarfRegNum(LRReg, true);
       MMI.addFrameInst(
           MCCFIInstruction::createOffset(FrameLabel, Reg, LROffset));
     }
@@ -625,15 +593,9 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF) const {
 
   // If there is a frame pointer, copy R1 into R31
   if (HasFP) {
-    if (!isPPC64) {
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::OR), PPC::R31)
-        .addReg(PPC::R1)
-        .addReg(PPC::R1);
-    } else {
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::OR8), PPC::X31)
-        .addReg(PPC::X1)
-        .addReg(PPC::X1);
-    }
+    BuildMI(MBB, MBBI, dl, OrInst, FPReg)
+      .addReg(SPReg)
+      .addReg(SPReg);
 
     if (needsFrameMoves) {
       ReadyLabel = MMI.getContext().CreateTempSymbol();
@@ -641,9 +603,7 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF) const {
       // Mark effective beginning of when frame pointer is ready.
       BuildMI(MBB, MBBI, dl, TII.get(PPC::PROLOG_LABEL)).addSym(ReadyLabel);
 
-      unsigned Reg = HasFP ? (isPPC64 ? PPC::X31 : PPC::R31)
-                           : (isPPC64 ? PPC::X1 : PPC::R1);
-      Reg = MRI->getDwarfRegNum(Reg, true);
+      unsigned Reg = MRI->getDwarfRegNum(FPReg, true);
       MMI.addFrameInst(MCCFIInstruction::createDefCfaRegister(ReadyLabel, Reg));
     }
   }
@@ -664,19 +624,16 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF) const {
 
       // For SVR4, don't emit a move for the CR spill slot if we haven't
       // spilled CRs.
-      if (Subtarget.isSVR4ABI()
-	  && (PPC::CR2 <= Reg && Reg <= PPC::CR4)
-	  && MustSaveCRs.empty())
-	continue;
+      if (isSVR4ABI && (PPC::CR2 <= Reg && Reg <= PPC::CR4)
+          && MustSaveCRs.empty())
+        continue;
 
       // For 64-bit SVR4 when we have spilled CRs, the spill location
       // is SP+8, not a frame-relative slot.
-      if (Subtarget.isSVR4ABI()
-	  && Subtarget.isPPC64()
-	  && (PPC::CR2 <= Reg && Reg <= PPC::CR4)) {
+      if (isSVR4ABI && isPPC64 && (PPC::CR2 <= Reg && Reg <= PPC::CR4)) {
         MMI.addFrameInst(MCCFIInstruction::createOffset(
             Label, MRI->getDwarfRegNum(PPC::CR2, true), 8));
-	continue;
+        continue;
       }
 
       int Offset = MFI->getObjectOffset(CSI[I].getFrameIdx());
@@ -707,7 +664,7 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF,
           RetOpcode == PPC::TCRETURNai8) &&
          "Can only insert epilog into returning blocks");
 
-  // Get alignment info so we know how to restore r1
+  // Get alignment info so we know how to restore the SP.
   const MachineFrameInfo *MFI = MF.getFrameInfo();
 
   // Get the number of bytes allocated from the FrameInfo.
@@ -715,21 +672,41 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF,
 
   // Get processor type.
   bool isPPC64 = Subtarget.isPPC64();
-  // Get operating system
+  // Get the ABI.
   bool isDarwinABI = Subtarget.isDarwinABI();
+  bool isSVR4ABI = Subtarget.isSVR4ABI();
+
   // Check if the link register (LR) has been saved.
   PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
   bool MustSaveLR = FI->mustSaveLR();
   const SmallVectorImpl<unsigned> &MustSaveCRs = FI->getMustSaveCRs();
-  // Do we have a frame pointer for this function?
+  // Do we have a frame pointer and/or base pointer for this function?
   bool HasFP = hasFP(MF);
   bool HasBP = RegInfo->hasBasePointer(MF);
 
+  unsigned SPReg      = isPPC64 ? PPC::X1  : PPC::R1;
+  unsigned BPReg      = isPPC64 ? PPC::X30 : PPC::R30;
+  unsigned FPReg      = isPPC64 ? PPC::X31 : PPC::R31;
+  unsigned ScratchReg  = isPPC64 ? PPC::X0  : PPC::R0;
+  unsigned TempReg     = isPPC64 ? PPC::X12 : PPC::R12; // another scratch reg
+  const MCInstrDesc& MTLRInst = TII.get( isPPC64 ? PPC::MTLR8
+                                                 : PPC::MTLR );
+  const MCInstrDesc& LoadInst = TII.get( isPPC64 ? PPC::LD
+                                                 : PPC::LWZ );
+  const MCInstrDesc& LoadImmShiftedInst = TII.get( isPPC64 ? PPC::LIS8
+                                                           : PPC::LIS );
+  const MCInstrDesc& OrImmInst = TII.get( isPPC64 ? PPC::ORI8
+                                                  : PPC::ORI );
+  const MCInstrDesc& AddImmInst = TII.get( isPPC64 ? PPC::ADDI8
+                                                   : PPC::ADDI );
+  const MCInstrDesc& AddInst = TII.get( isPPC64 ? PPC::ADD8
+                                                : PPC::ADD4 );
+
   int LROffset = PPCFrameLowering::getReturnSaveOffset(isPPC64, isDarwinABI);
 
   int FPOffset = 0;
   if (HasFP) {
-    if (Subtarget.isSVR4ABI()) {
+    if (isSVR4ABI) {
       MachineFrameInfo *FFI = MF.getFrameInfo();
       int FPIndex = FI->getFramePointerSaveIndex();
       assert(FPIndex && "No Frame Pointer Save Slot!");
@@ -741,7 +718,7 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF,
 
   int BPOffset = 0;
   if (HasBP) {
-    if (Subtarget.isSVR4ABI()) {
+    if (isSVR4ABI) {
       MachineFrameInfo *FFI = MF.getFrameInfo();
       int BPIndex = FI->getBasePointerSaveIndex();
       assert(BPIndex && "No Base Pointer Save Slot!");
@@ -773,106 +750,76 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF,
       FrameSize += StackAdj;
   }
 
+  // Frames of 32KB & larger require special handling because they cannot be
+  // indexed into with a simple LD/LWZ immediate offset operand.
+  bool isLargeFrame = !isInt<16>(FrameSize);
+
   if (FrameSize) {
-    // The loaded (or persistent) stack pointer value is offset by the 'stwu'
-    // on entry to the function.  Add this offset back now.
-    if (!isPPC64) {
-      // If this function contained a fastcc call and GuaranteedTailCallOpt is
-      // enabled (=> hasFastCall()==true) the fastcc call might contain a tail
-      // call which invalidates the stack pointer value in SP(0). So we use the
-      // value of R31 in this case.
-      if (FI->hasFastCall() && isInt<16>(FrameSize)) {
-        assert(hasFP(MF) && "Expecting a valid the frame pointer.");
-        BuildMI(MBB, MBBI, dl, TII.get(PPC::ADDI), PPC::R1)
-          .addReg(PPC::R31).addImm(FrameSize);
-      } else if(FI->hasFastCall()) {
-        BuildMI(MBB, MBBI, dl, TII.get(PPC::LIS), PPC::R0)
-          .addImm(FrameSize >> 16);
-        BuildMI(MBB, MBBI, dl, TII.get(PPC::ORI), PPC::R0)
-          .addReg(PPC::R0, RegState::Kill)
-          .addImm(FrameSize & 0xFFFF);
-        BuildMI(MBB, MBBI, dl, TII.get(PPC::ADD4))
-          .addReg(PPC::R1)
-          .addReg(PPC::R31)
-          .addReg(PPC::R0);
-      } else if (isInt<16>(FrameSize) &&
-                 !HasBP &&
-                 !MFI->hasVarSizedObjects()) {
-        BuildMI(MBB, MBBI, dl, TII.get(PPC::ADDI), PPC::R1)
-          .addReg(PPC::R1).addImm(FrameSize);
+    // In the prologue, the loaded (or persistent) stack pointer value is offset
+    // by the STDU/STDUX/STWU/STWUX instruction.  Add this offset back now.
+
+    // If this function contained a fastcc call and GuaranteedTailCallOpt is
+    // enabled (=> hasFastCall()==true) the fastcc call might contain a tail
+    // call which invalidates the stack pointer value in SP(0). So we use the
+    // value of R31 in this case.
+    if (FI->hasFastCall()) {
+      assert(HasFP && "Expecting a valid frame pointer.");
+      if (!isLargeFrame) {
+        BuildMI(MBB, MBBI, dl, AddImmInst, SPReg)
+          .addReg(FPReg).addImm(FrameSize);
       } else {
-        BuildMI(MBB, MBBI, dl, TII.get(PPC::LWZ),PPC::R1)
-          .addImm(0).addReg(PPC::R1);
-      }
-    } else {
-      if (FI->hasFastCall() && isInt<16>(FrameSize)) {
-        assert(hasFP(MF) && "Expecting a valid the frame pointer.");
-        BuildMI(MBB, MBBI, dl, TII.get(PPC::ADDI8), PPC::X1)
-          .addReg(PPC::X31).addImm(FrameSize);
-      } else if(FI->hasFastCall()) {
-        BuildMI(MBB, MBBI, dl, TII.get(PPC::LIS8), PPC::X0)
+        BuildMI(MBB, MBBI, dl, LoadImmShiftedInst, ScratchReg)
           .addImm(FrameSize >> 16);
-        BuildMI(MBB, MBBI, dl, TII.get(PPC::ORI8), PPC::X0)
-          .addReg(PPC::X0, RegState::Kill)
+        BuildMI(MBB, MBBI, dl, OrImmInst, ScratchReg)
+          .addReg(ScratchReg, RegState::Kill)
           .addImm(FrameSize & 0xFFFF);
-        BuildMI(MBB, MBBI, dl, TII.get(PPC::ADD8))
-          .addReg(PPC::X1)
-          .addReg(PPC::X31)
-          .addReg(PPC::X0);
-      } else if (isInt<16>(FrameSize) && !HasBP &&
-            !MFI->hasVarSizedObjects()) {
-        BuildMI(MBB, MBBI, dl, TII.get(PPC::ADDI8), PPC::X1)
-           .addReg(PPC::X1).addImm(FrameSize);
-      } else {
-        BuildMI(MBB, MBBI, dl, TII.get(PPC::LD), PPC::X1)
-           .addImm(0).addReg(PPC::X1);
+        BuildMI(MBB, MBBI, dl, AddInst)
+          .addReg(SPReg)
+          .addReg(FPReg)
+          .addReg(ScratchReg);
       }
+    } else if (!isLargeFrame && !HasBP && !MFI->hasVarSizedObjects()) {
+      BuildMI(MBB, MBBI, dl, AddImmInst, SPReg)
+        .addReg(SPReg)
+        .addImm(FrameSize);
+    } else {
+      BuildMI(MBB, MBBI, dl, LoadInst, SPReg)
+        .addImm(0)
+        .addReg(SPReg);
     }
-  }
 
-  if (isPPC64) {
-    if (MustSaveLR)
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::LD), PPC::X0)
-        .addImm(LROffset).addReg(PPC::X1);
-
-    if (!MustSaveCRs.empty())
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::LWZ8), PPC::X12)
-        .addImm(8).addReg(PPC::X1);
+  }
 
-    if (HasFP)
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::LD), PPC::X31)
-        .addImm(FPOffset).addReg(PPC::X1);
+  if (MustSaveLR)
+    BuildMI(MBB, MBBI, dl, LoadInst, ScratchReg)
+      .addImm(LROffset)
+      .addReg(SPReg);
 
-    if (HasBP)
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::LD), PPC::X30)
-        .addImm(BPOffset).addReg(PPC::X1);
+  assert((isPPC64 || MustSaveCRs.empty()) &&
+         "Epilogue CR restoring supported only in 64-bit mode");
 
-    if (!MustSaveCRs.empty())
-      for (unsigned i = 0, e = MustSaveCRs.size(); i != e; ++i)
-        BuildMI(MBB, MBBI, dl, TII.get(PPC::MTOCRF8), MustSaveCRs[i])
-          .addReg(PPC::X12, getKillRegState(i == e-1));
+  if (!MustSaveCRs.empty()) // will only occur for PPC64
+    BuildMI(MBB, MBBI, dl, TII.get(PPC::LWZ8), TempReg)
+      .addImm(8)
+      .addReg(SPReg);
 
-    if (MustSaveLR)
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::MTLR8)).addReg(PPC::X0);
-  } else {
-    if (MustSaveLR)
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::LWZ), PPC::R0)
-          .addImm(LROffset).addReg(PPC::R1);
+  if (HasFP)
+    BuildMI(MBB, MBBI, dl, LoadInst, FPReg)
+      .addImm(FPOffset)
+      .addReg(SPReg);
 
-    assert(MustSaveCRs.empty() &&
-           "Epilogue CR restoring supported only in 64-bit mode");
+  if (HasBP)
+    BuildMI(MBB, MBBI, dl, LoadInst, BPReg)
+      .addImm(BPOffset)
+      .addReg(SPReg);
 
-    if (HasFP)
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::LWZ), PPC::R31)
-          .addImm(FPOffset).addReg(PPC::R1);
+  if (!MustSaveCRs.empty()) // will only occur for PPC64
+    for (unsigned i = 0, e = MustSaveCRs.size(); i != e; ++i)
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::MTOCRF8), MustSaveCRs[i])
+        .addReg(TempReg, getKillRegState(i == e-1));
 
-    if (HasBP)
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::LWZ), PPC::R30)
-          .addImm(FPOffset).addReg(PPC::R1);
-
-    if (MustSaveLR)
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::MTLR)).addReg(PPC::R0);
-  }
+  if (MustSaveLR)
+    BuildMI(MBB, MBBI, dl, MTLRInst).addReg(ScratchReg);
 
   // Callee pop calling convention. Pop parameter/linkage area. Used for tail
   // call optimization
@@ -880,27 +827,20 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF,
       MF.getFunction()->getCallingConv() == CallingConv::Fast) {
      PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
      unsigned CallerAllocatedAmt = FI->getMinReservedArea();
-     unsigned StackReg = isPPC64 ? PPC::X1 : PPC::R1;
-     unsigned FPReg = isPPC64 ? PPC::X31 : PPC::R31;
-     unsigned TmpReg = isPPC64 ? PPC::X0 : PPC::R0;
-     unsigned ADDIInstr = isPPC64 ? PPC::ADDI8 : PPC::ADDI;
-     unsigned ADDInstr = isPPC64 ? PPC::ADD8 : PPC::ADD4;
-     unsigned LISInstr = isPPC64 ? PPC::LIS8 : PPC::LIS;
-     unsigned ORIInstr = isPPC64 ? PPC::ORI8 : PPC::ORI;
 
      if (CallerAllocatedAmt && isInt<16>(CallerAllocatedAmt)) {
-       BuildMI(MBB, MBBI, dl, TII.get(ADDIInstr), StackReg)
-         .addReg(StackReg).addImm(CallerAllocatedAmt);
+       BuildMI(MBB, MBBI, dl, AddImmInst, SPReg)
+         .addReg(SPReg).addImm(CallerAllocatedAmt);
      } else {
-       BuildMI(MBB, MBBI, dl, TII.get(LISInstr), TmpReg)
+       BuildMI(MBB, MBBI, dl, LoadImmShiftedInst, ScratchReg)
           .addImm(CallerAllocatedAmt >> 16);
-       BuildMI(MBB, MBBI, dl, TII.get(ORIInstr), TmpReg)
-          .addReg(TmpReg, RegState::Kill)
+       BuildMI(MBB, MBBI, dl, OrImmInst, ScratchReg)
+          .addReg(ScratchReg, RegState::Kill)
           .addImm(CallerAllocatedAmt & 0xFFFF);
-       BuildMI(MBB, MBBI, dl, TII.get(ADDInstr))
-          .addReg(StackReg)
+       BuildMI(MBB, MBBI, dl, AddInst)
+          .addReg(SPReg)
           .addReg(FPReg)
-          .addReg(TmpReg);
+          .addReg(ScratchReg);
      }
   } else if (RetOpcode == PPC::TCRETURNdi) {
     MBBI = MBB.getLastNonDebugInstr();
diff --git a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index 475bde1..6ba6af6 100644
--- a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -876,8 +876,10 @@ SDNode *PPCDAGToDAGISel::SelectSETCC(SDNode *N) {
 // target-specific node if it hasn't already been changed.
 SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
   SDLoc dl(N);
-  if (N->isMachineOpcode())
+  if (N->isMachineOpcode()) {
+    N->setNodeId(-1);
     return NULL;   // Already selected.
+  }
 
   switch (N->getOpcode()) {
   default: break;
diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
index 664dd12..8da5f05 100644
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -149,28 +149,24 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
         Subtarget->hasFRSQRTES() && Subtarget->hasFRES()))
     setOperationAction(ISD::FSQRT, MVT::f32, Expand);
 
-  setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
-  setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
+  if (Subtarget->hasFCPSGN()) {
+    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Legal);
+    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Legal);
+  } else {
+    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
+    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
+  }
 
   if (Subtarget->hasFPRND()) {
     setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
     setOperationAction(ISD::FCEIL,  MVT::f64, Legal);
     setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
+    setOperationAction(ISD::FROUND, MVT::f64, Legal);
 
     setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
     setOperationAction(ISD::FCEIL,  MVT::f32, Legal);
     setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
-
-    // frin does not implement "ties to even." Thus, this is safe only in
-    // fast-math mode.
-    if (TM.Options.UnsafeFPMath) {
-      setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal);
-      setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal);
-
-      // These need to set FE_INEXACT, and use a custom inserter.
-      setOperationAction(ISD::FRINT, MVT::f64, Legal);
-      setOperationAction(ISD::FRINT, MVT::f32, Legal);
-    }
+    setOperationAction(ISD::FROUND, MVT::f32, Legal);
   }
 
   // PowerPC does not have BSWAP, CTPOP or CTTZ
@@ -560,7 +556,10 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
 
   setInsertFencesForAtomic(true);
 
-  setSchedulingPreference(Sched::Hybrid);
+  if (Subtarget->enableMachineScheduler())
+    setSchedulingPreference(Sched::Source);
+  else
+    setSchedulingPreference(Sched::Hybrid);
 
   computeRegisterProperties();
 
@@ -579,24 +578,47 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
   }
 }
 
+/// getMaxByValAlign - Helper for getByValTypeAlignment to determine
+/// the desired ByVal argument alignment.
+static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign,
+                             unsigned MaxMaxAlign) {
+  if (MaxAlign == MaxMaxAlign)
+    return;
+  if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
+    if (MaxMaxAlign >= 32 && VTy->getBitWidth() >= 256)
+      MaxAlign = 32;
+    else if (VTy->getBitWidth() >= 128 && MaxAlign < 16)
+      MaxAlign = 16;
+  } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
+    unsigned EltAlign = 0;
+    getMaxByValAlign(ATy->getElementType(), EltAlign, MaxMaxAlign);
+    if (EltAlign > MaxAlign)
+      MaxAlign = EltAlign;
+  } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
+    for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+      unsigned EltAlign = 0;
+      getMaxByValAlign(STy->getElementType(i), EltAlign, MaxMaxAlign);
+      if (EltAlign > MaxAlign)
+        MaxAlign = EltAlign;
+      if (MaxAlign == MaxMaxAlign)
+        break;
+    }
+  }
+}
+
 /// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
 /// function arguments in the caller parameter area.
 unsigned PPCTargetLowering::getByValTypeAlignment(Type *Ty) const {
-  const TargetMachine &TM = getTargetMachine();
   // Darwin passes everything on 4 byte boundary.
-  if (TM.getSubtarget<PPCSubtarget>().isDarwin())
+  if (PPCSubTarget.isDarwin())
     return 4;
 
   // 16byte and wider vectors are passed on 16byte boundary.
-  if (VectorType *VTy = dyn_cast<VectorType>(Ty))
-    if (VTy->getBitWidth() >= 128)
-      return 16;
-
   // The rest is 8 on PPC64 and 4 on PPC32 boundary.
-   if (PPCSubTarget.isPPC64())
-     return 8;
-
-  return 4;
+  unsigned Align = PPCSubTarget.isPPC64() ? 8 : 4;
+  if (PPCSubTarget.hasAltivec() || PPCSubTarget.hasQPX())
+    getMaxByValAlign(Ty, Align, PPCSubTarget.hasQPX() ? 32 : 16);
+  return Align;
 }
 
 const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
@@ -1386,6 +1408,10 @@ SDValue PPCTargetLowering::LowerBlockAddress(SDValue Op,
 SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op,
                                               SelectionDAG &DAG) const {
 
+  // FIXME: TLS addresses currently use medium model code sequences,
+  // which is the most useful form.  Eventually support for small and
+  // large models could be added if users need it, at the cost of
+  // additional complexity.
   GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
   SDLoc dl(GA);
   const GlobalValue *GV = GA->getGlobal();
@@ -1814,6 +1840,12 @@ SDValue PPCTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG,
 
 #include "PPCGenCallingConv.inc"
 
+// Function whose sole purpose is to kill compiler warnings 
+// stemming from unused functions included from PPCGenCallingConv.inc.
+CCAssignFn *PPCTargetLowering::useFastISelCCs(unsigned Flag) const {
+  return Flag ? CC_PPC64_ELF_FIS : RetCC_PPC64_ELF_FIS;
+}
+
 bool llvm::CC_PPC32_SVR4_Custom_Dummy(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
                                       CCValAssign::LocInfo &LocInfo,
                                       ISD::ArgFlagsTy &ArgFlags,
@@ -2276,6 +2308,13 @@ PPCTargetLowering::LowerFormalArguments_64SVR4(
         InVals.push_back(FIN);
         continue;
       }
+
+      unsigned BVAlign = Flags.getByValAlign();
+      if (BVAlign > 8) {
+        ArgOffset = ((ArgOffset+BVAlign-1)/BVAlign)*BVAlign;
+        CurArgOffset = ArgOffset;
+      }
+
       // All aggregates smaller than 8 bytes must be passed right-justified.
       if (ObjSize < PtrByteSize)
         CurArgOffset = CurArgOffset + (PtrByteSize - ObjSize);
@@ -3448,7 +3487,9 @@ PPCTargetLowering::FinishCall(CallingConv::ID CallConv, SDLoc dl,
       // from allocating it), resulting in an additional register being
       // allocated and an unnecessary move instruction being generated.
       needsTOCRestore = true;
-    } else if ((CallOpc == PPCISD::CALL) && !isLocalCall(Callee)) {
+    } else if ((CallOpc == PPCISD::CALL) &&
+               (!isLocalCall(Callee) ||
+                DAG.getTarget().getRelocationModel() == Reloc::PIC_)) {
       // Otherwise insert NOP for non-local calls.
       CallOpc = PPCISD::CALL_NOP;
     }
@@ -3865,6 +3906,15 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
       if (Size == 0)
         continue;
 
+      unsigned BVAlign = Flags.getByValAlign();
+      if (BVAlign > 8) {
+        if (BVAlign % PtrByteSize != 0)
+          llvm_unreachable(
+            "ByVal alignment is not a multiple of the pointer size");
+
+        ArgOffset = ((ArgOffset+BVAlign-1)/BVAlign)*BVAlign;
+      }
+
       // All aggregates smaller than 8 bytes must be passed right-justified.
       if (Size==1 || Size==2 || Size==4) {
         EVT VT = (Size==1) ? MVT::i8 : ((Size==2) ? MVT::i16 : MVT::i32);
@@ -3956,7 +4006,7 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
       continue;
     }
 
-    switch (Arg.getValueType().getSimpleVT().SimpleTy) {
+    switch (Arg.getSimpleValueType().SimpleTy) {
     default: llvm_unreachable("Unexpected ValueType for argument!");
     case MVT::i32:
     case MVT::i64:
@@ -3979,7 +4029,7 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
           // must be passed right-justified in the stack doubleword, and
           // in the GPR, if one is available.
           SDValue StoreOff;
-          if (Arg.getValueType().getSimpleVT().SimpleTy == MVT::f32) {
+          if (Arg.getSimpleValueType().SimpleTy == MVT::f32) {
             SDValue ConstFour = DAG.getConstant(4, PtrOff.getValueType());
             StoreOff = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, ConstFour);
           } else
@@ -4287,7 +4337,7 @@ PPCTargetLowering::LowerCall_Darwin(SDValue Chain, SDValue Callee,
       continue;
     }
 
-    switch (Arg.getValueType().getSimpleVT().SimpleTy) {
+    switch (Arg.getSimpleValueType().SimpleTy) {
     default: llvm_unreachable("Unexpected ValueType for argument!");
     case MVT::i32:
     case MVT::i64:
@@ -4752,7 +4802,7 @@ SDValue PPCTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG,
     Src = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Src);
 
   SDValue Tmp;
-  switch (Op.getValueType().getSimpleVT().SimpleTy) {
+  switch (Op.getSimpleValueType().SimpleTy) {
   default: llvm_unreachable("Unhandled FP_TO_INT type in custom expander!");
   case MVT::i32:
     Tmp = DAG.getNode(Op.getOpcode()==ISD::FP_TO_SINT ? PPCISD::FCTIWZ :
@@ -6676,51 +6726,6 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
 
     // Restore FPSCR value.
     BuildMI(*BB, MI, dl, TII->get(PPC::MTFSF)).addImm(1).addReg(MFFSReg);
-  } else if (MI->getOpcode() == PPC::FRINDrint ||
-             MI->getOpcode() == PPC::FRINSrint) {
-    bool isf32 = MI->getOpcode() == PPC::FRINSrint;
-    unsigned Dest = MI->getOperand(0).getReg();
-    unsigned Src = MI->getOperand(1).getReg();
-    DebugLoc dl   = MI->getDebugLoc();
-
-    MachineRegisterInfo &RegInfo = F->getRegInfo();
-    unsigned CRReg = RegInfo.createVirtualRegister(&PPC::CRRCRegClass);
-
-    // Perform the rounding.
-    BuildMI(*BB, MI, dl, TII->get(isf32 ? PPC::FRINS : PPC::FRIND), Dest)
-      .addReg(Src);
-
-    // Compare the results.
-    BuildMI(*BB, MI, dl, TII->get(isf32 ? PPC::FCMPUS : PPC::FCMPUD), CRReg)
-      .addReg(Dest).addReg(Src);
-
-    // If the results were not equal, then set the FPSCR XX bit.
-    MachineBasicBlock *midMBB = F->CreateMachineBasicBlock(LLVM_BB);
-    MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB);
-    F->insert(It, midMBB);
-    F->insert(It, exitMBB);
-    exitMBB->splice(exitMBB->begin(), BB,
-                    llvm::next(MachineBasicBlock::iterator(MI)),
-                    BB->end());
-    exitMBB->transferSuccessorsAndUpdatePHIs(BB);
-
-    BuildMI(*BB, MI, dl, TII->get(PPC::BCC))
-      .addImm(PPC::PRED_EQ).addReg(CRReg).addMBB(exitMBB);
-
-    BB->addSuccessor(midMBB);
-    BB->addSuccessor(exitMBB);
-
-    BB = midMBB;
-
-    // Set the FPSCR XX bit (FE_INEXACT). Note that we cannot just set
-    // the FI bit here because that will not automatically set XX also,
-    // and XX is what libm interprets as the FE_INEXACT flag.
-    BuildMI(BB, dl, TII->get(PPC::MTFSB1)).addImm(/* 38 - 32 = */ 6);
-    BuildMI(BB, dl, TII->get(PPC::B)).addMBB(exitMBB);
-
-    BB->addSuccessor(exitMBB);
-
-    BB = exitMBB;
   } else {
     llvm_unreachable("Unexpected instr type to insert");
   }
@@ -7061,8 +7066,28 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
     if (RV.getNode() != 0) {
       DCI.AddToWorklist(RV.getNode());
       RV = DAGCombineFastRecip(RV, DCI);
-      if (RV.getNode() != 0)
+      if (RV.getNode() != 0) {
+	// Unfortunately, RV is now NaN if the input was exactly 0. Select out
+	// this case and force the answer to 0.
+
+        EVT VT = RV.getValueType();
+
+        SDValue Zero = DAG.getConstantFP(0.0, VT.getScalarType());
+        if (VT.isVector()) {
+          assert(VT.getVectorNumElements() == 4 && "Unknown vector type");
+          Zero = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Zero, Zero, Zero, Zero);
+        }
+
+        SDValue ZeroCmp =
+          DAG.getSetCC(dl, getSetCCResultType(*DAG.getContext(), VT),
+                       N->getOperand(0), Zero, ISD::SETEQ);
+        DCI.AddToWorklist(ZeroCmp.getNode());
+        DCI.AddToWorklist(RV.getNode());
+
+        RV = DAG.getNode(VT.isVector() ? ISD::VSELECT : ISD::SELECT, dl, VT,
+                         ZeroCmp, Zero, RV);
         return RV;
+      }
     }
 
     }
@@ -7158,7 +7183,8 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
     unsigned ABIAlignment = getDataLayout()->getABITypeAlignment(Ty);
     if (ISD::isNON_EXTLoad(N) && VT.isVector() &&
         TM.getSubtarget<PPCSubtarget>().hasAltivec() &&
-        DCI.getDAGCombineLevel() == AfterLegalizeTypes &&
+        (VT == MVT::v16i8 || VT == MVT::v8i16 ||
+         VT == MVT::v4i32 || VT == MVT::v4f32) &&
         LD->getAlignment() < ABIAlignment) {
       // This is a type-legal unaligned Altivec load.
       SDValue Chain = LD->getChain();
@@ -7302,6 +7328,8 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
         }
       }
     }
+
+    break;
   case ISD::BSWAP:
     // Turn BSWAP (LOAD) -> lhbrx/lwbrx.
     if (ISD::isNON_EXTLoad(N->getOperand(0).getNode()) &&
@@ -7645,7 +7673,7 @@ PPCTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
       PPC::GPRCRegClass.contains(R.first)) {
     const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
     return std::make_pair(TRI->getMatchingSuperReg(R.first,
-                            PPC::sub_32, &PPC::GPRCRegClass),
+                            PPC::sub_32, &PPC::G8RCRegClass),
                           &PPC::G8RCRegClass);
   }
 
@@ -7896,7 +7924,7 @@ bool PPCTargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
 }
 
 Sched::Preference PPCTargetLowering::getSchedulingPreference(SDNode *N) const {
-  if (DisableILPPref)
+  if (DisableILPPref || PPCSubTarget.enableMachineScheduler())
     return TargetLowering::getSchedulingPreference(N);
 
   return Sched::ILP;
diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h
index aa5e821..df3af35 100644
--- a/lib/Target/PowerPC/PPCISelLowering.h
+++ b/lib/Target/PowerPC/PPCISelLowering.h
@@ -627,6 +627,8 @@ namespace llvm {
 
     SDValue DAGCombineFastRecip(SDValue Op, DAGCombinerInfo &DCI) const;
     SDValue DAGCombineFastRecipFSQRT(SDValue Op, DAGCombinerInfo &DCI) const;
+
+    CCAssignFn *useFastISelCCs(unsigned Flag) const;
   };
 
   namespace PPC {
diff --git a/lib/Target/PowerPC/PPCInstr64Bit.td b/lib/Target/PowerPC/PPCInstr64Bit.td
index f78bb38..46db4fe 100644
--- a/lib/Target/PowerPC/PPCInstr64Bit.td
+++ b/lib/Target/PowerPC/PPCInstr64Bit.td
@@ -270,6 +270,7 @@ def MTCRF8 : XFXForm_5<31, 144, (outs), (ins i32imm:$FXM, g8rc:$rS),
                       "mtcrf $FXM, $rS", BrMCRX>,
             PPC970_MicroCode, PPC970_Unit_CRU;
 
+let hasExtraSrcRegAllocReq = 1 in // to enable post-ra anti-dep breaking.
 def MFOCRF8: XFXForm_5a<31, 19, (outs g8rc:$rT), (ins crbitm:$FXM),
                         "mfocrf $rT, $FXM", SprMFCR>,
              PPC970_DGroup_First, PPC970_Unit_CRU;
@@ -506,6 +507,14 @@ defm EXTSH8 : XForm_11r<31, 922, (outs g8rc:$rA), (ins g8rc:$rS),
                         [(set i64:$rA, (sext_inreg i64:$rS, i16))]>;
 } // Interpretation64Bit
 
+// For fast-isel:
+let isCodeGenOnly = 1 in {
+def EXTSB8_32_64 : XForm_11<31, 954, (outs g8rc:$rA), (ins gprc:$rS),
+                           "extsb $rA, $rS", IntSimple, []>, isPPC64;
+def EXTSH8_32_64 : XForm_11<31, 922, (outs g8rc:$rA), (ins gprc:$rS),
+                           "extsh $rA, $rS", IntSimple, []>, isPPC64;
+} // isCodeGenOnly for fast-isel
+
 defm EXTSW  : XForm_11r<31, 986, (outs g8rc:$rA), (ins g8rc:$rS),
                         "extsw", "$rA, $rS", IntSimple,
                         [(set i64:$rA, (sext_inreg i64:$rS, i32))]>, isPPC64;
@@ -520,16 +529,16 @@ defm SRADI  : XSForm_1rc<31, 413, (outs g8rc:$rA), (ins g8rc:$rS, u6imm:$SH),
 defm CNTLZD : XForm_11r<31, 58, (outs g8rc:$rA), (ins g8rc:$rS),
                         "cntlzd", "$rA, $rS", IntGeneral,
                         [(set i64:$rA, (ctlz i64:$rS))]>;
-defm POPCNTD : XForm_11r<31, 506, (outs g8rc:$rA), (ins g8rc:$rS),
-                         "popcntd", "$rA, $rS", IntGeneral,
-                         [(set i64:$rA, (ctpop i64:$rS))]>;
+def POPCNTD : XForm_11<31, 506, (outs g8rc:$rA), (ins g8rc:$rS),
+                       "popcntd $rA, $rS", IntGeneral,
+                       [(set i64:$rA, (ctpop i64:$rS))]>;
 
 // popcntw also does a population count on the high 32 bits (storing the
 // results in the high 32-bits of the output). We'll ignore that here (which is
 // safe because we never separately use the high part of the 64-bit registers).
-defm POPCNTW : XForm_11r<31, 378, (outs gprc:$rA), (ins gprc:$rS),
-                         "popcntw", "$rA, $rS", IntGeneral,
-                         [(set i32:$rA, (ctpop i32:$rS))]>;
+def POPCNTW : XForm_11<31, 378, (outs gprc:$rA), (ins gprc:$rS),
+                       "popcntw $rA, $rS", IntGeneral,
+                       [(set i32:$rA, (ctpop i32:$rS))]>;
 
 defm DIVD  : XOForm_1r<31, 489, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
                        "divd", "$rT, $rA, $rB", IntDivD,
@@ -569,6 +578,14 @@ defm RLDICL : MDForm_1r<30, 0,
                         (outs g8rc:$rA), (ins g8rc:$rS, u6imm:$SH, u6imm:$MBE),
                         "rldicl", "$rA, $rS, $SH, $MBE", IntRotateDI,
                         []>, isPPC64;
+// For fast-isel:
+let isCodeGenOnly = 1 in
+def RLDICL_32_64 : MDForm_1<30, 0,
+                           (outs g8rc:$rA),
+                           (ins gprc:$rS, u6imm:$SH, u6imm:$MBE),
+                           "rldicl $rA, $rS, $SH, $MBE", IntRotateDI,
+                           []>, isPPC64;
+// End fast-isel.
 defm RLDICR : MDForm_1r<30, 1,
                         (outs g8rc:$rA), (ins g8rc:$rS, u6imm:$SH, u6imm:$MBE),
                         "rldicr", "$rA, $rS, $SH, $MBE", IntRotateDI,
@@ -620,6 +637,15 @@ def LWAX : XForm_1<31, 341, (outs g8rc:$rD), (ins memrr:$src),
                    "lwax $rD, $src", LdStLHA,
                    [(set i64:$rD, (sextloadi32 xaddr:$src))]>, isPPC64,
                    PPC970_DGroup_Cracked;
+// For fast-isel:
+let isCodeGenOnly = 1, mayLoad = 1 in {
+def LWA_32  : DSForm_1<58, 2, (outs gprc:$rD), (ins memrix:$src),
+                      "lwa $rD, $src", LdStLWA, []>, isPPC64,
+                      PPC970_DGroup_Cracked;
+def LWAX_32 : XForm_1<31, 341, (outs gprc:$rD), (ins memrr:$src),
+                     "lwax $rD, $src", LdStLHA, []>, isPPC64,
+                     PPC970_DGroup_Cracked;
+} // end fast-isel isCodeGenOnly
 
 // Update forms.
 let mayLoad = 1, neverHasSideEffects = 1 in {
@@ -942,6 +968,9 @@ let PPC970_Unit = 3, neverHasSideEffects = 1,
 defm FCFID  : XForm_26r<63, 846, (outs f8rc:$frD), (ins f8rc:$frB),
                         "fcfid", "$frD, $frB", FPGeneral,
                         [(set f64:$frD, (PPCfcfid f64:$frB))]>, isPPC64;
+defm FCTID  : XForm_26r<63, 814, (outs f8rc:$frD), (ins f8rc:$frB),
+                        "fctid", "$frD, $frB", FPGeneral,
+                        []>, isPPC64;
 defm FCTIDZ : XForm_26r<63, 815, (outs f8rc:$frD), (ins f8rc:$frB),
                         "fctidz", "$frD, $frB", FPGeneral,
                         [(set f64:$frD, (PPCfctidz f64:$frB))]>, isPPC64;
diff --git a/lib/Target/PowerPC/PPCInstrAltivec.td b/lib/Target/PowerPC/PPCInstrAltivec.td
index fdea51d..a55abe3 100644
--- a/lib/Target/PowerPC/PPCInstrAltivec.td
+++ b/lib/Target/PowerPC/PPCInstrAltivec.td
@@ -229,35 +229,45 @@ let Predicates = [HasAltivec] in {
 let isCodeGenOnly = 1 in {
 def DSS      : DSS_Form<822, (outs),
                         (ins u5imm:$ZERO0, u5imm:$STRM,u5imm:$ZERO1,u5imm:$ZERO2),
-                        "dss $STRM", LdStLoad /*FIXME*/, []>;
+                        "dss $STRM", LdStLoad /*FIXME*/, []>,
+                        Deprecated<DeprecatedDST>;
 def DSSALL   : DSS_Form<822, (outs),
                         (ins u5imm:$ONE, u5imm:$ZERO0,u5imm:$ZERO1,u5imm:$ZERO2),
-                        "dssall", LdStLoad /*FIXME*/, []>;
+                        "dssall", LdStLoad /*FIXME*/, []>,
+                        Deprecated<DeprecatedDST>;
 def DST      : DSS_Form<342, (outs),
                         (ins u5imm:$ZERO, u5imm:$STRM, gprc:$rA, gprc:$rB),
-                        "dst $rA, $rB, $STRM", LdStLoad /*FIXME*/, []>;
+                        "dst $rA, $rB, $STRM", LdStLoad /*FIXME*/, []>,
+                        Deprecated<DeprecatedDST>;
 def DSTT     : DSS_Form<342, (outs),
                         (ins u5imm:$ONE, u5imm:$STRM, gprc:$rA, gprc:$rB),
-                        "dstt $rA, $rB, $STRM", LdStLoad /*FIXME*/, []>;
+                        "dstt $rA, $rB, $STRM", LdStLoad /*FIXME*/, []>,
+                        Deprecated<DeprecatedDST>;
 def DSTST    : DSS_Form<374, (outs),
                         (ins u5imm:$ZERO, u5imm:$STRM, gprc:$rA, gprc:$rB),
-                        "dstst $rA, $rB, $STRM", LdStLoad /*FIXME*/, []>;
+                        "dstst $rA, $rB, $STRM", LdStLoad /*FIXME*/, []>,
+                        Deprecated<DeprecatedDST>;
 def DSTSTT   : DSS_Form<374, (outs),
                         (ins u5imm:$ONE, u5imm:$STRM, gprc:$rA, gprc:$rB),
-                        "dststt $rA, $rB, $STRM", LdStLoad /*FIXME*/, []>;
+                        "dststt $rA, $rB, $STRM", LdStLoad /*FIXME*/, []>,
+                        Deprecated<DeprecatedDST>;
 
 def DST64    : DSS_Form<342, (outs),
                         (ins u5imm:$ZERO, u5imm:$STRM, g8rc:$rA, gprc:$rB),
-                        "dst $rA, $rB, $STRM", LdStLoad /*FIXME*/, []>;
+                        "dst $rA, $rB, $STRM", LdStLoad /*FIXME*/, []>,
+                        Deprecated<DeprecatedDST>;
 def DSTT64   : DSS_Form<342, (outs),
                         (ins u5imm:$ONE, u5imm:$STRM, g8rc:$rA, gprc:$rB),
-                        "dstt $rA, $rB, $STRM", LdStLoad /*FIXME*/, []>;
+                        "dstt $rA, $rB, $STRM", LdStLoad /*FIXME*/, []>,
+                        Deprecated<DeprecatedDST>;
 def DSTST64  : DSS_Form<374, (outs),
                         (ins u5imm:$ZERO, u5imm:$STRM, g8rc:$rA, gprc:$rB),
-                        "dstst $rA, $rB, $STRM", LdStLoad /*FIXME*/, []>;
+                        "dstst $rA, $rB, $STRM", LdStLoad /*FIXME*/, []>,
+                        Deprecated<DeprecatedDST>;
 def DSTSTT64 : DSS_Form<374, (outs),
                         (ins u5imm:$ONE, u5imm:$STRM, g8rc:$rA, gprc:$rB),
-                        "dststt $rA, $rB, $STRM", LdStLoad /*FIXME*/, []>;
+                        "dststt $rA, $rB, $STRM", LdStLoad /*FIXME*/, []>,
+                        Deprecated<DeprecatedDST>;
 }
 
 def MFVSCR : VXForm_4<1540, (outs vrrc:$vD), (ins),
diff --git a/lib/Target/PowerPC/PPCInstrFormats.td b/lib/Target/PowerPC/PPCInstrFormats.td
index 42adc02..29233d4 100644
--- a/lib/Target/PowerPC/PPCInstrFormats.td
+++ b/lib/Target/PowerPC/PPCInstrFormats.td
@@ -398,6 +398,13 @@ class XForm_1a<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
   let RST = 0;
 }
 
+class XForm_rs<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern>
+  : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+  let A = 0;
+  let B = 0;
+}
+
 class XForm_6<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
               InstrItinClass itin, list<dag> pattern> 
   : XForm_base_r3xo_swapped<opcode, xo, OOL, IOL, asmstr, itin> {
@@ -438,6 +445,17 @@ class XForm_16<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
   let Inst{31}    = 0;
 }
 
+class XForm_mtmsr<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+                InstrItinClass itin>
+         : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5> RS;
+  bits<1> L;
+
+  let Inst{6-10} = RS;
+  let Inst{15} = L;
+  let Inst{21-30} = xo;
+}
+
 class XForm_16_ext<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
                    InstrItinClass itin>
   : XForm_16<opcode, xo, OOL, IOL, asmstr, itin> {
@@ -534,6 +552,21 @@ class XForm_43<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
   let Inst{31}    = RC;
 }
 
+class XForm_0<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern>
+  : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+  let RST = 0;
+  let A = 0;
+  let B = 0;
+}
+
+class XForm_16b<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern>
+  : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+  let RST = 0;
+  let A = 0;
+}
+
 // DCB_Form - Form X instruction, used for dcb* instructions.
 class DCB_Form<bits<10> xo, bits<5> immfield, dag OOL, dag IOL, string asmstr, 
                       InstrItinClass itin, list<dag> pattern>
diff --git a/lib/Target/PowerPC/PPCInstrInfo.cpp b/lib/Target/PowerPC/PPCInstrInfo.cpp
index 375daee..315ad04 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -33,7 +33,7 @@
 #include "llvm/Support/raw_ostream.h"
 
 #define GET_INSTRMAP_INFO
-#define GET_INSTRINFO_CTOR
+#define GET_INSTRINFO_CTOR_DTOR
 #include "PPCGenInstrInfo.inc"
 
 using namespace llvm;
@@ -45,6 +45,9 @@ opt<bool> DisableCTRLoopAnal("disable-ppc-ctrloop-analysis", cl::Hidden,
 static cl::opt<bool> DisableCmpOpt("disable-ppc-cmp-opt",
 cl::desc("Disable compare instruction optimization"), cl::Hidden);
 
+// Pin the vtable to this file.
+void PPCInstrInfo::anchor() {}
+
 PPCInstrInfo::PPCInstrInfo(PPCTargetMachine &tm)
   : PPCGenInstrInfo(PPC::ADJCALLSTACKDOWN, PPC::ADJCALLSTACKUP),
     TM(tm), RI(*TM.getSubtargetImpl()) {}
@@ -985,6 +988,10 @@ bool PPCInstrInfo::SubsumesPredicate(
   if (Pred2[1].getReg() == PPC::CTR8 || Pred2[1].getReg() == PPC::CTR)
     return false;
 
+  // P1 can only subsume P2 if they test the same condition register.
+  if (Pred1[1].getReg() != Pred2[1].getReg())
+    return false;
+
   PPC::Predicate P1 = (PPC::Predicate) Pred1[0].getImm();
   PPC::Predicate P2 = (PPC::Predicate) Pred2[0].getImm();
 
diff --git a/lib/Target/PowerPC/PPCInstrInfo.h b/lib/Target/PowerPC/PPCInstrInfo.h
index bd72a4d..f140c41 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.h
+++ b/lib/Target/PowerPC/PPCInstrInfo.h
@@ -78,6 +78,7 @@ class PPCInstrInfo : public PPCGenInstrInfo {
                             const TargetRegisterClass *RC,
                             SmallVectorImpl<MachineInstr*> &NewMIs,
                             bool &NonRI, bool &SpillsVRS) const;
+  virtual void anchor();
 public:
   explicit PPCInstrInfo(PPCTargetMachine &TM);
 
diff --git a/lib/Target/PowerPC/PPCInstrInfo.td b/lib/Target/PowerPC/PPCInstrInfo.td
index 398a11b..2bd3aad 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/lib/Target/PowerPC/PPCInstrInfo.td
@@ -785,6 +785,20 @@ multiclass XForm_26r<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
   }
 }
 
+multiclass XForm_28r<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
+                    string asmbase, string asmstr, InstrItinClass itin,
+                    list<dag> pattern> {
+  let BaseName = asmbase in {
+    def NAME : XForm_28<opcode, xo, OOL, IOL,
+                       !strconcat(asmbase, !strconcat(" ", asmstr)), itin,
+                       pattern>, RecFormRel;
+    let Defs = [CR1] in
+    def o    : XForm_28<opcode, xo, OOL, IOL,
+                       !strconcat(asmbase, !strconcat(". ", asmstr)), itin,
+                       []>, isDOT, RecFormRel;
+  }
+}
+
 multiclass AForm_1r<bits<6> opcode, bits<5> xo, dag OOL, dag IOL,
                     string asmbase, string asmstr, InstrItinClass itin,
                     list<dag> pattern> {
@@ -1678,6 +1692,9 @@ let isCompare = 1, neverHasSideEffects = 1 in {
 
 let Uses = [RM] in {
   let neverHasSideEffects = 1 in {
+  defm FCTIW  : XForm_26r<63, 14, (outs f8rc:$frD), (ins f8rc:$frB),
+                          "fctiw", "$frD, $frB", FPGeneral,
+                          []>;
   defm FCTIWZ : XForm_26r<63, 15, (outs f8rc:$frD), (ins f8rc:$frB),
                           "fctiwz", "$frD, $frB", FPGeneral,
                           [(set f64:$frD, (PPCfctiwz f64:$frB))]>;
@@ -1686,23 +1703,13 @@ let Uses = [RM] in {
                           "frsp", "$frD, $frB", FPGeneral,
                           [(set f32:$frD, (fround f64:$frB))]>;
 
-  // The frin -> nearbyint mapping is valid only in fast-math mode.
   let Interpretation64Bit = 1 in
   defm FRIND  : XForm_26r<63, 392, (outs f8rc:$frD), (ins f8rc:$frB),
                           "frin", "$frD, $frB", FPGeneral,
-                          [(set f64:$frD, (fnearbyint f64:$frB))]>;
+                          [(set f64:$frD, (frnd f64:$frB))]>;
   defm FRINS  : XForm_26r<63, 392, (outs f4rc:$frD), (ins f4rc:$frB),
                           "frin", "$frD, $frB", FPGeneral,
-                          [(set f32:$frD, (fnearbyint f32:$frB))]>;
-  }
-
-  // These pseudos expand to rint but also set FE_INEXACT when the result does
-  // not equal the argument.
-  let usesCustomInserter = 1, Defs = [RM] in { // FIXME: Model FPSCR!
-    def FRINDrint : Pseudo<(outs f8rc:$frD), (ins f8rc:$frB),
-                            "#FRINDrint", [(set f64:$frD, (frint f64:$frB))]>;
-    def FRINSrint : Pseudo<(outs f4rc:$frD), (ins f4rc:$frB),
-                            "#FRINSrint", [(set f32:$frD, (frint f32:$frB))]>;
+                          [(set f32:$frD, (frnd f32:$frB))]>;
   }
 
   let neverHasSideEffects = 1 in {
@@ -1772,6 +1779,14 @@ defm FNEGD  : XForm_26r<63, 40, (outs f8rc:$frD), (ins f8rc:$frB),
                         "fneg", "$frD, $frB", FPGeneral,
                         [(set f64:$frD, (fneg f64:$frB))]>;
 
+defm FCPSGNS : XForm_28r<63, 8, (outs f4rc:$frD), (ins f4rc:$frA, f4rc:$frB),
+                        "fcpsgn", "$frD, $frA, $frB", FPGeneral,
+                        [(set f32:$frD, (fcopysign f32:$frB, f32:$frA))]>;
+let Interpretation64Bit = 1 in
+defm FCPSGND : XForm_28r<63, 8, (outs f8rc:$frD), (ins f8rc:$frA, f8rc:$frB),
+                        "fcpsgn", "$frD, $frA, $frB", FPGeneral,
+                        [(set f64:$frD, (fcopysign f64:$frB, f64:$frA))]>;
+
 // Reciprocal estimates.
 defm FRE      : XForm_26r<63, 24, (outs f8rc:$frD), (ins f8rc:$frB),
                           "fre", "$frD, $frB", FPGeneral,
@@ -1855,7 +1870,7 @@ def MTSPR : XFXForm_1<31, 467, (outs), (ins i32imm:$SPR, gprc:$RT),
                       "mtspr $SPR, $RT", SprMTSPR>;
 
 def MFTB : XFXForm_1<31, 371, (outs gprc:$RT), (ins i32imm:$SPR),
-                     "mftb $RT, $SPR", SprMFTB>;
+                     "mftb $RT, $SPR", SprMFTB>, Deprecated<DeprecatedMFTB>;
 
 let Uses = [CTR] in {
 def MFCTR : XFXForm_1_ext<31, 339, 9, (outs gprc:$rT), (ins),
@@ -1927,6 +1942,7 @@ def MTCRF : XFXForm_5<31, 144, (outs), (ins i32imm:$FXM, gprc:$rS),
                       "mtcrf $FXM, $rS", BrMCRX>,
             PPC970_MicroCode, PPC970_Unit_CRU;
 
+let hasExtraSrcRegAllocReq = 1 in // to enable post-ra anti-dep breaking.
 def MFOCRF: XFXForm_5a<31, 19, (outs gprc:$rT), (ins crbitm:$FXM),
                        "mfocrf $rT, $FXM", SprMFCR>,
             PPC970_DGroup_First, PPC970_Unit_CRU;
@@ -2280,6 +2296,12 @@ def : Pat<(fma (fneg f32:$A), f32:$C, f32:$B),
 def : Pat<(fma f32:$A, (fneg f32:$C), f32:$B),
           (FNMSUBS $A, $C, $B)>;
 
+// FCOPYSIGN's operand types need not agree.
+def : Pat<(fcopysign f64:$frB, f32:$frA),
+          (FCPSGND (COPY_TO_REGCLASS $frA, F8RC), $frB)>;
+def : Pat<(fcopysign f32:$frB, f64:$frA),
+          (FCPSGNS (COPY_TO_REGCLASS $frA, F4RC), $frB)>;
+
 include "PPCInstrAltivec.td"
 include "PPCInstr64Bit.td"
 
@@ -2300,6 +2322,35 @@ def EIEIO : XForm_24_eieio<31, 854, (outs), (ins),
 def WAIT : XForm_24_sync<31, 62, (outs), (ins i32imm:$L),
                          "wait $L", LdStLoad, []>;
 
+def MTMSR: XForm_mtmsr<31, 146, (outs), (ins gprc:$RS, i32imm:$L),
+                    "mtmsr $RS, $L", SprMTMSR>;
+
+def MFMSR : XForm_rs<31, 83, (outs gprc:$RT), (ins),
+                  "mfmsr $RT", SprMFMSR, []>;
+
+def MTMSRD : XForm_mtmsr<31, 178, (outs), (ins gprc:$RS, i32imm:$L),
+                    "mtmsrd $RS, $L", SprMTMSRD>;
+
+def SLBIE : XForm_16b<31, 434, (outs), (ins gprc:$RB),
+                        "slbie $RB", SprSLBIE, []>;
+
+def SLBMTE : XForm_26<31, 402, (outs), (ins gprc:$RS, gprc:$RB),
+                    "slbmte $RS, $RB", SprSLBMTE, []>;
+
+def SLBMFEE : XForm_26<31, 915, (outs gprc:$RT), (ins gprc:$RB),
+                       "slbmfee $RT, $RB", SprSLBMFEE, []>;
+
+def SLBIA : XForm_0<31, 498, (outs), (ins), "slbia", SprSLBIA, []>;
+
+def TLBSYNC : XForm_0<31, 566, (outs), (ins),
+                        "tlbsync", SprTLBSYNC, []>;
+
+def TLBIEL : XForm_16b<31, 274, (outs), (ins gprc:$RB),
+                          "tlbiel $RB", SprTLBIEL, []>;
+
+def TLBIE : XForm_26<31, 306, (outs), (ins gprc:$RS, gprc:$RB),
+                          "tlbie $RB,$RS", SprTLBIE, []>;
+
 //===----------------------------------------------------------------------===//
 // PowerPC Assembler Instruction Aliases
 //
@@ -2368,6 +2419,46 @@ def : InstAlias<"sub. $rA, $rB, $rC", (SUBF8o g8rc:$rA, g8rc:$rC, g8rc:$rB)>;
 def : InstAlias<"subc $rA, $rB, $rC", (SUBFC8 g8rc:$rA, g8rc:$rC, g8rc:$rB)>;
 def : InstAlias<"subc. $rA, $rB, $rC", (SUBFC8o g8rc:$rA, g8rc:$rC, g8rc:$rB)>;
 
+def : InstAlias<"mtmsrd $RS", (MTMSRD gprc:$RS, 0)>;
+def : InstAlias<"mtmsr $RS", (MTMSR gprc:$RS, 0)>;
+
+def : InstAlias<"mfsprg $RT, 0", (MFSPR gprc:$RT, 272)>;
+def : InstAlias<"mfsprg $RT, 1", (MFSPR gprc:$RT, 273)>;
+def : InstAlias<"mfsprg $RT, 2", (MFSPR gprc:$RT, 274)>;
+def : InstAlias<"mfsprg $RT, 3", (MFSPR gprc:$RT, 275)>;
+
+def : InstAlias<"mfsprg0 $RT", (MFSPR gprc:$RT, 272)>;
+def : InstAlias<"mfsprg1 $RT", (MFSPR gprc:$RT, 273)>;
+def : InstAlias<"mfsprg2 $RT", (MFSPR gprc:$RT, 274)>;
+def : InstAlias<"mfsprg3 $RT", (MFSPR gprc:$RT, 275)>;
+
+def : InstAlias<"mtsprg 0, $RT", (MTSPR 272, gprc:$RT)>;
+def : InstAlias<"mtsprg 1, $RT", (MTSPR 273, gprc:$RT)>;
+def : InstAlias<"mtsprg 2, $RT", (MTSPR 274, gprc:$RT)>;
+def : InstAlias<"mtsprg 3, $RT", (MTSPR 275, gprc:$RT)>;
+
+def : InstAlias<"mtsprg0 $RT", (MTSPR 272, gprc:$RT)>;
+def : InstAlias<"mtsprg1 $RT", (MTSPR 273, gprc:$RT)>;
+def : InstAlias<"mtsprg2 $RT", (MTSPR 274, gprc:$RT)>;
+def : InstAlias<"mtsprg3 $RT", (MTSPR 275, gprc:$RT)>;
+
+def : InstAlias<"mtasr $RS", (MTSPR 280, gprc:$RS)>;
+
+def : InstAlias<"mfdec $RT", (MFSPR gprc:$RT, 22)>;
+def : InstAlias<"mtdec $RT", (MTSPR 22, gprc:$RT)>;
+
+def : InstAlias<"mfpvr $RT", (MFSPR gprc:$RT, 287)>;
+
+def : InstAlias<"mfsdr1 $RT", (MFSPR gprc:$RT, 25)>;
+def : InstAlias<"mtsdr1 $RT", (MTSPR 25, gprc:$RT)>;
+
+def : InstAlias<"mfsrr0 $RT", (MFSPR gprc:$RT, 26)>;
+def : InstAlias<"mfsrr1 $RT", (MFSPR gprc:$RT, 27)>;
+def : InstAlias<"mtsrr0 $RT", (MTSPR 26, gprc:$RT)>;
+def : InstAlias<"mtsrr1 $RT", (MTSPR 27, gprc:$RT)>;
+
+def : InstAlias<"tlbie $RB", (TLBIE R0, gprc:$RB)>;
+
 def EXTLWI : PPCAsmPseudo<"extlwi $rA, $rS, $n, $b",
                           (ins gprc:$rA, gprc:$rS, u5imm:$n, u5imm:$b)>;
 def EXTLWIo : PPCAsmPseudo<"extlwi. $rA, $rS, $n, $b",
diff --git a/lib/Target/PowerPC/PPCMCInstLower.cpp b/lib/Target/PowerPC/PPCMCInstLower.cpp
index d69aa4a..f61c8bf 100644
--- a/lib/Target/PowerPC/PPCMCInstLower.cpp
+++ b/lib/Target/PowerPC/PPCMCInstLower.cpp
@@ -69,7 +69,7 @@ static MCSymbol *GetSymbolFromOperand(const MachineOperand &MO, AsmPrinter &AP){
     if (MO.isGlobal()) {
       StubSym =
       MachineModuleInfoImpl::
-      StubValueTy(AP.Mang->getSymbol(MO.getGlobal()),
+      StubValueTy(AP.getSymbol(MO.getGlobal()),
                   !MO.getGlobal()->hasInternalLinkage());
     } else {
       Name.erase(Name.end()-5, Name.end());
@@ -95,7 +95,7 @@ static MCSymbol *GetSymbolFromOperand(const MachineOperand &MO, AsmPrinter &AP){
     if (StubSym.getPointer() == 0) {
       assert(MO.isGlobal() && "Extern symbol not handled yet");
       StubSym = MachineModuleInfoImpl::
-                   StubValueTy(AP.Mang->getSymbol(MO.getGlobal()),
+                   StubValueTy(AP.getSymbol(MO.getGlobal()),
                                !MO.getGlobal()->hasInternalLinkage());
     }
     return Sym;
diff --git a/lib/Target/PowerPC/PPCRegisterInfo.cpp b/lib/Target/PowerPC/PPCRegisterInfo.cpp
index adba613..19ccbfc 100644
--- a/lib/Target/PowerPC/PPCRegisterInfo.cpp
+++ b/lib/Target/PowerPC/PPCRegisterInfo.cpp
@@ -69,6 +69,7 @@ PPCRegisterInfo::PPCRegisterInfo(const PPCSubtarget &ST)
   ImmToIdxMap[PPC::STH]  = PPC::STHX;   ImmToIdxMap[PPC::STW]  = PPC::STWX;
   ImmToIdxMap[PPC::STFS] = PPC::STFSX;  ImmToIdxMap[PPC::STFD] = PPC::STFDX;
   ImmToIdxMap[PPC::ADDI] = PPC::ADD4;
+  ImmToIdxMap[PPC::LWA_32] = PPC::LWAX_32;
 
   // 64-bit
   ImmToIdxMap[PPC::LHA8] = PPC::LHAX8; ImmToIdxMap[PPC::LBZ8] = PPC::LBZX8;
@@ -532,6 +533,7 @@ static bool usesIXAddr(const MachineInstr &MI) {
   default:
     return false;
   case PPC::LWA:
+  case PPC::LWA_32:
   case PPC::LD:
   case PPC::STD:
     return true;
@@ -689,14 +691,6 @@ unsigned PPCRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
     return TFI->hasFP(MF) ? PPC::X31 : PPC::X1;
 }
 
-unsigned PPCRegisterInfo::getEHExceptionRegister() const {
-  return !Subtarget.isPPC64() ? PPC::R3 : PPC::X3;
-}
-
-unsigned PPCRegisterInfo::getEHHandlerRegister() const {
-  return !Subtarget.isPPC64() ? PPC::R4 : PPC::X4;
-}
-
 unsigned PPCRegisterInfo::getBaseRegister(const MachineFunction &MF) const {
   if (!hasBasePointer(MF))
     return getFrameRegister(MF);
diff --git a/lib/Target/PowerPC/PPCRegisterInfo.h b/lib/Target/PowerPC/PPCRegisterInfo.h
index d02af9e..dd3bb40 100644
--- a/lib/Target/PowerPC/PPCRegisterInfo.h
+++ b/lib/Target/PowerPC/PPCRegisterInfo.h
@@ -97,10 +97,6 @@ public:
   bool hasBasePointer(const MachineFunction &MF) const;
   bool canRealignStack(const MachineFunction &MF) const;
   bool needsStackRealignment(const MachineFunction &MF) const;
-
-  // Exception handling queries.
-  unsigned getEHExceptionRegister() const;
-  unsigned getEHHandlerRegister() const;
 };
 
 } // end namespace llvm
diff --git a/lib/Target/PowerPC/PPCSchedule.td b/lib/Target/PowerPC/PPCSchedule.td
index 660c0c3..92ba69c 100644
--- a/lib/Target/PowerPC/PPCSchedule.td
+++ b/lib/Target/PowerPC/PPCSchedule.td
@@ -108,6 +108,14 @@ def VecPerm      : InstrItinClass;
 def VecFPRound   : InstrItinClass;
 def VecVSL       : InstrItinClass;
 def VecVSR       : InstrItinClass;
+def SprMTMSRD    : InstrItinClass;
+def SprSLIE      : InstrItinClass;
+def SprSLBIE     : InstrItinClass;
+def SprSLBMTE    : InstrItinClass;
+def SprSLBMFEE   : InstrItinClass;
+def SprSLBIA     : InstrItinClass;
+def SprTLBIEL    : InstrItinClass;
+def SprTLBIE     : InstrItinClass;
 
 //===----------------------------------------------------------------------===//
 // Processor instruction itineraries.
diff --git a/lib/Target/PowerPC/PPCScheduleA2.td b/lib/Target/PowerPC/PPCScheduleA2.td
index 8d5838e..1612cd2 100644
--- a/lib/Target/PowerPC/PPCScheduleA2.td
+++ b/lib/Target/PowerPC/PPCScheduleA2.td
@@ -14,39 +14,8 @@
 //===----------------------------------------------------------------------===//
 // Functional units on the PowerPC A2 chip sets
 //
-def IU0to3_0  : FuncUnit; // Fetch unit 1 to 4 slot 1
-def IU0to3_1  : FuncUnit; // Fetch unit 1 to 4 slot 2
-def IU0to3_2  : FuncUnit; // Fetch unit 1 to 4 slot 3
-def IU0to3_3  : FuncUnit; // Fetch unit 1 to 4 slot 4
-def IU4_0  : FuncUnit; // Instruction buffer slot 1
-def IU4_1  : FuncUnit; // Instruction buffer slot 2
-def IU4_2  : FuncUnit; // Instruction buffer slot 3
-def IU4_3  : FuncUnit; // Instruction buffer slot 4
-def IU4_4  : FuncUnit; // Instruction buffer slot 5
-def IU4_5  : FuncUnit; // Instruction buffer slot 6
-def IU4_6  : FuncUnit; // Instruction buffer slot 7
-def IU4_7  : FuncUnit; // Instruction buffer slot 8
-def IU5    : FuncUnit; // Dependency resolution
-def IU6    : FuncUnit; // Instruction issue
-def RF0    : FuncUnit;
-def XRF1   : FuncUnit;
-def XEX1   : FuncUnit; // Execution stage 1 for the XU pipeline
-def XEX2   : FuncUnit; // Execution stage 2 for the XU pipeline
-def XEX3   : FuncUnit; // Execution stage 3 for the XU pipeline
-def XEX4   : FuncUnit; // Execution stage 4 for the XU pipeline
-def XEX5   : FuncUnit; // Execution stage 5 for the XU pipeline
-def XEX6   : FuncUnit; // Execution stage 6 for the XU pipeline
-def FRF1   : FuncUnit;
-def FEX1   : FuncUnit; // Execution stage 1 for the FU pipeline
-def FEX2   : FuncUnit; // Execution stage 2 for the FU pipeline
-def FEX3   : FuncUnit; // Execution stage 3 for the FU pipeline
-def FEX4   : FuncUnit; // Execution stage 4 for the FU pipeline
-def FEX5   : FuncUnit; // Execution stage 5 for the FU pipeline
-def FEX6   : FuncUnit; // Execution stage 6 for the FU pipeline
-
-def CR_Bypass  : Bypass; // The bypass for condition regs.
-//def GPR_Bypass : Bypass; // The bypass for general-purpose regs.
-//def FPR_Bypass : Bypass; // The bypass for floating-point regs.
+def XU     : FuncUnit; // XU pipeline
+def FU     : FuncUnit; // FI pipeline
 
 //
 // This file defines the itinerary class data for the PPC A2 processor.
@@ -55,699 +24,119 @@ def CR_Bypass  : Bypass; // The bypass for condition regs.
 
 
 def PPCA2Itineraries : ProcessorItineraries<
-  [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3,
-   IU4_0, IU4_1, IU4_2, IU4_3, IU4_4, IU4_5, IU4_6, IU4_7,
-   IU5, IU6, RF0, XRF1, XEX1, XEX2, XEX3, XEX4, XEX5, XEX6,
-   FRF1, FEX1, FEX2, FEX3, FEX4, FEX5, FEX6],
-  [CR_Bypass, GPR_Bypass, FPR_Bypass], [
-  InstrItinData<IntSimple   , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [10, 7, 7],
-                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<IntGeneral  , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [10, 7, 7],
-                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<IntCompare  , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [10, 7, 7],
-                              [CR_Bypass, GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<IntDivW     , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<38, [XEX6]>],
-                              [53, 7, 7],
-                              [NoBypass, GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<IntMFFS     , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [10, 7, 7],
-                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<IntMTFSB0   , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [10, 7, 7], 
-                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<IntMulHW    , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [14, 7, 7],
-                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<IntMulHWU   , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [14, 7, 7],
-                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<IntMulLI    , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [15, 7, 7],
-                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<IntRotate   , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [10, 7, 7],
-                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<IntRotateD  , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [10, 7, 7],
-                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<IntRotateDI , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [10, 7, 7],
-                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,                              
-  InstrItinData<IntShift    , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [10, 7, 7],
-                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<IntTrapW    , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [10, 7, 7], 
-                              [GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<IntTrapD    , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [10, 7, 7], 
-                              [GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<BrB         , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [15, 7, 7],
-                              [NoBypass, GPR_Bypass]>,
-  InstrItinData<BrCR        , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [10, 7, 7],
-                              [CR_Bypass, CR_Bypass, CR_Bypass]>,
-  InstrItinData<BrMCR       , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [10, 7, 7],
-                              [CR_Bypass, CR_Bypass, CR_Bypass]>,
-  InstrItinData<BrMCRX      , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [10, 7, 7],
-                              [CR_Bypass, GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<LdStDCBA    , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [13, 11],
-                              [NoBypass, GPR_Bypass]>,
-  InstrItinData<LdStDCBF    , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [13, 11],
-                              [NoBypass, GPR_Bypass]>,
-  InstrItinData<LdStDCBI    , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [13, 11],
-                              [NoBypass, GPR_Bypass]>,
-  InstrItinData<LdStLoad    , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [14, 7],
-                              [GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<LdStLoadUpd , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [14, 7],
-                              [GPR_Bypass, GPR_Bypass]>,                              
-  InstrItinData<LdStLDU     , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [14, 7],
-                              [GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<LdStStore   , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [13, 7],
-                              [GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<LdStStoreUpd, [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [13, 7],
-                              [GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<LdStICBI    , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [14, 7],
-                              [NoBypass, GPR_Bypass]>,
-  InstrItinData<LdStSTFD    , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [14, 7, 7],
-                              [NoBypass, FPR_Bypass, FPR_Bypass]>,
-  InstrItinData<LdStSTFDU   , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [14, 7, 7],
-                              [NoBypass, FPR_Bypass, FPR_Bypass]>,                              
-  InstrItinData<LdStLFD     , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [14, 7, 7],
-                              [FPR_Bypass, GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<LdStLFDU    , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [14, 7, 7],
-                              [FPR_Bypass, GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<LdStLHA     , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [14, 7],
-                              [NoBypass, GPR_Bypass]>,
-  InstrItinData<LdStLHAU    , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [14, 7],
-                              [NoBypass, GPR_Bypass]>,
-  InstrItinData<LdStLMW     , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [14, 7],
-                              [NoBypass, GPR_Bypass]>,
-  InstrItinData<LdStLWARX   , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<13, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [26, 7],
-                              [NoBypass, GPR_Bypass]>,
-  InstrItinData<LdStSTD     , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [13, 7],
-                              [GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<LdStSTDU    , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [13, 7],
-                              [GPR_Bypass, GPR_Bypass]>,                              
-  InstrItinData<LdStSTDCX   , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<13, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [26, 7],
-                              [NoBypass, GPR_Bypass]>,
-  InstrItinData<LdStSTWCX   , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<13, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [26, 7],
-                              [NoBypass, GPR_Bypass]>,
-  InstrItinData<LdStSync    , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<12, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>]>,
-  InstrItinData<SprISYNC    , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<14, [XEX6]>]>,
-  InstrItinData<SprMFSR     , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [15, 7],
-                              [GPR_Bypass, NoBypass]>,
-  InstrItinData<SprMTMSR    , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [15, 7],
-                              [NoBypass, GPR_Bypass]>,
-  InstrItinData<SprMTSR     , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [15, 7],
-                              [NoBypass, GPR_Bypass]>,
-  InstrItinData<SprTLBSYNC  , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<14, [XEX6]>]>,
-  InstrItinData<SprMFCR     , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [10, 7], 
-                              [GPR_Bypass, CR_Bypass]>,
-  InstrItinData<SprMFMSR    , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [15, 7],
-                              [GPR_Bypass, NoBypass]>,
-  InstrItinData<SprMFSPR    , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [15, 7],
-                              [NoBypass, GPR_Bypass]>,
-  InstrItinData<SprMFTB     , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<14, [XEX6]>],
-                              [29, 7],
-                              [NoBypass, GPR_Bypass]>,
-  InstrItinData<SprMTSPR    , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [15, 7],
-                              [NoBypass, GPR_Bypass]>,
-  InstrItinData<SprMTSRIN   , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<14, [XEX6]>],
-                              [29, 7],
-                              [NoBypass, GPR_Bypass]>,
-  InstrItinData<SprRFI      , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<14, [XEX6]>],
-                              [29, 7],
-                              [NoBypass, GPR_Bypass]>,
-  InstrItinData<SprSC       , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<14, [XEX6]>],
-                              [29, 7],
-                              [NoBypass, GPR_Bypass]>,
-  InstrItinData<FPGeneral   , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [FRF1]>,
-                               InstrStage<1, [FEX1]>, InstrStage<1, [FEX2]>,
-                               InstrStage<1, [FEX3]>, InstrStage<1, [FEX4]>,
-                               InstrStage<1, [FEX5]>, InstrStage<1, [FEX6]>],
-                              [15, 7, 7],
-                              [FPR_Bypass, FPR_Bypass, FPR_Bypass]>,
-  InstrItinData<FPAddSub    , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [FRF1]>,
-                               InstrStage<1, [FEX1]>, InstrStage<1, [FEX2]>,
-                               InstrStage<1, [FEX3]>, InstrStage<1, [FEX4]>,
-                               InstrStage<1, [FEX5]>, InstrStage<1, [FEX6]>],
-                              [15, 7, 7],
-                              [FPR_Bypass, FPR_Bypass, FPR_Bypass]>,
-  InstrItinData<FPCompare   , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [FRF1]>,
-                               InstrStage<1, [FEX1]>, InstrStage<1, [FEX2]>,
-                               InstrStage<1, [FEX3]>, InstrStage<1, [FEX4]>,
-                               InstrStage<1, [FEX5]>, InstrStage<1, [FEX6]>],
-                              [13, 7, 7],
-                              [CR_Bypass, FPR_Bypass, FPR_Bypass]>,
-  InstrItinData<FPDivD      , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<71, [FRF1], 0>,
-                               InstrStage<71, [FEX1], 0>,
-                                  InstrStage<71, [FEX2], 0>,
-                               InstrStage<71, [FEX3], 0>,
-                                  InstrStage<71, [FEX4], 0>,
-                               InstrStage<71, [FEX5], 0>,
-                                  InstrStage<71, [FEX6]>],
-                              [86, 7, 7],
-                              [NoBypass, FPR_Bypass, FPR_Bypass]>,
-  InstrItinData<FPDivS      , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<58, [FRF1], 0>,
-                               InstrStage<58, [FEX1], 0>,
-                                  InstrStage<58, [FEX2], 0>,
-                               InstrStage<58, [FEX3], 0>,
-                                  InstrStage<58, [FEX4], 0>,
-                               InstrStage<58, [FEX5], 0>,
-                                  InstrStage<58, [FEX6]>],
-                              [73, 7, 7],
-                              [NoBypass, FPR_Bypass, FPR_Bypass]>,
-  InstrItinData<FPSqrt      , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<68, [FRF1], 0>,
-                               InstrStage<68, [FEX1], 0>,
-                                  InstrStage<68, [FEX2], 0>,
-                               InstrStage<68, [FEX3], 0>,
-                                  InstrStage<68, [FEX4], 0>,
-                               InstrStage<68, [FEX5], 0>,
-                                  InstrStage<68, [FEX6]>],
-                              [86, 7], // FIXME: should be [86, 7] for double
-                                       // and [82, 7] for single. Likewise,
-                                       // the FEX? cycle count should be 68
-                                       // for double and 64 for single.
-                              [NoBypass, FPR_Bypass]>,
-  InstrItinData<FPFused     , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [FRF1]>,
-                               InstrStage<1, [FEX1]>, InstrStage<1, [FEX2]>,
-                               InstrStage<1, [FEX3]>, InstrStage<1, [FEX4]>,
-                               InstrStage<1, [FEX5]>, InstrStage<1, [FEX6]>],
-                              [15, 7, 7, 7],
-                              [FPR_Bypass, FPR_Bypass, FPR_Bypass, FPR_Bypass]>,
-  InstrItinData<FPRes       , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [FRF1]>,
-                               InstrStage<1, [FEX1]>, InstrStage<1, [FEX2]>,
-                               InstrStage<1, [FEX3]>, InstrStage<1, [FEX4]>,
-                               InstrStage<1, [FEX5]>, InstrStage<1, [FEX6]>],
-                              [15, 7],
-                              [FPR_Bypass, FPR_Bypass]>
+  [XU, FU], [], [
+  InstrItinData<IntSimple   , [InstrStage<1, [XU]>],
+                              [1, 1, 1]>,
+  InstrItinData<IntGeneral  , [InstrStage<1, [XU]>],
+                              [2, 1, 1]>,
+  InstrItinData<IntCompare  , [InstrStage<1, [XU]>],
+                              [2, 1, 1]>,
+  InstrItinData<IntDivW     , [InstrStage<1, [XU]>],
+                              [39, 1, 1]>,
+  InstrItinData<IntDivD     , [InstrStage<1, [XU]>],
+                              [71, 1, 1]>,
+  InstrItinData<IntMulHW    , [InstrStage<1, [XU]>],
+                              [5, 1, 1]>,
+  InstrItinData<IntMulHWU   , [InstrStage<1, [XU]>],
+                              [5, 1, 1]>,
+  InstrItinData<IntMulLI    , [InstrStage<1, [XU]>],
+                              [6, 1, 1]>,
+  InstrItinData<IntRotate   , [InstrStage<1, [XU]>],
+                              [2, 1, 1]>,
+  InstrItinData<IntRotateD  , [InstrStage<1, [XU]>],
+                              [2, 1, 1]>,
+  InstrItinData<IntRotateDI , [InstrStage<1, [XU]>],
+                              [2, 1, 1]>,
+  InstrItinData<IntShift    , [InstrStage<1, [XU]>],
+                              [2, 1, 1]>,
+  InstrItinData<IntTrapW    , [InstrStage<1, [XU]>],
+                              [2, 1]>,
+  InstrItinData<IntTrapD    , [InstrStage<1, [XU]>],
+                              [2, 1]>,
+  InstrItinData<BrB         , [InstrStage<1, [XU]>],
+                              [6, 1, 1]>,
+  InstrItinData<BrCR        , [InstrStage<1, [XU]>],
+                              [1, 1, 1]>,
+  InstrItinData<BrMCR       , [InstrStage<1, [XU]>],
+                              [5, 1, 1]>,
+  InstrItinData<BrMCRX      , [InstrStage<1, [XU]>],
+                              [1, 1, 1]>,
+  InstrItinData<LdStDCBA    , [InstrStage<1, [XU]>],
+                              [1, 1, 1]>,
+  InstrItinData<LdStDCBF    , [InstrStage<1, [XU]>],
+                              [1, 1, 1]>,
+  InstrItinData<LdStDCBI    , [InstrStage<1, [XU]>],
+                              [1, 1, 1]>,
+  InstrItinData<LdStLoad    , [InstrStage<1, [XU]>],
+                              [6, 1, 1]>,
+  InstrItinData<LdStLoadUpd , [InstrStage<1, [XU]>],
+                              [6, 8, 1, 1]>,
+  InstrItinData<LdStLDU     , [InstrStage<1, [XU]>],
+                              [6, 1, 1]>,
+  InstrItinData<LdStStore   , [InstrStage<1, [XU]>],
+                              [1, 1, 1]>,
+  InstrItinData<LdStStoreUpd, [InstrStage<1, [XU]>],
+                              [2, 1, 1, 1]>,
+  InstrItinData<LdStICBI,     [InstrStage<1, [XU]>],
+                              [16, 1, 1]>,
+  InstrItinData<LdStSTFD    , [InstrStage<1, [XU]>],
+                              [1, 1, 1]>,
+  InstrItinData<LdStSTFDU   , [InstrStage<1, [XU]>],
+                              [2, 1, 1, 1]>,
+  InstrItinData<LdStLFD     , [InstrStage<1, [XU]>],
+                              [7, 1, 1]>,
+  InstrItinData<LdStLFDU    , [InstrStage<1, [XU]>],
+                              [7, 9, 1, 1]>,
+  InstrItinData<LdStLHA     , [InstrStage<1, [XU]>],
+                              [6, 1, 1]>,
+  InstrItinData<LdStLHAU    , [InstrStage<1, [XU]>],
+                              [6, 8, 1, 1]>,
+  InstrItinData<LdStLWARX   , [InstrStage<1, [XU]>],
+                              [82, 1, 1]>, // L2 latency
+  InstrItinData<LdStSTD     , [InstrStage<1, [XU]>],
+                              [1, 1, 1]>,
+  InstrItinData<LdStSTDU    , [InstrStage<1, [XU]>],
+                              [2, 1, 1, 1]>,
+  InstrItinData<LdStSTDCX   , [InstrStage<1, [XU]>],
+                              [82, 1, 1]>, // L2 latency
+  InstrItinData<LdStSTWCX   , [InstrStage<1, [XU]>],
+                              [82, 1, 1]>, // L2 latency
+  InstrItinData<LdStSync    , [InstrStage<1, [XU]>],
+                              [6]>,
+  InstrItinData<SprISYNC    , [InstrStage<1, [XU]>],
+                              [16]>,
+  InstrItinData<SprMTMSR    , [InstrStage<1, [XU]>],
+                              [16, 1]>,
+  InstrItinData<SprMFCR     , [InstrStage<1, [XU]>],
+                              [6, 1]>,
+  InstrItinData<SprMFMSR    , [InstrStage<1, [XU]>],
+                              [4, 1]>,
+  InstrItinData<SprMFSPR    , [InstrStage<1, [XU]>],
+                              [6, 1]>,
+  InstrItinData<SprMFTB     , [InstrStage<1, [XU]>],
+                              [4, 1]>,
+  InstrItinData<SprMTSPR    , [InstrStage<1, [XU]>],
+                              [6, 1]>,
+  InstrItinData<SprRFI      , [InstrStage<1, [XU]>],
+                              [16]>,
+  InstrItinData<SprSC       , [InstrStage<1, [XU]>],
+                              [16]>,
+  InstrItinData<FPGeneral   , [InstrStage<1, [FU]>],
+                              [6, 1, 1]>,
+  InstrItinData<FPAddSub    , [InstrStage<1, [FU]>],
+                              [6, 1, 1]>,
+  InstrItinData<FPCompare   , [InstrStage<1, [FU]>],
+                              [5, 1, 1]>,
+  InstrItinData<FPDivD      , [InstrStage<1, [FU]>],
+                              [72, 1, 1]>,
+  InstrItinData<FPDivS      , [InstrStage<1, [FU]>],
+                              [59, 1, 1]>,
+  InstrItinData<FPSqrt      , [InstrStage<1, [FU]>],
+                              [69, 1, 1]>,
+  InstrItinData<FPFused     , [InstrStage<1, [FU]>],
+                              [6, 1, 1, 1]>,
+  InstrItinData<FPRes       , [InstrStage<1, [FU]>],
+                              [6, 1]>
 ]>;
 
 // ===---------------------------------------------------------------------===//
diff --git a/lib/Target/PowerPC/PPCScheduleE500mc.td b/lib/Target/PowerPC/PPCScheduleE500mc.td
index 9bb779a..c189b9e 100644
--- a/lib/Target/PowerPC/PPCScheduleE500mc.td
+++ b/lib/Target/PowerPC/PPCScheduleE500mc.td
@@ -36,6 +36,8 @@ def CFX_0 : FuncUnit; // CFX pipeline
 def LSU_0 : FuncUnit; // LSU pipeline
 def FPU_0 : FuncUnit; // FPU pipeline
 
+def CR_Bypass : Bypass;
+
 def PPCE500mcItineraries : ProcessorItineraries<
   [DIS0, DIS1, SFX0, SFX1, BU, CFX_DivBypass, CFX_0, LSU_0, FPU_0],
   [CR_Bypass, GPR_Bypass, FPR_Bypass], [
diff --git a/lib/Target/PowerPC/PPCScheduleE5500.td b/lib/Target/PowerPC/PPCScheduleE5500.td
index d7e11ac..7a24d20 100644
--- a/lib/Target/PowerPC/PPCScheduleE5500.td
+++ b/lib/Target/PowerPC/PPCScheduleE5500.td
@@ -39,6 +39,7 @@ def CFX_1 : FuncUnit; // CFX pipeline stage 1
 // def LSU_0 : FuncUnit; // LSU pipeline
 // def FPU_0 : FuncUnit; // FPU pipeline
 
+// def CR_Bypass : Bypass;
 
 def PPCE5500Itineraries : ProcessorItineraries<
   [DIS0, DIS1, SFX0, SFX1, BU, CFX_DivBypass, CFX_0, CFX_1,
diff --git a/lib/Target/PowerPC/PPCSubtarget.cpp b/lib/Target/PowerPC/PPCSubtarget.cpp
index 12d0326..7231ab1 100644
--- a/lib/Target/PowerPC/PPCSubtarget.cpp
+++ b/lib/Target/PowerPC/PPCSubtarget.cpp
@@ -15,6 +15,7 @@
 #include "PPC.h"
 #include "PPCRegisterInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineScheduler.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/Function.h"
@@ -74,6 +75,7 @@ void PPCSubtarget::initializeEnvironment() {
   Use64BitRegs = false;
   HasAltivec = false;
   HasQPX = false;
+  HasFCPSGN = false;
   HasFSQRT = false;
   HasFRE = false;
   HasFRES = false;
@@ -88,6 +90,8 @@ void PPCSubtarget::initializeEnvironment() {
   HasPOPCNTD = false;
   HasLDBRX = false;
   IsBookE = false;
+  DeprecatedMFTB = false;
+  DeprecatedDST = false;
   HasLazyResolverStubs = false;
   IsJITCodeModel = false;
 }
@@ -163,14 +167,7 @@ bool PPCSubtarget::enablePostRAScheduler(
            CodeGenOpt::Level OptLevel,
            TargetSubtargetInfo::AntiDepBreakMode& Mode,
            RegClassVector& CriticalPathRCs) const {
-  // FIXME: It would be best to use TargetSubtargetInfo::ANTIDEP_ALL here,
-  // but we can't because we can't reassign the cr registers. There is a
-  // dependence between the cr register and the RLWINM instruction used
-  // to extract its value which the anti-dependency breaker can't currently
-  // see. Maybe we should make a late-expanded pseudo to encode this dependency.
-  // (the relevant code is in PPCDAGToDAGISel::SelectSETCC)
-
-  Mode = TargetSubtargetInfo::ANTIDEP_CRITICAL;
+  Mode = TargetSubtargetInfo::ANTIDEP_ALL;
 
   CriticalPathRCs.clear();
 
@@ -179,9 +176,44 @@ bool PPCSubtarget::enablePostRAScheduler(
   else
     CriticalPathRCs.push_back(&PPC::GPRCRegClass);
     
-  CriticalPathRCs.push_back(&PPC::F8RCRegClass);
-  CriticalPathRCs.push_back(&PPC::VRRCRegClass);
-
   return OptLevel >= CodeGenOpt::Default;
 }
 
+// Embedded cores need aggressive scheduling.
+static bool needsAggressiveScheduling(unsigned Directive) {
+  switch (Directive) {
+  default: return false;
+  case PPC::DIR_440:
+  case PPC::DIR_A2:
+  case PPC::DIR_E500mc:
+  case PPC::DIR_E5500:
+    return true;
+  }
+}
+
+bool PPCSubtarget::enableMachineScheduler() const {
+  // Enable MI scheduling for the embedded cores.
+  // FIXME: Enable this for all cores (some additional modeling
+  // may be necessary).
+  return needsAggressiveScheduling(DarwinDirective);
+}
+
+void PPCSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
+                                       MachineInstr *begin,
+                                       MachineInstr *end,
+                                       unsigned NumRegionInstrs) const {
+  if (needsAggressiveScheduling(DarwinDirective)) {
+    Policy.OnlyTopDown = false;
+    Policy.OnlyBottomUp = false;
+  }
+
+  // Spilling is generally expensive on all PPC cores, so always enable
+  // register-pressure tracking.
+  Policy.ShouldTrackPressure = true;
+}
+
+bool PPCSubtarget::useAA() const {
+  // Use AA during code generation for the embedded cores.
+  return needsAggressiveScheduling(DarwinDirective);
+}
+
diff --git a/lib/Target/PowerPC/PPCSubtarget.h b/lib/Target/PowerPC/PPCSubtarget.h
index 3f3fc0e..c863a6e 100644
--- a/lib/Target/PowerPC/PPCSubtarget.h
+++ b/lib/Target/PowerPC/PPCSubtarget.h
@@ -76,6 +76,8 @@ protected:
   bool IsPPC64;
   bool HasAltivec;
   bool HasQPX;
+  bool HasVSX;
+  bool HasFCPSGN;
   bool HasFSQRT;
   bool HasFRE, HasFRES, HasFRSQRTE, HasFRSQRTES;
   bool HasRecipPrec;
@@ -87,6 +89,8 @@ protected:
   bool HasPOPCNTD;
   bool HasLDBRX;
   bool IsBookE;
+  bool DeprecatedMFTB;
+  bool DeprecatedDST;
   bool HasLazyResolverStubs;
   bool IsJITCodeModel;
   bool IsLittleEndian;
@@ -171,6 +175,7 @@ public:
   bool isLittleEndian() const { return IsLittleEndian; }
 
   // Specific obvious features.
+  bool hasFCPSGN() const { return HasFCPSGN; }
   bool hasFSQRT() const { return HasFSQRT; }
   bool hasFRE() const { return HasFRE; }
   bool hasFRES() const { return HasFRES; }
@@ -188,6 +193,8 @@ public:
   bool hasPOPCNTD() const { return HasPOPCNTD; }
   bool hasLDBRX() const { return HasLDBRX; }
   bool isBookE() const { return IsBookE; }
+  bool isDeprecatedMFTB() const { return DeprecatedMFTB; }
+  bool isDeprecatedDST() const { return DeprecatedDST; }
 
   const Triple &getTargetTriple() const { return TargetTriple; }
 
@@ -205,6 +212,14 @@ public:
   bool enablePostRAScheduler(CodeGenOpt::Level OptLevel,
                              TargetSubtargetInfo::AntiDepBreakMode& Mode,
                              RegClassVector& CriticalPathRCs) const;
+
+  // Scheduling customization.
+  bool enableMachineScheduler() const;
+  void overrideSchedPolicy(MachineSchedPolicy &Policy,
+                           MachineInstr *begin,
+                           MachineInstr *end,
+                           unsigned NumRegionInstrs) const;
+  bool useAA() const;
 };
 } // End llvm namespace
 
diff --git a/lib/Target/PowerPC/PPCTargetStreamer.h b/lib/Target/PowerPC/PPCTargetStreamer.h
new file mode 100644
index 0000000..e876be1
--- /dev/null
+++ b/lib/Target/PowerPC/PPCTargetStreamer.h
@@ -0,0 +1,23 @@
+//===-- PPCTargetStreamer.h - PPC Target Streamer --s-----------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PPCTARGETSTREAMER_H
+#define PPCTARGETSTREAMER_H
+
+#include "llvm/MC/MCStreamer.h"
+
+namespace llvm {
+class PPCTargetStreamer : public MCTargetStreamer {
+public:
+  virtual ~PPCTargetStreamer();
+  virtual void emitTCEntry(const MCSymbol &S) = 0;
+};
+}
+
+#endif
diff --git a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
index 2504ba7..8879630 100644
--- a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -77,6 +77,7 @@ public:
   /// \name Scalar TTI Implementations
   /// @{
   virtual PopcntSupportKind getPopcntSupport(unsigned TyWidth) const;
+  virtual void getUnrollingPreferences(Loop *L, UnrollingPreferences &UP) const;
 
   /// @}
 
@@ -129,6 +130,14 @@ PPCTTI::PopcntSupportKind PPCTTI::getPopcntSupport(unsigned TyWidth) const {
   return PSK_Software;
 }
 
+void PPCTTI::getUnrollingPreferences(Loop *L, UnrollingPreferences &UP) const {
+  if (ST->getDarwinDirective() == PPC::DIR_A2) {
+    // The A2 is in-order with a deep pipeline, and concatenation unrolling
+    // helps expose latency-hiding opportunities to the instruction scheduler.
+    UP.Partial = UP.Runtime = true;
+  }
+}
+
 unsigned PPCTTI::getNumberOfRegisters(bool Vector) const {
   if (Vector && !ST->hasAltivec())
     return 0;
diff --git a/lib/Target/R600/AMDGPU.h b/lib/Target/R600/AMDGPU.h
index 6b374cb..025b28e 100644
--- a/lib/Target/R600/AMDGPU.h
+++ b/lib/Target/R600/AMDGPU.h
@@ -29,11 +29,13 @@ FunctionPass *createR600VectorRegMerger(TargetMachine &tm);
 FunctionPass *createR600TextureIntrinsicsReplacer();
 FunctionPass *createR600ExpandSpecialInstrsPass(TargetMachine &tm);
 FunctionPass *createR600EmitClauseMarkers(TargetMachine &tm);
+FunctionPass *createR600ClauseMergePass(TargetMachine &tm);
 FunctionPass *createR600Packetizer(TargetMachine &tm);
 FunctionPass *createR600ControlFlowFinalizer(TargetMachine &tm);
 FunctionPass *createAMDGPUCFGStructurizerPass(TargetMachine &tm);
 
 // SI Passes
+FunctionPass *createSITypeRewriter();
 FunctionPass *createSIAnnotateControlFlowPass();
 FunctionPass *createSILowerControlFlowPass(TargetMachine &tm);
 FunctionPass *createSIFixSGPRCopiesPass(TargetMachine &tm);
@@ -43,7 +45,6 @@ FunctionPass *createSIInsertWaits(TargetMachine &tm);
 // Passes common to R600 and SI
 Pass *createAMDGPUStructurizeCFGPass();
 FunctionPass *createAMDGPUConvertToISAPass(TargetMachine &tm);
-FunctionPass *createAMDGPUIndirectAddressingPass(TargetMachine &tm);
 FunctionPass *createAMDGPUISelDag(TargetMachine &tm);
 
 /// \brief Creates an AMDGPU-specific Target Transformation Info pass.
diff --git a/lib/Target/R600/AMDGPU.td b/lib/Target/R600/AMDGPU.td
index 0048e25..182235b 100644
--- a/lib/Target/R600/AMDGPU.td
+++ b/lib/Target/R600/AMDGPU.td
@@ -21,8 +21,18 @@ def FeatureDumpCode : SubtargetFeature <"DumpCode",
         "true",
         "Dump MachineInstrs in the CodeEmitter">;
 
+def FeatureIRStructurizer : SubtargetFeature <"disable-irstructurizer",
+        "EnableIRStructurizer",
+        "false",
+        "Disable IR Structurizer">;
+
 // Target features
 
+def FeatureIfCvt : SubtargetFeature <"disable-ifcvt",
+        "EnableIfCvt",
+        "false",
+        "Disable the if conversion pass">;
+
 def FeatureFP64     : SubtargetFeature<"fp64",
         "FP64",
         "true",
@@ -82,6 +92,8 @@ def FeatureNorthernIslands : SubtargetFeatureGeneration<"NORTHERN_ISLANDS",
 def FeatureSouthernIslands : SubtargetFeatureGeneration<"SOUTHERN_ISLANDS",
         [Feature64BitPtr, FeatureFP64]>;
 
+def FeatureSeaIslands : SubtargetFeatureGeneration<"SEA_ISLANDS",
+        [Feature64BitPtr, FeatureFP64]>;
 //===----------------------------------------------------------------------===//
 
 def AMDGPUInstrInfo : InstrInfo {
diff --git a/lib/Target/R600/AMDGPUAsmPrinter.cpp b/lib/Target/R600/AMDGPUAsmPrinter.cpp
index e039b77..67bdba2 100644
--- a/lib/Target/R600/AMDGPUAsmPrinter.cpp
+++ b/lib/Target/R600/AMDGPUAsmPrinter.cpp
@@ -45,32 +45,60 @@ extern "C" void LLVMInitializeR600AsmPrinter() {
   TargetRegistry::RegisterAsmPrinter(TheAMDGPUTarget, createAMDGPUAsmPrinterPass);
 }
 
+AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
+    : AsmPrinter(TM, Streamer)
+{
+  DisasmEnabled = TM.getSubtarget<AMDGPUSubtarget>().dumpCode() &&
+                  ! Streamer.hasRawTextSupport();
+}
+
 /// We need to override this function so we can avoid
 /// the call to EmitFunctionHeader(), which the MCPureStreamer can't handle.
 bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
-  const AMDGPUSubtarget &STM = TM.getSubtarget<AMDGPUSubtarget>();
-  if (STM.dumpCode()) {
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-    MF.dump();
-#endif
-  }
   SetupMachineFunction(MF);
   if (OutStreamer.hasRawTextSupport()) {
     OutStreamer.EmitRawText("@" + MF.getName() + ":");
   }
 
-  const MCSectionELF *ConfigSection = getObjFileLowering().getContext()
-                                              .getELFSection(".AMDGPU.config",
+  MCContext &Context = getObjFileLowering().getContext();
+  const MCSectionELF *ConfigSection = Context.getELFSection(".AMDGPU.config",
                                               ELF::SHT_PROGBITS, 0,
                                               SectionKind::getReadOnly());
   OutStreamer.SwitchSection(ConfigSection);
+  const AMDGPUSubtarget &STM = TM.getSubtarget<AMDGPUSubtarget>();
   if (STM.getGeneration() > AMDGPUSubtarget::NORTHERN_ISLANDS) {
     EmitProgramInfoSI(MF);
   } else {
     EmitProgramInfoR600(MF);
   }
+
+  DisasmLines.clear();
+  HexLines.clear();
+  DisasmLineMaxLen = 0;
+
   OutStreamer.SwitchSection(getObjFileLowering().getTextSection());
   EmitFunctionBody();
+
+  if (STM.dumpCode()) {
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+    MF.dump();
+#endif
+
+    if (DisasmEnabled) {
+      OutStreamer.SwitchSection(Context.getELFSection(".AMDGPU.disasm",
+                                                  ELF::SHT_NOTE, 0,
+                                                  SectionKind::getReadOnly()));
+
+      for (size_t i = 0; i < DisasmLines.size(); ++i) {
+        std::string Comment(DisasmLineMaxLen - DisasmLines[i].size(), ' ');
+        Comment += " ; " + HexLines[i] + "\n";
+
+        OutStreamer.EmitBytes(StringRef(DisasmLines[i]));
+        OutStreamer.EmitBytes(StringRef(Comment));
+      }
+    }
+  }
+
   return false;
 }
 
@@ -139,6 +167,7 @@ void AMDGPUAsmPrinter::EmitProgramInfoR600(MachineFunction &MF) {
 }
 
 void AMDGPUAsmPrinter::EmitProgramInfoSI(MachineFunction &MF) {
+  const AMDGPUSubtarget &STM = TM.getSubtarget<AMDGPUSubtarget>();
   unsigned MaxSGPR = 0;
   unsigned MaxVGPR = 0;
   bool VCCUsed = false;
@@ -154,7 +183,7 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(MachineFunction &MF) {
 
       unsigned numOperands = MI.getNumOperands();
       for (unsigned op_idx = 0; op_idx < numOperands; op_idx++) {
-        MachineOperand & MO = MI.getOperand(op_idx);
+        MachineOperand &MO = MI.getOperand(op_idx);
         unsigned maxUsed;
         unsigned width = 0;
         bool isSGPR = false;
@@ -168,8 +197,10 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(MachineFunction &MF) {
           VCCUsed = true;
           continue;
         }
+
         switch (reg) {
         default: break;
+        case AMDGPU::SCC:
         case AMDGPU::EXEC:
         case AMDGPU::M0:
           continue;
@@ -202,6 +233,9 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(MachineFunction &MF) {
         } else if (AMDGPU::VReg_256RegClass.contains(reg)) {
           isSGPR = false;
           width = 8;
+        } else if (AMDGPU::SReg_512RegClass.contains(reg)) {
+          isSGPR = true;
+          width = 16;
         } else if (AMDGPU::VReg_512RegClass.contains(reg)) {
           isSGPR = false;
           width = 16;
@@ -234,13 +268,24 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(MachineFunction &MF) {
   OutStreamer.EmitIntValue(RsrcReg, 4);
   OutStreamer.EmitIntValue(S_00B028_VGPRS(MaxVGPR / 4) | S_00B028_SGPRS(MaxSGPR / 8), 4);
 
+  unsigned LDSAlignShift;
+  if (STM.getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) {
+    // LDS is allocated in 64 dword blocks
+    LDSAlignShift = 8;
+  } else {
+    // LDS is allocated in 128 dword blocks
+    LDSAlignShift = 9;
+  }
+  unsigned LDSBlocks =
+          RoundUpToAlignment(MFI->LDSSize, 1 << LDSAlignShift) >> LDSAlignShift;
+
   if (MFI->ShaderType == ShaderType::COMPUTE) {
     OutStreamer.EmitIntValue(R_00B84C_COMPUTE_PGM_RSRC2, 4);
-    OutStreamer.EmitIntValue(S_00B84C_LDS_SIZE(RoundUpToAlignment(MFI->LDSSize, 256) >> 8), 4);
+    OutStreamer.EmitIntValue(S_00B84C_LDS_SIZE(LDSBlocks), 4);
   }
   if (MFI->ShaderType == ShaderType::PIXEL) {
     OutStreamer.EmitIntValue(R_00B02C_SPI_SHADER_PGM_RSRC2_PS, 4);
-    OutStreamer.EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(RoundUpToAlignment(MFI->LDSSize, 256) >> 8), 4);
+    OutStreamer.EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(LDSBlocks), 4);
     OutStreamer.EmitIntValue(R_0286CC_SPI_PS_INPUT_ENA, 4);
     OutStreamer.EmitIntValue(MFI->PSInputAddr, 4);
   }
diff --git a/lib/Target/R600/AMDGPUAsmPrinter.h b/lib/Target/R600/AMDGPUAsmPrinter.h
index f425ef4..05dc9bb 100644
--- a/lib/Target/R600/AMDGPUAsmPrinter.h
+++ b/lib/Target/R600/AMDGPUAsmPrinter.h
@@ -1,4 +1,4 @@
-//===-- AMDGPUAsmPrinter.h - Print AMDGPU assembly code -------------------===//
+//===-- AMDGPUAsmPrinter.h - Print AMDGPU assembly code ---------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -16,14 +16,15 @@
 #define AMDGPU_ASMPRINTER_H
 
 #include "llvm/CodeGen/AsmPrinter.h"
+#include <string>
+#include <vector>
 
 namespace llvm {
 
 class AMDGPUAsmPrinter : public AsmPrinter {
 
 public:
-  explicit AMDGPUAsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
-    : AsmPrinter(TM, Streamer) { }
+  explicit AMDGPUAsmPrinter(TargetMachine &TM, MCStreamer &Streamer);
 
   virtual bool runOnMachineFunction(MachineFunction &MF);
 
@@ -38,6 +39,11 @@ public:
 
   /// Implemented in AMDGPUMCInstLower.cpp
   virtual void EmitInstruction(const MachineInstr *MI);
+
+protected:
+  bool DisasmEnabled;
+  std::vector<std::string> DisasmLines, HexLines;
+  size_t DisasmLineMaxLen;
 };
 
 } // End anonymous llvm
diff --git a/lib/Target/R600/AMDGPUCallingConv.td b/lib/Target/R600/AMDGPUCallingConv.td
index fc95d58..65cdb24 100644
--- a/lib/Target/R600/AMDGPUCallingConv.td
+++ b/lib/Target/R600/AMDGPUCallingConv.td
@@ -19,12 +19,13 @@ def CC_SI : CallingConv<[
 
   CCIfInReg<CCIfType<[f32, i32] , CCAssignToReg<[
     SGPR0, SGPR1, SGPR2, SGPR3, SGPR4, SGPR5, SGPR6, SGPR7,
-    SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15
+    SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15,
+    SGPR16
   ]>>>,
 
   CCIfInReg<CCIfType<[i64] , CCAssignToRegWithShadow<
     [ SGPR0, SGPR2, SGPR4, SGPR6, SGPR8, SGPR10, SGPR12, SGPR14 ],
-    [ SGPR1, SGPR3, SGPR5, SGPR7, SGPR9, SGPR11, SGPR12, SGPR15 ]
+    [ SGPR1, SGPR3, SGPR5, SGPR7, SGPR9, SGPR11, SGPR13, SGPR15 ]
   >>>,
 
   CCIfNotInReg<CCIfType<[f32, i32] , CCAssignToReg<[
@@ -32,21 +33,33 @@ def CC_SI : CallingConv<[
     VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
     VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
     VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31
-  ]>>>
+  ]>>>,
+
+  CCIfByVal<CCIfType<[i64] , CCAssignToRegWithShadow<
+    [ SGPR0, SGPR2, SGPR4, SGPR6, SGPR8, SGPR10, SGPR12, SGPR14 ],
+    [ SGPR1, SGPR3, SGPR5, SGPR7, SGPR9, SGPR11, SGPR13, SGPR15 ]
+  >>>
 
 ]>;
 
+// Calling convention for R600
+def CC_R600 : CallingConv<[
+  CCIfInReg<CCIfType<[v4f32, v4i32] , CCAssignToReg<[
+    T0_XYZW, T1_XYZW, T2_XYZW, T3_XYZW, T4_XYZW, T5_XYZW, T6_XYZW, T7_XYZW,
+    T8_XYZW, T9_XYZW, T10_XYZW, T11_XYZW, T12_XYZW, T13_XYZW, T14_XYZW, T15_XYZW,
+    T16_XYZW, T17_XYZW, T18_XYZW, T19_XYZW, T20_XYZW, T21_XYZW, T22_XYZW,
+    T23_XYZW, T24_XYZW, T25_XYZW, T26_XYZW, T27_XYZW, T28_XYZW, T29_XYZW,
+    T30_XYZW, T31_XYZW, T32_XYZW
+  ]>>>
+]>;
+
 // Calling convention for compute kernels
 def CC_AMDGPU_Kernel : CallingConv<[
-  CCIfType<[v4i32, v4f32],               CCAssignToStack <16, 16>>,
-  CCIfType<[i64, f64, v2f32, v2i32],     CCAssignToStack < 8, 8>>,
-  CCIfType<[i32, f32],                   CCAssignToStack < 4, 4>>,
-  CCIfType<[i16],                        CCAssignToStack < 2, 4>>,
-  CCIfType<[i8],                         CCAssignToStack < 1, 4>>
+  CCCustom<"allocateStack">
 ]>;
 
 def CC_AMDGPU : CallingConv<[
-  CCIf<"State.getTarget().getSubtarget<AMDGPUSubtarget>().getGeneration() == "
+  CCIf<"State.getTarget().getSubtarget<AMDGPUSubtarget>().getGeneration() >= "
        "AMDGPUSubtarget::SOUTHERN_ISLANDS && "
        "State.getMachineFunction().getInfo<SIMachineFunctionInfo>()->"#
        "ShaderType == ShaderType::COMPUTE", CCDelegateTo<CC_AMDGPU_Kernel>>,
@@ -55,5 +68,7 @@ def CC_AMDGPU : CallingConv<[
        "State.getMachineFunction().getInfo<R600MachineFunctionInfo>()->"
        "ShaderType == ShaderType::COMPUTE", CCDelegateTo<CC_AMDGPU_Kernel>>,
   CCIf<"State.getTarget().getSubtarget<AMDGPUSubtarget>()"#
-       ".getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS", CCDelegateTo<CC_SI>>
+       ".getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS", CCDelegateTo<CC_SI>>,
+  CCIf<"State.getTarget().getSubtarget<AMDGPUSubtarget>()"#
+       ".getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS", CCDelegateTo<CC_R600>>
 ]>;
diff --git a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp
index f222901..a989135 100644
--- a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp
+++ b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp
@@ -77,6 +77,7 @@ private:
   bool isLocalLoad(const LoadSDNode *N) const;
   bool isRegionLoad(const LoadSDNode *N) const;
 
+  const TargetRegisterClass *getOperandRegClass(SDNode *N, unsigned OpNo) const;
   bool SelectGlobalValueConstantOffset(SDValue Addr, SDValue& IntPtr);
   bool SelectGlobalValueVariableOffset(SDValue Addr,
       SDValue &BaseReg, SDValue& Offset);
@@ -102,6 +103,37 @@ AMDGPUDAGToDAGISel::AMDGPUDAGToDAGISel(TargetMachine &TM)
 AMDGPUDAGToDAGISel::~AMDGPUDAGToDAGISel() {
 }
 
+/// \brief Determine the register class for \p OpNo
+/// \returns The register class of the virtual register that will be used for
+/// the given operand number \OpNo or NULL if the register class cannot be
+/// determined.
+const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N,
+                                                          unsigned OpNo) const {
+  if (!N->isMachineOpcode()) {
+    return NULL;
+  }
+  switch (N->getMachineOpcode()) {
+  default: {
+    const MCInstrDesc &Desc = TM.getInstrInfo()->get(N->getMachineOpcode());
+    unsigned OpIdx = Desc.getNumDefs() + OpNo;
+    if (OpIdx >= Desc.getNumOperands())
+      return NULL;
+    int RegClass = Desc.OpInfo[OpIdx].RegClass;
+    if (RegClass == -1) {
+      return NULL;
+    }
+    return TM.getRegisterInfo()->getRegClass(RegClass);
+  }
+  case AMDGPU::REG_SEQUENCE: {
+    const TargetRegisterClass *SuperRC = TM.getRegisterInfo()->getRegClass(
+                      cast<ConstantSDNode>(N->getOperand(0))->getZExtValue());
+    unsigned SubRegIdx =
+            dyn_cast<ConstantSDNode>(N->getOperand(OpNo + 1))->getZExtValue();
+    return TM.getRegisterInfo()->getSubClassWithSubReg(SuperRC, SubRegIdx);
+  }
+  }
+}
+
 SDValue AMDGPUDAGToDAGISel::getSmallIPtrImm(unsigned int Imm) {
   return CurDAG->getTargetConstant(Imm, MVT::i32);
 }
@@ -161,130 +193,94 @@ bool AMDGPUDAGToDAGISel::SelectADDR64(SDValue Addr, SDValue& R1, SDValue& R2) {
 }
 
 SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
-  const R600InstrInfo *TII =
-                      static_cast<const R600InstrInfo*>(TM.getInstrInfo());
   unsigned int Opc = N->getOpcode();
   if (N->isMachineOpcode()) {
+    N->setNodeId(-1);
     return NULL;   // Already selected.
   }
   switch (Opc) {
   default: break;
-  case AMDGPUISD::CONST_ADDRESS: {
-    for (SDNode::use_iterator I = N->use_begin(), Next = llvm::next(I);
-                              I != SDNode::use_end(); I = Next) {
-      Next = llvm::next(I);
-      if (!I->isMachineOpcode()) {
-        continue;
-      }
-      unsigned Opcode = I->getMachineOpcode();
-      bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
-      int SrcIdx = I.getOperandNo();
-      int SelIdx;
-      // Unlike MachineInstrs, SDNodes do not have results in their operand
-      // list, so we need to increment the SrcIdx, since
-      // R600InstrInfo::getOperandIdx is based on the MachineInstr indices.
-      if (HasDst) {
-        SrcIdx++;
-      }
-
-      SelIdx = TII->getSelIdx(I->getMachineOpcode(), SrcIdx);
-      if (SelIdx < 0) {
-        continue;
-      }
-
-      SDValue CstOffset;
-      if (N->getValueType(0).isVector() ||
-          !SelectGlobalValueConstantOffset(N->getOperand(0), CstOffset))
-        continue;
-
-      // Gather constants values
-      int SrcIndices[] = {
-        TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
-        TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
-        TII->getOperandIdx(Opcode, AMDGPU::OpName::src2),
-        TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
-        TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
-        TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
-        TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
-        TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
-        TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
-        TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
-        TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
-      };
-      std::vector<unsigned> Consts;
-      for (unsigned i = 0; i < sizeof(SrcIndices) / sizeof(int); i++) {
-        int OtherSrcIdx = SrcIndices[i];
-        int OtherSelIdx = TII->getSelIdx(Opcode, OtherSrcIdx);
-        if (OtherSrcIdx < 0 || OtherSelIdx < 0) {
+  case ISD::BUILD_VECTOR: {
+    unsigned RegClassID;
+    const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
+    const AMDGPURegisterInfo *TRI =
+                   static_cast<const AMDGPURegisterInfo*>(TM.getRegisterInfo());
+    const SIRegisterInfo *SIRI =
+                   static_cast<const SIRegisterInfo*>(TM.getRegisterInfo());
+    EVT VT = N->getValueType(0);
+    unsigned NumVectorElts = VT.getVectorNumElements();
+    assert(VT.getVectorElementType().bitsEq(MVT::i32));
+    if (ST.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
+      bool UseVReg = true;
+      for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end();
+                                                    U != E; ++U) {
+        if (!U->isMachineOpcode()) {
           continue;
         }
-        if (HasDst) {
-          OtherSrcIdx--;
-          OtherSelIdx--;
+        const TargetRegisterClass *RC = getOperandRegClass(*U, U.getOperandNo());
+        if (!RC) {
+          continue;
         }
-        if (RegisterSDNode *Reg =
-                         dyn_cast<RegisterSDNode>(I->getOperand(OtherSrcIdx))) {
-          if (Reg->getReg() == AMDGPU::ALU_CONST) {
-            ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(I->getOperand(OtherSelIdx));
-            Consts.push_back(Cst->getZExtValue());
-          }
+        if (SIRI->isSGPRClass(RC)) {
+          UseVReg = false;
         }
       }
-
-      ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(CstOffset);
-      Consts.push_back(Cst->getZExtValue());
-      if (!TII->fitsConstReadLimitations(Consts))
-        continue;
-
-      // Convert back to SDNode indices
-      if (HasDst) {
-        SrcIdx--;
-        SelIdx--;
+      switch(NumVectorElts) {
+      case 1: RegClassID = UseVReg ? AMDGPU::VReg_32RegClassID :
+                                     AMDGPU::SReg_32RegClassID;
+        break;
+      case 2: RegClassID = UseVReg ? AMDGPU::VReg_64RegClassID :
+                                     AMDGPU::SReg_64RegClassID;
+        break;
+      case 4: RegClassID = UseVReg ? AMDGPU::VReg_128RegClassID :
+                                     AMDGPU::SReg_128RegClassID;
+        break;
+      case 8: RegClassID = UseVReg ? AMDGPU::VReg_256RegClassID :
+                                     AMDGPU::SReg_256RegClassID;
+        break;
+      case 16: RegClassID = UseVReg ? AMDGPU::VReg_512RegClassID :
+                                      AMDGPU::SReg_512RegClassID;
+        break;
+      default: llvm_unreachable("Do not know how to lower this BUILD_VECTOR");
       }
-      std::vector<SDValue> Ops;
-      for (int i = 0, e = I->getNumOperands(); i != e; ++i) {
-        if (i == SrcIdx) {
-          Ops.push_back(CurDAG->getRegister(AMDGPU::ALU_CONST, MVT::f32));
-        } else if (i == SelIdx) {
-          Ops.push_back(CstOffset);
-        } else {
-          Ops.push_back(I->getOperand(i));
-        }
+    } else {
+      // BUILD_VECTOR was lowered into an IMPLICIT_DEF + 4 INSERT_SUBREG
+      // that adds a 128 bits reg copy when going through TwoAddressInstructions
+      // pass. We want to avoid 128 bits copies as much as possible because they
+      // can't be bundled by our scheduler.
+      switch(NumVectorElts) {
+      case 2: RegClassID = AMDGPU::R600_Reg64RegClassID; break;
+      case 4: RegClassID = AMDGPU::R600_Reg128RegClassID; break;
+      default: llvm_unreachable("Do not know how to lower this BUILD_VECTOR");
       }
-      CurDAG->UpdateNodeOperands(*I, Ops.data(), Ops.size());
-    }
-    break;
-  }
-  case ISD::BUILD_VECTOR: {
-    const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
-    if (ST.getGeneration() > AMDGPUSubtarget::NORTHERN_ISLANDS) {
-      break;
     }
 
-    unsigned RegClassID;
-    switch(N->getValueType(0).getVectorNumElements()) {
-    case 2: RegClassID = AMDGPU::R600_Reg64RegClassID; break;
-    case 4: RegClassID = AMDGPU::R600_Reg128RegClassID; break;
-    default: llvm_unreachable("Do not know how to lower this BUILD_VECTOR");
+    SDValue RegClass = CurDAG->getTargetConstant(RegClassID, MVT::i32);
+
+    if (NumVectorElts == 1) {
+      return CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS,
+                                  VT.getVectorElementType(),
+                                  N->getOperand(0), RegClass);
     }
-    // BUILD_VECTOR is usually lowered into an IMPLICIT_DEF + 4 INSERT_SUBREG
-    // that adds a 128 bits reg copy when going through TwoAddressInstructions
-    // pass. We want to avoid 128 bits copies as much as possible because they
-    // can't be bundled by our scheduler.
-    SDValue RegSeqArgs[9] = {
-      CurDAG->getTargetConstant(RegClassID, MVT::i32),
-      SDValue(), CurDAG->getTargetConstant(AMDGPU::sub0, MVT::i32),
-      SDValue(), CurDAG->getTargetConstant(AMDGPU::sub1, MVT::i32),
-      SDValue(), CurDAG->getTargetConstant(AMDGPU::sub2, MVT::i32),
-      SDValue(), CurDAG->getTargetConstant(AMDGPU::sub3, MVT::i32)
-    };
+
+    assert(NumVectorElts <= 16 && "Vectors with more than 16 elements not "
+                                  "supported yet");
+    // 16 = Max Num Vector Elements
+    // 2 = 2 REG_SEQUENCE operands per element (value, subreg index)
+    // 1 = Vector Register Class
+    SDValue RegSeqArgs[16 * 2 + 1];
+
+    RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, MVT::i32);
     bool IsRegSeq = true;
     for (unsigned i = 0; i < N->getNumOperands(); i++) {
+      // XXX: Why is this here?
       if (dyn_cast<RegisterSDNode>(N->getOperand(i))) {
         IsRegSeq = false;
         break;
       }
-      RegSeqArgs[2 * i + 1] = N->getOperand(i);
+      RegSeqArgs[1 + (2 * i)] = N->getOperand(i);
+      RegSeqArgs[1 + (2 * i) + 1] =
+              CurDAG->getTargetConstant(TRI->getSubRegFromChannel(i), MVT::i32);
     }
     if (!IsRegSeq)
       break;
@@ -313,285 +309,44 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
     return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE,
                                   SDLoc(N), N->getValueType(0), Ops);
   }
-
-  case ISD::ConstantFP:
-  case ISD::Constant: {
+  case AMDGPUISD::REGISTER_LOAD: {
     const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
-    // XXX: Custom immediate lowering not implemented yet.  Instead we use
-    // pseudo instructions defined in SIInstructions.td
-    if (ST.getGeneration() > AMDGPUSubtarget::NORTHERN_ISLANDS) {
+    if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS)
       break;
-    }
-
-    uint64_t ImmValue = 0;
-    unsigned ImmReg = AMDGPU::ALU_LITERAL_X;
-
-    if (N->getOpcode() == ISD::ConstantFP) {
-      // XXX: 64-bit Immediates not supported yet
-      assert(N->getValueType(0) != MVT::f64);
-
-      ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N);
-      APFloat Value = C->getValueAPF();
-      float FloatValue = Value.convertToFloat();
-      if (FloatValue == 0.0) {
-        ImmReg = AMDGPU::ZERO;
-      } else if (FloatValue == 0.5) {
-        ImmReg = AMDGPU::HALF;
-      } else if (FloatValue == 1.0) {
-        ImmReg = AMDGPU::ONE;
-      } else {
-        ImmValue = Value.bitcastToAPInt().getZExtValue();
-      }
-    } else {
-      // XXX: 64-bit Immediates not supported yet
-      assert(N->getValueType(0) != MVT::i64);
-
-      ConstantSDNode *C = dyn_cast<ConstantSDNode>(N);
-      if (C->getZExtValue() == 0) {
-        ImmReg = AMDGPU::ZERO;
-      } else if (C->getZExtValue() == 1) {
-        ImmReg = AMDGPU::ONE_INT;
-      } else {
-        ImmValue = C->getZExtValue();
-      }
-    }
-
-    for (SDNode::use_iterator Use = N->use_begin(), Next = llvm::next(Use);
-                              Use != SDNode::use_end(); Use = Next) {
-      Next = llvm::next(Use);
-      std::vector<SDValue> Ops;
-      for (unsigned i = 0; i < Use->getNumOperands(); ++i) {
-        Ops.push_back(Use->getOperand(i));
-      }
-
-      if (!Use->isMachineOpcode()) {
-          if (ImmReg == AMDGPU::ALU_LITERAL_X) {
-            // We can only use literal constants (e.g. AMDGPU::ZERO,
-            // AMDGPU::ONE, etc) in machine opcodes.
-            continue;
-          }
-      } else {
-        if (!TII->isALUInstr(Use->getMachineOpcode()) ||
-            (TII->get(Use->getMachineOpcode()).TSFlags &
-            R600_InstFlag::VECTOR)) {
-          continue;
-        }
-
-        int ImmIdx = TII->getOperandIdx(Use->getMachineOpcode(),
-                                        AMDGPU::OpName::literal);
-        if (ImmIdx == -1) {
-          continue;
-        }
-
-        if (TII->getOperandIdx(Use->getMachineOpcode(),
-                               AMDGPU::OpName::dst) != -1) {
-          // subtract one from ImmIdx, because the DST operand is usually index
-          // 0 for MachineInstrs, but we have no DST in the Ops vector.
-          ImmIdx--;
-        }
-
-        // Check that we aren't already using an immediate.
-        // XXX: It's possible for an instruction to have more than one
-        // immediate operand, but this is not supported yet.
-        if (ImmReg == AMDGPU::ALU_LITERAL_X) {
-          ConstantSDNode *C = dyn_cast<ConstantSDNode>(Use->getOperand(ImmIdx));
-          assert(C);
-
-          if (C->getZExtValue() != 0) {
-            // This instruction is already using an immediate.
-            continue;
-          }
-
-          // Set the immediate value
-          Ops[ImmIdx] = CurDAG->getTargetConstant(ImmValue, MVT::i32);
-        }
-      }
-      // Set the immediate register
-      Ops[Use.getOperandNo()] = CurDAG->getRegister(ImmReg, MVT::i32);
-
-      CurDAG->UpdateNodeOperands(*Use, Ops.data(), Use->getNumOperands());
-    }
-    break;
-  }
-  }
-  SDNode *Result = SelectCode(N);
-
-  // Fold operands of selected node
-
-  const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
-  if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
-    const R600InstrInfo *TII =
-        static_cast<const R600InstrInfo*>(TM.getInstrInfo());
-    if (Result && Result->isMachineOpcode() && Result->getMachineOpcode() == AMDGPU::DOT_4) {
-      bool IsModified = false;
-      do {
-        std::vector<SDValue> Ops;
-        for(SDNode::op_iterator I = Result->op_begin(), E = Result->op_end();
-            I != E; ++I)
-          Ops.push_back(*I);
-        IsModified = FoldDotOperands(Result->getMachineOpcode(), TII, Ops);
-        if (IsModified) {
-          Result = CurDAG->UpdateNodeOperands(Result, Ops.data(), Ops.size());
-        }
-      } while (IsModified);
-
-    }
-    if (Result && Result->isMachineOpcode() &&
-        !(TII->get(Result->getMachineOpcode()).TSFlags & R600_InstFlag::VECTOR)
-        && TII->hasInstrModifiers(Result->getMachineOpcode())) {
-      // Fold FNEG/FABS
-      // TODO: Isel can generate multiple MachineInst, we need to recursively
-      // parse Result
-      bool IsModified = false;
-      do {
-        std::vector<SDValue> Ops;
-        for(SDNode::op_iterator I = Result->op_begin(), E = Result->op_end();
-            I != E; ++I)
-          Ops.push_back(*I);
-        IsModified = FoldOperands(Result->getMachineOpcode(), TII, Ops);
-        if (IsModified) {
-          Result = CurDAG->UpdateNodeOperands(Result, Ops.data(), Ops.size());
-        }
-      } while (IsModified);
-
-      // If node has a single use which is CLAMP_R600, folds it
-      if (Result->hasOneUse() && Result->isMachineOpcode()) {
-        SDNode *PotentialClamp = *Result->use_begin();
-        if (PotentialClamp->isMachineOpcode() &&
-            PotentialClamp->getMachineOpcode() == AMDGPU::CLAMP_R600) {
-          unsigned ClampIdx =
-            TII->getOperandIdx(Result->getMachineOpcode(), AMDGPU::OpName::clamp);
-          std::vector<SDValue> Ops;
-          unsigned NumOp = Result->getNumOperands();
-          for (unsigned i = 0; i < NumOp; ++i) {
-            Ops.push_back(Result->getOperand(i));
-          }
-          Ops[ClampIdx - 1] = CurDAG->getTargetConstant(1, MVT::i32);
-          Result = CurDAG->SelectNodeTo(PotentialClamp,
-              Result->getMachineOpcode(), PotentialClamp->getVTList(),
-              Ops.data(), NumOp);
-        }
-      }
-    }
+    SDValue Addr, Offset;
+
+    SelectADDRIndirect(N->getOperand(1), Addr, Offset);
+    const SDValue Ops[] = {
+      Addr,
+      Offset,
+      CurDAG->getTargetConstant(0, MVT::i32),
+      N->getOperand(0),
+    };
+    return CurDAG->getMachineNode(AMDGPU::SI_RegisterLoad, SDLoc(N),
+                                  CurDAG->getVTList(MVT::i32, MVT::i64, MVT::Other),
+                                  Ops);
   }
-
-  return Result;
-}
-
-bool AMDGPUDAGToDAGISel::FoldOperand(SDValue &Src, SDValue &Sel, SDValue &Neg,
-                                     SDValue &Abs, const R600InstrInfo *TII) {
-  switch (Src.getOpcode()) {
-  case ISD::FNEG:
-    Src = Src.getOperand(0);
-    Neg = CurDAG->getTargetConstant(1, MVT::i32);
-    return true;
-  case ISD::FABS:
-    if (!Abs.getNode())
-      return false;
-    Src = Src.getOperand(0);
-    Abs = CurDAG->getTargetConstant(1, MVT::i32);
-    return true;
-  case ISD::BITCAST:
-    Src = Src.getOperand(0);
-    return true;
-  default:
-    return false;
+  case AMDGPUISD::REGISTER_STORE: {
+    const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
+    if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS)
+      break;
+    SDValue Addr, Offset;
+    SelectADDRIndirect(N->getOperand(2), Addr, Offset);
+    const SDValue Ops[] = {
+      N->getOperand(1),
+      Addr,
+      Offset,
+      CurDAG->getTargetConstant(0, MVT::i32),
+      N->getOperand(0),
+    };
+    return CurDAG->getMachineNode(AMDGPU::SI_RegisterStorePseudo, SDLoc(N),
+                                        CurDAG->getVTList(MVT::Other),
+                                        Ops);
   }
-}
-
-bool AMDGPUDAGToDAGISel::FoldOperands(unsigned Opcode,
-    const R600InstrInfo *TII, std::vector<SDValue> &Ops) {
-  int OperandIdx[] = {
-    TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
-    TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
-    TII->getOperandIdx(Opcode, AMDGPU::OpName::src2)
-  };
-  int SelIdx[] = {
-    TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_sel),
-    TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_sel),
-    TII->getOperandIdx(Opcode, AMDGPU::OpName::src2_sel)
-  };
-  int NegIdx[] = {
-    TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg),
-    TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg),
-    TII->getOperandIdx(Opcode, AMDGPU::OpName::src2_neg)
-  };
-  int AbsIdx[] = {
-    TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs),
-    TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs),
-    -1
-  };
-
-
-  for (unsigned i = 0; i < 3; i++) {
-    if (OperandIdx[i] < 0)
-      return false;
-    SDValue &Src = Ops[OperandIdx[i] - 1];
-    SDValue &Sel = Ops[SelIdx[i] - 1];
-    SDValue &Neg = Ops[NegIdx[i] - 1];
-    SDValue FakeAbs;
-    SDValue &Abs = (AbsIdx[i] > -1) ? Ops[AbsIdx[i] - 1] : FakeAbs;
-    if (FoldOperand(Src, Sel, Neg, Abs, TII))
-      return true;
   }
-  return false;
+  return SelectCode(N);
 }
 
-bool AMDGPUDAGToDAGISel::FoldDotOperands(unsigned Opcode,
-    const R600InstrInfo *TII, std::vector<SDValue> &Ops) {
-  int OperandIdx[] = {
-    TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
-    TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
-    TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
-    TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
-    TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
-    TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
-    TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
-    TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
-  };
-  int SelIdx[] = {
-    TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_sel_X),
-    TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_sel_Y),
-    TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_sel_Z),
-    TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_sel_W),
-    TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_sel_X),
-    TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_sel_Y),
-    TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_sel_Z),
-    TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_sel_W)
-  };
-  int NegIdx[] = {
-    TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_X),
-    TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Y),
-    TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Z),
-    TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_W),
-    TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_X),
-    TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Y),
-    TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Z),
-    TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_W)
-  };
-  int AbsIdx[] = {
-    TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_X),
-    TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Y),
-    TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Z),
-    TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_W),
-    TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_X),
-    TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Y),
-    TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Z),
-    TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_W)
-  };
-
-  for (unsigned i = 0; i < 8; i++) {
-    if (OperandIdx[i] < 0)
-      return false;
-    SDValue &Src = Ops[OperandIdx[i] - 1];
-    SDValue &Sel = Ops[SelIdx[i] - 1];
-    SDValue &Neg = Ops[NegIdx[i] - 1];
-    SDValue &Abs = Ops[AbsIdx[i] - 1];
-    if (FoldOperand(Src, Sel, Neg, Abs, TII))
-      return true;
-  }
-  return false;
-}
 
 bool AMDGPUDAGToDAGISel::checkType(const Value *ptr, unsigned int addrspace) {
   if (!ptr) {
@@ -804,26 +559,27 @@ bool AMDGPUDAGToDAGISel::SelectU24(SDValue Op, SDValue &U24) {
 }
 
 void AMDGPUDAGToDAGISel::PostprocessISelDAG() {
-
-  if (Subtarget.getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) {
-    return;
-  }
-
-  // Go over all selected nodes and try to fold them a bit more
   const AMDGPUTargetLowering& Lowering =
     (*(const AMDGPUTargetLowering*)getTargetLowering());
-  for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
-       E = CurDAG->allnodes_end(); I != E; ++I) {
+  bool IsModified = false;
+  do {
+    IsModified = false;
+    // Go over all selected nodes and try to fold them a bit more
+    for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
+         E = CurDAG->allnodes_end(); I != E; ++I) {
 
-    SDNode *Node = I;
+      SDNode *Node = I;
 
-    MachineSDNode *MachineNode = dyn_cast<MachineSDNode>(I);
-    if (!MachineNode)
-      continue;
+      MachineSDNode *MachineNode = dyn_cast<MachineSDNode>(I);
+      if (!MachineNode)
+        continue;
 
-    SDNode *ResNode = Lowering.PostISelFolding(MachineNode, *CurDAG);
-    if (ResNode != Node) {
-      ReplaceUses(Node, ResNode);
+      SDNode *ResNode = Lowering.PostISelFolding(MachineNode, *CurDAG);
+      if (ResNode != Node) {
+        ReplaceUses(Node, ResNode);
+        IsModified = true;
+      }
     }
-  }
+    CurDAG->RemoveDeadNodes();
+  } while (IsModified);
 }
diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp
index efd2756..c4d75ff 100644
--- a/lib/Target/R600/AMDGPUISelLowering.cpp
+++ b/lib/Target/R600/AMDGPUISelLowering.cpp
@@ -15,6 +15,7 @@
 
 #include "AMDGPUISelLowering.h"
 #include "AMDGPU.h"
+#include "AMDGPUFrameLowering.h"
 #include "AMDGPURegisterInfo.h"
 #include "AMDGPUSubtarget.h"
 #include "AMDILIntrinsicInfo.h"
@@ -28,6 +29,14 @@
 #include "llvm/IR/DataLayout.h"
 
 using namespace llvm;
+static bool allocateStack(unsigned ValNo, MVT ValVT, MVT LocVT,
+                      CCValAssign::LocInfo LocInfo,
+                      ISD::ArgFlagsTy ArgFlags, CCState &State) {
+  unsigned Offset = State.AllocateStack(ValVT.getSizeInBits() / 8, ArgFlags.getOrigAlign());
+    State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+
+  return true;
+}
 
 #include "AMDGPUGenCallingConv.inc"
 
@@ -49,6 +58,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
   setOperationAction(ISD::FABS,   MVT::f32, Legal);
   setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
   setOperationAction(ISD::FRINT,  MVT::f32, Legal);
+  setOperationAction(ISD::FROUND, MVT::f32, Legal);
 
   // The hardware supports ROTR, but not ROTL
   setOperationAction(ISD::ROTL, MVT::i32, Expand);
@@ -64,9 +74,29 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
   setOperationAction(ISD::STORE, MVT::v4f32, Promote);
   AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
 
+  setOperationAction(ISD::STORE, MVT::v8f32, Promote);
+  AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32);
+
+  setOperationAction(ISD::STORE, MVT::v16f32, Promote);
+  AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32);
+
   setOperationAction(ISD::STORE, MVT::f64, Promote);
   AddPromotedToType(ISD::STORE, MVT::f64, MVT::i64);
 
+  // Custom lowering of vector stores is required for local address space
+  // stores.
+  setOperationAction(ISD::STORE, MVT::v4i32, Custom);
+  // XXX: Native v2i32 local address space stores are possible, but not
+  // currently implemented.
+  setOperationAction(ISD::STORE, MVT::v2i32, Custom);
+
+  setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom);
+  setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom);
+  setTruncStoreAction(MVT::v4i32, MVT::v4i8, Custom);
+  // XXX: This can be change to Custom, once ExpandVectorStores can
+  // handle 64-bit stores.
+  setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
+
   setOperationAction(ISD::LOAD, MVT::f32, Promote);
   AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32);
 
@@ -76,15 +106,38 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
   setOperationAction(ISD::LOAD, MVT::v4f32, Promote);
   AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
 
+  setOperationAction(ISD::LOAD, MVT::v8f32, Promote);
+  AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32);
+
+  setOperationAction(ISD::LOAD, MVT::v16f32, Promote);
+  AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32);
+
   setOperationAction(ISD::LOAD, MVT::f64, Promote);
   AddPromotedToType(ISD::LOAD, MVT::f64, MVT::i64);
 
-  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i32, Expand);
-  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f32, Expand);
+  setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom);
+  setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32, Custom);
+  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i32, Custom);
+  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f32, Custom);
+
+  setLoadExtAction(ISD::EXTLOAD, MVT::v2i8, Expand);
+  setLoadExtAction(ISD::SEXTLOAD, MVT::v2i8, Expand);
+  setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i8, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::v4i8, Expand);
+  setLoadExtAction(ISD::SEXTLOAD, MVT::v4i8, Expand);
+  setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i8, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::v2i16, Expand);
+  setLoadExtAction(ISD::SEXTLOAD, MVT::v2i16, Expand);
+  setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i16, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, Expand);
+  setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, Expand);
+  setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i16, Expand);
 
   setOperationAction(ISD::FNEG, MVT::v2f32, Expand);
   setOperationAction(ISD::FNEG, MVT::v4f32, Expand);
 
+  setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
+
   setOperationAction(ISD::MUL, MVT::i64, Expand);
 
   setOperationAction(ISD::UDIV, MVT::i32, Expand);
@@ -93,14 +146,13 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
   setOperationAction(ISD::VSELECT, MVT::v2f32, Expand);
   setOperationAction(ISD::VSELECT, MVT::v4f32, Expand);
 
-  static const int types[] = {
-    (int)MVT::v2i32,
-    (int)MVT::v4i32
+  static const MVT::SimpleValueType IntTypes[] = {
+    MVT::v2i32, MVT::v4i32
   };
-  const size_t NumTypes = array_lengthof(types);
+  const size_t NumIntTypes = array_lengthof(IntTypes);
 
-  for (unsigned int x  = 0; x < NumTypes; ++x) {
-    MVT::SimpleValueType VT = (MVT::SimpleValueType)types[x];
+  for (unsigned int x  = 0; x < NumIntTypes; ++x) {
+    MVT::SimpleValueType VT = IntTypes[x];
     //Expand the following operations for the current type by default
     setOperationAction(ISD::ADD,  VT, Expand);
     setOperationAction(ISD::AND,  VT, Expand);
@@ -119,6 +171,23 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
     setOperationAction(ISD::VSELECT, VT, Expand);
     setOperationAction(ISD::XOR,  VT, Expand);
   }
+
+  static const MVT::SimpleValueType FloatTypes[] = {
+    MVT::v2f32, MVT::v4f32
+  };
+  const size_t NumFloatTypes = array_lengthof(FloatTypes);
+
+  for (unsigned int x = 0; x < NumFloatTypes; ++x) {
+    MVT::SimpleValueType VT = FloatTypes[x];
+    setOperationAction(ISD::FABS, VT, Expand);
+    setOperationAction(ISD::FADD, VT, Expand);
+    setOperationAction(ISD::FDIV, VT, Expand);
+    setOperationAction(ISD::FFLOOR, VT, Expand);
+    setOperationAction(ISD::FMUL, VT, Expand);
+    setOperationAction(ISD::FRINT, VT, Expand);
+    setOperationAction(ISD::FSQRT, VT, Expand);
+    setOperationAction(ISD::FSUB, VT, Expand);
+  }
 }
 
 //===----------------------------------------------------------------------===//
@@ -129,6 +198,18 @@ MVT AMDGPUTargetLowering::getVectorIdxTy() const {
   return MVT::i32;
 }
 
+bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy,
+                                                   EVT CastTy) const {
+  if (LoadTy.getSizeInBits() != CastTy.getSizeInBits())
+    return true;
+
+  unsigned LScalarSize = LoadTy.getScalarType().getSizeInBits();
+  unsigned CastScalarSize = CastTy.getScalarType().getSizeInBits();
+
+  return ((LScalarSize <= CastScalarSize) ||
+          (CastScalarSize >= 32) ||
+          (LScalarSize < 32));
+}
 
 //===---------------------------------------------------------------------===//
 // Target Properties
@@ -182,8 +263,12 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG)
   case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG);
   case ISD::BRCOND: return LowerBRCOND(Op, DAG);
   // AMDGPU DAG lowering
+  case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
+  case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG);
+  case ISD::FrameIndex: return LowerFrameIndex(Op, DAG);
   case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
   case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
+  case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
   }
   return Op;
 }
@@ -194,18 +279,82 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
 
   const DataLayout *TD = getTargetMachine().getDataLayout();
   GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);
+
+  assert(G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS);
   // XXX: What does the value of G->getOffset() mean?
   assert(G->getOffset() == 0 &&
          "Do not know what to do with an non-zero offset");
 
-  unsigned Offset = MFI->LDSSize;
   const GlobalValue *GV = G->getGlobal();
-  uint64_t Size = TD->getTypeAllocSize(GV->getType()->getElementType());
 
-  // XXX: Account for alignment?
-  MFI->LDSSize += Size;
+  unsigned Offset;
+  if (MFI->LocalMemoryObjects.count(GV) == 0) {
+    uint64_t Size = TD->getTypeAllocSize(GV->getType()->getElementType());
+    Offset = MFI->LDSSize;
+    MFI->LocalMemoryObjects[GV] = Offset;
+    // XXX: Account for alignment?
+    MFI->LDSSize += Size;
+  } else {
+    Offset = MFI->LocalMemoryObjects[GV];
+  }
+
+  return DAG.getConstant(Offset, getPointerTy(G->getAddressSpace()));
+}
+
+void AMDGPUTargetLowering::ExtractVectorElements(SDValue Op, SelectionDAG &DAG,
+                                         SmallVectorImpl<SDValue> &Args,
+                                         unsigned Start,
+                                         unsigned Count) const {
+  EVT VT = Op.getValueType();
+  for (unsigned i = Start, e = Start + Count; i != e; ++i) {
+    Args.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(Op),
+                               VT.getVectorElementType(),
+                               Op, DAG.getConstant(i, MVT::i32)));
+  }
+}
+
+SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op,
+                                                  SelectionDAG &DAG) const {
+  SmallVector<SDValue, 8> Args;
+  SDValue A = Op.getOperand(0);
+  SDValue B = Op.getOperand(1);
+
+  ExtractVectorElements(A, DAG, Args, 0,
+                        A.getValueType().getVectorNumElements());
+  ExtractVectorElements(B, DAG, Args, 0,
+                        B.getValueType().getVectorNumElements());
+
+  return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(Op), Op.getValueType(),
+                     &Args[0], Args.size());
+}
+
+SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
+                                                     SelectionDAG &DAG) const {
+
+  SmallVector<SDValue, 8> Args;
+  EVT VT = Op.getValueType();
+  unsigned Start = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+  ExtractVectorElements(Op.getOperand(0), DAG, Args, Start,
+                        VT.getVectorNumElements());
+
+  return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(Op), Op.getValueType(),
+                     &Args[0], Args.size());
+}
+
+SDValue AMDGPUTargetLowering::LowerFrameIndex(SDValue Op,
+                                              SelectionDAG &DAG) const {
 
-  return DAG.getConstant(Offset, TD->getPointerSize() == 8 ? MVT::i64 : MVT::i32);
+  MachineFunction &MF = DAG.getMachineFunction();
+  const AMDGPUFrameLowering *TFL =
+   static_cast<const AMDGPUFrameLowering*>(getTargetMachine().getFrameLowering());
+
+  FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Op);
+  assert(FIN);
+
+  unsigned FrameIndex = FIN->getIndex();
+  unsigned Offset = TFL->getFrameIndexOffset(MF, FrameIndex);
+  return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF),
+                         Op.getValueType());
 }
 
 SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
@@ -335,7 +484,122 @@ SDValue AMDGPUTargetLowering::LowerMinMax(SDValue Op,
   return Op;
 }
 
+SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue &Op,
+                                              SelectionDAG &DAG) const {
+  LoadSDNode *Load = dyn_cast<LoadSDNode>(Op);
+  EVT MemEltVT = Load->getMemoryVT().getVectorElementType();
+  EVT EltVT = Op.getValueType().getVectorElementType();
+  EVT PtrVT = Load->getBasePtr().getValueType();
+  unsigned NumElts = Load->getMemoryVT().getVectorNumElements();
+  SmallVector<SDValue, 8> Loads;
+  SDLoc SL(Op);
+
+  for (unsigned i = 0, e = NumElts; i != e; ++i) {
+    SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT, Load->getBasePtr(),
+                    DAG.getConstant(i * (MemEltVT.getSizeInBits() / 8), PtrVT));
+    Loads.push_back(DAG.getExtLoad(Load->getExtensionType(), SL, EltVT,
+                        Load->getChain(), Ptr,
+                        MachinePointerInfo(Load->getMemOperand()->getValue()),
+                        MemEltVT, Load->isVolatile(), Load->isNonTemporal(),
+                        Load->getAlignment()));
+  }
+  return DAG.getNode(ISD::BUILD_VECTOR, SL, Op.getValueType(), &Loads[0],
+                     Loads.size());
+}
+
+SDValue AMDGPUTargetLowering::MergeVectorStore(const SDValue &Op,
+                                               SelectionDAG &DAG) const {
+  StoreSDNode *Store = dyn_cast<StoreSDNode>(Op);
+  EVT MemVT = Store->getMemoryVT();
+  unsigned MemBits = MemVT.getSizeInBits();
+
+  // Byte stores are really expensive, so if possible, try to pack
+  // 32-bit vector truncatating store into an i32 store.
+  // XXX: We could also handle optimize other vector bitwidths
+  if (!MemVT.isVector() || MemBits > 32) {
+    return SDValue();
+  }
+
+  SDLoc DL(Op);
+  const SDValue &Value = Store->getValue();
+  EVT VT = Value.getValueType();
+  const SDValue &Ptr = Store->getBasePtr();
+  EVT MemEltVT = MemVT.getVectorElementType();
+  unsigned MemEltBits = MemEltVT.getSizeInBits();
+  unsigned MemNumElements = MemVT.getVectorNumElements();
+  EVT PackedVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
+  SDValue Mask;
+  switch(MemEltBits) {
+  case 8:
+    Mask = DAG.getConstant(0xFF, PackedVT);
+    break;
+  case 16:
+    Mask = DAG.getConstant(0xFFFF, PackedVT);
+    break;
+  default:
+    llvm_unreachable("Cannot lower this vector store");
+  }
+  SDValue PackedValue;
+  for (unsigned i = 0; i < MemNumElements; ++i) {
+    EVT ElemVT = VT.getVectorElementType();
+    SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT, Value,
+                              DAG.getConstant(i, MVT::i32));
+    Elt = DAG.getZExtOrTrunc(Elt, DL, PackedVT);
+    Elt = DAG.getNode(ISD::AND, DL, PackedVT, Elt, Mask);
+    SDValue Shift = DAG.getConstant(MemEltBits * i, PackedVT);
+    Elt = DAG.getNode(ISD::SHL, DL, PackedVT, Elt, Shift);
+    if (i == 0) {
+      PackedValue = Elt;
+    } else {
+      PackedValue = DAG.getNode(ISD::OR, DL, PackedVT, PackedValue, Elt);
+    }
+  }
+  return DAG.getStore(Store->getChain(), DL, PackedValue, Ptr,
+                      MachinePointerInfo(Store->getMemOperand()->getValue()),
+                      Store->isVolatile(),  Store->isNonTemporal(),
+                      Store->getAlignment());
+}
+
+SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  StoreSDNode *Store = cast<StoreSDNode>(Op);
+  EVT MemEltVT = Store->getMemoryVT().getVectorElementType();
+  EVT EltVT = Store->getValue().getValueType().getVectorElementType();
+  EVT PtrVT = Store->getBasePtr().getValueType();
+  unsigned NumElts = Store->getMemoryVT().getVectorNumElements();
+  SDLoc SL(Op);
+
+  SmallVector<SDValue, 8> Chains;
+
+  for (unsigned i = 0, e = NumElts; i != e; ++i) {
+    SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
+                              Store->getValue(), DAG.getConstant(i, MVT::i32));
+    SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT,
+                              Store->getBasePtr(),
+                            DAG.getConstant(i * (MemEltVT.getSizeInBits() / 8),
+                                            PtrVT));
+    Chains.push_back(DAG.getTruncStore(Store->getChain(), SL, Val, Ptr,
+                         MachinePointerInfo(Store->getMemOperand()->getValue()),
+                         MemEltVT, Store->isVolatile(), Store->isNonTemporal(),
+                         Store->getAlignment()));
+  }
+  return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, &Chains[0], NumElts);
+}
 
+SDValue AMDGPUTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
+  SDValue Result = AMDGPUTargetLowering::MergeVectorStore(Op, DAG);
+  if (Result.getNode()) {
+    return Result;
+  }
+
+  StoreSDNode *Store = cast<StoreSDNode>(Op);
+  if ((Store->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
+       Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) &&
+      Store->getValue().getValueType().isVector()) {
+    return SplitVectorStore(Op, DAG);
+  }
+  return SDValue();
+}
 
 SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
     SelectionDAG &DAG) const {
@@ -392,13 +656,13 @@ SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
   SDValue Remainder_GE_Den = DAG.getSelectCC(DL, Remainder, Den,
                                                  DAG.getConstant(-1, VT),
                                                  DAG.getConstant(0, VT),
-                                                 ISD::SETGE);
-  // Remainder_GE_Zero = (Remainder >= 0 ? -1 : 0)
-  SDValue Remainder_GE_Zero = DAG.getSelectCC(DL, Remainder,
-                                                  DAG.getConstant(0, VT),
+                                                 ISD::SETUGE);
+  // Remainder_GE_Zero = (Num >= Num_S_Remainder ? -1 : 0)
+  SDValue Remainder_GE_Zero = DAG.getSelectCC(DL, Num,
+                                                  Num_S_Remainder,
                                                   DAG.getConstant(-1, VT),
                                                   DAG.getConstant(0, VT),
-                                                  ISD::SETGE);
+                                                  ISD::SETUGE);
   // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero
   SDValue Tmp1 = DAG.getNode(ISD::AND, DL, VT, Remainder_GE_Den,
                                                Remainder_GE_Zero);
@@ -442,10 +706,62 @@ SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
   return DAG.getMergeValues(Ops, 2, DL);
 }
 
+SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  SDValue S0 = Op.getOperand(0);
+  SDLoc DL(Op);
+  if (Op.getValueType() != MVT::f32 || S0.getValueType() != MVT::i64)
+    return SDValue();
+
+  // f32 uint_to_fp i64
+  SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, S0,
+                           DAG.getConstant(0, MVT::i32));
+  SDValue FloatLo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, Lo);
+  SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, S0,
+                           DAG.getConstant(1, MVT::i32));
+  SDValue FloatHi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, Hi);
+  FloatHi = DAG.getNode(ISD::FMUL, DL, MVT::f32, FloatHi,
+                        DAG.getConstantFP(4294967296.0f, MVT::f32)); // 2^32
+  return DAG.getNode(ISD::FADD, DL, MVT::f32, FloatLo, FloatHi);
+
+}
+
 //===----------------------------------------------------------------------===//
 // Helper functions
 //===----------------------------------------------------------------------===//
 
+void AMDGPUTargetLowering::getOriginalFunctionArgs(
+                               SelectionDAG &DAG,
+                               const Function *F,
+                               const SmallVectorImpl<ISD::InputArg> &Ins,
+                               SmallVectorImpl<ISD::InputArg> &OrigIns) const {
+
+  for (unsigned i = 0, e = Ins.size(); i < e; ++i) {
+    if (Ins[i].ArgVT == Ins[i].VT) {
+      OrigIns.push_back(Ins[i]);
+      continue;
+    }
+
+    EVT VT;
+    if (Ins[i].ArgVT.isVector() && !Ins[i].VT.isVector()) {
+      // Vector has been split into scalars.
+      VT = Ins[i].ArgVT.getVectorElementType();
+    } else if (Ins[i].VT.isVector() && Ins[i].ArgVT.isVector() &&
+               Ins[i].ArgVT.getVectorElementType() !=
+               Ins[i].VT.getVectorElementType()) {
+      // Vector elements have been promoted
+      VT = Ins[i].ArgVT;
+    } else {
+      // Vector has been spilt into smaller vectors.
+      VT = Ins[i].VT;
+    }
+
+    ISD::InputArg Arg(Ins[i].Flags, VT, VT, Ins[i].Used,
+                      Ins[i].OrigArgIndex, Ins[i].PartOffset);
+    OrigIns.push_back(Arg);
+  }
+}
+
 bool AMDGPUTargetLowering::isHWTrueValue(SDValue Op) const {
   if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) {
     return CFP->isExactlyValue(1.0);
@@ -507,5 +823,13 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(CONST_ADDRESS)
   NODE_NAME_CASE(REGISTER_LOAD)
   NODE_NAME_CASE(REGISTER_STORE)
+  NODE_NAME_CASE(LOAD_CONSTANT)
+  NODE_NAME_CASE(LOAD_INPUT)
+  NODE_NAME_CASE(SAMPLE)
+  NODE_NAME_CASE(SAMPLEB)
+  NODE_NAME_CASE(SAMPLED)
+  NODE_NAME_CASE(SAMPLEL)
+  NODE_NAME_CASE(STORE_MSKOR)
+  NODE_NAME_CASE(TBUFFER_STORE_FORMAT)
   }
 }
diff --git a/lib/Target/R600/AMDGPUISelLowering.h b/lib/Target/R600/AMDGPUISelLowering.h
index f614e23..2dfd3cf 100644
--- a/lib/Target/R600/AMDGPUISelLowering.h
+++ b/lib/Target/R600/AMDGPUISelLowering.h
@@ -25,8 +25,20 @@ class MachineRegisterInfo;
 
 class AMDGPUTargetLowering : public TargetLowering {
 private:
+  void ExtractVectorElements(SDValue Op, SelectionDAG &DAG,
+                             SmallVectorImpl<SDValue> &Args,
+                             unsigned Start, unsigned Count) const;
+  SDValue LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
+  /// \brief Lower vector stores by merging the vector elements into an integer
+  /// of the same bitwidth.
+  SDValue MergeVectorStore(const SDValue &Op, SelectionDAG &DAG) const;
+  /// \brief Split a vector store into multiple scalar stores.
+  /// \returns The resulting chain. 
   SDValue LowerUDIVREM(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
 
 protected:
 
@@ -39,10 +51,23 @@ protected:
                                        unsigned Reg, EVT VT) const;
   SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op,
                              SelectionDAG &DAG) const;
-
+  /// \brief Split a vector load into multiple scalar loads.
+  SDValue SplitVectorLoad(const SDValue &Op, SelectionDAG &DAG) const;
+  SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
   bool isHWTrueValue(SDValue Op) const;
   bool isHWFalseValue(SDValue Op) const;
 
+  /// The SelectionDAGBuilder will automatically promote function arguments
+  /// with illegal types.  However, this does not work for the AMDGPU targets
+  /// since the function arguments are stored in memory as these illegal types.
+  /// In order to handle this properly we need to get the origianl types sizes
+  /// from the LLVM IR Function and fixup the ISD:InputArg values before
+  /// passing them to AnalyzeFormalArguments()
+  void getOriginalFunctionArgs(SelectionDAG &DAG,
+                               const Function *F,
+                               const SmallVectorImpl<ISD::InputArg> &Ins,
+                               SmallVectorImpl<ISD::InputArg> &OrigIns) const;
   void AnalyzeFormalArguments(CCState &State,
                               const SmallVectorImpl<ISD::InputArg> &Ins) const;
 
@@ -52,6 +77,7 @@ public:
   virtual bool isFAbsFree(EVT VT) const;
   virtual bool isFNegFree(EVT VT) const;
   virtual MVT getVectorIdxTy() const;
+  virtual bool isLoadBitCastBeneficial(EVT, EVT) const LLVM_OVERRIDE;
   virtual SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv,
                               bool isVarArg,
                               const SmallVectorImpl<ISD::OutputArg> &Outs,
@@ -139,6 +165,15 @@ enum {
   CONST_ADDRESS,
   REGISTER_LOAD,
   REGISTER_STORE,
+  LOAD_INPUT,
+  SAMPLE,
+  SAMPLEB,
+  SAMPLED,
+  SAMPLEL,
+  FIRST_MEM_OPCODE_NUMBER = ISD::FIRST_TARGET_MEMORY_OPCODE,
+  STORE_MSKOR,
+  LOAD_CONSTANT,
+  TBUFFER_STORE_FORMAT,
   LAST_AMDGPU_ISD_NUMBER
 };
 
diff --git a/lib/Target/R600/AMDGPUIndirectAddressing.cpp b/lib/Target/R600/AMDGPUIndirectAddressing.cpp
deleted file mode 100644
index 3ce3ecf..0000000
--- a/lib/Target/R600/AMDGPUIndirectAddressing.cpp
+++ /dev/null
@@ -1,345 +0,0 @@
-//===-- AMDGPUIndirectAddressing.cpp - Indirect Adressing Support ---------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-///
-/// Instructions can use indirect addressing to index the register file as if it
-/// were memory.  This pass lowers RegisterLoad and RegisterStore instructions
-/// to either a COPY or a MOV that uses indirect addressing.
-//
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPU.h"
-#include "R600InstrInfo.h"
-#include "R600MachineFunctionInfo.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/Support/Debug.h"
-
-using namespace llvm;
-
-namespace {
-
-class AMDGPUIndirectAddressingPass : public MachineFunctionPass {
-
-private:
-  static char ID;
-  const AMDGPUInstrInfo *TII;
-
-  bool regHasExplicitDef(MachineRegisterInfo &MRI, unsigned Reg) const;
-
-public:
-  AMDGPUIndirectAddressingPass(TargetMachine &tm) :
-    MachineFunctionPass(ID),
-    TII(0)
-    { }
-
-  virtual bool runOnMachineFunction(MachineFunction &MF);
-
-  const char *getPassName() const { return "R600 Handle indirect addressing"; }
-
-};
-
-} // End anonymous namespace
-
-char AMDGPUIndirectAddressingPass::ID = 0;
-
-FunctionPass *llvm::createAMDGPUIndirectAddressingPass(TargetMachine &tm) {
-  return new AMDGPUIndirectAddressingPass(tm);
-}
-
-bool AMDGPUIndirectAddressingPass::runOnMachineFunction(MachineFunction &MF) {
-  MachineRegisterInfo &MRI = MF.getRegInfo();
-
-  TII = static_cast<const AMDGPUInstrInfo*>(MF.getTarget().getInstrInfo());
-
-  int IndirectBegin = TII->getIndirectIndexBegin(MF);
-  int IndirectEnd = TII->getIndirectIndexEnd(MF);
-
-  if (IndirectBegin == -1) {
-    // No indirect addressing, we can skip this pass
-    assert(IndirectEnd == -1);
-    return false;
-  }
-
-  // The map keeps track of the indirect address that is represented by
-  // each virtual register. The key is the register and the value is the
-  // indirect address it uses.
-  std::map<unsigned, unsigned> RegisterAddressMap;
-
-  // First pass - Lower all of the RegisterStore instructions and track which
-  // registers are live.
-  for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
-                                                      BB != BB_E; ++BB) {
-    // This map keeps track of the current live indirect registers.
-    // The key is the address and the value is the register
-    std::map<unsigned, unsigned> LiveAddressRegisterMap;
-    MachineBasicBlock &MBB = *BB;
-
-    for (MachineBasicBlock::iterator I = MBB.begin(), Next = llvm::next(I);
-                               I != MBB.end(); I = Next) {
-      Next = llvm::next(I);
-      MachineInstr &MI = *I;
-
-      if (!TII->isRegisterStore(MI)) {
-        continue;
-      }
-
-      // Lower RegisterStore
-
-      unsigned RegIndex = MI.getOperand(2).getImm();
-      unsigned Channel = MI.getOperand(3).getImm();
-      unsigned Address = TII->calculateIndirectAddress(RegIndex, Channel);
-      const TargetRegisterClass *IndirectStoreRegClass =
-                   TII->getIndirectAddrStoreRegClass(MI.getOperand(0).getReg());
-
-      if (MI.getOperand(1).getReg() == AMDGPU::INDIRECT_BASE_ADDR) {
-        // Direct register access.
-        unsigned DstReg = MRI.createVirtualRegister(IndirectStoreRegClass);
-
-        BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::COPY), DstReg)
-                .addOperand(MI.getOperand(0));
-
-        RegisterAddressMap[DstReg] = Address;
-        LiveAddressRegisterMap[Address] = DstReg;
-      } else {
-        // Indirect register access.
-        MachineInstrBuilder MOV = TII->buildIndirectWrite(BB, I,
-                                           MI.getOperand(0).getReg(), // Value
-                                           Address,
-                                           MI.getOperand(1).getReg()); // Offset
-        for (int i = IndirectBegin; i <= IndirectEnd; ++i) {
-          unsigned Addr = TII->calculateIndirectAddress(i, Channel);
-          unsigned DstReg = MRI.createVirtualRegister(IndirectStoreRegClass);
-          MOV.addReg(DstReg, RegState::Define | RegState::Implicit);
-          RegisterAddressMap[DstReg] = Addr;
-          LiveAddressRegisterMap[Addr] = DstReg;
-        }
-      }
-      MI.eraseFromParent();
-    }
-
-    // Update the live-ins of the succesor blocks
-    for (MachineBasicBlock::succ_iterator Succ = MBB.succ_begin(),
-                                          SuccEnd = MBB.succ_end();
-                                          SuccEnd != Succ; ++Succ) {
-      std::map<unsigned, unsigned>::const_iterator Key, KeyEnd;
-      for (Key = LiveAddressRegisterMap.begin(),
-           KeyEnd = LiveAddressRegisterMap.end(); KeyEnd != Key; ++Key) {
-        (*Succ)->addLiveIn(Key->second);
-      }
-    }
-  }
-
-  // Second pass - Lower the RegisterLoad instructions
-  for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
-                                                      BB != BB_E; ++BB) {
-    // Key is the address and the value is the register
-    std::map<unsigned, unsigned> LiveAddressRegisterMap;
-    MachineBasicBlock &MBB = *BB;
-
-    MachineBasicBlock::livein_iterator LI = MBB.livein_begin();
-    while (LI != MBB.livein_end()) {
-      std::vector<unsigned> PhiRegisters;
-
-      // Make sure this live in is used for indirect addressing
-      if (RegisterAddressMap.find(*LI) == RegisterAddressMap.end()) {
-        ++LI;
-        continue;
-      }
-
-      unsigned Address = RegisterAddressMap[*LI];
-      LiveAddressRegisterMap[Address] = *LI;
-      PhiRegisters.push_back(*LI);
-
-      // Check if there are other live in registers which map to the same
-      // indirect address.
-      for (MachineBasicBlock::livein_iterator LJ = llvm::next(LI),
-                                              LE = MBB.livein_end();
-                                              LJ != LE; ++LJ) {
-        unsigned Reg = *LJ;
-        if (RegisterAddressMap.find(Reg) == RegisterAddressMap.end()) {
-          continue;
-        }
-
-        if (RegisterAddressMap[Reg] == Address) {
-          PhiRegisters.push_back(Reg);
-        }
-      }
-
-      if (PhiRegisters.size() == 1) {
-        // We don't need to insert a Phi instruction, so we can just add the
-        // registers to the live list for the block.
-        LiveAddressRegisterMap[Address] = *LI;
-        MBB.removeLiveIn(*LI);
-      } else {
-        // We need to insert a PHI, because we have the same address being
-        // written in multiple predecessor blocks.
-        const TargetRegisterClass *PhiDstClass =
-                   TII->getIndirectAddrStoreRegClass(*(PhiRegisters.begin()));
-        unsigned PhiDstReg = MRI.createVirtualRegister(PhiDstClass);
-        MachineInstrBuilder Phi = BuildMI(MBB, MBB.begin(),
-                                          MBB.findDebugLoc(MBB.begin()),
-                                          TII->get(AMDGPU::PHI), PhiDstReg);
-
-        for (std::vector<unsigned>::const_iterator RI = PhiRegisters.begin(),
-                                                   RE = PhiRegisters.end();
-                                                   RI != RE; ++RI) {
-          unsigned Reg = *RI;
-          MachineInstr *DefInst = MRI.getVRegDef(Reg);
-          assert(DefInst);
-          MachineBasicBlock *RegBlock = DefInst->getParent();
-          Phi.addReg(Reg);
-          Phi.addMBB(RegBlock);
-          MBB.removeLiveIn(Reg);
-        }
-        RegisterAddressMap[PhiDstReg] = Address;
-        LiveAddressRegisterMap[Address] = PhiDstReg;
-      }
-      LI = MBB.livein_begin();
-    }
-
-    for (MachineBasicBlock::iterator I = MBB.begin(), Next = llvm::next(I);
-                               I != MBB.end(); I = Next) {
-      Next = llvm::next(I);
-      MachineInstr &MI = *I;
-
-      if (!TII->isRegisterLoad(MI)) {
-        if (MI.getOpcode() == AMDGPU::PHI) {
-          continue;
-        }
-        // Check for indirect register defs
-        for (unsigned OpIdx = 0, NumOperands = MI.getNumOperands();
-                                 OpIdx < NumOperands; ++OpIdx) {
-          MachineOperand &MO = MI.getOperand(OpIdx);
-          if (MO.isReg() && MO.isDef() &&
-              RegisterAddressMap.find(MO.getReg()) != RegisterAddressMap.end()) {
-            unsigned Reg = MO.getReg();
-            unsigned LiveAddress = RegisterAddressMap[Reg];
-            // Chain the live-ins
-            if (LiveAddressRegisterMap.find(LiveAddress) !=
-                LiveAddressRegisterMap.end()) {
-              MI.addOperand(MachineOperand::CreateReg(
-                                  LiveAddressRegisterMap[LiveAddress],
-                                  false, // isDef
-                                  true,  // isImp
-                                  true));  // isKill
-            }
-            LiveAddressRegisterMap[LiveAddress] = Reg;
-          }
-        }
-        continue;
-      }
-
-      const TargetRegisterClass *SuperIndirectRegClass =
-                                                TII->getSuperIndirectRegClass();
-      const TargetRegisterClass *IndirectLoadRegClass =
-                                             TII->getIndirectAddrLoadRegClass();
-      unsigned IndirectReg = MRI.createVirtualRegister(SuperIndirectRegClass);
-
-      unsigned RegIndex = MI.getOperand(2).getImm();
-      unsigned Channel = MI.getOperand(3).getImm();
-      unsigned Address = TII->calculateIndirectAddress(RegIndex, Channel);
-
-      if (MI.getOperand(1).getReg() == AMDGPU::INDIRECT_BASE_ADDR) {
-        // Direct register access
-        unsigned Reg = LiveAddressRegisterMap[Address];
-        unsigned AddrReg = IndirectLoadRegClass->getRegister(Address);
-
-        if (regHasExplicitDef(MRI, Reg)) {
-          // If the register we are reading from has an explicit def, then that
-          // means it was written via a direct register access (i.e. COPY
-          // or other instruction that doesn't use indirect addressing).  In
-          // this case we know where the value has been stored, so we can just
-          // issue a copy.
-          BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::COPY),
-                  MI.getOperand(0).getReg())
-                  .addReg(Reg);
-        } else {
-          // If the register we are reading has an implicit def, then that
-          // means it was written by an indirect register access (i.e. An
-          // instruction that uses indirect addressing. 
-          BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::COPY),
-                   MI.getOperand(0).getReg())
-                   .addReg(AddrReg)
-                   .addReg(Reg, RegState::Implicit);
-        }
-      } else {
-        // Indirect register access
-
-        // Note on REQ_SEQUENCE instructons: You can't actually use the register
-        // it defines unless  you have an instruction that takes the defined
-        // register class as an operand.
-
-        MachineInstrBuilder Sequence = BuildMI(MBB, I, MBB.findDebugLoc(I),
-                                               TII->get(AMDGPU::REG_SEQUENCE),
-                                               IndirectReg);
-        for (int i = IndirectBegin; i <= IndirectEnd; ++i) {
-          unsigned Addr = TII->calculateIndirectAddress(i, Channel);
-          if (LiveAddressRegisterMap.find(Addr) == LiveAddressRegisterMap.end()) {
-            continue;
-          }
-          unsigned Reg = LiveAddressRegisterMap[Addr];
-
-          // We only need to use REG_SEQUENCE for explicit defs, since the
-          // register coalescer won't do anything with the implicit defs.
-          if (!regHasExplicitDef(MRI, Reg)) {
-            continue;
-          }
-
-          // Insert a REQ_SEQUENCE instruction to force the register allocator
-          // to allocate the virtual register to the correct physical register.
-          Sequence.addReg(LiveAddressRegisterMap[Addr]);
-          Sequence.addImm(TII->getRegisterInfo().getIndirectSubReg(Addr));
-        }
-        MachineInstrBuilder Mov = TII->buildIndirectRead(BB, I,
-                                           MI.getOperand(0).getReg(), // Value
-                                           Address,
-                                           MI.getOperand(1).getReg()); // Offset
-
-
-
-        Mov.addReg(IndirectReg, RegState::Implicit | RegState::Kill);
-        Mov.addReg(LiveAddressRegisterMap[Address], RegState::Implicit);
-
-      }
-      MI.eraseFromParent();
-    }
-  }
-  return false;
-}
-
-bool AMDGPUIndirectAddressingPass::regHasExplicitDef(MachineRegisterInfo &MRI,
-                                                  unsigned Reg) const {
-  MachineInstr *DefInstr = MRI.getVRegDef(Reg);
-
-  if (!DefInstr) {
-    return false;
-  }
-
-  if (DefInstr->getOpcode() == AMDGPU::PHI) {
-    bool Explicit = false;
-    for (MachineInstr::const_mop_iterator I = DefInstr->operands_begin(),
-                                          E = DefInstr->operands_end();
-                                          I != E; ++I) {
-      const MachineOperand &MO = *I;
-      if (!MO.isReg() || MO.isDef()) {
-        continue;
-      }
-
-      Explicit = Explicit || regHasExplicitDef(MRI, MO.getReg());
-    }
-    return Explicit;
-  }
-
-  return DefInstr->getOperand(0).isReg() &&
-         DefInstr->getOperand(0).getReg() == Reg;
-}
diff --git a/lib/Target/R600/AMDGPUInstrInfo.cpp b/lib/Target/R600/AMDGPUInstrInfo.cpp
index 61437e9..4f7084b 100644
--- a/lib/Target/R600/AMDGPUInstrInfo.cpp
+++ b/lib/Target/R600/AMDGPUInstrInfo.cpp
@@ -20,15 +20,19 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 
-#define GET_INSTRINFO_CTOR
+#define GET_INSTRINFO_CTOR_DTOR
 #define GET_INSTRINFO_NAMED_OPS
 #define GET_INSTRMAP_INFO
 #include "AMDGPUGenInstrInfo.inc"
 
 using namespace llvm;
 
+
+// Pin the vtable to this file.
+void AMDGPUInstrInfo::anchor() {}
+
 AMDGPUInstrInfo::AMDGPUInstrInfo(TargetMachine &tm)
-  : AMDGPUGenInstrInfo(0,0), RI(tm), TM(tm) { }
+  : AMDGPUGenInstrInfo(-1,-1), RI(tm), TM(tm) { }
 
 const AMDGPURegisterInfo &AMDGPUInstrInfo::getRegisterInfo() const {
   return RI;
@@ -118,6 +122,55 @@ AMDGPUInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
   assert(!"Not Implemented");
 }
 
+bool AMDGPUInstrInfo::expandPostRAPseudo (MachineBasicBlock::iterator MI) const {
+  MachineBasicBlock *MBB = MI->getParent();
+   int OffsetOpIdx =
+       AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::addr);
+   // addr is a custom operand with multiple MI operands, and only the
+   // first MI operand is given a name.
+  int RegOpIdx = OffsetOpIdx + 1;
+  int ChanOpIdx =
+      AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::chan);
+
+  if (isRegisterLoad(*MI)) {
+    int DstOpIdx =
+        AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst);
+    unsigned RegIndex = MI->getOperand(RegOpIdx).getImm();
+    unsigned Channel = MI->getOperand(ChanOpIdx).getImm();
+    unsigned Address = calculateIndirectAddress(RegIndex, Channel);
+    unsigned OffsetReg = MI->getOperand(OffsetOpIdx).getReg();
+    if (OffsetReg == AMDGPU::INDIRECT_BASE_ADDR) {
+      buildMovInstr(MBB, MI, MI->getOperand(DstOpIdx).getReg(),
+                    getIndirectAddrRegClass()->getRegister(Address));
+    } else {
+      buildIndirectRead(MBB, MI, MI->getOperand(DstOpIdx).getReg(),
+                        Address, OffsetReg);
+    }
+  } else if (isRegisterStore(*MI)) {
+    int ValOpIdx =
+        AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::val);
+    AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst);
+    unsigned RegIndex = MI->getOperand(RegOpIdx).getImm();
+    unsigned Channel = MI->getOperand(ChanOpIdx).getImm();
+    unsigned Address = calculateIndirectAddress(RegIndex, Channel);
+    unsigned OffsetReg = MI->getOperand(OffsetOpIdx).getReg();
+    if (OffsetReg == AMDGPU::INDIRECT_BASE_ADDR) {
+      buildMovInstr(MBB, MI, getIndirectAddrRegClass()->getRegister(Address),
+                    MI->getOperand(ValOpIdx).getReg());
+    } else {
+      buildIndirectWrite(MBB, MI, MI->getOperand(ValOpIdx).getReg(),
+                         calculateIndirectAddress(RegIndex, Channel),
+                         OffsetReg);
+    }
+  } else {
+    return false;
+  }
+
+  MBB->erase(MI);
+  return true;
+}
+
+
 MachineInstr *
 AMDGPUInstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
                                       MachineInstr *MI,
@@ -223,6 +276,57 @@ bool AMDGPUInstrInfo::isRegisterLoad(const MachineInstr &MI) const {
   return get(MI.getOpcode()).TSFlags & AMDGPU_FLAG_REGISTER_LOAD;
 }
 
+int AMDGPUInstrInfo::getIndirectIndexBegin(const MachineFunction &MF) const {
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  int Offset = -1;
+
+  if (MFI->getNumObjects() == 0) {
+    return -1;
+  }
+
+  if (MRI.livein_empty()) {
+    return 0;
+  }
+
+  const TargetRegisterClass *IndirectRC = getIndirectAddrRegClass();
+  for (MachineRegisterInfo::livein_iterator LI = MRI.livein_begin(),
+                                            LE = MRI.livein_end();
+                                            LI != LE; ++LI) {
+    unsigned Reg = LI->first;
+    if (TargetRegisterInfo::isVirtualRegister(Reg) ||
+        !IndirectRC->contains(Reg))
+      continue;
+
+    unsigned RegIndex;
+    unsigned RegEnd;
+    for (RegIndex = 0, RegEnd = IndirectRC->getNumRegs(); RegIndex != RegEnd;
+                                                          ++RegIndex) {
+      if (IndirectRC->getRegister(RegIndex) == Reg)
+        break;
+    }
+    Offset = std::max(Offset, (int)RegIndex);
+  }
+
+  return Offset + 1;
+}
+
+int AMDGPUInstrInfo::getIndirectIndexEnd(const MachineFunction &MF) const {
+  int Offset = 0;
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+
+  // Variable sized objects are not supported
+  assert(!MFI->hasVarSizedObjects());
+
+  if (MFI->getNumObjects() == 0) {
+    return -1;
+  }
+
+  Offset = TM.getFrameLowering()->getFrameIndexOffset(MF, -1);
+
+  return getIndirectIndexBegin(MF) + Offset;
+}
+
 
 void AMDGPUInstrInfo::convertToISA(MachineInstr & MI, MachineFunction &MF,
     DebugLoc DL) const {
@@ -244,3 +348,12 @@ void AMDGPUInstrInfo::convertToISA(MachineInstr & MI, MachineFunction &MF,
     }
   }
 }
+
+int AMDGPUInstrInfo::getMaskedMIMGOp(uint16_t Opcode, unsigned Channels) const {
+  switch (Channels) {
+  default: return Opcode;
+  case 1: return AMDGPU::getMaskedMIMGOp(Opcode, AMDGPU::Channels_1);
+  case 2: return AMDGPU::getMaskedMIMGOp(Opcode, AMDGPU::Channels_2);
+  case 3: return AMDGPU::getMaskedMIMGOp(Opcode, AMDGPU::Channels_3);
+  }
+}
diff --git a/lib/Target/R600/AMDGPUInstrInfo.h b/lib/Target/R600/AMDGPUInstrInfo.h
index 306f467..ce5b58c 100644
--- a/lib/Target/R600/AMDGPUInstrInfo.h
+++ b/lib/Target/R600/AMDGPUInstrInfo.h
@@ -43,6 +43,7 @@ private:
   const AMDGPURegisterInfo RI;
   bool getNextBranchInstr(MachineBasicBlock::iterator &iter,
                           MachineBasicBlock &MBB) const;
+  virtual void anchor();
 protected:
   TargetMachine &TM;
 public:
@@ -87,6 +88,8 @@ public:
                             unsigned DestReg, int FrameIndex,
                             const TargetRegisterClass *RC,
                             const TargetRegisterInfo *TRI) const;
+  virtual bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const;
+
 
 protected:
   MachineInstr *foldMemoryOperandImpl(MachineFunction &MF,
@@ -97,6 +100,14 @@ protected:
                                       MachineInstr *MI,
                                       const SmallVectorImpl<unsigned> &Ops,
                                       MachineInstr *LoadMI) const;
+  /// \returns the smallest register index that will be accessed by an indirect
+  /// read or write or -1 if indirect addressing is not used by this program.
+  virtual int getIndirectIndexBegin(const MachineFunction &MF) const;
+
+  /// \returns the largest register index that will be accessed by an indirect
+  /// read or write or -1 if indirect addressing is not used by this program.
+  virtual int getIndirectIndexEnd(const MachineFunction &MF) const;
+
 public:
   bool canFoldMemoryOperand(const MachineInstr *MI,
                             const SmallVectorImpl<unsigned> &Ops) const;
@@ -139,19 +150,9 @@ public:
 // Pure virtual funtions to be implemented by sub-classes.
 //===---------------------------------------------------------------------===//
 
-  virtual MachineInstr* getMovImmInstr(MachineFunction *MF, unsigned DstReg,
-                                       int64_t Imm) const = 0;
   virtual unsigned getIEQOpcode() const = 0;
   virtual bool isMov(unsigned opcode) const = 0;
 
-  /// \returns the smallest register index that will be accessed by an indirect
-  /// read or write or -1 if indirect addressing is not used by this program.
-  virtual int getIndirectIndexBegin(const MachineFunction &MF) const = 0;
-
-  /// \returns the largest register index that will be accessed by an indirect
-  /// read or write or -1 if indirect addressing is not used by this program.
-  virtual int getIndirectIndexEnd(const MachineFunction &MF) const = 0;
-
   /// \brief Calculate the "Indirect Address" for the given \p RegIndex and
   ///        \p Channel
   ///
@@ -162,14 +163,9 @@ public:
   virtual unsigned calculateIndirectAddress(unsigned RegIndex,
                                             unsigned Channel) const = 0;
 
-  /// \returns The register class to be used for storing values to an
-  /// "Indirect Address" .
-  virtual const TargetRegisterClass *getIndirectAddrStoreRegClass(
-                                                  unsigned SourceReg) const = 0;
-
-  /// \returns The register class to be used for loading values from
-  /// an "Indirect Address" .
-  virtual const TargetRegisterClass *getIndirectAddrLoadRegClass() const = 0;
+  /// \returns The register class to be used for loading and storing values
+  /// from an "Indirect Address" .
+  virtual const TargetRegisterClass *getIndirectAddrRegClass() const = 0;
 
   /// \brief Build instruction(s) for an indirect register write.
   ///
@@ -187,16 +183,21 @@ public:
                                     unsigned ValueReg, unsigned Address,
                                     unsigned OffsetReg) const = 0;
 
-  /// \returns the register class whose sub registers are the set of all
-  /// possible registers that can be used for indirect addressing.
-  virtual const TargetRegisterClass *getSuperIndirectRegClass() const = 0;
-
 
   /// \brief Convert the AMDIL MachineInstr to a supported ISA
   /// MachineInstr
   virtual void convertToISA(MachineInstr & MI, MachineFunction &MF,
     DebugLoc DL) const;
 
+  /// \brief Build a MOV instruction.
+  virtual MachineInstr *buildMovInstr(MachineBasicBlock *MBB,
+                                      MachineBasicBlock::iterator I,
+                                      unsigned DstReg, unsigned SrcReg) const = 0;
+
+  /// \brief Given a MIMG \p Opcode that writes all 4 channels, return the
+  /// equivalent opcode that writes \p Channels Channels.
+  int getMaskedMIMGOp(uint16_t Opcode, unsigned Channels) const;
+
 };
 
 namespace AMDGPU {
diff --git a/lib/Target/R600/AMDGPUInstrInfo.td b/lib/Target/R600/AMDGPUInstrInfo.td
index 48d89dd..fccede0 100644
--- a/lib/Target/R600/AMDGPUInstrInfo.td
+++ b/lib/Target/R600/AMDGPUInstrInfo.td
@@ -72,3 +72,17 @@ def AMDGPUregister_load : SDNode<"AMDGPUISD::REGISTER_LOAD",
 def AMDGPUregister_store : SDNode<"AMDGPUISD::REGISTER_STORE",
                            SDTypeProfile<0, 3, [SDTCisPtrTy<1>, SDTCisInt<2>]>,
                            [SDNPHasChain, SDNPMayStore]>;
+
+// MSKOR instructions are atomic memory instructions used mainly for storing
+// 8-bit and 16-bit values.  The definition is:
+//
+// MSKOR(dst, mask, src) MEM[dst] = ((MEM[dst] & ~mask) | src)
+//
+// src0: vec4(src, 0, 0, mask)
+// src1: dst - rat offset (aka pointer) in dwords  
+def AMDGPUstore_mskor : SDNode<"AMDGPUISD::STORE_MSKOR",
+                        SDTypeProfile<0, 2, []>,
+                        [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+
+def AMDGPUround : SDNode<"ISD::FROUND",
+                         SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisSameAs<0,1>]>>;
diff --git a/lib/Target/R600/AMDGPUInstructions.td b/lib/Target/R600/AMDGPUInstructions.td
index d6a7759..3c5375d 100644
--- a/lib/Target/R600/AMDGPUInstructions.td
+++ b/lib/Target/R600/AMDGPUInstructions.td
@@ -35,46 +35,75 @@ class AMDGPUShaderInst <dag outs, dag ins, string asm, list<dag> pattern>
 }
 
 def InstFlag : OperandWithDefaultOps <i32, (ops (i32 0))>;
+def ADDRIndirect : ComplexPattern<iPTR, 2, "SelectADDRIndirect", [], []>;
 
-def COND_EQ : PatLeaf <
+//===----------------------------------------------------------------------===//
+// PatLeafs for floating-point comparisons
+//===----------------------------------------------------------------------===//
+
+def COND_OEQ : PatLeaf <
   (cond),
-  [{switch(N->get()){{default: return false;
-                     case ISD::SETOEQ: case ISD::SETUEQ:
-                     case ISD::SETEQ: return true;}}}]
+  [{return N->get() == ISD::SETOEQ || N->get() == ISD::SETEQ;}]
 >;
 
-def COND_NE : PatLeaf <
+def COND_OGT : PatLeaf <
+  (cond),
+  [{return N->get() == ISD::SETOGT || N->get() == ISD::SETGT;}]
+>;
+
+def COND_OGE : PatLeaf <
+  (cond),
+  [{return N->get() == ISD::SETOGE || N->get() == ISD::SETGE;}]
+>;
+
+def COND_OLT : PatLeaf <
   (cond),
-  [{switch(N->get()){{default: return false;
-                     case ISD::SETONE: case ISD::SETUNE:
-                     case ISD::SETNE: return true;}}}]
+  [{return N->get() == ISD::SETOLT || N->get() == ISD::SETLT;}]
 >;
-def COND_GT : PatLeaf <
+
+def COND_OLE : PatLeaf <
   (cond),
-  [{switch(N->get()){{default: return false;
-                     case ISD::SETOGT: case ISD::SETUGT:
-                     case ISD::SETGT: return true;}}}]
+  [{return N->get() == ISD::SETOLE || N->get() == ISD::SETLE;}]
 >;
 
-def COND_GE : PatLeaf <
+def COND_UNE : PatLeaf <
   (cond),
-  [{switch(N->get()){{default: return false;
-                     case ISD::SETOGE: case ISD::SETUGE:
-                     case ISD::SETGE: return true;}}}]
+  [{return N->get() == ISD::SETUNE || N->get() == ISD::SETNE;}]
 >;
 
-def COND_LT : PatLeaf <
+def COND_O : PatLeaf <(cond), [{return N->get() == ISD::SETO;}]>;
+def COND_UO : PatLeaf <(cond), [{return N->get() == ISD::SETUO;}]>;
+
+//===----------------------------------------------------------------------===//
+// PatLeafs for unsigned comparisons
+//===----------------------------------------------------------------------===//
+
+def COND_UGT : PatLeaf <(cond), [{return N->get() == ISD::SETUGT;}]>;
+def COND_UGE : PatLeaf <(cond), [{return N->get() == ISD::SETUGE;}]>;
+def COND_ULT : PatLeaf <(cond), [{return N->get() == ISD::SETULT;}]>;
+def COND_ULE : PatLeaf <(cond), [{return N->get() == ISD::SETULE;}]>;
+
+//===----------------------------------------------------------------------===//
+// PatLeafs for signed comparisons
+//===----------------------------------------------------------------------===//
+
+def COND_SGT : PatLeaf <(cond), [{return N->get() == ISD::SETGT;}]>;
+def COND_SGE : PatLeaf <(cond), [{return N->get() == ISD::SETGE;}]>;
+def COND_SLT : PatLeaf <(cond), [{return N->get() == ISD::SETLT;}]>;
+def COND_SLE : PatLeaf <(cond), [{return N->get() == ISD::SETLE;}]>;
+
+//===----------------------------------------------------------------------===//
+// PatLeafs for integer equality
+//===----------------------------------------------------------------------===//
+
+def COND_EQ : PatLeaf <
   (cond),
-  [{switch(N->get()){{default: return false;
-                     case ISD::SETOLT: case ISD::SETULT:
-                     case ISD::SETLT: return true;}}}]
+  [{return N->get() == ISD::SETEQ || N->get() == ISD::SETUEQ;}]
 >;
 
-def COND_LE : PatLeaf <
+def COND_NE : PatLeaf <
   (cond),
-  [{switch(N->get()){{default: return false;
-                     case ISD::SETOLE: case ISD::SETULE:
-                     case ISD::SETLE: return true;}}}]
+  [{return N->get() == ISD::SETNE || N->get() == ISD::SETUNE;}]
 >;
 
 def COND_NULL : PatLeaf <
@@ -96,6 +125,10 @@ def az_extloadi8 : PatFrag<(ops node:$ptr), (az_extload node:$ptr), [{
   return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i8;
 }]>;
 
+def az_extloadi8_global : PatFrag<(ops node:$ptr), (az_extloadi8 node:$ptr), [{
+    return isGlobalLoad(dyn_cast<LoadSDNode>(N));
+}]>;
+
 def sextloadi8_global : PatFrag<(ops node:$ptr), (sextloadi8 node:$ptr), [{
     return isGlobalLoad(dyn_cast<LoadSDNode>(N));
 }]>;
@@ -108,8 +141,12 @@ def sextloadi8_constant : PatFrag<(ops node:$ptr), (sextloadi8 node:$ptr), [{
     return isConstantLoad(dyn_cast<LoadSDNode>(N), -1);
 }]>;
 
-def az_extloadi8_global : PatFrag<(ops node:$ptr), (az_extloadi8 node:$ptr), [{
-    return isGlobalLoad(dyn_cast<LoadSDNode>(N));
+def az_extloadi8_local : PatFrag<(ops node:$ptr), (az_extloadi8 node:$ptr), [{
+    return isLocalLoad(dyn_cast<LoadSDNode>(N));
+}]>;
+
+def sextloadi8_local : PatFrag<(ops node:$ptr), (sextloadi8 node:$ptr), [{
+    return isLocalLoad(dyn_cast<LoadSDNode>(N));
 }]>;
 
 def az_extloadi16 : PatFrag<(ops node:$ptr), (az_extload node:$ptr), [{
@@ -132,6 +169,14 @@ def sextloadi16_constant : PatFrag<(ops node:$ptr), (sextloadi16 node:$ptr), [{
     return isConstantLoad(dyn_cast<LoadSDNode>(N), -1);
 }]>;
 
+def az_extloadi16_local : PatFrag<(ops node:$ptr), (az_extloadi16 node:$ptr), [{
+    return isLocalLoad(dyn_cast<LoadSDNode>(N));
+}]>;
+
+def sextloadi16_local : PatFrag<(ops node:$ptr), (sextloadi16 node:$ptr), [{
+    return isLocalLoad(dyn_cast<LoadSDNode>(N));
+}]>;
+
 def az_extloadi32 : PatFrag<(ops node:$ptr), (az_extload node:$ptr), [{
   return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i32;
 }]>;
@@ -146,20 +191,55 @@ def az_extloadi32_constant : PatFrag<(ops node:$ptr),
   return isConstantLoad(dyn_cast<LoadSDNode>(N), -1);
 }]>;
 
-def local_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
-    return isLocalLoad(dyn_cast<LoadSDNode>(N));
+def truncstorei8_global : PatFrag<(ops node:$val, node:$ptr),
+                                  (truncstorei8 node:$val, node:$ptr), [{
+  return isGlobalStore(dyn_cast<StoreSDNode>(N));
+}]>;
+
+def truncstorei16_global : PatFrag<(ops node:$val, node:$ptr),
+                                  (truncstorei16 node:$val, node:$ptr), [{
+  return isGlobalStore(dyn_cast<StoreSDNode>(N));
 }]>;
 
 def local_store : PatFrag<(ops node:$val, node:$ptr),
                              (store node:$val, node:$ptr), [{
-    return isLocalStore(dyn_cast<StoreSDNode>(N));
+  return isLocalStore(dyn_cast<StoreSDNode>(N));
+}]>;
+
+def truncstorei8_local : PatFrag<(ops node:$val, node:$ptr),
+                                  (truncstorei8 node:$val, node:$ptr), [{
+  return isLocalStore(dyn_cast<StoreSDNode>(N));
+}]>;
+
+def truncstorei16_local : PatFrag<(ops node:$val, node:$ptr),
+                                  (truncstorei16 node:$val, node:$ptr), [{
+  return isLocalStore(dyn_cast<StoreSDNode>(N));
+}]>;
+
+def local_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+    return isLocalLoad(dyn_cast<LoadSDNode>(N));
+}]>;
+
+def atomic_load_add_local : PatFrag<(ops node:$ptr, node:$value),
+                                    (atomic_load_add node:$ptr, node:$value), [{
+  return dyn_cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
+}]>;
+
+def atomic_load_sub_local : PatFrag<(ops node:$ptr, node:$value),
+                                    (atomic_load_sub node:$ptr, node:$value), [{
+  return dyn_cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
+}]>;
+
+def mskor_global : PatFrag<(ops node:$val, node:$ptr),
+                            (AMDGPUstore_mskor node:$val, node:$ptr), [{
+  return dyn_cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;
 }]>;
 
 class Constants {
 int TWO_PI = 0x40c90fdb;
 int PI = 0x40490fdb;
 int TWO_PI_INV = 0x3e22f983;
-int FP_UINT_MAX_PLUS_1 = 0x4f800000;	// 1 << 32 in floating point encoding
+int FP_UINT_MAX_PLUS_1 = 0x4f800000;    // 1 << 32 in floating point encoding
 }
 def CONST : Constants;
 
@@ -205,6 +285,8 @@ class FNEG <RegisterClass rc> : AMDGPUShaderInst <
 
 multiclass RegisterLoadStore <RegisterClass dstClass, Operand addrClass,
                     ComplexPattern addrPat> {
+let UseNamedOperandTable = 1 in {
+
   def RegisterLoad : AMDGPUShaderInst <
     (outs dstClass:$dst),
     (ins addrClass:$addr, i32imm:$chan),
@@ -223,6 +305,7 @@ multiclass RegisterLoadStore <RegisterClass dstClass, Operand addrClass,
     let isRegisterStore = 1;
   }
 }
+}
 
 } // End isCodeGenOnly = 1, isPseudo = 1
 
@@ -254,61 +337,12 @@ class Insert_Element <ValueType elem_type, ValueType vec_type,
   (INSERT_SUBREG $vec, $elem, sub_reg)
 >;
 
-// Vector Build pattern
-class Vector1_Build <ValueType vecType, ValueType elemType,
-                     RegisterClass rc> : Pat <
-  (vecType (build_vector elemType:$src)),
-  (vecType (COPY_TO_REGCLASS $src, rc))
->;
-
-class Vector2_Build <ValueType vecType, ValueType elemType> : Pat <
-  (vecType (build_vector elemType:$sub0, elemType:$sub1)),
-  (INSERT_SUBREG (INSERT_SUBREG
-    (vecType (IMPLICIT_DEF)), $sub0, sub0), $sub1, sub1)
->;
-
 class Vector4_Build <ValueType vecType, ValueType elemType> : Pat <
   (vecType (build_vector elemType:$x, elemType:$y, elemType:$z, elemType:$w)),
   (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG
     (vecType (IMPLICIT_DEF)), $x, sub0), $y, sub1), $z, sub2), $w, sub3)
 >;
 
-class Vector8_Build <ValueType vecType, ValueType elemType> : Pat <
-  (vecType (build_vector elemType:$sub0, elemType:$sub1,
-                         elemType:$sub2, elemType:$sub3,
-                         elemType:$sub4, elemType:$sub5,
-                         elemType:$sub6, elemType:$sub7)),
-  (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG
-    (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG
-    (vecType (IMPLICIT_DEF)), $sub0, sub0), $sub1, sub1),
-                              $sub2, sub2), $sub3, sub3),
-                              $sub4, sub4), $sub5, sub5),
-                              $sub6, sub6), $sub7, sub7)
->;
-
-class Vector16_Build <ValueType vecType, ValueType elemType> : Pat <
-  (vecType (build_vector elemType:$sub0, elemType:$sub1,
-                         elemType:$sub2, elemType:$sub3,
-                         elemType:$sub4, elemType:$sub5,
-                         elemType:$sub6, elemType:$sub7,
-                         elemType:$sub8, elemType:$sub9,
-                         elemType:$sub10, elemType:$sub11,
-                         elemType:$sub12, elemType:$sub13,
-                         elemType:$sub14, elemType:$sub15)),
-  (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG
-    (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG
-    (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG
-    (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG
-    (vecType (IMPLICIT_DEF)), $sub0, sub0), $sub1, sub1),
-                            $sub2, sub2), $sub3, sub3),
-                            $sub4, sub4), $sub5, sub5),
-                            $sub6, sub6), $sub7, sub7),
-                            $sub8, sub8), $sub9, sub9),
-                            $sub10, sub10), $sub11, sub11),
-                            $sub12, sub12), $sub13, sub13),
-                            $sub14, sub14), $sub15, sub15)
->;
-
 // XXX: Convert to new syntax and use COPY_TO_REG, once the DFAPacketizer
 // can handle COPY instructions.
 // bitconvert pattern
diff --git a/lib/Target/R600/AMDGPUMCInstLower.cpp b/lib/Target/R600/AMDGPUMCInstLower.cpp
index 1dc1c65..0ed598e 100644
--- a/lib/Target/R600/AMDGPUMCInstLower.cpp
+++ b/lib/Target/R600/AMDGPUMCInstLower.cpp
@@ -15,14 +15,19 @@
 
 #include "AMDGPUMCInstLower.h"
 #include "AMDGPUAsmPrinter.h"
+#include "InstPrinter/AMDGPUInstPrinter.h"
 #include "R600InstrInfo.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCObjectStreamer.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
+#include <algorithm>
 
 using namespace llvm;
 
@@ -69,15 +74,45 @@ void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     MachineBasicBlock::const_instr_iterator I = MI;
     ++I;
     while (I != MBB->end() && I->isInsideBundle()) {
-      MCInst MCBundleInst;
-      const MachineInstr *BundledInst = I;
-      MCInstLowering.lower(BundledInst, MCBundleInst);
-      OutStreamer.EmitInstruction(MCBundleInst);
+      EmitInstruction(I);
       ++I;
     }
   } else {
     MCInst TmpInst;
     MCInstLowering.lower(MI, TmpInst);
     OutStreamer.EmitInstruction(TmpInst);
+
+    if (DisasmEnabled) {
+      // Disassemble instruction/operands to text.
+      DisasmLines.resize(DisasmLines.size() + 1);
+      std::string &DisasmLine = DisasmLines.back();
+      raw_string_ostream DisasmStream(DisasmLine);
+
+      AMDGPUInstPrinter InstPrinter(*TM.getMCAsmInfo(), *TM.getInstrInfo(),
+                                    *TM.getRegisterInfo());
+      InstPrinter.printInst(&TmpInst, DisasmStream, StringRef());
+
+      // Disassemble instruction/operands to hex representation.
+      SmallVector<MCFixup, 4> Fixups;
+      SmallVector<char, 16> CodeBytes;
+      raw_svector_ostream CodeStream(CodeBytes);
+
+      MCObjectStreamer &ObjStreamer = (MCObjectStreamer &)OutStreamer;
+      MCCodeEmitter &InstEmitter = ObjStreamer.getAssembler().getEmitter();
+      InstEmitter.EncodeInstruction(TmpInst, CodeStream, Fixups);
+      CodeStream.flush();
+
+      HexLines.resize(HexLines.size() + 1);
+      std::string &HexLine = HexLines.back();
+      raw_string_ostream HexStream(HexLine);
+
+      for (size_t i = 0; i < CodeBytes.size(); i += 4) {
+        unsigned int CodeDWord = *(unsigned int *)&CodeBytes[i];
+        HexStream << format("%s%08X", (i > 0 ? " " : ""), CodeDWord);
+      }
+
+      DisasmStream.flush();
+      DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLine.size());
+    }
   }
 }
diff --git a/lib/Target/R600/AMDGPUMachineFunction.cpp b/lib/Target/R600/AMDGPUMachineFunction.cpp
index f2342b0..14171f4 100644
--- a/lib/Target/R600/AMDGPUMachineFunction.cpp
+++ b/lib/Target/R600/AMDGPUMachineFunction.cpp
@@ -6,6 +6,9 @@ using namespace llvm;
 
 static const char *const ShaderTypeAttribute = "ShaderType";
 
+// Pin the vtable to this file.
+void AMDGPUMachineFunction::anchor() {}
+
 AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) :
     MachineFunctionInfo() {
   ShaderType = ShaderType::COMPUTE;
diff --git a/lib/Target/R600/AMDGPUMachineFunction.h b/lib/Target/R600/AMDGPUMachineFunction.h
index 789b96a..fea0b39 100644
--- a/lib/Target/R600/AMDGPUMachineFunction.h
+++ b/lib/Target/R600/AMDGPUMachineFunction.h
@@ -14,13 +14,18 @@
 #define AMDGPUMACHINEFUNCTION_H
 
 #include "llvm/CodeGen/MachineFunction.h"
+#include <map>
 
 namespace llvm {
 
 class AMDGPUMachineFunction : public MachineFunctionInfo {
+  virtual void anchor();
 public:
   AMDGPUMachineFunction(const MachineFunction &MF);
   unsigned ShaderType;
+  /// A map to keep track of local memory objects and their offsets within
+  /// the local memory space.
+  std::map<const GlobalValue *, unsigned> LocalMemoryObjects;
   /// Number of bytes in the LDS that are being used.
   unsigned LDSSize;
 };
diff --git a/lib/Target/R600/AMDGPURegisterInfo.cpp b/lib/Target/R600/AMDGPURegisterInfo.cpp
index 3402092..47617a7 100644
--- a/lib/Target/R600/AMDGPURegisterInfo.cpp
+++ b/lib/Target/R600/AMDGPURegisterInfo.cpp
@@ -46,27 +46,21 @@ unsigned AMDGPURegisterInfo::getFrameRegister(const MachineFunction &MF) const {
   return 0;
 }
 
+unsigned AMDGPURegisterInfo::getSubRegFromChannel(unsigned Channel) const {
+  static const unsigned SubRegs[] = {
+    AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, AMDGPU::sub4,
+    AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, AMDGPU::sub8, AMDGPU::sub9,
+    AMDGPU::sub10, AMDGPU::sub11, AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14,
+    AMDGPU::sub15
+  };
+
+  assert (Channel < array_lengthof(SubRegs));
+  return SubRegs[Channel];
+}
+
 unsigned AMDGPURegisterInfo::getIndirectSubReg(unsigned IndirectIndex) const {
 
-  switch(IndirectIndex) {
-  case 0: return AMDGPU::sub0;
-  case 1: return AMDGPU::sub1;
-  case 2: return AMDGPU::sub2;
-  case 3: return AMDGPU::sub3;
-  case 4: return AMDGPU::sub4;
-  case 5: return AMDGPU::sub5;
-  case 6: return AMDGPU::sub6;
-  case 7: return AMDGPU::sub7;
-  case 8: return AMDGPU::sub8;
-  case 9: return AMDGPU::sub9;
-  case 10: return AMDGPU::sub10;
-  case 11: return AMDGPU::sub11;
-  case 12: return AMDGPU::sub12;
-  case 13: return AMDGPU::sub13;
-  case 14: return AMDGPU::sub14;
-  case 15: return AMDGPU::sub15;
-  default: llvm_unreachable("indirect index out of range");
-  }
+  return getSubRegFromChannel(IndirectIndex);
 }
 
 #define GET_REGINFO_TARGET_DESC
diff --git a/lib/Target/R600/AMDGPURegisterInfo.h b/lib/Target/R600/AMDGPURegisterInfo.h
index 7cbd34b..688e1a0 100644
--- a/lib/Target/R600/AMDGPURegisterInfo.h
+++ b/lib/Target/R600/AMDGPURegisterInfo.h
@@ -50,6 +50,14 @@ struct AMDGPURegisterInfo : public AMDGPUGenRegisterInfo {
     assert(!"Unimplemented"); return NULL;
   }
 
+  virtual unsigned getHWRegIndex(unsigned Reg) const {
+    assert(!"Unimplemented"); return 0;
+  }
+
+  /// \returns the sub reg enum value for the given \p Channel
+  /// (e.g. getSubRegFromChannel(0) -> AMDGPU::sub0)
+  unsigned getSubRegFromChannel(unsigned Channel) const;
+
   const uint16_t* getCalleeSavedRegs(const MachineFunction *MF) const;
   void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj,
                            unsigned FIOperandNum,
diff --git a/lib/Target/R600/AMDGPUSubtarget.cpp b/lib/Target/R600/AMDGPUSubtarget.cpp
index 8ed5a74..061793a 100644
--- a/lib/Target/R600/AMDGPUSubtarget.cpp
+++ b/lib/Target/R600/AMDGPUSubtarget.cpp
@@ -13,7 +13,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPUSubtarget.h"
-#include <stdio.h>
 
 using namespace llvm;
 
@@ -37,6 +36,8 @@ AMDGPUSubtarget::AMDGPUSubtarget(StringRef TT, StringRef CPU, StringRef FS) :
   Gen = AMDGPUSubtarget::R600;
   FP64 = false;
   CaymanISA = false;
+  EnableIRStructurizer = true;
+  EnableIfCvt = true;
   ParseSubtargetFeatures(GPU, FS);
   DevName = GPU;
 }
@@ -66,6 +67,14 @@ AMDGPUSubtarget::hasCaymanISA() const {
   return CaymanISA;
 }
 bool
+AMDGPUSubtarget::IsIRStructurizerEnabled() const {
+  return EnableIRStructurizer;
+}
+bool
+AMDGPUSubtarget::isIfCvtEnabled() const {
+  return EnableIfCvt;
+}
+bool
 AMDGPUSubtarget::isTargetELF() const {
   return false;
 }
@@ -98,6 +107,10 @@ AMDGPUSubtarget::getDataLayout() const {
     DataLayout.append("-p:32:32:32");
   }
 
+  if (Gen >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
+    DataLayout.append("-p3:32:32:32");
+  }
+
   return DataLayout;
 }
 
diff --git a/lib/Target/R600/AMDGPUSubtarget.h b/lib/Target/R600/AMDGPUSubtarget.h
index 8c65096..4288d27 100644
--- a/lib/Target/R600/AMDGPUSubtarget.h
+++ b/lib/Target/R600/AMDGPUSubtarget.h
@@ -33,7 +33,8 @@ public:
     R700,
     EVERGREEN,
     NORTHERN_ISLANDS,
-    SOUTHERN_ISLANDS
+    SOUTHERN_ISLANDS,
+    SEA_ISLANDS
   };
 
 private:
@@ -48,6 +49,8 @@ private:
   enum Generation Gen;
   bool FP64;
   bool CaymanISA;
+  bool EnableIRStructurizer;
+  bool EnableIfCvt;
 
   InstrItineraryData InstrItins;
 
@@ -63,6 +66,12 @@ public:
   enum Generation getGeneration() const;
   bool hasHWFP64() const;
   bool hasCaymanISA() const;
+  bool IsIRStructurizerEnabled() const;
+  bool isIfCvtEnabled() const;
+
+  virtual bool enableMachineScheduler() const {
+    return getGeneration() <= NORTHERN_ISLANDS;
+  }
 
   // Helper functions to simplify if statements
   bool isTargetELF() const;
diff --git a/lib/Target/R600/AMDGPUTargetMachine.cpp b/lib/Target/R600/AMDGPUTargetMachine.cpp
index 5ebc5f2..bc4f5d7 100644
--- a/lib/Target/R600/AMDGPUTargetMachine.cpp
+++ b/lib/Target/R600/AMDGPUTargetMachine.cpp
@@ -33,6 +33,7 @@
 #include "llvm/Transforms/Scalar.h"
 #include <llvm/CodeGen/Passes.h>
 
+
 using namespace llvm;
 
 extern "C" void LLVMInitializeR600Target() {
@@ -58,8 +59,9 @@ AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, StringRef TT,
   LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OptLevel),
   Subtarget(TT, CPU, FS),
   Layout(Subtarget.getDataLayout()),
-  FrameLowering(TargetFrameLowering::StackGrowsUp, 16 // Stack Alignment
-                                                 , 0),
+  FrameLowering(TargetFrameLowering::StackGrowsUp,
+                64 * 16 // Maximum stack alignment (long16)
+               , 0),
   IntrinsicInfo(this),
   InstrItins(&Subtarget.getInstrItineraryData()) {
   // TLInfo uses InstrInfo so it must be initialized after.
@@ -80,17 +82,20 @@ namespace {
 class AMDGPUPassConfig : public TargetPassConfig {
 public:
   AMDGPUPassConfig(AMDGPUTargetMachine *TM, PassManagerBase &PM)
-    : TargetPassConfig(TM, PM) {
-    const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
-    if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
-      enablePass(&MachineSchedulerID);
-      MachineSchedRegistry::setDefault(createR600MachineScheduler);
-    }
-  }
+    : TargetPassConfig(TM, PM) {}
 
   AMDGPUTargetMachine &getAMDGPUTargetMachine() const {
     return getTM<AMDGPUTargetMachine>();
   }
+
+  virtual ScheduleDAGInstrs *
+  createMachineScheduler(MachineSchedContext *C) const {
+    const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
+    if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS)
+      return createR600MachineScheduler(C);
+    return 0;
+  }
+
   virtual bool addPreISel();
   virtual bool addInstSelector();
   virtual bool addPreRegAlloc();
@@ -120,8 +125,11 @@ bool
 AMDGPUPassConfig::addPreISel() {
   const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
   addPass(createFlattenCFGPass());
-  if (ST.getGeneration() > AMDGPUSubtarget::NORTHERN_ISLANDS) {
+  if (ST.IsIRStructurizerEnabled())
     addPass(createStructurizeCFGPass());
+  if (ST.getGeneration() > AMDGPUSubtarget::NORTHERN_ISLANDS) {
+    addPass(createSinkingPass());
+    addPass(createSITypeRewriter());
     addPass(createSIAnnotateControlFlowPass());
   } else {
     addPass(createR600TextureIntrinsicsReplacer());
@@ -131,12 +139,6 @@ AMDGPUPassConfig::addPreISel() {
 
 bool AMDGPUPassConfig::addInstSelector() {
   addPass(createAMDGPUISelDag(getAMDGPUTargetMachine()));
-
-  const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
-  if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
-    // This callbacks this pass uses are not implemented yet on SI.
-    addPass(createAMDGPUIndirectAddressingPass(*TM));
-  }
   return false;
 }
 
@@ -164,10 +166,12 @@ bool AMDGPUPassConfig::addPostRegAlloc() {
 bool AMDGPUPassConfig::addPreSched2() {
   const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
 
-  if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
+  if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS)
     addPass(createR600EmitClauseMarkers(*TM));
-  }
-  addPass(&IfConverterID);
+  if (ST.isIfCvtEnabled())
+    addPass(&IfConverterID);
+  if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS)
+    addPass(createR600ClauseMergePass(*TM));
   return false;
 }
 
@@ -185,4 +189,3 @@ bool AMDGPUPassConfig::addPreEmitPass() {
 
   return false;
 }
-
diff --git a/lib/Target/R600/AMDILCFGStructurizer.cpp b/lib/Target/R600/AMDILCFGStructurizer.cpp
index 687eadb..507570f 100644
--- a/lib/Target/R600/AMDILCFGStructurizer.cpp
+++ b/lib/Target/R600/AMDILCFGStructurizer.cpp
@@ -1005,13 +1005,14 @@ int AMDGPUCFGStructurizer::ifPatternMatch(MachineBasicBlock *MBB) {
     return 0;
 
   assert(isCondBranch(BranchMI));
+  int NumMatch = 0;
 
   MachineBasicBlock *TrueMBB = getTrueBranch(BranchMI);
-  serialPatternMatch(TrueMBB);
-  ifPatternMatch(TrueMBB);
+  NumMatch += serialPatternMatch(TrueMBB);
+  NumMatch += ifPatternMatch(TrueMBB);
   MachineBasicBlock *FalseMBB = getFalseBranch(MBB, BranchMI);
-  serialPatternMatch(FalseMBB);
-  ifPatternMatch(FalseMBB);
+  NumMatch += serialPatternMatch(FalseMBB);
+  NumMatch += ifPatternMatch(FalseMBB);
   MachineBasicBlock *LandBlk;
   int Cloned = 0;
 
@@ -1040,7 +1041,7 @@ int AMDGPUCFGStructurizer::ifPatternMatch(MachineBasicBlock *MBB) {
     && isSameloopDetachedContbreak(FalseMBB, TrueMBB)) {
     LandBlk = *TrueMBB->succ_begin();
   } else {
-    return handleJumpintoIf(MBB, TrueMBB, FalseMBB);
+    return NumMatch + handleJumpintoIf(MBB, TrueMBB, FalseMBB);
   }
 
   // improveSimpleJumpinfoIf can handle the case where landBlk == NULL but the
@@ -1068,7 +1069,7 @@ int AMDGPUCFGStructurizer::ifPatternMatch(MachineBasicBlock *MBB) {
 
   numClonedBlock += Cloned;
 
-  return 1 + Cloned;
+  return 1 + Cloned + NumMatch;
 }
 
 int AMDGPUCFGStructurizer::loopendPatternMatch() {
@@ -1233,7 +1234,7 @@ int AMDGPUCFGStructurizer::handleJumpintoIfImp(MachineBasicBlock *HeadMBB,
 
       numClonedBlock += Num;
       Num += serialPatternMatch(*HeadMBB->succ_begin());
-      Num += serialPatternMatch(*(++HeadMBB->succ_begin()));
+      Num += serialPatternMatch(*llvm::next(HeadMBB->succ_begin()));
       Num += ifPatternMatch(HeadMBB);
       assert(Num > 0);
 
@@ -1335,32 +1336,77 @@ int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
   // add initReg = initVal to headBlk
 
   const TargetRegisterClass * I32RC = TRI->getCFGStructurizerRegClass(MVT::i32);
-  unsigned InitReg =
-    HeadMBB->getParent()->getRegInfo().createVirtualRegister(I32RC);
-  if (!MigrateTrue || !MigrateFalse)
-    llvm_unreachable("Extra register needed to handle CFG");
+  if (!MigrateTrue || !MigrateFalse) {
+    // XXX: We have an opportunity here to optimize the "branch into if" case
+    // here.  Branch into if looks like this:
+    //                        entry
+    //                       /     |
+    //           diamond_head       branch_from
+    //             /      \           |
+    // diamond_false        diamond_true
+    //             \      /
+    //               done
+    //
+    // The diamond_head block begins the "if" and the diamond_true block
+    // is the block being "branched into".
+    //
+    // If MigrateTrue is true, then TrueBB is the block being "branched into"
+    // and if MigrateFalse is true, then FalseBB is the block being
+    // "branched into"
+    // 
+    // Here is the pseudo code for how I think the optimization should work:
+    // 1. Insert MOV GPR0, 0 before the branch instruction in diamond_head.
+    // 2. Insert MOV GPR0, 1 before the branch instruction in branch_from.
+    // 3. Move the branch instruction from diamond_head into its own basic
+    //    block (new_block).
+    // 4. Add an unconditional branch from diamond_head to new_block
+    // 5. Replace the branch instruction in branch_from with an unconditional
+    //    branch to new_block.  If branch_from has multiple predecessors, then
+    //    we need to replace the True/False block in the branch
+    //    instruction instead of replacing it.
+    // 6. Change the condition of the branch instruction in new_block from
+    //    COND to (COND || GPR0)
+    //
+    // In order insert these MOV instruction, we will need to use the
+    // RegisterScavenger.  Usually liveness stops being tracked during
+    // the late machine optimization passes, however if we implement
+    // bool TargetRegisterInfo::requiresRegisterScavenging(
+    //                                                const MachineFunction &MF)
+    // and have it return true, liveness will be tracked correctly 
+    // by generic optimization passes.  We will also need to make sure that
+    // all of our target-specific passes that run after regalloc and before
+    // the CFGStructurizer track liveness and we will need to modify this pass
+    // to correctly track liveness.
+    //
+    // After the above changes, the new CFG should look like this:
+    //                        entry
+    //                       /     |
+    //           diamond_head       branch_from
+    //                       \     /
+    //                      new_block
+    //                      /      |
+    //         diamond_false        diamond_true
+    //                      \      /
+    //                        done
+    //
+    // Without this optimization, we are forced to duplicate the diamond_true
+    // block and we will end up with a CFG like this:
+    //
+    //                        entry
+    //                       /     |
+    //           diamond_head       branch_from
+    //             /      \                   |
+    // diamond_false        diamond_true      diamond_true (duplicate)
+    //             \      /                   |
+    //               done --------------------|
+    //
+    // Duplicating diamond_true can be very costly especially if it has a
+    // lot of instructions.
+    return 0;
+  }
 
   int NumNewBlk = 0;
 
-  if (!LandBlk) {
-    LandBlk = HeadMBB->getParent()->CreateMachineBasicBlock();
-    HeadMBB->getParent()->push_back(LandBlk);  //insert to function
-
-    if (TrueMBB) {
-      TrueMBB->addSuccessor(LandBlk);
-    } else {
-      HeadMBB->addSuccessor(LandBlk);
-    }
-
-    if (FalseMBB) {
-      FalseMBB->addSuccessor(LandBlk);
-    } else {
-      HeadMBB->addSuccessor(LandBlk);
-    }
-
-    NumNewBlk ++;
-  }
-
   bool LandBlkHasOtherPred = (LandBlk->pred_size() > 2);
 
   //insert AMDGPU::ENDIF to avoid special case "input landBlk == NULL"
@@ -1375,6 +1421,10 @@ int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
         CmpResReg, DebugLoc());
   }
 
+  // XXX: We are running this after RA, so creating virtual registers will
+  // cause an assertion failure in the PostRA scheduling pass.
+  unsigned InitReg =
+    HeadMBB->getParent()->getRegInfo().createVirtualRegister(I32RC);
   insertCondBranchBefore(LandBlk, I, AMDGPU::IF_PREDICATE_SET, InitReg,
       DebugLoc());
 
@@ -1713,7 +1763,7 @@ void AMDGPUCFGStructurizer::removeRedundantConditionalBranch(
   if (MBB->succ_size() != 2)
     return;
   MachineBasicBlock *MBB1 = *MBB->succ_begin();
-  MachineBasicBlock *MBB2 = *(++MBB->succ_begin());
+  MachineBasicBlock *MBB2 = *llvm::next(MBB->succ_begin());
   if (MBB1 != MBB2)
     return;
 
diff --git a/lib/Target/R600/AMDILInstrInfo.td b/lib/Target/R600/AMDILInstrInfo.td
index f7d0bd5..0f0c88d 100644
--- a/lib/Target/R600/AMDILInstrInfo.td
+++ b/lib/Target/R600/AMDILInstrInfo.td
@@ -118,15 +118,15 @@ class ILFormat<dag outs, dag ins, string asmstr, list<dag> pattern>
 // Multiclass Instruction formats
 //===--------------------------------------------------------------------===//
 // Multiclass that handles branch instructions
-multiclass BranchConditional<SDNode Op> {
+multiclass BranchConditional<SDNode Op, RegisterClass rci, RegisterClass rcf> {
     def _i32 : ILFormat<(outs),
-  (ins brtarget:$target, GPRI32:$src0),
+  (ins brtarget:$target, rci:$src0),
         "; i32 Pseudo branch instruction",
-  [(Op bb:$target, GPRI32:$src0)]>;
+  [(Op bb:$target, (i32 rci:$src0))]>;
     def _f32 : ILFormat<(outs),
-  (ins brtarget:$target, GPRF32:$src0),
+  (ins brtarget:$target, rcf:$src0),
         "; f32 Pseudo branch instruction",
-  [(Op bb:$target, GPRF32:$src0)]>;
+  [(Op bb:$target, (f32 rcf:$src0))]>;
 }
 
 // Only scalar types should generate flow control
diff --git a/lib/Target/R600/CMakeLists.txt b/lib/Target/R600/CMakeLists.txt
index fde187b..9f8f6a8 100644
--- a/lib/Target/R600/CMakeLists.txt
+++ b/lib/Target/R600/CMakeLists.txt
@@ -17,7 +17,6 @@ add_llvm_target(R600CodeGen
   AMDILISelLowering.cpp
   AMDGPUAsmPrinter.cpp
   AMDGPUFrameLowering.cpp
-  AMDGPUIndirectAddressing.cpp
   AMDGPUISelDAGToDAG.cpp
   AMDGPUMCInstLower.cpp
   AMDGPUMachineFunction.cpp
@@ -28,6 +27,7 @@ add_llvm_target(R600CodeGen
   AMDGPUConvertToISA.cpp
   AMDGPUInstrInfo.cpp
   AMDGPURegisterInfo.cpp
+  R600ClauseMergePass.cpp
   R600ControlFlowFinalizer.cpp
   R600EmitClauseMarkers.cpp
   R600ExpandSpecialInstrs.cpp
@@ -47,6 +47,7 @@ add_llvm_target(R600CodeGen
   SILowerControlFlow.cpp
   SIMachineFunctionInfo.cpp
   SIRegisterInfo.cpp
+  SITypeRewriter.cpp
   )
 
 add_dependencies(LLVMR600CodeGen AMDGPUCommonTableGen intrinsics_gen)
diff --git a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp
index fac3c39..99e1377 100644
--- a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp
+++ b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp
@@ -23,6 +23,63 @@ void AMDGPUInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
   printAnnotation(OS, Annot);
 }
 
+void AMDGPUInstPrinter::printRegOperand(unsigned reg, raw_ostream &O) {
+  switch (reg) {
+  case AMDGPU::VCC:
+    O << "vcc";
+    return;
+  case AMDGPU::SCC:
+    O << "scc";
+    return;
+  case AMDGPU::EXEC:
+    O << "exec";
+    return;
+  case AMDGPU::M0:
+    O << "m0";
+    return;
+  default:
+    break;
+  }
+
+  // It's seems there's no way to use SIRegisterInfo here, and dealing with the
+  // giant enum of all the different shifted sets of registers is pretty
+  // unmanagable, so parse the name and reformat it to be prettier.
+  StringRef Name(getRegisterName(reg));
+
+  std::pair<StringRef, StringRef> Split = Name.split('_');
+  StringRef SubRegName = Split.first;
+  StringRef Rest = Split.second;
+
+  if (SubRegName.size() <= 4) { // Must at least be as long as "SGPR"/"VGPR".
+    O << Name;
+    return;
+  }
+
+  unsigned RegIndex;
+  StringRef RegIndexStr = SubRegName.drop_front(4);
+
+  if (RegIndexStr.getAsInteger(10, RegIndex)) {
+    O << Name;
+    return;
+  }
+
+  if (SubRegName.front() == 'V')
+    O << 'v';
+  else if (SubRegName.front() == 'S')
+    O << 's';
+  else {
+    O << Name;
+    return;
+  }
+
+  if (Rest.empty()) // Only 1 32-bit register
+    O << RegIndex;
+  else {
+    unsigned NumReg = Rest.count('_') + 2;
+    O << '[' << RegIndex << ':' << (RegIndex + NumReg - 1) << ']';
+  }
+}
+
 void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
                                      raw_ostream &O) {
 
@@ -30,8 +87,12 @@ void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
   if (Op.isReg()) {
     switch (Op.getReg()) {
     // This is the default predicate state, so we don't need to print it.
-    case AMDGPU::PRED_SEL_OFF: break;
-    default: O << getRegisterName(Op.getReg()); break;
+    case AMDGPU::PRED_SEL_OFF:
+      break;
+
+    default:
+      printRegOperand(Op.getReg(), O);
+      break;
     }
   } else if (Op.isImm()) {
     O << Op.getImm();
@@ -255,4 +316,21 @@ void AMDGPUInstPrinter::printKCache(const MCInst *MI, unsigned OpNo,
   }
 }
 
+void AMDGPUInstPrinter::printWaitFlag(const MCInst *MI, unsigned OpNo,
+                                      raw_ostream &O) {
+  // Note: Mask values are taken from SIInsertWaits.cpp and not from ISA docs
+  // SIInsertWaits.cpp bits usage does not match ISA docs description but it
+  // works so it might be a misprint in docs.
+  unsigned SImm16 = MI->getOperand(OpNo).getImm();
+  unsigned Vmcnt = SImm16 & 0xF;
+  unsigned Expcnt = (SImm16 >> 4) & 0xF;
+  unsigned Lgkmcnt = (SImm16 >> 8) & 0xF;
+  if (Vmcnt != 0xF)
+    O << "vmcnt(" << Vmcnt << ") ";
+  if (Expcnt != 0x7)
+    O << "expcnt(" << Expcnt << ") ";
+  if (Lgkmcnt != 0x7)
+    O << "lgkmcnt(" << Lgkmcnt << ")";
+}
+
 #include "AMDGPUGenAsmWriter.inc"
diff --git a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h
index 4c1dfa6..77af942 100644
--- a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h
+++ b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h
@@ -32,6 +32,7 @@ public:
   virtual void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot);
 
 private:
+  void printRegOperand(unsigned RegNo, raw_ostream &O);
   void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printInterpSlot(const MCInst *MI, unsigned OpNum, raw_ostream &O);
   void printMemOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
@@ -52,6 +53,7 @@ private:
   void printRSel(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printCT(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printKCache(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printWaitFlag(const MCInst *MI, unsigned OpNo, raw_ostream &O);
 };
 
 } // End namespace llvm
diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp b/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp
index 9a36903..29d0acf 100644
--- a/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp
+++ b/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp
@@ -95,7 +95,9 @@ public:
 
 } // end anonymous namespace
 
-MCAsmBackend *llvm::createAMDGPUAsmBackend(const Target &T, StringRef TT,
+MCAsmBackend *llvm::createAMDGPUAsmBackend(const Target &T,
+                                           const MCRegisterInfo &MRI,
+                                           StringRef TT,
                                            StringRef CPU) {
   return new ELFAMDGPUAsmBackend(T);
 }
diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp b/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp
index 59136f3..4a8e1b0 100644
--- a/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp
+++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp
@@ -21,7 +21,6 @@ AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(StringRef &TT) : MCAsmInfo() {
   HasStaticCtorDtorReferenceInStaticMode = false;
   LinkerRequiresNonEmptyDwarfLines = true;
   MaxInstLength = 16;
-  PCSymbol = "$";
   SeparatorString = "\n";
   CommentColumn = 40;
   CommentString = ";";
@@ -32,9 +31,6 @@ AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(StringRef &TT) : MCAsmInfo() {
   InlineAsmStart = ";#ASMSTART";
   InlineAsmEnd = ";#ASMEND";
   AssemblerDialect = 0;
-  AllowQuotesInName = false;
-  AllowNameToStartWithDigit = false;
-  AllowPeriodsInName = false;
 
   //===--- Data Emission Directives -------------------------------------===//
   ZeroDirective = ".zero";
@@ -56,13 +52,11 @@ AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(StringRef &TT) : MCAsmInfo() {
 
   //===--- Global Variable Emission Directives --------------------------===//
   GlobalDirective = ".global";
-  ExternDirective = ".extern";
   HasSetDirective = false;
   HasAggressiveSymbolFolding = true;
   COMMDirectiveAlignmentIsInBytes = false;
   HasDotTypeDotSizeDirective = false;
   HasNoDeadStrip = true;
-  HasSymbolResolver = false;
   WeakRefDirective = ".weakref\t";
   LinkOnceDirective = 0;
   //===--- Dwarf Emission Directives -----------------------------------===//
diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.cpp b/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
new file mode 100644
index 0000000..521b3b3
--- /dev/null
+++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
@@ -0,0 +1,21 @@
+//===-- AMDGPUCodeEmitter.cpp - AMDGPU Code Emitter interface -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief CodeEmitter interface for R600 and SI codegen.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUMCCodeEmitter.h"
+
+using namespace llvm;
+
+// pin vtable to this file
+void AMDGPUMCCodeEmitter::anchor() {}
+
diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.h b/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.h
index cd3a7ce..d8cf64a 100644
--- a/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.h
+++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.h
@@ -24,6 +24,7 @@ class MCInst;
 class MCOperand;
 
 class AMDGPUMCCodeEmitter : public MCCodeEmitter {
+  virtual void anchor();
 public:
 
   uint64_t getBinaryCodeForInstr(const MCInst &MI,
diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp
index 61d70bb..a1bec28 100644
--- a/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp
+++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp
@@ -88,7 +88,7 @@ static MCStreamer *createMCStreamer(const Target &T, StringRef TT,
                                     MCCodeEmitter *_Emitter,
                                     bool RelaxAll,
                                     bool NoExecStack) {
-  return createELFStreamer(Ctx, MAB, _OS, _Emitter, false, false);
+  return createELFStreamer(Ctx, 0, MAB, _OS, _Emitter, false, false);
 }
 
 extern "C" void LLVMInitializeR600TargetMC() {
diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h b/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h
index abb0320..f6b3376 100644
--- a/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h
+++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h
@@ -40,8 +40,8 @@ MCCodeEmitter *createSIMCCodeEmitter(const MCInstrInfo &MCII,
                                      const MCSubtargetInfo &STI,
                                      MCContext &Ctx);
 
-MCAsmBackend *createAMDGPUAsmBackend(const Target &T, StringRef TT,
-                                     StringRef CPU);
+MCAsmBackend *createAMDGPUAsmBackend(const Target &T, const MCRegisterInfo &MRI,
+                                     StringRef TT, StringRef CPU);
 
 MCObjectWriter *createAMDGPUELFObjectWriter(raw_ostream &OS);
 } // End llvm namespace
diff --git a/lib/Target/R600/MCTargetDesc/CMakeLists.txt b/lib/Target/R600/MCTargetDesc/CMakeLists.txt
index 3ccdf42..98f6925 100644
--- a/lib/Target/R600/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/R600/MCTargetDesc/CMakeLists.txt
@@ -2,6 +2,7 @@
 add_llvm_library(LLVMR600Desc
   AMDGPUAsmBackend.cpp
   AMDGPUELFObjectWriter.cpp
+  AMDGPUMCCodeEmitter.cpp
   AMDGPUMCTargetDesc.cpp
   AMDGPUMCAsmInfo.cpp
   R600MCCodeEmitter.cpp
diff --git a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
index f470783..dd8df65 100644
--- a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
+++ b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
@@ -24,7 +24,6 @@
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/Support/raw_ostream.h"
-#include <stdio.h>
 
 using namespace llvm;
 
diff --git a/lib/Target/R600/Processors.td b/lib/Target/R600/Processors.td
index 4631c04..ee190e4 100644
--- a/lib/Target/R600/Processors.td
+++ b/lib/Target/R600/Processors.td
@@ -48,6 +48,7 @@ def : Proc<"pitcairn",   SI_Itin, [FeatureSouthernIslands]>;
 def : Proc<"verde",      SI_Itin, [FeatureSouthernIslands]>;
 def : Proc<"oland",      SI_Itin, [FeatureSouthernIslands]>;
 def : Proc<"hainan",     SI_Itin, [FeatureSouthernIslands]>;
-def : Proc<"bonaire",    SI_Itin, [FeatureSouthernIslands]>;
-def : Proc<"kabini",     SI_Itin, [FeatureSouthernIslands]>;
-def : Proc<"kaveri",     SI_Itin, [FeatureSouthernIslands]>;
+def : Proc<"bonaire",    SI_Itin, [FeatureSeaIslands]>;
+def : Proc<"kabini",     SI_Itin, [FeatureSeaIslands]>;
+def : Proc<"kaveri",     SI_Itin, [FeatureSeaIslands]>;
+def : Proc<"hawaii",     SI_Itin, [FeatureSeaIslands]>;
diff --git a/lib/Target/R600/R600ClauseMergePass.cpp b/lib/Target/R600/R600ClauseMergePass.cpp
new file mode 100644
index 0000000..33d2ca3
--- /dev/null
+++ b/lib/Target/R600/R600ClauseMergePass.cpp
@@ -0,0 +1,204 @@
+//===-- R600ClauseMergePass - Merge consecutive CF_ALU -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// R600EmitClauseMarker pass emits CFAlu instruction in a conservative maneer.
+/// This pass is merging consecutive CFAlus where applicable.
+/// It needs to be called after IfCvt for best results.
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "r600mergeclause"
+#include "AMDGPU.h"
+#include "R600Defines.h"
+#include "R600InstrInfo.h"
+#include "R600MachineFunctionInfo.h"
+#include "R600RegisterInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+namespace {
+
+static bool isCFAlu(const MachineInstr *MI) {
+  switch (MI->getOpcode()) {
+  case AMDGPU::CF_ALU:
+  case AMDGPU::CF_ALU_PUSH_BEFORE:
+    return true;
+  default:
+    return false;
+  }
+}
+
+class R600ClauseMergePass : public MachineFunctionPass {
+
+private:
+  static char ID;
+  const R600InstrInfo *TII;
+
+  unsigned getCFAluSize(const MachineInstr *MI) const;
+  bool isCFAluEnabled(const MachineInstr *MI) const;
+
+  /// IfCvt pass can generate "disabled" ALU clause marker that need to be
+  /// removed and their content affected to the previous alu clause.
+  /// This function parse instructions after CFAlu untill it find a disabled
+  /// CFAlu and merge the content, or an enabled CFAlu.
+  void cleanPotentialDisabledCFAlu(MachineInstr *CFAlu) const;
+
+  /// Check whether LatrCFAlu can be merged into RootCFAlu and do it if
+  /// it is the case.
+  bool mergeIfPossible(MachineInstr *RootCFAlu, const MachineInstr *LatrCFAlu)
+      const;
+
+public:
+  R600ClauseMergePass(TargetMachine &tm) : MachineFunctionPass(ID) { }
+
+  virtual bool runOnMachineFunction(MachineFunction &MF);
+
+  const char *getPassName() const;
+};
+
+char R600ClauseMergePass::ID = 0;
+
+unsigned R600ClauseMergePass::getCFAluSize(const MachineInstr *MI) const {
+  assert(isCFAlu(MI));
+  return MI->getOperand(
+      TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::COUNT)).getImm();
+}
+
+bool R600ClauseMergePass::isCFAluEnabled(const MachineInstr *MI) const {
+  assert(isCFAlu(MI));
+  return MI->getOperand(
+      TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::Enabled)).getImm();
+}
+
+void R600ClauseMergePass::cleanPotentialDisabledCFAlu(MachineInstr *CFAlu)
+    const {
+  int CntIdx = TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::COUNT);
+  MachineBasicBlock::iterator I = CFAlu, E = CFAlu->getParent()->end();
+  I++;
+  do {
+    while (I!= E && !isCFAlu(I))
+      I++;
+    if (I == E)
+      return;
+    MachineInstr *MI = I++;
+    if (isCFAluEnabled(MI))
+      break;
+    CFAlu->getOperand(CntIdx).setImm(getCFAluSize(CFAlu) + getCFAluSize(MI));
+    MI->eraseFromParent();
+  } while (I != E);
+}
+
+bool R600ClauseMergePass::mergeIfPossible(MachineInstr *RootCFAlu,
+                                          const MachineInstr *LatrCFAlu) const {
+  assert(isCFAlu(RootCFAlu) && isCFAlu(LatrCFAlu));
+  int CntIdx = TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::COUNT);
+  unsigned RootInstCount = getCFAluSize(RootCFAlu),
+      LaterInstCount = getCFAluSize(LatrCFAlu);
+  unsigned CumuledInsts = RootInstCount + LaterInstCount;
+  if (CumuledInsts >= TII->getMaxAlusPerClause()) {
+    DEBUG(dbgs() << "Excess inst counts\n");
+    return false;
+  }
+  if (RootCFAlu->getOpcode() == AMDGPU::CF_ALU_PUSH_BEFORE)
+    return false;
+  // Is KCache Bank 0 compatible ?
+  int Mode0Idx =
+      TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_MODE0);
+  int KBank0Idx =
+      TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_BANK0);
+  int KBank0LineIdx =
+      TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_ADDR0);
+  if (LatrCFAlu->getOperand(Mode0Idx).getImm() &&
+      RootCFAlu->getOperand(Mode0Idx).getImm() &&
+      (LatrCFAlu->getOperand(KBank0Idx).getImm() !=
+       RootCFAlu->getOperand(KBank0Idx).getImm() ||
+      LatrCFAlu->getOperand(KBank0LineIdx).getImm() !=
+      RootCFAlu->getOperand(KBank0LineIdx).getImm())) {
+    DEBUG(dbgs() << "Wrong KC0\n");
+    return false;
+  }
+  // Is KCache Bank 1 compatible ?
+  int Mode1Idx =
+      TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_MODE1);
+  int KBank1Idx =
+      TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_BANK1);
+  int KBank1LineIdx =
+      TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_ADDR1);
+  if (LatrCFAlu->getOperand(Mode1Idx).getImm() &&
+      RootCFAlu->getOperand(Mode1Idx).getImm() &&
+      (LatrCFAlu->getOperand(KBank1Idx).getImm() !=
+      RootCFAlu->getOperand(KBank1Idx).getImm() ||
+      LatrCFAlu->getOperand(KBank1LineIdx).getImm() !=
+      RootCFAlu->getOperand(KBank1LineIdx).getImm())) {
+    DEBUG(dbgs() << "Wrong KC0\n");
+    return false;
+  }
+  if (LatrCFAlu->getOperand(Mode0Idx).getImm()) {
+    RootCFAlu->getOperand(Mode0Idx).setImm(
+        LatrCFAlu->getOperand(Mode0Idx).getImm());
+    RootCFAlu->getOperand(KBank0Idx).setImm(
+        LatrCFAlu->getOperand(KBank0Idx).getImm());
+    RootCFAlu->getOperand(KBank0LineIdx).setImm(
+        LatrCFAlu->getOperand(KBank0LineIdx).getImm());
+  }
+  if (LatrCFAlu->getOperand(Mode1Idx).getImm()) {
+    RootCFAlu->getOperand(Mode1Idx).setImm(
+        LatrCFAlu->getOperand(Mode1Idx).getImm());
+    RootCFAlu->getOperand(KBank1Idx).setImm(
+        LatrCFAlu->getOperand(KBank1Idx).getImm());
+    RootCFAlu->getOperand(KBank1LineIdx).setImm(
+        LatrCFAlu->getOperand(KBank1LineIdx).getImm());
+  }
+  RootCFAlu->getOperand(CntIdx).setImm(CumuledInsts);
+  RootCFAlu->setDesc(TII->get(LatrCFAlu->getOpcode()));
+  return true;
+}
+
+bool R600ClauseMergePass::runOnMachineFunction(MachineFunction &MF) {
+  TII = static_cast<const R600InstrInfo *>(MF.getTarget().getInstrInfo());
+  for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
+                                                  BB != BB_E; ++BB) {
+    MachineBasicBlock &MBB = *BB;
+    MachineBasicBlock::iterator I = MBB.begin(),  E = MBB.end();
+    MachineBasicBlock::iterator LatestCFAlu = E;
+    while (I != E) {
+      MachineInstr *MI = I++;
+      if ((!TII->canBeConsideredALU(MI) && !isCFAlu(MI)) ||
+          TII->mustBeLastInClause(MI->getOpcode()))
+        LatestCFAlu = E;
+      if (!isCFAlu(MI))
+        continue;
+      cleanPotentialDisabledCFAlu(MI);
+
+      if (LatestCFAlu != E && mergeIfPossible(LatestCFAlu, MI)) {
+        MI->eraseFromParent();
+      } else {
+        assert(MI->getOperand(8).getImm() && "CF ALU instruction disabled");
+        LatestCFAlu = MI;
+      }
+    }
+  }
+  return false;
+}
+
+const char *R600ClauseMergePass::getPassName() const {
+  return "R600 Merge Clause Markers Pass";
+}
+
+} // end anonymous namespace
+
+
+llvm::FunctionPass *llvm::createR600ClauseMergePass(TargetMachine &TM) {
+  return new R600ClauseMergePass(TM);
+}
diff --git a/lib/Target/R600/R600ControlFlowFinalizer.cpp b/lib/Target/R600/R600ControlFlowFinalizer.cpp
index 715be37..ac3d8f6 100644
--- a/lib/Target/R600/R600ControlFlowFinalizer.cpp
+++ b/lib/Target/R600/R600ControlFlowFinalizer.cpp
@@ -373,15 +373,6 @@ public:
         case AMDGPU::CF_ALU:
           I = MI;
           AluClauses.push_back(MakeALUClause(MBB, I));
-        case AMDGPU::EG_ExportBuf:
-        case AMDGPU::EG_ExportSwz:
-        case AMDGPU::R600_ExportBuf:
-        case AMDGPU::R600_ExportSwz:
-        case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
-        case AMDGPU::RAT_WRITE_CACHELESS_64_eg:
-        case AMDGPU::RAT_WRITE_CACHELESS_128_eg:
-        case AMDGPU::RAT_STORE_DWORD32_cm:
-        case AMDGPU::RAT_STORE_DWORD64_cm:
           DEBUG(dbgs() << CfCount << ":"; MI->dump(););
           CfCount++;
           break;
@@ -491,6 +482,10 @@ public:
             EmitALUClause(I, AluClauses[i], CfCount);
         }
         default:
+          if (TII->isExport(MI->getOpcode())) {
+            DEBUG(dbgs() << CfCount << ":"; MI->dump(););
+            CfCount++;
+          }
           break;
         }
       }
diff --git a/lib/Target/R600/R600Defines.h b/lib/Target/R600/R600Defines.h
index 90fc29c..1781f2a 100644
--- a/lib/Target/R600/R600Defines.h
+++ b/lib/Target/R600/R600Defines.h
@@ -44,7 +44,9 @@ namespace R600_InstFlag {
     TEX_INST = (1 << 13),
     ALU_INST = (1 << 14),
     LDS_1A = (1 << 15),
-    LDS_1A1D = (1 << 16)
+    LDS_1A1D = (1 << 16),
+    IS_EXPORT = (1 << 17),
+    LDS_1A2D = (1 << 18)
   };
 }
 
diff --git a/lib/Target/R600/R600EmitClauseMarkers.cpp b/lib/Target/R600/R600EmitClauseMarkers.cpp
index fac2b47..1bbfd2b 100644
--- a/lib/Target/R600/R600EmitClauseMarkers.cpp
+++ b/lib/Target/R600/R600EmitClauseMarkers.cpp
@@ -47,6 +47,11 @@ private:
       break;
     }
 
+    // These will be expanded to two ALU instructions in the
+    // ExpandSpecialInstructions pass.
+    if (TII->isLDSRetInstr(MI->getOpcode()))
+      return 2;
+
     if(TII->isVector(*MI) ||
         TII->isCubeOp(MI->getOpcode()) ||
         TII->isReductionOp(MI->getOpcode()))
@@ -84,6 +89,7 @@ private:
     switch (MI->getOpcode()) {
     case AMDGPU::KILL:
     case AMDGPU::RETURN:
+    case AMDGPU::IMPLICIT_DEF:
       return true;
     default:
       return false;
@@ -105,8 +111,13 @@ private:
   }
 
   bool SubstituteKCacheBank(MachineInstr *MI,
-      std::vector<std::pair<unsigned, unsigned> > &CachedConsts) const {
+      std::vector<std::pair<unsigned, unsigned> > &CachedConsts,
+      bool UpdateInstr = true) const {
     std::vector<std::pair<unsigned, unsigned> > UsedKCache;
+
+    if (!TII->isALUInstr(MI->getOpcode()) && MI->getOpcode() != AMDGPU::DOT_4)
+      return true;
+
     const SmallVectorImpl<std::pair<MachineOperand *, int64_t> > &Consts =
         TII->getSrcs(MI);
     assert((TII->isALUInstr(MI->getOpcode()) ||
@@ -139,6 +150,9 @@ private:
       return false;
     }
 
+    if (!UpdateInstr)
+      return true;
+
     for (unsigned i = 0, j = 0, n = Consts.size(); i < n; ++i) {
       if (Consts[i].first->getReg() != AMDGPU::ALU_CONST)
         continue;
@@ -159,6 +173,52 @@ private:
     return true;
   }
 
+  bool canClauseLocalKillFitInClause(
+                        unsigned AluInstCount,
+                        std::vector<std::pair<unsigned, unsigned> > KCacheBanks,
+                        MachineBasicBlock::iterator Def,
+                        MachineBasicBlock::iterator BBEnd) {
+    const R600RegisterInfo &TRI = TII->getRegisterInfo();
+    for (MachineInstr::const_mop_iterator
+           MOI = Def->operands_begin(),
+           MOE = Def->operands_end(); MOI != MOE; ++MOI) {
+      if (!MOI->isReg() || !MOI->isDef() ||
+          TRI.isPhysRegLiveAcrossClauses(MOI->getReg()))
+        continue;
+
+      // Def defines a clause local register, so check that its use will fit
+      // in the clause.
+      unsigned LastUseCount = 0;
+      for (MachineBasicBlock::iterator UseI = Def; UseI != BBEnd; ++UseI) {
+        AluInstCount += OccupiedDwords(UseI);
+        // Make sure we won't need to end the clause due to KCache limitations.
+        if (!SubstituteKCacheBank(UseI, KCacheBanks, false))
+          return false;
+
+        // We have reached the maximum instruction limit before finding the
+        // use that kills this register, so we cannot use this def in the
+        // current clause.
+        if (AluInstCount >= TII->getMaxAlusPerClause())
+          return false;
+
+        // Register kill flags have been cleared by the time we get to this
+        // pass, but it is safe to assume that all uses of this register
+        // occur in the same basic block as its definition, because
+        // it is illegal for the scheduler to schedule them in
+        // different blocks.
+        if (UseI->findRegisterUseOperandIdx(MOI->getReg()))
+          LastUseCount = AluInstCount;
+
+        if (UseI != Def && UseI->findRegisterDefOperandIdx(MOI->getReg()) != -1)
+          break;
+      }
+      if (LastUseCount)
+        return LastUseCount <= TII->getMaxAlusPerClause();
+      llvm_unreachable("Clause local register live at end of clause.");
+    }
+    return true;
+  }
+
   MachineBasicBlock::iterator
   MakeALUClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator I) {
     MachineBasicBlock::iterator ClauseHead = I;
@@ -173,6 +233,14 @@ private:
       if (AluInstCount > TII->getMaxAlusPerClause())
         break;
       if (I->getOpcode() == AMDGPU::PRED_X) {
+        // We put PRED_X in its own clause to ensure that ifcvt won't create
+        // clauses with more than 128 insts.
+        // IfCvt is indeed checking that "then" and "else" branches of an if
+        // statement have less than ~60 insts thus converted clauses can't be
+        // bigger than ~121 insts (predicate setter needs to be in the same
+        // clause as predicated alus).
+        if (AluInstCount > 0)
+          break;
         if (TII->getFlagOp(I).getImm() & MO_FLAG_PUSH)
           PushBeforeModifier = true;
         AluInstCount ++;
@@ -189,11 +257,13 @@ private:
         I++;
         break;
       }
-      if (TII->isALUInstr(I->getOpcode()) &&
-          !SubstituteKCacheBank(I, KCacheBanks))
+
+      // If this instruction defines a clause local register, make sure
+      // its use can fit in this clause.
+      if (!canClauseLocalKillFitInClause(AluInstCount, KCacheBanks, I, E))
         break;
-      if (I->getOpcode() == AMDGPU::DOT_4 &&
-          !SubstituteKCacheBank(I, KCacheBanks))
+
+      if (!SubstituteKCacheBank(I, KCacheBanks))
         break;
       AluInstCount += OccupiedDwords(I);
     }
diff --git a/lib/Target/R600/R600ExpandSpecialInstrs.cpp b/lib/Target/R600/R600ExpandSpecialInstrs.cpp
index 67b42d7..aeee4aa 100644
--- a/lib/Target/R600/R600ExpandSpecialInstrs.cpp
+++ b/lib/Target/R600/R600ExpandSpecialInstrs.cpp
@@ -68,6 +68,23 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
       MachineInstr &MI = *I;
       I = llvm::next(I);
 
+      // Expand LDS_*_RET instructions
+      if (TII->isLDSRetInstr(MI.getOpcode())) {
+        int DstIdx = TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst);
+        assert(DstIdx != -1);
+        MachineOperand &DstOp = MI.getOperand(DstIdx);
+        MachineInstr *Mov = TII->buildMovInstr(&MBB, I,
+                                               DstOp.getReg(), AMDGPU::OQAP);
+        DstOp.setReg(AMDGPU::OQAP);
+        int LDSPredSelIdx = TII->getOperandIdx(MI.getOpcode(),
+                                           AMDGPU::OpName::pred_sel);
+        int MovPredSelIdx = TII->getOperandIdx(Mov->getOpcode(),
+                                           AMDGPU::OpName::pred_sel);
+        // Copy the pred_sel bit
+        Mov->getOperand(MovPredSelIdx).setReg(
+            MI.getOperand(LDSPredSelIdx).getReg());
+      }
+
       switch (MI.getOpcode()) {
       default: break;
       // Expand PRED_X to one of the PRED_SET instructions.
diff --git a/lib/Target/R600/R600ISelLowering.cpp b/lib/Target/R600/R600ISelLowering.cpp
index ce6ac89..0fcb488 100644
--- a/lib/Target/R600/R600ISelLowering.cpp
+++ b/lib/Target/R600/R600ISelLowering.cpp
@@ -38,14 +38,24 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
 
   computeRegisterProperties();
 
-  setOperationAction(ISD::FADD, MVT::v4f32, Expand);
-  setOperationAction(ISD::FADD, MVT::v2f32, Expand);
-  setOperationAction(ISD::FMUL, MVT::v4f32, Expand);
-  setOperationAction(ISD::FMUL, MVT::v2f32, Expand);
-  setOperationAction(ISD::FDIV, MVT::v4f32, Expand);
-  setOperationAction(ISD::FDIV, MVT::v2f32, Expand);
-  setOperationAction(ISD::FSUB, MVT::v4f32, Expand);
-  setOperationAction(ISD::FSUB, MVT::v2f32, Expand);
+  // Set condition code actions
+  setCondCodeAction(ISD::SETO,   MVT::f32, Expand);
+  setCondCodeAction(ISD::SETUO,  MVT::f32, Expand);
+  setCondCodeAction(ISD::SETLT,  MVT::f32, Expand);
+  setCondCodeAction(ISD::SETLE,  MVT::f32, Expand);
+  setCondCodeAction(ISD::SETOLT, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETOLE, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETONE, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETUGE, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETUGT, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETULT, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETULE, MVT::f32, Expand);
+
+  setCondCodeAction(ISD::SETLE, MVT::i32, Expand);
+  setCondCodeAction(ISD::SETLT, MVT::i32, Expand);
+  setCondCodeAction(ISD::SETULE, MVT::i32, Expand);
+  setCondCodeAction(ISD::SETULT, MVT::i32, Expand);
 
   setOperationAction(ISD::FCOS, MVT::f32, Custom);
   setOperationAction(ISD::FSIN, MVT::f32, Custom);
@@ -69,21 +79,33 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
   setOperationAction(ISD::SETCC, MVT::f32, Expand);
   setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom);
 
-  setOperationAction(ISD::SELECT, MVT::i32, Custom);
-  setOperationAction(ISD::SELECT, MVT::f32, Custom);
+  setOperationAction(ISD::SELECT, MVT::i32, Expand);
+  setOperationAction(ISD::SELECT, MVT::f32, Expand);
+  setOperationAction(ISD::SELECT, MVT::v2i32, Expand);
+  setOperationAction(ISD::SELECT, MVT::v2f32, Expand);
+  setOperationAction(ISD::SELECT, MVT::v4i32, Expand);
+  setOperationAction(ISD::SELECT, MVT::v4f32, Expand);
 
   // Legalize loads and stores to the private address space.
   setOperationAction(ISD::LOAD, MVT::i32, Custom);
   setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
+
+  // EXTLOAD should be the same as ZEXTLOAD. It is legal for some address
+  // spaces, so it is custom lowered to handle those where it isn't.
   setLoadExtAction(ISD::SEXTLOAD, MVT::i8, Custom);
   setLoadExtAction(ISD::SEXTLOAD, MVT::i16, Custom);
   setLoadExtAction(ISD::ZEXTLOAD, MVT::i8, Custom);
   setLoadExtAction(ISD::ZEXTLOAD, MVT::i16, Custom);
+  setLoadExtAction(ISD::EXTLOAD, MVT::i8, Custom);
+  setLoadExtAction(ISD::EXTLOAD, MVT::i16, Custom);
+
   setOperationAction(ISD::STORE, MVT::i8, Custom);
   setOperationAction(ISD::STORE, MVT::i32, Custom);
   setOperationAction(ISD::STORE, MVT::v2i32, Custom);
   setOperationAction(ISD::STORE, MVT::v4i32, Custom);
+  setTruncStoreAction(MVT::i32, MVT::i8, Custom);
+  setTruncStoreAction(MVT::i32, MVT::i16, Custom);
 
   setOperationAction(ISD::LOAD, MVT::i32, Custom);
   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
@@ -99,7 +121,7 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
 
   setBooleanContents(ZeroOrNegativeOneBooleanContent);
   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
-  setSchedulingPreference(Sched::VLIW);
+  setSchedulingPreference(Sched::Source);
 }
 
 MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
@@ -111,7 +133,25 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
     static_cast<const R600InstrInfo*>(MF->getTarget().getInstrInfo());
 
   switch (MI->getOpcode()) {
-  default: return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
+  default:
+    // Replace LDS_*_RET instruction that don't have any uses with the
+    // equivalent LDS_*_NORET instruction.
+    if (TII->isLDSRetInstr(MI->getOpcode())) {
+      int DstIdx = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst);
+      assert(DstIdx != -1);
+      MachineInstrBuilder NewMI;
+      if (!MRI.use_empty(MI->getOperand(DstIdx).getReg()))
+        return BB;
+
+      NewMI = BuildMI(*BB, I, BB->findDebugLoc(I),
+                      TII->get(AMDGPU::getLDSNoRetOp(MI->getOpcode())));
+      for (unsigned i = 1, e = MI->getNumOperands(); i < e; ++i) {
+        NewMI.addOperand(MI->getOperand(i));
+      }
+    } else {
+      return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
+    }
+    break;
   case AMDGPU::CLAMP_R600: {
     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
                                                    AMDGPU::MOV,
@@ -147,19 +187,6 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
     break;
   }
 
-  case AMDGPU::LDS_READ_RET: {
-    MachineInstrBuilder NewMI = BuildMI(*BB, I, BB->findDebugLoc(I),
-                                        TII->get(MI->getOpcode()),
-                                        AMDGPU::OQAP);
-    for (unsigned i = 1, e = MI->getNumOperands(); i < e; ++i) {
-      NewMI.addOperand(MI->getOperand(i));
-    }
-    TII->buildDefaultInstruction(*BB, I, AMDGPU::MOV,
-                                 MI->getOperand(0).getReg(),
-                                 AMDGPU::OQAP);
-    break;
-  }
-
   case AMDGPU::MOV_IMM_F32:
     TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
                      MI->getOperand(1).getFPImm()->getValueAPF()
@@ -486,10 +513,8 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
   case ISD::FCOS:
   case ISD::FSIN: return LowerTrig(Op, DAG);
   case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
-  case ISD::SELECT: return LowerSELECT(Op, DAG);
   case ISD::STORE: return LowerSTORE(Op, DAG);
   case ISD::LOAD: return LowerLOAD(Op, DAG);
-  case ISD::FrameIndex: return LowerFrameIndex(Op, DAG);
   case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG);
   case ISD::INTRINSIC_VOID: {
     SDValue Chain = Op.getOperand(0);
@@ -554,7 +579,6 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
             TII->getRegisterInfo().getSubRegFromChannel(slot % 4),
             DL, MVT::f32, SDValue(interp, 0));
       }
-
       MachineFunction &MF = DAG.getMachineFunction();
       MachineRegisterInfo &MRI = MF.getRegInfo();
       unsigned RegisterI = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb);
@@ -576,6 +600,24 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
             RegisterJNode, RegisterINode);
       return SDValue(interp, slot % 2);
     }
+    case AMDGPUIntrinsic::R600_interp_xy:
+    case AMDGPUIntrinsic::R600_interp_zw: {
+      int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+      MachineSDNode *interp;
+      SDValue RegisterINode = Op.getOperand(2);
+      SDValue RegisterJNode = Op.getOperand(3);
+
+      if (IntrinsicID == AMDGPUIntrinsic::R600_interp_xy)
+        interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
+            MVT::f32, MVT::f32, DAG.getTargetConstant(slot, MVT::i32),
+            RegisterJNode, RegisterINode);
+      else
+        interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
+            MVT::f32, MVT::f32, DAG.getTargetConstant(slot, MVT::i32),
+            RegisterJNode, RegisterINode);
+      return DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32,
+          SDValue(interp, 0), SDValue(interp, 1));
+    }
     case AMDGPUIntrinsic::R600_tex:
     case AMDGPUIntrinsic::R600_texc:
     case AMDGPUIntrinsic::R600_txl:
@@ -585,7 +627,8 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
     case AMDGPUIntrinsic::R600_txf:
     case AMDGPUIntrinsic::R600_txq:
     case AMDGPUIntrinsic::R600_ddx:
-    case AMDGPUIntrinsic::R600_ddy: {
+    case AMDGPUIntrinsic::R600_ddy:
+    case AMDGPUIntrinsic::R600_ldptr: {
       unsigned TextureOp;
       switch (IntrinsicID) {
       case AMDGPUIntrinsic::R600_tex:
@@ -618,6 +661,9 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
       case AMDGPUIntrinsic::R600_ddy:
         TextureOp = 9;
         break;
+      case AMDGPUIntrinsic::R600_ldptr:
+        TextureOp = 10;
+        break;
       default:
         llvm_unreachable("Unknow Texture Operation");
       }
@@ -792,20 +838,6 @@ SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
                      false, false, false, 0);
 }
 
-SDValue R600TargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const {
-
-  MachineFunction &MF = DAG.getMachineFunction();
-  const AMDGPUFrameLowering *TFL =
-   static_cast<const AMDGPUFrameLowering*>(getTargetMachine().getFrameLowering());
-
-  FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Op);
-  assert(FIN);
-
-  unsigned FrameIndex = FIN->getIndex();
-  unsigned Offset = TFL->getFrameIndexOffset(MF, FrameIndex);
-  return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF), MVT::i32);
-}
-
 bool R600TargetLowering::isZero(SDValue Op) const {
   if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
     return Cst->isNullValue();
@@ -836,16 +868,27 @@ SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const
   //
   // SET* can match the following patterns:
   //
-  // select_cc f32, f32, -1,  0, cc_any
-  // select_cc f32, f32, 1.0f, 0.0f, cc_any
-  // select_cc i32, i32, -1,  0, cc_any
+  // select_cc f32, f32, -1,  0, cc_supported
+  // select_cc f32, f32, 1.0f, 0.0f, cc_supported
+  // select_cc i32, i32, -1,  0, cc_supported
   //
 
   // Move hardware True/False values to the correct operand.
+  ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
+  ISD::CondCode InverseCC =
+     ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
   if (isHWTrueValue(False) && isHWFalseValue(True)) {
-    ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
-    std::swap(False, True);
-    CC = DAG.getCondCode(ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32));
+    if (isCondCodeLegal(InverseCC, CompareVT.getSimpleVT())) {
+      std::swap(False, True);
+      CC = DAG.getCondCode(InverseCC);
+    } else {
+      ISD::CondCode SwapInvCC = ISD::getSetCCSwappedOperands(InverseCC);
+      if (isCondCodeLegal(SwapInvCC, CompareVT.getSimpleVT())) {
+        std::swap(False, True);
+        std::swap(LHS, RHS);
+        CC = DAG.getCondCode(SwapInvCC);
+      }
+    }
   }
 
   if (isHWTrueValue(True) && isHWFalseValue(False) &&
@@ -858,14 +901,34 @@ SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const
   //
   // CND* can match the following patterns:
   //
-  // select_cc f32, 0.0, f32, f32, cc_any
-  // select_cc f32, 0.0, i32, i32, cc_any
-  // select_cc i32, 0,   f32, f32, cc_any
-  // select_cc i32, 0,   i32, i32, cc_any
+  // select_cc f32, 0.0, f32, f32, cc_supported
+  // select_cc f32, 0.0, i32, i32, cc_supported
+  // select_cc i32, 0,   f32, f32, cc_supported
+  // select_cc i32, 0,   i32, i32, cc_supported
   //
-  if (isZero(LHS) || isZero(RHS)) {
-    SDValue Cond = (isZero(LHS) ? RHS : LHS);
-    SDValue Zero = (isZero(LHS) ? LHS : RHS);
+
+  // Try to move the zero value to the RHS
+  if (isZero(LHS)) {
+    ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
+    // Try swapping the operands
+    ISD::CondCode CCSwapped = ISD::getSetCCSwappedOperands(CCOpcode);
+    if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
+      std::swap(LHS, RHS);
+      CC = DAG.getCondCode(CCSwapped);
+    } else {
+      // Try inverting the conditon and then swapping the operands
+      ISD::CondCode CCInv = ISD::getSetCCInverse(CCOpcode, CompareVT.isInteger());
+      CCSwapped = ISD::getSetCCSwappedOperands(CCInv);
+      if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
+        std::swap(True, False);
+        std::swap(LHS, RHS);
+        CC = DAG.getCondCode(CCSwapped);
+      }
+    }
+  }
+  if (isZero(RHS)) {
+    SDValue Cond = LHS;
+    SDValue Zero = RHS;
     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
     if (CompareVT != VT) {
       // Bitcast True / False to the correct types.  This will end up being
@@ -875,20 +938,11 @@ SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const
       True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True);
       False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False);
     }
-    if (isZero(LHS)) {
-      CCOpcode = ISD::getSetCCSwappedOperands(CCOpcode);
-    }
 
     switch (CCOpcode) {
     case ISD::SETONE:
     case ISD::SETUNE:
     case ISD::SETNE:
-    case ISD::SETULE:
-    case ISD::SETULT:
-    case ISD::SETOLE:
-    case ISD::SETOLT:
-    case ISD::SETLE:
-    case ISD::SETLT:
       CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
       Temp = True;
       True = False;
@@ -936,17 +990,6 @@ SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const
       DAG.getCondCode(ISD::SETNE));
 }
 
-SDValue R600TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
-  return DAG.getNode(ISD::SELECT_CC,
-      SDLoc(Op),
-      Op.getValueType(),
-      Op.getOperand(0),
-      DAG.getConstant(0, MVT::i32),
-      Op.getOperand(1),
-      Op.getOperand(2),
-      DAG.getCondCode(ISD::SETNE));
-}
-
 /// LLVM generates byte-addresed pointers.  For indirect addressing, we need to
 /// convert these pointers to a register index.  Each register holds
 /// 16 bytes, (4 x 32bit sub-register), but we need to take into account the
@@ -1009,19 +1052,59 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
   SDValue Value = Op.getOperand(1);
   SDValue Ptr = Op.getOperand(2);
 
-  if (StoreNode->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
-      Ptr->getOpcode() != AMDGPUISD::DWORDADDR) {
-    // Convert pointer from byte address to dword address.
-    Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(),
-                      DAG.getNode(ISD::SRL, DL, Ptr.getValueType(),
-                                  Ptr, DAG.getConstant(2, MVT::i32)));
+  SDValue Result = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
+  if (Result.getNode()) {
+    return Result;
+  }
 
-    if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) {
-      assert(!"Truncated and indexed stores not supported yet");
-    } else {
-      Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand());
+  if (StoreNode->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS) {
+    if (StoreNode->isTruncatingStore()) {
+      EVT VT = Value.getValueType();
+      assert(VT.bitsLE(MVT::i32));
+      EVT MemVT = StoreNode->getMemoryVT();
+      SDValue MaskConstant;
+      if (MemVT == MVT::i8) {
+        MaskConstant = DAG.getConstant(0xFF, MVT::i32);
+      } else {
+        assert(MemVT == MVT::i16);
+        MaskConstant = DAG.getConstant(0xFFFF, MVT::i32);
+      }
+      SDValue DWordAddr = DAG.getNode(ISD::SRL, DL, VT, Ptr,
+                                      DAG.getConstant(2, MVT::i32));
+      SDValue ByteIndex = DAG.getNode(ISD::AND, DL, Ptr.getValueType(), Ptr,
+                                      DAG.getConstant(0x00000003, VT));
+      SDValue TruncValue = DAG.getNode(ISD::AND, DL, VT, Value, MaskConstant);
+      SDValue Shift = DAG.getNode(ISD::SHL, DL, VT, ByteIndex,
+                                   DAG.getConstant(3, VT));
+      SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, VT, TruncValue, Shift);
+      SDValue Mask = DAG.getNode(ISD::SHL, DL, VT, MaskConstant, Shift);
+      // XXX: If we add a 64-bit ZW register class, then we could use a 2 x i32
+      // vector instead.
+      SDValue Src[4] = {
+        ShiftedValue,
+        DAG.getConstant(0, MVT::i32),
+        DAG.getConstant(0, MVT::i32),
+        Mask
+      };
+      SDValue Input = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Src, 4);
+      SDValue Args[3] = { Chain, Input, DWordAddr };
+      return DAG.getMemIntrinsicNode(AMDGPUISD::STORE_MSKOR, DL,
+                                     Op->getVTList(), Args, 3, MemVT,
+                                     StoreNode->getMemOperand());
+    } else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR &&
+               Value.getValueType().bitsGE(MVT::i32)) {
+      // Convert pointer from byte address to dword address.
+      Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(),
+                        DAG.getNode(ISD::SRL, DL, Ptr.getValueType(),
+                                    Ptr, DAG.getConstant(2, MVT::i32)));
+
+      if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) {
+        assert(!"Truncated and indexed stores not supported yet");
+      } else {
+        Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand());
+      }
+      return Chain;
     }
-    return Chain;
   }
 
   EVT ValueVT = Value.getValueType();
@@ -1121,12 +1204,22 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
   SDValue Ptr = Op.getOperand(1);
   SDValue LoweredLoad;
 
+  if (LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && VT.isVector()) {
+    SDValue MergedValues[2] = {
+      SplitVectorLoad(Op, DAG),
+      Chain
+    };
+    return DAG.getMergeValues(MergedValues, 2, DL);
+  }
+
   int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace());
-  if (ConstantBlock > -1) {
+  if (ConstantBlock > -1 &&
+      ((LoadNode->getExtensionType() == ISD::NON_EXTLOAD) ||
+       (LoadNode->getExtensionType() == ISD::ZEXTLOAD))) {
     SDValue Result;
-    if (dyn_cast<ConstantExpr>(LoadNode->getSrcValue()) ||
-        dyn_cast<Constant>(LoadNode->getSrcValue()) ||
-        dyn_cast<ConstantSDNode>(Ptr)) {
+    if (isa<ConstantExpr>(LoadNode->getSrcValue()) ||
+        isa<Constant>(LoadNode->getSrcValue()) ||
+        isa<ConstantSDNode>(Ptr)) {
       SDValue Slots[4];
       for (unsigned i = 0; i < 4; i++) {
         // We want Const position encoded with the following formula :
@@ -1166,13 +1259,13 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
     return DAG.getMergeValues(MergedValues, 2, DL);
   }
 
-  // For most operations returning SDValue() will result int he node being
-  // expanded by the DAG Legalizer.  This is not the case for ISD::LOAD, so
-  // we need to manually expand loads that may be legal in some address spaces
-  // and illegal in others.  SEXT loads from CONSTANT_BUFFER_0 are supported
-  // for compute shaders, since the data is sign extended when it is uploaded
-  // to the buffer.  Howerver SEXT loads from other addresspaces are not
-  // supported, so we need to expand them here.
+  // For most operations returning SDValue() will result in the node being
+  // expanded by the DAG Legalizer. This is not the case for ISD::LOAD, so we
+  // need to manually expand loads that may be legal in some address spaces and
+  // illegal in others. SEXT loads from CONSTANT_BUFFER_0 are supported for
+  // compute shaders, since the data is sign extended when it is uploaded to the
+  // buffer. However SEXT loads from other address spaces are not supported, so
+  // we need to expand them here.
   if (LoadNode->getExtensionType() == ISD::SEXTLOAD) {
     EVT MemVT = LoadNode->getMemoryVT();
     assert(!MemVT.isVector() && (MemVT == MVT::i16 || MemVT == MVT::i8));
@@ -1252,23 +1345,39 @@ SDValue R600TargetLowering::LowerFormalArguments(
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
                  getTargetMachine(), ArgLocs, *DAG.getContext());
+  MachineFunction &MF = DAG.getMachineFunction();
+  unsigned ShaderType = MF.getInfo<R600MachineFunctionInfo>()->ShaderType;
+
+  SmallVector<ISD::InputArg, 8> LocalIns;
+
+  getOriginalFunctionArgs(DAG, DAG.getMachineFunction().getFunction(), Ins,
+                          LocalIns);
 
-  AnalyzeFormalArguments(CCInfo, Ins);
+  AnalyzeFormalArguments(CCInfo, LocalIns);
 
   for (unsigned i = 0, e = Ins.size(); i < e; ++i) {
     CCValAssign &VA = ArgLocs[i];
-    EVT VT = VA.getLocVT();
+    EVT VT = Ins[i].VT;
+    EVT MemVT = LocalIns[i].VT;
+
+    if (ShaderType != ShaderType::COMPUTE) {
+      unsigned Reg = MF.addLiveIn(VA.getLocReg(), &AMDGPU::R600_Reg128RegClass);
+      SDValue Register = DAG.getCopyFromReg(Chain, DL, Reg, VT);
+      InVals.push_back(Register);
+      continue;
+    }
 
     PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
                                                    AMDGPUAS::CONSTANT_BUFFER_0);
 
     // The first 36 bytes of the input buffer contains information about
     // thread group and global sizes.
-    SDValue Arg = DAG.getLoad(VT, DL, Chain,
-                           DAG.getConstant(36 + VA.getLocMemOffset(), MVT::i32),
-                           MachinePointerInfo(UndefValue::get(PtrTy)), false,
-                           false, false, 4); // 4 is the prefered alignment for
-                                             // the CONSTANT memory space.
+    SDValue Arg = DAG.getExtLoad(ISD::SEXTLOAD, DL, VT, Chain,
+                                 DAG.getConstant(36 + VA.getLocMemOffset(), MVT::i32),
+                                 MachinePointerInfo(UndefValue::get(PtrTy)),
+                                 MemVT, false, false, 4);
+                                 // 4 is the prefered alignment for
+                                 // the CONSTANT memory space.
     InVals.push_back(Arg);
   }
   return Chain;
@@ -1292,6 +1401,11 @@ CompactSwizzlableVector(SelectionDAG &DAG, SDValue VectorEntry,
   };
 
   for (unsigned i = 0; i < 4; i++) {
+    if (NewBldVec[i].getOpcode() == ISD::UNDEF)
+      // We mask write here to teach later passes that the ith element of this
+      // vector is undef. Thus we can use it to reduce 128 bits reg usage,
+      // break false dependencies and additionnaly make assembly easier to read.
+      RemapSwizzle[i] = 7; // SEL_MASK_WRITE
     if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(NewBldVec[i])) {
       if (C->isZero()) {
         RemapSwizzle[i] = 4; // SEL_0
@@ -1335,12 +1449,16 @@ static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry,
     if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
       unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
           ->getZExtValue();
-      if (!isUnmovable[Idx]) {
-        // Swap i and Idx
-        std::swap(NewBldVec[Idx], NewBldVec[i]);
-        std::swap(RemapSwizzle[RemapSwizzle[Idx]], RemapSwizzle[RemapSwizzle[i]]);
+      if (i == Idx) {
+        isUnmovable[Idx] = true;
+        continue;
       }
-      isUnmovable[Idx] = true;
+      if (isUnmovable[Idx])
+        continue;
+      // Swap i and Idx
+      std::swap(NewBldVec[Idx], NewBldVec[i]);
+      std::swap(RemapSwizzle[i], RemapSwizzle[Idx]);
+      break;
     }
   }
 
@@ -1422,8 +1540,8 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
     break;
   }
 
-  // insert_vector_elt (build_vector elt0, …, eltN), NewEltIdx, idx
-  // => build_vector elt0, …, NewEltIdx, …, eltN
+  // insert_vector_elt (build_vector elt0, ... , eltN), NewEltIdx, idx
+  // => build_vector elt0, ... , NewEltIdx, ... , eltN
   case ISD::INSERT_VECTOR_ELT: {
     SDValue InVec = N->getOperand(0);
     SDValue InVal = N->getOperand(1);
@@ -1525,15 +1643,20 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
       ISD::CondCode LHSCC = cast<CondCodeSDNode>(LHS.getOperand(4))->get();
       LHSCC = ISD::getSetCCInverse(LHSCC,
                                   LHS.getOperand(0).getValueType().isInteger());
-      return DAG.getSelectCC(SDLoc(N),
-                             LHS.getOperand(0),
-                             LHS.getOperand(1),
-                             LHS.getOperand(2),
-                             LHS.getOperand(3),
-                             LHSCC);
+      if (DCI.isBeforeLegalizeOps() ||
+          isCondCodeLegal(LHSCC, LHS.getOperand(0).getSimpleValueType()))
+        return DAG.getSelectCC(SDLoc(N),
+                               LHS.getOperand(0),
+                               LHS.getOperand(1),
+                               LHS.getOperand(2),
+                               LHS.getOperand(3),
+                               LHSCC);
+      break;
     }
     }
+    return SDValue();
   }
+
   case AMDGPUISD::EXPORT: {
     SDValue Arg = N->getOperand(1);
     if (Arg.getOpcode() != ISD::BUILD_VECTOR)
@@ -1586,3 +1709,253 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
   }
   return SDValue();
 }
+
+static bool
+FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src, SDValue &Neg,
+            SDValue &Abs, SDValue &Sel, SDValue &Imm, SelectionDAG &DAG) {
+  const R600InstrInfo *TII =
+      static_cast<const R600InstrInfo *>(DAG.getTarget().getInstrInfo());
+  if (!Src.isMachineOpcode())
+    return false;
+  switch (Src.getMachineOpcode()) {
+  case AMDGPU::FNEG_R600:
+    if (!Neg.getNode())
+      return false;
+    Src = Src.getOperand(0);
+    Neg = DAG.getTargetConstant(1, MVT::i32);
+    return true;
+  case AMDGPU::FABS_R600:
+    if (!Abs.getNode())
+      return false;
+    Src = Src.getOperand(0);
+    Abs = DAG.getTargetConstant(1, MVT::i32);
+    return true;
+  case AMDGPU::CONST_COPY: {
+    unsigned Opcode = ParentNode->getMachineOpcode();
+    bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
+
+    if (!Sel.getNode())
+      return false;
+
+    SDValue CstOffset = Src.getOperand(0);
+    if (ParentNode->getValueType(0).isVector())
+      return false;
+
+    // Gather constants values
+    int SrcIndices[] = {
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src2),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
+    };
+    std::vector<unsigned> Consts;
+    for (unsigned i = 0; i < sizeof(SrcIndices) / sizeof(int); i++) {
+      int OtherSrcIdx = SrcIndices[i];
+      int OtherSelIdx = TII->getSelIdx(Opcode, OtherSrcIdx);
+      if (OtherSrcIdx < 0 || OtherSelIdx < 0)
+        continue;
+      if (HasDst) {
+        OtherSrcIdx--;
+        OtherSelIdx--;
+      }
+      if (RegisterSDNode *Reg =
+          dyn_cast<RegisterSDNode>(ParentNode->getOperand(OtherSrcIdx))) {
+        if (Reg->getReg() == AMDGPU::ALU_CONST) {
+          ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(
+              ParentNode->getOperand(OtherSelIdx));
+          Consts.push_back(Cst->getZExtValue());
+        }
+      }
+    }
+
+    ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(CstOffset);
+    Consts.push_back(Cst->getZExtValue());
+    if (!TII->fitsConstReadLimitations(Consts)) {
+      return false;
+    }
+
+    Sel = CstOffset;
+    Src = DAG.getRegister(AMDGPU::ALU_CONST, MVT::f32);
+    return true;
+  }
+  case AMDGPU::MOV_IMM_I32:
+  case AMDGPU::MOV_IMM_F32: {
+    unsigned ImmReg = AMDGPU::ALU_LITERAL_X;
+    uint64_t ImmValue = 0;
+
+
+    if (Src.getMachineOpcode() == AMDGPU::MOV_IMM_F32) {
+      ConstantFPSDNode *FPC = dyn_cast<ConstantFPSDNode>(Src.getOperand(0));
+      float FloatValue = FPC->getValueAPF().convertToFloat();
+      if (FloatValue == 0.0) {
+        ImmReg = AMDGPU::ZERO;
+      } else if (FloatValue == 0.5) {
+        ImmReg = AMDGPU::HALF;
+      } else if (FloatValue == 1.0) {
+        ImmReg = AMDGPU::ONE;
+      } else {
+        ImmValue = FPC->getValueAPF().bitcastToAPInt().getZExtValue();
+      }
+    } else {
+      ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src.getOperand(0));
+      uint64_t Value = C->getZExtValue();
+      if (Value == 0) {
+        ImmReg = AMDGPU::ZERO;
+      } else if (Value == 1) {
+        ImmReg = AMDGPU::ONE_INT;
+      } else {
+        ImmValue = Value;
+      }
+    }
+
+    // Check that we aren't already using an immediate.
+    // XXX: It's possible for an instruction to have more than one
+    // immediate operand, but this is not supported yet.
+    if (ImmReg == AMDGPU::ALU_LITERAL_X) {
+      if (!Imm.getNode())
+        return false;
+      ConstantSDNode *C = dyn_cast<ConstantSDNode>(Imm);
+      assert(C);
+      if (C->getZExtValue())
+        return false;
+      Imm = DAG.getTargetConstant(ImmValue, MVT::i32);
+    }
+    Src = DAG.getRegister(ImmReg, MVT::i32);
+    return true;
+  }
+  default:
+    return false;
+  }
+}
+
+
+/// \brief Fold the instructions after selecting them
+SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node,
+                                            SelectionDAG &DAG) const {
+  const R600InstrInfo *TII =
+      static_cast<const R600InstrInfo *>(DAG.getTarget().getInstrInfo());
+  if (!Node->isMachineOpcode())
+    return Node;
+  unsigned Opcode = Node->getMachineOpcode();
+  SDValue FakeOp;
+
+  std::vector<SDValue> Ops;
+  for(SDNode::op_iterator I = Node->op_begin(), E = Node->op_end();
+              I != E; ++I)
+          Ops.push_back(*I);
+
+  if (Opcode == AMDGPU::DOT_4) {
+    int OperandIdx[] = {
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
+        };
+    int NegIdx[] = {
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_X),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Y),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Z),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_W),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_X),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Y),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Z),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_W)
+    };
+    int AbsIdx[] = {
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_X),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Y),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Z),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_W),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_X),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Y),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Z),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_W)
+    };
+    for (unsigned i = 0; i < 8; i++) {
+      if (OperandIdx[i] < 0)
+        return Node;
+      SDValue &Src = Ops[OperandIdx[i] - 1];
+      SDValue &Neg = Ops[NegIdx[i] - 1];
+      SDValue &Abs = Ops[AbsIdx[i] - 1];
+      bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
+      int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
+      if (HasDst)
+        SelIdx--;
+      SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
+      if (FoldOperand(Node, i, Src, Neg, Abs, Sel, FakeOp, DAG))
+        return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
+    }
+  } else if (Opcode == AMDGPU::REG_SEQUENCE) {
+    for (unsigned i = 1, e = Node->getNumOperands(); i < e; i += 2) {
+      SDValue &Src = Ops[i];
+      if (FoldOperand(Node, i, Src, FakeOp, FakeOp, FakeOp, FakeOp, DAG))
+        return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
+    }
+  } else if (Opcode == AMDGPU::CLAMP_R600) {
+    SDValue Src = Node->getOperand(0);
+    if (!Src.isMachineOpcode() ||
+        !TII->hasInstrModifiers(Src.getMachineOpcode()))
+      return Node;
+    int ClampIdx = TII->getOperandIdx(Src.getMachineOpcode(),
+        AMDGPU::OpName::clamp);
+    if (ClampIdx < 0)
+      return Node;
+    std::vector<SDValue> Ops;
+    unsigned NumOp = Src.getNumOperands();
+    for(unsigned i = 0; i < NumOp; ++i)
+          Ops.push_back(Src.getOperand(i));
+    Ops[ClampIdx - 1] = DAG.getTargetConstant(1, MVT::i32);
+    return DAG.getMachineNode(Src.getMachineOpcode(), SDLoc(Node),
+        Node->getVTList(), Ops);
+  } else {
+    if (!TII->hasInstrModifiers(Opcode))
+      return Node;
+    int OperandIdx[] = {
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src2)
+    };
+    int NegIdx[] = {
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src2_neg)
+    };
+    int AbsIdx[] = {
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs),
+      -1
+    };
+    for (unsigned i = 0; i < 3; i++) {
+      if (OperandIdx[i] < 0)
+        return Node;
+      SDValue &Src = Ops[OperandIdx[i] - 1];
+      SDValue &Neg = Ops[NegIdx[i] - 1];
+      SDValue FakeAbs;
+      SDValue &Abs = (AbsIdx[i] > -1) ? Ops[AbsIdx[i] - 1] : FakeAbs;
+      bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
+      int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
+      int ImmIdx = TII->getOperandIdx(Opcode, AMDGPU::OpName::literal);
+      if (HasDst) {
+        SelIdx--;
+        ImmIdx--;
+      }
+      SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
+      SDValue &Imm = Ops[ImmIdx];
+      if (FoldOperand(Node, i, Src, Neg, Abs, Sel, Imm, DAG))
+        return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
+    }
+  }
+
+  return Node;
+}
diff --git a/lib/Target/R600/R600ISelLowering.h b/lib/Target/R600/R600ISelLowering.h
index a033fcb..c10257e 100644
--- a/lib/Target/R600/R600ISelLowering.h
+++ b/lib/Target/R600/R600ISelLowering.h
@@ -56,11 +56,9 @@ private:
   SDValue LowerROTL(SDValue Op, SelectionDAG &DAG) const;
 
   SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerTrig(SDValue Op, SelectionDAG &DAG) const;
 
   SDValue stackPtrToRegIndex(SDValue Ptr, unsigned StackWidth,
@@ -68,6 +66,7 @@ private:
   void getStackAddress(unsigned StackWidth, unsigned ElemIdx,
                        unsigned &Channel, unsigned &PtrIncr) const;
   bool isZero(SDValue Op) const;
+  virtual SDNode *PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const;
 };
 
 } // End namespace llvm;
diff --git a/lib/Target/R600/R600InstrFormats.td b/lib/Target/R600/R600InstrFormats.td
index 2d72404..9428bab 100644
--- a/lib/Target/R600/R600InstrFormats.td
+++ b/lib/Target/R600/R600InstrFormats.td
@@ -16,7 +16,6 @@ class InstR600 <dag outs, dag ins, string asm, list<dag> pattern,
     : AMDGPUInst <outs, ins, asm, pattern> {
 
   field bits<64> Inst;
-  bit TransOnly = 0;
   bit Trig = 0;
   bit Op3 = 0;
   bit isVector = 0;
@@ -29,6 +28,8 @@ class InstR600 <dag outs, dag ins, string asm, list<dag> pattern,
   bit VTXInst = 0;
   bit TEXInst = 0;
   bit ALUInst = 0;
+  bit IsExport = 0;
+  bit LDS_1A2D = 0;
 
   let Namespace = "AMDGPU";
   let OutOperandList = outs;
@@ -37,7 +38,6 @@ class InstR600 <dag outs, dag ins, string asm, list<dag> pattern,
   let Pattern = pattern;
   let Itinerary = itin;
 
-  let TSFlags{0} = TransOnly;
   let TSFlags{4} = Trig;
   let TSFlags{5} = Op3;
 
@@ -53,6 +53,8 @@ class InstR600 <dag outs, dag ins, string asm, list<dag> pattern,
   let TSFlags{14} = ALUInst;
   let TSFlags{15} = LDS_1A;
   let TSFlags{16} = LDS_1A1D;
+  let TSFlags{17} = IsExport;
+  let TSFlags{18} = LDS_1A2D;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/R600/R600InstrInfo.cpp b/lib/Target/R600/R600InstrInfo.cpp
index 4e7eff9..c0827fc 100644
--- a/lib/Target/R600/R600InstrInfo.cpp
+++ b/lib/Target/R600/R600InstrInfo.cpp
@@ -23,7 +23,7 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 
-#define GET_INSTRINFO_CTOR
+#define GET_INSTRINFO_CTOR_DTOR
 #include "AMDGPUGenDFAPacketizer.inc"
 
 using namespace llvm;
@@ -77,16 +77,16 @@ R600InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
   }
 }
 
-MachineInstr * R600InstrInfo::getMovImmInstr(MachineFunction *MF,
-                                             unsigned DstReg, int64_t Imm) const {
-  MachineInstr * MI = MF->CreateMachineInstr(get(AMDGPU::MOV), DebugLoc());
-  MachineInstrBuilder MIB(*MF, MI);
-  MIB.addReg(DstReg, RegState::Define);
-  MIB.addReg(AMDGPU::ALU_LITERAL_X);
-  MIB.addImm(Imm);
-  MIB.addReg(0); // PREDICATE_BIT
-
-  return MI;
+/// \returns true if \p MBBI can be moved into a new basic.
+bool R600InstrInfo::isLegalToSplitMBBAt(MachineBasicBlock &MBB,
+                                       MachineBasicBlock::iterator MBBI) const {
+  for (MachineInstr::const_mop_iterator I = MBBI->operands_begin(),
+                                        E = MBBI->operands_end(); I != E; ++I) {
+    if (I->isReg() && !TargetRegisterInfo::isVirtualRegister(I->getReg()) &&
+        I->isUse() && RI.isPhysRegLiveAcrossClauses(I->getReg()))
+      return false;
+  }
+  return true;
 }
 
 unsigned R600InstrInfo::getIEQOpcode() const {
@@ -149,17 +149,58 @@ bool R600InstrInfo::isLDSInstr(unsigned Opcode) const {
   unsigned TargetFlags = get(Opcode).TSFlags;
 
   return ((TargetFlags & R600_InstFlag::LDS_1A) |
-          (TargetFlags & R600_InstFlag::LDS_1A1D));
+          (TargetFlags & R600_InstFlag::LDS_1A1D) |
+          (TargetFlags & R600_InstFlag::LDS_1A2D));
+}
+
+bool R600InstrInfo::isLDSNoRetInstr(unsigned Opcode) const {
+  return isLDSInstr(Opcode) && getOperandIdx(Opcode, AMDGPU::OpName::dst) == -1;
+}
+
+bool R600InstrInfo::isLDSRetInstr(unsigned Opcode) const {
+  return isLDSInstr(Opcode) && getOperandIdx(Opcode, AMDGPU::OpName::dst) != -1;
+}
+
+bool R600InstrInfo::canBeConsideredALU(const MachineInstr *MI) const {
+  if (isALUInstr(MI->getOpcode()))
+    return true;
+  if (isVector(*MI) || isCubeOp(MI->getOpcode()))
+    return true;
+  switch (MI->getOpcode()) {
+  case AMDGPU::PRED_X:
+  case AMDGPU::INTERP_PAIR_XY:
+  case AMDGPU::INTERP_PAIR_ZW:
+  case AMDGPU::INTERP_VEC_LOAD:
+  case AMDGPU::COPY:
+  case AMDGPU::DOT_4:
+    return true;
+  default:
+    return false;
+  }
 }
 
 bool R600InstrInfo::isTransOnly(unsigned Opcode) const {
-  return (get(Opcode).TSFlags & R600_InstFlag::TRANS_ONLY);
+  if (ST.hasCaymanISA())
+    return false;
+  return (get(Opcode).getSchedClass() == AMDGPU::Sched::TransALU);
 }
 
 bool R600InstrInfo::isTransOnly(const MachineInstr *MI) const {
   return isTransOnly(MI->getOpcode());
 }
 
+bool R600InstrInfo::isVectorOnly(unsigned Opcode) const {
+  return (get(Opcode).getSchedClass() == AMDGPU::Sched::VecALU);
+}
+
+bool R600InstrInfo::isVectorOnly(const MachineInstr *MI) const {
+  return isVectorOnly(MI->getOpcode());
+}
+
+bool R600InstrInfo::isExport(unsigned Opcode) const {
+  return (get(Opcode).TSFlags & R600_InstFlag::IS_EXPORT);
+}
+
 bool R600InstrInfo::usesVertexCache(unsigned Opcode) const {
   return ST.hasVertexCache() && IS_VTX(get(Opcode));
 }
@@ -189,6 +230,30 @@ bool R600InstrInfo::mustBeLastInClause(unsigned Opcode) const {
   }
 }
 
+bool R600InstrInfo::usesAddressRegister(MachineInstr *MI) const {
+  return  MI->findRegisterUseOperandIdx(AMDGPU::AR_X) != -1;
+}
+
+bool R600InstrInfo::definesAddressRegister(MachineInstr *MI) const {
+  return MI->findRegisterDefOperandIdx(AMDGPU::AR_X) != -1;
+}
+
+bool R600InstrInfo::readsLDSSrcReg(const MachineInstr *MI) const {
+  if (!isALUInstr(MI->getOpcode())) {
+    return false;
+  }
+  for (MachineInstr::const_mop_iterator I = MI->operands_begin(),
+                                        E = MI->operands_end(); I != E; ++I) {
+    if (!I->isReg() || !I->isUse() ||
+        TargetRegisterInfo::isVirtualRegister(I->getReg()))
+      continue;
+
+    if (AMDGPU::R600_LDS_SRC_REGRegClass.contains(I->getReg()))
+      return true;
+  }
+  return false;
+}
+
 int R600InstrInfo::getSrcIdx(unsigned Opcode, unsigned SrcNum) const {
   static const unsigned OpTable[] = {
     AMDGPU::OpName::src0,
@@ -321,6 +386,8 @@ R600InstrInfo::ExtractSrcs(MachineInstr *MI,
 static std::vector<std::pair<int, unsigned> >
 Swizzle(std::vector<std::pair<int, unsigned> > Src,
         R600InstrInfo::BankSwizzle Swz) {
+  if (Src[0] == Src[1])
+    Src[1].first = -1;
   switch (Swz) {
   case R600InstrInfo::ALU_VEC_012_SCL_210:
     break;
@@ -462,6 +529,9 @@ static bool
 isConstCompatible(R600InstrInfo::BankSwizzle TransSwz,
                   const std::vector<std::pair<int, unsigned> > &TransOps,
                   unsigned ConstCount) {
+  // TransALU can't read 3 constants
+  if (ConstCount > 2)
+    return false;
   for (unsigned i = 0, e = TransOps.size(); i < e; ++i) {
     const std::pair<int, unsigned> &Src = TransOps[i];
     unsigned Cycle = getTransSwizzle(TransSwz, i);
@@ -615,6 +685,11 @@ bool isJump(unsigned Opcode) {
   return Opcode == AMDGPU::JUMP || Opcode == AMDGPU::JUMP_COND;
 }
 
+static bool isBranch(unsigned Opcode) {
+  return Opcode == AMDGPU::BRANCH || Opcode == AMDGPU::BRANCH_COND_i32 ||
+      Opcode == AMDGPU::BRANCH_COND_f32;
+}
+
 bool
 R600InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
                              MachineBasicBlock *&TBB,
@@ -633,6 +708,10 @@ R600InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
       return false;
     --I;
   }
+  // AMDGPU::BRANCH* instructions are only available after isel and are not
+  // handled
+  if (isBranch(I->getOpcode()))
+    return true;
   if (!isJump(static_cast<MachineInstr *>(I)->getOpcode())) {
     return false;
   }
@@ -942,6 +1021,20 @@ R600InstrInfo::PredicateInstruction(MachineInstr *MI,
     return true;
   }
 
+  if (MI->getOpcode() == AMDGPU::DOT_4) {
+    MI->getOperand(getOperandIdx(*MI, AMDGPU::OpName::pred_sel_X))
+        .setReg(Pred[2].getReg());
+    MI->getOperand(getOperandIdx(*MI, AMDGPU::OpName::pred_sel_Y))
+        .setReg(Pred[2].getReg());
+    MI->getOperand(getOperandIdx(*MI, AMDGPU::OpName::pred_sel_Z))
+        .setReg(Pred[2].getReg());
+    MI->getOperand(getOperandIdx(*MI, AMDGPU::OpName::pred_sel_W))
+        .setReg(Pred[2].getReg());
+    MachineInstrBuilder MIB(*MI->getParent()->getParent(), MI);
+    MIB.addReg(AMDGPU::PREDICATE_BIT, RegState::Implicit);
+    return true;
+  }
+
   if (PIdx != -1) {
     MachineOperand &PMO = MI->getOperand(PIdx);
     PMO.setReg(Pred[2].getReg());
@@ -953,6 +1046,10 @@ R600InstrInfo::PredicateInstruction(MachineInstr *MI,
   return false;
 }
 
+unsigned int R600InstrInfo::getPredicationCost(const MachineInstr *) const {
+  return 2;
+}
+
 unsigned int R600InstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
                                             const MachineInstr *MI,
                                             unsigned *PredCost) const {
@@ -961,67 +1058,25 @@ unsigned int R600InstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
   return 2;
 }
 
-int R600InstrInfo::getIndirectIndexBegin(const MachineFunction &MF) const {
-  const MachineRegisterInfo &MRI = MF.getRegInfo();
-  const MachineFrameInfo *MFI = MF.getFrameInfo();
-  int Offset = 0;
-
-  if (MFI->getNumObjects() == 0) {
-    return -1;
-  }
-
-  if (MRI.livein_empty()) {
-    return 0;
-  }
-
-  for (MachineRegisterInfo::livein_iterator LI = MRI.livein_begin(),
-                                            LE = MRI.livein_end();
-                                            LI != LE; ++LI) {
-    Offset = std::max(Offset,
-                      GET_REG_INDEX(RI.getEncodingValue(LI->first)));
-  }
-
-  return Offset + 1;
-}
-
-int R600InstrInfo::getIndirectIndexEnd(const MachineFunction &MF) const {
-  int Offset = 0;
-  const MachineFrameInfo *MFI = MF.getFrameInfo();
-
-  // Variable sized objects are not supported
-  assert(!MFI->hasVarSizedObjects());
-
-  if (MFI->getNumObjects() == 0) {
-    return -1;
-  }
-
-  Offset = TM.getFrameLowering()->getFrameIndexOffset(MF, -1);
-
-  return getIndirectIndexBegin(MF) + Offset;
-}
-
-std::vector<unsigned> R600InstrInfo::getIndirectReservedRegs(
+void  R600InstrInfo::reserveIndirectRegisters(BitVector &Reserved,
                                              const MachineFunction &MF) const {
   const AMDGPUFrameLowering *TFL =
                  static_cast<const AMDGPUFrameLowering*>(TM.getFrameLowering());
-  std::vector<unsigned> Regs;
 
   unsigned StackWidth = TFL->getStackWidth(MF);
   int End = getIndirectIndexEnd(MF);
 
-  if (End == -1) {
-    return Regs;
-  }
+  if (End == -1)
+    return;
 
   for (int Index = getIndirectIndexBegin(MF); Index <= End; ++Index) {
     unsigned SuperReg = AMDGPU::R600_Reg128RegClass.getRegister(Index);
-    Regs.push_back(SuperReg);
+    Reserved.set(SuperReg);
     for (unsigned Chan = 0; Chan < StackWidth; ++Chan) {
       unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister((4 * Index) + Chan);
-      Regs.push_back(Reg);
+      Reserved.set(Reg);
     }
   }
-  return Regs;
 }
 
 unsigned R600InstrInfo::calculateIndirectAddress(unsigned RegIndex,
@@ -1031,13 +1086,8 @@ unsigned R600InstrInfo::calculateIndirectAddress(unsigned RegIndex,
   return RegIndex;
 }
 
-const TargetRegisterClass * R600InstrInfo::getIndirectAddrStoreRegClass(
-                                                     unsigned SourceReg) const {
-  return &AMDGPU::R600_TReg32RegClass;
-}
-
-const TargetRegisterClass *R600InstrInfo::getIndirectAddrLoadRegClass() const {
-  return &AMDGPU::TRegMemRegClass;
+const TargetRegisterClass *R600InstrInfo::getIndirectAddrRegClass() const {
+  return &AMDGPU::R600_TReg32_XRegClass;
 }
 
 MachineInstrBuilder R600InstrInfo::buildIndirectWrite(MachineBasicBlock *MBB,
@@ -1076,10 +1126,6 @@ MachineInstrBuilder R600InstrInfo::buildIndirectRead(MachineBasicBlock *MBB,
   return Mov;
 }
 
-const TargetRegisterClass *R600InstrInfo::getSuperIndirectRegClass() const {
-  return &AMDGPU::IndirectRegRegClass;
-}
-
 unsigned R600InstrInfo::getMaxAlusPerClause() const {
   return 115;
 }
@@ -1197,6 +1243,11 @@ MachineInstr *R600InstrInfo::buildSlotOfVectorInstruction(
     AMDGPU::OpName::src1_sel,
   };
 
+  MachineOperand &MO = MI->getOperand(getOperandIdx(MI->getOpcode(),
+      getSlotedOps(AMDGPU::OpName::pred_sel, Slot)));
+  MIB->getOperand(getOperandIdx(Opcode, AMDGPU::OpName::pred_sel))
+      .setReg(MO.getReg());
+
   for (unsigned i = 0; i < 14; i++) {
     MachineOperand &MO = MI->getOperand(
         getOperandIdx(MI->getOpcode(), getSlotedOps(Operands[i], Slot)));
@@ -1217,6 +1268,12 @@ MachineInstr *R600InstrInfo::buildMovImm(MachineBasicBlock &BB,
   return MovImm;
 }
 
+MachineInstr *R600InstrInfo::buildMovInstr(MachineBasicBlock *MBB,
+                                       MachineBasicBlock::iterator I,
+                                       unsigned DstReg, unsigned SrcReg) const {
+  return buildDefaultInstruction(*MBB, I, AMDGPU::MOV, DstReg, SrcReg);
+}
+
 int R600InstrInfo::getOperandIdx(const MachineInstr &MI, unsigned Op) const {
   return getOperandIdx(MI.getOpcode(), Op);
 }
diff --git a/lib/Target/R600/R600InstrInfo.h b/lib/Target/R600/R600InstrInfo.h
index cdaa2fb..13d9810 100644
--- a/lib/Target/R600/R600InstrInfo.h
+++ b/lib/Target/R600/R600InstrInfo.h
@@ -55,6 +55,8 @@ namespace llvm {
                            MachineBasicBlock::iterator MI, DebugLoc DL,
                            unsigned DestReg, unsigned SrcReg,
                            bool KillSrc) const;
+  bool isLegalToSplitMBBAt(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MBBI) const;
 
   bool isTrig(const MachineInstr &MI) const;
   bool isPlaceHolderOpcode(unsigned opcode) const;
@@ -65,9 +67,18 @@ namespace llvm {
   bool isALUInstr(unsigned Opcode) const;
   bool hasInstrModifiers(unsigned Opcode) const;
   bool isLDSInstr(unsigned Opcode) const;
+  bool isLDSNoRetInstr(unsigned Opcode) const;
+  bool isLDSRetInstr(unsigned Opcode) const;
+
+  /// \returns true if this \p Opcode represents an ALU instruction or an
+  /// instruction that will be lowered in ExpandSpecialInstrs Pass.
+  bool canBeConsideredALU(const MachineInstr *MI) const;
 
   bool isTransOnly(unsigned Opcode) const;
   bool isTransOnly(const MachineInstr *MI) const;
+  bool isVectorOnly(unsigned Opcode) const;
+  bool isVectorOnly(const MachineInstr *MI) const;
+  bool isExport(unsigned Opcode) const;
 
   bool usesVertexCache(unsigned Opcode) const;
   bool usesVertexCache(const MachineInstr *MI) const;
@@ -75,6 +86,9 @@ namespace llvm {
   bool usesTextureCache(const MachineInstr *MI) const;
 
   bool mustBeLastInClause(unsigned Opcode) const;
+  bool usesAddressRegister(MachineInstr *MI) const;
+  bool definesAddressRegister(MachineInstr *MI) const;
+  bool readsLDSSrcReg(const MachineInstr *MI) const;
 
   /// \returns The operand index for the given source number.  Legal values
   /// for SrcNum are 0, 1, and 2.
@@ -128,9 +142,6 @@ namespace llvm {
   /// instruction slots within an instruction group.
   bool isVector(const MachineInstr &MI) const;
 
-  virtual MachineInstr * getMovImmInstr(MachineFunction *MF, unsigned DstReg,
-                                        int64_t Imm) const;
-
   virtual unsigned getIEQOpcode() const;
   virtual bool isMov(unsigned Opcode) const;
 
@@ -177,6 +188,8 @@ namespace llvm {
   bool PredicateInstruction(MachineInstr *MI,
                         const SmallVectorImpl<MachineOperand> &Pred) const;
 
+  unsigned int getPredicationCost(const MachineInstr *) const;
+
   unsigned int getInstrLatency(const InstrItineraryData *ItinData,
                                const MachineInstr *MI,
                                unsigned *PredCost = 0) const;
@@ -184,22 +197,14 @@ namespace llvm {
   virtual int getInstrLatency(const InstrItineraryData *ItinData,
                               SDNode *Node) const { return 1;}
 
-  /// \returns a list of all the registers that may be accesed using indirect
-  /// addressing.
-  std::vector<unsigned> getIndirectReservedRegs(const MachineFunction &MF) const;
-
-  virtual int getIndirectIndexBegin(const MachineFunction &MF) const;
-
-  virtual int getIndirectIndexEnd(const MachineFunction &MF) const;
-
+  /// \brief Reserve the registers that may be accesed using indirect addressing.
+  void reserveIndirectRegisters(BitVector &Reserved,
+                                const MachineFunction &MF) const;
 
   virtual unsigned calculateIndirectAddress(unsigned RegIndex,
                                             unsigned Channel) const;
 
-  virtual const TargetRegisterClass *getIndirectAddrStoreRegClass(
-                                                      unsigned SourceReg) const;
-
-  virtual const TargetRegisterClass *getIndirectAddrLoadRegClass() const;
+  virtual const TargetRegisterClass *getIndirectAddrRegClass() const;
 
   virtual MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB,
                                   MachineBasicBlock::iterator I,
@@ -211,8 +216,6 @@ namespace llvm {
                                   unsigned ValueReg, unsigned Address,
                                   unsigned OffsetReg) const;
 
-  virtual const TargetRegisterClass *getSuperIndirectRegClass() const;
-
   unsigned getMaxAlusPerClause() const;
 
   ///buildDefaultInstruction - This function returns a MachineInstr with
@@ -239,6 +242,10 @@ namespace llvm {
                                   unsigned DstReg,
                                   uint64_t Imm) const;
 
+  MachineInstr *buildMovInstr(MachineBasicBlock *MBB,
+                              MachineBasicBlock::iterator I,
+                              unsigned DstReg, unsigned SrcReg) const;
+
   /// \brief Get the index of Op in the MachineInstr.
   ///
   /// \returns -1 if the Instruction does not contain the specified \p Op.
@@ -272,6 +279,12 @@ namespace llvm {
   void clearFlag(MachineInstr *MI, unsigned Operand, unsigned Flag) const;
 };
 
+namespace AMDGPU {
+
+int getLDSNoRetOp(uint16_t Opcode);
+
+} //End namespace AMDGPU
+
 } // End llvm namespace
 
 #endif // R600INSTRINFO_H_
diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td
index 7e61b18..0346e24 100644
--- a/lib/Target/R600/R600Instructions.td
+++ b/lib/Target/R600/R600Instructions.td
@@ -75,7 +75,6 @@ def ADDRDWord : ComplexPattern<i32, 1, "SelectADDRDWord", [], []>;
 def ADDRVTX_READ : ComplexPattern<i32, 2, "SelectADDRVTX_READ", [], []>;
 def ADDRGA_CONST_OFFSET : ComplexPattern<i32, 1, "SelectGlobalValueConstantOffset", [], []>;
 def ADDRGA_VAR_OFFSET : ComplexPattern<i32, 2, "SelectGlobalValueVariableOffset", [], []>;
-def ADDRIndirect : ComplexPattern<iPTR, 2, "SelectADDRIndirect", [], []>;
 
 
 def R600_Pred : PredicateOperand<i32, (ops R600_Predicate),
@@ -230,7 +229,7 @@ def TEX_RECT : PatLeaf<
 def TEX_ARRAY : PatLeaf<
   (imm),
   [{uint32_t TType = (uint32_t)N->getZExtValue();
-    return TType == 9 || TType == 10 || TType == 15 || TType == 16;
+    return TType == 9 || TType == 10 || TType == 16;
   }]
 >;
 
@@ -241,12 +240,26 @@ def TEX_SHADOW_ARRAY : PatLeaf<
   }]
 >;
 
-class EG_CF_RAT <bits <8> cfinst, bits <6> ratinst, bits<4> mask, dag outs,
-                 dag ins, string asm, list<dag> pattern> :
+def TEX_MSAA : PatLeaf<
+  (imm),
+  [{uint32_t TType = (uint32_t)N->getZExtValue();
+    return TType == 14;
+  }]
+>;
+
+def TEX_ARRAY_MSAA : PatLeaf<
+  (imm),
+  [{uint32_t TType = (uint32_t)N->getZExtValue();
+    return TType == 15;
+  }]
+>;
+
+class EG_CF_RAT <bits <8> cfinst, bits <6> ratinst, bits<4> ratid, bits<4> mask,
+                 dag outs, dag ins, string asm, list<dag> pattern> :
     InstR600ISA <outs, ins, asm, pattern>,
     CF_ALLOC_EXPORT_WORD0_RAT, CF_ALLOC_EXPORT_WORD1_BUF  {
 
-  let rat_id = 0;
+  let rat_id = ratid;
   let rat_inst = ratinst;
   let rim         = 0;
   // XXX: Have a separate instruction for non-indexed writes.
@@ -264,6 +277,7 @@ class EG_CF_RAT <bits <8> cfinst, bits <6> ratinst, bits<4> mask, dag outs,
 
   let Inst{31-0} = Word0;
   let Inst{63-32} = Word1;
+  let IsExport = 1;
 
 }
 
@@ -403,7 +417,7 @@ def INTERP_VEC_LOAD :  AMDGPUShaderInst <
   (outs R600_Reg128:$dst),
   (ins i32imm:$src0),
   "INTERP_LOAD $src0 : $dst",
-  []>;
+  [(set R600_Reg128:$dst, (int_R600_interp_const imm:$src0))]>;
 
 def INTERP_XY : R600_2OP <0xD6, "INTERP_XY", []> {
   let bank_swizzle = 5;
@@ -537,6 +551,7 @@ class ExportSwzInst : InstR600ISA<(
   let elem_size = 3;
   let Inst{31-0} = Word0;
   let Inst{63-32} = Word1;
+  let IsExport = 1;
 }
 
 } // End usesCustomInserter = 1
@@ -550,6 +565,7 @@ class ExportBufInst : InstR600ISA<(
   let elem_size = 0;
   let Inst{31-0} = Word0;
   let Inst{63-32} = Word1;
+  let IsExport = 1;
 }
 
 //===----------------------------------------------------------------------===//
@@ -573,6 +589,7 @@ i32imm:$COUNT, i32imm:$Enabled),
   let ALT_CONST = 0;
   let WHOLE_QUAD_MODE = 0;
   let BARRIER = 1;
+  let UseNamedOperandTable = 1;
 
   let Inst{31-0} = Word0;
   let Inst{63-32} = Word1;
@@ -672,42 +689,42 @@ def MIN : R600_2OP_Helper <0x4, "MIN", AMDGPUfmin>;
 // XXX: Use the defs in TargetSelectionDAG.td instead of intrinsics.
 def SETE : R600_2OP <
   0x08, "SETE",
-  [(set f32:$dst, (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_EQ))]
+  [(set f32:$dst, (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_OEQ))]
 >;
 
 def SGT : R600_2OP <
   0x09, "SETGT",
-  [(set f32:$dst, (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_GT))]
+  [(set f32:$dst, (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_OGT))]
 >;
 
 def SGE : R600_2OP <
   0xA, "SETGE",
-  [(set f32:$dst, (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_GE))]
+  [(set f32:$dst, (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_OGE))]
 >;
 
 def SNE : R600_2OP <
   0xB, "SETNE",
-  [(set f32:$dst, (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_NE))]
+  [(set f32:$dst, (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_UNE))]
 >;
 
 def SETE_DX10 : R600_2OP <
   0xC, "SETE_DX10",
-  [(set i32:$dst, (selectcc f32:$src0, f32:$src1, -1, 0, COND_EQ))]
+  [(set i32:$dst, (selectcc f32:$src0, f32:$src1, -1, 0, COND_OEQ))]
 >;
 
 def SETGT_DX10 : R600_2OP <
   0xD, "SETGT_DX10",
-  [(set i32:$dst, (selectcc f32:$src0, f32:$src1, -1, 0, COND_GT))]
+  [(set i32:$dst, (selectcc f32:$src0, f32:$src1, -1, 0, COND_OGT))]
 >;
 
 def SETGE_DX10 : R600_2OP <
   0xE, "SETGE_DX10",
-  [(set i32:$dst, (selectcc f32:$src0, f32:$src1, -1, 0, COND_GE))]
+  [(set i32:$dst, (selectcc f32:$src0, f32:$src1, -1, 0, COND_OGE))]
 >;
 
 def SETNE_DX10 : R600_2OP <
   0xF, "SETNE_DX10",
-  [(set i32:$dst, (selectcc f32:$src0, f32:$src1, -1, 0, COND_NE))]
+  [(set i32:$dst, (selectcc f32:$src0, f32:$src1, -1, 0, COND_UNE))]
 >;
 
 def FRACT : R600_1OP_Helper <0x10, "FRACT", AMDGPUfract>;
@@ -805,12 +822,12 @@ def CNDE_INT : R600_3OP <
 
 def CNDGE_INT : R600_3OP <
   0x1E, "CNDGE_INT",
-  [(set i32:$dst, (selectcc i32:$src0, 0, i32:$src1, i32:$src2, COND_GE))]
+  [(set i32:$dst, (selectcc i32:$src0, 0, i32:$src1, i32:$src2, COND_SGE))]
 >;
 
 def CNDGT_INT : R600_3OP <
   0x1D, "CNDGT_INT",
-  [(set i32:$dst, (selectcc i32:$src0, 0, i32:$src1, i32:$src2, COND_GT))]
+  [(set i32:$dst, (selectcc i32:$src0, 0, i32:$src1, i32:$src2, COND_SGT))]
 >;
 
 //===----------------------------------------------------------------------===//
@@ -863,6 +880,9 @@ def TEX_SAMPLE_C_L : R600_TEX <0x19, "TEX_SAMPLE_C_L">;
 def TEX_SAMPLE_LB : R600_TEX <0x12, "TEX_SAMPLE_LB">;
 def TEX_SAMPLE_C_LB : R600_TEX <0x1A, "TEX_SAMPLE_C_LB">;
 def TEX_LD : R600_TEX <0x03, "TEX_LD">;
+def TEX_LDPTR : R600_TEX <0x03, "TEX_LDPTR"> {
+  let INST_MOD = 1;
+}
 def TEX_GET_TEXTURE_RESINFO : R600_TEX <0x04, "TEX_GET_TEXTURE_RESINFO">;
 def TEX_GET_GRADIENTS_H : R600_TEX <0x07, "TEX_GET_GRADIENTS_H">;
 def TEX_GET_GRADIENTS_V : R600_TEX <0x08, "TEX_GET_GRADIENTS_V">;
@@ -881,6 +901,7 @@ defm : TexPattern<6, TEX_LD, v4i32>;
 defm : TexPattern<7, TEX_GET_TEXTURE_RESINFO, v4i32>;
 defm : TexPattern<8, TEX_GET_GRADIENTS_H>;
 defm : TexPattern<9, TEX_GET_GRADIENTS_V>;
+defm : TexPattern<10, TEX_LDPTR, v4i32>;
 
 //===----------------------------------------------------------------------===//
 // Helper classes for common instructions
@@ -903,18 +924,22 @@ class MULADD_IEEE_Common <bits<5> inst> : R600_3OP <
 
 class CNDE_Common <bits<5> inst> : R600_3OP <
   inst, "CNDE",
-  [(set f32:$dst, (selectcc f32:$src0, FP_ZERO, f32:$src1, f32:$src2, COND_EQ))]
+  [(set f32:$dst, (selectcc f32:$src0, FP_ZERO, f32:$src1, f32:$src2, COND_OEQ))]
 >;
 
 class CNDGT_Common <bits<5> inst> : R600_3OP <
   inst, "CNDGT",
-  [(set f32:$dst, (selectcc f32:$src0, FP_ZERO, f32:$src1, f32:$src2, COND_GT))]
->;
+  [(set f32:$dst, (selectcc f32:$src0, FP_ZERO, f32:$src1, f32:$src2, COND_OGT))]
+> {
+  let Itinerary = VecALU;
+}
 
 class CNDGE_Common <bits<5> inst> : R600_3OP <
   inst, "CNDGE",
-  [(set f32:$dst, (selectcc f32:$src0, FP_ZERO, f32:$src1, f32:$src2, COND_GE))]
->;
+  [(set f32:$dst, (selectcc f32:$src0, FP_ZERO, f32:$src1, f32:$src2, COND_OGE))]
+> {
+  let Itinerary = VecALU;
+}
 
 
 let isCodeGenOnly = 1, isPseudo = 1, Namespace = "AMDGPU"  in {
@@ -984,35 +1009,30 @@ multiclass CUBE_Common <bits<11> inst> {
 class EXP_IEEE_Common <bits<11> inst> : R600_1OP_Helper <
   inst, "EXP_IEEE", fexp2
 > {
-  let TransOnly = 1;
   let Itinerary = TransALU;
 }
 
 class FLT_TO_INT_Common <bits<11> inst> : R600_1OP_Helper <
   inst, "FLT_TO_INT", fp_to_sint
 > {
-  let TransOnly = 1;
   let Itinerary = TransALU;
 }
 
 class INT_TO_FLT_Common <bits<11> inst> : R600_1OP_Helper <
   inst, "INT_TO_FLT", sint_to_fp
 > {
-  let TransOnly = 1;
   let Itinerary = TransALU;
 }
 
 class FLT_TO_UINT_Common <bits<11> inst> : R600_1OP_Helper <
   inst, "FLT_TO_UINT", fp_to_uint
 > {
-  let TransOnly = 1;
   let Itinerary = TransALU;
 }
 
 class UINT_TO_FLT_Common <bits<11> inst> : R600_1OP_Helper <
   inst, "UINT_TO_FLT", uint_to_fp
 > {
-  let TransOnly = 1;
   let Itinerary = TransALU;
 }
 
@@ -1023,7 +1043,6 @@ class LOG_CLAMPED_Common <bits<11> inst> : R600_1OP <
 class LOG_IEEE_Common <bits<11> inst> : R600_1OP_Helper <
   inst, "LOG_IEEE", flog2
 > {
-  let TransOnly = 1;
   let Itinerary = TransALU;
 }
 
@@ -1033,75 +1052,68 @@ class ASHR_Common <bits<11> inst> : R600_2OP_Helper <inst, "ASHR", sra>;
 class MULHI_INT_Common <bits<11> inst> : R600_2OP_Helper <
   inst, "MULHI_INT", mulhs
 > {
-  let TransOnly = 1;
   let Itinerary = TransALU;
 }
 class MULHI_UINT_Common <bits<11> inst> : R600_2OP_Helper <
   inst, "MULHI", mulhu
 > {
-  let TransOnly = 1;
   let Itinerary = TransALU;
 }
 class MULLO_INT_Common <bits<11> inst> : R600_2OP_Helper <
   inst, "MULLO_INT", mul
 > {
-  let TransOnly = 1;
   let Itinerary = TransALU;
 }
 class MULLO_UINT_Common <bits<11> inst> : R600_2OP <inst, "MULLO_UINT", []> {
-  let TransOnly = 1;
   let Itinerary = TransALU;
 }
 
 class RECIP_CLAMPED_Common <bits<11> inst> : R600_1OP <
   inst, "RECIP_CLAMPED", []
 > {
-  let TransOnly = 1;
   let Itinerary = TransALU;
 }
 
 class RECIP_IEEE_Common <bits<11> inst> : R600_1OP <
   inst, "RECIP_IEEE", [(set f32:$dst, (fdiv FP_ONE, f32:$src0))]
 > {
-  let TransOnly = 1;
   let Itinerary = TransALU;
 }
 
 class RECIP_UINT_Common <bits<11> inst> : R600_1OP_Helper <
   inst, "RECIP_UINT", AMDGPUurecip
 > {
-  let TransOnly = 1;
   let Itinerary = TransALU;
 }
 
 class RECIPSQRT_CLAMPED_Common <bits<11> inst> : R600_1OP_Helper <
   inst, "RECIPSQRT_CLAMPED", int_AMDGPU_rsq
 > {
-  let TransOnly = 1;
   let Itinerary = TransALU;
 }
 
 class RECIPSQRT_IEEE_Common <bits<11> inst> : R600_1OP <
   inst, "RECIPSQRT_IEEE", []
 > {
-  let TransOnly = 1;
   let Itinerary = TransALU;
 }
 
 class SIN_Common <bits<11> inst> : R600_1OP <
   inst, "SIN", [(set f32:$dst, (SIN_HW f32:$src0))]>{
   let Trig = 1;
-  let TransOnly = 1;
   let Itinerary = TransALU;
 }
 
 class COS_Common <bits<11> inst> : R600_1OP <
   inst, "COS", [(set f32:$dst, (COS_HW f32:$src0))]> {
   let Trig = 1;
-  let TransOnly = 1;
   let Itinerary = TransALU;
 }
 
+def CLAMP_R600 :  CLAMP <R600_Reg32>;
+def FABS_R600 : FABS<R600_Reg32>;
+def FNEG_R600 : FNEG<R600_Reg32>;
+
 //===----------------------------------------------------------------------===//
 // Helper patterns for complex intrinsics
 //===----------------------------------------------------------------------===//
@@ -1124,6 +1136,13 @@ class TGSI_LIT_Z_Common <InstR600 mul_lit, InstR600 log_clamped, InstR600 exp_ie
   (exp_ieee (mul_lit (log_clamped (MAX $src_y, (f32 ZERO))), $src_w, $src_x))
 >;
 
+// FROUND pattern
+class FROUNDPat<Instruction CNDGE> : Pat <
+  (AMDGPUround f32:$x),
+  (CNDGE (ADD (FNEG_R600 (f32 HALF)), (FRACT $x)), (CEIL $x), (FLOOR $x))
+>;
+
+
 //===----------------------------------------------------------------------===//
 // R600 / R700 Instructions
 //===----------------------------------------------------------------------===//
@@ -1165,11 +1184,12 @@ let Predicates = [isR600] in {
   def TGSI_LIT_Z_r600 : TGSI_LIT_Z_Common<MUL_LIT_r600, LOG_CLAMPED_r600, EXP_IEEE_r600>;
 
   def : Pat<(fsqrt f32:$src), (MUL $src, (RECIPSQRT_CLAMPED_r600 $src))>;
+  def : FROUNDPat <CNDGE_r600>;
 
   def R600_ExportSwz : ExportSwzInst {
     let Word1{20-17} = 0; // BURST_COUNT
     let Word1{21} = eop;
-    let Word1{22} = 1; // VALID_PIXEL_MODE
+    let Word1{22} = 0; // VALID_PIXEL_MODE
     let Word1{30-23} = inst;
     let Word1{31} = 1; // BARRIER
   }
@@ -1178,7 +1198,7 @@ let Predicates = [isR600] in {
   def R600_ExportBuf : ExportBufInst {
     let Word1{20-17} = 0; // BURST_COUNT
     let Word1{21} = eop;
-    let Word1{22} = 1; // VALID_PIXEL_MODE
+    let Word1{22} = 0; // VALID_PIXEL_MODE
     let Word1{30-23} = inst;
     let Word1{31} = 1; // BARRIER
   }
@@ -1247,6 +1267,33 @@ let Predicates = [isR700] in {
 }
 
 //===----------------------------------------------------------------------===//
+// Evergreen / Cayman store instructions
+//===----------------------------------------------------------------------===//
+
+let Predicates = [isEGorCayman] in {
+
+class CF_MEM_RAT_CACHELESS <bits<6> rat_inst, bits<4> rat_id, bits<4> mask, dag ins,
+                           string name, list<dag> pattern>
+    : EG_CF_RAT <0x57, rat_inst, rat_id, mask, (outs), ins,
+                 "MEM_RAT_CACHELESS "#name, pattern>;
+
+class CF_MEM_RAT <bits<6> rat_inst, bits<4> rat_id, dag ins, string name,
+                  list<dag> pattern>
+    : EG_CF_RAT <0x56, rat_inst, rat_id, 0xf /* mask */, (outs), ins,
+                 "MEM_RAT "#name, pattern>;
+
+def RAT_MSKOR : CF_MEM_RAT <0x11, 0,
+  (ins R600_Reg128:$rw_gpr, R600_TReg32_X:$index_gpr),
+  "MSKOR $rw_gpr.XW, $index_gpr",
+  [(mskor_global v4i32:$rw_gpr, i32:$index_gpr)]
+> {
+  let eop = 0;
+}
+
+} // End Predicates = [isEGorCayman]
+
+
+//===----------------------------------------------------------------------===//
 // Evergreen Only instructions
 //===----------------------------------------------------------------------===//
 
@@ -1274,36 +1321,32 @@ def : Pat<(fsqrt f32:$src), (MUL $src, (RECIPSQRT_CLAMPED_eg $src))>;
 //===----------------------------------------------------------------------===//
 // Memory read/write instructions
 //===----------------------------------------------------------------------===//
-let usesCustomInserter = 1 in {
 
-class RAT_WRITE_CACHELESS_eg <dag ins, bits<4> mask, string name,
-                              list<dag> pattern>
-    : EG_CF_RAT <0x57, 0x2, mask, (outs), ins, name, pattern> {
-}
-
-} // End usesCustomInserter = 1
+let usesCustomInserter = 1 in {
 
 // 32-bit store
-def RAT_WRITE_CACHELESS_32_eg : RAT_WRITE_CACHELESS_eg <
+def RAT_WRITE_CACHELESS_32_eg : CF_MEM_RAT_CACHELESS <0x2, 0, 0x1,
   (ins R600_TReg32_X:$rw_gpr, R600_TReg32_X:$index_gpr, InstFlag:$eop),
-  0x1, "RAT_WRITE_CACHELESS_32_eg $rw_gpr, $index_gpr, $eop",
+  "STORE_RAW $rw_gpr, $index_gpr, $eop",
   [(global_store i32:$rw_gpr, i32:$index_gpr)]
 >;
 
 // 64-bit store
-def RAT_WRITE_CACHELESS_64_eg : RAT_WRITE_CACHELESS_eg <
+def RAT_WRITE_CACHELESS_64_eg : CF_MEM_RAT_CACHELESS <0x2, 0, 0x3,
   (ins R600_Reg64:$rw_gpr, R600_TReg32_X:$index_gpr, InstFlag:$eop),
-  0x3, "RAT_WRITE_CACHELESS_64_eg $rw_gpr.XY, $index_gpr, $eop",
+  "STORE_RAW $rw_gpr.XY, $index_gpr, $eop",
   [(global_store v2i32:$rw_gpr, i32:$index_gpr)]
 >;
 
 //128-bit store
-def RAT_WRITE_CACHELESS_128_eg : RAT_WRITE_CACHELESS_eg <
+def RAT_WRITE_CACHELESS_128_eg : CF_MEM_RAT_CACHELESS <0x2, 0, 0xf,
   (ins R600_Reg128:$rw_gpr, R600_TReg32_X:$index_gpr, InstFlag:$eop),
-  0xf, "RAT_WRITE_CACHELESS_128 $rw_gpr.XYZW, $index_gpr, $eop",
+  "STORE_RAW $rw_gpr.XYZW, $index_gpr, $eop",
   [(global_store v4i32:$rw_gpr, i32:$index_gpr)]
 >;
 
+} // End usesCustomInserter = 1
+
 class VTX_READ_eg <string name, bits<8> buffer_id, dag outs, list<dag> pattern>
     : VTX_WORD0_eg, VTX_READ<name, buffer_id, outs, pattern> {
 
@@ -1508,7 +1551,6 @@ let hasSideEffects = 1 in {
 
   def FLT_TO_INT_eg : FLT_TO_INT_Common<0x50> {
     let Pattern = [];
-    let TransOnly = 0;
     let Itinerary = AnyALU;
   }
 
@@ -1600,29 +1642,83 @@ class R600_LDS_1A <bits<6> lds_op, string name, list<dag> pattern> : R600_LDS <
   let DisableEncoding = "$dst";
 }
 
-class R600_LDS_1A1D <bits<6> lds_op, string name, list<dag> pattern> :
+class R600_LDS_1A1D <bits<6> lds_op, dag outs, string name, list<dag> pattern,
+                     string dst =""> :
     R600_LDS <
-  lds_op,
-  (outs),
+  lds_op, outs,
   (ins R600_Reg32:$src0, REL:$src0_rel, SEL:$src0_sel,
        R600_Reg32:$src1, REL:$src1_rel, SEL:$src1_sel,
        LAST:$last, R600_Pred:$pred_sel,
        BANK_SWIZZLE:$bank_swizzle),
-  "  "#name#" $last $src0$src0_rel, $src1$src1_rel, $pred_sel",
+  "  "#name#" $last "#dst#"$src0$src0_rel, $src1$src1_rel, $pred_sel",
   pattern
   > {
 
+  field string BaseOp;
+
   let src2 = 0;
   let src2_rel = 0;
   let LDS_1A1D = 1;
 }
 
+class R600_LDS_1A1D_NORET <bits<6> lds_op, string name, list<dag> pattern> :
+    R600_LDS_1A1D <lds_op, (outs), name, pattern> {
+  let BaseOp = name;
+}
+
+class R600_LDS_1A1D_RET <bits<6> lds_op, string name, list<dag> pattern> :
+    R600_LDS_1A1D <lds_op,  (outs R600_Reg32:$dst), name##"_RET", pattern, "OQAP, "> {
+
+  let BaseOp = name;
+  let usesCustomInserter = 1;
+  let DisableEncoding = "$dst";
+  let Defs = [OQAP];
+}
+
+class R600_LDS_1A2D <bits<6> lds_op, string name, list<dag> pattern> :
+    R600_LDS <
+  lds_op,
+  (outs),
+  (ins R600_Reg32:$src0, REL:$src0_rel, SEL:$src0_sel,
+       R600_Reg32:$src1, REL:$src1_rel, SEL:$src1_sel,
+       R600_Reg32:$src2, REL:$src2_rel, SEL:$src2_sel,
+       LAST:$last, R600_Pred:$pred_sel, BANK_SWIZZLE:$bank_swizzle),
+  "  "#name# "$last $src0$src0_rel, $src1$src1_rel, $src2$src2_rel, $pred_sel",
+  pattern> {
+  let LDS_1A2D = 1;
+}
+
+def LDS_ADD : R600_LDS_1A1D_NORET <0x0, "LDS_ADD", [] >;
+def LDS_SUB : R600_LDS_1A1D_NORET <0x1, "LDS_SUB", [] >;
+def LDS_WRITE : R600_LDS_1A1D_NORET <0xD, "LDS_WRITE",
+  [(local_store (i32 R600_Reg32:$src1), R600_Reg32:$src0)]
+>;
+def LDS_BYTE_WRITE : R600_LDS_1A1D_NORET<0x12, "LDS_BYTE_WRITE",
+  [(truncstorei8_local i32:$src1, i32:$src0)]
+>;
+def LDS_SHORT_WRITE : R600_LDS_1A1D_NORET<0x13, "LDS_SHORT_WRITE",
+  [(truncstorei16_local i32:$src1, i32:$src0)]
+>;
+def LDS_ADD_RET : R600_LDS_1A1D_RET <0x20, "LDS_ADD",
+  [(set i32:$dst, (atomic_load_add_local i32:$src0, i32:$src1))]
+>;
+def LDS_SUB_RET : R600_LDS_1A1D_RET <0x21, "LDS_SUB",
+  [(set i32:$dst, (atomic_load_sub_local i32:$src0, i32:$src1))]
+>;
 def LDS_READ_RET : R600_LDS_1A <0x32, "LDS_READ_RET",
   [(set (i32 R600_Reg32:$dst), (local_load R600_Reg32:$src0))]
 >;
-
-def LDS_WRITE : R600_LDS_1A1D <0xD, "LDS_WRITE",
-  [(local_store (i32 R600_Reg32:$src1), R600_Reg32:$src0)]
+def LDS_BYTE_READ_RET : R600_LDS_1A <0x36, "LDS_BYTE_READ_RET",
+  [(set i32:$dst, (sextloadi8_local i32:$src0))]
+>;
+def LDS_UBYTE_READ_RET : R600_LDS_1A <0x37, "LDS_UBYTE_READ_RET",
+  [(set i32:$dst, (az_extloadi8_local i32:$src0))]
+>;
+def LDS_SHORT_READ_RET : R600_LDS_1A <0x38, "LDS_SHORT_READ_RET",
+  [(set i32:$dst, (sextloadi16_local i32:$src0))]
+>;
+def LDS_USHORT_READ_RET : R600_LDS_1A <0x39, "LDS_USHORT_READ_RET",
+  [(set i32:$dst, (az_extloadi16_local i32:$src0))]
 >;
 
   // TRUNC is used for the FLT_TO_INT instructions to work around a
@@ -1642,9 +1738,11 @@ def LDS_WRITE : R600_LDS_1A1D <0xD, "LDS_WRITE",
   // SHA-256 Patterns
   def : SHA256MaPattern <BFI_INT_eg, XOR_INT>;
 
+  def : FROUNDPat <CNDGE_eg>;
+
   def EG_ExportSwz : ExportSwzInst {
     let Word1{19-16} = 0; // BURST_COUNT
-    let Word1{20} = 1; // VALID_PIXEL_MODE
+    let Word1{20} = 0; // VALID_PIXEL_MODE
     let Word1{21} = eop;
     let Word1{29-22} = inst;
     let Word1{30} = 0; // MARK
@@ -1654,7 +1752,7 @@ def LDS_WRITE : R600_LDS_1A1D <0xD, "LDS_WRITE",
 
   def EG_ExportBuf : ExportBufInst {
     let Word1{19-16} = 0; // BURST_COUNT
-    let Word1{20} = 1; // VALID_PIXEL_MODE
+    let Word1{20} = 0; // VALID_PIXEL_MODE
     let Word1{21} = eop;
     let Word1{29-22} = inst;
     let Word1{30} = 0; // MARK
@@ -1771,23 +1869,17 @@ def : Pat <
 
 def : Pat<(fsqrt f32:$src), (MUL R600_Reg32:$src, (RECIPSQRT_CLAMPED_cm $src))>;
 
-
-class  RAT_STORE_DWORD_cm <bits<4> mask, dag ins, list<dag> pat> : EG_CF_RAT <
-  0x57, 0x14, mask, (outs), ins,
-  "EXPORT_RAT_INST_STORE_DWORD $rw_gpr, $index_gpr", pat
-> {
+class RAT_STORE_DWORD <RegisterClass rc, ValueType vt, bits<4> mask> :
+  CF_MEM_RAT_CACHELESS <0x14, 0, mask,
+                        (ins rc:$rw_gpr, R600_TReg32_X:$index_gpr),
+                        "STORE_DWORD $rw_gpr, $index_gpr",
+                        [(global_store vt:$rw_gpr, i32:$index_gpr)]> {
   let eop = 0; // This bit is not used on Cayman.
 }
 
-def RAT_STORE_DWORD32_cm : RAT_STORE_DWORD_cm <0x1,
-  (ins R600_TReg32_X:$rw_gpr, R600_TReg32_X:$index_gpr),
-  [(global_store i32:$rw_gpr, i32:$index_gpr)]
->;
-
-def RAT_STORE_DWORD64_cm : RAT_STORE_DWORD_cm <0x3,
-  (ins R600_Reg64:$rw_gpr, R600_TReg32_X:$index_gpr),
-  [(global_store v2i32:$rw_gpr, i32:$index_gpr)]
->;
+def RAT_STORE_DWORD32 : RAT_STORE_DWORD <R600_TReg32_X, i32, 0x1>;
+def RAT_STORE_DWORD64 : RAT_STORE_DWORD <R600_Reg64, v2i32, 0x3>;
+def RAT_STORE_DWORD128 : RAT_STORE_DWORD <R600_Reg128, v4i32, 0xf>;
 
 class VTX_READ_cm <string name, bits<8> buffer_id, dag outs, list<dag> pattern>
     : VTX_WORD0_cm, VTX_READ<name, buffer_id, outs, pattern> {
@@ -2012,10 +2104,6 @@ def TXD_SHADOW: InstR600 <
 } // End isPseudo = 1
 } // End usesCustomInserter = 1
 
-def CLAMP_R600 :  CLAMP <R600_Reg32>;
-def FABS_R600 : FABS<R600_Reg32>;
-def FNEG_R600 : FNEG<R600_Reg32>;
-
 //===---------------------------------------------------------------------===//
 // Return instruction
 //===---------------------------------------------------------------------===//
@@ -2164,7 +2252,7 @@ let isTerminator = 1, usesCustomInserter = 1, isBranch = 1, isBarrier = 1 in {
   def BRANCH : ILFormat<(outs), (ins brtarget:$target),
       "; Pseudo unconditional branch instruction",
       [(br bb:$target)]>;
-  defm BRANCH_COND : BranchConditional<IL_brcond>;
+  defm BRANCH_COND : BranchConditional<IL_brcond, R600_Reg32, R600_Reg32>;
 }
 
 //===---------------------------------------------------------------------===//
@@ -2235,7 +2323,7 @@ def : CND_INT_f32 <CNDGE_INT, SETGE>;
 
 //CNDGE_INT extra pattern
 def : Pat <
-  (selectcc i32:$src0, -1, i32:$src1, i32:$src2, COND_GT),
+  (selectcc i32:$src0, -1, i32:$src1, i32:$src2, COND_SGT),
   (CNDGE_INT $src0, $src1, $src2)
 >;
 
@@ -2250,86 +2338,6 @@ def KIL : Pat <
   (MASK_WRITE (KILLGT (f32 ZERO), $src0))
 >;
 
-// SGT Reverse args
-def : Pat <
-  (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_LT),
-  (SGT $src1, $src0)
->;
-
-// SGE Reverse args
-def : Pat <
-  (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_LE),
-  (SGE $src1, $src0)
->;
-
-// SETGT_DX10 reverse args
-def : Pat <
-  (selectcc f32:$src0, f32:$src1, -1, 0, COND_LT),
-  (SETGT_DX10 $src1, $src0)
->;
-
-// SETGE_DX10 reverse args
-def : Pat <
-  (selectcc f32:$src0, f32:$src1, -1, 0, COND_LE),
-  (SETGE_DX10 $src1, $src0)
->;
-
-// SETGT_INT reverse args
-def : Pat <
-  (selectcc i32:$src0, i32:$src1, -1, 0, SETLT),
-  (SETGT_INT $src1, $src0)
->;
-
-// SETGE_INT reverse args
-def : Pat <
-  (selectcc i32:$src0, i32:$src1, -1, 0, SETLE),
-  (SETGE_INT $src1, $src0)
->;
-
-// SETGT_UINT reverse args
-def : Pat <
-  (selectcc i32:$src0, i32:$src1, -1, 0, SETULT),
-  (SETGT_UINT $src1, $src0)
->;
-
-// SETGE_UINT reverse args
-def : Pat <
-  (selectcc i32:$src0, i32:$src1, -1, 0, SETULE),
-  (SETGE_UINT $src1, $src0)
->;
-
-// The next two patterns are special cases for handling 'true if ordered' and
-// 'true if unordered' conditionals.  The assumption here is that the behavior of
-// SETE and SNE conforms to the Direct3D 10 rules for floating point values
-// described here:
-// http://msdn.microsoft.com/en-us/library/windows/desktop/cc308050.aspx#alpha_32_bit
-// We assume that  SETE returns false when one of the operands is NAN and
-// SNE returns true when on of the operands is NAN
-
-//SETE - 'true if ordered'
-def : Pat <
-  (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, SETO),
-  (SETE $src0, $src1)
->;
-
-//SETE_DX10 - 'true if ordered'
-def : Pat <
-  (selectcc f32:$src0, f32:$src1, -1, 0, SETO),
-  (SETE_DX10 $src0, $src1)
->;
-
-//SNE - 'true if unordered'
-def : Pat <
-  (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, SETUO),
-  (SNE $src0, $src1)
->;
-
-//SETNE_DX10 - 'true if ordered'
-def : Pat <
-  (selectcc f32:$src0, f32:$src1, -1, 0, SETUO),
-  (SETNE_DX10 $src0, $src1)
->;
-
 def : Extract_Element <f32, v4f32, 0, sub0>;
 def : Extract_Element <f32, v4f32, 1, sub1>;
 def : Extract_Element <f32, v4f32, 2, sub2>;
@@ -2378,3 +2386,11 @@ def : BitConvert <v4i32, v4f32, R600_Reg128>;
 def : DwordAddrPat  <i32, R600_Reg32>;
 
 } // End isR600toCayman Predicate
+
+def getLDSNoRetOp : InstrMapping {
+  let FilterClass = "R600_LDS_1A1D";
+  let RowFields = ["BaseOp"];
+  let ColFields = ["DisableEncoding"];
+  let KeyCol = ["$dst"];
+  let ValueCols = [[""""]];
+}
diff --git a/lib/Target/R600/R600Intrinsics.td b/lib/Target/R600/R600Intrinsics.td
index 58d86b6..9681747 100644
--- a/lib/Target/R600/R600Intrinsics.td
+++ b/lib/Target/R600/R600Intrinsics.td
@@ -43,6 +43,12 @@ let TargetPrefix = "R600", isTarget = 1 in {
     Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>;
   def int_R600_interp_input :
     Intrinsic<[llvm_float_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_R600_interp_const :
+    Intrinsic<[llvm_v4f32_ty], [llvm_i32_ty], [IntrNoMem]>;
+def int_R600_interp_xy :
+    Intrinsic<[llvm_v2f32_ty], [llvm_i32_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
+def int_R600_interp_zw :
+    Intrinsic<[llvm_v2f32_ty], [llvm_i32_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
   def int_R600_load_texbuf :
     Intrinsic<[llvm_v4f32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
   def int_R600_tex : TextureIntrinsicFloatInput;
@@ -52,6 +58,7 @@ let TargetPrefix = "R600", isTarget = 1 in {
   def int_R600_txb : TextureIntrinsicFloatInput;
   def int_R600_txbc : TextureIntrinsicFloatInput;
   def int_R600_txf : TextureIntrinsicInt32Input;
+  def int_R600_ldptr : TextureIntrinsicInt32Input;
   def int_R600_txq : TextureIntrinsicInt32Input;
   def int_R600_ddx : TextureIntrinsicFloatInput;
   def int_R600_ddy : TextureIntrinsicFloatInput;
diff --git a/lib/Target/R600/R600MachineFunctionInfo.cpp b/lib/Target/R600/R600MachineFunctionInfo.cpp
index 018b403..01105c6 100644
--- a/lib/Target/R600/R600MachineFunctionInfo.cpp
+++ b/lib/Target/R600/R600MachineFunctionInfo.cpp
@@ -12,7 +12,9 @@
 
 using namespace llvm;
 
-R600MachineFunctionInfo::R600MachineFunctionInfo(const MachineFunction &MF)
-  : AMDGPUMachineFunction(MF) { }
 
+// Pin the vtable to this file.
+void R600MachineFunctionInfo::anchor() {}
 
+R600MachineFunctionInfo::R600MachineFunctionInfo(const MachineFunction &MF)
+  : AMDGPUMachineFunction(MF) { }
diff --git a/lib/Target/R600/R600MachineFunctionInfo.h b/lib/Target/R600/R600MachineFunctionInfo.h
index f23d9b7..c1bec0a 100644
--- a/lib/Target/R600/R600MachineFunctionInfo.h
+++ b/lib/Target/R600/R600MachineFunctionInfo.h
@@ -21,6 +21,7 @@
 namespace llvm {
 
 class R600MachineFunctionInfo : public AMDGPUMachineFunction {
+  virtual void anchor();
 public:
   R600MachineFunctionInfo(const MachineFunction &MF);
   SmallVector<unsigned, 4> LiveOuts;
diff --git a/lib/Target/R600/R600MachineScheduler.cpp b/lib/Target/R600/R600MachineScheduler.cpp
index 0dc0365..da2a4d8 100644
--- a/lib/Target/R600/R600MachineScheduler.cpp
+++ b/lib/Target/R600/R600MachineScheduler.cpp
@@ -9,7 +9,6 @@
 //
 /// \file
 /// \brief R600 Machine Scheduler interface
-// TODO: Scheduling is optimised for VLIW4 arch, modify it to support TRANS slot
 //
 //===----------------------------------------------------------------------===//
 
@@ -29,6 +28,7 @@ void R600SchedStrategy::initialize(ScheduleDAGMI *dag) {
   DAG = dag;
   TII = static_cast<const R600InstrInfo*>(DAG->TII);
   TRI = static_cast<const R600RegisterInfo*>(DAG->TRI);
+  VLIW5 = !DAG->MF.getTarget().getSubtarget<AMDGPUSubtarget>().hasCaymanISA();
   MRI = &DAG->MRI;
   CurInstKind = IDOther;
   CurEmitted = 0;
@@ -92,15 +92,6 @@ SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) {
       AllowSwitchFromAlu = true;
   }
 
-
-  // We want to scheduled AR defs as soon as possible to make sure they aren't
-  // put in a different ALU clause from their uses.
-  if (!SU && !UnscheduledARDefs.empty()) {
-      SU = UnscheduledARDefs[0];
-      UnscheduledARDefs.erase(UnscheduledARDefs.begin());
-      NextInstKind = IDAlu;
-  }
-
   if (!SU && ((AllowSwitchToAlu && CurInstKind != IDAlu) ||
       (!AllowSwitchFromAlu && CurInstKind == IDAlu))) {
     // try to pick ALU
@@ -130,15 +121,6 @@ SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) {
       NextInstKind = IDOther;
   }
 
-  // We want to schedule the AR uses as late as possible to make sure that
-  // the AR defs have been released.
-  if (!SU && !UnscheduledARUses.empty()) {
-      SU = UnscheduledARUses[0];
-      UnscheduledARUses.erase(UnscheduledARUses.begin());
-      NextInstKind = IDAlu;
-  }
-
-
   DEBUG(
       if (SU) {
         dbgs() << " ** Pick node **\n";
@@ -217,20 +199,6 @@ void R600SchedStrategy::releaseBottomNode(SUnit *SU) {
 
   int IK = getInstKind(SU);
 
-  // Check for AR register defines
-  for (MachineInstr::const_mop_iterator I = SU->getInstr()->operands_begin(),
-                                        E = SU->getInstr()->operands_end();
-                                        I != E; ++I) {
-    if (I->isReg() && I->getReg() == AMDGPU::AR_X) {
-      if (I->isDef()) {
-        UnscheduledARDefs.push_back(SU);
-      } else {
-        UnscheduledARUses.push_back(SU);
-      }
-      return;
-    }
-  }
-
   // There is no export clause, we can schedule one as soon as its ready
   if (IK == IDOther)
     Available[IDOther].push_back(SU);
@@ -314,6 +282,10 @@ R600SchedStrategy::AluKind R600SchedStrategy::getAluKind(SUnit *SU) const {
     if (regBelongsToClass(DestReg, &AMDGPU::R600_Reg128RegClass))
       return AluT_XYZW;
 
+    // LDS src registers cannot be used in the Trans slot.
+    if (TII->readsLDSSrcReg(MI))
+      return AluT_XYZW;
+
     return AluAny;
 
 }
@@ -342,14 +314,16 @@ int R600SchedStrategy::getInstKind(SUnit* SU) {
   }
 }
 
-SUnit *R600SchedStrategy::PopInst(std::vector<SUnit *> &Q) {
+SUnit *R600SchedStrategy::PopInst(std::vector<SUnit *> &Q, bool AnyALU) {
   if (Q.empty())
     return NULL;
   for (std::vector<SUnit *>::reverse_iterator It = Q.rbegin(), E = Q.rend();
       It != E; ++It) {
     SUnit *SU = *It;
     InstructionsGroupCandidate.push_back(SU->getInstr());
-    if (TII->fitsConstReadLimitations(InstructionsGroupCandidate)) {
+    if (TII->fitsConstReadLimitations(InstructionsGroupCandidate)
+        && (!AnyALU || !TII->isVectorOnly(SU->getInstr()))
+    ) {
       InstructionsGroupCandidate.pop_back();
       Q.erase((It + 1).base());
       return SU;
@@ -373,6 +347,8 @@ void R600SchedStrategy::PrepareNextSlot() {
   DEBUG(dbgs() << "New Slot\n");
   assert (OccupedSlotsMask && "Slot wasn't filled");
   OccupedSlotsMask = 0;
+//  if (HwGen == AMDGPUSubtarget::NORTHERN_ISLANDS)
+//    OccupedSlotsMask |= 16;
   InstructionsGroupCandidate.clear();
   LoadAlu();
 }
@@ -409,12 +385,12 @@ void R600SchedStrategy::AssignSlot(MachineInstr* MI, unsigned Slot) {
   }
 }
 
-SUnit *R600SchedStrategy::AttemptFillSlot(unsigned Slot) {
+SUnit *R600SchedStrategy::AttemptFillSlot(unsigned Slot, bool AnyAlu) {
   static const AluKind IndexToID[] = {AluT_X, AluT_Y, AluT_Z, AluT_W};
-  SUnit *SlotedSU = PopInst(AvailableAlus[IndexToID[Slot]]);
+  SUnit *SlotedSU = PopInst(AvailableAlus[IndexToID[Slot]], AnyAlu);
   if (SlotedSU)
     return SlotedSU;
-  SUnit *UnslotedSU = PopInst(AvailableAlus[AluAny]);
+  SUnit *UnslotedSU = PopInst(AvailableAlus[AluAny], AnyAlu);
   if (UnslotedSU)
     AssignSlot(UnslotedSU->getInstr(), Slot);
   return UnslotedSU;
@@ -434,30 +410,35 @@ SUnit* R600SchedStrategy::pickAlu() {
       // Bottom up scheduling : predX must comes first
       if (!AvailableAlus[AluPredX].empty()) {
         OccupedSlotsMask |= 31;
-        return PopInst(AvailableAlus[AluPredX]);
+        return PopInst(AvailableAlus[AluPredX], false);
       }
       // Flush physical reg copies (RA will discard them)
       if (!AvailableAlus[AluDiscarded].empty()) {
         OccupedSlotsMask |= 31;
-        return PopInst(AvailableAlus[AluDiscarded]);
+        return PopInst(AvailableAlus[AluDiscarded], false);
       }
       // If there is a T_XYZW alu available, use it
       if (!AvailableAlus[AluT_XYZW].empty()) {
         OccupedSlotsMask |= 15;
-        return PopInst(AvailableAlus[AluT_XYZW]);
+        return PopInst(AvailableAlus[AluT_XYZW], false);
       }
     }
     bool TransSlotOccuped = OccupedSlotsMask & 16;
-    if (!TransSlotOccuped) {
+    if (!TransSlotOccuped && VLIW5) {
       if (!AvailableAlus[AluTrans].empty()) {
         OccupedSlotsMask |= 16;
-        return PopInst(AvailableAlus[AluTrans]);
+        return PopInst(AvailableAlus[AluTrans], false);
+      }
+      SUnit *SU = AttemptFillSlot(3, true);
+      if (SU) {
+        OccupedSlotsMask |= 16;
+        return SU;
       }
     }
     for (int Chan = 3; Chan > -1; --Chan) {
       bool isOccupied = OccupedSlotsMask & (1 << Chan);
       if (!isOccupied) {
-        SUnit *SU = AttemptFillSlot(Chan);
+        SUnit *SU = AttemptFillSlot(Chan, false);
         if (SU) {
           OccupedSlotsMask |= (1 << Chan);
           InstructionsGroupCandidate.push_back(SU->getInstr());
diff --git a/lib/Target/R600/R600MachineScheduler.h b/lib/Target/R600/R600MachineScheduler.h
index f8965d8..97c8cde 100644
--- a/lib/Target/R600/R600MachineScheduler.h
+++ b/lib/Target/R600/R600MachineScheduler.h
@@ -53,8 +53,6 @@ class R600SchedStrategy : public MachineSchedStrategy {
 
   std::vector<SUnit *> Available[IDLast], Pending[IDLast];
   std::vector<SUnit *> AvailableAlus[AluLast];
-  std::vector<SUnit *> UnscheduledARDefs;
-  std::vector<SUnit *> UnscheduledARUses;
   std::vector<SUnit *> PhysicalRegCopy;
 
   InstKind CurInstKind;
@@ -84,15 +82,16 @@ public:
 
 private:
   std::vector<MachineInstr *> InstructionsGroupCandidate;
+  bool VLIW5;
 
   int getInstKind(SUnit *SU);
   bool regBelongsToClass(unsigned Reg, const TargetRegisterClass *RC) const;
   AluKind getAluKind(SUnit *SU) const;
   void LoadAlu();
   unsigned AvailablesAluCount() const;
-  SUnit *AttemptFillSlot (unsigned Slot);
+  SUnit *AttemptFillSlot (unsigned Slot, bool AnyAlu);
   void PrepareNextSlot();
-  SUnit *PopInst(std::vector<SUnit*> &Q);
+  SUnit *PopInst(std::vector<SUnit*> &Q, bool AnyALU);
 
   void AssignSlot(MachineInstr *MI, unsigned Slot);
   SUnit* pickAlu();
diff --git a/lib/Target/R600/R600OptimizeVectorRegisters.cpp b/lib/Target/R600/R600OptimizeVectorRegisters.cpp
index acacffa..cf719c0 100644
--- a/lib/Target/R600/R600OptimizeVectorRegisters.cpp
+++ b/lib/Target/R600/R600OptimizeVectorRegisters.cpp
@@ -50,6 +50,9 @@ isImplicitlyDef(MachineRegisterInfo &MRI, unsigned Reg) {
       E = MRI.def_end(); It != E; ++It) {
     return (*It).isImplicitDef();
   }
+  if (MRI.isReserved(Reg)) {
+    return false;
+  }
   llvm_unreachable("Reg without a def");
   return false;
 }
diff --git a/lib/Target/R600/R600Packetizer.cpp b/lib/Target/R600/R600Packetizer.cpp
index 5cf1fd3..cd9b6ea 100644
--- a/lib/Target/R600/R600Packetizer.cpp
+++ b/lib/Target/R600/R600Packetizer.cpp
@@ -58,6 +58,8 @@ class R600PacketizerList : public VLIWPacketizerList {
 private:
   const R600InstrInfo *TII;
   const R600RegisterInfo &TRI;
+  bool VLIW5;
+  bool ConsideredInstUsesAlreadyWrittenVectorElement;
 
   unsigned getSlot(const MachineInstr *MI) const {
     return TRI.getHWRegChan(MI->getOperand(0).getReg());
@@ -74,7 +76,13 @@ private:
     MachineBasicBlock::instr_iterator BI = I.getInstrIterator();
     if (I->isBundle())
       BI++;
+    int LastDstChan = -1;
     do {
+      bool isTrans = false;
+      int BISlot = getSlot(BI);
+      if (LastDstChan >= BISlot)
+        isTrans = true;
+      LastDstChan = BISlot;
       if (TII->isPredicated(BI))
         continue;
       int OperandIdx = TII->getOperandIdx(BI->getOpcode(), AMDGPU::OpName::write);
@@ -85,7 +93,7 @@ private:
         continue;
       }
       unsigned Dst = BI->getOperand(DstIdx).getReg();
-      if (TII->isTransOnly(BI)) {
+      if (isTrans || TII->isTransOnly(BI)) {
         Result[Dst] = AMDGPU::PS;
         continue;
       }
@@ -142,10 +150,14 @@ public:
                         MachineDominatorTree &MDT)
   : VLIWPacketizerList(MF, MLI, MDT, true),
     TII (static_cast<const R600InstrInfo *>(MF.getTarget().getInstrInfo())),
-    TRI(TII->getRegisterInfo()) { }
+    TRI(TII->getRegisterInfo()) {
+    VLIW5 = !MF.getTarget().getSubtarget<AMDGPUSubtarget>().hasCaymanISA();
+  }
 
   // initPacketizerState - initialize some internal flags.
-  void initPacketizerState() { }
+  void initPacketizerState() {
+    ConsideredInstUsesAlreadyWrittenVectorElement = false;
+  }
 
   // ignorePseudoInstruction - Ignore bundling of pseudo instructions.
   bool ignorePseudoInstruction(MachineInstr *MI, MachineBasicBlock *MBB) {
@@ -172,8 +184,8 @@ public:
   // together.
   bool isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) {
     MachineInstr *MII = SUI->getInstr(), *MIJ = SUJ->getInstr();
-    if (getSlot(MII) <= getSlot(MIJ) && !TII->isTransOnly(MII))
-      return false;
+    if (getSlot(MII) == getSlot(MIJ))
+      ConsideredInstUsesAlreadyWrittenVectorElement = true;
     // Does MII and MIJ share the same pred_sel ?
     int OpI = TII->getOperandIdx(MII->getOpcode(), AMDGPU::OpName::pred_sel),
         OpJ = TII->getOperandIdx(MIJ->getOpcode(), AMDGPU::OpName::pred_sel);
@@ -194,6 +206,14 @@ public:
         return false;
       }
     }
+
+    bool ARDef = TII->definesAddressRegister(MII) ||
+                 TII->definesAddressRegister(MIJ);
+    bool ARUse = TII->usesAddressRegister(MII) ||
+                 TII->usesAddressRegister(MIJ);
+    if (ARDef && ARUse)
+      return false;
+
     return true;
   }
 
@@ -211,6 +231,20 @@ public:
                                  std::vector<R600InstrInfo::BankSwizzle> &BS,
                                  bool &isTransSlot) {
     isTransSlot = TII->isTransOnly(MI);
+    assert (!isTransSlot || VLIW5);
+
+    // Is the dst reg sequence legal ?
+    if (!isTransSlot && !CurrentPacketMIs.empty()) {
+      if (getSlot(MI) <= getSlot(CurrentPacketMIs.back())) {
+        if (ConsideredInstUsesAlreadyWrittenVectorElement  &&
+            !TII->isVectorOnly(MI) && VLIW5) {
+          isTransSlot = true;
+          DEBUG(dbgs() << "Considering as Trans Inst :"; MI->dump(););
+        }
+        else
+          return false;
+      }
+    }
 
     // Are the Constants limitations met ?
     CurrentPacketMIs.push_back(MI);
@@ -246,6 +280,10 @@ public:
       return false;
     }
 
+    // We cannot read LDS source registrs from the Trans slot.
+    if (isTransSlot && TII->readsLDSSrcReg(MI))
+      return false;
+
     CurrentPacketMIs.pop_back();
     return true;
   }
@@ -278,6 +316,8 @@ public:
       return It;
     }
     endPacket(MI->getParent(), MI);
+    if (TII->isTransOnly(MI))
+      return MI;
     return VLIWPacketizerList::addToPacket(MI);
   }
 };
@@ -308,7 +348,7 @@ bool R600Packetizer::runOnMachineFunction(MachineFunction &Fn) {
     MachineBasicBlock::iterator End = MBB->end();
     MachineBasicBlock::iterator MI = MBB->begin();
     while (MI != End) {
-      if (MI->isKill() ||
+      if (MI->isKill() || MI->getOpcode() == AMDGPU::IMPLICIT_DEF ||
           (MI->getOpcode() == AMDGPU::CF_ALU && !MI->getOperand(8).getImm())) {
         MachineBasicBlock::iterator DeleteMI = MI;
         ++MI;
diff --git a/lib/Target/R600/R600RegisterInfo.cpp b/lib/Target/R600/R600RegisterInfo.cpp
index a42043b..f3bb88b 100644
--- a/lib/Target/R600/R600RegisterInfo.cpp
+++ b/lib/Target/R600/R600RegisterInfo.cpp
@@ -28,6 +28,8 @@ R600RegisterInfo::R600RegisterInfo(AMDGPUTargetMachine &tm)
 BitVector R600RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   BitVector Reserved(getNumRegs());
 
+  const R600InstrInfo *TII = static_cast<const R600InstrInfo*>(TM.getInstrInfo());
+
   Reserved.set(AMDGPU::ZERO);
   Reserved.set(AMDGPU::HALF);
   Reserved.set(AMDGPU::ONE);
@@ -41,26 +43,15 @@ BitVector R600RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   Reserved.set(AMDGPU::PRED_SEL_OFF);
   Reserved.set(AMDGPU::PRED_SEL_ZERO);
   Reserved.set(AMDGPU::PRED_SEL_ONE);
+  Reserved.set(AMDGPU::INDIRECT_BASE_ADDR);
 
   for (TargetRegisterClass::iterator I = AMDGPU::R600_AddrRegClass.begin(),
                         E = AMDGPU::R600_AddrRegClass.end(); I != E; ++I) {
     Reserved.set(*I);
   }
 
-  for (TargetRegisterClass::iterator I = AMDGPU::TRegMemRegClass.begin(),
-                                     E = AMDGPU::TRegMemRegClass.end();
-                                     I !=  E; ++I) {
-    Reserved.set(*I);
-  }
+  TII->reserveIndirectRegisters(Reserved, MF);
 
-  const R600InstrInfo *RII =
-    static_cast<const R600InstrInfo*>(TM.getInstrInfo());
-  std::vector<unsigned> IndirectRegs = RII->getIndirectReservedRegs(MF);
-  for (std::vector<unsigned>::iterator I = IndirectRegs.begin(),
-                                       E = IndirectRegs.end();
-                                       I != E; ++I) {
-    Reserved.set(*I);
-  }
   return Reserved;
 }
 
@@ -78,6 +69,10 @@ unsigned R600RegisterInfo::getHWRegChan(unsigned reg) const {
   return this->getEncodingValue(reg) >> HW_CHAN_SHIFT;
 }
 
+unsigned R600RegisterInfo::getHWRegIndex(unsigned Reg) const {
+  return GET_REG_INDEX(getEncodingValue(Reg));
+}
+
 const TargetRegisterClass * R600RegisterInfo::getCFGStructurizerRegClass(
                                                                    MVT VT) const {
   switch(VT.SimpleTy) {
@@ -86,17 +81,20 @@ const TargetRegisterClass * R600RegisterInfo::getCFGStructurizerRegClass(
   }
 }
 
-unsigned R600RegisterInfo::getSubRegFromChannel(unsigned Channel) const {
-  switch (Channel) {
-    default: assert(!"Invalid channel index"); return 0;
-    case 0: return AMDGPU::sub0;
-    case 1: return AMDGPU::sub1;
-    case 2: return AMDGPU::sub2;
-    case 3: return AMDGPU::sub3;
-  }
-}
-
 const RegClassWeight &R600RegisterInfo::getRegClassWeight(
   const TargetRegisterClass *RC) const {
   return RCW;
 }
+
+bool R600RegisterInfo::isPhysRegLiveAcrossClauses(unsigned Reg) const {
+  assert(!TargetRegisterInfo::isVirtualRegister(Reg));
+
+  switch (Reg) {
+  case AMDGPU::OQAP:
+  case AMDGPU::OQBP:
+  case AMDGPU::AR_X:
+    return false;
+  default:
+    return true;
+  }
+}
diff --git a/lib/Target/R600/R600RegisterInfo.h b/lib/Target/R600/R600RegisterInfo.h
index 9b286ee..c74c49e 100644
--- a/lib/Target/R600/R600RegisterInfo.h
+++ b/lib/Target/R600/R600RegisterInfo.h
@@ -39,16 +39,16 @@ struct R600RegisterInfo : public AMDGPURegisterInfo {
   /// \brief get the HW encoding for a register's channel.
   unsigned getHWRegChan(unsigned reg) const;
 
+  virtual unsigned getHWRegIndex(unsigned Reg) const;
+
   /// \brief get the register class of the specified type to use in the
   /// CFGStructurizer
   virtual const TargetRegisterClass * getCFGStructurizerRegClass(MVT VT) const;
 
-  /// \returns the sub reg enum value for the given \p Channel
-  /// (e.g. getSubRegFromChannel(0) -> AMDGPU::sel_x)
-  unsigned getSubRegFromChannel(unsigned Channel) const;
-
   virtual const RegClassWeight &getRegClassWeight(const TargetRegisterClass *RC) const;
 
+  // \returns true if \p Reg can be defined in one ALU caluse and used in another.
+  virtual bool isPhysRegLiveAcrossClauses(unsigned Reg) const;
 };
 
 } // End namespace llvm
diff --git a/lib/Target/R600/R600RegisterInfo.td b/lib/Target/R600/R600RegisterInfo.td
index fa987cf..68bcd20 100644
--- a/lib/Target/R600/R600RegisterInfo.td
+++ b/lib/Target/R600/R600RegisterInfo.td
@@ -39,8 +39,6 @@ foreach Index = 0-127 in {
     // Indirect addressing offset registers
     def Addr#Index#_#Chan : R600RegWithChan <"T("#Index#" + AR.x)."#Chan,
                                               Index, Chan>;
-    def TRegMem#Index#_#Chan : R600RegWithChan <"T"#Index#"."#Chan, Index,
-                                                Chan>;
   }
   // 128-bit Temporary Registers
   def T#Index#_XYZW : R600Reg_128 <"T"#Index#"",
@@ -95,6 +93,12 @@ foreach Index = 448-480 in {
 
 // Special Registers
 
+def OQA : R600Reg<"OQA", 219>;
+def OQB : R600Reg<"OQB", 220>;
+def OQAP : R600Reg<"OQAP", 221>;
+def OQBP : R600Reg<"OQAP", 222>;
+def LDS_DIRECT_A : R600Reg<"LDS_DIRECT_A", 223>;
+def LDS_DIRECT_B : R600Reg<"LDS_DIRECT_B", 224>;
 def ZERO : R600Reg<"0.0", 248>;
 def ONE : R600Reg<"1.0", 249>;
 def NEG_ONE : R600Reg<"-1.0", 249>;
@@ -115,7 +119,6 @@ def PRED_SEL_OFF: R600Reg<"Pred_sel_off", 0>;
 def PRED_SEL_ZERO : R600Reg<"Pred_sel_zero", 2>;
 def PRED_SEL_ONE : R600Reg<"Pred_sel_one", 3>;
 def AR_X : R600Reg<"AR.x", 0>;
-def OQAP : R600Reg<"OQAP", 221>;
 
 def R600_ArrayBase : RegisterClass <"AMDGPU", [f32, i32], 32,
                           (add (sequence "ArrayBase%u", 448, 480))>;
@@ -130,7 +133,8 @@ let isAllocatable = 0 in {
 // XXX: Only use the X channel, until we support wider stack widths
 def R600_Addr : RegisterClass <"AMDGPU", [i32], 127, (add (sequence "Addr%u_X", 0, 127))>;
 
-} // End isAllocatable = 0
+def R600_LDS_SRC_REG : RegisterClass<"AMDGPU", [i32], 32,
+  (add OQA, OQB, OQAP, OQBP, LDS_DIRECT_A, LDS_DIRECT_B)>;
 
 def R600_KC0_X : RegisterClass <"AMDGPU", [f32, i32], 32,
                               (add (sequence "KC0_%u_X", 128, 159))>;
@@ -164,6 +168,8 @@ def R600_KC1 : RegisterClass <"AMDGPU", [f32, i32], 32,
                                    (interleave R600_KC1_X, R600_KC1_Y,
                                                R600_KC1_Z, R600_KC1_W)>;
 
+} // End isAllocatable = 0
+
 def R600_TReg32_X : RegisterClass <"AMDGPU", [f32, i32], 32,
                                    (add (sequence "T%u_X", 0, 127), AR_X)>;
 
@@ -184,6 +190,7 @@ def R600_Reg32 : RegisterClass <"AMDGPU", [f32, i32], 32, (add
     R600_TReg32,
     R600_ArrayBase,
     R600_Addr,
+    R600_KC0, R600_KC1,
     ZERO, HALF, ONE, ONE_INT, PV_X, ALU_LITERAL_X, NEG_ONE, NEG_HALF,
     ALU_CONST, ALU_PARAM, OQAP
     )>;
@@ -201,33 +208,3 @@ def R600_Reg128 : RegisterClass<"AMDGPU", [v4f32, v4i32], 128,
 
 def R600_Reg64 : RegisterClass<"AMDGPU", [v2f32, v2i32], 64,
                                 (add (sequence "T%u_XY", 0, 63))>;
-
-//===----------------------------------------------------------------------===//
-// Register classes for indirect addressing
-//===----------------------------------------------------------------------===//
-
-// Super register for all the Indirect Registers.  This register class is used
-// by the REG_SEQUENCE instruction to specify the registers to use for direct
-// reads / writes which may be written / read by an indirect address.
-class IndirectSuper<string n, list<Register> subregs> :
-    RegisterWithSubRegs<n, subregs> {
-  let Namespace = "AMDGPU";
-  let SubRegIndices =
- [sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7,
-  sub8, sub9, sub10, sub11, sub12, sub13, sub14, sub15];
-}
-
-def IndirectSuperReg : IndirectSuper<"Indirect",
-  [TRegMem0_X, TRegMem1_X, TRegMem2_X, TRegMem3_X, TRegMem4_X, TRegMem5_X,
-   TRegMem6_X, TRegMem7_X, TRegMem8_X, TRegMem9_X, TRegMem10_X, TRegMem11_X,
-   TRegMem12_X, TRegMem13_X, TRegMem14_X, TRegMem15_X]
->;
-
-def IndirectReg : RegisterClass<"AMDGPU", [f32, i32], 32, (add IndirectSuperReg)>;
-
-// This register class defines the registers that are the storage units for
-// the "Indirect Addressing" pseudo memory space.
-// XXX: Only use the X channel, until we support wider stack widths
-def TRegMem : RegisterClass<"AMDGPU", [f32, i32], 32,
-  (add (sequence "TRegMem%u_X", 0, 16))
->;
diff --git a/lib/Target/R600/R600TextureIntrinsicsReplacer.cpp b/lib/Target/R600/R600TextureIntrinsicsReplacer.cpp
index 3768ba0..3258894 100644
--- a/lib/Target/R600/R600TextureIntrinsicsReplacer.cpp
+++ b/lib/Target/R600/R600TextureIntrinsicsReplacer.cpp
@@ -35,9 +35,9 @@ class R600TextureIntrinsicsReplacer :
   FunctionType *TexSign;
   FunctionType *TexQSign;
 
-  void getAdjustementFromTextureTarget(unsigned TextureType, bool hasLOD,
-                                       unsigned SrcSelect[4], unsigned CT[4],
-                                       bool &useShadowVariant) {
+  void getAdjustmentFromTextureTarget(unsigned TextureType, bool hasLOD,
+                                      unsigned SrcSelect[4], unsigned CT[4],
+                                      bool &useShadowVariant) {
     enum TextureTypes {
       TEXTURE_1D = 1,
       TEXTURE_2D,
@@ -60,6 +60,7 @@ class R600TextureIntrinsicsReplacer :
 
     switch (TextureType) {
     case 0:
+      useShadowVariant = false;
       return;
     case TEXTURE_RECT:
     case TEXTURE_1D:
@@ -93,9 +94,8 @@ class R600TextureIntrinsicsReplacer :
     }
 
     if (TextureType == TEXTURE_CUBE_ARRAY ||
-        TextureType == TEXTURE_SHADOWCUBE_ARRAY) {
+        TextureType == TEXTURE_SHADOWCUBE_ARRAY)
       CT[2] = 0;
-    }
 
     if (TextureType == TEXTURE_1D_ARRAY ||
         TextureType == TEXTURE_SHADOW1D_ARRAY) {
@@ -114,9 +114,8 @@ class R600TextureIntrinsicsReplacer :
         TextureType == TEXTURE_SHADOW2D ||
         TextureType == TEXTURE_SHADOWRECT ||
         TextureType == TEXTURE_SHADOW1D_ARRAY) &&
-        !(hasLOD && useShadowVariant)) {
+        !(hasLOD && useShadowVariant))
       SrcSelect[3] = 2;
-    }
   }
 
   void ReplaceCallInst(CallInst &I, FunctionType *FT, const char *Name,
@@ -174,8 +173,8 @@ class R600TextureIntrinsicsReplacer :
     };
     bool useShadowVariant;
 
-    getAdjustementFromTextureTarget(TextureType, hasLOD, SrcSelect, CT,
-                                    useShadowVariant);
+    getAdjustmentFromTextureTarget(TextureType, hasLOD, SrcSelect, CT,
+                                   useShadowVariant);
 
     ReplaceCallInst(I, FT, useShadowVariant?ShadowInt:VanillaInt, SrcSelect,
                     Offset, ResourceId, SamplerId, CT, Coord);
@@ -198,8 +197,8 @@ class R600TextureIntrinsicsReplacer :
     };
     bool useShadowVariant;
 
-    getAdjustementFromTextureTarget(TextureType, false, SrcSelect, CT,
-                                    useShadowVariant);
+    getAdjustmentFromTextureTarget(TextureType, false, SrcSelect, CT,
+                                   useShadowVariant);
 
     ReplaceCallInst(I, TexQSign, "llvm.R600.txf", SrcSelect,
                     Offset, ResourceId, SamplerId, CT, Coord);
@@ -259,6 +258,9 @@ public:
   }
 
   void visitCallInst(CallInst &I) {
+    if (!I.getCalledFunction())
+      return;
+
     StringRef Name = I.getCalledFunction()->getName();
     if (Name == "llvm.AMDGPU.tex") {
       ReplaceTexIntrinsic(I, false, TexSign, "llvm.R600.tex", "llvm.R600.texc");
diff --git a/lib/Target/R600/SIDefines.h b/lib/Target/R600/SIDefines.h
index 147578c..2cbce28 100644
--- a/lib/Target/R600/SIDefines.h
+++ b/lib/Target/R600/SIDefines.h
@@ -11,6 +11,18 @@
 #ifndef SIDEFINES_H_
 #define SIDEFINES_H_
 
+namespace SIInstrFlags {
+enum {
+  MIMG = 1 << 3,
+  SMRD = 1 << 4,
+  VOP1 = 1 << 5,
+  VOP2 = 1 << 6,
+  VOP3 = 1 << 7,
+  VOPC = 1 << 8,
+  SALU = 1 << 9
+};
+}
+
 #define R_00B028_SPI_SHADER_PGM_RSRC1_PS                                0x00B028
 #define R_00B02C_SPI_SHADER_PGM_RSRC2_PS                                0x00B02C
 #define   S_00B02C_EXTRA_LDS_SIZE(x)                                  (((x) & 0xFF) << 8)
diff --git a/lib/Target/R600/SIFixSGPRCopies.cpp b/lib/Target/R600/SIFixSGPRCopies.cpp
index 435172a..3370c79 100644
--- a/lib/Target/R600/SIFixSGPRCopies.cpp
+++ b/lib/Target/R600/SIFixSGPRCopies.cpp
@@ -23,9 +23,9 @@
 ///    %vreg3 <vsrc> = COPY %vreg2 <vgpr>
 ///  BB2:
 ///    %vreg4 <vsrc> = PHI %vreg1 <vsrc>, <BB#0>, %vreg3 <vrsc>, <BB#1>
-///    %vreg5 <vgpr> = VECTOR_INST %vreg4 <vsrc> 
+///    %vreg5 <vgpr> = VECTOR_INST %vreg4 <vsrc>
+///
 ///
-/// 
 /// The coalescer will begin at BB0 and eliminate its copy, then the resulting
 /// code will look like this:
 ///
@@ -43,7 +43,7 @@
 /// Now that the result of the PHI instruction is an SGPR, the register
 /// allocator is now forced to constrain the register class of %vreg3 to
 /// <sgpr> so we end up with final code like this:
-/// 
+///
 /// BB0:
 ///   %vreg0 <sgpr> = SCALAR_INST
 ///    ...
@@ -55,7 +55,7 @@
 ///   %vreg4 <sgpr> = PHI %vreg0 <sgpr>, <BB#0>, %vreg3 <sgpr>, <BB#1>
 ///   %vreg5 <vgpr> = VECTOR_INST %vreg4 <sgpr>
 ///
-/// Now this code contains an illegal copy from a VGPR to an SGPR. 
+/// Now this code contains an illegal copy from a VGPR to an SGPR.
 ///
 /// In order to avoid this problem, this pass searches for PHI instructions
 /// which define a <vsrc> register and constrains its definition class to
@@ -65,10 +65,14 @@
 /// ultimately led to the creation of an illegal COPY.
 //===----------------------------------------------------------------------===//
 
+#define DEBUG_TYPE "sgpr-copies"
 #include "AMDGPU.h"
 #include "SIInstrInfo.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
 
 using namespace llvm;
@@ -79,9 +83,16 @@ class SIFixSGPRCopies : public MachineFunctionPass {
 
 private:
   static char ID;
-  const TargetRegisterClass *inferRegClass(const TargetRegisterInfo *TRI,
+  const TargetRegisterClass *inferRegClassFromUses(const SIRegisterInfo *TRI,
                                            const MachineRegisterInfo &MRI,
-                                           unsigned Reg) const;
+                                           unsigned Reg,
+                                           unsigned SubReg) const;
+  const TargetRegisterClass *inferRegClassFromDef(const SIRegisterInfo *TRI,
+                                                 const MachineRegisterInfo &MRI,
+                                                 unsigned Reg,
+                                                 unsigned SubReg) const;
+  bool isVGPRToSGPRCopy(const MachineInstr &Copy, const SIRegisterInfo *TRI,
+                        const MachineRegisterInfo &MRI) const;
 
 public:
   SIFixSGPRCopies(TargetMachine &tm) : MachineFunctionPass(ID) { }
@@ -102,25 +113,41 @@ FunctionPass *llvm::createSIFixSGPRCopiesPass(TargetMachine &tm) {
   return new SIFixSGPRCopies(tm);
 }
 
-/// This functions walks the use/def chains starting with the definition of
-/// \p Reg until it finds an Instruction that isn't a COPY returns
-/// the register class of that instruction.
-const TargetRegisterClass *SIFixSGPRCopies::inferRegClass(
-                                                 const TargetRegisterInfo *TRI,
+static bool hasVGPROperands(const MachineInstr &MI, const SIRegisterInfo *TRI) {
+  const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+    if (!MI.getOperand(i).isReg() ||
+        !TargetRegisterInfo::isVirtualRegister(MI.getOperand(i).getReg()))
+      continue;
+
+    if (TRI->hasVGPRs(MRI.getRegClass(MI.getOperand(i).getReg())))
+      return true;
+  }
+  return false;
+}
+
+/// This functions walks the use list of Reg until it finds an Instruction
+/// that isn't a COPY returns the register class of that instruction.
+/// \return The register defined by the first non-COPY instruction.
+const TargetRegisterClass *SIFixSGPRCopies::inferRegClassFromUses(
+                                                 const SIRegisterInfo *TRI,
                                                  const MachineRegisterInfo &MRI,
-                                                 unsigned Reg) const {
+                                                 unsigned Reg,
+                                                 unsigned SubReg) const {
   // The Reg parameter to the function must always be defined by either a PHI
   // or a COPY, therefore it cannot be a physical register.
   assert(TargetRegisterInfo::isVirtualRegister(Reg) &&
          "Reg cannot be a physical register");
 
   const TargetRegisterClass *RC = MRI.getRegClass(Reg);
+  RC = TRI->getSubRegClass(RC, SubReg);
   for (MachineRegisterInfo::use_iterator I = MRI.use_begin(Reg),
                                          E = MRI.use_end(); I != E; ++I) {
     switch (I->getOpcode()) {
     case AMDGPU::COPY:
-      RC = TRI->getCommonSubClass(RC, inferRegClass(TRI, MRI,
-                                                    I->getOperand(0).getReg()));
+      RC = TRI->getCommonSubClass(RC, inferRegClassFromUses(TRI, MRI,
+                                  I->getOperand(0).getReg(),
+                                  I->getOperand(0).getSubReg()));
       break;
     }
   }
@@ -128,9 +155,48 @@ const TargetRegisterClass *SIFixSGPRCopies::inferRegClass(
   return RC;
 }
 
+const TargetRegisterClass *SIFixSGPRCopies::inferRegClassFromDef(
+                                                 const SIRegisterInfo *TRI,
+                                                 const MachineRegisterInfo &MRI,
+                                                 unsigned Reg,
+                                                 unsigned SubReg) const {
+  if (!TargetRegisterInfo::isVirtualRegister(Reg)) {
+    const TargetRegisterClass *RC = TRI->getPhysRegClass(Reg);
+    return TRI->getSubRegClass(RC, SubReg);
+  }
+  MachineInstr *Def = MRI.getVRegDef(Reg);
+  if (Def->getOpcode() != AMDGPU::COPY) {
+    return TRI->getSubRegClass(MRI.getRegClass(Reg), SubReg);
+  }
+
+  return inferRegClassFromDef(TRI, MRI, Def->getOperand(1).getReg(),
+                                   Def->getOperand(1).getSubReg());
+}
+
+bool SIFixSGPRCopies::isVGPRToSGPRCopy(const MachineInstr &Copy,
+                                      const SIRegisterInfo *TRI,
+                                      const MachineRegisterInfo &MRI) const {
+
+  unsigned DstReg = Copy.getOperand(0).getReg();
+  unsigned SrcReg = Copy.getOperand(1).getReg();
+  unsigned SrcSubReg = Copy.getOperand(1).getSubReg();
+  const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg);
+  const TargetRegisterClass *SrcRC;
+
+  if (!TargetRegisterInfo::isVirtualRegister(SrcReg) ||
+      DstRC == &AMDGPU::M0RegRegClass)
+    return false;
+
+  SrcRC = inferRegClassFromDef(TRI, MRI, SrcReg, SrcSubReg);
+  return TRI->isSGPRClass(DstRC) && TRI->hasVGPRs(SrcRC);
+}
+
 bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
   MachineRegisterInfo &MRI = MF.getRegInfo();
-  const TargetRegisterInfo *TRI = MF.getTarget().getRegisterInfo();
+  const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(
+      MF.getTarget().getRegisterInfo());
+  const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(
+      MF.getTarget().getInstrInfo());
   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
                                                   BI != BE; ++BI) {
 
@@ -138,13 +204,58 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
     for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
                                                       I != E; ++I) {
       MachineInstr &MI = *I;
-      if (MI.getOpcode() != AMDGPU::PHI) {
-        continue;
+      if (MI.getOpcode() == AMDGPU::COPY && isVGPRToSGPRCopy(MI, TRI, MRI)) {
+        DEBUG(dbgs() << "Fixing VGPR -> SGPR copy:\n");
+        DEBUG(MI.print(dbgs()));
+        TII->moveToVALU(MI);
+
+      }
+
+      switch (MI.getOpcode()) {
+      default: continue;
+      case AMDGPU::PHI: {
+        DEBUG(dbgs() << " Fixing PHI:\n");
+        DEBUG(MI.print(dbgs()));
+
+        for (unsigned i = 1; i < MI.getNumOperands(); i+=2) {
+          unsigned Reg = MI.getOperand(i).getReg();
+          const TargetRegisterClass *RC = inferRegClassFromDef(TRI, MRI, Reg,
+                                                  MI.getOperand(0).getSubReg());
+          MRI.constrainRegClass(Reg, RC);
+        }
+        unsigned Reg = MI.getOperand(0).getReg();
+        const TargetRegisterClass *RC = inferRegClassFromUses(TRI, MRI, Reg,
+                                                  MI.getOperand(0).getSubReg());
+        if (TRI->getCommonSubClass(RC, &AMDGPU::VReg_32RegClass)) {
+          MRI.constrainRegClass(Reg, &AMDGPU::VReg_32RegClass);
+        }
+
+        if (!TRI->isSGPRClass(MRI.getRegClass(Reg)))
+          break;
+
+        // If a PHI node defines an SGPR and any of its operands are VGPRs,
+        // then we need to move it to the VALU.
+        for (unsigned i = 1; i < MI.getNumOperands(); i+=2) {
+          unsigned Reg = MI.getOperand(i).getReg();
+          if (TRI->hasVGPRs(MRI.getRegClass(Reg))) {
+            TII->moveToVALU(MI);
+            break;
+          }
+        }
+
+        break;
+      }
+      case AMDGPU::REG_SEQUENCE: {
+        if (TRI->hasVGPRs(TII->getOpRegClass(MI, 0)) ||
+            !hasVGPROperands(MI, TRI))
+          continue;
+
+        DEBUG(dbgs() << "Fixing REG_SEQUENCE:\n");
+        DEBUG(MI.print(dbgs()));
+
+        TII->moveToVALU(MI);
+        break;
       }
-      unsigned Reg = MI.getOperand(0).getReg();
-      const TargetRegisterClass *RC = inferRegClass(TRI, MRI, Reg);
-      if (RC == &AMDGPU::VSrc_32RegClass) {
-        MRI.constrainRegClass(Reg, &AMDGPU::VReg_32RegClass);
       }
     }
   }
diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp
index c64027f..d5d2b68 100644
--- a/lib/Target/R600/SIISelLowering.cpp
+++ b/lib/Target/R600/SIISelLowering.cpp
@@ -34,18 +34,12 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
   addRegisterClass(MVT::i1, &AMDGPU::SReg_64RegClass);
   addRegisterClass(MVT::i64, &AMDGPU::VSrc_64RegClass);
 
-  addRegisterClass(MVT::v2i1, &AMDGPU::VReg_64RegClass);
-  addRegisterClass(MVT::v4i1, &AMDGPU::VReg_128RegClass);
-
-  addRegisterClass(MVT::v16i8, &AMDGPU::SReg_128RegClass);
   addRegisterClass(MVT::v32i8, &AMDGPU::SReg_256RegClass);
   addRegisterClass(MVT::v64i8, &AMDGPU::SReg_512RegClass);
 
   addRegisterClass(MVT::i32, &AMDGPU::VSrc_32RegClass);
   addRegisterClass(MVT::f32, &AMDGPU::VSrc_32RegClass);
 
-  addRegisterClass(MVT::v1i32, &AMDGPU::VSrc_32RegClass);
-
   addRegisterClass(MVT::f64, &AMDGPU::VSrc_64RegClass);
   addRegisterClass(MVT::v2i32, &AMDGPU::VSrc_64RegClass);
   addRegisterClass(MVT::v2f32, &AMDGPU::VSrc_64RegClass);
@@ -62,6 +56,21 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
 
   computeRegisterProperties();
 
+  // Condition Codes
+  setCondCodeAction(ISD::SETONE, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETUGE, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETUGT, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETULE, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETULT, MVT::f32, Expand);
+
+  setCondCodeAction(ISD::SETONE, MVT::f64, Expand);
+  setCondCodeAction(ISD::SETUEQ, MVT::f64, Expand);
+  setCondCodeAction(ISD::SETUGE, MVT::f64, Expand);
+  setCondCodeAction(ISD::SETUGT, MVT::f64, Expand);
+  setCondCodeAction(ISD::SETULE, MVT::f64, Expand);
+  setCondCodeAction(ISD::SETULT, MVT::f64, Expand);
+
   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand);
   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Expand);
   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand);
@@ -69,6 +78,32 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
 
   setOperationAction(ISD::ADD, MVT::i64, Legal);
   setOperationAction(ISD::ADD, MVT::i32, Legal);
+  setOperationAction(ISD::ADDC, MVT::i32, Legal);
+  setOperationAction(ISD::ADDE, MVT::i32, Legal);
+
+  setOperationAction(ISD::BITCAST, MVT::i128, Legal);
+
+  // We need to custom lower vector stores from local memory
+  setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
+  setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
+  setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
+  setOperationAction(ISD::LOAD, MVT::v16i32, Custom);
+
+  setOperationAction(ISD::STORE, MVT::v8i32, Custom);
+  setOperationAction(ISD::STORE, MVT::v16i32, Custom);
+
+  // We need to custom lower loads/stores from private memory
+  setOperationAction(ISD::LOAD, MVT::i32, Custom);
+  setOperationAction(ISD::LOAD, MVT::i64, Custom);
+  setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
+  setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
+
+  setOperationAction(ISD::STORE, MVT::i32, Custom);
+  setOperationAction(ISD::STORE, MVT::i64, Custom);
+  setOperationAction(ISD::STORE, MVT::i128, Custom);
+  setOperationAction(ISD::STORE, MVT::v2i32, Custom);
+  setOperationAction(ISD::STORE, MVT::v4i32, Custom);
+
 
   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
@@ -78,14 +113,31 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
   setOperationAction(ISD::SETCC, MVT::v2i1, Expand);
   setOperationAction(ISD::SETCC, MVT::v4i1, Expand);
 
+  setOperationAction(ISD::ANY_EXTEND, MVT::i64, Custom);
   setOperationAction(ISD::SIGN_EXTEND, MVT::i64, Custom);
   setOperationAction(ISD::ZERO_EXTEND, MVT::i64, Custom);
 
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom);
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v16i8, Custom);
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom);
+
+  setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
 
   setLoadExtAction(ISD::SEXTLOAD, MVT::i32, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::i32, Expand);
+  setLoadExtAction(ISD::SEXTLOAD, MVT::v8i16, Expand);
+  setLoadExtAction(ISD::SEXTLOAD, MVT::v16i16, Expand);
 
-  setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
+  setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
+  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+  setTruncStoreAction(MVT::i64, MVT::i32, Expand);
+  setTruncStoreAction(MVT::i128, MVT::i64, Expand);
+  setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
+  setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
+
+  setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
+  setOperationAction(ISD::FrameIndex, MVT::i64, Custom);
 
   setTargetDAGCombine(ISD::SELECT_CC);
 
@@ -102,24 +154,28 @@ bool SITargetLowering::allowsUnalignedMemoryAccesses(EVT  VT,
                                                      bool *IsFast) const {
   // XXX: This depends on the address space and also we may want to revist
   // the alignment values we specify in the DataLayout.
+  if (!VT.isSimple() || VT == MVT::Other)
+    return false;
   return VT.bitsGT(MVT::i32);
 }
 
+bool SITargetLowering::shouldSplitVectorElementType(EVT VT) const {
+  return VT.bitsLE(MVT::i16);
+}
 
-SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT,
+SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
                                          SDLoc DL, SDValue Chain,
                                          unsigned Offset) const {
   MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
   PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
                                             AMDGPUAS::CONSTANT_ADDRESS);
-  EVT ArgVT = MVT::getIntegerVT(VT.getSizeInBits());
   SDValue BasePtr =  DAG.getCopyFromReg(Chain, DL,
                            MRI.getLiveInVirtReg(AMDGPU::SGPR0_SGPR1), MVT::i64);
   SDValue Ptr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
                                              DAG.getConstant(Offset, MVT::i64));
-  return DAG.getLoad(VT, DL, Chain, Ptr,
-                            MachinePointerInfo(UndefValue::get(PtrTy)),
-                            false, false, false, ArgVT.getSizeInBits() >> 3);
+  return DAG.getExtLoad(ISD::SEXTLOAD, DL, VT, Chain, Ptr,
+                            MachinePointerInfo(UndefValue::get(PtrTy)), MemVT,
+                            false, false, MemVT.getSizeInBits() >> 3);
 
 }
 
@@ -146,7 +202,8 @@ SDValue SITargetLowering::LowerFormalArguments(
     const ISD::InputArg &Arg = Ins[i];
 
     // First check if it's a PS input addr
-    if (Info->ShaderType == ShaderType::PIXEL && !Arg.Flags.isInReg()) {
+    if (Info->ShaderType == ShaderType::PIXEL && !Arg.Flags.isInReg() &&
+        !Arg.Flags.isByVal()) {
 
       assert((PSInputNum <= 15) && "Too many PS inputs!");
 
@@ -177,7 +234,7 @@ SDValue SITargetLowering::LowerFormalArguments(
         NewArg.PartOffset += NewArg.VT.getStoreSize();
       }
 
-    } else {
+    } else if (Info->ShaderType != ShaderType::COMPUTE) {
       Splits.push_back(Arg);
     }
   }
@@ -200,6 +257,11 @@ SDValue SITargetLowering::LowerFormalArguments(
     MF.addLiveIn(AMDGPU::SGPR0_SGPR1, &AMDGPU::SReg_64RegClass);
   }
 
+  if (Info->ShaderType == ShaderType::COMPUTE) {
+    getOriginalFunctionArgs(DAG, DAG.getMachineFunction().getFunction(), Ins,
+                            Splits);
+  }
+
   AnalyzeFormalArguments(CCInfo, Splits);
 
   for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
@@ -214,9 +276,11 @@ SDValue SITargetLowering::LowerFormalArguments(
     EVT VT = VA.getLocVT();
 
     if (VA.isMemLoc()) {
+      VT = Ins[i].VT;
+      EVT MemVT = Splits[i].VT;
       // The first 36 bytes of the input buffer contains information about
       // thread group and global sizes.
-      SDValue Arg = LowerParameter(DAG, VT, DL, DAG.getRoot(),
+      SDValue Arg = LowerParameter(DAG, VT, MemVT,  DL, DAG.getRoot(),
                                    36 + VA.getLocMemOffset());
       InVals.push_back(Arg);
       continue;
@@ -320,6 +384,19 @@ MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter(
     MI->eraseFromParent();
     break;
   }
+  case AMDGPU::SI_RegisterStorePseudo: {
+    MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+    const SIInstrInfo *TII =
+      static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo());
+    unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+    MachineInstrBuilder MIB =
+        BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::SI_RegisterStore),
+                Reg);
+    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i)
+      MIB.addOperand(MI->getOperand(i));
+
+    MI->eraseFromParent();
+  }
   }
   return BB;
 }
@@ -335,6 +412,24 @@ MVT SITargetLowering::getScalarShiftAmountTy(EVT VT) const {
   return MVT::i32;
 }
 
+bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
+  VT = VT.getScalarType();
+
+  if (!VT.isSimple())
+    return false;
+
+  switch (VT.getSimpleVT().SimpleTy) {
+  case MVT::f32:
+    return false; /* There is V_MAD_F32 for f32 */
+  case MVT::f64:
+    return true;
+  default:
+    break;
+  }
+
+  return false;
+}
+
 //===----------------------------------------------------------------------===//
 // Custom DAG Lowering Operations
 //===----------------------------------------------------------------------===//
@@ -344,9 +439,27 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
   switch (Op.getOpcode()) {
   default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
+  case ISD::ADD: return LowerADD(Op, DAG);
   case ISD::BRCOND: return LowerBRCOND(Op, DAG);
+  case ISD::LOAD: {
+    LoadSDNode *Load = dyn_cast<LoadSDNode>(Op);
+    if ((Load->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
+         Load->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) &&
+        Op.getValueType().isVector()) {
+      SDValue MergedValues[2] = {
+        SplitVectorLoad(Op, DAG),
+        Load->getChain()
+      };
+      return DAG.getMergeValues(MergedValues, 2, SDLoc(Op));
+    } else {
+      return LowerLOAD(Op, DAG);
+    }
+  }
+
   case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
   case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, DAG);
+  case ISD::STORE: return LowerSTORE(Op, DAG);
+  case ISD::ANY_EXTEND: // Fall-through
   case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, DAG);
   case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG);
   case ISD::INTRINSIC_WO_CHAIN: {
@@ -359,23 +472,23 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
     switch (IntrinsicID) {
     default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
     case Intrinsic::r600_read_ngroups_x:
-      return LowerParameter(DAG, VT, DL, DAG.getEntryNode(), 0);
+      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 0);
     case Intrinsic::r600_read_ngroups_y:
-      return LowerParameter(DAG, VT, DL, DAG.getEntryNode(), 4);
+      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 4);
     case Intrinsic::r600_read_ngroups_z:
-      return LowerParameter(DAG, VT, DL, DAG.getEntryNode(), 8);
+      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 8);
     case Intrinsic::r600_read_global_size_x:
-      return LowerParameter(DAG, VT, DL, DAG.getEntryNode(), 12);
+      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 12);
     case Intrinsic::r600_read_global_size_y:
-      return LowerParameter(DAG, VT, DL, DAG.getEntryNode(), 16);
+      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 16);
     case Intrinsic::r600_read_global_size_z:
-      return LowerParameter(DAG, VT, DL, DAG.getEntryNode(), 20);
+      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 20);
     case Intrinsic::r600_read_local_size_x:
-      return LowerParameter(DAG, VT, DL, DAG.getEntryNode(), 24);
+      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 24);
     case Intrinsic::r600_read_local_size_y:
-      return LowerParameter(DAG, VT, DL, DAG.getEntryNode(), 28);
+      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 28);
     case Intrinsic::r600_read_local_size_z:
-      return LowerParameter(DAG, VT, DL, DAG.getEntryNode(), 32);
+      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 32);
     case Intrinsic::r600_read_tgid_x:
       return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
                      AMDGPU::SReg_32RegClass.getRegister(NumUserSGPRs + 0), VT);
@@ -394,13 +507,102 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
     case Intrinsic::r600_read_tidig_z:
       return CreateLiveInRegister(DAG, &AMDGPU::VReg_32RegClass,
                                   AMDGPU::VGPR2, VT);
-
+    case AMDGPUIntrinsic::SI_load_const: {
+      SDValue Ops [] = {
+        ResourceDescriptorToi128(Op.getOperand(1), DAG),
+        Op.getOperand(2)
+      };
+
+      MachineMemOperand *MMO = MF.getMachineMemOperand(
+          MachinePointerInfo(),
+          MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant,
+          VT.getSizeInBits() / 8, 4);
+      return DAG.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT, DL,
+                                     Op->getVTList(), Ops, 2, VT, MMO);
+    }
+    case AMDGPUIntrinsic::SI_sample:
+      return LowerSampleIntrinsic(AMDGPUISD::SAMPLE, Op, DAG);
+    case AMDGPUIntrinsic::SI_sampleb:
+      return LowerSampleIntrinsic(AMDGPUISD::SAMPLEB, Op, DAG);
+    case AMDGPUIntrinsic::SI_sampled:
+      return LowerSampleIntrinsic(AMDGPUISD::SAMPLED, Op, DAG);
+    case AMDGPUIntrinsic::SI_samplel:
+      return LowerSampleIntrinsic(AMDGPUISD::SAMPLEL, Op, DAG);
+    case AMDGPUIntrinsic::SI_vs_load_input:
+      return DAG.getNode(AMDGPUISD::LOAD_INPUT, DL, VT,
+                         ResourceDescriptorToi128(Op.getOperand(1), DAG),
+                         Op.getOperand(2),
+                         Op.getOperand(3));
     }
   }
+
+  case ISD::INTRINSIC_VOID:
+    SDValue Chain = Op.getOperand(0);
+    unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+
+    switch (IntrinsicID) {
+      case AMDGPUIntrinsic::SI_tbuffer_store: {
+        SDLoc DL(Op);
+        SDValue Ops [] = {
+          Chain,
+          ResourceDescriptorToi128(Op.getOperand(2), DAG),
+          Op.getOperand(3),
+          Op.getOperand(4),
+          Op.getOperand(5),
+          Op.getOperand(6),
+          Op.getOperand(7),
+          Op.getOperand(8),
+          Op.getOperand(9),
+          Op.getOperand(10),
+          Op.getOperand(11),
+          Op.getOperand(12),
+          Op.getOperand(13),
+          Op.getOperand(14)
+        };
+        EVT VT = Op.getOperand(3).getValueType();
+
+        MachineMemOperand *MMO = MF.getMachineMemOperand(
+            MachinePointerInfo(),
+            MachineMemOperand::MOStore,
+            VT.getSizeInBits() / 8, 4);
+        return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_STORE_FORMAT, DL,
+                                       Op->getVTList(), Ops,
+                                       sizeof(Ops)/sizeof(Ops[0]), VT, MMO);
+      }
+      default:
+        break;
+    }
   }
   return SDValue();
 }
 
+SDValue SITargetLowering::LowerADD(SDValue Op,
+                                   SelectionDAG &DAG) const {
+  if (Op.getValueType() != MVT::i64)
+    return SDValue();
+
+  SDLoc DL(Op);
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+
+  SDValue Zero = DAG.getConstant(0, MVT::i32);
+  SDValue One = DAG.getConstant(1, MVT::i32);
+
+  SDValue Lo0 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, LHS, Zero);
+  SDValue Hi0 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, LHS, One);
+
+  SDValue Lo1 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, RHS, Zero);
+  SDValue Hi1 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, RHS, One);
+
+  SDVTList VTList = DAG.getVTList(MVT::i32, MVT::Glue);
+
+  SDValue AddLo = DAG.getNode(ISD::ADDC, DL, VTList, Lo0, Lo1);
+  SDValue Carry = AddLo.getValue(1);
+  SDValue AddHi = DAG.getNode(ISD::ADDE, DL, VTList, Hi0, Hi1, Carry);
+
+  return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, AddLo, AddHi.getValue(0));
+}
+
 /// \brief Helper function for LowerBRCOND
 static SDNode *findUser(SDValue Value, unsigned Opcode) {
 
@@ -495,6 +697,53 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
   return Chain;
 }
 
+SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  LoadSDNode *Load = cast<LoadSDNode>(Op);
+
+  if (Load->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS)
+    return SDValue();
+
+  SDValue TruncPtr = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32,
+                                 Load->getBasePtr(), DAG.getConstant(0, MVT::i32));
+  SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, TruncPtr,
+                            DAG.getConstant(2, MVT::i32));
+
+  SDValue Ret = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op.getValueType(),
+                            Load->getChain(), Ptr,
+                            DAG.getTargetConstant(0, MVT::i32),
+                            Op.getOperand(2));
+  SDValue MergedValues[2] = {
+    Ret,
+    Load->getChain()
+  };
+  return DAG.getMergeValues(MergedValues, 2, DL);
+
+}
+
+SDValue SITargetLowering::ResourceDescriptorToi128(SDValue Op,
+                                             SelectionDAG &DAG) const {
+
+  if (Op.getValueType() == MVT::i128) {
+    return Op;
+  }
+
+  assert(Op.getOpcode() == ISD::UNDEF);
+
+  return DAG.getNode(ISD::BUILD_PAIR, SDLoc(Op), MVT::i128,
+                     DAG.getConstant(0, MVT::i64),
+                     DAG.getConstant(0, MVT::i64));
+}
+
+SDValue SITargetLowering::LowerSampleIntrinsic(unsigned Opcode,
+                                               const SDValue &Op,
+                                               SelectionDAG &DAG) const {
+  return DAG.getNode(Opcode, SDLoc(Op), Op.getValueType(), Op.getOperand(1),
+                     Op.getOperand(2),
+                     ResourceDescriptorToi128(Op.getOperand(3), DAG),
+                     Op.getOperand(4));
+}
+
 SDValue SITargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
   SDValue LHS = Op.getOperand(0);
   SDValue RHS = Op.getOperand(1);
@@ -529,6 +778,56 @@ SDValue SITargetLowering::LowerSIGN_EXTEND(SDValue Op,
   return DAG.getNode(ISD::BUILD_PAIR, DL, VT, Op.getOperand(0), Hi);
 }
 
+SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  StoreSDNode *Store = cast<StoreSDNode>(Op);
+  EVT VT = Store->getMemoryVT();
+
+  SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
+  if (Ret.getNode())
+    return Ret;
+
+  if (VT.isVector() && VT.getVectorNumElements() >= 8)
+      return SplitVectorStore(Op, DAG);
+
+  if (Store->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS)
+    return SDValue();
+
+  SDValue TruncPtr = DAG.getZExtOrTrunc(Store->getBasePtr(), DL, MVT::i32);
+  SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, TruncPtr,
+                            DAG.getConstant(2, MVT::i32));
+  SDValue Chain = Store->getChain();
+  SmallVector<SDValue, 8> Values;
+
+  if (VT == MVT::i64) {
+    for (unsigned i = 0; i < 2; ++i) {
+      Values.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32,
+                       Store->getValue(), DAG.getConstant(i, MVT::i32)));
+    }
+  } else if (VT == MVT::i128) {
+    for (unsigned i = 0; i < 2; ++i) {
+      for (unsigned j = 0; j < 2; ++j) {
+        Values.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32,
+                           DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i64,
+                           Store->getValue(), DAG.getConstant(i, MVT::i32)),
+                         DAG.getConstant(j, MVT::i32)));
+      }
+    }
+  } else {
+    Values.push_back(Store->getValue());
+  }
+
+  for (unsigned i = 0; i < Values.size(); ++i) {
+    SDValue PartPtr = DAG.getNode(ISD::ADD, DL, MVT::i32,
+                                  Ptr, DAG.getConstant(i, MVT::i32));
+    Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
+                        Chain, Values[i], PartPtr,
+                        DAG.getTargetConstant(0, MVT::i32));
+  }
+  return Chain;
+}
+
+
 SDValue SITargetLowering::LowerZERO_EXTEND(SDValue Op,
                                            SelectionDAG &DAG) const {
   EVT VT = Op.getValueType();
@@ -555,7 +854,6 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
   switch (N->getOpcode()) {
     default: break;
     case ISD::SELECT_CC: {
-      N->dump();
       ConstantSDNode *True, *False;
       // i1 selectcc(l, r, -1, 0, cc) -> i1 setcc(l, r, cc)
       if ((True = dyn_cast<ConstantSDNode>(N->getOperand(2)))
@@ -759,8 +1057,8 @@ void SITargetLowering::ensureSRegLimit(SelectionDAG &DAG, SDValue &Operand,
     return;
   }
 
-  // This is a conservative aproach, it is possible that we can't determine
-  // the correct register class and copy too often, but better save than sorry.
+  // This is a conservative aproach. It is possible that we can't determine the
+  // correct register class and copy too often, but better safe than sorry.
   SDValue RC = DAG.getTargetConstant(RegClass, MVT::i32);
   SDNode *Node = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS, SDLoc(),
                                     Operand.getValueType(), Operand, RC);
@@ -937,7 +1235,9 @@ static unsigned SubIdx2Lane(unsigned Idx) {
 void SITargetLowering::adjustWritemask(MachineSDNode *&Node,
                                        SelectionDAG &DAG) const {
   SDNode *Users[4] = { };
-  unsigned Writemask = 0, Lane = 0;
+  unsigned Lane = 0;
+  unsigned OldDmask = Node->getConstantOperandVal(0);
+  unsigned NewDmask = 0;
 
   // Try to figure out the used register components
   for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end();
@@ -948,29 +1248,42 @@ void SITargetLowering::adjustWritemask(MachineSDNode *&Node,
         I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
       return;
 
+    // Lane means which subreg of %VGPRa_VGPRb_VGPRc_VGPRd is used.
+    // Note that subregs are packed, i.e. Lane==0 is the first bit set
+    // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
+    // set, etc.
     Lane = SubIdx2Lane(I->getConstantOperandVal(1));
 
+    // Set which texture component corresponds to the lane.
+    unsigned Comp;
+    for (unsigned i = 0, Dmask = OldDmask; i <= Lane; i++) {
+      assert(Dmask);
+      Comp = countTrailingZeros(Dmask);
+      Dmask &= ~(1 << Comp);
+    }
+
     // Abort if we have more than one user per component
     if (Users[Lane])
       return;
 
     Users[Lane] = *I;
-    Writemask |= 1 << Lane;
+    NewDmask |= 1 << Comp;
   }
 
-  // Abort if all components are used
-  if (Writemask == 0xf)
+  // Abort if there's no change
+  if (NewDmask == OldDmask)
     return;
 
   // Adjust the writemask in the node
   std::vector<SDValue> Ops;
-  Ops.push_back(DAG.getTargetConstant(Writemask, MVT::i32));
+  Ops.push_back(DAG.getTargetConstant(NewDmask, MVT::i32));
   for (unsigned i = 1, e = Node->getNumOperands(); i != e; ++i)
     Ops.push_back(Node->getOperand(i));
   Node = (MachineSDNode*)DAG.UpdateNodeOperands(Node, Ops.data(), Ops.size());
 
   // If we only got one lane, replace it with a copy
-  if (Writemask == (1U << Lane)) {
+  // (if NewDmask has only one bit set...)
+  if (NewDmask && (NewDmask & (NewDmask-1)) == 0) {
     SDValue RC = DAG.getTargetConstant(AMDGPU::VReg_32RegClassID, MVT::i32);
     SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
                                       SDLoc(), Users[Lane]->getValueType(0),
@@ -1001,9 +1314,11 @@ void SITargetLowering::adjustWritemask(MachineSDNode *&Node,
 /// \brief Fold the instructions after slecting them
 SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
                                           SelectionDAG &DAG) const {
+  const SIInstrInfo *TII =
+      static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo());
   Node = AdjustRegClass(Node, DAG);
 
-  if (AMDGPU::isMIMG(Node->getMachineOpcode()) != -1)
+  if (TII->isMIMG(Node->getMachineOpcode()))
     adjustWritemask(Node, DAG);
 
   return foldOperands(Node, DAG);
@@ -1013,7 +1328,9 @@ SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
 /// bits set in the writemask
 void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI,
                                                      SDNode *Node) const {
-  if (AMDGPU::isMIMG(MI->getOpcode()) == -1)
+  const SIInstrInfo *TII =
+      static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo());
+  if (!TII->isMIMG(MI->getOpcode()))
     return;
 
   unsigned VReg = MI->getOperand(0).getReg();
@@ -1030,6 +1347,8 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI,
   case 3:  RC = &AMDGPU::VReg_96RegClass; break;
   }
 
+  unsigned NewOpcode = TII->getMaskedMIMGOp(MI->getOpcode(), BitsSet);
+  MI->setDesc(TII->get(NewOpcode));
   MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
   MRI.setRegClass(VReg, RC);
 }
diff --git a/lib/Target/R600/SIISelLowering.h b/lib/Target/R600/SIISelLowering.h
index b4202c4..9933ece 100644
--- a/lib/Target/R600/SIISelLowering.h
+++ b/lib/Target/R600/SIISelLowering.h
@@ -21,13 +21,19 @@
 namespace llvm {
 
 class SITargetLowering : public AMDGPUTargetLowering {
-  SDValue LowerParameter(SelectionDAG &DAG, EVT VT, SDLoc DL,
+  SDValue LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, SDLoc DL,
                          SDValue Chain, unsigned Offset) const;
+  SDValue LowerSampleIntrinsic(unsigned Opcode, const SDValue &Op,
+                               SelectionDAG &DAG) const;
+  SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSIGN_EXTEND(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerZERO_EXTEND(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerADD(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
 
+  SDValue ResourceDescriptorToi128(SDValue Op, SelectionDAG &DAG) const;
   bool foldImm(SDValue &Operand, int32_t &Immediate,
                bool &ScalarSlotUsed) const;
   const TargetRegisterClass *getRegClassForNode(SelectionDAG &DAG,
@@ -44,6 +50,7 @@ class SITargetLowering : public AMDGPUTargetLowering {
 public:
   SITargetLowering(TargetMachine &tm);
   bool allowsUnalignedMemoryAccesses(EVT  VT, bool *IsFast) const;
+  virtual bool shouldSplitVectorElementType(EVT VT) const;
 
   SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv,
                                bool isVarArg,
@@ -55,6 +62,7 @@ public:
                                               MachineBasicBlock * BB) const;
   virtual EVT getSetCCResultType(LLVMContext &Context, EVT VT) const;
   virtual MVT getScalarShiftAmountTy(EVT VT) const;
+  virtual bool isFMAFasterThanFMulAndFAdd(EVT VT) const;
   virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
   virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   virtual SDNode *PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const;
diff --git a/lib/Target/R600/SIInsertWaits.cpp b/lib/Target/R600/SIInsertWaits.cpp
index ba202e3..7ef662e 100644
--- a/lib/Target/R600/SIInsertWaits.cpp
+++ b/lib/Target/R600/SIInsertWaits.cpp
@@ -134,14 +134,19 @@ Counters SIInsertWaits::getHwCounts(MachineInstr &MI) {
   // LGKM may uses larger values
   if (TSFlags & SIInstrFlags::LGKM_CNT) {
 
-    MachineOperand &Op = MI.getOperand(0);
-    if (!Op.isReg())
-      Op = MI.getOperand(1);
-    assert(Op.isReg() && "First LGKM operand must be a register!");
+    if (TII->isSMRD(MI.getOpcode())) {
 
-    unsigned Reg = Op.getReg();
-    unsigned Size = TRI->getMinimalPhysRegClass(Reg)->getSize();
-    Result.Named.LGKM = Size > 4 ? 2 : 1;
+      MachineOperand &Op = MI.getOperand(0);
+      assert(Op.isReg() && "First LGKM operand must be a register!");
+
+      unsigned Reg = Op.getReg();
+      unsigned Size = TRI->getMinimalPhysRegClass(Reg)->getSize();
+      Result.Named.LGKM = Size > 4 ? 2 : 1;
+
+    } else {
+      // DS
+      Result.Named.LGKM = 1;
+    }
 
   } else {
     Result.Named.LGKM = 0;
@@ -181,7 +186,7 @@ bool SIInsertWaits::isOpRelevant(MachineOperand &Op) {
 
 RegInterval SIInsertWaits::getRegInterval(MachineOperand &Op) {
 
-  if (!Op.isReg())
+  if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()))
     return std::make_pair(0, 0);
 
   unsigned Reg = Op.getReg();
diff --git a/lib/Target/R600/SIInstrFormats.td b/lib/Target/R600/SIInstrFormats.td
index 434aa7e..53ebaaf 100644
--- a/lib/Target/R600/SIInstrFormats.td
+++ b/lib/Target/R600/SIInstrFormats.td
@@ -17,10 +17,24 @@ class InstSI <dag outs, dag ins, string asm, list<dag> pattern> :
   field bits<1> VM_CNT = 0;
   field bits<1> EXP_CNT = 0;
   field bits<1> LGKM_CNT = 0;
+  field bits<1> MIMG = 0;
+  field bits<1> SMRD = 0;
+  field bits<1> VOP1 = 0;
+  field bits<1> VOP2 = 0;
+  field bits<1> VOP3 = 0;
+  field bits<1> VOPC = 0;
+  field bits<1> SALU = 0;
 
   let TSFlags{0} = VM_CNT;
   let TSFlags{1} = EXP_CNT;
   let TSFlags{2} = LGKM_CNT;
+  let TSFlags{3} = MIMG;
+  let TSFlags{4} = SMRD;
+  let TSFlags{5} = VOP1;
+  let TSFlags{6} = VOP2;
+  let TSFlags{7} = VOP3;
+  let TSFlags{8} = VOPC;
+  let TSFlags{9} = SALU;
 }
 
 class Enc32 <dag outs, dag ins, string asm, list<dag> pattern> :
@@ -55,6 +69,7 @@ class SOP1 <bits<8> op, dag outs, dag ins, string asm, list<dag> pattern> :
   let mayLoad = 0;
   let mayStore = 0;
   let hasSideEffects = 0;
+  let SALU = 1;
 }
 
 class SOP2 <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
@@ -73,6 +88,7 @@ class SOP2 <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
   let mayLoad = 0;
   let mayStore = 0;
   let hasSideEffects = 0;
+  let SALU = 1;
 }
 
 class SOPC <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
@@ -90,6 +106,7 @@ class SOPC <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
   let mayLoad = 0;
   let mayStore = 0;
   let hasSideEffects = 0;
+  let SALU = 1;
 }
 
 class SOPK <bits<5> op, dag outs, dag ins, string asm, list<dag> pattern> :
@@ -106,6 +123,7 @@ class SOPK <bits<5> op, dag outs, dag ins, string asm, list<dag> pattern> :
   let mayLoad = 0;
   let mayStore = 0;
   let hasSideEffects = 0;
+  let SALU = 1;
 }
 
 class SOPP <bits<7> op, dag ins, string asm, list<dag> pattern> : Enc32 <
@@ -123,6 +141,7 @@ class SOPP <bits<7> op, dag ins, string asm, list<dag> pattern> : Enc32 <
   let mayLoad = 0;
   let mayStore = 0;
   let hasSideEffects = 0;
+  let SALU = 1;
 }
 
 class SMRD <bits<5> op, bits<1> imm, dag outs, dag ins, string asm,
@@ -140,6 +159,7 @@ class SMRD <bits<5> op, bits<1> imm, dag outs, dag ins, string asm,
   let Inst{31-27} = 0x18; //encoding
 
   let LGKM_CNT = 1;
+  let SMRD = 1;
 }
 
 //===----------------------------------------------------------------------===//
@@ -162,6 +182,8 @@ class VOP1 <bits<8> op, dag outs, dag ins, string asm, list<dag> pattern> :
   let mayLoad = 0;
   let mayStore = 0;
   let hasSideEffects = 0;
+  let UseNamedOperandTable = 1;
+  let VOP1 = 1;
 }
 
 class VOP2 <bits<6> op, dag outs, dag ins, string asm, list<dag> pattern> :
@@ -180,6 +202,8 @@ class VOP2 <bits<6> op, dag outs, dag ins, string asm, list<dag> pattern> :
   let mayLoad = 0;
   let mayStore = 0;
   let hasSideEffects = 0;
+  let UseNamedOperandTable = 1;
+  let VOP2 = 1;
 }
 
 class VOP3 <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern> :
@@ -208,6 +232,8 @@ class VOP3 <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern> :
   let mayLoad = 0;
   let mayStore = 0;
   let hasSideEffects = 0;
+  let UseNamedOperandTable = 1;
+  let VOP3 = 1;
 }
 
 class VOP3b <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern> :
@@ -234,6 +260,8 @@ class VOP3b <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern> :
   let mayLoad = 0;
   let mayStore = 0;
   let hasSideEffects = 0;
+  let UseNamedOperandTable = 1;
+  let VOP3 = 1;
 }
 
 class VOPC <bits<8> op, dag ins, string asm, list<dag> pattern> :
@@ -251,6 +279,7 @@ class VOPC <bits<8> op, dag ins, string asm, list<dag> pattern> :
   let mayLoad = 0;
   let mayStore = 0;
   let hasSideEffects = 0;
+  let VOPC = 1;
 }
 
 class VINTRP <bits <2> op, dag outs, dag ins, string asm, list<dag> pattern> :
@@ -414,6 +443,7 @@ class MIMG <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
 
   let VM_CNT = 1;
   let EXP_CNT = 1;
+  let MIMG = 1;
 }
 
 def EXP : Enc64<
diff --git a/lib/Target/R600/SIInstrInfo.cpp b/lib/Target/R600/SIInstrInfo.cpp
index 551ae86..ab55c1b 100644
--- a/lib/Target/R600/SIInstrInfo.cpp
+++ b/lib/Target/R600/SIInstrInfo.cpp
@@ -15,10 +15,10 @@
 
 #include "SIInstrInfo.h"
 #include "AMDGPUTargetMachine.h"
+#include "SIDefines.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/MC/MCInstrDesc.h"
-#include <stdio.h>
 
 using namespace llvm;
 
@@ -31,6 +31,10 @@ const SIRegisterInfo &SIInstrInfo::getRegisterInfo() const {
   return RI;
 }
 
+//===----------------------------------------------------------------------===//
+// TargetInstrInfo callbacks
+//===----------------------------------------------------------------------===//
+
 void
 SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                          MachineBasicBlock::iterator MI, DebugLoc DL,
@@ -118,14 +122,14 @@ SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
 
   } else if (AMDGPU::VReg_32RegClass.contains(DestReg)) {
     assert(AMDGPU::VReg_32RegClass.contains(SrcReg) ||
-	   AMDGPU::SReg_32RegClass.contains(SrcReg));
+           AMDGPU::SReg_32RegClass.contains(SrcReg));
     BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg)
             .addReg(SrcReg, getKillRegState(KillSrc));
     return;
 
   } else if (AMDGPU::VReg_64RegClass.contains(DestReg)) {
     assert(AMDGPU::VReg_64RegClass.contains(SrcReg) ||
-	   AMDGPU::SReg_64RegClass.contains(SrcReg));
+           AMDGPU::SReg_64RegClass.contains(SrcReg));
     Opcode = AMDGPU::V_MOV_B32_e32;
     SubIndices = Sub0_1;
 
@@ -136,19 +140,19 @@ SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
 
   } else if (AMDGPU::VReg_128RegClass.contains(DestReg)) {
     assert(AMDGPU::VReg_128RegClass.contains(SrcReg) ||
-	   AMDGPU::SReg_128RegClass.contains(SrcReg));
+           AMDGPU::SReg_128RegClass.contains(SrcReg));
     Opcode = AMDGPU::V_MOV_B32_e32;
     SubIndices = Sub0_3;
 
   } else if (AMDGPU::VReg_256RegClass.contains(DestReg)) {
     assert(AMDGPU::VReg_256RegClass.contains(SrcReg) ||
-	   AMDGPU::SReg_256RegClass.contains(SrcReg));
+           AMDGPU::SReg_256RegClass.contains(SrcReg));
     Opcode = AMDGPU::V_MOV_B32_e32;
     SubIndices = Sub0_7;
 
   } else if (AMDGPU::VReg_512RegClass.contains(DestReg)) {
     assert(AMDGPU::VReg_512RegClass.contains(SrcReg) ||
-	   AMDGPU::SReg_512RegClass.contains(SrcReg));
+           AMDGPU::SReg_512RegClass.contains(SrcReg));
     Opcode = AMDGPU::V_MOV_B32_e32;
     SubIndices = Sub0_15;
 
@@ -168,7 +172,6 @@ SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
 }
 
 unsigned SIInstrInfo::commuteOpcode(unsigned Opcode) const {
-
   int NewOpc;
 
   // Try to map original to commuted opcode
@@ -185,11 +188,36 @@ unsigned SIInstrInfo::commuteOpcode(unsigned Opcode) const {
 MachineInstr *SIInstrInfo::commuteInstruction(MachineInstr *MI,
                                               bool NewMI) const {
 
-  if (MI->getNumOperands() < 3 || !MI->getOperand(1).isReg() ||
-      !MI->getOperand(2).isReg())
+  MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
+  if (MI->getNumOperands() < 3 || !MI->getOperand(1).isReg())
     return 0;
 
-  MI = TargetInstrInfo::commuteInstruction(MI, NewMI);
+  // Cannot commute VOP2 if src0 is SGPR.
+  if (isVOP2(MI->getOpcode()) && MI->getOperand(1).isReg() &&
+      RI.isSGPRClass(MRI.getRegClass(MI->getOperand(1).getReg())))
+   return 0;
+
+  if (!MI->getOperand(2).isReg()) {
+    // XXX: Commute instructions with FPImm operands
+    if (NewMI || MI->getOperand(2).isFPImm() ||
+       (!isVOP2(MI->getOpcode()) && !isVOP3(MI->getOpcode()))) {
+      return 0;
+    }
+
+    // XXX: Commute VOP3 instructions with abs and neg set.
+    if (isVOP3(MI->getOpcode()) &&
+        (MI->getOperand(AMDGPU::getNamedOperandIdx(MI->getOpcode(),
+                        AMDGPU::OpName::abs)).getImm() ||
+         MI->getOperand(AMDGPU::getNamedOperandIdx(MI->getOpcode(),
+                        AMDGPU::OpName::neg)).getImm()))
+      return 0;
+
+    unsigned Reg = MI->getOperand(1).getReg();
+    MI->getOperand(1).ChangeToImmediate(MI->getOperand(2).getImm());
+    MI->getOperand(2).ChangeToRegister(Reg, false);
+  } else {
+    MI = TargetInstrInfo::commuteInstruction(MI, NewMI);
+  }
 
   if (MI)
     MI->setDesc(get(commuteOpcode(MI->getOpcode())));
@@ -197,15 +225,12 @@ MachineInstr *SIInstrInfo::commuteInstruction(MachineInstr *MI,
   return MI;
 }
 
-MachineInstr * SIInstrInfo::getMovImmInstr(MachineFunction *MF, unsigned DstReg,
-                                           int64_t Imm) const {
-  MachineInstr * MI = MF->CreateMachineInstr(get(AMDGPU::V_MOV_B32_e32), DebugLoc());
-  MachineInstrBuilder MIB(*MF, MI);
-  MIB.addReg(DstReg, RegState::Define);
-  MIB.addImm(Imm);
-
-  return MI;
-
+MachineInstr *SIInstrInfo::buildMovInstr(MachineBasicBlock *MBB,
+                                         MachineBasicBlock::iterator I,
+                                         unsigned DstReg,
+                                         unsigned SrcReg) const {
+  return BuildMI(*MBB, I, MBB->findDebugLoc(I), get(AMDGPU::V_MOV_B32_e32),
+                 DstReg) .addReg(SrcReg);
 }
 
 bool SIInstrInfo::isMov(unsigned Opcode) const {
@@ -224,32 +249,397 @@ SIInstrInfo::isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const {
   return RC != &AMDGPU::EXECRegRegClass;
 }
 
-//===----------------------------------------------------------------------===//
-// Indirect addressing callbacks
-//===----------------------------------------------------------------------===//
+int SIInstrInfo::isMIMG(uint16_t Opcode) const {
+  return get(Opcode).TSFlags & SIInstrFlags::MIMG;
+}
 
-unsigned SIInstrInfo::calculateIndirectAddress(unsigned RegIndex,
-                                                 unsigned Channel) const {
-  assert(Channel == 0);
-  return RegIndex;
+int SIInstrInfo::isSMRD(uint16_t Opcode) const {
+  return get(Opcode).TSFlags & SIInstrFlags::SMRD;
+}
+
+bool SIInstrInfo::isVOP1(uint16_t Opcode) const {
+  return get(Opcode).TSFlags & SIInstrFlags::VOP1;
+}
+
+bool SIInstrInfo::isVOP2(uint16_t Opcode) const {
+  return get(Opcode).TSFlags & SIInstrFlags::VOP2;
+}
+
+bool SIInstrInfo::isVOP3(uint16_t Opcode) const {
+  return get(Opcode).TSFlags & SIInstrFlags::VOP3;
+}
+
+bool SIInstrInfo::isVOPC(uint16_t Opcode) const {
+  return get(Opcode).TSFlags & SIInstrFlags::VOPC;
+}
+
+bool SIInstrInfo::isSALUInstr(const MachineInstr &MI) const {
+  return get(MI.getOpcode()).TSFlags & SIInstrFlags::SALU;
+}
+
+bool SIInstrInfo::isInlineConstant(const MachineOperand &MO) const {
+  if(MO.isImm()) {
+    return MO.getImm() >= -16 && MO.getImm() <= 64;
+  }
+  if (MO.isFPImm()) {
+    return MO.getFPImm()->isExactlyValue(0.0)  ||
+           MO.getFPImm()->isExactlyValue(0.5)  ||
+           MO.getFPImm()->isExactlyValue(-0.5) ||
+           MO.getFPImm()->isExactlyValue(1.0)  ||
+           MO.getFPImm()->isExactlyValue(-1.0) ||
+           MO.getFPImm()->isExactlyValue(2.0)  ||
+           MO.getFPImm()->isExactlyValue(-2.0) ||
+           MO.getFPImm()->isExactlyValue(4.0)  ||
+           MO.getFPImm()->isExactlyValue(-4.0);
+  }
+  return false;
+}
+
+bool SIInstrInfo::isLiteralConstant(const MachineOperand &MO) const {
+  return (MO.isImm() || MO.isFPImm()) && !isInlineConstant(MO);
+}
+
+bool SIInstrInfo::verifyInstruction(const MachineInstr *MI,
+                                    StringRef &ErrInfo) const {
+  uint16_t Opcode = MI->getOpcode();
+  int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
+  int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
+  int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
+
+  // Verify VOP*
+  if (isVOP1(Opcode) || isVOP2(Opcode) || isVOP3(Opcode) || isVOPC(Opcode)) {
+    unsigned ConstantBusCount = 0;
+    unsigned SGPRUsed = AMDGPU::NoRegister;
+    for (int i = 0, e = MI->getNumOperands(); i != e; ++i) {
+      const MachineOperand &MO = MI->getOperand(i);
+      if (MO.isReg() && MO.isUse() &&
+          !TargetRegisterInfo::isVirtualRegister(MO.getReg())) {
+
+        // EXEC register uses the constant bus.
+        if (!MO.isImplicit() && MO.getReg() == AMDGPU::EXEC)
+          ++ConstantBusCount;
+
+        // SGPRs use the constant bus
+        if (MO.getReg() == AMDGPU::M0 || MO.getReg() == AMDGPU::VCC ||
+            (!MO.isImplicit() &&
+            (AMDGPU::SGPR_32RegClass.contains(MO.getReg()) ||
+            AMDGPU::SGPR_64RegClass.contains(MO.getReg())))) {
+          if (SGPRUsed != MO.getReg()) {
+            ++ConstantBusCount;
+            SGPRUsed = MO.getReg();
+          }
+        }
+      }
+      // Literal constants use the constant bus.
+      if (isLiteralConstant(MO))
+        ++ConstantBusCount;
+    }
+    if (ConstantBusCount > 1) {
+      ErrInfo = "VOP* instruction uses the constant bus more than once";
+      return false;
+    }
+  }
+
+  // Verify SRC1 for VOP2 and VOPC
+  if (Src1Idx != -1 && (isVOP2(Opcode) || isVOPC(Opcode))) {
+    const MachineOperand &Src1 = MI->getOperand(Src1Idx);
+    if (Src1.isImm() || Src1.isFPImm()) {
+      ErrInfo = "VOP[2C] src1 cannot be an immediate.";
+      return false;
+    }
+  }
+
+  // Verify VOP3
+  if (isVOP3(Opcode)) {
+    if (Src0Idx != -1 && isLiteralConstant(MI->getOperand(Src0Idx))) {
+      ErrInfo = "VOP3 src0 cannot be a literal constant.";
+      return false;
+    }
+    if (Src1Idx != -1 && isLiteralConstant(MI->getOperand(Src1Idx))) {
+      ErrInfo = "VOP3 src1 cannot be a literal constant.";
+      return false;
+    }
+    if (Src2Idx != -1 && isLiteralConstant(MI->getOperand(Src2Idx))) {
+      ErrInfo = "VOP3 src2 cannot be a literal constant.";
+      return false;
+    }
+  }
+  return true;
+}
+
+unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) {
+  switch (MI.getOpcode()) {
+  default: return AMDGPU::INSTRUCTION_LIST_END;
+  case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE;
+  case AMDGPU::COPY: return AMDGPU::COPY;
+  case AMDGPU::PHI: return AMDGPU::PHI;
+  case AMDGPU::S_ADD_I32: return AMDGPU::V_ADD_I32_e32;
+  case AMDGPU::S_ADDC_U32: return AMDGPU::V_ADDC_U32_e32;
+  case AMDGPU::S_SUB_I32: return AMDGPU::V_SUB_I32_e32;
+  case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32;
+  case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32;
+  case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64;
+  case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32;
+  case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64;
+  case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32;
+  case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64;
+  }
+}
+
+bool SIInstrInfo::isSALUOpSupportedOnVALU(const MachineInstr &MI) const {
+  return getVALUOp(MI) != AMDGPU::INSTRUCTION_LIST_END;
+}
+
+const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI,
+                                                      unsigned OpNo) const {
+  const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+  const MCInstrDesc &Desc = get(MI.getOpcode());
+  if (MI.isVariadic() || OpNo >= Desc.getNumOperands() ||
+      Desc.OpInfo[OpNo].RegClass == -1)
+    return MRI.getRegClass(MI.getOperand(OpNo).getReg());
+
+  unsigned RCID = Desc.OpInfo[OpNo].RegClass;
+  return RI.getRegClass(RCID);
+}
+
+bool SIInstrInfo::canReadVGPR(const MachineInstr &MI, unsigned OpNo) const {
+  switch (MI.getOpcode()) {
+  case AMDGPU::COPY:
+  case AMDGPU::REG_SEQUENCE:
+    return RI.hasVGPRs(getOpRegClass(MI, 0));
+  default:
+    return RI.hasVGPRs(getOpRegClass(MI, OpNo));
+  }
 }
 
+void SIInstrInfo::legalizeOpWithMove(MachineInstr *MI, unsigned OpIdx) const {
+  MachineBasicBlock::iterator I = MI;
+  MachineOperand &MO = MI->getOperand(OpIdx);
+  MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
+  unsigned RCID = get(MI->getOpcode()).OpInfo[OpIdx].RegClass;
+  const TargetRegisterClass *RC = RI.getRegClass(RCID);
+  unsigned Opcode = AMDGPU::V_MOV_B32_e32;
+  if (MO.isReg()) {
+    Opcode = AMDGPU::COPY;
+  } else if (RI.isSGPRClass(RC)) {
+    Opcode = AMDGPU::S_MOV_B32;
+  }
+
+  const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC);
+  unsigned Reg = MRI.createVirtualRegister(VRC);
+  BuildMI(*MI->getParent(), I, MI->getParent()->findDebugLoc(I), get(Opcode),
+          Reg).addOperand(MO);
+  MO.ChangeToRegister(Reg, false);
+}
+
+void SIInstrInfo::legalizeOperands(MachineInstr *MI) const {
+  MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
+  int Src0Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
+                                           AMDGPU::OpName::src0);
+  int Src1Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
+                                           AMDGPU::OpName::src1);
+  int Src2Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
+                                           AMDGPU::OpName::src2);
+
+  // Legalize VOP2
+  if (isVOP2(MI->getOpcode()) && Src1Idx != -1) {
+    MachineOperand &Src0 = MI->getOperand(Src0Idx);
+    MachineOperand &Src1 = MI->getOperand(Src1Idx);
+
+    // If the instruction implicitly reads VCC, we can't have any SGPR operands,
+    // so move any.
+    bool ReadsVCC = MI->readsRegister(AMDGPU::VCC, &RI);
+    if (ReadsVCC && Src0.isReg() &&
+        RI.isSGPRClass(MRI.getRegClass(Src0.getReg()))) {
+      legalizeOpWithMove(MI, Src0Idx);
+      return;
+    }
+
+    if (ReadsVCC && Src1.isReg() &&
+        RI.isSGPRClass(MRI.getRegClass(Src1.getReg()))) {
+      legalizeOpWithMove(MI, Src1Idx);
+      return;
+    }
+
+    // Legalize VOP2 instructions where src1 is not a VGPR. An SGPR input must
+    // be the first operand, and there can only be one.
+    if (Src1.isImm() || Src1.isFPImm() ||
+        (Src1.isReg() && RI.isSGPRClass(MRI.getRegClass(Src1.getReg())))) {
+      if (MI->isCommutable()) {
+        if (commuteInstruction(MI))
+          return;
+      }
+      legalizeOpWithMove(MI, Src1Idx);
+    }
+  }
+
+  // XXX - Do any VOP3 instructions read VCC?
+  // Legalize VOP3
+  if (isVOP3(MI->getOpcode())) {
+    int VOP3Idx[3] = {Src0Idx, Src1Idx, Src2Idx};
+    unsigned SGPRReg = AMDGPU::NoRegister;
+    for (unsigned i = 0; i < 3; ++i) {
+      int Idx = VOP3Idx[i];
+      if (Idx == -1)
+        continue;
+      MachineOperand &MO = MI->getOperand(Idx);
+
+      if (MO.isReg()) {
+        if (!RI.isSGPRClass(MRI.getRegClass(MO.getReg())))
+          continue; // VGPRs are legal
+
+        assert(MO.getReg() != AMDGPU::SCC && "SCC operand to VOP3 instruction");
+
+        if (SGPRReg == AMDGPU::NoRegister || SGPRReg == MO.getReg()) {
+          SGPRReg = MO.getReg();
+          // We can use one SGPR in each VOP3 instruction.
+          continue;
+        }
+      } else if (!isLiteralConstant(MO)) {
+        // If it is not a register and not a literal constant, then it must be
+        // an inline constant which is always legal.
+        continue;
+      }
+      // If we make it this far, then the operand is not legal and we must
+      // legalize it.
+      legalizeOpWithMove(MI, Idx);
+    }
+  }
+
+  // Legalize REG_SEQUENCE
+  // The register class of the operands much be the same type as the register
+  // class of the output.
+  if (MI->getOpcode() == AMDGPU::REG_SEQUENCE) {
+    const TargetRegisterClass *RC = NULL, *SRC = NULL, *VRC = NULL;
+    for (unsigned i = 1, e = MI->getNumOperands(); i != e; i+=2) {
+      if (!MI->getOperand(i).isReg() ||
+          !TargetRegisterInfo::isVirtualRegister(MI->getOperand(i).getReg()))
+        continue;
+      const TargetRegisterClass *OpRC =
+              MRI.getRegClass(MI->getOperand(i).getReg());
+      if (RI.hasVGPRs(OpRC)) {
+        VRC = OpRC;
+      } else {
+        SRC = OpRC;
+      }
+    }
+
+    // If any of the operands are VGPR registers, then they all most be
+    // otherwise we will create illegal VGPR->SGPR copies when legalizing
+    // them.
+    if (VRC || !RI.isSGPRClass(getOpRegClass(*MI, 0))) {
+      if (!VRC) {
+        assert(SRC);
+        VRC = RI.getEquivalentVGPRClass(SRC);
+      }
+      RC = VRC;
+    } else {
+      RC = SRC;
+    }
 
-int SIInstrInfo::getIndirectIndexBegin(const MachineFunction &MF) const {
-  llvm_unreachable("Unimplemented");
+    // Update all the operands so they have the same type.
+    for (unsigned i = 1, e = MI->getNumOperands(); i != e; i+=2) {
+      if (!MI->getOperand(i).isReg() ||
+          !TargetRegisterInfo::isVirtualRegister(MI->getOperand(i).getReg()))
+        continue;
+      unsigned DstReg = MRI.createVirtualRegister(RC);
+      BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+              get(AMDGPU::COPY), DstReg)
+              .addOperand(MI->getOperand(i));
+      MI->getOperand(i).setReg(DstReg);
+    }
+  }
 }
 
-int SIInstrInfo::getIndirectIndexEnd(const MachineFunction &MF) const {
-  llvm_unreachable("Unimplemented");
+void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
+  SmallVector<MachineInstr *, 128> Worklist;
+  Worklist.push_back(&TopInst);
+
+  while (!Worklist.empty()) {
+    MachineInstr *Inst = Worklist.pop_back_val();
+    unsigned NewOpcode = getVALUOp(*Inst);
+    if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END)
+      continue;
+
+    MachineRegisterInfo &MRI = Inst->getParent()->getParent()->getRegInfo();
+
+    // Use the new VALU Opcode.
+    const MCInstrDesc &NewDesc = get(NewOpcode);
+    Inst->setDesc(NewDesc);
+
+    // Remove any references to SCC. Vector instructions can't read from it, and
+    // We're just about to add the implicit use / defs of VCC, and we don't want
+    // both.
+    for (unsigned i = Inst->getNumOperands() - 1; i > 0; --i) {
+      MachineOperand &Op = Inst->getOperand(i);
+      if (Op.isReg() && Op.getReg() == AMDGPU::SCC)
+        Inst->RemoveOperand(i);
+    }
+
+    // Add the implict and explicit register definitions.
+    if (NewDesc.ImplicitUses) {
+      for (unsigned i = 0; NewDesc.ImplicitUses[i]; ++i) {
+        unsigned Reg = NewDesc.ImplicitUses[i];
+        Inst->addOperand(MachineOperand::CreateReg(Reg, false, true));
+      }
+    }
+
+    if (NewDesc.ImplicitDefs) {
+      for (unsigned i = 0; NewDesc.ImplicitDefs[i]; ++i) {
+        unsigned Reg = NewDesc.ImplicitDefs[i];
+        Inst->addOperand(MachineOperand::CreateReg(Reg, true, true));
+      }
+    }
+
+    legalizeOperands(Inst);
+
+    // Update the destination register class.
+    const TargetRegisterClass *NewDstRC = getOpRegClass(*Inst, 0);
+
+    switch (Inst->getOpcode()) {
+      // For target instructions, getOpRegClass just returns the virtual
+      // register class associated with the operand, so we need to find an
+      // equivalent VGPR register class in order to move the instruction to the
+      // VALU.
+    case AMDGPU::COPY:
+    case AMDGPU::PHI:
+    case AMDGPU::REG_SEQUENCE:
+      if (RI.hasVGPRs(NewDstRC))
+        continue;
+      NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
+      if (!NewDstRC)
+        continue;
+      break;
+    default:
+      break;
+    }
+
+    unsigned DstReg = Inst->getOperand(0).getReg();
+    unsigned NewDstReg = MRI.createVirtualRegister(NewDstRC);
+    MRI.replaceRegWith(DstReg, NewDstReg);
+
+    for (MachineRegisterInfo::use_iterator I = MRI.use_begin(NewDstReg),
+           E = MRI.use_end(); I != E; ++I) {
+      MachineInstr &UseMI = *I;
+      if (!canReadVGPR(UseMI, I.getOperandNo())) {
+        Worklist.push_back(&UseMI);
+      }
+    }
+  }
 }
 
-const TargetRegisterClass *SIInstrInfo::getIndirectAddrStoreRegClass(
-                                                     unsigned SourceReg) const {
-  llvm_unreachable("Unimplemented");
+//===----------------------------------------------------------------------===//
+// Indirect addressing callbacks
+//===----------------------------------------------------------------------===//
+
+unsigned SIInstrInfo::calculateIndirectAddress(unsigned RegIndex,
+                                                 unsigned Channel) const {
+  assert(Channel == 0);
+  return RegIndex;
 }
 
-const TargetRegisterClass *SIInstrInfo::getIndirectAddrLoadRegClass() const {
-  llvm_unreachable("Unimplemented");
+const TargetRegisterClass *SIInstrInfo::getIndirectAddrRegClass() const {
+  return &AMDGPU::VReg_32RegClass;
 }
 
 MachineInstrBuilder SIInstrInfo::buildIndirectWrite(
@@ -257,7 +647,17 @@ MachineInstrBuilder SIInstrInfo::buildIndirectWrite(
                                    MachineBasicBlock::iterator I,
                                    unsigned ValueReg,
                                    unsigned Address, unsigned OffsetReg) const {
-  llvm_unreachable("Unimplemented");
+  const DebugLoc &DL = MBB->findDebugLoc(I);
+  unsigned IndirectBaseReg = AMDGPU::VReg_32RegClass.getRegister(
+                                      getIndirectIndexBegin(*MBB->getParent()));
+
+  return BuildMI(*MBB, I, DL, get(AMDGPU::SI_INDIRECT_DST_V1))
+          .addReg(IndirectBaseReg, RegState::Define)
+          .addOperand(I->getOperand(0))
+          .addReg(IndirectBaseReg)
+          .addReg(OffsetReg)
+          .addImm(0)
+          .addReg(ValueReg);
 }
 
 MachineInstrBuilder SIInstrInfo::buildIndirectRead(
@@ -265,9 +665,43 @@ MachineInstrBuilder SIInstrInfo::buildIndirectRead(
                                    MachineBasicBlock::iterator I,
                                    unsigned ValueReg,
                                    unsigned Address, unsigned OffsetReg) const {
-  llvm_unreachable("Unimplemented");
+  const DebugLoc &DL = MBB->findDebugLoc(I);
+  unsigned IndirectBaseReg = AMDGPU::VReg_32RegClass.getRegister(
+                                      getIndirectIndexBegin(*MBB->getParent()));
+
+  return BuildMI(*MBB, I, DL, get(AMDGPU::SI_INDIRECT_SRC))
+          .addOperand(I->getOperand(0))
+          .addOperand(I->getOperand(1))
+          .addReg(IndirectBaseReg)
+          .addReg(OffsetReg)
+          .addImm(0);
+
 }
 
-const TargetRegisterClass *SIInstrInfo::getSuperIndirectRegClass() const {
-  llvm_unreachable("Unimplemented");
+void SIInstrInfo::reserveIndirectRegisters(BitVector &Reserved,
+                                            const MachineFunction &MF) const {
+  int End = getIndirectIndexEnd(MF);
+  int Begin = getIndirectIndexBegin(MF);
+
+  if (End == -1)
+    return;
+
+
+  for (int Index = Begin; Index <= End; ++Index)
+    Reserved.set(AMDGPU::VReg_32RegClass.getRegister(Index));
+
+  for (int Index = std::max(0, Begin - 1); Index <= End; ++Index)
+    Reserved.set(AMDGPU::VReg_64RegClass.getRegister(Index));
+
+  for (int Index = std::max(0, Begin - 2); Index <= End; ++Index)
+    Reserved.set(AMDGPU::VReg_96RegClass.getRegister(Index));
+
+  for (int Index = std::max(0, Begin - 3); Index <= End; ++Index)
+    Reserved.set(AMDGPU::VReg_128RegClass.getRegister(Index));
+
+  for (int Index = std::max(0, Begin - 7); Index <= End; ++Index)
+    Reserved.set(AMDGPU::VReg_256RegClass.getRegister(Index));
+
+  for (int Index = std::max(0, Begin - 15); Index <= End; ++Index)
+    Reserved.set(AMDGPU::VReg_512RegClass.getRegister(Index));
 }
diff --git a/lib/Target/R600/SIInstrInfo.h b/lib/Target/R600/SIInstrInfo.h
index 87eff4d..4af6348 100644
--- a/lib/Target/R600/SIInstrInfo.h
+++ b/lib/Target/R600/SIInstrInfo.h
@@ -1,4 +1,4 @@
-//===-- SIInstrInfo.h - SI Instruction Info Interface ---------------------===//
+//===-- SIInstrInfo.h - SI Instruction Info Interface -----------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -25,6 +25,14 @@ class SIInstrInfo : public AMDGPUInstrInfo {
 private:
   const SIRegisterInfo RI;
 
+  MachineInstrBuilder buildIndirectIndexLoop(MachineBasicBlock &MBB,
+                                             MachineBasicBlock::iterator I,
+                                             unsigned OffsetVGPR,
+                                             unsigned MovRelOp,
+                                             unsigned Dst,
+                                             unsigned Src0) const;
+  // If you add or remove instructions from this function, you will
+
 public:
   explicit SIInstrInfo(AMDGPUTargetMachine &tm);
 
@@ -40,25 +48,65 @@ public:
   virtual MachineInstr *commuteInstruction(MachineInstr *MI,
                                            bool NewMI=false) const;
 
-  virtual MachineInstr * getMovImmInstr(MachineFunction *MF, unsigned DstReg,
-                                        int64_t Imm) const;
-
   virtual unsigned getIEQOpcode() const { assert(!"Implement"); return 0;}
+  MachineInstr *buildMovInstr(MachineBasicBlock *MBB,
+                              MachineBasicBlock::iterator I,
+                              unsigned DstReg, unsigned SrcReg) const;
   virtual bool isMov(unsigned Opcode) const;
 
   virtual bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const;
-
-  virtual int getIndirectIndexBegin(const MachineFunction &MF) const;
-
-  virtual int getIndirectIndexEnd(const MachineFunction &MF) const;
+  int isMIMG(uint16_t Opcode) const;
+  int isSMRD(uint16_t Opcode) const;
+  bool isVOP1(uint16_t Opcode) const;
+  bool isVOP2(uint16_t Opcode) const;
+  bool isVOP3(uint16_t Opcode) const;
+  bool isVOPC(uint16_t Opcode) const;
+  bool isInlineConstant(const MachineOperand &MO) const;
+  bool isLiteralConstant(const MachineOperand &MO) const;
+
+  virtual bool verifyInstruction(const MachineInstr *MI,
+                                 StringRef &ErrInfo) const;
+
+  bool isSALUInstr(const MachineInstr &MI) const;
+  static unsigned getVALUOp(const MachineInstr &MI);
+  bool isSALUOpSupportedOnVALU(const MachineInstr &MI) const;
+
+  /// \brief Return the correct register class for \p OpNo.  For target-specific
+  /// instructions, this will return the register class that has been defined
+  /// in tablegen.  For generic instructions, like REG_SEQUENCE it will return
+  /// the register class of its machine operand.
+  /// to infer the correct register class base on the other operands.
+  const TargetRegisterClass *getOpRegClass(const MachineInstr &MI,
+                                           unsigned OpNo) const;\
+
+  /// \returns true if it is legal for the operand at index \p OpNo
+  /// to read a VGPR.
+  bool canReadVGPR(const MachineInstr &MI, unsigned OpNo) const;
+
+  /// \brief Legalize the \p OpIndex operand of this instruction by inserting
+  /// a MOV.  For example:
+  /// ADD_I32_e32 VGPR0, 15
+  /// to
+  /// MOV VGPR1, 15
+  /// ADD_I32_e32 VGPR0, VGPR1
+  ///
+  /// If the operand being legalized is a register, then a COPY will be used
+  /// instead of MOV.
+  void legalizeOpWithMove(MachineInstr *MI, unsigned OpIdx) const;
+
+  /// \brief Legalize all operands in this instruction.  This function may
+  /// create new instruction and insert them before \p MI.
+  void legalizeOperands(MachineInstr *MI) const;
+
+  /// \brief Replace this instruction's opcode with the equivalent VALU
+  /// opcode.  This function will also move the users of \p MI to the
+  /// VALU if necessary.
+  void moveToVALU(MachineInstr &MI) const;
 
   virtual unsigned calculateIndirectAddress(unsigned RegIndex,
                                             unsigned Channel) const;
 
-  virtual const TargetRegisterClass *getIndirectAddrStoreRegClass(
-                                                      unsigned SourceReg) const;
-
-  virtual const TargetRegisterClass *getIndirectAddrLoadRegClass() const;
+  virtual const TargetRegisterClass *getIndirectAddrRegClass() const;
 
   virtual MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB,
                                                  MachineBasicBlock::iterator I,
@@ -71,16 +119,18 @@ public:
                                                 unsigned ValueReg,
                                                 unsigned Address,
                                                 unsigned OffsetReg) const;
+  void reserveIndirectRegisters(BitVector &Reserved,
+                                const MachineFunction &MF) const;
 
-  virtual const TargetRegisterClass *getSuperIndirectRegClass() const;
-  };
+  void LoadM0(MachineInstr *MoveRel, MachineBasicBlock::iterator I,
+              unsigned SavReg, unsigned IndexReg) const;
+};
 
 namespace AMDGPU {
 
   int getVOPe64(uint16_t Opcode);
   int getCommuteRev(uint16_t Opcode);
   int getCommuteOrig(uint16_t Opcode);
-  int isMIMG(uint16_t Opcode);
 
 } // End namespace AMDGPU
 
diff --git a/lib/Target/R600/SIInstrInfo.td b/lib/Target/R600/SIInstrInfo.td
index 52af79c..4cd0daa 100644
--- a/lib/Target/R600/SIInstrInfo.td
+++ b/lib/Target/R600/SIInstrInfo.td
@@ -16,6 +16,45 @@ def SIadd64bit32bit : SDNode<"ISD::ADD",
   SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>, SDTCisVT<0, i64>, SDTCisVT<2, i32>]>
 >;
 
+def SIload_constant : SDNode<"AMDGPUISD::LOAD_CONSTANT",
+  SDTypeProfile<1, 2, [SDTCisVT<0, f32>, SDTCisVT<1, i128>, SDTCisVT<2, i32>]>,
+                      [SDNPMayLoad, SDNPMemOperand]
+>;
+
+def SItbuffer_store : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT",
+  SDTypeProfile<0, 13,
+    [SDTCisVT<0, i128>,   // rsrc(SGPR)
+     SDTCisVT<1, iAny>,   // vdata(VGPR)
+     SDTCisVT<2, i32>,    // num_channels(imm)
+     SDTCisVT<3, i32>,    // vaddr(VGPR)
+     SDTCisVT<4, i32>,    // soffset(SGPR)
+     SDTCisVT<5, i32>,    // inst_offset(imm)
+     SDTCisVT<6, i32>,    // dfmt(imm)
+     SDTCisVT<7, i32>,    // nfmt(imm)
+     SDTCisVT<8, i32>,    // offen(imm)
+     SDTCisVT<9, i32>,    // idxen(imm)
+     SDTCisVT<10, i32>,   // glc(imm)
+     SDTCisVT<11, i32>,   // slc(imm)
+     SDTCisVT<12, i32>    // tfe(imm)
+    ]>,
+  [SDNPMayStore, SDNPMemOperand, SDNPHasChain]
+>;
+
+def SIload_input : SDNode<"AMDGPUISD::LOAD_INPUT",
+  SDTypeProfile<1, 3, [SDTCisVT<0, v4f32>, SDTCisVT<1, i128>, SDTCisVT<2, i16>,
+                       SDTCisVT<3, i32>]>
+>;
+
+class SDSample<string opcode> : SDNode <opcode,
+  SDTypeProfile<1, 4, [SDTCisVT<0, v4f32>, SDTCisVT<2, v32i8>,
+                       SDTCisVT<3, i128>, SDTCisVT<4, i32>]>
+>;
+
+def SIsample : SDSample<"AMDGPUISD::SAMPLE">;
+def SIsampleb : SDSample<"AMDGPUISD::SAMPLEB">;
+def SIsampled : SDSample<"AMDGPUISD::SAMPLED">;
+def SIsamplel : SDSample<"AMDGPUISD::SAMPLEL">;
+
 // Transformation function, extract the lower 32bit of a 64bit immediate
 def LO32 : SDNodeXForm<imm, [{
   return CurDAG->getTargetConstant(N->getZExtValue() & 0xffffffff, MVT::i32);
@@ -45,6 +84,14 @@ def IMM8bitDWORD : ImmLeaf <
   }]>
 >;
 
+def as_i1imm : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getZExtValue(), MVT::i1);
+}]>;
+
+def as_i8imm : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getZExtValue(), MVT::i8);
+}]>;
+
 def as_i16imm : SDNodeXForm<imm, [{
   return CurDAG->getTargetConstant(N->getSExtValue(), MVT::i16);
 }]>;
@@ -58,6 +105,26 @@ class InlineImm <ValueType vt> : PatLeaf <(vt imm), [{
     (*(const SITargetLowering *)getTargetLowering()).analyzeImmediate(N) == 0;
 }]>;
 
+class SGPRImm <dag frag> : PatLeaf<frag, [{
+  if (TM.getSubtarget<AMDGPUSubtarget>().getGeneration() <
+      AMDGPUSubtarget::SOUTHERN_ISLANDS) {
+    return false;
+  }
+  const SIRegisterInfo *SIRI =
+                       static_cast<const SIRegisterInfo*>(TM.getRegisterInfo());
+  for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end();
+                                                U != E; ++U) {
+    if (SIRI->isSGPRClass(getOperandRegClass(*U, U.getOperandNo()))) {
+      return true;
+    }
+  }
+  return false;
+}]>;
+
+def FRAMEri64 : Operand<iPTR> {
+  let MIOperandInfo = (ops SReg_32:$ptr, i32imm:$index);
+}
+
 //===----------------------------------------------------------------------===//
 // SI assembler operands
 //===----------------------------------------------------------------------===//
@@ -109,6 +176,11 @@ class SOP2_64 <bits<7> op, string opName, list<dag> pattern> : SOP2 <
   opName#" $dst, $src0, $src1", pattern
 >;
 
+class SOP2_SHIFT_64 <bits<7> op, string opName, list<dag> pattern> : SOP2 <
+  op, (outs SReg_64:$dst), (ins SSrc_64:$src0, SSrc_32:$src1),
+  opName#" $dst, $src0, $src1", pattern
+>;
+
 class SOPC_32 <bits<7> op, string opName, list<dag> pattern> : SOPC <
   op, (outs SCCReg:$dst), (ins SSrc_32:$src0, SSrc_32:$src1),
   opName#" $dst, $src0, $src1", pattern
@@ -184,6 +256,12 @@ multiclass VOP1_32 <bits<8> op, string opName, list<dag> pattern>
 multiclass VOP1_64 <bits<8> op, string opName, list<dag> pattern>
   : VOP1_Helper <op, VReg_64, VSrc_64, opName, pattern>;
 
+multiclass VOP1_32_64 <bits<8> op, string opName, list<dag> pattern>
+  : VOP1_Helper <op, VReg_32, VSrc_64, opName, pattern>;
+
+multiclass VOP1_64_32 <bits<8> op, string opName, list<dag> pattern>
+  : VOP1_Helper <op, VReg_64, VSrc_32, opName, pattern>;
+
 multiclass VOP2_Helper <bits<6> op, RegisterClass vrc, RegisterClass arc,
                         string opName, list<dag> pattern, string revOp> {
   def _e32 : VOP2 <
@@ -320,6 +398,18 @@ class DS_Store_Helper <bits<8> op, string asm, RegisterClass regClass> : DS <
   let vdst = 0;
 }
 
+class DS_1A1D_RET <bits<8> op, string asm, RegisterClass rc> : DS <
+  op,
+  (outs rc:$vdst),
+  (ins i1imm:$gds, VReg_32:$addr, VReg_32:$data0, i8imm:$offset0,
+       i8imm:$offset1),
+  asm#" $gds, $vdst, $addr, $data0, $offset0, $offset1, [M0]",
+  []> {
+  let mayStore = 1;
+  let mayLoad = 1;
+  let data1 = 0;
+}
+
 class MTBUF_Store_Helper <bits<3> op, string asm, RegisterClass regClass> : MTBUF <
   op,
   (outs),
@@ -358,9 +448,9 @@ multiclass MUBUF_Load_Helper <bits<7> op, string asm, RegisterClass regClass> {
   }
 }
 
-class MUBUF_Store_Helper <bits<7> op, string name, RegisterClass vdataClass,
-                         ValueType VT> :
-    MUBUF <op, (outs), (ins vdataClass:$vdata, SReg_128:$srsrc, VReg_64:$vaddr, i16imm:$offset),
+class MUBUF_Store_Helper <bits<7> op, string name, RegisterClass vdataClass> :
+    MUBUF <op, (outs), (ins vdataClass:$vdata, SReg_128:$srsrc, VReg_64:$vaddr,
+                            i16imm:$offset),
           name#" $vdata, $srsrc + $vaddr + $offset",
          []> {
 
@@ -391,11 +481,18 @@ class MTBUF_Load_Helper <bits<3> op, string asm, RegisterClass regClass> : MTBUF
   let mayStore = 0;
 }
 
-class MIMG_NoSampler_Helper <bits<7> op, string asm> : MIMG <
+class MIMG_Mask <string op, int channels> {
+  string Op = op;
+  int Channels = channels;
+}
+
+class MIMG_NoSampler_Helper <bits<7> op, string asm,
+                             RegisterClass dst_rc,
+                             RegisterClass src_rc> : MIMG <
   op,
-  (outs VReg_128:$vdata),
+  (outs dst_rc:$vdata),
   (ins i32imm:$dmask, i1imm:$unorm, i1imm:$glc, i1imm:$da, i1imm:$r128,
-       i1imm:$tfe, i1imm:$lwe, i1imm:$slc, unknown:$vaddr,
+       i1imm:$tfe, i1imm:$lwe, i1imm:$slc, src_rc:$vaddr,
        SReg_256:$srsrc),
   asm#" $vdata, $dmask, $unorm, $glc, $da, $r128,"
      #" $tfe, $lwe, $slc, $vaddr, $srsrc",
@@ -406,11 +503,31 @@ class MIMG_NoSampler_Helper <bits<7> op, string asm> : MIMG <
   let hasPostISelHook = 1;
 }
 
-class MIMG_Sampler_Helper <bits<7> op, string asm> : MIMG <
+multiclass MIMG_NoSampler_Src_Helper <bits<7> op, string asm,
+                                      RegisterClass dst_rc,
+                                      int channels> {
+  def _V1 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_32>,
+            MIMG_Mask<asm#"_V1", channels>;
+  def _V2 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_64>,
+            MIMG_Mask<asm#"_V2", channels>;
+  def _V4 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_128>,
+            MIMG_Mask<asm#"_V4", channels>;
+}
+
+multiclass MIMG_NoSampler <bits<7> op, string asm> {
+  defm _V1 : MIMG_NoSampler_Src_Helper <op, asm, VReg_32, 1>;
+  defm _V2 : MIMG_NoSampler_Src_Helper <op, asm, VReg_64, 2>;
+  defm _V3 : MIMG_NoSampler_Src_Helper <op, asm, VReg_96, 3>;
+  defm _V4 : MIMG_NoSampler_Src_Helper <op, asm, VReg_128, 4>;
+}
+
+class MIMG_Sampler_Helper <bits<7> op, string asm,
+                           RegisterClass dst_rc,
+                           RegisterClass src_rc> : MIMG <
   op,
-  (outs VReg_128:$vdata),
+  (outs dst_rc:$vdata),
   (ins i32imm:$dmask, i1imm:$unorm, i1imm:$glc, i1imm:$da, i1imm:$r128,
-       i1imm:$tfe, i1imm:$lwe, i1imm:$slc, unknown:$vaddr,
+       i1imm:$tfe, i1imm:$lwe, i1imm:$slc, src_rc:$vaddr,
        SReg_256:$srsrc, SReg_128:$ssamp),
   asm#" $vdata, $dmask, $unorm, $glc, $da, $r128,"
      #" $tfe, $lwe, $slc, $vaddr, $srsrc, $ssamp",
@@ -420,6 +537,28 @@ class MIMG_Sampler_Helper <bits<7> op, string asm> : MIMG <
   let hasPostISelHook = 1;
 }
 
+multiclass MIMG_Sampler_Src_Helper <bits<7> op, string asm,
+                                    RegisterClass dst_rc,
+                                    int channels> {
+  def _V1 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_32>,
+            MIMG_Mask<asm#"_V1", channels>;
+  def _V2 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_64>,
+            MIMG_Mask<asm#"_V2", channels>;
+  def _V4 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_128>,
+            MIMG_Mask<asm#"_V4", channels>;
+  def _V8 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_256>,
+            MIMG_Mask<asm#"_V8", channels>;
+  def _V16 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_512>,
+            MIMG_Mask<asm#"_V16", channels>;
+}
+
+multiclass MIMG_Sampler <bits<7> op, string asm> {
+  defm _V1 : MIMG_Sampler_Src_Helper<op, asm, VReg_32, 1>;
+  defm _V2 : MIMG_Sampler_Src_Helper<op, asm, VReg_64, 2>;
+  defm _V3 : MIMG_Sampler_Src_Helper<op, asm, VReg_96, 3>;
+  defm _V4 : MIMG_Sampler_Src_Helper<op, asm, VReg_128, 4>;
+}
+
 //===----------------------------------------------------------------------===//
 // Vector instruction mappings
 //===----------------------------------------------------------------------===//
@@ -442,6 +581,14 @@ def getCommuteRev : InstrMapping {
   let ValueCols = [["0"]];
 }
 
+def getMaskedMIMGOp : InstrMapping {
+  let FilterClass = "MIMG_Mask";
+  let RowFields = ["Op"];
+  let ColFields = ["Channels"];
+  let KeyCol = ["4"];
+  let ValueCols = [["1"], ["2"], ["3"] ];
+}
+
 // Maps an commuted opcode to its original version
 def getCommuteOrig : InstrMapping {
   let FilterClass = "VOP2_REV";
@@ -451,13 +598,4 @@ def getCommuteOrig : InstrMapping {
   let ValueCols = [["1"]];
 }
 
-// Test if the supplied opcode is an MIMG instruction
-def isMIMG : InstrMapping {
-  let FilterClass = "MIMG";
-  let RowFields = ["Inst"];
-  let ColFields = ["Size"];
-  let KeyCol = ["8"];
-  let ValueCols = [["8"]];
-}
-
 include "SIInstructions.td"
diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td
index 500d15e..76f05eb 100644
--- a/lib/Target/R600/SIInstructions.td
+++ b/lib/Target/R600/SIInstructions.td
@@ -23,7 +23,9 @@ def InterpSlot : Operand<i32> {
 }
 
 def isSI : Predicate<"Subtarget.getGeneration() "
-                      "== AMDGPUSubtarget::SOUTHERN_ISLANDS">;
+                      ">= AMDGPUSubtarget::SOUTHERN_ISLANDS">;
+
+def WAIT_FLAG : InstFlag<"printWaitFlag">;
 
 let Predicates = [isSI] in {
 
@@ -126,8 +128,11 @@ def S_CMPK_LT_U32 : SOPK_32 <0x0000000d, "S_CMPK_LT_U32", []>;
 def S_CMPK_LE_U32 : SOPK_32 <0x0000000e, "S_CMPK_LE_U32", []>;
 } // End isCompare = 1
 
-def S_ADDK_I32 : SOPK_32 <0x0000000f, "S_ADDK_I32", []>;
-def S_MULK_I32 : SOPK_32 <0x00000010, "S_MULK_I32", []>;
+let Defs = [SCC], isCommutable = 1 in {
+  def S_ADDK_I32 : SOPK_32 <0x0000000f, "S_ADDK_I32", []>;
+  def S_MULK_I32 : SOPK_32 <0x00000010, "S_MULK_I32", []>;
+}
+
 //def S_CBRANCH_I_FORK : SOPK_ <0x00000011, "S_CBRANCH_I_FORK", []>;
 def S_GETREG_B32 : SOPK_32 <0x00000012, "S_GETREG_B32", []>;
 def S_SETREG_B32 : SOPK_32 <0x00000013, "S_SETREG_B32", []>;
@@ -138,19 +143,19 @@ def S_GETREG_REGRD_B32 : SOPK_32 <0x00000014, "S_GETREG_REGRD_B32", []>;
 let isCompare = 1 in {
 
 defm V_CMP_F_F32 : VOPC_32 <0x00000000, "V_CMP_F_F32">;
-defm V_CMP_LT_F32 : VOPC_32 <0x00000001, "V_CMP_LT_F32", f32, COND_LT>;
-defm V_CMP_EQ_F32 : VOPC_32 <0x00000002, "V_CMP_EQ_F32", f32, COND_EQ>;
-defm V_CMP_LE_F32 : VOPC_32 <0x00000003, "V_CMP_LE_F32", f32, COND_LE>;
-defm V_CMP_GT_F32 : VOPC_32 <0x00000004, "V_CMP_GT_F32", f32, COND_GT>;
-defm V_CMP_LG_F32 : VOPC_32 <0x00000005, "V_CMP_LG_F32", f32, COND_NE>;
-defm V_CMP_GE_F32 : VOPC_32 <0x00000006, "V_CMP_GE_F32", f32, COND_GE>;
-defm V_CMP_O_F32 : VOPC_32 <0x00000007, "V_CMP_O_F32">;
-defm V_CMP_U_F32 : VOPC_32 <0x00000008, "V_CMP_U_F32">;
+defm V_CMP_LT_F32 : VOPC_32 <0x00000001, "V_CMP_LT_F32", f32, COND_OLT>;
+defm V_CMP_EQ_F32 : VOPC_32 <0x00000002, "V_CMP_EQ_F32", f32, COND_OEQ>;
+defm V_CMP_LE_F32 : VOPC_32 <0x00000003, "V_CMP_LE_F32", f32, COND_OLE>;
+defm V_CMP_GT_F32 : VOPC_32 <0x00000004, "V_CMP_GT_F32", f32, COND_OGT>;
+defm V_CMP_LG_F32 : VOPC_32 <0x00000005, "V_CMP_LG_F32">;
+defm V_CMP_GE_F32 : VOPC_32 <0x00000006, "V_CMP_GE_F32", f32, COND_OGE>;
+defm V_CMP_O_F32 : VOPC_32 <0x00000007, "V_CMP_O_F32", f32, COND_O>;
+defm V_CMP_U_F32 : VOPC_32 <0x00000008, "V_CMP_U_F32", f32, COND_UO>;
 defm V_CMP_NGE_F32 : VOPC_32 <0x00000009, "V_CMP_NGE_F32">;
 defm V_CMP_NLG_F32 : VOPC_32 <0x0000000a, "V_CMP_NLG_F32">;
 defm V_CMP_NGT_F32 : VOPC_32 <0x0000000b, "V_CMP_NGT_F32">;
 defm V_CMP_NLE_F32 : VOPC_32 <0x0000000c, "V_CMP_NLE_F32">;
-defm V_CMP_NEQ_F32 : VOPC_32 <0x0000000d, "V_CMP_NEQ_F32", f32, COND_NE>;
+defm V_CMP_NEQ_F32 : VOPC_32 <0x0000000d, "V_CMP_NEQ_F32", f32, COND_UNE>;
 defm V_CMP_NLT_F32 : VOPC_32 <0x0000000e, "V_CMP_NLT_F32">;
 defm V_CMP_TRU_F32 : VOPC_32 <0x0000000f, "V_CMP_TRU_F32">;
 
@@ -176,19 +181,19 @@ defm V_CMPX_TRU_F32 : VOPC_32 <0x0000001f, "V_CMPX_TRU_F32">;
 } // End hasSideEffects = 1, Defs = [EXEC]
 
 defm V_CMP_F_F64 : VOPC_64 <0x00000020, "V_CMP_F_F64">;
-defm V_CMP_LT_F64 : VOPC_64 <0x00000021, "V_CMP_LT_F64", f64, COND_LT>;
-defm V_CMP_EQ_F64 : VOPC_64 <0x00000022, "V_CMP_EQ_F64", f64, COND_EQ>;
-defm V_CMP_LE_F64 : VOPC_64 <0x00000023, "V_CMP_LE_F64", f64, COND_LE>;
-defm V_CMP_GT_F64 : VOPC_64 <0x00000024, "V_CMP_GT_F64", f64, COND_GT>;
+defm V_CMP_LT_F64 : VOPC_64 <0x00000021, "V_CMP_LT_F64", f64, COND_OLT>;
+defm V_CMP_EQ_F64 : VOPC_64 <0x00000022, "V_CMP_EQ_F64", f64, COND_OEQ>;
+defm V_CMP_LE_F64 : VOPC_64 <0x00000023, "V_CMP_LE_F64", f64, COND_OLE>;
+defm V_CMP_GT_F64 : VOPC_64 <0x00000024, "V_CMP_GT_F64", f64, COND_OGT>;
 defm V_CMP_LG_F64 : VOPC_64 <0x00000025, "V_CMP_LG_F64">;
-defm V_CMP_GE_F64 : VOPC_64 <0x00000026, "V_CMP_GE_F64", f64, COND_GE>;
-defm V_CMP_O_F64 : VOPC_64 <0x00000027, "V_CMP_O_F64">;
-defm V_CMP_U_F64 : VOPC_64 <0x00000028, "V_CMP_U_F64">;
+defm V_CMP_GE_F64 : VOPC_64 <0x00000026, "V_CMP_GE_F64", f64, COND_OGE>;
+defm V_CMP_O_F64 : VOPC_64 <0x00000027, "V_CMP_O_F64", f64, COND_O>;
+defm V_CMP_U_F64 : VOPC_64 <0x00000028, "V_CMP_U_F64", f64, COND_UO>;
 defm V_CMP_NGE_F64 : VOPC_64 <0x00000029, "V_CMP_NGE_F64">;
 defm V_CMP_NLG_F64 : VOPC_64 <0x0000002a, "V_CMP_NLG_F64">;
 defm V_CMP_NGT_F64 : VOPC_64 <0x0000002b, "V_CMP_NGT_F64">;
 defm V_CMP_NLE_F64 : VOPC_64 <0x0000002c, "V_CMP_NLE_F64">;
-defm V_CMP_NEQ_F64 : VOPC_64 <0x0000002d, "V_CMP_NEQ_F64", f64, COND_NE>;
+defm V_CMP_NEQ_F64 : VOPC_64 <0x0000002d, "V_CMP_NEQ_F64", f64, COND_UNE>;
 defm V_CMP_NLT_F64 : VOPC_64 <0x0000002e, "V_CMP_NLT_F64">;
 defm V_CMP_TRU_F64 : VOPC_64 <0x0000002f, "V_CMP_TRU_F64">;
 
@@ -290,12 +295,12 @@ defm V_CMPSX_TRU_F64 : VOPC_64 <0x0000007f, "V_CMPSX_TRU_F64">;
 } // End hasSideEffects = 1, Defs = [EXEC]
 
 defm V_CMP_F_I32 : VOPC_32 <0x00000080, "V_CMP_F_I32">;
-defm V_CMP_LT_I32 : VOPC_32 <0x00000081, "V_CMP_LT_I32", i32, COND_LT>;
+defm V_CMP_LT_I32 : VOPC_32 <0x00000081, "V_CMP_LT_I32", i32, COND_SLT>;
 defm V_CMP_EQ_I32 : VOPC_32 <0x00000082, "V_CMP_EQ_I32", i32, COND_EQ>;
-defm V_CMP_LE_I32 : VOPC_32 <0x00000083, "V_CMP_LE_I32", i32, COND_LE>;
-defm V_CMP_GT_I32 : VOPC_32 <0x00000084, "V_CMP_GT_I32", i32, COND_GT>;
+defm V_CMP_LE_I32 : VOPC_32 <0x00000083, "V_CMP_LE_I32", i32, COND_SLE>;
+defm V_CMP_GT_I32 : VOPC_32 <0x00000084, "V_CMP_GT_I32", i32, COND_SGT>;
 defm V_CMP_NE_I32 : VOPC_32 <0x00000085, "V_CMP_NE_I32", i32, COND_NE>;
-defm V_CMP_GE_I32 : VOPC_32 <0x00000086, "V_CMP_GE_I32", i32, COND_GE>;
+defm V_CMP_GE_I32 : VOPC_32 <0x00000086, "V_CMP_GE_I32", i32, COND_SGE>;
 defm V_CMP_T_I32 : VOPC_32 <0x00000087, "V_CMP_T_I32">;
 
 let hasSideEffects = 1, Defs = [EXEC] in {
@@ -312,12 +317,12 @@ defm V_CMPX_T_I32 : VOPC_32 <0x00000097, "V_CMPX_T_I32">;
 } // End hasSideEffects = 1, Defs = [EXEC]
 
 defm V_CMP_F_I64 : VOPC_64 <0x000000a0, "V_CMP_F_I64">;
-defm V_CMP_LT_I64 : VOPC_64 <0x000000a1, "V_CMP_LT_I64">;
-defm V_CMP_EQ_I64 : VOPC_64 <0x000000a2, "V_CMP_EQ_I64">;
-defm V_CMP_LE_I64 : VOPC_64 <0x000000a3, "V_CMP_LE_I64">;
-defm V_CMP_GT_I64 : VOPC_64 <0x000000a4, "V_CMP_GT_I64">;
-defm V_CMP_NE_I64 : VOPC_64 <0x000000a5, "V_CMP_NE_I64">;
-defm V_CMP_GE_I64 : VOPC_64 <0x000000a6, "V_CMP_GE_I64">;
+defm V_CMP_LT_I64 : VOPC_64 <0x000000a1, "V_CMP_LT_I64", i64, COND_SLT>;
+defm V_CMP_EQ_I64 : VOPC_64 <0x000000a2, "V_CMP_EQ_I64", i64, COND_EQ>;
+defm V_CMP_LE_I64 : VOPC_64 <0x000000a3, "V_CMP_LE_I64", i64, COND_SLE>;
+defm V_CMP_GT_I64 : VOPC_64 <0x000000a4, "V_CMP_GT_I64", i64, COND_SGT>;
+defm V_CMP_NE_I64 : VOPC_64 <0x000000a5, "V_CMP_NE_I64", i64, COND_NE>;
+defm V_CMP_GE_I64 : VOPC_64 <0x000000a6, "V_CMP_GE_I64", i64, COND_SGE>;
 defm V_CMP_T_I64 : VOPC_64 <0x000000a7, "V_CMP_T_I64">;
 
 let hasSideEffects = 1, Defs = [EXEC] in {
@@ -334,12 +339,12 @@ defm V_CMPX_T_I64 : VOPC_64 <0x000000b7, "V_CMPX_T_I64">;
 } // End hasSideEffects = 1, Defs = [EXEC]
 
 defm V_CMP_F_U32 : VOPC_32 <0x000000c0, "V_CMP_F_U32">;
-defm V_CMP_LT_U32 : VOPC_32 <0x000000c1, "V_CMP_LT_U32">;
-defm V_CMP_EQ_U32 : VOPC_32 <0x000000c2, "V_CMP_EQ_U32">;
-defm V_CMP_LE_U32 : VOPC_32 <0x000000c3, "V_CMP_LE_U32">;
-defm V_CMP_GT_U32 : VOPC_32 <0x000000c4, "V_CMP_GT_U32">;
-defm V_CMP_NE_U32 : VOPC_32 <0x000000c5, "V_CMP_NE_U32">;
-defm V_CMP_GE_U32 : VOPC_32 <0x000000c6, "V_CMP_GE_U32">;
+defm V_CMP_LT_U32 : VOPC_32 <0x000000c1, "V_CMP_LT_U32", i32, COND_ULT>;
+defm V_CMP_EQ_U32 : VOPC_32 <0x000000c2, "V_CMP_EQ_U32", i32, COND_EQ>;
+defm V_CMP_LE_U32 : VOPC_32 <0x000000c3, "V_CMP_LE_U32", i32, COND_ULE>;
+defm V_CMP_GT_U32 : VOPC_32 <0x000000c4, "V_CMP_GT_U32", i32, COND_UGT>;
+defm V_CMP_NE_U32 : VOPC_32 <0x000000c5, "V_CMP_NE_U32", i32, COND_NE>;
+defm V_CMP_GE_U32 : VOPC_32 <0x000000c6, "V_CMP_GE_U32", i32, COND_UGE>;
 defm V_CMP_T_U32 : VOPC_32 <0x000000c7, "V_CMP_T_U32">;
 
 let hasSideEffects = 1, Defs = [EXEC] in {
@@ -356,12 +361,12 @@ defm V_CMPX_T_U32 : VOPC_32 <0x000000d7, "V_CMPX_T_U32">;
 } // End hasSideEffects = 1, Defs = [EXEC]
 
 defm V_CMP_F_U64 : VOPC_64 <0x000000e0, "V_CMP_F_U64">;
-defm V_CMP_LT_U64 : VOPC_64 <0x000000e1, "V_CMP_LT_U64">;
-defm V_CMP_EQ_U64 : VOPC_64 <0x000000e2, "V_CMP_EQ_U64">;
-defm V_CMP_LE_U64 : VOPC_64 <0x000000e3, "V_CMP_LE_U64">;
-defm V_CMP_GT_U64 : VOPC_64 <0x000000e4, "V_CMP_GT_U64">;
-defm V_CMP_NE_U64 : VOPC_64 <0x000000e5, "V_CMP_NE_U64">;
-defm V_CMP_GE_U64 : VOPC_64 <0x000000e6, "V_CMP_GE_U64">;
+defm V_CMP_LT_U64 : VOPC_64 <0x000000e1, "V_CMP_LT_U64", i64, COND_ULT>;
+defm V_CMP_EQ_U64 : VOPC_64 <0x000000e2, "V_CMP_EQ_U64", i64, COND_EQ>;
+defm V_CMP_LE_U64 : VOPC_64 <0x000000e3, "V_CMP_LE_U64", i64, COND_ULE>;
+defm V_CMP_GT_U64 : VOPC_64 <0x000000e4, "V_CMP_GT_U64", i64, COND_UGT>;
+defm V_CMP_NE_U64 : VOPC_64 <0x000000e5, "V_CMP_NE_U64", i64, COND_NE>;
+defm V_CMP_GE_U64 : VOPC_64 <0x000000e6, "V_CMP_GE_U64", i64, COND_UGE>;
 defm V_CMP_T_U64 : VOPC_64 <0x000000e7, "V_CMP_T_U64">;
 
 let hasSideEffects = 1, Defs = [EXEC] in {
@@ -391,8 +396,16 @@ defm V_CMPX_CLASS_F64 : VOPC_64 <0x000000b8, "V_CMPX_CLASS_F64">;
 
 } // End isCompare = 1
 
+def DS_ADD_U32_RTN : DS_1A1D_RET <0x20, "DS_ADD_U32_RTN", VReg_32>;
+def DS_SUB_U32_RTN : DS_1A1D_RET <0x21, "DS_SUB_U32_RTN", VReg_32>;
 def DS_WRITE_B32 : DS_Store_Helper <0x0000000d, "DS_WRITE_B32", VReg_32>;
+def DS_WRITE_B8 : DS_Store_Helper <0x00000001e, "DS_WRITE_B8", VReg_32>;
+def DS_WRITE_B16 : DS_Store_Helper <0x00000001f, "DS_WRITE_B16", VReg_32>;
 def DS_READ_B32 : DS_Load_Helper <0x00000036, "DS_READ_B32", VReg_32>;
+def DS_READ_I8 : DS_Load_Helper <0x00000039, "DS_READ_I8", VReg_32>;
+def DS_READ_U8 : DS_Load_Helper <0x0000003a, "DS_READ_U8", VReg_32>;
+def DS_READ_I16 : DS_Load_Helper <0x0000003b, "DS_READ_I16", VReg_32>;
+def DS_READ_U16 : DS_Load_Helper <0x0000003c, "DS_READ_U16", VReg_32>;
 
 //def BUFFER_LOAD_FORMAT_X : MUBUF_ <0x00000000, "BUFFER_LOAD_FORMAT_X", []>;
 //def BUFFER_LOAD_FORMAT_XY : MUBUF_ <0x00000001, "BUFFER_LOAD_FORMAT_XY", []>;
@@ -409,19 +422,25 @@ defm BUFFER_LOAD_SSHORT : MUBUF_Load_Helper <0x0000000b, "BUFFER_LOAD_SSHORT", V
 defm BUFFER_LOAD_DWORD : MUBUF_Load_Helper <0x0000000c, "BUFFER_LOAD_DWORD", VReg_32>;
 defm BUFFER_LOAD_DWORDX2 : MUBUF_Load_Helper <0x0000000d, "BUFFER_LOAD_DWORDX2", VReg_64>;
 defm BUFFER_LOAD_DWORDX4 : MUBUF_Load_Helper <0x0000000e, "BUFFER_LOAD_DWORDX4", VReg_128>;
-//def BUFFER_STORE_BYTE : MUBUF_ <0x00000018, "BUFFER_STORE_BYTE", []>;
-//def BUFFER_STORE_SHORT : MUBUF_ <0x0000001a, "BUFFER_STORE_SHORT", []>;
+
+def BUFFER_STORE_BYTE : MUBUF_Store_Helper <
+  0x00000018, "BUFFER_STORE_BYTE", VReg_32
+>;
+
+def BUFFER_STORE_SHORT : MUBUF_Store_Helper <
+  0x0000001a, "BUFFER_STORE_SHORT", VReg_32
+>;
 
 def BUFFER_STORE_DWORD : MUBUF_Store_Helper <
-  0x0000001c, "BUFFER_STORE_DWORD", VReg_32, i32
+  0x0000001c, "BUFFER_STORE_DWORD", VReg_32
 >;
 
 def BUFFER_STORE_DWORDX2 : MUBUF_Store_Helper <
-  0x0000001d, "BUFFER_STORE_DWORDX2", VReg_64, i64
+  0x0000001d, "BUFFER_STORE_DWORDX2", VReg_64
 >;
 
 def BUFFER_STORE_DWORDX4 : MUBUF_Store_Helper <
-  0x0000001e, "BUFFER_STORE_DWORDX4", VReg_128, v4i32
+  0x0000001e, "BUFFER_STORE_DWORDX4", VReg_128
 >;
 //def BUFFER_ATOMIC_SWAP : MUBUF_ <0x00000030, "BUFFER_ATOMIC_SWAP", []>;
 //def BUFFER_ATOMIC_CMPSWAP : MUBUF_ <0x00000031, "BUFFER_ATOMIC_CMPSWAP", []>;
@@ -463,21 +482,24 @@ def BUFFER_STORE_DWORDX4 : MUBUF_Store_Helper <
 //def TBUFFER_LOAD_FORMAT_XY : MTBUF_ <0x00000001, "TBUFFER_LOAD_FORMAT_XY", []>;
 //def TBUFFER_LOAD_FORMAT_XYZ : MTBUF_ <0x00000002, "TBUFFER_LOAD_FORMAT_XYZ", []>;
 def TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Load_Helper <0x00000003, "TBUFFER_LOAD_FORMAT_XYZW", VReg_128>;
-//def TBUFFER_STORE_FORMAT_X : MTBUF_ <0x00000004, "TBUFFER_STORE_FORMAT_X", []>;
-//def TBUFFER_STORE_FORMAT_XY : MTBUF_ <0x00000005, "TBUFFER_STORE_FORMAT_XY", []>;
-//def TBUFFER_STORE_FORMAT_XYZ : MTBUF_ <0x00000006, "TBUFFER_STORE_FORMAT_XYZ", []>;
-//def TBUFFER_STORE_FORMAT_XYZW : MTBUF_ <0x00000007, "TBUFFER_STORE_FORMAT_XYZW", []>;
+def TBUFFER_STORE_FORMAT_X : MTBUF_Store_Helper <0x00000004, "TBUFFER_STORE_FORMAT_X", VReg_32>;
+def TBUFFER_STORE_FORMAT_XY : MTBUF_Store_Helper <0x00000005, "TBUFFER_STORE_FORMAT_XY", VReg_64>;
+def TBUFFER_STORE_FORMAT_XYZ : MTBUF_Store_Helper <0x00000006, "TBUFFER_STORE_FORMAT_XYZ", VReg_128>;
+def TBUFFER_STORE_FORMAT_XYZW : MTBUF_Store_Helper <0x00000007, "TBUFFER_STORE_FORMAT_XYZW", VReg_128>;
 
 let mayLoad = 1 in {
 
-defm S_LOAD_DWORD : SMRD_Helper <0x00, "S_LOAD_DWORD", SReg_64, SReg_32>;
+// We are using the SGPR_32 and not the SReg_32 register class for 32-bit
+// SMRD instructions, because the SGPR_32 register class does not include M0
+// and writing to M0 from an SMRD instruction will hang the GPU.
+defm S_LOAD_DWORD : SMRD_Helper <0x00, "S_LOAD_DWORD", SReg_64, SGPR_32>;
 defm S_LOAD_DWORDX2 : SMRD_Helper <0x01, "S_LOAD_DWORDX2", SReg_64, SReg_64>;
 defm S_LOAD_DWORDX4 : SMRD_Helper <0x02, "S_LOAD_DWORDX4", SReg_64, SReg_128>;
 defm S_LOAD_DWORDX8 : SMRD_Helper <0x03, "S_LOAD_DWORDX8", SReg_64, SReg_256>;
 defm S_LOAD_DWORDX16 : SMRD_Helper <0x04, "S_LOAD_DWORDX16", SReg_64, SReg_512>;
 
 defm S_BUFFER_LOAD_DWORD : SMRD_Helper <
-  0x08, "S_BUFFER_LOAD_DWORD", SReg_128, SReg_32
+  0x08, "S_BUFFER_LOAD_DWORD", SReg_128, SGPR_32
 >;
 
 defm S_BUFFER_LOAD_DWORDX2 : SMRD_Helper <
@@ -500,8 +522,8 @@ defm S_BUFFER_LOAD_DWORDX16 : SMRD_Helper <
 
 //def S_MEMTIME : SMRD_ <0x0000001e, "S_MEMTIME", []>;
 //def S_DCACHE_INV : SMRD_ <0x0000001f, "S_DCACHE_INV", []>;
-//def IMAGE_LOAD : MIMG_NoPattern_ <"IMAGE_LOAD", 0x00000000>;
-def IMAGE_LOAD_MIP : MIMG_NoSampler_Helper <0x00000001, "IMAGE_LOAD_MIP">;
+defm IMAGE_LOAD : MIMG_NoSampler <0x00000000, "IMAGE_LOAD">;
+defm IMAGE_LOAD_MIP : MIMG_NoSampler <0x00000001, "IMAGE_LOAD_MIP">;
 //def IMAGE_LOAD_PCK : MIMG_NoPattern_ <"IMAGE_LOAD_PCK", 0x00000002>;
 //def IMAGE_LOAD_PCK_SGN : MIMG_NoPattern_ <"IMAGE_LOAD_PCK_SGN", 0x00000003>;
 //def IMAGE_LOAD_MIP_PCK : MIMG_NoPattern_ <"IMAGE_LOAD_MIP_PCK", 0x00000004>;
@@ -510,7 +532,7 @@ def IMAGE_LOAD_MIP : MIMG_NoSampler_Helper <0x00000001, "IMAGE_LOAD_MIP">;
 //def IMAGE_STORE_MIP : MIMG_NoPattern_ <"IMAGE_STORE_MIP", 0x00000009>;
 //def IMAGE_STORE_PCK : MIMG_NoPattern_ <"IMAGE_STORE_PCK", 0x0000000a>;
 //def IMAGE_STORE_MIP_PCK : MIMG_NoPattern_ <"IMAGE_STORE_MIP_PCK", 0x0000000b>;
-def IMAGE_GET_RESINFO : MIMG_NoSampler_Helper <0x0000000e, "IMAGE_GET_RESINFO">;
+defm IMAGE_GET_RESINFO : MIMG_NoSampler <0x0000000e, "IMAGE_GET_RESINFO">;
 //def IMAGE_ATOMIC_SWAP : MIMG_NoPattern_ <"IMAGE_ATOMIC_SWAP", 0x0000000f>;
 //def IMAGE_ATOMIC_CMPSWAP : MIMG_NoPattern_ <"IMAGE_ATOMIC_CMPSWAP", 0x00000010>;
 //def IMAGE_ATOMIC_ADD : MIMG_NoPattern_ <"IMAGE_ATOMIC_ADD", 0x00000011>;
@@ -528,20 +550,20 @@ def IMAGE_GET_RESINFO : MIMG_NoSampler_Helper <0x0000000e, "IMAGE_GET_RESINFO">;
 //def IMAGE_ATOMIC_FCMPSWAP : MIMG_NoPattern_ <"IMAGE_ATOMIC_FCMPSWAP", 0x0000001d>;
 //def IMAGE_ATOMIC_FMIN : MIMG_NoPattern_ <"IMAGE_ATOMIC_FMIN", 0x0000001e>;
 //def IMAGE_ATOMIC_FMAX : MIMG_NoPattern_ <"IMAGE_ATOMIC_FMAX", 0x0000001f>;
-def IMAGE_SAMPLE : MIMG_Sampler_Helper <0x00000020, "IMAGE_SAMPLE">; 
+defm IMAGE_SAMPLE : MIMG_Sampler <0x00000020, "IMAGE_SAMPLE">;
 //def IMAGE_SAMPLE_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_CL", 0x00000021>;
-def IMAGE_SAMPLE_D : MIMG_Sampler_Helper <0x00000022, "IMAGE_SAMPLE_D">;
+defm IMAGE_SAMPLE_D : MIMG_Sampler <0x00000022, "IMAGE_SAMPLE_D">;
 //def IMAGE_SAMPLE_D_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_D_CL", 0x00000023>;
-def IMAGE_SAMPLE_L : MIMG_Sampler_Helper <0x00000024, "IMAGE_SAMPLE_L">;
-def IMAGE_SAMPLE_B : MIMG_Sampler_Helper <0x00000025, "IMAGE_SAMPLE_B">;
+defm IMAGE_SAMPLE_L : MIMG_Sampler <0x00000024, "IMAGE_SAMPLE_L">;
+defm IMAGE_SAMPLE_B : MIMG_Sampler <0x00000025, "IMAGE_SAMPLE_B">;
 //def IMAGE_SAMPLE_B_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_B_CL", 0x00000026>;
 //def IMAGE_SAMPLE_LZ : MIMG_NoPattern_ <"IMAGE_SAMPLE_LZ", 0x00000027>;
-def IMAGE_SAMPLE_C : MIMG_Sampler_Helper <0x00000028, "IMAGE_SAMPLE_C">;
+defm IMAGE_SAMPLE_C : MIMG_Sampler <0x00000028, "IMAGE_SAMPLE_C">;
 //def IMAGE_SAMPLE_C_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_CL", 0x00000029>;
-def IMAGE_SAMPLE_C_D : MIMG_Sampler_Helper <0x0000002a, "IMAGE_SAMPLE_C_D">;
+defm IMAGE_SAMPLE_C_D : MIMG_Sampler <0x0000002a, "IMAGE_SAMPLE_C_D">;
 //def IMAGE_SAMPLE_C_D_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_D_CL", 0x0000002b>;
-def IMAGE_SAMPLE_C_L : MIMG_Sampler_Helper <0x0000002c, "IMAGE_SAMPLE_C_L">;
-def IMAGE_SAMPLE_C_B : MIMG_Sampler_Helper <0x0000002d, "IMAGE_SAMPLE_C_B">;
+defm IMAGE_SAMPLE_C_L : MIMG_Sampler <0x0000002c, "IMAGE_SAMPLE_C_L">;
+defm IMAGE_SAMPLE_C_B : MIMG_Sampler <0x0000002d, "IMAGE_SAMPLE_C_B">;
 //def IMAGE_SAMPLE_C_B_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_B_CL", 0x0000002e>;
 //def IMAGE_SAMPLE_C_LZ : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_LZ", 0x0000002f>;
 //def IMAGE_SAMPLE_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_O", 0x00000030>;
@@ -603,15 +625,21 @@ defm V_MOV_B32 : VOP1_32 <0x00000001, "V_MOV_B32", []>;
 } // End neverHasSideEffects = 1, isMoveImm = 1
 
 defm V_READFIRSTLANE_B32 : VOP1_32 <0x00000002, "V_READFIRSTLANE_B32", []>;
-//defm V_CVT_I32_F64 : VOP1_32 <0x00000003, "V_CVT_I32_F64", []>;
-//defm V_CVT_F64_I32 : VOP1_64 <0x00000004, "V_CVT_F64_I32", []>;
+defm V_CVT_I32_F64 : VOP1_32_64 <0x00000003, "V_CVT_I32_F64",
+  [(set i32:$dst, (fp_to_sint f64:$src0))]
+>;
+defm V_CVT_F64_I32 : VOP1_64_32 <0x00000004, "V_CVT_F64_I32",
+  [(set f64:$dst, (sint_to_fp i32:$src0))]
+>;
 defm V_CVT_F32_I32 : VOP1_32 <0x00000005, "V_CVT_F32_I32",
   [(set f32:$dst, (sint_to_fp i32:$src0))]
 >;
 defm V_CVT_F32_U32 : VOP1_32 <0x00000006, "V_CVT_F32_U32",
   [(set f32:$dst, (uint_to_fp i32:$src0))]
 >;
-defm V_CVT_U32_F32 : VOP1_32 <0x00000007, "V_CVT_U32_F32", []>;
+defm V_CVT_U32_F32 : VOP1_32 <0x00000007, "V_CVT_U32_F32",
+  [(set i32:$dst, (fp_to_uint f32:$src0))]
+>;
 defm V_CVT_I32_F32 : VOP1_32 <0x00000008, "V_CVT_I32_F32",
   [(set i32:$dst, (fp_to_sint f32:$src0))]
 >;
@@ -621,8 +649,12 @@ defm V_MOV_FED_B32 : VOP1_32 <0x00000009, "V_MOV_FED_B32", []>;
 //defm V_CVT_RPI_I32_F32 : VOP1_32 <0x0000000c, "V_CVT_RPI_I32_F32", []>;
 //defm V_CVT_FLR_I32_F32 : VOP1_32 <0x0000000d, "V_CVT_FLR_I32_F32", []>;
 //defm V_CVT_OFF_F32_I4 : VOP1_32 <0x0000000e, "V_CVT_OFF_F32_I4", []>;
-//defm V_CVT_F32_F64 : VOP1_32 <0x0000000f, "V_CVT_F32_F64", []>;
-//defm V_CVT_F64_F32 : VOP1_64 <0x00000010, "V_CVT_F64_F32", []>;
+defm V_CVT_F32_F64 : VOP1_32_64 <0x0000000f, "V_CVT_F32_F64",
+  [(set f32:$dst, (fround f64:$src0))]
+>;
+defm V_CVT_F64_F32 : VOP1_64_32 <0x00000010, "V_CVT_F64_F32",
+  [(set f64:$dst, (fextend f32:$src0))]
+>;
 //defm V_CVT_F32_UBYTE0 : VOP1_32 <0x00000011, "V_CVT_F32_UBYTE0", []>;
 //defm V_CVT_F32_UBYTE1 : VOP1_32 <0x00000012, "V_CVT_F32_UBYTE1", []>;
 //defm V_CVT_F32_UBYTE2 : VOP1_32 <0x00000013, "V_CVT_F32_UBYTE2", []>;
@@ -791,7 +823,7 @@ def S_BARRIER : SOPP <0x0000000a, (ins), "S_BARRIER",
   let mayStore = 1;
 }
 
-def S_WAITCNT : SOPP <0x0000000c, (ins i32imm:$simm16), "S_WAITCNT $simm16",
+def S_WAITCNT : SOPP <0x0000000c, (ins WAIT_FLAG:$simm16), "S_WAITCNT $simm16",
   []
 >;
 } // End hasSideEffects
@@ -827,6 +859,11 @@ def : Pat <
   (V_CNDMASK_B32_e64 $src0, $src1, $src2)
 >;
 
+def : Pat <
+  (i32 (trunc i64:$val)),
+  (EXTRACT_SUBREG $val, sub0)
+>;
+
 //use two V_CNDMASK_B32_e64 instructions for f64
 def : Pat <
   (f64 (select i1:$src2, f64:$src1, f64:$src0)),
@@ -910,9 +947,13 @@ defm V_ASHR_I32 : VOP2_32 <0x00000017, "V_ASHR_I32",
 >;
 defm V_ASHRREV_I32 : VOP2_32 <0x00000018, "V_ASHRREV_I32", [], "V_ASHR_I32">;
 
+let hasPostISelHook = 1 in {
+
 defm V_LSHL_B32 : VOP2_32 <0x00000019, "V_LSHL_B32",
   [(set i32:$dst, (shl i32:$src0, i32:$src1))]
 >;
+
+}
 defm V_LSHLREV_B32 : VOP2_32 <0x0000001a, "V_LSHLREV_B32", [], "V_LSHL_B32">;
 
 defm V_AND_B32 : VOP2_32 <0x0000001b, "V_AND_B32",
@@ -936,16 +977,13 @@ defm V_MBCNT_LO_U32_B32 : VOP2_32 <0x00000023, "V_MBCNT_LO_U32_B32", []>;
 defm V_MBCNT_HI_U32_B32 : VOP2_32 <0x00000024, "V_MBCNT_HI_U32_B32", []>;
 
 let isCommutable = 1, Defs = [VCC] in { // Carry-out goes to VCC
-defm V_ADD_I32 : VOP2b_32 <0x00000025, "V_ADD_I32",
-  [(set i32:$dst, (add (i32 VSrc_32:$src0), (i32 VReg_32:$src1)))]
->;
-
-defm V_SUB_I32 : VOP2b_32 <0x00000026, "V_SUB_I32",
-  [(set i32:$dst, (sub i32:$src0, i32:$src1))]
->;
+// No patterns so that the scalar instructions are always selected.
+// The scalar versions will be replaced with vector when needed later.
+defm V_ADD_I32 : VOP2b_32 <0x00000025, "V_ADD_I32", []>;
+defm V_SUB_I32 : VOP2b_32 <0x00000026, "V_SUB_I32", []>;
 defm V_SUBREV_I32 : VOP2b_32 <0x00000027, "V_SUBREV_I32", [], "V_SUB_I32">;
 
-let Uses = [VCC] in { // Carry-out comes from VCC
+let Uses = [VCC] in { // Carry-in comes from VCC
 defm V_ADDC_U32 : VOP2b_32 <0x00000028, "V_ADDC_U32", []>;
 defm V_SUBB_U32 : VOP2b_32 <0x00000029, "V_SUBB_U32", []>;
 defm V_SUBBREV_U32 : VOP2b_32 <0x0000002a, "V_SUBBREV_U32", [], "V_SUBB_U32">;
@@ -999,8 +1037,12 @@ def V_BFE_U32 : VOP3_32 <0x00000148, "V_BFE_U32", []>;
 def V_BFE_I32 : VOP3_32 <0x00000149, "V_BFE_I32", []>;
 def V_BFI_B32 : VOP3_32 <0x0000014a, "V_BFI_B32", []>;
 defm : BFIPatterns <V_BFI_B32>;
-def V_FMA_F32 : VOP3_32 <0x0000014b, "V_FMA_F32", []>;
-def V_FMA_F64 : VOP3_64 <0x0000014c, "V_FMA_F64", []>;
+def V_FMA_F32 : VOP3_32 <0x0000014b, "V_FMA_F32",
+  [(set f32:$dst, (fma f32:$src0, f32:$src1, f32:$src2))]
+>;
+def V_FMA_F64 : VOP3_64 <0x0000014c, "V_FMA_F64",
+  [(set f64:$dst, (fma f64:$src0, f64:$src1, f64:$src2))]
+>;
 //def V_LERP_U8 : VOP3_U8 <0x0000014d, "V_LERP_U8", []>;
 def V_ALIGNBIT_B32 : VOP3_32 <0x0000014e, "V_ALIGNBIT_B32", []>;
 def : ROTRPattern <V_ALIGNBIT_B32>;
@@ -1087,12 +1129,31 @@ def V_DIV_FMAS_F64 : VOP3_64 <0x00000170, "V_DIV_FMAS_F64", []>;
 //def V_QSAD_U8 : VOP3_U8 <0x00000172, "V_QSAD_U8", []>;
 //def V_MQSAD_U8 : VOP3_U8 <0x00000173, "V_MQSAD_U8", []>;
 def V_TRIG_PREOP_F64 : VOP3_64 <0x00000174, "V_TRIG_PREOP_F64", []>;
+
+let Defs = [SCC] in { // Carry out goes to SCC
+let isCommutable = 1 in {
 def S_ADD_U32 : SOP2_32 <0x00000000, "S_ADD_U32", []>;
+def S_ADD_I32 : SOP2_32 <0x00000002, "S_ADD_I32",
+  [(set i32:$dst, (add SSrc_32:$src0, SSrc_32:$src1))]
+>;
+} // End isCommutable = 1
+
 def S_SUB_U32 : SOP2_32 <0x00000001, "S_SUB_U32", []>;
-def S_ADD_I32 : SOP2_32 <0x00000002, "S_ADD_I32", []>;
-def S_SUB_I32 : SOP2_32 <0x00000003, "S_SUB_I32", []>;
-def S_ADDC_U32 : SOP2_32 <0x00000004, "S_ADDC_U32", []>;
-def S_SUBB_U32 : SOP2_32 <0x00000005, "S_SUBB_U32", []>;
+def S_SUB_I32 : SOP2_32 <0x00000003, "S_SUB_I32",
+  [(set i32:$dst, (sub SSrc_32:$src0, SSrc_32:$src1))]
+>;
+
+let Uses = [SCC] in { // Carry in comes from SCC
+let isCommutable = 1 in {
+def S_ADDC_U32 : SOP2_32 <0x00000004, "S_ADDC_U32",
+  [(set i32:$dst, (adde (i32 SSrc_32:$src0), (i32 SSrc_32:$src1)))]>;
+} // End isCommutable = 1
+
+def S_SUBB_U32 : SOP2_32 <0x00000005, "S_SUBB_U32",
+  [(set i32:$dst, (sube (i32 SSrc_32:$src0), (i32 SSrc_32:$src1)))]>;
+} // End Uses = [SCC]
+} // End Defs = [SCC]
+
 def S_MIN_I32 : SOP2_32 <0x00000006, "S_MIN_I32", []>;
 def S_MIN_U32 : SOP2_32 <0x00000007, "S_MIN_U32", []>;
 def S_MAX_I32 : SOP2_32 <0x00000008, "S_MAX_I32", []>;
@@ -1124,7 +1185,9 @@ def : Pat <
   (S_OR_B64 $src0, $src1)
 >;
 def S_XOR_B32 : SOP2_32 <0x00000012, "S_XOR_B32", []>;
-def S_XOR_B64 : SOP2_64 <0x00000013, "S_XOR_B64", []>;
+def S_XOR_B64 : SOP2_64 <0x00000013, "S_XOR_B64",
+  [(set i1:$dst, (xor i1:$src0, i1:$src1))]
+>;
 def S_ANDN2_B32 : SOP2_32 <0x00000014, "S_ANDN2_B32", []>;
 def S_ANDN2_B64 : SOP2_64 <0x00000015, "S_ANDN2_B64", []>;
 def S_ORN2_B32 : SOP2_32 <0x00000016, "S_ORN2_B32", []>;
@@ -1135,12 +1198,31 @@ def S_NOR_B32 : SOP2_32 <0x0000001a, "S_NOR_B32", []>;
 def S_NOR_B64 : SOP2_64 <0x0000001b, "S_NOR_B64", []>;
 def S_XNOR_B32 : SOP2_32 <0x0000001c, "S_XNOR_B32", []>;
 def S_XNOR_B64 : SOP2_64 <0x0000001d, "S_XNOR_B64", []>;
-def S_LSHL_B32 : SOP2_32 <0x0000001e, "S_LSHL_B32", []>;
-def S_LSHL_B64 : SOP2_64 <0x0000001f, "S_LSHL_B64", []>;
-def S_LSHR_B32 : SOP2_32 <0x00000020, "S_LSHR_B32", []>;
-def S_LSHR_B64 : SOP2_64 <0x00000021, "S_LSHR_B64", []>;
-def S_ASHR_I32 : SOP2_32 <0x00000022, "S_ASHR_I32", []>;
-def S_ASHR_I64 : SOP2_64 <0x00000023, "S_ASHR_I64", []>;
+
+// Use added complexity so these patterns are preferred to the VALU patterns.
+let AddedComplexity = 1 in {
+
+def S_LSHL_B32 : SOP2_32 <0x0000001e, "S_LSHL_B32",
+  [(set i32:$dst, (shl i32:$src0, i32:$src1))]
+>;
+def S_LSHL_B64 : SOP2_SHIFT_64 <0x0000001f, "S_LSHL_B64",
+  [(set i64:$dst, (shl i64:$src0, i32:$src1))]
+>;
+def S_LSHR_B32 : SOP2_32 <0x00000020, "S_LSHR_B32",
+  [(set i32:$dst, (srl i32:$src0, i32:$src1))]
+>;
+def S_LSHR_B64 : SOP2_SHIFT_64 <0x00000021, "S_LSHR_B64",
+  [(set i64:$dst, (srl i64:$src0, i32:$src1))]
+>;
+def S_ASHR_I32 : SOP2_32 <0x00000022, "S_ASHR_I32",
+  [(set i32:$dst, (sra i32:$src0, i32:$src1))]
+>;
+def S_ASHR_I64 : SOP2_SHIFT_64 <0x00000023, "S_ASHR_I64",
+  [(set i64:$dst, (sra i64:$src0, i32:$src1))]
+>;
+
+} // End AddedComplexity = 1
+
 def S_BFM_B32 : SOP2_32 <0x00000024, "S_BFM_B32", []>;
 def S_BFM_B64 : SOP2_64 <0x00000025, "S_BFM_B64", []>;
 def S_MUL_I32 : SOP2_32 <0x00000026, "S_MUL_I32", []>;
@@ -1160,7 +1242,7 @@ def LOAD_CONST : AMDGPUShaderInst <
   [(set GPRF32:$dst, (int_AMDGPU_load_const imm:$src))]
 >;
 
-// SI Psuedo instructions. These are used by the CFG structurizer pass
+// SI pseudo instructions. These are used by the CFG structurizer pass
 // and should be lowered to ISA instructions prior to codegen.
 
 let mayLoad = 1, mayStore = 1, hasSideEffects = 1,
@@ -1233,6 +1315,36 @@ def SI_KILL : InstSI <
 
 let Uses = [EXEC], Defs = [EXEC,VCC,M0] in {
 
+//defm SI_ : RegisterLoadStore <VReg_32, FRAMEri64, ADDRIndirect>;
+
+let UseNamedOperandTable = 1 in {
+
+def SI_RegisterLoad : AMDGPUShaderInst <
+  (outs VReg_32:$dst, SReg_64:$temp),
+  (ins FRAMEri64:$addr, i32imm:$chan),
+  "", []
+> {
+  let isRegisterLoad = 1;
+  let mayLoad = 1;
+}
+
+class SIRegStore<dag outs> : AMDGPUShaderInst <
+  outs,
+  (ins VReg_32:$val, FRAMEri64:$addr, i32imm:$chan),
+  "", []
+> {
+  let isRegisterStore = 1;
+  let mayStore = 1;
+}
+
+let usesCustomInserter = 1 in {
+def SI_RegisterStorePseudo : SIRegStore<(outs)>;
+} // End usesCustomInserter = 1
+def SI_RegisterStore : SIRegStore<(outs SReg_64:$temp)>;
+
+
+} // End UseNamedOperandTable = 1
+
 def SI_INDIRECT_SRC : InstSI <
   (outs VReg_32:$dst, SReg_64:$temp),
   (ins unknown:$src, VSrc_32:$idx, i32imm:$off),
@@ -1249,6 +1361,7 @@ class SI_INDIRECT_DST<RegisterClass rc> : InstSI <
   let Constraints = "$src = $dst";
 }
 
+def SI_INDIRECT_DST_V1 : SI_INDIRECT_DST<VReg_32>;
 def SI_INDIRECT_DST_V2 : SI_INDIRECT_DST<VReg_64>;
 def SI_INDIRECT_DST_V4 : SI_INDIRECT_DST<VReg_128>;
 def SI_INDIRECT_DST_V8 : SI_INDIRECT_DST<VReg_256>;
@@ -1258,7 +1371,7 @@ def SI_INDIRECT_DST_V16 : SI_INDIRECT_DST<VReg_512>;
 
 let usesCustomInserter = 1 in {
 
-// This psuedo instruction takes a pointer as input and outputs a resource
+// This pseudo instruction takes a pointer as input and outputs a resource
 // constant that can be used with the ADDR64 MUBUF instructions.
 def SI_ADDR64_RSRC : InstSI <
   (outs SReg_128:$srsrc),
@@ -1289,7 +1402,7 @@ def : Pat <
 
 /* int_SI_vs_load_input */
 def : Pat<
-  (int_SI_vs_load_input v16i8:$tlst, IMM12bit:$attr_offset, i32:$buf_idx_vgpr),
+  (SIload_input i128:$tlst, IMM12bit:$attr_offset, i32:$buf_idx_vgpr),
   (BUFFER_LOAD_FORMAT_XYZW_IDXEN $tlst, $buf_idx_vgpr, imm:$attr_offset)
 >;
 
@@ -1310,67 +1423,85 @@ def : Pat <
 /********** Image sampling patterns **********/
 /********** ======================= **********/
 
-/* int_SI_sample for simple 1D texture lookup */
+/* SIsample for simple 1D texture lookup */
 def : Pat <
-  (int_SI_sample v1i32:$addr, v32i8:$rsrc, v16i8:$sampler, imm),
-  (IMAGE_SAMPLE 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler)
+  (SIsample i32:$addr, v32i8:$rsrc, i128:$sampler, imm),
+  (IMAGE_SAMPLE_V4_V1 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler)
 >;
 
-class SamplePattern<Intrinsic name, MIMG opcode, ValueType vt> : Pat <
-    (name vt:$addr, v32i8:$rsrc, v16i8:$sampler, imm),
+class SamplePattern<SDNode name, MIMG opcode, ValueType vt> : Pat <
+    (name vt:$addr, v32i8:$rsrc, i128:$sampler, imm),
     (opcode 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler)
 >;
 
-class SampleRectPattern<Intrinsic name, MIMG opcode, ValueType vt> : Pat <
-    (name vt:$addr, v32i8:$rsrc, v16i8:$sampler, TEX_RECT),
+class SampleRectPattern<SDNode name, MIMG opcode, ValueType vt> : Pat <
+    (name vt:$addr, v32i8:$rsrc, i128:$sampler, TEX_RECT),
     (opcode 0xf, 1, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler)
 >;
 
-class SampleArrayPattern<Intrinsic name, MIMG opcode, ValueType vt> : Pat <
-    (name vt:$addr, v32i8:$rsrc, v16i8:$sampler, TEX_ARRAY),
+class SampleArrayPattern<SDNode name, MIMG opcode, ValueType vt> : Pat <
+    (name vt:$addr, v32i8:$rsrc, i128:$sampler, TEX_ARRAY),
     (opcode 0xf, 0, 0, 1, 0, 0, 0, 0, $addr, $rsrc, $sampler)
 >;
 
-class SampleShadowPattern<Intrinsic name, MIMG opcode,
+class SampleShadowPattern<SDNode name, MIMG opcode,
                           ValueType vt> : Pat <
-    (name vt:$addr, v32i8:$rsrc, v16i8:$sampler, TEX_SHADOW),
+    (name vt:$addr, v32i8:$rsrc, i128:$sampler, TEX_SHADOW),
     (opcode 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler)
 >;
 
-class SampleShadowArrayPattern<Intrinsic name, MIMG opcode,
+class SampleShadowArrayPattern<SDNode name, MIMG opcode,
                                ValueType vt> : Pat <
-    (name vt:$addr, v32i8:$rsrc, v16i8:$sampler, TEX_SHADOW_ARRAY),
+    (name vt:$addr, v32i8:$rsrc, i128:$sampler, TEX_SHADOW_ARRAY),
     (opcode 0xf, 0, 0, 1, 0, 0, 0, 0, $addr, $rsrc, $sampler)
 >;
 
-/* int_SI_sample* for texture lookups consuming more address parameters */
-multiclass SamplePatterns<ValueType addr_type> {
-  def : SamplePattern <int_SI_sample, IMAGE_SAMPLE, addr_type>;
-  def : SampleRectPattern <int_SI_sample, IMAGE_SAMPLE, addr_type>;
-  def : SampleArrayPattern <int_SI_sample, IMAGE_SAMPLE, addr_type>;
-  def : SampleShadowPattern <int_SI_sample, IMAGE_SAMPLE_C, addr_type>;
-  def : SampleShadowArrayPattern <int_SI_sample, IMAGE_SAMPLE_C, addr_type>;
-
-  def : SamplePattern <int_SI_samplel, IMAGE_SAMPLE_L, addr_type>;
-  def : SampleArrayPattern <int_SI_samplel, IMAGE_SAMPLE_L, addr_type>;
-  def : SampleShadowPattern <int_SI_samplel, IMAGE_SAMPLE_C_L, addr_type>;
-  def : SampleShadowArrayPattern <int_SI_samplel, IMAGE_SAMPLE_C_L, addr_type>;
-
-  def : SamplePattern <int_SI_sampleb, IMAGE_SAMPLE_B, addr_type>;
-  def : SampleArrayPattern <int_SI_sampleb, IMAGE_SAMPLE_B, addr_type>;
-  def : SampleShadowPattern <int_SI_sampleb, IMAGE_SAMPLE_C_B, addr_type>;
-  def : SampleShadowArrayPattern <int_SI_sampleb, IMAGE_SAMPLE_C_B, addr_type>;
-
-  def : SamplePattern <int_SI_sampled, IMAGE_SAMPLE_D, addr_type>;
-  def : SampleArrayPattern <int_SI_sampled, IMAGE_SAMPLE_D, addr_type>;
-  def : SampleShadowPattern <int_SI_sampled, IMAGE_SAMPLE_C_D, addr_type>;
-  def : SampleShadowArrayPattern <int_SI_sampled, IMAGE_SAMPLE_C_D, addr_type>;
+/* SIsample* for texture lookups consuming more address parameters */
+multiclass SamplePatterns<MIMG sample, MIMG sample_c, MIMG sample_l,
+                          MIMG sample_c_l, MIMG sample_b, MIMG sample_c_b,
+MIMG sample_d, MIMG sample_c_d, ValueType addr_type> {
+  def : SamplePattern <SIsample, sample, addr_type>;
+  def : SampleRectPattern <SIsample, sample, addr_type>;
+  def : SampleArrayPattern <SIsample, sample, addr_type>;
+  def : SampleShadowPattern <SIsample, sample_c, addr_type>;
+  def : SampleShadowArrayPattern <SIsample, sample_c, addr_type>;
+
+  def : SamplePattern <SIsamplel, sample_l, addr_type>;
+  def : SampleArrayPattern <SIsamplel, sample_l, addr_type>;
+  def : SampleShadowPattern <SIsamplel, sample_c_l, addr_type>;
+  def : SampleShadowArrayPattern <SIsamplel, sample_c_l, addr_type>;
+
+  def : SamplePattern <SIsampleb, sample_b, addr_type>;
+  def : SampleArrayPattern <SIsampleb, sample_b, addr_type>;
+  def : SampleShadowPattern <SIsampleb, sample_c_b, addr_type>;
+  def : SampleShadowArrayPattern <SIsampleb, sample_c_b, addr_type>;
+
+  def : SamplePattern <SIsampled, sample_d, addr_type>;
+  def : SampleArrayPattern <SIsampled, sample_d, addr_type>;
+  def : SampleShadowPattern <SIsampled, sample_c_d, addr_type>;
+  def : SampleShadowArrayPattern <SIsampled, sample_c_d, addr_type>;
 }
 
-defm : SamplePatterns<v2i32>;
-defm : SamplePatterns<v4i32>;
-defm : SamplePatterns<v8i32>;
-defm : SamplePatterns<v16i32>;
+defm : SamplePatterns<IMAGE_SAMPLE_V4_V2, IMAGE_SAMPLE_C_V4_V2,
+                      IMAGE_SAMPLE_L_V4_V2, IMAGE_SAMPLE_C_L_V4_V2,
+                      IMAGE_SAMPLE_B_V4_V2, IMAGE_SAMPLE_C_B_V4_V2,
+                      IMAGE_SAMPLE_D_V4_V2, IMAGE_SAMPLE_C_D_V4_V2,
+                      v2i32>;
+defm : SamplePatterns<IMAGE_SAMPLE_V4_V4, IMAGE_SAMPLE_C_V4_V4,
+                      IMAGE_SAMPLE_L_V4_V4, IMAGE_SAMPLE_C_L_V4_V4,
+                      IMAGE_SAMPLE_B_V4_V4, IMAGE_SAMPLE_C_B_V4_V4,
+                      IMAGE_SAMPLE_D_V4_V4, IMAGE_SAMPLE_C_D_V4_V4,
+                      v4i32>;
+defm : SamplePatterns<IMAGE_SAMPLE_V4_V8, IMAGE_SAMPLE_C_V4_V8,
+                      IMAGE_SAMPLE_L_V4_V8, IMAGE_SAMPLE_C_L_V4_V8,
+                      IMAGE_SAMPLE_B_V4_V8, IMAGE_SAMPLE_C_B_V4_V8,
+                      IMAGE_SAMPLE_D_V4_V8, IMAGE_SAMPLE_C_D_V4_V8,
+                      v8i32>;
+defm : SamplePatterns<IMAGE_SAMPLE_V4_V16, IMAGE_SAMPLE_C_V4_V16,
+                      IMAGE_SAMPLE_L_V4_V16, IMAGE_SAMPLE_C_L_V4_V16,
+                      IMAGE_SAMPLE_B_V4_V16, IMAGE_SAMPLE_C_B_V4_V16,
+                      IMAGE_SAMPLE_D_V4_V16, IMAGE_SAMPLE_C_D_V4_V16,
+                      v16i32>;
 
 /* int_SI_imageload for texture fetches consuming varying address parameters */
 class ImageLoadPattern<Intrinsic name, MIMG opcode, ValueType addr_type> : Pat <
@@ -1383,23 +1514,46 @@ class ImageLoadArrayPattern<Intrinsic name, MIMG opcode, ValueType addr_type> :
     (opcode 0xf, 0, 0, 1, 0, 0, 0, 0, $addr, $rsrc)
 >;
 
-multiclass ImageLoadPatterns<ValueType addr_type> {
-  def : ImageLoadPattern <int_SI_imageload, IMAGE_LOAD_MIP, addr_type>;
-  def : ImageLoadArrayPattern <int_SI_imageload, IMAGE_LOAD_MIP, addr_type>;
+class ImageLoadMSAAPattern<Intrinsic name, MIMG opcode, ValueType addr_type> : Pat <
+    (name addr_type:$addr, v32i8:$rsrc, TEX_MSAA),
+    (opcode 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc)
+>;
+
+class ImageLoadArrayMSAAPattern<Intrinsic name, MIMG opcode, ValueType addr_type> : Pat <
+    (name addr_type:$addr, v32i8:$rsrc, TEX_ARRAY_MSAA),
+    (opcode 0xf, 0, 0, 1, 0, 0, 0, 0, $addr, $rsrc)
+>;
+
+multiclass ImageLoadPatterns<MIMG opcode, ValueType addr_type> {
+  def : ImageLoadPattern <int_SI_imageload, opcode, addr_type>;
+  def : ImageLoadArrayPattern <int_SI_imageload, opcode, addr_type>;
+}
+
+multiclass ImageLoadMSAAPatterns<MIMG opcode, ValueType addr_type> {
+  def : ImageLoadMSAAPattern <int_SI_imageload, opcode, addr_type>;
+  def : ImageLoadArrayMSAAPattern <int_SI_imageload, opcode, addr_type>;
 }
 
-defm : ImageLoadPatterns<v2i32>;
-defm : ImageLoadPatterns<v4i32>;
+defm : ImageLoadPatterns<IMAGE_LOAD_MIP_V4_V2, v2i32>;
+defm : ImageLoadPatterns<IMAGE_LOAD_MIP_V4_V4, v4i32>;
+
+defm : ImageLoadMSAAPatterns<IMAGE_LOAD_V4_V2, v2i32>;
+defm : ImageLoadMSAAPatterns<IMAGE_LOAD_V4_V4, v4i32>;
 
 /* Image resource information */
 def : Pat <
   (int_SI_resinfo i32:$mipid, v32i8:$rsrc, imm),
-  (IMAGE_GET_RESINFO 0xf, 0, 0, 0, 0, 0, 0, 0, (V_MOV_B32_e32 $mipid), $rsrc)
+  (IMAGE_GET_RESINFO_V4_V1 0xf, 0, 0, 0, 0, 0, 0, 0, (V_MOV_B32_e32 $mipid), $rsrc)
 >;
 
 def : Pat <
   (int_SI_resinfo i32:$mipid, v32i8:$rsrc, TEX_ARRAY),
-  (IMAGE_GET_RESINFO 0xf, 0, 0, 1, 0, 0, 0, 0, (V_MOV_B32_e32 $mipid), $rsrc)
+  (IMAGE_GET_RESINFO_V4_V1 0xf, 0, 0, 1, 0, 0, 0, 0, (V_MOV_B32_e32 $mipid), $rsrc)
+>;
+
+def : Pat <
+  (int_SI_resinfo i32:$mipid, v32i8:$rsrc, TEX_ARRAY_MSAA),
+  (IMAGE_GET_RESINFO_V4_V1 0xf, 0, 0, 1, 0, 0, 0, 0, (V_MOV_B32_e32 $mipid), $rsrc)
 >;
 
 /********** ============================================ **********/
@@ -1470,16 +1624,6 @@ foreach Index = 0-15 in {
   >;
 }
 
-def : Vector1_Build <v1i32, i32, VReg_32>;
-def : Vector2_Build <v2i32, i32>;
-def : Vector2_Build <v2f32, f32>;
-def : Vector4_Build <v4i32, i32>;
-def : Vector4_Build <v4f32, f32>;
-def : Vector8_Build <v8i32, i32>;
-def : Vector8_Build <v8f32, f32>;
-def : Vector16_Build <v16i32, i32>;
-def : Vector16_Build <v16f32, f32>;
-
 def : BitConvert <i32, f32, SReg_32>;
 def : BitConvert <i32, f32, VReg_32>;
 
@@ -1492,9 +1636,17 @@ def : BitConvert <f64, i64, VReg_64>;
 
 def : BitConvert <v2f32, v2i32, VReg_64>;
 def : BitConvert <v2i32, v2f32, VReg_64>;
+def : BitConvert <v2i32, i64, VReg_64>;
 
 def : BitConvert <v4f32, v4i32, VReg_128>;
 def : BitConvert <v4i32, v4f32, VReg_128>;
+def : BitConvert <v4i32, i128,  VReg_128>;
+def : BitConvert <i128, v4i32,  VReg_128>;
+
+def : BitConvert <v8i32, v32i8, SReg_256>;
+def : BitConvert <v32i8, v8i32, SReg_256>;
+def : BitConvert <v8i32, v32i8, VReg_256>;
+def : BitConvert <v32i8, v8i32, VReg_256>;
 
 /********** =================== **********/
 /********** Src & Dst modifiers **********/
@@ -1523,6 +1675,16 @@ def : Pat <
 /********** ================== **********/
 
 def : Pat <
+  (SGPRImm<(i32 imm)>:$imm),
+  (S_MOV_B32 imm:$imm)
+>;
+
+def : Pat <
+  (SGPRImm<(f32 fpimm)>:$imm),
+  (S_MOV_B32 fpimm:$imm)
+>;
+
+def : Pat <
   (i32 imm:$imm),
   (V_MOV_B32_e32 imm:$imm)
 >;
@@ -1634,19 +1796,19 @@ def : Pat <
 
 // 1. Offset as 8bit DWORD immediate
 def : Pat <
-  (int_SI_load_const v16i8:$sbase, IMM8bitDWORD:$offset),
+  (SIload_constant i128:$sbase, IMM8bitDWORD:$offset),
   (S_BUFFER_LOAD_DWORD_IMM $sbase, IMM8bitDWORD:$offset)
 >;
 
 // 2. Offset loaded in an 32bit SGPR
 def : Pat <
-  (int_SI_load_const v16i8:$sbase, imm:$offset),
+  (SIload_constant i128:$sbase, imm:$offset),
   (S_BUFFER_LOAD_DWORD_SGPR $sbase, (S_MOV_B32 imm:$offset))
 >;
 
 // 3. Offset in an 32Bit VGPR
 def : Pat <
-  (int_SI_load_const v16i8:$sbase, i32:$voff),
+  (SIload_constant i128:$sbase, i32:$voff),
   (BUFFER_LOAD_DWORD_OFFEN $sbase, $voff)
 >;
 
@@ -1677,17 +1839,36 @@ def : Pat <
 /**********   Load/Store Patterns   **********/
 /********** ======================= **********/
 
-def : Pat <
-    (local_load i64:$src0),
-    (i32 (DS_READ_B32 0, (EXTRACT_SUBREG $src0, sub0),
-                      (EXTRACT_SUBREG $src0, sub0), (EXTRACT_SUBREG $src0, sub0), 0, 0))
+class DSReadPat <DS inst, ValueType vt, PatFrag frag> : Pat <
+  (frag i32:$src0),
+  (vt (inst 0, $src0, $src0, $src0, 0, 0))
 >;
 
+def : DSReadPat <DS_READ_I8,  i32, sextloadi8_local>;
+def : DSReadPat <DS_READ_U8,  i32, az_extloadi8_local>;
+def : DSReadPat <DS_READ_I16, i32, sextloadi16_local>;
+def : DSReadPat <DS_READ_U16, i32, az_extloadi16_local>;
+def : DSReadPat <DS_READ_B32, i32, local_load>;
 def : Pat <
-    (local_store i32:$src1, i64:$src0),
-    (DS_WRITE_B32 0, (EXTRACT_SUBREG $src0, sub0), $src1, $src1, 0, 0)
+    (local_load i32:$src0),
+    (i32 (DS_READ_B32 0, $src0, $src0, $src0, 0, 0))
+>;
+
+class DSWritePat <DS inst, ValueType vt, PatFrag frag> : Pat <
+  (frag i32:$src1, i32:$src0),
+  (inst 0, $src0, $src1, $src1, 0, 0)
 >;
 
+def : DSWritePat <DS_WRITE_B8, i32, truncstorei8_local>;
+def : DSWritePat <DS_WRITE_B16, i32, truncstorei16_local>;
+def : DSWritePat <DS_WRITE_B32, i32, local_store>;
+
+def : Pat <(atomic_load_add_local i32:$ptr, i32:$val),
+           (DS_ADD_U32_RTN 0, $ptr, $val, 0, 0)>;
+
+def : Pat <(atomic_load_sub_local i32:$ptr, i32:$val),
+           (DS_SUB_U32_RTN 0, $ptr, $val, 0, 0)>;
+
 /********** ================== **********/
 /**********   SMRD Patterns    **********/
 /********** ================== **********/
@@ -1717,8 +1898,11 @@ defm : SMRD_Pattern <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, f32>;
 defm : SMRD_Pattern <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, i32>;
 defm : SMRD_Pattern <S_LOAD_DWORDX2_IMM, S_LOAD_DWORDX2_SGPR, i64>;
 defm : SMRD_Pattern <S_LOAD_DWORDX2_IMM, S_LOAD_DWORDX2_SGPR, v2i32>;
-defm : SMRD_Pattern <S_LOAD_DWORDX4_IMM, S_LOAD_DWORDX4_SGPR, v16i8>;
+defm : SMRD_Pattern <S_LOAD_DWORDX4_IMM, S_LOAD_DWORDX4_SGPR, i128>;
+defm : SMRD_Pattern <S_LOAD_DWORDX4_IMM, S_LOAD_DWORDX4_SGPR, v4i32>;
 defm : SMRD_Pattern <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v32i8>;
+defm : SMRD_Pattern <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v8i32>;
+defm : SMRD_Pattern <S_LOAD_DWORDX16_IMM, S_LOAD_DWORDX16_SGPR, v16i32>;
 
 //===----------------------------------------------------------------------===//
 // MUBUF Patterns
@@ -1766,23 +1950,46 @@ defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORDX2_ADDR64, v2i32,
 defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORDX4_ADDR64, v4i32,
                           global_load, constant_load>;
 
-multiclass MUBUFStore_Pattern <MUBUF Instr, ValueType vt> {
+multiclass MUBUFStore_Pattern <MUBUF Instr, ValueType vt, PatFrag st> {
 
   def : Pat <
-    (global_store vt:$value, i64:$ptr),
+    (st vt:$value, i64:$ptr),
     (Instr $value, (SI_ADDR64_RSRC (i64 0)), $ptr, 0)
   >;
 
   def : Pat <
-    (global_store vt:$value, (add i64:$ptr, i64:$offset)),
+    (st vt:$value, (add i64:$ptr, i64:$offset)),
     (Instr $value, (SI_ADDR64_RSRC $ptr), $offset, 0)
    >;
 }
 
-defm : MUBUFStore_Pattern <BUFFER_STORE_DWORD, i32>;
-defm : MUBUFStore_Pattern <BUFFER_STORE_DWORDX2, i64>;
-defm : MUBUFStore_Pattern <BUFFER_STORE_DWORDX2, v2i32>;
-defm : MUBUFStore_Pattern <BUFFER_STORE_DWORDX4, v4i32>;
+defm : MUBUFStore_Pattern <BUFFER_STORE_BYTE, i32, truncstorei8_global>;
+defm : MUBUFStore_Pattern <BUFFER_STORE_SHORT, i32, truncstorei16_global>;
+defm : MUBUFStore_Pattern <BUFFER_STORE_DWORD, i32, global_store>;
+defm : MUBUFStore_Pattern <BUFFER_STORE_DWORDX2, i64, global_store>;
+defm : MUBUFStore_Pattern <BUFFER_STORE_DWORDX2, v2i32, global_store>;
+defm : MUBUFStore_Pattern <BUFFER_STORE_DWORDX4, v4i32, global_store>;
+
+//===----------------------------------------------------------------------===//
+// MTBUF Patterns
+//===----------------------------------------------------------------------===//
+
+// TBUFFER_STORE_FORMAT_*, addr64=0
+class MTBUF_StoreResource <ValueType vt, int num_channels, MTBUF opcode> : Pat<
+  (SItbuffer_store i128:$rsrc, vt:$vdata, num_channels, i32:$vaddr,
+                   i32:$soffset, imm:$inst_offset, imm:$dfmt,
+                   imm:$nfmt, imm:$offen, imm:$idxen,
+                   imm:$glc, imm:$slc, imm:$tfe),
+  (opcode
+    $vdata, (as_i16imm $inst_offset), (as_i1imm $offen), (as_i1imm $idxen),
+    (as_i1imm $glc), 0, (as_i8imm $dfmt), (as_i8imm $nfmt), $vaddr, $rsrc,
+    (as_i1imm $slc), (as_i1imm $tfe), $soffset)
+>;
+
+def : MTBUF_StoreResource <i32, 1, TBUFFER_STORE_FORMAT_X>;
+def : MTBUF_StoreResource <v2i32, 2, TBUFFER_STORE_FORMAT_XY>;
+def : MTBUF_StoreResource <v4i32, 3, TBUFFER_STORE_FORMAT_XYZ>;
+def : MTBUF_StoreResource <v4i32, 4, TBUFFER_STORE_FORMAT_XYZW>;
 
 /********** ====================== **********/
 /**********   Indirect adressing   **********/
@@ -1834,6 +2041,37 @@ def : Pat<
   (V_CMP_U_F32_e64 $src0, $src1)
 >;
 
+//===----------------------------------------------------------------------===//
+// Miscellaneous Patterns
+//===----------------------------------------------------------------------===//
+
+def : Pat <
+  (i64 (trunc i128:$x)),
+  (INSERT_SUBREG (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+    (i32 (EXTRACT_SUBREG $x, sub0)), sub0),
+    (i32 (EXTRACT_SUBREG $x, sub1)), sub1)
+>;
+
+def : Pat <
+  (i32 (trunc i64:$a)),
+  (EXTRACT_SUBREG $a, sub0)
+>;
+
+// V_ADD_I32_e32/S_ADD_I32 produces carry in VCC/SCC. For the vector
+// case, the sgpr-copies pass will fix this to use the vector version.
+def : Pat <
+  (i32 (addc i32:$src0, i32:$src1)),
+  (S_ADD_I32 $src0, $src1)
+>;
+
+def : Pat <
+  (or i64:$a, i64:$b),
+  (INSERT_SUBREG
+    (INSERT_SUBREG (IMPLICIT_DEF),
+      (V_OR_B32_e32 (EXTRACT_SUBREG $a, sub0), (EXTRACT_SUBREG $b, sub0)), sub0),
+    (V_OR_B32_e32 (EXTRACT_SUBREG $a, sub1), (EXTRACT_SUBREG $b, sub1)), sub1)
+>;
+
 //============================================================================//
 // Miscellaneous Optimization Patterns
 //============================================================================//
diff --git a/lib/Target/R600/SIIntrinsics.td b/lib/Target/R600/SIIntrinsics.td
index 2fa073e..7fcc964 100644
--- a/lib/Target/R600/SIIntrinsics.td
+++ b/lib/Target/R600/SIIntrinsics.td
@@ -17,10 +17,28 @@ let TargetPrefix = "SI", isTarget = 1 in {
   def int_SI_tid : Intrinsic <[llvm_i32_ty], [], [IntrNoMem]>;
   def int_SI_packf16 : Intrinsic <[llvm_i32_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
   def int_SI_export : Intrinsic <[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], []>;
-  def int_SI_load_const : Intrinsic <[llvm_float_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_SI_vs_load_input : Intrinsic <[llvm_v4f32_ty], [llvm_v16i8_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]> ;
+  def int_SI_load_const : Intrinsic <[llvm_float_ty], [llvm_anyint_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_SI_vs_load_input : Intrinsic <[llvm_v4f32_ty], [llvm_anyint_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]> ;
 
-  class Sample : Intrinsic <[llvm_v4f32_ty], [llvm_anyvector_ty, llvm_v32i8_ty, llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem]>;
+  // Fully-flexible TBUFFER_STORE_FORMAT_* except for the ADDR64 bit, which is not exposed
+  def int_SI_tbuffer_store : Intrinsic <
+    [],
+    [llvm_anyint_ty, // rsrc(SGPR)
+     llvm_anyint_ty, // vdata(VGPR), overloaded for types i32, v2i32, v4i32
+     llvm_i32_ty,    // num_channels(imm), selects opcode suffix: 1=X, 2=XY, 3=XYZ, 4=XYZW
+     llvm_i32_ty,    // vaddr(VGPR)
+     llvm_i32_ty,    // soffset(SGPR)
+     llvm_i32_ty,    // inst_offset(imm)
+     llvm_i32_ty,    // dfmt(imm)
+     llvm_i32_ty,    // nfmt(imm)
+     llvm_i32_ty,    // offen(imm)
+     llvm_i32_ty,    // idxen(imm)
+     llvm_i32_ty,    // glc(imm)
+     llvm_i32_ty,    // slc(imm)
+     llvm_i32_ty],   // tfe(imm)
+    []>;
+
+  class Sample : Intrinsic <[llvm_v4f32_ty], [llvm_anyvector_ty, llvm_v32i8_ty, llvm_anyint_ty, llvm_i32_ty], [IntrNoMem]>;
 
   def int_SI_sample : Sample;
   def int_SI_sampleb : Sample;
diff --git a/lib/Target/R600/SILowerControlFlow.cpp b/lib/Target/R600/SILowerControlFlow.cpp
index c2e8f02..958763d 100644
--- a/lib/Target/R600/SILowerControlFlow.cpp
+++ b/lib/Target/R600/SILowerControlFlow.cpp
@@ -377,10 +377,13 @@ void SILowerControlFlowPass::IndirectSrc(MachineInstr &MI) {
   unsigned Dst = MI.getOperand(0).getReg();
   unsigned Vec = MI.getOperand(2).getReg();
   unsigned Off = MI.getOperand(4).getImm();
+  unsigned SubReg = TRI->getSubReg(Vec, AMDGPU::sub0);
+  if (!SubReg)
+    SubReg = Vec;
 
-  MachineInstr *MovRel = 
+  MachineInstr *MovRel =
     BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
-            .addReg(TRI->getSubReg(Vec, AMDGPU::sub0) + Off)
+            .addReg(SubReg + Off)
             .addReg(AMDGPU::M0, RegState::Implicit)
             .addReg(Vec, RegState::Implicit);
 
@@ -395,10 +398,13 @@ void SILowerControlFlowPass::IndirectDst(MachineInstr &MI) {
   unsigned Dst = MI.getOperand(0).getReg();
   unsigned Off = MI.getOperand(4).getImm();
   unsigned Val = MI.getOperand(5).getReg();
+  unsigned SubReg = TRI->getSubReg(Dst, AMDGPU::sub0);
+  if (!SubReg)
+    SubReg = Dst;
 
   MachineInstr *MovRel = 
     BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELD_B32_e32))
-            .addReg(TRI->getSubReg(Dst, AMDGPU::sub0) + Off, RegState::Define)
+            .addReg(SubReg + Off, RegState::Define)
             .addReg(Val)
             .addReg(AMDGPU::M0, RegState::Implicit)
             .addReg(Dst, RegState::Implicit);
@@ -409,6 +415,7 @@ void SILowerControlFlowPass::IndirectDst(MachineInstr &MI) {
 bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
   TII = MF.getTarget().getInstrInfo();
   TRI = MF.getTarget().getRegisterInfo();
+  SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
 
   bool HaveKill = false;
   bool NeedM0 = false;
@@ -476,6 +483,7 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
           IndirectSrc(MI);
           break;
 
+        case AMDGPU::SI_INDIRECT_DST_V1:
         case AMDGPU::SI_INDIRECT_DST_V2:
         case AMDGPU::SI_INDIRECT_DST_V4:
         case AMDGPU::SI_INDIRECT_DST_V8:
@@ -487,6 +495,7 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
           NeedWQM = true;
           // Fall through
         case AMDGPU::DS_WRITE_B32:
+        case AMDGPU::DS_ADD_U32_RTN:
           NeedM0 = true;
           break;
 
@@ -508,7 +517,7 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
             AMDGPU::M0).addImm(0xffffffff);
   }
 
-  if (NeedWQM) {
+  if (NeedWQM && MFI->ShaderType != ShaderType::COMPUTE) {
     MachineBasicBlock &MBB = MF.front();
     BuildMI(MBB, MBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WQM_B64),
             AMDGPU::EXEC).addReg(AMDGPU::EXEC);
diff --git a/lib/Target/R600/SIMachineFunctionInfo.cpp b/lib/Target/R600/SIMachineFunctionInfo.cpp
index ee0e307..071f9fa 100644
--- a/lib/Target/R600/SIMachineFunctionInfo.cpp
+++ b/lib/Target/R600/SIMachineFunctionInfo.cpp
@@ -13,6 +13,10 @@
 
 using namespace llvm;
 
+
+// Pin the vtable to this file.
+void SIMachineFunctionInfo::anchor() {}
+
 SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
   : AMDGPUMachineFunction(MF),
     PSInputAddr(0) { }
diff --git a/lib/Target/R600/SIMachineFunctionInfo.h b/lib/Target/R600/SIMachineFunctionInfo.h
index 6da9f7f..2f1961c 100644
--- a/lib/Target/R600/SIMachineFunctionInfo.h
+++ b/lib/Target/R600/SIMachineFunctionInfo.h
@@ -22,6 +22,7 @@ namespace llvm {
 /// This class keeps track of the SPI_SP_INPUT_ADDR config register, which
 /// tells the hardware which interpolation parameters to load.
 class SIMachineFunctionInfo : public AMDGPUMachineFunction {
+  virtual void anchor();
 public:
   SIMachineFunctionInfo(const MachineFunction &MF);
   unsigned PSInputAddr;
diff --git a/lib/Target/R600/SIRegisterInfo.cpp b/lib/Target/R600/SIRegisterInfo.cpp
index 50fd4c7..ed0bbaf 100644
--- a/lib/Target/R600/SIRegisterInfo.cpp
+++ b/lib/Target/R600/SIRegisterInfo.cpp
@@ -15,6 +15,7 @@
 
 #include "SIRegisterInfo.h"
 #include "AMDGPUTargetMachine.h"
+#include "SIInstrInfo.h"
 
 using namespace llvm;
 
@@ -25,6 +26,10 @@ SIRegisterInfo::SIRegisterInfo(AMDGPUTargetMachine &tm)
 
 BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   BitVector Reserved(getNumRegs());
+  Reserved.set(AMDGPU::EXEC);
+  Reserved.set(AMDGPU::INDIRECT_BASE_ADDR);
+  const SIInstrInfo *TII = static_cast<const SIInstrInfo*>(TM.getInstrInfo());
+  TII->reserveIndirectRegisters(Reserved, MF);
   return Reserved;
 }
 
@@ -50,6 +55,10 @@ const TargetRegisterClass * SIRegisterInfo::getCFGStructurizerRegClass(
   }
 }
 
+unsigned SIRegisterInfo::getHWRegIndex(unsigned Reg) const {
+  return getEncodingValue(Reg);
+}
+
 const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const {
   assert(!TargetRegisterInfo::isVirtualRegister(Reg));
 
@@ -70,3 +79,53 @@ const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const {
   }
   return NULL;
 }
+
+bool SIRegisterInfo::isSGPRClass(const TargetRegisterClass *RC) const {
+  if (!RC) {
+    return false;
+  }
+  return !hasVGPRs(RC);
+}
+
+bool SIRegisterInfo::hasVGPRs(const TargetRegisterClass *RC) const {
+  return getCommonSubClass(&AMDGPU::VReg_32RegClass, RC) ||
+         getCommonSubClass(&AMDGPU::VReg_64RegClass, RC) ||
+         getCommonSubClass(&AMDGPU::VReg_96RegClass, RC) ||
+         getCommonSubClass(&AMDGPU::VReg_128RegClass, RC) ||
+         getCommonSubClass(&AMDGPU::VReg_256RegClass, RC) ||
+         getCommonSubClass(&AMDGPU::VReg_512RegClass, RC);
+}
+
+const TargetRegisterClass *SIRegisterInfo::getEquivalentVGPRClass(
+                                         const TargetRegisterClass *SRC) const {
+    if (hasVGPRs(SRC)) {
+      return SRC;
+    } else if (SRC == &AMDGPU::SCCRegRegClass) {
+      return &AMDGPU::VCCRegRegClass;
+    } else if (getCommonSubClass(SRC, &AMDGPU::SGPR_32RegClass)) {
+      return &AMDGPU::VReg_32RegClass;
+    } else if (getCommonSubClass(SRC, &AMDGPU::SGPR_64RegClass)) {
+      return &AMDGPU::VReg_64RegClass;
+    } else if (getCommonSubClass(SRC, &AMDGPU::SReg_128RegClass)) {
+      return &AMDGPU::VReg_128RegClass;
+    } else if (getCommonSubClass(SRC, &AMDGPU::SReg_256RegClass)) {
+      return &AMDGPU::VReg_256RegClass;
+    } else if (getCommonSubClass(SRC, &AMDGPU::SReg_512RegClass)) {
+      return &AMDGPU::VReg_512RegClass;
+    }
+    return NULL;
+}
+
+const TargetRegisterClass *SIRegisterInfo::getSubRegClass(
+                         const TargetRegisterClass *RC, unsigned SubIdx) const {
+  if (SubIdx == AMDGPU::NoSubRegister)
+    return RC;
+
+  // If this register has a sub-register, we can safely assume it is a 32-bit
+  // register, becuase all of SI's sub-registers are 32-bit.
+  if (isSGPRClass(RC)) {
+    return &AMDGPU::SGPR_32RegClass;
+  } else {
+    return &AMDGPU::VGPR_32RegClass;
+  }
+}
diff --git a/lib/Target/R600/SIRegisterInfo.h b/lib/Target/R600/SIRegisterInfo.h
index d0df4f9..8148f7f 100644
--- a/lib/Target/R600/SIRegisterInfo.h
+++ b/lib/Target/R600/SIRegisterInfo.h
@@ -42,9 +42,27 @@ struct SIRegisterInfo : public AMDGPURegisterInfo {
   /// CFGStructurizer
   virtual const TargetRegisterClass * getCFGStructurizerRegClass(MVT VT) const;
 
+  virtual unsigned getHWRegIndex(unsigned Reg) const;
+
   /// \brief Return the 'base' register class for this register.
   /// e.g. SGPR0 => SReg_32, VGPR => VReg_32 SGPR0_SGPR1 -> SReg_32, etc.
   const TargetRegisterClass *getPhysRegClass(unsigned Reg) const;
+
+  /// \returns true if this class contains only SGPR registers
+  bool isSGPRClass(const TargetRegisterClass *RC) const;
+
+  /// \returns true if this class contains VGPR registers.
+  bool hasVGPRs(const TargetRegisterClass *RC) const;
+
+  /// \returns A VGPR reg class with the same width as \p SRC
+  const TargetRegisterClass *getEquivalentVGPRClass(
+                                          const TargetRegisterClass *SRC) const;
+
+  /// \returns The register class that is used for a sub-register of \p RC for
+  /// the given \p SubIdx.  If \p SubIdx equals NoSubRegister, \p RC will
+  /// be returned.
+  const TargetRegisterClass *getSubRegClass(const TargetRegisterClass *RC,
+                                            unsigned SubIdx) const;
 };
 
 } // End namespace llvm
diff --git a/lib/Target/R600/SIRegisterInfo.td b/lib/Target/R600/SIRegisterInfo.td
index 292b9d2..49bdbc9 100644
--- a/lib/Target/R600/SIRegisterInfo.td
+++ b/lib/Target/R600/SIRegisterInfo.td
@@ -43,7 +43,7 @@ def SGPR_32 : RegisterClass<"AMDGPU", [f32, i32], 32,
                             (add (sequence "SGPR%u", 0, 101))>;
 
 // SGPR 64-bit registers
-def SGPR_64 : RegisterTuples<[sub0, sub1],
+def SGPR_64Regs : RegisterTuples<[sub0, sub1],
                              [(add (decimate (trunc SGPR_32, 101), 2)),
                               (add (decimate (shl SGPR_32, 1), 2))]>;
 
@@ -153,15 +153,17 @@ def SReg_32 : RegisterClass<"AMDGPU", [f32, i32], 32,
   (add SGPR_32, M0Reg)
 >;
 
+def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64], 64, (add SGPR_64Regs)>;
+
 def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, i1], 64,
-  (add SGPR_64, VCCReg, EXECReg)
+  (add SGPR_64Regs, VCCReg, EXECReg)
 >;
 
-def SReg_128 : RegisterClass<"AMDGPU", [v16i8, i128], 128, (add SGPR_128)>;
+def SReg_128 : RegisterClass<"AMDGPU", [i128, v4i32], 128, (add SGPR_128)>;
 
-def SReg_256 : RegisterClass<"AMDGPU", [v32i8], 256, (add SGPR_256)>;
+def SReg_256 : RegisterClass<"AMDGPU", [v32i8, v8i32, v8f32], 256, (add SGPR_256)>;
 
-def SReg_512 : RegisterClass<"AMDGPU", [v64i8], 512, (add SGPR_512)>;
+def SReg_512 : RegisterClass<"AMDGPU", [v64i8, v16i32], 512, (add SGPR_512)>;
 
 // Register class for all vector registers (VGPRs + Interploation Registers)
 def VReg_32 : RegisterClass<"AMDGPU", [i32, f32, v1i32], 32, (add VGPR_32)>;
@@ -172,9 +174,9 @@ def VReg_96 : RegisterClass<"AMDGPU", [untyped], 96, (add VGPR_96)> {
   let Size = 96;
 }
 
-def VReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32], 128, (add VGPR_128)>;
+def VReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, i128], 128, (add VGPR_128)>;
 
-def VReg_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 256, (add VGPR_256)>;
+def VReg_256 : RegisterClass<"AMDGPU", [v32i8, v8i32, v8f32], 256, (add VGPR_256)>;
 
 def VReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 512, (add VGPR_512)>;
 
diff --git a/lib/Target/R600/SITypeRewriter.cpp b/lib/Target/R600/SITypeRewriter.cpp
new file mode 100644
index 0000000..f194d8b
--- /dev/null
+++ b/lib/Target/R600/SITypeRewriter.cpp
@@ -0,0 +1,162 @@
+//===-- SITypeRewriter.cpp - Remove unwanted types ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass removes performs the following type substitution on all
+/// non-compute shaders:
+///
+/// v16i8 => i128
+///   - v16i8 is used for constant memory resource descriptors.  This type is
+///      legal for some compute APIs, and we don't want to declare it as legal
+///      in the backend, because we want the legalizer to expand all v16i8
+///      operations.
+/// v1* => *
+///   - Having v1* types complicates the legalizer and we can easily replace
+///   - them with the element type.
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/InstVisitor.h"
+
+using namespace llvm;
+
+namespace {
+
+class SITypeRewriter : public FunctionPass,
+                       public InstVisitor<SITypeRewriter> {
+
+  static char ID;
+  Module *Mod;
+  Type *v16i8;
+  Type *i128;
+
+public:
+  SITypeRewriter() : FunctionPass(ID) { }
+  virtual bool doInitialization(Module &M);
+  virtual bool runOnFunction(Function &F);
+  virtual const char *getPassName() const {
+    return "SI Type Rewriter";
+  }
+  void visitLoadInst(LoadInst &I);
+  void visitCallInst(CallInst &I);
+  void visitBitCast(BitCastInst &I);
+};
+
+} // End anonymous namespace
+
+char SITypeRewriter::ID = 0;
+
+bool SITypeRewriter::doInitialization(Module &M) {
+  Mod = &M;
+  v16i8 = VectorType::get(Type::getInt8Ty(M.getContext()), 16);
+  i128 = Type::getIntNTy(M.getContext(), 128);
+  return false;
+}
+
+bool SITypeRewriter::runOnFunction(Function &F) {
+  AttributeSet Set = F.getAttributes();
+  Attribute A = Set.getAttribute(AttributeSet::FunctionIndex, "ShaderType");
+
+  unsigned ShaderType = ShaderType::COMPUTE;
+  if (A.isStringAttribute()) {
+    StringRef Str = A.getValueAsString();
+    Str.getAsInteger(0, ShaderType);
+  }
+  if (ShaderType != ShaderType::COMPUTE) {
+    visit(F);
+  }
+
+  visit(F);
+
+  return false;
+}
+
+void SITypeRewriter::visitLoadInst(LoadInst &I) {
+  Value *Ptr = I.getPointerOperand();
+  Type *PtrTy = Ptr->getType();
+  Type *ElemTy = PtrTy->getPointerElementType();
+  IRBuilder<> Builder(&I);
+  if (ElemTy == v16i8)  {
+    Value *BitCast = Builder.CreateBitCast(Ptr, Type::getIntNPtrTy(I.getContext(), 128, 2));
+    LoadInst *Load = Builder.CreateLoad(BitCast);
+    SmallVector <std::pair<unsigned, MDNode*>, 8> MD;
+    I.getAllMetadataOtherThanDebugLoc(MD);
+    for (unsigned i = 0, e = MD.size(); i != e; ++i) {
+      Load->setMetadata(MD[i].first, MD[i].second);
+    }
+    Value *BitCastLoad = Builder.CreateBitCast(Load, I.getType());
+    I.replaceAllUsesWith(BitCastLoad);
+    I.eraseFromParent();
+  }
+}
+
+void SITypeRewriter::visitCallInst(CallInst &I) {
+  IRBuilder<> Builder(&I);
+  SmallVector <Value*, 8> Args;
+  SmallVector <Type*, 8> Types;
+  bool NeedToReplace = false;
+  Function *F = I.getCalledFunction();
+  std::string Name = F->getName().str();
+  for (unsigned i = 0, e = I.getNumArgOperands(); i != e; ++i) {
+    Value *Arg = I.getArgOperand(i);
+    if (Arg->getType() == v16i8) {
+      Args.push_back(Builder.CreateBitCast(Arg, i128));
+      Types.push_back(i128);
+      NeedToReplace = true;
+      Name = Name + ".i128";
+    } else if (Arg->getType()->isVectorTy() &&
+               Arg->getType()->getVectorNumElements() == 1 &&
+               Arg->getType()->getVectorElementType() ==
+                                              Type::getInt32Ty(I.getContext())){
+      Type *ElementTy = Arg->getType()->getVectorElementType();
+      std::string TypeName = "i32";
+      InsertElementInst *Def = dyn_cast<InsertElementInst>(Arg);
+      assert(Def);
+      Args.push_back(Def->getOperand(1));
+      Types.push_back(ElementTy);
+      std::string VecTypeName = "v1" + TypeName;
+      Name = Name.replace(Name.find(VecTypeName), VecTypeName.length(), TypeName);
+      NeedToReplace = true;
+    } else {
+      Args.push_back(Arg);
+      Types.push_back(Arg->getType());
+    }
+  }
+
+  if (!NeedToReplace) {
+    return;
+  }
+  Function *NewF = Mod->getFunction(Name);
+  if (!NewF) {
+    NewF = Function::Create(FunctionType::get(F->getReturnType(), Types, false), GlobalValue::ExternalLinkage, Name, Mod);
+    NewF->setAttributes(F->getAttributes());
+  }
+  I.replaceAllUsesWith(Builder.CreateCall(NewF, Args));
+  I.eraseFromParent();
+}
+
+void SITypeRewriter::visitBitCast(BitCastInst &I) {
+  IRBuilder<> Builder(&I);
+  if (I.getDestTy() != i128) {
+    return;
+  }
+
+  if (BitCastInst *Op = dyn_cast<BitCastInst>(I.getOperand(0))) {
+    if (Op->getSrcTy() == i128) {
+      I.replaceAllUsesWith(Op->getOperand(0));
+      I.eraseFromParent();
+    }
+  }
+}
+
+FunctionPass *llvm::createSITypeRewriter() {
+  return new SITypeRewriter();
+}
diff --git a/lib/Target/Sparc/CMakeLists.txt b/lib/Target/Sparc/CMakeLists.txt
index acf7496..6339394 100644
--- a/lib/Target/Sparc/CMakeLists.txt
+++ b/lib/Target/Sparc/CMakeLists.txt
@@ -2,6 +2,7 @@ set(LLVM_TARGET_DEFINITIONS Sparc.td)
 
 tablegen(LLVM SparcGenRegisterInfo.inc -gen-register-info)
 tablegen(LLVM SparcGenInstrInfo.inc -gen-instr-info)
+tablegen(LLVM SparcGenCodeEmitter.inc -gen-emitter)
 tablegen(LLVM SparcGenAsmWriter.inc -gen-asm-writer)
 tablegen(LLVM SparcGenDAGISel.inc -gen-dag-isel)
 tablegen(LLVM SparcGenSubtargetInfo.inc -gen-subtarget)
@@ -20,6 +21,8 @@ add_llvm_target(SparcCodeGen
   SparcSubtarget.cpp
   SparcTargetMachine.cpp
   SparcSelectionDAGInfo.cpp
+  SparcJITInfo.cpp
+  SparcCodeEmitter.cpp
   )
 
 add_dependencies(LLVMSparcCodeGen SparcCommonTableGen intrinsics_gen)
diff --git a/lib/Target/Sparc/DelaySlotFiller.cpp b/lib/Target/Sparc/DelaySlotFiller.cpp
index b101751..9a0466a 100644
--- a/lib/Target/Sparc/DelaySlotFiller.cpp
+++ b/lib/Target/Sparc/DelaySlotFiller.cpp
@@ -14,6 +14,7 @@
 
 #define DEBUG_TYPE "delay-slot-filler"
 #include "Sparc.h"
+#include "SparcSubtarget.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -39,10 +40,13 @@ namespace {
     /// layout, etc.
     ///
     TargetMachine &TM;
+    const SparcSubtarget *Subtarget;
 
     static char ID;
     Filler(TargetMachine &tm)
-      : MachineFunctionPass(ID), TM(tm) { }
+      : MachineFunctionPass(ID), TM(tm),
+        Subtarget(&TM.getSubtarget<SparcSubtarget>()) {
+    }
 
     virtual const char *getPassName() const {
       return "SPARC Delay Slot Filler";
@@ -102,6 +106,8 @@ FunctionPass *llvm::createSparcDelaySlotFillerPass(TargetMachine &tm) {
 bool Filler::runOnMachineBasicBlock(MachineBasicBlock &MBB) {
   bool Changed = false;
 
+  const TargetInstrInfo *TII = TM.getInstrInfo();
+
   for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ) {
     MachineBasicBlock::iterator MI = I;
     ++I;
@@ -114,6 +120,14 @@ bool Filler::runOnMachineBasicBlock(MachineBasicBlock &MBB) {
       continue;
     }
 
+    if (!Subtarget->isV9() &&
+        (MI->getOpcode() == SP::FCMPS || MI->getOpcode() == SP::FCMPD
+         || MI->getOpcode() == SP::FCMPQ)) {
+      BuildMI(MBB, I, MI->getDebugLoc(), TII->get(SP::NOP));
+      Changed = true;
+      continue;
+    }
+
     // If MI has no delay slot, skip.
     if (!MI->hasDelaySlot())
       continue;
@@ -126,7 +140,6 @@ bool Filler::runOnMachineBasicBlock(MachineBasicBlock &MBB) {
     ++FilledSlots;
     Changed = true;
 
-    const TargetInstrInfo *TII = TM.getInstrInfo();
     if (D == MBB.end())
       BuildMI(MBB, I, MI->getDebugLoc(), TII->get(SP::NOP));
     else
@@ -156,7 +169,7 @@ Filler::findDelayInstr(MachineBasicBlock &MBB,
   if (slot == MBB.begin())
     return MBB.end();
 
-  if (slot->getOpcode() == SP::RET)
+  if (slot->getOpcode() == SP::RET || slot->getOpcode() == SP::TLS_CALL)
     return MBB.end();
 
   if (slot->getOpcode() == SP::RETL) {
@@ -342,6 +355,7 @@ bool Filler::needsUnimp(MachineBasicBlock::iterator I, unsigned &StructSize)
   case SP::CALL: structSizeOpNum = 1; break;
   case SP::JMPLrr:
   case SP::JMPLri: structSizeOpNum = 2; break;
+  case SP::TLS_CALL: return false;
   }
 
   const MachineOperand &MO = I->getOperand(structSizeOpNum);
diff --git a/lib/Target/Sparc/LLVMBuild.txt b/lib/Target/Sparc/LLVMBuild.txt
index 7d54d32..fd8e5d9 100644
--- a/lib/Target/Sparc/LLVMBuild.txt
+++ b/lib/Target/Sparc/LLVMBuild.txt
@@ -23,6 +23,7 @@ type = TargetGroup
 name = Sparc
 parent = Target
 has_asmprinter = 1
+has_jit = 1
 
 [component_1]
 type = Library
diff --git a/lib/Target/Sparc/MCTargetDesc/SparcBaseInfo.h b/lib/Target/Sparc/MCTargetDesc/SparcBaseInfo.h
index aac0e8d..f3caeaa 100644
--- a/lib/Target/Sparc/MCTargetDesc/SparcBaseInfo.h
+++ b/lib/Target/Sparc/MCTargetDesc/SparcBaseInfo.h
@@ -53,7 +53,27 @@ enum TOF {
 
   // Extract bits 41-32 of an address.
   // Assembler: %hm(addr)
-  MO_HM
+  MO_HM,
+
+  // TargetFlags for Thread Local Storage.
+  MO_TLS_GD_HI22,
+  MO_TLS_GD_LO10,
+  MO_TLS_GD_ADD,
+  MO_TLS_GD_CALL,
+  MO_TLS_LDM_HI22,
+  MO_TLS_LDM_LO10,
+  MO_TLS_LDM_ADD,
+  MO_TLS_LDM_CALL,
+  MO_TLS_LDO_HIX22,
+  MO_TLS_LDO_LOX10,
+  MO_TLS_LDO_ADD,
+  MO_TLS_IE_HI22,
+  MO_TLS_IE_LO10,
+  MO_TLS_IE_LD,
+  MO_TLS_IE_LDX,
+  MO_TLS_IE_ADD,
+  MO_TLS_LE_HIX22,
+  MO_TLS_LE_LOX10
 };
 
 } // end namespace SPII
diff --git a/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp b/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp
index 5a52abe..baac36b 100644
--- a/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp
+++ b/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp
@@ -21,23 +21,26 @@ void SparcELFMCAsmInfo::anchor() { }
 SparcELFMCAsmInfo::SparcELFMCAsmInfo(StringRef TT) {
   IsLittleEndian = false;
   Triple TheTriple(TT);
-  if (TheTriple.getArch() == Triple::sparcv9) {
+  bool isV9 = (TheTriple.getArch() == Triple::sparcv9);
+
+  if (isV9) {
     PointerSize = CalleeSaveStackSlotSize = 8;
   }
 
   Data16bitsDirective = "\t.half\t";
   Data32bitsDirective = "\t.word\t";
-  Data64bitsDirective = 0;  // .xword is only supported by V9.
+  // .xword is only supported by V9.
+  Data64bitsDirective = (isV9) ? "\t.xword\t" : 0;
   ZeroDirective = "\t.skip\t";
   CommentString = "!";
   HasLEB128 = true;
   SupportsDebugInformation = true;
-  
+
+  ExceptionsType = ExceptionHandling::DwarfCFI;
+
   SunStyleELFSectionSwitchSyntax = true;
   UsesELFSectionDirectiveForBSS = true;
 
-  WeakRefDirective = "\t.weak\t";
-
   PrivateGlobalPrefix = ".L";
 }
 
diff --git a/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h b/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h
index 621e8ff..1e58e37 100644
--- a/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h
+++ b/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h
@@ -14,12 +14,12 @@
 #ifndef SPARCTARGETASMINFO_H
 #define SPARCTARGETASMINFO_H
 
-#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCAsmInfoELF.h"
 
 namespace llvm {
   class StringRef;
 
-  class SparcELFMCAsmInfo : public MCAsmInfo {
+  class SparcELFMCAsmInfo : public MCAsmInfoELF {
     virtual void anchor();
   public:
     explicit SparcELFMCAsmInfo(StringRef TT);
diff --git a/lib/Target/Sparc/Makefile b/lib/Target/Sparc/Makefile
index 4b81ada..c171db7 100644
--- a/lib/Target/Sparc/Makefile
+++ b/lib/Target/Sparc/Makefile
@@ -14,7 +14,8 @@ TARGET = Sparc
 # Make sure that tblgen is run, first thing.
 BUILT_SOURCES = SparcGenRegisterInfo.inc SparcGenInstrInfo.inc \
 		SparcGenAsmWriter.inc SparcGenDAGISel.inc \
-		SparcGenSubtargetInfo.inc SparcGenCallingConv.inc
+		SparcGenSubtargetInfo.inc SparcGenCallingConv.inc \
+		SparcGenCodeEmitter.inc
 
 DIRS = TargetInfo MCTargetDesc
 
diff --git a/lib/Target/Sparc/Sparc.h b/lib/Target/Sparc/Sparc.h
index 98563db..f44b604 100644
--- a/lib/Target/Sparc/Sparc.h
+++ b/lib/Target/Sparc/Sparc.h
@@ -26,6 +26,8 @@ namespace llvm {
 
   FunctionPass *createSparcISelDag(SparcTargetMachine &TM);
   FunctionPass *createSparcDelaySlotFillerPass(TargetMachine &TM);
+  FunctionPass *createSparcJITCodeEmitterPass(SparcTargetMachine &TM,
+                                              JITCodeEmitter &JCE);
 
 } // end namespace llvm;
 
@@ -103,5 +105,22 @@ namespace llvm {
     }
     llvm_unreachable("Invalid cond code");
   }
+
+  inline static unsigned HI22(int64_t imm) {
+    return (unsigned)((imm >> 10) & ((1 << 22)-1));
+  }
+
+  inline static unsigned LO10(int64_t imm) {
+    return (unsigned)(imm & 0x3FF);
+  }
+
+  inline static unsigned HIX22(int64_t imm) {
+    return HI22(~imm);
+  }
+
+  inline static unsigned LOX10(int64_t imm) {
+    return ~LO10(~imm);
+  }
+
 }  // end namespace llvm
 #endif
diff --git a/lib/Target/Sparc/Sparc.td b/lib/Target/Sparc/Sparc.td
index d42c40f..0df48f6 100644
--- a/lib/Target/Sparc/Sparc.td
+++ b/lib/Target/Sparc/Sparc.td
@@ -30,6 +30,10 @@ def FeatureVIS
   : SubtargetFeature<"vis", "IsVIS", "true",
                      "Enable UltraSPARC Visual Instruction Set extensions">;
 
+def FeatureHardQuad
+  : SubtargetFeature<"hard-quad-float", "HasHardQuad", "true",
+                     "Enable quad-word floating point instructions">;
+
 //===----------------------------------------------------------------------===//
 // Register File, Calling Conv, Instruction Descriptions
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/Sparc/SparcAsmPrinter.cpp b/lib/Target/Sparc/SparcAsmPrinter.cpp
index 3fe2b44..d06c894 100644
--- a/lib/Target/Sparc/SparcAsmPrinter.cpp
+++ b/lib/Target/Sparc/SparcAsmPrinter.cpp
@@ -20,6 +20,7 @@
 #include "llvm/ADT/SmallString.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
@@ -43,6 +44,7 @@ namespace {
                          const char *Modifier = 0);
     void printCCOperand(const MachineInstr *MI, int opNum, raw_ostream &OS);
 
+    virtual void EmitFunctionBodyStart();
     virtual void EmitInstruction(const MachineInstr *MI) {
       SmallString<128> Str;
       raw_svector_ostream OS(Str);
@@ -63,11 +65,35 @@ namespace {
 
     virtual bool isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB)
                        const;
+    void EmitGlobalRegisterDecl(unsigned reg) {
+      SmallString<128> Str;
+      raw_svector_ostream OS(Str);
+      OS << "\t.register "
+         << "%" << StringRef(getRegisterName(reg)).lower()
+         << ", "
+         << ((reg == SP::G6 || reg == SP::G7)? "#ignore" : "#scratch");
+      OutStreamer.EmitRawText(OS.str());
+    }
+
   };
 } // end of anonymous namespace
 
 #include "SparcGenAsmWriter.inc"
 
+void SparcAsmPrinter::EmitFunctionBodyStart() {
+  if (!TM.getSubtarget<SparcSubtarget>().is64Bit())
+    return;
+
+  const MachineRegisterInfo &MRI = MF->getRegInfo();
+  const unsigned globalRegs[] = { SP::G2, SP::G3, SP::G6, SP::G7, 0 };
+  for (unsigned i = 0; globalRegs[i] != 0; ++i) {
+    unsigned reg = globalRegs[i];
+    if (MRI.use_empty(reg))
+      continue;
+    EmitGlobalRegisterDecl(reg);
+  }
+}
+
 void SparcAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
                                    raw_ostream &O) {
   const MachineOperand &MO = MI->getOperand (opNum);
@@ -79,11 +105,37 @@ void SparcAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
       assert(TF == SPII::MO_NO_FLAG &&
              "Cannot handle target flags on call address");
     else if (MI->getOpcode() == SP::SETHIi)
-      assert((TF == SPII::MO_HI || TF == SPII::MO_H44 || TF == SPII::MO_HH) &&
+      assert((TF == SPII::MO_HI || TF == SPII::MO_H44 || TF == SPII::MO_HH
+              || TF == SPII::MO_TLS_GD_HI22
+              || TF == SPII::MO_TLS_LDM_HI22
+              || TF == SPII::MO_TLS_LDO_HIX22
+              || TF == SPII::MO_TLS_IE_HI22
+              || TF == SPII::MO_TLS_LE_HIX22) &&
              "Invalid target flags for address operand on sethi");
+    else if (MI->getOpcode() == SP::TLS_CALL)
+      assert((TF == SPII::MO_NO_FLAG
+              || TF == SPII::MO_TLS_GD_CALL
+              || TF == SPII::MO_TLS_LDM_CALL) &&
+             "Cannot handle target flags on tls call address");
+    else if (MI->getOpcode() == SP::TLS_ADDrr)
+      assert((TF == SPII::MO_TLS_GD_ADD || TF == SPII::MO_TLS_LDM_ADD
+              || TF == SPII::MO_TLS_LDO_ADD || TF == SPII::MO_TLS_IE_ADD) &&
+             "Cannot handle target flags on add for TLS");
+    else if (MI->getOpcode() == SP::TLS_LDrr)
+      assert(TF == SPII::MO_TLS_IE_LD &&
+             "Cannot handle target flags on ld for TLS");
+    else if (MI->getOpcode() == SP::TLS_LDXrr)
+      assert(TF == SPII::MO_TLS_IE_LDX &&
+             "Cannot handle target flags on ldx for TLS");
+    else if (MI->getOpcode() == SP::XORri)
+      assert((TF == SPII::MO_TLS_LDO_LOX10 || TF == SPII::MO_TLS_LE_LOX10) &&
+             "Cannot handle target flags on xor for TLS");
     else
-      assert((TF == SPII::MO_LO || TF == SPII::MO_M44 || TF == SPII::MO_L44 ||
-              TF == SPII::MO_HM) &&
+      assert((TF == SPII::MO_LO || TF == SPII::MO_M44 || TF == SPII::MO_L44
+              || TF == SPII::MO_HM
+              || TF == SPII::MO_TLS_GD_LO10
+              || TF == SPII::MO_TLS_LDM_LO10
+              || TF == SPII::MO_TLS_IE_LO10 ) &&
              "Invalid target flags for small address operand");
   }
 #endif
@@ -102,6 +154,24 @@ void SparcAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
   case SPII::MO_L44: O << "%l44("; break;
   case SPII::MO_HH:  O << "%hh(";  break;
   case SPII::MO_HM:  O << "%hm(";  break;
+  case SPII::MO_TLS_GD_HI22:   O << "%tgd_hi22(";   break;
+  case SPII::MO_TLS_GD_LO10:   O << "%tgd_lo10(";   break;
+  case SPII::MO_TLS_GD_ADD:    O << "%tgd_add(";    break;
+  case SPII::MO_TLS_GD_CALL:   O << "%tgd_call(";   break;
+  case SPII::MO_TLS_LDM_HI22:  O << "%tldm_hi22(";  break;
+  case SPII::MO_TLS_LDM_LO10:  O << "%tldm_lo10(";  break;
+  case SPII::MO_TLS_LDM_ADD:   O << "%tldm_add(";   break;
+  case SPII::MO_TLS_LDM_CALL:  O << "%tldm_call(";  break;
+  case SPII::MO_TLS_LDO_HIX22: O << "%tldo_hix22("; break;
+  case SPII::MO_TLS_LDO_LOX10: O << "%tldo_lox10("; break;
+  case SPII::MO_TLS_LDO_ADD:   O << "%tldo_add(";   break;
+  case SPII::MO_TLS_IE_HI22:   O << "%tie_hi22(";   break;
+  case SPII::MO_TLS_IE_LO10:   O << "%tie_lo10(";   break;
+  case SPII::MO_TLS_IE_LD:     O << "%tie_ld(";     break;
+  case SPII::MO_TLS_IE_LDX:    O << "%tie_ldx(";    break;
+  case SPII::MO_TLS_IE_ADD:    O << "%tie_add(";    break;
+  case SPII::MO_TLS_LE_HIX22:  O << "%tle_hix22(";  break;
+  case SPII::MO_TLS_LE_LOX10:  O << "%tle_lox10(";   break;
   }
 
   switch (MO.getType()) {
@@ -116,7 +186,7 @@ void SparcAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
     O << *MO.getMBB()->getSymbol();
     return;
   case MachineOperand::MO_GlobalAddress:
-    O << *Mang->getSymbol(MO.getGlobal());
+    O << *getSymbol(MO.getGlobal());
     break;
   case MachineOperand::MO_BlockAddress:
     O <<  GetBlockAddressSymbol(MO.getBlockAddress())->getName();
diff --git a/lib/Target/Sparc/SparcCallingConv.td b/lib/Target/Sparc/SparcCallingConv.td
index a181bcf..acd4ec2 100644
--- a/lib/Target/Sparc/SparcCallingConv.td
+++ b/lib/Target/Sparc/SparcCallingConv.td
@@ -117,3 +117,14 @@ def CC_Sparc64 : CallingConv<[
   // arguments whether they are passed in registers or not.
   CCCustom<"CC_Sparc64_Full">
 ]>;
+
+// Callee-saved registers are handled by the register window mechanism.
+def CSR : CalleeSavedRegs<(add)> {
+  let OtherPreserved = (add (sequence "I%u", 0, 7),
+                            (sequence "L%u", 0, 7));
+}
+
+// Callee-saved registers for calls with ReturnsTwice attribute.
+def RTCSR : CalleeSavedRegs<(add)> {
+  let OtherPreserved = (add I6, I7);
+}
diff --git a/lib/Target/Sparc/SparcCodeEmitter.cpp b/lib/Target/Sparc/SparcCodeEmitter.cpp
new file mode 100644
index 0000000..9bfe31f
--- /dev/null
+++ b/lib/Target/Sparc/SparcCodeEmitter.cpp
@@ -0,0 +1,245 @@
+//===-- Sparc/SparcCodeEmitter.cpp - Convert Sparc Code to Machine Code ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===---------------------------------------------------------------------===//
+//
+// This file contains the pass that transforms the Sparc machine instructions
+// into relocatable machine code.
+//
+//===---------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "jit"
+#include "Sparc.h"
+#include "MCTargetDesc/SparcBaseInfo.h"
+#include "SparcRelocations.h"
+#include "SparcTargetMachine.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/JITCodeEmitter.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+STATISTIC(NumEmitted, "Number of machine instructions emitted");
+
+namespace {
+
+class SparcCodeEmitter : public MachineFunctionPass {
+  SparcJITInfo *JTI;
+  const SparcInstrInfo *II;
+  const DataLayout *TD;
+  const SparcSubtarget *Subtarget;
+  TargetMachine &TM;
+  JITCodeEmitter &MCE;
+  const std::vector<MachineConstantPoolEntry> *MCPEs;
+  bool IsPIC;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.addRequired<MachineModuleInfo> ();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  static char ID;
+
+public:
+  SparcCodeEmitter(TargetMachine &tm, JITCodeEmitter &mce)
+    : MachineFunctionPass(ID), JTI(0), II(0), TD(0),
+      TM(tm), MCE(mce), MCPEs(0),
+      IsPIC(TM.getRelocationModel() == Reloc::PIC_) {}
+
+  bool runOnMachineFunction(MachineFunction &MF);
+
+  virtual const char *getPassName() const {
+    return "Sparc Machine Code Emitter";
+  }
+
+  /// getBinaryCodeForInstr - This function, generated by the
+  /// CodeEmitterGenerator using TableGen, produces the binary encoding for
+  /// machine instructions.
+  uint64_t getBinaryCodeForInstr(const MachineInstr &MI) const;
+
+  void emitInstruction(MachineBasicBlock::instr_iterator MI,
+                       MachineBasicBlock &MBB);
+
+private:
+  /// getMachineOpValue - Return binary encoding of operand. If the machine
+  /// operand requires relocation, record the relocation and return zero.
+  unsigned getMachineOpValue(const MachineInstr &MI,
+                             const MachineOperand &MO) const;
+
+  void emitWord(unsigned Word);
+
+  unsigned getRelocation(const MachineInstr &MI,
+                         const MachineOperand &MO) const;
+
+  void emitGlobalAddress(const GlobalValue *GV, unsigned Reloc) const;
+  void emitExternalSymbolAddress(const char *ES, unsigned Reloc) const;
+  void emitConstPoolAddress(unsigned CPI, unsigned Reloc) const;
+  void emitMachineBasicBlock(MachineBasicBlock *BB, unsigned Reloc) const;
+};
+}  // end anonymous namespace.
+
+char SparcCodeEmitter::ID = 0;
+
+bool SparcCodeEmitter::runOnMachineFunction(MachineFunction &MF) {
+  SparcTargetMachine &Target = static_cast<SparcTargetMachine &>(
+                                const_cast<TargetMachine &>(MF.getTarget()));
+
+  JTI = Target.getJITInfo();
+  II = Target.getInstrInfo();
+  TD = Target.getDataLayout();
+  Subtarget = &TM.getSubtarget<SparcSubtarget> ();
+  MCPEs = &MF.getConstantPool()->getConstants();
+  JTI->Initialize(MF, IsPIC);
+  MCE.setModuleInfo(&getAnalysis<MachineModuleInfo> ());
+
+  do {
+    DEBUG(errs() << "JITTing function '"
+        << MF.getName() << "'\n");
+    MCE.startFunction(MF);
+
+    for (MachineFunction::iterator MBB = MF.begin(), E = MF.end();
+        MBB != E; ++MBB){
+      MCE.StartMachineBasicBlock(MBB);
+      for (MachineBasicBlock::instr_iterator I = MBB->instr_begin(),
+           E = MBB->instr_end(); I != E;)
+        emitInstruction(*I++, *MBB);
+    }
+  } while (MCE.finishFunction(MF));
+
+  return false;
+}
+
+void SparcCodeEmitter::emitInstruction(MachineBasicBlock::instr_iterator MI,
+                                      MachineBasicBlock &MBB) {
+  DEBUG(errs() << "JIT: " << (void*)MCE.getCurrentPCValue() << ":\t" << *MI);
+
+  MCE.processDebugLoc(MI->getDebugLoc(), true);
+
+  ++NumEmitted;
+
+  switch (MI->getOpcode()) {
+  default: {
+    emitWord(getBinaryCodeForInstr(*MI));
+    break;
+  }
+  case TargetOpcode::INLINEASM: {
+    // We allow inline assembler nodes with empty bodies - they can
+    // implicitly define registers, which is ok for JIT.
+    if (MI->getOperand(0).getSymbolName()[0]) {
+      report_fatal_error("JIT does not support inline asm!");
+    }
+    break;
+  }
+  case TargetOpcode::PROLOG_LABEL:
+  case TargetOpcode::EH_LABEL: {
+    MCE.emitLabel(MI->getOperand(0).getMCSymbol());
+    break;
+  }
+  case TargetOpcode::IMPLICIT_DEF:
+  case TargetOpcode::KILL: {
+    // Do nothing.
+    break;
+  }
+  case SP::GETPCX: {
+    report_fatal_error("JIT does not support pseudo instruction GETPCX yet!");
+    break;
+  }
+  }
+
+  MCE.processDebugLoc(MI->getDebugLoc(), false);
+}
+
+void SparcCodeEmitter::emitWord(unsigned Word) {
+  DEBUG(errs() << "  0x";
+        errs().write_hex(Word) << "\n");
+  MCE.emitWordBE(Word);
+}
+
+/// getMachineOpValue - Return binary encoding of operand. If the machine
+/// operand requires relocation, record the relocation and return zero.
+unsigned SparcCodeEmitter::getMachineOpValue(const MachineInstr &MI,
+                                             const MachineOperand &MO) const {
+  if (MO.isReg())
+    return TM.getRegisterInfo()->getEncodingValue(MO.getReg());
+  else if (MO.isImm())
+    return static_cast<unsigned>(MO.getImm());
+  else if (MO.isGlobal())
+    emitGlobalAddress(MO.getGlobal(), getRelocation(MI, MO));
+  else if (MO.isSymbol())
+    emitExternalSymbolAddress(MO.getSymbolName(), getRelocation(MI, MO));
+  else if (MO.isCPI())
+    emitConstPoolAddress(MO.getIndex(), getRelocation(MI, MO));
+  else if (MO.isMBB())
+    emitMachineBasicBlock(MO.getMBB(), getRelocation(MI, MO));
+  else
+    llvm_unreachable("Unable to encode MachineOperand!");
+  return 0;
+}
+unsigned SparcCodeEmitter::getRelocation(const MachineInstr &MI,
+                                         const MachineOperand &MO) const {
+
+  unsigned TF = MO.getTargetFlags();
+  switch (TF) {
+  default:
+  case SPII::MO_NO_FLAG: break;
+  case SPII::MO_LO: return SP::reloc_sparc_lo;
+  case SPII::MO_HI: return SP::reloc_sparc_hi;
+  case SPII::MO_H44:
+  case SPII::MO_M44:
+  case SPII::MO_L44:
+  case SPII::MO_HH:
+  case SPII::MO_HM: assert(0 && "FIXME: Implement Medium/Large code model.");
+  }
+
+  unsigned Opc = MI.getOpcode();
+  switch (Opc) {
+  default: break;
+  case SP::CALL:    return SP::reloc_sparc_pc30;
+  case SP::BA:
+  case SP::BCOND:
+  case SP::FBCOND:  return SP::reloc_sparc_pc22;
+  case SP::BPXCC:   return SP::reloc_sparc_pc19;
+  }
+  llvm_unreachable("unknown reloc!");
+}
+
+void SparcCodeEmitter::emitGlobalAddress(const GlobalValue *GV,
+                                        unsigned Reloc) const {
+  MCE.addRelocation(MachineRelocation::getGV(MCE.getCurrentPCOffset(), Reloc,
+                                             const_cast<GlobalValue *>(GV), 0,
+                                             true));
+}
+
+void SparcCodeEmitter::
+emitExternalSymbolAddress(const char *ES, unsigned Reloc) const {
+  MCE.addRelocation(MachineRelocation::getExtSym(MCE.getCurrentPCOffset(),
+                                                 Reloc, ES, 0, 0));
+}
+
+void SparcCodeEmitter::
+emitConstPoolAddress(unsigned CPI, unsigned Reloc) const {
+  MCE.addRelocation(MachineRelocation::getConstPool(MCE.getCurrentPCOffset(),
+                                                    Reloc, CPI, 0, false));
+}
+
+void SparcCodeEmitter::emitMachineBasicBlock(MachineBasicBlock *BB,
+                                            unsigned Reloc) const {
+  MCE.addRelocation(MachineRelocation::getBB(MCE.getCurrentPCOffset(),
+                                             Reloc, BB));
+}
+
+
+/// createSparcJITCodeEmitterPass - Return a pass that emits the collected Sparc
+/// code to the specified MCE object.
+FunctionPass *llvm::createSparcJITCodeEmitterPass(SparcTargetMachine &TM,
+                                                 JITCodeEmitter &JCE) {
+  return new SparcCodeEmitter(TM, JCE);
+}
+
+#include "SparcGenCodeEmitter.inc"
diff --git a/lib/Target/Sparc/SparcFrameLowering.cpp b/lib/Target/Sparc/SparcFrameLowering.cpp
index 536e466..c75998a 100644
--- a/lib/Target/Sparc/SparcFrameLowering.cpp
+++ b/lib/Target/Sparc/SparcFrameLowering.cpp
@@ -33,6 +33,51 @@ DisableLeafProc("disable-sparc-leaf-proc",
                 cl::Hidden);
 
 
+void SparcFrameLowering::emitSPAdjustment(MachineFunction &MF,
+                                          MachineBasicBlock &MBB,
+                                          MachineBasicBlock::iterator MBBI,
+                                          int NumBytes,
+                                          unsigned ADDrr,
+                                          unsigned ADDri) const {
+
+  DebugLoc dl = (MBBI != MBB.end()) ? MBBI->getDebugLoc() : DebugLoc();
+  const SparcInstrInfo &TII =
+    *static_cast<const SparcInstrInfo*>(MF.getTarget().getInstrInfo());
+
+  if (NumBytes >= -4096 && NumBytes < 4096) {
+    BuildMI(MBB, MBBI, dl, TII.get(ADDri), SP::O6)
+      .addReg(SP::O6).addImm(NumBytes);
+    return;
+  }
+
+  // Emit this the hard way.  This clobbers G1 which we always know is
+  // available here.
+  if (NumBytes >= 0) {
+    // Emit nonnegative numbers with sethi + or.
+    // sethi %hi(NumBytes), %g1
+    // or %g1, %lo(NumBytes), %g1
+    // add %sp, %g1, %sp
+    BuildMI(MBB, MBBI, dl, TII.get(SP::SETHIi), SP::G1)
+      .addImm(HI22(NumBytes));
+    BuildMI(MBB, MBBI, dl, TII.get(SP::ORri), SP::G1)
+      .addReg(SP::G1).addImm(LO10(NumBytes));
+    BuildMI(MBB, MBBI, dl, TII.get(ADDrr), SP::O6)
+      .addReg(SP::O6).addReg(SP::G1);
+    return ;
+  }
+
+  // Emit negative numbers with sethi + xor.
+  // sethi %hix(NumBytes), %g1
+  // xor %g1, %lox(NumBytes), %g1
+  // add %sp, %g1, %sp
+  BuildMI(MBB, MBBI, dl, TII.get(SP::SETHIi), SP::G1)
+    .addImm(HIX22(NumBytes));
+  BuildMI(MBB, MBBI, dl, TII.get(SP::XORri), SP::G1)
+    .addReg(SP::G1).addImm(LOX10(NumBytes));
+  BuildMI(MBB, MBBI, dl, TII.get(ADDrr), SP::O6)
+    .addReg(SP::O6).addReg(SP::G1);
+}
+
 void SparcFrameLowering::emitPrologue(MachineFunction &MF) const {
   SparcMachineFunctionInfo *FuncInfo = MF.getInfo<SparcMachineFunctionInfo>();
 
@@ -55,21 +100,27 @@ void SparcFrameLowering::emitPrologue(MachineFunction &MF) const {
     SAVErr = SP::ADDrr;
   }
   NumBytes = - SubTarget.getAdjustedFrameSize(NumBytes);
-
-  if (NumBytes >= -4096) {
-    BuildMI(MBB, MBBI, dl, TII.get(SAVEri), SP::O6)
-      .addReg(SP::O6).addImm(NumBytes);
-  } else {
-    // Emit this the hard way.  This clobbers G1 which we always know is
-    // available here.
-    unsigned OffHi = (unsigned)NumBytes >> 10U;
-    BuildMI(MBB, MBBI, dl, TII.get(SP::SETHIi), SP::G1).addImm(OffHi);
-    // Emit G1 = G1 + I6
-    BuildMI(MBB, MBBI, dl, TII.get(SP::ORri), SP::G1)
-      .addReg(SP::G1).addImm(NumBytes & ((1 << 10)-1));
-    BuildMI(MBB, MBBI, dl, TII.get(SAVErr), SP::O6)
-      .addReg(SP::O6).addReg(SP::G1);
-  }
+  emitSPAdjustment(MF, MBB, MBBI, NumBytes, SAVErr, SAVEri);
+
+  MachineModuleInfo &MMI = MF.getMMI();
+  const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo();
+  MCSymbol *FrameLabel = MMI.getContext().CreateTempSymbol();
+  BuildMI(MBB, MBBI, dl, TII.get(SP::PROLOG_LABEL)).addSym(FrameLabel);
+
+  unsigned regFP = MRI->getDwarfRegNum(SP::I6, true);
+
+  // Emit ".cfi_def_cfa_register 30".
+  MMI.addFrameInst(MCCFIInstruction::createDefCfaRegister(FrameLabel,
+                                                          regFP));
+  // Emit ".cfi_window_save".
+  MMI.addFrameInst(MCCFIInstruction::createWindowSave(FrameLabel));
+
+  unsigned regInRA = MRI->getDwarfRegNum(SP::I7, true);
+  unsigned regOutRA = MRI->getDwarfRegNum(SP::O7, true);
+  // Emit ".cfi_register 15, 31".
+  MMI.addFrameInst(MCCFIInstruction::createRegister(FrameLabel,
+                                                    regOutRA,
+                                                    regInRA));
 }
 
 void SparcFrameLowering::
@@ -77,15 +128,12 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator I) const {
   if (!hasReservedCallFrame(MF)) {
     MachineInstr &MI = *I;
-    DebugLoc DL = MI.getDebugLoc();
     int Size = MI.getOperand(0).getImm();
     if (MI.getOpcode() == SP::ADJCALLSTACKDOWN)
       Size = -Size;
-    const SparcInstrInfo &TII =
-      *static_cast<const SparcInstrInfo*>(MF.getTarget().getInstrInfo());
+
     if (Size)
-      BuildMI(MBB, I, DL, TII.get(SP::ADDri), SP::O6).addReg(SP::O6)
-        .addImm(Size);
+      emitSPAdjustment(MF, MBB, I, Size, SP::ADDrr, SP::ADDri);
   }
   MBB.erase(I);
 }
@@ -112,21 +160,7 @@ void SparcFrameLowering::emitEpilogue(MachineFunction &MF,
     return;
 
   NumBytes = SubTarget.getAdjustedFrameSize(NumBytes);
-
-  if (NumBytes < 4096) {
-    BuildMI(MBB, MBBI, dl, TII.get(SP::ADDri), SP::O6)
-      .addReg(SP::O6).addImm(NumBytes);
-  } else {
-    // Emit this the hard way.  This clobbers G1 which we always know is
-    // available here.
-    unsigned OffHi = (unsigned)NumBytes >> 10U;
-    BuildMI(MBB, MBBI, dl, TII.get(SP::SETHIi), SP::G1).addImm(OffHi);
-    // Emit G1 = G1 + I6
-    BuildMI(MBB, MBBI, dl, TII.get(SP::ORri), SP::G1)
-      .addReg(SP::G1).addImm(NumBytes & ((1 << 10)-1));
-    BuildMI(MBB, MBBI, dl, TII.get(SP::ADDrr), SP::O6)
-      .addReg(SP::O6).addReg(SP::G1);
-  }
+  emitSPAdjustment(MF, MBB, MBBI, NumBytes, SP::ADDrr, SP::ADDri);
 }
 
 bool SparcFrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
diff --git a/lib/Target/Sparc/SparcFrameLowering.h b/lib/Target/Sparc/SparcFrameLowering.h
index 8eaef59..072fde3 100644
--- a/lib/Target/Sparc/SparcFrameLowering.h
+++ b/lib/Target/Sparc/SparcFrameLowering.h
@@ -49,6 +49,14 @@ private:
 
   // Returns true if MF is a leaf procedure.
   bool isLeafProc(MachineFunction &MF) const;
+
+
+  // Emits code for adjusting SP in function prologue/epilogue.
+  void emitSPAdjustment(MachineFunction &MF,
+                        MachineBasicBlock &MBB,
+                        MachineBasicBlock::iterator MBBI,
+                        int NumBytes, unsigned ADDrr, unsigned ADDri) const;
+
 };
 
 } // End llvm namespace
diff --git a/lib/Target/Sparc/SparcISelDAGToDAG.cpp b/lib/Target/Sparc/SparcISelDAGToDAG.cpp
index db62151..b012bfd 100644
--- a/lib/Target/Sparc/SparcISelDAGToDAG.cpp
+++ b/lib/Target/Sparc/SparcISelDAGToDAG.cpp
@@ -80,7 +80,8 @@ bool SparcDAGToDAGISel::SelectADDRri(SDValue Addr,
     return true;
   }
   if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
-      Addr.getOpcode() == ISD::TargetGlobalAddress)
+      Addr.getOpcode() == ISD::TargetGlobalAddress ||
+      Addr.getOpcode() == ISD::TargetGlobalTLSAddress)
     return false;  // direct calls.
 
   if (Addr.getOpcode() == ISD::ADD) {
@@ -117,7 +118,8 @@ bool SparcDAGToDAGISel::SelectADDRri(SDValue Addr,
 bool SparcDAGToDAGISel::SelectADDRrr(SDValue Addr, SDValue &R1, SDValue &R2) {
   if (Addr.getOpcode() == ISD::FrameIndex) return false;
   if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
-      Addr.getOpcode() == ISD::TargetGlobalAddress)
+      Addr.getOpcode() == ISD::TargetGlobalAddress ||
+      Addr.getOpcode() == ISD::TargetGlobalTLSAddress)
     return false;  // direct calls.
 
   if (Addr.getOpcode() == ISD::ADD) {
@@ -139,8 +141,10 @@ bool SparcDAGToDAGISel::SelectADDRrr(SDValue Addr, SDValue &R1, SDValue &R2) {
 
 SDNode *SparcDAGToDAGISel::Select(SDNode *N) {
   SDLoc dl(N);
-  if (N->isMachineOpcode())
+  if (N->isMachineOpcode()) {
+    N->setNodeId(-1);
     return NULL;   // Already selected.
+  }
 
   switch (N->getOpcode()) {
   default: break;
diff --git a/lib/Target/Sparc/SparcISelLowering.cpp b/lib/Target/Sparc/SparcISelLowering.cpp
index 4b0fa67..64625f7 100644
--- a/lib/Target/Sparc/SparcISelLowering.cpp
+++ b/lib/Target/Sparc/SparcISelLowering.cpp
@@ -14,6 +14,7 @@
 
 #include "SparcISelLowering.h"
 #include "SparcMachineFunctionInfo.h"
+#include "SparcRegisterInfo.h"
 #include "SparcTargetMachine.h"
 #include "MCTargetDesc/SparcBaseInfo.h"
 #include "llvm/CodeGen/CallingConvLower.h"
@@ -648,6 +649,27 @@ SparcTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   return LowerCall_32(CLI, InVals);
 }
 
+static bool hasReturnsTwiceAttr(SelectionDAG &DAG, SDValue Callee,
+                                     ImmutableCallSite *CS) {
+  if (CS)
+    return CS->hasFnAttr(Attribute::ReturnsTwice);
+
+  const Function *CalleeFn = 0;
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+    CalleeFn = dyn_cast<Function>(G->getGlobal());
+  } else if (ExternalSymbolSDNode *E =
+             dyn_cast<ExternalSymbolSDNode>(Callee)) {
+    const Function *Fn = DAG.getMachineFunction().getFunction();
+    const Module *M = Fn->getParent();
+    const char *CalleeName = E->getSymbol();
+    CalleeFn = M->getFunction(CalleeName);
+  }
+
+  if (!CalleeFn)
+    return false;
+  return CalleeFn->hasFnAttribute(Attribute::ReturnsTwice);
+}
+
 // Lower a call for the 32-bit ABI.
 SDValue
 SparcTargetLowering::LowerCall_32(TargetLowering::CallLoweringInfo &CLI,
@@ -861,6 +883,7 @@ SparcTargetLowering::LowerCall_32(TargetLowering::CallLoweringInfo &CLI,
   }
 
   unsigned SRetArgSize = (hasStructRetAttr)? getSRetArgSize(DAG, Callee):0;
+  bool hasReturnsTwice = hasReturnsTwiceAttr(DAG, Callee, CLI.CS);
 
   // If the callee is a GlobalAddress node (quite common, every direct call is)
   // turn it into a TargetGlobalAddress node so that legalize doesn't hack it.
@@ -880,6 +903,16 @@ SparcTargetLowering::LowerCall_32(TargetLowering::CallLoweringInfo &CLI,
   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
     Ops.push_back(DAG.getRegister(toCallerWindow(RegsToPass[i].first),
                                   RegsToPass[i].second.getValueType()));
+
+  // Add a register mask operand representing the call-preserved registers.
+  const SparcRegisterInfo *TRI =
+    ((const SparcTargetMachine&)getTargetMachine()).getRegisterInfo();
+  const uint32_t *Mask = ((hasReturnsTwice)
+                          ? TRI->getRTCallPreservedMask(CallConv)
+                          : TRI->getCallPreservedMask(CallConv));
+  assert(Mask && "Missing call preserved mask for calling convention");
+  Ops.push_back(DAG.getRegisterMask(Mask));
+
   if (InFlag.getNode())
     Ops.push_back(InFlag);
 
@@ -908,6 +941,23 @@ SparcTargetLowering::LowerCall_32(TargetLowering::CallLoweringInfo &CLI,
   return Chain;
 }
 
+// This functions returns true if CalleeName is a ABI function that returns
+// a long double (fp128).
+static bool isFP128ABICall(const char *CalleeName)
+{
+  static const char *const ABICalls[] =
+    {  "_Q_add", "_Q_sub", "_Q_mul", "_Q_div",
+       "_Q_sqrt", "_Q_neg",
+       "_Q_itoq", "_Q_stoq", "_Q_dtoq", "_Q_utoq",
+       "_Q_lltoq", "_Q_ulltoq",
+       0
+    };
+  for (const char * const *I = ABICalls; *I != 0; ++I)
+    if (strcmp(CalleeName, *I) == 0)
+      return true;
+  return false;
+}
+
 unsigned
 SparcTargetLowering::getSRetArgSize(SelectionDAG &DAG, SDValue Callee) const
 {
@@ -918,7 +968,10 @@ SparcTargetLowering::getSRetArgSize(SelectionDAG &DAG, SDValue Callee) const
              dyn_cast<ExternalSymbolSDNode>(Callee)) {
     const Function *Fn = DAG.getMachineFunction().getFunction();
     const Module *M = Fn->getParent();
-    CalleeFn = M->getFunction(E->getSymbol());
+    const char *CalleeName = E->getSymbol();
+    CalleeFn = M->getFunction(CalleeName);
+    if (!CalleeFn && isFP128ABICall(CalleeName))
+      return 16; // Return sizeof(fp128)
   }
 
   if (!CalleeFn)
@@ -983,6 +1036,9 @@ SparcTargetLowering::LowerCall_64(TargetLowering::CallLoweringInfo &CLI,
   SDLoc DL = CLI.DL;
   SDValue Chain = CLI.Chain;
 
+  // Sparc target does not yet support tail call optimization.
+  CLI.IsTailCall = false;
+
   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CLI.CallConv, CLI.IsVarArg, DAG.getMachineFunction(),
@@ -1099,6 +1155,7 @@ SparcTargetLowering::LowerCall_64(TargetLowering::CallLoweringInfo &CLI,
   // turn it into a TargetGlobalAddress node so that legalize doesn't hack it.
   // Likewise ExternalSymbol -> TargetExternalSymbol.
   SDValue Callee = CLI.Callee;
+  bool hasReturnsTwice = hasReturnsTwiceAttr(DAG, Callee, CLI.CS);
   if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
     Callee = DAG.getTargetGlobalAddress(G->getGlobal(), DL, getPointerTy());
   else if (ExternalSymbolSDNode *E = dyn_cast<ExternalSymbolSDNode>(Callee))
@@ -1112,6 +1169,15 @@ SparcTargetLowering::LowerCall_64(TargetLowering::CallLoweringInfo &CLI,
     Ops.push_back(DAG.getRegister(RegsToPass[i].first,
                                   RegsToPass[i].second.getValueType()));
 
+  // Add a register mask operand representing the call-preserved registers.
+  const SparcRegisterInfo *TRI =
+    ((const SparcTargetMachine&)getTargetMachine()).getRegisterInfo();
+  const uint32_t *Mask = ((hasReturnsTwice)
+                          ? TRI->getRTCallPreservedMask(CLI.CallConv)
+                          : TRI->getCallPreservedMask(CLI.CallConv));
+  assert(Mask && "Missing call preserved mask for calling convention");
+  Ops.push_back(DAG.getRegisterMask(Mask));
+
   // Make sure the CopyToReg nodes are glued to the call instruction which
   // consumes the registers.
   if (InGlue.getNode())
@@ -1244,15 +1310,21 @@ SparcTargetLowering::SparcTargetLowering(TargetMachine &TM)
   addRegisterClass(MVT::i32, &SP::IntRegsRegClass);
   addRegisterClass(MVT::f32, &SP::FPRegsRegClass);
   addRegisterClass(MVT::f64, &SP::DFPRegsRegClass);
+  addRegisterClass(MVT::f128, &SP::QFPRegsRegClass);
   if (Subtarget->is64Bit())
     addRegisterClass(MVT::i64, &SP::I64RegsRegClass);
 
   // Turn FP extload into load/fextend
   setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::f64, Expand);
+
   // Sparc doesn't have i1 sign extending load
   setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
+
   // Turn FP truncstore into trunc + store.
   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+  setTruncStoreAction(MVT::f128, MVT::f32, Expand);
+  setTruncStoreAction(MVT::f128, MVT::f64, Expand);
 
   // Custom legalize GlobalAddress nodes into LO/HI parts.
   setOperationAction(ISD::GlobalAddress, getPointerTy(), Custom);
@@ -1271,13 +1343,25 @@ SparcTargetLowering::SparcTargetLowering(TargetMachine &TM)
   setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
   setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
 
+  // ... nor does SparcV9.
+  if (Subtarget->is64Bit()) {
+    setOperationAction(ISD::UREM, MVT::i64, Expand);
+    setOperationAction(ISD::SREM, MVT::i64, Expand);
+    setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
+    setOperationAction(ISD::UDIVREM, MVT::i64, Expand);
+  }
+
   // Custom expand fp<->sint
   setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
   setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
+  setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
+  setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
 
-  // Expand fp<->uint
-  setOperationAction(ISD::FP_TO_UINT, MVT::i32, Expand);
-  setOperationAction(ISD::UINT_TO_FP, MVT::i32, Expand);
+  // Custom Expand fp<->uint
+  setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
+  setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
+  setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
+  setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
 
   setOperationAction(ISD::BITCAST, MVT::f32, Expand);
   setOperationAction(ISD::BITCAST, MVT::i32, Expand);
@@ -1286,9 +1370,12 @@ SparcTargetLowering::SparcTargetLowering(TargetMachine &TM)
   setOperationAction(ISD::SELECT, MVT::i32, Expand);
   setOperationAction(ISD::SELECT, MVT::f32, Expand);
   setOperationAction(ISD::SELECT, MVT::f64, Expand);
+  setOperationAction(ISD::SELECT, MVT::f128, Expand);
+
   setOperationAction(ISD::SETCC, MVT::i32, Expand);
   setOperationAction(ISD::SETCC, MVT::f32, Expand);
   setOperationAction(ISD::SETCC, MVT::f64, Expand);
+  setOperationAction(ISD::SETCC, MVT::f128, Expand);
 
   // Sparc doesn't have BRCOND either, it has BR_CC.
   setOperationAction(ISD::BRCOND, MVT::Other, Expand);
@@ -1297,18 +1384,34 @@ SparcTargetLowering::SparcTargetLowering(TargetMachine &TM)
   setOperationAction(ISD::BR_CC, MVT::i32, Custom);
   setOperationAction(ISD::BR_CC, MVT::f32, Custom);
   setOperationAction(ISD::BR_CC, MVT::f64, Custom);
+  setOperationAction(ISD::BR_CC, MVT::f128, Custom);
 
   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
   setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
+  setOperationAction(ISD::SELECT_CC, MVT::f128, Custom);
 
   if (Subtarget->is64Bit()) {
+    setOperationAction(ISD::ADDC, MVT::i64, Custom);
+    setOperationAction(ISD::ADDE, MVT::i64, Custom);
+    setOperationAction(ISD::SUBC, MVT::i64, Custom);
+    setOperationAction(ISD::SUBE, MVT::i64, Custom);
     setOperationAction(ISD::BITCAST, MVT::f64, Expand);
     setOperationAction(ISD::BITCAST, MVT::i64, Expand);
     setOperationAction(ISD::SELECT, MVT::i64, Expand);
     setOperationAction(ISD::SETCC, MVT::i64, Expand);
     setOperationAction(ISD::BR_CC, MVT::i64, Custom);
     setOperationAction(ISD::SELECT_CC, MVT::i64, Custom);
+
+    setOperationAction(ISD::CTPOP, MVT::i64, Legal);
+    setOperationAction(ISD::CTTZ , MVT::i64, Expand);
+    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
+    setOperationAction(ISD::CTLZ , MVT::i64, Expand);
+    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand);
+    setOperationAction(ISD::BSWAP, MVT::i64, Expand);
+    setOperationAction(ISD::ROTL , MVT::i64, Expand);
+    setOperationAction(ISD::ROTR , MVT::i64, Expand);
+    setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom);
   }
 
   // FIXME: There are instructions available for ATOMIC_FENCE
@@ -1321,6 +1424,11 @@ SparcTargetLowering::SparcTargetLowering(TargetMachine &TM)
     setOperationAction(ISD::FABS, MVT::f64, Custom);
   }
 
+  setOperationAction(ISD::FSIN , MVT::f128, Expand);
+  setOperationAction(ISD::FCOS , MVT::f128, Expand);
+  setOperationAction(ISD::FSINCOS, MVT::f128, Expand);
+  setOperationAction(ISD::FREM , MVT::f128, Expand);
+  setOperationAction(ISD::FMA  , MVT::f128, Expand);
   setOperationAction(ISD::FSIN , MVT::f64, Expand);
   setOperationAction(ISD::FCOS , MVT::f64, Expand);
   setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
@@ -1339,8 +1447,10 @@ SparcTargetLowering::SparcTargetLowering(TargetMachine &TM)
   setOperationAction(ISD::ROTL , MVT::i32, Expand);
   setOperationAction(ISD::ROTR , MVT::i32, Expand);
   setOperationAction(ISD::BSWAP, MVT::i32, Expand);
+  setOperationAction(ISD::FCOPYSIGN, MVT::f128, Expand);
   setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
   setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
+  setOperationAction(ISD::FPOW , MVT::f128, Expand);
   setOperationAction(ISD::FPOW , MVT::f64, Expand);
   setOperationAction(ISD::FPOW , MVT::f32, Expand);
 
@@ -1352,7 +1462,12 @@ SparcTargetLowering::SparcTargetLowering(TargetMachine &TM)
   setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
   setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
 
-  setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
+  if (Subtarget->is64Bit()) {
+    setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
+    setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
+    setOperationAction(ISD::MULHU,     MVT::i64, Expand);
+    setOperationAction(ISD::MULHS,     MVT::i64, Expand);
+  }
 
   // VASTART needs to be custom lowered to use the VarArgsFrameIndex.
   setOperationAction(ISD::VASTART           , MVT::Other, Custom);
@@ -1366,14 +1481,100 @@ SparcTargetLowering::SparcTargetLowering(TargetMachine &TM)
   setOperationAction(ISD::STACKRESTORE      , MVT::Other, Expand);
   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32  , Custom);
 
-  // No debug info support yet.
-  setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
+  setExceptionPointerRegister(SP::I0);
+  setExceptionSelectorRegister(SP::I1);
 
   setStackPointerRegisterToSaveRestore(SP::O6);
 
   if (Subtarget->isV9())
     setOperationAction(ISD::CTPOP, MVT::i32, Legal);
 
+  if (Subtarget->isV9() && Subtarget->hasHardQuad()) {
+    setOperationAction(ISD::LOAD, MVT::f128, Legal);
+    setOperationAction(ISD::STORE, MVT::f128, Legal);
+  } else {
+    setOperationAction(ISD::LOAD, MVT::f128, Custom);
+    setOperationAction(ISD::STORE, MVT::f128, Custom);
+  }
+
+  if (Subtarget->hasHardQuad()) {
+    setOperationAction(ISD::FADD,  MVT::f128, Legal);
+    setOperationAction(ISD::FSUB,  MVT::f128, Legal);
+    setOperationAction(ISD::FMUL,  MVT::f128, Legal);
+    setOperationAction(ISD::FDIV,  MVT::f128, Legal);
+    setOperationAction(ISD::FSQRT, MVT::f128, Legal);
+    setOperationAction(ISD::FP_EXTEND, MVT::f128, Legal);
+    setOperationAction(ISD::FP_ROUND,  MVT::f64, Legal);
+    if (Subtarget->isV9()) {
+      setOperationAction(ISD::FNEG, MVT::f128, Legal);
+      setOperationAction(ISD::FABS, MVT::f128, Legal);
+    } else {
+      setOperationAction(ISD::FNEG, MVT::f128, Custom);
+      setOperationAction(ISD::FABS, MVT::f128, Custom);
+    }
+
+    if (!Subtarget->is64Bit()) {
+      setLibcallName(RTLIB::FPTOSINT_F128_I64, "_Q_qtoll");
+      setLibcallName(RTLIB::FPTOUINT_F128_I64, "_Q_qtoull");
+      setLibcallName(RTLIB::SINTTOFP_I64_F128, "_Q_lltoq");
+      setLibcallName(RTLIB::UINTTOFP_I64_F128, "_Q_ulltoq");
+    }
+
+  } else {
+    // Custom legalize f128 operations.
+
+    setOperationAction(ISD::FADD,  MVT::f128, Custom);
+    setOperationAction(ISD::FSUB,  MVT::f128, Custom);
+    setOperationAction(ISD::FMUL,  MVT::f128, Custom);
+    setOperationAction(ISD::FDIV,  MVT::f128, Custom);
+    setOperationAction(ISD::FSQRT, MVT::f128, Custom);
+    setOperationAction(ISD::FNEG,  MVT::f128, Custom);
+    setOperationAction(ISD::FABS,  MVT::f128, Custom);
+
+    setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);
+    setOperationAction(ISD::FP_ROUND,  MVT::f64, Custom);
+    setOperationAction(ISD::FP_ROUND,  MVT::f32, Custom);
+
+    // Setup Runtime library names.
+    if (Subtarget->is64Bit()) {
+      setLibcallName(RTLIB::ADD_F128,  "_Qp_add");
+      setLibcallName(RTLIB::SUB_F128,  "_Qp_sub");
+      setLibcallName(RTLIB::MUL_F128,  "_Qp_mul");
+      setLibcallName(RTLIB::DIV_F128,  "_Qp_div");
+      setLibcallName(RTLIB::SQRT_F128, "_Qp_sqrt");
+      setLibcallName(RTLIB::FPTOSINT_F128_I32, "_Qp_qtoi");
+      setLibcallName(RTLIB::FPTOUINT_F128_I32, "_Qp_qtoui");
+      setLibcallName(RTLIB::SINTTOFP_I32_F128, "_Qp_itoq");
+      setLibcallName(RTLIB::UINTTOFP_I32_F128, "_Qp_uitoq");
+      setLibcallName(RTLIB::FPTOSINT_F128_I64, "_Qp_qtox");
+      setLibcallName(RTLIB::FPTOUINT_F128_I64, "_Qp_qtoux");
+      setLibcallName(RTLIB::SINTTOFP_I64_F128, "_Qp_xtoq");
+      setLibcallName(RTLIB::UINTTOFP_I64_F128, "_Qp_uxtoq");
+      setLibcallName(RTLIB::FPEXT_F32_F128, "_Qp_stoq");
+      setLibcallName(RTLIB::FPEXT_F64_F128, "_Qp_dtoq");
+      setLibcallName(RTLIB::FPROUND_F128_F32, "_Qp_qtos");
+      setLibcallName(RTLIB::FPROUND_F128_F64, "_Qp_qtod");
+    } else {
+      setLibcallName(RTLIB::ADD_F128,  "_Q_add");
+      setLibcallName(RTLIB::SUB_F128,  "_Q_sub");
+      setLibcallName(RTLIB::MUL_F128,  "_Q_mul");
+      setLibcallName(RTLIB::DIV_F128,  "_Q_div");
+      setLibcallName(RTLIB::SQRT_F128, "_Q_sqrt");
+      setLibcallName(RTLIB::FPTOSINT_F128_I32, "_Q_qtoi");
+      setLibcallName(RTLIB::FPTOUINT_F128_I32, "_Q_qtou");
+      setLibcallName(RTLIB::SINTTOFP_I32_F128, "_Q_itoq");
+      setLibcallName(RTLIB::UINTTOFP_I32_F128, "_Q_utoq");
+      setLibcallName(RTLIB::FPTOSINT_F128_I64, "_Q_qtoll");
+      setLibcallName(RTLIB::FPTOUINT_F128_I64, "_Q_qtoull");
+      setLibcallName(RTLIB::SINTTOFP_I64_F128, "_Q_lltoq");
+      setLibcallName(RTLIB::UINTTOFP_I64_F128, "_Q_ulltoq");
+      setLibcallName(RTLIB::FPEXT_F32_F128, "_Q_stoq");
+      setLibcallName(RTLIB::FPEXT_F64_F128, "_Q_dtoq");
+      setLibcallName(RTLIB::FPROUND_F128_F32, "_Q_qtos");
+      setLibcallName(RTLIB::FPROUND_F128_F64, "_Q_qtod");
+    }
+  }
+
   setMinFunctionAlignment(2);
 
   computeRegisterProperties();
@@ -1394,13 +1595,24 @@ const char *SparcTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case SPISD::Lo:         return "SPISD::Lo";
   case SPISD::FTOI:       return "SPISD::FTOI";
   case SPISD::ITOF:       return "SPISD::ITOF";
+  case SPISD::FTOX:       return "SPISD::FTOX";
+  case SPISD::XTOF:       return "SPISD::XTOF";
   case SPISD::CALL:       return "SPISD::CALL";
   case SPISD::RET_FLAG:   return "SPISD::RET_FLAG";
   case SPISD::GLOBAL_BASE_REG: return "SPISD::GLOBAL_BASE_REG";
   case SPISD::FLUSHW:     return "SPISD::FLUSHW";
+  case SPISD::TLS_ADD:    return "SPISD::TLS_ADD";
+  case SPISD::TLS_LD:     return "SPISD::TLS_LD";
+  case SPISD::TLS_CALL:   return "SPISD::TLS_CALL";
   }
 }
 
+EVT SparcTargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
+  if (!VT.isVector())
+    return MVT::i32;
+  return VT.changeVectorElementTypeToInteger();
+}
+
 /// isMaskedValueZeroForTargetNode - Return true if 'Op & Mask' is known to
 /// be zero. Op is expected to be a target specific node. Used by DAG
 /// combiner.
@@ -1505,6 +1717,10 @@ SDValue SparcTargetLowering::makeAddress(SDValue Op, SelectionDAG &DAG) const {
     SDValue HiLo = makeHiLoPair(Op, SPII::MO_HI, SPII::MO_LO, DAG);
     SDValue GlobalBase = DAG.getNode(SPISD::GLOBAL_BASE_REG, DL, VT);
     SDValue AbsAddr = DAG.getNode(ISD::ADD, DL, VT, GlobalBase, HiLo);
+    // GLOBAL_BASE_REG codegen'ed with call. Inform MFI that this
+    // function has calls.
+    MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+    MFI->setHasCalls(true);
     return DAG.getLoad(VT, DL, DAG.getEntryNode(), AbsAddr,
                        MachinePointerInfo::getGOT(), false, false, false, 0);
   }
@@ -1513,6 +1729,7 @@ SDValue SparcTargetLowering::makeAddress(SDValue Op, SelectionDAG &DAG) const {
   switch(getTargetMachine().getCodeModel()) {
   default:
     llvm_unreachable("Unsupported absolute code model");
+  case CodeModel::JITDefault:
   case CodeModel::Small:
     // abs32.
     return makeHiLoPair(Op, SPII::MO_HI, SPII::MO_LO, DAG);
@@ -1549,23 +1766,430 @@ SDValue SparcTargetLowering::LowerBlockAddress(SDValue Op,
   return makeAddress(Op, DAG);
 }
 
-static SDValue LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) {
+SDValue SparcTargetLowering::LowerGlobalTLSAddress(SDValue Op,
+                                                   SelectionDAG &DAG) const {
+
+  GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
+  SDLoc DL(GA);
+  const GlobalValue *GV = GA->getGlobal();
+  EVT PtrVT = getPointerTy();
+
+  TLSModel::Model model = getTargetMachine().getTLSModel(GV);
+
+  if (model == TLSModel::GeneralDynamic || model == TLSModel::LocalDynamic) {
+    unsigned HiTF = ((model == TLSModel::GeneralDynamic)? SPII::MO_TLS_GD_HI22
+                     : SPII::MO_TLS_LDM_HI22);
+    unsigned LoTF = ((model == TLSModel::GeneralDynamic)? SPII::MO_TLS_GD_LO10
+                     : SPII::MO_TLS_LDM_LO10);
+    unsigned addTF = ((model == TLSModel::GeneralDynamic)? SPII::MO_TLS_GD_ADD
+                      : SPII::MO_TLS_LDM_ADD);
+    unsigned callTF = ((model == TLSModel::GeneralDynamic)? SPII::MO_TLS_GD_CALL
+                       : SPII::MO_TLS_LDM_CALL);
+
+    SDValue HiLo = makeHiLoPair(Op, HiTF, LoTF, DAG);
+    SDValue Base = DAG.getNode(SPISD::GLOBAL_BASE_REG, DL, PtrVT);
+    SDValue Argument = DAG.getNode(SPISD::TLS_ADD, DL, PtrVT, Base, HiLo,
+                               withTargetFlags(Op, addTF, DAG));
+
+    SDValue Chain = DAG.getEntryNode();
+    SDValue InFlag;
+
+    Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(1, true), DL);
+    Chain = DAG.getCopyToReg(Chain, DL, SP::O0, Argument, InFlag);
+    InFlag = Chain.getValue(1);
+    SDValue Callee = DAG.getTargetExternalSymbol("__tls_get_addr", PtrVT);
+    SDValue Symbol = withTargetFlags(Op, callTF, DAG);
+
+    SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+    SmallVector<SDValue, 4> Ops;
+    Ops.push_back(Chain);
+    Ops.push_back(Callee);
+    Ops.push_back(Symbol);
+    Ops.push_back(DAG.getRegister(SP::O0, PtrVT));
+    const uint32_t *Mask = getTargetMachine()
+      .getRegisterInfo()->getCallPreservedMask(CallingConv::C);
+    assert(Mask && "Missing call preserved mask for calling convention");
+    Ops.push_back(DAG.getRegisterMask(Mask));
+    Ops.push_back(InFlag);
+    Chain = DAG.getNode(SPISD::TLS_CALL, DL, NodeTys, &Ops[0], Ops.size());
+    InFlag = Chain.getValue(1);
+    Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(1, true),
+                               DAG.getIntPtrConstant(0, true), InFlag, DL);
+    InFlag = Chain.getValue(1);
+    SDValue Ret = DAG.getCopyFromReg(Chain, DL, SP::O0, PtrVT, InFlag);
+
+    if (model != TLSModel::LocalDynamic)
+      return Ret;
+
+    SDValue Hi = DAG.getNode(SPISD::Hi, DL, PtrVT,
+                             withTargetFlags(Op, SPII::MO_TLS_LDO_HIX22, DAG));
+    SDValue Lo = DAG.getNode(SPISD::Lo, DL, PtrVT,
+                             withTargetFlags(Op, SPII::MO_TLS_LDO_LOX10, DAG));
+    HiLo =  DAG.getNode(ISD::XOR, DL, PtrVT, Hi, Lo);
+    return DAG.getNode(SPISD::TLS_ADD, DL, PtrVT, Ret, HiLo,
+                       withTargetFlags(Op, SPII::MO_TLS_LDO_ADD, DAG));
+  }
+
+  if (model == TLSModel::InitialExec) {
+    unsigned ldTF     = ((PtrVT == MVT::i64)? SPII::MO_TLS_IE_LDX
+                         : SPII::MO_TLS_IE_LD);
+
+    SDValue Base = DAG.getNode(SPISD::GLOBAL_BASE_REG, DL, PtrVT);
+
+    // GLOBAL_BASE_REG codegen'ed with call. Inform MFI that this
+    // function has calls.
+    MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+    MFI->setHasCalls(true);
+
+    SDValue TGA = makeHiLoPair(Op,
+                               SPII::MO_TLS_IE_HI22, SPII::MO_TLS_IE_LO10, DAG);
+    SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, Base, TGA);
+    SDValue Offset = DAG.getNode(SPISD::TLS_LD,
+                                 DL, PtrVT, Ptr,
+                                 withTargetFlags(Op, ldTF, DAG));
+    return DAG.getNode(SPISD::TLS_ADD, DL, PtrVT,
+                       DAG.getRegister(SP::G7, PtrVT), Offset,
+                       withTargetFlags(Op, SPII::MO_TLS_IE_ADD, DAG));
+  }
+
+  assert(model == TLSModel::LocalExec);
+  SDValue Hi = DAG.getNode(SPISD::Hi, DL, PtrVT,
+                           withTargetFlags(Op, SPII::MO_TLS_LE_HIX22, DAG));
+  SDValue Lo = DAG.getNode(SPISD::Lo, DL, PtrVT,
+                           withTargetFlags(Op, SPII::MO_TLS_LE_LOX10, DAG));
+  SDValue Offset =  DAG.getNode(ISD::XOR, DL, PtrVT, Hi, Lo);
+
+  return DAG.getNode(ISD::ADD, DL, PtrVT,
+                     DAG.getRegister(SP::G7, PtrVT), Offset);
+}
+
+SDValue
+SparcTargetLowering::LowerF128_LibCallArg(SDValue Chain, ArgListTy &Args,
+                                          SDValue Arg, SDLoc DL,
+                                          SelectionDAG &DAG) const {
+  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+  EVT ArgVT = Arg.getValueType();
+  Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
+
+  ArgListEntry Entry;
+  Entry.Node = Arg;
+  Entry.Ty   = ArgTy;
+
+  if (ArgTy->isFP128Ty()) {
+    // Create a stack object and pass the pointer to the library function.
+    int FI = MFI->CreateStackObject(16, 8, false);
+    SDValue FIPtr = DAG.getFrameIndex(FI, getPointerTy());
+    Chain = DAG.getStore(Chain,
+                         DL,
+                         Entry.Node,
+                         FIPtr,
+                         MachinePointerInfo(),
+                         false,
+                         false,
+                         8);
+
+    Entry.Node = FIPtr;
+    Entry.Ty   = PointerType::getUnqual(ArgTy);
+  }
+  Args.push_back(Entry);
+  return Chain;
+}
+
+SDValue
+SparcTargetLowering::LowerF128Op(SDValue Op, SelectionDAG &DAG,
+                                 const char *LibFuncName,
+                                 unsigned numArgs) const {
+
+  ArgListTy Args;
+
+  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+
+  SDValue Callee = DAG.getExternalSymbol(LibFuncName, getPointerTy());
+  Type *RetTy = Op.getValueType().getTypeForEVT(*DAG.getContext());
+  Type *RetTyABI = RetTy;
+  SDValue Chain = DAG.getEntryNode();
+  SDValue RetPtr;
+
+  if (RetTy->isFP128Ty()) {
+    // Create a Stack Object to receive the return value of type f128.
+    ArgListEntry Entry;
+    int RetFI = MFI->CreateStackObject(16, 8, false);
+    RetPtr = DAG.getFrameIndex(RetFI, getPointerTy());
+    Entry.Node = RetPtr;
+    Entry.Ty   = PointerType::getUnqual(RetTy);
+    if (!Subtarget->is64Bit())
+      Entry.isSRet = true;
+    Entry.isReturned = false;
+    Args.push_back(Entry);
+    RetTyABI = Type::getVoidTy(*DAG.getContext());
+  }
+
+  assert(Op->getNumOperands() >= numArgs && "Not enough operands!");
+  for (unsigned i = 0, e = numArgs; i != e; ++i) {
+    Chain = LowerF128_LibCallArg(Chain, Args, Op.getOperand(i), SDLoc(Op), DAG);
+  }
+  TargetLowering::
+    CallLoweringInfo CLI(Chain,
+                         RetTyABI,
+                         false, false, false, false,
+                         0, CallingConv::C,
+                         false, false, true,
+                         Callee, Args, DAG, SDLoc(Op));
+  std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
+
+  // chain is in second result.
+  if (RetTyABI == RetTy)
+    return CallInfo.first;
+
+  assert (RetTy->isFP128Ty() && "Unexpected return type!");
+
+  Chain = CallInfo.second;
+
+  // Load RetPtr to get the return value.
+  return DAG.getLoad(Op.getValueType(),
+                     SDLoc(Op),
+                     Chain,
+                     RetPtr,
+                     MachinePointerInfo(),
+                     false, false, false, 8);
+}
+
+SDValue
+SparcTargetLowering::LowerF128Compare(SDValue LHS, SDValue RHS,
+                                      unsigned &SPCC,
+                                      SDLoc DL,
+                                      SelectionDAG &DAG) const {
+
+  const char *LibCall = 0;
+  bool is64Bit = Subtarget->is64Bit();
+  switch(SPCC) {
+  default: llvm_unreachable("Unhandled conditional code!");
+  case SPCC::FCC_E  : LibCall = is64Bit? "_Qp_feq" : "_Q_feq"; break;
+  case SPCC::FCC_NE : LibCall = is64Bit? "_Qp_fne" : "_Q_fne"; break;
+  case SPCC::FCC_L  : LibCall = is64Bit? "_Qp_flt" : "_Q_flt"; break;
+  case SPCC::FCC_G  : LibCall = is64Bit? "_Qp_fgt" : "_Q_fgt"; break;
+  case SPCC::FCC_LE : LibCall = is64Bit? "_Qp_fle" : "_Q_fle"; break;
+  case SPCC::FCC_GE : LibCall = is64Bit? "_Qp_fge" : "_Q_fge"; break;
+  case SPCC::FCC_UL :
+  case SPCC::FCC_ULE:
+  case SPCC::FCC_UG :
+  case SPCC::FCC_UGE:
+  case SPCC::FCC_U  :
+  case SPCC::FCC_O  :
+  case SPCC::FCC_LG :
+  case SPCC::FCC_UE : LibCall = is64Bit? "_Qp_cmp" : "_Q_cmp"; break;
+  }
+
+  SDValue Callee = DAG.getExternalSymbol(LibCall, getPointerTy());
+  Type *RetTy = Type::getInt32Ty(*DAG.getContext());
+  ArgListTy Args;
+  SDValue Chain = DAG.getEntryNode();
+  Chain = LowerF128_LibCallArg(Chain, Args, LHS, DL, DAG);
+  Chain = LowerF128_LibCallArg(Chain, Args, RHS, DL, DAG);
+
+  TargetLowering::
+    CallLoweringInfo CLI(Chain,
+                         RetTy,
+                         false, false, false, false,
+                         0, CallingConv::C,
+                         false, false, true,
+                         Callee, Args, DAG, DL);
+
+  std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
+
+  // result is in first, and chain is in second result.
+  SDValue Result =  CallInfo.first;
+
+  switch(SPCC) {
+  default: {
+    SDValue RHS = DAG.getTargetConstant(0, Result.getValueType());
+    SPCC = SPCC::ICC_NE;
+    return DAG.getNode(SPISD::CMPICC, DL, MVT::Glue, Result, RHS);
+  }
+  case SPCC::FCC_UL : {
+    SDValue Mask   = DAG.getTargetConstant(1, Result.getValueType());
+    Result = DAG.getNode(ISD::AND, DL, Result.getValueType(), Result, Mask);
+    SDValue RHS    = DAG.getTargetConstant(0, Result.getValueType());
+    SPCC = SPCC::ICC_NE;
+    return DAG.getNode(SPISD::CMPICC, DL, MVT::Glue, Result, RHS);
+  }
+  case SPCC::FCC_ULE: {
+    SDValue RHS = DAG.getTargetConstant(2, Result.getValueType());
+    SPCC = SPCC::ICC_NE;
+    return DAG.getNode(SPISD::CMPICC, DL, MVT::Glue, Result, RHS);
+  }
+  case SPCC::FCC_UG :  {
+    SDValue RHS = DAG.getTargetConstant(1, Result.getValueType());
+    SPCC = SPCC::ICC_G;
+    return DAG.getNode(SPISD::CMPICC, DL, MVT::Glue, Result, RHS);
+  }
+  case SPCC::FCC_UGE: {
+    SDValue RHS = DAG.getTargetConstant(1, Result.getValueType());
+    SPCC = SPCC::ICC_NE;
+    return DAG.getNode(SPISD::CMPICC, DL, MVT::Glue, Result, RHS);
+  }
+
+  case SPCC::FCC_U  :  {
+    SDValue RHS = DAG.getTargetConstant(3, Result.getValueType());
+    SPCC = SPCC::ICC_E;
+    return DAG.getNode(SPISD::CMPICC, DL, MVT::Glue, Result, RHS);
+  }
+  case SPCC::FCC_O  :  {
+    SDValue RHS = DAG.getTargetConstant(3, Result.getValueType());
+    SPCC = SPCC::ICC_NE;
+    return DAG.getNode(SPISD::CMPICC, DL, MVT::Glue, Result, RHS);
+  }
+  case SPCC::FCC_LG :  {
+    SDValue Mask   = DAG.getTargetConstant(3, Result.getValueType());
+    Result = DAG.getNode(ISD::AND, DL, Result.getValueType(), Result, Mask);
+    SDValue RHS    = DAG.getTargetConstant(0, Result.getValueType());
+    SPCC = SPCC::ICC_NE;
+    return DAG.getNode(SPISD::CMPICC, DL, MVT::Glue, Result, RHS);
+  }
+  case SPCC::FCC_UE : {
+    SDValue Mask   = DAG.getTargetConstant(3, Result.getValueType());
+    Result = DAG.getNode(ISD::AND, DL, Result.getValueType(), Result, Mask);
+    SDValue RHS    = DAG.getTargetConstant(0, Result.getValueType());
+    SPCC = SPCC::ICC_E;
+    return DAG.getNode(SPISD::CMPICC, DL, MVT::Glue, Result, RHS);
+  }
+  }
+}
+
+static SDValue
+LowerF128_FPEXTEND(SDValue Op, SelectionDAG &DAG,
+                   const SparcTargetLowering &TLI) {
+
+  if (Op.getOperand(0).getValueType() == MVT::f64)
+    return TLI.LowerF128Op(Op, DAG,
+                           TLI.getLibcallName(RTLIB::FPEXT_F64_F128), 1);
+
+  if (Op.getOperand(0).getValueType() == MVT::f32)
+    return TLI.LowerF128Op(Op, DAG,
+                           TLI.getLibcallName(RTLIB::FPEXT_F32_F128), 1);
+
+  llvm_unreachable("fpextend with non-float operand!");
+  return SDValue(0, 0);
+}
+
+static SDValue
+LowerF128_FPROUND(SDValue Op, SelectionDAG &DAG,
+                  const SparcTargetLowering &TLI) {
+  // FP_ROUND on f64 and f32 are legal.
+  if (Op.getOperand(0).getValueType() != MVT::f128)
+    return Op;
+
+  if (Op.getValueType() == MVT::f64)
+    return TLI.LowerF128Op(Op, DAG,
+                           TLI.getLibcallName(RTLIB::FPROUND_F128_F64), 1);
+  if (Op.getValueType() == MVT::f32)
+    return TLI.LowerF128Op(Op, DAG,
+                           TLI.getLibcallName(RTLIB::FPROUND_F128_F32), 1);
+
+  llvm_unreachable("fpround to non-float!");
+  return SDValue(0, 0);
+}
+
+static SDValue LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG,
+                               const SparcTargetLowering &TLI,
+                               bool hasHardQuad) {
+  SDLoc dl(Op);
+  EVT VT = Op.getValueType();
+  assert(VT == MVT::i32 || VT == MVT::i64);
+
+  // Expand f128 operations to fp128 abi calls.
+  if (Op.getOperand(0).getValueType() == MVT::f128
+      && (!hasHardQuad || !TLI.isTypeLegal(VT))) {
+    const char *libName = TLI.getLibcallName(VT == MVT::i32
+                                             ? RTLIB::FPTOSINT_F128_I32
+                                             : RTLIB::FPTOSINT_F128_I64);
+    return TLI.LowerF128Op(Op, DAG, libName, 1);
+  }
+
+  // Expand if the resulting type is illegal.
+  if (!TLI.isTypeLegal(VT))
+    return SDValue(0, 0);
+
+  // Otherwise, Convert the fp value to integer in an FP register.
+  if (VT == MVT::i32)
+    Op = DAG.getNode(SPISD::FTOI, dl, MVT::f32, Op.getOperand(0));
+  else
+    Op = DAG.getNode(SPISD::FTOX, dl, MVT::f64, Op.getOperand(0));
+
+  return DAG.getNode(ISD::BITCAST, dl, VT, Op);
+}
+
+static SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG,
+                               const SparcTargetLowering &TLI,
+                               bool hasHardQuad) {
+  SDLoc dl(Op);
+  EVT OpVT = Op.getOperand(0).getValueType();
+  assert(OpVT == MVT::i32 || (OpVT == MVT::i64));
+
+  EVT floatVT = (OpVT == MVT::i32) ? MVT::f32 : MVT::f64;
+
+  // Expand f128 operations to fp128 ABI calls.
+  if (Op.getValueType() == MVT::f128
+      && (!hasHardQuad || !TLI.isTypeLegal(OpVT))) {
+    const char *libName = TLI.getLibcallName(OpVT == MVT::i32
+                                             ? RTLIB::SINTTOFP_I32_F128
+                                             : RTLIB::SINTTOFP_I64_F128);
+    return TLI.LowerF128Op(Op, DAG, libName, 1);
+  }
+
+  // Expand if the operand type is illegal.
+  if (!TLI.isTypeLegal(OpVT))
+    return SDValue(0, 0);
+
+  // Otherwise, Convert the int value to FP in an FP register.
+  SDValue Tmp = DAG.getNode(ISD::BITCAST, dl, floatVT, Op.getOperand(0));
+  unsigned opcode = (OpVT == MVT::i32)? SPISD::ITOF : SPISD::XTOF;
+  return DAG.getNode(opcode, dl, Op.getValueType(), Tmp);
+}
+
+static SDValue LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG,
+                               const SparcTargetLowering &TLI,
+                               bool hasHardQuad) {
   SDLoc dl(Op);
-  // Convert the fp value to integer in an FP register.
-  assert(Op.getValueType() == MVT::i32);
-  Op = DAG.getNode(SPISD::FTOI, dl, MVT::f32, Op.getOperand(0));
-  return DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op);
+  EVT VT = Op.getValueType();
+
+  // Expand if it does not involve f128 or the target has support for
+  // quad floating point instructions and the resulting type is legal.
+  if (Op.getOperand(0).getValueType() != MVT::f128 ||
+      (hasHardQuad && TLI.isTypeLegal(VT)))
+    return SDValue(0, 0);
+
+  assert(VT == MVT::i32 || VT == MVT::i64);
+
+  return TLI.LowerF128Op(Op, DAG,
+                         TLI.getLibcallName(VT == MVT::i32
+                                            ? RTLIB::FPTOUINT_F128_I32
+                                            : RTLIB::FPTOUINT_F128_I64),
+                         1);
 }
 
-static SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
+static SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG,
+                               const SparcTargetLowering &TLI,
+                               bool hasHardQuad) {
   SDLoc dl(Op);
-  assert(Op.getOperand(0).getValueType() == MVT::i32);
-  SDValue Tmp = DAG.getNode(ISD::BITCAST, dl, MVT::f32, Op.getOperand(0));
-  // Convert the int value to FP in an FP register.
-  return DAG.getNode(SPISD::ITOF, dl, Op.getValueType(), Tmp);
+  EVT OpVT = Op.getOperand(0).getValueType();
+  assert(OpVT == MVT::i32 || OpVT == MVT::i64);
+
+  // Expand if it does not involve f128 or the target has support for
+  // quad floating point instructions and the operand type is legal.
+  if (Op.getValueType() != MVT::f128 || (hasHardQuad && TLI.isTypeLegal(OpVT)))
+    return SDValue(0, 0);
+
+  return TLI.LowerF128Op(Op, DAG,
+                         TLI.getLibcallName(OpVT == MVT::i32
+                                            ? RTLIB::UINTTOFP_I32_F128
+                                            : RTLIB::UINTTOFP_I64_F128),
+                         1);
 }
 
-static SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) {
+static SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG,
+                          const SparcTargetLowering &TLI,
+                          bool hasHardQuad) {
   SDValue Chain = Op.getOperand(0);
   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
   SDValue LHS = Op.getOperand(2);
@@ -1586,15 +2210,23 @@ static SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) {
     // 32-bit compares use the icc flags, 64-bit uses the xcc flags.
     Opc = LHS.getValueType() == MVT::i32 ? SPISD::BRICC : SPISD::BRXCC;
   } else {
-    CompareFlag = DAG.getNode(SPISD::CMPFCC, dl, MVT::Glue, LHS, RHS);
-    if (SPCC == ~0U) SPCC = FPCondCCodeToFCC(CC);
-    Opc = SPISD::BRFCC;
+    if (!hasHardQuad && LHS.getValueType() == MVT::f128) {
+      if (SPCC == ~0U) SPCC = FPCondCCodeToFCC(CC);
+      CompareFlag = TLI.LowerF128Compare(LHS, RHS, SPCC, dl, DAG);
+      Opc = SPISD::BRICC;
+    } else {
+      CompareFlag = DAG.getNode(SPISD::CMPFCC, dl, MVT::Glue, LHS, RHS);
+      if (SPCC == ~0U) SPCC = FPCondCCodeToFCC(CC);
+      Opc = SPISD::BRFCC;
+    }
   }
   return DAG.getNode(Opc, dl, MVT::Other, Chain, Dest,
                      DAG.getConstant(SPCC, MVT::i32), CompareFlag);
 }
 
-static SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) {
+static SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG,
+                              const SparcTargetLowering &TLI,
+                              bool hasHardQuad) {
   SDValue LHS = Op.getOperand(0);
   SDValue RHS = Op.getOperand(1);
   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
@@ -1614,9 +2246,15 @@ static SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) {
           SPISD::SELECT_ICC : SPISD::SELECT_XCC;
     if (SPCC == ~0U) SPCC = IntCondCCodeToICC(CC);
   } else {
-    CompareFlag = DAG.getNode(SPISD::CMPFCC, dl, MVT::Glue, LHS, RHS);
-    Opc = SPISD::SELECT_FCC;
-    if (SPCC == ~0U) SPCC = FPCondCCodeToFCC(CC);
+    if (!hasHardQuad && LHS.getValueType() == MVT::f128) {
+      if (SPCC == ~0U) SPCC = FPCondCCodeToFCC(CC);
+      CompareFlag = TLI.LowerF128Compare(LHS, RHS, SPCC, dl, DAG);
+      Opc = SPISD::SELECT_ICC;
+    } else {
+      CompareFlag = DAG.getNode(SPISD::CMPFCC, dl, MVT::Glue, LHS, RHS);
+      Opc = SPISD::SELECT_FCC;
+      if (SPCC == ~0U) SPCC = FPCondCCodeToFCC(CC);
+    }
   }
   return DAG.getNode(Opc, dl, TrueVal.getValueType(), TrueVal, FalseVal,
                      DAG.getConstant(SPCC, MVT::i32), CompareFlag);
@@ -1665,20 +2303,25 @@ static SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) {
                      std::min(PtrVT.getSizeInBits(), VT.getSizeInBits())/8);
 }
 
-static SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) {
+static SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG,
+                                       const SparcSubtarget *Subtarget) {
   SDValue Chain = Op.getOperand(0);  // Legalize the chain.
   SDValue Size  = Op.getOperand(1);  // Legalize the size.
+  EVT VT = Size->getValueType(0);
   SDLoc dl(Op);
 
   unsigned SPReg = SP::O6;
-  SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, MVT::i32);
-  SDValue NewSP = DAG.getNode(ISD::SUB, dl, MVT::i32, SP, Size); // Value
+  SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
+  SDValue NewSP = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
   Chain = DAG.getCopyToReg(SP.getValue(1), dl, SPReg, NewSP);    // Output chain
 
   // The resultant pointer is actually 16 words from the bottom of the stack,
   // to provide a register spill area.
-  SDValue NewVal = DAG.getNode(ISD::ADD, dl, MVT::i32, NewSP,
-                                 DAG.getConstant(96, MVT::i32));
+  unsigned regSpillArea = Subtarget->is64Bit() ? 128 : 96;
+  regSpillArea += Subtarget->getStackPointerBias();
+
+  SDValue NewVal = DAG.getNode(ISD::ADD, dl, VT, NewSP,
+                               DAG.getConstant(regSpillArea, VT));
   SDValue Ops[2] = { NewVal, Chain };
   return DAG.getMergeValues(Ops, 2, dl);
 }
@@ -1759,12 +2402,12 @@ static SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG,
   return RetAddr;
 }
 
-static SDValue LowerF64Op(SDValue Op, SelectionDAG &DAG)
+static SDValue LowerF64Op(SDValue Op, SelectionDAG &DAG, unsigned opcode)
 {
   SDLoc dl(Op);
 
   assert(Op.getValueType() == MVT::f64 && "LowerF64Op called on non-double!");
-  assert(Op.getOpcode() == ISD::FNEG || Op.getOpcode() == ISD::FABS);
+  assert(opcode == ISD::FNEG || opcode == ISD::FABS);
 
   // Lower fneg/fabs on f64 to fneg/fabs on f32.
   // fneg f64 => fneg f32:sub_even, fmov f32:sub_odd.
@@ -1776,7 +2419,7 @@ static SDValue LowerF64Op(SDValue Op, SelectionDAG &DAG)
   SDValue Lo32 = DAG.getTargetExtractSubreg(SP::sub_odd, dl, MVT::f32,
                                             SrcReg64);
 
-  Hi32 = DAG.getNode(Op.getOpcode(), dl, MVT::f32, Hi32);
+  Hi32 = DAG.getNode(opcode, dl, MVT::f32, Hi32);
 
   SDValue DstReg64 = SDValue(DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF,
                                                 dl, MVT::f64), 0);
@@ -1787,28 +2430,244 @@ static SDValue LowerF64Op(SDValue Op, SelectionDAG &DAG)
   return DstReg64;
 }
 
+// Lower a f128 load into two f64 loads.
+static SDValue LowerF128Load(SDValue Op, SelectionDAG &DAG)
+{
+  SDLoc dl(Op);
+  LoadSDNode *LdNode = dyn_cast<LoadSDNode>(Op.getNode());
+  assert(LdNode && LdNode->getOffset().getOpcode() == ISD::UNDEF
+         && "Unexpected node type");
+
+  unsigned alignment = LdNode->getAlignment();
+  if (alignment > 8)
+    alignment = 8;
+
+  SDValue Hi64 = DAG.getLoad(MVT::f64,
+                             dl,
+                             LdNode->getChain(),
+                             LdNode->getBasePtr(),
+                             LdNode->getPointerInfo(),
+                             false, false, false, alignment);
+  EVT addrVT = LdNode->getBasePtr().getValueType();
+  SDValue LoPtr = DAG.getNode(ISD::ADD, dl, addrVT,
+                              LdNode->getBasePtr(),
+                              DAG.getConstant(8, addrVT));
+  SDValue Lo64 = DAG.getLoad(MVT::f64,
+                             dl,
+                             LdNode->getChain(),
+                             LoPtr,
+                             LdNode->getPointerInfo(),
+                             false, false, false, alignment);
+
+  SDValue SubRegEven = DAG.getTargetConstant(SP::sub_even64, MVT::i32);
+  SDValue SubRegOdd  = DAG.getTargetConstant(SP::sub_odd64, MVT::i32);
+
+  SDNode *InFP128 = DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF,
+                                       dl, MVT::f128);
+  InFP128 = DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, dl,
+                               MVT::f128,
+                               SDValue(InFP128, 0),
+                               Hi64,
+                               SubRegEven);
+  InFP128 = DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, dl,
+                               MVT::f128,
+                               SDValue(InFP128, 0),
+                               Lo64,
+                               SubRegOdd);
+  SDValue OutChains[2] = { SDValue(Hi64.getNode(), 1),
+                           SDValue(Lo64.getNode(), 1) };
+  SDValue OutChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                                 &OutChains[0], 2);
+  SDValue Ops[2] = {SDValue(InFP128,0), OutChain};
+  return DAG.getMergeValues(Ops, 2, dl);
+}
+
+// Lower a f128 store into two f64 stores.
+static SDValue LowerF128Store(SDValue Op, SelectionDAG &DAG) {
+  SDLoc dl(Op);
+  StoreSDNode *StNode = dyn_cast<StoreSDNode>(Op.getNode());
+  assert(StNode && StNode->getOffset().getOpcode() == ISD::UNDEF
+         && "Unexpected node type");
+  SDValue SubRegEven = DAG.getTargetConstant(SP::sub_even64, MVT::i32);
+  SDValue SubRegOdd  = DAG.getTargetConstant(SP::sub_odd64, MVT::i32);
+
+  SDNode *Hi64 = DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG,
+                                    dl,
+                                    MVT::f64,
+                                    StNode->getValue(),
+                                    SubRegEven);
+  SDNode *Lo64 = DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG,
+                                    dl,
+                                    MVT::f64,
+                                    StNode->getValue(),
+                                    SubRegOdd);
+
+  unsigned alignment = StNode->getAlignment();
+  if (alignment > 8)
+    alignment = 8;
+
+  SDValue OutChains[2];
+  OutChains[0] = DAG.getStore(StNode->getChain(),
+                              dl,
+                              SDValue(Hi64, 0),
+                              StNode->getBasePtr(),
+                              MachinePointerInfo(),
+                              false, false, alignment);
+  EVT addrVT = StNode->getBasePtr().getValueType();
+  SDValue LoPtr = DAG.getNode(ISD::ADD, dl, addrVT,
+                              StNode->getBasePtr(),
+                              DAG.getConstant(8, addrVT));
+  OutChains[1] = DAG.getStore(StNode->getChain(),
+                             dl,
+                             SDValue(Lo64, 0),
+                             LoPtr,
+                             MachinePointerInfo(),
+                             false, false, alignment);
+  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                     &OutChains[0], 2);
+}
+
+static SDValue LowerFNEG(SDValue Op, SelectionDAG &DAG,
+                         const SparcTargetLowering &TLI,
+                         bool is64Bit) {
+  if (Op.getValueType() == MVT::f64)
+    return LowerF64Op(Op, DAG, ISD::FNEG);
+  if (Op.getValueType() == MVT::f128)
+    return TLI.LowerF128Op(Op, DAG, ((is64Bit) ? "_Qp_neg" : "_Q_neg"), 1);
+  return Op;
+}
+
+static SDValue LowerFABS(SDValue Op, SelectionDAG &DAG, bool isV9) {
+  if (Op.getValueType() == MVT::f64)
+    return LowerF64Op(Op, DAG, ISD::FABS);
+  if (Op.getValueType() != MVT::f128)
+    return Op;
+
+  // Lower fabs on f128 to fabs on f64
+  // fabs f128 => fabs f64:sub_even64, fmov f64:sub_odd64
+
+  SDLoc dl(Op);
+  SDValue SrcReg128 = Op.getOperand(0);
+  SDValue Hi64 = DAG.getTargetExtractSubreg(SP::sub_even64, dl, MVT::f64,
+                                            SrcReg128);
+  SDValue Lo64 = DAG.getTargetExtractSubreg(SP::sub_odd64, dl, MVT::f64,
+                                            SrcReg128);
+  if (isV9)
+    Hi64 = DAG.getNode(Op.getOpcode(), dl, MVT::f64, Hi64);
+  else
+    Hi64 = LowerF64Op(Hi64, DAG, ISD::FABS);
+
+  SDValue DstReg128 = SDValue(DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF,
+                                                 dl, MVT::f128), 0);
+  DstReg128 = DAG.getTargetInsertSubreg(SP::sub_even64, dl, MVT::f128,
+                                        DstReg128, Hi64);
+  DstReg128 = DAG.getTargetInsertSubreg(SP::sub_odd64, dl, MVT::f128,
+                                        DstReg128, Lo64);
+  return DstReg128;
+}
+
+static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
+
+  if (Op.getValueType() != MVT::i64)
+    return Op;
+
+  SDLoc dl(Op);
+  SDValue Src1 = Op.getOperand(0);
+  SDValue Src1Lo = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src1);
+  SDValue Src1Hi = DAG.getNode(ISD::SRL, dl, MVT::i64, Src1,
+                               DAG.getConstant(32, MVT::i64));
+  Src1Hi = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src1Hi);
+
+  SDValue Src2 = Op.getOperand(1);
+  SDValue Src2Lo = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src2);
+  SDValue Src2Hi = DAG.getNode(ISD::SRL, dl, MVT::i64, Src2,
+                               DAG.getConstant(32, MVT::i64));
+  Src2Hi = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src2Hi);
+
+
+  bool hasChain = false;
+  unsigned hiOpc = Op.getOpcode();
+  switch (Op.getOpcode()) {
+  default: llvm_unreachable("Invalid opcode");
+  case ISD::ADDC: hiOpc = ISD::ADDE; break;
+  case ISD::ADDE: hasChain = true; break;
+  case ISD::SUBC: hiOpc = ISD::SUBE; break;
+  case ISD::SUBE: hasChain = true; break;
+  }
+  SDValue Lo;
+  SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Glue);
+  if (hasChain) {
+    Lo = DAG.getNode(Op.getOpcode(), dl, VTs, Src1Lo, Src2Lo,
+                     Op.getOperand(2));
+  } else {
+    Lo = DAG.getNode(Op.getOpcode(), dl, VTs, Src1Lo, Src2Lo);
+  }
+  SDValue Hi = DAG.getNode(hiOpc, dl, VTs, Src1Hi, Src2Hi, Lo.getValue(1));
+  SDValue Carry = Hi.getValue(1);
+
+  Lo = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Lo);
+  Hi = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Hi);
+  Hi = DAG.getNode(ISD::SHL, dl, MVT::i64, Hi,
+                   DAG.getConstant(32, MVT::i64));
+
+  SDValue Dst = DAG.getNode(ISD::OR, dl, MVT::i64, Hi, Lo);
+  SDValue Ops[2] = { Dst, Carry };
+  return DAG.getMergeValues(Ops, 2, dl);
+}
+
 SDValue SparcTargetLowering::
 LowerOperation(SDValue Op, SelectionDAG &DAG) const {
+
+  bool hasHardQuad = Subtarget->hasHardQuad();
+  bool is64Bit     = Subtarget->is64Bit();
+  bool isV9        = Subtarget->isV9();
+
   switch (Op.getOpcode()) {
   default: llvm_unreachable("Should not custom lower this!");
 
-  case ISD::FNEG:
-  case ISD::FABS:               return LowerF64Op(Op, DAG);
-
   case ISD::RETURNADDR:         return LowerRETURNADDR(Op, DAG, *this);
   case ISD::FRAMEADDR:          return LowerFRAMEADDR(Op, DAG);
-  case ISD::GlobalTLSAddress:
-    llvm_unreachable("TLS not implemented for Sparc.");
+  case ISD::GlobalTLSAddress:   return LowerGlobalTLSAddress(Op, DAG);
   case ISD::GlobalAddress:      return LowerGlobalAddress(Op, DAG);
   case ISD::BlockAddress:       return LowerBlockAddress(Op, DAG);
   case ISD::ConstantPool:       return LowerConstantPool(Op, DAG);
-  case ISD::FP_TO_SINT:         return LowerFP_TO_SINT(Op, DAG);
-  case ISD::SINT_TO_FP:         return LowerSINT_TO_FP(Op, DAG);
-  case ISD::BR_CC:              return LowerBR_CC(Op, DAG);
-  case ISD::SELECT_CC:          return LowerSELECT_CC(Op, DAG);
+  case ISD::FP_TO_SINT:         return LowerFP_TO_SINT(Op, DAG, *this,
+                                                       hasHardQuad);
+  case ISD::SINT_TO_FP:         return LowerSINT_TO_FP(Op, DAG, *this,
+                                                       hasHardQuad);
+  case ISD::FP_TO_UINT:         return LowerFP_TO_UINT(Op, DAG, *this,
+                                                       hasHardQuad);
+  case ISD::UINT_TO_FP:         return LowerUINT_TO_FP(Op, DAG, *this,
+                                                       hasHardQuad);
+  case ISD::BR_CC:              return LowerBR_CC(Op, DAG, *this,
+                                                  hasHardQuad);
+  case ISD::SELECT_CC:          return LowerSELECT_CC(Op, DAG, *this,
+                                                      hasHardQuad);
   case ISD::VASTART:            return LowerVASTART(Op, DAG, *this);
   case ISD::VAARG:              return LowerVAARG(Op, DAG);
-  case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
+  case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG,
+                                                               Subtarget);
+
+  case ISD::LOAD:               return LowerF128Load(Op, DAG);
+  case ISD::STORE:              return LowerF128Store(Op, DAG);
+  case ISD::FADD:               return LowerF128Op(Op, DAG,
+                                       getLibcallName(RTLIB::ADD_F128), 2);
+  case ISD::FSUB:               return LowerF128Op(Op, DAG,
+                                       getLibcallName(RTLIB::SUB_F128), 2);
+  case ISD::FMUL:               return LowerF128Op(Op, DAG,
+                                       getLibcallName(RTLIB::MUL_F128), 2);
+  case ISD::FDIV:               return LowerF128Op(Op, DAG,
+                                       getLibcallName(RTLIB::DIV_F128), 2);
+  case ISD::FSQRT:              return LowerF128Op(Op, DAG,
+                                       getLibcallName(RTLIB::SQRT_F128),1);
+  case ISD::FNEG:               return LowerFNEG(Op, DAG, *this, is64Bit);
+  case ISD::FABS:               return LowerFABS(Op, DAG, isV9);
+  case ISD::FP_EXTEND:          return LowerF128_FPEXTEND(Op, DAG, *this);
+  case ISD::FP_ROUND:           return LowerF128_FPROUND(Op, DAG, *this);
+  case ISD::ADDC:
+  case ISD::ADDE:
+  case ISD::SUBC:
+  case ISD::SUBE:               return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
   }
 }
 
@@ -1825,11 +2684,13 @@ SparcTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   case SP::SELECT_CC_Int_ICC:
   case SP::SELECT_CC_FP_ICC:
   case SP::SELECT_CC_DFP_ICC:
+  case SP::SELECT_CC_QFP_ICC:
     BROpcode = SP::BCOND;
     break;
   case SP::SELECT_CC_Int_FCC:
   case SP::SELECT_CC_FP_FCC:
   case SP::SELECT_CC_DFP_FCC:
+  case SP::SELECT_CC_QFP_FCC:
     BROpcode = SP::FBCOND;
     break;
   }
@@ -1924,3 +2785,50 @@ SparcTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
   // The Sparc target isn't yet aware of offsets.
   return false;
 }
+
+void SparcTargetLowering::ReplaceNodeResults(SDNode *N,
+                                             SmallVectorImpl<SDValue>& Results,
+                                             SelectionDAG &DAG) const {
+
+  SDLoc dl(N);
+
+  RTLIB::Libcall libCall = RTLIB::UNKNOWN_LIBCALL;
+
+  switch (N->getOpcode()) {
+  default:
+    llvm_unreachable("Do not know how to custom type legalize this operation!");
+
+  case ISD::FP_TO_SINT:
+  case ISD::FP_TO_UINT:
+    // Custom lower only if it involves f128 or i64.
+    if (N->getOperand(0).getValueType() != MVT::f128
+        || N->getValueType(0) != MVT::i64)
+      return;
+    libCall = ((N->getOpcode() == ISD::FP_TO_SINT)
+               ? RTLIB::FPTOSINT_F128_I64
+               : RTLIB::FPTOUINT_F128_I64);
+
+    Results.push_back(LowerF128Op(SDValue(N, 0),
+                                  DAG,
+                                  getLibcallName(libCall),
+                                  1));
+    return;
+
+  case ISD::SINT_TO_FP:
+  case ISD::UINT_TO_FP:
+    // Custom lower only if it involves f128 or i64.
+    if (N->getValueType(0) != MVT::f128
+        || N->getOperand(0).getValueType() != MVT::i64)
+      return;
+
+    libCall = ((N->getOpcode() == ISD::SINT_TO_FP)
+               ? RTLIB::SINTTOFP_I64_F128
+               : RTLIB::UINTTOFP_I64_F128);
+
+    Results.push_back(LowerF128Op(SDValue(N, 0),
+                                  DAG,
+                                  getLibcallName(libCall),
+                                  1));
+    return;
+  }
+}
diff --git a/lib/Target/Sparc/SparcISelLowering.h b/lib/Target/Sparc/SparcISelLowering.h
index 261c25a..2659fc8 100644
--- a/lib/Target/Sparc/SparcISelLowering.h
+++ b/lib/Target/Sparc/SparcISelLowering.h
@@ -37,11 +37,17 @@ namespace llvm {
 
       FTOI,        // FP to Int within a FP register.
       ITOF,        // Int to FP within a FP register.
+      FTOX,        // FP to Int64 within a FP register.
+      XTOF,        // Int64 to FP within a FP register.
 
       CALL,        // A call instruction.
       RET_FLAG,    // Return with a flag operand.
-      GLOBAL_BASE_REG, // Global base reg for PIC
-      FLUSHW       // FLUSH register windows to stack
+      GLOBAL_BASE_REG, // Global base reg for PIC.
+      FLUSHW,      // FLUSH register windows to stack.
+
+      TLS_ADD,     // For Thread Local Storage (TLS).
+      TLS_LD,
+      TLS_CALL
     };
   }
 
@@ -73,6 +79,9 @@ namespace llvm {
     virtual bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const;
     virtual MVT getScalarShiftAmountTy(EVT LHSTy) const { return MVT::i32; }
 
+    /// getSetCCResultType - Return the ISD::SETCC ValueType
+    virtual EVT getSetCCResultType(LLVMContext &Context, EVT VT) const;
+
     virtual SDValue
       LowerFormalArguments(SDValue Chain,
                            CallingConv::ID CallConv,
@@ -119,6 +128,7 @@ namespace llvm {
                            SDLoc DL, SelectionDAG &DAG) const;
 
     SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
 
@@ -127,6 +137,27 @@ namespace llvm {
     SDValue makeHiLoPair(SDValue Op, unsigned HiTF, unsigned LoTF,
                          SelectionDAG &DAG) const;
     SDValue makeAddress(SDValue Op, SelectionDAG &DAG) const;
+
+    SDValue LowerF128_LibCallArg(SDValue Chain, ArgListTy &Args,
+                                 SDValue Arg, SDLoc DL,
+                                 SelectionDAG &DAG) const;
+    SDValue LowerF128Op(SDValue Op, SelectionDAG &DAG,
+                        const char *LibFuncName,
+                        unsigned numArgs) const;
+    SDValue LowerF128Compare(SDValue LHS, SDValue RHS,
+                             unsigned &SPCC,
+                             SDLoc DL,
+                             SelectionDAG &DAG) const;
+
+    bool ShouldShrinkFPConstant(EVT VT) const {
+      // Do not shrink FP constpool if VT == MVT::f128.
+      // (ldd, call _Q_fdtoq) is more expensive than two ldds.
+      return VT != MVT::f128;
+    }
+
+    virtual void ReplaceNodeResults(SDNode *N,
+                                    SmallVectorImpl<SDValue>& Results,
+                                    SelectionDAG &DAG) const;
   };
 } // end namespace llvm
 
diff --git a/lib/Target/Sparc/SparcInstr64Bit.td b/lib/Target/Sparc/SparcInstr64Bit.td
index 47658ee..8656de5 100644
--- a/lib/Target/Sparc/SparcInstr64Bit.td
+++ b/lib/Target/Sparc/SparcInstr64Bit.td
@@ -153,15 +153,11 @@ def : Pat<(xor i64:$a, (not i64:$b)), (XNORrr $a, $b)>;
 def : Pat<(add i64:$a, i64:$b), (ADDrr $a, $b)>;
 def : Pat<(sub i64:$a, i64:$b), (SUBrr $a, $b)>;
 
-// Add/sub with carry were renamed to addc/subc in SPARC v9.
-def : Pat<(adde i64:$a, i64:$b), (ADDXrr $a, $b)>;
-def : Pat<(sube i64:$a, i64:$b), (SUBXrr $a, $b)>;
-
-def : Pat<(addc i64:$a, i64:$b), (ADDCCrr $a, $b)>;
-def : Pat<(subc i64:$a, i64:$b), (SUBCCrr $a, $b)>;
-
 def : Pat<(SPcmpicc i64:$a, i64:$b), (CMPrr $a, $b)>;
 
+def : Pat<(tlsadd i64:$a, i64:$b, tglobaltlsaddr:$sym),
+          (TLS_ADDrr $a, $b, $sym)>;
+
 // Register-immediate instructions.
 
 def : Pat<(and i64:$a, (i64 simm13:$b)), (ANDri $a, (as_i32imm $b))>;
@@ -173,6 +169,14 @@ def : Pat<(sub i64:$a, (i64 simm13:$b)), (SUBri $a, (as_i32imm $b))>;
 
 def : Pat<(SPcmpicc i64:$a, (i64 simm13:$b)), (CMPri $a, (as_i32imm $b))>;
 
+def : Pat<(ctpop i64:$src), (POPCrr $src)>;
+
+// "LEA" form of add
+def LEAX_ADDri : F3_2<2, 0b000000,
+                     (outs I64Regs:$dst), (ins MEMri:$addr),
+                     "add ${addr:arith}, $dst",
+                     [(set iPTR:$dst, ADDRri:$addr)]>;
+
 } // Predicates = [Is64Bit]
 
 
@@ -237,6 +241,12 @@ def LDXri  : F3_2<3, 0b001011,
                   (outs I64Regs:$dst), (ins MEMri:$addr),
                   "ldx [$addr], $dst",
                   [(set i64:$dst, (load ADDRri:$addr))]>;
+let mayLoad = 1 in
+  def TLS_LDXrr : F3_1<3, 0b001011,
+                       (outs IntRegs:$dst), (ins MEMrr:$addr, TLSSym:$sym),
+                       "ldx [$addr], $dst, $sym",
+                       [(set i64:$dst,
+                           (tlsld ADDRrr:$addr, tglobaltlsaddr:$sym))]>;
 
 // Extending loads to i64.
 def : Pat<(i64 (zextloadi1 ADDRrr:$addr)), (LDUBrr ADDRrr:$addr)>;
@@ -312,9 +322,9 @@ def : Pat<(store (i64 0), ADDRri:$dst), (STXri ADDRri:$dst, (i64 G0))>;
 let Predicates = [Is64Bit] in {
 
 let Uses = [ICC] in
-def BPXCC : BranchSP<0, (ins brtarget:$dst, CCOp:$cc),
-                     "b$cc %xcc, $dst",
-                     [(SPbrxcc bb:$dst, imm:$cc)]>;
+def BPXCC : BranchSP<(ins brtarget:$imm22, CCOp:$cond),
+                     "b$cond %xcc, $imm22",
+                     [(SPbrxcc bb:$imm22, imm:$cond)]>;
 
 // Conditional moves on %xcc.
 let Uses = [ICC], Constraints = "$f = $rd" in {
@@ -340,6 +350,42 @@ def FMOVD_XCC : Pseudo<(outs DFPRegs:$rd),
                        (SPselectxcc f64:$rs2, f64:$f, imm:$cond))]>;
 } // Uses, Constraints
 
+//===----------------------------------------------------------------------===//
+// 64-bit Floating Point Conversions.
+//===----------------------------------------------------------------------===//
+
+let Predicates = [Is64Bit] in {
+
+def FXTOS : F3_3u<2, 0b110100, 0b010000100,
+                 (outs FPRegs:$dst), (ins DFPRegs:$src),
+                 "fxtos $src, $dst",
+                 [(set FPRegs:$dst, (SPxtof DFPRegs:$src))]>;
+def FXTOD : F3_3u<2, 0b110100, 0b010001000,
+                 (outs DFPRegs:$dst), (ins DFPRegs:$src),
+                 "fxtod $src, $dst",
+                 [(set DFPRegs:$dst, (SPxtof DFPRegs:$src))]>;
+def FXTOQ : F3_3u<2, 0b110100, 0b010001100,
+                 (outs QFPRegs:$dst), (ins DFPRegs:$src),
+                 "fxtoq $src, $dst",
+                 [(set QFPRegs:$dst, (SPxtof DFPRegs:$src))]>,
+                 Requires<[HasHardQuad]>;
+
+def FSTOX : F3_3u<2, 0b110100, 0b010000001,
+                 (outs DFPRegs:$dst), (ins FPRegs:$src),
+                 "fstox $src, $dst",
+                 [(set DFPRegs:$dst, (SPftox FPRegs:$src))]>;
+def FDTOX : F3_3u<2, 0b110100, 0b010000010,
+                 (outs DFPRegs:$dst), (ins DFPRegs:$src),
+                 "fdtox $src, $dst",
+                 [(set DFPRegs:$dst, (SPftox DFPRegs:$src))]>;
+def FQTOX : F3_3u<2, 0b110100, 0b010000011,
+                 (outs DFPRegs:$dst), (ins QFPRegs:$src),
+                 "fqtox $src, $dst",
+                 [(set DFPRegs:$dst, (SPftox QFPRegs:$src))]>,
+                 Requires<[HasHardQuad]>;
+
+} // Predicates = [Is64Bit]
+
 def : Pat<(SPselectxcc i64:$t, i64:$f, imm:$cond),
           (MOVXCCrr $t, $f, imm:$cond)>;
 def : Pat<(SPselectxcc (i64 simm11:$t), i64:$f, imm:$cond),
diff --git a/lib/Target/Sparc/SparcInstrFormats.td b/lib/Target/Sparc/SparcInstrFormats.td
index 6cdf6bc..afa2874 100644
--- a/lib/Target/Sparc/SparcInstrFormats.td
+++ b/lib/Target/Sparc/SparcInstrFormats.td
@@ -47,12 +47,11 @@ class F2_1<bits<3> op2Val, dag outs, dag ins, string asmstr, list<dag> pattern>
   let Inst{29-25} = rd;
 }
 
-class F2_2<bits<4> condVal, bits<3> op2Val, dag outs, dag ins, string asmstr,
+class F2_2<bits<3> op2Val, dag outs, dag ins, string asmstr,
            list<dag> pattern> : F2<outs, ins, asmstr, pattern> {
   bits<4>   cond;
   bit       annul = 0;     // currently unused
 
-  let cond        = condVal;
   let op2         = op2Val;
 
   let Inst{29}    = annul;
@@ -112,6 +111,32 @@ class F3_3<bits<2> opVal, bits<6> op3val, bits<9> opfval, dag outs, dag ins,
   let Inst{4-0}  = rs2;
 }
 
+// floating-point unary operations.
+class F3_3u<bits<2> opVal, bits<6> op3val, bits<9> opfval, dag outs, dag ins,
+           string asmstr, list<dag> pattern> : F3<outs, ins, asmstr, pattern> {
+  bits<5> rs2;
+
+  let op         = opVal;
+  let op3        = op3val;
+  let rs1        = 0;
+
+  let Inst{13-5} = opfval;   // fp opcode
+  let Inst{4-0}  = rs2;
+}
+
+// floating-point compares.
+class F3_3c<bits<2> opVal, bits<6> op3val, bits<9> opfval, dag outs, dag ins,
+           string asmstr, list<dag> pattern> : F3<outs, ins, asmstr, pattern> {
+  bits<5> rs2;
+
+  let op         = opVal;
+  let op3        = op3val;
+  let rd         = 0;
+
+  let Inst{13-5} = opfval;   // fp opcode
+  let Inst{4-0}  = rs2;
+}
+
 // Shift by register rs2.
 class F3_Sr<bits<2> opVal, bits<6> op3val, bit xVal, dag outs, dag ins,
             string asmstr, list<dag> pattern> : F3<outs, ins, asmstr, pattern> {
@@ -150,3 +175,59 @@ multiclass F3_S<string OpcStr, bits<6> Op3Val, bit XVal, SDNode OpNode,
                  !strconcat(OpcStr, " $rs, $shcnt, $rd"),
                  [(set VT:$rd, (OpNode VT:$rs, (i32 imm:$shcnt)))]>;
 }
+
+class F4<bits<6> op3, dag outs, dag ins, string asmstr, list<dag> pattern>
+      : InstSP<outs, ins, asmstr, pattern> {
+  bits<5> rd;
+
+  let op          = 2;
+  let Inst{29-25} = rd;
+  let Inst{24-19} = op3;
+}
+
+
+class F4_1<bits<6> op3, dag outs, dag ins,
+            string asmstr, list<dag> pattern>
+      : F4<op3, outs, ins, asmstr, pattern> {
+
+  bits<3> cc;
+  bits<4> cond;
+  bits<5> rs2;
+
+  let Inst{4-0}   = rs2;
+  let Inst{11}    = cc{0};
+  let Inst{12}    = cc{1};
+  let Inst{13}    = 0;
+  let Inst{17-14} = cond;
+  let Inst{18}    = cc{2};
+
+}
+
+class F4_2<bits<6> op3, dag outs, dag ins,
+            string asmstr, list<dag> pattern>
+      : F4<op3, outs, ins, asmstr, pattern> {
+  bits<3>  cc;
+  bits<4>  cond;
+  bits<11> simm11;
+
+  let Inst{10-0}  = simm11;
+  let Inst{11}    = cc{0};
+  let Inst{12}    = cc{1};
+  let Inst{13}    = 1;
+  let Inst{17-14} = cond;
+  let Inst{18}    = cc{2};
+}
+
+class F4_3<bits<6> op3, bits<6> opf_low, dag outs, dag ins,
+           string asmstr, list<dag> pattern>
+      : F4<op3, outs, ins, asmstr, pattern> {
+  bits<4> cond;
+  bits<3> opf_cc;
+  bits<5> rs2;
+
+  let Inst{18}     = 0;
+  let Inst{17-14}  = cond;
+  let Inst{13-11}  = opf_cc;
+  let Inst{10-5}   = opf_low;
+  let Inst{4-0}    = rs2;
+}
diff --git a/lib/Target/Sparc/SparcInstrInfo.cpp b/lib/Target/Sparc/SparcInstrInfo.cpp
index 6c14bc9..c10b5b3 100644
--- a/lib/Target/Sparc/SparcInstrInfo.cpp
+++ b/lib/Target/Sparc/SparcInstrInfo.cpp
@@ -24,11 +24,15 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TargetRegistry.h"
 
-#define GET_INSTRINFO_CTOR
+#define GET_INSTRINFO_CTOR_DTOR
 #include "SparcGenInstrInfo.inc"
 
 using namespace llvm;
 
+
+// Pin the vtable to this file.
+void SparcInstrInfo::anchor() {}
+
 SparcInstrInfo::SparcInstrInfo(SparcSubtarget &ST)
   : SparcGenInstrInfo(SP::ADJCALLSTACKDOWN, SP::ADJCALLSTACKUP),
     RI(ST), Subtarget(ST) {
@@ -44,7 +48,8 @@ unsigned SparcInstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
   if (MI->getOpcode() == SP::LDri ||
       MI->getOpcode() == SP::LDXri ||
       MI->getOpcode() == SP::LDFri ||
-      MI->getOpcode() == SP::LDDFri) {
+      MI->getOpcode() == SP::LDDFri ||
+      MI->getOpcode() == SP::LDQFri) {
     if (MI->getOperand(1).isFI() && MI->getOperand(2).isImm() &&
         MI->getOperand(2).getImm() == 0) {
       FrameIndex = MI->getOperand(1).getIndex();
@@ -64,7 +69,8 @@ unsigned SparcInstrInfo::isStoreToStackSlot(const MachineInstr *MI,
   if (MI->getOpcode() == SP::STri ||
       MI->getOpcode() == SP::STXri ||
       MI->getOpcode() == SP::STFri ||
-      MI->getOpcode() == SP::STDFri) {
+      MI->getOpcode() == SP::STDFri ||
+      MI->getOpcode() == SP::STQFri) {
     if (MI->getOperand(0).isFI() && MI->getOperand(1).isImm() &&
         MI->getOperand(1).getImm() == 0) {
       FrameIndex = MI->getOperand(0).getIndex();
@@ -100,14 +106,14 @@ static SPCC::CondCodes GetOppositeBranchCondition(SPCC::CondCodes CC)
 
   case SPCC::FCC_U:    return SPCC::FCC_O;
   case SPCC::FCC_O:    return SPCC::FCC_U;
-  case SPCC::FCC_G:    return SPCC::FCC_LE;
-  case SPCC::FCC_LE:   return SPCC::FCC_G;
-  case SPCC::FCC_UG:   return SPCC::FCC_ULE;
-  case SPCC::FCC_ULE:  return SPCC::FCC_UG;
-  case SPCC::FCC_L:    return SPCC::FCC_GE;
-  case SPCC::FCC_GE:   return SPCC::FCC_L;
-  case SPCC::FCC_UL:   return SPCC::FCC_UGE;
-  case SPCC::FCC_UGE:  return SPCC::FCC_UL;
+  case SPCC::FCC_G:    return SPCC::FCC_ULE;
+  case SPCC::FCC_LE:   return SPCC::FCC_UG;
+  case SPCC::FCC_UG:   return SPCC::FCC_LE;
+  case SPCC::FCC_ULE:  return SPCC::FCC_G;
+  case SPCC::FCC_L:    return SPCC::FCC_UGE;
+  case SPCC::FCC_GE:   return SPCC::FCC_UL;
+  case SPCC::FCC_UL:   return SPCC::FCC_GE;
+  case SPCC::FCC_UGE:  return SPCC::FCC_L;
   case SPCC::FCC_LG:   return SPCC::FCC_UE;
   case SPCC::FCC_UE:   return SPCC::FCC_LG;
   case SPCC::FCC_NE:   return SPCC::FCC_E;
@@ -273,6 +279,16 @@ void SparcInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator I, DebugLoc DL,
                                  unsigned DestReg, unsigned SrcReg,
                                  bool KillSrc) const {
+  unsigned numSubRegs = 0;
+  unsigned movOpc     = 0;
+  const unsigned *subRegIdx = 0;
+
+  const unsigned DFP_FP_SubRegsIdx[]  = { SP::sub_even, SP::sub_odd };
+  const unsigned QFP_DFP_SubRegsIdx[] = { SP::sub_even64, SP::sub_odd64 };
+  const unsigned QFP_FP_SubRegsIdx[]  = { SP::sub_even, SP::sub_odd,
+                                          SP::sub_odd64_then_sub_even,
+                                          SP::sub_odd64_then_sub_odd };
+
   if (SP::IntRegsRegClass.contains(DestReg, SrcReg))
     BuildMI(MBB, I, DL, get(SP::ORrr), DestReg).addReg(SP::G0)
       .addReg(SrcReg, getKillRegState(KillSrc));
@@ -285,23 +301,47 @@ void SparcInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
         .addReg(SrcReg, getKillRegState(KillSrc));
     } else {
       // Use two FMOVS instructions.
-      const TargetRegisterInfo *TRI = &getRegisterInfo();
-      MachineInstr *MovMI = 0;
-      unsigned subRegIdx[] = {SP::sub_even, SP::sub_odd};
-      for (unsigned i = 0; i != 2; ++i) {
-        unsigned Dst = TRI->getSubReg(DestReg, subRegIdx[i]);
-        unsigned Src = TRI->getSubReg(SrcReg,  subRegIdx[i]);
-        assert(Dst && Src && "Bad sub-register");
-
-        MovMI = BuildMI(MBB, I, DL, get(SP::FMOVS), Dst).addReg(Src);
+      subRegIdx  = DFP_FP_SubRegsIdx;
+      numSubRegs = 2;
+      movOpc     = SP::FMOVS;
+    }
+  } else if (SP::QFPRegsRegClass.contains(DestReg, SrcReg)) {
+    if (Subtarget.isV9()) {
+      if (Subtarget.hasHardQuad()) {
+        BuildMI(MBB, I, DL, get(SP::FMOVQ), DestReg)
+          .addReg(SrcReg, getKillRegState(KillSrc));
+      } else {
+        // Use two FMOVD instructions.
+        subRegIdx  = QFP_DFP_SubRegsIdx;
+        numSubRegs = 2;
+        movOpc     = SP::FMOVD;
       }
-      // Add implicit super-register defs and kills to the last MovMI.
-      MovMI->addRegisterDefined(DestReg, TRI);
-      if (KillSrc)
-        MovMI->addRegisterKilled(SrcReg, TRI);
+    } else {
+      // Use four FMOVS instructions.
+      subRegIdx  = QFP_FP_SubRegsIdx;
+      numSubRegs = 4;
+      movOpc     = SP::FMOVS;
     }
   } else
     llvm_unreachable("Impossible reg-to-reg copy");
+
+  if (numSubRegs == 0 || subRegIdx == 0 || movOpc == 0)
+    return;
+
+  const TargetRegisterInfo *TRI = &getRegisterInfo();
+  MachineInstr *MovMI = 0;
+
+  for (unsigned i = 0; i != numSubRegs; ++i) {
+    unsigned Dst = TRI->getSubReg(DestReg, subRegIdx[i]);
+    unsigned Src = TRI->getSubReg(SrcReg,  subRegIdx[i]);
+    assert(Dst && Src && "Bad sub-register");
+
+    MovMI = BuildMI(MBB, I, DL, get(movOpc), Dst).addReg(Src);
+  }
+  // Add implicit super-register defs and kills to the last MovMI.
+  MovMI->addRegisterDefined(DestReg, TRI);
+  if (KillSrc)
+    MovMI->addRegisterKilled(SrcReg, TRI);
 }
 
 void SparcInstrInfo::
@@ -321,7 +361,7 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
                              MFI.getObjectAlignment(FI));
 
   // On the order of operands here: think "[FrameIdx + 0] = SrcReg".
-  if (RC == &SP::I64RegsRegClass)
+ if (RC == &SP::I64RegsRegClass)
     BuildMI(MBB, I, DL, get(SP::STXri)).addFrameIndex(FI).addImm(0)
       .addReg(SrcReg, getKillRegState(isKill)).addMemOperand(MMO);
   else if (RC == &SP::IntRegsRegClass)
@@ -330,9 +370,14 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
   else if (RC == &SP::FPRegsRegClass)
     BuildMI(MBB, I, DL, get(SP::STFri)).addFrameIndex(FI).addImm(0)
       .addReg(SrcReg,  getKillRegState(isKill)).addMemOperand(MMO);
-  else if (RC == &SP::DFPRegsRegClass)
+  else if (SP::DFPRegsRegClass.hasSubClassEq(RC))
     BuildMI(MBB, I, DL, get(SP::STDFri)).addFrameIndex(FI).addImm(0)
       .addReg(SrcReg,  getKillRegState(isKill)).addMemOperand(MMO);
+  else if (SP::QFPRegsRegClass.hasSubClassEq(RC))
+    // Use STQFri irrespective of its legality. If STQ is not legal, it will be
+    // lowered into two STDs in eliminateFrameIndex.
+    BuildMI(MBB, I, DL, get(SP::STQFri)).addFrameIndex(FI).addImm(0)
+      .addReg(SrcReg,  getKillRegState(isKill)).addMemOperand(MMO);
   else
     llvm_unreachable("Can't store this register to stack slot");
 }
@@ -362,9 +407,14 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
   else if (RC == &SP::FPRegsRegClass)
     BuildMI(MBB, I, DL, get(SP::LDFri), DestReg).addFrameIndex(FI).addImm(0)
       .addMemOperand(MMO);
-  else if (RC == &SP::DFPRegsRegClass)
+  else if (SP::DFPRegsRegClass.hasSubClassEq(RC))
     BuildMI(MBB, I, DL, get(SP::LDDFri), DestReg).addFrameIndex(FI).addImm(0)
       .addMemOperand(MMO);
+  else if (SP::QFPRegsRegClass.hasSubClassEq(RC))
+    // Use LDQFri irrespective of its legality. If LDQ is not legal, it will be
+    // lowered into two LDDs in eliminateFrameIndex.
+    BuildMI(MBB, I, DL, get(SP::LDQFri), DestReg).addFrameIndex(FI).addImm(0)
+      .addMemOperand(MMO);
   else
     llvm_unreachable("Can't load this register from stack slot");
 }
diff --git a/lib/Target/Sparc/SparcInstrInfo.h b/lib/Target/Sparc/SparcInstrInfo.h
index d0b220b..a86cbcb 100644
--- a/lib/Target/Sparc/SparcInstrInfo.h
+++ b/lib/Target/Sparc/SparcInstrInfo.h
@@ -37,6 +37,7 @@ namespace SPII {
 class SparcInstrInfo : public SparcGenInstrInfo {
   const SparcRegisterInfo RI;
   const SparcSubtarget& Subtarget;
+  virtual void anchor();
 public:
   explicit SparcInstrInfo(SparcSubtarget &ST);
 
diff --git a/lib/Target/Sparc/SparcInstrInfo.td b/lib/Target/Sparc/SparcInstrInfo.td
index d4cac4d..ef7a114 100644
--- a/lib/Target/Sparc/SparcInstrInfo.td
+++ b/lib/Target/Sparc/SparcInstrInfo.td
@@ -39,6 +39,10 @@ def HasNoV9 : Predicate<"!Subtarget.isV9()">;
 // HasVIS - This is true when the target processor has VIS extensions.
 def HasVIS : Predicate<"Subtarget.isVIS()">;
 
+// HasHardQuad - This is true when the target processor supports quad floating
+// point instructions.
+def HasHardQuad : Predicate<"Subtarget.hasHardQuad()">;
+
 // UseDeprecatedInsts - This predicate is true when the target processor is a
 // V8, or when it is V9 but the V8 deprecated instructions are efficient enough
 // to use when appropriate.  In either of these cases, the instruction selector
@@ -81,6 +85,8 @@ def MEMri : Operand<iPTR> {
   let MIOperandInfo = (ops ptr_rc, i32imm);
 }
 
+def TLSSym : Operand<iPTR>;
+
 // Branch targets have OtherVT type.
 def brtarget : Operand<OtherVT>;
 def calltarget : Operand<i32>;
@@ -101,6 +107,15 @@ def SDTSPFTOI :
 SDTypeProfile<1, 1, [SDTCisVT<0, f32>, SDTCisFP<1>]>;
 def SDTSPITOF :
 SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisVT<1, f32>]>;
+def SDTSPFTOX :
+SDTypeProfile<1, 1, [SDTCisVT<0, f64>, SDTCisFP<1>]>;
+def SDTSPXTOF :
+SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisVT<1, f64>]>;
+
+def SDTSPtlsadd :
+SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>;
+def SDTSPtlsld :
+SDTypeProfile<1, 2, [SDTCisPtrTy<0>, SDTCisPtrTy<1>]>;
 
 def SPcmpicc : SDNode<"SPISD::CMPICC", SDTSPcmpicc, [SDNPOutGlue]>;
 def SPcmpfcc : SDNode<"SPISD::CMPFCC", SDTSPcmpfcc, [SDNPOutGlue]>;
@@ -113,6 +128,8 @@ def SPlo    : SDNode<"SPISD::Lo", SDTIntUnaryOp>;
 
 def SPftoi  : SDNode<"SPISD::FTOI", SDTSPFTOI>;
 def SPitof  : SDNode<"SPISD::ITOF", SDTSPITOF>;
+def SPftox  : SDNode<"SPISD::FTOX", SDTSPFTOX>;
+def SPxtof  : SDNode<"SPISD::XTOF", SDTSPXTOF>;
 
 def SPselecticc : SDNode<"SPISD::SELECT_ICC", SDTSPselectcc, [SDNPInGlue]>;
 def SPselectxcc : SDNode<"SPISD::SELECT_XCC", SDTSPselectcc, [SDNPInGlue]>;
@@ -140,6 +157,12 @@ def retflag       : SDNode<"SPISD::RET_FLAG", SDT_SPRet,
 def flushw        : SDNode<"SPISD::FLUSHW", SDTNone,
                            [SDNPHasChain, SDNPSideEffect, SDNPMayStore]>;
 
+def tlsadd        : SDNode<"SPISD::TLS_ADD", SDTSPtlsadd>;
+def tlsld         : SDNode<"SPISD::TLS_LD",  SDTSPtlsld>;
+def tlscall       : SDNode<"SPISD::TLS_CALL", SDT_SPCall,
+                            [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+                             SDNPVariadic]>;
+
 def getPCX        : Operand<i32> {
   let PrintMethod = "printGetPCX";
 }
@@ -242,8 +265,9 @@ let hasSideEffects = 1, mayStore = 1 in {
                    [(flushw)]>;
 }
 
-def UNIMP : F2_1<0b000, (outs), (ins i32imm:$val),
-                "unimp $val", []>;
+let rd = 0 in
+  def UNIMP : F2_1<0b000, (outs), (ins i32imm:$val),
+                  "unimp $val", []>;
 
 // SELECT_CC_* - Used to implement the SELECT_CC DAG operation.  Expanded after
 // instruction selection into a branch sequence.  This has to handle all
@@ -263,6 +287,11 @@ let Uses = [ICC], usesCustomInserter = 1 in {
    : Pseudo<(outs DFPRegs:$dst), (ins DFPRegs:$T, DFPRegs:$F, i32imm:$Cond),
             "; SELECT_CC_DFP_ICC PSEUDO!",
             [(set f64:$dst, (SPselecticc f64:$T, f64:$F, imm:$Cond))]>;
+
+  def SELECT_CC_QFP_ICC
+   : Pseudo<(outs QFPRegs:$dst), (ins QFPRegs:$T, QFPRegs:$F, i32imm:$Cond),
+            "; SELECT_CC_QFP_ICC PSEUDO!",
+            [(set f128:$dst, (SPselecticc f128:$T, f128:$F, imm:$Cond))]>;
 }
 
 let usesCustomInserter = 1, Uses = [FCC] in {
@@ -280,17 +309,21 @@ let usesCustomInserter = 1, Uses = [FCC] in {
    : Pseudo<(outs DFPRegs:$dst), (ins DFPRegs:$T, DFPRegs:$F, i32imm:$Cond),
             "; SELECT_CC_DFP_FCC PSEUDO!",
             [(set f64:$dst, (SPselectfcc f64:$T, f64:$F, imm:$Cond))]>;
+  def SELECT_CC_QFP_FCC
+   : Pseudo<(outs QFPRegs:$dst), (ins QFPRegs:$T, QFPRegs:$F, i32imm:$Cond),
+            "; SELECT_CC_QFP_FCC PSEUDO!",
+            [(set f128:$dst, (SPselectfcc f128:$T, f128:$F, imm:$Cond))]>;
 }
 
 
 // Section A.3 - Synthetic Instructions, p. 85
 // special cases of JMPL:
 let isReturn = 1, isTerminator = 1, hasDelaySlot = 1, isBarrier = 1 in {
-  let rd = O7.Num, rs1 = G0.Num in
+  let rd = 0, rs1 = 15 in
     def RETL: F3_2<2, 0b111000, (outs), (ins i32imm:$val),
                    "jmp %o7+$val", [(retflag simm13:$val)]>;
 
-  let rd = I7.Num, rs1 = G0.Num in
+  let rd = 0, rs1 = 31 in
     def RET: F3_2<2, 0b111000, (outs), (ins i32imm:$val),
                   "jmp %i7+$val", []>;
 }
@@ -354,56 +387,76 @@ def LDDFri : F3_2<3, 0b100011,
                   (outs DFPRegs:$dst), (ins MEMri:$addr),
                   "ldd [$addr], $dst",
                   [(set f64:$dst, (load ADDRri:$addr))]>;
+def LDQFrr : F3_1<3, 0b100010,
+                  (outs QFPRegs:$dst), (ins MEMrr:$addr),
+                  "ldq [$addr], $dst",
+                  [(set f128:$dst, (load ADDRrr:$addr))]>,
+                  Requires<[HasV9, HasHardQuad]>;
+def LDQFri : F3_2<3, 0b100010,
+                  (outs QFPRegs:$dst), (ins MEMri:$addr),
+                  "ldq [$addr], $dst",
+                  [(set f128:$dst, (load ADDRri:$addr))]>,
+                  Requires<[HasV9, HasHardQuad]>;
 
 // Section B.4 - Store Integer Instructions, p. 95
 def STBrr : F3_1<3, 0b000101,
-                 (outs), (ins MEMrr:$addr, IntRegs:$src),
-                 "stb $src, [$addr]",
-                 [(truncstorei8 i32:$src, ADDRrr:$addr)]>;
+                 (outs), (ins MEMrr:$addr, IntRegs:$rd),
+                 "stb $rd, [$addr]",
+                 [(truncstorei8 i32:$rd, ADDRrr:$addr)]>;
 def STBri : F3_2<3, 0b000101,
-                 (outs), (ins MEMri:$addr, IntRegs:$src),
-                 "stb $src, [$addr]",
-                 [(truncstorei8 i32:$src, ADDRri:$addr)]>;
+                 (outs), (ins MEMri:$addr, IntRegs:$rd),
+                 "stb $rd, [$addr]",
+                 [(truncstorei8 i32:$rd, ADDRri:$addr)]>;
 def STHrr : F3_1<3, 0b000110,
-                 (outs), (ins MEMrr:$addr, IntRegs:$src),
-                 "sth $src, [$addr]",
-                 [(truncstorei16 i32:$src, ADDRrr:$addr)]>;
+                 (outs), (ins MEMrr:$addr, IntRegs:$rd),
+                 "sth $rd, [$addr]",
+                 [(truncstorei16 i32:$rd, ADDRrr:$addr)]>;
 def STHri : F3_2<3, 0b000110,
-                 (outs), (ins MEMri:$addr, IntRegs:$src),
-                 "sth $src, [$addr]",
-                 [(truncstorei16 i32:$src, ADDRri:$addr)]>;
+                 (outs), (ins MEMri:$addr, IntRegs:$rd),
+                 "sth $rd, [$addr]",
+                 [(truncstorei16 i32:$rd, ADDRri:$addr)]>;
 def STrr  : F3_1<3, 0b000100,
-                 (outs), (ins MEMrr:$addr, IntRegs:$src),
-                 "st $src, [$addr]",
-                 [(store i32:$src, ADDRrr:$addr)]>;
+                 (outs), (ins MEMrr:$addr, IntRegs:$rd),
+                 "st $rd, [$addr]",
+                 [(store i32:$rd, ADDRrr:$addr)]>;
 def STri  : F3_2<3, 0b000100,
-                 (outs), (ins MEMri:$addr, IntRegs:$src),
-                 "st $src, [$addr]",
-                 [(store i32:$src, ADDRri:$addr)]>;
+                 (outs), (ins MEMri:$addr, IntRegs:$rd),
+                 "st $rd, [$addr]",
+                 [(store i32:$rd, ADDRri:$addr)]>;
 
 // Section B.5 - Store Floating-point Instructions, p. 97
 def STFrr   : F3_1<3, 0b100100,
-                   (outs), (ins MEMrr:$addr, FPRegs:$src),
-                   "st $src, [$addr]",
-                   [(store f32:$src, ADDRrr:$addr)]>;
+                   (outs), (ins MEMrr:$addr, FPRegs:$rd),
+                   "st $rd, [$addr]",
+                   [(store f32:$rd, ADDRrr:$addr)]>;
 def STFri   : F3_2<3, 0b100100,
-                   (outs), (ins MEMri:$addr, FPRegs:$src),
-                   "st $src, [$addr]",
-                   [(store f32:$src, ADDRri:$addr)]>;
+                   (outs), (ins MEMri:$addr, FPRegs:$rd),
+                   "st $rd, [$addr]",
+                   [(store f32:$rd, ADDRri:$addr)]>;
 def STDFrr  : F3_1<3, 0b100111,
-                   (outs), (ins MEMrr:$addr, DFPRegs:$src),
-                   "std  $src, [$addr]",
-                   [(store f64:$src, ADDRrr:$addr)]>;
+                   (outs), (ins MEMrr:$addr, DFPRegs:$rd),
+                   "std  $rd, [$addr]",
+                   [(store f64:$rd, ADDRrr:$addr)]>;
 def STDFri  : F3_2<3, 0b100111,
-                   (outs), (ins MEMri:$addr, DFPRegs:$src),
-                   "std $src, [$addr]",
-                   [(store f64:$src, ADDRri:$addr)]>;
+                   (outs), (ins MEMri:$addr, DFPRegs:$rd),
+                   "std $rd, [$addr]",
+                   [(store f64:$rd, ADDRri:$addr)]>;
+def STQFrr  : F3_1<3, 0b100110,
+                   (outs), (ins MEMrr:$addr, QFPRegs:$rd),
+                   "stq  $rd, [$addr]",
+                   [(store f128:$rd, ADDRrr:$addr)]>,
+                   Requires<[HasV9, HasHardQuad]>;
+def STQFri  : F3_2<3, 0b100110,
+                   (outs), (ins MEMri:$addr, QFPRegs:$rd),
+                   "stq $rd, [$addr]",
+                   [(store f128:$rd, ADDRri:$addr)]>,
+                   Requires<[HasV9, HasHardQuad]>;
 
 // Section B.9 - SETHI Instruction, p. 104
 def SETHIi: F2_1<0b100,
-                 (outs IntRegs:$dst), (ins i32imm:$src),
-                 "sethi $src, $dst",
-                 [(set i32:$dst, SETHIimm:$src)]>;
+                 (outs IntRegs:$rd), (ins i32imm:$imm22),
+                 "sethi $imm22, $rd",
+                 [(set i32:$rd, SETHIimm:$imm22)]>;
 
 // Section B.10 - NOP Instruction, p. 105
 // (It's a special case of SETHI)
@@ -449,30 +502,32 @@ defm SRA : F3_12<"sra", 0b100111, sra>;
 defm ADD   : F3_12<"add", 0b000000, add>;
 
 // "LEA" forms of add (patterns to make tblgen happy)
-def LEA_ADDri   : F3_2<2, 0b000000,
-                   (outs IntRegs:$dst), (ins MEMri:$addr),
-                   "add ${addr:arith}, $dst",
-                   [(set iPTR:$dst, ADDRri:$addr)]>;
+let Predicates = [Is32Bit] in
+  def LEA_ADDri   : F3_2<2, 0b000000,
+                     (outs IntRegs:$dst), (ins MEMri:$addr),
+                     "add ${addr:arith}, $dst",
+                     [(set iPTR:$dst, ADDRri:$addr)]>;
 
 let Defs = [ICC] in
   defm ADDCC  : F3_12<"addcc", 0b010000, addc>;
 
-let Uses = [ICC] in
-  defm ADDX  : F3_12<"addx", 0b001000, adde>;
+let Uses = [ICC], Defs = [ICC] in
+  defm ADDX  : F3_12<"addxcc", 0b011000, adde>;
 
 // Section B.15 - Subtract Instructions, p. 110
 defm SUB    : F3_12  <"sub"  , 0b000100, sub>;
-let Uses = [ICC] in
-  defm SUBX   : F3_12  <"subx" , 0b001100, sube>;
+let Uses = [ICC], Defs = [ICC] in
+  defm SUBX   : F3_12  <"subxcc" , 0b011100, sube>;
 
-let Defs = [ICC] in {
+let Defs = [ICC] in
   defm SUBCC  : F3_12  <"subcc", 0b010100, subc>;
 
+let Defs = [ICC], rd = 0 in {
   def CMPrr   : F3_1<2, 0b010100,
                      (outs), (ins IntRegs:$b, IntRegs:$c),
                      "cmp $b, $c",
                      [(SPcmpicc i32:$b, i32:$c)]>;
-  def CMPri   : F3_1<2, 0b010100,
+  def CMPri   : F3_2<2, 0b010100,
                      (outs), (ins IntRegs:$b, i32imm:$c),
                      "cmp $b, $c",
                      [(SPcmpicc i32:$b, (i32 simm13:$c))]>;
@@ -502,23 +557,30 @@ defm RESTORE : F3_12np<"restore", 0b111101>;
 
 // Section B.21 - Branch on Integer Condition Codes Instructions, p. 119
 
+// unconditional branch class.
+class BranchAlways<dag ins, string asmstr, list<dag> pattern>
+  : F2_2<0b010, (outs), ins, asmstr, pattern> {
+  let isBranch     = 1;
+  let isTerminator = 1;
+  let hasDelaySlot = 1;
+  let isBarrier    = 1;
+}
+
+let cond = 8 in
+  def BA : BranchAlways<(ins brtarget:$imm22), "ba $imm22", [(br bb:$imm22)]>;
+
 // conditional branch class:
-class BranchSP<bits<4> cc, dag ins, string asmstr, list<dag> pattern>
- : F2_2<cc, 0b010, (outs), ins, asmstr, pattern> {
+class BranchSP<dag ins, string asmstr, list<dag> pattern>
+ : F2_2<0b010, (outs), ins, asmstr, pattern> {
   let isBranch = 1;
   let isTerminator = 1;
   let hasDelaySlot = 1;
 }
 
-let isBarrier = 1 in
-  def BA   : BranchSP<0b1000, (ins brtarget:$dst),
-                      "ba $dst",
-                      [(br bb:$dst)]>;
-
 // Indirect branch instructions.
 let isTerminator = 1, isBarrier = 1,
      hasDelaySlot = 1, isBranch =1,
-     isIndirectBranch = 1 in {
+     isIndirectBranch = 1, rd = 0 in {
   def BINDrr  : F3_1<2, 0b111000,
                    (outs), (ins MEMrr:$ptr),
                    "jmp $ptr",
@@ -529,37 +591,31 @@ let isTerminator = 1, isBarrier = 1,
                    [(brind ADDRri:$ptr)]>;
 }
 
-// FIXME: the encoding for the JIT should look at the condition field.
 let Uses = [ICC] in
-  def BCOND : BranchSP<0, (ins brtarget:$dst, CCOp:$cc),
-                         "b$cc $dst",
-                        [(SPbricc bb:$dst, imm:$cc)]>;
-
+  def BCOND : BranchSP<(ins brtarget:$imm22, CCOp:$cond),
+                         "b$cond $imm22",
+                        [(SPbricc bb:$imm22, imm:$cond)]>;
 
 // Section B.22 - Branch on Floating-point Condition Codes Instructions, p. 121
 
 // floating-point conditional branch class:
-class FPBranchSP<bits<4> cc, dag ins, string asmstr, list<dag> pattern>
- : F2_2<cc, 0b110, (outs), ins, asmstr, pattern> {
+class FPBranchSP<dag ins, string asmstr, list<dag> pattern>
+ : F2_2<0b110, (outs), ins, asmstr, pattern> {
   let isBranch = 1;
   let isTerminator = 1;
   let hasDelaySlot = 1;
 }
 
-// FIXME: the encoding for the JIT should look at the condition field.
 let Uses = [FCC] in
-  def FBCOND  : FPBranchSP<0, (ins brtarget:$dst, CCOp:$cc),
-                              "fb$cc $dst",
-                              [(SPbrfcc bb:$dst, imm:$cc)]>;
+  def FBCOND  : FPBranchSP<(ins brtarget:$imm22, CCOp:$cond),
+                              "fb$cond $imm22",
+                              [(SPbrfcc bb:$imm22, imm:$cond)]>;
 
 
 // Section B.24 - Call and Link Instruction, p. 125
 // This is the only Format 1 instruction
 let Uses = [O6],
-    hasDelaySlot = 1, isCall = 1,
-    Defs = [O0, O1, O2, O3, O4, O5, O7, G1, G2, G3, G4, G5, G6, G7,
-    D0, D1, D2, D3, D4, D5, D6, D7, D8, D9, D10, D11, D12, D13, D14, D15,
-        ICC, FCC, Y] in {
+    hasDelaySlot = 1, isCall = 1 in {
   def CALL : InstSP<(outs), (ins calltarget:$dst, variable_ops),
                     "call $dst", []> {
     bits<30> disp;
@@ -571,21 +627,21 @@ let Uses = [O6],
   def JMPLrr : F3_1<2, 0b111000,
                     (outs), (ins MEMrr:$ptr, variable_ops),
                     "call $ptr",
-                    [(call ADDRrr:$ptr)]>;
+                    [(call ADDRrr:$ptr)]> { let rd = 15; }
   def JMPLri : F3_2<2, 0b111000,
                     (outs), (ins MEMri:$ptr, variable_ops),
                     "call $ptr",
-                    [(call ADDRri:$ptr)]>;
+                    [(call ADDRri:$ptr)]> { let rd = 15; }
 }
 
 // Section B.28 - Read State Register Instructions
-let Uses = [Y] in
+let Uses = [Y], rs1 = 0, rs2 = 0 in
   def RDY : F3_1<2, 0b101000,
                  (outs IntRegs:$dst), (ins),
                  "rd %y, $dst", []>;
 
 // Section B.29 - Write State Register Instructions
-let Defs = [Y] in {
+let Defs = [Y], rd = 0 in {
   def WRYrr : F3_1<2, 0b110000,
                    (outs), (ins IntRegs:$b, IntRegs:$c),
                    "wr $b, $c, %y", []>;
@@ -594,58 +650,93 @@ let Defs = [Y] in {
                    "wr $b, $c, %y", []>;
 }
 // Convert Integer to Floating-point Instructions, p. 141
-def FITOS : F3_3<2, 0b110100, 0b011000100,
+def FITOS : F3_3u<2, 0b110100, 0b011000100,
                  (outs FPRegs:$dst), (ins FPRegs:$src),
                  "fitos $src, $dst",
                  [(set FPRegs:$dst, (SPitof FPRegs:$src))]>;
-def FITOD : F3_3<2, 0b110100, 0b011001000,
+def FITOD : F3_3u<2, 0b110100, 0b011001000,
                  (outs DFPRegs:$dst), (ins FPRegs:$src),
                  "fitod $src, $dst",
                  [(set DFPRegs:$dst, (SPitof FPRegs:$src))]>;
+def FITOQ : F3_3u<2, 0b110100, 0b011001100,
+                 (outs QFPRegs:$dst), (ins FPRegs:$src),
+                 "fitoq $src, $dst",
+                 [(set QFPRegs:$dst, (SPitof FPRegs:$src))]>,
+                 Requires<[HasHardQuad]>;
 
 // Convert Floating-point to Integer Instructions, p. 142
-def FSTOI : F3_3<2, 0b110100, 0b011010001,
+def FSTOI : F3_3u<2, 0b110100, 0b011010001,
                  (outs FPRegs:$dst), (ins FPRegs:$src),
                  "fstoi $src, $dst",
                  [(set FPRegs:$dst, (SPftoi FPRegs:$src))]>;
-def FDTOI : F3_3<2, 0b110100, 0b011010010,
+def FDTOI : F3_3u<2, 0b110100, 0b011010010,
                  (outs FPRegs:$dst), (ins DFPRegs:$src),
                  "fdtoi $src, $dst",
                  [(set FPRegs:$dst, (SPftoi DFPRegs:$src))]>;
+def FQTOI : F3_3u<2, 0b110100, 0b011010011,
+                 (outs FPRegs:$dst), (ins QFPRegs:$src),
+                 "fqtoi $src, $dst",
+                 [(set FPRegs:$dst, (SPftoi QFPRegs:$src))]>,
+                 Requires<[HasHardQuad]>;
 
 // Convert between Floating-point Formats Instructions, p. 143
-def FSTOD : F3_3<2, 0b110100, 0b011001001,
+def FSTOD : F3_3u<2, 0b110100, 0b011001001,
                  (outs DFPRegs:$dst), (ins FPRegs:$src),
                  "fstod $src, $dst",
                  [(set f64:$dst, (fextend f32:$src))]>;
-def FDTOS : F3_3<2, 0b110100, 0b011000110,
+def FSTOQ : F3_3u<2, 0b110100, 0b011001101,
+                 (outs QFPRegs:$dst), (ins FPRegs:$src),
+                 "fstoq $src, $dst",
+                 [(set f128:$dst, (fextend f32:$src))]>,
+                 Requires<[HasHardQuad]>;
+def FDTOS : F3_3u<2, 0b110100, 0b011000110,
                  (outs FPRegs:$dst), (ins DFPRegs:$src),
                  "fdtos $src, $dst",
                  [(set f32:$dst, (fround f64:$src))]>;
+def FDTOQ : F3_3u<2, 0b110100, 0b01101110,
+                 (outs QFPRegs:$dst), (ins DFPRegs:$src),
+                 "fdtoq $src, $dst",
+                 [(set f128:$dst, (fextend f64:$src))]>,
+                 Requires<[HasHardQuad]>;
+def FQTOS : F3_3u<2, 0b110100, 0b011000111,
+                 (outs FPRegs:$dst), (ins QFPRegs:$src),
+                 "fqtos $src, $dst",
+                 [(set f32:$dst, (fround f128:$src))]>,
+                 Requires<[HasHardQuad]>;
+def FQTOD : F3_3u<2, 0b110100, 0b011001011,
+                 (outs DFPRegs:$dst), (ins QFPRegs:$src),
+                 "fqtod $src, $dst",
+                 [(set f64:$dst, (fround f128:$src))]>,
+                 Requires<[HasHardQuad]>;
 
 // Floating-point Move Instructions, p. 144
-def FMOVS : F3_3<2, 0b110100, 0b000000001,
+def FMOVS : F3_3u<2, 0b110100, 0b000000001,
                  (outs FPRegs:$dst), (ins FPRegs:$src),
                  "fmovs $src, $dst", []>;
-def FNEGS : F3_3<2, 0b110100, 0b000000101,
+def FNEGS : F3_3u<2, 0b110100, 0b000000101,
                  (outs FPRegs:$dst), (ins FPRegs:$src),
                  "fnegs $src, $dst",
                  [(set f32:$dst, (fneg f32:$src))]>;
-def FABSS : F3_3<2, 0b110100, 0b000001001,
+def FABSS : F3_3u<2, 0b110100, 0b000001001,
                  (outs FPRegs:$dst), (ins FPRegs:$src),
                  "fabss $src, $dst",
                  [(set f32:$dst, (fabs f32:$src))]>;
 
 
 // Floating-point Square Root Instructions, p.145
-def FSQRTS : F3_3<2, 0b110100, 0b000101001,
+def FSQRTS : F3_3u<2, 0b110100, 0b000101001,
                   (outs FPRegs:$dst), (ins FPRegs:$src),
                   "fsqrts $src, $dst",
                   [(set f32:$dst, (fsqrt f32:$src))]>;
-def FSQRTD : F3_3<2, 0b110100, 0b000101010,
+def FSQRTD : F3_3u<2, 0b110100, 0b000101010,
                   (outs DFPRegs:$dst), (ins DFPRegs:$src),
                   "fsqrtd $src, $dst",
                   [(set f64:$dst, (fsqrt f64:$src))]>;
+def FSQRTQ : F3_3u<2, 0b110100, 0b000101011,
+                  (outs QFPRegs:$dst), (ins QFPRegs:$src),
+                  "fsqrtq $src, $dst",
+                  [(set f128:$dst, (fsqrt f128:$src))]>,
+                  Requires<[HasHardQuad]>;
 
 
 
@@ -658,6 +749,12 @@ def FADDD  : F3_3<2, 0b110100, 0b001000010,
                   (outs DFPRegs:$dst), (ins DFPRegs:$src1, DFPRegs:$src2),
                   "faddd $src1, $src2, $dst",
                   [(set f64:$dst, (fadd f64:$src1, f64:$src2))]>;
+def FADDQ  : F3_3<2, 0b110100, 0b001000011,
+                  (outs QFPRegs:$dst), (ins QFPRegs:$src1, QFPRegs:$src2),
+                  "faddq $src1, $src2, $dst",
+                  [(set f128:$dst, (fadd f128:$src1, f128:$src2))]>,
+                  Requires<[HasHardQuad]>;
+
 def FSUBS  : F3_3<2, 0b110100, 0b001000101,
                   (outs FPRegs:$dst), (ins FPRegs:$src1, FPRegs:$src2),
                   "fsubs $src1, $src2, $dst",
@@ -666,6 +763,12 @@ def FSUBD  : F3_3<2, 0b110100, 0b001000110,
                   (outs DFPRegs:$dst), (ins DFPRegs:$src1, DFPRegs:$src2),
                   "fsubd $src1, $src2, $dst",
                   [(set f64:$dst, (fsub f64:$src1, f64:$src2))]>;
+def FSUBQ  : F3_3<2, 0b110100, 0b001000111,
+                  (outs QFPRegs:$dst), (ins QFPRegs:$src1, QFPRegs:$src2),
+                  "fsubq $src1, $src2, $dst",
+                  [(set f128:$dst, (fsub f128:$src1, f128:$src2))]>,
+                  Requires<[HasHardQuad]>;
+
 
 // Floating-point Multiply and Divide Instructions, p. 147
 def FMULS  : F3_3<2, 0b110100, 0b001001001,
@@ -676,11 +779,24 @@ def FMULD  : F3_3<2, 0b110100, 0b001001010,
                   (outs DFPRegs:$dst), (ins DFPRegs:$src1, DFPRegs:$src2),
                   "fmuld $src1, $src2, $dst",
                   [(set f64:$dst, (fmul f64:$src1, f64:$src2))]>;
+def FMULQ  : F3_3<2, 0b110100, 0b001001011,
+                  (outs QFPRegs:$dst), (ins QFPRegs:$src1, QFPRegs:$src2),
+                  "fmulq $src1, $src2, $dst",
+                  [(set f128:$dst, (fmul f128:$src1, f128:$src2))]>,
+                  Requires<[HasHardQuad]>;
+
 def FSMULD : F3_3<2, 0b110100, 0b001101001,
                   (outs DFPRegs:$dst), (ins FPRegs:$src1, FPRegs:$src2),
                   "fsmuld $src1, $src2, $dst",
                   [(set f64:$dst, (fmul (fextend f32:$src1),
                                         (fextend f32:$src2)))]>;
+def FDMULQ : F3_3<2, 0b110100, 0b001101110,
+                  (outs QFPRegs:$dst), (ins DFPRegs:$src1, DFPRegs:$src2),
+                  "fdmulq $src1, $src2, $dst",
+                  [(set f128:$dst, (fmul (fextend f64:$src1),
+                                         (fextend f64:$src2)))]>,
+                  Requires<[HasHardQuad]>;
+
 def FDIVS  : F3_3<2, 0b110100, 0b001001101,
                  (outs FPRegs:$dst), (ins FPRegs:$src1, FPRegs:$src2),
                  "fdivs $src1, $src2, $dst",
@@ -689,21 +805,61 @@ def FDIVD  : F3_3<2, 0b110100, 0b001001110,
                  (outs DFPRegs:$dst), (ins DFPRegs:$src1, DFPRegs:$src2),
                  "fdivd $src1, $src2, $dst",
                  [(set f64:$dst, (fdiv f64:$src1, f64:$src2))]>;
+def FDIVQ  : F3_3<2, 0b110100, 0b001001111,
+                 (outs QFPRegs:$dst), (ins QFPRegs:$src1, QFPRegs:$src2),
+                 "fdivq $src1, $src2, $dst",
+                 [(set f128:$dst, (fdiv f128:$src1, f128:$src2))]>,
+                 Requires<[HasHardQuad]>;
 
 // Floating-point Compare Instructions, p. 148
 // Note: the 2nd template arg is different for these guys.
 // Note 2: the result of a FCMP is not available until the 2nd cycle
-// after the instr is retired, but there is no interlock. This behavior
-// is modelled with a forced noop after the instruction.
+// after the instr is retired, but there is no interlock in Sparc V8.
+// This behavior is modeled with a forced noop after the instruction in
+// DelaySlotFiller.
+
 let Defs = [FCC] in {
-  def FCMPS  : F3_3<2, 0b110101, 0b001010001,
+  def FCMPS  : F3_3c<2, 0b110101, 0b001010001,
                    (outs), (ins FPRegs:$src1, FPRegs:$src2),
-                   "fcmps $src1, $src2\n\tnop",
+                   "fcmps $src1, $src2",
                    [(SPcmpfcc f32:$src1, f32:$src2)]>;
-  def FCMPD  : F3_3<2, 0b110101, 0b001010010,
+  def FCMPD  : F3_3c<2, 0b110101, 0b001010010,
                    (outs), (ins DFPRegs:$src1, DFPRegs:$src2),
-                   "fcmpd $src1, $src2\n\tnop",
+                   "fcmpd $src1, $src2",
                    [(SPcmpfcc f64:$src1, f64:$src2)]>;
+  def FCMPQ  : F3_3c<2, 0b110101, 0b001010011,
+                   (outs), (ins QFPRegs:$src1, QFPRegs:$src2),
+                   "fcmpq $src1, $src2",
+                   [(SPcmpfcc f128:$src1, f128:$src2)]>,
+                   Requires<[HasHardQuad]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Instructions for Thread Local Storage(TLS).
+//===----------------------------------------------------------------------===//
+
+def TLS_ADDrr : F3_1<2, 0b000000,
+                    (outs IntRegs:$rd),
+                    (ins IntRegs:$rs1, IntRegs:$rs2, TLSSym:$sym),
+                    "add $rs1, $rs2, $rd, $sym",
+                    [(set i32:$rd,
+                        (tlsadd i32:$rs1, i32:$rs2, tglobaltlsaddr:$sym))]>;
+
+let mayLoad = 1 in
+  def TLS_LDrr : F3_1<3, 0b000000,
+                      (outs IntRegs:$dst), (ins MEMrr:$addr, TLSSym:$sym),
+                      "ld [$addr], $dst, $sym",
+                      [(set i32:$dst,
+                          (tlsld ADDRrr:$addr, tglobaltlsaddr:$sym))]>;
+
+let Uses = [O6], isCall = 1, hasDelaySlot = 1 in
+  def TLS_CALL : InstSP<(outs),
+                        (ins calltarget:$disp, TLSSym:$sym, variable_ops),
+                        "call $disp, $sym",
+                        [(tlscall texternalsym:$disp, tglobaltlsaddr:$sym)]> {
+  bits<30> disp;
+  let op = 1;
+  let Inst{29-0} = disp;
 }
 
 //===----------------------------------------------------------------------===//
@@ -713,73 +869,108 @@ let Defs = [FCC] in {
 // V9 Conditional Moves.
 let Predicates = [HasV9], Constraints = "$f = $rd" in {
   // Move Integer Register on Condition (MOVcc) p. 194 of the V9 manual.
-  // FIXME: Add instruction encodings for the JIT some day.
-  let Uses = [ICC] in {
+  let Uses = [ICC], cc = 0b100 in {
     def MOVICCrr
-      : Pseudo<(outs IntRegs:$rd), (ins IntRegs:$rs2, IntRegs:$f, CCOp:$cc),
-               "mov$cc %icc, $rs2, $rd",
-               [(set i32:$rd, (SPselecticc i32:$rs2, i32:$f, imm:$cc))]>;
+      : F4_1<0b101100, (outs IntRegs:$rd),
+             (ins IntRegs:$rs2, IntRegs:$f, CCOp:$cond),
+             "mov$cond %icc, $rs2, $rd",
+             [(set i32:$rd, (SPselecticc i32:$rs2, i32:$f, imm:$cond))]>;
+
     def MOVICCri
-      : Pseudo<(outs IntRegs:$rd), (ins i32imm:$i, IntRegs:$f, CCOp:$cc),
-               "mov$cc %icc, $i, $rd",
-               [(set i32:$rd, (SPselecticc simm11:$i, i32:$f, imm:$cc))]>;
+      : F4_2<0b101100, (outs IntRegs:$rd),
+             (ins i32imm:$simm11, IntRegs:$f, CCOp:$cond),
+             "mov$cond %icc, $simm11, $rd",
+             [(set i32:$rd,
+                    (SPselecticc simm11:$simm11, i32:$f, imm:$cond))]>;
   }
 
-  let Uses = [FCC] in {
+  let Uses = [FCC], cc = 0b000 in {
     def MOVFCCrr
-      : Pseudo<(outs IntRegs:$rd), (ins IntRegs:$rs2, IntRegs:$f, CCOp:$cc),
-               "mov$cc %fcc0, $rs2, $rd",
-               [(set i32:$rd, (SPselectfcc i32:$rs2, i32:$f, imm:$cc))]>;
+      : F4_1<0b101100, (outs IntRegs:$rd),
+             (ins IntRegs:$rs2, IntRegs:$f, CCOp:$cond),
+             "mov$cond %fcc0, $rs2, $rd",
+             [(set i32:$rd, (SPselectfcc i32:$rs2, i32:$f, imm:$cond))]>;
     def MOVFCCri
-      : Pseudo<(outs IntRegs:$rd), (ins i32imm:$i, IntRegs:$f, CCOp:$cc),
-               "mov$cc %fcc0, $i, $rd",
-               [(set i32:$rd, (SPselectfcc simm11:$i, i32:$f, imm:$cc))]>;
+      : F4_2<0b101100, (outs IntRegs:$rd),
+             (ins i32imm:$simm11, IntRegs:$f, CCOp:$cond),
+             "mov$cond %fcc0, $simm11, $rd",
+             [(set i32:$rd,
+                    (SPselectfcc simm11:$simm11, i32:$f, imm:$cond))]>;
   }
 
-  let Uses = [ICC] in {
+  let Uses = [ICC], opf_cc = 0b100 in {
     def FMOVS_ICC
-      : Pseudo<(outs FPRegs:$rd), (ins FPRegs:$rs2, FPRegs:$f, CCOp:$cc),
-               "fmovs$cc %icc, $rs2, $rd",
-               [(set f32:$rd, (SPselecticc f32:$rs2, f32:$f, imm:$cc))]>;
+      : F4_3<0b110101, 0b000001, (outs FPRegs:$rd),
+             (ins FPRegs:$rs2, FPRegs:$f, CCOp:$cond),
+             "fmovs$cond %icc, $rs2, $rd",
+             [(set f32:$rd, (SPselecticc f32:$rs2, f32:$f, imm:$cond))]>;
     def FMOVD_ICC
-      : Pseudo<(outs DFPRegs:$rd), (ins DFPRegs:$rs2, DFPRegs:$f, CCOp:$cc),
-               "fmovd$cc %icc, $rs2, $rd",
-               [(set f64:$rd, (SPselecticc f64:$rs2, f64:$f, imm:$cc))]>;
+      : F4_3<0b110101, 0b000010, (outs DFPRegs:$rd),
+               (ins DFPRegs:$rs2, DFPRegs:$f, CCOp:$cond),
+               "fmovd$cond %icc, $rs2, $rd",
+               [(set f64:$rd, (SPselecticc f64:$rs2, f64:$f, imm:$cond))]>;
+    def FMOVQ_ICC
+      : F4_3<0b110101, 0b000011, (outs QFPRegs:$rd),
+               (ins QFPRegs:$rs2, QFPRegs:$f, CCOp:$cond),
+               "fmovd$cond %icc, $rs2, $rd",
+               [(set f128:$rd, (SPselecticc f128:$rs2, f128:$f, imm:$cond))]>;
   }
 
-  let Uses = [FCC] in {
+  let Uses = [FCC], opf_cc = 0b000 in {
     def FMOVS_FCC
-      : Pseudo<(outs FPRegs:$rd), (ins FPRegs:$rs2, FPRegs:$f, CCOp:$cc),
-               "fmovs$cc %fcc0, $rs2, $rd",
-               [(set f32:$rd, (SPselectfcc f32:$rs2, f32:$f, imm:$cc))]>;
+      : F4_3<0b110101, 0b000001, (outs FPRegs:$rd),
+             (ins FPRegs:$rs2, FPRegs:$f, CCOp:$cond),
+             "fmovs$cond %fcc0, $rs2, $rd",
+             [(set f32:$rd, (SPselectfcc f32:$rs2, f32:$f, imm:$cond))]>;
     def FMOVD_FCC
-      : Pseudo<(outs DFPRegs:$rd), (ins DFPRegs:$rs2, DFPRegs:$f, CCOp:$cc),
-               "fmovd$cc %fcc0, $rs2, $rd",
-               [(set f64:$rd, (SPselectfcc f64:$rs2, f64:$f, imm:$cc))]>;
+      : F4_3<0b110101, 0b000010, (outs DFPRegs:$rd),
+             (ins DFPRegs:$rs2, DFPRegs:$f, CCOp:$cond),
+             "fmovd$cond %fcc0, $rs2, $rd",
+             [(set f64:$rd, (SPselectfcc f64:$rs2, f64:$f, imm:$cond))]>;
+    def FMOVQ_FCC
+      : F4_3<0b110101, 0b000011, (outs QFPRegs:$rd),
+             (ins QFPRegs:$rs2, QFPRegs:$f, CCOp:$cond),
+             "fmovd$cond %fcc0, $rs2, $rd",
+             [(set f128:$rd, (SPselectfcc f128:$rs2, f128:$f, imm:$cond))]>;
   }
 
 }
 
 // Floating-Point Move Instructions, p. 164 of the V9 manual.
 let Predicates = [HasV9] in {
-  def FMOVD : F3_3<2, 0b110100, 0b000000010,
+  def FMOVD : F3_3u<2, 0b110100, 0b000000010,
                    (outs DFPRegs:$dst), (ins DFPRegs:$src),
                    "fmovd $src, $dst", []>;
-  def FNEGD : F3_3<2, 0b110100, 0b000000110,
+  def FMOVQ : F3_3u<2, 0b110100, 0b000000011,
+                   (outs QFPRegs:$dst), (ins QFPRegs:$src),
+                   "fmovq $src, $dst", []>,
+                   Requires<[HasHardQuad]>;
+  def FNEGD : F3_3u<2, 0b110100, 0b000000110,
                    (outs DFPRegs:$dst), (ins DFPRegs:$src),
                    "fnegd $src, $dst",
                    [(set f64:$dst, (fneg f64:$src))]>;
-  def FABSD : F3_3<2, 0b110100, 0b000001010,
+  def FNEGQ : F3_3u<2, 0b110100, 0b000000111,
+                   (outs QFPRegs:$dst), (ins QFPRegs:$src),
+                   "fnegq $src, $dst",
+                   [(set f128:$dst, (fneg f128:$src))]>,
+                   Requires<[HasHardQuad]>;
+  def FABSD : F3_3u<2, 0b110100, 0b000001010,
                    (outs DFPRegs:$dst), (ins DFPRegs:$src),
                    "fabsd $src, $dst",
                    [(set f64:$dst, (fabs f64:$src))]>;
+  def FABSQ : F3_3u<2, 0b110100, 0b000001011,
+                   (outs QFPRegs:$dst), (ins QFPRegs:$src),
+                   "fabsq $src, $dst",
+                   [(set f128:$dst, (fabs f128:$src))]>,
+                   Requires<[HasHardQuad]>;
 }
 
 // POPCrr - This does a ctpop of a 64-bit register.  As such, we have to clear
 // the top 32-bits before using it.  To do this clearing, we use a SLLri X,0.
-def POPCrr : F3_1<2, 0b101110,
-                  (outs IntRegs:$dst), (ins IntRegs:$src),
-                  "popc $src, $dst", []>, Requires<[HasV9]>;
+let rs1 = 0 in
+  def POPCrr : F3_1<2, 0b101110,
+                    (outs IntRegs:$dst), (ins IntRegs:$src),
+                    "popc $src, $dst", []>, Requires<[HasV9]>;
 def : Pat<(ctpop i32:$src),
           (POPCrr (SLLri $src, 0))>;
 
@@ -801,6 +992,14 @@ def : Pat<(SPlo tglobaladdr:$in), (ORri (i32 G0), tglobaladdr:$in)>;
 def : Pat<(SPhi tconstpool:$in), (SETHIi tconstpool:$in)>;
 def : Pat<(SPlo tconstpool:$in), (ORri (i32 G0), tconstpool:$in)>;
 
+// GlobalTLS addresses
+def : Pat<(SPhi tglobaltlsaddr:$in), (SETHIi tglobaltlsaddr:$in)>;
+def : Pat<(SPlo tglobaltlsaddr:$in), (ORri (i32 G0), tglobaltlsaddr:$in)>;
+def : Pat<(add (SPhi tglobaltlsaddr:$in1), (SPlo tglobaltlsaddr:$in2)),
+          (ADDri (SETHIi tglobaltlsaddr:$in1), (tglobaltlsaddr:$in2))>;
+def : Pat<(xor (SPhi tglobaltlsaddr:$in1), (SPlo tglobaltlsaddr:$in2)),
+          (XORri (SETHIi tglobaltlsaddr:$in1), (tglobaltlsaddr:$in2))>;
+
 // Blockaddress
 def : Pat<(SPhi tblockaddress:$in), (SETHIi tblockaddress:$in)>;
 def : Pat<(SPlo tblockaddress:$in), (ORri (i32 G0), tblockaddress:$in)>;
diff --git a/lib/Target/Sparc/SparcJITInfo.cpp b/lib/Target/Sparc/SparcJITInfo.cpp
new file mode 100644
index 0000000..6493c7d
--- /dev/null
+++ b/lib/Target/Sparc/SparcJITInfo.cpp
@@ -0,0 +1,165 @@
+//===-- SparcJITInfo.cpp - Implement the Sparc JIT Interface --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the JIT interfaces for the Sparc target.
+//
+//===----------------------------------------------------------------------===//
+#define DEBUG_TYPE "jit"
+#include "SparcJITInfo.h"
+#include "SparcRelocations.h"
+
+#include "llvm/CodeGen/JITCodeEmitter.h"
+#include "llvm/Support/Memory.h"
+
+using namespace llvm;
+
+/// JITCompilerFunction - This contains the address of the JIT function used to
+/// compile a function lazily.
+static TargetJITInfo::JITCompilerFn JITCompilerFunction;
+
+extern "C" void SparcCompilationCallback();
+
+extern "C" {
+#if defined (__sparc__)
+  asm(
+      ".text\n"
+      "\t.align 4\n"
+      "\t.global SparcCompilationCallback\n"
+      "\t.type SparcCompilationCallback, #function\n"
+      "SparcCompilationCallback:\n"
+      // Save current register window.
+      "\tsave %sp, -192, %sp\n"
+      // stubaddr+4 is in %g1.
+      "\tcall SparcCompilationCallbackC\n"
+      "\t  sub %g1, 4, %o0\n"
+      // restore original register window and
+      // copy %o0 to %g1
+      "\t  restore %o0, 0, %g1\n"
+      // call the new stub
+      "\tjmp %g1\n"
+      "\t  nop\n"
+      "\t.size   SparcCompilationCallback, .-SparcCompilationCallback"
+      );
+
+#else
+  void SparcCompilationCallback() {
+    llvm_unreachable(
+      "Cannot call SparcCompilationCallback() on a non-sparc arch!");
+  }
+#endif
+}
+
+#define HI(Val) (((unsigned)(Val)) >> 10)
+#define LO(Val) (((unsigned)(Val)) & 0x3FF)
+
+#define SETHI_INST(imm, rd)    (0x01000000 | ((rd) << 25) | ((imm) & 0x3FFFFF))
+#define JMP_INST(rs1, imm, rd) (0x80000000 | ((rd) << 25) | (0x38 << 19) \
+                                | ((rs1) << 14) | (1 << 13) | ((imm) & 0x1FFF))
+#define NOP_INST               SETHI_INST(0, 0)
+
+extern "C" void *SparcCompilationCallbackC(intptr_t StubAddr) {
+  // Get the address of the compiled code for this function.
+  intptr_t NewVal = (intptr_t) JITCompilerFunction((void*) StubAddr);
+
+  // Rewrite the function stub so that we don't end up here every time we
+  // execute the call. We're replacing the first three instructions of the
+  // stub with code that jumps to the compiled function:
+  //   sethi %hi(NewVal), %g1
+  //   jmp %g1+%lo(NewVal)
+  //   nop
+
+  *(intptr_t *)(StubAddr)      = SETHI_INST(HI(NewVal), 1);
+  *(intptr_t *)(StubAddr + 4)  = JMP_INST(1, LO(NewVal), 0);
+  *(intptr_t *)(StubAddr + 8)  = NOP_INST;
+
+  sys::Memory::InvalidateInstructionCache((void*) StubAddr, 12);
+  return (void*)StubAddr;
+}
+
+void SparcJITInfo::replaceMachineCodeForFunction(void *Old, void *New) {
+  assert(0 && "FIXME: Implement SparcJITInfo::replaceMachineCodeForFunction");
+}
+
+
+TargetJITInfo::StubLayout SparcJITInfo::getStubLayout() {
+  // The stub contains 3 4-byte instructions, aligned at 4 bytes. See
+  // emitFunctionStub for details.
+
+  StubLayout Result = { 3*4, 4 };
+  return Result;
+}
+
+void *SparcJITInfo::emitFunctionStub(const Function *F, void *Fn,
+                                     JITCodeEmitter &JCE)
+{
+  JCE.emitAlignment(4);
+  void *Addr = (void*) (JCE.getCurrentPCValue());
+  if (!sys::Memory::setRangeWritable(Addr, 12))
+    llvm_unreachable("ERROR: Unable to mark stub writable.");
+
+  intptr_t EmittedAddr;
+  if (Fn != (void*)(intptr_t)SparcCompilationCallback)
+    EmittedAddr = (intptr_t)Fn;
+  else
+    EmittedAddr = (intptr_t)SparcCompilationCallback;
+
+  // sethi %hi(EmittedAddr), %g1
+  // jmp   %g1+%lo(EmittedAddr), %g1
+  // nop
+
+  JCE.emitWordBE(SETHI_INST(HI(EmittedAddr), 1));
+  JCE.emitWordBE(JMP_INST(1, LO(EmittedAddr), 1));
+  JCE.emitWordBE(NOP_INST);
+
+  sys::Memory::InvalidateInstructionCache(Addr, 12);
+  if (!sys::Memory::setRangeExecutable(Addr, 12))
+    llvm_unreachable("ERROR: Unable to mark stub executable.");
+
+  return Addr;
+}
+
+TargetJITInfo::LazyResolverFn
+SparcJITInfo::getLazyResolverFunction(JITCompilerFn F) {
+  JITCompilerFunction = F;
+  return SparcCompilationCallback;
+}
+
+/// relocate - Before the JIT can run a block of code that has been emitted,
+/// it must rewrite the code to contain the actual addresses of any
+/// referenced global symbols.
+void SparcJITInfo::relocate(void *Function, MachineRelocation *MR,
+                            unsigned NumRelocs, unsigned char *GOTBase) {
+  for (unsigned i = 0; i != NumRelocs; ++i, ++MR) {
+    void *RelocPos = (char*) Function + MR->getMachineCodeOffset();
+    intptr_t ResultPtr = (intptr_t) MR->getResultPointer();
+
+    switch ((SP::RelocationType) MR->getRelocationType()) {
+    case SP::reloc_sparc_hi:
+      ResultPtr = (ResultPtr >> 10) & 0x3fffff;
+      break;
+
+    case SP::reloc_sparc_lo:
+      ResultPtr = (ResultPtr & 0x3ff);
+      break;
+
+    case SP::reloc_sparc_pc30:
+      ResultPtr = ((ResultPtr - (intptr_t)RelocPos) >> 2) & 0x3fffffff;
+      break;
+
+    case SP::reloc_sparc_pc22:
+      ResultPtr = ((ResultPtr - (intptr_t)RelocPos) >> 2) & 0x3fffff;
+      break;
+
+    case SP::reloc_sparc_pc19:
+      ResultPtr = ((ResultPtr - (intptr_t)RelocPos) >> 2) & 0x7ffff;
+      break;
+    }
+    *((unsigned*) RelocPos) |= (unsigned) ResultPtr;
+  }
+}
diff --git a/lib/Target/Sparc/SparcJITInfo.h b/lib/Target/Sparc/SparcJITInfo.h
new file mode 100644
index 0000000..9c6e488
--- /dev/null
+++ b/lib/Target/Sparc/SparcJITInfo.h
@@ -0,0 +1,67 @@
+//==- SparcJITInfo.h - Sparc Implementation of the JIT Interface -*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the SparcJITInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SPARCJITINFO_H
+#define SPARCJITINFO_H
+
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/Target/TargetJITInfo.h"
+
+namespace llvm {
+class SparcTargetMachine;
+
+class SparcJITInfo : public TargetJITInfo {
+
+  bool IsPIC;
+
+  public:
+  explicit SparcJITInfo()
+    :  IsPIC(false) {}
+
+  /// replaceMachineCodeForFunction - Make it so that calling the function
+  /// whose machine code is at OLD turns into a call to NEW, perhaps by
+  /// overwriting OLD with a branch to NEW.  This is used for self-modifying
+  /// code.
+  ///
+  virtual void replaceMachineCodeForFunction(void *Old, void *New);
+
+  // getStubLayout - Returns the size and alignment of the largest call stub
+  // on Sparc.
+  virtual StubLayout getStubLayout();
+
+
+  /// emitFunctionStub - Use the specified JITCodeEmitter object to emit a
+  /// small native function that simply calls the function at the specified
+  /// address.
+  virtual void *emitFunctionStub(const Function *F, void *Fn,
+                                 JITCodeEmitter &JCE);
+
+  /// getLazyResolverFunction - Expose the lazy resolver to the JIT.
+  virtual LazyResolverFn getLazyResolverFunction(JITCompilerFn);
+
+  /// relocate - Before the JIT can run a block of code that has been emitted,
+  /// it must rewrite the code to contain the actual addresses of any
+  /// referenced global symbols.
+  virtual void relocate(void *Function, MachineRelocation *MR,
+                        unsigned NumRelocs, unsigned char *GOTBase);
+
+  /// Initialize - Initialize internal stage for the function being JITted.
+  void Initialize(const MachineFunction &MF, bool isPIC) {
+    IsPIC = isPIC;
+  }
+
+};
+}
+
+#endif
diff --git a/lib/Target/Sparc/SparcRegisterInfo.cpp b/lib/Target/Sparc/SparcRegisterInfo.cpp
index dc97f06..c98613a 100644
--- a/lib/Target/Sparc/SparcRegisterInfo.cpp
+++ b/lib/Target/Sparc/SparcRegisterInfo.cpp
@@ -40,8 +40,17 @@ SparcRegisterInfo::SparcRegisterInfo(SparcSubtarget &st)
 
 const uint16_t* SparcRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF)
                                                                          const {
-  static const uint16_t CalleeSavedRegs[] = { 0 };
-  return CalleeSavedRegs;
+  return CSR_SaveList;
+}
+
+const uint32_t*
+SparcRegisterInfo::getCallPreservedMask(CallingConv::ID CC) const {
+  return CSR_RegMask;
+}
+
+const uint32_t*
+SparcRegisterInfo::getRTCallPreservedMask(CallingConv::ID CC) const {
+  return RTCSR_RegMask;
 }
 
 BitVector SparcRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
@@ -65,6 +74,15 @@ BitVector SparcRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   Reserved.set(SP::G0);
   Reserved.set(SP::G6);
   Reserved.set(SP::G7);
+
+  // Unaliased double registers are not available in non-V9 targets.
+  if (!Subtarget.isV9()) {
+    for (unsigned n = 0; n != 16; ++n) {
+      for (MCRegAliasIterator AI(SP::D16 + n, this, true); AI.isValid(); ++AI)
+        Reserved.set(*AI);
+    }
+  }
+
   return Reserved;
 }
 
@@ -74,6 +92,62 @@ SparcRegisterInfo::getPointerRegClass(const MachineFunction &MF,
   return Subtarget.is64Bit() ? &SP::I64RegsRegClass : &SP::IntRegsRegClass;
 }
 
+static void replaceFI(MachineFunction &MF,
+                      MachineBasicBlock::iterator II,
+                      MachineInstr &MI,
+                      DebugLoc dl,
+                      unsigned FIOperandNum, int Offset,
+                      unsigned FramePtr)
+{
+  // Replace frame index with a frame pointer reference.
+  if (Offset >= -4096 && Offset <= 4095) {
+    // If the offset is small enough to fit in the immediate field, directly
+    // encode it.
+    MI.getOperand(FIOperandNum).ChangeToRegister(FramePtr, false);
+    MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset);
+    return;
+  }
+
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+
+  // FIXME: it would be better to scavenge a register here instead of
+  // reserving G1 all of the time.
+  if (Offset >= 0) {
+    // Emit nonnegaive immediates with sethi + or.
+    // sethi %hi(Offset), %g1
+    // add %g1, %fp, %g1
+    // Insert G1+%lo(offset) into the user.
+    BuildMI(*MI.getParent(), II, dl, TII.get(SP::SETHIi), SP::G1)
+      .addImm(HI22(Offset));
+
+
+    // Emit G1 = G1 + I6
+    BuildMI(*MI.getParent(), II, dl, TII.get(SP::ADDrr), SP::G1).addReg(SP::G1)
+      .addReg(FramePtr);
+    // Insert: G1+%lo(offset) into the user.
+    MI.getOperand(FIOperandNum).ChangeToRegister(SP::G1, false);
+    MI.getOperand(FIOperandNum + 1).ChangeToImmediate(LO10(Offset));
+    return;
+  }
+
+  // Emit Negative numbers with sethi + xor
+  // sethi %hix(Offset), %g1
+  // xor  %g1, %lox(offset), %g1
+  // add %g1, %fp, %g1
+  // Insert: G1 + 0 into the user.
+  BuildMI(*MI.getParent(), II, dl, TII.get(SP::SETHIi), SP::G1)
+    .addImm(HIX22(Offset));
+  BuildMI(*MI.getParent(), II, dl, TII.get(SP::XORri), SP::G1)
+    .addReg(SP::G1).addImm(LOX10(Offset));
+
+  BuildMI(*MI.getParent(), II, dl, TII.get(SP::ADDrr), SP::G1).addReg(SP::G1)
+    .addReg(FramePtr);
+  // Insert: G1+%lo(offset) into the user.
+  MI.getOperand(FIOperandNum).ChangeToRegister(SP::G1, false);
+  MI.getOperand(FIOperandNum + 1).ChangeToImmediate(0);
+}
+
+
 void
 SparcRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
                                        int SPAdj, unsigned FIOperandNum,
@@ -98,35 +172,40 @@ SparcRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
     Offset += (stackSize) ? Subtarget.getAdjustedFrameSize(stackSize) : 0 ;
   }
 
-  // Replace frame index with a frame pointer reference.
-  if (Offset >= -4096 && Offset <= 4095) {
-    // If the offset is small enough to fit in the immediate field, directly
-    // encode it.
-    MI.getOperand(FIOperandNum).ChangeToRegister(FramePtr, false);
-    MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset);
-  } else {
-    // Otherwise, emit a G1 = SETHI %hi(offset).  FIXME: it would be better to
-    // scavenge a register here instead of reserving G1 all of the time.
-    const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
-    unsigned OffHi = (unsigned)Offset >> 10U;
-    BuildMI(*MI.getParent(), II, dl, TII.get(SP::SETHIi), SP::G1).addImm(OffHi);
-    // Emit G1 = G1 + I6
-    BuildMI(*MI.getParent(), II, dl, TII.get(SP::ADDrr), SP::G1).addReg(SP::G1)
-      .addReg(FramePtr);
-    // Insert: G1+%lo(offset) into the user.
-    MI.getOperand(FIOperandNum).ChangeToRegister(SP::G1, false);
-    MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset & ((1 << 10)-1));
+  if (!Subtarget.isV9() || !Subtarget.hasHardQuad()) {
+    if (MI.getOpcode() == SP::STQFri) {
+      const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+      unsigned SrcReg = MI.getOperand(2).getReg();
+      unsigned SrcEvenReg = getSubReg(SrcReg, SP::sub_even64);
+      unsigned SrcOddReg  = getSubReg(SrcReg, SP::sub_odd64);
+      MachineInstr *StMI =
+        BuildMI(*MI.getParent(), II, dl, TII.get(SP::STDFri))
+        .addReg(FramePtr).addImm(0).addReg(SrcEvenReg);
+      replaceFI(MF, II, *StMI, dl, 0, Offset, FramePtr);
+      MI.setDesc(TII.get(SP::STDFri));
+      MI.getOperand(2).setReg(SrcOddReg);
+      Offset += 8;
+    } else if (MI.getOpcode() == SP::LDQFri) {
+      const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+      unsigned DestReg     = MI.getOperand(0).getReg();
+      unsigned DestEvenReg = getSubReg(DestReg, SP::sub_even64);
+      unsigned DestOddReg  = getSubReg(DestReg, SP::sub_odd64);
+      MachineInstr *StMI =
+        BuildMI(*MI.getParent(), II, dl, TII.get(SP::LDDFri), DestEvenReg)
+        .addReg(FramePtr).addImm(0);
+      replaceFI(MF, II, *StMI, dl, 1, Offset, FramePtr);
+
+      MI.setDesc(TII.get(SP::LDDFri));
+      MI.getOperand(0).setReg(DestOddReg);
+      Offset += 8;
+    }
   }
+
+  replaceFI(MF, II, MI, dl, FIOperandNum, Offset, FramePtr);
+
 }
 
 unsigned SparcRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
   return SP::I6;
 }
 
-unsigned SparcRegisterInfo::getEHExceptionRegister() const {
-  llvm_unreachable("What is the exception register");
-}
-
-unsigned SparcRegisterInfo::getEHHandlerRegister() const {
-  llvm_unreachable("What is the exception handler register");
-}
diff --git a/lib/Target/Sparc/SparcRegisterInfo.h b/lib/Target/Sparc/SparcRegisterInfo.h
index 6b77d4e..00b5a98 100644
--- a/lib/Target/Sparc/SparcRegisterInfo.h
+++ b/lib/Target/Sparc/SparcRegisterInfo.h
@@ -32,6 +32,9 @@ struct SparcRegisterInfo : public SparcGenRegisterInfo {
 
   /// Code Generation virtual methods...
   const uint16_t *getCalleeSavedRegs(const MachineFunction *MF = 0) const;
+  const uint32_t* getCallPreservedMask(CallingConv::ID CC) const;
+
+  const uint32_t* getRTCallPreservedMask(CallingConv::ID CC) const;
 
   BitVector getReservedRegs(const MachineFunction &MF) const;
 
@@ -47,10 +50,6 @@ struct SparcRegisterInfo : public SparcGenRegisterInfo {
 
   // Debug information queries.
   unsigned getFrameRegister(const MachineFunction &MF) const;
-
-  // Exception handling queries.
-  unsigned getEHExceptionRegister() const;
-  unsigned getEHHandlerRegister() const;
 };
 
 } // end namespace llvm
diff --git a/lib/Target/Sparc/SparcRegisterInfo.td b/lib/Target/Sparc/SparcRegisterInfo.td
index a59c442..2a575c0 100644
--- a/lib/Target/Sparc/SparcRegisterInfo.td
+++ b/lib/Target/Sparc/SparcRegisterInfo.td
@@ -11,8 +11,8 @@
 //  Declarations that describe the Sparc register file
 //===----------------------------------------------------------------------===//
 
-class SparcReg<string n> : Register<n> {
-  field bits<5> Num;
+class SparcReg<bits<16> Enc, string n> : Register<n> {
+  let HWEncoding = Enc;
   let Namespace = "SP";
 }
 
@@ -23,25 +23,31 @@ class SparcCtrlReg<string n>: Register<n> {
 let Namespace = "SP" in {
 def sub_even : SubRegIndex<32>;
 def sub_odd  : SubRegIndex<32, 32>;
+def sub_even64 : SubRegIndex<64>;
+def sub_odd64  : SubRegIndex<64, 64>;
 }
 
 // Registers are identified with 5-bit ID numbers.
 // Ri - 32-bit integer registers
-class Ri<bits<5> num, string n> : SparcReg<n> {
-  let Num = num;
-}
+class Ri<bits<16> Enc, string n> : SparcReg<Enc, n>;
+
 // Rf - 32-bit floating-point registers
-class Rf<bits<5> num, string n> : SparcReg<n> {
-  let Num = num;
-}
+class Rf<bits<16> Enc, string n> : SparcReg<Enc, n>;
+
 // Rd - Slots in the FP register file for 64-bit floating-point values.
-class Rd<bits<5> num, string n, list<Register> subregs> : SparcReg<n> {
-  let Num = num;
+class Rd<bits<16> Enc, string n, list<Register> subregs> : SparcReg<Enc, n> {
   let SubRegs = subregs;
   let SubRegIndices = [sub_even, sub_odd];
   let CoveredBySubRegs = 1;
 }
 
+// Rq - Slots in the FP register file for 128-bit floating-point values.
+class Rq<bits<16> Enc, string n, list<Register> subregs> : SparcReg<Enc, n> {
+  let SubRegs = subregs;
+  let SubRegIndices = [sub_even64, sub_odd64];
+  let CoveredBySubRegs = 1;
+}
+
 // Control Registers
 def ICC : SparcCtrlReg<"ICC">; // This represents icc and xcc in 64-bit code.
 def FCC : SparcCtrlReg<"FCC">;
@@ -135,6 +141,43 @@ def D13 : Rd<26, "F26", [F26, F27]>, DwarfRegNum<[85]>;
 def D14 : Rd<28, "F28", [F28, F29]>, DwarfRegNum<[86]>;
 def D15 : Rd<30, "F30", [F30, F31]>, DwarfRegNum<[87]>;
 
+// Unaliased double precision floating point registers.
+// FIXME: Define DwarfRegNum for these registers.
+def D16 : SparcReg< 1, "F32">;
+def D17 : SparcReg< 3, "F34">;
+def D18 : SparcReg< 5, "F36">;
+def D19 : SparcReg< 7, "F38">;
+def D20 : SparcReg< 9, "F40">;
+def D21 : SparcReg<11, "F42">;
+def D22 : SparcReg<13, "F44">;
+def D23 : SparcReg<15, "F46">;
+def D24 : SparcReg<17, "F48">;
+def D25 : SparcReg<19, "F50">;
+def D26 : SparcReg<21, "F52">;
+def D27 : SparcReg<23, "F54">;
+def D28 : SparcReg<25, "F56">;
+def D29 : SparcReg<27, "F58">;
+def D30 : SparcReg<29, "F60">;
+def D31 : SparcReg<31, "F62">;
+
+// Aliases of the F* registers used to hold 128-bit for values (long doubles).
+def Q0  : Rq< 0,  "F0", [D0,   D1]>;
+def Q1  : Rq< 4,  "F4", [D2,   D3]>;
+def Q2  : Rq< 8,  "F8", [D4,   D5]>;
+def Q3  : Rq<12, "F12", [D6,   D7]>;
+def Q4  : Rq<16, "F16", [D8,   D9]>;
+def Q5  : Rq<20, "F20", [D10, D11]>;
+def Q6  : Rq<24, "F24", [D12, D13]>;
+def Q7  : Rq<28, "F28", [D14, D15]>;
+def Q8  : Rq< 1, "F32", [D16, D17]>;
+def Q9  : Rq< 5, "F36", [D18, D19]>;
+def Q10 : Rq< 9, "F40", [D20, D21]>;
+def Q11 : Rq<13, "F44", [D22, D23]>;
+def Q12 : Rq<17, "F48", [D24, D25]>;
+def Q13 : Rq<21, "F52", [D26, D27]>;
+def Q14 : Rq<25, "F56", [D28, D29]>;
+def Q15 : Rq<29, "F60", [D30, D31]>;
+
 // Register classes.
 //
 // FIXME: the register order should be defined in terms of the preferred
@@ -158,4 +201,6 @@ def I64Regs : RegisterClass<"SP", [i64], 64, (add IntRegs)>;
 // Floating point register classes.
 def FPRegs : RegisterClass<"SP", [f32], 32, (sequence "F%u", 0, 31)>;
 
-def DFPRegs : RegisterClass<"SP", [f64], 64, (sequence "D%u", 0, 15)>;
+def DFPRegs : RegisterClass<"SP", [f64], 64, (sequence "D%u", 0, 31)>;
+
+def QFPRegs : RegisterClass<"SP", [f128], 128, (sequence "Q%u", 0, 15)>;
diff --git a/lib/Target/Sparc/SparcRelocations.h b/lib/Target/Sparc/SparcRelocations.h
new file mode 100644
index 0000000..388cfe7
--- /dev/null
+++ b/lib/Target/Sparc/SparcRelocations.h
@@ -0,0 +1,41 @@
+//===-- SparcRelocations.h - Sparc Code Relocations -------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the Sparc target-specific relocation types
+// (for relocation-model=static).
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SPARC_RELOCATIONS_H
+#define SPARC_RELOCATIONS_H
+
+#include "llvm/CodeGen/MachineRelocation.h"
+
+namespace llvm {
+  namespace SP {
+    enum RelocationType {
+      // reloc_sparc_hi - upper 22 bits
+      reloc_sparc_hi = 1,
+
+      // reloc_sparc_lo - lower 10 bits
+      reloc_sparc_lo = 2,
+
+      // reloc_sparc_pc30 - pc rel. 30 bits for call
+      reloc_sparc_pc30 = 3,
+
+     // reloc_sparc_pc22 - pc rel. 22 bits for branch
+      reloc_sparc_pc22 = 4,
+
+      // reloc_sparc_pc22 - pc rel. 19 bits for branch with icc/xcc
+      reloc_sparc_pc19 = 5
+    };
+  }
+}
+
+#endif
diff --git a/lib/Target/Sparc/SparcSubtarget.cpp b/lib/Target/Sparc/SparcSubtarget.cpp
index f9ce098..7d09d0e 100644
--- a/lib/Target/Sparc/SparcSubtarget.cpp
+++ b/lib/Target/Sparc/SparcSubtarget.cpp
@@ -30,7 +30,8 @@ SparcSubtarget::SparcSubtarget(const std::string &TT, const std::string &CPU,
   IsV9(false),
   V8DeprecatedInsts(false),
   IsVIS(false),
-  Is64Bit(is64Bit) {
+  Is64Bit(is64Bit),
+  HasHardQuad(false) {
 
   // Determine default and user specified characteristics
   std::string CPUName = CPU;
diff --git a/lib/Target/Sparc/SparcSubtarget.h b/lib/Target/Sparc/SparcSubtarget.h
index 2bf599d..0f81cc9 100644
--- a/lib/Target/Sparc/SparcSubtarget.h
+++ b/lib/Target/Sparc/SparcSubtarget.h
@@ -29,6 +29,7 @@ class SparcSubtarget : public SparcGenSubtargetInfo {
   bool V8DeprecatedInsts;
   bool IsVIS;
   bool Is64Bit;
+  bool HasHardQuad;
 
 public:
   SparcSubtarget(const std::string &TT, const std::string &CPU,
@@ -37,6 +38,7 @@ public:
   bool isV9() const { return IsV9; }
   bool isVIS() const { return IsVIS; }
   bool useDeprecatedV8Instructions() const { return V8DeprecatedInsts; }
+  bool hasHardQuad() const { return HasHardQuad; }
 
   /// ParseSubtargetFeatures - Parses features string setting specified
   /// subtarget options.  Definition of function is auto generated by tblgen.
diff --git a/lib/Target/Sparc/SparcTargetMachine.cpp b/lib/Target/Sparc/SparcTargetMachine.cpp
index a7355f4..0f93674 100644
--- a/lib/Target/Sparc/SparcTargetMachine.cpp
+++ b/lib/Target/Sparc/SparcTargetMachine.cpp
@@ -65,6 +65,13 @@ bool SparcPassConfig::addInstSelector() {
   return false;
 }
 
+bool SparcTargetMachine::addCodeEmitter(PassManagerBase &PM,
+                                        JITCodeEmitter &JCE) {
+  // Machine code emitter pass for Sparc.
+  PM.add(createSparcJITCodeEmitterPass(*this, JCE));
+  return false;
+}
+
 /// addPreEmitPass - This pass may be implemented by targets that want to run
 /// passes immediately before machine code is emitted.  This should return
 /// true if -print-machineinstrs should print out the code after the passes.
diff --git a/lib/Target/Sparc/SparcTargetMachine.h b/lib/Target/Sparc/SparcTargetMachine.h
index 081075d..8c9bcd3 100644
--- a/lib/Target/Sparc/SparcTargetMachine.h
+++ b/lib/Target/Sparc/SparcTargetMachine.h
@@ -17,6 +17,7 @@
 #include "SparcFrameLowering.h"
 #include "SparcISelLowering.h"
 #include "SparcInstrInfo.h"
+#include "SparcJITInfo.h"
 #include "SparcSelectionDAGInfo.h"
 #include "SparcSubtarget.h"
 #include "llvm/IR/DataLayout.h"
@@ -32,6 +33,7 @@ class SparcTargetMachine : public LLVMTargetMachine {
   SparcTargetLowering TLInfo;
   SparcSelectionDAGInfo TSInfo;
   SparcFrameLowering FrameLowering;
+  SparcJITInfo JITInfo;
 public:
   SparcTargetMachine(const Target &T, StringRef TT,
                      StringRef CPU, StringRef FS, const TargetOptions &Options,
@@ -52,10 +54,14 @@ public:
   virtual const SparcSelectionDAGInfo* getSelectionDAGInfo() const {
     return &TSInfo;
   }
+  virtual SparcJITInfo *getJITInfo() {
+    return &JITInfo;
+  }
   virtual const DataLayout       *getDataLayout() const { return &DL; }
 
   // Pass Pipeline Configuration
   virtual TargetPassConfig *createPassConfig(PassManagerBase &PM);
+  virtual bool addCodeEmitter(PassManagerBase &PM, JITCodeEmitter &JCE);
 };
 
 /// SparcV8TargetMachine - Sparc 32-bit target machine
diff --git a/lib/Target/Sparc/TargetInfo/SparcTargetInfo.cpp b/lib/Target/Sparc/TargetInfo/SparcTargetInfo.cpp
index bb71463..4eea163 100644
--- a/lib/Target/Sparc/TargetInfo/SparcTargetInfo.cpp
+++ b/lib/Target/Sparc/TargetInfo/SparcTargetInfo.cpp
@@ -15,7 +15,9 @@ using namespace llvm;
 Target llvm::TheSparcTarget;
 Target llvm::TheSparcV9Target;
 
-extern "C" void LLVMInitializeSparcTargetInfo() { 
-  RegisterTarget<Triple::sparc> X(TheSparcTarget, "sparc", "Sparc");
-  RegisterTarget<Triple::sparcv9> Y(TheSparcV9Target, "sparcv9", "Sparc V9");
+extern "C" void LLVMInitializeSparcTargetInfo() {
+  RegisterTarget<Triple::sparc, /*HasJIT=*/ true>
+    X(TheSparcTarget, "sparc", "Sparc");
+  RegisterTarget<Triple::sparcv9, /*HasJIT=*/ true>
+    Y(TheSparcV9Target, "sparcv9", "Sparc V9");
 }
diff --git a/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp b/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
index 58af2c4..763f40c 100644
--- a/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
+++ b/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
@@ -32,6 +32,7 @@ static bool inRange(const MCExpr *Expr, int64_t MinValue, int64_t MaxValue) {
 namespace {
 enum RegisterKind {
   GR32Reg,
+  GRH32Reg,
   GR64Reg,
   GR128Reg,
   ADDR32Reg,
@@ -262,6 +263,8 @@ public:
 
   // Used by the TableGen code to check for particular operand types.
   bool isGR32() const { return isReg(GR32Reg); }
+  bool isGRH32() const { return isReg(GRH32Reg); }
+  bool isGRX32() const { return false; }
   bool isGR64() const { return isReg(GR64Reg); }
   bool isGR128() const { return isReg(GR128Reg); }
   bool isADDR32() const { return isReg(ADDR32Reg); }
@@ -327,8 +330,9 @@ private:
                     StringRef Mnemonic);
 
 public:
-  SystemZAsmParser(MCSubtargetInfo &sti, MCAsmParser &parser)
-    : MCTargetAsmParser(), STI(sti), Parser(parser) {
+  SystemZAsmParser(MCSubtargetInfo &sti, MCAsmParser &parser,
+                   const MCInstrInfo &MII)
+      : MCTargetAsmParser(), STI(sti), Parser(parser) {
     MCAsmParserExtension::Initialize(Parser);
 
     // Initialize the set of available features.
@@ -355,6 +359,14 @@ public:
     return parseRegister(Operands, RegGR, SystemZMC::GR32Regs, GR32Reg);
   }
   OperandMatchResultTy
+  parseGRH32(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+    return parseRegister(Operands, RegGR, SystemZMC::GRH32Regs, GRH32Reg);
+  }
+  OperandMatchResultTy
+  parseGRX32(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+    llvm_unreachable("GRX32 should only be used for pseudo instructions");
+  }
+  OperandMatchResultTy
   parseGR64(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
     return parseRegister(Operands, RegGR, SystemZMC::GR64Regs, GR64Reg);
   }
diff --git a/lib/Target/SystemZ/CMakeLists.txt b/lib/Target/SystemZ/CMakeLists.txt
index ab657f6..d21c0a8 100644
--- a/lib/Target/SystemZ/CMakeLists.txt
+++ b/lib/Target/SystemZ/CMakeLists.txt
@@ -21,9 +21,11 @@ add_llvm_target(SystemZCodeGen
   SystemZISelLowering.cpp
   SystemZInstrInfo.cpp
   SystemZLongBranch.cpp
+  SystemZMachineFunctionInfo.cpp
   SystemZMCInstLower.cpp
   SystemZRegisterInfo.cpp
   SystemZSelectionDAGInfo.cpp
+  SystemZShortenInst.cpp
   SystemZSubtarget.cpp
   SystemZTargetMachine.cpp
   )
diff --git a/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp b/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp
index 79469b6..fc3c38d 100644
--- a/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp
+++ b/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp
@@ -48,14 +48,11 @@ extern "C" void LLVMInitializeSystemZDisassembler() {
 }
 
 static DecodeStatus decodeRegisterClass(MCInst &Inst, uint64_t RegNo,
-                                        const unsigned *Regs,
-                                        bool isAddress = false) {
+                                        const unsigned *Regs) {
   assert(RegNo < 16 && "Invalid register");
-  if (!isAddress || RegNo) {
-    RegNo = Regs[RegNo];
-    if (RegNo == 0)
-      return MCDisassembler::Fail;
-  }
+  RegNo = Regs[RegNo];
+  if (RegNo == 0)
+    return MCDisassembler::Fail;
   Inst.addOperand(MCOperand::CreateReg(RegNo));
   return MCDisassembler::Success;
 }
@@ -66,6 +63,12 @@ static DecodeStatus DecodeGR32BitRegisterClass(MCInst &Inst, uint64_t RegNo,
   return decodeRegisterClass(Inst, RegNo, SystemZMC::GR32Regs);
 }
 
+static DecodeStatus DecodeGRH32BitRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                                uint64_t Address,
+                                                const void *Decoder) {
+  return decodeRegisterClass(Inst, RegNo, SystemZMC::GRH32Regs);
+}
+
 static DecodeStatus DecodeGR64BitRegisterClass(MCInst &Inst, uint64_t RegNo,
                                                uint64_t Address,
                                                const void *Decoder) {
@@ -81,7 +84,7 @@ static DecodeStatus DecodeGR128BitRegisterClass(MCInst &Inst, uint64_t RegNo,
 static DecodeStatus DecodeADDR64BitRegisterClass(MCInst &Inst, uint64_t RegNo,
                                                  uint64_t Address,
                                                  const void *Decoder) {
-  return decodeRegisterClass(Inst, RegNo, SystemZMC::GR64Regs, true);
+  return decodeRegisterClass(Inst, RegNo, SystemZMC::GR64Regs);
 }
 
 static DecodeStatus DecodeFP32BitRegisterClass(MCInst &Inst, uint64_t RegNo,
diff --git a/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.cpp b/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.cpp
index 37ebff3..e1e64d3 100644
--- a/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.cpp
+++ b/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.cpp
@@ -124,18 +124,6 @@ void SystemZInstPrinter::printPCRelOperand(const MCInst *MI, int OpNum,
     O << *MO.getExpr();
 }
 
-void SystemZInstPrinter::printCallOperand(const MCInst *MI, int OpNum,
-                                          raw_ostream &O) {
-  const MCOperand &MO = MI->getOperand(OpNum);
-  if (MO.isImm()) {
-    O << "0x";
-    O.write_hex(MO.getImm());
-  } else {
-    O << *MO.getExpr();
-    O << "@PLT";
-  }
-}
-
 void SystemZInstPrinter::printOperand(const MCInst *MI, int OpNum,
                                       raw_ostream &O) {
   printOperand(MI->getOperand(OpNum), O);
diff --git a/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.h b/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.h
index 30cdee5..734ecf0 100644
--- a/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.h
+++ b/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.h
@@ -58,7 +58,6 @@ private:
   void printS32ImmOperand(const MCInst *MI, int OpNum, raw_ostream &O);
   void printU32ImmOperand(const MCInst *MI, int OpNum, raw_ostream &O);
   void printPCRelOperand(const MCInst *MI, int OpNum, raw_ostream &O);
-  void printCallOperand(const MCInst *MI, int OpNum, raw_ostream &O);
   void printAccessRegOperand(const MCInst *MI, int OpNum, raw_ostream &O);
 
   // Print the mnemonic for a condition-code mask ("ne", "lh", etc.)
diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp b/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp
index 027db44..26a8fae 100644
--- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp
+++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp
@@ -35,15 +35,6 @@ static uint64_t extractBitsForFixup(MCFixupKind Kind, uint64_t Value) {
   llvm_unreachable("Unknown fixup kind!");
 }
 
-// If Opcode is a relaxable interprocedural reference, return the relaxed form,
-// otherwise return 0.
-static unsigned getRelaxedOpcode(unsigned Opcode) {
-  switch (Opcode) {
-  case SystemZ::BRAS: return SystemZ::BRASL;
-  }
-  return 0;
-}
-
 namespace {
 class SystemZMCAsmBackend : public MCAsmBackend {
   uint8_t OSABI;
@@ -59,14 +50,20 @@ public:
     LLVM_OVERRIDE;
   virtual void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
                           uint64_t Value) const LLVM_OVERRIDE;
-  virtual bool mayNeedRelaxation(const MCInst &Inst) const LLVM_OVERRIDE;
+  virtual bool mayNeedRelaxation(const MCInst &Inst) const LLVM_OVERRIDE {
+    return false;
+  }
   virtual bool fixupNeedsRelaxation(const MCFixup &Fixup,
                                     uint64_t Value,
                                     const MCRelaxableFragment *Fragment,
                                     const MCAsmLayout &Layout) const
-    LLVM_OVERRIDE;
+    LLVM_OVERRIDE {
+    return false;
+  }
   virtual void relaxInstruction(const MCInst &Inst,
-                                MCInst &Res) const LLVM_OVERRIDE;
+                                MCInst &Res) const LLVM_OVERRIDE {
+    llvm_unreachable("SystemZ does do not have assembler relaxation");
+  }
   virtual bool writeNopData(uint64_t Count,
                             MCObjectWriter *OW) const LLVM_OVERRIDE;
   virtual MCObjectWriter *createObjectWriter(raw_ostream &OS) const
@@ -114,28 +111,6 @@ void SystemZMCAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
   }
 }
 
-bool SystemZMCAsmBackend::mayNeedRelaxation(const MCInst &Inst) const {
-  return getRelaxedOpcode(Inst.getOpcode()) != 0;
-}
-
-bool
-SystemZMCAsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup,
-                                          uint64_t Value,
-                                          const MCRelaxableFragment *Fragment,
-                                          const MCAsmLayout &Layout) const {
-  // At the moment we just need to relax 16-bit fields to wider fields.
-  Value = extractBitsForFixup(Fixup.getKind(), Value);
-  return (int16_t)Value != (int64_t)Value;
-}
-
-void SystemZMCAsmBackend::relaxInstruction(const MCInst &Inst,
-                                           MCInst &Res) const {
-  unsigned Opcode = getRelaxedOpcode(Inst.getOpcode());
-  assert(Opcode && "Unexpected insn to relax");
-  Res = Inst;
-  Res.setOpcode(Opcode);
-}
-
 bool SystemZMCAsmBackend::writeNopData(uint64_t Count,
                                        MCObjectWriter *OW) const {
   for (uint64_t I = 0; I != Count; ++I)
@@ -143,8 +118,9 @@ bool SystemZMCAsmBackend::writeNopData(uint64_t Count,
   return true;
 }
 
-MCAsmBackend *llvm::createSystemZMCAsmBackend(const Target &T, StringRef TT,
-                                              StringRef CPU) {
+MCAsmBackend *llvm::createSystemZMCAsmBackend(const Target &T,
+                                              const MCRegisterInfo &MRI,
+                                              StringRef TT, StringRef CPU) {
   uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(Triple(TT).getOS());
   return new SystemZMCAsmBackend(OSABI);
 }
diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp b/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp
index 9e27aa0..965c41e 100644
--- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp
+++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp
@@ -19,10 +19,8 @@ SystemZMCAsmInfo::SystemZMCAsmInfo(StringRef TT) {
   IsLittleEndian = false;
 
   CommentString = "#";
-  PCSymbol = ".";
   GlobalPrefix = "";
   PrivateGlobalPrefix = ".L";
-  WeakRefDirective = "\t.weak\t";
   ZeroDirective = "\t.space\t";
   Data64bitsDirective = "\t.quad\t";
   UsesELFSectionDirectiveForBSS = true;
diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.h b/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.h
index d440787..b9ac92a 100644
--- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.h
+++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.h
@@ -10,13 +10,13 @@
 #ifndef SystemZTARGETASMINFO_H
 #define SystemZTARGETASMINFO_H
 
-#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCAsmInfoELF.h"
 #include "llvm/Support/Compiler.h"
 
 namespace llvm {
 class StringRef;
 
-class SystemZMCAsmInfo : public MCAsmInfo {
+class SystemZMCAsmInfo : public MCAsmInfoELF {
 public:
   explicit SystemZMCAsmInfo(StringRef TT);
 
diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp b/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp
index bda7714..f07ea7b 100644
--- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp
+++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp
@@ -79,14 +79,6 @@ private:
                               SmallVectorImpl<MCFixup> &Fixups) const {
     return getPCRelEncoding(MI, OpNum, Fixups, SystemZ::FK_390_PC32DBL, 2);
   }
-  uint64_t getPLT16DBLEncoding(const MCInst &MI, unsigned OpNum,
-                               SmallVectorImpl<MCFixup> &Fixups) const {
-    return getPCRelEncoding(MI, OpNum, Fixups, SystemZ::FK_390_PLT16DBL, 2);
-  }
-  uint64_t getPLT32DBLEncoding(const MCInst &MI, unsigned OpNum,
-                               SmallVectorImpl<MCFixup> &Fixups) const {
-    return getPCRelEncoding(MI, OpNum, Fixups, SystemZ::FK_390_PLT32DBL, 2);
-  }
 };
 }
 
diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp b/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
index 3653192..9e1296b 100644
--- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
+++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
@@ -28,10 +28,17 @@
 using namespace llvm;
 
 const unsigned SystemZMC::GR32Regs[16] = {
-  SystemZ::R0W, SystemZ::R1W, SystemZ::R2W, SystemZ::R3W,
-  SystemZ::R4W, SystemZ::R5W, SystemZ::R6W, SystemZ::R7W,
-  SystemZ::R8W, SystemZ::R9W, SystemZ::R10W, SystemZ::R11W,
-  SystemZ::R12W, SystemZ::R13W, SystemZ::R14W, SystemZ::R15W
+  SystemZ::R0L, SystemZ::R1L, SystemZ::R2L, SystemZ::R3L,
+  SystemZ::R4L, SystemZ::R5L, SystemZ::R6L, SystemZ::R7L,
+  SystemZ::R8L, SystemZ::R9L, SystemZ::R10L, SystemZ::R11L,
+  SystemZ::R12L, SystemZ::R13L, SystemZ::R14L, SystemZ::R15L
+};
+
+const unsigned SystemZMC::GRH32Regs[16] = {
+  SystemZ::R0H, SystemZ::R1H, SystemZ::R2H, SystemZ::R3H,
+  SystemZ::R4H, SystemZ::R5H, SystemZ::R6H, SystemZ::R7H,
+  SystemZ::R8H, SystemZ::R9H, SystemZ::R10H, SystemZ::R11H,
+  SystemZ::R12H, SystemZ::R13H, SystemZ::R14H, SystemZ::R15H
 };
 
 const unsigned SystemZMC::GR64Regs[16] = {
@@ -69,6 +76,24 @@ const unsigned SystemZMC::FP128Regs[16] = {
   SystemZ::F12Q, SystemZ::F13Q, 0, 0
 };
 
+unsigned SystemZMC::getFirstReg(unsigned Reg) {
+  static unsigned Map[SystemZ::NUM_TARGET_REGS];
+  static bool Initialized = false;
+  if (!Initialized) {
+    for (unsigned I = 0; I < 16; ++I) {
+      Map[GR32Regs[I]] = I;
+      Map[GRH32Regs[I]] = I;
+      Map[GR64Regs[I]] = I;
+      Map[GR128Regs[I]] = I;
+      Map[FP32Regs[I]] = I;
+      Map[FP64Regs[I]] = I;
+      Map[FP128Regs[I]] = I;
+    }
+  }
+  assert(Reg < SystemZ::NUM_TARGET_REGS);
+  return Map[Reg];
+}
+
 static MCAsmInfo *createSystemZMCAsmInfo(const MCRegisterInfo &MRI,
                                          StringRef TT) {
   MCAsmInfo *MAI = new SystemZMCAsmInfo(TT);
@@ -162,7 +187,7 @@ static MCStreamer *createSystemZMCObjectStreamer(const Target &T, StringRef TT,
                                                  MCCodeEmitter *Emitter,
                                                  bool RelaxAll,
                                                  bool NoExecStack) {
-  return createELFStreamer(Ctx, MAB, OS, Emitter, RelaxAll, NoExecStack);
+  return createELFStreamer(Ctx, 0, MAB, OS, Emitter, RelaxAll, NoExecStack);
 }
 
 extern "C" void LLVMInitializeSystemZTargetMC() {
diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h b/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h
index 3c9f0cb..97e325b 100644
--- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h
+++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h
@@ -42,11 +42,31 @@ namespace SystemZMC {
   // as %r0-%r15.  It seems better to provide the same interface for
   // all classes though.
   extern const unsigned GR32Regs[16];
+  extern const unsigned GRH32Regs[16];
   extern const unsigned GR64Regs[16];
   extern const unsigned GR128Regs[16];
   extern const unsigned FP32Regs[16];
   extern const unsigned FP64Regs[16];
   extern const unsigned FP128Regs[16];
+
+  // Return the 0-based number of the first architectural register that
+  // contains the given LLVM register.   E.g. R1D -> 1.
+  unsigned getFirstReg(unsigned Reg);
+
+  // Return the given register as a GR64.
+  inline unsigned getRegAsGR64(unsigned Reg) {
+    return GR64Regs[getFirstReg(Reg)];
+  }
+
+  // Return the given register as a low GR32.
+  inline unsigned getRegAsGR32(unsigned Reg) {
+    return GR32Regs[getFirstReg(Reg)];
+  }
+
+  // Return the given register as a high GR32.
+  inline unsigned getRegAsGRH32(unsigned Reg) {
+    return GRH32Regs[getFirstReg(Reg)];
+  }
 }
 
 MCCodeEmitter *createSystemZMCCodeEmitter(const MCInstrInfo &MCII,
@@ -54,8 +74,9 @@ MCCodeEmitter *createSystemZMCCodeEmitter(const MCInstrInfo &MCII,
                                           const MCSubtargetInfo &STI,
                                           MCContext &Ctx);
 
-MCAsmBackend *createSystemZMCAsmBackend(const Target &T, StringRef TT,
-                                        StringRef CPU);
+MCAsmBackend *createSystemZMCAsmBackend(const Target &T,
+                                        const MCRegisterInfo &MRI,
+                                        StringRef TT, StringRef CPU);
 
 MCObjectWriter *createSystemZObjectWriter(raw_ostream &OS, uint8_t OSABI);
 } // end namespace llvm
diff --git a/lib/Target/SystemZ/README.txt b/lib/Target/SystemZ/README.txt
index 2782b63..afa6cf0 100644
--- a/lib/Target/SystemZ/README.txt
+++ b/lib/Target/SystemZ/README.txt
@@ -35,19 +35,11 @@ performance measurements.
 
 --
 
-We don't support tail calls at present.
-
---
-
-We don't support prefetching yet.
-
---
-
 There is no scheduling support.
 
 --
 
-We don't use the BRANCH ON COUNT or BRANCH ON INDEX families of instruction.
+We don't use the BRANCH ON INDEX instructions.
 
 --
 
@@ -56,18 +48,7 @@ and conditional returns.
 
 --
 
-We don't use the condition code results of anything except comparisons.
-
-Implementing this may need something more finely grained than the z_cmp
-and z_ucmp that we have now.  It might (or might not) also be useful to
-have a mask of "don't care" values in conditional branches.  For example,
-integer comparisons never set CC to 3, so the bottom bit of the CC mask
-isn't particularly relevant.  JNLH and JE are equally good for testing
-equality after an integer comparison, etc.
-
---
-
-We don't use the LOAD AND TEST or TEST DATA CLASS instructions.
+We don't use the TEST DATA CLASS instructions.
 
 --
 
@@ -77,20 +58,16 @@ condition codes.  For example, we could use LCDFR instead of LCDBR.
 
 --
 
-We don't optimize block memory operations.
+We only use MVC, XC and CLC for constant-length block operations.
+We could extend them to variable-length operations too,
+using EXECUTE RELATIVE LONG.
 
-It's definitely worth using things like MVC, CLC, NC, XC and OC with
-constant lengths.  MVCIN may be worthwhile too.
-
-We should probably implement things like memcpy using MVC with EXECUTE.
-Likewise memcmp and CLC.  MVCLE and CLCLE could be useful too.
+MVCIN, MVCLE and CLCLE may be worthwhile too.
 
 --
 
-We don't optimize string operations.
-
-MVST, CLST, SRST and CUSE could be useful here.  Some of the TRANSLATE
-family might be too, although they are probably more difficult to exploit.
+We don't use CUSE or the TRANSLATE family of instructions for string
+operations.  The TRANSLATE ones are probably more difficult to exploit.
 
 --
 
@@ -113,14 +90,7 @@ We don't use the halfword forms of LOAD REVERSED and STORE REVERSED
 
 --
 
-We could take advantage of the various ... UNDER MASK instructions,
-such as ICM and STCM.
-
---
-
-DAGCombiner can detect integer absolute, but there's not yet an associated
-ISD opcode.  We could add one and implement it using LOAD POSITIVE.
-Negated absolutes could use LOAD NEGATIVE.
+We don't use ICM or STCM.
 
 --
 
diff --git a/lib/Target/SystemZ/SystemZ.h b/lib/Target/SystemZ/SystemZ.h
index eccc2aa..dcebbad 100644
--- a/lib/Target/SystemZ/SystemZ.h
+++ b/lib/Target/SystemZ/SystemZ.h
@@ -52,6 +52,29 @@ namespace llvm {
     const unsigned CCMASK_CS_NE = CCMASK_1;
     const unsigned CCMASK_CS    = CCMASK_0 | CCMASK_1;
 
+    // Condition-code mask assignments for a completed SRST loop.
+    const unsigned CCMASK_SRST_FOUND    = CCMASK_1;
+    const unsigned CCMASK_SRST_NOTFOUND = CCMASK_2;
+    const unsigned CCMASK_SRST          = CCMASK_1 | CCMASK_2;
+
+    // Condition-code mask assignments for TEST UNDER MASK.
+    const unsigned CCMASK_TM_ALL_0       = CCMASK_0;
+    const unsigned CCMASK_TM_MIXED_MSB_0 = CCMASK_1;
+    const unsigned CCMASK_TM_MIXED_MSB_1 = CCMASK_2;
+    const unsigned CCMASK_TM_ALL_1       = CCMASK_3;
+    const unsigned CCMASK_TM_SOME_0      = CCMASK_TM_ALL_1 ^ CCMASK_ANY;
+    const unsigned CCMASK_TM_SOME_1      = CCMASK_TM_ALL_0 ^ CCMASK_ANY;
+    const unsigned CCMASK_TM_MSB_0       = CCMASK_0 | CCMASK_1;
+    const unsigned CCMASK_TM_MSB_1       = CCMASK_2 | CCMASK_3;
+    const unsigned CCMASK_TM             = CCMASK_ANY;
+
+    // The position of the low CC bit in an IPM result.
+    const unsigned IPM_CC = 28;
+
+    // Mask assignments for PFD.
+    const unsigned PFD_READ  = 1;
+    const unsigned PFD_WRITE = 2;
+
     // Return true if Val fits an LLILL operand.
     static inline bool isImmLL(uint64_t Val) {
       return (Val & ~0x000000000000ffffULL) == 0;
@@ -86,6 +109,7 @@ namespace llvm {
   FunctionPass *createSystemZISelDag(SystemZTargetMachine &TM,
                                      CodeGenOpt::Level OptLevel);
   FunctionPass *createSystemZElimComparePass(SystemZTargetMachine &TM);
+  FunctionPass *createSystemZShortenInstPass(SystemZTargetMachine &TM);
   FunctionPass *createSystemZLongBranchPass(SystemZTargetMachine &TM);
 } // end namespace llvm;
 #endif
diff --git a/lib/Target/SystemZ/SystemZAsmPrinter.cpp b/lib/Target/SystemZ/SystemZAsmPrinter.cpp
index 3a57ea0..75cbda4 100644
--- a/lib/Target/SystemZ/SystemZAsmPrinter.cpp
+++ b/lib/Target/SystemZ/SystemZAsmPrinter.cpp
@@ -19,16 +19,142 @@
 #include "llvm/CodeGen/MachineModuleInfoImpls.h"
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 #include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInstBuilder.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Target/Mangler.h"
 
 using namespace llvm;
 
+// Return an RI instruction like MI with opcode Opcode, but with the
+// GR64 register operands turned into GR32s.
+static MCInst lowerRILow(const MachineInstr *MI, unsigned Opcode) {
+  if (MI->isCompare())
+    return MCInstBuilder(Opcode)
+      .addReg(SystemZMC::getRegAsGR32(MI->getOperand(0).getReg()))
+      .addImm(MI->getOperand(1).getImm());
+  else
+    return MCInstBuilder(Opcode)
+      .addReg(SystemZMC::getRegAsGR32(MI->getOperand(0).getReg()))
+      .addReg(SystemZMC::getRegAsGR32(MI->getOperand(1).getReg()))
+      .addImm(MI->getOperand(2).getImm());
+}
+
+// Return an RI instruction like MI with opcode Opcode, but with the
+// GR64 register operands turned into GRH32s.
+static MCInst lowerRIHigh(const MachineInstr *MI, unsigned Opcode) {
+  if (MI->isCompare())
+    return MCInstBuilder(Opcode)
+      .addReg(SystemZMC::getRegAsGRH32(MI->getOperand(0).getReg()))
+      .addImm(MI->getOperand(1).getImm());
+  else
+    return MCInstBuilder(Opcode)
+      .addReg(SystemZMC::getRegAsGRH32(MI->getOperand(0).getReg()))
+      .addReg(SystemZMC::getRegAsGRH32(MI->getOperand(1).getReg()))
+      .addImm(MI->getOperand(2).getImm());
+}
+
+// Return an RI instruction like MI with opcode Opcode, but with the
+// R2 register turned into a GR64.
+static MCInst lowerRIEfLow(const MachineInstr *MI, unsigned Opcode) {
+  return MCInstBuilder(Opcode)
+    .addReg(MI->getOperand(0).getReg())
+    .addReg(MI->getOperand(1).getReg())
+    .addReg(SystemZMC::getRegAsGR64(MI->getOperand(2).getReg()))
+    .addImm(MI->getOperand(3).getImm())
+    .addImm(MI->getOperand(4).getImm())
+    .addImm(MI->getOperand(5).getImm());
+}
+
 void SystemZAsmPrinter::EmitInstruction(const MachineInstr *MI) {
-  SystemZMCInstLower Lower(Mang, MF->getContext(), *this);
+  SystemZMCInstLower Lower(MF->getContext(), *this);
   MCInst LoweredMI;
-  Lower.lower(MI, LoweredMI);
+  switch (MI->getOpcode()) {
+  case SystemZ::Return:
+    LoweredMI = MCInstBuilder(SystemZ::BR).addReg(SystemZ::R14D);
+    break;
+
+  case SystemZ::CallBRASL:
+    LoweredMI = MCInstBuilder(SystemZ::BRASL)
+      .addReg(SystemZ::R14D)
+      .addExpr(Lower.getExpr(MI->getOperand(0), MCSymbolRefExpr::VK_PLT));
+    break;
+
+  case SystemZ::CallBASR:
+    LoweredMI = MCInstBuilder(SystemZ::BASR)
+      .addReg(SystemZ::R14D)
+      .addReg(MI->getOperand(0).getReg());
+    break;
+
+  case SystemZ::CallJG:
+    LoweredMI = MCInstBuilder(SystemZ::JG)
+      .addExpr(Lower.getExpr(MI->getOperand(0), MCSymbolRefExpr::VK_PLT));
+    break;
+
+  case SystemZ::CallBR:
+    LoweredMI = MCInstBuilder(SystemZ::BR).addReg(SystemZ::R1D);
+    break;
+
+  case SystemZ::IILF64:
+    LoweredMI = MCInstBuilder(SystemZ::IILF)
+      .addReg(SystemZMC::getRegAsGR32(MI->getOperand(0).getReg()))
+      .addImm(MI->getOperand(2).getImm());
+    break;
+
+  case SystemZ::IIHF64:
+    LoweredMI = MCInstBuilder(SystemZ::IIHF)
+      .addReg(SystemZMC::getRegAsGRH32(MI->getOperand(0).getReg()))
+      .addImm(MI->getOperand(2).getImm());
+    break;
+
+  case SystemZ::RISBHH:
+  case SystemZ::RISBHL:
+    LoweredMI = lowerRIEfLow(MI, SystemZ::RISBHG);
+    break;
+
+  case SystemZ::RISBLH:
+  case SystemZ::RISBLL:
+    LoweredMI = lowerRIEfLow(MI, SystemZ::RISBLG);
+    break;
+
+#define LOWER_LOW(NAME)                                                 \
+  case SystemZ::NAME##64: LoweredMI = lowerRILow(MI, SystemZ::NAME); break
+
+  LOWER_LOW(IILL);
+  LOWER_LOW(IILH);
+  LOWER_LOW(TMLL);
+  LOWER_LOW(TMLH);
+  LOWER_LOW(NILL);
+  LOWER_LOW(NILH);
+  LOWER_LOW(NILF);
+  LOWER_LOW(OILL);
+  LOWER_LOW(OILH);
+  LOWER_LOW(OILF);
+  LOWER_LOW(XILF);
+
+#undef LOWER_LOW
+
+#define LOWER_HIGH(NAME) \
+  case SystemZ::NAME##64: LoweredMI = lowerRIHigh(MI, SystemZ::NAME); break
+
+  LOWER_HIGH(IIHL);
+  LOWER_HIGH(IIHH);
+  LOWER_HIGH(TMHL);
+  LOWER_HIGH(TMHH);
+  LOWER_HIGH(NIHL);
+  LOWER_HIGH(NIHH);
+  LOWER_HIGH(NIHF);
+  LOWER_HIGH(OIHL);
+  LOWER_HIGH(OIHH);
+  LOWER_HIGH(OIHF);
+  LOWER_HIGH(XIHF);
+
+#undef LOWER_HIGH
+
+  default:
+    Lower.lower(MI, LoweredMI);
+    break;
+  }
   OutStreamer.EmitInstruction(LoweredMI);
 }
 
@@ -48,7 +174,7 @@ EmitMachineConstantPoolValue(MachineConstantPoolValue *MCPV) {
     static_cast<SystemZConstantPoolValue*>(MCPV);
 
   const MCExpr *Expr =
-    MCSymbolRefExpr::Create(Mang->getSymbol(ZCPV->getGlobalValue()),
+    MCSymbolRefExpr::Create(getSymbol(ZCPV->getGlobalValue()),
                             getModifierVariantKind(ZCPV->getModifier()),
                             OutContext);
   uint64_t Size = TM.getDataLayout()->getTypeAllocSize(ZCPV->getType());
@@ -66,7 +192,7 @@ bool SystemZAsmPrinter::PrintAsmOperand(const MachineInstr *MI,
       return true;
     OS << -int64_t(MI->getOperand(OpNo).getImm());
   } else {
-    SystemZMCInstLower Lower(Mang, MF->getContext(), *this);
+    SystemZMCInstLower Lower(MF->getContext(), *this);
     MCOperand MO(Lower.lowerOperand(MI->getOperand(OpNo)));
     SystemZInstPrinter::printOperand(MO, OS);
   }
diff --git a/lib/Target/SystemZ/SystemZCallingConv.td b/lib/Target/SystemZ/SystemZCallingConv.td
index c2d727f..c4f641e 100644
--- a/lib/Target/SystemZ/SystemZCallingConv.td
+++ b/lib/Target/SystemZ/SystemZCallingConv.td
@@ -23,7 +23,7 @@ def RetCC_SystemZ : CallingConv<[
   // call-clobbered argument registers available for code that doesn't
   // care about the ABI.  (R6 is an argument register too, but is
   // call-saved and therefore not suitable for return values.)
-  CCIfType<[i32], CCAssignToReg<[R2W, R3W, R4W, R5W]>>,
+  CCIfType<[i32], CCAssignToReg<[R2L, R3L, R4L, R5L]>>,
   CCIfType<[i64], CCAssignToReg<[R2D, R3D, R4D, R5D]>>,
 
   // ABI-complaint code returns float and double in F0.  Make the
@@ -53,7 +53,7 @@ def CC_SystemZ : CallingConv<[
 
   // The first 5 integer arguments are passed in R2-R6.  Note that R6
   // is call-saved.
-  CCIfType<[i32], CCAssignToReg<[R2W, R3W, R4W, R5W, R6W]>>,
+  CCIfType<[i32], CCAssignToReg<[R2L, R3L, R4L, R5L, R6L]>>,
   CCIfType<[i64], CCAssignToReg<[R2D, R3D, R4D, R5D, R6D]>>,
 
   // The first 4 float and double arguments are passed in even registers F0-F6.
diff --git a/lib/Target/SystemZ/SystemZConstantPoolValue.cpp b/lib/Target/SystemZ/SystemZConstantPoolValue.cpp
index e9c4f6d..6c70811 100644
--- a/lib/Target/SystemZ/SystemZConstantPoolValue.cpp
+++ b/lib/Target/SystemZ/SystemZConstantPoolValue.cpp
@@ -39,7 +39,7 @@ unsigned SystemZConstantPoolValue::getRelocationInfo() const {
 int SystemZConstantPoolValue::
 getExistingMachineCPValue(MachineConstantPool *CP, unsigned Alignment) {
   unsigned AlignMask = Alignment - 1;
-  const std::vector<MachineConstantPoolEntry> Constants = CP->getConstants();
+  const std::vector<MachineConstantPoolEntry> &Constants = CP->getConstants();
   for (unsigned I = 0, E = Constants.size(); I != E; ++I) {
     if (Constants[I].isMachineConstantPoolEntry() &&
         (Constants[I].getAlignment() & AlignMask) == 0) {
diff --git a/lib/Target/SystemZ/SystemZFrameLowering.cpp b/lib/Target/SystemZ/SystemZFrameLowering.cpp
index a58da90..acfb491 100644
--- a/lib/Target/SystemZ/SystemZFrameLowering.cpp
+++ b/lib/Target/SystemZ/SystemZFrameLowering.cpp
@@ -111,7 +111,7 @@ static void addSavedGPR(MachineBasicBlock &MBB, MachineInstrBuilder &MIB,
                         const SystemZTargetMachine &TM,
                         unsigned GPR64, bool IsImplicit) {
   const SystemZRegisterInfo *RI = TM.getRegisterInfo();
-  unsigned GPR32 = RI->getSubReg(GPR64, SystemZ::subreg_32bit);
+  unsigned GPR32 = RI->getSubReg(GPR64, SystemZ::subreg_l32);
   bool IsLive = MBB.isLiveIn(GPR64) || MBB.isLiveIn(GPR32);
   if (!IsLive || !IsImplicit) {
     MIB.addReg(GPR64, getImplRegState(IsImplicit) | getKillRegState(!IsLive));
@@ -420,8 +420,7 @@ void SystemZFrameLowering::emitEpilogue(MachineFunction &MF,
   SystemZMachineFunctionInfo *ZFI = MF.getInfo<SystemZMachineFunctionInfo>();
 
   // Skip the return instruction.
-  assert(MBBI->getOpcode() == SystemZ::RET &&
-         "Can only insert epilogue into returning blocks");
+  assert(MBBI->isReturn() && "Can only insert epilogue into returning blocks");
 
   uint64_t StackSize = getAllocatedStackSize(MF);
   if (ZFI->getLowSavedGPR()) {
diff --git a/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp b/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
index d9794b1..f4a2773 100644
--- a/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
+++ b/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
@@ -107,7 +107,8 @@ static uint64_t allOnes(unsigned int Count) {
 //
 //   (and (rotl Input, Rotate), Mask)
 //
-// otherwise.  The value has BitSize bits.
+// otherwise.  The output value has BitSize bits, although Input may be
+// narrower (in which case the upper bits are don't care).
 struct RxSBGOperands {
   RxSBGOperands(unsigned Op, SDValue N)
     : Opcode(Op), BitSize(N.getValueType().getSizeInBits()),
@@ -128,7 +129,7 @@ class SystemZDAGToDAGISel : public SelectionDAGISel {
   const SystemZSubtarget &Subtarget;
 
   // Used by SystemZOperands.td to create integer constants.
-  inline SDValue getImm(const SDNode *Node, uint64_t Imm) {
+  inline SDValue getImm(const SDNode *Node, uint64_t Imm) const {
     return CurDAG->getTargetConstant(Imm, Node->getValueType(0));
   }
 
@@ -142,33 +143,39 @@ class SystemZDAGToDAGISel : public SelectionDAGISel {
 
   // Try to fold more of the base or index of AM into AM, where IsBase
   // selects between the base and index.
-  bool expandAddress(SystemZAddressingMode &AM, bool IsBase);
+  bool expandAddress(SystemZAddressingMode &AM, bool IsBase) const;
 
   // Try to describe N in AM, returning true on success.
-  bool selectAddress(SDValue N, SystemZAddressingMode &AM);
+  bool selectAddress(SDValue N, SystemZAddressingMode &AM) const;
 
   // Extract individual target operands from matched address AM.
   void getAddressOperands(const SystemZAddressingMode &AM, EVT VT,
-                          SDValue &Base, SDValue &Disp);
+                          SDValue &Base, SDValue &Disp) const;
   void getAddressOperands(const SystemZAddressingMode &AM, EVT VT,
-                          SDValue &Base, SDValue &Disp, SDValue &Index);
+                          SDValue &Base, SDValue &Disp, SDValue &Index) const;
 
   // Try to match Addr as a FormBD address with displacement type DR.
   // Return true on success, storing the base and displacement in
   // Base and Disp respectively.
   bool selectBDAddr(SystemZAddressingMode::DispRange DR, SDValue Addr,
-                    SDValue &Base, SDValue &Disp);
+                    SDValue &Base, SDValue &Disp) const;
+
+  // Try to match Addr as a FormBDX address with displacement type DR.
+  // Return true on success and if the result had no index.  Store the
+  // base and displacement in Base and Disp respectively.
+  bool selectMVIAddr(SystemZAddressingMode::DispRange DR, SDValue Addr,
+                     SDValue &Base, SDValue &Disp) const;
 
   // Try to match Addr as a FormBDX* address of form Form with
   // displacement type DR.  Return true on success, storing the base,
   // displacement and index in Base, Disp and Index respectively.
   bool selectBDXAddr(SystemZAddressingMode::AddrForm Form,
                      SystemZAddressingMode::DispRange DR, SDValue Addr,
-                     SDValue &Base, SDValue &Disp, SDValue &Index);
+                     SDValue &Base, SDValue &Disp, SDValue &Index) const;
 
   // PC-relative address matching routines used by SystemZOperands.td.
-  bool selectPCRelAddress(SDValue Addr, SDValue &Target) {
-    if (Addr.getOpcode() == SystemZISD::PCREL_WRAPPER) {
+  bool selectPCRelAddress(SDValue Addr, SDValue &Target) const {
+    if (SystemZISD::isPCREL(Addr.getOpcode())) {
       Target = Addr.getOperand(0);
       return true;
     }
@@ -176,64 +183,72 @@ class SystemZDAGToDAGISel : public SelectionDAGISel {
   }
 
   // BD matching routines used by SystemZOperands.td.
-  bool selectBDAddr12Only(SDValue Addr, SDValue &Base, SDValue &Disp) {
+  bool selectBDAddr12Only(SDValue Addr, SDValue &Base, SDValue &Disp) const {
     return selectBDAddr(SystemZAddressingMode::Disp12Only, Addr, Base, Disp);
   }
-  bool selectBDAddr12Pair(SDValue Addr, SDValue &Base, SDValue &Disp) {
+  bool selectBDAddr12Pair(SDValue Addr, SDValue &Base, SDValue &Disp) const {
     return selectBDAddr(SystemZAddressingMode::Disp12Pair, Addr, Base, Disp);
   }
-  bool selectBDAddr20Only(SDValue Addr, SDValue &Base, SDValue &Disp) {
+  bool selectBDAddr20Only(SDValue Addr, SDValue &Base, SDValue &Disp) const {
     return selectBDAddr(SystemZAddressingMode::Disp20Only, Addr, Base, Disp);
   }
-  bool selectBDAddr20Pair(SDValue Addr, SDValue &Base, SDValue &Disp) {
+  bool selectBDAddr20Pair(SDValue Addr, SDValue &Base, SDValue &Disp) const {
     return selectBDAddr(SystemZAddressingMode::Disp20Pair, Addr, Base, Disp);
   }
 
+  // MVI matching routines used by SystemZOperands.td.
+  bool selectMVIAddr12Pair(SDValue Addr, SDValue &Base, SDValue &Disp) const {
+    return selectMVIAddr(SystemZAddressingMode::Disp12Pair, Addr, Base, Disp);
+  }
+  bool selectMVIAddr20Pair(SDValue Addr, SDValue &Base, SDValue &Disp) const {
+    return selectMVIAddr(SystemZAddressingMode::Disp20Pair, Addr, Base, Disp);
+  }
+
   // BDX matching routines used by SystemZOperands.td.
   bool selectBDXAddr12Only(SDValue Addr, SDValue &Base, SDValue &Disp,
-                           SDValue &Index) {
+                           SDValue &Index) const {
     return selectBDXAddr(SystemZAddressingMode::FormBDXNormal,
                          SystemZAddressingMode::Disp12Only,
                          Addr, Base, Disp, Index);
   }
   bool selectBDXAddr12Pair(SDValue Addr, SDValue &Base, SDValue &Disp,
-                           SDValue &Index) {
+                           SDValue &Index) const {
     return selectBDXAddr(SystemZAddressingMode::FormBDXNormal,
                          SystemZAddressingMode::Disp12Pair,
                          Addr, Base, Disp, Index);
   }
   bool selectDynAlloc12Only(SDValue Addr, SDValue &Base, SDValue &Disp,
-                            SDValue &Index) {
+                            SDValue &Index) const {
     return selectBDXAddr(SystemZAddressingMode::FormBDXDynAlloc,
                          SystemZAddressingMode::Disp12Only,
                          Addr, Base, Disp, Index);
   }
   bool selectBDXAddr20Only(SDValue Addr, SDValue &Base, SDValue &Disp,
-                           SDValue &Index) {
+                           SDValue &Index) const {
     return selectBDXAddr(SystemZAddressingMode::FormBDXNormal,
                          SystemZAddressingMode::Disp20Only,
                          Addr, Base, Disp, Index);
   }
   bool selectBDXAddr20Only128(SDValue Addr, SDValue &Base, SDValue &Disp,
-                              SDValue &Index) {
+                              SDValue &Index) const {
     return selectBDXAddr(SystemZAddressingMode::FormBDXNormal,
                          SystemZAddressingMode::Disp20Only128,
                          Addr, Base, Disp, Index);
   }
   bool selectBDXAddr20Pair(SDValue Addr, SDValue &Base, SDValue &Disp,
-                           SDValue &Index) {
+                           SDValue &Index) const {
     return selectBDXAddr(SystemZAddressingMode::FormBDXNormal,
                          SystemZAddressingMode::Disp20Pair,
                          Addr, Base, Disp, Index);
   }
   bool selectLAAddr12Pair(SDValue Addr, SDValue &Base, SDValue &Disp,
-                          SDValue &Index) {
+                          SDValue &Index) const {
     return selectBDXAddr(SystemZAddressingMode::FormBDXLA,
                          SystemZAddressingMode::Disp12Pair,
                          Addr, Base, Disp, Index);
   }
   bool selectLAAddr20Pair(SDValue Addr, SDValue &Base, SDValue &Disp,
-                          SDValue &Index) {
+                          SDValue &Index) const {
     return selectBDXAddr(SystemZAddressingMode::FormBDXLA,
                          SystemZAddressingMode::Disp20Pair,
                          Addr, Base, Disp, Index);
@@ -242,21 +257,21 @@ class SystemZDAGToDAGISel : public SelectionDAGISel {
   // Check whether (or Op (and X InsertMask)) is effectively an insertion
   // of X into bits InsertMask of some Y != Op.  Return true if so and
   // set Op to that Y.
-  bool detectOrAndInsertion(SDValue &Op, uint64_t InsertMask);
+  bool detectOrAndInsertion(SDValue &Op, uint64_t InsertMask) const;
 
   // Try to update RxSBG so that only the bits of RxSBG.Input in Mask are used.
   // Return true on success.
-  bool refineRxSBGMask(RxSBGOperands &RxSBG, uint64_t Mask);
+  bool refineRxSBGMask(RxSBGOperands &RxSBG, uint64_t Mask) const;
 
   // Try to fold some of RxSBG.Input into other fields of RxSBG.
   // Return true on success.
-  bool expandRxSBG(RxSBGOperands &RxSBG);
+  bool expandRxSBG(RxSBGOperands &RxSBG) const;
 
-  // Return an undefined i64 value.
-  SDValue getUNDEF64(SDLoc DL);
+  // Return an undefined value of type VT.
+  SDValue getUNDEF(SDLoc DL, EVT VT) const;
 
   // Convert N to VT, if it isn't already.
-  SDValue convertTo(SDLoc DL, EVT VT, SDValue N);
+  SDValue convertTo(SDLoc DL, EVT VT, SDValue N) const;
 
   // Try to implement AND or shift node N using RISBG with the zero flag set.
   // Return the selected node on success, otherwise return null.
@@ -276,8 +291,26 @@ class SystemZDAGToDAGISel : public SelectionDAGISel {
   SDNode *splitLargeImmediate(unsigned Opcode, SDNode *Node, SDValue Op0,
                               uint64_t UpperVal, uint64_t LowerVal);
 
+  // Return true if Load and Store are loads and stores of the same size
+  // and are guaranteed not to overlap.  Such operations can be implemented
+  // using block (SS-format) instructions.
+  //
+  // Partial overlap would lead to incorrect code, since the block operations
+  // are logically bytewise, even though they have a fast path for the
+  // non-overlapping case.  We also need to avoid full overlap (i.e. two
+  // addresses that might be equal at run time) because although that case
+  // would be handled correctly, it might be implemented by millicode.
+  bool canUseBlockOperation(StoreSDNode *Store, LoadSDNode *Load) const;
+
+  // N is a (store (load Y), X) pattern.  Return true if it can use an MVC
+  // from Y to X.
   bool storeLoadCanUseMVC(SDNode *N) const;
 
+  // N is a (store (op (load A[0]), (load A[1])), X) pattern.  Return true
+  // if A[1 - I] == X and if N can use a block operation like NC from A[I]
+  // to X.
+  bool storeLoadCanUseBlockBinary(SDNode *N, unsigned I) const;
+
 public:
   SystemZDAGToDAGISel(SystemZTargetMachine &TM, CodeGenOpt::Level OptLevel)
     : SelectionDAGISel(TM, OptLevel),
@@ -363,9 +396,9 @@ static bool expandIndex(SystemZAddressingMode &AM, SDValue Base,
 // The base or index of AM is equivalent to Op0 + Op1, where IsBase selects
 // between the base and index.  Try to fold Op1 into AM's displacement.
 static bool expandDisp(SystemZAddressingMode &AM, bool IsBase,
-                       SDValue Op0, ConstantSDNode *Op1) {
+                       SDValue Op0, uint64_t Op1) {
   // First try adjusting the displacement.
-  int64_t TestDisp = AM.Disp + Op1->getSExtValue();
+  int64_t TestDisp = AM.Disp + Op1;
   if (selectDisp(AM.DR, TestDisp)) {
     changeComponent(AM, IsBase, Op0);
     AM.Disp = TestDisp;
@@ -378,7 +411,7 @@ static bool expandDisp(SystemZAddressingMode &AM, bool IsBase,
 }
 
 bool SystemZDAGToDAGISel::expandAddress(SystemZAddressingMode &AM,
-                                        bool IsBase) {
+                                        bool IsBase) const {
   SDValue N = IsBase ? AM.Base : AM.Index;
   unsigned Opcode = N.getOpcode();
   if (Opcode == ISD::TRUNCATE) {
@@ -398,13 +431,23 @@ bool SystemZDAGToDAGISel::expandAddress(SystemZAddressingMode &AM,
       return expandAdjDynAlloc(AM, IsBase, Op0);
 
     if (Op0Code == ISD::Constant)
-      return expandDisp(AM, IsBase, Op1, cast<ConstantSDNode>(Op0));
+      return expandDisp(AM, IsBase, Op1,
+                        cast<ConstantSDNode>(Op0)->getSExtValue());
     if (Op1Code == ISD::Constant)
-      return expandDisp(AM, IsBase, Op0, cast<ConstantSDNode>(Op1));
+      return expandDisp(AM, IsBase, Op0,
+                        cast<ConstantSDNode>(Op1)->getSExtValue());
 
     if (IsBase && expandIndex(AM, Op0, Op1))
       return true;
   }
+  if (Opcode == SystemZISD::PCREL_OFFSET) {
+    SDValue Full = N.getOperand(0);
+    SDValue Base = N.getOperand(1);
+    SDValue Anchor = Base.getOperand(0);
+    uint64_t Offset = (cast<GlobalAddressSDNode>(Full)->getOffset() -
+                       cast<GlobalAddressSDNode>(Anchor)->getOffset());
+    return expandDisp(AM, IsBase, Base, Offset);
+  }
   return false;
 }
 
@@ -483,14 +526,15 @@ static bool shouldUseLA(SDNode *Base, int64_t Disp, SDNode *Index) {
 
 // Return true if Addr is suitable for AM, updating AM if so.
 bool SystemZDAGToDAGISel::selectAddress(SDValue Addr,
-                                        SystemZAddressingMode &AM) {
+                                        SystemZAddressingMode &AM) const {
   // Start out assuming that the address will need to be loaded separately,
   // then try to extend it as much as we can.
   AM.Base = Addr;
 
   // First try treating the address as a constant.
   if (Addr.getOpcode() == ISD::Constant &&
-      expandDisp(AM, true, SDValue(), cast<ConstantSDNode>(Addr)))
+      expandDisp(AM, true, SDValue(),
+                 cast<ConstantSDNode>(Addr)->getSExtValue()))
     ;
   else
     // Otherwise try expanding each component.
@@ -530,7 +574,7 @@ static void insertDAGNode(SelectionDAG *DAG, SDNode *Pos, SDValue N) {
 
 void SystemZDAGToDAGISel::getAddressOperands(const SystemZAddressingMode &AM,
                                              EVT VT, SDValue &Base,
-                                             SDValue &Disp) {
+                                             SDValue &Disp) const {
   Base = AM.Base;
   if (!Base.getNode())
     // Register 0 means "no base".  This is mostly useful for shifts.
@@ -555,7 +599,8 @@ void SystemZDAGToDAGISel::getAddressOperands(const SystemZAddressingMode &AM,
 
 void SystemZDAGToDAGISel::getAddressOperands(const SystemZAddressingMode &AM,
                                              EVT VT, SDValue &Base,
-                                             SDValue &Disp, SDValue &Index) {
+                                             SDValue &Disp,
+                                             SDValue &Index) const {
   getAddressOperands(AM, VT, Base, Disp);
 
   Index = AM.Index;
@@ -566,7 +611,7 @@ void SystemZDAGToDAGISel::getAddressOperands(const SystemZAddressingMode &AM,
 
 bool SystemZDAGToDAGISel::selectBDAddr(SystemZAddressingMode::DispRange DR,
                                        SDValue Addr, SDValue &Base,
-                                       SDValue &Disp) {
+                                       SDValue &Disp) const {
   SystemZAddressingMode AM(SystemZAddressingMode::FormBD, DR);
   if (!selectAddress(Addr, AM))
     return false;
@@ -575,10 +620,21 @@ bool SystemZDAGToDAGISel::selectBDAddr(SystemZAddressingMode::DispRange DR,
   return true;
 }
 
+bool SystemZDAGToDAGISel::selectMVIAddr(SystemZAddressingMode::DispRange DR,
+                                        SDValue Addr, SDValue &Base,
+                                        SDValue &Disp) const {
+  SystemZAddressingMode AM(SystemZAddressingMode::FormBDXNormal, DR);
+  if (!selectAddress(Addr, AM) || AM.Index.getNode())
+    return false;
+
+  getAddressOperands(AM, Addr.getValueType(), Base, Disp);
+  return true;
+}
+
 bool SystemZDAGToDAGISel::selectBDXAddr(SystemZAddressingMode::AddrForm Form,
                                         SystemZAddressingMode::DispRange DR,
                                         SDValue Addr, SDValue &Base,
-                                        SDValue &Disp, SDValue &Index) {
+                                        SDValue &Disp, SDValue &Index) const {
   SystemZAddressingMode AM(Form, DR);
   if (!selectAddress(Addr, AM))
     return false;
@@ -588,7 +644,7 @@ bool SystemZDAGToDAGISel::selectBDXAddr(SystemZAddressingMode::AddrForm Form,
 }
 
 bool SystemZDAGToDAGISel::detectOrAndInsertion(SDValue &Op,
-                                               uint64_t InsertMask) {
+                                               uint64_t InsertMask) const {
   // We're only interested in cases where the insertion is into some operand
   // of Op, rather than into Op itself.  The only useful case is an AND.
   if (Op.getOpcode() != ISD::AND)
@@ -619,7 +675,8 @@ bool SystemZDAGToDAGISel::detectOrAndInsertion(SDValue &Op,
   return true;
 }
 
-bool SystemZDAGToDAGISel::refineRxSBGMask(RxSBGOperands &RxSBG, uint64_t Mask) {
+bool SystemZDAGToDAGISel::refineRxSBGMask(RxSBGOperands &RxSBG,
+                                          uint64_t Mask) const {
   const SystemZInstrInfo *TII = getInstrInfo();
   if (RxSBG.Rotate != 0)
     Mask = (Mask << RxSBG.Rotate) | (Mask >> (64 - RxSBG.Rotate));
@@ -631,26 +688,15 @@ bool SystemZDAGToDAGISel::refineRxSBGMask(RxSBGOperands &RxSBG, uint64_t Mask) {
   return false;
 }
 
-// RxSBG.Input is a shift of Count bits in the direction given by IsLeft.
-// Return true if the result depends on the signs or zeros that are
-// shifted in.
-static bool shiftedInBitsMatter(RxSBGOperands &RxSBG, uint64_t Count,
-                                bool IsLeft) {
-  // Work out which bits of the shift result are zeros or sign copies.
-  uint64_t ShiftedIn = allOnes(Count);
-  if (!IsLeft)
-    ShiftedIn <<= RxSBG.BitSize - Count;
-
-  // Rotate that mask in the same way as RxSBG.Input is rotated.
+// Return true if any bits of (RxSBG.Input & Mask) are significant.
+static bool maskMatters(RxSBGOperands &RxSBG, uint64_t Mask) {
+  // Rotate the mask in the same way as RxSBG.Input is rotated.
   if (RxSBG.Rotate != 0)
-    ShiftedIn = ((ShiftedIn << RxSBG.Rotate) |
-                 (ShiftedIn >> (64 - RxSBG.Rotate)));
-
-  // Fail if any of the zero or sign bits are used.
-  return (ShiftedIn & RxSBG.Mask) != 0;
+    Mask = ((Mask << RxSBG.Rotate) | (Mask >> (64 - RxSBG.Rotate)));
+  return (Mask & RxSBG.Mask) != 0;
 }
 
-bool SystemZDAGToDAGISel::expandRxSBG(RxSBGOperands &RxSBG) {
+bool SystemZDAGToDAGISel::expandRxSBG(RxSBGOperands &RxSBG) const {
   SDValue N = RxSBG.Input;
   unsigned Opcode = N.getOpcode();
   switch (Opcode) {
@@ -706,7 +752,7 @@ bool SystemZDAGToDAGISel::expandRxSBG(RxSBGOperands &RxSBG) {
 
   case ISD::ROTL: {
     // Any 64-bit rotate left can be merged into the RxSBG.
-    if (RxSBG.BitSize != 64)
+    if (RxSBG.BitSize != 64 || N.getValueType() != MVT::i64)
       return false;
     ConstantSDNode *CountNode
       = dyn_cast<ConstantSDNode>(N.getOperand(1).getNode());
@@ -718,6 +764,19 @@ bool SystemZDAGToDAGISel::expandRxSBG(RxSBGOperands &RxSBG) {
     return true;
   }
       
+  case ISD::SIGN_EXTEND:
+  case ISD::ZERO_EXTEND:
+  case ISD::ANY_EXTEND: {
+    // Check that the extension bits are don't-care (i.e. are masked out
+    // by the final mask).
+    unsigned InnerBitSize = N.getOperand(0).getValueType().getSizeInBits();
+    if (maskMatters(RxSBG, allOnes(RxSBG.BitSize) - allOnes(InnerBitSize)))
+      return false;
+
+    RxSBG.Input = N.getOperand(0);
+    return true;
+  }
+
   case ISD::SHL: {
     ConstantSDNode *CountNode =
       dyn_cast<ConstantSDNode>(N.getOperand(1).getNode());
@@ -725,17 +784,18 @@ bool SystemZDAGToDAGISel::expandRxSBG(RxSBGOperands &RxSBG) {
       return false;
 
     uint64_t Count = CountNode->getZExtValue();
-    if (Count < 1 || Count >= RxSBG.BitSize)
+    unsigned BitSize = N.getValueType().getSizeInBits();
+    if (Count < 1 || Count >= BitSize)
       return false;
 
     if (RxSBG.Opcode == SystemZ::RNSBG) {
       // Treat (shl X, count) as (rotl X, size-count) as long as the bottom
       // count bits from RxSBG.Input are ignored.
-      if (shiftedInBitsMatter(RxSBG, Count, true))
+      if (maskMatters(RxSBG, allOnes(Count)))
         return false;
     } else {
       // Treat (shl X, count) as (and (rotl X, count), ~0<<count).
-      if (!refineRxSBGMask(RxSBG, allOnes(RxSBG.BitSize - Count) << Count))
+      if (!refineRxSBGMask(RxSBG, allOnes(BitSize - Count) << Count))
         return false;
     }
 
@@ -752,18 +812,19 @@ bool SystemZDAGToDAGISel::expandRxSBG(RxSBGOperands &RxSBG) {
       return false;
 
     uint64_t Count = CountNode->getZExtValue();
-    if (Count < 1 || Count >= RxSBG.BitSize)
+    unsigned BitSize = N.getValueType().getSizeInBits();
+    if (Count < 1 || Count >= BitSize)
       return false;
 
     if (RxSBG.Opcode == SystemZ::RNSBG || Opcode == ISD::SRA) {
       // Treat (srl|sra X, count) as (rotl X, size-count) as long as the top
       // count bits from RxSBG.Input are ignored.
-      if (shiftedInBitsMatter(RxSBG, Count, false))
+      if (maskMatters(RxSBG, allOnes(Count) << (BitSize - Count)))
         return false;
     } else {
       // Treat (srl X, count), mask) as (and (rotl X, size-count), ~0>>count),
       // which is similar to SLL above.
-      if (!refineRxSBGMask(RxSBG, allOnes(RxSBG.BitSize - Count)))
+      if (!refineRxSBGMask(RxSBG, allOnes(BitSize - Count)))
         return false;
     }
 
@@ -776,24 +837,17 @@ bool SystemZDAGToDAGISel::expandRxSBG(RxSBGOperands &RxSBG) {
   }
 }
 
-SDValue SystemZDAGToDAGISel::getUNDEF64(SDLoc DL) {
-  SDNode *N = CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::i64);
+SDValue SystemZDAGToDAGISel::getUNDEF(SDLoc DL, EVT VT) const {
+  SDNode *N = CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, VT);
   return SDValue(N, 0);
 }
 
-SDValue SystemZDAGToDAGISel::convertTo(SDLoc DL, EVT VT, SDValue N) {
-  if (N.getValueType() == MVT::i32 && VT == MVT::i64) {
-    SDValue Index = CurDAG->getTargetConstant(SystemZ::subreg_32bit, MVT::i64);
-    SDNode *Insert = CurDAG->getMachineNode(TargetOpcode::INSERT_SUBREG,
-                                            DL, VT, getUNDEF64(DL), N, Index);
-    return SDValue(Insert, 0);
-  }
-  if (N.getValueType() == MVT::i64 && VT == MVT::i32) {
-    SDValue Index = CurDAG->getTargetConstant(SystemZ::subreg_32bit, MVT::i64);
-    SDNode *Extract = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
-                                             DL, VT, N, Index);
-    return SDValue(Extract, 0);
-  }
+SDValue SystemZDAGToDAGISel::convertTo(SDLoc DL, EVT VT, SDValue N) const {
+  if (N.getValueType() == MVT::i32 && VT == MVT::i64)
+    return CurDAG->getTargetInsertSubreg(SystemZ::subreg_l32,
+                                         DL, VT, getUNDEF(DL, MVT::i64), N);
+  if (N.getValueType() == MVT::i64 && VT == MVT::i32)
+    return CurDAG->getTargetExtractSubreg(SystemZ::subreg_l32, DL, VT, N);
   assert(N.getValueType() == VT && "Unexpected value types");
   return N;
 }
@@ -803,7 +857,8 @@ SDNode *SystemZDAGToDAGISel::tryRISBGZero(SDNode *N) {
   RxSBGOperands RISBG(SystemZ::RISBG, SDValue(N, 0));
   unsigned Count = 0;
   while (expandRxSBG(RISBG))
-    Count += 1;
+    if (RISBG.Input.getOpcode() != ISD::ANY_EXTEND)
+      Count += 1;
   if (Count == 0)
     return 0;
   if (Count == 1) {
@@ -831,14 +886,22 @@ SDNode *SystemZDAGToDAGISel::tryRISBGZero(SDNode *N) {
     }
   }  
 
+  unsigned Opcode = SystemZ::RISBG;
+  EVT OpcodeVT = MVT::i64;
+  if (VT == MVT::i32 && Subtarget.hasHighWord()) {
+    Opcode = SystemZ::RISBMux;
+    OpcodeVT = MVT::i32;
+    RISBG.Start &= 31;
+    RISBG.End &= 31;
+  }
   SDValue Ops[5] = {
-    getUNDEF64(SDLoc(N)),
-    convertTo(SDLoc(N), MVT::i64, RISBG.Input),
+    getUNDEF(SDLoc(N), OpcodeVT),
+    convertTo(SDLoc(N), OpcodeVT, RISBG.Input),
     CurDAG->getTargetConstant(RISBG.Start, MVT::i32),
     CurDAG->getTargetConstant(RISBG.End | 128, MVT::i32),
     CurDAG->getTargetConstant(RISBG.Rotate, MVT::i32)
   };
-  N = CurDAG->getMachineNode(SystemZ::RISBG, SDLoc(N), MVT::i64, Ops);
+  N = CurDAG->getMachineNode(Opcode, SDLoc(N), OpcodeVT, Ops);
   return convertTo(SDLoc(N), VT, SDValue(N, 0)).getNode();
 }
 
@@ -852,7 +915,8 @@ SDNode *SystemZDAGToDAGISel::tryRxSBG(SDNode *N, unsigned Opcode) {
   unsigned Count[] = { 0, 0 };
   for (unsigned I = 0; I < 2; ++I)
     while (expandRxSBG(RxSBG[I]))
-      Count[I] += 1;
+      if (RxSBG[I].Input.getOpcode() != ISD::ANY_EXTEND)
+        Count[I] += 1;
 
   // Do nothing if neither operand is suitable.
   if (Count[0] == 0 && Count[1] == 0)
@@ -900,49 +964,64 @@ SDNode *SystemZDAGToDAGISel::splitLargeImmediate(unsigned Opcode, SDNode *Node,
   return Or.getNode();
 }
 
-// N is a (store (load ...), ...) pattern.  Return true if it can use MVC.
-bool SystemZDAGToDAGISel::storeLoadCanUseMVC(SDNode *N) const {
-  StoreSDNode *Store = cast<StoreSDNode>(N);
-  LoadSDNode *Load = cast<LoadSDNode>(Store->getValue().getNode());
+bool SystemZDAGToDAGISel::canUseBlockOperation(StoreSDNode *Store,
+                                               LoadSDNode *Load) const {
+  // Check that the two memory operands have the same size.
+  if (Load->getMemoryVT() != Store->getMemoryVT())
+    return false;
 
-  // MVC is logically a bytewise copy, so can't be used for volatile accesses.
+  // Volatility stops an access from being decomposed.
   if (Load->isVolatile() || Store->isVolatile())
     return false;
 
-  // Prefer not to use MVC if either address can use ... RELATIVE LONG
-  // instructions.
-  assert(Load->getMemoryVT() == Store->getMemoryVT() &&
-         "Should already have checked that the types match");
-  uint64_t Size = Load->getMemoryVT().getStoreSize();
-  if (Size > 1 && Size <= 8) {
-    // Prefer LHRL, LRL and LGRL.
-    if (Load->getBasePtr().getOpcode() == SystemZISD::PCREL_WRAPPER)
-      return false;
-    // Prefer STHRL, STRL and STGRL.
-    if (Store->getBasePtr().getOpcode() == SystemZISD::PCREL_WRAPPER)
-      return false;
-  }
-
   // There's no chance of overlap if the load is invariant.
   if (Load->isInvariant())
     return true;
 
-  // If both operands are aligned, they must be equal or not overlap.
-  if (Load->getAlignment() >= Size && Store->getAlignment() >= Size)
-    return true;
-
   // Otherwise we need to check whether there's an alias.
   const Value *V1 = Load->getSrcValue();
   const Value *V2 = Store->getSrcValue();
   if (!V1 || !V2)
     return false;
 
+  // Reject equality.
+  uint64_t Size = Load->getMemoryVT().getStoreSize();
   int64_t End1 = Load->getSrcValueOffset() + Size;
   int64_t End2 = Store->getSrcValueOffset() + Size;
+  if (V1 == V2 && End1 == End2)
+    return false;
+
   return !AA->alias(AliasAnalysis::Location(V1, End1, Load->getTBAAInfo()),
                     AliasAnalysis::Location(V2, End2, Store->getTBAAInfo()));
 }
 
+bool SystemZDAGToDAGISel::storeLoadCanUseMVC(SDNode *N) const {
+  StoreSDNode *Store = cast<StoreSDNode>(N);
+  LoadSDNode *Load = cast<LoadSDNode>(Store->getValue());
+
+  // Prefer not to use MVC if either address can use ... RELATIVE LONG
+  // instructions.
+  uint64_t Size = Load->getMemoryVT().getStoreSize();
+  if (Size > 1 && Size <= 8) {
+    // Prefer LHRL, LRL and LGRL.
+    if (SystemZISD::isPCREL(Load->getBasePtr().getOpcode()))
+      return false;
+    // Prefer STHRL, STRL and STGRL.
+    if (SystemZISD::isPCREL(Store->getBasePtr().getOpcode()))
+      return false;
+  }
+
+  return canUseBlockOperation(Store, Load);
+}
+
+bool SystemZDAGToDAGISel::storeLoadCanUseBlockBinary(SDNode *N,
+                                                     unsigned I) const {
+  StoreSDNode *StoreA = cast<StoreSDNode>(N);
+  LoadSDNode *LoadA = cast<LoadSDNode>(StoreA->getValue().getOperand(1 - I));
+  LoadSDNode *LoadB = cast<LoadSDNode>(StoreA->getValue().getOperand(I));
+  return !LoadA->isVolatile() && canUseBlockOperation(StoreA, LoadB);
+}
+
 SDNode *SystemZDAGToDAGISel::Select(SDNode *Node) {
   // Dump information about the Node being selected
   DEBUG(errs() << "Selecting: "; Node->dump(CurDAG); errs() << "\n");
@@ -950,6 +1029,7 @@ SDNode *SystemZDAGToDAGISel::Select(SDNode *Node) {
   // If we have a custom node, we already have selected!
   if (Node->isMachineOpcode()) {
     DEBUG(errs() << "== "; Node->dump(CurDAG); errs() << "\n");
+    Node->setNodeId(-1);
     return 0;
   }
 
diff --git a/lib/Target/SystemZ/SystemZISelLowering.cpp b/lib/Target/SystemZ/SystemZISelLowering.cpp
index 6acdcd4..f6e1853 100644
--- a/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -23,8 +23,23 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 
+#include <cctype>
+
 using namespace llvm;
 
+namespace {
+// Represents a sequence for extracting a 0/1 value from an IPM result:
+// (((X ^ XORValue) + AddValue) >> Bit)
+struct IPMConversion {
+  IPMConversion(unsigned xorValue, int64_t addValue, unsigned bit)
+    : XORValue(xorValue), AddValue(addValue), Bit(bit) {}
+
+  int64_t XORValue;
+  int64_t AddValue;
+  unsigned Bit;
+};
+}
+
 // Classify VT as either 32 or 64 bit.
 static bool is32Bit(EVT VT) {
   switch (VT.getSimpleVT().SimpleTy) {
@@ -51,7 +66,10 @@ SystemZTargetLowering::SystemZTargetLowering(SystemZTargetMachine &tm)
   MVT PtrVT = getPointerTy();
 
   // Set up the register classes.
-  addRegisterClass(MVT::i32,  &SystemZ::GR32BitRegClass);
+  if (Subtarget.hasHighWord())
+    addRegisterClass(MVT::i32, &SystemZ::GRX32BitRegClass);
+  else
+    addRegisterClass(MVT::i32, &SystemZ::GR32BitRegClass);
   addRegisterClass(MVT::i64,  &SystemZ::GR64BitRegClass);
   addRegisterClass(MVT::f32,  &SystemZ::FP32BitRegClass);
   addRegisterClass(MVT::f64,  &SystemZ::FP64BitRegClass);
@@ -83,8 +101,8 @@ SystemZTargetLowering::SystemZTargetLowering(SystemZTargetMachine &tm)
        ++I) {
     MVT VT = MVT::SimpleValueType(I);
     if (isTypeLegal(VT)) {
-      // Expand SETCC(X, Y, COND) into SELECT_CC(X, Y, 1, 0, COND).
-      setOperationAction(ISD::SETCC, VT, Expand);
+      // Lower SET_CC into an IPM-based sequence.
+      setOperationAction(ISD::SETCC, VT, Custom);
 
       // Expand SELECT(C, A, B) into SELECT_CC(X, 0, A, B, NE).
       setOperationAction(ISD::SELECT, VT, Expand);
@@ -128,9 +146,11 @@ SystemZTargetLowering::SystemZTargetLowering(SystemZTargetMachine &tm)
       setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand);
       setOperationAction(ISD::ROTR,            VT, Expand);
 
-      // Use *MUL_LOHI where possible and a wider multiplication otherwise.
+      // Use *MUL_LOHI where possible instead of MULH*.
       setOperationAction(ISD::MULHS, VT, Expand);
       setOperationAction(ISD::MULHU, VT, Expand);
+      setOperationAction(ISD::SMUL_LOHI, VT, Custom);
+      setOperationAction(ISD::UMUL_LOHI, VT, Custom);
 
       // We have instructions for signed but not unsigned FP conversion.
       setOperationAction(ISD::FP_TO_UINT, VT, Expand);
@@ -165,14 +185,6 @@ SystemZTargetLowering::SystemZTargetLowering(SystemZTargetMachine &tm)
   // Give LowerOperation the chance to replace 64-bit ORs with subregs.
   setOperationAction(ISD::OR, MVT::i64, Custom);
 
-  // The architecture has 32-bit SMUL_LOHI and UMUL_LOHI (MR and MLR),
-  // but they aren't really worth using.  There is no 64-bit SMUL_LOHI,
-  // but there is a 64-bit UMUL_LOHI: MLGR.
-  setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
-  setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
-  setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
-  setOperationAction(ISD::UMUL_LOHI, MVT::i64, Custom);
-
   // FIXME: Can we support these natively?
   setOperationAction(ISD::SRL_PARTS, MVT::i64, Expand);
   setOperationAction(ISD::SHL_PARTS, MVT::i64, Expand);
@@ -200,6 +212,9 @@ SystemZTargetLowering::SystemZTargetLowering(SystemZTargetMachine &tm)
   setOperationAction(ISD::STACKSAVE,    MVT::Other, Custom);
   setOperationAction(ISD::STACKRESTORE, MVT::Other, Custom);
 
+  // Handle prefetches with PFD or PFDRL.
+  setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
+
   // Handle floating-point types.
   for (unsigned I = MVT::FIRST_FP_VALUETYPE;
        I <= MVT::LAST_FP_VALUETYPE;
@@ -209,6 +224,15 @@ SystemZTargetLowering::SystemZTargetLowering(SystemZTargetMachine &tm)
       // We can use FI for FRINT.
       setOperationAction(ISD::FRINT, VT, Legal);
 
+      // We can use the extended form of FI for other rounding operations.
+      if (Subtarget.hasFPExtension()) {
+        setOperationAction(ISD::FNEARBYINT, VT, Legal);
+        setOperationAction(ISD::FFLOOR, VT, Legal);
+        setOperationAction(ISD::FCEIL, VT, Legal);
+        setOperationAction(ISD::FTRUNC, VT, Legal);
+        setOperationAction(ISD::FROUND, VT, Legal);
+      }
+
       // No special instructions for these.
       setOperationAction(ISD::FSIN, VT, Expand);
       setOperationAction(ISD::FCOS, VT, Expand);
@@ -255,8 +279,13 @@ SystemZTargetLowering::SystemZTargetLowering(SystemZTargetMachine &tm)
   MaxStoresPerMemsetOptSize = 0;
 }
 
-bool
-SystemZTargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
+EVT SystemZTargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
+  if (!VT.isVector())
+    return MVT::i32;
+  return VT.changeVectorElementTypeToInteger();
+}
+
+bool SystemZTargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
   VT = VT.getScalarType();
 
   if (!VT.isSimple())
@@ -305,6 +334,22 @@ bool SystemZTargetLowering::isLegalAddressingMode(const AddrMode &AM,
   return AM.Scale == 0 || AM.Scale == 1;
 }
 
+bool SystemZTargetLowering::isTruncateFree(Type *FromType, Type *ToType) const {
+  if (!FromType->isIntegerTy() || !ToType->isIntegerTy())
+    return false;
+  unsigned FromBits = FromType->getPrimitiveSizeInBits();
+  unsigned ToBits = ToType->getPrimitiveSizeInBits();
+  return FromBits > ToBits;
+}
+
+bool SystemZTargetLowering::isTruncateFree(EVT FromVT, EVT ToVT) const {
+  if (!FromVT.isInteger() || !ToVT.isInteger())
+    return false;
+  unsigned FromBits = FromVT.getSizeInBits();
+  unsigned ToBits = ToVT.getSizeInBits();
+  return FromBits > ToBits;
+}
+
 //===----------------------------------------------------------------------===//
 // Inline asm support
 //===----------------------------------------------------------------------===//
@@ -316,6 +361,7 @@ SystemZTargetLowering::getConstraintType(const std::string &Constraint) const {
     case 'a': // Address register
     case 'd': // Data register (equivalent to 'r')
     case 'f': // Floating-point register
+    case 'h': // High-part register
     case 'r': // General-purpose register
       return C_RegisterClass;
 
@@ -358,6 +404,7 @@ getSingleConstraintMatchWeight(AsmOperandInfo &info,
 
   case 'a': // Address register
   case 'd': // Data register (equivalent to 'r')
+  case 'h': // High-part register
   case 'r': // General-purpose register
     if (CallOperandVal->getType()->isIntegerTy())
       weight = CW_Register;
@@ -438,6 +485,9 @@ getRegForInlineAsmConstraint(const std::string &Constraint, MVT VT) const {
         return std::make_pair(0U, &SystemZ::ADDR128BitRegClass);
       return std::make_pair(0U, &SystemZ::ADDR32BitRegClass);
 
+    case 'h': // High-part register (an LLVM extension)
+      return std::make_pair(0U, &SystemZ::GRH32BitRegClass);
+
     case 'f': // Floating-point register
       if (VT == MVT::f64)
         return std::make_pair(0U, &SystemZ::FP64BitRegClass);
@@ -527,6 +577,17 @@ LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint,
 
 #include "SystemZGenCallingConv.inc"
 
+bool SystemZTargetLowering::allowTruncateForTailCall(Type *FromType,
+                                                     Type *ToType) const {
+  return isTruncateFree(FromType, ToType);
+}
+
+bool SystemZTargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
+  if (!CI->isTailCall())
+    return false;
+  return true;
+}
+
 // Value is a value that has been passed to us in the location described by VA
 // (and so has type VA.getLocVT()).  Convert Value to VA.getValVT(), chaining
 // any loads onto Chain.
@@ -689,6 +750,23 @@ LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
   return Chain;
 }
 
+static bool canUseSiblingCall(CCState ArgCCInfo,
+                              SmallVectorImpl<CCValAssign> &ArgLocs) {
+  // Punt if there are any indirect or stack arguments, or if the call
+  // needs the call-saved argument register R6.
+  for (unsigned I = 0, E = ArgLocs.size(); I != E; ++I) {
+    CCValAssign &VA = ArgLocs[I];
+    if (VA.getLocInfo() == CCValAssign::Indirect)
+      return false;
+    if (!VA.isRegLoc())
+      return false;
+    unsigned Reg = VA.getLocReg();
+    if (Reg == SystemZ::R6H || Reg == SystemZ::R6L || Reg == SystemZ::R6D)
+      return false;
+  }
+  return true;
+}
+
 SDValue
 SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI,
                                  SmallVectorImpl<SDValue> &InVals) const {
@@ -699,26 +777,29 @@ SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI,
   SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
   SDValue Chain = CLI.Chain;
   SDValue Callee = CLI.Callee;
-  bool &isTailCall = CLI.IsTailCall;
+  bool &IsTailCall = CLI.IsTailCall;
   CallingConv::ID CallConv = CLI.CallConv;
   bool IsVarArg = CLI.IsVarArg;
   MachineFunction &MF = DAG.getMachineFunction();
   EVT PtrVT = getPointerTy();
 
-  // SystemZ target does not yet support tail call optimization.
-  isTailCall = false;
-
   // Analyze the operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState ArgCCInfo(CallConv, IsVarArg, MF, TM, ArgLocs, *DAG.getContext());
   ArgCCInfo.AnalyzeCallOperands(Outs, CC_SystemZ);
 
+  // We don't support GuaranteedTailCallOpt, only automatically-detected
+  // sibling calls.
+  if (IsTailCall && !canUseSiblingCall(ArgCCInfo, ArgLocs))
+    IsTailCall = false;
+
   // Get a count of how many bytes are to be pushed on the stack.
   unsigned NumBytes = ArgCCInfo.getNextStackOffset();
 
   // Mark the start of the call.
-  Chain = DAG.getCALLSEQ_START(Chain, DAG.getConstant(NumBytes, PtrVT, true),
-                               DL);
+  if (!IsTailCall)
+    Chain = DAG.getCALLSEQ_START(Chain, DAG.getConstant(NumBytes, PtrVT, true),
+                                 DL);
 
   // Copy argument values to their designated locations.
   SmallVector<std::pair<unsigned, SDValue>, 9> RegsToPass;
@@ -767,22 +848,27 @@ SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI,
     Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
                         &MemOpChains[0], MemOpChains.size());
 
-  // Build a sequence of copy-to-reg nodes, chained and glued together.
-  SDValue Glue;
-  for (unsigned I = 0, E = RegsToPass.size(); I != E; ++I) {
-    Chain = DAG.getCopyToReg(Chain, DL, RegsToPass[I].first,
-                             RegsToPass[I].second, Glue);
-    Glue = Chain.getValue(1);
-  }
-
   // Accept direct calls by converting symbolic call addresses to the
-  // associated Target* opcodes.
+  // associated Target* opcodes.  Force %r1 to be used for indirect
+  // tail calls.
+  SDValue Glue;
   if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
     Callee = DAG.getTargetGlobalAddress(G->getGlobal(), DL, PtrVT);
     Callee = DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Callee);
   } else if (ExternalSymbolSDNode *E = dyn_cast<ExternalSymbolSDNode>(Callee)) {
     Callee = DAG.getTargetExternalSymbol(E->getSymbol(), PtrVT);
     Callee = DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Callee);
+  } else if (IsTailCall) {
+    Chain = DAG.getCopyToReg(Chain, DL, SystemZ::R1D, Callee, Glue);
+    Glue = Chain.getValue(1);
+    Callee = DAG.getRegister(SystemZ::R1D, Callee.getValueType());
+  }
+
+  // Build a sequence of copy-to-reg nodes, chained and glued together.
+  for (unsigned I = 0, E = RegsToPass.size(); I != E; ++I) {
+    Chain = DAG.getCopyToReg(Chain, DL, RegsToPass[I].first,
+                             RegsToPass[I].second, Glue);
+    Glue = Chain.getValue(1);
   }
 
   // The first call operand is the chain and the second is the target address.
@@ -802,6 +888,8 @@ SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI,
 
   // Emit the call.
   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+  if (IsTailCall)
+    return DAG.getNode(SystemZISD::SIBCALL, DL, NodeTys, &Ops[0], Ops.size());
   Chain = DAG.getNode(SystemZISD::CALL, DL, NodeTys, &Ops[0], Ops.size());
   Glue = Chain.getValue(1);
 
@@ -910,6 +998,73 @@ static unsigned CCMaskForCondCode(ISD::CondCode CC) {
 #undef CONV
 }
 
+// Return a sequence for getting a 1 from an IPM result when CC has a
+// value in CCMask and a 0 when CC has a value in CCValid & ~CCMask.
+// The handling of CC values outside CCValid doesn't matter.
+static IPMConversion getIPMConversion(unsigned CCValid, unsigned CCMask) {
+  // Deal with cases where the result can be taken directly from a bit
+  // of the IPM result.
+  if (CCMask == (CCValid & (SystemZ::CCMASK_1 | SystemZ::CCMASK_3)))
+    return IPMConversion(0, 0, SystemZ::IPM_CC);
+  if (CCMask == (CCValid & (SystemZ::CCMASK_2 | SystemZ::CCMASK_3)))
+    return IPMConversion(0, 0, SystemZ::IPM_CC + 1);
+
+  // Deal with cases where we can add a value to force the sign bit
+  // to contain the right value.  Putting the bit in 31 means we can
+  // use SRL rather than RISBG(L), and also makes it easier to get a
+  // 0/-1 value, so it has priority over the other tests below.
+  //
+  // These sequences rely on the fact that the upper two bits of the
+  // IPM result are zero.
+  uint64_t TopBit = uint64_t(1) << 31;
+  if (CCMask == (CCValid & SystemZ::CCMASK_0))
+    return IPMConversion(0, -(1 << SystemZ::IPM_CC), 31);
+  if (CCMask == (CCValid & (SystemZ::CCMASK_0 | SystemZ::CCMASK_1)))
+    return IPMConversion(0, -(2 << SystemZ::IPM_CC), 31);
+  if (CCMask == (CCValid & (SystemZ::CCMASK_0
+                            | SystemZ::CCMASK_1
+                            | SystemZ::CCMASK_2)))
+    return IPMConversion(0, -(3 << SystemZ::IPM_CC), 31);
+  if (CCMask == (CCValid & SystemZ::CCMASK_3))
+    return IPMConversion(0, TopBit - (3 << SystemZ::IPM_CC), 31);
+  if (CCMask == (CCValid & (SystemZ::CCMASK_1
+                            | SystemZ::CCMASK_2
+                            | SystemZ::CCMASK_3)))
+    return IPMConversion(0, TopBit - (1 << SystemZ::IPM_CC), 31);
+
+  // Next try inverting the value and testing a bit.  0/1 could be
+  // handled this way too, but we dealt with that case above.
+  if (CCMask == (CCValid & (SystemZ::CCMASK_0 | SystemZ::CCMASK_2)))
+    return IPMConversion(-1, 0, SystemZ::IPM_CC);
+
+  // Handle cases where adding a value forces a non-sign bit to contain
+  // the right value.
+  if (CCMask == (CCValid & (SystemZ::CCMASK_1 | SystemZ::CCMASK_2)))
+    return IPMConversion(0, 1 << SystemZ::IPM_CC, SystemZ::IPM_CC + 1);
+  if (CCMask == (CCValid & (SystemZ::CCMASK_0 | SystemZ::CCMASK_3)))
+    return IPMConversion(0, -(1 << SystemZ::IPM_CC), SystemZ::IPM_CC + 1);
+
+  // The remaing cases are 1, 2, 0/1/3 and 0/2/3.  All these are
+  // can be done by inverting the low CC bit and applying one of the
+  // sign-based extractions above.
+  if (CCMask == (CCValid & SystemZ::CCMASK_1))
+    return IPMConversion(1 << SystemZ::IPM_CC, -(1 << SystemZ::IPM_CC), 31);
+  if (CCMask == (CCValid & SystemZ::CCMASK_2))
+    return IPMConversion(1 << SystemZ::IPM_CC,
+                         TopBit - (3 << SystemZ::IPM_CC), 31);
+  if (CCMask == (CCValid & (SystemZ::CCMASK_0
+                            | SystemZ::CCMASK_1
+                            | SystemZ::CCMASK_3)))
+    return IPMConversion(1 << SystemZ::IPM_CC, -(3 << SystemZ::IPM_CC), 31);
+  if (CCMask == (CCValid & (SystemZ::CCMASK_0
+                            | SystemZ::CCMASK_2
+                            | SystemZ::CCMASK_3)))
+    return IPMConversion(1 << SystemZ::IPM_CC,
+                         TopBit - (1 << SystemZ::IPM_CC), 31);
+
+  llvm_unreachable("Unexpected CC combination");
+}
+
 // If a comparison described by IsUnsigned, CCMask, CmpOp0 and CmpOp1
 // can be converted to a comparison against zero, adjust the operands
 // as necessary.
@@ -1009,74 +1164,309 @@ static void adjustSubwordCmp(SelectionDAG &DAG, bool &IsUnsigned,
     CmpOp1 = DAG.getConstant(Value, MVT::i32);
 }
 
-// Return true if a comparison described by CCMask, CmpOp0 and CmpOp1
-// is an equality comparison that is better implemented using unsigned
-// rather than signed comparison instructions.
-static bool preferUnsignedComparison(SelectionDAG &DAG, SDValue CmpOp0,
-                                     SDValue CmpOp1, unsigned CCMask) {
-  // The test must be for equality or inequality.
-  if (CCMask != SystemZ::CCMASK_CMP_EQ && CCMask != SystemZ::CCMASK_CMP_NE)
+// Return true if Op is either an unextended load, or a load suitable
+// for integer register-memory comparisons of type ICmpType.
+static bool isNaturalMemoryOperand(SDValue Op, unsigned ICmpType) {
+  LoadSDNode *Load = dyn_cast<LoadSDNode>(Op.getNode());
+  if (Load) {
+    // There are no instructions to compare a register with a memory byte.
+    if (Load->getMemoryVT() == MVT::i8)
+      return false;
+    // Otherwise decide on extension type.
+    switch (Load->getExtensionType()) {
+    case ISD::NON_EXTLOAD:
+      return true;
+    case ISD::SEXTLOAD:
+      return ICmpType != SystemZICMP::UnsignedOnly;
+    case ISD::ZEXTLOAD:
+      return ICmpType != SystemZICMP::SignedOnly;
+    default:
+      break;
+    }
+  }
+  return false;
+}
+
+// Return true if it is better to swap comparison operands Op0 and Op1.
+// ICmpType is the type of an integer comparison.
+static bool shouldSwapCmpOperands(SDValue Op0, SDValue Op1,
+                                  unsigned ICmpType) {
+  // Leave f128 comparisons alone, since they have no memory forms.
+  if (Op0.getValueType() == MVT::f128)
     return false;
 
-  if (CmpOp1.getOpcode() == ISD::Constant) {
-    uint64_t Value = cast<ConstantSDNode>(CmpOp1)->getSExtValue();
+  // Always keep a floating-point constant second, since comparisons with
+  // zero can use LOAD TEST and comparisons with other constants make a
+  // natural memory operand.
+  if (isa<ConstantFPSDNode>(Op1))
+    return false;
 
-    // If we're comparing with memory, prefer unsigned comparisons for
-    // values that are in the unsigned 16-bit range but not the signed
-    // 16-bit range.  We want to use CLFHSI and CLGHSI.
-    if (CmpOp0.hasOneUse() &&
-        ISD::isNormalLoad(CmpOp0.getNode()) &&
-        (Value >= 32768 && Value < 65536))
-      return true;
+  // Never swap comparisons with zero since there are many ways to optimize
+  // those later.
+  ConstantSDNode *COp1 = dyn_cast<ConstantSDNode>(Op1);
+  if (COp1 && COp1->getZExtValue() == 0)
+    return false;
 
-    // Use unsigned comparisons for values that are in the CLGFI range
-    // but not in the CGFI range.
-    if (CmpOp0.getValueType() == MVT::i64 && (Value >> 31) == 1)
+  // Look for cases where Cmp0 is a single-use load and Cmp1 isn't.
+  // In that case we generally prefer the memory to be second.
+  if ((isNaturalMemoryOperand(Op0, ICmpType) && Op0.hasOneUse()) &&
+      !(isNaturalMemoryOperand(Op1, ICmpType) && Op1.hasOneUse())) {
+    // The only exceptions are when the second operand is a constant and
+    // we can use things like CHHSI.
+    if (!COp1)
       return true;
+    // The unsigned memory-immediate instructions can handle 16-bit
+    // unsigned integers.
+    if (ICmpType != SystemZICMP::SignedOnly &&
+        isUInt<16>(COp1->getZExtValue()))
+      return false;
+    // The signed memory-immediate instructions can handle 16-bit
+    // signed integers.
+    if (ICmpType != SystemZICMP::UnsignedOnly &&
+        isInt<16>(COp1->getSExtValue()))
+      return false;
+    return true;
+  }
+  return false;
+}
+
+// Return true if shift operation N has an in-range constant shift value.
+// Store it in ShiftVal if so.
+static bool isSimpleShift(SDValue N, unsigned &ShiftVal) {
+  ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(N.getOperand(1));
+  if (!Shift)
+    return false;
 
+  uint64_t Amount = Shift->getZExtValue();
+  if (Amount >= N.getValueType().getSizeInBits())
     return false;
+
+  ShiftVal = Amount;
+  return true;
+}
+
+// Check whether an AND with Mask is suitable for a TEST UNDER MASK
+// instruction and whether the CC value is descriptive enough to handle
+// a comparison of type Opcode between the AND result and CmpVal.
+// CCMask says which comparison result is being tested and BitSize is
+// the number of bits in the operands.  If TEST UNDER MASK can be used,
+// return the corresponding CC mask, otherwise return 0.
+static unsigned getTestUnderMaskCond(unsigned BitSize, unsigned CCMask,
+                                     uint64_t Mask, uint64_t CmpVal,
+                                     unsigned ICmpType) {
+  assert(Mask != 0 && "ANDs with zero should have been removed by now");
+
+  // Check whether the mask is suitable for TMHH, TMHL, TMLH or TMLL.
+  if (!SystemZ::isImmLL(Mask) && !SystemZ::isImmLH(Mask) &&
+      !SystemZ::isImmHL(Mask) && !SystemZ::isImmHH(Mask))
+    return 0;
+
+  // Work out the masks for the lowest and highest bits.
+  unsigned HighShift = 63 - countLeadingZeros(Mask);
+  uint64_t High = uint64_t(1) << HighShift;
+  uint64_t Low = uint64_t(1) << countTrailingZeros(Mask);
+
+  // Signed ordered comparisons are effectively unsigned if the sign
+  // bit is dropped.
+  bool EffectivelyUnsigned = (ICmpType != SystemZICMP::SignedOnly);
+
+  // Check for equality comparisons with 0, or the equivalent.
+  if (CmpVal == 0) {
+    if (CCMask == SystemZ::CCMASK_CMP_EQ)
+      return SystemZ::CCMASK_TM_ALL_0;
+    if (CCMask == SystemZ::CCMASK_CMP_NE)
+      return SystemZ::CCMASK_TM_SOME_1;
+  }
+  if (EffectivelyUnsigned && CmpVal <= Low) {
+    if (CCMask == SystemZ::CCMASK_CMP_LT)
+      return SystemZ::CCMASK_TM_ALL_0;
+    if (CCMask == SystemZ::CCMASK_CMP_GE)
+      return SystemZ::CCMASK_TM_SOME_1;
+  }
+  if (EffectivelyUnsigned && CmpVal < Low) {
+    if (CCMask == SystemZ::CCMASK_CMP_LE)
+      return SystemZ::CCMASK_TM_ALL_0;
+    if (CCMask == SystemZ::CCMASK_CMP_GT)
+      return SystemZ::CCMASK_TM_SOME_1;
   }
 
-  // Prefer CL for zero-extended loads.
-  if (CmpOp1.getOpcode() == ISD::ZERO_EXTEND ||
-      ISD::isZEXTLoad(CmpOp1.getNode()))
-    return true;
+  // Check for equality comparisons with the mask, or the equivalent.
+  if (CmpVal == Mask) {
+    if (CCMask == SystemZ::CCMASK_CMP_EQ)
+      return SystemZ::CCMASK_TM_ALL_1;
+    if (CCMask == SystemZ::CCMASK_CMP_NE)
+      return SystemZ::CCMASK_TM_SOME_0;
+  }
+  if (EffectivelyUnsigned && CmpVal >= Mask - Low && CmpVal < Mask) {
+    if (CCMask == SystemZ::CCMASK_CMP_GT)
+      return SystemZ::CCMASK_TM_ALL_1;
+    if (CCMask == SystemZ::CCMASK_CMP_LE)
+      return SystemZ::CCMASK_TM_SOME_0;
+  }
+  if (EffectivelyUnsigned && CmpVal > Mask - Low && CmpVal <= Mask) {
+    if (CCMask == SystemZ::CCMASK_CMP_GE)
+      return SystemZ::CCMASK_TM_ALL_1;
+    if (CCMask == SystemZ::CCMASK_CMP_LT)
+      return SystemZ::CCMASK_TM_SOME_0;
+  }
 
-  // ...and for "in-register" zero extensions.
-  if (CmpOp1.getOpcode() == ISD::AND && CmpOp1.getValueType() == MVT::i64) {
-    SDValue Mask = CmpOp1.getOperand(1);
-    if (Mask.getOpcode() == ISD::Constant &&
-        cast<ConstantSDNode>(Mask)->getZExtValue() == 0xffffffff)
-      return true;
+  // Check for ordered comparisons with the top bit.
+  if (EffectivelyUnsigned && CmpVal >= Mask - High && CmpVal < High) {
+    if (CCMask == SystemZ::CCMASK_CMP_LE)
+      return SystemZ::CCMASK_TM_MSB_0;
+    if (CCMask == SystemZ::CCMASK_CMP_GT)
+      return SystemZ::CCMASK_TM_MSB_1;
+  }
+  if (EffectivelyUnsigned && CmpVal > Mask - High && CmpVal <= High) {
+    if (CCMask == SystemZ::CCMASK_CMP_LT)
+      return SystemZ::CCMASK_TM_MSB_0;
+    if (CCMask == SystemZ::CCMASK_CMP_GE)
+      return SystemZ::CCMASK_TM_MSB_1;
   }
 
-  return false;
+  // If there are just two bits, we can do equality checks for Low and High
+  // as well.
+  if (Mask == Low + High) {
+    if (CCMask == SystemZ::CCMASK_CMP_EQ && CmpVal == Low)
+      return SystemZ::CCMASK_TM_MIXED_MSB_0;
+    if (CCMask == SystemZ::CCMASK_CMP_NE && CmpVal == Low)
+      return SystemZ::CCMASK_TM_MIXED_MSB_0 ^ SystemZ::CCMASK_ANY;
+    if (CCMask == SystemZ::CCMASK_CMP_EQ && CmpVal == High)
+      return SystemZ::CCMASK_TM_MIXED_MSB_1;
+    if (CCMask == SystemZ::CCMASK_CMP_NE && CmpVal == High)
+      return SystemZ::CCMASK_TM_MIXED_MSB_1 ^ SystemZ::CCMASK_ANY;
+  }
+
+  // Looks like we've exhausted our options.
+  return 0;
+}
+
+// See whether the comparison (Opcode CmpOp0, CmpOp1, ICmpType) can be
+// implemented as a TEST UNDER MASK instruction when the condition being
+// tested is as described by CCValid and CCMask.  Update the arguments
+// with the TM version if so.
+static void adjustForTestUnderMask(SelectionDAG &DAG, unsigned &Opcode,
+                                   SDValue &CmpOp0, SDValue &CmpOp1,
+                                   unsigned &CCValid, unsigned &CCMask,
+                                   unsigned &ICmpType) {
+  // Check that we have a comparison with a constant.
+  ConstantSDNode *ConstCmpOp1 = dyn_cast<ConstantSDNode>(CmpOp1);
+  if (!ConstCmpOp1)
+    return;
+  uint64_t CmpVal = ConstCmpOp1->getZExtValue();
+
+  // Check whether the nonconstant input is an AND with a constant mask.
+  if (CmpOp0.getOpcode() != ISD::AND)
+    return;
+  SDValue AndOp0 = CmpOp0.getOperand(0);
+  SDValue AndOp1 = CmpOp0.getOperand(1);
+  ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(AndOp1.getNode());
+  if (!Mask)
+    return;
+  uint64_t MaskVal = Mask->getZExtValue();
+
+  // Check whether the combination of mask, comparison value and comparison
+  // type are suitable.
+  unsigned BitSize = CmpOp0.getValueType().getSizeInBits();
+  unsigned NewCCMask, ShiftVal;
+  if (ICmpType != SystemZICMP::SignedOnly &&
+      AndOp0.getOpcode() == ISD::SHL &&
+      isSimpleShift(AndOp0, ShiftVal) &&
+      (NewCCMask = getTestUnderMaskCond(BitSize, CCMask, MaskVal >> ShiftVal,
+                                        CmpVal >> ShiftVal,
+                                        SystemZICMP::Any))) {
+    AndOp0 = AndOp0.getOperand(0);
+    AndOp1 = DAG.getConstant(MaskVal >> ShiftVal, AndOp0.getValueType());
+  } else if (ICmpType != SystemZICMP::SignedOnly &&
+             AndOp0.getOpcode() == ISD::SRL &&
+             isSimpleShift(AndOp0, ShiftVal) &&
+             (NewCCMask = getTestUnderMaskCond(BitSize, CCMask,
+                                               MaskVal << ShiftVal,
+                                               CmpVal << ShiftVal,
+                                               SystemZICMP::UnsignedOnly))) {
+    AndOp0 = AndOp0.getOperand(0);
+    AndOp1 = DAG.getConstant(MaskVal << ShiftVal, AndOp0.getValueType());
+  } else {
+    NewCCMask = getTestUnderMaskCond(BitSize, CCMask, MaskVal, CmpVal,
+                                     ICmpType);
+    if (!NewCCMask)
+      return;
+  }
+
+  // Go ahead and make the change.
+  Opcode = SystemZISD::TM;
+  CmpOp0 = AndOp0;
+  CmpOp1 = AndOp1;
+  ICmpType = (bool(NewCCMask & SystemZ::CCMASK_TM_MIXED_MSB_0) !=
+              bool(NewCCMask & SystemZ::CCMASK_TM_MIXED_MSB_1));
+  CCValid = SystemZ::CCMASK_TM;
+  CCMask = NewCCMask;
 }
 
 // Return a target node that compares CmpOp0 with CmpOp1 and stores a
 // 2-bit result in CC.  Set CCValid to the CCMASK_* of all possible
 // 2-bit results and CCMask to the subset of those results that are
 // associated with Cond.
-static SDValue emitCmp(SelectionDAG &DAG, SDValue CmpOp0, SDValue CmpOp1,
+static SDValue emitCmp(const SystemZTargetMachine &TM, SelectionDAG &DAG,
+                       SDLoc DL, SDValue CmpOp0, SDValue CmpOp1,
                        ISD::CondCode Cond, unsigned &CCValid,
                        unsigned &CCMask) {
   bool IsUnsigned = false;
   CCMask = CCMaskForCondCode(Cond);
-  if (CmpOp0.getValueType().isFloatingPoint())
+  unsigned Opcode, ICmpType = 0;
+  if (CmpOp0.getValueType().isFloatingPoint()) {
     CCValid = SystemZ::CCMASK_FCMP;
-  else {
+    Opcode = SystemZISD::FCMP;
+  } else {
     IsUnsigned = CCMask & SystemZ::CCMASK_CMP_UO;
     CCValid = SystemZ::CCMASK_ICMP;
     CCMask &= CCValid;
     adjustZeroCmp(DAG, IsUnsigned, CmpOp0, CmpOp1, CCMask);
     adjustSubwordCmp(DAG, IsUnsigned, CmpOp0, CmpOp1, CCMask);
-    if (preferUnsignedComparison(DAG, CmpOp0, CmpOp1, CCMask))
-      IsUnsigned = true;
+    Opcode = SystemZISD::ICMP;
+    // Choose the type of comparison.  Equality and inequality tests can
+    // use either signed or unsigned comparisons.  The choice also doesn't
+    // matter if both sign bits are known to be clear.  In those cases we
+    // want to give the main isel code the freedom to choose whichever
+    // form fits best.
+    if (CCMask == SystemZ::CCMASK_CMP_EQ ||
+        CCMask == SystemZ::CCMASK_CMP_NE ||
+        (DAG.SignBitIsZero(CmpOp0) && DAG.SignBitIsZero(CmpOp1)))
+      ICmpType = SystemZICMP::Any;
+    else if (IsUnsigned)
+      ICmpType = SystemZICMP::UnsignedOnly;
+    else
+      ICmpType = SystemZICMP::SignedOnly;
   }
 
-  SDLoc DL(CmpOp0);
-  return DAG.getNode((IsUnsigned ? SystemZISD::UCMP : SystemZISD::CMP),
-                     DL, MVT::Glue, CmpOp0, CmpOp1);
+  if (shouldSwapCmpOperands(CmpOp0, CmpOp1, ICmpType)) {
+    std::swap(CmpOp0, CmpOp1);
+    CCMask = ((CCMask & SystemZ::CCMASK_CMP_EQ) |
+              (CCMask & SystemZ::CCMASK_CMP_GT ? SystemZ::CCMASK_CMP_LT : 0) |
+              (CCMask & SystemZ::CCMASK_CMP_LT ? SystemZ::CCMASK_CMP_GT : 0) |
+              (CCMask & SystemZ::CCMASK_CMP_UO));
+  }
+
+  adjustForTestUnderMask(DAG, Opcode, CmpOp0, CmpOp1, CCValid, CCMask,
+                         ICmpType);
+  if (Opcode == SystemZISD::ICMP || Opcode == SystemZISD::TM)
+    return DAG.getNode(Opcode, DL, MVT::Glue, CmpOp0, CmpOp1,
+                       DAG.getConstant(ICmpType, MVT::i32));
+  return DAG.getNode(Opcode, DL, MVT::Glue, CmpOp0, CmpOp1);
+}
+
+// Implement a 32-bit *MUL_LOHI operation by extending both operands to
+// 64 bits.  Extend is the extension type to use.  Store the high part
+// in Hi and the low part in Lo.
+static void lowerMUL_LOHI32(SelectionDAG &DAG, SDLoc DL,
+                            unsigned Extend, SDValue Op0, SDValue Op1,
+                            SDValue &Hi, SDValue &Lo) {
+  Op0 = DAG.getNode(Extend, DL, MVT::i64, Op0);
+  Op1 = DAG.getNode(Extend, DL, MVT::i64, Op1);
+  SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, Op0, Op1);
+  Hi = DAG.getNode(ISD::SRL, DL, MVT::i64, Mul, DAG.getConstant(32, MVT::i64));
+  Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Hi);
+  Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Mul);
 }
 
 // Lower a binary operation that produces two VT results, one in each
@@ -1092,14 +1482,38 @@ static void lowerGR128Binary(SelectionDAG &DAG, SDLoc DL, EVT VT,
   SDValue Result = DAG.getNode(Opcode, DL, MVT::Untyped,
                                SDValue(In128, 0), Op1);
   bool Is32Bit = is32Bit(VT);
-  SDValue SubReg0 = DAG.getTargetConstant(SystemZ::even128(Is32Bit), VT);
-  SDValue SubReg1 = DAG.getTargetConstant(SystemZ::odd128(Is32Bit), VT);
-  SDNode *Reg0 = DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL,
-                                    VT, Result, SubReg0);
-  SDNode *Reg1 = DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL,
-                                    VT, Result, SubReg1);
-  Even = SDValue(Reg0, 0);
-  Odd = SDValue(Reg1, 0);
+  Even = DAG.getTargetExtractSubreg(SystemZ::even128(Is32Bit), DL, VT, Result);
+  Odd = DAG.getTargetExtractSubreg(SystemZ::odd128(Is32Bit), DL, VT, Result);
+}
+
+SDValue SystemZTargetLowering::lowerSETCC(SDValue Op,
+                                          SelectionDAG &DAG) const {
+  SDValue CmpOp0   = Op.getOperand(0);
+  SDValue CmpOp1   = Op.getOperand(1);
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+  SDLoc DL(Op);
+
+  unsigned CCValid, CCMask;
+  SDValue Glue = emitCmp(TM, DAG, DL, CmpOp0, CmpOp1, CC, CCValid, CCMask);
+
+  IPMConversion Conversion = getIPMConversion(CCValid, CCMask);
+  SDValue Result = DAG.getNode(SystemZISD::IPM, DL, MVT::i32, Glue);
+
+  if (Conversion.XORValue)
+    Result = DAG.getNode(ISD::XOR, DL, MVT::i32, Result,
+                         DAG.getConstant(Conversion.XORValue, MVT::i32));
+
+  if (Conversion.AddValue)
+    Result = DAG.getNode(ISD::ADD, DL, MVT::i32, Result,
+                         DAG.getConstant(Conversion.AddValue, MVT::i32));
+
+  // The SHR/AND sequence should get optimized to an RISBG.
+  Result = DAG.getNode(ISD::SRL, DL, MVT::i32, Result,
+                       DAG.getConstant(Conversion.Bit, MVT::i32));
+  if (Conversion.Bit != 31)
+    Result = DAG.getNode(ISD::AND, DL, MVT::i32, Result,
+                         DAG.getConstant(1, MVT::i32));
+  return Result;
 }
 
 SDValue SystemZTargetLowering::lowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
@@ -1111,7 +1525,7 @@ SDValue SystemZTargetLowering::lowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
   SDLoc DL(Op);
 
   unsigned CCValid, CCMask;
-  SDValue Flags = emitCmp(DAG, CmpOp0, CmpOp1, CC, CCValid, CCMask);
+  SDValue Flags = emitCmp(TM, DAG, DL, CmpOp0, CmpOp1, CC, CCValid, CCMask);
   return DAG.getNode(SystemZISD::BR_CCMASK, DL, Op.getValueType(),
                      Chain, DAG.getConstant(CCValid, MVT::i32),
                      DAG.getConstant(CCMask, MVT::i32), Dest, Flags);
@@ -1127,7 +1541,7 @@ SDValue SystemZTargetLowering::lowerSELECT_CC(SDValue Op,
   SDLoc DL(Op);
 
   unsigned CCValid, CCMask;
-  SDValue Flags = emitCmp(DAG, CmpOp0, CmpOp1, CC, CCValid, CCMask);
+  SDValue Flags = emitCmp(TM, DAG, DL, CmpOp0, CmpOp1, CC, CCValid, CCMask);
 
   SmallVector<SDValue, 5> Ops;
   Ops.push_back(TrueOp);
@@ -1151,18 +1565,18 @@ SDValue SystemZTargetLowering::lowerGlobalAddress(GlobalAddressSDNode *Node,
 
   SDValue Result;
   if (Subtarget.isPC32DBLSymbol(GV, RM, CM)) {
-    // Make sure that the offset is aligned to a halfword.  If it isn't,
-    // create an "anchor" at the previous 12-bit boundary.
-    // FIXME check whether there is a better way of handling this.
-    if (Offset & 1) {
-      Result = DAG.getTargetGlobalAddress(GV, DL, PtrVT,
-                                          Offset & ~uint64_t(0xfff));
-      Offset &= 0xfff;
-    } else {
-      Result = DAG.getTargetGlobalAddress(GV, DL, PtrVT, Offset);
+    // Assign anchors at 1<<12 byte boundaries.
+    uint64_t Anchor = Offset & ~uint64_t(0xfff);
+    Result = DAG.getTargetGlobalAddress(GV, DL, PtrVT, Anchor);
+    Result = DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Result);
+
+    // The offset can be folded into the address if it is aligned to a halfword.
+    Offset -= Anchor;
+    if (Offset != 0 && (Offset & 1) == 0) {
+      SDValue Full = DAG.getTargetGlobalAddress(GV, DL, PtrVT, Anchor + Offset);
+      Result = DAG.getNode(SystemZISD::PCREL_OFFSET, DL, PtrVT, Full, Result);
       Offset = 0;
     }
-    Result = DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Result);
   } else {
     Result = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, SystemZII::MO_GOT);
     Result = DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Result);
@@ -1264,24 +1678,33 @@ SDValue SystemZTargetLowering::lowerBITCAST(SDValue Op,
   EVT InVT = In.getValueType();
   EVT ResVT = Op.getValueType();
 
-  SDValue SubReg32 = DAG.getTargetConstant(SystemZ::subreg_32bit, MVT::i64);
-  SDValue Shift32 = DAG.getConstant(32, MVT::i64);
   if (InVT == MVT::i32 && ResVT == MVT::f32) {
-    SDValue In64 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, In);
-    SDValue Shift = DAG.getNode(ISD::SHL, DL, MVT::i64, In64, Shift32);
-    SDValue Out64 = DAG.getNode(ISD::BITCAST, DL, MVT::f64, Shift);
-    SDNode *Out = DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL,
-                                     MVT::f32, Out64, SubReg32);
-    return SDValue(Out, 0);
+    SDValue In64;
+    if (Subtarget.hasHighWord()) {
+      SDNode *U64 = DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, DL,
+                                       MVT::i64);
+      In64 = DAG.getTargetInsertSubreg(SystemZ::subreg_h32, DL,
+                                       MVT::i64, SDValue(U64, 0), In);
+    } else {
+      In64 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, In);
+      In64 = DAG.getNode(ISD::SHL, DL, MVT::i64, In64,
+                         DAG.getConstant(32, MVT::i64));
+    }
+    SDValue Out64 = DAG.getNode(ISD::BITCAST, DL, MVT::f64, In64);
+    return DAG.getTargetExtractSubreg(SystemZ::subreg_h32,
+                                      DL, MVT::f32, Out64);
   }
   if (InVT == MVT::f32 && ResVT == MVT::i32) {
     SDNode *U64 = DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::f64);
-    SDNode *In64 = DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL,
-                                      MVT::f64, SDValue(U64, 0), In, SubReg32);
-    SDValue Out64 = DAG.getNode(ISD::BITCAST, DL, MVT::i64, SDValue(In64, 0));
-    SDValue Shift = DAG.getNode(ISD::SRL, DL, MVT::i64, Out64, Shift32);
-    SDValue Out = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Shift);
-    return Out;
+    SDValue In64 = DAG.getTargetInsertSubreg(SystemZ::subreg_h32, DL,
+                                             MVT::f64, SDValue(U64, 0), In);
+    SDValue Out64 = DAG.getNode(ISD::BITCAST, DL, MVT::i64, In64);
+    if (Subtarget.hasHighWord())
+      return DAG.getTargetExtractSubreg(SystemZ::subreg_h32, DL,
+                                        MVT::i32, Out64);
+    SDValue Shift = DAG.getNode(ISD::SRL, DL, MVT::i64, Out64,
+                                DAG.getConstant(32, MVT::i64));
+    return DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Shift);
   }
   llvm_unreachable("Unexpected bitcast combination");
 }
@@ -1364,18 +1787,64 @@ lowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const {
   return DAG.getMergeValues(Ops, 2, DL);
 }
 
-SDValue SystemZTargetLowering::lowerUMUL_LOHI(SDValue Op,
+SDValue SystemZTargetLowering::lowerSMUL_LOHI(SDValue Op,
                                               SelectionDAG &DAG) const {
   EVT VT = Op.getValueType();
   SDLoc DL(Op);
-  assert(!is32Bit(VT) && "Only support 64-bit UMUL_LOHI");
+  SDValue Ops[2];
+  if (is32Bit(VT))
+    // Just do a normal 64-bit multiplication and extract the results.
+    // We define this so that it can be used for constant division.
+    lowerMUL_LOHI32(DAG, DL, ISD::SIGN_EXTEND, Op.getOperand(0),
+                    Op.getOperand(1), Ops[1], Ops[0]);
+  else {
+    // Do a full 128-bit multiplication based on UMUL_LOHI64:
+    //
+    //   (ll * rl) + ((lh * rl) << 64) + ((ll * rh) << 64)
+    //
+    // but using the fact that the upper halves are either all zeros
+    // or all ones:
+    //
+    //   (ll * rl) - ((lh & rl) << 64) - ((ll & rh) << 64)
+    //
+    // and grouping the right terms together since they are quicker than the
+    // multiplication:
+    //
+    //   (ll * rl) - (((lh & rl) + (ll & rh)) << 64)
+    SDValue C63 = DAG.getConstant(63, MVT::i64);
+    SDValue LL = Op.getOperand(0);
+    SDValue RL = Op.getOperand(1);
+    SDValue LH = DAG.getNode(ISD::SRA, DL, VT, LL, C63);
+    SDValue RH = DAG.getNode(ISD::SRA, DL, VT, RL, C63);
+    // UMUL_LOHI64 returns the low result in the odd register and the high
+    // result in the even register.  SMUL_LOHI is defined to return the
+    // low half first, so the results are in reverse order.
+    lowerGR128Binary(DAG, DL, VT, SystemZ::AEXT128_64, SystemZISD::UMUL_LOHI64,
+                     LL, RL, Ops[1], Ops[0]);
+    SDValue NegLLTimesRH = DAG.getNode(ISD::AND, DL, VT, LL, RH);
+    SDValue NegLHTimesRL = DAG.getNode(ISD::AND, DL, VT, LH, RL);
+    SDValue NegSum = DAG.getNode(ISD::ADD, DL, VT, NegLLTimesRH, NegLHTimesRL);
+    Ops[1] = DAG.getNode(ISD::SUB, DL, VT, Ops[1], NegSum);
+  }
+  return DAG.getMergeValues(Ops, 2, DL);
+}
 
-  // UMUL_LOHI64 returns the low result in the odd register and the high
-  // result in the even register.  UMUL_LOHI is defined to return the
-  // low half first, so the results are in reverse order.
+SDValue SystemZTargetLowering::lowerUMUL_LOHI(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  EVT VT = Op.getValueType();
+  SDLoc DL(Op);
   SDValue Ops[2];
-  lowerGR128Binary(DAG, DL, VT, SystemZ::AEXT128_64, SystemZISD::UMUL_LOHI64,
-                   Op.getOperand(0), Op.getOperand(1), Ops[1], Ops[0]);
+  if (is32Bit(VT))
+    // Just do a normal 64-bit multiplication and extract the results.
+    // We define this so that it can be used for constant division.
+    lowerMUL_LOHI32(DAG, DL, ISD::ZERO_EXTEND, Op.getOperand(0),
+                    Op.getOperand(1), Ops[1], Ops[0]);
+  else
+    // UMUL_LOHI64 returns the low result in the odd register and the high
+    // result in the even register.  UMUL_LOHI is defined to return the
+    // low half first, so the results are in reverse order.
+    lowerGR128Binary(DAG, DL, VT, SystemZ::AEXT128_64, SystemZISD::UMUL_LOHI64,
+                     Op.getOperand(0), Op.getOperand(1), Ops[1], Ops[0]);
   return DAG.getMergeValues(Ops, 2, DL);
 }
 
@@ -1464,10 +1933,10 @@ SDValue SystemZTargetLowering::lowerOR(SDValue Op, SelectionDAG &DAG) const {
   // high 32 bits and just masks out low bits.  We can skip it if so.
   if (HighOp.getOpcode() == ISD::AND &&
       HighOp.getOperand(1).getOpcode() == ISD::Constant) {
-    ConstantSDNode *MaskNode = cast<ConstantSDNode>(HighOp.getOperand(1));
-    uint64_t Mask = MaskNode->getZExtValue() | Masks[High];
-    if ((Mask >> 32) == 0xffffffff)
-      HighOp = HighOp.getOperand(0);
+    SDValue HighOp0 = HighOp.getOperand(0);
+    uint64_t Mask = cast<ConstantSDNode>(HighOp.getOperand(1))->getZExtValue();
+    if (DAG.MaskedValueIsZero(HighOp0, APInt(64, ~(Mask | 0xffffffff))))
+      HighOp = HighOp0;
   }
 
   // Take advantage of the fact that all GR32 operations only change the
@@ -1476,10 +1945,8 @@ SDValue SystemZTargetLowering::lowerOR(SDValue Op, SelectionDAG &DAG) const {
   // can be folded.
   SDLoc DL(Op);
   SDValue Low32 = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, LowOp);
-  SDValue SubReg32 = DAG.getTargetConstant(SystemZ::subreg_32bit, MVT::i64);
-  SDNode *Result = DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL,
-                                      MVT::i64, HighOp, Low32, SubReg32);
-  return SDValue(Result, 0);
+  return DAG.getTargetInsertSubreg(SystemZ::subreg_l32, DL,
+                                   MVT::i64, HighOp, Low32);
 }
 
 // Op is an 8-, 16-bit or 32-bit ATOMIC_LOAD_* operation.  Lower the first
@@ -1618,6 +2085,26 @@ SDValue SystemZTargetLowering::lowerSTACKRESTORE(SDValue Op,
                           SystemZ::R15D, Op.getOperand(1));
 }
 
+SDValue SystemZTargetLowering::lowerPREFETCH(SDValue Op,
+                                             SelectionDAG &DAG) const {
+  bool IsData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
+  if (!IsData)
+    // Just preserve the chain.
+    return Op.getOperand(0);
+
+  bool IsWrite = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
+  unsigned Code = IsWrite ? SystemZ::PFD_WRITE : SystemZ::PFD_READ;
+  MemIntrinsicSDNode *Node = cast<MemIntrinsicSDNode>(Op.getNode());
+  SDValue Ops[] = {
+    Op.getOperand(0),
+    DAG.getConstant(Code, MVT::i32),
+    Op.getOperand(1)
+  };
+  return DAG.getMemIntrinsicNode(SystemZISD::PREFETCH, SDLoc(Op),
+                                 Node->getVTList(), Ops, array_lengthof(Ops),
+                                 Node->getMemoryVT(), Node->getMemOperand());
+}
+
 SDValue SystemZTargetLowering::LowerOperation(SDValue Op,
                                               SelectionDAG &DAG) const {
   switch (Op.getOpcode()) {
@@ -1625,6 +2112,8 @@ SDValue SystemZTargetLowering::LowerOperation(SDValue Op,
     return lowerBR_CC(Op, DAG);
   case ISD::SELECT_CC:
     return lowerSELECT_CC(Op, DAG);
+  case ISD::SETCC:
+    return lowerSETCC(Op, DAG);
   case ISD::GlobalAddress:
     return lowerGlobalAddress(cast<GlobalAddressSDNode>(Op), DAG);
   case ISD::GlobalTLSAddress:
@@ -1643,6 +2132,8 @@ SDValue SystemZTargetLowering::LowerOperation(SDValue Op,
     return lowerVACOPY(Op, DAG);
   case ISD::DYNAMIC_STACKALLOC:
     return lowerDYNAMIC_STACKALLOC(Op, DAG);
+  case ISD::SMUL_LOHI:
+    return lowerSMUL_LOHI(Op, DAG);
   case ISD::UMUL_LOHI:
     return lowerUMUL_LOHI(Op, DAG);
   case ISD::SDIVREM:
@@ -1679,6 +2170,8 @@ SDValue SystemZTargetLowering::LowerOperation(SDValue Op,
     return lowerSTACKSAVE(Op, DAG);
   case ISD::STACKRESTORE:
     return lowerSTACKRESTORE(Op, DAG);
+  case ISD::PREFETCH:
+    return lowerPREFETCH(Op, DAG);
   default:
     llvm_unreachable("Unexpected node to lower");
   }
@@ -1689,9 +2182,12 @@ const char *SystemZTargetLowering::getTargetNodeName(unsigned Opcode) const {
   switch (Opcode) {
     OPCODE(RET_FLAG);
     OPCODE(CALL);
+    OPCODE(SIBCALL);
     OPCODE(PCREL_WRAPPER);
-    OPCODE(CMP);
-    OPCODE(UCMP);
+    OPCODE(PCREL_OFFSET);
+    OPCODE(ICMP);
+    OPCODE(FCMP);
+    OPCODE(TM);
     OPCODE(BR_CCMASK);
     OPCODE(SELECT_CCMASK);
     OPCODE(ADJDYNALLOC);
@@ -1701,6 +2197,19 @@ const char *SystemZTargetLowering::getTargetNodeName(unsigned Opcode) const {
     OPCODE(UDIVREM32);
     OPCODE(UDIVREM64);
     OPCODE(MVC);
+    OPCODE(MVC_LOOP);
+    OPCODE(NC);
+    OPCODE(NC_LOOP);
+    OPCODE(OC);
+    OPCODE(OC_LOOP);
+    OPCODE(XC);
+    OPCODE(XC_LOOP);
+    OPCODE(CLC);
+    OPCODE(CLC_LOOP);
+    OPCODE(STRCMP);
+    OPCODE(STPCPY);
+    OPCODE(SEARCH_STRING);
+    OPCODE(IPM);
     OPCODE(ATOMIC_SWAPW);
     OPCODE(ATOMIC_LOADW_ADD);
     OPCODE(ATOMIC_LOADW_SUB);
@@ -1713,6 +2222,7 @@ const char *SystemZTargetLowering::getTargetNodeName(unsigned Opcode) const {
     OPCODE(ATOMIC_LOADW_UMIN);
     OPCODE(ATOMIC_LOADW_UMAX);
     OPCODE(ATOMIC_CMP_SWAPW);
+    OPCODE(PREFETCH);
   }
   return NULL;
 #undef OPCODE
@@ -1742,6 +2252,31 @@ static MachineBasicBlock *splitBlockAfter(MachineInstr *MI,
   return NewMBB;
 }
 
+// Split MBB before MI and return the new block (the one that contains MI).
+static MachineBasicBlock *splitBlockBefore(MachineInstr *MI,
+                                           MachineBasicBlock *MBB) {
+  MachineBasicBlock *NewMBB = emitBlockAfter(MBB);
+  NewMBB->splice(NewMBB->begin(), MBB, MI, MBB->end());
+  NewMBB->transferSuccessorsAndUpdatePHIs(MBB);
+  return NewMBB;
+}
+
+// Force base value Base into a register before MI.  Return the register.
+static unsigned forceReg(MachineInstr *MI, MachineOperand &Base,
+                         const SystemZInstrInfo *TII) {
+  if (Base.isReg())
+    return Base.getReg();
+
+  MachineBasicBlock *MBB = MI->getParent();
+  MachineFunction &MF = *MBB->getParent();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  unsigned Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
+  BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(SystemZ::LA), Reg)
+    .addOperand(Base).addImm(0).addReg(0);
+  return Reg;
+}
+
 // Implement EmitInstrWithCustomInserter for pseudo Select* instruction MI.
 MachineBasicBlock *
 SystemZTargetLowering::emitSelect(MachineInstr *MI,
@@ -1756,7 +2291,7 @@ SystemZTargetLowering::emitSelect(MachineInstr *MI,
   DebugLoc DL       = MI->getDebugLoc();
 
   MachineBasicBlock *StartMBB = MBB;
-  MachineBasicBlock *JoinMBB  = splitBlockAfter(MI, MBB);
+  MachineBasicBlock *JoinMBB  = splitBlockBefore(MI, MBB);
   MachineBasicBlock *FalseMBB = emitBlockAfter(StartMBB);
 
   //  StartMBB:
@@ -1777,7 +2312,7 @@ SystemZTargetLowering::emitSelect(MachineInstr *MI,
   //   %Result = phi [ %FalseReg, FalseMBB ], [ %TrueReg, StartMBB ]
   //  ...
   MBB = JoinMBB;
-  BuildMI(*MBB, MBB->begin(), DL, TII->get(SystemZ::PHI), DestReg)
+  BuildMI(*MBB, MI, DL, TII->get(SystemZ::PHI), DestReg)
     .addReg(TrueReg).addMBB(StartMBB)
     .addReg(FalseReg).addMBB(FalseMBB);
 
@@ -1824,7 +2359,7 @@ SystemZTargetLowering::emitCondStore(MachineInstr *MI,
     CCMask ^= CCValid;
 
   MachineBasicBlock *StartMBB = MBB;
-  MachineBasicBlock *JoinMBB  = splitBlockAfter(MI, MBB);
+  MachineBasicBlock *JoinMBB  = splitBlockBefore(MI, MBB);
   MachineBasicBlock *FalseMBB = emitBlockAfter(StartMBB);
 
   //  StartMBB:
@@ -1900,7 +2435,7 @@ SystemZTargetLowering::emitAtomicLoadBinary(MachineInstr *MI,
 
   // Insert a basic block for the main loop.
   MachineBasicBlock *StartMBB = MBB;
-  MachineBasicBlock *DoneMBB  = splitBlockAfter(MI, MBB);
+  MachineBasicBlock *DoneMBB  = splitBlockBefore(MI, MBB);
   MachineBasicBlock *LoopMBB  = emitBlockAfter(StartMBB);
 
   //  StartMBB:
@@ -1934,11 +2469,11 @@ SystemZTargetLowering::emitAtomicLoadBinary(MachineInstr *MI,
       .addReg(RotatedOldVal).addOperand(Src2);
     if (BitSize < 32)
       // XILF with the upper BitSize bits set.
-      BuildMI(MBB, DL, TII->get(SystemZ::XILF32), RotatedNewVal)
+      BuildMI(MBB, DL, TII->get(SystemZ::XILF), RotatedNewVal)
         .addReg(Tmp).addImm(uint32_t(~0 << (32 - BitSize)));
     else if (BitSize == 32)
       // XILF with every bit set.
-      BuildMI(MBB, DL, TII->get(SystemZ::XILF32), RotatedNewVal)
+      BuildMI(MBB, DL, TII->get(SystemZ::XILF), RotatedNewVal)
         .addReg(Tmp).addImm(~uint32_t(0));
     else {
       // Use LCGR and add -1 to the result, which is more compact than
@@ -2022,7 +2557,7 @@ SystemZTargetLowering::emitAtomicLoadMinMax(MachineInstr *MI,
 
   // Insert 3 basic blocks for the loop.
   MachineBasicBlock *StartMBB  = MBB;
-  MachineBasicBlock *DoneMBB   = splitBlockAfter(MI, MBB);
+  MachineBasicBlock *DoneMBB   = splitBlockBefore(MI, MBB);
   MachineBasicBlock *LoopMBB   = emitBlockAfter(StartMBB);
   MachineBasicBlock *UseAltMBB = emitBlockAfter(LoopMBB);
   MachineBasicBlock *UpdateMBB = emitBlockAfter(UseAltMBB);
@@ -2129,7 +2664,7 @@ SystemZTargetLowering::emitAtomicCmpSwapW(MachineInstr *MI,
 
   // Insert 2 basic blocks for the loop.
   MachineBasicBlock *StartMBB = MBB;
-  MachineBasicBlock *DoneMBB  = splitBlockAfter(MI, MBB);
+  MachineBasicBlock *DoneMBB  = splitBlockBefore(MI, MBB);
   MachineBasicBlock *LoopMBB  = emitBlockAfter(StartMBB);
   MachineBasicBlock *SetMBB   = emitBlockAfter(LoopMBB);
 
@@ -2205,8 +2740,8 @@ SystemZTargetLowering::emitAtomicCmpSwapW(MachineInstr *MI,
 
 // Emit an extension from a GR32 or GR64 to a GR128.  ClearEven is true
 // if the high register of the GR128 value must be cleared or false if
-// it's "don't care".  SubReg is subreg_odd32 when extending a GR32
-// and subreg_odd when extending a GR64.
+// it's "don't care".  SubReg is subreg_l32 when extending a GR32
+// and subreg_l64 when extending a GR64.
 MachineBasicBlock *
 SystemZTargetLowering::emitExt128(MachineInstr *MI,
                                   MachineBasicBlock *MBB,
@@ -2228,7 +2763,7 @@ SystemZTargetLowering::emitExt128(MachineInstr *MI,
     BuildMI(*MBB, MI, DL, TII->get(SystemZ::LLILL), Zero64)
       .addImm(0);
     BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), NewIn128)
-      .addReg(In128).addReg(Zero64).addImm(SystemZ::subreg_high);
+      .addReg(In128).addReg(Zero64).addImm(SystemZ::subreg_h64);
     In128 = NewIn128;
   }
   BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dest)
@@ -2239,28 +2774,237 @@ SystemZTargetLowering::emitExt128(MachineInstr *MI,
 }
 
 MachineBasicBlock *
-SystemZTargetLowering::emitMVCWrapper(MachineInstr *MI,
-                                      MachineBasicBlock *MBB) const {
+SystemZTargetLowering::emitMemMemWrapper(MachineInstr *MI,
+                                         MachineBasicBlock *MBB,
+                                         unsigned Opcode) const {
   const SystemZInstrInfo *TII = TM.getInstrInfo();
+  MachineFunction &MF = *MBB->getParent();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
   DebugLoc DL = MI->getDebugLoc();
 
-  MachineOperand DestBase = MI->getOperand(0);
+  MachineOperand DestBase = earlyUseOperand(MI->getOperand(0));
   uint64_t       DestDisp = MI->getOperand(1).getImm();
-  MachineOperand SrcBase  = MI->getOperand(2);
+  MachineOperand SrcBase  = earlyUseOperand(MI->getOperand(2));
   uint64_t       SrcDisp  = MI->getOperand(3).getImm();
   uint64_t       Length   = MI->getOperand(4).getImm();
 
-  BuildMI(*MBB, MI, DL, TII->get(SystemZ::MVC))
-    .addOperand(DestBase).addImm(DestDisp).addImm(Length)
-    .addOperand(SrcBase).addImm(SrcDisp);
+  // When generating more than one CLC, all but the last will need to
+  // branch to the end when a difference is found.
+  MachineBasicBlock *EndMBB = (Length > 256 && Opcode == SystemZ::CLC ?
+                               splitBlockAfter(MI, MBB) : 0);
+
+  // Check for the loop form, in which operand 5 is the trip count.
+  if (MI->getNumExplicitOperands() > 5) {
+    bool HaveSingleBase = DestBase.isIdenticalTo(SrcBase);
+
+    uint64_t StartCountReg = MI->getOperand(5).getReg();
+    uint64_t StartSrcReg   = forceReg(MI, SrcBase, TII);
+    uint64_t StartDestReg  = (HaveSingleBase ? StartSrcReg :
+                              forceReg(MI, DestBase, TII));
+
+    const TargetRegisterClass *RC = &SystemZ::ADDR64BitRegClass;
+    uint64_t ThisSrcReg  = MRI.createVirtualRegister(RC);
+    uint64_t ThisDestReg = (HaveSingleBase ? ThisSrcReg :
+                            MRI.createVirtualRegister(RC));
+    uint64_t NextSrcReg  = MRI.createVirtualRegister(RC);
+    uint64_t NextDestReg = (HaveSingleBase ? NextSrcReg :
+                            MRI.createVirtualRegister(RC));
+
+    RC = &SystemZ::GR64BitRegClass;
+    uint64_t ThisCountReg = MRI.createVirtualRegister(RC);
+    uint64_t NextCountReg = MRI.createVirtualRegister(RC);
+
+    MachineBasicBlock *StartMBB = MBB;
+    MachineBasicBlock *DoneMBB = splitBlockBefore(MI, MBB);
+    MachineBasicBlock *LoopMBB = emitBlockAfter(StartMBB);
+    MachineBasicBlock *NextMBB = (EndMBB ? emitBlockAfter(LoopMBB) : LoopMBB);
+
+    //  StartMBB:
+    //   # fall through to LoopMMB
+    MBB->addSuccessor(LoopMBB);
+
+    //  LoopMBB:
+    //   %ThisDestReg = phi [ %StartDestReg, StartMBB ],
+    //                      [ %NextDestReg, NextMBB ]
+    //   %ThisSrcReg = phi [ %StartSrcReg, StartMBB ],
+    //                     [ %NextSrcReg, NextMBB ]
+    //   %ThisCountReg = phi [ %StartCountReg, StartMBB ],
+    //                       [ %NextCountReg, NextMBB ]
+    //   ( PFD 2, 768+DestDisp(%ThisDestReg) )
+    //   Opcode DestDisp(256,%ThisDestReg), SrcDisp(%ThisSrcReg)
+    //   ( JLH EndMBB )
+    //
+    // The prefetch is used only for MVC.  The JLH is used only for CLC.
+    MBB = LoopMBB;
+
+    BuildMI(MBB, DL, TII->get(SystemZ::PHI), ThisDestReg)
+      .addReg(StartDestReg).addMBB(StartMBB)
+      .addReg(NextDestReg).addMBB(NextMBB);
+    if (!HaveSingleBase)
+      BuildMI(MBB, DL, TII->get(SystemZ::PHI), ThisSrcReg)
+        .addReg(StartSrcReg).addMBB(StartMBB)
+        .addReg(NextSrcReg).addMBB(NextMBB);
+    BuildMI(MBB, DL, TII->get(SystemZ::PHI), ThisCountReg)
+      .addReg(StartCountReg).addMBB(StartMBB)
+      .addReg(NextCountReg).addMBB(NextMBB);
+    if (Opcode == SystemZ::MVC)
+      BuildMI(MBB, DL, TII->get(SystemZ::PFD))
+        .addImm(SystemZ::PFD_WRITE)
+        .addReg(ThisDestReg).addImm(DestDisp + 768).addReg(0);
+    BuildMI(MBB, DL, TII->get(Opcode))
+      .addReg(ThisDestReg).addImm(DestDisp).addImm(256)
+      .addReg(ThisSrcReg).addImm(SrcDisp);
+    if (EndMBB) {
+      BuildMI(MBB, DL, TII->get(SystemZ::BRC))
+        .addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_NE)
+        .addMBB(EndMBB);
+      MBB->addSuccessor(EndMBB);
+      MBB->addSuccessor(NextMBB);
+    }
+
+    // NextMBB:
+    //   %NextDestReg = LA 256(%ThisDestReg)
+    //   %NextSrcReg = LA 256(%ThisSrcReg)
+    //   %NextCountReg = AGHI %ThisCountReg, -1
+    //   CGHI %NextCountReg, 0
+    //   JLH LoopMBB
+    //   # fall through to DoneMMB
+    //
+    // The AGHI, CGHI and JLH should be converted to BRCTG by later passes.
+    MBB = NextMBB;
+
+    BuildMI(MBB, DL, TII->get(SystemZ::LA), NextDestReg)
+      .addReg(ThisDestReg).addImm(256).addReg(0);
+    if (!HaveSingleBase)
+      BuildMI(MBB, DL, TII->get(SystemZ::LA), NextSrcReg)
+        .addReg(ThisSrcReg).addImm(256).addReg(0);
+    BuildMI(MBB, DL, TII->get(SystemZ::AGHI), NextCountReg)
+      .addReg(ThisCountReg).addImm(-1);
+    BuildMI(MBB, DL, TII->get(SystemZ::CGHI))
+      .addReg(NextCountReg).addImm(0);
+    BuildMI(MBB, DL, TII->get(SystemZ::BRC))
+      .addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_NE)
+      .addMBB(LoopMBB);
+    MBB->addSuccessor(LoopMBB);
+    MBB->addSuccessor(DoneMBB);
+
+    DestBase = MachineOperand::CreateReg(NextDestReg, false);
+    SrcBase = MachineOperand::CreateReg(NextSrcReg, false);
+    Length &= 255;
+    MBB = DoneMBB;
+  }
+  // Handle any remaining bytes with straight-line code.
+  while (Length > 0) {
+    uint64_t ThisLength = std::min(Length, uint64_t(256));
+    // The previous iteration might have created out-of-range displacements.
+    // Apply them using LAY if so.
+    if (!isUInt<12>(DestDisp)) {
+      unsigned Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
+      BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(SystemZ::LAY), Reg)
+        .addOperand(DestBase).addImm(DestDisp).addReg(0);
+      DestBase = MachineOperand::CreateReg(Reg, false);
+      DestDisp = 0;
+    }
+    if (!isUInt<12>(SrcDisp)) {
+      unsigned Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
+      BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(SystemZ::LAY), Reg)
+        .addOperand(SrcBase).addImm(SrcDisp).addReg(0);
+      SrcBase = MachineOperand::CreateReg(Reg, false);
+      SrcDisp = 0;
+    }
+    BuildMI(*MBB, MI, DL, TII->get(Opcode))
+      .addOperand(DestBase).addImm(DestDisp).addImm(ThisLength)
+      .addOperand(SrcBase).addImm(SrcDisp);
+    DestDisp += ThisLength;
+    SrcDisp += ThisLength;
+    Length -= ThisLength;
+    // If there's another CLC to go, branch to the end if a difference
+    // was found.
+    if (EndMBB && Length > 0) {
+      MachineBasicBlock *NextMBB = splitBlockBefore(MI, MBB);
+      BuildMI(MBB, DL, TII->get(SystemZ::BRC))
+        .addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_NE)
+        .addMBB(EndMBB);
+      MBB->addSuccessor(EndMBB);
+      MBB->addSuccessor(NextMBB);
+      MBB = NextMBB;
+    }
+  }
+  if (EndMBB) {
+    MBB->addSuccessor(EndMBB);
+    MBB = EndMBB;
+    MBB->addLiveIn(SystemZ::CC);
+  }
 
   MI->eraseFromParent();
   return MBB;
 }
 
+// Decompose string pseudo-instruction MI into a loop that continually performs
+// Opcode until CC != 3.
+MachineBasicBlock *
+SystemZTargetLowering::emitStringWrapper(MachineInstr *MI,
+                                         MachineBasicBlock *MBB,
+                                         unsigned Opcode) const {
+  const SystemZInstrInfo *TII = TM.getInstrInfo();
+  MachineFunction &MF = *MBB->getParent();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  DebugLoc DL = MI->getDebugLoc();
+
+  uint64_t End1Reg   = MI->getOperand(0).getReg();
+  uint64_t Start1Reg = MI->getOperand(1).getReg();
+  uint64_t Start2Reg = MI->getOperand(2).getReg();
+  uint64_t CharReg   = MI->getOperand(3).getReg();
+
+  const TargetRegisterClass *RC = &SystemZ::GR64BitRegClass;
+  uint64_t This1Reg = MRI.createVirtualRegister(RC);
+  uint64_t This2Reg = MRI.createVirtualRegister(RC);
+  uint64_t End2Reg  = MRI.createVirtualRegister(RC);
+
+  MachineBasicBlock *StartMBB = MBB;
+  MachineBasicBlock *DoneMBB = splitBlockBefore(MI, MBB);
+  MachineBasicBlock *LoopMBB = emitBlockAfter(StartMBB);
+
+  //  StartMBB:
+  //   # fall through to LoopMMB
+  MBB->addSuccessor(LoopMBB);
+
+  //  LoopMBB:
+  //   %This1Reg = phi [ %Start1Reg, StartMBB ], [ %End1Reg, LoopMBB ]
+  //   %This2Reg = phi [ %Start2Reg, StartMBB ], [ %End2Reg, LoopMBB ]
+  //   R0L = %CharReg
+  //   %End1Reg, %End2Reg = CLST %This1Reg, %This2Reg -- uses R0L
+  //   JO LoopMBB
+  //   # fall through to DoneMMB
+  //
+  // The load of R0L can be hoisted by post-RA LICM.
+  MBB = LoopMBB;
+
+  BuildMI(MBB, DL, TII->get(SystemZ::PHI), This1Reg)
+    .addReg(Start1Reg).addMBB(StartMBB)
+    .addReg(End1Reg).addMBB(LoopMBB);
+  BuildMI(MBB, DL, TII->get(SystemZ::PHI), This2Reg)
+    .addReg(Start2Reg).addMBB(StartMBB)
+    .addReg(End2Reg).addMBB(LoopMBB);
+  BuildMI(MBB, DL, TII->get(TargetOpcode::COPY), SystemZ::R0L).addReg(CharReg);
+  BuildMI(MBB, DL, TII->get(Opcode))
+    .addReg(End1Reg, RegState::Define).addReg(End2Reg, RegState::Define)
+    .addReg(This1Reg).addReg(This2Reg);
+  BuildMI(MBB, DL, TII->get(SystemZ::BRC))
+    .addImm(SystemZ::CCMASK_ANY).addImm(SystemZ::CCMASK_3).addMBB(LoopMBB);
+  MBB->addSuccessor(LoopMBB);
+  MBB->addSuccessor(DoneMBB);
+
+  DoneMBB->addLiveIn(SystemZ::CC);
+
+  MI->eraseFromParent();
+  return DoneMBB;
+}
+
 MachineBasicBlock *SystemZTargetLowering::
 EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *MBB) const {
   switch (MI->getOpcode()) {
+  case SystemZ::Select32Mux:
   case SystemZ::Select32:
   case SystemZ::SelectF32:
   case SystemZ::Select64:
@@ -2268,18 +3012,14 @@ EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *MBB) const {
   case SystemZ::SelectF128:
     return emitSelect(MI, MBB);
 
-  case SystemZ::CondStore8_32:
-    return emitCondStore(MI, MBB, SystemZ::STC32, 0, false);
-  case SystemZ::CondStore8_32Inv:
-    return emitCondStore(MI, MBB, SystemZ::STC32, 0, true);
-  case SystemZ::CondStore16_32:
-    return emitCondStore(MI, MBB, SystemZ::STH32, 0, false);
-  case SystemZ::CondStore16_32Inv:
-    return emitCondStore(MI, MBB, SystemZ::STH32, 0, true);
-  case SystemZ::CondStore32_32:
-    return emitCondStore(MI, MBB, SystemZ::ST32, SystemZ::STOC32, false);
-  case SystemZ::CondStore32_32Inv:
-    return emitCondStore(MI, MBB, SystemZ::ST32, SystemZ::STOC32, true);
+  case SystemZ::CondStore8Mux:
+    return emitCondStore(MI, MBB, SystemZ::STCMux, 0, false);
+  case SystemZ::CondStore8MuxInv:
+    return emitCondStore(MI, MBB, SystemZ::STCMux, 0, true);
+  case SystemZ::CondStore16Mux:
+    return emitCondStore(MI, MBB, SystemZ::STHMux, 0, false);
+  case SystemZ::CondStore16MuxInv:
+    return emitCondStore(MI, MBB, SystemZ::STHMux, 0, true);
   case SystemZ::CondStore8:
     return emitCondStore(MI, MBB, SystemZ::STC, 0, false);
   case SystemZ::CondStore8Inv:
@@ -2306,11 +3046,11 @@ EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *MBB) const {
     return emitCondStore(MI, MBB, SystemZ::STD, 0, true);
 
   case SystemZ::AEXT128_64:
-    return emitExt128(MI, MBB, false, SystemZ::subreg_low);
+    return emitExt128(MI, MBB, false, SystemZ::subreg_l64);
   case SystemZ::ZEXT128_32:
-    return emitExt128(MI, MBB, true, SystemZ::subreg_low32);
+    return emitExt128(MI, MBB, true, SystemZ::subreg_l32);
   case SystemZ::ZEXT128_64:
-    return emitExt128(MI, MBB, true, SystemZ::subreg_low);
+    return emitExt128(MI, MBB, true, SystemZ::subreg_l64);
 
   case SystemZ::ATOMIC_SWAPW:
     return emitAtomicLoadBinary(MI, MBB, 0, 0);
@@ -2346,98 +3086,98 @@ EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *MBB) const {
   case SystemZ::ATOMIC_LOADW_NR:
     return emitAtomicLoadBinary(MI, MBB, SystemZ::NR, 0);
   case SystemZ::ATOMIC_LOADW_NILH:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILH32, 0);
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILH, 0);
   case SystemZ::ATOMIC_LOAD_NR:
     return emitAtomicLoadBinary(MI, MBB, SystemZ::NR, 32);
-  case SystemZ::ATOMIC_LOAD_NILL32:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILL32, 32);
-  case SystemZ::ATOMIC_LOAD_NILH32:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILH32, 32);
-  case SystemZ::ATOMIC_LOAD_NILF32:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILF32, 32);
-  case SystemZ::ATOMIC_LOAD_NGR:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::NGR, 64);
   case SystemZ::ATOMIC_LOAD_NILL:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILL, 64);
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILL, 32);
   case SystemZ::ATOMIC_LOAD_NILH:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILH, 64);
-  case SystemZ::ATOMIC_LOAD_NIHL:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::NIHL, 64);
-  case SystemZ::ATOMIC_LOAD_NIHH:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::NIHH, 64);
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILH, 32);
   case SystemZ::ATOMIC_LOAD_NILF:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILF, 64);
-  case SystemZ::ATOMIC_LOAD_NIHF:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::NIHF, 64);
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILF, 32);
+  case SystemZ::ATOMIC_LOAD_NGR:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NGR, 64);
+  case SystemZ::ATOMIC_LOAD_NILL64:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILL64, 64);
+  case SystemZ::ATOMIC_LOAD_NILH64:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILH64, 64);
+  case SystemZ::ATOMIC_LOAD_NIHL64:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NIHL64, 64);
+  case SystemZ::ATOMIC_LOAD_NIHH64:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NIHH64, 64);
+  case SystemZ::ATOMIC_LOAD_NILF64:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILF64, 64);
+  case SystemZ::ATOMIC_LOAD_NIHF64:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NIHF64, 64);
 
   case SystemZ::ATOMIC_LOADW_OR:
     return emitAtomicLoadBinary(MI, MBB, SystemZ::OR, 0);
   case SystemZ::ATOMIC_LOADW_OILH:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::OILH32, 0);
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::OILH, 0);
   case SystemZ::ATOMIC_LOAD_OR:
     return emitAtomicLoadBinary(MI, MBB, SystemZ::OR, 32);
-  case SystemZ::ATOMIC_LOAD_OILL32:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::OILL32, 32);
-  case SystemZ::ATOMIC_LOAD_OILH32:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::OILH32, 32);
-  case SystemZ::ATOMIC_LOAD_OILF32:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::OILF32, 32);
-  case SystemZ::ATOMIC_LOAD_OGR:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::OGR, 64);
   case SystemZ::ATOMIC_LOAD_OILL:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::OILL, 64);
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::OILL, 32);
   case SystemZ::ATOMIC_LOAD_OILH:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::OILH, 64);
-  case SystemZ::ATOMIC_LOAD_OIHL:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::OIHL, 64);
-  case SystemZ::ATOMIC_LOAD_OIHH:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::OIHH, 64);
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::OILH, 32);
   case SystemZ::ATOMIC_LOAD_OILF:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::OILF, 64);
-  case SystemZ::ATOMIC_LOAD_OIHF:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::OIHF, 64);
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::OILF, 32);
+  case SystemZ::ATOMIC_LOAD_OGR:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::OGR, 64);
+  case SystemZ::ATOMIC_LOAD_OILL64:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::OILL64, 64);
+  case SystemZ::ATOMIC_LOAD_OILH64:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::OILH64, 64);
+  case SystemZ::ATOMIC_LOAD_OIHL64:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::OIHL64, 64);
+  case SystemZ::ATOMIC_LOAD_OIHH64:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::OIHH64, 64);
+  case SystemZ::ATOMIC_LOAD_OILF64:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::OILF64, 64);
+  case SystemZ::ATOMIC_LOAD_OIHF64:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::OIHF64, 64);
 
   case SystemZ::ATOMIC_LOADW_XR:
     return emitAtomicLoadBinary(MI, MBB, SystemZ::XR, 0);
   case SystemZ::ATOMIC_LOADW_XILF:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::XILF32, 0);
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::XILF, 0);
   case SystemZ::ATOMIC_LOAD_XR:
     return emitAtomicLoadBinary(MI, MBB, SystemZ::XR, 32);
-  case SystemZ::ATOMIC_LOAD_XILF32:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::XILF32, 32);
+  case SystemZ::ATOMIC_LOAD_XILF:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::XILF, 32);
   case SystemZ::ATOMIC_LOAD_XGR:
     return emitAtomicLoadBinary(MI, MBB, SystemZ::XGR, 64);
-  case SystemZ::ATOMIC_LOAD_XILF:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::XILF, 64);
-  case SystemZ::ATOMIC_LOAD_XIHF:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::XIHF, 64);
+  case SystemZ::ATOMIC_LOAD_XILF64:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::XILF64, 64);
+  case SystemZ::ATOMIC_LOAD_XIHF64:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::XIHF64, 64);
 
   case SystemZ::ATOMIC_LOADW_NRi:
     return emitAtomicLoadBinary(MI, MBB, SystemZ::NR, 0, true);
   case SystemZ::ATOMIC_LOADW_NILHi:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILH32, 0, true);
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILH, 0, true);
   case SystemZ::ATOMIC_LOAD_NRi:
     return emitAtomicLoadBinary(MI, MBB, SystemZ::NR, 32, true);
-  case SystemZ::ATOMIC_LOAD_NILL32i:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILL32, 32, true);
-  case SystemZ::ATOMIC_LOAD_NILH32i:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILH32, 32, true);
-  case SystemZ::ATOMIC_LOAD_NILF32i:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILF32, 32, true);
-  case SystemZ::ATOMIC_LOAD_NGRi:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::NGR, 64, true);
   case SystemZ::ATOMIC_LOAD_NILLi:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILL, 64, true);
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILL, 32, true);
   case SystemZ::ATOMIC_LOAD_NILHi:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILH, 64, true);
-  case SystemZ::ATOMIC_LOAD_NIHLi:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::NIHL, 64, true);
-  case SystemZ::ATOMIC_LOAD_NIHHi:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::NIHH, 64, true);
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILH, 32, true);
   case SystemZ::ATOMIC_LOAD_NILFi:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILF, 64, true);
-  case SystemZ::ATOMIC_LOAD_NIHFi:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::NIHF, 64, true);
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILF, 32, true);
+  case SystemZ::ATOMIC_LOAD_NGRi:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NGR, 64, true);
+  case SystemZ::ATOMIC_LOAD_NILL64i:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILL64, 64, true);
+  case SystemZ::ATOMIC_LOAD_NILH64i:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILH64, 64, true);
+  case SystemZ::ATOMIC_LOAD_NIHL64i:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NIHL64, 64, true);
+  case SystemZ::ATOMIC_LOAD_NIHH64i:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NIHH64, 64, true);
+  case SystemZ::ATOMIC_LOAD_NILF64i:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILF64, 64, true);
+  case SystemZ::ATOMIC_LOAD_NIHF64i:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NIHF64, 64, true);
 
   case SystemZ::ATOMIC_LOADW_MIN:
     return emitAtomicLoadMinMax(MI, MBB, SystemZ::CR,
@@ -2481,8 +3221,27 @@ EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *MBB) const {
 
   case SystemZ::ATOMIC_CMP_SWAPW:
     return emitAtomicCmpSwapW(MI, MBB);
-  case SystemZ::MVCWrapper:
-    return emitMVCWrapper(MI, MBB);
+  case SystemZ::MVCSequence:
+  case SystemZ::MVCLoop:
+    return emitMemMemWrapper(MI, MBB, SystemZ::MVC);
+  case SystemZ::NCSequence:
+  case SystemZ::NCLoop:
+    return emitMemMemWrapper(MI, MBB, SystemZ::NC);
+  case SystemZ::OCSequence:
+  case SystemZ::OCLoop:
+    return emitMemMemWrapper(MI, MBB, SystemZ::OC);
+  case SystemZ::XCSequence:
+  case SystemZ::XCLoop:
+    return emitMemMemWrapper(MI, MBB, SystemZ::XC);
+  case SystemZ::CLCSequence:
+  case SystemZ::CLCLoop:
+    return emitMemMemWrapper(MI, MBB, SystemZ::CLC);
+  case SystemZ::CLSTLoop:
+    return emitStringWrapper(MI, MBB, SystemZ::CLST);
+  case SystemZ::MVSTLoop:
+    return emitStringWrapper(MI, MBB, SystemZ::MVST);
+  case SystemZ::SRSTLoop:
+    return emitStringWrapper(MI, MBB, SystemZ::SRST);
   default:
     llvm_unreachable("Unexpected instr type to insert");
   }
diff --git a/lib/Target/SystemZ/SystemZISelLowering.h b/lib/Target/SystemZ/SystemZISelLowering.h
index c0dbe49..c6dcca6 100644
--- a/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/lib/Target/SystemZ/SystemZISelLowering.h
@@ -32,17 +32,32 @@ namespace SystemZISD {
     // is the target address.  The arguments start at operand 2.
     // There is an optional glue operand at the end.
     CALL,
+    SIBCALL,
 
     // Wraps a TargetGlobalAddress that should be loaded using PC-relative
     // accesses (LARL).  Operand 0 is the address.
     PCREL_WRAPPER,
 
-    // Signed integer and floating-point comparisons.  The operands are the
-    // two values to compare.
-    CMP,
+    // Used in cases where an offset is applied to a TargetGlobalAddress.
+    // Operand 0 is the full TargetGlobalAddress and operand 1 is a
+    // PCREL_WRAPPER for an anchor point.  This is used so that we can
+    // cheaply refer to either the full address or the anchor point
+    // as a register base.
+    PCREL_OFFSET,
 
-    // Likewise unsigned integer comparison.
-    UCMP,
+    // Integer comparisons.  There are three operands: the two values
+    // to compare, and an integer of type SystemZICMP.
+    ICMP,
+
+    // Floating-point comparisons.  The two operands are the values to compare.
+    FCMP,
+
+    // Test under mask.  The first operand is ANDed with the second operand
+    // and the condition codes are set on the result.  The third operand is
+    // a boolean that is true if the condition codes need to distinguish
+    // between CCMASK_TM_MIXED_MSB_0 and CCMASK_TM_MIXED_MSB_1 (which the
+    // register forms do but the memory forms don't).
+    TM,
 
     // Branches if a condition is true.  Operand 0 is the chain operand;
     // operand 1 is the 4-bit condition-code mask, with bit N in
@@ -73,13 +88,50 @@ namespace SystemZISD {
     UDIVREM32,
     UDIVREM64,
 
-    // Use MVC to copy bytes from one memory location to another.
-    // The first operand is the target address, the second operand is the
-    // source address, and the third operand is the constant length.
+    // Use a series of MVCs to copy bytes from one memory location to another.
+    // The operands are:
+    // - the target address
+    // - the source address
+    // - the constant length
+    //
     // This isn't a memory opcode because we'd need to attach two
     // MachineMemOperands rather than one.
     MVC,
 
+    // Like MVC, but implemented as a loop that handles X*256 bytes
+    // followed by straight-line code to handle the rest (if any).
+    // The value of X is passed as an additional operand.
+    MVC_LOOP,
+
+    // Similar to MVC and MVC_LOOP, but for logic operations (AND, OR, XOR).
+    NC,
+    NC_LOOP,
+    OC,
+    OC_LOOP,
+    XC,
+    XC_LOOP,
+
+    // Use CLC to compare two blocks of memory, with the same comments
+    // as for MVC and MVC_LOOP.
+    CLC,
+    CLC_LOOP,
+
+    // Use an MVST-based sequence to implement stpcpy().
+    STPCPY,
+
+    // Use a CLST-based sequence to implement strcmp().  The two input operands
+    // are the addresses of the strings to compare.
+    STRCMP,
+
+    // Use an SRST-based sequence to search a block of memory.  The first
+    // operand is the end address, the second is the start, and the third
+    // is the character to search for.  CC is set to 1 on success and 2
+    // on failure.
+    SEARCH_STRING,
+
+    // Store the CC value in bits 29 and 28 of an integer.
+    IPM,
+
     // Wrappers around the inner loop of an 8- or 16-bit ATOMIC_SWAP or
     // ATOMIC_LOAD_<op>.
     //
@@ -111,7 +163,27 @@ namespace SystemZISD {
     //            operand into the high bits
     // Operand 4: the negative of operand 2, for rotating the other way
     // Operand 5: the width of the field in bits (8 or 16)
-    ATOMIC_CMP_SWAPW
+    ATOMIC_CMP_SWAPW,
+
+    // Prefetch from the second operand using the 4-bit control code in
+    // the first operand.  The code is 1 for a load prefetch and 2 for
+    // a store prefetch.
+    PREFETCH
+  };
+
+  // Return true if OPCODE is some kind of PC-relative address.
+  inline bool isPCREL(unsigned Opcode) {
+    return Opcode == PCREL_WRAPPER || Opcode == PCREL_OFFSET;
+  }
+}
+
+namespace SystemZICMP {
+  // Describes whether an integer comparison needs to be signed or unsigned,
+  // or whether either type is OK.
+  enum {
+    Any,
+    UnsignedOnly,
+    SignedOnly
   };
 }
 
@@ -126,15 +198,15 @@ public:
   virtual MVT getScalarShiftAmountTy(EVT LHSTy) const LLVM_OVERRIDE {
     return MVT::i32;
   }
-  virtual EVT getSetCCResultType(LLVMContext &, EVT) const LLVM_OVERRIDE {
-    return MVT::i32;
-  }
+  virtual EVT getSetCCResultType(LLVMContext &, EVT) const LLVM_OVERRIDE;
   virtual bool isFMAFasterThanFMulAndFAdd(EVT VT) const LLVM_OVERRIDE;
   virtual bool isFPImmLegal(const APFloat &Imm, EVT VT) const LLVM_OVERRIDE;
   virtual bool isLegalAddressingMode(const AddrMode &AM, Type *Ty) const
      LLVM_OVERRIDE;
   virtual bool allowsUnalignedMemoryAccesses(EVT VT, bool *Fast) const
     LLVM_OVERRIDE;
+  virtual bool isTruncateFree(Type *, Type *) const LLVM_OVERRIDE;
+  virtual bool isTruncateFree(EVT, EVT) const LLVM_OVERRIDE;
   virtual const char *getTargetNodeName(unsigned Opcode) const LLVM_OVERRIDE;
   virtual std::pair<unsigned, const TargetRegisterClass *>
     getRegForInlineAsmConstraint(const std::string &Constraint,
@@ -154,6 +226,8 @@ public:
                                 MachineBasicBlock *BB) const LLVM_OVERRIDE;
   virtual SDValue LowerOperation(SDValue Op,
                                  SelectionDAG &DAG) const LLVM_OVERRIDE;
+  virtual bool allowTruncateForTailCall(Type *, Type *) const LLVM_OVERRIDE;
+  virtual bool mayBeEmittedAsTailCall(CallInst *CI) const LLVM_OVERRIDE;
   virtual SDValue
     LowerFormalArguments(SDValue Chain,
                          CallingConv::ID CallConv, bool isVarArg,
@@ -176,6 +250,7 @@ private:
   const SystemZTargetMachine &TM;
 
   // Implement LowerOperation for individual opcodes.
+  SDValue lowerSETCC(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerBR_CC(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerGlobalAddress(GlobalAddressSDNode *Node,
@@ -189,6 +264,7 @@ private:
   SDValue lowerVASTART(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerVACOPY(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerSMUL_LOHI(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerUMUL_LOHI(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerSDIVREM(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerUDIVREM(SDValue Op, SelectionDAG &DAG) const;
@@ -199,6 +275,7 @@ private:
   SDValue lowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerSTACKRESTORE(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const;
 
   // If the last instruction before MBBI in MBB was some form of COMPARE,
   // try to replace it with a COMPARE AND BRANCH just before MBBI.
@@ -230,8 +307,12 @@ private:
                                           unsigned BitSize) const;
   MachineBasicBlock *emitAtomicCmpSwapW(MachineInstr *MI,
                                         MachineBasicBlock *BB) const;
-  MachineBasicBlock *emitMVCWrapper(MachineInstr *MI,
-                                    MachineBasicBlock *BB) const;
+  MachineBasicBlock *emitMemMemWrapper(MachineInstr *MI,
+                                       MachineBasicBlock *BB,
+                                       unsigned Opcode) const;
+  MachineBasicBlock *emitStringWrapper(MachineInstr *MI,
+                                       MachineBasicBlock *BB,
+                                       unsigned Opcode) const;
 };
 } // end namespace llvm
 
diff --git a/lib/Target/SystemZ/SystemZInstrFP.td b/lib/Target/SystemZ/SystemZInstrFP.td
index b903b51..6080046 100644
--- a/lib/Target/SystemZ/SystemZInstrFP.td
+++ b/lib/Target/SystemZ/SystemZInstrFP.td
@@ -27,9 +27,9 @@ defm CondStoreF64 : CondStores<FP64, nonvolatile_store,
 
 // Load zero.
 let neverHasSideEffects = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in {
-  def LZER : InherentRRE<"lze", 0xB374, FP32,  (fpimm0)>;
-  def LZDR : InherentRRE<"lzd", 0xB375, FP64,  (fpimm0)>;
-  def LZXR : InherentRRE<"lzx", 0xB376, FP128, (fpimm0)>;
+  def LZER : InherentRRE<"lzer", 0xB374, FP32,  (fpimm0)>;
+  def LZDR : InherentRRE<"lzdr", 0xB375, FP64,  (fpimm0)>;
+  def LZXR : InherentRRE<"lzxr", 0xB376, FP128, (fpimm0)>;
 }
 
 // Moves between two floating-point registers.
@@ -62,7 +62,7 @@ let isCodeGenOnly = 1 in {
 
 // The sign of an FP128 is in the high register.
 def : Pat<(fcopysign FP32:$src1, FP128:$src2),
-          (CPSDRsd FP32:$src1, (EXTRACT_SUBREG FP128:$src2, subreg_high))>;
+          (CPSDRsd FP32:$src1, (EXTRACT_SUBREG FP128:$src2, subreg_h64))>;
 
 // fcopysign with an FP64 result.
 let isCodeGenOnly = 1 in
@@ -71,24 +71,24 @@ def CPSDRdd : BinaryRRF<"cpsd", 0xB372, fcopysign, FP64, FP64>;
 
 // The sign of an FP128 is in the high register.
 def : Pat<(fcopysign FP64:$src1, FP128:$src2),
-          (CPSDRdd FP64:$src1, (EXTRACT_SUBREG FP128:$src2, subreg_high))>;
+          (CPSDRdd FP64:$src1, (EXTRACT_SUBREG FP128:$src2, subreg_h64))>;
 
 // fcopysign with an FP128 result.  Use "upper" as the high half and leave
 // the low half as-is.
 class CopySign128<RegisterOperand cls, dag upper>
   : Pat<(fcopysign FP128:$src1, cls:$src2),
-        (INSERT_SUBREG FP128:$src1, upper, subreg_high)>;
+        (INSERT_SUBREG FP128:$src1, upper, subreg_h64)>;
 
-def : CopySign128<FP32,  (CPSDRds (EXTRACT_SUBREG FP128:$src1, subreg_high),
+def : CopySign128<FP32,  (CPSDRds (EXTRACT_SUBREG FP128:$src1, subreg_h64),
                                   FP32:$src2)>;
-def : CopySign128<FP64,  (CPSDRdd (EXTRACT_SUBREG FP128:$src1, subreg_high),
+def : CopySign128<FP64,  (CPSDRdd (EXTRACT_SUBREG FP128:$src1, subreg_h64),
                                   FP64:$src2)>;
-def : CopySign128<FP128, (CPSDRdd (EXTRACT_SUBREG FP128:$src1, subreg_high),
-                                  (EXTRACT_SUBREG FP128:$src2, subreg_high))>;
+def : CopySign128<FP128, (CPSDRdd (EXTRACT_SUBREG FP128:$src1, subreg_h64),
+                                  (EXTRACT_SUBREG FP128:$src2, subreg_h64))>;
 
-defm LoadStoreF32  : MVCLoadStore<load, store, f32,  MVCWrapper, 4>;
-defm LoadStoreF64  : MVCLoadStore<load, store, f64,  MVCWrapper, 8>;
-defm LoadStoreF128 : MVCLoadStore<load, store, f128, MVCWrapper, 16>;
+defm LoadStoreF32  : MVCLoadStore<load, f32,  MVCSequence, 4>;
+defm LoadStoreF64  : MVCLoadStore<load, f64,  MVCSequence, 8>;
+defm LoadStoreF128 : MVCLoadStore<load, f128, MVCSequence, 16>;
 
 //===----------------------------------------------------------------------===//
 // Load instructions
@@ -134,9 +134,9 @@ def LEXBR : UnaryRRE<"lexb", 0xB346, null_frag, FP128, FP128>;
 def LDXBR : UnaryRRE<"ldxb", 0xB345, null_frag, FP128, FP128>;
 
 def : Pat<(f32 (fround FP128:$src)),
-          (EXTRACT_SUBREG (LEXBR FP128:$src), subreg_32bit)>;
+          (EXTRACT_SUBREG (LEXBR FP128:$src), subreg_hh32)>;
 def : Pat<(f64 (fround FP128:$src)),
-          (EXTRACT_SUBREG (LDXBR FP128:$src), subreg_high)>;
+          (EXTRACT_SUBREG (LDXBR FP128:$src), subreg_h64)>;
 
 // Extend register floating-point values to wider representations.
 def LDEBR : UnaryRRE<"ldeb", 0xB304, fextend, FP64,  FP32>;
@@ -212,21 +212,56 @@ def SQEB : UnaryRXE<"sqeb", 0xED14, loadu<fsqrt>, FP32, 4>;
 def SQDB : UnaryRXE<"sqdb", 0xED15, loadu<fsqrt>, FP64, 8>;
 
 // Round to an integer, with the second operand (modifier M3) specifying
-// the rounding mode.
-//
-// These forms always check for inexact conditions.  z196 added versions
-// that allow this to suppressed (as for fnearbyint), but we don't yet
-// support -march=z196.
+// the rounding mode.  These forms always check for inexact conditions.
 def FIEBR : UnaryRRF<"fieb", 0xB357, FP32,  FP32>;
 def FIDBR : UnaryRRF<"fidb", 0xB35F, FP64,  FP64>;
 def FIXBR : UnaryRRF<"fixb", 0xB347, FP128, FP128>;
 
+// Extended forms of the previous three instructions.  M4 can be set to 4
+// to suppress detection of inexact conditions.
+def FIEBRA : UnaryRRF4<"fiebra", 0xB357, FP32,  FP32>,
+             Requires<[FeatureFPExtension]>;
+def FIDBRA : UnaryRRF4<"fidbra", 0xB35F, FP64,  FP64>,
+             Requires<[FeatureFPExtension]>;
+def FIXBRA : UnaryRRF4<"fixbra", 0xB347, FP128, FP128>,
+             Requires<[FeatureFPExtension]>;
+
 // frint rounds according to the current mode (modifier 0) and detects
 // inexact conditions.
 def : Pat<(frint FP32:$src),  (FIEBR 0, FP32:$src)>;
 def : Pat<(frint FP64:$src),  (FIDBR 0, FP64:$src)>;
 def : Pat<(frint FP128:$src), (FIXBR 0, FP128:$src)>;
 
+let Predicates = [FeatureFPExtension] in {
+  // fnearbyint is like frint but does not detect inexact conditions.
+  def : Pat<(fnearbyint FP32:$src),  (FIEBRA 0, FP32:$src,  4)>;
+  def : Pat<(fnearbyint FP64:$src),  (FIDBRA 0, FP64:$src,  4)>;
+  def : Pat<(fnearbyint FP128:$src), (FIXBRA 0, FP128:$src, 4)>;
+
+  // floor is no longer allowed to raise an inexact condition,
+  // so restrict it to the cases where the condition can be suppressed.
+  // Mode 7 is round towards -inf.
+  def : Pat<(ffloor FP32:$src),  (FIEBRA 7, FP32:$src,  4)>;
+  def : Pat<(ffloor FP64:$src),  (FIDBRA 7, FP64:$src,  4)>;
+  def : Pat<(ffloor FP128:$src), (FIXBRA 7, FP128:$src, 4)>;
+
+  // Same idea for ceil, where mode 6 is round towards +inf.
+  def : Pat<(fceil FP32:$src),  (FIEBRA 6, FP32:$src,  4)>;
+  def : Pat<(fceil FP64:$src),  (FIDBRA 6, FP64:$src,  4)>;
+  def : Pat<(fceil FP128:$src), (FIXBRA 6, FP128:$src, 4)>;
+
+  // Same idea for trunc, where mode 5 is round towards zero.
+  def : Pat<(ftrunc FP32:$src),  (FIEBRA 5, FP32:$src,  4)>;
+  def : Pat<(ftrunc FP64:$src),  (FIDBRA 5, FP64:$src,  4)>;
+  def : Pat<(ftrunc FP128:$src), (FIXBRA 5, FP128:$src, 4)>;
+
+  // Same idea for round, where mode 1 is round towards nearest with
+  // ties away from zero.
+  def : Pat<(frnd FP32:$src),  (FIEBRA 1, FP32:$src,  4)>;
+  def : Pat<(frnd FP64:$src),  (FIDBRA 1, FP64:$src,  4)>;
+  def : Pat<(frnd FP128:$src), (FIXBRA 1, FP128:$src, 4)>;
+}
+
 //===----------------------------------------------------------------------===//
 // Binary arithmetic
 //===----------------------------------------------------------------------===//
@@ -265,26 +300,26 @@ def MDB  : BinaryRXE<"mdb",  0xED1C, fmul, FP64, load, 8>;
 def MDEBR : BinaryRRE<"mdeb", 0xB30C, null_frag, FP64, FP32>;
 def : Pat<(fmul (f64 (fextend FP32:$src1)), (f64 (fextend FP32:$src2))),
           (MDEBR (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
-                                FP32:$src1, subreg_32bit), FP32:$src2)>;
+                                FP32:$src1, subreg_h32), FP32:$src2)>;
 
 // f64 multiplication of an FP32 register and an f32 memory.
 def MDEB : BinaryRXE<"mdeb", 0xED0C, null_frag, FP64, load, 4>;
 def : Pat<(fmul (f64 (fextend FP32:$src1)),
                 (f64 (extloadf32 bdxaddr12only:$addr))),
-          (MDEB (INSERT_SUBREG (f64 (IMPLICIT_DEF)), FP32:$src1, subreg_32bit),
+          (MDEB (INSERT_SUBREG (f64 (IMPLICIT_DEF)), FP32:$src1, subreg_h32),
                 bdxaddr12only:$addr)>;
 
 // f128 multiplication of two FP64 registers.
 def MXDBR : BinaryRRE<"mxdb", 0xB307, null_frag, FP128, FP64>;
 def : Pat<(fmul (f128 (fextend FP64:$src1)), (f128 (fextend FP64:$src2))),
           (MXDBR (INSERT_SUBREG (f128 (IMPLICIT_DEF)),
-                                FP64:$src1, subreg_high), FP64:$src2)>;
+                                FP64:$src1, subreg_h64), FP64:$src2)>;
 
 // f128 multiplication of an FP64 register and an f64 memory.
 def MXDB : BinaryRXE<"mxdb", 0xED07, null_frag, FP128, load, 8>;
 def : Pat<(fmul (f128 (fextend FP64:$src1)),
                 (f128 (extloadf64 bdxaddr12only:$addr))),
-          (MXDB (INSERT_SUBREG (f128 (IMPLICIT_DEF)), FP64:$src1, subreg_high),
+          (MXDB (INSERT_SUBREG (f128 (IMPLICIT_DEF)), FP64:$src1, subreg_h64),
                 bdxaddr12only:$addr)>;
 
 // Fused multiply-add.
@@ -314,12 +349,12 @@ def DDB : BinaryRXE<"ddb", 0xED1D, fdiv, FP64, load, 8>;
 //===----------------------------------------------------------------------===//
 
 let Defs = [CC], CCValues = 0xF in {
-  def CEBR : CompareRRE<"ceb", 0xB309, z_cmp, FP32,  FP32>;
-  def CDBR : CompareRRE<"cdb", 0xB319, z_cmp, FP64,  FP64>;
-  def CXBR : CompareRRE<"cxb", 0xB349, z_cmp, FP128, FP128>;
+  def CEBR : CompareRRE<"ceb", 0xB309, z_fcmp, FP32,  FP32>;
+  def CDBR : CompareRRE<"cdb", 0xB319, z_fcmp, FP64,  FP64>;
+  def CXBR : CompareRRE<"cxb", 0xB349, z_fcmp, FP128, FP128>;
 
-  def CEB : CompareRXE<"ceb", 0xED09, z_cmp, FP32, load, 4>;
-  def CDB : CompareRXE<"cdb", 0xED19, z_cmp, FP64, load, 8>;
+  def CEB : CompareRXE<"ceb", 0xED09, z_fcmp, FP32, load, 4>;
+  def CDB : CompareRXE<"cdb", 0xED19, z_fcmp, FP64, load, 8>;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/SystemZ/SystemZInstrFormats.td b/lib/Target/SystemZ/SystemZInstrFormats.td
index 954df11..a8efe16 100644
--- a/lib/Target/SystemZ/SystemZInstrFormats.td
+++ b/lib/Target/SystemZ/SystemZInstrFormats.td
@@ -308,10 +308,11 @@ class InstRRF<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
   bits<4> R1;
   bits<4> R2;
   bits<4> R3;
+  bits<4> R4;
 
   let Inst{31-16} = op;
   let Inst{15-12} = R3;
-  let Inst{11-8}  = 0;
+  let Inst{11-8}  = R4;
   let Inst{7-4}   = R1;
   let Inst{3-0}   = R2;
 }
@@ -539,6 +540,10 @@ class InstSS<bits<8> op, dag outs, dag ins, string asmstr, list<dag> pattern>
 //     One output operand and five input operands.  The first two operands
 //     are registers and the other three are immediates.
 //
+//   Prefetch:
+//     One 4-bit immediate operand and one address operand.  The immediate
+//     operand is 1 for a load prefetch and 2 for a store prefetch.
+//
 // The format determines which input operands are tied to output operands,
 // and also determines the shape of any address operand.
 //
@@ -552,7 +557,7 @@ class InstSS<bits<8> op, dag outs, dag ins, string asmstr, list<dag> pattern>
 class InherentRRE<string mnemonic, bits<16> opcode, RegisterOperand cls,
                   dag src>
   : InstRRE<opcode, (outs cls:$R1), (ins),
-            mnemonic#"r\t$R1",
+            mnemonic#"\t$R1",
             [(set cls:$R1, src)]> {
   let R2 = 0;
 }
@@ -626,27 +631,33 @@ class StoreMultipleRSY<string mnemonic, bits<16> opcode, RegisterOperand cls>
   let mayStore = 1;
 }
 
+// StoreSI* instructions are used to store an integer to memory, but the
+// addresses are more restricted than for normal stores.  If we are in the
+// situation of having to force either the address into a register or the
+// constant into a register, it's usually better to do the latter.
+// We therefore match the address in the same way as a normal store and
+// only use the StoreSI* instruction if the matched address is suitable.
 class StoreSI<string mnemonic, bits<8> opcode, SDPatternOperator operator,
-              Immediate imm, AddressingMode mode = bdaddr12only>
-  : InstSI<opcode, (outs), (ins mode:$BD1, imm:$I2),
+              Immediate imm>
+  : InstSI<opcode, (outs), (ins mviaddr12pair:$BD1, imm:$I2),
            mnemonic#"\t$BD1, $I2",
-           [(operator imm:$I2, mode:$BD1)]> {
+           [(operator imm:$I2, mviaddr12pair:$BD1)]> {
   let mayStore = 1;
 }
 
 class StoreSIY<string mnemonic, bits<16> opcode, SDPatternOperator operator,
-               Immediate imm, AddressingMode mode = bdaddr20only>
-  : InstSIY<opcode, (outs), (ins mode:$BD1, imm:$I2),
+               Immediate imm>
+  : InstSIY<opcode, (outs), (ins mviaddr20pair:$BD1, imm:$I2),
             mnemonic#"\t$BD1, $I2",
-            [(operator imm:$I2, mode:$BD1)]> {
+            [(operator imm:$I2, mviaddr20pair:$BD1)]> {
   let mayStore = 1;
 }
 
 class StoreSIL<string mnemonic, bits<16> opcode, SDPatternOperator operator,
                Immediate imm>
-  : InstSIL<opcode, (outs), (ins bdaddr12only:$BD1, imm:$I2),
+  : InstSIL<opcode, (outs), (ins mviaddr12pair:$BD1, imm:$I2),
             mnemonic#"\t$BD1, $I2",
-            [(operator imm:$I2, bdaddr12only:$BD1)]> {
+            [(operator imm:$I2, mviaddr12pair:$BD1)]> {
   let mayStore = 1;
 }
 
@@ -654,9 +665,9 @@ multiclass StoreSIPair<string mnemonic, bits<8> siOpcode, bits<16> siyOpcode,
                        SDPatternOperator operator, Immediate imm> {
   let DispKey = mnemonic in {
     let DispSize = "12" in
-      def "" : StoreSI<mnemonic, siOpcode, operator, imm, bdaddr12pair>;
+      def "" : StoreSI<mnemonic, siOpcode, operator, imm>;
     let DispSize = "20" in
-      def Y  : StoreSIY<mnemonic#"y", siyOpcode, operator, imm, bdaddr20pair>;
+      def Y  : StoreSIY<mnemonic#"y", siyOpcode, operator, imm>;
   }
 }
 
@@ -719,8 +730,14 @@ class UnaryRRF<string mnemonic, bits<16> opcode, RegisterOperand cls1,
             mnemonic#"r\t$R1, $R3, $R2", []> {
   let OpKey = mnemonic ## cls1;
   let OpType = "reg";
+  let R4 = 0;
 }
 
+class UnaryRRF4<string mnemonic, bits<16> opcode, RegisterOperand cls1,
+                RegisterOperand cls2>
+  : InstRRF<opcode, (outs cls1:$R1), (ins uimm8zx4:$R3, cls2:$R2, uimm8zx4:$R4),
+            mnemonic#"\t$R1, $R3, $R2, $R4", []>;
+
 // These instructions are generated by if conversion.  The old value of R1
 // is added as an implicit use.
 class CondUnaryRRF<string mnemonic, bits<16> opcode, RegisterOperand cls1,
@@ -729,6 +746,7 @@ class CondUnaryRRF<string mnemonic, bits<16> opcode, RegisterOperand cls1,
             mnemonic#"r$R3\t$R1, $R2", []>,
     Requires<[FeatureLoadStoreOnCond]> {
   let CCMaskLast = 1;
+  let R4 = 0;
 }
 
 // Like CondUnaryRRF, but used for the raw assembly form.  The condition-code
@@ -740,6 +758,7 @@ class AsmCondUnaryRRF<string mnemonic, bits<16> opcode, RegisterOperand cls1,
     Requires<[FeatureLoadStoreOnCond]> {
   let Constraints = "$R1 = $R1src";
   let DisableEncoding = "$R1src";
+  let R4 = 0;
 }
 
 // Like CondUnaryRRF, but with a fixed CC mask.
@@ -751,6 +770,7 @@ class FixedCondUnaryRRF<string mnemonic, bits<16> opcode, RegisterOperand cls1,
   let Constraints = "$R1 = $R1src";
   let DisableEncoding = "$R1src";
   let R3 = ccmask;
+  let R4 = 0;
 }
 
 class UnaryRI<string mnemonic, bits<12> opcode, SDPatternOperator operator,
@@ -898,13 +918,16 @@ class BinaryRRF<string mnemonic, bits<16> opcode, SDPatternOperator operator,
             [(set cls1:$R1, (operator cls1:$R3, cls2:$R2))]> {
   let OpKey = mnemonic ## cls1;
   let OpType = "reg";
+  let R4 = 0;
 }
 
 class BinaryRRFK<string mnemonic, bits<16> opcode, SDPatternOperator operator,
                  RegisterOperand cls1, RegisterOperand cls2>
   : InstRRF<opcode, (outs cls1:$R1), (ins cls1:$R2, cls2:$R3),
             mnemonic#"rk\t$R1, $R2, $R3",
-            [(set cls1:$R1, (operator cls1:$R2, cls2:$R3))]>;
+            [(set cls1:$R1, (operator cls1:$R2, cls2:$R3))]> {
+  let R4 = 0;
+}
 
 multiclass BinaryRRAndK<string mnemonic, bits<8> opcode1, bits<16> opcode2,
                         SDPatternOperator operator, RegisterOperand cls1,
@@ -1285,6 +1308,22 @@ class RotateSelectRIEf<string mnemonic, bits<16> opcode, RegisterOperand cls1,
   let DisableEncoding = "$R1src";
 }
 
+class PrefetchRXY<string mnemonic, bits<16> opcode, SDPatternOperator operator>
+  : InstRXY<opcode, (outs), (ins uimm8zx4:$R1, bdxaddr20only:$XBD2),
+            mnemonic##"\t$R1, $XBD2",
+            [(operator uimm8zx4:$R1, bdxaddr20only:$XBD2)]>;
+
+class PrefetchRILPC<string mnemonic, bits<12> opcode,
+                    SDPatternOperator operator>
+  : InstRIL<opcode, (outs), (ins uimm8zx4:$R1, pcrel32:$I2),
+            mnemonic##"\t$R1, $I2",
+            [(operator uimm8zx4:$R1, pcrel32:$I2)]> {
+  // We want PC-relative addresses to be tried ahead of BD and BDX addresses.
+  // However, BDXs have two extra operands and are therefore 6 units more
+  // complex.
+  let AddedComplexity = 7;
+}
+
 // A floating-point load-and test operation.  Create both a normal unary
 // operation and one that acts as a comparison against zero.
 multiclass LoadAndTestRRE<string mnemonic, bits<16> opcode,
@@ -1310,6 +1349,100 @@ class Pseudo<dag outs, dag ins, list<dag> pattern>
   let isCodeGenOnly = 1;
 }
 
+// Like UnaryRI, but expanded after RA depending on the choice of register.
+class UnaryRIPseudo<SDPatternOperator operator, RegisterOperand cls,
+                    Immediate imm>
+  : Pseudo<(outs cls:$R1), (ins imm:$I2),
+           [(set cls:$R1, (operator imm:$I2))]>;
+
+// Like UnaryRXY, but expanded after RA depending on the choice of register.
+class UnaryRXYPseudo<string key, SDPatternOperator operator,
+                     RegisterOperand cls, bits<5> bytes,
+                     AddressingMode mode = bdxaddr20only>
+  : Pseudo<(outs cls:$R1), (ins mode:$XBD2),
+           [(set cls:$R1, (operator mode:$XBD2))]> {
+  let OpKey = key ## cls;
+  let OpType = "mem";
+  let mayLoad = 1;
+  let Has20BitOffset = 1;
+  let HasIndex = 1;
+  let AccessBytes = bytes;
+}
+
+// Like UnaryRR, but expanded after RA depending on the choice of registers.
+class UnaryRRPseudo<string key, SDPatternOperator operator,
+                    RegisterOperand cls1, RegisterOperand cls2>
+  : Pseudo<(outs cls1:$R1), (ins cls2:$R2),
+           [(set cls1:$R1, (operator cls2:$R2))]> {
+  let OpKey = key ## cls1;
+  let OpType = "reg";
+}
+
+// Like BinaryRI, but expanded after RA depending on the choice of register.
+class BinaryRIPseudo<SDPatternOperator operator, RegisterOperand cls,
+                     Immediate imm>
+  : Pseudo<(outs cls:$R1), (ins cls:$R1src, imm:$I2),
+           [(set cls:$R1, (operator cls:$R1src, imm:$I2))]> {
+  let Constraints = "$R1 = $R1src";
+}
+
+// Like BinaryRIE, but expanded after RA depending on the choice of register.
+class BinaryRIEPseudo<SDPatternOperator operator, RegisterOperand cls,
+                      Immediate imm>
+  : Pseudo<(outs cls:$R1), (ins cls:$R3, imm:$I2),
+           [(set cls:$R1, (operator cls:$R3, imm:$I2))]>;
+
+// Like BinaryRIAndK, but expanded after RA depending on the choice of register.
+multiclass BinaryRIAndKPseudo<string key, SDPatternOperator operator,
+                              RegisterOperand cls, Immediate imm> {
+  let NumOpsKey = key in {
+    let NumOpsValue = "3" in
+      def K : BinaryRIEPseudo<null_frag, cls, imm>,
+              Requires<[FeatureHighWord, FeatureDistinctOps]>;
+    let NumOpsValue = "2", isConvertibleToThreeAddress = 1 in
+      def "" : BinaryRIPseudo<operator, cls, imm>,
+               Requires<[FeatureHighWord]>;
+  }
+}
+
+// Like CompareRI, but expanded after RA depending on the choice of register.
+class CompareRIPseudo<SDPatternOperator operator, RegisterOperand cls,
+                      Immediate imm>
+  : Pseudo<(outs), (ins cls:$R1, imm:$I2), [(operator cls:$R1, imm:$I2)]>;
+
+// Like CompareRXY, but expanded after RA depending on the choice of register.
+class CompareRXYPseudo<SDPatternOperator operator, RegisterOperand cls,
+                       SDPatternOperator load, bits<5> bytes,
+                       AddressingMode mode = bdxaddr20only>
+  : Pseudo<(outs), (ins cls:$R1, mode:$XBD2),
+           [(operator cls:$R1, (load mode:$XBD2))]> {
+  let mayLoad = 1;
+  let Has20BitOffset = 1;
+  let HasIndex = 1;
+  let AccessBytes = bytes;
+}
+
+// Like StoreRXY, but expanded after RA depending on the choice of register.
+class StoreRXYPseudo<SDPatternOperator operator, RegisterOperand cls,
+                     bits<5> bytes, AddressingMode mode = bdxaddr20only>
+  : Pseudo<(outs), (ins cls:$R1, mode:$XBD2),
+           [(operator cls:$R1, mode:$XBD2)]> {
+  let mayStore = 1;
+  let Has20BitOffset = 1;
+  let HasIndex = 1;
+  let AccessBytes = bytes;
+}
+
+// Like RotateSelectRIEf, but expanded after RA depending on the choice
+// of registers.
+class RotateSelectRIEfPseudo<RegisterOperand cls1, RegisterOperand cls2>
+  : Pseudo<(outs cls1:$R1),
+           (ins cls1:$R1src, cls2:$R2, uimm8:$I3, uimm8:$I4, uimm8zx6:$I5),
+           []> {
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
+}
+
 // Implements "$dst = $cc & (8 >> CC) ? $src1 : $src2", where CC is
 // the value of the PSW's 2-bit condition code field.
 class SelectWrapper<RegisterOperand cls>
@@ -1386,3 +1519,85 @@ class AtomicLoadWBinaryReg<SDPatternOperator operator>
   : AtomicLoadWBinary<operator, (i32 GR32:$src2), GR32>;
 class AtomicLoadWBinaryImm<SDPatternOperator operator, Immediate imm>
   : AtomicLoadWBinary<operator, (i32 imm:$src2), imm>;
+
+// Define an instruction that operates on two fixed-length blocks of memory,
+// and associated pseudo instructions for operating on blocks of any size.
+// The Sequence form uses a straight-line sequence of instructions and
+// the Loop form uses a loop of length-256 instructions followed by
+// another instruction to handle the excess.
+multiclass MemorySS<string mnemonic, bits<8> opcode,
+                    SDPatternOperator sequence, SDPatternOperator loop> {
+  def "" : InstSS<opcode, (outs), (ins bdladdr12onlylen8:$BDL1,
+                                       bdaddr12only:$BD2),
+                  mnemonic##"\t$BDL1, $BD2", []>;
+  let usesCustomInserter = 1 in {
+    def Sequence : Pseudo<(outs), (ins bdaddr12only:$dest, bdaddr12only:$src,
+                                       imm64:$length),
+                           [(sequence bdaddr12only:$dest, bdaddr12only:$src,
+                                      imm64:$length)]>;
+    def Loop : Pseudo<(outs), (ins bdaddr12only:$dest, bdaddr12only:$src,
+                                   imm64:$length, GR64:$count256),
+                      [(loop bdaddr12only:$dest, bdaddr12only:$src,
+                             imm64:$length, GR64:$count256)]>;
+  }
+}
+
+// Define an instruction that operates on two strings, both terminated
+// by the character in R0.  The instruction processes a CPU-determinated
+// number of bytes at a time and sets CC to 3 if the instruction needs
+// to be repeated.  Also define a pseudo instruction that represents
+// the full loop (the main instruction plus the branch on CC==3).
+multiclass StringRRE<string mnemonic, bits<16> opcode,
+                     SDPatternOperator operator> {
+  def "" : InstRRE<opcode, (outs GR64:$R1, GR64:$R2),
+                   (ins GR64:$R1src, GR64:$R2src),
+                   mnemonic#"\t$R1, $R2", []> {
+    let Constraints = "$R1 = $R1src, $R2 = $R2src";
+    let DisableEncoding = "$R1src, $R2src";
+  }
+  let usesCustomInserter = 1 in
+    def Loop : Pseudo<(outs GR64:$end),
+                      (ins GR64:$start1, GR64:$start2, GR32:$char),
+                      [(set GR64:$end, (operator GR64:$start1, GR64:$start2,
+                                                 GR32:$char))]>;
+}
+
+// A pseudo instruction that is a direct alias of a real instruction.
+// These aliases are used in cases where a particular register operand is
+// fixed or where the same instruction is used with different register sizes.
+// The size parameter is the size in bytes of the associated real instruction.
+class Alias<int size, dag outs, dag ins, list<dag> pattern>
+  : InstSystemZ<size, outs, ins, "", pattern> {
+  let isPseudo = 1;
+  let isCodeGenOnly = 1;
+}
+
+// An alias of a BinaryRI, but with different register sizes.
+class BinaryAliasRI<SDPatternOperator operator, RegisterOperand cls,
+                    Immediate imm>
+  : Alias<4, (outs cls:$R1), (ins cls:$R1src, imm:$I2),
+          [(set cls:$R1, (operator cls:$R1src, imm:$I2))]> {
+  let Constraints = "$R1 = $R1src";
+}
+
+// An alias of a BinaryRIL, but with different register sizes.
+class BinaryAliasRIL<SDPatternOperator operator, RegisterOperand cls,
+                     Immediate imm>
+  : Alias<6, (outs cls:$R1), (ins cls:$R1src, imm:$I2),
+          [(set cls:$R1, (operator cls:$R1src, imm:$I2))]> {
+  let Constraints = "$R1 = $R1src";
+}
+
+// An alias of a CompareRI, but with different register sizes.
+class CompareAliasRI<SDPatternOperator operator, RegisterOperand cls,
+                     Immediate imm>
+  : Alias<4, (outs), (ins cls:$R1, imm:$I2), [(operator cls:$R1, imm:$I2)]> {
+  let isCompare = 1;
+}
+
+// An alias of a RotateSelectRIEf, but with different register sizes.
+class RotateSelectAliasRIEf<RegisterOperand cls1, RegisterOperand cls2>
+  : Alias<6, (outs cls1:$R1),
+          (ins cls1:$R1src, cls2:$R2, uimm8:$I3, uimm8:$I4, uimm8zx6:$I5), []> {
+  let Constraints = "$R1 = $R1src";
+}
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.cpp b/lib/Target/SystemZ/SystemZInstrInfo.cpp
index 9ee60aa..acfeed8 100644
--- a/lib/Target/SystemZ/SystemZInstrInfo.cpp
+++ b/lib/Target/SystemZ/SystemZInstrInfo.cpp
@@ -17,7 +17,7 @@
 #include "llvm/CodeGen/LiveVariables.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 
-#define GET_INSTRINFO_CTOR
+#define GET_INSTRINFO_CTOR_DTOR
 #define GET_INSTRMAP_INFO
 #include "SystemZGenInstrInfo.inc"
 
@@ -28,6 +28,18 @@ static uint64_t allOnes(unsigned int Count) {
   return Count == 0 ? 0 : (uint64_t(1) << (Count - 1) << 1) - 1;
 }
 
+// Reg should be a 32-bit GPR.  Return true if it is a high register rather
+// than a low register.
+static bool isHighReg(unsigned int Reg) {
+  if (SystemZ::GRH32BitRegClass.contains(Reg))
+    return true;
+  assert(SystemZ::GR32BitRegClass.contains(Reg) && "Invalid GRX32");
+  return false;
+}
+
+// Pin the vtable to this file.
+void SystemZInstrInfo::anchor() {}
+
 SystemZInstrInfo::SystemZInstrInfo(SystemZTargetMachine &tm)
   : SystemZGenInstrInfo(SystemZ::ADJCALLSTACKDOWN, SystemZ::ADJCALLSTACKUP),
     RI(tm), TM(tm) {
@@ -48,8 +60,8 @@ void SystemZInstrInfo::splitMove(MachineBasicBlock::iterator MI,
   // Set up the two 64-bit registers.
   MachineOperand &HighRegOp = EarlierMI->getOperand(0);
   MachineOperand &LowRegOp = MI->getOperand(0);
-  HighRegOp.setReg(RI.getSubReg(HighRegOp.getReg(), SystemZ::subreg_high));
-  LowRegOp.setReg(RI.getSubReg(LowRegOp.getReg(), SystemZ::subreg_low));
+  HighRegOp.setReg(RI.getSubReg(HighRegOp.getReg(), SystemZ::subreg_h64));
+  LowRegOp.setReg(RI.getSubReg(LowRegOp.getReg(), SystemZ::subreg_l64));
 
   // The address in the first (high) instruction is already correct.
   // Adjust the offset in the second (low) instruction.
@@ -82,6 +94,97 @@ void SystemZInstrInfo::splitAdjDynAlloc(MachineBasicBlock::iterator MI) const {
   OffsetMO.setImm(Offset);
 }
 
+// MI is an RI-style pseudo instruction.  Replace it with LowOpcode
+// if the first operand is a low GR32 and HighOpcode if the first operand
+// is a high GR32.  ConvertHigh is true if LowOpcode takes a signed operand
+// and HighOpcode takes an unsigned 32-bit operand.  In those cases,
+// MI has the same kind of operand as LowOpcode, so needs to be converted
+// if HighOpcode is used.
+void SystemZInstrInfo::expandRIPseudo(MachineInstr *MI, unsigned LowOpcode,
+                                      unsigned HighOpcode,
+                                      bool ConvertHigh) const {
+  unsigned Reg = MI->getOperand(0).getReg();
+  bool IsHigh = isHighReg(Reg);
+  MI->setDesc(get(IsHigh ? HighOpcode : LowOpcode));
+  if (IsHigh && ConvertHigh)
+    MI->getOperand(1).setImm(uint32_t(MI->getOperand(1).getImm()));
+}
+
+// MI is a three-operand RIE-style pseudo instruction.  Replace it with
+// LowOpcode3 if the registers are both low GR32s, otherwise use a move
+// followed by HighOpcode or LowOpcode, depending on whether the target
+// is a high or low GR32.
+void SystemZInstrInfo::expandRIEPseudo(MachineInstr *MI, unsigned LowOpcode,
+                                       unsigned LowOpcodeK,
+                                       unsigned HighOpcode) const {
+  unsigned DestReg = MI->getOperand(0).getReg();
+  unsigned SrcReg = MI->getOperand(1).getReg();
+  bool DestIsHigh = isHighReg(DestReg);
+  bool SrcIsHigh = isHighReg(SrcReg);
+  if (!DestIsHigh && !SrcIsHigh)
+    MI->setDesc(get(LowOpcodeK));
+  else {
+    emitGRX32Move(*MI->getParent(), MI, MI->getDebugLoc(),
+                  DestReg, SrcReg, SystemZ::LR, 32,
+                  MI->getOperand(1).isKill());
+    MI->setDesc(get(DestIsHigh ? HighOpcode : LowOpcode));
+    MI->getOperand(1).setReg(DestReg);
+  }
+}
+
+// MI is an RXY-style pseudo instruction.  Replace it with LowOpcode
+// if the first operand is a low GR32 and HighOpcode if the first operand
+// is a high GR32.
+void SystemZInstrInfo::expandRXYPseudo(MachineInstr *MI, unsigned LowOpcode,
+                                       unsigned HighOpcode) const {
+  unsigned Reg = MI->getOperand(0).getReg();
+  unsigned Opcode = getOpcodeForOffset(isHighReg(Reg) ? HighOpcode : LowOpcode,
+                                       MI->getOperand(2).getImm());
+  MI->setDesc(get(Opcode));
+}
+
+// MI is an RR-style pseudo instruction that zero-extends the low Size bits
+// of one GRX32 into another.  Replace it with LowOpcode if both operands
+// are low registers, otherwise use RISB[LH]G.
+void SystemZInstrInfo::expandZExtPseudo(MachineInstr *MI, unsigned LowOpcode,
+                                        unsigned Size) const {
+  emitGRX32Move(*MI->getParent(), MI, MI->getDebugLoc(),
+                MI->getOperand(0).getReg(), MI->getOperand(1).getReg(),
+                LowOpcode, Size, MI->getOperand(1).isKill());
+  MI->eraseFromParent();
+}
+
+// Emit a zero-extending move from 32-bit GPR SrcReg to 32-bit GPR
+// DestReg before MBBI in MBB.  Use LowLowOpcode when both DestReg and SrcReg
+// are low registers, otherwise use RISB[LH]G.  Size is the number of bits
+// taken from the low end of SrcReg (8 for LLCR, 16 for LLHR and 32 for LR).
+// KillSrc is true if this move is the last use of SrcReg.
+void SystemZInstrInfo::emitGRX32Move(MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator MBBI,
+                                     DebugLoc DL, unsigned DestReg,
+                                     unsigned SrcReg, unsigned LowLowOpcode,
+                                     unsigned Size, bool KillSrc) const {
+  unsigned Opcode;
+  bool DestIsHigh = isHighReg(DestReg);
+  bool SrcIsHigh = isHighReg(SrcReg);
+  if (DestIsHigh && SrcIsHigh)
+    Opcode = SystemZ::RISBHH;
+  else if (DestIsHigh && !SrcIsHigh)
+    Opcode = SystemZ::RISBHL;
+  else if (!DestIsHigh && SrcIsHigh)
+    Opcode = SystemZ::RISBLH;
+  else {
+    BuildMI(MBB, MBBI, DL, get(LowLowOpcode), DestReg)
+      .addReg(SrcReg, getKillRegState(KillSrc));
+    return;
+  }
+  unsigned Rotate = (DestIsHigh != SrcIsHigh ? 32 : 0);
+  BuildMI(MBB, MBBI, DL, get(Opcode), DestReg)
+    .addReg(DestReg, RegState::Undef)
+    .addReg(SrcReg, getKillRegState(KillSrc))
+    .addImm(32 - Size).addImm(128 + 31).addImm(Rotate);
+}
+
 // If MI is a simple load or store for a frame object, return the register
 // it loads or stores and set FrameIndex to the index of the frame object.
 // Return 0 otherwise.
@@ -293,6 +396,103 @@ SystemZInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
   return Count;
 }
 
+bool SystemZInstrInfo::analyzeCompare(const MachineInstr *MI,
+                                      unsigned &SrcReg, unsigned &SrcReg2,
+                                      int &Mask, int &Value) const {
+  assert(MI->isCompare() && "Caller should have checked for a comparison");
+
+  if (MI->getNumExplicitOperands() == 2 &&
+      MI->getOperand(0).isReg() &&
+      MI->getOperand(1).isImm()) {
+    SrcReg = MI->getOperand(0).getReg();
+    SrcReg2 = 0;
+    Value = MI->getOperand(1).getImm();
+    Mask = ~0;
+    return true;
+  }
+
+  return false;
+}
+
+// If Reg is a virtual register, return its definition, otherwise return null.
+static MachineInstr *getDef(unsigned Reg,
+                            const MachineRegisterInfo *MRI) {
+  if (TargetRegisterInfo::isPhysicalRegister(Reg))
+    return 0;
+  return MRI->getUniqueVRegDef(Reg);
+}
+
+// Return true if MI is a shift of type Opcode by Imm bits.
+static bool isShift(MachineInstr *MI, int Opcode, int64_t Imm) {
+  return (MI->getOpcode() == Opcode &&
+          !MI->getOperand(2).getReg() &&
+          MI->getOperand(3).getImm() == Imm);
+}
+
+// If the destination of MI has no uses, delete it as dead.
+static void eraseIfDead(MachineInstr *MI, const MachineRegisterInfo *MRI) {
+  if (MRI->use_nodbg_empty(MI->getOperand(0).getReg()))
+    MI->eraseFromParent();
+}
+
+// Compare compares SrcReg against zero.  Check whether SrcReg contains
+// the result of an IPM sequence whose input CC survives until Compare,
+// and whether Compare is therefore redundant.  Delete it and return
+// true if so.
+static bool removeIPMBasedCompare(MachineInstr *Compare, unsigned SrcReg,
+                                  const MachineRegisterInfo *MRI,
+                                  const TargetRegisterInfo *TRI) {
+  MachineInstr *LGFR = 0;
+  MachineInstr *RLL = getDef(SrcReg, MRI);
+  if (RLL && RLL->getOpcode() == SystemZ::LGFR) {
+    LGFR = RLL;
+    RLL = getDef(LGFR->getOperand(1).getReg(), MRI);
+  }
+  if (!RLL || !isShift(RLL, SystemZ::RLL, 31))
+    return false;
+
+  MachineInstr *SRL = getDef(RLL->getOperand(1).getReg(), MRI);
+  if (!SRL || !isShift(SRL, SystemZ::SRL, SystemZ::IPM_CC))
+    return false;
+
+  MachineInstr *IPM = getDef(SRL->getOperand(1).getReg(), MRI);
+  if (!IPM || IPM->getOpcode() != SystemZ::IPM)
+    return false;
+
+  // Check that there are no assignments to CC between the IPM and Compare,
+  if (IPM->getParent() != Compare->getParent())
+    return false;
+  MachineBasicBlock::iterator MBBI = IPM, MBBE = Compare;
+  for (++MBBI; MBBI != MBBE; ++MBBI) {
+    MachineInstr *MI = MBBI;
+    if (MI->modifiesRegister(SystemZ::CC, TRI))
+      return false;
+  }
+
+  Compare->eraseFromParent();
+  if (LGFR)
+    eraseIfDead(LGFR, MRI);
+  eraseIfDead(RLL, MRI);
+  eraseIfDead(SRL, MRI);
+  eraseIfDead(IPM, MRI);
+
+  return true;
+}
+
+bool
+SystemZInstrInfo::optimizeCompareInstr(MachineInstr *Compare,
+                                       unsigned SrcReg, unsigned SrcReg2,
+                                       int Mask, int Value,
+                                       const MachineRegisterInfo *MRI) const {
+  assert(!SrcReg2 && "Only optimizing constant comparisons so far");
+  bool IsLogical = (Compare->getDesc().TSFlags & SystemZII::IsLogical) != 0;
+  if (Value == 0 &&
+      !IsLogical &&
+      removeIPMBasedCompare(Compare, SrcReg, MRI, TM.getRegisterInfo()))
+    return true;
+  return false;
+}
+
 // If Opcode is a move that has a conditional variant, return that variant,
 // otherwise return 0.
 static unsigned getConditionalMove(unsigned Opcode) {
@@ -356,18 +556,21 @@ SystemZInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
 			      bool KillSrc) const {
   // Split 128-bit GPR moves into two 64-bit moves.  This handles ADDR128 too.
   if (SystemZ::GR128BitRegClass.contains(DestReg, SrcReg)) {
-    copyPhysReg(MBB, MBBI, DL, RI.getSubReg(DestReg, SystemZ::subreg_high),
-                RI.getSubReg(SrcReg, SystemZ::subreg_high), KillSrc);
-    copyPhysReg(MBB, MBBI, DL, RI.getSubReg(DestReg, SystemZ::subreg_low),
-                RI.getSubReg(SrcReg, SystemZ::subreg_low), KillSrc);
+    copyPhysReg(MBB, MBBI, DL, RI.getSubReg(DestReg, SystemZ::subreg_h64),
+                RI.getSubReg(SrcReg, SystemZ::subreg_h64), KillSrc);
+    copyPhysReg(MBB, MBBI, DL, RI.getSubReg(DestReg, SystemZ::subreg_l64),
+                RI.getSubReg(SrcReg, SystemZ::subreg_l64), KillSrc);
+    return;
+  }
+
+  if (SystemZ::GRX32BitRegClass.contains(DestReg, SrcReg)) {
+    emitGRX32Move(MBB, MBBI, DL, DestReg, SrcReg, SystemZ::LR, 32, KillSrc);
     return;
   }
 
   // Everything else needs only one instruction.
   unsigned Opcode;
-  if (SystemZ::GR32BitRegClass.contains(DestReg, SrcReg))
-    Opcode = SystemZ::LR;
-  else if (SystemZ::GR64BitRegClass.contains(DestReg, SrcReg))
+  if (SystemZ::GR64BitRegClass.contains(DestReg, SrcReg))
     Opcode = SystemZ::LGR;
   else if (SystemZ::FP32BitRegClass.contains(DestReg, SrcReg))
     Opcode = SystemZ::LER;
@@ -438,15 +641,15 @@ namespace {
 
 static LogicOp interpretAndImmediate(unsigned Opcode) {
   switch (Opcode) {
-  case SystemZ::NILL32: return LogicOp(32,  0, 16);
-  case SystemZ::NILH32: return LogicOp(32, 16, 16);
-  case SystemZ::NILL:   return LogicOp(64,  0, 16);
-  case SystemZ::NILH:   return LogicOp(64, 16, 16);
-  case SystemZ::NIHL:   return LogicOp(64, 32, 16);
-  case SystemZ::NIHH:   return LogicOp(64, 48, 16);
-  case SystemZ::NILF32: return LogicOp(32,  0, 32);
-  case SystemZ::NILF:   return LogicOp(64,  0, 32);
-  case SystemZ::NIHF:   return LogicOp(64, 32, 32);
+  case SystemZ::NILMux: return LogicOp(32,  0, 16);
+  case SystemZ::NIHMux: return LogicOp(32, 16, 16);
+  case SystemZ::NILL64: return LogicOp(64,  0, 16);
+  case SystemZ::NILH64: return LogicOp(64, 16, 16);
+  case SystemZ::NIHL64: return LogicOp(64, 32, 16);
+  case SystemZ::NIHH64: return LogicOp(64, 48, 16);
+  case SystemZ::NIFMux: return LogicOp(32,  0, 32);
+  case SystemZ::NILF64: return LogicOp(64,  0, 32);
+  case SystemZ::NIHF64: return LogicOp(64, 32, 32);
   default:              return LogicOp();
   }
 }
@@ -473,6 +676,7 @@ SystemZInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
                                         LiveVariables *LV) const {
   MachineInstr *MI = MBBI;
   MachineBasicBlock *MBB = MI->getParent();
+  MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
 
   unsigned Opcode = MI->getOpcode();
   unsigned NumOps = MI->getNumOperands();
@@ -482,10 +686,23 @@ SystemZInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
   // because it tends to be shorter and because some instructions
   // have memory forms that can be used during spilling.
   if (TM.getSubtargetImpl()->hasDistinctOps()) {
+    MachineOperand &Dest = MI->getOperand(0);
+    MachineOperand &Src = MI->getOperand(1);
+    unsigned DestReg = Dest.getReg();
+    unsigned SrcReg = Src.getReg();
+    // AHIMux is only really a three-operand instruction when both operands
+    // are low registers.  Try to constrain both operands to be low if
+    // possible.
+    if (Opcode == SystemZ::AHIMux &&
+        TargetRegisterInfo::isVirtualRegister(DestReg) &&
+        TargetRegisterInfo::isVirtualRegister(SrcReg) &&
+        MRI.getRegClass(DestReg)->contains(SystemZ::R1L) &&
+        MRI.getRegClass(SrcReg)->contains(SystemZ::R1L)) {
+      MRI.constrainRegClass(DestReg, &SystemZ::GR32BitRegClass);
+      MRI.constrainRegClass(SrcReg, &SystemZ::GR32BitRegClass);
+    }
     int ThreeOperandOpcode = SystemZ::getThreeOperandOpcode(Opcode);
     if (ThreeOperandOpcode >= 0) {
-      MachineOperand &Dest = MI->getOperand(0);
-      MachineOperand &Src = MI->getOperand(1);
       MachineInstrBuilder MIB =
         BuildMI(*MBB, MBBI, MI->getDebugLoc(), get(ThreeOperandOpcode))
         .addOperand(Dest);
@@ -500,34 +717,27 @@ SystemZInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
 
   // Try to convert an AND into an RISBG-type instruction.
   if (LogicOp And = interpretAndImmediate(Opcode)) {
-    unsigned NewOpcode;
-    if (And.RegSize == 64)
-      NewOpcode = SystemZ::RISBG;
-    else if (TM.getSubtargetImpl()->hasHighWord())
-      NewOpcode = SystemZ::RISBLG32;
-    else
-      // We can't use RISBG for 32-bit operations because it clobbers the
-      // high word of the destination too.
-      NewOpcode = 0;
-    if (NewOpcode) {
-      uint64_t Imm = MI->getOperand(2).getImm() << And.ImmLSB;
-      // AND IMMEDIATE leaves the other bits of the register unchanged.
-      Imm |= allOnes(And.RegSize) & ~(allOnes(And.ImmSize) << And.ImmLSB);
-      unsigned Start, End;
-      if (isRxSBGMask(Imm, And.RegSize, Start, End)) {
-        if (NewOpcode == SystemZ::RISBLG32) {
-          Start &= 31;
-          End &= 31;
-        }
-        MachineOperand &Dest = MI->getOperand(0);
-        MachineOperand &Src = MI->getOperand(1);
-        MachineInstrBuilder MIB =
-          BuildMI(*MBB, MI, MI->getDebugLoc(), get(NewOpcode))
-          .addOperand(Dest).addReg(0)
-          .addReg(Src.getReg(), getKillRegState(Src.isKill()), Src.getSubReg())
-          .addImm(Start).addImm(End + 128).addImm(0);
-        return finishConvertToThreeAddress(MI, MIB, LV);
+    uint64_t Imm = MI->getOperand(2).getImm() << And.ImmLSB;
+    // AND IMMEDIATE leaves the other bits of the register unchanged.
+    Imm |= allOnes(And.RegSize) & ~(allOnes(And.ImmSize) << And.ImmLSB);
+    unsigned Start, End;
+    if (isRxSBGMask(Imm, And.RegSize, Start, End)) {
+      unsigned NewOpcode;
+      if (And.RegSize == 64)
+        NewOpcode = SystemZ::RISBG;
+      else {
+        NewOpcode = SystemZ::RISBMux;
+        Start &= 31;
+        End &= 31;
       }
+      MachineOperand &Dest = MI->getOperand(0);
+      MachineOperand &Src = MI->getOperand(1);
+      MachineInstrBuilder MIB =
+        BuildMI(*MBB, MI, MI->getDebugLoc(), get(NewOpcode))
+        .addOperand(Dest).addReg(0)
+        .addReg(Src.getReg(), getKillRegState(Src.isKill()), Src.getSubReg())
+        .addImm(Start).addImm(End + 128).addImm(0);
+      return finishConvertToThreeAddress(MI, MIB, LV);
     }
   }
   return 0;
@@ -540,8 +750,21 @@ SystemZInstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
                                         int FrameIndex) const {
   const MachineFrameInfo *MFI = MF.getFrameInfo();
   unsigned Size = MFI->getObjectSize(FrameIndex);
+  unsigned Opcode = MI->getOpcode();
+
+  if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) {
+    if ((Opcode == SystemZ::LA || Opcode == SystemZ::LAY) &&
+        isInt<8>(MI->getOperand(2).getImm()) &&
+        !MI->getOperand(3).getReg()) {
+      // LA(Y) %reg, CONST(%reg) -> AGSI %mem, CONST
+      return BuildMI(MF, MI->getDebugLoc(), get(SystemZ::AGSI))
+        .addFrameIndex(FrameIndex).addImm(0)
+        .addImm(MI->getOperand(2).getImm());
+    }
+    return 0;
+  }
 
-  // Eary exit for cases we don't care about
+  // All other cases require a single operand.
   if (Ops.size() != 1)
     return 0;
 
@@ -550,7 +773,16 @@ SystemZInstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
          .getRegClass(MI->getOperand(OpNum).getReg())->getSize() &&
          "Invalid size combination");
 
-  unsigned Opcode = MI->getOpcode();
+  if ((Opcode == SystemZ::AHI || Opcode == SystemZ::AGHI) &&
+      OpNum == 0 &&
+      isInt<8>(MI->getOperand(2).getImm())) {
+    // A(G)HI %reg, CONST -> A(G)SI %mem, CONST
+    Opcode = (Opcode == SystemZ::AHI ? SystemZ::ASI : SystemZ::AGSI);
+    return BuildMI(MF, MI->getDebugLoc(), get(Opcode))
+      .addFrameIndex(FrameIndex).addImm(0)
+      .addImm(MI->getOperand(2).getImm());
+  }
+
   if (Opcode == SystemZ::LGDR || Opcode == SystemZ::LDGR) {
     bool Op0IsGPR = (Opcode == SystemZ::LGDR);
     bool Op1IsGPR = (Opcode == SystemZ::LDGR);
@@ -577,10 +809,14 @@ SystemZInstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
   //
   // Although MVC is in practice a fast choice in these cases, it is still
   // logically a bytewise copy.  This means that we cannot use it if the
-  // load or store is volatile.  It also means that the transformation is
-  // not valid in cases where the two memories partially overlap; however,
-  // that is not a problem here, because we know that one of the memories
-  // is a full frame index.
+  // load or store is volatile.  We also wouldn't be able to use MVC if
+  // the two memories partially overlap, but that case cannot occur here,
+  // because we know that one of the memories is a full frame index.
+  //
+  // For performance reasons, we also want to avoid using MVC if the addresses
+  // might be equal.  We don't worry about that case here, because spill slot
+  // coloring happens later, and because we have special code to remove
+  // MVCs that turn out to be redundant.
   if (OpNum == 0 && MI->hasOneMemOperand()) {
     MachineMemOperand *MMO = *MI->memoperands_begin();
     if (MMO->getSize() == Size && !MMO->isVolatile()) {
@@ -651,6 +887,138 @@ SystemZInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
     splitMove(MI, SystemZ::STD);
     return true;
 
+  case SystemZ::LBMux:
+    expandRXYPseudo(MI, SystemZ::LB, SystemZ::LBH);
+    return true;
+
+  case SystemZ::LHMux:
+    expandRXYPseudo(MI, SystemZ::LH, SystemZ::LHH);
+    return true;
+
+  case SystemZ::LLCRMux:
+    expandZExtPseudo(MI, SystemZ::LLCR, 8);
+    return true;
+
+  case SystemZ::LLHRMux:
+    expandZExtPseudo(MI, SystemZ::LLHR, 16);
+    return true;
+
+  case SystemZ::LLCMux:
+    expandRXYPseudo(MI, SystemZ::LLC, SystemZ::LLCH);
+    return true;
+
+  case SystemZ::LLHMux:
+    expandRXYPseudo(MI, SystemZ::LLH, SystemZ::LLHH);
+    return true;
+
+  case SystemZ::LMux:
+    expandRXYPseudo(MI, SystemZ::L, SystemZ::LFH);
+    return true;
+
+  case SystemZ::STCMux:
+    expandRXYPseudo(MI, SystemZ::STC, SystemZ::STCH);
+    return true;
+
+  case SystemZ::STHMux:
+    expandRXYPseudo(MI, SystemZ::STH, SystemZ::STHH);
+    return true;
+
+  case SystemZ::STMux:
+    expandRXYPseudo(MI, SystemZ::ST, SystemZ::STFH);
+    return true;
+
+  case SystemZ::LHIMux:
+    expandRIPseudo(MI, SystemZ::LHI, SystemZ::IIHF, true);
+    return true;
+
+  case SystemZ::IIFMux:
+    expandRIPseudo(MI, SystemZ::IILF, SystemZ::IIHF, false);
+    return true;
+
+  case SystemZ::IILMux:
+    expandRIPseudo(MI, SystemZ::IILL, SystemZ::IIHL, false);
+    return true;
+
+  case SystemZ::IIHMux:
+    expandRIPseudo(MI, SystemZ::IILH, SystemZ::IIHH, false);
+    return true;
+
+  case SystemZ::NIFMux:
+    expandRIPseudo(MI, SystemZ::NILF, SystemZ::NIHF, false);
+    return true;
+
+  case SystemZ::NILMux:
+    expandRIPseudo(MI, SystemZ::NILL, SystemZ::NIHL, false);
+    return true;
+
+  case SystemZ::NIHMux:
+    expandRIPseudo(MI, SystemZ::NILH, SystemZ::NIHH, false);
+    return true;
+
+  case SystemZ::OIFMux:
+    expandRIPseudo(MI, SystemZ::OILF, SystemZ::OIHF, false);
+    return true;
+
+  case SystemZ::OILMux:
+    expandRIPseudo(MI, SystemZ::OILL, SystemZ::OIHL, false);
+    return true;
+
+  case SystemZ::OIHMux:
+    expandRIPseudo(MI, SystemZ::OILH, SystemZ::OIHH, false);
+    return true;
+
+  case SystemZ::XIFMux:
+    expandRIPseudo(MI, SystemZ::XILF, SystemZ::XIHF, false);
+    return true;
+
+  case SystemZ::TMLMux:
+    expandRIPseudo(MI, SystemZ::TMLL, SystemZ::TMHL, false);
+    return true;
+
+  case SystemZ::TMHMux:
+    expandRIPseudo(MI, SystemZ::TMLH, SystemZ::TMHH, false);
+    return true;
+
+  case SystemZ::AHIMux:
+    expandRIPseudo(MI, SystemZ::AHI, SystemZ::AIH, false);
+    return true;
+
+  case SystemZ::AHIMuxK:
+    expandRIEPseudo(MI, SystemZ::AHI, SystemZ::AHIK, SystemZ::AIH);
+    return true;
+
+  case SystemZ::AFIMux:
+    expandRIPseudo(MI, SystemZ::AFI, SystemZ::AIH, false);
+    return true;
+
+  case SystemZ::CFIMux:
+    expandRIPseudo(MI, SystemZ::CFI, SystemZ::CIH, false);
+    return true;
+
+  case SystemZ::CLFIMux:
+    expandRIPseudo(MI, SystemZ::CLFI, SystemZ::CLIH, false);
+    return true;
+
+  case SystemZ::CMux:
+    expandRXYPseudo(MI, SystemZ::C, SystemZ::CHF);
+    return true;
+
+  case SystemZ::CLMux:
+    expandRXYPseudo(MI, SystemZ::CL, SystemZ::CLHF);
+    return true;
+
+  case SystemZ::RISBMux: {
+    bool DestIsHigh = isHighReg(MI->getOperand(0).getReg());
+    bool SrcIsHigh = isHighReg(MI->getOperand(2).getReg());
+    if (SrcIsHigh == DestIsHigh)
+      MI->setDesc(get(DestIsHigh ? SystemZ::RISBHH : SystemZ::RISBLL));
+    else {
+      MI->setDesc(get(DestIsHigh ? SystemZ::RISBHL : SystemZ::RISBLH));
+      MI->getOperand(5).setImm(MI->getOperand(5).getImm() ^ 32);
+    }
+    return true;
+  }
+
   case SystemZ::ADJDYNALLOC:
     splitAdjDynAlloc(MI);
     return true;
@@ -697,11 +1065,21 @@ SystemZInstrInfo::getBranchInfo(const MachineInstr *MI) const {
     return SystemZII::Branch(SystemZII::BranchC, SystemZ::CCMASK_ICMP,
                              MI->getOperand(2).getImm(), &MI->getOperand(3));
 
+  case SystemZ::CLIJ:
+  case SystemZ::CLRJ:
+    return SystemZII::Branch(SystemZII::BranchCL, SystemZ::CCMASK_ICMP,
+                             MI->getOperand(2).getImm(), &MI->getOperand(3));
+
   case SystemZ::CGIJ:
   case SystemZ::CGRJ:
     return SystemZII::Branch(SystemZII::BranchCG, SystemZ::CCMASK_ICMP,
                              MI->getOperand(2).getImm(), &MI->getOperand(3));
 
+  case SystemZ::CLGIJ:
+  case SystemZ::CLGRJ:
+    return SystemZII::Branch(SystemZII::BranchCLG, SystemZ::CCMASK_ICMP,
+                             MI->getOperand(2).getImm(), &MI->getOperand(3));
+
   default:
     llvm_unreachable("Unrecognized branch opcode");
   }
@@ -712,7 +1090,13 @@ void SystemZInstrInfo::getLoadStoreOpcodes(const TargetRegisterClass *RC,
                                            unsigned &StoreOpcode) const {
   if (RC == &SystemZ::GR32BitRegClass || RC == &SystemZ::ADDR32BitRegClass) {
     LoadOpcode = SystemZ::L;
-    StoreOpcode = SystemZ::ST32;
+    StoreOpcode = SystemZ::ST;
+  } else if (RC == &SystemZ::GRH32BitRegClass) {
+    LoadOpcode = SystemZ::LFH;
+    StoreOpcode = SystemZ::STFH;
+  } else if (RC == &SystemZ::GRX32BitRegClass) {
+    LoadOpcode = SystemZ::LMux;
+    StoreOpcode = SystemZ::STMux;
   } else if (RC == &SystemZ::GR64BitRegClass ||
              RC == &SystemZ::ADDR64BitRegClass) {
     LoadOpcode = SystemZ::LG;
@@ -830,6 +1214,14 @@ unsigned SystemZInstrInfo::getCompareAndBranch(unsigned Opcode,
     return MI && isInt<8>(MI->getOperand(1).getImm()) ? SystemZ::CIJ : 0;
   case SystemZ::CGHI:
     return MI && isInt<8>(MI->getOperand(1).getImm()) ? SystemZ::CGIJ : 0;
+  case SystemZ::CLR:
+    return SystemZ::CLRJ;
+  case SystemZ::CLGR:
+    return SystemZ::CLGRJ;
+  case SystemZ::CLFI:
+    return MI && isUInt<8>(MI->getOperand(1).getImm()) ? SystemZ::CLIJ : 0;
+  case SystemZ::CLGFI:
+    return MI && isUInt<8>(MI->getOperand(1).getImm()) ? SystemZ::CLGIJ : 0;
   default:
     return 0;
   }
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.h b/lib/Target/SystemZ/SystemZInstrInfo.h
index 276fd3b..be4c8fe 100644
--- a/lib/Target/SystemZ/SystemZInstrInfo.h
+++ b/lib/Target/SystemZ/SystemZInstrInfo.h
@@ -70,10 +70,18 @@ namespace SystemZII {
     // on the result.
     BranchC,
 
+    // An instruction that peforms a 32-bit unsigned comparison and branches
+    // on the result.
+    BranchCL,
+
     // An instruction that peforms a 64-bit signed comparison and branches
     // on the result.
     BranchCG,
 
+    // An instruction that peforms a 64-bit unsigned comparison and branches
+    // on the result.
+    BranchCLG,
+
     // An instruction that decrements a 32-bit register and branches if
     // the result is nonzero.
     BranchCT,
@@ -108,7 +116,19 @@ class SystemZInstrInfo : public SystemZGenInstrInfo {
 
   void splitMove(MachineBasicBlock::iterator MI, unsigned NewOpcode) const;
   void splitAdjDynAlloc(MachineBasicBlock::iterator MI) const;
-
+  void expandRIPseudo(MachineInstr *MI, unsigned LowOpcode,
+                      unsigned HighOpcode, bool ConvertHigh) const;
+  void expandRIEPseudo(MachineInstr *MI, unsigned LowOpcode,
+                       unsigned LowOpcodeK, unsigned HighOpcode) const;
+  void expandRXYPseudo(MachineInstr *MI, unsigned LowOpcode,
+                       unsigned HighOpcode) const;
+  void expandZExtPseudo(MachineInstr *MI, unsigned LowOpcode,
+                        unsigned Size) const;
+  void emitGRX32Move(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+                     DebugLoc DL, unsigned DestReg, unsigned SrcReg,
+                     unsigned LowLowOpcode, unsigned Size, bool KillSrc) const;
+  virtual void anchor();
+  
 public:
   explicit SystemZInstrInfo(SystemZTargetMachine &TM);
 
@@ -129,6 +149,12 @@ public:
                                 MachineBasicBlock *FBB,
                                 const SmallVectorImpl<MachineOperand> &Cond,
                                 DebugLoc DL) const LLVM_OVERRIDE;
+  bool analyzeCompare(const MachineInstr *MI, unsigned &SrcReg,
+                      unsigned &SrcReg2, int &Mask, int &Value) const
+    LLVM_OVERRIDE;
+  bool optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg,
+                            unsigned SrcReg2, int Mask, int Value,
+                            const MachineRegisterInfo *MRI) const LLVM_OVERRIDE;
   virtual bool isPredicable(MachineInstr *MI) const LLVM_OVERRIDE;
   virtual bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCycles,
                                    unsigned ExtraPredCycles,
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.td b/lib/Target/SystemZ/SystemZInstrInfo.td
index b318d67..6524e44 100644
--- a/lib/Target/SystemZ/SystemZInstrInfo.td
+++ b/lib/Target/SystemZ/SystemZInstrInfo.td
@@ -32,12 +32,9 @@ let neverHasSideEffects = 1 in {
 // Control flow instructions
 //===----------------------------------------------------------------------===//
 
-// A return instruction.  R1 is the condition-code mask (all 1s)
-// and R2 is the target address, which is always stored in %r14.
-let isReturn = 1, isTerminator = 1, isBarrier = 1, hasCtrlDep = 1,
-    R1 = 15, R2 = 14, isCodeGenOnly = 1 in {
-  def RET : InstRR<0x07, (outs), (ins), "br\t%r14", [(z_retflag)]>;
-}
+// A return instruction (br %r14).
+let isReturn = 1, isTerminator = 1, isBarrier = 1, hasCtrlDep = 1 in
+  def Return : Alias<2, (outs), (ins), [(z_retflag)]>;
 
 // Unconditional branches.  R1 is the condition-code mask (all 1s).
 let isBranch = 1, isTerminator = 1, isBarrier = 1, R1 = 15 in {
@@ -70,6 +67,8 @@ let isBranch = 1, isTerminator = 1, Uses = [CC] in {
                       "brc\t$R1, $I2", []>;
   def AsmBRCL : InstRIL<0xC04, (outs), (ins uimm8zx4:$R1, brtarget32:$I2),
                         "brcl\t$R1, $I2", []>;
+  def AsmBCR : InstRR<0x07, (outs), (ins uimm8zx4:$R1, GR64:$R2),
+                      "bcr\t$R1, $R2", []>;
 }
 
 // Fused compare-and-branch instructions.  As for normal branches,
@@ -94,6 +93,18 @@ multiclass CompareBranches<Operand ccmask, string pos1, string pos2> {
     def GIJ : InstRIEc<0xEC7C, (outs), (ins GR64:$R1, imm64sx8:$I2, ccmask:$M3,
                                             brtarget16:$RI4),
                        "cgij"##pos1##"\t$R1, $I2, "##pos2##"$RI4", []>;
+    def LRJ  : InstRIEb<0xEC77, (outs), (ins GR32:$R1, GR32:$R2, ccmask:$M3,
+                                             brtarget16:$RI4),
+                        "clrj"##pos1##"\t$R1, $R2, "##pos2##"$RI4", []>;
+    def LGRJ : InstRIEb<0xEC65, (outs), (ins GR64:$R1, GR64:$R2, ccmask:$M3,
+                                             brtarget16:$RI4),
+                        "clgrj"##pos1##"\t$R1, $R2, "##pos2##"$RI4", []>;
+    def LIJ  : InstRIEc<0xEC7F, (outs), (ins GR32:$R1, imm32zx8:$I2, ccmask:$M3,
+                                             brtarget16:$RI4),
+                        "clij"##pos1##"\t$R1, $I2, "##pos2##"$RI4", []>;
+    def LGIJ : InstRIEc<0xEC7D, (outs), (ins GR64:$R1, imm64zx8:$I2, ccmask:$M3,
+                                             brtarget16:$RI4),
+                        "clgij"##pos1##"\t$R1, $I2, "##pos2##"$RI4", []>;
   }
 }
 let isCodeGenOnly = 1 in
@@ -108,6 +119,7 @@ multiclass CondExtendedMnemonic<bits<4> ccmask, string name> {
                    "j"##name##"\t$I2", []>;
     def JG : InstRIL<0xC04, (outs), (ins brtarget32:$I2),
                      "jg"##name##"\t$I2", []>;
+    def BR : InstRR<0x07, (outs), (ins ADDR64:$R2), "b"##name##"r\t$R2", []>;
   }
   def LOCR  : FixedCondUnaryRRF<"locr"##name,  0xB9F2, GR32, GR32, ccmask>;
   def LOCGR : FixedCondUnaryRRF<"locgr"##name, 0xB9E2, GR64, GR64, ccmask>;
@@ -152,6 +164,18 @@ multiclass IntCondExtendedMnemonicA<bits<4> ccmask, string name> {
     def CGI : InstRIEc<0xEC7C, (outs), (ins GR64:$R1, imm64sx8:$I2,
                                             brtarget16:$RI4),
                        "cgij"##name##"\t$R1, $I2, $RI4", []>;
+    def CLR  : InstRIEb<0xEC77, (outs), (ins GR32:$R1, GR32:$R2,
+                                            brtarget16:$RI4),
+                        "clrj"##name##"\t$R1, $R2, $RI4", []>;
+    def CLGR : InstRIEb<0xEC65, (outs), (ins GR64:$R1, GR64:$R2,
+                                             brtarget16:$RI4),
+                        "clgrj"##name##"\t$R1, $R2, $RI4", []>;
+    def CLI  : InstRIEc<0xEC7F, (outs), (ins GR32:$R1, imm32zx8:$I2,
+                                             brtarget16:$RI4),
+                        "clij"##name##"\t$R1, $I2, $RI4", []>;
+    def CLGI : InstRIEc<0xEC7D, (outs), (ins GR64:$R1, imm64zx8:$I2,
+                                             brtarget16:$RI4),
+                        "clgij"##name##"\t$R1, $I2, $RI4", []>;
   }
 }
 multiclass IntCondExtendedMnemonic<bits<4> ccmask, string name1, string name2>
@@ -177,22 +201,31 @@ let Defs = [CC] in {
 // Select instructions
 //===----------------------------------------------------------------------===//
 
-def Select32 : SelectWrapper<GR32>;
-def Select64 : SelectWrapper<GR64>;
-
-defm CondStore8_32  : CondStores<GR32, nonvolatile_truncstorei8,
+def Select32Mux : SelectWrapper<GRX32>, Requires<[FeatureHighWord]>;
+def Select32    : SelectWrapper<GR32>;
+def Select64    : SelectWrapper<GR64>;
+
+// We don't define 32-bit Mux stores because the low-only STOC should
+// always be used if possible.
+defm CondStore8Mux  : CondStores<GRX32, nonvolatile_truncstorei8,
+                                 nonvolatile_anyextloadi8, bdxaddr20only>,
+                      Requires<[FeatureHighWord]>;
+defm CondStore16Mux : CondStores<GRX32, nonvolatile_truncstorei16,
+                                 nonvolatile_anyextloadi16, bdxaddr20only>,
+                      Requires<[FeatureHighWord]>;
+defm CondStore8     : CondStores<GR32, nonvolatile_truncstorei8,
                                  nonvolatile_anyextloadi8, bdxaddr20only>;
-defm CondStore16_32 : CondStores<GR32, nonvolatile_truncstorei16,
+defm CondStore16    : CondStores<GR32, nonvolatile_truncstorei16,
                                  nonvolatile_anyextloadi16, bdxaddr20only>;
-defm CondStore32_32 : CondStores<GR32, nonvolatile_store,
+defm CondStore32    : CondStores<GR32, nonvolatile_store,
                                  nonvolatile_load, bdxaddr20only>;
 
-defm CondStore8  : CondStores<GR64, nonvolatile_truncstorei8,
-                              nonvolatile_anyextloadi8, bdxaddr20only>;
-defm CondStore16 : CondStores<GR64, nonvolatile_truncstorei16,
-                              nonvolatile_anyextloadi16, bdxaddr20only>;
-defm CondStore32 : CondStores<GR64, nonvolatile_truncstorei32,
-                              nonvolatile_anyextloadi32, bdxaddr20only>;
+defm : CondStores64<CondStore8, CondStore8Inv, nonvolatile_truncstorei8,
+                    nonvolatile_anyextloadi8, bdxaddr20only>;
+defm : CondStores64<CondStore16, CondStore16Inv, nonvolatile_truncstorei16,
+                    nonvolatile_anyextloadi16, bdxaddr20only>;
+defm : CondStores64<CondStore32, CondStore32Inv, nonvolatile_truncstorei32,
+                    nonvolatile_anyextloadi32, bdxaddr20only>;
 defm CondStore64 : CondStores<GR64, nonvolatile_store,
                               nonvolatile_load, bdxaddr20only>;
 
@@ -202,24 +235,30 @@ defm CondStore64 : CondStores<GR64, nonvolatile_store,
 
 // The definitions here are for the call-clobbered registers.
 let isCall = 1, Defs = [R0D, R1D, R2D, R3D, R4D, R5D, R14D,
-                        F0D, F1D, F2D, F3D, F4D, F5D, F6D, F7D, CC],
-    R1 = 14, isCodeGenOnly = 1 in {
-  def BRAS  : InstRI<0xA75, (outs), (ins pcrel16call:$I2, variable_ops),
-                     "bras\t%r14, $I2", []>;
-  def BRASL : InstRIL<0xC05, (outs), (ins pcrel32call:$I2, variable_ops),
-                      "brasl\t%r14, $I2", [(z_call pcrel32call:$I2)]>;
-  def BASR  : InstRR<0x0D, (outs), (ins ADDR64:$R2, variable_ops),
-                     "basr\t%r14, $R2", [(z_call ADDR64:$R2)]>;
+                        F0D, F1D, F2D, F3D, F4D, F5D, F6D, F7D, CC] in {
+  def CallBRASL : Alias<6, (outs), (ins pcrel32:$I2, variable_ops),
+                        [(z_call pcrel32:$I2)]>;
+  def CallBASR  : Alias<2, (outs), (ins ADDR64:$R2, variable_ops),
+                        [(z_call ADDR64:$R2)]>;
+}
+
+// Sibling calls.  Indirect sibling calls must be via R1, since R2 upwards
+// are argument registers and since branching to R0 is a no-op.
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in {
+  def CallJG : Alias<6, (outs), (ins pcrel32:$I2),
+                     [(z_sibcall pcrel32:$I2)]>;
+  let Uses = [R1D] in
+    def CallBR : Alias<2, (outs), (ins), [(z_sibcall R1D)]>;
 }
 
 // Define the general form of the call instructions for the asm parser.
 // These instructions don't hard-code %r14 as the return address register.
-def AsmBRAS  : InstRI<0xA75, (outs), (ins GR64:$R1, brtarget16:$I2),
-                      "bras\t$R1, $I2", []>;
-def AsmBRASL : InstRIL<0xC05, (outs), (ins GR64:$R1, brtarget32:$I2),
-                       "brasl\t$R1, $I2", []>;
-def AsmBASR  : InstRR<0x0D, (outs), (ins GR64:$R1, ADDR64:$R2),
-                      "basr\t$R1, $R2", []>;
+def BRAS  : InstRI<0xA75, (outs), (ins GR64:$R1, brtarget16:$I2),
+                   "bras\t$R1, $I2", []>;
+def BRASL : InstRIL<0xC05, (outs), (ins GR64:$R1, brtarget32:$I2),
+                    "brasl\t$R1, $I2", []>;
+def BASR  : InstRR<0x0D, (outs), (ins GR64:$R1, ADDR64:$R2),
+                   "basr\t$R1, $R2", []>;
 
 //===----------------------------------------------------------------------===//
 // Move instructions
@@ -227,6 +266,9 @@ def AsmBASR  : InstRR<0x0D, (outs), (ins GR64:$R1, ADDR64:$R2),
 
 // Register moves.
 let neverHasSideEffects = 1 in {
+  // Expands to LR, RISBHG or RISBLG, depending on the choice of registers.
+  def LRMux : UnaryRRPseudo<"l", null_frag, GRX32, GRX32>,
+              Requires<[FeatureHighWord]>;
   def LR  : UnaryRR <"l",  0x18,   null_frag, GR32, GR32>;
   def LGR : UnaryRRE<"lg", 0xB904, null_frag, GR64, GR64>;
 }
@@ -248,7 +290,10 @@ let Uses = [CC] in {
 // Immediate moves.
 let neverHasSideEffects = 1, isAsCheapAsAMove = 1, isMoveImm = 1,
     isReMaterializable = 1 in {
-  // 16-bit sign-extended immediates.
+  // 16-bit sign-extended immediates.  LHIMux expands to LHI or IIHF,
+  // deopending on the choice of register.
+  def LHIMux : UnaryRIPseudo<bitconvert, GRX32, imm32sx16>,
+               Requires<[FeatureHighWord]>;
   def LHI  : UnaryRI<"lhi",  0xA78, bitconvert, GR32, imm32sx16>;
   def LGHI : UnaryRI<"lghi", 0xA79, bitconvert, GR64, imm64sx16>;
 
@@ -266,7 +311,12 @@ let neverHasSideEffects = 1, isAsCheapAsAMove = 1, isMoveImm = 1,
 
 // Register loads.
 let canFoldAsLoad = 1, SimpleBDXLoad = 1 in {
+  // Expands to L, LY or LFH, depending on the choice of register.
+  def LMux : UnaryRXYPseudo<"l", load, GRX32, 4>,
+             Requires<[FeatureHighWord]>;
   defm L : UnaryRXPair<"l", 0x58, 0xE358, load, GR32, 4>;
+  def LFH : UnaryRXY<"lfh", 0xE3CA, load, GRH32, 4>,
+            Requires<[FeatureHighWord]>;
   def LG : UnaryRXY<"lg", 0xE304, load, GR64, 8>;
 
   // These instructions are split after register allocation, so we don't
@@ -298,8 +348,12 @@ let Uses = [CC] in {
 
 // Register stores.
 let SimpleBDXStore = 1 in {
-  let isCodeGenOnly = 1 in
-    defm ST32 : StoreRXPair<"st", 0x50, 0xE350, store, GR32, 4>;
+  // Expands to ST, STY or STFH, depending on the choice of register.
+  def STMux : StoreRXYPseudo<store, GRX32, 4>,
+              Requires<[FeatureHighWord]>;
+  defm ST : StoreRXPair<"st", 0x50, 0xE350, store, GR32, 4>;
+  def STFH : StoreRXY<"stfh", 0xE3CB, store, GRH32, 4>,
+             Requires<[FeatureHighWord]>;
   def STG : StoreRXY<"stg", 0xE324, store, GR64, 8>;
 
   // These instructions are split after register allocation, so we don't
@@ -309,15 +363,13 @@ let SimpleBDXStore = 1 in {
                        [(store GR128:$src, bdxaddr20only128:$dst)]>;
   }
 }
-let isCodeGenOnly = 1 in
-  def STRL32 : StoreRILPC<"strl", 0xC4F, aligned_store, GR32>;
+def STRL  : StoreRILPC<"strl", 0xC4F, aligned_store, GR32>;
 def STGRL : StoreRILPC<"stgrl", 0xC4B, aligned_store, GR64>;
 
 // Store on condition.
 let isCodeGenOnly = 1, Uses = [CC] in {
-  def STOC32 : CondStoreRSY<"stoc",  0xEBF3, GR32, 4>;
-  def STOC   : CondStoreRSY<"stoc",  0xEBF3, GR64, 4>;
-  def STOCG  : CondStoreRSY<"stocg", 0xEBE3, GR64, 8>;
+  def STOC  : CondStoreRSY<"stoc",  0xEBF3, GR32, 4>;
+  def STOCG : CondStoreRSY<"stocg", 0xEBE3, GR64, 8>;
 }
 let Uses = [CC] in {
   def AsmSTOC  : AsmCondStoreRSY<"stoc",  0xEBF3, GR32, 4>;
@@ -334,33 +386,22 @@ def MVGHI : StoreSIL<"mvghi", 0xE548, store,         imm64sx16>;
 
 // Memory-to-memory moves.
 let mayLoad = 1, mayStore = 1 in
-  def MVC : InstSS<0xD2, (outs), (ins bdladdr12onlylen8:$BDL1,
-                                      bdaddr12only:$BD2),
-                   "mvc\t$BDL1, $BD2", []>;
-
-let mayLoad = 1, mayStore = 1, usesCustomInserter = 1 in
-  def MVCWrapper : Pseudo<(outs), (ins bdaddr12only:$dest, bdaddr12only:$src,
-                                       imm32len8:$length),
-                          [(z_mvc bdaddr12only:$dest, bdaddr12only:$src,
-                                  imm32len8:$length)]>;
-
-defm LoadStore8_32  : MVCLoadStore<anyextloadi8, truncstorei8, i32,
-                                   MVCWrapper, 1>;
-defm LoadStore16_32 : MVCLoadStore<anyextloadi16, truncstorei16, i32,
-                                   MVCWrapper, 2>;
-defm LoadStore32_32 : MVCLoadStore<load, store, i32, MVCWrapper, 4>;
-
-defm LoadStore8  : MVCLoadStore<anyextloadi8, truncstorei8, i64,
-                                MVCWrapper, 1>;
-defm LoadStore16 : MVCLoadStore<anyextloadi16, truncstorei16, i64,
-                                MVCWrapper, 2>;
-defm LoadStore32 : MVCLoadStore<anyextloadi32, truncstorei32, i64,
-                                MVCWrapper, 4>;
-defm LoadStore64 : MVCLoadStore<load, store, i64, MVCWrapper, 8>;
+  defm MVC : MemorySS<"mvc", 0xD2, z_mvc, z_mvc_loop>;
+
+// String moves.
+let mayLoad = 1, mayStore = 1, Defs = [CC], Uses = [R0L] in
+  defm MVST : StringRRE<"mvst", 0xB255, z_stpcpy>;
 
 //===----------------------------------------------------------------------===//
 // Sign extensions
 //===----------------------------------------------------------------------===//
+//
+// Note that putting these before zero extensions mean that we will prefer
+// them for anyextload*.  There's not really much to choose between the two
+// either way, but signed-extending loads have a short LH and a long LHY,
+// while zero-extending loads have only the long LLH.
+//
+//===----------------------------------------------------------------------===//
 
 // 32-bit extensions from registers.
 let neverHasSideEffects = 1 in {
@@ -380,40 +421,33 @@ let Defs = [CC], CCValues = 0xE, CompareZeroCCMask = 0xE in
 // Match 32-to-64-bit sign extensions in which the source is already
 // in a 64-bit register.
 def : Pat<(sext_inreg GR64:$src, i32),
-          (LGFR (EXTRACT_SUBREG GR64:$src, subreg_32bit))>;
-
-// 32-bit extensions from memory.
-def  LB   : UnaryRXY<"lb", 0xE376, sextloadi8, GR32, 1>;
-defm LH   : UnaryRXPair<"lh", 0x48, 0xE378, sextloadi16, GR32, 2>;
-def  LHRL : UnaryRILPC<"lhrl", 0xC45, aligned_sextloadi16, GR32>;
+          (LGFR (EXTRACT_SUBREG GR64:$src, subreg_l32))>;
+
+// 32-bit extensions from 8-bit memory.  LBMux expands to LB or LBH,
+// depending on the choice of register.
+def LBMux : UnaryRXYPseudo<"lb", asextloadi8, GRX32, 1>,
+            Requires<[FeatureHighWord]>;
+def LB  : UnaryRXY<"lb", 0xE376, asextloadi8, GR32, 1>;
+def LBH : UnaryRXY<"lbh", 0xE3C0, asextloadi8, GRH32, 1>,
+          Requires<[FeatureHighWord]>;
+
+// 32-bit extensions from 16-bit memory.  LHMux expands to LH or LHH,
+// depending on the choice of register.
+def LHMux : UnaryRXYPseudo<"lh", asextloadi16, GRX32, 2>,
+            Requires<[FeatureHighWord]>;
+defm LH   : UnaryRXPair<"lh", 0x48, 0xE378, asextloadi16, GR32, 2>;
+def  LHH  : UnaryRXY<"lhh", 0xE3C4, asextloadi16, GRH32, 2>,
+            Requires<[FeatureHighWord]>;
+def  LHRL : UnaryRILPC<"lhrl", 0xC45, aligned_asextloadi16, GR32>;
 
 // 64-bit extensions from memory.
-def LGB   : UnaryRXY<"lgb", 0xE377, sextloadi8,  GR64, 1>;
-def LGH   : UnaryRXY<"lgh", 0xE315, sextloadi16, GR64, 2>;
-def LGF   : UnaryRXY<"lgf", 0xE314, sextloadi32, GR64, 4>;
-def LGHRL : UnaryRILPC<"lghrl", 0xC44, aligned_sextloadi16, GR64>;
-def LGFRL : UnaryRILPC<"lgfrl", 0xC4C, aligned_sextloadi32, GR64>;
+def LGB   : UnaryRXY<"lgb", 0xE377, asextloadi8,  GR64, 1>;
+def LGH   : UnaryRXY<"lgh", 0xE315, asextloadi16, GR64, 2>;
+def LGF   : UnaryRXY<"lgf", 0xE314, asextloadi32, GR64, 4>;
+def LGHRL : UnaryRILPC<"lghrl", 0xC44, aligned_asextloadi16, GR64>;
+def LGFRL : UnaryRILPC<"lgfrl", 0xC4C, aligned_asextloadi32, GR64>;
 let Defs = [CC], CCValues = 0xE, CompareZeroCCMask = 0xE in
-  def LTGF : UnaryRXY<"ltgf", 0xE332, sextloadi32, GR64, 4>;
-
-// If the sign of a load-extend operation doesn't matter, use the signed ones.
-// There's not really much to choose between the sign and zero extensions,
-// but LH is more compact than LLH for small offsets.
-def : Pat<(i32 (extloadi8  bdxaddr20only:$src)), (LB  bdxaddr20only:$src)>;
-def : Pat<(i32 (extloadi16 bdxaddr12pair:$src)), (LH  bdxaddr12pair:$src)>;
-def : Pat<(i32 (extloadi16 bdxaddr20pair:$src)), (LHY bdxaddr20pair:$src)>;
-
-def : Pat<(i64 (extloadi8  bdxaddr20only:$src)), (LGB bdxaddr20only:$src)>;
-def : Pat<(i64 (extloadi16 bdxaddr20only:$src)), (LGH bdxaddr20only:$src)>;
-def : Pat<(i64 (extloadi32 bdxaddr20only:$src)), (LGF bdxaddr20only:$src)>;
-
-// We want PC-relative addresses to be tried ahead of BD and BDX addresses.
-// However, BDXs have two extra operands and are therefore 6 units more
-// complex.
-let AddedComplexity = 7 in {
-  def : Pat<(i32 (extloadi16 pcrel32:$src)), (LHRL  pcrel32:$src)>;
-  def : Pat<(i64 (extloadi16 pcrel32:$src)), (LGHRL pcrel32:$src)>;
-}
+  def LTGF : UnaryRXY<"ltgf", 0xE332, asextloadi32, GR64, 4>;
 
 //===----------------------------------------------------------------------===//
 // Zero extensions
@@ -421,8 +455,14 @@ let AddedComplexity = 7 in {
 
 // 32-bit extensions from registers.
 let neverHasSideEffects = 1 in {
-  def LLCR : UnaryRRE<"llc", 0xB994, zext8,  GR32, GR32>;
-  def LLHR : UnaryRRE<"llh", 0xB995, zext16, GR32, GR32>;
+  // Expands to LLCR or RISB[LH]G, depending on the choice of registers.
+  def LLCRMux : UnaryRRPseudo<"llc", zext8, GRX32, GRX32>,
+                Requires<[FeatureHighWord]>;
+  def LLCR    : UnaryRRE<"llc", 0xB994, zext8,  GR32, GR32>;
+  // Expands to LLHR or RISB[LH]G, depending on the choice of registers.
+  def LLHRMux : UnaryRRPseudo<"llh", zext16, GRX32, GRX32>,
+                Requires<[FeatureHighWord]>;
+  def LLHR    : UnaryRRE<"llh", 0xB995, zext16, GR32, GR32>;
 }
 
 // 64-bit extensions from registers.
@@ -435,19 +475,31 @@ let neverHasSideEffects = 1 in {
 // Match 32-to-64-bit zero extensions in which the source is already
 // in a 64-bit register.
 def : Pat<(and GR64:$src, 0xffffffff),
-          (LLGFR (EXTRACT_SUBREG GR64:$src, subreg_32bit))>;
+          (LLGFR (EXTRACT_SUBREG GR64:$src, subreg_l32))>;
 
-// 32-bit extensions from memory.
-def LLC   : UnaryRXY<"llc", 0xE394, zextloadi8,  GR32, 1>;
-def LLH   : UnaryRXY<"llh", 0xE395, zextloadi16, GR32, 2>;
-def LLHRL : UnaryRILPC<"llhrl", 0xC42, aligned_zextloadi16, GR32>;
+// 32-bit extensions from 8-bit memory.  LLCMux expands to LLC or LLCH,
+// depending on the choice of register.
+def LLCMux : UnaryRXYPseudo<"llc", azextloadi8, GRX32, 1>,
+             Requires<[FeatureHighWord]>;
+def LLC  : UnaryRXY<"llc", 0xE394, azextloadi8, GR32, 1>;
+def LLCH : UnaryRXY<"llch", 0xE3C2, azextloadi8, GR32, 1>,
+           Requires<[FeatureHighWord]>;
+
+// 32-bit extensions from 16-bit memory.  LLHMux expands to LLH or LLHH,
+// depending on the choice of register.
+def LLHMux : UnaryRXYPseudo<"llh", azextloadi16, GRX32, 2>,
+             Requires<[FeatureHighWord]>;
+def LLH   : UnaryRXY<"llh", 0xE395, azextloadi16, GR32, 2>;
+def LLHH  : UnaryRXY<"llhh", 0xE3C6, azextloadi16, GR32, 2>,
+            Requires<[FeatureHighWord]>;
+def LLHRL : UnaryRILPC<"llhrl", 0xC42, aligned_azextloadi16, GR32>;
 
 // 64-bit extensions from memory.
-def LLGC   : UnaryRXY<"llgc", 0xE390, zextloadi8,  GR64, 1>;
-def LLGH   : UnaryRXY<"llgh", 0xE391, zextloadi16, GR64, 2>;
-def LLGF   : UnaryRXY<"llgf", 0xE316, zextloadi32, GR64, 4>;
-def LLGHRL : UnaryRILPC<"llghrl", 0xC46, aligned_zextloadi16, GR64>;
-def LLGFRL : UnaryRILPC<"llgfrl", 0xC4E, aligned_zextloadi32, GR64>;
+def LLGC   : UnaryRXY<"llgc", 0xE390, azextloadi8,  GR64, 1>;
+def LLGH   : UnaryRXY<"llgh", 0xE391, azextloadi16, GR64, 2>;
+def LLGF   : UnaryRXY<"llgf", 0xE316, azextloadi32, GR64, 4>;
+def LLGHRL : UnaryRILPC<"llghrl", 0xC46, aligned_azextloadi16, GR64>;
+def LLGFRL : UnaryRILPC<"llgfrl", 0xC4E, aligned_azextloadi32, GR64>;
 
 //===----------------------------------------------------------------------===//
 // Truncations
@@ -455,21 +507,31 @@ def LLGFRL : UnaryRILPC<"llgfrl", 0xC4E, aligned_zextloadi32, GR64>;
 
 // Truncations of 64-bit registers to 32-bit registers.
 def : Pat<(i32 (trunc GR64:$src)),
-          (EXTRACT_SUBREG GR64:$src, subreg_32bit)>;
+          (EXTRACT_SUBREG GR64:$src, subreg_l32)>;
 
-// Truncations of 32-bit registers to memory.
-let isCodeGenOnly = 1 in {
-  defm STC32   : StoreRXPair<"stc", 0x42, 0xE372, truncstorei8,  GR32, 1>;
-  defm STH32   : StoreRXPair<"sth", 0x40, 0xE370, truncstorei16, GR32, 2>;
-  def  STHRL32 : StoreRILPC<"sthrl", 0xC47, aligned_truncstorei16, GR32>;
-}
+// Truncations of 32-bit registers to 8-bit memory.  STCMux expands to
+// STC, STCY or STCH, depending on the choice of register.
+def STCMux : StoreRXYPseudo<truncstorei8, GRX32, 1>,
+             Requires<[FeatureHighWord]>;
+defm STC : StoreRXPair<"stc", 0x42, 0xE372, truncstorei8, GR32, 1>;
+def STCH : StoreRXY<"stch", 0xE3C3, truncstorei8, GRH32, 1>,
+           Requires<[FeatureHighWord]>;
+
+// Truncations of 32-bit registers to 16-bit memory.  STHMux expands to
+// STH, STHY or STHH, depending on the choice of register.
+def STHMux : StoreRXYPseudo<truncstorei16, GRX32, 1>,
+             Requires<[FeatureHighWord]>;
+defm STH : StoreRXPair<"sth", 0x40, 0xE370, truncstorei16, GR32, 2>;
+def STHH : StoreRXY<"sthh", 0xE3C7, truncstorei16, GRH32, 2>,
+           Requires<[FeatureHighWord]>;
+def STHRL : StoreRILPC<"sthrl", 0xC47, aligned_truncstorei16, GR32>;
 
 // Truncations of 64-bit registers to memory.
-defm STC   : StoreRXPair<"stc", 0x42, 0xE372, truncstorei8,  GR64, 1>;
-defm STH   : StoreRXPair<"sth", 0x40, 0xE370, truncstorei16, GR64, 2>;
-def  STHRL : StoreRILPC<"sthrl", 0xC47, aligned_truncstorei16, GR64>;
-defm ST    : StoreRXPair<"st", 0x50, 0xE350, truncstorei32, GR64, 4>;
-def  STRL  : StoreRILPC<"strl", 0xC4F, aligned_truncstorei32, GR64>;
+defm : StoreGR64Pair<STC, STCY, truncstorei8>;
+defm : StoreGR64Pair<STH, STHY, truncstorei16>;
+def  : StoreGR64PC<STHRL, aligned_truncstorei16>;
+defm : StoreGR64Pair<ST, STY, truncstorei32>;
+def  : StoreGR64PC<STRL, aligned_truncstorei32>;
 
 //===----------------------------------------------------------------------===//
 // Multi-register moves
@@ -528,11 +590,31 @@ let neverHasSideEffects = 1, isAsCheapAsAMove = 1, isMoveImm = 1,
 }
 
 //===----------------------------------------------------------------------===//
-// Negation
+// Absolute and Negation
 //===----------------------------------------------------------------------===//
 
 let Defs = [CC] in {
   let CCValues = 0xF, CompareZeroCCMask = 0x8 in {
+    def LPR  : UnaryRR <"lp",  0x10,   z_iabs32, GR32, GR32>;
+    def LPGR : UnaryRRE<"lpg", 0xB900, z_iabs64, GR64, GR64>;
+  }
+  let CCValues = 0xE, CompareZeroCCMask = 0xE in
+    def LPGFR : UnaryRRE<"lpgf", 0xB910, null_frag, GR64, GR32>;
+}
+defm : SXU<z_iabs64, LPGFR>;
+
+let Defs = [CC] in {
+  let CCValues = 0xF, CompareZeroCCMask = 0x8 in {
+    def LNR  : UnaryRR <"ln",  0x11,   z_inegabs32, GR32, GR32>;
+    def LNGR : UnaryRRE<"lng", 0xB901, z_inegabs64, GR64, GR64>;
+  }
+  let CCValues = 0xE, CompareZeroCCMask = 0xE in
+    def LNGFR : UnaryRRE<"lngf", 0xB911, null_frag, GR64, GR32>;
+}
+defm : SXU<z_inegabs64, LNGFR>;
+
+let Defs = [CC] in {
+  let CCValues = 0xF, CompareZeroCCMask = 0x8 in {
     def LCR  : UnaryRR <"lc",  0x13,   ineg, GR32, GR32>;
     def LCGR : UnaryRRE<"lcg", 0xB903, ineg, GR64, GR64>;
   }
@@ -546,43 +628,51 @@ defm : SXU<ineg, LCGFR>;
 //===----------------------------------------------------------------------===//
 
 let isCodeGenOnly = 1 in
-  defm IC32 : BinaryRXPair<"ic", 0x43, 0xE373, inserti8, GR32, zextloadi8, 1>;
-defm IC : BinaryRXPair<"ic", 0x43, 0xE373, inserti8, GR64, zextloadi8, 1>;
+  defm IC32 : BinaryRXPair<"ic", 0x43, 0xE373, inserti8, GR32, azextloadi8, 1>;
+defm IC : BinaryRXPair<"ic", 0x43, 0xE373, inserti8, GR64, azextloadi8, 1>;
 
-defm : InsertMem<"inserti8", IC32,  GR32, zextloadi8, bdxaddr12pair>;
-defm : InsertMem<"inserti8", IC32Y, GR32, zextloadi8, bdxaddr20pair>;
+defm : InsertMem<"inserti8", IC32,  GR32, azextloadi8, bdxaddr12pair>;
+defm : InsertMem<"inserti8", IC32Y, GR32, azextloadi8, bdxaddr20pair>;
 
-defm : InsertMem<"inserti8", IC,  GR64, zextloadi8, bdxaddr12pair>;
-defm : InsertMem<"inserti8", ICY, GR64, zextloadi8, bdxaddr20pair>;
+defm : InsertMem<"inserti8", IC,  GR64, azextloadi8, bdxaddr12pair>;
+defm : InsertMem<"inserti8", ICY, GR64, azextloadi8, bdxaddr20pair>;
 
 // Insertions of a 16-bit immediate, leaving other bits unaffected.
 // We don't have or_as_insert equivalents of these operations because
 // OI is available instead.
-let isCodeGenOnly = 1 in {
-  def IILL32 : BinaryRI<"iill", 0xA53, insertll, GR32, imm32ll16>;
-  def IILH32 : BinaryRI<"iilh", 0xA52, insertlh, GR32, imm32lh16>;
-}
-def IILL : BinaryRI<"iill", 0xA53, insertll, GR64, imm64ll16>;
-def IILH : BinaryRI<"iilh", 0xA52, insertlh, GR64, imm64lh16>;
-def IIHL : BinaryRI<"iihl", 0xA51, inserthl, GR64, imm64hl16>;
-def IIHH : BinaryRI<"iihh", 0xA50, inserthh, GR64, imm64hh16>;
+//
+// IIxMux expands to II[LH]x, depending on the choice of register.
+def IILMux : BinaryRIPseudo<insertll, GRX32, imm32ll16>,
+             Requires<[FeatureHighWord]>;
+def IIHMux : BinaryRIPseudo<insertlh, GRX32, imm32lh16>,
+             Requires<[FeatureHighWord]>;
+def IILL : BinaryRI<"iill", 0xA53, insertll, GR32, imm32ll16>;
+def IILH : BinaryRI<"iilh", 0xA52, insertlh, GR32, imm32lh16>;
+def IIHL : BinaryRI<"iihl", 0xA51, insertll, GRH32, imm32ll16>;
+def IIHH : BinaryRI<"iihh", 0xA50, insertlh, GRH32, imm32lh16>;
+def IILL64 : BinaryAliasRI<insertll, GR64, imm64ll16>;
+def IILH64 : BinaryAliasRI<insertlh, GR64, imm64lh16>;
+def IIHL64 : BinaryAliasRI<inserthl, GR64, imm64hl16>;
+def IIHH64 : BinaryAliasRI<inserthh, GR64, imm64hh16>;
 
 // ...likewise for 32-bit immediates.  For GR32s this is a general
 // full-width move.  (We use IILF rather than something like LLILF
 // for 32-bit moves because IILF leaves the upper 32 bits of the
 // GR64 unchanged.)
-let isCodeGenOnly = 1, isAsCheapAsAMove = 1, isMoveImm = 1,
-    isReMaterializable = 1 in {
-  def IILF32 : UnaryRIL<"iilf", 0xC09, bitconvert, GR32, uimm32>;
+let isAsCheapAsAMove = 1, isMoveImm = 1, isReMaterializable = 1 in {
+  def IIFMux : UnaryRIPseudo<bitconvert, GRX32, uimm32>,
+               Requires<[FeatureHighWord]>;
+  def IILF : UnaryRIL<"iilf", 0xC09, bitconvert, GR32, uimm32>;
+  def IIHF : UnaryRIL<"iihf", 0xC08, bitconvert, GRH32, uimm32>;
 }
-def IILF : BinaryRIL<"iilf", 0xC09, insertlf, GR64, imm64lf32>;
-def IIHF : BinaryRIL<"iihf", 0xC08, inserthf, GR64, imm64hf32>;
+def IILF64 : BinaryAliasRIL<insertlf, GR64, imm64lf32>;
+def IIHF64 : BinaryAliasRIL<inserthf, GR64, imm64hf32>;
 
 // An alternative model of inserthf, with the first operand being
 // a zero-extended value.
 def : Pat<(or (zext32 GR32:$src), imm64hf32:$imm),
-          (IIHF (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$src, subreg_32bit),
-                imm64hf32:$imm)>;
+          (IIHF64 (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$src, subreg_l32),
+                  imm64hf32:$imm)>;
 
 //===----------------------------------------------------------------------===//
 // Addition
@@ -598,17 +688,22 @@ let Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0x8 in {
   def AGFR : BinaryRRE<"agf", 0xB918, null_frag, GR64, GR32>;
 
   // Addition of signed 16-bit immediates.
+  defm AHIMux : BinaryRIAndKPseudo<"ahimux", add, GRX32, imm32sx16>;
   defm AHI  : BinaryRIAndK<"ahi",  0xA7A, 0xECD8, add, GR32, imm32sx16>;
   defm AGHI : BinaryRIAndK<"aghi", 0xA7B, 0xECD9, add, GR64, imm64sx16>;
 
   // Addition of signed 32-bit immediates.
+  def AFIMux : BinaryRIPseudo<add, GRX32, simm32>,
+               Requires<[FeatureHighWord]>;
   def AFI  : BinaryRIL<"afi",  0xC29, add, GR32, simm32>;
+  def AIH  : BinaryRIL<"aih",  0xCC8, add, GRH32, simm32>,
+             Requires<[FeatureHighWord]>;
   def AGFI : BinaryRIL<"agfi", 0xC28, add, GR64, imm64sx32>;
 
   // Addition of memory.
-  defm AH  : BinaryRXPair<"ah", 0x4A, 0xE37A, add, GR32, sextloadi16, 2>;
+  defm AH  : BinaryRXPair<"ah", 0x4A, 0xE37A, add, GR32, asextloadi16, 2>;
   defm A   : BinaryRXPair<"a",  0x5A, 0xE35A, add, GR32, load, 4>;
-  def  AGF : BinaryRXY<"agf", 0xE318, add, GR64, sextloadi32, 4>;
+  def  AGF : BinaryRXY<"agf", 0xE318, add, GR64, asextloadi32, 4>;
   def  AG  : BinaryRXY<"ag",  0xE308, add, GR64, load, 8>;
 
   // Addition to memory.
@@ -638,7 +733,7 @@ let Defs = [CC] in {
 
   // Addition of memory.
   defm AL   : BinaryRXPair<"al", 0x5E, 0xE35E, addc, GR32, load, 4>;
-  def  ALGF : BinaryRXY<"algf", 0xE31A, addc, GR64, zextloadi32, 4>;
+  def  ALGF : BinaryRXY<"algf", 0xE31A, addc, GR64, azextloadi32, 4>;
   def  ALG  : BinaryRXY<"alg",  0xE30A, addc, GR64, load, 8>;
 }
 defm : ZXB<addc, GR64, ALGFR>;
@@ -667,9 +762,9 @@ let Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0x8 in {
   defm SGR : BinaryRREAndK<"sg", 0xB909, 0xB9E9, sub, GR64, GR64>;
 
   // Subtraction of memory.
-  defm SH  : BinaryRXPair<"sh", 0x4B, 0xE37B, sub, GR32, sextloadi16, 2>;
+  defm SH  : BinaryRXPair<"sh", 0x4B, 0xE37B, sub, GR32, asextloadi16, 2>;
   defm S   : BinaryRXPair<"s", 0x5B, 0xE35B, sub, GR32, load, 4>;
-  def  SGF : BinaryRXY<"sgf", 0xE319, sub, GR64, sextloadi32, 4>;
+  def  SGF : BinaryRXY<"sgf", 0xE319, sub, GR64, asextloadi32, 4>;
   def  SG  : BinaryRXY<"sg",  0xE309, sub, GR64, load, 8>;
 }
 defm : SXB<sub, GR64, SGFR>;
@@ -688,7 +783,7 @@ let Defs = [CC] in {
 
   // Subtraction of memory.
   defm SL   : BinaryRXPair<"sl", 0x5F, 0xE35F, subc, GR32, load, 4>;
-  def  SLGF : BinaryRXY<"slgf", 0xE31B, subc, GR64, zextloadi32, 4>;
+  def  SLGF : BinaryRXY<"slgf", 0xE31B, subc, GR64, azextloadi32, 4>;
   def  SLG  : BinaryRXY<"slg",  0xE30B, subc, GR64, load, 8>;
 }
 defm : ZXB<subc, GR64, SLGFR>;
@@ -718,22 +813,33 @@ let Defs = [CC] in {
   let isConvertibleToThreeAddress = 1 in {
     // ANDs of a 16-bit immediate, leaving other bits unaffected.
     // The CC result only reflects the 16-bit field, not the full register.
-    let isCodeGenOnly = 1 in {
-      def NILL32 : BinaryRI<"nill", 0xA57, and, GR32, imm32ll16c>;
-      def NILH32 : BinaryRI<"nilh", 0xA56, and, GR32, imm32lh16c>;
-    }
-    def NILL : BinaryRI<"nill", 0xA57, and, GR64, imm64ll16c>;
-    def NILH : BinaryRI<"nilh", 0xA56, and, GR64, imm64lh16c>;
-    def NIHL : BinaryRI<"nihl", 0xA55, and, GR64, imm64hl16c>;
-    def NIHH : BinaryRI<"nihh", 0xA54, and, GR64, imm64hh16c>;
+    //
+    // NIxMux expands to NI[LH]x, depending on the choice of register.
+    def NILMux : BinaryRIPseudo<and, GRX32, imm32ll16c>,
+                 Requires<[FeatureHighWord]>;
+    def NIHMux : BinaryRIPseudo<and, GRX32, imm32lh16c>,
+                 Requires<[FeatureHighWord]>;
+    def NILL : BinaryRI<"nill", 0xA57, and, GR32, imm32ll16c>;
+    def NILH : BinaryRI<"nilh", 0xA56, and, GR32, imm32lh16c>;
+    def NIHL : BinaryRI<"nihl", 0xA55, and, GRH32, imm32ll16c>;
+    def NIHH : BinaryRI<"nihh", 0xA54, and, GRH32, imm32lh16c>;
+    def NILL64 : BinaryAliasRI<and, GR64, imm64ll16c>;
+    def NILH64 : BinaryAliasRI<and, GR64, imm64lh16c>;
+    def NIHL64 : BinaryAliasRI<and, GR64, imm64hl16c>;
+    def NIHH64 : BinaryAliasRI<and, GR64, imm64hh16c>;
 
     // ANDs of a 32-bit immediate, leaving other bits unaffected.
     // The CC result only reflects the 32-bit field, which means we can
     // use it as a zero indicator for i32 operations but not otherwise.
-    let isCodeGenOnly = 1, CCValues = 0xC, CompareZeroCCMask = 0x8 in
-      def NILF32 : BinaryRIL<"nilf", 0xC0B, and, GR32, uimm32>;
-    def NILF : BinaryRIL<"nilf", 0xC0B, and, GR64, imm64lf32c>;
-    def NIHF : BinaryRIL<"nihf", 0xC0A, and, GR64, imm64hf32c>;
+    let CCValues = 0xC, CompareZeroCCMask = 0x8 in {
+      // Expands to NILF or NIHF, depending on the choice of register.
+      def NIFMux : BinaryRIPseudo<and, GRX32, uimm32>,
+                   Requires<[FeatureHighWord]>;
+      def NILF : BinaryRIL<"nilf", 0xC0B, and, GR32, uimm32>;
+      def NIHF : BinaryRIL<"nihf", 0xC0A, and, GRH32, uimm32>;
+    }
+    def NILF64 : BinaryAliasRIL<and, GR64, imm64lf32c>;
+    def NIHF64 : BinaryAliasRIL<and, GR64, imm64hf32c>;
   }
 
   // ANDs of memory.
@@ -744,6 +850,10 @@ let Defs = [CC] in {
 
   // AND to memory
   defm NI : BinarySIPair<"ni", 0x94, 0xEB54, null_frag, uimm8>;
+
+  // Block AND.
+  let mayLoad = 1, mayStore = 1 in
+    defm NC : MemorySS<"nc", 0xD4, z_nc, z_nc_loop>;
 }
 defm : RMWIByte<and, bdaddr12pair, NI>;
 defm : RMWIByte<and, bdaddr20pair, NIY>;
@@ -761,22 +871,33 @@ let Defs = [CC] in {
 
   // ORs of a 16-bit immediate, leaving other bits unaffected.
   // The CC result only reflects the 16-bit field, not the full register.
-  let isCodeGenOnly = 1 in {
-    def OILL32 : BinaryRI<"oill", 0xA5B, or, GR32, imm32ll16>;
-    def OILH32 : BinaryRI<"oilh", 0xA5A, or, GR32, imm32lh16>;
-  }
-  def OILL : BinaryRI<"oill", 0xA5B, or, GR64, imm64ll16>;
-  def OILH : BinaryRI<"oilh", 0xA5A, or, GR64, imm64lh16>;
-  def OIHL : BinaryRI<"oihl", 0xA59, or, GR64, imm64hl16>;
-  def OIHH : BinaryRI<"oihh", 0xA58, or, GR64, imm64hh16>;
+  //
+  // OIxMux expands to OI[LH]x, depending on the choice of register.
+  def OILMux : BinaryRIPseudo<or, GRX32, imm32ll16>,
+               Requires<[FeatureHighWord]>;
+  def OIHMux : BinaryRIPseudo<or, GRX32, imm32lh16>,
+               Requires<[FeatureHighWord]>;
+  def OILL : BinaryRI<"oill", 0xA5B, or, GR32, imm32ll16>;
+  def OILH : BinaryRI<"oilh", 0xA5A, or, GR32, imm32lh16>;
+  def OIHL : BinaryRI<"oihl", 0xA59, or, GRH32, imm32ll16>;
+  def OIHH : BinaryRI<"oihh", 0xA58, or, GRH32, imm32lh16>;
+  def OILL64 : BinaryAliasRI<or, GR64, imm64ll16>;
+  def OILH64 : BinaryAliasRI<or, GR64, imm64lh16>;
+  def OIHL64 : BinaryAliasRI<or, GR64, imm64hl16>;
+  def OIHH64 : BinaryAliasRI<or, GR64, imm64hh16>;
 
   // ORs of a 32-bit immediate, leaving other bits unaffected.
   // The CC result only reflects the 32-bit field, which means we can
   // use it as a zero indicator for i32 operations but not otherwise.
-  let isCodeGenOnly = 1, CCValues = 0xC, CompareZeroCCMask = 0x8 in
-    def OILF32 : BinaryRIL<"oilf", 0xC0D, or, GR32, uimm32>;
-  def OILF : BinaryRIL<"oilf", 0xC0D, or, GR64, imm64lf32>;
-  def OIHF : BinaryRIL<"oihf", 0xC0C, or, GR64, imm64hf32>;
+  let CCValues = 0xC, CompareZeroCCMask = 0x8 in {
+    // Expands to OILF or OIHF, depending on the choice of register.
+    def OIFMux : BinaryRIPseudo<or, GRX32, uimm32>,
+                 Requires<[FeatureHighWord]>;
+    def OILF : BinaryRIL<"oilf", 0xC0D, or, GR32, uimm32>;
+    def OIHF : BinaryRIL<"oihf", 0xC0C, or, GRH32, uimm32>;
+  }
+  def OILF64 : BinaryAliasRIL<or, GR64, imm64lf32>;
+  def OIHF64 : BinaryAliasRIL<or, GR64, imm64hf32>;
 
   // ORs of memory.
   let CCValues = 0xC, CompareZeroCCMask = 0x8 in {
@@ -786,6 +907,10 @@ let Defs = [CC] in {
 
   // OR to memory
   defm OI : BinarySIPair<"oi", 0x96, 0xEB56, null_frag, uimm8>;
+
+  // Block OR.
+  let mayLoad = 1, mayStore = 1 in
+    defm OC : MemorySS<"oc", 0xD6, z_oc, z_oc_loop>;
 }
 defm : RMWIByte<or, bdaddr12pair, OI>;
 defm : RMWIByte<or, bdaddr20pair, OIY>;
@@ -804,10 +929,15 @@ let Defs = [CC] in {
   // XORs of a 32-bit immediate, leaving other bits unaffected.
   // The CC result only reflects the 32-bit field, which means we can
   // use it as a zero indicator for i32 operations but not otherwise.
-  let isCodeGenOnly = 1, CCValues = 0xC, CompareZeroCCMask = 0x8 in
-    def XILF32 : BinaryRIL<"xilf", 0xC07, xor, GR32, uimm32>;
-  def XILF : BinaryRIL<"xilf", 0xC07, xor, GR64, imm64lf32>;
-  def XIHF : BinaryRIL<"xihf", 0xC06, xor, GR64, imm64hf32>;
+  let CCValues = 0xC, CompareZeroCCMask = 0x8 in {
+    // Expands to XILF or XIHF, depending on the choice of register.
+    def XIFMux : BinaryRIPseudo<xor, GRX32, uimm32>,
+                 Requires<[FeatureHighWord]>;
+    def XILF : BinaryRIL<"xilf", 0xC07, xor, GR32, uimm32>;
+    def XIHF : BinaryRIL<"xihf", 0xC06, xor, GRH32, uimm32>;
+  }
+  def XILF64 : BinaryAliasRIL<xor, GR64, imm64lf32>;
+  def XIHF64 : BinaryAliasRIL<xor, GR64, imm64hf32>;
 
   // XORs of memory.
   let CCValues = 0xC, CompareZeroCCMask = 0x8 in {
@@ -817,6 +947,10 @@ let Defs = [CC] in {
 
   // XOR to memory
   defm XI : BinarySIPair<"xi", 0x97, 0xEB57, null_frag, uimm8>;
+
+  // Block XOR.
+  let mayLoad = 1, mayStore = 1 in
+    defm XC : MemorySS<"xc", 0xD7, z_xc, z_xc_loop>;
 }
 defm : RMWIByte<xor, bdaddr12pair, XI>;
 defm : RMWIByte<xor, bdaddr20pair, XIY>;
@@ -842,9 +976,9 @@ def MSFI  : BinaryRIL<"msfi",  0xC21, mul, GR32, simm32>;
 def MSGFI : BinaryRIL<"msgfi", 0xC20, mul, GR64, imm64sx32>;
 
 // Multiplication of memory.
-defm MH   : BinaryRXPair<"mh", 0x4C, 0xE37C, mul, GR32, sextloadi16, 2>;
+defm MH   : BinaryRXPair<"mh", 0x4C, 0xE37C, mul, GR32, asextloadi16, 2>;
 defm MS   : BinaryRXPair<"ms", 0x71, 0xE351, mul, GR32, load, 4>;
-def  MSGF : BinaryRXY<"msgf", 0xE31C, mul, GR64, sextloadi32, 4>;
+def  MSGF : BinaryRXY<"msgf", 0xE31C, mul, GR64, asextloadi32, 4>;
 def  MSG  : BinaryRXY<"msg",  0xE30C, mul, GR64, load, 8>;
 
 // Multiplication of a register, producing two results.
@@ -909,13 +1043,15 @@ let Defs = [CC] in {
 
 // Forms of RISBG that only affect one word of the destination register.
 // They do not set CC.
-let isCodeGenOnly = 1 in
-  def RISBLG32 : RotateSelectRIEf<"risblg", 0xEC51, GR32, GR32>,
-                 Requires<[FeatureHighWord]>;
-def RISBHG : RotateSelectRIEf<"risbhg", 0xEC5D, GR64, GR64>,
-             Requires<[FeatureHighWord]>;
-def RISBLG : RotateSelectRIEf<"risblg", 0xEC51, GR64, GR64>,
-             Requires<[FeatureHighWord]>;
+def RISBMux : RotateSelectRIEfPseudo<GRX32, GRX32>, Requires<[FeatureHighWord]>;
+def RISBLL  : RotateSelectAliasRIEf<GR32,  GR32>,  Requires<[FeatureHighWord]>;
+def RISBLH  : RotateSelectAliasRIEf<GR32,  GRH32>, Requires<[FeatureHighWord]>;
+def RISBHL  : RotateSelectAliasRIEf<GRH32, GR32>,  Requires<[FeatureHighWord]>;
+def RISBHH  : RotateSelectAliasRIEf<GRH32, GRH32>, Requires<[FeatureHighWord]>;
+def RISBLG  : RotateSelectRIEf<"risblg", 0xEC51, GR32, GR64>,
+              Requires<[FeatureHighWord]>;
+def RISBHG  : RotateSelectRIEf<"risbhg", 0xEC5D, GRH32, GR64>,
+              Requires<[FeatureHighWord]>;
 
 // Rotate second operand left and perform a logical operation with selected
 // bits of the first operand.  The CC result only describes the selected bits,
@@ -930,39 +1066,50 @@ let Defs = [CC] in {
 // Comparison
 //===----------------------------------------------------------------------===//
 
-// Signed comparisons.
+// Signed comparisons.  We put these before the unsigned comparisons because
+// some of the signed forms have COMPARE AND BRANCH equivalents whereas none
+// of the unsigned forms do.
 let Defs = [CC], CCValues = 0xE in {
   // Comparison with a register.
-  def CR   : CompareRR <"c",   0x19,   z_cmp,     GR32, GR32>;
+  def CR   : CompareRR <"c",   0x19,   z_scmp,    GR32, GR32>;
   def CGFR : CompareRRE<"cgf", 0xB930, null_frag, GR64, GR32>;
-  def CGR  : CompareRRE<"cg",  0xB920, z_cmp,     GR64, GR64>;
+  def CGR  : CompareRRE<"cg",  0xB920, z_scmp,    GR64, GR64>;
 
   // Comparison with a signed 16-bit immediate.
-  def CHI  : CompareRI<"chi",  0xA7E, z_cmp, GR32, imm32sx16>;
-  def CGHI : CompareRI<"cghi", 0xA7F, z_cmp, GR64, imm64sx16>;
-
-  // Comparison with a signed 32-bit immediate.
-  def CFI  : CompareRIL<"cfi",  0xC2D, z_cmp, GR32, simm32>;
-  def CGFI : CompareRIL<"cgfi", 0xC2C, z_cmp, GR64, imm64sx32>;
+  def CHI  : CompareRI<"chi",  0xA7E, z_scmp, GR32, imm32sx16>;
+  def CGHI : CompareRI<"cghi", 0xA7F, z_scmp, GR64, imm64sx16>;
+
+  // Comparison with a signed 32-bit immediate.  CFIMux expands to CFI or CIH,
+  // depending on the choice of register.
+  def CFIMux : CompareRIPseudo<z_scmp, GRX32, simm32>,
+               Requires<[FeatureHighWord]>;
+  def CFI  : CompareRIL<"cfi",  0xC2D, z_scmp, GR32, simm32>;
+  def CIH  : CompareRIL<"cih",  0xCCD, z_scmp, GRH32, simm32>,
+             Requires<[FeatureHighWord]>;
+  def CGFI : CompareRIL<"cgfi", 0xC2C, z_scmp, GR64, imm64sx32>;
 
   // Comparison with memory.
-  defm CH    : CompareRXPair<"ch", 0x49, 0xE379, z_cmp, GR32, sextloadi16, 2>;
-  defm C     : CompareRXPair<"c",  0x59, 0xE359, z_cmp, GR32, load, 4>;
-  def  CGH   : CompareRXY<"cgh", 0xE334, z_cmp, GR64, sextloadi16, 2>;
-  def  CGF   : CompareRXY<"cgf", 0xE330, z_cmp, GR64, sextloadi32, 4>;
-  def  CG    : CompareRXY<"cg",  0xE320, z_cmp, GR64, load, 8>;
-  def  CHRL  : CompareRILPC<"chrl",  0xC65, z_cmp, GR32, aligned_sextloadi16>;
-  def  CRL   : CompareRILPC<"crl",   0xC6D, z_cmp, GR32, aligned_load>;
-  def  CGHRL : CompareRILPC<"cghrl", 0xC64, z_cmp, GR64, aligned_sextloadi16>;
-  def  CGFRL : CompareRILPC<"cgfrl", 0xC6C, z_cmp, GR64, aligned_sextloadi32>;
-  def  CGRL  : CompareRILPC<"cgrl",  0xC68, z_cmp, GR64, aligned_load>;
+  defm CH    : CompareRXPair<"ch", 0x49, 0xE379, z_scmp, GR32, asextloadi16, 2>;
+  def  CMux  : CompareRXYPseudo<z_scmp, GRX32, load, 4>,
+               Requires<[FeatureHighWord]>;
+  defm C     : CompareRXPair<"c",  0x59, 0xE359, z_scmp, GR32, load, 4>;
+  def  CHF   : CompareRXY<"chf", 0xE3CD, z_scmp, GRH32, load, 4>,
+               Requires<[FeatureHighWord]>;
+  def  CGH   : CompareRXY<"cgh", 0xE334, z_scmp, GR64, asextloadi16, 2>;
+  def  CGF   : CompareRXY<"cgf", 0xE330, z_scmp, GR64, asextloadi32, 4>;
+  def  CG    : CompareRXY<"cg",  0xE320, z_scmp, GR64, load, 8>;
+  def  CHRL  : CompareRILPC<"chrl",  0xC65, z_scmp, GR32, aligned_asextloadi16>;
+  def  CRL   : CompareRILPC<"crl",   0xC6D, z_scmp, GR32, aligned_load>;
+  def  CGHRL : CompareRILPC<"cghrl", 0xC64, z_scmp, GR64, aligned_asextloadi16>;
+  def  CGFRL : CompareRILPC<"cgfrl", 0xC6C, z_scmp, GR64, aligned_asextloadi32>;
+  def  CGRL  : CompareRILPC<"cgrl",  0xC68, z_scmp, GR64, aligned_load>;
 
   // Comparison between memory and a signed 16-bit immediate.
-  def CHHSI : CompareSIL<"chhsi", 0xE554, z_cmp, sextloadi16, imm32sx16>;
-  def CHSI  : CompareSIL<"chsi",  0xE55C, z_cmp, load,        imm32sx16>;
-  def CGHSI : CompareSIL<"cghsi", 0xE558, z_cmp, load,        imm64sx16>;
+  def CHHSI : CompareSIL<"chhsi", 0xE554, z_scmp, asextloadi16, imm32sx16>;
+  def CHSI  : CompareSIL<"chsi",  0xE55C, z_scmp, load, imm32sx16>;
+  def CGHSI : CompareSIL<"cghsi", 0xE558, z_scmp, load, imm64sx16>;
 }
-defm : SXB<z_cmp, GR64, CGFR>;
+defm : SXB<z_scmp, GR64, CGFR>;
 
 // Unsigned comparisons.
 let Defs = [CC], CCValues = 0xE, IsLogical = 1 in {
@@ -971,35 +1118,79 @@ let Defs = [CC], CCValues = 0xE, IsLogical = 1 in {
   def CLGFR : CompareRRE<"clgf", 0xB931, null_frag, GR64, GR32>;
   def CLGR  : CompareRRE<"clg",  0xB921, z_ucmp,    GR64, GR64>;
 
-  // Comparison with a signed 32-bit immediate.
+  // Comparison with an unsigned 32-bit immediate.  CLFIMux expands to CLFI
+  // or CLIH, depending on the choice of register.
+  def CLFIMux : CompareRIPseudo<z_ucmp, GRX32, uimm32>,
+                Requires<[FeatureHighWord]>;
   def CLFI  : CompareRIL<"clfi",  0xC2F, z_ucmp, GR32, uimm32>;
+  def CLIH  : CompareRIL<"clih",  0xCCF, z_ucmp, GR32, uimm32>,
+              Requires<[FeatureHighWord]>;
   def CLGFI : CompareRIL<"clgfi", 0xC2E, z_ucmp, GR64, imm64zx32>;
 
   // Comparison with memory.
+  def  CLMux  : CompareRXYPseudo<z_ucmp, GRX32, load, 4>,
+                Requires<[FeatureHighWord]>;
   defm CL     : CompareRXPair<"cl", 0x55, 0xE355, z_ucmp, GR32, load, 4>;
-  def  CLGF   : CompareRXY<"clgf", 0xE331, z_ucmp, GR64, zextloadi32, 4>;
+  def  CLHF   : CompareRXY<"clhf", 0xE3CF, z_ucmp, GRH32, load, 4>,
+                Requires<[FeatureHighWord]>;
+  def  CLGF   : CompareRXY<"clgf", 0xE331, z_ucmp, GR64, azextloadi32, 4>;
   def  CLG    : CompareRXY<"clg",  0xE321, z_ucmp, GR64, load, 8>;
   def  CLHRL  : CompareRILPC<"clhrl",  0xC67, z_ucmp, GR32,
-                             aligned_zextloadi16>;
+                             aligned_azextloadi16>;
   def  CLRL   : CompareRILPC<"clrl",   0xC6F, z_ucmp, GR32,
                              aligned_load>;
   def  CLGHRL : CompareRILPC<"clghrl", 0xC66, z_ucmp, GR64,
-                             aligned_zextloadi16>;
+                             aligned_azextloadi16>;
   def  CLGFRL : CompareRILPC<"clgfrl", 0xC6E, z_ucmp, GR64,
-                             aligned_zextloadi32>;
+                             aligned_azextloadi32>;
   def  CLGRL  : CompareRILPC<"clgrl",  0xC6A, z_ucmp, GR64,
                              aligned_load>;
 
   // Comparison between memory and an unsigned 8-bit immediate.
-  defm CLI : CompareSIPair<"cli", 0x95, 0xEB55, z_ucmp, zextloadi8, imm32zx8>;
+  defm CLI : CompareSIPair<"cli", 0x95, 0xEB55, z_ucmp, azextloadi8, imm32zx8>;
 
   // Comparison between memory and an unsigned 16-bit immediate.
-  def CLHHSI : CompareSIL<"clhhsi", 0xE555, z_ucmp, zextloadi16, imm32zx16>;
-  def CLFHSI : CompareSIL<"clfhsi", 0xE55D, z_ucmp, load,        imm32zx16>;
-  def CLGHSI : CompareSIL<"clghsi", 0xE559, z_ucmp, load,        imm64zx16>;
+  def CLHHSI : CompareSIL<"clhhsi", 0xE555, z_ucmp, azextloadi16, imm32zx16>;
+  def CLFHSI : CompareSIL<"clfhsi", 0xE55D, z_ucmp, load, imm32zx16>;
+  def CLGHSI : CompareSIL<"clghsi", 0xE559, z_ucmp, load, imm64zx16>;
 }
 defm : ZXB<z_ucmp, GR64, CLGFR>;
 
+// Memory-to-memory comparison.
+let mayLoad = 1, Defs = [CC] in
+  defm CLC : MemorySS<"clc", 0xD5, z_clc, z_clc_loop>;
+
+// String comparison.
+let mayLoad = 1, Defs = [CC], Uses = [R0L] in
+  defm CLST : StringRRE<"clst", 0xB25D, z_strcmp>;
+
+// Test under mask.
+let Defs = [CC] in {
+  // TMxMux expands to TM[LH]x, depending on the choice of register.
+  def TMLMux : CompareRIPseudo<z_tm_reg, GRX32, imm32ll16>,
+               Requires<[FeatureHighWord]>;
+  def TMHMux : CompareRIPseudo<z_tm_reg, GRX32, imm32lh16>,
+               Requires<[FeatureHighWord]>;
+  def TMLL : CompareRI<"tmll", 0xA71, z_tm_reg, GR32, imm32ll16>;
+  def TMLH : CompareRI<"tmlh", 0xA70, z_tm_reg, GR32, imm32lh16>;
+  def TMHL : CompareRI<"tmhl", 0xA73, z_tm_reg, GRH32, imm32ll16>;
+  def TMHH : CompareRI<"tmhh", 0xA72, z_tm_reg, GRH32, imm32lh16>;
+
+  def TMLL64 : CompareAliasRI<z_tm_reg, GR64, imm64ll16>;
+  def TMLH64 : CompareAliasRI<z_tm_reg, GR64, imm64lh16>;
+  def TMHL64 : CompareAliasRI<z_tm_reg, GR64, imm64hl16>;
+  def TMHH64 : CompareAliasRI<z_tm_reg, GR64, imm64hh16>;
+
+  defm TM : CompareSIPair<"tm", 0x91, 0xEB51, z_tm_mem, anyextloadi8, imm32zx8>;
+}
+
+//===----------------------------------------------------------------------===//
+// Prefetch
+//===----------------------------------------------------------------------===//
+
+def PFD : PrefetchRXY<"pfd", 0xE336, z_prefetch>;
+def PFDRL : PrefetchRILPC<"pfdrl", 0xC62, z_prefetch>;
+
 //===----------------------------------------------------------------------===//
 // Atomic operations
 //===----------------------------------------------------------------------===//
@@ -1024,60 +1215,60 @@ def ATOMIC_LOAD_SGR     : AtomicLoadBinaryReg64<atomic_load_sub_64>;
 def ATOMIC_LOADW_NR     : AtomicLoadWBinaryReg<z_atomic_loadw_and>;
 def ATOMIC_LOADW_NILH   : AtomicLoadWBinaryImm<z_atomic_loadw_and, imm32lh16c>;
 def ATOMIC_LOAD_NR      : AtomicLoadBinaryReg32<atomic_load_and_32>;
-def ATOMIC_LOAD_NILL32  : AtomicLoadBinaryImm32<atomic_load_and_32, imm32ll16c>;
-def ATOMIC_LOAD_NILH32  : AtomicLoadBinaryImm32<atomic_load_and_32, imm32lh16c>;
-def ATOMIC_LOAD_NILF32  : AtomicLoadBinaryImm32<atomic_load_and_32, uimm32>;
+def ATOMIC_LOAD_NILL    : AtomicLoadBinaryImm32<atomic_load_and_32, imm32ll16c>;
+def ATOMIC_LOAD_NILH    : AtomicLoadBinaryImm32<atomic_load_and_32, imm32lh16c>;
+def ATOMIC_LOAD_NILF    : AtomicLoadBinaryImm32<atomic_load_and_32, uimm32>;
 def ATOMIC_LOAD_NGR     : AtomicLoadBinaryReg64<atomic_load_and_64>;
-def ATOMIC_LOAD_NILL    : AtomicLoadBinaryImm64<atomic_load_and_64, imm64ll16c>;
-def ATOMIC_LOAD_NILH    : AtomicLoadBinaryImm64<atomic_load_and_64, imm64lh16c>;
-def ATOMIC_LOAD_NIHL    : AtomicLoadBinaryImm64<atomic_load_and_64, imm64hl16c>;
-def ATOMIC_LOAD_NIHH    : AtomicLoadBinaryImm64<atomic_load_and_64, imm64hh16c>;
-def ATOMIC_LOAD_NILF    : AtomicLoadBinaryImm64<atomic_load_and_64, imm64lf32c>;
-def ATOMIC_LOAD_NIHF    : AtomicLoadBinaryImm64<atomic_load_and_64, imm64hf32c>;
+def ATOMIC_LOAD_NILL64  : AtomicLoadBinaryImm64<atomic_load_and_64, imm64ll16c>;
+def ATOMIC_LOAD_NILH64  : AtomicLoadBinaryImm64<atomic_load_and_64, imm64lh16c>;
+def ATOMIC_LOAD_NIHL64  : AtomicLoadBinaryImm64<atomic_load_and_64, imm64hl16c>;
+def ATOMIC_LOAD_NIHH64  : AtomicLoadBinaryImm64<atomic_load_and_64, imm64hh16c>;
+def ATOMIC_LOAD_NILF64  : AtomicLoadBinaryImm64<atomic_load_and_64, imm64lf32c>;
+def ATOMIC_LOAD_NIHF64  : AtomicLoadBinaryImm64<atomic_load_and_64, imm64hf32c>;
 
 def ATOMIC_LOADW_OR     : AtomicLoadWBinaryReg<z_atomic_loadw_or>;
 def ATOMIC_LOADW_OILH   : AtomicLoadWBinaryImm<z_atomic_loadw_or, imm32lh16>;
 def ATOMIC_LOAD_OR      : AtomicLoadBinaryReg32<atomic_load_or_32>;
-def ATOMIC_LOAD_OILL32  : AtomicLoadBinaryImm32<atomic_load_or_32, imm32ll16>;
-def ATOMIC_LOAD_OILH32  : AtomicLoadBinaryImm32<atomic_load_or_32, imm32lh16>;
-def ATOMIC_LOAD_OILF32  : AtomicLoadBinaryImm32<atomic_load_or_32, uimm32>;
+def ATOMIC_LOAD_OILL    : AtomicLoadBinaryImm32<atomic_load_or_32, imm32ll16>;
+def ATOMIC_LOAD_OILH    : AtomicLoadBinaryImm32<atomic_load_or_32, imm32lh16>;
+def ATOMIC_LOAD_OILF    : AtomicLoadBinaryImm32<atomic_load_or_32, uimm32>;
 def ATOMIC_LOAD_OGR     : AtomicLoadBinaryReg64<atomic_load_or_64>;
-def ATOMIC_LOAD_OILL    : AtomicLoadBinaryImm64<atomic_load_or_64, imm64ll16>;
-def ATOMIC_LOAD_OILH    : AtomicLoadBinaryImm64<atomic_load_or_64, imm64lh16>;
-def ATOMIC_LOAD_OIHL    : AtomicLoadBinaryImm64<atomic_load_or_64, imm64hl16>;
-def ATOMIC_LOAD_OIHH    : AtomicLoadBinaryImm64<atomic_load_or_64, imm64hh16>;
-def ATOMIC_LOAD_OILF    : AtomicLoadBinaryImm64<atomic_load_or_64, imm64lf32>;
-def ATOMIC_LOAD_OIHF    : AtomicLoadBinaryImm64<atomic_load_or_64, imm64hf32>;
+def ATOMIC_LOAD_OILL64  : AtomicLoadBinaryImm64<atomic_load_or_64, imm64ll16>;
+def ATOMIC_LOAD_OILH64  : AtomicLoadBinaryImm64<atomic_load_or_64, imm64lh16>;
+def ATOMIC_LOAD_OIHL64  : AtomicLoadBinaryImm64<atomic_load_or_64, imm64hl16>;
+def ATOMIC_LOAD_OIHH64  : AtomicLoadBinaryImm64<atomic_load_or_64, imm64hh16>;
+def ATOMIC_LOAD_OILF64  : AtomicLoadBinaryImm64<atomic_load_or_64, imm64lf32>;
+def ATOMIC_LOAD_OIHF64  : AtomicLoadBinaryImm64<atomic_load_or_64, imm64hf32>;
 
 def ATOMIC_LOADW_XR     : AtomicLoadWBinaryReg<z_atomic_loadw_xor>;
 def ATOMIC_LOADW_XILF   : AtomicLoadWBinaryImm<z_atomic_loadw_xor, uimm32>;
 def ATOMIC_LOAD_XR      : AtomicLoadBinaryReg32<atomic_load_xor_32>;
-def ATOMIC_LOAD_XILF32  : AtomicLoadBinaryImm32<atomic_load_xor_32, uimm32>;
+def ATOMIC_LOAD_XILF    : AtomicLoadBinaryImm32<atomic_load_xor_32, uimm32>;
 def ATOMIC_LOAD_XGR     : AtomicLoadBinaryReg64<atomic_load_xor_64>;
-def ATOMIC_LOAD_XILF    : AtomicLoadBinaryImm64<atomic_load_xor_64, imm64lf32>;
-def ATOMIC_LOAD_XIHF    : AtomicLoadBinaryImm64<atomic_load_xor_64, imm64hf32>;
+def ATOMIC_LOAD_XILF64  : AtomicLoadBinaryImm64<atomic_load_xor_64, imm64lf32>;
+def ATOMIC_LOAD_XIHF64  : AtomicLoadBinaryImm64<atomic_load_xor_64, imm64hf32>;
 
 def ATOMIC_LOADW_NRi    : AtomicLoadWBinaryReg<z_atomic_loadw_nand>;
 def ATOMIC_LOADW_NILHi  : AtomicLoadWBinaryImm<z_atomic_loadw_nand,
                                                imm32lh16c>;
 def ATOMIC_LOAD_NRi     : AtomicLoadBinaryReg32<atomic_load_nand_32>;
-def ATOMIC_LOAD_NILL32i : AtomicLoadBinaryImm32<atomic_load_nand_32,
+def ATOMIC_LOAD_NILLi   : AtomicLoadBinaryImm32<atomic_load_nand_32,
                                                 imm32ll16c>;
-def ATOMIC_LOAD_NILH32i : AtomicLoadBinaryImm32<atomic_load_nand_32,
+def ATOMIC_LOAD_NILHi   : AtomicLoadBinaryImm32<atomic_load_nand_32,
                                                 imm32lh16c>;
-def ATOMIC_LOAD_NILF32i : AtomicLoadBinaryImm32<atomic_load_nand_32, uimm32>;
+def ATOMIC_LOAD_NILFi   : AtomicLoadBinaryImm32<atomic_load_nand_32, uimm32>;
 def ATOMIC_LOAD_NGRi    : AtomicLoadBinaryReg64<atomic_load_nand_64>;
-def ATOMIC_LOAD_NILLi   : AtomicLoadBinaryImm64<atomic_load_nand_64,
+def ATOMIC_LOAD_NILL64i : AtomicLoadBinaryImm64<atomic_load_nand_64,
                                                 imm64ll16c>;
-def ATOMIC_LOAD_NILHi   : AtomicLoadBinaryImm64<atomic_load_nand_64,
+def ATOMIC_LOAD_NILH64i : AtomicLoadBinaryImm64<atomic_load_nand_64,
                                                 imm64lh16c>;
-def ATOMIC_LOAD_NIHLi   : AtomicLoadBinaryImm64<atomic_load_nand_64,
+def ATOMIC_LOAD_NIHL64i : AtomicLoadBinaryImm64<atomic_load_nand_64,
                                                 imm64hl16c>;
-def ATOMIC_LOAD_NIHHi   : AtomicLoadBinaryImm64<atomic_load_nand_64,
+def ATOMIC_LOAD_NIHH64i : AtomicLoadBinaryImm64<atomic_load_nand_64,
                                                 imm64hh16c>;
-def ATOMIC_LOAD_NILFi   : AtomicLoadBinaryImm64<atomic_load_nand_64,
+def ATOMIC_LOAD_NILF64i : AtomicLoadBinaryImm64<atomic_load_nand_64,
                                                 imm64lf32c>;
-def ATOMIC_LOAD_NIHFi   : AtomicLoadBinaryImm64<atomic_load_nand_64,
+def ATOMIC_LOAD_NIHF64i : AtomicLoadBinaryImm64<atomic_load_nand_64,
                                                 imm64hf32c>;
 
 def ATOMIC_LOADW_MIN    : AtomicLoadWBinaryReg<z_atomic_loadw_min>;
@@ -1119,6 +1310,10 @@ let Defs = [CC] in {
 // Miscellaneous Instructions.
 //===----------------------------------------------------------------------===//
 
+// Extract CC into bits 29 and 28 of a register.
+let Uses = [CC] in
+  def IPM : InherentRRE<"ipm", 0xB222, GR32, (z_ipm)>;
+
 // Read a 32-bit access register into a GR32.  As with all GR32 operations,
 // the upper 32 bits of the enclosing GR64 remain unchanged, which is useful
 // when a 64-bit address is stored in a pair of access registers.
@@ -1134,19 +1329,11 @@ let Defs = [CC] in {
   def FLOGR : UnaryRRE<"flog", 0xB983, null_frag, GR128, GR64>;
 }
 def : Pat<(ctlz GR64:$src),
-          (EXTRACT_SUBREG (FLOGR GR64:$src), subreg_high)>;
+          (EXTRACT_SUBREG (FLOGR GR64:$src), subreg_h64)>;
 
 // Use subregs to populate the "don't care" bits in a 32-bit to 64-bit anyext.
 def : Pat<(i64 (anyext GR32:$src)),
-          (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$src, subreg_32bit)>;
-
-// There are no 32-bit equivalents of LLILL and LLILH, so use a full
-// 64-bit move followed by a subreg.  This preserves the invariant that
-// all GR32 operations only modify the low 32 bits.
-def : Pat<(i32 imm32ll16:$src),
-          (EXTRACT_SUBREG (LLILL (LL16 imm:$src)), subreg_32bit)>;
-def : Pat<(i32 imm32lh16:$src),
-          (EXTRACT_SUBREG (LLILH (LH16 imm:$src)), subreg_32bit)>;
+          (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$src, subreg_l32)>;
 
 // Extend GR32s and GR64s to GR128s.
 let usesCustomInserter = 1 in {
@@ -1155,6 +1342,10 @@ let usesCustomInserter = 1 in {
   def ZEXT128_64 : Pseudo<(outs GR128:$dst), (ins GR64:$src), []>;
 }
 
+// Search a block of memory for a character.
+let mayLoad = 1, Defs = [CC], Uses = [R0L] in
+  defm SRST : StringRRE<"srst", 0xb25e, z_search_string>;
+
 //===----------------------------------------------------------------------===//
 // Peepholes.
 //===----------------------------------------------------------------------===//
@@ -1163,14 +1354,14 @@ let usesCustomInserter = 1 in {
 defm : ZXB<add, GR64, ALGFR>;
 def  : Pat<(add GR64:$src1, imm64zx32:$src2),
            (ALGFI GR64:$src1, imm64zx32:$src2)>;
-def  : Pat<(add GR64:$src1, (zextloadi32 bdxaddr20only:$addr)),
+def  : Pat<(add GR64:$src1, (azextloadi32 bdxaddr20only:$addr)),
            (ALGF GR64:$src1, bdxaddr20only:$addr)>;
 
 // Use SL* for GR64 subtractions of unsigned 32-bit values.
 defm : ZXB<sub, GR64, SLGFR>;
 def  : Pat<(add GR64:$src1, imm64zx32n:$src2),
            (SLGFI GR64:$src1, imm64zx32n:$src2)>;
-def  : Pat<(sub GR64:$src1, (zextloadi32 bdxaddr20only:$addr)),
+def  : Pat<(sub GR64:$src1, (azextloadi32 bdxaddr20only:$addr)),
            (SLGF GR64:$src1, bdxaddr20only:$addr)>;
 
 // Optimize sign-extended 1/0 selects to -1/0 selects.  This is important
@@ -1184,3 +1375,19 @@ def : Pat<(sra (shl (i64 (anyext (i32 (z_select_ccmask 1, 0, uimm8zx4:$valid,
                     (i32 63)),
                (i32 63)),
           (Select64 (LGHI -1), (LGHI 0), uimm8zx4:$valid, uimm8zx4:$cc)>;
+
+// Peepholes for turning scalar operations into block operations.
+defm : BlockLoadStore<anyextloadi8, i32, MVCSequence, NCSequence, OCSequence,
+                      XCSequence, 1>;
+defm : BlockLoadStore<anyextloadi16, i32, MVCSequence, NCSequence, OCSequence,
+                      XCSequence, 2>;
+defm : BlockLoadStore<load, i32, MVCSequence, NCSequence, OCSequence,
+                      XCSequence, 4>;
+defm : BlockLoadStore<anyextloadi8, i64, MVCSequence, NCSequence,
+                      OCSequence, XCSequence, 1>;
+defm : BlockLoadStore<anyextloadi16, i64, MVCSequence, NCSequence, OCSequence,
+                      XCSequence, 2>;
+defm : BlockLoadStore<anyextloadi32, i64, MVCSequence, NCSequence, OCSequence,
+                      XCSequence, 4>;
+defm : BlockLoadStore<load, i64, MVCSequence, NCSequence, OCSequence,
+                      XCSequence, 8>;
diff --git a/lib/Target/SystemZ/SystemZLongBranch.cpp b/lib/Target/SystemZ/SystemZLongBranch.cpp
index 114f74e..ba027d4 100644
--- a/lib/Target/SystemZ/SystemZLongBranch.cpp
+++ b/lib/Target/SystemZ/SystemZLongBranch.cpp
@@ -225,11 +225,13 @@ TerminatorInfo SystemZLongBranch::describeTerminator(MachineInstr *MI) {
       Terminator.ExtraRelaxSize = 6;
       break;
     case SystemZ::CRJ:
-      // Relaxes to a CR/BRCL sequence, which is 2 bytes longer.
+    case SystemZ::CLRJ:
+      // Relaxes to a C(L)R/BRCL sequence, which is 2 bytes longer.
       Terminator.ExtraRelaxSize = 2;
       break;
     case SystemZ::CGRJ:
-      // Relaxes to a CGR/BRCL sequence, which is 4 bytes longer.
+    case SystemZ::CLGRJ:
+      // Relaxes to a C(L)GR/BRCL sequence, which is 4 bytes longer.
       Terminator.ExtraRelaxSize = 4;
       break;
     case SystemZ::CIJ:
@@ -237,6 +239,11 @@ TerminatorInfo SystemZLongBranch::describeTerminator(MachineInstr *MI) {
       // Relaxes to a C(G)HI/BRCL sequence, which is 4 bytes longer.
       Terminator.ExtraRelaxSize = 4;
       break;
+    case SystemZ::CLIJ:
+    case SystemZ::CLGIJ:
+      // Relaxes to a CL(G)FI/BRCL sequence, which is 6 bytes longer.
+      Terminator.ExtraRelaxSize = 6;
+      break;
     default:
       llvm_unreachable("Unrecognized branch instruction");
     }
@@ -401,6 +408,18 @@ void SystemZLongBranch::relaxBranch(TerminatorInfo &Terminator) {
   case SystemZ::CGIJ:
     splitCompareBranch(Branch, SystemZ::CGHI);
     break;
+  case SystemZ::CLRJ:
+    splitCompareBranch(Branch, SystemZ::CLR);
+    break;
+  case SystemZ::CLGRJ:
+    splitCompareBranch(Branch, SystemZ::CLGR);
+    break;
+  case SystemZ::CLIJ:
+    splitCompareBranch(Branch, SystemZ::CLFI);
+    break;
+  case SystemZ::CLGIJ:
+    splitCompareBranch(Branch, SystemZ::CLGFI);
+    break;
   default:
     llvm_unreachable("Unrecognized branch");
   }
diff --git a/lib/Target/SystemZ/SystemZMCInstLower.cpp b/lib/Target/SystemZ/SystemZMCInstLower.cpp
index 432a0d3..ff9a6c0 100644
--- a/lib/Target/SystemZ/SystemZMCInstLower.cpp
+++ b/lib/Target/SystemZ/SystemZMCInstLower.cpp
@@ -15,15 +15,6 @@
 
 using namespace llvm;
 
-// If Opcode is an interprocedural reference that can be shortened,
-// return the short form, otherwise return 0.
-static unsigned getShortenedInstr(unsigned Opcode) {
-  switch (Opcode) {
-  case SystemZ::BRASL: return SystemZ::BRAS;
-  }
-  return Opcode;
-}
-
 // Return the VK_* enumeration for MachineOperand target flags Flags.
 static MCSymbolRefExpr::VariantKind getVariantKind(unsigned Flags) {
   switch (Flags & SystemZII::MO_SYMBOL_MODIFIER) {
@@ -35,70 +26,71 @@ static MCSymbolRefExpr::VariantKind getVariantKind(unsigned Flags) {
   llvm_unreachable("Unrecognised MO_ACCESS_MODEL");
 }
 
-SystemZMCInstLower::SystemZMCInstLower(Mangler *mang, MCContext &ctx,
+SystemZMCInstLower::SystemZMCInstLower(MCContext &ctx,
                                        SystemZAsmPrinter &asmprinter)
-  : Mang(mang), Ctx(ctx), AsmPrinter(asmprinter) {}
+  : Ctx(ctx), AsmPrinter(asmprinter) {}
 
-MCOperand SystemZMCInstLower::lowerSymbolOperand(const MachineOperand &MO,
-                                                 const MCSymbol *Symbol,
-                                                 int64_t Offset) const {
-  MCSymbolRefExpr::VariantKind Kind = getVariantKind(MO.getTargetFlags());
-  const MCExpr *Expr = MCSymbolRefExpr::Create(Symbol, Kind, Ctx);
-  if (Offset) {
-    const MCExpr *OffsetExpr = MCConstantExpr::Create(Offset, Ctx);
-    Expr = MCBinaryExpr::CreateAdd(Expr, OffsetExpr, Ctx);
+const MCExpr *
+SystemZMCInstLower::getExpr(const MachineOperand &MO,
+                            MCSymbolRefExpr::VariantKind Kind) const {
+  const MCSymbol *Symbol;
+  bool HasOffset = true;
+  switch (MO.getType()) {
+  case MachineOperand::MO_MachineBasicBlock:
+    Symbol = MO.getMBB()->getSymbol();
+    HasOffset = false;
+    break;
+
+  case MachineOperand::MO_GlobalAddress:
+    Symbol = AsmPrinter.getSymbol(MO.getGlobal());
+    break;
+
+  case MachineOperand::MO_ExternalSymbol:
+    Symbol = AsmPrinter.GetExternalSymbolSymbol(MO.getSymbolName());
+    break;
+
+  case MachineOperand::MO_JumpTableIndex:
+    Symbol = AsmPrinter.GetJTISymbol(MO.getIndex());
+    HasOffset = false;
+    break;
+
+  case MachineOperand::MO_ConstantPoolIndex:
+    Symbol = AsmPrinter.GetCPISymbol(MO.getIndex());
+    break;
+
+  case MachineOperand::MO_BlockAddress:
+    Symbol = AsmPrinter.GetBlockAddressSymbol(MO.getBlockAddress());
+    break;
+
+  default:
+    llvm_unreachable("unknown operand type");
   }
-  return MCOperand::CreateExpr(Expr);
+  const MCExpr *Expr = MCSymbolRefExpr::Create(Symbol, Kind, Ctx);
+  if (HasOffset)
+    if (int64_t Offset = MO.getOffset()) {
+      const MCExpr *OffsetExpr = MCConstantExpr::Create(Offset, Ctx);
+      Expr = MCBinaryExpr::CreateAdd(Expr, OffsetExpr, Ctx);
+    }
+  return Expr;
 }
 
 MCOperand SystemZMCInstLower::lowerOperand(const MachineOperand &MO) const {
   switch (MO.getType()) {
-  default:
-    llvm_unreachable("unknown operand type");
-
   case MachineOperand::MO_Register:
     return MCOperand::CreateReg(MO.getReg());
 
   case MachineOperand::MO_Immediate:
     return MCOperand::CreateImm(MO.getImm());
 
-  case MachineOperand::MO_MachineBasicBlock:
-    return lowerSymbolOperand(MO, MO.getMBB()->getSymbol(),
-                              /* MO has no offset field */0);
-
-  case MachineOperand::MO_GlobalAddress:
-    return lowerSymbolOperand(MO, Mang->getSymbol(MO.getGlobal()),
-                              MO.getOffset());
-
-  case MachineOperand::MO_ExternalSymbol: {
-    StringRef Name = MO.getSymbolName();
-    return lowerSymbolOperand(MO, AsmPrinter.GetExternalSymbolSymbol(Name),
-                              MO.getOffset());
-  }
-
-  case MachineOperand::MO_JumpTableIndex:
-    return lowerSymbolOperand(MO, AsmPrinter.GetJTISymbol(MO.getIndex()),
-                              /* MO has no offset field */0);
-
-  case MachineOperand::MO_ConstantPoolIndex:
-    return lowerSymbolOperand(MO, AsmPrinter.GetCPISymbol(MO.getIndex()),
-                              MO.getOffset());
-
-  case MachineOperand::MO_BlockAddress: {
-    const BlockAddress *BA = MO.getBlockAddress();
-    return lowerSymbolOperand(MO, AsmPrinter.GetBlockAddressSymbol(BA),
-                              MO.getOffset());
+  default: {
+    MCSymbolRefExpr::VariantKind Kind = getVariantKind(MO.getTargetFlags());
+    return MCOperand::CreateExpr(getExpr(MO, Kind));
   }
   }
 }
 
 void SystemZMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
-  unsigned Opcode = MI->getOpcode();
-  // When emitting binary code, start with the shortest form of an instruction
-  // and then relax it where necessary.
-  if (!AsmPrinter.OutStreamer.hasRawTextSupport())
-    Opcode = getShortenedInstr(Opcode);
-  OutMI.setOpcode(Opcode);
+  OutMI.setOpcode(MI->getOpcode());
   for (unsigned I = 0, E = MI->getNumOperands(); I != E; ++I) {
     const MachineOperand &MO = MI->getOperand(I);
     // Ignore all implicit register operands.
diff --git a/lib/Target/SystemZ/SystemZMCInstLower.h b/lib/Target/SystemZ/SystemZMCInstLower.h
index db5bdb0..f6d5ac8 100644
--- a/lib/Target/SystemZ/SystemZMCInstLower.h
+++ b/lib/Target/SystemZ/SystemZMCInstLower.h
@@ -10,27 +10,24 @@
 #ifndef LLVM_SYSTEMZMCINSTLOWER_H
 #define LLVM_SYSTEMZMCINSTLOWER_H
 
+#include "llvm/MC/MCExpr.h"
 #include "llvm/Support/DataTypes.h"
 #include "llvm/Support/Compiler.h"
 
 namespace llvm {
-class MCContext;
 class MCInst;
 class MCOperand;
-class MCSymbol;
 class MachineInstr;
 class MachineOperand;
 class Mangler;
 class SystemZAsmPrinter;
 
 class LLVM_LIBRARY_VISIBILITY SystemZMCInstLower {
-  Mangler *Mang;
   MCContext &Ctx;
   SystemZAsmPrinter &AsmPrinter;
 
 public:
-  SystemZMCInstLower(Mangler *mang, MCContext &ctx,
-                     SystemZAsmPrinter &asmPrinter);
+  SystemZMCInstLower(MCContext &ctx, SystemZAsmPrinter &asmPrinter);
 
   // Lower MachineInstr MI to MCInst OutMI.
   void lower(const MachineInstr *MI, MCInst &OutMI) const;
@@ -38,9 +35,9 @@ public:
   // Return an MCOperand for MO.
   MCOperand lowerOperand(const MachineOperand& MO) const;
 
-  // Return an MCOperand for MO, given that it equals Symbol + Offset.
-  MCOperand lowerSymbolOperand(const MachineOperand &MO,
-                               const MCSymbol *Symbol, int64_t Offset) const;
+  // Return an MCExpr for symbolic operand MO with variant kind Kind.
+  const MCExpr *getExpr(const MachineOperand &MO,
+                        MCSymbolRefExpr::VariantKind Kind) const;
 };
 } // end namespace llvm
 
diff --git a/lib/Target/SystemZ/SystemZMachineFunctionInfo.cpp b/lib/Target/SystemZ/SystemZMachineFunctionInfo.cpp
new file mode 100644
index 0000000..00572d0
--- /dev/null
+++ b/lib/Target/SystemZ/SystemZMachineFunctionInfo.cpp
@@ -0,0 +1,17 @@
+//== SystemZMachineFuctionInfo.cpp - SystemZ machine function info-*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZMachineFunctionInfo.h"
+
+using namespace llvm;
+
+
+// pin vtable to this file
+void SystemZMachineFunctionInfo::anchor() {}
+
diff --git a/lib/Target/SystemZ/SystemZMachineFunctionInfo.h b/lib/Target/SystemZ/SystemZMachineFunctionInfo.h
index 69c2691..845291f 100644
--- a/lib/Target/SystemZ/SystemZMachineFunctionInfo.h
+++ b/lib/Target/SystemZ/SystemZMachineFunctionInfo.h
@@ -15,6 +15,7 @@
 namespace llvm {
 
 class SystemZMachineFunctionInfo : public MachineFunctionInfo {
+  virtual void anchor();
   unsigned LowSavedGPR;
   unsigned HighSavedGPR;
   unsigned VarArgsFirstGPR;
diff --git a/lib/Target/SystemZ/SystemZOperands.td b/lib/Target/SystemZ/SystemZOperands.td
index 9d79439..3ad146c 100644
--- a/lib/Target/SystemZ/SystemZOperands.td
+++ b/lib/Target/SystemZ/SystemZOperands.td
@@ -46,7 +46,8 @@ class PCRelOperand<ValueType vt, AsmOperandClass asmop> : Operand<vt> {
 // address with address size VT.  SELF is the name of the operand and
 // ASMOP is the associated asm operand.
 class PCRelAddress<ValueType vt, string self, AsmOperandClass asmop>
-  : ComplexPattern<vt, 1, "selectPCRelAddress", [z_pcrel_wrapper]>,
+  : ComplexPattern<vt, 1, "selectPCRelAddress",
+                   [z_pcrel_wrapper, z_pcrel_offset]>,
     PCRelOperand<vt, asmop> {
   let MIOperandInfo = (ops !cast<Operand>(self));
 }
@@ -219,11 +220,6 @@ def uimm8    : Immediate<i8, [{}], UIMM8, "U8Imm">;
 // i32 immediates
 //===----------------------------------------------------------------------===//
 
-// Immediates for 8-bit lengths.
-def imm32len8 : Immediate<i32, [{
-  return isUInt<8>(N->getZExtValue() - 1);
-}], NOOP_SDNodeXForm, "U32Imm">;
-
 // Immediates for the lower and upper 16 bits of an i32, with the other
 // bits of the i32 being zero.
 def imm32ll16 : Immediate<i32, [{
@@ -338,6 +334,10 @@ def imm64sx8 : Immediate<i64, [{
   return isInt<8>(N->getSExtValue());
 }], SIMM8, "S8Imm">;
 
+def imm64zx8 : Immediate<i64, [{
+  return isUInt<8>(N->getSExtValue());
+}], UIMM8, "U8Imm">;
+
 def imm64sx16 : Immediate<i64, [{
   return isInt<16>(N->getSExtValue());
 }], SIMM16, "S16Imm">;
@@ -358,7 +358,7 @@ def imm64zx32n : Immediate<i64, [{
   return isUInt<32>(-N->getSExtValue());
 }], NEGIMM32, "U32Imm">;
 
-def imm64 : ImmLeaf<i64, [{}]>;
+def imm64 : ImmLeaf<i64, [{}]>, Operand<i64>;
 
 //===----------------------------------------------------------------------===//
 // Floating-point immediates
@@ -396,19 +396,6 @@ def pcrel32 : PCRelAddress<i64, "pcrel32", PCRel32> {
   let DecoderMethod = "decodePC32DBLOperand";
 }
 
-// A PC-relative offset of a global value when the value is used as a
-// call target.  The offset is sign-extended and multiplied by 2.
-def pcrel16call : PCRelAddress<i64, "pcrel16call", PCRel16> {
-  let PrintMethod = "printCallOperand";
-  let EncoderMethod = "getPLT16DBLEncoding";
-  let DecoderMethod = "decodePC16DBLOperand";
-}
-def pcrel32call : PCRelAddress<i64, "pcrel32call", PCRel32> {
-  let PrintMethod = "printCallOperand";
-  let EncoderMethod = "getPLT32DBLEncoding";
-  let DecoderMethod = "decodePC32DBLOperand";
-}
-
 //===----------------------------------------------------------------------===//
 // Addressing modes
 //===----------------------------------------------------------------------===//
@@ -435,6 +422,7 @@ def BDLAddr64Disp12Len8 : AddressAsmOperand<"BDLAddr",  "64", "12", "Len8">;
 // <type> is one of:
 //   shift    : base + displacement (32-bit)
 //   bdaddr   : base + displacement
+//   mviaddr  : like bdaddr, but reject cases with a natural index
 //   bdxaddr  : base + displacement + index
 //   laaddr   : like bdxaddr, but used for Load Address operations
 //   dynalloc : base + displacement + index + ADJDYNALLOC
@@ -460,6 +448,8 @@ def bdaddr12only      : BDMode <"BDAddr",   "64", "12", "Only">;
 def bdaddr12pair      : BDMode <"BDAddr",   "64", "12", "Pair">;
 def bdaddr20only      : BDMode <"BDAddr",   "64", "20", "Only">;
 def bdaddr20pair      : BDMode <"BDAddr",   "64", "20", "Pair">;
+def mviaddr12pair     : BDMode <"MVIAddr",  "64", "12", "Pair">;
+def mviaddr20pair     : BDMode <"MVIAddr",  "64", "20", "Pair">;
 def bdxaddr12only     : BDXMode<"BDXAddr",  "64", "12", "Only">;
 def bdxaddr12pair     : BDXMode<"BDXAddr",  "64", "12", "Pair">;
 def bdxaddr20only     : BDXMode<"BDXAddr",  "64", "20", "Only">;
diff --git a/lib/Target/SystemZ/SystemZOperators.td b/lib/Target/SystemZ/SystemZOperators.td
index 6a3af2b..31cabaa 100644
--- a/lib/Target/SystemZ/SystemZOperators.td
+++ b/lib/Target/SystemZ/SystemZOperators.td
@@ -15,6 +15,9 @@ def SDT_CallSeqEnd          : SDCallSeqEnd<[SDTCisVT<0, i64>,
                                             SDTCisVT<1, i64>]>;
 def SDT_ZCall               : SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>;
 def SDT_ZCmp                : SDTypeProfile<0, 2, [SDTCisSameAs<0, 1>]>;
+def SDT_ZICmp               : SDTypeProfile<0, 3,
+                                            [SDTCisSameAs<0, 1>,
+                                             SDTCisVT<2, i32>]>;
 def SDT_ZBRCCMask           : SDTypeProfile<0, 3,
                                             [SDTCisVT<0, i8>,
                                              SDTCisVT<1, i8>,
@@ -27,6 +30,10 @@ def SDT_ZSelectCCMask       : SDTypeProfile<1, 4,
 def SDT_ZWrapPtr            : SDTypeProfile<1, 1,
                                             [SDTCisSameAs<0, 1>,
                                              SDTCisPtrTy<0>]>;
+def SDT_ZWrapOffset         : SDTypeProfile<1, 2,
+                                            [SDTCisSameAs<0, 1>,
+                                             SDTCisSameAs<0, 2>,
+                                             SDTCisPtrTy<0>]>;
 def SDT_ZAdjDynAlloc        : SDTypeProfile<1, 0, [SDTCisVT<0, i64>]>;
 def SDT_ZExtractAccess      : SDTypeProfile<1, 1,
                                             [SDTCisVT<0, i32>,
@@ -54,10 +61,24 @@ def SDT_ZAtomicCmpSwapW     : SDTypeProfile<1, 6,
                                              SDTCisVT<4, i32>,
                                              SDTCisVT<5, i32>,
                                              SDTCisVT<6, i32>]>;
-def SDT_ZCopy               : SDTypeProfile<0, 3,
+def SDT_ZMemMemLength       : SDTypeProfile<0, 3,
                                             [SDTCisPtrTy<0>,
                                              SDTCisPtrTy<1>,
-                                             SDTCisVT<2, i32>]>;
+                                             SDTCisVT<2, i64>]>;
+def SDT_ZMemMemLoop         : SDTypeProfile<0, 4,
+                                            [SDTCisPtrTy<0>,
+                                             SDTCisPtrTy<1>,
+                                             SDTCisVT<2, i64>,
+                                             SDTCisVT<3, i64>]>;
+def SDT_ZString             : SDTypeProfile<1, 3,
+                                            [SDTCisPtrTy<0>,
+                                             SDTCisPtrTy<1>,
+                                             SDTCisPtrTy<2>,
+                                             SDTCisVT<3, i32>]>;
+def SDT_ZI32Intrinsic       : SDTypeProfile<1, 0, [SDTCisVT<0, i32>]>;
+def SDT_ZPrefetch           : SDTypeProfile<0, 2,
+                                            [SDTCisVT<0, i8>,
+                                             SDTCisPtrTy<1>]>;
 
 //===----------------------------------------------------------------------===//
 // Node definitions
@@ -76,9 +97,15 @@ def z_retflag           : SDNode<"SystemZISD::RET_FLAG", SDTNone,
 def z_call              : SDNode<"SystemZISD::CALL", SDT_ZCall,
                                  [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue,
                                   SDNPVariadic]>;
+def z_sibcall           : SDNode<"SystemZISD::SIBCALL", SDT_ZCall,
+                                 [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue,
+                                  SDNPVariadic]>;
 def z_pcrel_wrapper     : SDNode<"SystemZISD::PCREL_WRAPPER", SDT_ZWrapPtr, []>;
-def z_cmp               : SDNode<"SystemZISD::CMP", SDT_ZCmp, [SDNPOutGlue]>;
-def z_ucmp              : SDNode<"SystemZISD::UCMP", SDT_ZCmp, [SDNPOutGlue]>;
+def z_pcrel_offset      : SDNode<"SystemZISD::PCREL_OFFSET",
+                                 SDT_ZWrapOffset, []>;
+def z_icmp              : SDNode<"SystemZISD::ICMP", SDT_ZICmp, [SDNPOutGlue]>;
+def z_fcmp              : SDNode<"SystemZISD::FCMP", SDT_ZCmp, [SDNPOutGlue]>;
+def z_tm                : SDNode<"SystemZISD::TM", SDT_ZICmp, [SDNPOutGlue]>;
 def z_br_ccmask         : SDNode<"SystemZISD::BR_CCMASK", SDT_ZBRCCMask,
                                  [SDNPHasChain, SDNPInGlue]>;
 def z_select_ccmask     : SDNode<"SystemZISD::SELECT_CCMASK", SDT_ZSelectCCMask,
@@ -109,13 +136,56 @@ def z_atomic_loadw_umin : AtomicWOp<"ATOMIC_LOADW_UMIN">;
 def z_atomic_loadw_umax : AtomicWOp<"ATOMIC_LOADW_UMAX">;
 def z_atomic_cmp_swapw  : AtomicWOp<"ATOMIC_CMP_SWAPW", SDT_ZAtomicCmpSwapW>;
 
-def z_mvc               : SDNode<"SystemZISD::MVC", SDT_ZCopy,
+def z_mvc               : SDNode<"SystemZISD::MVC", SDT_ZMemMemLength,
+                                 [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
+def z_mvc_loop          : SDNode<"SystemZISD::MVC_LOOP", SDT_ZMemMemLoop,
+                                 [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
+def z_nc                : SDNode<"SystemZISD::NC", SDT_ZMemMemLength,
+                                  [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
+def z_nc_loop           : SDNode<"SystemZISD::NC_LOOP", SDT_ZMemMemLoop,
+                                  [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
+def z_oc                : SDNode<"SystemZISD::OC", SDT_ZMemMemLength,
+                                  [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
+def z_oc_loop           : SDNode<"SystemZISD::OC_LOOP", SDT_ZMemMemLoop,
+                                  [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
+def z_xc                : SDNode<"SystemZISD::XC", SDT_ZMemMemLength,
+                                  [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
+def z_xc_loop           : SDNode<"SystemZISD::XC_LOOP", SDT_ZMemMemLoop,
+                                  [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
+def z_clc               : SDNode<"SystemZISD::CLC", SDT_ZMemMemLength,
+                                 [SDNPHasChain, SDNPOutGlue, SDNPMayLoad]>;
+def z_clc_loop          : SDNode<"SystemZISD::CLC_LOOP", SDT_ZMemMemLoop,
+                                 [SDNPHasChain, SDNPOutGlue, SDNPMayLoad]>;
+def z_strcmp            : SDNode<"SystemZISD::STRCMP", SDT_ZString,
+                                 [SDNPHasChain, SDNPOutGlue, SDNPMayLoad]>;
+def z_stpcpy            : SDNode<"SystemZISD::STPCPY", SDT_ZString,
                                  [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
+def z_search_string     : SDNode<"SystemZISD::SEARCH_STRING", SDT_ZString,
+                                 [SDNPHasChain, SDNPOutGlue, SDNPMayLoad]>;
+def z_ipm               : SDNode<"SystemZISD::IPM", SDT_ZI32Intrinsic,
+                                 [SDNPInGlue]>;
+def z_prefetch          : SDNode<"SystemZISD::PREFETCH", SDT_ZPrefetch,
+                                 [SDNPHasChain, SDNPMayLoad, SDNPMayStore,
+                                  SDNPMemOperand]>;
 
 //===----------------------------------------------------------------------===//
 // Pattern fragments
 //===----------------------------------------------------------------------===//
 
+// Signed and unsigned comparisons.
+def z_scmp : PatFrag<(ops node:$a, node:$b), (z_icmp node:$a, node:$b, imm), [{
+  unsigned Type = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
+  return Type != SystemZICMP::UnsignedOnly;
+}]>;
+def z_ucmp : PatFrag<(ops node:$a, node:$b), (z_icmp node:$a, node:$b, imm), [{
+  unsigned Type = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
+  return Type != SystemZICMP::SignedOnly;
+}]>;
+
+// Register- and memory-based TEST UNDER MASK.
+def z_tm_reg : PatFrag<(ops node:$a, node:$b), (z_tm node:$a, node:$b, imm)>;
+def z_tm_mem : PatFrag<(ops node:$a, node:$b), (z_tm node:$a, node:$b, 0)>;
+
 // Register sign-extend operations.  Sub-32-bit values are represented as i32s.
 def sext8  : PatFrag<(ops node:$src), (sext_inreg node:$src, i8)>;
 def sext16 : PatFrag<(ops node:$src), (sext_inreg node:$src, i16)>;
@@ -130,6 +200,36 @@ def zext32 : PatFrag<(ops node:$src), (zext (i32 node:$src))>;
 def loadf32 : PatFrag<(ops node:$src), (f32 (load node:$src))>;
 def loadf64 : PatFrag<(ops node:$src), (f64 (load node:$src))>;
 
+// Extending loads in which the extension type can be signed.
+def asextload : PatFrag<(ops node:$ptr), (unindexedload node:$ptr), [{
+  unsigned Type = cast<LoadSDNode>(N)->getExtensionType();
+  return Type == ISD::EXTLOAD || Type == ISD::SEXTLOAD;
+}]>;
+def asextloadi8 : PatFrag<(ops node:$ptr), (asextload node:$ptr), [{
+  return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i8;
+}]>;
+def asextloadi16 : PatFrag<(ops node:$ptr), (asextload node:$ptr), [{
+  return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+def asextloadi32 : PatFrag<(ops node:$ptr), (asextload node:$ptr), [{
+  return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
+
+// Extending loads in which the extension type can be unsigned.
+def azextload : PatFrag<(ops node:$ptr), (unindexedload node:$ptr), [{
+  unsigned Type = cast<LoadSDNode>(N)->getExtensionType();
+  return Type == ISD::EXTLOAD || Type == ISD::ZEXTLOAD;
+}]>;
+def azextloadi8 : PatFrag<(ops node:$ptr), (azextload node:$ptr), [{
+  return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i8;
+}]>;
+def azextloadi16 : PatFrag<(ops node:$ptr), (azextload node:$ptr), [{
+  return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+def azextloadi32 : PatFrag<(ops node:$ptr), (azextload node:$ptr), [{
+  return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
+
 // Extending loads in which the extension type doesn't matter.
 def anyextload : PatFrag<(ops node:$ptr), (unindexedload node:$ptr), [{
   return cast<LoadSDNode>(N)->getExtensionType() != ISD::NON_EXTLOAD;
@@ -150,11 +250,11 @@ class AlignedLoad<SDPatternOperator load>
   LoadSDNode *Load = cast<LoadSDNode>(N);
   return Load->getAlignment() >= Load->getMemoryVT().getStoreSize();
 }]>;
-def aligned_load        : AlignedLoad<load>;
-def aligned_sextloadi16 : AlignedLoad<sextloadi16>;
-def aligned_sextloadi32 : AlignedLoad<sextloadi32>;
-def aligned_zextloadi16 : AlignedLoad<zextloadi16>;
-def aligned_zextloadi32 : AlignedLoad<zextloadi32>;
+def aligned_load         : AlignedLoad<load>;
+def aligned_asextloadi16 : AlignedLoad<asextloadi16>;
+def aligned_asextloadi32 : AlignedLoad<asextloadi32>;
+def aligned_azextloadi16 : AlignedLoad<azextloadi16>;
+def aligned_azextloadi32 : AlignedLoad<azextloadi32>;
 
 // Aligned stores.
 class AlignedStore<SDPatternOperator store>
@@ -189,6 +289,31 @@ def nonvolatile_truncstorei8  : NonvolatileStore<truncstorei8>;
 def nonvolatile_truncstorei16 : NonvolatileStore<truncstorei16>;
 def nonvolatile_truncstorei32 : NonvolatileStore<truncstorei32>;
 
+// A store of a load that can be implemented using MVC.
+def mvc_store : PatFrag<(ops node:$value, node:$addr),
+                        (unindexedstore node:$value, node:$addr),
+                        [{ return storeLoadCanUseMVC(N); }]>;
+
+// Binary read-modify-write operations on memory in which the other
+// operand is also memory and for which block operations like NC can
+// be used.  There are two patterns for each operator, depending on
+// which operand contains the "other" load.
+multiclass block_op<SDPatternOperator operator> {
+  def "1" : PatFrag<(ops node:$value, node:$addr),
+                    (unindexedstore (operator node:$value,
+                                              (unindexedload node:$addr)),
+                                    node:$addr),
+                    [{ return storeLoadCanUseBlockBinary(N, 0); }]>;
+  def "2" : PatFrag<(ops node:$value, node:$addr),
+                    (unindexedstore (operator (unindexedload node:$addr),
+                                              node:$value),
+                                    node:$addr),
+                    [{ return storeLoadCanUseBlockBinary(N, 1); }]>;
+}
+defm block_and : block_op<and>;
+defm block_or  : block_op<or>;
+defm block_xor : block_op<xor>;
+
 // Insertions.
 def inserti8 : PatFrag<(ops node:$src1, node:$src2),
                        (or (and node:$src1, -256), node:$src2)>;
@@ -221,6 +346,16 @@ def or_as_revinserti8 : PatFrag<(ops node:$src1, node:$src2),
                                    APInt::getLowBitsSet(BitWidth, 8));
 }]>;
 
+// Integer absolute, matching the canonical form generated by DAGCombiner.
+def z_iabs32 : PatFrag<(ops node:$src),
+                       (xor (add node:$src, (sra node:$src, (i32 31))),
+                            (sra node:$src, (i32 31)))>;
+def z_iabs64 : PatFrag<(ops node:$src),
+                       (xor (add node:$src, (sra node:$src, (i32 63))),
+                            (sra node:$src, (i32 63)))>;
+def z_inegabs32 : PatFrag<(ops node:$src), (ineg (z_iabs32 node:$src))>;
+def z_inegabs64 : PatFrag<(ops node:$src), (ineg (z_iabs64 node:$src))>;
+
 // Fused multiply-add and multiply-subtract, but with the order of the
 // operands matching SystemZ's MA and MS instructions.
 def z_fma : PatFrag<(ops node:$src1, node:$src2, node:$src3),
diff --git a/lib/Target/SystemZ/SystemZPatterns.td b/lib/Target/SystemZ/SystemZPatterns.td
index c442ae0..7706351 100644
--- a/lib/Target/SystemZ/SystemZPatterns.td
+++ b/lib/Target/SystemZ/SystemZPatterns.td
@@ -13,7 +13,7 @@ multiclass SXU<SDPatternOperator operator, Instruction insn> {
   def : Pat<(operator (sext (i32 GR32:$src))),
             (insn GR32:$src)>;
   def : Pat<(operator (sext_inreg GR64:$src, i32)),
-            (insn (EXTRACT_SUBREG GR64:$src, subreg_32bit))>;
+            (insn (EXTRACT_SUBREG GR64:$src, subreg_l32))>;
 }
 
 // Record that INSN performs a 64-bit version of binary operator OPERATOR
@@ -24,7 +24,7 @@ multiclass SXB<SDPatternOperator operator, RegisterOperand cls,
   def : Pat<(operator cls:$src1, (sext GR32:$src2)),
             (insn cls:$src1, GR32:$src2)>;
   def : Pat<(operator cls:$src1, (sext_inreg GR64:$src2, i32)),
-            (insn cls:$src1, (EXTRACT_SUBREG GR64:$src2, subreg_32bit))>;
+            (insn cls:$src1, (EXTRACT_SUBREG GR64:$src2, subreg_l32))>;
 }
 
 // Like SXB, but for zero extension.
@@ -33,7 +33,7 @@ multiclass ZXB<SDPatternOperator operator, RegisterOperand cls,
   def : Pat<(operator cls:$src1, (zext GR32:$src2)),
             (insn cls:$src1, GR32:$src2)>;
   def : Pat<(operator cls:$src1, (and GR64:$src2, 0xffffffff)),
-            (insn cls:$src1, (EXTRACT_SUBREG GR64:$src2, subreg_32bit))>;
+            (insn cls:$src1, (EXTRACT_SUBREG GR64:$src2, subreg_l32))>;
 }
 
 // Record that INSN performs a binary read-modify-write operation,
@@ -66,22 +66,87 @@ multiclass InsertMem<string type, Instruction insn, RegisterOperand cls,
             (insn cls:$src1, mode:$src2)>;
 }
 
-// Use MVC instruction INSN for a load of type LOAD followed by a store
-// of type STORE.  VT is the type of the intermediate register and LENGTH
-// is the number of bytes to copy (which may be smaller than VT).
-multiclass MVCLoadStore<SDPatternOperator load, SDPatternOperator store,
-                        ValueType vt, Instruction insn, bits<5> length> {
-  def Pat : PatFrag<(ops node:$dest, node:$src),
-                    (store (vt (load node:$src)), node:$dest),
-                    [{ return storeLoadCanUseMVC(N); }]>;
+// INSN stores the low 32 bits of a GPR to a memory with addressing mode MODE.
+// Record that it is equivalent to using OPERATOR to store a GR64.
+class StoreGR64<Instruction insn, SDPatternOperator operator,
+                AddressingMode mode>
+  : Pat<(operator GR64:$R1, mode:$XBD2),
+        (insn (EXTRACT_SUBREG GR64:$R1, subreg_l32), mode:$XBD2)>;
 
-  def : Pat<(!cast<SDPatternOperator>(NAME##"Pat") bdaddr12only:$dest,
-                                                   bdaddr12only:$src),
+// INSN and INSNY are an RX/RXY pair of instructions that store the low
+// 32 bits of a GPR to memory.  Record that they are equivalent to using
+// OPERATOR to store a GR64.
+multiclass StoreGR64Pair<Instruction insn, Instruction insny,
+                         SDPatternOperator operator> {
+  def : StoreGR64<insn, operator, bdxaddr12pair>;
+  def : StoreGR64<insny, operator, bdxaddr20pair>;
+}
+
+// INSN stores the low 32 bits of a GPR using PC-relative addressing.
+// Record that it is equivalent to using OPERATOR to store a GR64.
+class StoreGR64PC<Instruction insn, SDPatternOperator operator>
+  : Pat<(operator GR64:$R1, pcrel32:$XBD2),
+        (insn (EXTRACT_SUBREG GR64:$R1, subreg_l32), pcrel32:$XBD2)> {
+  // We want PC-relative addresses to be tried ahead of BD and BDX addresses.
+  // However, BDXs have two extra operands and are therefore 6 units more
+  // complex.
+  let AddedComplexity = 7;
+}
+
+// INSN and INSNINV conditionally store the low 32 bits of a GPR to memory,
+// with INSN storing when the condition is true and INSNINV storing when the
+// condition is false.  Record that they are equivalent to a LOAD/select/STORE
+// sequence for GR64s.
+multiclass CondStores64<Instruction insn, Instruction insninv,
+                        SDPatternOperator store, SDPatternOperator load,
+                        AddressingMode mode> {
+  def : Pat<(store (z_select_ccmask GR64:$new, (load mode:$addr),
+                                    uimm8zx4:$valid, uimm8zx4:$cc),
+                   mode:$addr),
+            (insn (EXTRACT_SUBREG GR64:$new, subreg_l32), mode:$addr,
+                  uimm8zx4:$valid, uimm8zx4:$cc)>;
+  def : Pat<(store (z_select_ccmask (load mode:$addr), GR64:$new,
+                                    uimm8zx4:$valid, uimm8zx4:$cc),
+                   mode:$addr),
+            (insninv (EXTRACT_SUBREG GR64:$new, subreg_l32), mode:$addr,
+                     uimm8zx4:$valid, uimm8zx4:$cc)>;
+}
+
+// Try to use MVC instruction INSN for a load of type LOAD followed by a store
+// of the same size.  VT is the type of the intermediate (legalized) value and
+// LENGTH is the number of bytes loaded by LOAD.
+multiclass MVCLoadStore<SDPatternOperator load, ValueType vt, Instruction insn,
+                        bits<5> length> {
+  def : Pat<(mvc_store (vt (load bdaddr12only:$src)), bdaddr12only:$dest),
             (insn bdaddr12only:$dest, bdaddr12only:$src, length)>;
 }
 
+// Use NC-like instruction INSN for block_op operation OPERATOR.
+// The other operand is a load of type LOAD, which accesses LENGTH bytes.
+// VT is the intermediate legalized type in which the binary operation
+// is actually done.
+multiclass BinaryLoadStore<SDPatternOperator operator, SDPatternOperator load,
+                           ValueType vt, Instruction insn, bits<5> length> {
+  def : Pat<(operator (vt (load bdaddr12only:$src)), bdaddr12only:$dest),
+            (insn bdaddr12only:$dest, bdaddr12only:$src, length)>;
+}
+
+// A convenient way of generating all block peepholes for a particular
+// LOAD/VT/LENGTH combination.
+multiclass BlockLoadStore<SDPatternOperator load, ValueType vt,
+                          Instruction mvc, Instruction nc, Instruction oc,
+                          Instruction xc, bits<5> length> {
+  defm : MVCLoadStore<load, vt, mvc, length>;
+  defm : BinaryLoadStore<block_and1, load, vt, nc, length>;
+  defm : BinaryLoadStore<block_and2, load, vt, nc, length>;
+  defm : BinaryLoadStore<block_or1,  load, vt, oc, length>;
+  defm : BinaryLoadStore<block_or2,  load, vt, oc, length>;
+  defm : BinaryLoadStore<block_xor1, load, vt, xc, length>;
+  defm : BinaryLoadStore<block_xor2, load, vt, xc, length>;
+}
+
 // Record that INSN is a LOAD AND TEST that can be used to compare
 // registers in CLS against zero.  The instruction has separate R1 and R2
 // operands, but they must be the same when the instruction is used like this.
 class CompareZeroFP<Instruction insn, RegisterOperand cls>
-  : Pat<(z_cmp cls:$reg, (fpimm0)), (insn cls:$reg, cls:$reg)>;
+  : Pat<(z_fcmp cls:$reg, (fpimm0)), (insn cls:$reg, cls:$reg)>;
diff --git a/lib/Target/SystemZ/SystemZProcessors.td b/lib/Target/SystemZ/SystemZProcessors.td
index 7e14aa7..f241fb0 100644
--- a/lib/Target/SystemZ/SystemZProcessors.td
+++ b/lib/Target/SystemZ/SystemZProcessors.td
@@ -31,8 +31,16 @@ def FeatureHighWord : SystemZFeature<
   "Assume that the high-word facility is installed"
 >;
 
-def : Processor<"z10",   NoItineraries, []>;
-def : Processor<"z196",  NoItineraries,
-                [FeatureDistinctOps, FeatureLoadStoreOnCond, FeatureHighWord]>;
+def FeatureFPExtension : SystemZFeature<
+  "fp-extension", "FPExtension",
+  "Assume that the floating-point extension facility is installed"
+>;
+
+def : Processor<"generic", NoItineraries, []>;
+def : Processor<"z10", NoItineraries, []>;
+def : Processor<"z196", NoItineraries,
+                [FeatureDistinctOps, FeatureLoadStoreOnCond, FeatureHighWord,
+                 FeatureFPExtension]>;
 def : Processor<"zEC12", NoItineraries,
-                [FeatureDistinctOps, FeatureLoadStoreOnCond, FeatureHighWord]>;
+                [FeatureDistinctOps, FeatureLoadStoreOnCond, FeatureHighWord,
+                 FeatureFPExtension]>;
diff --git a/lib/Target/SystemZ/SystemZRegisterInfo.cpp b/lib/Target/SystemZ/SystemZRegisterInfo.cpp
index 8ce6d6a..b61ae88 100644
--- a/lib/Target/SystemZ/SystemZRegisterInfo.cpp
+++ b/lib/Target/SystemZ/SystemZRegisterInfo.cpp
@@ -42,13 +42,15 @@ SystemZRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   if (TFI->hasFP(MF)) {
     // R11D is the frame pointer.  Reserve all aliases.
     Reserved.set(SystemZ::R11D);
-    Reserved.set(SystemZ::R11W);
+    Reserved.set(SystemZ::R11L);
+    Reserved.set(SystemZ::R11H);
     Reserved.set(SystemZ::R10Q);
   }
 
   // R15D is the stack pointer.  Reserve all aliases.
   Reserved.set(SystemZ::R15D);
-  Reserved.set(SystemZ::R15W);
+  Reserved.set(SystemZ::R15L);
+  Reserved.set(SystemZ::R15H);
   Reserved.set(SystemZ::R14Q);
   return Reserved;
 }
diff --git a/lib/Target/SystemZ/SystemZRegisterInfo.h b/lib/Target/SystemZ/SystemZRegisterInfo.h
index c447e4d..13f45fa 100644
--- a/lib/Target/SystemZ/SystemZRegisterInfo.h
+++ b/lib/Target/SystemZ/SystemZRegisterInfo.h
@@ -22,10 +22,10 @@ namespace SystemZ {
   // Return the subreg to use for referring to the even and odd registers
   // in a GR128 pair.  Is32Bit says whether we want a GR32 or GR64.
   inline unsigned even128(bool Is32bit) {
-    return Is32bit ? subreg_32bit : subreg_high;
+    return Is32bit ? subreg_hl32 : subreg_h64;
   }
   inline unsigned odd128(bool Is32bit) {
-    return Is32bit ? subreg_low32 : subreg_low;
+    return Is32bit ? subreg_l32 : subreg_l64;
   }
 }
 
@@ -48,6 +48,10 @@ public:
     LLVM_OVERRIDE {
     return true;
   }
+  virtual bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const
+    LLVM_OVERRIDE {
+    return true;
+  }
   virtual const uint16_t *getCalleeSavedRegs(const MachineFunction *MF = 0)
     const LLVM_OVERRIDE;
   virtual BitVector getReservedRegs(const MachineFunction &MF)
diff --git a/lib/Target/SystemZ/SystemZRegisterInfo.td b/lib/Target/SystemZ/SystemZRegisterInfo.td
index ffffe72..93d7c83 100644
--- a/lib/Target/SystemZ/SystemZRegisterInfo.td
+++ b/lib/Target/SystemZ/SystemZRegisterInfo.td
@@ -21,11 +21,12 @@ class SystemZRegWithSubregs<string n, list<Register> subregs>
 }
 
 let Namespace = "SystemZ" in {
-def subreg_32bit  : SubRegIndex<32>; // could also be named "subreg_high32"
-// Indices are used in a variety of ways, so don't set an Offset.
-def subreg_high   : SubRegIndex<64, -1>;
-def subreg_low    : SubRegIndex<64, -1>;
-def subreg_low32  : ComposedSubRegIndex<subreg_low, subreg_32bit>;
+def subreg_l32   : SubRegIndex<32, 0>;  // Also acts as subreg_ll32.
+def subreg_h32   : SubRegIndex<32, 32>; // Also acts as subreg_lh32.
+def subreg_l64   : SubRegIndex<64, 0>;
+def subreg_h64   : SubRegIndex<64, 64>;
+def subreg_hh32  : ComposedSubRegIndex<subreg_h64, subreg_h32>;
+def subreg_hl32  : ComposedSubRegIndex<subreg_h64, subreg_l32>;
 }
 
 // Define a register class that contains values of type TYPE and an
@@ -55,36 +56,49 @@ class GPR32<bits<16> num, string n> : SystemZReg<n> {
 }
 
 // One of the 16 64-bit general-purpose registers.
-class GPR64<bits<16> num, string n, GPR32 low>
- : SystemZRegWithSubregs<n, [low]> {
+class GPR64<bits<16> num, string n, GPR32 low, GPR32 high>
+ : SystemZRegWithSubregs<n, [low, high]> {
   let HWEncoding = num;
-  let SubRegIndices = [subreg_32bit];
+  let SubRegIndices = [subreg_l32, subreg_h32];
 }
 
 // 8 even-odd pairs of GPR64s.
-class GPR128<bits<16> num, string n, GPR64 high, GPR64 low>
- : SystemZRegWithSubregs<n, [high, low]> {
+class GPR128<bits<16> num, string n, GPR64 low, GPR64 high>
+ : SystemZRegWithSubregs<n, [low, high]> {
   let HWEncoding = num;
-  let SubRegIndices = [subreg_high, subreg_low];
+  let SubRegIndices = [subreg_l64, subreg_h64];
 }
 
 // General-purpose registers
 foreach I = 0-15 in {
-  def R#I#W : GPR32<I, "r"#I>;
-  def R#I#D : GPR64<I, "r"#I, !cast<GPR32>("R"#I#"W")>, DwarfRegNum<[I]>;
+  def R#I#L : GPR32<I, "r"#I>;
+  def R#I#H : GPR32<I, "r"#I>;
+  def R#I#D : GPR64<I, "r"#I, !cast<GPR32>("R"#I#"L"), !cast<GPR32>("R"#I#"H")>,
+                    DwarfRegNum<[I]>;
 }
 
 foreach I = [0, 2, 4, 6, 8, 10, 12, 14] in {
-  def R#I#Q : GPR128<I, "r"#I, !cast<GPR64>("R"#I#"D"),
-                     !cast<GPR64>("R"#!add(I, 1)#"D")>;
+  def R#I#Q : GPR128<I, "r"#I, !cast<GPR64>("R"#!add(I, 1)#"D"),
+                     !cast<GPR64>("R"#I#"D")>;
 }
 
 /// Allocate the callee-saved R6-R13 backwards. That way they can be saved
 /// together with R14 and R15 in one prolog instruction.
-defm GR32 : SystemZRegClass<"GR32", i32, 32, (add (sequence "R%uW",  0, 5),
-                                                  (sequence "R%uW", 15, 6))>;
-defm GR64 : SystemZRegClass<"GR64", i64, 64, (add (sequence "R%uD",  0, 5),
-                                                  (sequence "R%uD", 15, 6))>;
+defm GR32  : SystemZRegClass<"GR32",  i32, 32, (add (sequence "R%uL",  0, 5),
+                                                    (sequence "R%uL", 15, 6))>;
+defm GRH32 : SystemZRegClass<"GRH32", i32, 32, (add (sequence "R%uH",  0, 5),
+                                                    (sequence "R%uH", 15, 6))>;
+defm GR64  : SystemZRegClass<"GR64",  i64, 64, (add (sequence "R%uD",  0, 5),
+                                                    (sequence "R%uD", 15, 6))>;
+
+// Combine the low and high GR32s into a single class.  This can only be
+// used for virtual registers if the high-word facility is available.
+defm GRX32 : SystemZRegClass<"GRX32", i32, 32,
+                             (add (sequence "R%uL",  0, 5),
+                                  (sequence "R%uH",  0, 5),
+                                  R15L, R15H, R14L, R14H, R13L, R13H,
+                                  R12L, R12H, R11L, R11H, R10L, R10H,
+                                  R9L, R9H, R8L, R8H, R7L, R7H, R6L, R6H)>;
 
 // The architecture doesn't really have any i128 support, so model the
 // register pairs as untyped instead.
@@ -94,7 +108,7 @@ defm GR128 : SystemZRegClass<"GR128", untyped, 128, (add R0Q, R2Q, R4Q,
 
 // Base and index registers.  Everything except R0, which in an address
 // context evaluates as 0.
-defm ADDR32 : SystemZRegClass<"ADDR32", i32, 32, (sub GR32Bit, R0W)>;
+defm ADDR32 : SystemZRegClass<"ADDR32", i32, 32, (sub GR32Bit, R0L)>;
 defm ADDR64 : SystemZRegClass<"ADDR64", i64, 64, (sub GR64Bit, R0D)>;
 
 // Not used directly, but needs to exist for ADDR32 and ADDR64 subregs
@@ -114,14 +128,14 @@ class FPR32<bits<16> num, string n> : SystemZReg<n> {
 class FPR64<bits<16> num, string n, FPR32 low>
  : SystemZRegWithSubregs<n, [low]> {
   let HWEncoding = num;
-  let SubRegIndices = [subreg_32bit];
+  let SubRegIndices = [subreg_h32];
 }
 
 // 8 pairs of FPR64s, with a one-register gap inbetween.
-class FPR128<bits<16> num, string n, FPR64 high, FPR64 low>
- : SystemZRegWithSubregs<n, [high, low]> {
+class FPR128<bits<16> num, string n, FPR64 low, FPR64 high>
+ : SystemZRegWithSubregs<n, [low, high]> {
   let HWEncoding = num;
-  let SubRegIndices = [subreg_high, subreg_low];
+  let SubRegIndices = [subreg_l64, subreg_h64];
 }
 
 // Floating-point registers
@@ -132,8 +146,8 @@ foreach I = 0-15 in {
 }
 
 foreach I = [0, 1, 4, 5, 8, 9, 12, 13] in {
-  def F#I#Q  : FPR128<I, "f"#I, !cast<FPR64>("F"#I#"D"),
-                     !cast<FPR64>("F"#!add(I, 2)#"D")>;
+  def F#I#Q  : FPR128<I, "f"#I, !cast<FPR64>("F"#!add(I, 2)#"D"),
+                     !cast<FPR64>("F"#I#"D")>;
 }
 
 // There's no store-multiple instruction for FPRs, so we're not fussy
diff --git a/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp b/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
index 4ca9292..c7ebb5d 100644
--- a/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
+++ b/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
@@ -25,6 +25,34 @@ SystemZSelectionDAGInfo(const SystemZTargetMachine &TM)
 SystemZSelectionDAGInfo::~SystemZSelectionDAGInfo() {
 }
 
+// Decide whether it is best to use a loop or straight-line code for
+// a block operation of Size bytes with source address Src and destination
+// address Dest.  Sequence is the opcode to use for straight-line code
+// (such as MVC) and Loop is the opcode to use for loops (such as MVC_LOOP).
+// Return the chain for the completed operation.
+static SDValue emitMemMem(SelectionDAG &DAG, SDLoc DL, unsigned Sequence,
+                          unsigned Loop, SDValue Chain, SDValue Dst,
+                          SDValue Src, uint64_t Size) {
+  EVT PtrVT = Src.getValueType();
+  // The heuristic we use is to prefer loops for anything that would
+  // require 7 or more MVCs.  With these kinds of sizes there isn't
+  // much to choose between straight-line code and looping code,
+  // since the time will be dominated by the MVCs themselves.
+  // However, the loop has 4 or 5 instructions (depending on whether
+  // the base addresses can be proved equal), so there doesn't seem
+  // much point using a loop for 5 * 256 bytes or fewer.  Anything in
+  // the range (5 * 256, 6 * 256) will need another instruction after
+  // the loop, so it doesn't seem worth using a loop then either.
+  // The next value up, 6 * 256, can be implemented in the same
+  // number of straight-line MVCs as 6 * 256 - 1.
+  if (Size > 6 * 256)
+    return DAG.getNode(Loop, DL, MVT::Other, Chain, Dst, Src,
+                       DAG.getConstant(Size, PtrVT),
+                       DAG.getConstant(Size / 256, PtrVT));
+  return DAG.getNode(Sequence, DL, MVT::Other, Chain, Dst, Src,
+                     DAG.getConstant(Size, PtrVT));
+}
+
 SDValue SystemZSelectionDAGInfo::
 EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
                         SDValue Dst, SDValue Src, SDValue Size, unsigned Align,
@@ -34,14 +62,9 @@ EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
   if (IsVolatile)
     return SDValue();
 
-  if (ConstantSDNode *CSize = dyn_cast<ConstantSDNode>(Size)) {
-    uint64_t Bytes = CSize->getZExtValue();
-    if (Bytes >= 1 && Bytes <= 0x100) {
-      // A single MVC.
-      return DAG.getNode(SystemZISD::MVC, DL, MVT::Other,
-                         Chain, Dst, Src, Size);
-    }
-  }
+  if (ConstantSDNode *CSize = dyn_cast<ConstantSDNode>(Size))
+    return emitMemMem(DAG, DL, SystemZISD::MVC, SystemZISD::MVC_LOOP,
+                      Chain, Dst, Src, CSize->getZExtValue());
   return SDValue();
 }
 
@@ -65,7 +88,7 @@ EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
                         SDValue Dst, SDValue Byte, SDValue Size,
                         unsigned Align, bool IsVolatile,
                         MachinePointerInfo DstPtrInfo) const {
-  EVT DstVT = Dst.getValueType();
+  EVT PtrVT = Dst.getValueType();
 
   if (IsVolatile)
     return SDValue();
@@ -89,8 +112,8 @@ EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
                                      Align, DstPtrInfo);
         if (Size2 == 0)
           return Chain1;
-        Dst = DAG.getNode(ISD::ADD, DL, DstVT, Dst,
-                          DAG.getConstant(Size1, DstVT));
+        Dst = DAG.getNode(ISD::ADD, DL, PtrVT, Dst,
+                          DAG.getConstant(Size1, PtrVT));
         DstPtrInfo = DstPtrInfo.getWithOffset(Size1);
         SDValue Chain2 = memsetStore(DAG, DL, Chain, Dst, ByteVal, Size2,
                                      std::min(Align, Size1), DstPtrInfo);
@@ -103,8 +126,8 @@ EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
                                       false, false, Align);
         if (Bytes == 1)
           return Chain1;
-        SDValue Dst2 = DAG.getNode(ISD::ADD, DL, DstVT, Dst,
-                                   DAG.getConstant(1, DstVT));
+        SDValue Dst2 = DAG.getNode(ISD::ADD, DL, PtrVT, Dst,
+                                   DAG.getConstant(1, PtrVT));
         SDValue Chain2 = DAG.getStore(Chain, DL, Byte, Dst2,
                                       DstPtrInfo.getWithOffset(1),
                                       false, false, 1);
@@ -112,16 +135,159 @@ EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
       }
     }
     assert(Bytes >= 2 && "Should have dealt with 0- and 1-byte cases already");
-    if (Bytes <= 0x101) {
-      // Copy the byte to the first location and then use MVC to copy
-      // it to the rest.
-      Chain = DAG.getStore(Chain, DL, Byte, Dst, DstPtrInfo,
-                           false, false, Align);
-      SDValue Dst2 = DAG.getNode(ISD::ADD, DL, DstVT, Dst,
-                                 DAG.getConstant(1, DstVT));
-      return DAG.getNode(SystemZISD::MVC, DL, MVT::Other, Chain, Dst2, Dst,
-                         DAG.getConstant(Bytes - 1, MVT::i32));
-    }
+
+    // Handle the special case of a memset of 0, which can use XC.
+    ConstantSDNode *CByte = dyn_cast<ConstantSDNode>(Byte);
+    if (CByte && CByte->getZExtValue() == 0)
+      return emitMemMem(DAG, DL, SystemZISD::XC, SystemZISD::XC_LOOP,
+                        Chain, Dst, Dst, Bytes);
+
+    // Copy the byte to the first location and then use MVC to copy
+    // it to the rest.
+    Chain = DAG.getStore(Chain, DL, Byte, Dst, DstPtrInfo,
+                         false, false, Align);
+    SDValue DstPlus1 = DAG.getNode(ISD::ADD, DL, PtrVT, Dst,
+                                   DAG.getConstant(1, PtrVT));
+    return emitMemMem(DAG, DL, SystemZISD::MVC, SystemZISD::MVC_LOOP,
+                      Chain, DstPlus1, Dst, Bytes - 1);
   }
   return SDValue();
 }
+
+// Use CLC to compare [Src1, Src1 + Size) with [Src2, Src2 + Size),
+// deciding whether to use a loop or straight-line code.
+static SDValue emitCLC(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
+                       SDValue Src1, SDValue Src2, uint64_t Size) {
+  SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
+  EVT PtrVT = Src1.getValueType();
+  // A two-CLC sequence is a clear win over a loop, not least because it
+  // needs only one branch.  A three-CLC sequence needs the same number
+  // of branches as a loop (i.e. 2), but is shorter.  That brings us to
+  // lengths greater than 768 bytes.  It seems relatively likely that
+  // a difference will be found within the first 768 bytes, so we just
+  // optimize for the smallest number of branch instructions, in order
+  // to avoid polluting the prediction buffer too much.  A loop only ever
+  // needs 2 branches, whereas a straight-line sequence would need 3 or more.
+  if (Size > 3 * 256)
+    return DAG.getNode(SystemZISD::CLC_LOOP, DL, VTs, Chain, Src1, Src2,
+                       DAG.getConstant(Size, PtrVT),
+                       DAG.getConstant(Size / 256, PtrVT));
+  return DAG.getNode(SystemZISD::CLC, DL, VTs, Chain, Src1, Src2,
+                     DAG.getConstant(Size, PtrVT));
+}
+
+// Convert the current CC value into an integer that is 0 if CC == 0,
+// less than zero if CC == 1 and greater than zero if CC >= 2.
+// The sequence starts with IPM, which puts CC into bits 29 and 28
+// of an integer and clears bits 30 and 31.
+static SDValue addIPMSequence(SDLoc DL, SDValue Glue, SelectionDAG &DAG) {
+  SDValue IPM = DAG.getNode(SystemZISD::IPM, DL, MVT::i32, Glue);
+  SDValue SRL = DAG.getNode(ISD::SRL, DL, MVT::i32, IPM,
+                            DAG.getConstant(SystemZ::IPM_CC, MVT::i32));
+  SDValue ROTL = DAG.getNode(ISD::ROTL, DL, MVT::i32, SRL,
+                             DAG.getConstant(31, MVT::i32));
+  return ROTL;
+}
+
+std::pair<SDValue, SDValue> SystemZSelectionDAGInfo::
+EmitTargetCodeForMemcmp(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
+                        SDValue Src1, SDValue Src2, SDValue Size,
+                        MachinePointerInfo Op1PtrInfo,
+                        MachinePointerInfo Op2PtrInfo) const {
+  if (ConstantSDNode *CSize = dyn_cast<ConstantSDNode>(Size)) {
+    uint64_t Bytes = CSize->getZExtValue();
+    assert(Bytes > 0 && "Caller should have handled 0-size case");
+    Chain = emitCLC(DAG, DL, Chain, Src1, Src2, Bytes);
+    SDValue Glue = Chain.getValue(1);
+    return std::make_pair(addIPMSequence(DL, Glue, DAG), Chain);
+  }
+  return std::make_pair(SDValue(), SDValue());
+}
+
+std::pair<SDValue, SDValue> SystemZSelectionDAGInfo::
+EmitTargetCodeForMemchr(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
+                        SDValue Src, SDValue Char, SDValue Length,
+                        MachinePointerInfo SrcPtrInfo) const {
+  // Use SRST to find the character.  End is its address on success.
+  EVT PtrVT = Src.getValueType();
+  SDVTList VTs = DAG.getVTList(PtrVT, MVT::Other, MVT::Glue);
+  Length = DAG.getZExtOrTrunc(Length, DL, PtrVT);
+  Char = DAG.getZExtOrTrunc(Char, DL, MVT::i32);
+  Char = DAG.getNode(ISD::AND, DL, MVT::i32, Char,
+                     DAG.getConstant(255, MVT::i32));
+  SDValue Limit = DAG.getNode(ISD::ADD, DL, PtrVT, Src, Length);
+  SDValue End = DAG.getNode(SystemZISD::SEARCH_STRING, DL, VTs, Chain,
+                            Limit, Src, Char);
+  Chain = End.getValue(1);
+  SDValue Glue = End.getValue(2);
+
+  // Now select between End and null, depending on whether the character
+  // was found.
+  SmallVector<SDValue, 5> Ops;
+  Ops.push_back(End);
+  Ops.push_back(DAG.getConstant(0, PtrVT));
+  Ops.push_back(DAG.getConstant(SystemZ::CCMASK_SRST, MVT::i32));
+  Ops.push_back(DAG.getConstant(SystemZ::CCMASK_SRST_FOUND, MVT::i32));
+  Ops.push_back(Glue);
+  VTs = DAG.getVTList(PtrVT, MVT::Glue);
+  End = DAG.getNode(SystemZISD::SELECT_CCMASK, DL, VTs, &Ops[0], Ops.size());
+  return std::make_pair(End, Chain);
+}
+
+std::pair<SDValue, SDValue> SystemZSelectionDAGInfo::
+EmitTargetCodeForStrcpy(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
+                        SDValue Dest, SDValue Src,
+                        MachinePointerInfo DestPtrInfo,
+                        MachinePointerInfo SrcPtrInfo, bool isStpcpy) const {
+  SDVTList VTs = DAG.getVTList(Dest.getValueType(), MVT::Other);
+  SDValue EndDest = DAG.getNode(SystemZISD::STPCPY, DL, VTs, Chain, Dest, Src,
+                                DAG.getConstant(0, MVT::i32));
+  return std::make_pair(isStpcpy ? EndDest : Dest, EndDest.getValue(1));
+}
+
+std::pair<SDValue, SDValue> SystemZSelectionDAGInfo::
+EmitTargetCodeForStrcmp(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
+                        SDValue Src1, SDValue Src2,
+                        MachinePointerInfo Op1PtrInfo,
+                        MachinePointerInfo Op2PtrInfo) const {
+  SDVTList VTs = DAG.getVTList(Src1.getValueType(), MVT::Other, MVT::Glue);
+  SDValue Unused = DAG.getNode(SystemZISD::STRCMP, DL, VTs, Chain, Src1, Src2,
+                               DAG.getConstant(0, MVT::i32));
+  Chain = Unused.getValue(1);
+  SDValue Glue = Chain.getValue(2);
+  return std::make_pair(addIPMSequence(DL, Glue, DAG), Chain);
+}
+
+// Search from Src for a null character, stopping once Src reaches Limit.
+// Return a pair of values, the first being the number of nonnull characters
+// and the second being the out chain.
+//
+// This can be used for strlen by setting Limit to 0.
+static std::pair<SDValue, SDValue> getBoundedStrlen(SelectionDAG &DAG, SDLoc DL,
+                                                    SDValue Chain, SDValue Src,
+                                                    SDValue Limit) {
+  EVT PtrVT = Src.getValueType();
+  SDVTList VTs = DAG.getVTList(PtrVT, MVT::Other, MVT::Glue);
+  SDValue End = DAG.getNode(SystemZISD::SEARCH_STRING, DL, VTs, Chain,
+                            Limit, Src, DAG.getConstant(0, MVT::i32));
+  Chain = End.getValue(1);
+  SDValue Len = DAG.getNode(ISD::SUB, DL, PtrVT, End, Src);
+  return std::make_pair(Len, Chain);
+}    
+
+std::pair<SDValue, SDValue> SystemZSelectionDAGInfo::
+EmitTargetCodeForStrlen(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
+                        SDValue Src, MachinePointerInfo SrcPtrInfo) const {
+  EVT PtrVT = Src.getValueType();
+  return getBoundedStrlen(DAG, DL, Chain, Src, DAG.getConstant(0, PtrVT));
+}
+
+std::pair<SDValue, SDValue> SystemZSelectionDAGInfo::
+EmitTargetCodeForStrnlen(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
+                         SDValue Src, SDValue MaxLength,
+                         MachinePointerInfo SrcPtrInfo) const {
+  EVT PtrVT = Src.getValueType();
+  MaxLength = DAG.getZExtOrTrunc(MaxLength, DL, PtrVT);
+  SDValue Limit = DAG.getNode(ISD::ADD, DL, PtrVT, Src, MaxLength);
+  return getBoundedStrlen(DAG, DL, Chain, Src, Limit);
+}
diff --git a/lib/Target/SystemZ/SystemZSelectionDAGInfo.h b/lib/Target/SystemZ/SystemZSelectionDAGInfo.h
index 9138a9c..281d1e2 100644
--- a/lib/Target/SystemZ/SystemZSelectionDAGInfo.h
+++ b/lib/Target/SystemZ/SystemZSelectionDAGInfo.h
@@ -38,7 +38,41 @@ public:
   EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc DL,
                           SDValue Chain, SDValue Dst, SDValue Byte,
                           SDValue Size, unsigned Align, bool IsVolatile,
-                          MachinePointerInfo DstPtrInfo) const;
+                          MachinePointerInfo DstPtrInfo) const LLVM_OVERRIDE;
+
+  virtual std::pair<SDValue, SDValue>
+  EmitTargetCodeForMemcmp(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
+                          SDValue Src1, SDValue Src2, SDValue Size,
+                          MachinePointerInfo Op1PtrInfo,
+                          MachinePointerInfo Op2PtrInfo) const LLVM_OVERRIDE;
+
+  virtual std::pair<SDValue, SDValue>
+  EmitTargetCodeForMemchr(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
+                          SDValue Src, SDValue Char, SDValue Length,
+                          MachinePointerInfo SrcPtrInfo) const LLVM_OVERRIDE;
+
+  virtual std::pair<SDValue, SDValue>
+  EmitTargetCodeForStrcpy(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
+                          SDValue Dest, SDValue Src,
+                          MachinePointerInfo DestPtrInfo,
+                          MachinePointerInfo SrcPtrInfo,
+                          bool isStpcpy) const LLVM_OVERRIDE;
+
+  virtual std::pair<SDValue, SDValue>
+  EmitTargetCodeForStrcmp(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
+                          SDValue Src1, SDValue Src2,
+                          MachinePointerInfo Op1PtrInfo,
+                          MachinePointerInfo Op2PtrInfo) const LLVM_OVERRIDE;
+
+  virtual std::pair<SDValue, SDValue>
+  EmitTargetCodeForStrlen(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
+                          SDValue Src, MachinePointerInfo SrcPtrInfo) const
+    LLVM_OVERRIDE;
+
+  virtual std::pair<SDValue, SDValue>
+  EmitTargetCodeForStrnlen(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
+                           SDValue Src, SDValue MaxLength,
+                           MachinePointerInfo SrcPtrInfo) const LLVM_OVERRIDE;
 };
 
 }
diff --git a/lib/Target/SystemZ/SystemZShortenInst.cpp b/lib/Target/SystemZ/SystemZShortenInst.cpp
new file mode 100644
index 0000000..537a545
--- /dev/null
+++ b/lib/Target/SystemZ/SystemZShortenInst.cpp
@@ -0,0 +1,163 @@
+//===-- SystemZShortenInst.cpp - Instruction-shortening pass --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass tries to replace instructions with shorter forms.  For example,
+// IILF can be replaced with LLILL or LLILH if the constant fits and if the
+// other 32 bits of the GR64 destination are not live.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "systemz-shorten-inst"
+
+#include "SystemZTargetMachine.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+
+using namespace llvm;
+
+namespace {
+  class SystemZShortenInst : public MachineFunctionPass {
+  public:
+    static char ID;
+    SystemZShortenInst(const SystemZTargetMachine &tm);
+
+    virtual const char *getPassName() const {
+      return "SystemZ Instruction Shortening";
+    }
+
+    bool processBlock(MachineBasicBlock *MBB);
+    bool runOnMachineFunction(MachineFunction &F);
+
+  private:
+    bool shortenIIF(MachineInstr &MI, unsigned *GPRMap, unsigned LiveOther,
+                    unsigned LLIxL, unsigned LLIxH);
+
+    const SystemZInstrInfo *TII;
+
+    // LowGPRs[I] has bit N set if LLVM register I includes the low
+    // word of GPR N.  HighGPRs is the same for the high word.
+    unsigned LowGPRs[SystemZ::NUM_TARGET_REGS];
+    unsigned HighGPRs[SystemZ::NUM_TARGET_REGS];
+  };
+
+  char SystemZShortenInst::ID = 0;
+} // end of anonymous namespace
+
+FunctionPass *llvm::createSystemZShortenInstPass(SystemZTargetMachine &TM) {
+  return new SystemZShortenInst(TM);
+}
+
+SystemZShortenInst::SystemZShortenInst(const SystemZTargetMachine &tm)
+  : MachineFunctionPass(ID), TII(0), LowGPRs(), HighGPRs() {
+  // Set up LowGPRs and HighGPRs.
+  for (unsigned I = 0; I < 16; ++I) {
+    LowGPRs[SystemZMC::GR32Regs[I]] |= 1 << I;
+    LowGPRs[SystemZMC::GR64Regs[I]] |= 1 << I;
+    HighGPRs[SystemZMC::GRH32Regs[I]] |= 1 << I;
+    HighGPRs[SystemZMC::GR64Regs[I]] |= 1 << I;
+    if (unsigned GR128 = SystemZMC::GR128Regs[I]) {
+      LowGPRs[GR128] |= 3 << I;
+      HighGPRs[GR128] |= 3 << I;
+    }
+  }
+}
+
+// MI loads one word of a GPR using an IIxF instruction and LLIxL and LLIxH
+// are the halfword immediate loads for the same word.  Try to use one of them
+// instead of IIxF.  If MI loads the high word, GPRMap[X] is the set of high
+// words referenced by LLVM register X while LiveOther is the mask of low
+// words that are currently live, and vice versa.
+bool SystemZShortenInst::shortenIIF(MachineInstr &MI, unsigned *GPRMap,
+                                    unsigned LiveOther, unsigned LLIxL,
+                                    unsigned LLIxH) {
+  unsigned Reg = MI.getOperand(0).getReg();
+  assert(Reg < SystemZ::NUM_TARGET_REGS && "Invalid register number");
+  unsigned GPRs = GPRMap[Reg];
+  assert(GPRs != 0 && "Register must be a GPR");
+  if (GPRs & LiveOther)
+    return false;
+
+  uint64_t Imm = MI.getOperand(1).getImm();
+  if (SystemZ::isImmLL(Imm)) {
+    MI.setDesc(TII->get(LLIxL));
+    MI.getOperand(0).setReg(SystemZMC::getRegAsGR64(Reg));
+    return true;
+  }
+  if (SystemZ::isImmLH(Imm)) {
+    MI.setDesc(TII->get(LLIxH));
+    MI.getOperand(0).setReg(SystemZMC::getRegAsGR64(Reg));
+    MI.getOperand(1).setImm(Imm >> 16);
+    return true;
+  }
+  return false;
+}
+
+// Process all instructions in MBB.  Return true if something changed.
+bool SystemZShortenInst::processBlock(MachineBasicBlock *MBB) {
+  bool Changed = false;
+
+  // Work out which words are live on exit from the block.
+  unsigned LiveLow = 0;
+  unsigned LiveHigh = 0;
+  for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(),
+         SE = MBB->succ_end(); SI != SE; ++SI) {
+    for (MachineBasicBlock::livein_iterator LI = (*SI)->livein_begin(),
+           LE = (*SI)->livein_end(); LI != LE; ++LI) {
+      unsigned Reg = *LI;
+      assert(Reg < SystemZ::NUM_TARGET_REGS && "Invalid register number");
+      LiveLow |= LowGPRs[Reg];
+      LiveHigh |= HighGPRs[Reg];
+    }
+  }
+
+  // Iterate backwards through the block looking for instructions to change.
+  for (MachineBasicBlock::reverse_iterator MBBI = MBB->rbegin(),
+         MBBE = MBB->rend(); MBBI != MBBE; ++MBBI) {
+    MachineInstr &MI = *MBBI;
+    unsigned Opcode = MI.getOpcode();
+    if (Opcode == SystemZ::IILF)
+      Changed |= shortenIIF(MI, LowGPRs, LiveHigh, SystemZ::LLILL,
+                            SystemZ::LLILH);
+    else if (Opcode == SystemZ::IIHF)
+      Changed |= shortenIIF(MI, HighGPRs, LiveLow, SystemZ::LLIHL,
+                            SystemZ::LLIHH);
+    unsigned UsedLow = 0;
+    unsigned UsedHigh = 0;
+    for (MachineInstr::mop_iterator MOI = MI.operands_begin(),
+           MOE = MI.operands_end(); MOI != MOE; ++MOI) {
+      MachineOperand &MO = *MOI;
+      if (MO.isReg()) {
+        if (unsigned Reg = MO.getReg()) {
+          assert(Reg < SystemZ::NUM_TARGET_REGS && "Invalid register number");
+          if (MO.isDef()) {
+            LiveLow &= ~LowGPRs[Reg];
+            LiveHigh &= ~HighGPRs[Reg];
+          } else if (!MO.isUndef()) {
+            UsedLow |= LowGPRs[Reg];
+            UsedHigh |= HighGPRs[Reg];
+          }
+        }
+      }
+    }
+    LiveLow |= UsedLow;
+    LiveHigh |= UsedHigh;
+  }
+
+  return Changed;
+}
+
+bool SystemZShortenInst::runOnMachineFunction(MachineFunction &F) {
+  TII = static_cast<const SystemZInstrInfo *>(F.getTarget().getInstrInfo());
+
+  bool Changed = false;
+  for (MachineFunction::iterator MFI = F.begin(), MFE = F.end();
+       MFI != MFE; ++MFI)
+    Changed |= processBlock(MFI);
+
+  return Changed;
+}
diff --git a/lib/Target/SystemZ/SystemZSubtarget.cpp b/lib/Target/SystemZ/SystemZSubtarget.cpp
index 036ec05..3971d5e 100644
--- a/lib/Target/SystemZ/SystemZSubtarget.cpp
+++ b/lib/Target/SystemZ/SystemZSubtarget.cpp
@@ -9,6 +9,7 @@
 
 #include "SystemZSubtarget.h"
 #include "llvm/IR/GlobalValue.h"
+#include "llvm/Support/Host.h"
 #include "MCTargetDesc/SystemZMCTargetDesc.h"
 
 #define GET_SUBTARGETINFO_TARGET_DESC
@@ -17,14 +18,22 @@
 
 using namespace llvm;
 
+// Pin the vtabel to this file.
+void SystemZSubtarget::anchor() {}
+
 SystemZSubtarget::SystemZSubtarget(const std::string &TT,
                                    const std::string &CPU,
                                    const std::string &FS)
   : SystemZGenSubtargetInfo(TT, CPU, FS), HasDistinctOps(false),
-    HasLoadStoreOnCond(false), HasHighWord(false), TargetTriple(TT) {
+    HasLoadStoreOnCond(false), HasHighWord(false), HasFPExtension(false),
+    TargetTriple(TT) {
   std::string CPUName = CPU;
   if (CPUName.empty())
-    CPUName = "z10";
+    CPUName = "generic";
+#if defined(__linux__) && defined(__s390x__)
+  if (CPUName == "generic")
+    CPUName = sys::getHostCPUName();
+#endif
 
   // Parse features string.
   ParseSubtargetFeatures(CPUName, FS);
diff --git a/lib/Target/SystemZ/SystemZSubtarget.h b/lib/Target/SystemZ/SystemZSubtarget.h
index 4efb58d..5817491 100644
--- a/lib/Target/SystemZ/SystemZSubtarget.h
+++ b/lib/Target/SystemZ/SystemZSubtarget.h
@@ -26,10 +26,12 @@ class GlobalValue;
 class StringRef;
 
 class SystemZSubtarget : public SystemZGenSubtargetInfo {
+  virtual void anchor();
 protected:
   bool HasDistinctOps;
   bool HasLoadStoreOnCond;
   bool HasHighWord;
+  bool HasFPExtension;
 
 private:
   Triple TargetTriple;
@@ -38,6 +40,9 @@ public:
   SystemZSubtarget(const std::string &TT, const std::string &CPU,
                    const std::string &FS);
 
+  // This is important for reducing register pressure in vector code.
+  virtual bool useAA() const LLVM_OVERRIDE { return true; }
+
   // Automatically generated by tblgen.
   void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
 
@@ -50,6 +55,9 @@ public:
   // Return true if the target has the high-word facility.
   bool hasHighWord() const { return HasHighWord; }
 
+  // Return true if the target has the floating-point extension facility.
+  bool hasFPExtension() const { return HasFPExtension; }
+
   // Return true if GV can be accessed using LARL for reloc model RM
   // and code model CM.
   bool isPC32DBLSymbol(const GlobalValue *GV, Reloc::Model RM,
diff --git a/lib/Target/SystemZ/SystemZTargetMachine.cpp b/lib/Target/SystemZ/SystemZTargetMachine.cpp
index 856183c..dee92e9 100644
--- a/lib/Target/SystemZ/SystemZTargetMachine.cpp
+++ b/lib/Target/SystemZ/SystemZTargetMachine.cpp
@@ -10,6 +10,7 @@
 #include "SystemZTargetMachine.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/Support/TargetRegistry.h"
+#include "llvm/Transforms/Scalar.h"
 
 using namespace llvm;
 
@@ -47,12 +48,18 @@ public:
     return getTM<SystemZTargetMachine>();
   }
 
+  virtual void addIRPasses() LLVM_OVERRIDE;
   virtual bool addInstSelector() LLVM_OVERRIDE;
   virtual bool addPreSched2() LLVM_OVERRIDE;
   virtual bool addPreEmitPass() LLVM_OVERRIDE;
 };
 } // end anonymous namespace
 
+void SystemZPassConfig::addIRPasses() {
+  TargetPassConfig::addIRPasses();
+  addPass(createPartiallyInlineLibCallsPass());
+}
+
 bool SystemZPassConfig::addInstSelector() {
   addPass(createSystemZISelDag(getSystemZTargetMachine(), getOptLevel()));
   return false;
@@ -90,6 +97,8 @@ bool SystemZPassConfig::addPreEmitPass() {
   // preventing that would be a win or not.
   if (getOptLevel() != CodeGenOpt::None)
     addPass(createSystemZElimComparePass(getSystemZTargetMachine()));
+  if (getOptLevel() != CodeGenOpt::None)
+    addPass(createSystemZShortenInstPass(getSystemZTargetMachine()));
   addPass(createSystemZLongBranchPass(getSystemZTargetMachine()));
   return true;
 }
diff --git a/lib/Target/Target.cpp b/lib/Target/Target.cpp
index 3d92f29..2190198 100644
--- a/lib/Target/Target.cpp
+++ b/lib/Target/Target.cpp
@@ -88,6 +88,14 @@ LLVMTypeRef LLVMIntPtrTypeForAS(LLVMTargetDataRef TD, unsigned AS) {
   return wrap(unwrap(TD)->getIntPtrType(getGlobalContext(), AS));
 }
 
+LLVMTypeRef LLVMIntPtrTypeInContext(LLVMContextRef C, LLVMTargetDataRef TD) {
+  return wrap(unwrap(TD)->getIntPtrType(*unwrap(C)));
+}
+
+LLVMTypeRef LLVMIntPtrTypeForASInContext(LLVMContextRef C, LLVMTargetDataRef TD, unsigned AS) {
+  return wrap(unwrap(TD)->getIntPtrType(*unwrap(C), AS));
+}
+
 unsigned long long LLVMSizeOfTypeInBits(LLVMTargetDataRef TD, LLVMTypeRef Ty) {
   return unwrap(TD)->getTypeSizeInBits(unwrap(Ty));
 }
diff --git a/lib/Target/TargetLibraryInfo.cpp b/lib/Target/TargetLibraryInfo.cpp
index 8696b57..3e68fe1 100644
--- a/lib/Target/TargetLibraryInfo.cpp
+++ b/lib/Target/TargetLibraryInfo.cpp
@@ -38,6 +38,8 @@ const char* TargetLibraryInfo::StandardNames[LibFunc::NumLibFuncs] =
     "_ZnwjRKSt9nothrow_t",
     "_Znwm",
     "_ZnwmRKSt9nothrow_t",
+    "__cospi",
+    "__cospif",
     "__cxa_atexit",
     "__cxa_guard_abort",
     "__cxa_guard_acquire",
@@ -45,6 +47,10 @@ const char* TargetLibraryInfo::StandardNames[LibFunc::NumLibFuncs] =
     "__isoc99_scanf",
     "__isoc99_sscanf",
     "__memcpy_chk",
+    "__sincospi_stret",
+    "__sincospi_stretf",
+    "__sinpi",
+    "__sinpif",
     "__sqrt_finite",
     "__sqrtf_finite",
     "__sqrtl_finite",
@@ -331,6 +337,24 @@ const char* TargetLibraryInfo::StandardNames[LibFunc::NumLibFuncs] =
     "write"
   };
 
+static bool hasSinCosPiStret(const Triple &T) {
+  // Only Darwin variants have _stret versions of combined trig functions.
+  if (!T.isMacOSX() && T.getOS() != Triple::IOS)
+    return false;
+
+  // The ABI is rather complicated on x86, so don't do anything special there.
+  if (T.getArch() == Triple::x86)
+    return false;
+
+  if (T.isMacOSX() && T.isMacOSXVersionLT(10, 9))
+    return false;
+
+  if (T.getOS() == Triple::IOS && T.isOSVersionLT(7, 0))
+    return false;
+
+  return true;
+}
+
 /// initialize - Initialize the set of available library functions based on the
 /// specified target triple.  This should be carefully written so that a missing
 /// target triple gets a sane set of defaults.
@@ -350,13 +374,22 @@ static void initialize(TargetLibraryInfo &TLI, const Triple &T,
   if (T.isMacOSX()) {
     if (T.isMacOSXVersionLT(10, 5))
       TLI.setUnavailable(LibFunc::memset_pattern16);
-  } else if (T.getOS() == Triple::IOS) {
+  } else if (T.isiOS()) {
     if (T.isOSVersionLT(3, 0))
       TLI.setUnavailable(LibFunc::memset_pattern16);
   } else {
     TLI.setUnavailable(LibFunc::memset_pattern16);
   }
 
+  if (!hasSinCosPiStret(T)) {
+    TLI.setUnavailable(LibFunc::sinpi);
+    TLI.setUnavailable(LibFunc::sinpif);
+    TLI.setUnavailable(LibFunc::cospi);
+    TLI.setUnavailable(LibFunc::cospif);
+    TLI.setUnavailable(LibFunc::sincospi_stret);
+    TLI.setUnavailable(LibFunc::sincospi_stretf);
+  }
+
   if (T.isMacOSX() && T.getArch() == Triple::x86 &&
       !T.isMacOSXVersionLT(10, 7)) {
     // x86-32 OSX has a scheme where fwrite and fputs (and some other functions
@@ -562,7 +595,7 @@ static void initialize(TargetLibraryInfo &TLI, const Triple &T,
   }
 
   // The following functions are available on at least Linux:
-  if (T.getOS() != Triple::Linux) {
+  if (!T.isOSLinux()) {
     TLI.setUnavailable(LibFunc::dunder_strdup);
     TLI.setUnavailable(LibFunc::dunder_strtok_r);
     TLI.setUnavailable(LibFunc::dunder_isoc99_scanf);
diff --git a/lib/Target/TargetLoweringObjectFile.cpp b/lib/Target/TargetLoweringObjectFile.cpp
index cd810b6..7b8d110 100644
--- a/lib/Target/TargetLoweringObjectFile.cpp
+++ b/lib/Target/TargetLoweringObjectFile.cpp
@@ -97,10 +97,20 @@ static bool IsNullTerminatedString(const Constant *C) {
   return false;
 }
 
+/// Return the MCSymbol for the specified global value.  This
+/// symbol is the main label that is the address of the global.
+MCSymbol *TargetLoweringObjectFile::getSymbol(Mangler &M, 
+                                              const GlobalValue *GV) const {
+  SmallString<60> NameStr;
+  M.getNameWithPrefix(NameStr, GV, false);
+  return Ctx->GetOrCreateSymbol(NameStr.str());
+}
+
+
 MCSymbol *TargetLoweringObjectFile::
 getCFIPersonalitySymbol(const GlobalValue *GV, Mangler *Mang,
                         MachineModuleInfo *MMI) const {
-  return Mang->getSymbol(GV);
+  return getSymbol(*Mang, GV);
 }
 
 void TargetLoweringObjectFile::emitPersonalityValue(MCStreamer &Streamer,
@@ -293,7 +303,7 @@ getTTypeGlobalReference(const GlobalValue *GV, Mangler *Mang,
                         MachineModuleInfo *MMI, unsigned Encoding,
                         MCStreamer &Streamer) const {
   const MCSymbolRefExpr *Ref =
-    MCSymbolRefExpr::Create(Mang->getSymbol(GV), getContext());
+    MCSymbolRefExpr::Create(getSymbol(*Mang, GV), getContext());
 
   return getTTypeReference(Ref, Encoding, Streamer);
 }
diff --git a/lib/Target/TargetMachine.cpp b/lib/Target/TargetMachine.cpp
index df4a03c..cb42e83 100644
--- a/lib/Target/TargetMachine.cpp
+++ b/lib/Target/TargetMachine.cpp
@@ -164,6 +164,11 @@ CodeGenOpt::Level TargetMachine::getOptLevel() const {
   return CodeGenInfo->getOptLevel();
 }
 
+void TargetMachine::setOptLevel(CodeGenOpt::Level Level) const {
+  if (CodeGenInfo)
+    CodeGenInfo->setOptLevel(Level);
+}
+
 bool TargetMachine::getAsmVerbosityDefault() {
   return AsmVerbosityDefault;
 }
diff --git a/lib/Target/TargetMachineC.cpp b/lib/Target/TargetMachineC.cpp
index 7419122..3d5f827 100644
--- a/lib/Target/TargetMachineC.cpp
+++ b/lib/Target/TargetMachineC.cpp
@@ -21,6 +21,7 @@
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/Host.h"
 #include "llvm/Target/TargetMachine.h"
 #include <cassert>
 #include <cstdlib>
@@ -60,13 +61,44 @@ inline LLVMTargetRef wrap(const Target * P) {
 }
 
 LLVMTargetRef LLVMGetFirstTarget() {
-   const Target* target = &*TargetRegistry::begin();
-   return wrap(target);
+  if(TargetRegistry::begin() == TargetRegistry::end()) {
+    return NULL;
+  }
+
+  const Target* target = &*TargetRegistry::begin();
+  return wrap(target);
 }
 LLVMTargetRef LLVMGetNextTarget(LLVMTargetRef T) {
   return wrap(unwrap(T)->getNext());
 }
 
+LLVMTargetRef LLVMGetTargetFromName(const char *Name) {
+  StringRef NameRef = Name;
+  for (TargetRegistry::iterator IT = TargetRegistry::begin(),
+                                IE = TargetRegistry::end(); IT != IE; ++IT) {
+    if (IT->getName() == NameRef)
+      return wrap(&*IT);
+  }
+  
+  return NULL;
+}
+
+LLVMBool LLVMGetTargetFromTriple(const char* TripleStr, LLVMTargetRef *T,
+                                 char **ErrorMessage) {
+  std::string Error;
+  
+  *T = wrap(TargetRegistry::lookupTarget(TripleStr, Error));
+  
+  if (!*T) {
+    if (ErrorMessage)
+      *ErrorMessage = strdup(Error.c_str());
+
+    return 1;
+  }
+  
+  return 0;
+}
+
 const char * LLVMGetTargetName(LLVMTargetRef T) {
   return unwrap(T)->getName();
 }
@@ -87,9 +119,10 @@ LLVMBool LLVMTargetHasAsmBackend(LLVMTargetRef T) {
   return unwrap(T)->hasMCAsmBackend();
 }
 
-LLVMTargetMachineRef LLVMCreateTargetMachine(LLVMTargetRef T, char* Triple,
-  char* CPU, char* Features, LLVMCodeGenOptLevel Level, LLVMRelocMode Reloc,
-  LLVMCodeModel CodeModel) {
+LLVMTargetMachineRef LLVMCreateTargetMachine(LLVMTargetRef T,
+        const char* Triple, const char* CPU, const char* Features,
+        LLVMCodeGenOptLevel Level, LLVMRelocMode Reloc,
+        LLVMCodeModel CodeModel) {
   Reloc::Model RM;
   switch (Reloc){
     case LLVMRelocStatic:
@@ -158,6 +191,11 @@ LLVMTargetDataRef LLVMGetTargetMachineData(LLVMTargetMachineRef T) {
   return wrap(unwrap(T)->getDataLayout());
 }
 
+void LLVMSetTargetMachineAsmVerbosity(LLVMTargetMachineRef T,
+                                      LLVMBool VerboseAsm) {
+  unwrap(T)->setAsmVerbosityDefault(VerboseAsm);
+}
+
 static LLVMBool LLVMTargetMachineEmit(LLVMTargetMachineRef T, LLVMModuleRef M,
   formatted_raw_ostream &OS, LLVMCodeGenFileType codegen, char **ErrorMessage) {
   TargetMachine* TM = unwrap(T);
@@ -201,11 +239,11 @@ LLVMBool LLVMTargetMachineEmitToFile(LLVMTargetMachineRef T, LLVMModuleRef M,
   char* Filename, LLVMCodeGenFileType codegen, char** ErrorMessage) {
   std::string error;
   raw_fd_ostream dest(Filename, error, sys::fs::F_Binary);
-  formatted_raw_ostream destf(dest);
   if (!error.empty()) {
     *ErrorMessage = strdup(error.c_str());
     return true;
   }
+  formatted_raw_ostream destf(dest);
   bool Result = LLVMTargetMachineEmit(T, M, destf, codegen, ErrorMessage);
   dest.flush();
   return Result;
@@ -225,3 +263,7 @@ LLVMBool LLVMTargetMachineEmitToMemoryBuffer(LLVMTargetMachineRef T,
                                                      Data.length(), "");
   return Result;
 }
+
+char *LLVMGetDefaultTargetTriple(void) {
+  return strdup(sys::getDefaultTargetTriple().c_str());
+}
diff --git a/lib/Target/TargetSubtargetInfo.cpp b/lib/Target/TargetSubtargetInfo.cpp
index af0cef6..10e8db5 100644
--- a/lib/Target/TargetSubtargetInfo.cpp
+++ b/lib/Target/TargetSubtargetInfo.cpp
@@ -11,6 +11,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include "llvm/ADT/SmallVector.h"
 using namespace llvm;
@@ -22,6 +23,21 @@ TargetSubtargetInfo::TargetSubtargetInfo() {}
 
 TargetSubtargetInfo::~TargetSubtargetInfo() {}
 
+// Temporary option to compare overall performance change when moving from the
+// SD scheduler to the MachineScheduler pass pipeline. It should be removed
+// before 3.4. The normal way to enable/disable the MachineScheduling pass
+// itself is by using -enable-misched. For targets that already use MI sched
+// (via MySubTarget::enableMachineScheduler()) -misched-bench=false negates the
+// subtarget hook.
+static cl::opt<bool> BenchMachineSched("misched-bench", cl::Hidden,
+    cl::desc("Migrate from the target's default SD scheduler to MI scheduler"));
+
+bool TargetSubtargetInfo::useMachineScheduler() const {
+  if (BenchMachineSched.getNumOccurrences())
+    return BenchMachineSched;
+  return enableMachineScheduler();
+}
+
 bool TargetSubtargetInfo::enableMachineScheduler() const {
   return false;
 }
@@ -35,3 +51,6 @@ bool TargetSubtargetInfo::enablePostRAScheduler(
   return false;
 }
 
+bool TargetSubtargetInfo::useAA() const {
+  return false;
+}
diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp
index ad83d97..bc8f367 100644
--- a/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -80,7 +80,7 @@ private:
       PostfixStack.push_back(std::make_pair(Op, Val));
     }
     
-    void popOperator() { InfixOperatorStack.pop_back_val(); }
+    void popOperator() { InfixOperatorStack.pop_back(); }
     void pushOperator(InfixCalculatorTok Op) {
       // Push the new operator if the stack is empty.
       if (InfixOperatorStack.empty()) {
@@ -118,12 +118,12 @@ private:
         
         if (StackOp == IC_RPAREN) {
           ++ParenCount;
-          InfixOperatorStack.pop_back_val();
+          InfixOperatorStack.pop_back();
         } else if (StackOp == IC_LPAREN) {
           --ParenCount;
-          InfixOperatorStack.pop_back_val();
+          InfixOperatorStack.pop_back();
         } else {
-          InfixOperatorStack.pop_back_val();
+          InfixOperatorStack.pop_back();
           PostfixStack.push_back(std::make_pair(StackOp, 0));
         }
       }
@@ -495,16 +495,17 @@ private:
   X86Operand *ParseATTOperand();
   X86Operand *ParseIntelOperand();
   X86Operand *ParseIntelOffsetOfOperator();
-  X86Operand *ParseIntelDotOperator(const MCExpr *Disp, const MCExpr *&NewDisp);
+  bool ParseIntelDotOperator(const MCExpr *Disp, const MCExpr *&NewDisp);
   X86Operand *ParseIntelOperator(unsigned OpKind);
-  X86Operand *ParseIntelMemOperand(unsigned SegReg, int64_t ImmDisp,
-                                   SMLoc StartLoc);
-  X86Operand *ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End);
+  X86Operand *ParseIntelSegmentOverride(unsigned SegReg, SMLoc Start, unsigned Size);
+  X86Operand *ParseIntelMemOperand(int64_t ImmDisp, SMLoc StartLoc,
+                                   unsigned Size);
+  bool ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End);
   X86Operand *ParseIntelBracExpression(unsigned SegReg, SMLoc Start,
                                        int64_t ImmDisp, unsigned Size);
-  X86Operand *ParseIntelIdentifier(const MCExpr *&Val, StringRef &Identifier,
-                                   InlineAsmIdentifierInfo &Info,
-                                   bool IsUnevaluatedOperand, SMLoc &End);
+  bool ParseIntelIdentifier(const MCExpr *&Val, StringRef &Identifier,
+                            InlineAsmIdentifierInfo &Info,
+                            bool IsUnevaluatedOperand, SMLoc &End);
 
   X86Operand *ParseMemOperand(unsigned SegReg, SMLoc StartLoc);
 
@@ -555,8 +556,9 @@ private:
   /// }
 
 public:
-  X86AsmParser(MCSubtargetInfo &sti, MCAsmParser &parser)
-    : MCTargetAsmParser(), STI(sti), Parser(parser), InstInfo(0) {
+  X86AsmParser(MCSubtargetInfo &sti, MCAsmParser &parser,
+               const MCInstrInfo &MII)
+      : MCTargetAsmParser(), STI(sti), Parser(parser), InstInfo(0) {
 
     // Initialize the set of available features.
     setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
@@ -814,6 +816,9 @@ struct X86Operand : public MCParsedAsmOperand {
   bool isMem256() const {
     return Kind == Memory && (!Mem.Size || Mem.Size == 256);
   }
+  bool isMem512() const {
+    return Kind == Memory && (!Mem.Size || Mem.Size == 512);
+  }
 
   bool isMemVX32() const {
     return Kind == Memory && (!Mem.Size || Mem.Size == 32) &&
@@ -840,17 +845,36 @@ struct X86Operand : public MCParsedAsmOperand {
       getMemIndexReg() >= X86::ZMM0 && getMemIndexReg() <= X86::ZMM31;
   }
 
-  bool isMem512() const {
-    return Kind == Memory && (!Mem.Size || Mem.Size == 512);
-  }
-
   bool isAbsMem() const {
     return Kind == Memory && !getMemSegReg() && !getMemBaseReg() &&
       !getMemIndexReg() && getMemScale() == 1;
   }
 
+  bool isMemOffs8() const {
+    return Kind == Memory && !getMemSegReg() && !getMemBaseReg() &&
+      !getMemIndexReg() && getMemScale() == 1 && (!Mem.Size || Mem.Size == 8);
+  }
+  bool isMemOffs16() const {
+    return Kind == Memory && !getMemSegReg() && !getMemBaseReg() &&
+      !getMemIndexReg() && getMemScale() == 1 && (!Mem.Size || Mem.Size == 16);
+  }
+  bool isMemOffs32() const {
+    return Kind == Memory && !getMemSegReg() && !getMemBaseReg() &&
+      !getMemIndexReg() && getMemScale() == 1 && (!Mem.Size || Mem.Size == 32);
+  }
+  bool isMemOffs64() const {
+    return Kind == Memory && !getMemSegReg() && !getMemBaseReg() &&
+      !getMemIndexReg() && getMemScale() == 1 && (!Mem.Size || Mem.Size == 64);
+  }
+
   bool isReg() const { return Kind == Register; }
 
+  bool isGR32orGR64() const {
+    return Kind == Register &&
+      (X86MCRegisterClasses[X86::GR32RegClassID].contains(getReg()) ||
+      X86MCRegisterClasses[X86::GR64RegClassID].contains(getReg()));
+  }
+
   void addExpr(MCInst &Inst, const MCExpr *Expr) const {
     // Add as immediates when possible.
     if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr))
@@ -864,53 +888,40 @@ struct X86Operand : public MCParsedAsmOperand {
     Inst.addOperand(MCOperand::CreateReg(getReg()));
   }
 
-  void addImmOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    addExpr(Inst, getImm());
+  static unsigned getGR32FromGR64(unsigned RegNo) {
+    switch (RegNo) {
+    default: llvm_unreachable("Unexpected register");
+    case X86::RAX: return X86::EAX;
+    case X86::RCX: return X86::ECX;
+    case X86::RDX: return X86::EDX;
+    case X86::RBX: return X86::EBX;
+    case X86::RBP: return X86::EBP;
+    case X86::RSP: return X86::ESP;
+    case X86::RSI: return X86::ESI;
+    case X86::RDI: return X86::EDI;
+    case X86::R8: return X86::R8D;
+    case X86::R9: return X86::R9D;
+    case X86::R10: return X86::R10D;
+    case X86::R11: return X86::R11D;
+    case X86::R12: return X86::R12D;
+    case X86::R13: return X86::R13D;
+    case X86::R14: return X86::R14D;
+    case X86::R15: return X86::R15D;
+    case X86::RIP: return X86::EIP;
+    }
   }
 
-  void addMem8Operands(MCInst &Inst, unsigned N) const {
-    addMemOperands(Inst, N);
-  }
-  void addMem16Operands(MCInst &Inst, unsigned N) const {
-    addMemOperands(Inst, N);
-  }
-  void addMem32Operands(MCInst &Inst, unsigned N) const {
-    addMemOperands(Inst, N);
-  }
-  void addMem64Operands(MCInst &Inst, unsigned N) const {
-    addMemOperands(Inst, N);
-  }
-  void addMem80Operands(MCInst &Inst, unsigned N) const {
-    addMemOperands(Inst, N);
-  }
-  void addMem128Operands(MCInst &Inst, unsigned N) const {
-    addMemOperands(Inst, N);
-  }
-  void addMem256Operands(MCInst &Inst, unsigned N) const {
-    addMemOperands(Inst, N);
-  }
-  void addMemVX32Operands(MCInst &Inst, unsigned N) const {
-    addMemOperands(Inst, N);
-  }
-  void addMemVY32Operands(MCInst &Inst, unsigned N) const {
-    addMemOperands(Inst, N);
-  }
-  void addMemVX64Operands(MCInst &Inst, unsigned N) const {
-    addMemOperands(Inst, N);
-  }
-  void addMemVY64Operands(MCInst &Inst, unsigned N) const {
-    addMemOperands(Inst, N);
+  void addGR32orGR64Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    unsigned RegNo = getReg();
+    if (X86MCRegisterClasses[X86::GR64RegClassID].contains(RegNo))
+      RegNo = getGR32FromGR64(RegNo);
+    Inst.addOperand(MCOperand::CreateReg(RegNo));
   }
 
-  void addMemVZ32Operands(MCInst &Inst, unsigned N) const {
-    addMemOperands(Inst, N);
-  }
-  void addMemVZ64Operands(MCInst &Inst, unsigned N) const {
-    addMemOperands(Inst, N);
-  }
-  void addMem512Operands(MCInst &Inst, unsigned N) const {
-    addMemOperands(Inst, N);
+  void addImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    addExpr(Inst, getImm());
   }
 
   void addMemOperands(MCInst &Inst, unsigned N) const {
@@ -931,6 +942,15 @@ struct X86Operand : public MCParsedAsmOperand {
       Inst.addOperand(MCOperand::CreateExpr(getMemDisp()));
   }
 
+  void addMemOffsOperands(MCInst &Inst, unsigned N) const {
+    assert((N == 1) && "Invalid number of operands!");
+    // Add as immediates when possible.
+    if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getMemDisp()))
+      Inst.addOperand(MCOperand::CreateImm(CE->getValue()));
+    else
+      Inst.addOperand(MCOperand::CreateExpr(getMemDisp()));
+  }
+
   static X86Operand *CreateToken(StringRef Str, SMLoc Loc) {
     SMLoc EndLoc = SMLoc::getFromPointer(Loc.getPointer() + Str.size());
     X86Operand *Res = new X86Operand(Token, Loc, EndLoc);
@@ -1249,8 +1269,7 @@ RewriteIntelBracExpression(SmallVectorImpl<AsmRewrite> *AsmRewrites,
   }
 }
 
-X86Operand *
-X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
+bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
   const AsmToken &Tok = Parser.getTok();
 
   bool Done = false;
@@ -1272,7 +1291,7 @@ X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
         Done = true;
         break;
       }
-      return ErrorOperand(Tok.getLoc(), "Unexpected token!");
+      return Error(Tok.getLoc(), "unknown token in expression");
     }
     case AsmToken::EndOfStatement: {
       Done = true;
@@ -1291,18 +1310,18 @@ X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
       } else {
         if (!isParsingInlineAsm()) {
           if (getParser().parsePrimaryExpr(Val, End))
-            return ErrorOperand(Tok.getLoc(), "Unexpected identifier!");
+            return Error(Tok.getLoc(), "Unexpected identifier!");
         } else {
           InlineAsmIdentifierInfo &Info = SM.getIdentifierInfo();
-          if (X86Operand *Err = ParseIntelIdentifier(Val, Identifier, Info,
-                                                     /*Unevaluated*/ false, End))
-            return Err;
+          if (ParseIntelIdentifier(Val, Identifier, Info,
+                                   /*Unevaluated=*/false, End))
+            return true;
         }
         SM.onIdentifierExpr(Val, Identifier);
         UpdateLocLex = false;
         break;
       }
-      return ErrorOperand(Tok.getLoc(), "Unexpected identifier!");
+      return Error(Tok.getLoc(), "Unexpected identifier!");
     }
     case AsmToken::Integer:
       if (isParsingInlineAsm() && SM.getAddImmPrefix())
@@ -1320,14 +1339,14 @@ X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
     case AsmToken::RParen:  SM.onRParen(); break;
     }
     if (SM.hadError())
-      return ErrorOperand(Tok.getLoc(), "Unexpected token!");
+      return Error(Tok.getLoc(), "unknown token in expression");
 
     if (!Done && UpdateLocLex) {
       End = Tok.getLoc();
       Parser.Lex(); // Consume the token.
     }
   }
-  return 0;
+  return false;
 }
 
 X86Operand *X86AsmParser::ParseIntelBracExpression(unsigned SegReg, SMLoc Start,
@@ -1344,8 +1363,8 @@ X86Operand *X86AsmParser::ParseIntelBracExpression(unsigned SegReg, SMLoc Start,
   // may have already parsed an immediate displacement before the bracketed
   // expression.
   IntelExprStateMachine SM(ImmDisp, /*StopOnLBrac=*/false, /*AddImmPrefix=*/true);
-  if (X86Operand *Err = ParseIntelExpression(SM, End))
-    return Err;
+  if (ParseIntelExpression(SM, End))
+    return 0;
 
   const MCExpr *Disp;
   if (const MCExpr *Sym = SM.getSym()) {
@@ -1363,8 +1382,8 @@ X86Operand *X86AsmParser::ParseIntelBracExpression(unsigned SegReg, SMLoc Start,
   // Parse the dot operator (e.g., [ebx].foo.bar).
   if (Tok.getString().startswith(".")) {
     const MCExpr *NewDisp;
-    if (X86Operand *Err = ParseIntelDotOperator(Disp, NewDisp))
-      return Err;
+    if (ParseIntelDotOperator(Disp, NewDisp))
+      return 0;
     
     End = Tok.getEndLoc();
     Parser.Lex();  // Eat the field.
@@ -1392,11 +1411,10 @@ X86Operand *X86AsmParser::ParseIntelBracExpression(unsigned SegReg, SMLoc Start,
 }
 
 // Inline assembly may use variable names with namespace alias qualifiers.
-X86Operand *X86AsmParser::ParseIntelIdentifier(const MCExpr *&Val,
-                                               StringRef &Identifier,
-                                               InlineAsmIdentifierInfo &Info,
-                                               bool IsUnevaluatedOperand,
-                                               SMLoc &End) {
+bool X86AsmParser::ParseIntelIdentifier(const MCExpr *&Val,
+                                        StringRef &Identifier,
+                                        InlineAsmIdentifierInfo &Info,
+                                        bool IsUnevaluatedOperand, SMLoc &End) {
   assert (isParsingInlineAsm() && "Expected to be parsing inline assembly.");
   Val = 0;
 
@@ -1421,68 +1439,89 @@ X86Operand *X86AsmParser::ParseIntelIdentifier(const MCExpr *&Val,
   MCSymbol *Sym = getContext().GetOrCreateSymbol(Identifier);
   MCSymbolRefExpr::VariantKind Variant = MCSymbolRefExpr::VK_None;
   Val = MCSymbolRefExpr::Create(Sym, Variant, getParser().getContext());
-  return 0;
+  return false;
 }
 
-/// ParseIntelMemOperand - Parse intel style memory operand.
-X86Operand *X86AsmParser::ParseIntelMemOperand(unsigned SegReg,
-                                               int64_t ImmDisp,
-                                               SMLoc Start) {
-  const AsmToken &Tok = Parser.getTok();
-  SMLoc End;
-
-  unsigned Size = getIntelMemOperandSize(Tok.getString());
-  if (Size) {
-    Parser.Lex(); // Eat operand size (e.g., byte, word).
-    if (Tok.getString() != "PTR" && Tok.getString() != "ptr")
-      return ErrorOperand(Start, "Expected 'PTR' or 'ptr' token!");
-    Parser.Lex(); // Eat ptr.
-  }
-
-  // Parse ImmDisp [ BaseReg + Scale*IndexReg + Disp ].
+/// \brief Parse intel style segment override.
+X86Operand *X86AsmParser::ParseIntelSegmentOverride(unsigned SegReg,
+                                                    SMLoc Start,
+                                                    unsigned Size) {
+  assert(SegReg != 0 && "Tried to parse a segment override without a segment!");
+  const AsmToken &Tok = Parser.getTok(); // Eat colon.
+  if (Tok.isNot(AsmToken::Colon))
+    return ErrorOperand(Tok.getLoc(), "Expected ':' token!");
+  Parser.Lex(); // Eat ':'
+
+  int64_t ImmDisp = 0;
   if (getLexer().is(AsmToken::Integer)) {
+    ImmDisp = Tok.getIntVal();
+    AsmToken ImmDispToken = Parser.Lex(); // Eat the integer.
+
     if (isParsingInlineAsm())
-      InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_ImmPrefix,
-                                                  Tok.getLoc()));
-    int64_t ImmDisp = Tok.getIntVal();
-    Parser.Lex(); // Eat the integer.
-    if (getLexer().isNot(AsmToken::LBrac))
-      return ErrorOperand(Start, "Expected '[' token!");
-    return ParseIntelBracExpression(SegReg, Start, ImmDisp, Size);
+      InstInfo->AsmRewrites->push_back(
+          AsmRewrite(AOK_ImmPrefix, ImmDispToken.getLoc()));
+
+    if (getLexer().isNot(AsmToken::LBrac)) {
+      // An immediate following a 'segment register', 'colon' token sequence can
+      // be followed by a bracketed expression.  If it isn't we know we have our
+      // final segment override.
+      const MCExpr *Disp = MCConstantExpr::Create(ImmDisp, getContext());
+      return X86Operand::CreateMem(SegReg, Disp, /*BaseReg=*/0, /*IndexReg=*/0,
+                                   /*Scale=*/1, Start, ImmDispToken.getEndLoc(),
+                                   Size);
+    }
   }
 
   if (getLexer().is(AsmToken::LBrac))
     return ParseIntelBracExpression(SegReg, Start, ImmDisp, Size);
 
-  if (!ParseRegister(SegReg, Start, End)) {
-    // Handel SegReg : [ ... ]
-    if (getLexer().isNot(AsmToken::Colon))
-      return ErrorOperand(Start, "Expected ':' token!");
-    Parser.Lex(); // Eat :
-    if (getLexer().isNot(AsmToken::LBrac))
-      return ErrorOperand(Start, "Expected '[' token!");
-    return ParseIntelBracExpression(SegReg, Start, ImmDisp, Size);
+  const MCExpr *Val;
+  SMLoc End;
+  if (!isParsingInlineAsm()) {
+    if (getParser().parsePrimaryExpr(Val, End))
+      return ErrorOperand(Tok.getLoc(), "unknown token in expression");
+
+    return X86Operand::CreateMem(Val, Start, End, Size);
   }
 
+  InlineAsmIdentifierInfo Info;
+  StringRef Identifier = Tok.getString();
+  if (ParseIntelIdentifier(Val, Identifier, Info,
+                           /*Unevaluated=*/false, End))
+    return 0;
+  return CreateMemForInlineAsm(/*SegReg=*/0, Val, /*BaseReg=*/0,/*IndexReg=*/0,
+                               /*Scale=*/1, Start, End, Size, Identifier, Info);
+}
+
+/// ParseIntelMemOperand - Parse intel style memory operand.
+X86Operand *X86AsmParser::ParseIntelMemOperand(int64_t ImmDisp, SMLoc Start,
+                                               unsigned Size) {
+  const AsmToken &Tok = Parser.getTok();
+  SMLoc End;
+
+  // Parse ImmDisp [ BaseReg + Scale*IndexReg + Disp ].
+  if (getLexer().is(AsmToken::LBrac))
+    return ParseIntelBracExpression(/*SegReg=*/0, Start, ImmDisp, Size);
+
   const MCExpr *Val;
   if (!isParsingInlineAsm()) {
     if (getParser().parsePrimaryExpr(Val, End))
-      return ErrorOperand(Tok.getLoc(), "Unexpected token!");
+      return ErrorOperand(Tok.getLoc(), "unknown token in expression");
 
     return X86Operand::CreateMem(Val, Start, End, Size);
   }
 
   InlineAsmIdentifierInfo Info;
   StringRef Identifier = Tok.getString();
-  if (X86Operand *Err = ParseIntelIdentifier(Val, Identifier, Info,
-                                             /*Unevaluated*/ false, End))
-    return Err;
-  return CreateMemForInlineAsm(/*SegReg=*/0, Val, /*BaseReg=*/0,/*IndexReg=*/0,
+  if (ParseIntelIdentifier(Val, Identifier, Info,
+                           /*Unevaluated=*/false, End))
+    return 0;
+  return CreateMemForInlineAsm(/*SegReg=*/0, Val, /*BaseReg=*/0, /*IndexReg=*/0,
                                /*Scale=*/1, Start, End, Size, Identifier, Info);
 }
 
 /// Parse the '.' operator.
-X86Operand *X86AsmParser::ParseIntelDotOperator(const MCExpr *Disp,
+bool X86AsmParser::ParseIntelDotOperator(const MCExpr *Disp,
                                                 const MCExpr *&NewDisp) {
   const AsmToken &Tok = Parser.getTok();
   int64_t OrigDispVal, DotDispVal;
@@ -1491,7 +1530,7 @@ X86Operand *X86AsmParser::ParseIntelDotOperator(const MCExpr *Disp,
   if (const MCConstantExpr *OrigDisp = dyn_cast<MCConstantExpr>(Disp))
     OrigDispVal = OrigDisp->getValue();
   else
-    return ErrorOperand(Tok.getLoc(), "Non-constant offsets are not supported!");
+    return Error(Tok.getLoc(), "Non-constant offsets are not supported!");
 
   // Drop the '.'.
   StringRef DotDispStr = Tok.getString().drop_front(1);
@@ -1506,10 +1545,10 @@ X86Operand *X86AsmParser::ParseIntelDotOperator(const MCExpr *Disp,
     std::pair<StringRef, StringRef> BaseMember = DotDispStr.split('.');
     if (SemaCallback->LookupInlineAsmField(BaseMember.first, BaseMember.second,
                                            DotDisp))
-      return ErrorOperand(Tok.getLoc(), "Unable to lookup field reference!");
+      return Error(Tok.getLoc(), "Unable to lookup field reference!");
     DotDispVal = DotDisp;
   } else
-    return ErrorOperand(Tok.getLoc(), "Unexpected token type!");
+    return Error(Tok.getLoc(), "Unexpected token type!");
 
   if (isParsingInlineAsm() && Tok.is(AsmToken::Identifier)) {
     SMLoc Loc = SMLoc::getFromPointer(DotDispStr.data());
@@ -1520,7 +1559,7 @@ X86Operand *X86AsmParser::ParseIntelDotOperator(const MCExpr *Disp,
   }
 
   NewDisp = MCConstantExpr::Create(OrigDispVal + DotDispVal, getContext());
-  return 0;
+  return false;
 }
 
 /// Parse the 'offset' operator.  This operator is used to specify the
@@ -1534,9 +1573,9 @@ X86Operand *X86AsmParser::ParseIntelOffsetOfOperator() {
   InlineAsmIdentifierInfo Info;
   SMLoc Start = Tok.getLoc(), End;
   StringRef Identifier = Tok.getString();
-  if (X86Operand *Err = ParseIntelIdentifier(Val, Identifier, Info,
-                                             /*Unevaluated*/ false, End))
-    return Err;
+  if (ParseIntelIdentifier(Val, Identifier, Info,
+                           /*Unevaluated=*/false, End))
+    return 0;
 
   // Don't emit the offset operator.
   InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_Skip, OffsetOfLoc, 7));
@@ -1570,9 +1609,12 @@ X86Operand *X86AsmParser::ParseIntelOperator(unsigned OpKind) {
   InlineAsmIdentifierInfo Info;
   SMLoc Start = Tok.getLoc(), End;
   StringRef Identifier = Tok.getString();
-  if (X86Operand *Err = ParseIntelIdentifier(Val, Identifier, Info,
-                                             /*Unevaluated*/ true, End))
-    return Err;
+  if (ParseIntelIdentifier(Val, Identifier, Info,
+                           /*Unevaluated=*/true, End))
+    return 0;
+
+  if (!Info.OpDecl)
+    return ErrorOperand(Start, "unable to lookup expression");
 
   unsigned CVal = 0;
   switch(OpKind) {
@@ -1593,7 +1635,7 @@ X86Operand *X86AsmParser::ParseIntelOperator(unsigned OpKind) {
 
 X86Operand *X86AsmParser::ParseIntelOperand() {
   const AsmToken &Tok = Parser.getTok();
-  SMLoc Start = Tok.getLoc(), End;
+  SMLoc Start, End;
 
   // Offset, length, type and size operators.
   if (isParsingInlineAsm()) {
@@ -1608,14 +1650,23 @@ X86Operand *X86AsmParser::ParseIntelOperand() {
       return ParseIntelOperator(IOK_TYPE);
   }
 
+  unsigned Size = getIntelMemOperandSize(Tok.getString());
+  if (Size) {
+    Parser.Lex(); // Eat operand size (e.g., byte, word).
+    if (Tok.getString() != "PTR" && Tok.getString() != "ptr")
+      return ErrorOperand(Start, "Expected 'PTR' or 'ptr' token!");
+    Parser.Lex(); // Eat ptr.
+  }
+  Start = Tok.getLoc();
+
   // Immediate.
   if (getLexer().is(AsmToken::Integer) || getLexer().is(AsmToken::Minus) ||
       getLexer().is(AsmToken::LParen)) {    
     AsmToken StartTok = Tok;
     IntelExprStateMachine SM(/*Imm=*/0, /*StopOnLBrac=*/true,
                              /*AddImmPrefix=*/false);
-    if (X86Operand *Err = ParseIntelExpression(SM, End))
-      return Err;
+    if (ParseIntelExpression(SM, End))
+      return 0;
 
     int64_t Imm = SM.getImm();
     if (isParsingInlineAsm()) {
@@ -1639,23 +1690,22 @@ X86Operand *X86AsmParser::ParseIntelOperand() {
                           "before bracketed expr.");
 
     // Parse ImmDisp [ BaseReg + Scale*IndexReg + Disp ].
-    return ParseIntelMemOperand(/*SegReg=*/0, Imm, Start);
+    return ParseIntelMemOperand(Imm, Start, Size);
   }
 
   // Register.
   unsigned RegNo = 0;
   if (!ParseRegister(RegNo, Start, End)) {
     // If this is a segment register followed by a ':', then this is the start
-    // of a memory reference, otherwise this is a normal register reference.
+    // of a segment override, otherwise this is a normal register reference.
     if (getLexer().isNot(AsmToken::Colon))
       return X86Operand::CreateReg(RegNo, Start, End);
 
-    getParser().Lex(); // Eat the colon.
-    return ParseIntelMemOperand(/*SegReg=*/RegNo, /*Disp=*/0, Start);
+    return ParseIntelSegmentOverride(/*SegReg=*/RegNo, Start, Size);
   }
 
   // Memory operand.
-  return ParseIntelMemOperand(/*SegReg=*/0, /*Disp=*/0, Start);
+  return ParseIntelMemOperand(/*Disp=*/0, Start, Size);
 }
 
 X86Operand *X86AsmParser::ParseATTOperand() {
@@ -1967,6 +2017,47 @@ ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc,
       }
     }
 
+    if (STI.getFeatureBits() & X86::FeatureAVX512) {
+      // Parse mask register {%k1}
+      if (getLexer().is(AsmToken::LCurly)) {
+        SMLoc Loc = Parser.getTok().getLoc();
+        Operands.push_back(X86Operand::CreateToken("{", Loc));
+        Parser.Lex();  // Eat the {
+        if (X86Operand *Op = ParseOperand()) {
+          Operands.push_back(Op);
+          if (!getLexer().is(AsmToken::RCurly)) {
+            SMLoc Loc = getLexer().getLoc();
+            Parser.eatToEndOfStatement();
+            return Error(Loc, "Expected } at this point");
+          }
+          Loc = Parser.getTok().getLoc();
+          Operands.push_back(X86Operand::CreateToken("}", Loc));
+          Parser.Lex();  // Eat the }
+        } else {
+          Parser.eatToEndOfStatement();
+          return true;
+        }
+      }
+      // Parse "zeroing non-masked" semantic {z}
+      if (getLexer().is(AsmToken::LCurly)) {
+        SMLoc Loc = Parser.getTok().getLoc();
+        Operands.push_back(X86Operand::CreateToken("{z}", Loc));
+        Parser.Lex();  // Eat the {
+        if (!getLexer().is(AsmToken::Identifier) || getLexer().getTok().getIdentifier() != "z") {
+          SMLoc Loc = getLexer().getLoc();
+          Parser.eatToEndOfStatement();
+          return Error(Loc, "Expected z at this point");
+        }
+        Parser.Lex();  // Eat the z
+        if (!getLexer().is(AsmToken::RCurly)) {
+            SMLoc Loc = getLexer().getLoc();
+            Parser.eatToEndOfStatement();
+            return Error(Loc, "Expected } at this point");
+        }
+        Parser.Lex();  // Eat the }
+      }
+    }
+
     if (getLexer().isNot(AsmToken::EndOfStatement)) {
       SMLoc Loc = getLexer().getLoc();
       Parser.eatToEndOfStatement();
@@ -2218,6 +2309,55 @@ processInstruction(MCInst &Inst,
   case X86::SBB16i16: return convert16i16to16ri8(Inst, X86::SBB16ri8);
   case X86::SBB32i32: return convert32i32to32ri8(Inst, X86::SBB32ri8);
   case X86::SBB64i32: return convert64i32to64ri8(Inst, X86::SBB64ri8);
+  case X86::VMOVAPDrr:
+  case X86::VMOVAPDYrr:
+  case X86::VMOVAPSrr:
+  case X86::VMOVAPSYrr:
+  case X86::VMOVDQArr:
+  case X86::VMOVDQAYrr:
+  case X86::VMOVDQUrr:
+  case X86::VMOVDQUYrr:
+  case X86::VMOVUPDrr:
+  case X86::VMOVUPDYrr:
+  case X86::VMOVUPSrr:
+  case X86::VMOVUPSYrr: {
+    if (X86II::isX86_64ExtendedReg(Inst.getOperand(0).getReg()) ||
+        !X86II::isX86_64ExtendedReg(Inst.getOperand(1).getReg()))
+      return false;
+
+    unsigned NewOpc;
+    switch (Inst.getOpcode()) {
+    default: llvm_unreachable("Invalid opcode");
+    case X86::VMOVAPDrr:  NewOpc = X86::VMOVAPDrr_REV;  break;
+    case X86::VMOVAPDYrr: NewOpc = X86::VMOVAPDYrr_REV; break;
+    case X86::VMOVAPSrr:  NewOpc = X86::VMOVAPSrr_REV;  break;
+    case X86::VMOVAPSYrr: NewOpc = X86::VMOVAPSYrr_REV; break;
+    case X86::VMOVDQArr:  NewOpc = X86::VMOVDQArr_REV;  break;
+    case X86::VMOVDQAYrr: NewOpc = X86::VMOVDQAYrr_REV; break;
+    case X86::VMOVDQUrr:  NewOpc = X86::VMOVDQUrr_REV;  break;
+    case X86::VMOVDQUYrr: NewOpc = X86::VMOVDQUYrr_REV; break;
+    case X86::VMOVUPDrr:  NewOpc = X86::VMOVUPDrr_REV;  break;
+    case X86::VMOVUPDYrr: NewOpc = X86::VMOVUPDYrr_REV; break;
+    case X86::VMOVUPSrr:  NewOpc = X86::VMOVUPSrr_REV;  break;
+    case X86::VMOVUPSYrr: NewOpc = X86::VMOVUPSYrr_REV; break;
+    }
+    Inst.setOpcode(NewOpc);
+    return true;
+  }
+  case X86::VMOVSDrr:
+  case X86::VMOVSSrr: {
+    if (X86II::isX86_64ExtendedReg(Inst.getOperand(0).getReg()) ||
+        !X86II::isX86_64ExtendedReg(Inst.getOperand(2).getReg()))
+      return false;
+    unsigned NewOpc;
+    switch (Inst.getOpcode()) {
+    default: llvm_unreachable("Invalid opcode");
+    case X86::VMOVSDrr: NewOpc = X86::VMOVSDrr_REV;   break;
+    case X86::VMOVSSrr: NewOpc = X86::VMOVSSrr_REV;   break;
+    }
+    Inst.setOpcode(NewOpc);
+    return true;
+  }
   }
 }
 
diff --git a/lib/Target/X86/Disassembler/X86Disassembler.cpp b/lib/Target/X86/Disassembler/X86Disassembler.cpp
index 82af6fa..903e36c 100644
--- a/lib/Target/X86/Disassembler/X86Disassembler.cpp
+++ b/lib/Target/X86/Disassembler/X86Disassembler.cpp
@@ -231,16 +231,18 @@ static void translateImmediate(MCInst &mcInst, uint64_t immediate,
     default:
       break;
     case 1:
-      type = TYPE_MOFFS8;
+      if(immediate & 0x80)
+        immediate |= ~(0xffull);
       break;
     case 2:
-      type = TYPE_MOFFS16;
+      if(immediate & 0x8000)
+        immediate |= ~(0xffffull);
       break;
     case 4:
-      type = TYPE_MOFFS32;
+      if(immediate & 0x80000000)
+        immediate |= ~(0xffffffffull);
       break;
     case 8:
-      type = TYPE_MOFFS64;
       break;
     }
   }
@@ -263,16 +265,18 @@ static void translateImmediate(MCInst &mcInst, uint64_t immediate,
           Opcode != X86::VMPSADBWrri && Opcode != X86::VDPPSYrri &&
           Opcode != X86::VDPPSYrmi && Opcode != X86::VDPPDrri &&
           Opcode != X86::VINSERTPSrr)
-        type = TYPE_MOFFS8;
+        if(immediate & 0x80)
+          immediate |= ~(0xffull);
       break;
     case ENCODING_IW:
-      type = TYPE_MOFFS16;
+      if(immediate & 0x8000)
+        immediate |= ~(0xffffull);
       break;
     case ENCODING_ID:
-      type = TYPE_MOFFS32;
+      if(immediate & 0x80000000)
+        immediate |= ~(0xffffffffull);
       break;
     case ENCODING_IO:
-      type = TYPE_MOFFS64;
       break;
     }
   }
@@ -292,30 +296,21 @@ static void translateImmediate(MCInst &mcInst, uint64_t immediate,
   case TYPE_REL8:
     isBranch = true;
     pcrel = insn.startLocation + insn.immediateOffset + insn.immediateSize;
-    // fall through to sign extend the immediate if needed.
-  case TYPE_MOFFS8:
     if(immediate & 0x80)
       immediate |= ~(0xffull);
     break;
-  case TYPE_MOFFS16:
-    if(immediate & 0x8000)
-      immediate |= ~(0xffffull);
-    break;
   case TYPE_REL32:
   case TYPE_REL64:
     isBranch = true;
     pcrel = insn.startLocation + insn.immediateOffset + insn.immediateSize;
-    // fall through to sign extend the immediate if needed.
-  case TYPE_MOFFS32:
     if(immediate & 0x80000000)
       immediate |= ~(0xffffffffull);
     break;
-  case TYPE_MOFFS64:
   default:
     // operand is 64 bits wide.  Do nothing.
     break;
   }
-    
+
   if(!tryAddingSymbolicOperand(immediate + pcrel, isBranch, insn.startLocation,
                                insn.immediateOffset, insn.immediateSize,
                                mcInst, Dis))
diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c
index bb195ee..c81a857 100644
--- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c
+++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c
@@ -25,8 +25,6 @@
 #define TRUE  1
 #define FALSE 0
 
-typedef int8_t bool;
-
 #ifndef NDEBUG
 #define debug(s) do { x86DisassemblerDebug(__FILE__, __LINE__, s); } while (0)
 #else
@@ -81,6 +79,15 @@ static int modRMRequired(OpcodeType type,
   case THREEBYTE_A7:
     decision = &THREEBYTEA7_SYM;
     break;
+  case XOP8_MAP:
+    decision = &XOP8_MAP_SYM;
+    break;
+  case XOP9_MAP:
+    decision = &XOP9_MAP_SYM;
+    break;
+  case XOPA_MAP:
+    decision = &XOPA_MAP_SYM;
+    break;
   }
 
   return decision->opcodeDecisions[insnContext].modRMDecisions[opcode].
@@ -122,6 +129,15 @@ static InstrUID decode(OpcodeType type,
   case THREEBYTE_A7:
     dec = &THREEBYTEA7_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
     break;
+  case XOP8_MAP:
+    dec = &XOP8_MAP_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
+    break;
+  case XOP9_MAP:
+    dec = &XOP9_MAP_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
+    break;
+  case XOPA_MAP:
+    dec = &XOPA_MAP_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
+    break;
   }
 
   switch (dec->modrm_type) {
@@ -305,6 +321,7 @@ static int readPrefixes(struct InternalInstruction* insn) {
   BOOL prefixGroups[4] = { FALSE };
   uint64_t prefixLocation;
   uint8_t byte = 0;
+  uint8_t nextByte;
 
   BOOL hasAdSize = FALSE;
   BOOL hasOpSize = FALSE;
@@ -314,20 +331,21 @@ static int readPrefixes(struct InternalInstruction* insn) {
   while (isPrefix) {
     prefixLocation = insn->readerCursor;
 
+    /* If we fail reading prefixes, just stop here and let the opcode reader deal with it */
     if (consumeByte(insn, &byte))
-      return -1;
+      break;
 
     /*
      * If the byte is a LOCK/REP/REPNE prefix and not a part of the opcode, then
      * break and let it be disassembled as a normal "instruction".
      */
+    if (insn->readerCursor - 1 == insn->startLocation && byte == 0xf0)
+      break;
+
     if (insn->readerCursor - 1 == insn->startLocation
-        && (byte == 0xf0 || byte == 0xf2 || byte == 0xf3)) {
-      uint8_t nextByte;
-      if (byte == 0xf0)
-        break;
-      if (lookAtByte(insn, &nextByte))
-        return -1;
+        && (byte == 0xf2 || byte == 0xf3)
+        && !lookAtByte(insn, &nextByte))
+    {
       /*
        * If the byte is 0xf2 or 0xf3, and any of the following conditions are
        * met:
@@ -426,7 +444,7 @@ static int readPrefixes(struct InternalInstruction* insn) {
       dbgprintf(insn, "Found prefix 0x%hhx", byte);
   }
 
-  insn->vexSize = 0;
+  insn->vexXopType = TYPE_NO_VEX_XOP;
 
   if (byte == 0xc4) {
     uint8_t byte1;
@@ -437,7 +455,7 @@ static int readPrefixes(struct InternalInstruction* insn) {
     }
 
     if (insn->mode == MODE_64BIT || (byte1 & 0xc0) == 0xc0) {
-      insn->vexSize = 3;
+      insn->vexXopType = TYPE_VEX_3B;
       insn->necessaryPrefixLocation = insn->readerCursor - 1;
     }
     else {
@@ -445,22 +463,22 @@ static int readPrefixes(struct InternalInstruction* insn) {
       insn->necessaryPrefixLocation = insn->readerCursor - 1;
     }
 
-    if (insn->vexSize == 3) {
-      insn->vexPrefix[0] = byte;
-      consumeByte(insn, &insn->vexPrefix[1]);
-      consumeByte(insn, &insn->vexPrefix[2]);
+    if (insn->vexXopType == TYPE_VEX_3B) {
+      insn->vexXopPrefix[0] = byte;
+      consumeByte(insn, &insn->vexXopPrefix[1]);
+      consumeByte(insn, &insn->vexXopPrefix[2]);
 
       /* We simulate the REX prefix for simplicity's sake */
 
       if (insn->mode == MODE_64BIT) {
         insn->rexPrefix = 0x40
-                        | (wFromVEX3of3(insn->vexPrefix[2]) << 3)
-                        | (rFromVEX2of3(insn->vexPrefix[1]) << 2)
-                        | (xFromVEX2of3(insn->vexPrefix[1]) << 1)
-                        | (bFromVEX2of3(insn->vexPrefix[1]) << 0);
+                        | (wFromVEX3of3(insn->vexXopPrefix[2]) << 3)
+                        | (rFromVEX2of3(insn->vexXopPrefix[1]) << 2)
+                        | (xFromVEX2of3(insn->vexXopPrefix[1]) << 1)
+                        | (bFromVEX2of3(insn->vexXopPrefix[1]) << 0);
       }
 
-      switch (ppFromVEX3of3(insn->vexPrefix[2]))
+      switch (ppFromVEX3of3(insn->vexXopPrefix[2]))
       {
       default:
         break;
@@ -469,7 +487,9 @@ static int readPrefixes(struct InternalInstruction* insn) {
         break;
       }
 
-      dbgprintf(insn, "Found VEX prefix 0x%hhx 0x%hhx 0x%hhx", insn->vexPrefix[0], insn->vexPrefix[1], insn->vexPrefix[2]);
+      dbgprintf(insn, "Found VEX prefix 0x%hhx 0x%hhx 0x%hhx",
+                insn->vexXopPrefix[0], insn->vexXopPrefix[1],
+                insn->vexXopPrefix[2]);
     }
   }
   else if (byte == 0xc5) {
@@ -481,22 +501,22 @@ static int readPrefixes(struct InternalInstruction* insn) {
     }
 
     if (insn->mode == MODE_64BIT || (byte1 & 0xc0) == 0xc0) {
-      insn->vexSize = 2;
+      insn->vexXopType = TYPE_VEX_2B;
     }
     else {
       unconsumeByte(insn);
     }
 
-    if (insn->vexSize == 2) {
-      insn->vexPrefix[0] = byte;
-      consumeByte(insn, &insn->vexPrefix[1]);
+    if (insn->vexXopType == TYPE_VEX_2B) {
+      insn->vexXopPrefix[0] = byte;
+      consumeByte(insn, &insn->vexXopPrefix[1]);
 
       if (insn->mode == MODE_64BIT) {
         insn->rexPrefix = 0x40
-                        | (rFromVEX2of2(insn->vexPrefix[1]) << 2);
+                        | (rFromVEX2of2(insn->vexXopPrefix[1]) << 2);
       }
 
-      switch (ppFromVEX2of2(insn->vexPrefix[1]))
+      switch (ppFromVEX2of2(insn->vexXopPrefix[1]))
       {
       default:
         break;
@@ -505,7 +525,53 @@ static int readPrefixes(struct InternalInstruction* insn) {
         break;
       }
 
-      dbgprintf(insn, "Found VEX prefix 0x%hhx 0x%hhx", insn->vexPrefix[0], insn->vexPrefix[1]);
+      dbgprintf(insn, "Found VEX prefix 0x%hhx 0x%hhx", insn->vexXopPrefix[0], insn->vexXopPrefix[1]);
+    }
+  }
+  else if (byte == 0x8f) {
+    uint8_t byte1;
+
+    if (lookAtByte(insn, &byte1)) {
+      dbgprintf(insn, "Couldn't read second byte of XOP");
+      return -1;
+    }
+
+    if ((byte1 & 0x38) != 0x0) { /* 0 in these 3 bits is a POP instruction. */
+      insn->vexXopType = TYPE_XOP;
+      insn->necessaryPrefixLocation = insn->readerCursor - 1;
+    }
+    else {
+      unconsumeByte(insn);
+      insn->necessaryPrefixLocation = insn->readerCursor - 1;
+    }
+
+    if (insn->vexXopType == TYPE_XOP) {
+      insn->vexXopPrefix[0] = byte;
+      consumeByte(insn, &insn->vexXopPrefix[1]);
+      consumeByte(insn, &insn->vexXopPrefix[2]);
+
+      /* We simulate the REX prefix for simplicity's sake */
+
+      if (insn->mode == MODE_64BIT) {
+        insn->rexPrefix = 0x40
+                        | (wFromXOP3of3(insn->vexXopPrefix[2]) << 3)
+                        | (rFromXOP2of3(insn->vexXopPrefix[1]) << 2)
+                        | (xFromXOP2of3(insn->vexXopPrefix[1]) << 1)
+                        | (bFromXOP2of3(insn->vexXopPrefix[1]) << 0);
+      }
+
+      switch (ppFromXOP3of3(insn->vexXopPrefix[2]))
+      {
+      default:
+        break;
+      case VEX_PREFIX_66:
+        hasOpSize = TRUE;
+        break;
+      }
+
+      dbgprintf(insn, "Found XOP prefix 0x%hhx 0x%hhx 0x%hhx",
+                insn->vexXopPrefix[0], insn->vexXopPrefix[1],
+                insn->vexXopPrefix[2]);
     }
   }
   else {
@@ -580,37 +646,49 @@ static int readOpcode(struct InternalInstruction* insn) {
 
   insn->opcodeType = ONEBYTE;
 
-  if (insn->vexSize == 3)
+  if (insn->vexXopType == TYPE_VEX_3B)
   {
-    switch (mmmmmFromVEX2of3(insn->vexPrefix[1]))
+    switch (mmmmmFromVEX2of3(insn->vexXopPrefix[1]))
     {
     default:
-      dbgprintf(insn, "Unhandled m-mmmm field for instruction (0x%hhx)", mmmmmFromVEX2of3(insn->vexPrefix[1]));
+      dbgprintf(insn, "Unhandled m-mmmm field for instruction (0x%hhx)",
+                mmmmmFromVEX2of3(insn->vexXopPrefix[1]));
       return -1;
-    case 0:
-      break;
     case VEX_LOB_0F:
-      insn->twoByteEscape = 0x0f;
       insn->opcodeType = TWOBYTE;
       return consumeByte(insn, &insn->opcode);
     case VEX_LOB_0F38:
-      insn->twoByteEscape = 0x0f;
-      insn->threeByteEscape = 0x38;
       insn->opcodeType = THREEBYTE_38;
       return consumeByte(insn, &insn->opcode);
     case VEX_LOB_0F3A:
-      insn->twoByteEscape = 0x0f;
-      insn->threeByteEscape = 0x3a;
       insn->opcodeType = THREEBYTE_3A;
       return consumeByte(insn, &insn->opcode);
     }
   }
-  else if (insn->vexSize == 2)
+  else if (insn->vexXopType == TYPE_VEX_2B)
   {
-    insn->twoByteEscape = 0x0f;
     insn->opcodeType = TWOBYTE;
     return consumeByte(insn, &insn->opcode);
   }
+  else if (insn->vexXopType == TYPE_XOP)
+  {
+    switch (mmmmmFromXOP2of3(insn->vexXopPrefix[1]))
+    {
+    default:
+      dbgprintf(insn, "Unhandled m-mmmm field for instruction (0x%hhx)",
+                mmmmmFromVEX2of3(insn->vexXopPrefix[1]));
+      return -1;
+    case XOP_MAP_SELECT_8:
+      insn->opcodeType = XOP8_MAP;
+      return consumeByte(insn, &insn->opcode);
+    case XOP_MAP_SELECT_9:
+      insn->opcodeType = XOP9_MAP;
+      return consumeByte(insn, &insn->opcode);
+    case XOP_MAP_SELECT_A:
+      insn->opcodeType = XOPA_MAP;
+      return consumeByte(insn, &insn->opcode);
+    }
+  }
 
   if (consumeByte(insn, &current))
     return -1;
@@ -618,16 +696,12 @@ static int readOpcode(struct InternalInstruction* insn) {
   if (current == 0x0f) {
     dbgprintf(insn, "Found a two-byte escape prefix (0x%hhx)", current);
 
-    insn->twoByteEscape = current;
-
     if (consumeByte(insn, &current))
       return -1;
 
     if (current == 0x38) {
       dbgprintf(insn, "Found a three-byte escape prefix (0x%hhx)", current);
 
-      insn->threeByteEscape = current;
-
       if (consumeByte(insn, &current))
         return -1;
 
@@ -635,8 +709,6 @@ static int readOpcode(struct InternalInstruction* insn) {
     } else if (current == 0x3a) {
       dbgprintf(insn, "Found a three-byte escape prefix (0x%hhx)", current);
 
-      insn->threeByteEscape = current;
-
       if (consumeByte(insn, &current))
         return -1;
 
@@ -644,8 +716,6 @@ static int readOpcode(struct InternalInstruction* insn) {
     } else if (current == 0xa6) {
       dbgprintf(insn, "Found a three-byte escape prefix (0x%hhx)", current);
 
-      insn->threeByteEscape = current;
-
       if (consumeByte(insn, &current))
         return -1;
 
@@ -653,8 +723,6 @@ static int readOpcode(struct InternalInstruction* insn) {
     } else if (current == 0xa7) {
       dbgprintf(insn, "Found a three-byte escape prefix (0x%hhx)", current);
 
-      insn->threeByteEscape = current;
-
       if (consumeByte(insn, &current))
         return -1;
 
@@ -768,11 +836,27 @@ static int getID(struct InternalInstruction* insn, const void *miiArg) {
   if (insn->mode == MODE_64BIT)
     attrMask |= ATTR_64BIT;
 
-  if (insn->vexSize) {
+  if (insn->vexXopType != TYPE_NO_VEX_XOP) {
     attrMask |= ATTR_VEX;
 
-    if (insn->vexSize == 3) {
-      switch (ppFromVEX3of3(insn->vexPrefix[2])) {
+    if (insn->vexXopType == TYPE_VEX_3B) {
+      switch (ppFromVEX3of3(insn->vexXopPrefix[2])) {
+      case VEX_PREFIX_66:
+        attrMask |= ATTR_OPSIZE;
+        break;
+      case VEX_PREFIX_F3:
+        attrMask |= ATTR_XS;
+        break;
+      case VEX_PREFIX_F2:
+        attrMask |= ATTR_XD;
+        break;
+      }
+
+      if (lFromVEX3of3(insn->vexXopPrefix[2]))
+        attrMask |= ATTR_VEXL;
+    }
+    else if (insn->vexXopType == TYPE_VEX_2B) {
+      switch (ppFromVEX2of2(insn->vexXopPrefix[1])) {
       case VEX_PREFIX_66:
         attrMask |= ATTR_OPSIZE;
         break;
@@ -784,11 +868,11 @@ static int getID(struct InternalInstruction* insn, const void *miiArg) {
         break;
       }
 
-      if (lFromVEX3of3(insn->vexPrefix[2]))
+      if (lFromVEX2of2(insn->vexXopPrefix[1]))
         attrMask |= ATTR_VEXL;
     }
-    else if (insn->vexSize == 2) {
-      switch (ppFromVEX2of2(insn->vexPrefix[1])) {
+    else if (insn->vexXopType == TYPE_XOP) {
+      switch (ppFromXOP3of3(insn->vexXopPrefix[2])) {
       case VEX_PREFIX_66:
         attrMask |= ATTR_OPSIZE;
         break;
@@ -800,7 +884,7 @@ static int getID(struct InternalInstruction* insn, const void *miiArg) {
         break;
       }
 
-      if (lFromVEX2of2(insn->vexPrefix[1]))
+      if (lFromXOP3of3(insn->vexXopPrefix[2]))
         attrMask |= ATTR_VEXL;
     }
     else {
@@ -826,42 +910,6 @@ static int getID(struct InternalInstruction* insn, const void *miiArg) {
 
   /* The following clauses compensate for limitations of the tables. */
 
-  if ((attrMask & ATTR_VEXL) && (attrMask & ATTR_REXW) &&
-      !(attrMask & ATTR_OPSIZE)) {
-    /*
-     * Some VEX instructions ignore the L-bit, but use the W-bit. Normally L-bit
-     * has precedence since there are no L-bit with W-bit entries in the tables.
-     * So if the L-bit isn't significant we should use the W-bit instead.
-     * We only need to do this if the instruction doesn't specify OpSize since
-     * there is a VEX_L_W_OPSIZE table.
-     */
-
-    const struct InstructionSpecifier *spec;
-    uint16_t instructionIDWithWBit;
-    const struct InstructionSpecifier *specWithWBit;
-
-    spec = specifierForUID(instructionID);
-
-    if (getIDWithAttrMask(&instructionIDWithWBit,
-                          insn,
-                          (attrMask & (~ATTR_VEXL)) | ATTR_REXW)) {
-      insn->instructionID = instructionID;
-      insn->spec = spec;
-      return 0;
-    }
-
-    specWithWBit = specifierForUID(instructionIDWithWBit);
-
-    if (instructionID != instructionIDWithWBit) {
-      insn->instructionID = instructionIDWithWBit;
-      insn->spec = specWithWBit;
-    } else {
-      insn->instructionID = instructionID;
-      insn->spec = spec;
-    }
-    return 0;
-  }
-
   if (insn->prefixPresent[0x66] && !(attrMask & ATTR_OPSIZE)) {
     /*
      * The instruction tables make no distinction between instructions that
@@ -1502,10 +1550,12 @@ static int readImmediate(struct InternalInstruction* insn, uint8_t size) {
 static int readVVVV(struct InternalInstruction* insn) {
   dbgprintf(insn, "readVVVV()");
 
-  if (insn->vexSize == 3)
-    insn->vvvv = vvvvFromVEX3of3(insn->vexPrefix[2]);
-  else if (insn->vexSize == 2)
-    insn->vvvv = vvvvFromVEX2of2(insn->vexPrefix[1]);
+  if (insn->vexXopType == TYPE_VEX_3B)
+    insn->vvvv = vvvvFromVEX3of3(insn->vexXopPrefix[2]);
+  else if (insn->vexXopType == TYPE_VEX_2B)
+    insn->vvvv = vvvvFromVEX2of2(insn->vexXopPrefix[1]);
+  else if (insn->vexXopType == TYPE_XOP)
+    insn->vvvv = vvvvFromXOP3of3(insn->vexXopPrefix[2]);
   else
     return -1;
 
diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
index dcb6aad..6d03d5c 100644
--- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
+++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
@@ -59,6 +59,15 @@ extern "C" {
 #define lFromVEX2of2(vex)       (((vex) & 0x4) >> 2)
 #define ppFromVEX2of2(vex)      ((vex) & 0x3)
 
+#define rFromXOP2of3(xop)       (((~(xop)) & 0x80) >> 7)
+#define xFromXOP2of3(xop)       (((~(xop)) & 0x40) >> 6)
+#define bFromXOP2of3(xop)       (((~(xop)) & 0x20) >> 5)
+#define mmmmmFromXOP2of3(xop)   ((xop) & 0x1f)
+#define wFromXOP3of3(xop)       (((xop) & 0x80) >> 7)
+#define vvvvFromXOP3of3(vex)    (((~(vex)) & 0x78) >> 3)
+#define lFromXOP3of3(xop)       (((xop) & 0x4) >> 2)
+#define ppFromXOP3of3(xop)      ((xop) & 0x3)
+
 /*
  * These enums represent Intel registers for use by the decoder.
  */
@@ -447,6 +456,12 @@ typedef enum {
   VEX_LOB_0F3A = 0x3
 } VEXLeadingOpcodeByte;
 
+typedef enum {
+  XOP_MAP_SELECT_8 = 0x8,
+  XOP_MAP_SELECT_9 = 0x9,
+  XOP_MAP_SELECT_A = 0xA
+} XOPMapSelect;
+
 /*
  * VEXPrefixCode - Possible values for the VEX.pp field
  */
@@ -458,6 +473,13 @@ typedef enum {
   VEX_PREFIX_F2 = 0x3
 } VEXPrefixCode;
 
+typedef enum {
+  TYPE_NO_VEX_XOP = 0x0,
+  TYPE_VEX_2B = 0x1,
+  TYPE_VEX_3B = 0x2,
+  TYPE_XOP = 0x3
+} VEXXOPType;
+
 typedef uint8_t BOOL;
 
 /*
@@ -514,10 +536,10 @@ struct InternalInstruction {
   uint8_t prefixPresent[0x100];
   /* contains the location (for use with the reader) of the prefix byte */
   uint64_t prefixLocations[0x100];
-  /* The value of the VEX prefix, if present */
-  uint8_t vexPrefix[3];
+  /* The value of the VEX/XOP prefix, if present */
+  uint8_t vexXopPrefix[3];
   /* The length of the VEX prefix (0 if not present) */
-  uint8_t vexSize;
+  VEXXOPType vexXopType;
   /* The value of the REX prefix, if present */
   uint8_t rexPrefix;
   /* The location where a mandatory prefix would have to be (i.e., right before
@@ -541,10 +563,6 @@ struct InternalInstruction {
 
   /* opcode state */
 
-  /* The value of the two-byte escape prefix (usually 0x0f) */
-  uint8_t twoByteEscape;
-  /* The value of the three-byte escape prefix (usually 0x38 or 0x3a) */
-  uint8_t threeByteEscape;
   /* The last byte of the opcode, not counting any ModR/M extension */
   uint8_t opcode;
   /* The ModR/M byte of the instruction, if it is an opcode extension */
diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h b/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h
index d291441..dd1719c 100644
--- a/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h
+++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h
@@ -32,6 +32,9 @@
 #define THREEBYTE3A_SYM   x86DisassemblerThreeByte3AOpcodes
 #define THREEBYTEA6_SYM   x86DisassemblerThreeByteA6Opcodes
 #define THREEBYTEA7_SYM   x86DisassemblerThreeByteA7Opcodes
+#define XOP8_MAP_SYM      x86DisassemblerXOP8Opcodes
+#define XOP9_MAP_SYM      x86DisassemblerXOP9Opcodes
+#define XOPA_MAP_SYM      x86DisassemblerXOPAOpcodes
 
 #define INSTRUCTIONS_STR  "x86DisassemblerInstrSpecifiers"
 #define CONTEXTS_STR      "x86DisassemblerContexts"
@@ -41,6 +44,9 @@
 #define THREEBYTE3A_STR   "x86DisassemblerThreeByte3AOpcodes"
 #define THREEBYTEA6_STR   "x86DisassemblerThreeByteA6Opcodes"
 #define THREEBYTEA7_STR   "x86DisassemblerThreeByteA7Opcodes"
+#define XOP8_MAP_STR      "x86DisassemblerXOP8Opcodes"
+#define XOP9_MAP_STR      "x86DisassemblerXOP9Opcodes"
+#define XOPA_MAP_STR      "x86DisassemblerXOPAOpcodes"
 
 /*
  * Attributes of an instruction that must be known before the opcode can be
@@ -116,10 +122,10 @@ enum attributeBits {
   ENUM_ENTRY(IC_VEX_L_XS,           4,  "requires VEX and the L and XS prefix")\
   ENUM_ENTRY(IC_VEX_L_XD,           4,  "requires VEX and the L and XD prefix")\
   ENUM_ENTRY(IC_VEX_L_OPSIZE,       4,  "requires VEX, L, and OpSize")         \
-  ENUM_ENTRY(IC_VEX_L_W,            3,  "requires VEX, L and W")               \
-  ENUM_ENTRY(IC_VEX_L_W_XS,         4,  "requires VEX, L, W and XS prefix")    \
-  ENUM_ENTRY(IC_VEX_L_W_XD,         4,  "requires VEX, L, W and XD prefix")    \
-  ENUM_ENTRY(IC_VEX_L_W_OPSIZE,     4,  "requires VEX, L, W and OpSize")       \
+  ENUM_ENTRY(IC_VEX_L_W,            4,  "requires VEX, L and W")               \
+  ENUM_ENTRY(IC_VEX_L_W_XS,         5,  "requires VEX, L, W and XS prefix")    \
+  ENUM_ENTRY(IC_VEX_L_W_XD,         5,  "requires VEX, L, W and XD prefix")    \
+  ENUM_ENTRY(IC_VEX_L_W_OPSIZE,     5,  "requires VEX, L, W and OpSize")       \
   ENUM_ENTRY(IC_EVEX,               1,  "requires an EVEX prefix")             \
   ENUM_ENTRY(IC_EVEX_XS,            2,  "requires EVEX and the XS prefix")     \
   ENUM_ENTRY(IC_EVEX_XD,            2,  "requires EVEX and the XD prefix")     \
@@ -215,7 +221,55 @@ enum attributeBits {
   ENUM_ENTRY(IC_EVEX_L2_W_K_B,        3,  "requires EVEX_B, EVEX_K, L2 and W")               \
   ENUM_ENTRY(IC_EVEX_L2_W_XS_K_B,     4,  "requires EVEX_B, EVEX_K, L2, W and XS prefix")    \
   ENUM_ENTRY(IC_EVEX_L2_W_XD_K_B,     4,  "requires EVEX_B, EVEX_K, L2, W and XD prefix")    \
-  ENUM_ENTRY(IC_EVEX_L2_W_OPSIZE_K_B, 4,  "requires EVEX_B, EVEX_K, L2, W and OpSize") 
+  ENUM_ENTRY(IC_EVEX_L2_W_OPSIZE_K_B, 4,  "requires EVEX_B, EVEX_K, L2, W and OpSize")       \
+  ENUM_ENTRY(IC_EVEX_KZ_B,             1,  "requires EVEX_B and EVEX_KZ prefix")             \
+  ENUM_ENTRY(IC_EVEX_XS_KZ_B,          2,  "requires EVEX_B, EVEX_KZ and the XS prefix")     \
+  ENUM_ENTRY(IC_EVEX_XD_KZ_B,          2,  "requires EVEX_B, EVEX_KZ and the XD prefix")     \
+  ENUM_ENTRY(IC_EVEX_OPSIZE_KZ_B,      2,  "requires EVEX_B, EVEX_KZ and the OpSize prefix") \
+  ENUM_ENTRY(IC_EVEX_W_KZ_B,           3,  "requires EVEX_B, EVEX_KZ and the W prefix")      \
+  ENUM_ENTRY(IC_EVEX_W_XS_KZ_B,        4,  "requires EVEX_B, EVEX_KZ, W, and XS prefix")     \
+  ENUM_ENTRY(IC_EVEX_W_XD_KZ_B,        4,  "requires EVEX_B, EVEX_KZ, W, and XD prefix")     \
+  ENUM_ENTRY(IC_EVEX_W_OPSIZE_KZ_B,    4,  "requires EVEX_B, EVEX_KZ, W, and OpSize")        \
+  ENUM_ENTRY(IC_EVEX_L_KZ_B,           3,  "requires EVEX_B, EVEX_KZ and the L prefix")       \
+  ENUM_ENTRY(IC_EVEX_L_XS_KZ_B,        4,  "requires EVEX_B, EVEX_KZ and the L and XS prefix")\
+  ENUM_ENTRY(IC_EVEX_L_XD_KZ_B,        4,  "requires EVEX_B, EVEX_KZ and the L and XD prefix")\
+  ENUM_ENTRY(IC_EVEX_L_OPSIZE_KZ_B,    4,  "requires EVEX_B, EVEX_KZ, L, and OpSize")         \
+  ENUM_ENTRY(IC_EVEX_L_W_KZ_B,         3,  "requires EVEX_B, EVEX_KZ, L and W")               \
+  ENUM_ENTRY(IC_EVEX_L_W_XS_KZ_B,      4,  "requires EVEX_B, EVEX_KZ, L, W and XS prefix")    \
+  ENUM_ENTRY(IC_EVEX_L_W_XD_KZ_B,      4,  "requires EVEX_B, EVEX_KZ, L, W and XD prefix")    \
+  ENUM_ENTRY(IC_EVEX_L_W_OPSIZE_KZ_B,  4,  "requires EVEX_B, EVEX_KZ, L, W and OpSize")       \
+  ENUM_ENTRY(IC_EVEX_L2_KZ_B,          3,  "requires EVEX_B, EVEX_KZ and the L2 prefix")       \
+  ENUM_ENTRY(IC_EVEX_L2_XS_KZ_B,       4,  "requires EVEX_B, EVEX_KZ and the L2 and XS prefix")\
+  ENUM_ENTRY(IC_EVEX_L2_XD_KZ_B,       4,  "requires EVEX_B, EVEX_KZ and the L2 and XD prefix")\
+  ENUM_ENTRY(IC_EVEX_L2_OPSIZE_KZ_B,   4,  "requires EVEX_B, EVEX_KZ, L2, and OpSize")         \
+  ENUM_ENTRY(IC_EVEX_L2_W_KZ_B,        3,  "requires EVEX_B, EVEX_KZ, L2 and W")               \
+  ENUM_ENTRY(IC_EVEX_L2_W_XS_KZ_B,     4,  "requires EVEX_B, EVEX_KZ, L2, W and XS prefix")    \
+  ENUM_ENTRY(IC_EVEX_L2_W_XD_KZ_B,     4,  "requires EVEX_B, EVEX_KZ, L2, W and XD prefix")    \
+  ENUM_ENTRY(IC_EVEX_L2_W_OPSIZE_KZ_B, 4,  "requires EVEX_B, EVEX_KZ, L2, W and OpSize")       \
+  ENUM_ENTRY(IC_EVEX_KZ,             1,  "requires an EVEX_KZ prefix")             \
+  ENUM_ENTRY(IC_EVEX_XS_KZ,          2,  "requires EVEX_KZ and the XS prefix")     \
+  ENUM_ENTRY(IC_EVEX_XD_KZ,          2,  "requires EVEX_KZ and the XD prefix")     \
+  ENUM_ENTRY(IC_EVEX_OPSIZE_KZ,      2,  "requires EVEX_KZ and the OpSize prefix") \
+  ENUM_ENTRY(IC_EVEX_W_KZ,           3,  "requires EVEX_KZ and the W prefix")      \
+  ENUM_ENTRY(IC_EVEX_W_XS_KZ,        4,  "requires EVEX_KZ, W, and XS prefix")     \
+  ENUM_ENTRY(IC_EVEX_W_XD_KZ,        4,  "requires EVEX_KZ, W, and XD prefix")     \
+  ENUM_ENTRY(IC_EVEX_W_OPSIZE_KZ,    4,  "requires EVEX_KZ, W, and OpSize")        \
+  ENUM_ENTRY(IC_EVEX_L_KZ,           3,  "requires EVEX_KZ and the L prefix")       \
+  ENUM_ENTRY(IC_EVEX_L_XS_KZ,        4,  "requires EVEX_KZ and the L and XS prefix")\
+  ENUM_ENTRY(IC_EVEX_L_XD_KZ,        4,  "requires EVEX_KZ and the L and XD prefix")\
+  ENUM_ENTRY(IC_EVEX_L_OPSIZE_KZ,    4,  "requires EVEX_KZ, L, and OpSize")         \
+  ENUM_ENTRY(IC_EVEX_L_W_KZ,         3,  "requires EVEX_KZ, L and W")               \
+  ENUM_ENTRY(IC_EVEX_L_W_XS_KZ,      4,  "requires EVEX_KZ, L, W and XS prefix")    \
+  ENUM_ENTRY(IC_EVEX_L_W_XD_KZ,      4,  "requires EVEX_KZ, L, W and XD prefix")    \
+  ENUM_ENTRY(IC_EVEX_L_W_OPSIZE_KZ,  4,  "requires EVEX_KZ, L, W and OpSize")       \
+  ENUM_ENTRY(IC_EVEX_L2_KZ,          3,  "requires EVEX_KZ and the L2 prefix")       \
+  ENUM_ENTRY(IC_EVEX_L2_XS_KZ,       4,  "requires EVEX_KZ and the L2 and XS prefix")\
+  ENUM_ENTRY(IC_EVEX_L2_XD_KZ,       4,  "requires EVEX_KZ and the L2 and XD prefix")\
+  ENUM_ENTRY(IC_EVEX_L2_OPSIZE_KZ,   4,  "requires EVEX_KZ, L2, and OpSize")         \
+  ENUM_ENTRY(IC_EVEX_L2_W_KZ,        3,  "requires EVEX_KZ, L2 and W")               \
+  ENUM_ENTRY(IC_EVEX_L2_W_XS_KZ,     4,  "requires EVEX_KZ, L2, W and XS prefix")    \
+  ENUM_ENTRY(IC_EVEX_L2_W_XD_KZ,     4,  "requires EVEX_KZ, L2, W and XD prefix")    \
+  ENUM_ENTRY(IC_EVEX_L2_W_OPSIZE_KZ, 4,  "requires EVEX_KZ, L2, W and OpSize")     
 
 #define ENUM_ENTRY(n, r, d) n,
 typedef enum {
@@ -234,7 +288,10 @@ typedef enum {
   THREEBYTE_38  = 2,
   THREEBYTE_3A  = 3,
   THREEBYTE_A6  = 4,
-  THREEBYTE_A7  = 5
+  THREEBYTE_A7  = 5,
+  XOP8_MAP      = 6,
+  XOP9_MAP      = 7,
+  XOPA_MAP      = 8
 } OpcodeType;
 
 /*
diff --git a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
index b9d0082..4439311 100644
--- a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
+++ b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
@@ -215,3 +215,19 @@ void X86ATTInstPrinter::printMemReference(const MCInst *MI, unsigned Op,
 
   O << markup(">");
 }
+
+void X86ATTInstPrinter::printMemOffset(const MCInst *MI, unsigned Op,
+                                       raw_ostream &O) {
+  const MCOperand &DispSpec = MI->getOperand(Op);
+
+  O << markup("<mem:");
+
+  if (DispSpec.isImm()) {
+    O << formatImm(DispSpec.getImm());
+  } else {
+    assert(DispSpec.isExpr() && "non-immediate displacement?");
+    O << *DispSpec.getExpr();
+  }
+
+  O << markup(">");
+}
diff --git a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h
index 8d05256..a8fab72 100644
--- a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h
+++ b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h
@@ -42,7 +42,8 @@ public:
   void printSSECC(const MCInst *MI, unsigned Op, raw_ostream &OS);
   void printAVXCC(const MCInst *MI, unsigned Op, raw_ostream &OS);
   void printPCRelImm(const MCInst *MI, unsigned OpNo, raw_ostream &OS);
-  
+  void printMemOffset(const MCInst *MI, unsigned OpNo, raw_ostream &OS);
+
   void printopaquemem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
     printMemReference(MI, OpNo, O);
   }
@@ -86,6 +87,19 @@ public:
   void printf512mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
     printMemReference(MI, OpNo, O);
   }
+
+  void printMemOffs8(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printMemOffset(MI, OpNo, O);
+  }
+  void printMemOffs16(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printMemOffset(MI, OpNo, O);
+  }
+  void printMemOffs32(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printMemOffset(MI, OpNo, O);
+  }
+  void printMemOffs64(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printMemOffset(MI, OpNo, O);
+  }
 };
   
 }
diff --git a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
index 9dfc9a9..e7e7b15 100644
--- a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
+++ b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
@@ -200,3 +200,19 @@ void X86IntelInstPrinter::printMemReference(const MCInst *MI, unsigned Op,
   
   O << ']';
 }
+
+void X86IntelInstPrinter::printMemOffset(const MCInst *MI, unsigned Op,
+                                         raw_ostream &O) {
+  const MCOperand &DispSpec = MI->getOperand(Op);
+
+  O << '[';
+
+  if (DispSpec.isImm()) {
+    O << formatImm(DispSpec.getImm());
+  } else {
+    assert(DispSpec.isExpr() && "non-immediate displacement?");
+    O << *DispSpec.getExpr();
+  }
+
+  O << ']';
+}
diff --git a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h
index 45beeda..590bf68 100644
--- a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h
+++ b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h
@@ -39,7 +39,8 @@ public:
   void printSSECC(const MCInst *MI, unsigned Op, raw_ostream &O);
   void printAVXCC(const MCInst *MI, unsigned Op, raw_ostream &O);
   void printPCRelImm(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  
+  void printMemOffset(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+
   void printopaquemem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
     O << "opaque ptr ";
     printMemReference(MI, OpNo, O);
@@ -97,6 +98,23 @@ public:
     O << "zmmword ptr ";
     printMemReference(MI, OpNo, O);
   }
+
+  void printMemOffs8(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "byte ptr ";
+    printMemOffset(MI, OpNo, O);
+  }
+  void printMemOffs16(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "word ptr ";
+    printMemOffset(MI, OpNo, O);
+  }
+  void printMemOffs32(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "dword ptr ";
+    printMemOffset(MI, OpNo, O);
+  }
+  void printMemOffs64(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "qword ptr ";
+    printMemOffset(MI, OpNo, O);
+  }
 };
   
 }
diff --git a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
index 598ddee..f8e359b 100644
--- a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -9,6 +9,7 @@
 
 #include "MCTargetDesc/X86BaseInfo.h"
 #include "MCTargetDesc/X86FixupKinds.h"
+#include "llvm/ADT/StringSwitch.h"
 #include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCELFObjectWriter.h"
@@ -19,10 +20,10 @@
 #include "llvm/MC/MCSectionCOFF.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCSectionMachO.h"
-#include "llvm/Object/MachOFormat.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ELF.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachO.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
@@ -67,9 +68,16 @@ public:
 
 class X86AsmBackend : public MCAsmBackend {
   StringRef CPU;
+  bool HasNopl;
 public:
   X86AsmBackend(const Target &T, StringRef _CPU)
-    : MCAsmBackend(), CPU(_CPU) {}
+    : MCAsmBackend(), CPU(_CPU) {
+    HasNopl = CPU != "generic" && CPU != "i386" && CPU != "i486" &&
+              CPU != "i586" && CPU != "pentium" && CPU != "pentium-mmx" &&
+              CPU != "i686" && CPU != "k6" && CPU != "k6-2" && CPU != "k6-3" &&
+              CPU != "geode" && CPU != "winchip-c6" && CPU != "winchip2" &&
+              CPU != "c3" && CPU != "c3-2";
+  }
 
   unsigned getNumFixupKinds() const {
     return X86::NumTargetFixupKinds;
@@ -308,8 +316,8 @@ bool X86AsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
 
   // This CPU doesnt support long nops. If needed add more.
   // FIXME: Can we get this from the subtarget somehow?
-  if (CPU == "generic" || CPU == "i386" || CPU == "i486" || CPU == "i586" ||
-      CPU == "pentium" || CPU == "pentium-mmx" || CPU == "geode") {
+  // FIXME: We could generated something better than plain 0x90.
+  if (!HasNopl) {
     for (uint64_t i = 0; i < Count; ++i)
       OW->Write8(0x90);
     return true;
@@ -334,6 +342,7 @@ bool X86AsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
 /* *** */
 
 namespace {
+
 class ELFX86AsmBackend : public X86AsmBackend {
 public:
   uint8_t OSABI;
@@ -382,35 +391,368 @@ public:
   }
 };
 
+namespace CU {
+
+  /// Compact unwind encoding values.
+  enum CompactUnwindEncodings {
+    /// [RE]BP based frame where [RE]BP is pused on the stack immediately after
+    /// the return address, then [RE]SP is moved to [RE]BP.
+    UNWIND_MODE_BP_FRAME                   = 0x01000000,
+
+    /// A frameless function with a small constant stack size.
+    UNWIND_MODE_STACK_IMMD                 = 0x02000000,
+
+    /// A frameless function with a large constant stack size.
+    UNWIND_MODE_STACK_IND                  = 0x03000000,
+
+    /// No compact unwind encoding is available.
+    UNWIND_MODE_DWARF                      = 0x04000000,
+
+    /// Mask for encoding the frame registers.
+    UNWIND_BP_FRAME_REGISTERS              = 0x00007FFF,
+
+    /// Mask for encoding the frameless registers.
+    UNWIND_FRAMELESS_STACK_REG_PERMUTATION = 0x000003FF
+  };
+
+} // end CU namespace
+
 class DarwinX86AsmBackend : public X86AsmBackend {
+  const MCRegisterInfo &MRI;
+
+  /// \brief Number of registers that can be saved in a compact unwind encoding.
+  enum { CU_NUM_SAVED_REGS = 6 };
+
+  mutable unsigned SavedRegs[CU_NUM_SAVED_REGS];
+  bool Is64Bit;
+
+  unsigned OffsetSize;                   ///< Offset of a "push" instruction.
+  unsigned PushInstrSize;                ///< Size of a "push" instruction.
+  unsigned MoveInstrSize;                ///< Size of a "move" instruction.
+  unsigned StackDivide;                  ///< Amount to adjust stack stize by.
+protected:
+  /// \brief Implementation of algorithm to generate the compact unwind encoding
+  /// for the CFI instructions.
+  uint32_t
+  generateCompactUnwindEncodingImpl(ArrayRef<MCCFIInstruction> Instrs) const {
+    if (Instrs.empty()) return 0;
+
+    // Reset the saved registers.
+    unsigned SavedRegIdx = 0;
+    memset(SavedRegs, 0, sizeof(SavedRegs));
+
+    bool HasFP = false;
+
+    // Encode that we are using EBP/RBP as the frame pointer.
+    uint32_t CompactUnwindEncoding = 0;
+
+    unsigned SubtractInstrIdx = Is64Bit ? 3 : 2;
+    unsigned InstrOffset = 0;
+    unsigned StackAdjust = 0;
+    unsigned StackSize = 0;
+    unsigned PrevStackSize = 0;
+    unsigned NumDefCFAOffsets = 0;
+
+    for (unsigned i = 0, e = Instrs.size(); i != e; ++i) {
+      const MCCFIInstruction &Inst = Instrs[i];
+
+      switch (Inst.getOperation()) {
+      default:
+        // Any other CFI directives indicate a frame that we aren't prepared
+        // to represent via compact unwind, so just bail out.
+        return 0;
+      case MCCFIInstruction::OpDefCfaRegister: {
+        // Defines a frame pointer. E.g.
+        //
+        //     movq %rsp, %rbp
+        //  L0:
+        //     .cfi_def_cfa_register %rbp
+        //
+        HasFP = true;
+        assert(MRI.getLLVMRegNum(Inst.getRegister(), true) ==
+               (Is64Bit ? X86::RBP : X86::EBP) && "Invalid frame pointer!");
+
+        // Reset the counts.
+        memset(SavedRegs, 0, sizeof(SavedRegs));
+        StackAdjust = 0;
+        SavedRegIdx = 0;
+        InstrOffset += MoveInstrSize;
+        break;
+      }
+      case MCCFIInstruction::OpDefCfaOffset: {
+        // Defines a new offset for the CFA. E.g.
+        //
+        //  With frame:
+        //  
+        //     pushq %rbp
+        //  L0:
+        //     .cfi_def_cfa_offset 16
+        //
+        //  Without frame:
+        //
+        //     subq $72, %rsp
+        //  L0:
+        //     .cfi_def_cfa_offset 80
+        //
+        PrevStackSize = StackSize;
+        StackSize = std::abs(Inst.getOffset()) / StackDivide;
+        ++NumDefCFAOffsets;
+        break;
+      }
+      case MCCFIInstruction::OpOffset: {
+        // Defines a "push" of a callee-saved register. E.g.
+        //
+        //     pushq %r15
+        //     pushq %r14
+        //     pushq %rbx
+        //  L0:
+        //     subq $120, %rsp
+        //  L1:
+        //     .cfi_offset %rbx, -40
+        //     .cfi_offset %r14, -32
+        //     .cfi_offset %r15, -24
+        //
+        if (SavedRegIdx == CU_NUM_SAVED_REGS)
+          // If there are too many saved registers, we cannot use a compact
+          // unwind encoding.
+          return CU::UNWIND_MODE_DWARF;
+
+        unsigned Reg = MRI.getLLVMRegNum(Inst.getRegister(), true);
+        SavedRegs[SavedRegIdx++] = Reg;
+        StackAdjust += OffsetSize;
+        InstrOffset += PushInstrSize;
+        break;
+      }
+      }
+    }
+
+    StackAdjust /= StackDivide;
+
+    if (HasFP) {
+      if ((StackAdjust & 0xFF) != StackAdjust)
+        // Offset was too big for a compact unwind encoding.
+        return CU::UNWIND_MODE_DWARF;
+
+      // Get the encoding of the saved registers when we have a frame pointer.
+      uint32_t RegEnc = encodeCompactUnwindRegistersWithFrame();
+      if (RegEnc == ~0U) return CU::UNWIND_MODE_DWARF;
+
+      CompactUnwindEncoding |= CU::UNWIND_MODE_BP_FRAME;
+      CompactUnwindEncoding |= (StackAdjust & 0xFF) << 16;
+      CompactUnwindEncoding |= RegEnc & CU::UNWIND_BP_FRAME_REGISTERS;
+    } else {
+      // If the amount of the stack allocation is the size of a register, then
+      // we "push" the RAX/EAX register onto the stack instead of adjusting the
+      // stack pointer with a SUB instruction. We don't support the push of the
+      // RAX/EAX register with compact unwind. So we check for that situation
+      // here.
+      if ((NumDefCFAOffsets == SavedRegIdx + 1 &&
+           StackSize - PrevStackSize == 1) ||
+          (Instrs.size() == 1 && NumDefCFAOffsets == 1 && StackSize == 2))
+        return CU::UNWIND_MODE_DWARF;
+
+      SubtractInstrIdx += InstrOffset;
+      ++StackAdjust;
+
+      if ((StackSize & 0xFF) == StackSize) {
+        // Frameless stack with a small stack size.
+        CompactUnwindEncoding |= CU::UNWIND_MODE_STACK_IMMD;
+
+        // Encode the stack size.
+        CompactUnwindEncoding |= (StackSize & 0xFF) << 16;
+      } else {
+        if ((StackAdjust & 0x7) != StackAdjust)
+          // The extra stack adjustments are too big for us to handle.
+          return CU::UNWIND_MODE_DWARF;
+
+        // Frameless stack with an offset too large for us to encode compactly.
+        CompactUnwindEncoding |= CU::UNWIND_MODE_STACK_IND;
+
+        // Encode the offset to the nnnnnn value in the 'subl $nnnnnn, ESP'
+        // instruction.
+        CompactUnwindEncoding |= (SubtractInstrIdx & 0xFF) << 16;
+
+        // Encode any extra stack stack adjustments (done via push
+        // instructions).
+        CompactUnwindEncoding |= (StackAdjust & 0x7) << 13;
+      }
+
+      // Encode the number of registers saved. (Reverse the list first.)
+      std::reverse(&SavedRegs[0], &SavedRegs[SavedRegIdx]);
+      CompactUnwindEncoding |= (SavedRegIdx & 0x7) << 10;
+
+      // Get the encoding of the saved registers when we don't have a frame
+      // pointer.
+      uint32_t RegEnc = encodeCompactUnwindRegistersWithoutFrame(SavedRegIdx);
+      if (RegEnc == ~0U) return CU::UNWIND_MODE_DWARF;
+
+      // Encode the register encoding.
+      CompactUnwindEncoding |=
+        RegEnc & CU::UNWIND_FRAMELESS_STACK_REG_PERMUTATION;
+    }
+
+    return CompactUnwindEncoding;
+  }
+
+private:
+  /// \brief Get the compact unwind number for a given register. The number
+  /// corresponds to the enum lists in compact_unwind_encoding.h.
+  int getCompactUnwindRegNum(unsigned Reg) const {
+    static const uint16_t CU32BitRegs[7] = {
+      X86::EBX, X86::ECX, X86::EDX, X86::EDI, X86::ESI, X86::EBP, 0
+    };
+    static const uint16_t CU64BitRegs[] = {
+      X86::RBX, X86::R12, X86::R13, X86::R14, X86::R15, X86::RBP, 0
+    };
+    const uint16_t *CURegs = Is64Bit ? CU64BitRegs : CU32BitRegs;
+    for (int Idx = 1; *CURegs; ++CURegs, ++Idx)
+      if (*CURegs == Reg)
+        return Idx;
+
+    return -1;
+  }
+
+  /// \brief Return the registers encoded for a compact encoding with a frame
+  /// pointer.
+  uint32_t encodeCompactUnwindRegistersWithFrame() const {
+    // Encode the registers in the order they were saved --- 3-bits per
+    // register. The list of saved registers is assumed to be in reverse
+    // order. The registers are numbered from 1 to CU_NUM_SAVED_REGS.
+    uint32_t RegEnc = 0;
+    for (int i = 0, Idx = 0; i != CU_NUM_SAVED_REGS; ++i) {
+      unsigned Reg = SavedRegs[i];
+      if (Reg == 0) break;
+
+      int CURegNum = getCompactUnwindRegNum(Reg);
+      if (CURegNum == -1) return ~0U;
+
+      // Encode the 3-bit register number in order, skipping over 3-bits for
+      // each register.
+      RegEnc |= (CURegNum & 0x7) << (Idx++ * 3);
+    }
+
+    assert((RegEnc & 0x3FFFF) == RegEnc &&
+           "Invalid compact register encoding!");
+    return RegEnc;
+  }
+
+  /// \brief Create the permutation encoding used with frameless stacks. It is
+  /// passed the number of registers to be saved and an array of the registers
+  /// saved.
+  uint32_t encodeCompactUnwindRegistersWithoutFrame(unsigned RegCount) const {
+    // The saved registers are numbered from 1 to 6. In order to encode the
+    // order in which they were saved, we re-number them according to their
+    // place in the register order. The re-numbering is relative to the last
+    // re-numbered register. E.g., if we have registers {6, 2, 4, 5} saved in
+    // that order:
+    //
+    //    Orig  Re-Num
+    //    ----  ------
+    //     6       6
+    //     2       2
+    //     4       3
+    //     5       3
+    //
+    for (unsigned i = 0; i != CU_NUM_SAVED_REGS; ++i) {
+      int CUReg = getCompactUnwindRegNum(SavedRegs[i]);
+      if (CUReg == -1) return ~0U;
+      SavedRegs[i] = CUReg;
+    }
+
+    // Reverse the list.
+    std::reverse(&SavedRegs[0], &SavedRegs[CU_NUM_SAVED_REGS]);
+
+    uint32_t RenumRegs[CU_NUM_SAVED_REGS];
+    for (unsigned i = CU_NUM_SAVED_REGS - RegCount; i < CU_NUM_SAVED_REGS; ++i){
+      unsigned Countless = 0;
+      for (unsigned j = CU_NUM_SAVED_REGS - RegCount; j < i; ++j)
+        if (SavedRegs[j] < SavedRegs[i])
+          ++Countless;
+
+      RenumRegs[i] = SavedRegs[i] - Countless - 1;
+    }
+
+    // Take the renumbered values and encode them into a 10-bit number.
+    uint32_t permutationEncoding = 0;
+    switch (RegCount) {
+    case 6:
+      permutationEncoding |= 120 * RenumRegs[0] + 24 * RenumRegs[1]
+                             + 6 * RenumRegs[2] +  2 * RenumRegs[3]
+                             +     RenumRegs[4];
+      break;
+    case 5:
+      permutationEncoding |= 120 * RenumRegs[1] + 24 * RenumRegs[2]
+                             + 6 * RenumRegs[3] +  2 * RenumRegs[4]
+                             +     RenumRegs[5];
+      break;
+    case 4:
+      permutationEncoding |=  60 * RenumRegs[2] + 12 * RenumRegs[3]
+                             + 3 * RenumRegs[4] +      RenumRegs[5];
+      break;
+    case 3:
+      permutationEncoding |=  20 * RenumRegs[3] +  4 * RenumRegs[4]
+                             +     RenumRegs[5];
+      break;
+    case 2:
+      permutationEncoding |=   5 * RenumRegs[4] +      RenumRegs[5];
+      break;
+    case 1:
+      permutationEncoding |=       RenumRegs[5];
+      break;
+    }
+
+    assert((permutationEncoding & 0x3FF) == permutationEncoding &&
+           "Invalid compact register encoding!");
+    return permutationEncoding;
+  }
+
 public:
-  DarwinX86AsmBackend(const Target &T, StringRef CPU)
-    : X86AsmBackend(T, CPU) { }
+  DarwinX86AsmBackend(const Target &T, const MCRegisterInfo &MRI, StringRef CPU,
+                      bool Is64Bit)
+    : X86AsmBackend(T, CPU), MRI(MRI), Is64Bit(Is64Bit) {
+    memset(SavedRegs, 0, sizeof(SavedRegs));
+    OffsetSize = Is64Bit ? 8 : 4;
+    MoveInstrSize = Is64Bit ? 3 : 2;
+    StackDivide = Is64Bit ? 8 : 4;
+    PushInstrSize = 1;
+  }
 };
 
 class DarwinX86_32AsmBackend : public DarwinX86AsmBackend {
+  bool SupportsCU;
 public:
-  DarwinX86_32AsmBackend(const Target &T, StringRef CPU)
-    : DarwinX86AsmBackend(T, CPU) {}
+  DarwinX86_32AsmBackend(const Target &T, const MCRegisterInfo &MRI,
+                         StringRef CPU, bool SupportsCU)
+    : DarwinX86AsmBackend(T, MRI, CPU, false), SupportsCU(SupportsCU) {}
 
   MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
     return createX86MachObjectWriter(OS, /*Is64Bit=*/false,
-                                     object::mach::CTM_i386,
-                                     object::mach::CSX86_ALL);
+                                     MachO::CPU_TYPE_I386,
+                                     MachO::CPU_SUBTYPE_I386_ALL);
+  }
+
+  /// \brief Generate the compact unwind encoding for the CFI instructions.
+  virtual uint32_t
+  generateCompactUnwindEncoding(ArrayRef<MCCFIInstruction> Instrs) const {
+    return SupportsCU ? generateCompactUnwindEncodingImpl(Instrs) : 0;
   }
 };
 
 class DarwinX86_64AsmBackend : public DarwinX86AsmBackend {
+  bool SupportsCU;
+  const MachO::CPUSubTypeX86 Subtype;
 public:
-  DarwinX86_64AsmBackend(const Target &T, StringRef CPU)
-    : DarwinX86AsmBackend(T, CPU) {
+  DarwinX86_64AsmBackend(const Target &T, const MCRegisterInfo &MRI,
+                         StringRef CPU, bool SupportsCU,
+                         MachO::CPUSubTypeX86 st)
+    : DarwinX86AsmBackend(T, MRI, CPU, true), SupportsCU(SupportsCU),
+      Subtype(st) {
     HasReliableSymbolDifference = true;
   }
 
   MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
     return createX86MachObjectWriter(OS, /*Is64Bit=*/true,
-                                     object::mach::CTM_x86_64,
-                                     object::mach::CSX86_ALL);
+                                     MachO::CPU_TYPE_X86_64, Subtype);
   }
 
   virtual bool doesSectionRequireSymbols(const MCSection &Section) const {
@@ -445,15 +787,26 @@ public:
       return false;
     }
   }
+
+  /// \brief Generate the compact unwind encoding for the CFI instructions.
+  virtual uint32_t
+  generateCompactUnwindEncoding(ArrayRef<MCCFIInstruction> Instrs) const {
+    return SupportsCU ? generateCompactUnwindEncodingImpl(Instrs) : 0;
+  }
 };
 
 } // end anonymous namespace
 
-MCAsmBackend *llvm::createX86_32AsmBackend(const Target &T, StringRef TT, StringRef CPU) {
+MCAsmBackend *llvm::createX86_32AsmBackend(const Target &T,
+                                           const MCRegisterInfo &MRI,
+                                           StringRef TT,
+                                           StringRef CPU) {
   Triple TheTriple(TT);
 
   if (TheTriple.isOSDarwin() || TheTriple.getEnvironment() == Triple::MachO)
-    return new DarwinX86_32AsmBackend(T, CPU);
+    return new DarwinX86_32AsmBackend(T, MRI, CPU,
+                                      TheTriple.isMacOSX() &&
+                                      !TheTriple.isMacOSXVersionLT(10, 7));
 
   if (TheTriple.isOSWindows() && TheTriple.getEnvironment() != Triple::ELF)
     return new WindowsX86AsmBackend(T, false, CPU);
@@ -462,11 +815,21 @@ MCAsmBackend *llvm::createX86_32AsmBackend(const Target &T, StringRef TT, String
   return new ELFX86_32AsmBackend(T, OSABI, CPU);
 }
 
-MCAsmBackend *llvm::createX86_64AsmBackend(const Target &T, StringRef TT, StringRef CPU) {
+MCAsmBackend *llvm::createX86_64AsmBackend(const Target &T,
+                                           const MCRegisterInfo &MRI,
+                                           StringRef TT,
+                                           StringRef CPU) {
   Triple TheTriple(TT);
 
-  if (TheTriple.isOSDarwin() || TheTriple.getEnvironment() == Triple::MachO)
-    return new DarwinX86_64AsmBackend(T, CPU);
+  if (TheTriple.isOSDarwin() || TheTriple.getEnvironment() == Triple::MachO) {
+    MachO::CPUSubTypeX86 CS =
+        StringSwitch<MachO::CPUSubTypeX86>(TheTriple.getArchName())
+            .Case("x86_64h", MachO::CPU_SUBTYPE_X86_64_H)
+            .Default(MachO::CPU_SUBTYPE_X86_64_ALL);
+    return new DarwinX86_64AsmBackend(T, MRI, CPU,
+                                      TheTriple.isMacOSX() &&
+                                      !TheTriple.isMacOSXVersionLT(10, 7), CS);
+  }
 
   if (TheTriple.isOSWindows() && TheTriple.getEnvironment() != Triple::ELF)
     return new WindowsX86AsmBackend(T, true, CPU);
diff --git a/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
index 25d1af3..1ef9814 100644
--- a/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
+++ b/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
@@ -354,6 +354,9 @@ namespace X86II {
     // XOP9 - Prefix to exclude use of imm byte.
     XOP9 = 21 << Op0Shift,
 
+    // XOPA - Prefix to encode 0xA in VEX.MMMM of XOP instructions.
+    XOPA = 22 << Op0Shift,
+
     //===------------------------------------------------------------------===//
     // REX_W - REX prefixes are instruction prefixes used in 64-bit mode.
     // They are used to specify GPRs and SSE registers, 64-bit operand size,
diff --git a/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
index b400b87..3ddd865 100644
--- a/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
@@ -108,6 +108,15 @@ unsigned X86ELFObjectWriter::GetRelocType(const MCValue &Target,
         case MCSymbolRefExpr::VK_None:
           Type = ELF::R_X86_64_64;
           break;
+        case MCSymbolRefExpr::VK_GOT:
+          Type = ELF::R_X86_64_GOT64;
+          break;
+        case MCSymbolRefExpr::VK_GOTOFF:
+          Type = ELF::R_X86_64_GOTOFF64;
+          break;
+        case MCSymbolRefExpr::VK_TPOFF:
+          Type = ELF::R_X86_64_TPOFF64;
+          break;
         case MCSymbolRefExpr::VK_DTPOFF:
           Type = ELF::R_X86_64_DTPOFF64;
           break;
diff --git a/lib/Target/X86/MCTargetDesc/X86ELFRelocationInfo.cpp b/lib/Target/X86/MCTargetDesc/X86ELFRelocationInfo.cpp
index 8f4ab46..a3eb4fb 100644
--- a/lib/Target/X86/MCTargetDesc/X86ELFRelocationInfo.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86ELFRelocationInfo.cpp
@@ -13,7 +13,7 @@
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCRelocationInfo.h"
-#include "llvm/Object/ELF.h"
+#include "llvm/Object/ELFObjectFile.h"
 #include "llvm/Support/ELF.h"
 
 using namespace llvm;
diff --git a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
index 7815ae9..3861e1c 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
@@ -59,10 +59,8 @@ X86MCAsmInfoDarwin::X86MCAsmInfoDarwin(const Triple &T) {
   // for .S files on other systems.  Perhaps this is because the file system
   // wasn't always case preserving or something.
   CommentString = "##";
-  PCSymbol = ".";
 
   SupportsDebugInformation = true;
-  DwarfUsesInlineInfoSection = true;
   UseDataRegionDirectives = MarkedJTDataRegions;
 
   // Exceptions handling
@@ -92,8 +90,6 @@ X86ELFMCAsmInfo::X86ELFMCAsmInfo(const Triple &T) {
   TextAlignFillValue = 0x90;
 
   PrivateGlobalPrefix = ".L";
-  WeakRefDirective = "\t.weak\t";
-  PCSymbol = ".";
 
   // Set up DWARF directives
   HasLEB128 = true;  // Target asm supports leb128 directives (little-endian)
@@ -139,6 +135,8 @@ X86MCAsmInfoMicrosoft::X86MCAsmInfoMicrosoft(const Triple &Triple) {
   AssemblerDialect = AsmWriterFlavor;
 
   TextAlignFillValue = 0x90;
+
+  AllowAtInName = true;
 }
 
 void X86MCAsmInfoGNUCOFF::anchor() { }
diff --git a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h
index b6b70fd..80979dd 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h
+++ b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h
@@ -17,6 +17,7 @@
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCAsmInfoCOFF.h"
 #include "llvm/MC/MCAsmInfoDarwin.h"
+#include "llvm/MC/MCAsmInfoELF.h"
 
 namespace llvm {
   class Triple;
@@ -35,7 +36,7 @@ namespace llvm {
                                 MCStreamer &Streamer) const;
   };
 
-  class X86ELFMCAsmInfo : public MCAsmInfo {
+  class X86ELFMCAsmInfo : public MCAsmInfoELF {
     virtual void anchor();
   public:
     explicit X86ELFMCAsmInfo(const Triple &Triple);
diff --git a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
index 8515879..7952607 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
@@ -564,7 +564,7 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
   unsigned char VEX_W = 0;
 
   // XOP: Use XOP prefix byte 0x8f instead of VEX.
-  unsigned char XOP = 0;
+  bool XOP = false;
 
   // VEX_5M (VEX m-mmmmm field):
   //
@@ -574,7 +574,8 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
   //  0b00011: implied 0F 3A leading opcode bytes
   //  0b00100-0b11111: Reserved for future use
   //  0b01000: XOP map select - 08h instructions with imm byte
-  //  0b10001: XOP map select - 09h instructions with no imm byte
+  //  0b01001: XOP map select - 09h instructions with no imm byte
+  //  0b01010: XOP map select - 0Ah instructions with imm dword
   unsigned char VEX_5M = 0x1;
 
   // VEX_4V (VEX vvvv field): a register specifier
@@ -620,7 +621,7 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
     VEX_W = 1;
 
   if ((TSFlags >> X86II::VEXShift) & X86II::XOP)
-    XOP = 1;
+    XOP = true;
 
   if ((TSFlags >> X86II::VEXShift) & X86II::VEX_L)
     VEX_L = 1;
@@ -665,11 +666,11 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
   case X86II::XOP9:
     VEX_5M = 0x9;
     break;
-  case X86II::A6:  // Bypass: Not used by VEX
-  case X86II::A7:  // Bypass: Not used by VEX
-  case X86II::TB:  // Bypass: Not used by VEX
-  case 0:
-    break;  // No prefix!
+  case X86II::XOPA:
+    VEX_5M = 0xA;
+    break;
+  case X86II::TB: // VEX_5M/VEX_PP already correct
+    break;
   }
 
 
@@ -786,8 +787,8 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
       VEX_4V = getVEXRegisterEncoding(MI, CurOp);
       if (HasEVEX && X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
         EVEX_V2 = 0x0;
+      CurOp++;
     }
-    CurOp++;
 
     if (HasEVEX_K)
       EVEX_aaa = getWriteMaskRegisterEncoding(MI, CurOp++);
@@ -868,11 +869,12 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
   case X86II::MRM6r: case X86II::MRM7r:
     // MRM0r-MRM7r instructions forms:
     //  dst(VEX_4V), src(ModR/M), imm8
-    VEX_4V = getVEXRegisterEncoding(MI, CurOp);
-    if (HasEVEX && X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
-        EVEX_V2 = 0x0;
-    CurOp++;
-    
+    if (HasVEX_4V) {
+      VEX_4V = getVEXRegisterEncoding(MI, CurOp);
+      if (HasEVEX && X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
+          EVEX_V2 = 0x0;
+      CurOp++;
+    }    
     if (HasEVEX_K)
       EVEX_aaa = getWriteMaskRegisterEncoding(MI, CurOp++);
 
diff --git a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
index bd23ce4..1cbdafd 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
@@ -368,7 +368,7 @@ static MCStreamer *createMCStreamer(const Target &T, StringRef TT,
   if (TheTriple.isOSWindows() && TheTriple.getEnvironment() != Triple::ELF)
     return createWinCOFFStreamer(Ctx, MAB, *_Emitter, _OS, RelaxAll);
 
-  return createELFStreamer(Ctx, MAB, _OS, _Emitter, RelaxAll, NoExecStack);
+  return createELFStreamer(Ctx, 0, MAB, _OS, _Emitter, RelaxAll, NoExecStack);
 }
 
 static MCInstPrinter *createX86MCInstPrinter(const Target &T,
diff --git a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
index 2f459b4..41ae435 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
+++ b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
@@ -79,8 +79,10 @@ MCCodeEmitter *createX86MCCodeEmitter(const MCInstrInfo &MCII,
                                       const MCSubtargetInfo &STI,
                                       MCContext &Ctx);
 
-MCAsmBackend *createX86_32AsmBackend(const Target &T, StringRef TT, StringRef CPU);
-MCAsmBackend *createX86_64AsmBackend(const Target &T, StringRef TT, StringRef CPU);
+MCAsmBackend *createX86_32AsmBackend(const Target &T, const MCRegisterInfo &MRI,
+                                     StringRef TT, StringRef CPU);
+MCAsmBackend *createX86_64AsmBackend(const Target &T, const MCRegisterInfo &MRI,
+                                     StringRef TT, StringRef CPU);
 
 /// createX86MachObjectWriter - Construct an X86 Mach-O object writer.
 MCObjectWriter *createX86MachObjectWriter(raw_ostream &OS,
diff --git a/lib/Target/X86/MCTargetDesc/X86MachORelocationInfo.cpp b/lib/Target/X86/MCTargetDesc/X86MachORelocationInfo.cpp
index 75b5acf..209b1d0 100644
--- a/lib/Target/X86/MCTargetDesc/X86MachORelocationInfo.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MachORelocationInfo.cpp
@@ -17,7 +17,7 @@
 
 using namespace llvm;
 using namespace object;
-using namespace macho;
+using namespace MachO;
 
 namespace {
 class X86_64MachORelocationInfo : public MCRelocationInfo {
@@ -33,7 +33,7 @@ public:
     StringRef SymName; SymI->getName(SymName);
     uint64_t  SymAddr; SymI->getAddress(SymAddr);
 
-    RelocationEntry RE = Obj->getRelocation(Rel.getRawDataRefImpl());
+    any_relocation_info RE = Obj->getRelocation(Rel.getRawDataRefImpl());
     bool isPCRel = Obj->getAnyRelocationPCRel(RE);
 
     MCSymbol *Sym = Ctx.GetOrCreateSymbol(SymName);
@@ -43,44 +43,44 @@ public:
     const MCExpr *Expr = 0;
 
     switch(RelType) {
-    case RIT_X86_64_TLV:
+    case X86_64_RELOC_TLV:
       Expr = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_TLVP, Ctx);
       break;
-    case RIT_X86_64_Signed4:
+    case X86_64_RELOC_SIGNED_4:
       Expr = MCBinaryExpr::CreateAdd(MCSymbolRefExpr::Create(Sym, Ctx),
                                      MCConstantExpr::Create(4, Ctx),
                                      Ctx);
       break;
-    case RIT_X86_64_Signed2:
+    case X86_64_RELOC_SIGNED_2:
       Expr = MCBinaryExpr::CreateAdd(MCSymbolRefExpr::Create(Sym, Ctx),
                                      MCConstantExpr::Create(2, Ctx),
                                      Ctx);
       break;
-    case RIT_X86_64_Signed1:
+    case X86_64_RELOC_SIGNED_1:
       Expr = MCBinaryExpr::CreateAdd(MCSymbolRefExpr::Create(Sym, Ctx),
                                      MCConstantExpr::Create(1, Ctx),
                                      Ctx);
       break;
-    case RIT_X86_64_GOTLoad:
+    case X86_64_RELOC_GOT_LOAD:
       Expr = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_GOTPCREL, Ctx);
       break;
-    case RIT_X86_64_GOT:
+    case X86_64_RELOC_GOT:
       Expr = MCSymbolRefExpr::Create(Sym, isPCRel ?
                                      MCSymbolRefExpr::VK_GOTPCREL :
                                      MCSymbolRefExpr::VK_GOT,
                                      Ctx);
       break;
-    case RIT_X86_64_Subtractor:
+    case X86_64_RELOC_SUBTRACTOR:
       {
         RelocationRef RelNext;
         Obj->getRelocationNext(Rel.getRawDataRefImpl(), RelNext);
-        RelocationEntry RENext = Obj->getRelocation(RelNext.getRawDataRefImpl());
+        any_relocation_info RENext = Obj->getRelocation(RelNext.getRawDataRefImpl());
 
         // X86_64_SUBTRACTOR must be followed by a relocation of type
-        // X86_64_RELOC_UNSIGNED    .
+        // X86_64_RELOC_UNSIGNED.
         // NOTE: Scattered relocations don't exist on x86_64.
         unsigned RType = Obj->getAnyRelocationType(RENext);
-        if (RType != RIT_X86_64_Unsigned)
+        if (RType != X86_64_RELOC_UNSIGNED)
           report_fatal_error("Expected X86_64_RELOC_UNSIGNED after "
                              "X86_64_RELOC_SUBTRACTOR.");
 
diff --git a/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
index 64f005c..eb7c0b1 100644
--- a/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
@@ -16,12 +16,11 @@
 #include "llvm/MC/MCMachObjectWriter.h"
 #include "llvm/MC/MCSectionMachO.h"
 #include "llvm/MC/MCValue.h"
-#include "llvm/Object/MachOFormat.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
+#include "llvm/Support/MachO.h"
 
 using namespace llvm;
-using namespace llvm::object;
 
 namespace {
 class X86MachObjectWriter : public MCMachObjectTargetWriter {
@@ -132,7 +131,7 @@ void X86MachObjectWriter::RecordX86_64Relocation(MachObjectWriter *Writer,
 
   if (Target.isAbsolute()) { // constant
     // SymbolNum of 0 indicates the absolute section.
-    Type = macho::RIT_X86_64_Unsigned;
+    Type = MachO::X86_64_RELOC_UNSIGNED;
     Index = 0;
 
     // FIXME: I believe this is broken, I don't think the linker can understand
@@ -141,26 +140,31 @@ void X86MachObjectWriter::RecordX86_64Relocation(MachObjectWriter *Writer,
     // is to use an absolute symbol (which we don't support yet).
     if (IsPCRel) {
       IsExtern = 1;
-      Type = macho::RIT_X86_64_Branch;
+      Type = MachO::X86_64_RELOC_BRANCH;
     }
   } else if (Target.getSymB()) { // A - B + constant
     const MCSymbol *A = &Target.getSymA()->getSymbol();
+    if (A->isTemporary())
+      A = &A->AliasedSymbol();
     MCSymbolData &A_SD = Asm.getSymbolData(*A);
     const MCSymbolData *A_Base = Asm.getAtom(&A_SD);
 
     const MCSymbol *B = &Target.getSymB()->getSymbol();
+    if (B->isTemporary())
+      B = &B->AliasedSymbol();
     MCSymbolData &B_SD = Asm.getSymbolData(*B);
     const MCSymbolData *B_Base = Asm.getAtom(&B_SD);
 
     // Neither symbol can be modified.
     if (Target.getSymA()->getKind() != MCSymbolRefExpr::VK_None ||
         Target.getSymB()->getKind() != MCSymbolRefExpr::VK_None)
-      report_fatal_error("unsupported relocation of modified symbol");
+      report_fatal_error("unsupported relocation of modified symbol", false);
 
     // We don't support PCrel relocations of differences. Darwin 'as' doesn't
     // implement most of these correctly.
     if (IsPCRel)
-      report_fatal_error("unsupported pc-relative relocation of difference");
+      report_fatal_error("unsupported pc-relative relocation of difference",
+                         false);
 
     // The support for the situation where one or both of the symbols would
     // require a local relocation is handled just like if the symbols were
@@ -173,7 +177,13 @@ void X86MachObjectWriter::RecordX86_64Relocation(MachObjectWriter *Writer,
     // single SIGNED relocation); reject it for now.  Except the case where both
     // symbols don't have a base, equal but both NULL.
     if (A_Base == B_Base && A_Base)
-      report_fatal_error("unsupported relocation with identical base");
+      report_fatal_error("unsupported relocation with identical base", false);
+
+    // A subtraction expression where both symbols are undefined is a
+    // non-relocatable expression.
+    if (A->isUndefined() && B->isUndefined())
+      report_fatal_error("unsupported relocation with subtraction expression",
+                         false);
 
     Value += Writer->getSymbolAddress(&A_SD, Layout) -
       (A_Base == NULL ? 0 : Writer->getSymbolAddress(A_Base, Layout));
@@ -188,15 +198,15 @@ void X86MachObjectWriter::RecordX86_64Relocation(MachObjectWriter *Writer,
       Index = A_SD.getFragment()->getParent()->getOrdinal() + 1;
       IsExtern = 0;
     }
-    Type = macho::RIT_X86_64_Unsigned;
-
-    macho::RelocationEntry MRE;
-    MRE.Word0 = FixupOffset;
-    MRE.Word1 = ((Index     <<  0) |
-                 (IsPCRel   << 24) |
-                 (Log2Size  << 25) |
-                 (IsExtern  << 27) |
-                 (Type      << 28));
+    Type = MachO::X86_64_RELOC_UNSIGNED;
+
+    MachO::any_relocation_info MRE;
+    MRE.r_word0 = FixupOffset;
+    MRE.r_word1 = ((Index     <<  0) |
+                   (IsPCRel   << 24) |
+                   (Log2Size  << 25) |
+                   (IsExtern  << 27) |
+                   (Type      << 28));
     Writer->addRelocation(Fragment->getParent(), MRE);
 
     if (B_Base) {
@@ -207,7 +217,7 @@ void X86MachObjectWriter::RecordX86_64Relocation(MachObjectWriter *Writer,
       Index = B_SD.getFragment()->getParent()->getOrdinal() + 1;
       IsExtern = 0;
     }
-    Type = macho::RIT_X86_64_Subtractor;
+    Type = MachO::X86_64_RELOC_SUBTRACTOR;
   } else {
     const MCSymbol *Symbol = &Target.getSymA()->getSymbol();
     MCSymbolData &SD = Asm.getSymbolData(*Symbol);
@@ -252,11 +262,11 @@ void X86MachObjectWriter::RecordX86_64Relocation(MachObjectWriter *Writer,
         return;
       } else {
         report_fatal_error("unsupported relocation of variable '" +
-                           Symbol->getName() + "'");
+                           Symbol->getName() + "'", false);
       }
     } else {
       report_fatal_error("unsupported relocation of undefined symbol '" +
-                         Symbol->getName() + "'");
+                         Symbol->getName() + "'", false);
     }
 
     MCSymbolRefExpr::VariantKind Modifier = Target.getSymA()->getKind();
@@ -267,15 +277,16 @@ void X86MachObjectWriter::RecordX86_64Relocation(MachObjectWriter *Writer,
           // rewrite the movq to an leaq at link time if the symbol ends up in
           // the same linkage unit.
           if (unsigned(Fixup.getKind()) == X86::reloc_riprel_4byte_movq_load)
-            Type = macho::RIT_X86_64_GOTLoad;
+            Type = MachO::X86_64_RELOC_GOT_LOAD;
           else
-            Type = macho::RIT_X86_64_GOT;
+            Type = MachO::X86_64_RELOC_GOT;
         }  else if (Modifier == MCSymbolRefExpr::VK_TLVP) {
-          Type = macho::RIT_X86_64_TLV;
+          Type = MachO::X86_64_RELOC_TLV;
         }  else if (Modifier != MCSymbolRefExpr::VK_None) {
-          report_fatal_error("unsupported symbol modifier in relocation");
+          report_fatal_error("unsupported symbol modifier in relocation",
+                             false);
         } else {
-          Type = macho::RIT_X86_64_Signed;
+          Type = MachO::X86_64_RELOC_SIGNED;
 
           // The Darwin x86_64 relocation format has a problem where it cannot
           // encode an address (L<foo> + <constant>) which is outside the atom
@@ -292,34 +303,40 @@ void X86MachObjectWriter::RecordX86_64Relocation(MachObjectWriter *Writer,
           // (the additional bias), but instead appear to just look at the final
           // offset.
           switch (-(Target.getConstant() + (1LL << Log2Size))) {
-          case 1: Type = macho::RIT_X86_64_Signed1; break;
-          case 2: Type = macho::RIT_X86_64_Signed2; break;
-          case 4: Type = macho::RIT_X86_64_Signed4; break;
+          case 1: Type = MachO::X86_64_RELOC_SIGNED_1; break;
+          case 2: Type = MachO::X86_64_RELOC_SIGNED_2; break;
+          case 4: Type = MachO::X86_64_RELOC_SIGNED_4; break;
           }
         }
       } else {
         if (Modifier != MCSymbolRefExpr::VK_None)
           report_fatal_error("unsupported symbol modifier in branch "
-                             "relocation");
+                             "relocation", false);
 
-        Type = macho::RIT_X86_64_Branch;
+        Type = MachO::X86_64_RELOC_BRANCH;
       }
     } else {
       if (Modifier == MCSymbolRefExpr::VK_GOT) {
-        Type = macho::RIT_X86_64_GOT;
+        Type = MachO::X86_64_RELOC_GOT;
       } else if (Modifier == MCSymbolRefExpr::VK_GOTPCREL) {
         // GOTPCREL is allowed as a modifier on non-PCrel instructions, in which
         // case all we do is set the PCrel bit in the relocation entry; this is
         // used with exception handling, for example. The source is required to
         // include any necessary offset directly.
-        Type = macho::RIT_X86_64_GOT;
+        Type = MachO::X86_64_RELOC_GOT;
         IsPCRel = 1;
       } else if (Modifier == MCSymbolRefExpr::VK_TLVP) {
-        report_fatal_error("TLVP symbol modifier should have been rip-rel");
+        report_fatal_error("TLVP symbol modifier should have been rip-rel",
+                           false);
       } else if (Modifier != MCSymbolRefExpr::VK_None)
-        report_fatal_error("unsupported symbol modifier in relocation");
-      else
-        Type = macho::RIT_X86_64_Unsigned;
+        report_fatal_error("unsupported symbol modifier in relocation", false);
+      else {
+        Type = MachO::X86_64_RELOC_UNSIGNED;
+        unsigned Kind = Fixup.getKind();
+        if (Kind == X86::reloc_signed_4byte)
+          report_fatal_error("32-bit absolute addressing is not supported in "
+                             "64-bit mode", false);
+      }
     }
   }
 
@@ -327,13 +344,13 @@ void X86MachObjectWriter::RecordX86_64Relocation(MachObjectWriter *Writer,
   FixedValue = Value;
 
   // struct relocation_info (8 bytes)
-  macho::RelocationEntry MRE;
-  MRE.Word0 = FixupOffset;
-  MRE.Word1 = ((Index     <<  0) |
-               (IsPCRel   << 24) |
-               (Log2Size  << 25) |
-               (IsExtern  << 27) |
-               (Type      << 28));
+  MachO::any_relocation_info MRE;
+  MRE.r_word0 = FixupOffset;
+  MRE.r_word1 = ((Index     <<  0) |
+                 (IsPCRel   << 24) |
+                 (Log2Size  << 25) |
+                 (IsExtern  << 27) |
+                 (Type      << 28));
   Writer->addRelocation(Fragment->getParent(), MRE);
 }
 
@@ -347,7 +364,7 @@ bool X86MachObjectWriter::RecordScatteredRelocation(MachObjectWriter *Writer,
                                                     uint64_t &FixedValue) {
   uint32_t FixupOffset = Layout.getFragmentOffset(Fragment)+Fixup.getOffset();
   unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind());
-  unsigned Type = macho::RIT_Vanilla;
+  unsigned Type = MachO::GENERIC_RELOC_VANILLA;
 
   // See <reloc.h>.
   const MCSymbol *A = &Target.getSymA()->getSymbol();
@@ -355,7 +372,8 @@ bool X86MachObjectWriter::RecordScatteredRelocation(MachObjectWriter *Writer,
 
   if (!A_SD->getFragment())
     report_fatal_error("symbol '" + A->getName() +
-                       "' can not be undefined in a subtraction expression");
+                       "' can not be undefined in a subtraction expression",
+                       false);
 
   uint32_t Value = Writer->getSymbolAddress(A_SD, Layout);
   uint64_t SecAddr = Writer->getSectionAddress(A_SD->getFragment()->getParent());
@@ -367,22 +385,23 @@ bool X86MachObjectWriter::RecordScatteredRelocation(MachObjectWriter *Writer,
 
     if (!B_SD->getFragment())
       report_fatal_error("symbol '" + B->getSymbol().getName() +
-                         "' can not be undefined in a subtraction expression");
+                         "' can not be undefined in a subtraction expression",
+                         false);
 
     // Select the appropriate difference relocation type.
     //
     // Note that there is no longer any semantic difference between these two
     // relocation types from the linkers point of view, this is done solely for
     // pedantic compatibility with 'as'.
-    Type = A_SD->isExternal() ? (unsigned)macho::RIT_Difference :
-      (unsigned)macho::RIT_Generic_LocalDifference;
+    Type = A_SD->isExternal() ? (unsigned)MachO::GENERIC_RELOC_SECTDIFF :
+      (unsigned)MachO::GENERIC_RELOC_LOCAL_SECTDIFF;
     Value2 = Writer->getSymbolAddress(B_SD, Layout);
     FixedValue -= Writer->getSectionAddress(B_SD->getFragment()->getParent());
   }
 
   // Relocations are written out in reverse order, so the PAIR comes first.
-  if (Type == macho::RIT_Difference ||
-      Type == macho::RIT_Generic_LocalDifference) {
+  if (Type == MachO::GENERIC_RELOC_SECTDIFF ||
+      Type == MachO::GENERIC_RELOC_LOCAL_SECTDIFF) {
     // If the offset is too large to fit in a scattered relocation,
     // we're hosed. It's an unfortunate limitation of the MachO format.
     if (FixupOffset > 0xffffff) {
@@ -396,13 +415,13 @@ bool X86MachObjectWriter::RecordScatteredRelocation(MachObjectWriter *Writer,
       llvm_unreachable("fatal error returned?!");
     }
 
-    macho::RelocationEntry MRE;
-    MRE.Word0 = ((0         <<  0) |
-                 (macho::RIT_Pair  << 24) |
-                 (Log2Size  << 28) |
-                 (IsPCRel   << 30) |
-                 macho::RF_Scattered);
-    MRE.Word1 = Value2;
+    MachO::any_relocation_info MRE;
+    MRE.r_word0 = ((0                         <<  0) | // r_address
+                   (MachO::GENERIC_RELOC_PAIR << 24) | // r_type
+                   (Log2Size                  << 28) |
+                   (IsPCRel                   << 30) |
+                   MachO::R_SCATTERED);
+    MRE.r_word1 = Value2;
     Writer->addRelocation(Fragment->getParent(), MRE);
   } else {
     // If the offset is more than 24-bits, it won't fit in a scattered
@@ -416,13 +435,13 @@ bool X86MachObjectWriter::RecordScatteredRelocation(MachObjectWriter *Writer,
       return false;
   }
 
-  macho::RelocationEntry MRE;
-  MRE.Word0 = ((FixupOffset <<  0) |
-               (Type        << 24) |
-               (Log2Size    << 28) |
-               (IsPCRel     << 30) |
-               macho::RF_Scattered);
-  MRE.Word1 = Value;
+  MachO::any_relocation_info MRE;
+  MRE.r_word0 = ((FixupOffset <<  0) |
+                 (Type        << 24) |
+                 (Log2Size    << 28) |
+                 (IsPCRel     << 30) |
+                 MachO::R_SCATTERED);
+  MRE.r_word1 = Value;
   Writer->addRelocation(Fragment->getParent(), MRE);
   return true;
 }
@@ -464,13 +483,13 @@ void X86MachObjectWriter::RecordTLVPRelocation(MachObjectWriter *Writer,
   }
 
   // struct relocation_info (8 bytes)
-  macho::RelocationEntry MRE;
-  MRE.Word0 = Value;
-  MRE.Word1 = ((Index                  <<  0) |
-               (IsPCRel                << 24) |
-               (Log2Size               << 25) |
-               (1                      << 27) | // Extern
-               (macho::RIT_Generic_TLV << 28)); // Type
+  MachO::any_relocation_info MRE;
+  MRE.r_word0 = Value;
+  MRE.r_word1 = ((Index                    <<  0) |
+                 (IsPCRel                  << 24) |
+                 (Log2Size                 << 25) |
+                 (1                        << 27) | // r_extern
+                 (MachO::GENERIC_RELOC_TLV << 28)); // r_type
   Writer->addRelocation(Fragment->getParent(), MRE);
 }
 
@@ -530,7 +549,7 @@ void X86MachObjectWriter::RecordX86Relocation(MachObjectWriter *Writer,
     //
     // FIXME: Currently, these are never generated (see code below). I cannot
     // find a case where they are actually emitted.
-    Type = macho::RIT_Vanilla;
+    Type = MachO::GENERIC_RELOC_VANILLA;
   } else {
     // Resolve constant variables.
     if (SD->getSymbol().isVariable()) {
@@ -561,17 +580,17 @@ void X86MachObjectWriter::RecordX86Relocation(MachObjectWriter *Writer,
     if (IsPCRel)
       FixedValue -= Writer->getSectionAddress(Fragment->getParent());
 
-    Type = macho::RIT_Vanilla;
+    Type = MachO::GENERIC_RELOC_VANILLA;
   }
 
   // struct relocation_info (8 bytes)
-  macho::RelocationEntry MRE;
-  MRE.Word0 = FixupOffset;
-  MRE.Word1 = ((Index     <<  0) |
-               (IsPCRel   << 24) |
-               (Log2Size  << 25) |
-               (IsExtern  << 27) |
-               (Type      << 28));
+  MachO::any_relocation_info MRE;
+  MRE.r_word0 = FixupOffset;
+  MRE.r_word1 = ((Index     <<  0) |
+                 (IsPCRel   << 24) |
+                 (Log2Size  << 25) |
+                 (IsExtern  << 27) |
+                 (Type      << 28));
   Writer->addRelocation(Fragment->getParent(), MRE);
 }
 
diff --git a/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
index ed64a32..6da4142 100644
--- a/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
@@ -27,7 +27,7 @@ namespace {
 
   public:
     X86WinCOFFObjectWriter(bool Is64Bit_);
-    ~X86WinCOFFObjectWriter();
+    virtual ~X86WinCOFFObjectWriter();
 
     virtual unsigned getRelocType(const MCValue &Target,
                                   const MCFixup &Fixup,
diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td
index 461ea9b..65c5552 100644
--- a/lib/Target/X86/X86.td
+++ b/lib/Target/X86/X86.td
@@ -50,10 +50,10 @@ def FeatureSSE3    : SubtargetFeature<"sse3", "X86SSELevel", "SSE3",
 def FeatureSSSE3   : SubtargetFeature<"ssse3", "X86SSELevel", "SSSE3",
                                       "Enable SSSE3 instructions",
                                       [FeatureSSE3]>;
-def FeatureSSE41   : SubtargetFeature<"sse41", "X86SSELevel", "SSE41",
+def FeatureSSE41   : SubtargetFeature<"sse4.1", "X86SSELevel", "SSE41",
                                       "Enable SSE 4.1 instructions",
                                       [FeatureSSSE3]>;
-def FeatureSSE42   : SubtargetFeature<"sse42", "X86SSELevel", "SSE42",
+def FeatureSSE42   : SubtargetFeature<"sse4.2", "X86SSELevel", "SSE42",
                                       "Enable SSE 4.2 instructions",
                                       [FeatureSSE41]>;
 def Feature3DNow   : SubtargetFeature<"3dnow", "X863DNowLevel", "ThreeDNow",
@@ -68,7 +68,7 @@ def Feature3DNowA  : SubtargetFeature<"3dnowa", "X863DNowLevel", "ThreeDNowA",
 def Feature64Bit   : SubtargetFeature<"64bit", "HasX86_64", "true",
                                       "Support 64-bit instructions",
                                       [FeatureCMOV]>;
-def FeatureCMPXCHG16B : SubtargetFeature<"cmpxchg16b", "HasCmpxchg16b", "true",
+def FeatureCMPXCHG16B : SubtargetFeature<"cx16", "HasCmpxchg16b", "true",
                                       "64-bit with cmpxchg16b",
                                       [Feature64Bit]>;
 def FeatureSlowBTMem : SubtargetFeature<"slow-bt-mem", "IsBTMemSlow", "true",
@@ -86,16 +86,16 @@ def FeatureAVX     : SubtargetFeature<"avx", "X86SSELevel", "AVX",
 def FeatureAVX2    : SubtargetFeature<"avx2", "X86SSELevel", "AVX2",
                                       "Enable AVX2 instructions",
                                       [FeatureAVX]>;
-def FeatureAVX512   : SubtargetFeature<"avx-512", "X86SSELevel", "AVX512",
+def FeatureAVX512   : SubtargetFeature<"avx512f", "X86SSELevel", "AVX512F",
                                       "Enable AVX-512 instructions",
                                       [FeatureAVX2]>;
-def FeatureERI      : SubtargetFeature<"avx-512-eri", "HasERI", "true",
+def FeatureERI      : SubtargetFeature<"avx512er", "HasERI", "true",
                       "Enable AVX-512 Exponential and Reciprocal Instructions",
                                       [FeatureAVX512]>;
-def FeatureCDI      : SubtargetFeature<"avx-512-cdi", "HasCDI", "true",
+def FeatureCDI      : SubtargetFeature<"avx512cd", "HasCDI", "true",
                       "Enable AVX-512 Conflict Detection Instructions",
                                       [FeatureAVX512]>;
-def FeaturePFI      : SubtargetFeature<"avx-512-pfi", "HasPFI", "true",
+def FeaturePFI      : SubtargetFeature<"avx512pf", "HasPFI", "true",
                       "Enable AVX-512 PreFetch Instructions",
                                       [FeatureAVX512]>;
 
@@ -117,12 +117,15 @@ def FeatureVectorUAMem : SubtargetFeature<"vector-unaligned-mem",
 def FeatureAES     : SubtargetFeature<"aes", "HasAES", "true",
                                       "Enable AES instructions",
                                       [FeatureSSE2]>;
+def FeatureTBM     : SubtargetFeature<"tbm", "HasTBM", "true",
+                                      "Enable TBM instructions">;
 def FeatureMOVBE   : SubtargetFeature<"movbe", "HasMOVBE", "true",
                                       "Support MOVBE instruction">;
-def FeatureRDRAND  : SubtargetFeature<"rdrand", "HasRDRAND", "true",
+def FeatureRDRAND  : SubtargetFeature<"rdrnd", "HasRDRAND", "true",
                                       "Support RDRAND instruction">;
 def FeatureF16C    : SubtargetFeature<"f16c", "HasF16C", "true",
-                       "Support 16-bit floating point conversion instructions">;
+                       "Support 16-bit floating point conversion instructions",
+                       [FeatureAVX]>;
 def FeatureFSGSBase : SubtargetFeature<"fsgsbase", "HasFSGSBase", "true",
                                        "Support FS/GS Base instructions">;
 def FeatureLZCNT   : SubtargetFeature<"lzcnt", "HasLZCNT", "true",
@@ -137,6 +140,9 @@ def FeatureHLE     : SubtargetFeature<"hle", "HasHLE", "true",
                                       "Support HLE">;
 def FeatureADX     : SubtargetFeature<"adx", "HasADX", "true",
                                       "Support ADX instructions">;
+def FeatureSHA     : SubtargetFeature<"sha", "HasSHA", "true",
+                                      "Enable SHA instructions",
+                                      [FeatureSSE2]>;
 def FeaturePRFCHW  : SubtargetFeature<"prfchw", "HasPRFCHW", "true",
                                       "Support PRFCHW instructions">;
 def FeatureRDSEED  : SubtargetFeature<"rdseed", "HasRDSEED", "true",
@@ -163,6 +169,8 @@ include "X86Schedule.td"
 
 def ProcIntelAtom : SubtargetFeature<"atom", "X86ProcFamily", "IntelAtom",
                     "Intel Atom processors">;
+def ProcIntelSLM  : SubtargetFeature<"slm", "X86ProcFamily", "IntelSLM",
+                    "Intel Silvermont processors">;
 
 class Proc<string Name, list<SubtargetFeature> Features>
  : ProcessorModel<Name, GenericModel, Features>;
@@ -206,6 +214,14 @@ def : ProcessorModel<"atom", AtomModel,
                       FeatureLEAUsesAG,
                       FeaturePadShortFunctions]>;
 
+// Atom Silvermont.
+def : ProcessorModel<"slm",  SLMModel, [ProcIntelSLM,
+                               FeatureSSE42, FeatureCMPXCHG16B,
+                               FeatureMOVBE, FeaturePOPCNT,
+                               FeaturePCLMUL, FeatureAES,
+                               FeatureCallRegIndirect,
+                               FeaturePRFCHW,
+                               FeatureSlowBTMem]>;
 // "Arrandale" along with corei3 and corei5
 def : ProcessorModel<"corei7", SandyBridgeModel,
                      [FeatureSSE42, FeatureCMPXCHG16B, FeatureSlowBTMem,
@@ -276,21 +292,30 @@ def : Proc<"amdfam10",        [FeatureSSE4A,
                                FeaturePOPCNT, FeatureSlowBTMem]>;
 // Bobcat
 def : Proc<"btver1",          [FeatureSSSE3, FeatureSSE4A, FeatureCMPXCHG16B,
-                               FeatureLZCNT, FeaturePOPCNT]>;
+                               FeaturePRFCHW, FeatureLZCNT, FeaturePOPCNT]>;
 // Jaguar
 def : Proc<"btver2",          [FeatureAVX, FeatureSSE4A, FeatureCMPXCHG16B,
-                               FeatureAES, FeaturePCLMUL, FeatureBMI,
-                               FeatureF16C, FeatureMOVBE, FeatureLZCNT,
-                               FeaturePOPCNT]>;
+                               FeaturePRFCHW, FeatureAES, FeaturePCLMUL,
+                               FeatureBMI, FeatureF16C, FeatureMOVBE,
+                               FeatureLZCNT, FeaturePOPCNT]>;
 // Bulldozer
 def : Proc<"bdver1",          [FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B,
-                               FeatureAES, FeaturePCLMUL,
+                               FeatureAES, FeaturePRFCHW, FeaturePCLMUL,
                                FeatureLZCNT, FeaturePOPCNT]>;
 // Piledriver
 def : Proc<"bdver2",          [FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B,
-                               FeatureAES, FeaturePCLMUL,
+                               FeatureAES, FeaturePRFCHW, FeaturePCLMUL,
                                FeatureF16C, FeatureLZCNT,
-                               FeaturePOPCNT, FeatureBMI, FeatureFMA]>;
+                               FeaturePOPCNT, FeatureBMI,  FeatureTBM,
+                               FeatureFMA]>;
+
+// Steamroller
+def : Proc<"bdver3",          [FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B,
+                               FeatureAES, FeaturePRFCHW, FeaturePCLMUL,
+                               FeatureF16C, FeatureLZCNT,
+                               FeaturePOPCNT, FeatureBMI,  FeatureTBM,
+                               FeatureFMA, FeatureFSGSBase]>;
+
 def : Proc<"geode",           [Feature3DNowA]>;
 
 def : Proc<"winchip-c6",      [FeatureMMX]>;
diff --git a/lib/Target/X86/X86AsmPrinter.cpp b/lib/Target/X86/X86AsmPrinter.cpp
index 9e0ab82..1258441 100644
--- a/lib/Target/X86/X86AsmPrinter.cpp
+++ b/lib/Target/X86/X86AsmPrinter.cpp
@@ -96,7 +96,7 @@ void X86AsmPrinter::printSymbolOperand(const MachineOperand &MO,
              MO.getTargetFlags() == X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE)
       GVSym = GetSymbolWithGlobalValueBase(GV, "$non_lazy_ptr");
     else
-      GVSym = Mang->getSymbol(GV);
+      GVSym = getSymbol(GV);
 
     // Handle dllimport linkage.
     if (MO.getTargetFlags() == X86II::MO_DLLIMPORT)
@@ -109,21 +109,21 @@ void X86AsmPrinter::printSymbolOperand(const MachineOperand &MO,
         MMI->getObjFileInfo<MachineModuleInfoMachO>().getGVStubEntry(Sym);
       if (StubSym.getPointer() == 0)
         StubSym = MachineModuleInfoImpl::
-          StubValueTy(Mang->getSymbol(GV), !GV->hasInternalLinkage());
+          StubValueTy(getSymbol(GV), !GV->hasInternalLinkage());
     } else if (MO.getTargetFlags() == X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE){
       MCSymbol *Sym = GetSymbolWithGlobalValueBase(GV, "$non_lazy_ptr");
       MachineModuleInfoImpl::StubValueTy &StubSym =
         MMI->getObjFileInfo<MachineModuleInfoMachO>().getHiddenGVStubEntry(Sym);
       if (StubSym.getPointer() == 0)
         StubSym = MachineModuleInfoImpl::
-          StubValueTy(Mang->getSymbol(GV), !GV->hasInternalLinkage());
+          StubValueTy(getSymbol(GV), !GV->hasInternalLinkage());
     } else if (MO.getTargetFlags() == X86II::MO_DARWIN_STUB) {
       MCSymbol *Sym = GetSymbolWithGlobalValueBase(GV, "$stub");
       MachineModuleInfoImpl::StubValueTy &StubSym =
         MMI->getObjFileInfo<MachineModuleInfoMachO>().getFnStubEntry(Sym);
       if (StubSym.getPointer() == 0)
         StubSym = MachineModuleInfoImpl::
-          StubValueTy(Mang->getSymbol(GV), !GV->hasInternalLinkage());
+          StubValueTy(getSymbol(GV), !GV->hasInternalLinkage());
     }
 
     // If the name begins with a dollar-sign, enclose it in parens.  We do this
@@ -333,21 +333,21 @@ void X86AsmPrinter::printIntelMemReference(const MachineInstr *MI, unsigned Op,
   const MachineOperand &IndexReg = MI->getOperand(Op+2);
   const MachineOperand &DispSpec = MI->getOperand(Op+3);
   const MachineOperand &SegReg   = MI->getOperand(Op+4);
-  
+
   // If this has a segment register, print it.
   if (SegReg.getReg()) {
     printOperand(MI, Op+4, O, Modifier, AsmVariant);
     O << ':';
   }
-  
+
   O << '[';
-  
+
   bool NeedPlus = false;
   if (BaseReg.getReg()) {
     printOperand(MI, Op, O, Modifier, AsmVariant);
     NeedPlus = true;
   }
-  
+
   if (IndexReg.getReg()) {
     if (NeedPlus) O << " + ";
     if (ScaleVal != 1)
@@ -394,6 +394,7 @@ bool X86AsmPrinter::printAsmMRegister(const MachineOperand &MO, char Mode,
     Reg = getX86SubSuperRegister(Reg, MVT::i32);
     break;
   case 'q': // Print DImode register
+    // FIXME: gcc will actually print e instead of r for 32-bit.
     Reg = getX86SubSuperRegister(Reg, MVT::i64);
     break;
   }
@@ -518,6 +519,27 @@ bool X86AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
 void X86AsmPrinter::EmitStartOfAsmFile(Module &M) {
   if (Subtarget->isTargetEnvMacho())
     OutStreamer.SwitchSection(getObjFileLowering().getTextSection());
+
+  if (Subtarget->isTargetCOFF()) {
+    // Emit an absolute @feat.00 symbol.  This appears to be some kind of
+    // compiler features bitfield read by link.exe.
+    if (!Subtarget->is64Bit()) {
+      MCSymbol *S = MMI->getContext().GetOrCreateSymbol(StringRef("@feat.00"));
+      OutStreamer.BeginCOFFSymbolDef(S);
+      OutStreamer.EmitCOFFSymbolStorageClass(COFF::IMAGE_SYM_CLASS_STATIC);
+      OutStreamer.EmitCOFFSymbolType(COFF::IMAGE_SYM_DTYPE_NULL);
+      OutStreamer.EndCOFFSymbolDef();
+      // According to the PE-COFF spec, the LSB of this value marks the object
+      // for "registered SEH".  This means that all SEH handler entry points
+      // must be registered in .sxdata.  Use of any unregistered handlers will
+      // cause the process to terminate immediately.  LLVM does not know how to
+      // register any SEH handlers, so its object files should be safe.
+      S->setAbsolute();
+      OutStreamer.EmitSymbolAttribute(S, MCSA_Global);
+      OutStreamer.EmitAssignment(
+          S, MCConstantExpr::Create(int64_t(1), MMI->getContext()));
+    }
+  }
 }
 
 
@@ -606,6 +628,8 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) {
       OutStreamer.AddBlankLine();
     }
 
+    SM.serializeToStackMapSection();
+
     // Funny Darwin hack: This flag tells the linker that no global symbols
     // contain code that falls through to other global symbols (e.g. the obvious
     // implementation of multiple entry points).  If this doesn't occur, the
@@ -645,12 +669,12 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) {
 
     for (Module::const_iterator I = M.begin(), E = M.end(); I != E; ++I)
       if (I->hasDLLExportLinkage())
-        DLLExportedFns.push_back(Mang->getSymbol(I));
+        DLLExportedFns.push_back(getSymbol(I));
 
     for (Module::const_global_iterator I = M.global_begin(),
            E = M.global_end(); I != E; ++I)
       if (I->hasDLLExportLinkage())
-        DLLExportedGlobals.push_back(Mang->getSymbol(I));
+        DLLExportedGlobals.push_back(getSymbol(I));
 
     // Output linker support code for dllexported globals on windows.
     if (!DLLExportedGlobals.empty() || !DLLExportedFns.empty()) {
diff --git a/lib/Target/X86/X86AsmPrinter.h b/lib/Target/X86/X86AsmPrinter.h
index 6eed5ce..24a768b 100644
--- a/lib/Target/X86/X86AsmPrinter.h
+++ b/lib/Target/X86/X86AsmPrinter.h
@@ -16,6 +16,7 @@
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/CodeGen/StackMaps.h"
 #include "llvm/Support/Compiler.h"
 
 namespace llvm {
@@ -24,9 +25,21 @@ class MCStreamer;
 
 class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter {
   const X86Subtarget *Subtarget;
+  StackMaps SM;
+
+  // Parses operands of PATCHPOINT and STACKMAP to produce stack map Location
+  // structures. Returns a result location and an iterator to the operand
+  // immediately following the operands consumed.
+  //
+  // This method is implemented in X86MCInstLower.cpp.
+  static std::pair<StackMaps::Location, MachineInstr::const_mop_iterator>
+    stackmapOperandParser(MachineInstr::const_mop_iterator MOI,
+                          MachineInstr::const_mop_iterator MOE,
+                          const TargetMachine &TM);
+
  public:
   explicit X86AsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
-    : AsmPrinter(TM, Streamer) {
+    : AsmPrinter(TM, Streamer), SM(*this, stackmapOperandParser) {
     Subtarget = &TM.getSubtarget<X86Subtarget>();
   }
 
diff --git a/lib/Target/X86/X86CallingConv.h b/lib/Target/X86/X86CallingConv.h
new file mode 100644
index 0000000..e76f9fd
--- /dev/null
+++ b/lib/Target/X86/X86CallingConv.h
@@ -0,0 +1,35 @@
+//=== X86CallingConv.h - X86 Custom Calling Convention Routines -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the custom routines for the X86 Calling Convention that
+// aren't done by tablegen.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86CALLINGCONV_H
+#define X86CALLINGCONV_H
+
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/IR/CallingConv.h"
+
+namespace llvm {
+
+inline bool CC_X86_AnyReg_Error(unsigned &, MVT &, MVT &,
+                                CCValAssign::LocInfo &, ISD::ArgFlagsTy &,
+                                CCState &) {
+  llvm_unreachable("The AnyReg calling convention is only supported by the " \
+                   "stackmap and patchpoint intrinsics.");
+  // gracefully fallback to X86 C calling convention on Release builds.
+  return false;
+}
+
+} // End llvm namespace
+
+#endif
+
diff --git a/lib/Target/X86/X86CallingConv.td b/lib/Target/X86/X86CallingConv.td
index 38e2591..a78b5c0 100644
--- a/lib/Target/X86/X86CallingConv.td
+++ b/lib/Target/X86/X86CallingConv.td
@@ -151,6 +151,26 @@ def RetCC_X86_64_HiPE : CallingConv<[
   CCIfType<[i64], CCAssignToReg<[R15, RBP, RAX, RDX]>>
 ]>;
 
+// X86-64 WebKit_JS return-value convention.
+def RetCC_X86_64_WebKit_JS : CallingConv<[
+  // Promote all types to i64
+  CCIfType<[i8, i16, i32], CCPromoteToType<i64>>,
+
+  // Return: RAX
+  CCIfType<[i64], CCAssignToReg<[RAX]>>
+]>;
+
+// X86-64 AnyReg return-value convention. No explicit register is specified for
+// the return-value. The register allocator is allowed and expected to choose
+// any free register.
+//
+// This calling convention is currently only supported by the stackmap and
+// patchpoint intrinsics. All other uses will result in an assert on Debug
+// builds. On Release builds we fallback to the X86 C calling convention.
+def RetCC_X86_64_AnyReg : CallingConv<[
+  CCCustom<"CC_X86_AnyReg_Error">
+]>;
+
 // This is the root return-value convention for the X86-32 backend.
 def RetCC_X86_32 : CallingConv<[
   // If FastCC, use RetCC_X86_32_Fast.
@@ -167,6 +187,10 @@ def RetCC_X86_64 : CallingConv<[
   // HiPE uses RetCC_X86_64_HiPE
   CCIfCC<"CallingConv::HiPE", CCDelegateTo<RetCC_X86_64_HiPE>>,
 
+  // Handle JavaScript calls.
+  CCIfCC<"CallingConv::WebKit_JS", CCDelegateTo<RetCC_X86_64_WebKit_JS>>,
+  CCIfCC<"CallingConv::AnyReg", CCDelegateTo<RetCC_X86_64_AnyReg>>,
+
   // Handle explicit CC selection
   CCIfCC<"CallingConv::X86_64_Win64", CCDelegateTo<RetCC_X86_Win64_C>>,
   CCIfCC<"CallingConv::X86_64_SysV", CCDelegateTo<RetCC_X86_64_C>>,
@@ -279,10 +303,10 @@ def CC_X86_Win64_C : CallingConv<[
   // The first 4 integer arguments are passed in integer registers.
   CCIfType<[i32], CCAssignToRegWithShadow<[ECX , EDX , R8D , R9D ],
                                           [XMM0, XMM1, XMM2, XMM3]>>,
-  
+
   // Do not pass the sret argument in RCX, the Win64 thiscall calling
-  // convention requires "this" to be passed in RCX.                                        
-  CCIfCC<"CallingConv::X86_ThisCall", 
+  // convention requires "this" to be passed in RCX.
+  CCIfCC<"CallingConv::X86_ThisCall",
     CCIfSRet<CCIfType<[i64], CCAssignToRegWithShadow<[RDX , R8  , R9  ],
                                                      [XMM1, XMM2, XMM3]>>>>,
 
@@ -329,6 +353,25 @@ def CC_X86_64_HiPE : CallingConv<[
   CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>
 ]>;
 
+def CC_X86_64_WebKit_JS : CallingConv<[
+  // Promote i8/i16 arguments to i32.
+  CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+  // Integer/FP values are always stored in stack slots that are 8 bytes in size
+  // and 8-byte aligned.
+  CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>
+]>;
+
+// No explicit register is specified for the AnyReg calling convention. The
+// register allocator may assign the arguments to any free register.
+//
+// This calling convention is currently only supported by the stackmap and
+// patchpoint intrinsics. All other uses will result in an assert on Debug
+// builds. On Release builds we fallback to the X86 C calling convention.
+def CC_X86_64_AnyReg : CallingConv<[
+  CCCustom<"CC_X86_AnyReg_Error">
+]>;
+
 //===----------------------------------------------------------------------===//
 // X86 C Calling Convention
 //===----------------------------------------------------------------------===//
@@ -354,7 +397,7 @@ def CC_X86_32_Common : CallingConv<[
   // Integer/Float values get stored in stack slots that are 4 bytes in
   // size and 4-byte aligned.
   CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
-  
+
   // Doubles get 8-byte slots that are 4-byte aligned.
   CCIfType<[f64], CCAssignToStack<8, 4>>,
 
@@ -520,6 +563,8 @@ def CC_X86_32 : CallingConv<[
 def CC_X86_64 : CallingConv<[
   CCIfCC<"CallingConv::GHC", CCDelegateTo<CC_X86_64_GHC>>,
   CCIfCC<"CallingConv::HiPE", CCDelegateTo<CC_X86_64_HiPE>>,
+  CCIfCC<"CallingConv::WebKit_JS", CCDelegateTo<CC_X86_64_WebKit_JS>>,
+  CCIfCC<"CallingConv::AnyReg", CCDelegateTo<CC_X86_64_AnyReg>>,
   CCIfCC<"CallingConv::X86_64_Win64", CCDelegateTo<CC_X86_Win64_C>>,
   CCIfCC<"CallingConv::X86_64_SysV", CCDelegateTo<CC_X86_64_C>>,
 
@@ -558,11 +603,11 @@ def CSR_MostRegs_64 : CalleeSavedRegs<(add RBX, RCX, RDX, RSI, RDI, R8, R9, R10,
 
 // Standard C + YMM6-15
 def CSR_Win64_Intel_OCL_BI_AVX : CalleeSavedRegs<(add RBX, RBP, RDI, RSI, R12,
-                                                  R13, R14, R15, 
+                                                  R13, R14, R15,
                                                   (sequence "YMM%u", 6, 15))>;
 
 def CSR_Win64_Intel_OCL_BI_AVX512 : CalleeSavedRegs<(add RBX, RBP, RDI, RSI,
-                                                     R12, R13, R14, R15, 
+                                                     R12, R13, R14, R15,
                                                      (sequence "ZMM%u", 6, 21),
                                                      K4, K5, K6, K7)>;
 //Standard C + XMM 8-15
diff --git a/lib/Target/X86/X86CodeEmitter.cpp b/lib/Target/X86/X86CodeEmitter.cpp
index 5d72b44..14385ed 100644
--- a/lib/Target/X86/X86CodeEmitter.cpp
+++ b/lib/Target/X86/X86CodeEmitter.cpp
@@ -840,7 +840,7 @@ void Emitter<CodeEmitter>::emitVEXOpcodePrefix(uint64_t TSFlags,
   unsigned char VEX_W = 0;
 
   // XOP: Use XOP prefix byte 0x8f instead of VEX.
-  unsigned char XOP = 0;
+  bool XOP = false;
 
   // VEX_5M (VEX m-mmmmm field):
   //
@@ -850,7 +850,8 @@ void Emitter<CodeEmitter>::emitVEXOpcodePrefix(uint64_t TSFlags,
   //  0b00011: implied 0F 3A leading opcode bytes
   //  0b00100-0b11111: Reserved for future use
   //  0b01000: XOP map select - 08h instructions with imm byte
-  //  0b10001: XOP map select - 09h instructions with no imm byte
+  //  0b01001: XOP map select - 09h instructions with no imm byte
+  //  0b01010: XOP map select - 0Ah instructions with imm dword
   unsigned char VEX_5M = 0x1;
 
   // VEX_4V (VEX vvvv field): a register specifier
@@ -882,7 +883,7 @@ void Emitter<CodeEmitter>::emitVEXOpcodePrefix(uint64_t TSFlags,
     VEX_W = 1;
 
   if ((TSFlags >> X86II::VEXShift) & X86II::XOP)
-    XOP = 1;
+    XOP = true;
 
   if ((TSFlags >> X86II::VEXShift) & X86II::VEX_L)
     VEX_L = 1;
@@ -919,11 +920,11 @@ void Emitter<CodeEmitter>::emitVEXOpcodePrefix(uint64_t TSFlags,
     case X86II::XOP9:
       VEX_5M = 0x9;
       break;
-    case X86II::A6:  // Bypass: Not used by VEX
-    case X86II::A7:  // Bypass: Not used by VEX
-    case X86II::TB:  // Bypass: Not used by VEX
-    case 0:
-      break;  // No prefix!
+    case X86II::XOPA:
+      VEX_5M = 0xA;
+      break;
+    case X86II::TB: // VEX_5M/VEX_PP already correct
+      break;
   }
 
 
@@ -982,11 +983,14 @@ void Emitter<CodeEmitter>::emitVEXOpcodePrefix(uint64_t TSFlags,
       //  FMA4:
       //  dst(ModR/M.reg), src1(VEX_4V), src2(ModR/M), src3(VEX_I8IMM)
       //  dst(ModR/M.reg), src1(VEX_4V), src2(VEX_I8IMM), src3(ModR/M),
-      if (X86II::isX86_64ExtendedReg(MI.getOperand(0).getReg()))
+      if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
         VEX_R = 0x0;
+      CurOp++;
 
-      if (HasVEX_4V)
-        VEX_4V = getVEXRegisterEncoding(MI, 1);
+      if (HasVEX_4V) {
+        VEX_4V = getVEXRegisterEncoding(MI, CurOp);
+        CurOp++;
+      }
 
       if (X86II::isX86_64ExtendedReg(
                           MI.getOperand(MemOperand+X86::AddrBaseReg).getReg()))
@@ -996,7 +1000,7 @@ void Emitter<CodeEmitter>::emitVEXOpcodePrefix(uint64_t TSFlags,
         VEX_X = 0x0;
 
       if (HasVEX_4VOp3)
-        VEX_4V = getVEXRegisterEncoding(MI, X86::AddrNumOperands+1);
+        VEX_4V = getVEXRegisterEncoding(MI, CurOp+X86::AddrNumOperands);
       break;
     case X86II::MRM0m: case X86II::MRM1m:
     case X86II::MRM2m: case X86II::MRM3m:
@@ -1006,7 +1010,7 @@ void Emitter<CodeEmitter>::emitVEXOpcodePrefix(uint64_t TSFlags,
       //  MemAddr
       //  src1(VEX_4V), MemAddr
       if (HasVEX_4V)
-        VEX_4V = getVEXRegisterEncoding(MI, 0);
+        VEX_4V = getVEXRegisterEncoding(MI, CurOp++);
 
       if (X86II::isX86_64ExtendedReg(
                           MI.getOperand(MemOperand+X86::AddrBaseReg).getReg()))
@@ -1059,8 +1063,10 @@ void Emitter<CodeEmitter>::emitVEXOpcodePrefix(uint64_t TSFlags,
     case X86II::MRM6r: case X86II::MRM7r:
       // MRM0r-MRM7r instructions forms:
       //  dst(VEX_4V), src(ModR/M), imm8
-      VEX_4V = getVEXRegisterEncoding(MI, 0);
-      if (X86II::isX86_64ExtendedReg(MI.getOperand(1).getReg()))
+      VEX_4V = getVEXRegisterEncoding(MI, CurOp);
+      CurOp++;
+
+      if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
         VEX_B = 0x0;
       break;
     default: // RawFrm
diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp
index 5bc3420..97f96ab 100644
--- a/lib/Target/X86/X86FastISel.cpp
+++ b/lib/Target/X86/X86FastISel.cpp
@@ -14,6 +14,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "X86.h"
+#include "X86CallingConv.h"
 #include "X86ISelLowering.h"
 #include "X86InstrBuilder.h"
 #include "X86RegisterInfo.h"
@@ -125,6 +126,8 @@ private:
     return static_cast<const X86TargetMachine *>(&TM);
   }
 
+  bool handleConstantAddresses(const Value *V, X86AddressMode &AM);
+
   unsigned TargetMaterializeConstant(const Constant *C);
 
   unsigned TargetMaterializeAlloca(const AllocaInst *C);
@@ -344,9 +347,126 @@ bool X86FastISel::X86FastEmitExtend(ISD::NodeType Opc, EVT DstVT,
   return true;
 }
 
+bool X86FastISel::handleConstantAddresses(const Value *V, X86AddressMode &AM) {
+  // Handle constant address.
+  if (const GlobalValue *GV = dyn_cast<GlobalValue>(V)) {
+    // Can't handle alternate code models yet.
+    if (TM.getCodeModel() != CodeModel::Small)
+      return false;
+
+    // Can't handle TLS yet.
+    if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV))
+      if (GVar->isThreadLocal())
+        return false;
+
+    // Can't handle TLS yet, part 2 (this is slightly crazy, but this is how
+    // it works...).
+    if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
+      if (const GlobalVariable *GVar =
+            dyn_cast_or_null<GlobalVariable>(GA->resolveAliasedGlobal(false)))
+        if (GVar->isThreadLocal())
+          return false;
+
+    // RIP-relative addresses can't have additional register operands, so if
+    // we've already folded stuff into the addressing mode, just force the
+    // global value into its own register, which we can use as the basereg.
+    if (!Subtarget->isPICStyleRIPRel() ||
+        (AM.Base.Reg == 0 && AM.IndexReg == 0)) {
+      // Okay, we've committed to selecting this global. Set up the address.
+      AM.GV = GV;
+
+      // Allow the subtarget to classify the global.
+      unsigned char GVFlags = Subtarget->ClassifyGlobalReference(GV, TM);
+
+      // If this reference is relative to the pic base, set it now.
+      if (isGlobalRelativeToPICBase(GVFlags)) {
+        // FIXME: How do we know Base.Reg is free??
+        AM.Base.Reg = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF);
+      }
+
+      // Unless the ABI requires an extra load, return a direct reference to
+      // the global.
+      if (!isGlobalStubReference(GVFlags)) {
+        if (Subtarget->isPICStyleRIPRel()) {
+          // Use rip-relative addressing if we can.  Above we verified that the
+          // base and index registers are unused.
+          assert(AM.Base.Reg == 0 && AM.IndexReg == 0);
+          AM.Base.Reg = X86::RIP;
+        }
+        AM.GVOpFlags = GVFlags;
+        return true;
+      }
+
+      // Ok, we need to do a load from a stub.  If we've already loaded from
+      // this stub, reuse the loaded pointer, otherwise emit the load now.
+      DenseMap<const Value*, unsigned>::iterator I = LocalValueMap.find(V);
+      unsigned LoadReg;
+      if (I != LocalValueMap.end() && I->second != 0) {
+        LoadReg = I->second;
+      } else {
+        // Issue load from stub.
+        unsigned Opc = 0;
+        const TargetRegisterClass *RC = NULL;
+        X86AddressMode StubAM;
+        StubAM.Base.Reg = AM.Base.Reg;
+        StubAM.GV = GV;
+        StubAM.GVOpFlags = GVFlags;
+
+        // Prepare for inserting code in the local-value area.
+        SavePoint SaveInsertPt = enterLocalValueArea();
+
+        if (TLI.getPointerTy() == MVT::i64) {
+          Opc = X86::MOV64rm;
+          RC  = &X86::GR64RegClass;
+
+          if (Subtarget->isPICStyleRIPRel())
+            StubAM.Base.Reg = X86::RIP;
+        } else {
+          Opc = X86::MOV32rm;
+          RC  = &X86::GR32RegClass;
+        }
+
+        LoadReg = createResultReg(RC);
+        MachineInstrBuilder LoadMI =
+          BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), LoadReg);
+        addFullAddress(LoadMI, StubAM);
+
+        // Ok, back to normal mode.
+        leaveLocalValueArea(SaveInsertPt);
+
+        // Prevent loading GV stub multiple times in same MBB.
+        LocalValueMap[V] = LoadReg;
+      }
+
+      // Now construct the final address. Note that the Disp, Scale,
+      // and Index values may already be set here.
+      AM.Base.Reg = LoadReg;
+      AM.GV = 0;
+      return true;
+    }
+  }
+
+  // If all else fails, try to materialize the value in a register.
+  if (!AM.GV || !Subtarget->isPICStyleRIPRel()) {
+    if (AM.Base.Reg == 0) {
+      AM.Base.Reg = getRegForValue(V);
+      return AM.Base.Reg != 0;
+    }
+    if (AM.IndexReg == 0) {
+      assert(AM.Scale == 1 && "Scale with no index!");
+      AM.IndexReg = getRegForValue(V);
+      return AM.IndexReg != 0;
+    }
+  }
+
+  return false;
+}
+
 /// X86SelectAddress - Attempt to fill in an address from the given value.
 ///
 bool X86FastISel::X86SelectAddress(const Value *V, X86AddressMode &AM) {
+  SmallVector<const Value *, 32> GEPs;
+redo_gep:
   const User *U = NULL;
   unsigned Opcode = Instruction::UserOp1;
   if (const Instruction *I = dyn_cast<Instruction>(V)) {
@@ -441,13 +561,8 @@ bool X86FastISel::X86SelectAddress(const Value *V, X86AddressMode &AM) {
           Disp += CI->getSExtValue() * S;
           break;
         }
-        if (isa<AddOperator>(Op) &&
-            (!isa<Instruction>(Op) ||
-             FuncInfo.MBBMap[cast<Instruction>(Op)->getParent()]
-               == FuncInfo.MBB) &&
-            isa<ConstantInt>(cast<AddOperator>(Op)->getOperand(1))) {
-          // An add (in the same block) with a constant operand. Fold the
-          // constant.
+        if (canFoldAddIntoGEP(U, Op)) {
+          // A compatible add with a constant operand. Fold the constant.
           ConstantInt *CI =
             cast<ConstantInt>(cast<AddOperator>(Op)->getOperand(1));
           Disp += CI->getSExtValue() * S;
@@ -469,139 +584,43 @@ bool X86FastISel::X86SelectAddress(const Value *V, X86AddressMode &AM) {
         goto unsupported_gep;
       }
     }
+
     // Check for displacement overflow.
     if (!isInt<32>(Disp))
       break;
-    // Ok, the GEP indices were covered by constant-offset and scaled-index
-    // addressing. Update the address state and move on to examining the base.
+
     AM.IndexReg = IndexReg;
     AM.Scale = Scale;
     AM.Disp = (uint32_t)Disp;
-    if (X86SelectAddress(U->getOperand(0), AM))
+    GEPs.push_back(V);
+
+    if (const GetElementPtrInst *GEP =
+          dyn_cast<GetElementPtrInst>(U->getOperand(0))) {
+      // Ok, the GEP indices were covered by constant-offset and scaled-index
+      // addressing. Update the address state and move on to examining the base.
+      V = GEP;
+      goto redo_gep;
+    } else if (X86SelectAddress(U->getOperand(0), AM)) {
       return true;
+    }
 
     // If we couldn't merge the gep value into this addr mode, revert back to
     // our address and just match the value instead of completely failing.
     AM = SavedAM;
-    break;
-  unsupported_gep:
-    // Ok, the GEP indices weren't all covered.
-    break;
-  }
-  }
-
-  // Handle constant address.
-  if (const GlobalValue *GV = dyn_cast<GlobalValue>(V)) {
-    // Can't handle alternate code models yet.
-    if (TM.getCodeModel() != CodeModel::Small)
-      return false;
 
-    // Can't handle TLS yet.
-    if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV))
-      if (GVar->isThreadLocal())
-        return false;
-
-    // Can't handle TLS yet, part 2 (this is slightly crazy, but this is how
-    // it works...).
-    if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
-      if (const GlobalVariable *GVar =
-            dyn_cast_or_null<GlobalVariable>(GA->resolveAliasedGlobal(false)))
-        if (GVar->isThreadLocal())
-          return false;
-
-    // RIP-relative addresses can't have additional register operands, so if
-    // we've already folded stuff into the addressing mode, just force the
-    // global value into its own register, which we can use as the basereg.
-    if (!Subtarget->isPICStyleRIPRel() ||
-        (AM.Base.Reg == 0 && AM.IndexReg == 0)) {
-      // Okay, we've committed to selecting this global. Set up the address.
-      AM.GV = GV;
-
-      // Allow the subtarget to classify the global.
-      unsigned char GVFlags = Subtarget->ClassifyGlobalReference(GV, TM);
-
-      // If this reference is relative to the pic base, set it now.
-      if (isGlobalRelativeToPICBase(GVFlags)) {
-        // FIXME: How do we know Base.Reg is free??
-        AM.Base.Reg = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF);
-      }
-
-      // Unless the ABI requires an extra load, return a direct reference to
-      // the global.
-      if (!isGlobalStubReference(GVFlags)) {
-        if (Subtarget->isPICStyleRIPRel()) {
-          // Use rip-relative addressing if we can.  Above we verified that the
-          // base and index registers are unused.
-          assert(AM.Base.Reg == 0 && AM.IndexReg == 0);
-          AM.Base.Reg = X86::RIP;
-        }
-        AM.GVOpFlags = GVFlags;
+    for (SmallVectorImpl<const Value *>::reverse_iterator
+           I = GEPs.rbegin(), E = GEPs.rend(); I != E; ++I)
+      if (handleConstantAddresses(*I, AM))
         return true;
-      }
-
-      // Ok, we need to do a load from a stub.  If we've already loaded from
-      // this stub, reuse the loaded pointer, otherwise emit the load now.
-      DenseMap<const Value*, unsigned>::iterator I = LocalValueMap.find(V);
-      unsigned LoadReg;
-      if (I != LocalValueMap.end() && I->second != 0) {
-        LoadReg = I->second;
-      } else {
-        // Issue load from stub.
-        unsigned Opc = 0;
-        const TargetRegisterClass *RC = NULL;
-        X86AddressMode StubAM;
-        StubAM.Base.Reg = AM.Base.Reg;
-        StubAM.GV = GV;
-        StubAM.GVOpFlags = GVFlags;
-
-        // Prepare for inserting code in the local-value area.
-        SavePoint SaveInsertPt = enterLocalValueArea();
-
-        if (TLI.getPointerTy() == MVT::i64) {
-          Opc = X86::MOV64rm;
-          RC  = &X86::GR64RegClass;
 
-          if (Subtarget->isPICStyleRIPRel())
-            StubAM.Base.Reg = X86::RIP;
-        } else {
-          Opc = X86::MOV32rm;
-          RC  = &X86::GR32RegClass;
-        }
-
-        LoadReg = createResultReg(RC);
-        MachineInstrBuilder LoadMI =
-          BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), LoadReg);
-        addFullAddress(LoadMI, StubAM);
-
-        // Ok, back to normal mode.
-        leaveLocalValueArea(SaveInsertPt);
-
-        // Prevent loading GV stub multiple times in same MBB.
-        LocalValueMap[V] = LoadReg;
-      }
-
-      // Now construct the final address. Note that the Disp, Scale,
-      // and Index values may already be set here.
-      AM.Base.Reg = LoadReg;
-      AM.GV = 0;
-      return true;
-    }
+    return false;
+  unsupported_gep:
+    // Ok, the GEP indices weren't all covered.
+    break;
   }
-
-  // If all else fails, try to materialize the value in a register.
-  if (!AM.GV || !Subtarget->isPICStyleRIPRel()) {
-    if (AM.Base.Reg == 0) {
-      AM.Base.Reg = getRegForValue(V);
-      return AM.Base.Reg != 0;
-    }
-    if (AM.IndexReg == 0) {
-      assert(AM.Scale == 1 && "Scale with no index!");
-      AM.IndexReg = getRegForValue(V);
-      return AM.IndexReg != 0;
-    }
   }
 
-  return false;
+  return handleConstantAddresses(V, AM);
 }
 
 /// X86SelectCallAddress - Attempt to fill in an address from the given value.
@@ -609,9 +628,35 @@ bool X86FastISel::X86SelectAddress(const Value *V, X86AddressMode &AM) {
 bool X86FastISel::X86SelectCallAddress(const Value *V, X86AddressMode &AM) {
   const User *U = NULL;
   unsigned Opcode = Instruction::UserOp1;
-  if (const Instruction *I = dyn_cast<Instruction>(V)) {
+  const Instruction *I = dyn_cast<Instruction>(V);
+  // Record if the value is defined in the same basic block.
+  //
+  // This information is crucial to know whether or not folding an
+  // operand is valid.
+  // Indeed, FastISel generates or reuses a virtual register for all
+  // operands of all instructions it selects. Obviously, the definition and
+  // its uses must use the same virtual register otherwise the produced
+  // code is incorrect.
+  // Before instruction selection, FunctionLoweringInfo::set sets the virtual
+  // registers for values that are alive across basic blocks. This ensures
+  // that the values are consistently set between across basic block, even
+  // if different instruction selection mechanisms are used (e.g., a mix of
+  // SDISel and FastISel).
+  // For values local to a basic block, the instruction selection process
+  // generates these virtual registers with whatever method is appropriate
+  // for its needs. In particular, FastISel and SDISel do not share the way
+  // local virtual registers are set.
+  // Therefore, this is impossible (or at least unsafe) to share values
+  // between basic blocks unless they use the same instruction selection
+  // method, which is not guarantee for X86.
+  // Moreover, things like hasOneUse could not be used accurately, if we
+  // allow to reference values across basic blocks whereas they are not
+  // alive across basic blocks initially.
+  bool InMBB = true;
+  if (I) {
     Opcode = I->getOpcode();
     U = I;
+    InMBB = I->getParent() == FuncInfo.MBB->getBasicBlock();
   } else if (const ConstantExpr *C = dyn_cast<ConstantExpr>(V)) {
     Opcode = C->getOpcode();
     U = C;
@@ -620,18 +665,22 @@ bool X86FastISel::X86SelectCallAddress(const Value *V, X86AddressMode &AM) {
   switch (Opcode) {
   default: break;
   case Instruction::BitCast:
-    // Look past bitcasts.
-    return X86SelectCallAddress(U->getOperand(0), AM);
+    // Look past bitcasts if its operand is in the same BB.
+    if (InMBB)
+      return X86SelectCallAddress(U->getOperand(0), AM);
+    break;
 
   case Instruction::IntToPtr:
-    // Look past no-op inttoptrs.
-    if (TLI.getValueType(U->getOperand(0)->getType()) == TLI.getPointerTy())
+    // Look past no-op inttoptrs if its operand is in the same BB.
+    if (InMBB &&
+        TLI.getValueType(U->getOperand(0)->getType()) == TLI.getPointerTy())
       return X86SelectCallAddress(U->getOperand(0), AM);
     break;
 
   case Instruction::PtrToInt:
-    // Look past no-op ptrtoints.
-    if (TLI.getValueType(U->getType()) == TLI.getPointerTy())
+    // Look past no-op ptrtoints if its operand is in the same BB.
+    if (InMBB &&
+        TLI.getValueType(U->getType()) == TLI.getPointerTy())
       return X86SelectCallAddress(U->getOperand(0), AM);
     break;
   }
@@ -1026,7 +1075,7 @@ bool X86FastISel::X86SelectZExt(const Instruction *I) {
     return false;
 
   // Handle zero-extension from i1 to i8, which is common.
-  MVT SrcVT = TLI.getValueType(I->getOperand(0)->getType()).getSimpleVT();
+  MVT SrcVT = TLI.getSimpleValueType(I->getOperand(0)->getType());
   if (SrcVT.SimpleTy == MVT::i1) {
     // Set the high bits to zero.
     ResultReg = FastEmitZExtFromI1(MVT::i8, ResultReg, /*TODO: Kill=*/false);
diff --git a/lib/Target/X86/X86FixupLEAs.cpp b/lib/Target/X86/X86FixupLEAs.cpp
index d21cb8a..38a8351 100644
--- a/lib/Target/X86/X86FixupLEAs.cpp
+++ b/lib/Target/X86/X86FixupLEAs.cpp
@@ -125,6 +125,15 @@ FixupLEAPass::postRAConvertToLEA(MachineFunction::iterator &MFI,
       // which requires isImm() to be true
       return 0;
     }
+    break;
+  case X86::ADD16rr:
+  case X86::ADD16rr_DB:
+    if (MI->getOperand(1).getReg() != MI->getOperand(2).getReg()) {
+      // if src1 != src2, then convertToThreeAddress will
+      // need to create a Virtual register, which we cannot do
+      // after register allocation.
+      return 0;
+    }
   }
   return TII->convertToThreeAddress(MFI, MBBI, 0);
 }
diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp
index b994e67..a06ba9d 100644
--- a/lib/Target/X86/X86FrameLowering.cpp
+++ b/lib/Target/X86/X86FrameLowering.cpp
@@ -365,270 +365,13 @@ void X86FrameLowering::emitCalleeSavedFrameMoves(MachineFunction &MF,
   }
 }
 
-/// getCompactUnwindRegNum - Get the compact unwind number for a given
-/// register. The number corresponds to the enum lists in
-/// compact_unwind_encoding.h.
-static int getCompactUnwindRegNum(unsigned Reg, bool is64Bit) {
-  static const uint16_t CU32BitRegs[] = {
-    X86::EBX, X86::ECX, X86::EDX, X86::EDI, X86::ESI, X86::EBP, 0
-  };
-  static const uint16_t CU64BitRegs[] = {
-    X86::RBX, X86::R12, X86::R13, X86::R14, X86::R15, X86::RBP, 0
-  };
-  const uint16_t *CURegs = is64Bit ? CU64BitRegs : CU32BitRegs;
-  for (int Idx = 1; *CURegs; ++CURegs, ++Idx)
-    if (*CURegs == Reg)
-      return Idx;
-
-  return -1;
-}
-
-// Number of registers that can be saved in a compact unwind encoding.
-#define CU_NUM_SAVED_REGS 6
-
-/// encodeCompactUnwindRegistersWithoutFrame - Create the permutation encoding
-/// used with frameless stacks. It is passed the number of registers to be saved
-/// and an array of the registers saved.
-static uint32_t
-encodeCompactUnwindRegistersWithoutFrame(unsigned SavedRegs[CU_NUM_SAVED_REGS],
-                                         unsigned RegCount, bool Is64Bit) {
-  // The saved registers are numbered from 1 to 6. In order to encode the order
-  // in which they were saved, we re-number them according to their place in the
-  // register order. The re-numbering is relative to the last re-numbered
-  // register. E.g., if we have registers {6, 2, 4, 5} saved in that order:
-  //
-  //    Orig  Re-Num
-  //    ----  ------
-  //     6       6
-  //     2       2
-  //     4       3
-  //     5       3
-  //
-  for (unsigned i = 0; i != CU_NUM_SAVED_REGS; ++i) {
-    int CUReg = getCompactUnwindRegNum(SavedRegs[i], Is64Bit);
-    if (CUReg == -1) return ~0U;
-    SavedRegs[i] = CUReg;
-  }
-
-  // Reverse the list.
-  std::swap(SavedRegs[0], SavedRegs[5]);
-  std::swap(SavedRegs[1], SavedRegs[4]);
-  std::swap(SavedRegs[2], SavedRegs[3]);
-
-  uint32_t RenumRegs[CU_NUM_SAVED_REGS];
-  for (unsigned i = CU_NUM_SAVED_REGS - RegCount; i < CU_NUM_SAVED_REGS; ++i) {
-    unsigned Countless = 0;
-    for (unsigned j = CU_NUM_SAVED_REGS - RegCount; j < i; ++j)
-      if (SavedRegs[j] < SavedRegs[i])
-        ++Countless;
-
-    RenumRegs[i] = SavedRegs[i] - Countless - 1;
-  }
-
-  // Take the renumbered values and encode them into a 10-bit number.
-  uint32_t permutationEncoding = 0;
-  switch (RegCount) {
-  case 6:
-    permutationEncoding |= 120 * RenumRegs[0] + 24 * RenumRegs[1]
-                           + 6 * RenumRegs[2] +  2 * RenumRegs[3]
-                           +     RenumRegs[4];
-    break;
-  case 5:
-    permutationEncoding |= 120 * RenumRegs[1] + 24 * RenumRegs[2]
-                           + 6 * RenumRegs[3] +  2 * RenumRegs[4]
-                           +     RenumRegs[5];
-    break;
-  case 4:
-    permutationEncoding |=  60 * RenumRegs[2] + 12 * RenumRegs[3]
-                           + 3 * RenumRegs[4] +      RenumRegs[5];
-    break;
-  case 3:
-    permutationEncoding |=  20 * RenumRegs[3] +  4 * RenumRegs[4]
-                           +     RenumRegs[5];
-    break;
-  case 2:
-    permutationEncoding |=   5 * RenumRegs[4] +      RenumRegs[5];
-    break;
-  case 1:
-    permutationEncoding |=       RenumRegs[5];
-    break;
-  }
-
-  assert((permutationEncoding & 0x3FF) == permutationEncoding &&
-         "Invalid compact register encoding!");
-  return permutationEncoding;
-}
-
-/// encodeCompactUnwindRegistersWithFrame - Return the registers encoded for a
-/// compact encoding with a frame pointer.
-static uint32_t
-encodeCompactUnwindRegistersWithFrame(unsigned SavedRegs[CU_NUM_SAVED_REGS],
-                                      bool Is64Bit) {
-  // Encode the registers in the order they were saved, 3-bits per register. The
-  // registers are numbered from 1 to CU_NUM_SAVED_REGS.
-  uint32_t RegEnc = 0;
-  for (int I = CU_NUM_SAVED_REGS - 1, Idx = 0; I != -1; --I) {
-    unsigned Reg = SavedRegs[I];
-    if (Reg == 0) continue;
-
-    int CURegNum = getCompactUnwindRegNum(Reg, Is64Bit);
-    if (CURegNum == -1) return ~0U;
-
-    // Encode the 3-bit register number in order, skipping over 3-bits for each
-    // register.
-    RegEnc |= (CURegNum & 0x7) << (Idx++ * 3);
-  }
-
-  assert((RegEnc & 0x3FFFF) == RegEnc && "Invalid compact register encoding!");
-  return RegEnc;
-}
-
-uint32_t X86FrameLowering::getCompactUnwindEncoding(MachineFunction &MF) const {
-  const X86RegisterInfo *RegInfo = TM.getRegisterInfo();
-  unsigned FramePtr = RegInfo->getFrameRegister(MF);
-  unsigned StackPtr = RegInfo->getStackRegister();
-
-  bool Is64Bit = STI.is64Bit();
-  bool HasFP = hasFP(MF);
-
-  unsigned SavedRegs[CU_NUM_SAVED_REGS] = { 0, 0, 0, 0, 0, 0 };
-  unsigned SavedRegIdx = 0;
-
-  unsigned OffsetSize = (Is64Bit ? 8 : 4);
-
-  unsigned PushInstr = (Is64Bit ? X86::PUSH64r : X86::PUSH32r);
-  unsigned PushInstrSize = 1;
-  unsigned MoveInstr = (Is64Bit ? X86::MOV64rr : X86::MOV32rr);
-  unsigned MoveInstrSize = (Is64Bit ? 3 : 2);
-  unsigned SubtractInstrIdx = (Is64Bit ? 3 : 2);
-
-  unsigned StackDivide = (Is64Bit ? 8 : 4);
-
-  unsigned InstrOffset = 0;
-  unsigned StackAdjust = 0;
-  unsigned StackSize = 0;
-
-  MachineBasicBlock &MBB = MF.front(); // Prologue is in entry BB.
-  bool ExpectEnd = false;
-  for (MachineBasicBlock::iterator
-         MBBI = MBB.begin(), MBBE = MBB.end(); MBBI != MBBE; ++MBBI) {
-    MachineInstr &MI = *MBBI;
-    unsigned Opc = MI.getOpcode();
-    if (Opc == X86::PROLOG_LABEL) continue;
-    if (!MI.getFlag(MachineInstr::FrameSetup)) break;
-
-    // We don't exect any more prolog instructions.
-    if (ExpectEnd) return CU::UNWIND_MODE_DWARF;
-
-    if (Opc == PushInstr) {
-      // If there are too many saved registers, we cannot use compact encoding.
-      if (SavedRegIdx >= CU_NUM_SAVED_REGS) return CU::UNWIND_MODE_DWARF;
-
-      unsigned Reg = MI.getOperand(0).getReg();
-      if (Reg == (Is64Bit ? X86::RAX : X86::EAX)) {
-        ExpectEnd = true;
-        continue;
-      }
-
-      SavedRegs[SavedRegIdx++] = MI.getOperand(0).getReg();
-      StackAdjust += OffsetSize;
-      InstrOffset += PushInstrSize;
-    } else if (Opc == MoveInstr) {
-      unsigned SrcReg = MI.getOperand(1).getReg();
-      unsigned DstReg = MI.getOperand(0).getReg();
-
-      if (DstReg != FramePtr || SrcReg != StackPtr)
-        return CU::UNWIND_MODE_DWARF;
-
-      StackAdjust = 0;
-      memset(SavedRegs, 0, sizeof(SavedRegs));
-      SavedRegIdx = 0;
-      InstrOffset += MoveInstrSize;
-    } else if (Opc == X86::SUB64ri32 || Opc == X86::SUB64ri8 ||
-               Opc == X86::SUB32ri || Opc == X86::SUB32ri8) {
-      if (StackSize)
-        // We already have a stack size.
-        return CU::UNWIND_MODE_DWARF;
-
-      if (!MI.getOperand(0).isReg() ||
-          MI.getOperand(0).getReg() != MI.getOperand(1).getReg() ||
-          MI.getOperand(0).getReg() != StackPtr || !MI.getOperand(2).isImm())
-        // We need this to be a stack adjustment pointer. Something like:
-        //
-        //   %RSP<def> = SUB64ri8 %RSP, 48
-        return CU::UNWIND_MODE_DWARF;
-
-      StackSize = MI.getOperand(2).getImm() / StackDivide;
-      SubtractInstrIdx += InstrOffset;
-      ExpectEnd = true;
-    }
-  }
-
-  // Encode that we are using EBP/RBP as the frame pointer.
-  uint32_t CompactUnwindEncoding = 0;
-  StackAdjust /= StackDivide;
-  if (HasFP) {
-    if ((StackAdjust & 0xFF) != StackAdjust)
-      // Offset was too big for compact encoding.
-      return CU::UNWIND_MODE_DWARF;
-
-    // Get the encoding of the saved registers when we have a frame pointer.
-    uint32_t RegEnc = encodeCompactUnwindRegistersWithFrame(SavedRegs, Is64Bit);
-    if (RegEnc == ~0U) return CU::UNWIND_MODE_DWARF;
-
-    CompactUnwindEncoding |= CU::UNWIND_MODE_BP_FRAME;
-    CompactUnwindEncoding |= (StackAdjust & 0xFF) << 16;
-    CompactUnwindEncoding |= RegEnc & CU::UNWIND_BP_FRAME_REGISTERS;
-  } else {
-    ++StackAdjust;
-    uint32_t TotalStackSize = StackAdjust + StackSize;
-    if ((TotalStackSize & 0xFF) == TotalStackSize) {
-      // Frameless stack with a small stack size.
-      CompactUnwindEncoding |= CU::UNWIND_MODE_STACK_IMMD;
-
-      // Encode the stack size.
-      CompactUnwindEncoding |= (TotalStackSize & 0xFF) << 16;
-    } else {
-      if ((StackAdjust & 0x7) != StackAdjust)
-        // The extra stack adjustments are too big for us to handle.
-        return CU::UNWIND_MODE_DWARF;
-
-      // Frameless stack with an offset too large for us to encode compactly.
-      CompactUnwindEncoding |= CU::UNWIND_MODE_STACK_IND;
-
-      // Encode the offset to the nnnnnn value in the 'subl $nnnnnn, ESP'
-      // instruction.
-      CompactUnwindEncoding |= (SubtractInstrIdx & 0xFF) << 16;
-
-      // Encode any extra stack stack adjustments (done via push instructions).
-      CompactUnwindEncoding |= (StackAdjust & 0x7) << 13;
-    }
-
-    // Encode the number of registers saved.
-    CompactUnwindEncoding |= (SavedRegIdx & 0x7) << 10;
-
-    // Get the encoding of the saved registers when we don't have a frame
-    // pointer.
-    uint32_t RegEnc =
-      encodeCompactUnwindRegistersWithoutFrame(SavedRegs, SavedRegIdx,
-                                               Is64Bit);
-    if (RegEnc == ~0U) return CU::UNWIND_MODE_DWARF;
-
-    // Encode the register encoding.
-    CompactUnwindEncoding |=
-      RegEnc & CU::UNWIND_FRAMELESS_STACK_REG_PERMUTATION;
-  }
-
-  return CompactUnwindEncoding;
-}
-
 /// usesTheStack - This function checks if any of the users of EFLAGS
 /// copies the EFLAGS. We know that the code that lowers COPY of EFLAGS has
 /// to use the stack, and if we don't adjust the stack we clobber the first
 /// frame index.
 /// See X86InstrInfo::copyPhysReg.
-static bool usesTheStack(MachineFunction &MF) {
-  MachineRegisterInfo &MRI = MF.getRegInfo();
+static bool usesTheStack(const MachineFunction &MF) {
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
 
   for (MachineRegisterInfo::reg_iterator ri = MRI.reg_begin(X86::EFLAGS),
        re = MRI.reg_end(); ri != re; ++ri)
@@ -863,7 +606,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
   // responsible for adjusting the stack pointer.  Touching the stack at 4K
   // increments is necessary to ensure that the guard pages used by the OS
   // virtual memory manager are allocated in correct sequence.
-  if (NumBytes >= 4096 && STI.isTargetCOFF() && !STI.isTargetEnvMacho()) {
+  if (NumBytes >= 4096 && STI.isOSWindows() && !STI.isTargetEnvMacho()) {
     const char *StackProbeSymbol;
     bool isSPUpdateNeeded = false;
 
@@ -964,11 +707,6 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
     if (PushedRegs)
       emitCalleeSavedFrameMoves(MF, Label, HasFP ? FramePtr : StackPtr);
   }
-
-  // Darwin 10.7 and greater has support for compact unwind encoding.
-  if (STI.getTargetTriple().isMacOSX() &&
-      !STI.getTargetTriple().isMacOSXVersionLT(10, 7))
-    MMI.setCompactUnwindEncoding(getCompactUnwindEncoding(MF));
 }
 
 void X86FrameLowering::emitEpilogue(MachineFunction &MF,
diff --git a/lib/Target/X86/X86FrameLowering.h b/lib/Target/X86/X86FrameLowering.h
index 6e309d8..3d3b011 100644
--- a/lib/Target/X86/X86FrameLowering.h
+++ b/lib/Target/X86/X86FrameLowering.h
@@ -20,32 +20,6 @@
 
 namespace llvm {
 
-namespace CU {
-
-  /// Compact unwind encoding values.
-  enum CompactUnwindEncodings {
-    /// [RE]BP based frame where [RE]BP is pused on the stack immediately after
-    /// the return address, then [RE]SP is moved to [RE]BP.
-    UNWIND_MODE_BP_FRAME                   = 0x01000000,
-
-    /// A frameless function with a small constant stack size.
-    UNWIND_MODE_STACK_IMMD                 = 0x02000000,
-
-    /// A frameless function with a large constant stack size.
-    UNWIND_MODE_STACK_IND                  = 0x03000000,
-
-    /// No compact unwind encoding is available.
-    UNWIND_MODE_DWARF                      = 0x04000000,
-
-    /// Mask for encoding the frame registers.
-    UNWIND_BP_FRAME_REGISTERS              = 0x00007FFF,
-
-    /// Mask for encoding the frameless registers.
-    UNWIND_FRAMELESS_STACK_REG_PERMUTATION = 0x000003FF
-  };
-
-} // end CU namespace
-
 class MCSymbol;
 class X86TargetMachine;
 
@@ -91,7 +65,6 @@ public:
   int getFrameIndexOffset(const MachineFunction &MF, int FI) const;
   int getFrameIndexReference(const MachineFunction &MF, int FI,
                              unsigned &FrameReg) const;
-  uint32_t getCompactUnwindEncoding(MachineFunction &MF) const;
 
   void eliminateCallFramePseudoInstr(MachineFunction &MF,
                                      MachineBasicBlock &MBB,
diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
index 9465420..36d1690 100644
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -79,7 +79,8 @@ namespace {
     }
 
     bool hasBaseOrIndexReg() const {
-      return IndexReg.getNode() != 0 || Base_Reg.getNode() != 0;
+      return BaseType == FrameIndexBase ||
+             IndexReg.getNode() != 0 || Base_Reg.getNode() != 0;
     }
 
     /// isRIPRelative - Return true if this addressing mode is already RIP
@@ -183,7 +184,7 @@ namespace {
     SDNode *Select(SDNode *N);
     SDNode *SelectGather(SDNode *N, unsigned Opc);
     SDNode *SelectAtomic64(SDNode *Node, unsigned Opc);
-    SDNode *SelectAtomicLoadArith(SDNode *Node, EVT NVT);
+    SDNode *SelectAtomicLoadArith(SDNode *Node, MVT NVT);
 
     bool FoldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM);
     bool MatchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM);
@@ -491,8 +492,8 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
     if (N->getOpcode() != ISD::FP_ROUND && N->getOpcode() != ISD::FP_EXTEND)
       continue;
 
-    EVT SrcVT = N->getOperand(0).getValueType();
-    EVT DstVT = N->getValueType(0);
+    MVT SrcVT = N->getOperand(0).getSimpleValueType();
+    MVT DstVT = N->getSimpleValueType(0);
 
     // If any of the sources are vectors, no fp stack involved.
     if (SrcVT.isVector() || DstVT.isVector())
@@ -519,7 +520,7 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
     // Here we could have an FP stack truncation or an FPStack <-> SSE convert.
     // FPStack has extload and truncstore.  SSE can fold direct loads into other
     // operations.  Based on this, decide what we want to do.
-    EVT MemVT;
+    MVT MemVT;
     if (N->getOpcode() == ISD::FP_ROUND)
       MemVT = DstVT;  // FP_ROUND must use DstVT, we can't do a 'trunc load'.
     else
@@ -783,7 +784,7 @@ static bool FoldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N,
       Mask != (0xffu << ScaleLog))
     return true;
 
-  EVT VT = N.getValueType();
+  MVT VT = N.getSimpleValueType();
   SDLoc DL(N);
   SDValue Eight = DAG.getConstant(8, MVT::i8);
   SDValue NewMask = DAG.getConstant(0xff, VT);
@@ -831,7 +832,7 @@ static bool FoldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N,
   if (ShiftAmt != 1 && ShiftAmt != 2 && ShiftAmt != 3)
     return true;
 
-  EVT VT = N.getValueType();
+  MVT VT = N.getSimpleValueType();
   SDLoc DL(N);
   SDValue NewMask = DAG.getConstant(Mask >> ShiftAmt, VT);
   SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, X, NewMask);
@@ -904,7 +905,7 @@ static bool FoldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N,
 
   // Scale the leading zero count down based on the actual size of the value.
   // Also scale it down based on the size of the shift.
-  MaskLZ -= (64 - X.getValueSizeInBits()) + ShiftAmt;
+  MaskLZ -= (64 - X.getSimpleValueType().getSizeInBits()) + ShiftAmt;
 
   // The final check is to ensure that any masked out high bits of X are
   // already known to be zero. Otherwise, the mask has a semantic impact
@@ -914,23 +915,23 @@ static bool FoldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N,
   // replace them with zero extensions cheaply if necessary.
   bool ReplacingAnyExtend = false;
   if (X.getOpcode() == ISD::ANY_EXTEND) {
-    unsigned ExtendBits =
-      X.getValueSizeInBits() - X.getOperand(0).getValueSizeInBits();
+    unsigned ExtendBits = X.getSimpleValueType().getSizeInBits() -
+                          X.getOperand(0).getSimpleValueType().getSizeInBits();
     // Assume that we'll replace the any-extend with a zero-extend, and
     // narrow the search to the extended value.
     X = X.getOperand(0);
     MaskLZ = ExtendBits > MaskLZ ? 0 : MaskLZ - ExtendBits;
     ReplacingAnyExtend = true;
   }
-  APInt MaskedHighBits = APInt::getHighBitsSet(X.getValueSizeInBits(),
-                                               MaskLZ);
+  APInt MaskedHighBits =
+    APInt::getHighBitsSet(X.getSimpleValueType().getSizeInBits(), MaskLZ);
   APInt KnownZero, KnownOne;
   DAG.ComputeMaskedBits(X, KnownZero, KnownOne);
   if (MaskedHighBits != KnownZero) return true;
 
   // We've identified a pattern that can be transformed into a single shift
   // and an addressing mode. Make it so.
-  EVT VT = N.getValueType();
+  MVT VT = N.getSimpleValueType();
   if (ReplacingAnyExtend) {
     assert(X.getValueType() != VT);
     // We looked through an ANY_EXTEND node, insert a ZERO_EXTEND.
@@ -1059,7 +1060,7 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
 
     // We only handle up to 64-bit values here as those are what matter for
     // addressing mode optimizations.
-    if (X.getValueSizeInBits() > 64) break;
+    if (X.getSimpleValueType().getSizeInBits() > 64) break;
 
     // The mask used for the transform is expected to be post-shift, but we
     // found the shift first so just apply the shift to the mask before passing
@@ -1244,7 +1245,7 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
 
     // We only handle up to 64-bit values here as those are what matter for
     // addressing mode optimizations.
-    if (X.getValueSizeInBits() > 64) break;
+    if (X.getSimpleValueType().getSizeInBits() > 64) break;
 
     if (!isa<ConstantSDNode>(N.getOperand(1)))
       break;
@@ -1323,7 +1324,7 @@ bool X86DAGToDAGISel::SelectAddr(SDNode *Parent, SDValue N, SDValue &Base,
   if (MatchAddress(N, AM))
     return false;
 
-  EVT VT = N.getValueType();
+  MVT VT = N.getSimpleValueType();
   if (AM.BaseType == X86ISelAddressMode::RegBase) {
     if (!AM.Base_Reg.getNode())
       AM.Base_Reg = CurDAG->getRegister(0, VT);
@@ -1465,7 +1466,7 @@ bool X86DAGToDAGISel::SelectLEAAddr(SDValue N,
   assert (T == AM.Segment);
   AM.Segment = Copy;
 
-  EVT VT = N.getValueType();
+  MVT VT = N.getSimpleValueType();
   unsigned Complexity = 0;
   if (AM.BaseType == X86ISelAddressMode::RegBase)
     if (AM.Base_Reg.getNode())
@@ -1706,7 +1707,7 @@ static const uint16_t AtomicOpcTbl[AtomicOpcEnd][AtomicSzEnd] = {
 // + non-empty, otherwise.
 static SDValue getAtomicLoadArithTargetConstant(SelectionDAG *CurDAG,
                                                 SDLoc dl,
-                                                enum AtomicOpc &Op, EVT NVT,
+                                                enum AtomicOpc &Op, MVT NVT,
                                                 SDValue Val) {
   if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Val)) {
     int64_t CNVal = CN->getSExtValue();
@@ -1753,7 +1754,7 @@ static SDValue getAtomicLoadArithTargetConstant(SelectionDAG *CurDAG,
   return Val;
 }
 
-SDNode *X86DAGToDAGISel::SelectAtomicLoadArith(SDNode *Node, EVT NVT) {
+SDNode *X86DAGToDAGISel::SelectAtomicLoadArith(SDNode *Node, MVT NVT) {
   if (Node->hasAnyUseOfValue(0))
     return 0;
 
@@ -1793,7 +1794,7 @@ SDNode *X86DAGToDAGISel::SelectAtomicLoadArith(SDNode *Node, EVT NVT) {
   bool isCN = Val.getNode() && (Val.getOpcode() == ISD::TargetConstant);
 
   unsigned Opc = 0;
-  switch (NVT.getSimpleVT().SimpleTy) {
+  switch (NVT.SimpleTy) {
     default: return 0;
     case MVT::i8:
       if (isCN)
@@ -2047,7 +2048,7 @@ SDNode *X86DAGToDAGISel::SelectGather(SDNode *Node, unsigned Opc) {
 }
 
 SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
-  EVT NVT = Node->getValueType(0);
+  MVT NVT = Node->getSimpleValueType(0);
   unsigned Opc, MOpc;
   unsigned Opcode = Node->getOpcode();
   SDLoc dl(Node);
@@ -2056,6 +2057,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
 
   if (Node->isMachineOpcode()) {
     DEBUG(dbgs() << "== ";  Node->dump(CurDAG); dbgs() << '\n');
+    Node->setNodeId(-1);
     return NULL;   // Already selected.
   }
 
@@ -2187,7 +2189,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
       break;
 
     unsigned ShlOp, Op;
-    EVT CstVT = NVT;
+    MVT CstVT = NVT;
 
     // Check the minimum bitwidth for the new constant.
     // TODO: AND32ri is the same as AND64ri32 with zext imm.
@@ -2202,7 +2204,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
     if (NVT == CstVT)
       break;
 
-    switch (NVT.getSimpleVT().SimpleTy) {
+    switch (NVT.SimpleTy) {
     default: llvm_unreachable("Unsupported VT!");
     case MVT::i32:
       assert(CstVT == MVT::i8);
@@ -2239,7 +2241,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
     SDValue N1 = Node->getOperand(1);
 
     unsigned LoReg;
-    switch (NVT.getSimpleVT().SimpleTy) {
+    switch (NVT.SimpleTy) {
     default: llvm_unreachable("Unsupported VT!");
     case MVT::i8:  LoReg = X86::AL;  Opc = X86::MUL8r; break;
     case MVT::i16: LoReg = X86::AX;  Opc = X86::MUL16r; break;
@@ -2268,7 +2270,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
     bool isSigned = Opcode == ISD::SMUL_LOHI;
     bool hasBMI2 = Subtarget->hasBMI2();
     if (!isSigned) {
-      switch (NVT.getSimpleVT().SimpleTy) {
+      switch (NVT.SimpleTy) {
       default: llvm_unreachable("Unsupported VT!");
       case MVT::i8:  Opc = X86::MUL8r;  MOpc = X86::MUL8m;  break;
       case MVT::i16: Opc = X86::MUL16r; MOpc = X86::MUL16m; break;
@@ -2278,7 +2280,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
                      MOpc = hasBMI2 ? X86::MULX64rm : X86::MUL64m; break;
       }
     } else {
-      switch (NVT.getSimpleVT().SimpleTy) {
+      switch (NVT.SimpleTy) {
       default: llvm_unreachable("Unsupported VT!");
       case MVT::i8:  Opc = X86::IMUL8r;  MOpc = X86::IMUL8m;  break;
       case MVT::i16: Opc = X86::IMUL16r; MOpc = X86::IMUL16m; break;
@@ -2415,7 +2417,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
 
     bool isSigned = Opcode == ISD::SDIVREM;
     if (!isSigned) {
-      switch (NVT.getSimpleVT().SimpleTy) {
+      switch (NVT.SimpleTy) {
       default: llvm_unreachable("Unsupported VT!");
       case MVT::i8:  Opc = X86::DIV8r;  MOpc = X86::DIV8m;  break;
       case MVT::i16: Opc = X86::DIV16r; MOpc = X86::DIV16m; break;
@@ -2423,7 +2425,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
       case MVT::i64: Opc = X86::DIV64r; MOpc = X86::DIV64m; break;
       }
     } else {
-      switch (NVT.getSimpleVT().SimpleTy) {
+      switch (NVT.SimpleTy) {
       default: llvm_unreachable("Unsupported VT!");
       case MVT::i8:  Opc = X86::IDIV8r;  MOpc = X86::IDIV8m;  break;
       case MVT::i16: Opc = X86::IDIV16r; MOpc = X86::IDIV16m; break;
@@ -2434,7 +2436,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
 
     unsigned LoReg, HiReg, ClrReg;
     unsigned SExtOpcode;
-    switch (NVT.getSimpleVT().SimpleTy) {
+    switch (NVT.SimpleTy) {
     default: llvm_unreachable("Unsupported VT!");
     case MVT::i8:
       LoReg = X86::AL;  ClrReg = HiReg = X86::AH;
@@ -2489,7 +2491,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
       } else {
         // Zero out the high part, effectively zero extending the input.
         SDValue ClrNode = SDValue(CurDAG->getMachineNode(X86::MOV32r0, dl, NVT), 0);       
-        switch (NVT.getSimpleVT().SimpleTy) {
+        switch (NVT.SimpleTy) {
         case MVT::i16:
           ClrNode =
               SDValue(CurDAG->getMachineNode(
@@ -2609,7 +2611,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
         // On x86-32, only the ABCD registers have 8-bit subregisters.
         if (!Subtarget->is64Bit()) {
           const TargetRegisterClass *TRC;
-          switch (N0.getValueType().getSimpleVT().SimpleTy) {
+          switch (N0.getSimpleValueType().SimpleTy) {
           case MVT::i32: TRC = &X86::GR32_ABCDRegClass; break;
           case MVT::i16: TRC = &X86::GR16_ABCDRegClass; break;
           default: llvm_unreachable("Unsupported TEST operand type!");
@@ -2644,7 +2646,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
 
         // Put the value in an ABCD register.
         const TargetRegisterClass *TRC;
-        switch (N0.getValueType().getSimpleVT().SimpleTy) {
+        switch (N0.getSimpleValueType().SimpleTy) {
         case MVT::i64: TRC = &X86::GR64_ABCDRegClass; break;
         case MVT::i32: TRC = &X86::GR32_ABCDRegClass; break;
         case MVT::i16: TRC = &X86::GR16_ABCDRegClass; break;
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 9ffe29f..081c558 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -16,6 +16,7 @@
 #include "X86ISelLowering.h"
 #include "Utils/X86ShuffleDecode.h"
 #include "X86.h"
+#include "X86CallingConv.h"
 #include "X86InstrBuilder.h"
 #include "X86TargetMachine.h"
 #include "X86TargetObjectFile.h"
@@ -91,7 +92,7 @@ static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal,
                                VecIdx);
 
   return Result;
-  
+
 }
 /// Generate a DAG to grab 128-bits from a vector > 128 bits.  This
 /// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
@@ -631,7 +632,7 @@ void X86TargetLowering::resetOperationActions() {
   setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
   setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
 
-  if (Subtarget->isTargetCOFF() && !Subtarget->isTargetEnvMacho())
+  if (Subtarget->isOSWindows() && !Subtarget->isTargetEnvMacho())
     setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ?
                        MVT::i64 : MVT::i32, Custom);
   else if (TM.Options.EnableSegmentedStacks)
@@ -1150,9 +1151,6 @@ void X86TargetLowering::resetOperationActions() {
     setOperationAction(ISD::FNEG,               MVT::v4f64, Custom);
     setOperationAction(ISD::FABS,               MVT::v4f64, Custom);
 
-    setOperationAction(ISD::TRUNCATE,           MVT::v8i16, Custom);
-    setOperationAction(ISD::TRUNCATE,           MVT::v4i32, Custom);
-
     setOperationAction(ISD::FP_TO_SINT,         MVT::v8i16, Custom);
 
     setOperationAction(ISD::FP_TO_SINT,         MVT::v8i32, Legal);
@@ -1160,7 +1158,6 @@ void X86TargetLowering::resetOperationActions() {
     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i32, Legal);
     setOperationAction(ISD::FP_ROUND,           MVT::v4f32, Legal);
 
-    setOperationAction(ISD::ZERO_EXTEND,        MVT::v8i32, Custom);
     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i8,  Custom);
     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i16, Custom);
 
@@ -1193,10 +1190,16 @@ void X86TargetLowering::resetOperationActions() {
 
     setOperationAction(ISD::SIGN_EXTEND,       MVT::v4i64, Custom);
     setOperationAction(ISD::SIGN_EXTEND,       MVT::v8i32, Custom);
+    setOperationAction(ISD::SIGN_EXTEND,       MVT::v16i16, Custom);
     setOperationAction(ISD::ZERO_EXTEND,       MVT::v4i64, Custom);
     setOperationAction(ISD::ZERO_EXTEND,       MVT::v8i32, Custom);
+    setOperationAction(ISD::ZERO_EXTEND,       MVT::v16i16, Custom);
     setOperationAction(ISD::ANY_EXTEND,        MVT::v4i64, Custom);
     setOperationAction(ISD::ANY_EXTEND,        MVT::v8i32, Custom);
+    setOperationAction(ISD::ANY_EXTEND,        MVT::v16i16, Custom);
+    setOperationAction(ISD::TRUNCATE,          MVT::v16i8, Custom);
+    setOperationAction(ISD::TRUNCATE,          MVT::v8i16, Custom);
+    setOperationAction(ISD::TRUNCATE,          MVT::v4i32, Custom);
 
     if (Subtarget->hasFMA() || Subtarget->hasFMA4()) {
       setOperationAction(ISD::FMA,             MVT::v8f32, Legal);
@@ -1330,7 +1333,16 @@ void X86TargetLowering::resetOperationActions() {
     setOperationAction(ISD::FMA,                MVT::v16f32, Legal);
     setOperationAction(ISD::SDIV,               MVT::v16i32, Custom);
 
-
+    setOperationAction(ISD::FP_TO_SINT,         MVT::i32, Legal);
+    setOperationAction(ISD::FP_TO_UINT,         MVT::i32, Legal);
+    setOperationAction(ISD::SINT_TO_FP,         MVT::i32, Legal);
+    setOperationAction(ISD::UINT_TO_FP,         MVT::i32, Legal);
+    if (Subtarget->is64Bit()) {
+      setOperationAction(ISD::FP_TO_UINT,       MVT::i64, Legal);
+      setOperationAction(ISD::FP_TO_SINT,       MVT::i64, Legal);
+      setOperationAction(ISD::SINT_TO_FP,       MVT::i64, Legal);
+      setOperationAction(ISD::UINT_TO_FP,       MVT::i64, Legal);
+    }
     setOperationAction(ISD::FP_TO_SINT,         MVT::v16i32, Legal);
     setOperationAction(ISD::FP_TO_UINT,         MVT::v16i32, Legal);
     setOperationAction(ISD::FP_TO_UINT,         MVT::v8i32, Legal);
@@ -1390,6 +1402,9 @@ void X86TargetLowering::resetOperationActions() {
     setOperationAction(ISD::AND,                MVT::v8i64, Legal);
     setOperationAction(ISD::OR,                 MVT::v8i64, Legal);
     setOperationAction(ISD::XOR,                MVT::v8i64, Legal);
+    setOperationAction(ISD::AND,                MVT::v16i32, Legal);
+    setOperationAction(ISD::OR,                 MVT::v16i32, Legal);
+    setOperationAction(ISD::XOR,                MVT::v16i32, Legal);
 
     // Custom lower several nodes.
     for (int i = MVT::FIRST_VECTOR_VALUETYPE;
@@ -1409,14 +1424,6 @@ void X86TargetLowering::resetOperationActions() {
       if (!VT.is512BitVector())
         continue;
 
-      if (VT != MVT::v8i64) {
-        setOperationAction(ISD::XOR,   VT, Promote);
-        AddPromotedToType (ISD::XOR,   VT, MVT::v8i64);
-        setOperationAction(ISD::OR,    VT, Promote);
-        AddPromotedToType (ISD::OR,    VT, MVT::v8i64);
-        setOperationAction(ISD::AND,   VT, Promote);
-        AddPromotedToType (ISD::AND,   VT, MVT::v8i64);
-      }
       if ( EltSize >= 32) {
         setOperationAction(ISD::VECTOR_SHUFFLE,      VT, Custom);
         setOperationAction(ISD::INSERT_VECTOR_ELT,   VT, Custom);
@@ -1434,8 +1441,6 @@ void X86TargetLowering::resetOperationActions() {
       if (!VT.is512BitVector())
         continue;
 
-      setOperationAction(ISD::LOAD,   VT, Promote);
-      AddPromotedToType (ISD::LOAD,   VT, MVT::v8i64);
       setOperationAction(ISD::SELECT, VT, Promote);
       AddPromotedToType (ISD::SELECT, VT, MVT::v8i64);
     }
@@ -1452,6 +1457,7 @@ void X86TargetLowering::resetOperationActions() {
   // We want to custom lower some of our intrinsics.
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
+  setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
 
   // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
   // handle type legalization for these operations here.
@@ -1541,7 +1547,16 @@ void X86TargetLowering::resetOperationActions() {
 }
 
 EVT X86TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
-  if (!VT.isVector()) return MVT::i8;
+  if (!VT.isVector())
+    return MVT::i8;
+
+  const TargetMachine &TM = getTargetMachine();
+  if (!TM.Options.UseSoftFloat && Subtarget->hasAVX512())
+    switch(VT.getVectorNumElements()) {
+    case  8: return MVT::v8i1;
+    case 16: return MVT::v16i1;
+    }
+
   return VT.changeVectorElementTypeToInteger();
 }
 
@@ -1750,6 +1765,13 @@ bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace,
   return true;
 }
 
+bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
+                                            unsigned DestAS) const {
+  assert(SrcAS != DestAS && "Expected different address spaces!");
+
+  return SrcAS < 256 && DestAS < 256;
+}
+
 //===----------------------------------------------------------------------===//
 //               Return Value Calling Convention Implementation
 //===----------------------------------------------------------------------===//
@@ -1767,6 +1789,11 @@ X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv,
   return CCInfo.CheckReturn(Outs, RetCC_X86);
 }
 
+const uint16_t *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
+  static const uint16_t ScratchRegs[] = { X86::R11, 0 };
+  return ScratchRegs;
+}
+
 SDValue
 X86TargetLowering::LowerReturn(SDValue Chain,
                                CallingConv::ID CallConv, bool isVarArg,
@@ -3532,7 +3559,7 @@ static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
 /// isPSHUFDMask - Return true if the node specifies a shuffle of elements that
 /// is suitable for input to PSHUFD or PSHUFW.  That is, it doesn't reference
 /// the second operand.
-static bool isPSHUFDMask(ArrayRef<int> Mask, EVT VT) {
+static bool isPSHUFDMask(ArrayRef<int> Mask, MVT VT) {
   if (VT == MVT::v4f32 || VT == MVT::v4i32 )
     return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4);
   if (VT == MVT::v2f64 || VT == MVT::v2i64)
@@ -3542,7 +3569,7 @@ static bool isPSHUFDMask(ArrayRef<int> Mask, EVT VT) {
 
 /// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that
 /// is suitable for input to PSHUFHW.
-static bool isPSHUFHWMask(ArrayRef<int> Mask, EVT VT, bool HasInt256) {
+static bool isPSHUFHWMask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
   if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16))
     return false;
 
@@ -3571,7 +3598,7 @@ static bool isPSHUFHWMask(ArrayRef<int> Mask, EVT VT, bool HasInt256) {
 
 /// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that
 /// is suitable for input to PSHUFLW.
-static bool isPSHUFLWMask(ArrayRef<int> Mask, EVT VT, bool HasInt256) {
+static bool isPSHUFLWMask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
   if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16))
     return false;
 
@@ -3600,14 +3627,14 @@ static bool isPSHUFLWMask(ArrayRef<int> Mask, EVT VT, bool HasInt256) {
 
 /// isPALIGNRMask - Return true if the node specifies a shuffle of elements that
 /// is suitable for input to PALIGNR.
-static bool isPALIGNRMask(ArrayRef<int> Mask, EVT VT,
+static bool isPALIGNRMask(ArrayRef<int> Mask, MVT VT,
                           const X86Subtarget *Subtarget) {
   if ((VT.is128BitVector() && !Subtarget->hasSSSE3()) ||
       (VT.is256BitVector() && !Subtarget->hasInt256()))
     return false;
 
   unsigned NumElts = VT.getVectorNumElements();
-  unsigned NumLanes = VT.getSizeInBits()/128;
+  unsigned NumLanes = VT.is512BitVector() ? 1: VT.getSizeInBits()/128;
   unsigned NumLaneElts = NumElts/NumLanes;
 
   // Do not handle 64-bit element shuffles with palignr.
@@ -3690,10 +3717,7 @@ static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask,
 /// specifies a shuffle of elements that is suitable for input to 128/256-bit
 /// SHUFPS and SHUFPD. If Commuted is true, then it checks for sources to be
 /// reverse of what x86 shuffles want.
-static bool isSHUFPMask(ArrayRef<int> Mask, EVT VT, bool HasFp256,
-                        bool Commuted = false) {
-  if (!HasFp256 && VT.is256BitVector())
-    return false;
+static bool isSHUFPMask(ArrayRef<int> Mask, MVT VT, bool Commuted = false) {
 
   unsigned NumElems = VT.getVectorNumElements();
   unsigned NumLanes = VT.getSizeInBits()/128;
@@ -3702,6 +3726,10 @@ static bool isSHUFPMask(ArrayRef<int> Mask, EVT VT, bool HasFp256,
   if (NumLaneElems != 2 && NumLaneElems != 4)
     return false;
 
+  unsigned EltSize = VT.getVectorElementType().getSizeInBits();
+  bool symetricMaskRequired =
+    (VT.getSizeInBits() >= 256) && (EltSize == 32);
+
   // VSHUFPSY divides the resulting vector into 4 chunks.
   // The sources are also splitted into 4 chunks, and each destination
   // chunk must come from a different source chunk.
@@ -3721,6 +3749,7 @@ static bool isSHUFPMask(ArrayRef<int> Mask, EVT VT, bool HasFp256,
   //
   //  DST  =>  Y3..Y2,  X3..X2,  Y1..Y0,  X1..X0
   //
+  SmallVector<int, 4> MaskVal(NumLaneElems, -1);
   unsigned HalfLaneElems = NumLaneElems/2;
   for (unsigned l = 0; l != NumElems; l += NumLaneElems) {
     for (unsigned i = 0; i != NumLaneElems; ++i) {
@@ -3731,9 +3760,13 @@ static bool isSHUFPMask(ArrayRef<int> Mask, EVT VT, bool HasFp256,
       // For VSHUFPSY, the mask of the second half must be the same as the
       // first but with the appropriate offsets. This works in the same way as
       // VPERMILPS works with masks.
-      if (NumElems != 8 || l == 0 || Mask[i] < 0)
+      if (!symetricMaskRequired || Idx < 0)
         continue;
-      if (!isUndefOrEqual(Idx, Mask[i]+l))
+      if (MaskVal[i] < 0) {
+        MaskVal[i] = Idx - l;
+        continue;
+      }
+      if ((signed)(Idx - l) != MaskVal[i])
         return false;
     }
   }
@@ -3743,7 +3776,7 @@ static bool isSHUFPMask(ArrayRef<int> Mask, EVT VT, bool HasFp256,
 
 /// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand
 /// specifies a shuffle of elements that is suitable for input to MOVHLPS.
-static bool isMOVHLPSMask(ArrayRef<int> Mask, EVT VT) {
+static bool isMOVHLPSMask(ArrayRef<int> Mask, MVT VT) {
   if (!VT.is128BitVector())
     return false;
 
@@ -3762,7 +3795,7 @@ static bool isMOVHLPSMask(ArrayRef<int> Mask, EVT VT) {
 /// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form
 /// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef,
 /// <2, 3, 2, 3>
-static bool isMOVHLPS_v_undef_Mask(ArrayRef<int> Mask, EVT VT) {
+static bool isMOVHLPS_v_undef_Mask(ArrayRef<int> Mask, MVT VT) {
   if (!VT.is128BitVector())
     return false;
 
@@ -3779,7 +3812,7 @@ static bool isMOVHLPS_v_undef_Mask(ArrayRef<int> Mask, EVT VT) {
 
 /// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand
 /// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}.
-static bool isMOVLPMask(ArrayRef<int> Mask, EVT VT) {
+static bool isMOVLPMask(ArrayRef<int> Mask, MVT VT) {
   if (!VT.is128BitVector())
     return false;
 
@@ -3801,7 +3834,7 @@ static bool isMOVLPMask(ArrayRef<int> Mask, EVT VT) {
 
 /// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand
 /// specifies a shuffle of elements that is suitable for input to MOVLHPS.
-static bool isMOVLHPSMask(ArrayRef<int> Mask, EVT VT) {
+static bool isMOVLHPSMask(ArrayRef<int> Mask, MVT VT) {
   if (!VT.is128BitVector())
     return false;
 
@@ -3827,7 +3860,7 @@ static bool isMOVLHPSMask(ArrayRef<int> Mask, EVT VT) {
 static
 SDValue Compact8x32ShuffleNode(ShuffleVectorSDNode *SVOp,
                                SelectionDAG &DAG) {
-  MVT VT = SVOp->getValueType(0).getSimpleVT();
+  MVT VT = SVOp->getSimpleValueType(0);
   SDLoc dl(SVOp);
 
   if (VT != MVT::v8i32 && VT != MVT::v8f32)
@@ -3870,70 +3903,92 @@ SDValue Compact8x32ShuffleNode(ShuffleVectorSDNode *SVOp,
 
 /// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand
 /// specifies a shuffle of elements that is suitable for input to UNPCKL.
-static bool isUNPCKLMask(ArrayRef<int> Mask, EVT VT,
+static bool isUNPCKLMask(ArrayRef<int> Mask, MVT VT,
                          bool HasInt256, bool V2IsSplat = false) {
-  unsigned NumElts = VT.getVectorNumElements();
 
-  assert((VT.is128BitVector() || VT.is256BitVector()) &&
-         "Unsupported vector type for unpckh");
+  assert(VT.getSizeInBits() >= 128 &&
+         "Unsupported vector type for unpckl");
 
-  if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
-      (!HasInt256 || (NumElts != 16 && NumElts != 32)))
+  // AVX defines UNPCK* to operate independently on 128-bit lanes.
+  unsigned NumLanes;
+  unsigned NumOf256BitLanes;
+  unsigned NumElts = VT.getVectorNumElements();
+  if (VT.is256BitVector()) {
+    if (NumElts != 4 && NumElts != 8 &&
+        (!HasInt256 || (NumElts != 16 && NumElts != 32)))
     return false;
+    NumLanes = 2;
+    NumOf256BitLanes = 1;
+  } else if (VT.is512BitVector()) {
+    assert(VT.getScalarType().getSizeInBits() >= 32 &&
+           "Unsupported vector type for unpckh");
+    NumLanes = 2;
+    NumOf256BitLanes = 2;
+  } else {
+    NumLanes = 1;
+    NumOf256BitLanes = 1;
+  }
 
-  // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
-  // independently on 128-bit lanes.
-  unsigned NumLanes = VT.getSizeInBits()/128;
-  unsigned NumLaneElts = NumElts/NumLanes;
+  unsigned NumEltsInStride = NumElts/NumOf256BitLanes;
+  unsigned NumLaneElts = NumEltsInStride/NumLanes;
 
-  for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
-    for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) {
-      int BitI  = Mask[l+i];
-      int BitI1 = Mask[l+i+1];
-      if (!isUndefOrEqual(BitI, j))
-        return false;
-      if (V2IsSplat) {
-        if (!isUndefOrEqual(BitI1, NumElts))
+  for (unsigned l256 = 0; l256 < NumOf256BitLanes; l256 += 1) {
+    for (unsigned l = 0; l != NumEltsInStride; l += NumLaneElts) {
+      for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) {
+        int BitI  = Mask[l256*NumEltsInStride+l+i];
+        int BitI1 = Mask[l256*NumEltsInStride+l+i+1];
+        if (!isUndefOrEqual(BitI, j+l256*NumElts))
           return false;
-      } else {
-        if (!isUndefOrEqual(BitI1, j + NumElts))
+        if (V2IsSplat && !isUndefOrEqual(BitI1, NumElts))
+          return false;
+        if (!isUndefOrEqual(BitI1, j+l256*NumElts+NumEltsInStride))
           return false;
       }
     }
   }
-
   return true;
 }
 
 /// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand
 /// specifies a shuffle of elements that is suitable for input to UNPCKH.
-static bool isUNPCKHMask(ArrayRef<int> Mask, EVT VT,
+static bool isUNPCKHMask(ArrayRef<int> Mask, MVT VT,
                          bool HasInt256, bool V2IsSplat = false) {
-  unsigned NumElts = VT.getVectorNumElements();
-
-  assert((VT.is128BitVector() || VT.is256BitVector()) &&
+  assert(VT.getSizeInBits() >= 128 &&
          "Unsupported vector type for unpckh");
 
-  if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
-      (!HasInt256 || (NumElts != 16 && NumElts != 32)))
+  // AVX defines UNPCK* to operate independently on 128-bit lanes.
+  unsigned NumLanes;
+  unsigned NumOf256BitLanes;
+  unsigned NumElts = VT.getVectorNumElements();
+  if (VT.is256BitVector()) {
+    if (NumElts != 4 && NumElts != 8 &&
+        (!HasInt256 || (NumElts != 16 && NumElts != 32)))
     return false;
+    NumLanes = 2;
+    NumOf256BitLanes = 1;
+  } else if (VT.is512BitVector()) {
+    assert(VT.getScalarType().getSizeInBits() >= 32 &&
+           "Unsupported vector type for unpckh");
+    NumLanes = 2;
+    NumOf256BitLanes = 2;
+  } else {
+    NumLanes = 1;
+    NumOf256BitLanes = 1;
+  }
 
-  // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
-  // independently on 128-bit lanes.
-  unsigned NumLanes = VT.getSizeInBits()/128;
-  unsigned NumLaneElts = NumElts/NumLanes;
+  unsigned NumEltsInStride = NumElts/NumOf256BitLanes;
+  unsigned NumLaneElts = NumEltsInStride/NumLanes;
 
-  for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
-    for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) {
-      int BitI  = Mask[l+i];
-      int BitI1 = Mask[l+i+1];
-      if (!isUndefOrEqual(BitI, j))
-        return false;
-      if (V2IsSplat) {
-        if (isUndefOrEqual(BitI1, NumElts))
+  for (unsigned l256 = 0; l256 < NumOf256BitLanes; l256 += 1) {
+    for (unsigned l = 0; l != NumEltsInStride; l += NumLaneElts) {
+      for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) {
+        int BitI  = Mask[l256*NumEltsInStride+l+i];
+        int BitI1 = Mask[l256*NumEltsInStride+l+i+1];
+        if (!isUndefOrEqual(BitI, j+l256*NumElts))
           return false;
-      } else {
-        if (!isUndefOrEqual(BitI1, j+NumElts))
+        if (V2IsSplat && !isUndefOrEqual(BitI1, NumElts))
+          return false;
+        if (!isUndefOrEqual(BitI1, j+l256*NumElts+NumEltsInStride))
           return false;
       }
     }
@@ -3944,10 +3999,12 @@ static bool isUNPCKHMask(ArrayRef<int> Mask, EVT VT,
 /// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form
 /// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef,
 /// <0, 0, 1, 1>
-static bool isUNPCKL_v_undef_Mask(ArrayRef<int> Mask, EVT VT, bool HasInt256) {
+static bool isUNPCKL_v_undef_Mask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
   unsigned NumElts = VT.getVectorNumElements();
   bool Is256BitVec = VT.is256BitVector();
 
+  if (VT.is512BitVector())
+    return false;
   assert((VT.is128BitVector() || VT.is256BitVector()) &&
          "Unsupported vector type for unpckh");
 
@@ -3985,9 +4042,12 @@ static bool isUNPCKL_v_undef_Mask(ArrayRef<int> Mask, EVT VT, bool HasInt256) {
 /// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form
 /// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef,
 /// <2, 2, 3, 3>
-static bool isUNPCKH_v_undef_Mask(ArrayRef<int> Mask, EVT VT, bool HasInt256) {
+static bool isUNPCKH_v_undef_Mask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
   unsigned NumElts = VT.getVectorNumElements();
 
+  if (VT.is512BitVector())
+    return false;
+
   assert((VT.is128BitVector() || VT.is256BitVector()) &&
          "Unsupported vector type for unpckh");
 
@@ -4040,7 +4100,7 @@ static bool isMOVLMask(ArrayRef<int> Mask, EVT VT) {
 ///   vector_shuffle <4, 5, 6, 7, 12, 13, 14, 15>
 /// The first half comes from the second half of V1 and the second half from the
 /// the second half of V2.
-static bool isVPERM2X128Mask(ArrayRef<int> Mask, EVT VT, bool HasFp256) {
+static bool isVPERM2X128Mask(ArrayRef<int> Mask, MVT VT, bool HasFp256) {
   if (!HasFp256 || !VT.is256BitVector())
     return false;
 
@@ -4072,7 +4132,7 @@ static bool isVPERM2X128Mask(ArrayRef<int> Mask, EVT VT, bool HasFp256) {
 /// getShuffleVPERM2X128Immediate - Return the appropriate immediate to shuffle
 /// the specified VECTOR_MASK mask with VPERM2F128/VPERM2I128 instructions.
 static unsigned getShuffleVPERM2X128Immediate(ShuffleVectorSDNode *SVOp) {
-  MVT VT = SVOp->getValueType(0).getSimpleVT();
+  MVT VT = SVOp->getSimpleValueType(0);
 
   unsigned HalfSize = VT.getVectorNumElements()/2;
 
@@ -4093,6 +4153,44 @@ static unsigned getShuffleVPERM2X128Immediate(ShuffleVectorSDNode *SVOp) {
   return (FstHalf | (SndHalf << 4));
 }
 
+// Symetric in-lane mask. Each lane has 4 elements (for imm8)
+static bool isPermImmMask(ArrayRef<int> Mask, MVT VT, unsigned& Imm8) {
+  unsigned EltSize = VT.getVectorElementType().getSizeInBits();
+  if (EltSize < 32)
+    return false;
+
+  unsigned NumElts = VT.getVectorNumElements();
+  Imm8 = 0;
+  if (VT.is128BitVector() || (VT.is256BitVector() && EltSize == 64)) {
+    for (unsigned i = 0; i != NumElts; ++i) {
+      if (Mask[i] < 0)
+        continue;
+      Imm8 |= Mask[i] << (i*2);
+    }
+    return true;
+  }
+
+  unsigned LaneSize = 4;
+  SmallVector<int, 4> MaskVal(LaneSize, -1);
+
+  for (unsigned l = 0; l != NumElts; l += LaneSize) {
+    for (unsigned i = 0; i != LaneSize; ++i) {
+      if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize))
+        return false;
+      if (Mask[i+l] < 0)
+        continue;
+      if (MaskVal[i] < 0) {
+        MaskVal[i] = Mask[i+l] - l;
+        Imm8 |= MaskVal[i] << (i*2);
+        continue;
+      }
+      if (Mask[i+l] != (signed)(MaskVal[i]+l))
+        return false;
+    }
+  }
+  return true;
+}
+
 /// isVPERMILPMask - Return true if the specified VECTOR_SHUFFLE operand
 /// specifies a shuffle of elements that is suitable for input to VPERMILPD*.
 /// Note that VPERMIL mask matching is different depending whether theunderlying
@@ -4100,38 +4198,39 @@ static unsigned getShuffleVPERM2X128Immediate(ShuffleVectorSDNode *SVOp) {
 /// to the same elements of the low, but to the higher half of the source.
 /// In VPERMILPD the two lanes could be shuffled independently of each other
 /// with the same restriction that lanes can't be crossed. Also handles PSHUFDY.
-static bool isVPERMILPMask(ArrayRef<int> Mask, EVT VT, bool HasFp256) {
-  if (!HasFp256)
+static bool isVPERMILPMask(ArrayRef<int> Mask, MVT VT) {
+  unsigned EltSize = VT.getVectorElementType().getSizeInBits();
+  if (VT.getSizeInBits() < 256 || EltSize < 32)
     return false;
-
+  bool symetricMaskRequired = (EltSize == 32);
   unsigned NumElts = VT.getVectorNumElements();
-  // Only match 256-bit with 32/64-bit types
-  if (!VT.is256BitVector() || (NumElts != 4 && NumElts != 8))
-    return false;
 
   unsigned NumLanes = VT.getSizeInBits()/128;
   unsigned LaneSize = NumElts/NumLanes;
+  // 2 or 4 elements in one lane
+  
+  SmallVector<int, 4> ExpectedMaskVal(LaneSize, -1);
   for (unsigned l = 0; l != NumElts; l += LaneSize) {
     for (unsigned i = 0; i != LaneSize; ++i) {
       if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize))
         return false;
-      if (NumElts != 8 || l == 0)
-        continue;
-      // VPERMILPS handling
-      if (Mask[i] < 0)
-        continue;
-      if (!isUndefOrEqual(Mask[i+l], Mask[i]+l))
-        return false;
+      if (symetricMaskRequired) {
+        if (ExpectedMaskVal[i] < 0 && Mask[i+l] >= 0) {
+          ExpectedMaskVal[i] = Mask[i+l] - l;
+          continue;
+        }
+        if (!isUndefOrEqual(Mask[i+l], ExpectedMaskVal[i]+l))
+          return false;
+      }
     }
   }
-
   return true;
 }
 
 /// isCommutedMOVLMask - Returns true if the shuffle mask is except the reverse
 /// of what x86 movss want. X86 movs requires the lowest  element to be lowest
 /// element of vector 2 and the other elements to come from vector 1 in order.
-static bool isCommutedMOVLMask(ArrayRef<int> Mask, EVT VT,
+static bool isCommutedMOVLMask(ArrayRef<int> Mask, MVT VT,
                                bool V2IsSplat = false, bool V2IsUndef = false) {
   if (!VT.is128BitVector())
     return false;
@@ -4155,7 +4254,7 @@ static bool isCommutedMOVLMask(ArrayRef<int> Mask, EVT VT,
 /// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand
 /// specifies a shuffle of elements that is suitable for input to MOVSHDUP.
 /// Masks to match: <1, 1, 3, 3> or <1, 1, 3, 3, 5, 5, 7, 7>
-static bool isMOVSHDUPMask(ArrayRef<int> Mask, EVT VT,
+static bool isMOVSHDUPMask(ArrayRef<int> Mask, MVT VT,
                            const X86Subtarget *Subtarget) {
   if (!Subtarget->hasSSE3())
     return false;
@@ -4163,7 +4262,8 @@ static bool isMOVSHDUPMask(ArrayRef<int> Mask, EVT VT,
   unsigned NumElems = VT.getVectorNumElements();
 
   if ((VT.is128BitVector() && NumElems != 4) ||
-      (VT.is256BitVector() && NumElems != 8))
+      (VT.is256BitVector() && NumElems != 8) ||
+      (VT.is512BitVector() && NumElems != 16))
     return false;
 
   // "i+1" is the value the indexed mask element must have
@@ -4178,7 +4278,7 @@ static bool isMOVSHDUPMask(ArrayRef<int> Mask, EVT VT,
 /// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand
 /// specifies a shuffle of elements that is suitable for input to MOVSLDUP.
 /// Masks to match: <0, 0, 2, 2> or <0, 0, 2, 2, 4, 4, 6, 6>
-static bool isMOVSLDUPMask(ArrayRef<int> Mask, EVT VT,
+static bool isMOVSLDUPMask(ArrayRef<int> Mask, MVT VT,
                            const X86Subtarget *Subtarget) {
   if (!Subtarget->hasSSE3())
     return false;
@@ -4186,7 +4286,8 @@ static bool isMOVSLDUPMask(ArrayRef<int> Mask, EVT VT,
   unsigned NumElems = VT.getVectorNumElements();
 
   if ((VT.is128BitVector() && NumElems != 4) ||
-      (VT.is256BitVector() && NumElems != 8))
+      (VT.is256BitVector() && NumElems != 8) ||
+      (VT.is512BitVector() && NumElems != 16))
     return false;
 
   // "i" is the value the indexed mask element must have
@@ -4201,7 +4302,7 @@ static bool isMOVSLDUPMask(ArrayRef<int> Mask, EVT VT,
 /// isMOVDDUPYMask - Return true if the specified VECTOR_SHUFFLE operand
 /// specifies a shuffle of elements that is suitable for input to 256-bit
 /// version of MOVDDUP.
-static bool isMOVDDUPYMask(ArrayRef<int> Mask, EVT VT, bool HasFp256) {
+static bool isMOVDDUPYMask(ArrayRef<int> Mask, MVT VT, bool HasFp256) {
   if (!HasFp256 || !VT.is256BitVector())
     return false;
 
@@ -4221,7 +4322,7 @@ static bool isMOVDDUPYMask(ArrayRef<int> Mask, EVT VT, bool HasFp256) {
 /// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand
 /// specifies a shuffle of elements that is suitable for input to 128-bit
 /// version of MOVDDUP.
-static bool isMOVDDUPMask(ArrayRef<int> Mask, EVT VT) {
+static bool isMOVDDUPMask(ArrayRef<int> Mask, MVT VT) {
   if (!VT.is128BitVector())
     return false;
 
@@ -4247,7 +4348,7 @@ static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) {
   uint64_t Index =
     cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
 
-  MVT VT = N->getValueType(0).getSimpleVT();
+  MVT VT = N->getSimpleValueType(0);
   unsigned ElSize = VT.getVectorElementType().getSizeInBits();
   bool Result = (Index * ElSize) % vecWidth == 0;
 
@@ -4265,7 +4366,7 @@ static bool isVINSERTIndex(SDNode *N, unsigned vecWidth) {
   uint64_t Index =
     cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
 
-  MVT VT = N->getValueType(0).getSimpleVT();
+  MVT VT = N->getSimpleValueType(0);
   unsigned ElSize = VT.getVectorElementType().getSizeInBits();
   bool Result = (Index * ElSize) % vecWidth == 0;
 
@@ -4292,9 +4393,9 @@ bool X86::isVEXTRACT256Index(SDNode *N) {
 /// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions.
 /// Handles 128-bit and 256-bit.
 static unsigned getShuffleSHUFImmediate(ShuffleVectorSDNode *N) {
-  MVT VT = N->getValueType(0).getSimpleVT();
+  MVT VT = N->getSimpleValueType(0);
 
-  assert((VT.is128BitVector() || VT.is256BitVector()) &&
+  assert((VT.getSizeInBits() >= 128) &&
          "Unsupported vector type for PSHUF/SHUFP");
 
   // Handle 128 and 256-bit vector lengths. AVX defines PSHUF/SHUFP to operate
@@ -4303,10 +4404,10 @@ static unsigned getShuffleSHUFImmediate(ShuffleVectorSDNode *N) {
   unsigned NumLanes = VT.getSizeInBits()/128;
   unsigned NumLaneElts = NumElts/NumLanes;
 
-  assert((NumLaneElts == 2 || NumLaneElts == 4) &&
-         "Only supports 2 or 4 elements per lane");
+  assert((NumLaneElts == 2 || NumLaneElts == 4 || NumLaneElts == 8) &&
+         "Only supports 2, 4 or 8 elements per lane");
 
-  unsigned Shift = (NumLaneElts == 4) ? 1 : 0;
+  unsigned Shift = (NumLaneElts >= 4) ? 1 : 0;
   unsigned Mask = 0;
   for (unsigned i = 0; i != NumElts; ++i) {
     int Elt = N->getMaskElt(i);
@@ -4322,7 +4423,7 @@ static unsigned getShuffleSHUFImmediate(ShuffleVectorSDNode *N) {
 /// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle
 /// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction.
 static unsigned getShufflePSHUFHWImmediate(ShuffleVectorSDNode *N) {
-  MVT VT = N->getValueType(0).getSimpleVT();
+  MVT VT = N->getSimpleValueType(0);
 
   assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&
          "Unsupported vector type for PSHUFHW");
@@ -4346,7 +4447,7 @@ static unsigned getShufflePSHUFHWImmediate(ShuffleVectorSDNode *N) {
 /// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle
 /// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction.
 static unsigned getShufflePSHUFLWImmediate(ShuffleVectorSDNode *N) {
-  MVT VT = N->getValueType(0).getSimpleVT();
+  MVT VT = N->getSimpleValueType(0);
 
   assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&
          "Unsupported vector type for PSHUFHW");
@@ -4370,11 +4471,12 @@ static unsigned getShufflePSHUFLWImmediate(ShuffleVectorSDNode *N) {
 /// getShufflePALIGNRImmediate - Return the appropriate immediate to shuffle
 /// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction.
 static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp) {
-  MVT VT = SVOp->getValueType(0).getSimpleVT();
-  unsigned EltSize = VT.getVectorElementType().getSizeInBits() >> 3;
+  MVT VT = SVOp->getSimpleValueType(0);
+  unsigned EltSize = VT.is512BitVector() ? 1 :
+    VT.getVectorElementType().getSizeInBits() >> 3;
 
   unsigned NumElts = VT.getVectorNumElements();
-  unsigned NumLanes = VT.getSizeInBits()/128;
+  unsigned NumLanes = VT.is512BitVector() ? 1 : VT.getSizeInBits()/128;
   unsigned NumLaneElts = NumElts/NumLanes;
 
   int Val = 0;
@@ -4399,7 +4501,7 @@ static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) {
   uint64_t Index =
     cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
 
-  MVT VecVT = N->getOperand(0).getValueType().getSimpleVT();
+  MVT VecVT = N->getOperand(0).getSimpleValueType();
   MVT ElVT = VecVT.getVectorElementType();
 
   unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
@@ -4414,7 +4516,7 @@ static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) {
   uint64_t Index =
     cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
 
-  MVT VecVT = N->getValueType(0).getSimpleVT();
+  MVT VecVT = N->getSimpleValueType(0);
   MVT ElVT = VecVT.getVectorElementType();
 
   unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
@@ -4449,27 +4551,6 @@ unsigned X86::getInsertVINSERT256Immediate(SDNode *N) {
   return getInsertVINSERTImmediate(N, 256);
 }
 
-/// getShuffleCLImmediate - Return the appropriate immediate to shuffle
-/// the specified VECTOR_SHUFFLE mask with VPERMQ and VPERMPD instructions.
-/// Handles 256-bit.
-static unsigned getShuffleCLImmediate(ShuffleVectorSDNode *N) {
-  MVT VT = N->getValueType(0).getSimpleVT();
-
-  unsigned NumElts = VT.getVectorNumElements();
-
-  assert((VT.is256BitVector() && NumElts == 4) &&
-         "Unsupported vector type for VPERMQ/VPERMPD");
-
-  unsigned Mask = 0;
-  for (unsigned i = 0; i != NumElts; ++i) {
-    int Elt = N->getMaskElt(i);
-    if (Elt < 0)
-      continue;
-    Mask |= Elt << (i*2);
-  }
-
-  return Mask;
-}
 /// isZeroNode - Returns true if Elt is a constant zero or a floating point
 /// constant +0.0.
 bool X86::isZeroNode(SDValue Elt) {
@@ -4484,7 +4565,7 @@ bool X86::isZeroNode(SDValue Elt) {
 /// their permute mask.
 static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp,
                                     SelectionDAG &DAG) {
-  MVT VT = SVOp->getValueType(0).getSimpleVT();
+  MVT VT = SVOp->getSimpleValueType(0);
   unsigned NumElems = VT.getVectorNumElements();
   SmallVector<int, 8> MaskVec;
 
@@ -4506,7 +4587,7 @@ static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp,
 /// match movhlps. The lower half elements should come from upper half of
 /// V1 (and in order), and the upper half elements should come from the upper
 /// half of V2 (and in order).
-static bool ShouldXformToMOVHLPS(ArrayRef<int> Mask, EVT VT) {
+static bool ShouldXformToMOVHLPS(ArrayRef<int> Mask, MVT VT) {
   if (!VT.is128BitVector())
     return false;
   if (VT.getVectorNumElements() != 4)
@@ -4563,7 +4644,7 @@ static bool WillBeConstantPoolLoad(SDNode *N) {
 /// half of V2 (and in order). And since V1 will become the source of the
 /// MOVLP, it must be either a vector load or a scalar load to vector.
 static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2,
-                               ArrayRef<int> Mask, EVT VT) {
+                               ArrayRef<int> Mask, MVT VT) {
   if (!VT.is128BitVector())
     return false;
 
@@ -4659,6 +4740,11 @@ static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops,
                         array_lengthof(Ops));
     }
+  } else if (VT.is512BitVector()) { // AVX-512
+      SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
+      SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst,
+                        Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
+      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops, 16);
   } else
     llvm_unreachable("Unexpected vector type");
 
@@ -4715,7 +4801,7 @@ static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
 }
 
 /// getUnpackl - Returns a vector_shuffle node for an unpackl operation.
-static SDValue getUnpackl(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
+static SDValue getUnpackl(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,
                           SDValue V2) {
   unsigned NumElems = VT.getVectorNumElements();
   SmallVector<int, 8> Mask;
@@ -4727,7 +4813,7 @@ static SDValue getUnpackl(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
 }
 
 /// getUnpackh - Returns a vector_shuffle node for an unpackh operation.
-static SDValue getUnpackh(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
+static SDValue getUnpackh(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,
                           SDValue V2) {
   unsigned NumElems = VT.getVectorNumElements();
   SmallVector<int, 8> Mask;
@@ -4743,7 +4829,7 @@ static SDValue getUnpackh(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
 // Generate shuffles which repeat i16 and i8 several times until they can be
 // represented by v4f32 and then be manipulated by target suported shuffles.
 static SDValue PromoteSplati8i16(SDValue V, SelectionDAG &DAG, int &EltNo) {
-  EVT VT = V.getValueType();
+  MVT VT = V.getSimpleValueType();
   int NumElems = VT.getVectorNumElements();
   SDLoc dl(V);
 
@@ -4761,7 +4847,7 @@ static SDValue PromoteSplati8i16(SDValue V, SelectionDAG &DAG, int &EltNo) {
 
 /// getLegalSplat - Generate a legal splat with supported x86 shuffles
 static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) {
-  EVT VT = V.getValueType();
+  MVT VT = V.getSimpleValueType();
   SDLoc dl(V);
 
   if (VT.is128BitVector()) {
@@ -4787,7 +4873,7 @@ static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) {
 
 /// PromoteSplat - Splat is promoted to target supported vector shuffles.
 static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) {
-  EVT SrcVT = SV->getValueType(0);
+  MVT SrcVT = SV->getSimpleValueType(0);
   SDValue V1 = SV->getOperand(0);
   SDLoc dl(SV);
 
@@ -4810,7 +4896,7 @@ static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) {
   // instruction because the target has no such instruction. Generate shuffles
   // which repeat i16 and i8 several times until they fit in i32, and then can
   // be manipulated by target suported shuffles.
-  EVT EltVT = SrcVT.getVectorElementType();
+  MVT EltVT = SrcVT.getVectorElementType();
   if (EltVT == MVT::i8 || EltVT == MVT::i16)
     V1 = PromoteSplati8i16(V1, DAG, EltNo);
 
@@ -4832,7 +4918,7 @@ static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,
                                            bool IsZero,
                                            const X86Subtarget *Subtarget,
                                            SelectionDAG &DAG) {
-  EVT VT = V2.getValueType();
+  MVT VT = V2.getSimpleValueType();
   SDValue V1 = IsZero
     ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
   unsigned NumElems = VT.getVectorNumElements();
@@ -4950,7 +5036,7 @@ static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
 
   // Recurse into target specific vector shuffles to find scalars.
   if (isTargetShuffle(Opcode)) {
-    MVT ShufVT = V.getValueType().getSimpleVT();
+    MVT ShufVT = V.getSimpleValueType();
     unsigned NumElems = ShufVT.getVectorNumElements();
     SmallVector<int, 16> ShuffleMask;
     bool IsUnary;
@@ -5048,7 +5134,8 @@ bool isShuffleMaskConsecutive(ShuffleVectorSDNode *SVOp,
 /// logical left shift of a vector.
 static bool isVectorShiftRight(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
                                bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
-  unsigned NumElems = SVOp->getValueType(0).getVectorNumElements();
+  unsigned NumElems =
+    SVOp->getSimpleValueType(0).getVectorNumElements();
   unsigned NumZeros = getNumOfConsecutiveZeros(
       SVOp, NumElems, false /* check zeros from right */, DAG,
       SVOp->getMaskElt(0));
@@ -5082,7 +5169,8 @@ static bool isVectorShiftRight(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
 /// logical left shift of a vector.
 static bool isVectorShiftLeft(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
                               bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
-  unsigned NumElems = SVOp->getValueType(0).getVectorNumElements();
+  unsigned NumElems =
+    SVOp->getSimpleValueType(0).getVectorNumElements();
   unsigned NumZeros = getNumOfConsecutiveZeros(
       SVOp, NumElems, true /* check zeros from left */, DAG,
       NumElems - SVOp->getMaskElt(NumElems - 1) - 1);
@@ -5118,7 +5206,7 @@ static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
                           bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
   // Although the logic below support any bitwidth size, there are no
   // shift instructions which handle more than 128-bit vectors.
-  if (!SVOp->getValueType(0).is128BitVector())
+  if (!SVOp->getSimpleValueType(0).is128BitVector())
     return false;
 
   if (isVectorShiftLeft(SVOp, DAG, isLeft, ShVal, ShAmt) ||
@@ -5223,9 +5311,8 @@ static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
                                   TLI.getScalarShiftAmountTy(SrcOp.getValueType()))));
 }
 
-SDValue
-X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, SDLoc dl,
-                                          SelectionDAG &DAG) const {
+static SDValue
+LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, SDLoc dl, SelectionDAG &DAG) {
 
   // Check if the scalar load can be widened into a vector load. And if
   // the address is "base + cst" see if the cst can be "absorbed" into
@@ -5288,7 +5375,10 @@ X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, SDLoc dl,
                              LD->getPointerInfo().getWithOffset(StartOffset),
                              false, false, false, 0);
 
-    SmallVector<int, 8> Mask(NumElems, EltNo);
+    SmallVector<int, 8> Mask;
+    for (unsigned i = 0; i != NumElems; ++i)
+      Mask.push_back(EltNo);
+
     return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &Mask[0]);
   }
 
@@ -5305,7 +5395,8 @@ X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, SDLoc dl,
 /// rather than undef via VZEXT_LOAD, but we do not detect that case today.
 /// There's even a handy isZeroNode for that purpose.
 static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
-                                        SDLoc &DL, SelectionDAG &DAG) {
+                                        SDLoc &DL, SelectionDAG &DAG,
+                                        bool isAfterLegalize) {
   EVT EltVT = VT.getVectorElementType();
   unsigned NumElems = Elts.size();
 
@@ -5341,7 +5432,13 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
   // load of the entire vector width starting at the base pointer.  If we found
   // consecutive loads for the low half, generate a vzext_load node.
   if (LastLoadedElt == NumElems - 1) {
+
+    if (isAfterLegalize &&
+        !DAG.getTargetLoweringInfo().isOperationLegal(ISD::LOAD, VT))
+      return SDValue();
+
     SDValue NewLd = SDValue();
+
     if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16)
       NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
                           LDBase->getPointerInfo(),
@@ -5398,12 +5495,12 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
 /// a scalar load, or a constant.
 /// The VBROADCAST node is returned when a pattern is found,
 /// or SDValue() otherwise.
-SDValue
-X86TargetLowering::LowerVectorBroadcast(SDValue Op, SelectionDAG &DAG) const {
+static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget,
+                                    SelectionDAG &DAG) {
   if (!Subtarget->hasFp256())
     return SDValue();
 
-  MVT VT = Op.getValueType().getSimpleVT();
+  MVT VT = Op.getSimpleValueType();
   SDLoc dl(Op);
 
   assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
@@ -5450,7 +5547,7 @@ X86TargetLowering::LowerVectorBroadcast(SDValue Op, SelectionDAG &DAG) const {
           return SDValue();
 
         // Use the register form of the broadcast instruction available on AVX2.
-        if (VT.is256BitVector())
+        if (VT.getSizeInBits() >= 256)
           Sc = Extract128BitVector(Sc, 0, DAG, dl);
         return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Sc);
       }
@@ -5492,7 +5589,8 @@ X86TargetLowering::LowerVectorBroadcast(SDValue Op, SelectionDAG &DAG) const {
 
       assert(C && "Invalid constant type");
 
-      SDValue CP = DAG.getConstantPool(C, getPointerTy());
+      const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+      SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy());
       unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
       Ld = DAG.getLoad(CVT, dl, DAG.getEntryNode(), CP,
                        MachinePointerInfo::getConstantPool(),
@@ -5528,12 +5626,12 @@ X86TargetLowering::LowerVectorBroadcast(SDValue Op, SelectionDAG &DAG) const {
   return SDValue();
 }
 
-SDValue
-X86TargetLowering::buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) const {
-  EVT VT = Op.getValueType();
+static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
+  MVT VT = Op.getSimpleValueType();
 
   // Skip if insert_vec_elt is not supported.
-  if (!isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
     return SDValue();
 
   SDLoc DL(Op);
@@ -5606,7 +5704,7 @@ X86TargetLowering::buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) const {
 SDValue
 X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
 
-  EVT VT = Op.getValueType();
+  MVT VT = Op.getSimpleValueType();
   assert((VT.getVectorElementType() == MVT::i1) && (VT.getSizeInBits() <= 16) &&
          "Unexpected type in LowerBUILD_VECTORvXi1!");
 
@@ -5645,13 +5743,16 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
     SDValue FullMask = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1,
       DAG.getConstant(Immediate, MVT::i16));
     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, FullMask,
-		       DAG.getIntPtrConstant(0));
+                       DAG.getIntPtrConstant(0));
   }
 
-  if (!isSplatVector(Op.getNode()))
-    llvm_unreachable("Unsupported predicate operation");
-
+  // Splat vector (with undefs)
   SDValue In = Op.getOperand(0);
+  for (unsigned i = 1, e = Op.getNumOperands(); i != e; ++i) {
+    if (Op.getOperand(i) != In && Op.getOperand(i).getOpcode() != ISD::UNDEF)
+      llvm_unreachable("Unsupported predicate operation");
+  }
+
   SDValue EFLAGS, X86CC;
   if (In.getOpcode() == ISD::SETCC) {
     SDValue Op0 = In.getOperand(0);
@@ -5679,7 +5780,7 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
     //     res = allOnes ### CMOVNE -1, %res
     //   else
     //     res = allZero
-    MVT InVT = In.getValueType().getSimpleVT();
+    MVT InVT = In.getSimpleValueType();
     SDValue Bit1 = DAG.getNode(ISD::AND, dl, InVT, In, DAG.getConstant(1, InVT));
     EFLAGS = EmitTest(Bit1, X86::COND_NE, DAG);
     X86CC = DAG.getConstant(X86::COND_NE, MVT::i8);
@@ -5708,7 +5809,7 @@ SDValue
 X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
   SDLoc dl(Op);
 
-  MVT VT = Op.getValueType().getSimpleVT();
+  MVT VT = Op.getSimpleValueType();
   MVT ExtVT = VT.getVectorElementType();
   unsigned NumElems = Op.getNumOperands();
 
@@ -5720,7 +5821,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
   if (ISD::isBuildVectorAllZeros(Op.getNode())) {
     // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
     // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
-    if (VT == MVT::v4i32 || VT == MVT::v8i32)
+    if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
       return Op;
 
     return getZeroVector(VT, Subtarget, DAG, dl);
@@ -5733,10 +5834,11 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
     if (VT == MVT::v4i32 || (VT == MVT::v8i32 && Subtarget->hasInt256()))
       return Op;
 
-    return getOnesVector(VT, Subtarget->hasInt256(), DAG, dl);
+    if (!VT.is512BitVector())
+      return getOnesVector(VT, Subtarget->hasInt256(), DAG, dl);
   }
 
-  SDValue Broadcast = LowerVectorBroadcast(Op, DAG);
+  SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG);
   if (Broadcast.getNode())
     return Broadcast;
 
@@ -5815,7 +5917,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
 
       if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
           (ExtVT == MVT::i64 && Subtarget->is64Bit())) {
-        if (VT.is256BitVector()) {
+        if (VT.is256BitVector() || VT.is512BitVector()) {
           SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl);
           return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec,
                              Item, DAG.getIntPtrConstant(0));
@@ -5980,7 +6082,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
       V[i] = Op.getOperand(i);
 
     // Check for elements which are consecutive loads.
-    SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG);
+    SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false);
     if (LD.getNode())
       return LD;
 
@@ -6044,7 +6146,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
 // to create 256-bit vectors from two other 128-bit ones.
 static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
   SDLoc dl(Op);
-  MVT ResVT = Op.getValueType().getSimpleVT();
+  MVT ResVT = Op.getSimpleValueType();
 
   assert((ResVT.is256BitVector() ||
           ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
@@ -6073,10 +6175,14 @@ LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp,
   SDValue V1 = SVOp->getOperand(0);
   SDValue V2 = SVOp->getOperand(1);
   SDLoc dl(SVOp);
-  MVT VT = SVOp->getValueType(0).getSimpleVT();
+  MVT VT = SVOp->getSimpleValueType(0);
   MVT EltVT = VT.getVectorElementType();
   unsigned NumElems = VT.getVectorNumElements();
 
+  // There is no blend with immediate in AVX-512.
+  if (VT.is512BitVector())
+    return SDValue();
+
   if (!Subtarget->hasSSE41() || EltVT == MVT::i8)
     return SDValue();
   if (!Subtarget->hasInt256() && VT == MVT::v16i16)
@@ -6382,10 +6488,10 @@ LowerVECTOR_SHUFFLEv8i16(SDValue Op, const X86Subtarget *Subtarget,
 // 1. [ssse3] 1 x pshufb
 // 2. [ssse3] 2 x pshufb + 1 x por
 // 3. [all]   v8i16 shuffle + N x pextrw + rotate + pinsrw
-static
-SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
-                                 SelectionDAG &DAG,
-                                 const X86TargetLowering &TLI) {
+static SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
+                                        const X86Subtarget* Subtarget,
+                                        SelectionDAG &DAG) {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   SDValue V1 = SVOp->getOperand(0);
   SDValue V2 = SVOp->getOperand(1);
   SDLoc dl(SVOp);
@@ -6401,7 +6507,7 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
   // present, fall back to case 3.
 
   // If SSSE3, use 1 pshufb instruction per vector with elements in the result.
-  if (TLI.getSubtarget()->hasSSSE3()) {
+  if (Subtarget->hasSSSE3()) {
     SmallVector<SDValue,16> pshufbMask;
 
     // If all result elements are from one input vector, then only translate
@@ -6514,7 +6620,7 @@ static
 SDValue LowerVECTOR_SHUFFLEv32i8(ShuffleVectorSDNode *SVOp,
                                  const X86Subtarget *Subtarget,
                                  SelectionDAG &DAG) {
-  MVT VT = SVOp->getValueType(0).getSimpleVT();
+  MVT VT = SVOp->getSimpleValueType(0);
   SDValue V1 = SVOp->getOperand(0);
   SDValue V2 = SVOp->getOperand(1);
   SDLoc dl(SVOp);
@@ -6562,7 +6668,7 @@ SDValue LowerVECTOR_SHUFFLEv32i8(ShuffleVectorSDNode *SVOp,
 static
 SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp,
                                  SelectionDAG &DAG) {
-  MVT VT = SVOp->getValueType(0).getSimpleVT();
+  MVT VT = SVOp->getSimpleValueType(0);
   SDLoc dl(SVOp);
   unsigned NumElems = VT.getVectorNumElements();
   MVT NewVT;
@@ -6599,7 +6705,7 @@ SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp,
 
 /// getVZextMovL - Return a zero-extending vector move low node.
 ///
-static SDValue getVZextMovL(MVT VT, EVT OpVT,
+static SDValue getVZextMovL(MVT VT, MVT OpVT,
                             SDValue SrcOp, SelectionDAG &DAG,
                             const X86Subtarget *Subtarget, SDLoc dl) {
   if (VT == MVT::v2f64 || VT == MVT::v4f32) {
@@ -6641,7 +6747,7 @@ LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
   if (NewOp.getNode())
     return NewOp;
 
-  MVT VT = SVOp->getValueType(0).getSimpleVT();
+  MVT VT = SVOp->getSimpleValueType(0);
 
   unsigned NumElems = VT.getVectorNumElements();
   unsigned NumLaneElems = NumElems / 2;
@@ -6753,7 +6859,7 @@ LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
   SDValue V1 = SVOp->getOperand(0);
   SDValue V2 = SVOp->getOperand(1);
   SDLoc dl(SVOp);
-  MVT VT = SVOp->getValueType(0).getSimpleVT();
+  MVT VT = SVOp->getSimpleValueType(0);
 
   assert(VT.is128BitVector() && "Unsupported vector size");
 
@@ -6904,7 +7010,7 @@ static bool MayFoldVectorLoad(SDValue V) {
 
 static
 SDValue getMOVDDup(SDValue &Op, SDLoc &dl, SDValue V1, SelectionDAG &DAG) {
-  EVT VT = Op.getValueType();
+  MVT VT = Op.getSimpleValueType();
 
   // Canonizalize to v2f64.
   V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, V1);
@@ -6918,7 +7024,7 @@ SDValue getMOVLowToHigh(SDValue &Op, SDLoc &dl, SelectionDAG &DAG,
                         bool HasSSE2) {
   SDValue V1 = Op.getOperand(0);
   SDValue V2 = Op.getOperand(1);
-  EVT VT = Op.getValueType();
+  MVT VT = Op.getSimpleValueType();
 
   assert(VT != MVT::v2i64 && "unsupported shuffle type");
 
@@ -6936,7 +7042,7 @@ static
 SDValue getMOVHighToLow(SDValue &Op, SDLoc &dl, SelectionDAG &DAG) {
   SDValue V1 = Op.getOperand(0);
   SDValue V2 = Op.getOperand(1);
-  EVT VT = Op.getValueType();
+  MVT VT = Op.getSimpleValueType();
 
   assert((VT == MVT::v4i32 || VT == MVT::v4f32) &&
          "unsupported shuffle type");
@@ -6952,7 +7058,7 @@ static
 SDValue getMOVLP(SDValue &Op, SDLoc &dl, SelectionDAG &DAG, bool HasSSE2) {
   SDValue V1 = Op.getOperand(0);
   SDValue V2 = Op.getOperand(1);
-  EVT VT = Op.getValueType();
+  MVT VT = Op.getSimpleValueType();
   unsigned NumElems = VT.getVectorNumElements();
 
   // Use MOVLPS and MOVLPD in case V1 or V2 are loads. During isel, the second
@@ -7006,13 +7112,13 @@ SDValue getMOVLP(SDValue &Op, SDLoc &dl, SelectionDAG &DAG, bool HasSSE2) {
 }
 
 // Reduce a vector shuffle to zext.
-SDValue
-X86TargetLowering::LowerVectorIntExtend(SDValue Op, SelectionDAG &DAG) const {
+static SDValue LowerVectorIntExtend(SDValue Op, const X86Subtarget *Subtarget,
+                                    SelectionDAG &DAG) {
   // PMOVZX is only available from SSE41.
   if (!Subtarget->hasSSE41())
     return SDValue();
 
-  EVT VT = Op.getValueType();
+  MVT VT = Op.getSimpleValueType();
 
   // Only AVX2 support 256-bit vector integer extending.
   if (!Subtarget->hasInt256() && VT.is256BitVector())
@@ -7051,12 +7157,11 @@ X86TargetLowering::LowerVectorIntExtend(SDValue Op, SelectionDAG &DAG) const {
       return SDValue();
   }
 
-  LLVMContext *Context = DAG.getContext();
   unsigned NBits = VT.getVectorElementType().getSizeInBits() << Shift;
-  EVT NeVT = EVT::getIntegerVT(*Context, NBits);
-  EVT NVT = EVT::getVectorVT(*Context, NeVT, NumElems >> Shift);
+  MVT NeVT = MVT::getIntegerVT(NBits);
+  MVT NVT = MVT::getVectorVT(NeVT, NumElems >> Shift);
 
-  if (!isTypeLegal(NVT))
+  if (!DAG.getTargetLoweringInfo().isTypeLegal(NVT))
     return SDValue();
 
   // Simplify the operand as it's prepared to be fed into shuffle.
@@ -7064,8 +7169,8 @@ X86TargetLowering::LowerVectorIntExtend(SDValue Op, SelectionDAG &DAG) const {
   if (V1.getOpcode() == ISD::BITCAST &&
       V1.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
       V1.getOperand(0).getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
-      V1.getOperand(0)
-        .getOperand(0).getValueType().getSizeInBits() == SignificantBits) {
+      V1.getOperand(0).getOperand(0)
+        .getSimpleValueType().getSizeInBits() == SignificantBits) {
     // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x)
     SDValue V = V1.getOperand(0).getOperand(0).getOperand(0);
     ConstantSDNode *CIdx =
@@ -7074,19 +7179,19 @@ X86TargetLowering::LowerVectorIntExtend(SDValue Op, SelectionDAG &DAG) const {
     // selection to fold it. Otherwise, we will short the conversion sequence.
     if (CIdx && CIdx->getZExtValue() == 0 &&
         (!ISD::isNormalLoad(V.getNode()) || !V.hasOneUse())) {
-      if (V.getValueSizeInBits() > V1.getValueSizeInBits()) {
+      MVT FullVT = V.getSimpleValueType();
+      MVT V1VT = V1.getSimpleValueType();
+      if (FullVT.getSizeInBits() > V1VT.getSizeInBits()) {
         // The "ext_vec_elt" node is wider than the result node.
         // In this case we should extract subvector from V.
         // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast (extract_subvector x)).
-        unsigned Ratio = V.getValueSizeInBits() / V1.getValueSizeInBits();
-        EVT FullVT = V.getValueType();
-        EVT SubVecVT = EVT::getVectorVT(*Context,
-                                        FullVT.getVectorElementType(),
+        unsigned Ratio = FullVT.getSizeInBits() / V1VT.getSizeInBits();
+        MVT SubVecVT = MVT::getVectorVT(FullVT.getVectorElementType(),
                                         FullVT.getVectorNumElements()/Ratio);
         V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVecVT, V,
                         DAG.getIntPtrConstant(0));
       }
-      V1 = DAG.getNode(ISD::BITCAST, DL, V1.getValueType(), V);
+      V1 = DAG.getNode(ISD::BITCAST, DL, V1VT, V);
     }
   }
 
@@ -7094,10 +7199,11 @@ X86TargetLowering::LowerVectorIntExtend(SDValue Op, SelectionDAG &DAG) const {
                      DAG.getNode(X86ISD::VZEXT, DL, NVT, V1));
 }
 
-SDValue
-X86TargetLowering::NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG) const {
+static SDValue
+NormalizeVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
+                       SelectionDAG &DAG) {
   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
-  MVT VT = Op.getValueType().getSimpleVT();
+  MVT VT = Op.getSimpleValueType();
   SDLoc dl(Op);
   SDValue V1 = Op.getOperand(0);
   SDValue V2 = Op.getOperand(1);
@@ -7108,13 +7214,13 @@ X86TargetLowering::NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG) const {
   // Handle splat operations
   if (SVOp->isSplat()) {
     // Use vbroadcast whenever the splat comes from a foldable load
-    SDValue Broadcast = LowerVectorBroadcast(Op, DAG);
+    SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG);
     if (Broadcast.getNode())
       return Broadcast;
   }
 
   // Check integer expanding shuffles.
-  SDValue NewOp = LowerVectorIntExtend(Op, DAG);
+  SDValue NewOp = LowerVectorIntExtend(Op, Subtarget, DAG);
   if (NewOp.getNode())
     return NewOp;
 
@@ -7132,7 +7238,7 @@ X86TargetLowering::NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG) const {
     if (ISD::isBuildVectorAllZeros(V2.getNode())) {
       SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
       if (NewOp.getNode()) {
-        MVT NewVT = NewOp.getValueType().getSimpleVT();
+        MVT NewVT = NewOp.getSimpleValueType();
         if (isCommutedMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(),
                                NewVT, true, false))
           return getVZextMovL(VT, NewVT, NewOp.getOperand(0),
@@ -7141,7 +7247,7 @@ X86TargetLowering::NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG) const {
     } else if (ISD::isBuildVectorAllZeros(V1.getNode())) {
       SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
       if (NewOp.getNode()) {
-        MVT NewVT = NewOp.getValueType().getSimpleVT();
+        MVT NewVT = NewOp.getSimpleValueType();
         if (isMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(), NewVT))
           return getVZextMovL(VT, NewVT, NewOp.getOperand(1),
                               DAG, Subtarget, dl);
@@ -7156,7 +7262,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   SDValue V1 = Op.getOperand(0);
   SDValue V2 = Op.getOperand(1);
-  MVT VT = Op.getValueType().getSimpleVT();
+  MVT VT = Op.getSimpleValueType();
   SDLoc dl(Op);
   unsigned NumElems = VT.getVectorNumElements();
   bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;
@@ -7194,7 +7300,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
   // Normalize the input vectors. Here splats, zeroed vectors, profitable
   // narrowing and commutation of operands should be handled. The actual code
   // doesn't include all of those, work in progress...
-  SDValue NewOp = NormalizeVectorShuffle(Op, DAG);
+  SDValue NewOp = NormalizeVectorShuffle(Op, Subtarget, DAG);
   if (NewOp.getNode())
     return NewOp;
 
@@ -7354,7 +7460,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
   }
 
   // Normalize the node to match x86 shuffle ops if needed
-  if (!V2IsUndef && (isSHUFPMask(M, VT, HasFp256, /* Commuted */ true)))
+  if (!V2IsUndef && (isSHUFPMask(M, VT, /* Commuted */ true)))
     return CommuteVectorShuffle(SVOp, DAG);
 
   // The checks below are all present in isShuffleMaskLegal, but they are
@@ -7377,7 +7483,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
                                 getShufflePSHUFLWImmediate(SVOp),
                                 DAG);
 
-  if (isSHUFPMask(M, VT, HasFp256))
+  if (isSHUFPMask(M, VT))
     return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V2,
                                 getShuffleSHUFImmediate(SVOp), DAG);
 
@@ -7396,8 +7502,8 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
     return getTargetShuffleNode(X86ISD::MOVDDUP, dl, VT, V1, DAG);
 
   // Handle VPERMILPS/D* permutations
-  if (isVPERMILPMask(M, VT, HasFp256)) {
-    if (HasInt256 && VT == MVT::v8i32)
+  if (isVPERMILPMask(M, VT)) {
+    if ((HasInt256 && VT == MVT::v8i32) || VT == MVT::v16i32)
       return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1,
                                   getShuffleSHUFImmediate(SVOp), DAG);
     return getTargetShuffleNode(X86ISD::VPERMILP, dl, VT, V1,
@@ -7413,21 +7519,28 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
   if (BlendOp.getNode())
     return BlendOp;
 
-  if (V2IsUndef && HasInt256 && (VT == MVT::v8i32 || VT == MVT::v8f32)) {
-    SmallVector<SDValue, 8> permclMask;
-    for (unsigned i = 0; i != 8; ++i) {
-      permclMask.push_back(DAG.getConstant((M[i]>=0) ? M[i] : 0, MVT::i32));
+  unsigned Imm8;
+  if (V2IsUndef && HasInt256 && isPermImmMask(M, VT, Imm8))
+    return getTargetShuffleNode(X86ISD::VPERMI, dl, VT, V1, Imm8, DAG);
+
+  if ((V2IsUndef && HasInt256 && VT.is256BitVector() && NumElems == 8) ||
+      VT.is512BitVector()) {
+    MVT MaskEltVT = MVT::getIntegerVT(VT.getVectorElementType().getSizeInBits());
+    MVT MaskVectorVT = MVT::getVectorVT(MaskEltVT, NumElems);
+    SmallVector<SDValue, 16> permclMask;
+    for (unsigned i = 0; i != NumElems; ++i) {
+      permclMask.push_back(DAG.getConstant((M[i]>=0) ? M[i] : 0, MaskEltVT));
     }
-    SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32,
-                               &permclMask[0], 8);
-    // Bitcast is for VPERMPS since mask is v8i32 but node takes v8f32
-    return DAG.getNode(X86ISD::VPERMV, dl, VT,
-                       DAG.getNode(ISD::BITCAST, dl, VT, Mask), V1);
-  }
 
-  if (V2IsUndef && HasInt256 && (VT == MVT::v4i64 || VT == MVT::v4f64))
-    return getTargetShuffleNode(X86ISD::VPERMI, dl, VT, V1,
-                                getShuffleCLImmediate(SVOp), DAG);
+    SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVectorVT,
+                                &permclMask[0], NumElems);
+    if (V2IsUndef)
+      // Bitcast is for VPERMPS since mask is v8i32 but node takes v8f32
+      return DAG.getNode(X86ISD::VPERMV, dl, VT,
+                          DAG.getNode(ISD::BITCAST, dl, VT, Mask), V1);
+    return DAG.getNode(X86ISD::VPERMV3, dl, VT,
+                       DAG.getNode(ISD::BITCAST, dl, VT, Mask), V1, V2);
+  }
 
   //===--------------------------------------------------------------------===//
   // Since no target specific shuffle was selected for this generic one,
@@ -7443,7 +7556,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
   }
 
   if (VT == MVT::v16i8) {
-    SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, DAG, *this);
+    SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, Subtarget, DAG);
     if (NewOp.getNode())
       return NewOp;
   }
@@ -7467,10 +7580,10 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
 }
 
 static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
-  MVT VT = Op.getValueType().getSimpleVT();
+  MVT VT = Op.getSimpleValueType();
   SDLoc dl(Op);
 
-  if (!Op.getOperand(0).getValueType().getSimpleVT().is128BitVector())
+  if (!Op.getOperand(0).getSimpleValueType().is128BitVector())
     return SDValue();
 
   if (VT.getSizeInBits() == 8) {
@@ -7532,21 +7645,38 @@ SDValue
 X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
                                            SelectionDAG &DAG) const {
   SDLoc dl(Op);
-  if (!isa<ConstantSDNode>(Op.getOperand(1)))
-    return SDValue();
-
   SDValue Vec = Op.getOperand(0);
-  MVT VecVT = Vec.getValueType().getSimpleVT();
+  MVT VecVT = Vec.getSimpleValueType();
+  SDValue Idx = Op.getOperand(1);
+  if (!isa<ConstantSDNode>(Idx)) {
+    if (VecVT.is512BitVector() ||
+        (VecVT.is256BitVector() && Subtarget->hasInt256() &&
+         VecVT.getVectorElementType().getSizeInBits() == 32)) {
+
+      MVT MaskEltVT =
+        MVT::getIntegerVT(VecVT.getVectorElementType().getSizeInBits());
+      MVT MaskVT = MVT::getVectorVT(MaskEltVT, VecVT.getSizeInBits() /
+                                    MaskEltVT.getSizeInBits());
+      
+      Idx = DAG.getZExtOrTrunc(Idx, dl, MaskEltVT);
+      SDValue Mask = DAG.getNode(X86ISD::VINSERT, dl, MaskVT,
+                                getZeroVector(MaskVT, Subtarget, DAG, dl),
+                                Idx, DAG.getConstant(0, getPointerTy()));
+      SDValue Perm = DAG.getNode(X86ISD::VPERMV, dl, VecVT, Mask, Vec);
+      return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(),
+                        Perm, DAG.getConstant(0, getPointerTy()));
+    }
+    return SDValue();
+  }
 
   // If this is a 256-bit vector result, first extract the 128-bit vector and
   // then extract the element from the 128-bit vector.
   if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
-    SDValue Idx = Op.getOperand(1);
-    unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
 
+    unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
     // Get the 128-bit vector.
     Vec = Extract128BitVector(Vec, IdxVal, DAG, dl);
-    EVT EltVT = VecVT.getVectorElementType();
+    MVT EltVT = VecVT.getVectorElementType();
 
     unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
 
@@ -7565,7 +7695,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
       return Res;
   }
 
-  MVT VT = Op.getValueType().getSimpleVT();
+  MVT VT = Op.getSimpleValueType();
   // TODO: handle v16i8.
   if (VT.getSizeInBits() == 16) {
     SDValue Vec = Op.getOperand(0);
@@ -7592,7 +7722,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
 
     // SHUFPS the element to the lowest double word, then movss.
     int Mask[4] = { static_cast<int>(Idx), -1, -1, -1 };
-    MVT VVT = Op.getOperand(0).getValueType().getSimpleVT();
+    MVT VVT = Op.getOperand(0).getSimpleValueType();
     SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
                                        DAG.getUNDEF(VVT), Mask);
     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
@@ -7611,7 +7741,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
     // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
     // to a f64mem, the whole operation is folded into a single MOVHPDmr.
     int Mask[2] = { 1, -1 };
-    MVT VVT = Op.getOperand(0).getValueType().getSimpleVT();
+    MVT VVT = Op.getOperand(0).getSimpleValueType();
     SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
                                        DAG.getUNDEF(VVT), Mask);
     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
@@ -7622,7 +7752,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
 }
 
 static SDValue LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
-  MVT VT = Op.getValueType().getSimpleVT();
+  MVT VT = Op.getSimpleValueType();
   MVT EltVT = VT.getVectorElementType();
   SDLoc dl(Op);
 
@@ -7676,7 +7806,7 @@ static SDValue LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
 
 SDValue
 X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const {
-  MVT VT = Op.getValueType().getSimpleVT();
+  MVT VT = Op.getSimpleValueType();
   MVT EltVT = VT.getVectorElementType();
 
   SDLoc dl(Op);
@@ -7724,17 +7854,15 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const {
 }
 
 static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
-  LLVMContext *Context = DAG.getContext();
   SDLoc dl(Op);
-  MVT OpVT = Op.getValueType().getSimpleVT();
+  MVT OpVT = Op.getSimpleValueType();
 
   // If this is a 256-bit vector result, first insert into a 128-bit
   // vector and then insert into the 256-bit vector.
   if (!OpVT.is128BitVector()) {
     // Insert into a 128-bit vector.
     unsigned SizeFactor = OpVT.getSizeInBits()/128;
-    EVT VT128 = EVT::getVectorVT(*Context,
-                                 OpVT.getVectorElementType(),
+    MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
                                  OpVT.getVectorNumElements() / SizeFactor);
 
     Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
@@ -7762,8 +7890,8 @@ static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
   SDValue In =  Op.getOperand(0);
   SDValue Idx = Op.getOperand(1);
   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
-  EVT ResVT   = Op.getValueType();
-  EVT InVT    = In.getValueType();
+  MVT ResVT   = Op.getSimpleValueType();
+  MVT InVT    = In.getSimpleValueType();
 
   if (Subtarget->hasFp256()) {
     if (ResVT.is128BitVector() &&
@@ -7790,16 +7918,16 @@ static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
     SDValue SubVec = Op.getNode()->getOperand(1);
     SDValue Idx = Op.getNode()->getOperand(2);
 
-    if ((Op.getNode()->getValueType(0).is256BitVector() ||
-         Op.getNode()->getValueType(0).is512BitVector()) &&
-        SubVec.getNode()->getValueType(0).is128BitVector() &&
+    if ((Op.getNode()->getSimpleValueType(0).is256BitVector() ||
+         Op.getNode()->getSimpleValueType(0).is512BitVector()) &&
+        SubVec.getNode()->getSimpleValueType(0).is128BitVector() &&
         isa<ConstantSDNode>(Idx)) {
       unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
       return Insert128BitVector(Vec, SubVec, IdxVal, DAG, dl);
     }
 
-    if (Op.getNode()->getValueType(0).is512BitVector() &&
-        SubVec.getNode()->getValueType(0).is256BitVector() &&
+    if (Op.getNode()->getSimpleValueType(0).is512BitVector() &&
+        SubVec.getNode()->getSimpleValueType(0).is256BitVector() &&
         isa<ConstantSDNode>(Idx)) {
       unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
       return Insert256BitVector(Vec, SubVec, IdxVal, DAG, dl);
@@ -8108,10 +8236,9 @@ static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
   Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
                                                          is64Bit ? 257 : 256));
 
-  SDValue ThreadPointer = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
-                                      DAG.getIntPtrConstant(0),
-                                      MachinePointerInfo(Ptr),
-                                      false, false, false, 0);
+  SDValue ThreadPointer =
+      DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0),
+                  MachinePointerInfo(Ptr), false, false, false, 0);
 
   unsigned char OperandFlags = 0;
   // Most TLS accesses are not RIP relative, even on x86-64.  One exception is
@@ -8133,21 +8260,20 @@ static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
   // emit "addl x@ntpoff,%eax" (local exec)
   // or "addl x@indntpoff,%eax" (initial exec)
   // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
-  SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
-                                           GA->getValueType(0),
-                                           GA->getOffset(), OperandFlags);
+  SDValue TGA =
+      DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
+                                 GA->getOffset(), OperandFlags);
   SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
 
   if (model == TLSModel::InitialExec) {
     if (isPIC && !is64Bit) {
       Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
-                          DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
+                           DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
                            Offset);
     }
 
     Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
-                         MachinePointerInfo::getGOT(), false, false, false,
-                         0);
+                         MachinePointerInfo::getGOT(), false, false, false, 0);
   }
 
   // The address of the thread local variable is the add of the thread
@@ -8305,6 +8431,11 @@ SDValue X86TargetLowering::LowerShiftParts(SDValue Op, SelectionDAG &DAG) const{
   SDValue ShOpLo = Op.getOperand(0);
   SDValue ShOpHi = Op.getOperand(1);
   SDValue ShAmt  = Op.getOperand(2);
+  // X86ISD::SHLD and X86ISD::SHRD have defined overflow behavior but the
+  // generic ISD nodes haven't. Insert an AND to be safe, it's optimized away
+  // during isel.
+  SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
+                                  DAG.getConstant(VTBits - 1, MVT::i8));
   SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
                                      DAG.getConstant(VTBits - 1, MVT::i8))
                        : DAG.getConstant(0, VT);
@@ -8312,12 +8443,15 @@ SDValue X86TargetLowering::LowerShiftParts(SDValue Op, SelectionDAG &DAG) const{
   SDValue Tmp2, Tmp3;
   if (Op.getOpcode() == ISD::SHL_PARTS) {
     Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);
-    Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
+    Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt);
   } else {
     Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);
-    Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, ShAmt);
+    Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt);
   }
 
+  // If the shift amount is larger or equal than the width of a part we can't
+  // rely on the results of shld/shrd. Insert a test and select the appropriate
+  // values for large shift amounts.
   SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
                                 DAG.getConstant(VTBits, MVT::i8));
   SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
@@ -8750,9 +8884,9 @@ X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
 
 static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
                               const X86Subtarget *Subtarget) {
-  MVT VT = Op->getValueType(0).getSimpleVT();
+  MVT VT = Op->getSimpleValueType(0);
   SDValue In = Op->getOperand(0);
-  MVT InVT = In.getValueType().getSimpleVT();
+  MVT InVT = In.getSimpleValueType();
   SDLoc dl(Op);
 
   // Optimize vectors in AVX mode:
@@ -8768,7 +8902,8 @@ static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
   //   Concat upper and lower parts.
   //
 
-  if (((VT != MVT::v8i32) || (InVT != MVT::v8i16)) &&
+  if (((VT != MVT::v16i16) || (InVT != MVT::v16i8)) &&
+      ((VT != MVT::v8i32) || (InVT != MVT::v8i16)) &&
       ((VT != MVT::v4i64) || (InVT != MVT::v4i32)))
     return SDValue();
 
@@ -8790,8 +8925,39 @@ static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
 }
 
-SDValue X86TargetLowering::LowerANY_EXTEND(SDValue Op,
-                                           SelectionDAG &DAG) const {
+static  SDValue LowerZERO_EXTEND_AVX512(SDValue Op,
+                                        SelectionDAG &DAG) {
+  MVT VT = Op->getValueType(0).getSimpleVT();
+  SDValue In = Op->getOperand(0);
+  MVT InVT = In.getValueType().getSimpleVT();
+  SDLoc DL(Op);
+  unsigned int NumElts = VT.getVectorNumElements();
+  if (NumElts != 8 && NumElts != 16)
+    return SDValue();
+
+  if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1)
+    return DAG.getNode(X86ISD::VZEXT, DL, VT, In);
+
+  EVT ExtVT = (NumElts == 8)? MVT::v8i64 : MVT::v16i32;
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  // Now we have only mask extension
+  assert(InVT.getVectorElementType() == MVT::i1);
+  SDValue Cst = DAG.getTargetConstant(1, ExtVT.getScalarType());
+  const Constant *C = (dyn_cast<ConstantSDNode>(Cst))->getConstantIntValue();
+  SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy());
+  unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
+  SDValue Ld = DAG.getLoad(Cst.getValueType(), DL, DAG.getEntryNode(), CP,
+                           MachinePointerInfo::getConstantPool(),
+                           false, false, false, Alignment);
+
+  SDValue Brcst = DAG.getNode(X86ISD::VBROADCASTM, DL, ExtVT, In, Ld);
+  if (VT.is512BitVector())
+    return Brcst;
+  return DAG.getNode(X86ISD::VTRUNC, DL, VT, Brcst);
+}
+
+static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
+                               SelectionDAG &DAG) {
   if (Subtarget->hasFp256()) {
     SDValue Res = LowerAVXExtend(Op, DAG, Subtarget);
     if (Res.getNode())
@@ -8800,12 +8966,16 @@ SDValue X86TargetLowering::LowerANY_EXTEND(SDValue Op,
 
   return SDValue();
 }
-SDValue X86TargetLowering::LowerZERO_EXTEND(SDValue Op,
-                                            SelectionDAG &DAG) const {
+
+static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
+                                SelectionDAG &DAG) {
   SDLoc DL(Op);
-  MVT VT = Op.getValueType().getSimpleVT();
+  MVT VT = Op.getSimpleValueType();
   SDValue In = Op.getOperand(0);
-  MVT SVT = In.getValueType().getSimpleVT();
+  MVT SVT = In.getSimpleValueType();
+
+  if (VT.is512BitVector() || SVT.getVectorElementType() == MVT::i1)
+    return LowerZERO_EXTEND_AVX512(Op, DAG);
 
   if (Subtarget->hasFp256()) {
     SDValue Res = LowerAVXExtend(Op, DAG, Subtarget);
@@ -8813,33 +8983,44 @@ SDValue X86TargetLowering::LowerZERO_EXTEND(SDValue Op,
       return Res;
   }
 
-  if (!VT.is256BitVector() || !SVT.is128BitVector() ||
-      VT.getVectorNumElements() != SVT.getVectorNumElements())
-    return SDValue();
-
-  assert(Subtarget->hasFp256() && "256-bit vector is observed without AVX!");
-
-  // AVX2 has better support of integer extending.
-  if (Subtarget->hasInt256())
-    return DAG.getNode(X86ISD::VZEXT, DL, VT, In);
-
-  SDValue Lo = DAG.getNode(X86ISD::VZEXT, DL, MVT::v4i32, In);
-  static const int Mask[] = {4, 5, 6, 7, -1, -1, -1, -1};
-  SDValue Hi = DAG.getNode(X86ISD::VZEXT, DL, MVT::v4i32,
-                           DAG.getVectorShuffle(MVT::v8i16, DL, In,
-                                                DAG.getUNDEF(MVT::v8i16),
-                                                &Mask[0]));
-
-  return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i32, Lo, Hi);
+  assert(!VT.is256BitVector() || !SVT.is128BitVector() ||
+         VT.getVectorNumElements() != SVT.getVectorNumElements());
+  return SDValue();
 }
 
 SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
   SDLoc DL(Op);
-  MVT VT = Op.getValueType().getSimpleVT();
+  MVT VT = Op.getSimpleValueType();  
   SDValue In = Op.getOperand(0);
-  MVT SVT = In.getValueType().getSimpleVT();
-
-  if ((VT == MVT::v4i32) && (SVT == MVT::v4i64)) {
+  MVT InVT = In.getSimpleValueType();
+  assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
+         "Invalid TRUNCATE operation");
+
+  if (InVT.is512BitVector() || VT.getVectorElementType() == MVT::i1) {
+    if (VT.getVectorElementType().getSizeInBits() >=8)
+      return DAG.getNode(X86ISD::VTRUNC, DL, VT, In);
+
+    assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type");
+    unsigned NumElts = InVT.getVectorNumElements();
+    assert ((NumElts == 8 || NumElts == 16) && "Unexpected vector type");
+    if (InVT.getSizeInBits() < 512) {
+      MVT ExtVT = (NumElts == 16)? MVT::v16i32 : MVT::v8i64;
+      In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
+      InVT = ExtVT;
+    }
+    SDValue Cst = DAG.getTargetConstant(1, InVT.getVectorElementType());
+    const Constant *C = (dyn_cast<ConstantSDNode>(Cst))->getConstantIntValue();
+    SDValue CP = DAG.getConstantPool(C, getPointerTy());
+    unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
+    SDValue Ld = DAG.getLoad(Cst.getValueType(), DL, DAG.getEntryNode(), CP,
+                           MachinePointerInfo::getConstantPool(),
+                           false, false, false, Alignment);
+    SDValue OneV = DAG.getNode(X86ISD::VBROADCAST, DL, InVT, Ld);
+    SDValue And = DAG.getNode(ISD::AND, DL, InVT, OneV, In);
+    return DAG.getNode(X86ISD::TESTM, DL, VT, And, And);
+  }
+
+  if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
     // On AVX2, v4i64 -> v4i32 becomes VPERMD.
     if (Subtarget->hasInt256()) {
       static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
@@ -8870,7 +9051,7 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
     return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask2);
   }
 
-  if ((VT == MVT::v8i16) && (SVT == MVT::v8i32)) {
+  if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
     // On AVX2, v8i32 -> v8i16 becomed PSHUFB.
     if (Subtarget->hasInt256()) {
       In = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, In);
@@ -8928,11 +9109,9 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
   }
 
   // Handle truncation of V256 to V128 using shuffles.
-  if (!VT.is128BitVector() || !SVT.is256BitVector())
+  if (!VT.is128BitVector() || !InVT.is256BitVector())
     return SDValue();
 
-  assert(VT.getVectorNumElements() != SVT.getVectorNumElements() &&
-         "Invalid op");
   assert(Subtarget->hasFp256() && "256-bit vector without AVX!");
 
   unsigned NumElems = VT.getVectorNumElements();
@@ -8952,7 +9131,7 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
 
 SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op,
                                            SelectionDAG &DAG) const {
-  MVT VT = Op.getValueType().getSimpleVT();
+  MVT VT = Op.getSimpleValueType();
   if (VT.isVector()) {
     if (VT == MVT::v8i16)
       return DAG.getNode(ISD::TRUNCATE, SDLoc(Op), VT,
@@ -8996,9 +9175,9 @@ SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op,
 
 static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {
   SDLoc DL(Op);
-  MVT VT = Op.getValueType().getSimpleVT();
+  MVT VT = Op.getSimpleValueType();
   SDValue In = Op.getOperand(0);
-  MVT SVT = In.getValueType().getSimpleVT();
+  MVT SVT = In.getSimpleValueType();
 
   assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
 
@@ -9010,7 +9189,7 @@ static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {
 SDValue X86TargetLowering::LowerFABS(SDValue Op, SelectionDAG &DAG) const {
   LLVMContext *Context = DAG.getContext();
   SDLoc dl(Op);
-  MVT VT = Op.getValueType().getSimpleVT();
+  MVT VT = Op.getSimpleValueType();
   MVT EltVT = VT;
   unsigned NumElts = VT == MVT::f64 ? 2 : 4;
   if (VT.isVector()) {
@@ -9044,7 +9223,7 @@ SDValue X86TargetLowering::LowerFABS(SDValue Op, SelectionDAG &DAG) const {
 SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const {
   LLVMContext *Context = DAG.getContext();
   SDLoc dl(Op);
-  MVT VT = Op.getValueType().getSimpleVT();
+  MVT VT = Op.getSimpleValueType();
   MVT EltVT = VT;
   unsigned NumElts = VT == MVT::f64 ? 2 : 4;
   if (VT.isVector()) {
@@ -9065,7 +9244,7 @@ SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const {
                              MachinePointerInfo::getConstantPool(),
                              false, false, false, Alignment);
   if (VT.isVector()) {
-    MVT XORVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
+    MVT XORVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits()/64);
     return DAG.getNode(ISD::BITCAST, dl, VT,
                        DAG.getNode(ISD::XOR, dl, XORVT,
                                    DAG.getNode(ISD::BITCAST, dl, XORVT,
@@ -9081,8 +9260,8 @@ SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
   SDValue Op0 = Op.getOperand(0);
   SDValue Op1 = Op.getOperand(1);
   SDLoc dl(Op);
-  MVT VT = Op.getValueType().getSimpleVT();
-  MVT SrcVT = Op1.getValueType().getSimpleVT();
+  MVT VT = Op.getSimpleValueType();
+  MVT SrcVT = Op1.getSimpleValueType();
 
   // If second operand is smaller, extend it first.
   if (SrcVT.bitsLT(VT)) {
@@ -9158,7 +9337,7 @@ SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
 static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
   SDValue N0 = Op.getOperand(0);
   SDLoc dl(Op);
-  MVT VT = Op.getValueType().getSimpleVT();
+  MVT VT = Op.getSimpleValueType();
 
   // Lower ISD::FGETSIGN to (AND (X86ISD::FGETSIGNx86 ...) 1).
   SDValue xFGETSIGN = DAG.getNode(X86ISD::FGETSIGNx86, dl, VT, N0,
@@ -9168,8 +9347,8 @@ static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
 
 // LowerVectorAllZeroTest - Check whether an OR'd tree is PTEST-able.
 //
-SDValue X86TargetLowering::LowerVectorAllZeroTest(SDValue Op,
-                                                  SelectionDAG &DAG) const {
+static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget *Subtarget,
+                                      SelectionDAG &DAG) {
   assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");
 
   if (!Subtarget->hasSSE41())
@@ -9294,7 +9473,7 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
   unsigned NumOperands = 0;
 
   // Truncate operations may prevent the merge of the SETCC instruction
-  // and the arithmetic intruction before it. Attempt to truncate the operands
+  // and the arithmetic instruction before it. Attempt to truncate the operands
   // of the arithmetic instruction and use a reduced bit-width instruction.
   bool NeedTruncation = false;
   SDValue ArithOp = Op;
@@ -9402,7 +9581,7 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
     case ISD::AND: Opcode = X86ISD::AND; break;
     case ISD::OR: {
       if (!NeedTruncation && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
-        SDValue EFLAGS = LowerVectorAllZeroTest(Op, DAG);
+        SDValue EFLAGS = LowerVectorAllZeroTest(Op, Subtarget, DAG);
         if (EFLAGS.getNode())
           return EFLAGS;
       }
@@ -9638,7 +9817,7 @@ static int translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
 // Lower256IntVSETCC - Break a VSETCC 256-bit integer VSETCC into two new 128
 // ones, and then concatenate the result back.
 static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
-  MVT VT = Op.getValueType().getSimpleVT();
+  MVT VT = Op.getSimpleValueType();
 
   assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC &&
          "Unsupported value type for operation");
@@ -9665,25 +9844,62 @@ static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
 }
 
+static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
+  SDValue Op0 = Op.getOperand(0);
+  SDValue Op1 = Op.getOperand(1);
+  SDValue CC = Op.getOperand(2);
+  MVT VT = Op.getSimpleValueType();
+
+  assert(Op0.getValueType().getVectorElementType().getSizeInBits() >= 32 &&
+         Op.getValueType().getScalarType() == MVT::i1 &&
+         "Cannot set masked compare for this operation");
+
+  ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
+  SDLoc dl(Op);
+
+  bool Unsigned = false;
+  unsigned SSECC;
+  switch (SetCCOpcode) {
+  default: llvm_unreachable("Unexpected SETCC condition");
+  case ISD::SETNE:  SSECC = 4; break;
+  case ISD::SETEQ:  SSECC = 0; break;
+  case ISD::SETUGT: Unsigned = true;
+  case ISD::SETGT:  SSECC = 6; break; // NLE
+  case ISD::SETULT: Unsigned = true;
+  case ISD::SETLT:  SSECC = 1; break;
+  case ISD::SETUGE: Unsigned = true;
+  case ISD::SETGE:  SSECC = 5; break; // NLT
+  case ISD::SETULE: Unsigned = true;
+  case ISD::SETLE:  SSECC = 2; break;
+  }
+  unsigned  Opc = Unsigned ? X86ISD::CMPMU: X86ISD::CMPM;
+  return DAG.getNode(Opc, dl, VT, Op0, Op1,
+                     DAG.getConstant(SSECC, MVT::i8));
+
+}
+
 static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
                            SelectionDAG &DAG) {
-  SDValue Cond;
   SDValue Op0 = Op.getOperand(0);
   SDValue Op1 = Op.getOperand(1);
   SDValue CC = Op.getOperand(2);
-  MVT VT = Op.getValueType().getSimpleVT();
+  MVT VT = Op.getSimpleValueType();
   ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
-  bool isFP = Op.getOperand(1).getValueType().getSimpleVT().isFloatingPoint();
+  bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint();
   SDLoc dl(Op);
 
   if (isFP) {
 #ifndef NDEBUG
-    MVT EltVT = Op0.getValueType().getVectorElementType().getSimpleVT();
+    MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
     assert(EltVT == MVT::f32 || EltVT == MVT::f64);
 #endif
 
     unsigned SSECC = translateX86FSETCC(SetCCOpcode, Op0, Op1);
-
+    unsigned Opc = X86ISD::CMPP;
+    if (Subtarget->hasAVX512() && VT.getVectorElementType() == MVT::i1) {
+      assert(VT.getVectorNumElements() <= 16);
+      Opc = X86ISD::CMPM;
+    }
     // In the two special cases we can't handle, emit two comparisons.
     if (SSECC == 8) {
       unsigned CC0, CC1;
@@ -9695,14 +9911,14 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
         CC0 = 7; CC1 = 4; CombineOpc = ISD::AND;
       }
 
-      SDValue Cmp0 = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1,
+      SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1,
                                  DAG.getConstant(CC0, MVT::i8));
-      SDValue Cmp1 = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1,
+      SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1,
                                  DAG.getConstant(CC1, MVT::i8));
       return DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
     }
     // Handle all other FP comparisons here.
-    return DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1,
+    return DAG.getNode(Opc, dl, VT, Op0, Op1,
                        DAG.getConstant(SSECC, MVT::i8));
   }
 
@@ -9710,6 +9926,24 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
   if (VT.is256BitVector() && !Subtarget->hasInt256())
     return Lower256IntVSETCC(Op, DAG);
 
+  bool MaskResult = (VT.getVectorElementType() == MVT::i1);
+  EVT OpVT = Op1.getValueType();
+  if (Subtarget->hasAVX512()) {
+    if (Op1.getValueType().is512BitVector() ||
+        (MaskResult && OpVT.getVectorElementType().getSizeInBits() >= 32))
+      return LowerIntVSETCC_AVX512(Op, DAG);
+
+    // In AVX-512 architecture setcc returns mask with i1 elements,
+    // But there is no compare instruction for i8 and i16 elements.
+    // We are not talking about 512-bit operands in this case, these
+    // types are illegal.
+    if (MaskResult &&
+        (OpVT.getVectorElementType().getSizeInBits() < 32 &&
+         OpVT.getVectorElementType().getSizeInBits() >= 8))
+      return DAG.getNode(ISD::TRUNCATE, dl, VT,
+                         DAG.getNode(ISD::SETCC, dl, OpVT, Op0, Op1, CC));
+  }
+
   // We are handling one of the integer comparisons here.  Since SSE only has
   // GT and EQ comparisons for integer, swapping operands and multiple
   // operations may be required for some comparisons.
@@ -9719,15 +9953,18 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
   switch (SetCCOpcode) {
   default: llvm_unreachable("Unexpected SETCC condition");
   case ISD::SETNE:  Invert = true;
-  case ISD::SETEQ:  Opc = X86ISD::PCMPEQ; break;
+  case ISD::SETEQ:  Opc = MaskResult? X86ISD::PCMPEQM: X86ISD::PCMPEQ; break;
   case ISD::SETLT:  Swap = true;
-  case ISD::SETGT:  Opc = X86ISD::PCMPGT; break;
+  case ISD::SETGT:  Opc = MaskResult? X86ISD::PCMPGTM: X86ISD::PCMPGT; break;
   case ISD::SETGE:  Swap = true;
-  case ISD::SETLE:  Opc = X86ISD::PCMPGT; Invert = true; break;
+  case ISD::SETLE:  Opc = MaskResult? X86ISD::PCMPGTM: X86ISD::PCMPGT;
+                    Invert = true; break;
   case ISD::SETULT: Swap = true;
-  case ISD::SETUGT: Opc = X86ISD::PCMPGT; FlipSigns = true; break;
+  case ISD::SETUGT: Opc = MaskResult? X86ISD::PCMPGTM: X86ISD::PCMPGT;
+                    FlipSigns = true; break;
   case ISD::SETUGE: Swap = true;
-  case ISD::SETULE: Opc = X86ISD::PCMPGT; FlipSigns = true; Invert = true; break;
+  case ISD::SETULE: Opc = MaskResult? X86ISD::PCMPGTM: X86ISD::PCMPGT;
+                    FlipSigns = true; Invert = true; break;
   }
   
   // Special case: Use min/max operations for SETULE/SETUGE
@@ -9841,7 +10078,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
 
 SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
 
-  MVT VT = Op.getValueType().getSimpleVT();
+  MVT VT = Op.getSimpleValueType();
 
   if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
 
@@ -9885,7 +10122,7 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
     }
   }
 
-  bool isFP = Op1.getValueType().getSimpleVT().isFloatingPoint();
+  bool isFP = Op1.getSimpleValueType().isFloatingPoint();
   unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG);
   if (X86CC == X86::COND_INVALID)
     return SDValue();
@@ -10040,7 +10277,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
 
     SDValue Cmp = Cond.getOperand(1);
     unsigned Opc = Cmp.getOpcode();
-    MVT VT = Op.getValueType().getSimpleVT();
+    MVT VT = Op.getSimpleValueType();
 
     bool IllegalFPCMov = false;
     if (VT.isFloatingPoint() && !VT.isVector() &&
@@ -10149,15 +10386,50 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
   return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops, array_lengthof(Ops));
 }
 
-SDValue X86TargetLowering::LowerSIGN_EXTEND(SDValue Op,
-                                            SelectionDAG &DAG) const {
-  MVT VT = Op->getValueType(0).getSimpleVT();
+static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, SelectionDAG &DAG) {
+  MVT VT = Op->getSimpleValueType(0);
   SDValue In = Op->getOperand(0);
-  MVT InVT = In.getValueType().getSimpleVT();
+  MVT InVT = In.getSimpleValueType();
+  SDLoc dl(Op);
+
+  unsigned int NumElts = VT.getVectorNumElements();
+  if (NumElts != 8 && NumElts != 16)
+    return SDValue();
+
+  if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1)
+    return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
+
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  assert (InVT.getVectorElementType() == MVT::i1 && "Unexpected vector type");
+
+  MVT ExtVT = (NumElts == 8) ? MVT::v8i64 : MVT::v16i32;
+  Constant *C = ConstantInt::get(*DAG.getContext(),
+    APInt::getAllOnesValue(ExtVT.getScalarType().getSizeInBits()));
+
+  SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy());
+  unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
+  SDValue Ld = DAG.getLoad(ExtVT.getScalarType(), dl, DAG.getEntryNode(), CP,
+                          MachinePointerInfo::getConstantPool(),
+                          false, false, false, Alignment);
+  SDValue Brcst = DAG.getNode(X86ISD::VBROADCASTM, dl, ExtVT, In, Ld);
+  if (VT.is512BitVector())
+    return Brcst;
+  return DAG.getNode(X86ISD::VTRUNC, dl, VT, Brcst);
+}
+
+static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
+                                SelectionDAG &DAG) {
+  MVT VT = Op->getSimpleValueType(0);
+  SDValue In = Op->getOperand(0);
+  MVT InVT = In.getSimpleValueType();
   SDLoc dl(Op);
 
+  if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1)
+    return LowerSIGN_EXTEND_AVX512(Op, DAG);
+
   if ((VT != MVT::v4i64 || InVT != MVT::v4i32) &&
-      (VT != MVT::v8i32 || InVT != MVT::v8i16))
+      (VT != MVT::v8i32 || InVT != MVT::v8i16) &&
+      (VT != MVT::v16i16 || InVT != MVT::v16i8))
     return SDValue();
 
   if (Subtarget->hasInt256())
@@ -10502,7 +10774,8 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
   // Get the inputs.
   SDValue Chain = Op.getOperand(0);
   SDValue Size  = Op.getOperand(1);
-  // FIXME: Ensure alignment here
+  unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
+  EVT VT = Op.getNode()->getValueType(0);
 
   bool Is64Bit = Subtarget->is64Bit();
   EVT SPTy = Is64Bit ? MVT::i64 : MVT::i32;
@@ -10540,14 +10813,20 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
     SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
 
     Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag);
-    Flag = Chain.getValue(1);
 
     const X86RegisterInfo *RegInfo =
       static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo());
-    Chain = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
-                               SPTy).getValue(1);
+    unsigned SPReg = RegInfo->getStackRegister();
+    SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
+    Chain = SP.getValue(1);
 
-    SDValue Ops1[2] = { Chain.getValue(0), Chain };
+    if (Align) {
+      SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
+                       DAG.getConstant(-(uint64_t)Align, VT));
+      Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
+    }
+
+    SDValue Ops1[2] = { SP, Chain };
     return DAG.getMergeValues(Ops1, 2, dl);
   }
 }
@@ -10698,6 +10977,26 @@ static SDValue LowerVACOPY(SDValue Op, const X86Subtarget *Subtarget,
                        MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
 }
 
+// getTargetVShiftByConstNode - Handle vector element shifts where the shift
+// amount is a constant. Takes immediate version of shift as input.
+static SDValue getTargetVShiftByConstNode(unsigned Opc, SDLoc dl, EVT VT,
+                                          SDValue SrcOp, uint64_t ShiftAmt,
+                                          SelectionDAG &DAG) {
+
+  // Check for ShiftAmt >= element width
+  if (ShiftAmt >= VT.getVectorElementType().getSizeInBits()) {
+    if (Opc == X86ISD::VSRAI)
+      ShiftAmt = VT.getVectorElementType().getSizeInBits() - 1;
+    else
+      return DAG.getConstant(0, VT);
+  }
+
+  assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)
+         && "Unknown target vector shift-by-constant node");
+
+  return DAG.getNode(Opc, dl, VT, SrcOp, DAG.getConstant(ShiftAmt, MVT::i8));
+}
+
 // getTargetVShiftNode - Handle vector element shifts where the shift amount
 // may or may not be a constant. Takes immediate version of shift as input.
 static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, EVT VT,
@@ -10705,18 +11004,10 @@ static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, EVT VT,
                                    SelectionDAG &DAG) {
   assert(ShAmt.getValueType() == MVT::i32 && "ShAmt is not i32");
 
-  if (isa<ConstantSDNode>(ShAmt)) {
-    // Constant may be a TargetConstant. Use a regular constant.
-    uint32_t ShiftAmt = cast<ConstantSDNode>(ShAmt)->getZExtValue();
-    switch (Opc) {
-      default: llvm_unreachable("Unknown target vector shift node");
-      case X86ISD::VSHLI:
-      case X86ISD::VSRLI:
-      case X86ISD::VSRAI:
-        return DAG.getNode(Opc, dl, VT, SrcOp,
-                           DAG.getConstant(ShiftAmt, MVT::i32));
-    }
-  }
+  // Catch shift-by-constant.
+  if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
+    return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp,
+                                      CShAmt->getZExtValue(), DAG);
 
   // Change opcode to non-immediate version
   switch (Opc) {
@@ -10919,24 +11210,32 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
   case Intrinsic::x86_avx2_pmaxu_b:
   case Intrinsic::x86_avx2_pmaxu_w:
   case Intrinsic::x86_avx2_pmaxu_d:
+  case Intrinsic::x86_avx512_pmaxu_d:
+  case Intrinsic::x86_avx512_pmaxu_q:
   case Intrinsic::x86_sse2_pminu_b:
   case Intrinsic::x86_sse41_pminuw:
   case Intrinsic::x86_sse41_pminud:
   case Intrinsic::x86_avx2_pminu_b:
   case Intrinsic::x86_avx2_pminu_w:
   case Intrinsic::x86_avx2_pminu_d:
+  case Intrinsic::x86_avx512_pminu_d:
+  case Intrinsic::x86_avx512_pminu_q:
   case Intrinsic::x86_sse41_pmaxsb:
   case Intrinsic::x86_sse2_pmaxs_w:
   case Intrinsic::x86_sse41_pmaxsd:
   case Intrinsic::x86_avx2_pmaxs_b:
   case Intrinsic::x86_avx2_pmaxs_w:
   case Intrinsic::x86_avx2_pmaxs_d:
+  case Intrinsic::x86_avx512_pmaxs_d:
+  case Intrinsic::x86_avx512_pmaxs_q:
   case Intrinsic::x86_sse41_pminsb:
   case Intrinsic::x86_sse2_pmins_w:
   case Intrinsic::x86_sse41_pminsd:
   case Intrinsic::x86_avx2_pmins_b:
   case Intrinsic::x86_avx2_pmins_w:
-  case Intrinsic::x86_avx2_pmins_d: {
+  case Intrinsic::x86_avx2_pmins_d: 
+  case Intrinsic::x86_avx512_pmins_d:
+  case Intrinsic::x86_avx512_pmins_q: {
     unsigned Opcode;
     switch (IntNo) {
     default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
@@ -10946,6 +11245,8 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
     case Intrinsic::x86_avx2_pmaxu_b:
     case Intrinsic::x86_avx2_pmaxu_w:
     case Intrinsic::x86_avx2_pmaxu_d:
+    case Intrinsic::x86_avx512_pmaxu_d:
+    case Intrinsic::x86_avx512_pmaxu_q:
       Opcode = X86ISD::UMAX;
       break;
     case Intrinsic::x86_sse2_pminu_b:
@@ -10954,6 +11255,8 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
     case Intrinsic::x86_avx2_pminu_b:
     case Intrinsic::x86_avx2_pminu_w:
     case Intrinsic::x86_avx2_pminu_d:
+    case Intrinsic::x86_avx512_pminu_d:
+    case Intrinsic::x86_avx512_pminu_q:
       Opcode = X86ISD::UMIN;
       break;
     case Intrinsic::x86_sse41_pmaxsb:
@@ -10962,6 +11265,8 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
     case Intrinsic::x86_avx2_pmaxs_b:
     case Intrinsic::x86_avx2_pmaxs_w:
     case Intrinsic::x86_avx2_pmaxs_d:
+    case Intrinsic::x86_avx512_pmaxs_d:
+    case Intrinsic::x86_avx512_pmaxs_q:
       Opcode = X86ISD::SMAX;
       break;
     case Intrinsic::x86_sse41_pminsb:
@@ -10970,6 +11275,8 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
     case Intrinsic::x86_avx2_pmins_b:
     case Intrinsic::x86_avx2_pmins_w:
     case Intrinsic::x86_avx2_pmins_d:
+    case Intrinsic::x86_avx512_pmins_d:
+    case Intrinsic::x86_avx512_pmins_q:
       Opcode = X86ISD::SMIN;
       break;
     }
@@ -10982,10 +11289,14 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
   case Intrinsic::x86_sse2_max_pd:
   case Intrinsic::x86_avx_max_ps_256:
   case Intrinsic::x86_avx_max_pd_256:
+  case Intrinsic::x86_avx512_max_ps_512:
+  case Intrinsic::x86_avx512_max_pd_512:
   case Intrinsic::x86_sse_min_ps:
   case Intrinsic::x86_sse2_min_pd:
   case Intrinsic::x86_avx_min_ps_256:
-  case Intrinsic::x86_avx_min_pd_256: {
+  case Intrinsic::x86_avx_min_pd_256:
+  case Intrinsic::x86_avx512_min_ps_512:
+  case Intrinsic::x86_avx512_min_pd_512:  {
     unsigned Opcode;
     switch (IntNo) {
     default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
@@ -10993,12 +11304,16 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
     case Intrinsic::x86_sse2_max_pd:
     case Intrinsic::x86_avx_max_ps_256:
     case Intrinsic::x86_avx_max_pd_256:
+    case Intrinsic::x86_avx512_max_ps_512:
+    case Intrinsic::x86_avx512_max_pd_512:
       Opcode = X86ISD::FMAX;
       break;
     case Intrinsic::x86_sse_min_ps:
     case Intrinsic::x86_sse2_min_pd:
     case Intrinsic::x86_avx_min_ps_256:
     case Intrinsic::x86_avx_min_pd_256:
+    case Intrinsic::x86_avx512_min_ps_512:
+    case Intrinsic::x86_avx512_min_pd_512:
       Opcode = X86ISD::FMIN;
       break;
     }
@@ -11069,7 +11384,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
   case Intrinsic::x86_avx2_permd:
   case Intrinsic::x86_avx2_permps:
     // Operands intentionally swapped. Mask is last operand to intrinsic,
-    // but second operand for node/intruction.
+    // but second operand for node/instruction.
     return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(),
                        Op.getOperand(2), Op.getOperand(1));
 
@@ -11144,6 +11459,16 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
     SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test);
     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
   }
+  case Intrinsic::x86_avx512_kortestz:
+  case Intrinsic::x86_avx512_kortestc: {
+    unsigned X86CC = (IntNo == Intrinsic::x86_avx512_kortestz)? X86::COND_E: X86::COND_B;
+    SDValue LHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1, Op.getOperand(1));
+    SDValue RHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1, Op.getOperand(2));
+    SDValue CC = DAG.getConstant(X86CC, MVT::i8);
+    SDValue Test = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
+    SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test);
+    return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
+  }
 
   // SSE/AVX shift intrinsics
   case Intrinsic::x86_sse2_psll_w:
@@ -11338,7 +11663,19 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
   case Intrinsic::x86_fma_vfmaddsub_ps_256:
   case Intrinsic::x86_fma_vfmaddsub_pd_256:
   case Intrinsic::x86_fma_vfmsubadd_ps_256:
-  case Intrinsic::x86_fma_vfmsubadd_pd_256: {
+  case Intrinsic::x86_fma_vfmsubadd_pd_256:
+  case Intrinsic::x86_fma_vfmadd_ps_512:
+  case Intrinsic::x86_fma_vfmadd_pd_512:
+  case Intrinsic::x86_fma_vfmsub_ps_512:
+  case Intrinsic::x86_fma_vfmsub_pd_512:
+  case Intrinsic::x86_fma_vfnmadd_ps_512:
+  case Intrinsic::x86_fma_vfnmadd_pd_512:
+  case Intrinsic::x86_fma_vfnmsub_ps_512:
+  case Intrinsic::x86_fma_vfnmsub_pd_512:
+  case Intrinsic::x86_fma_vfmaddsub_ps_512:
+  case Intrinsic::x86_fma_vfmaddsub_pd_512:
+  case Intrinsic::x86_fma_vfmsubadd_ps_512:
+  case Intrinsic::x86_fma_vfmsubadd_pd_512: { 
     unsigned Opc;
     switch (IntNo) {
     default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
@@ -11346,36 +11683,48 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
     case Intrinsic::x86_fma_vfmadd_pd:
     case Intrinsic::x86_fma_vfmadd_ps_256:
     case Intrinsic::x86_fma_vfmadd_pd_256:
+    case Intrinsic::x86_fma_vfmadd_ps_512:
+    case Intrinsic::x86_fma_vfmadd_pd_512:
       Opc = X86ISD::FMADD;
       break;
     case Intrinsic::x86_fma_vfmsub_ps:
     case Intrinsic::x86_fma_vfmsub_pd:
     case Intrinsic::x86_fma_vfmsub_ps_256:
     case Intrinsic::x86_fma_vfmsub_pd_256:
+    case Intrinsic::x86_fma_vfmsub_ps_512:
+    case Intrinsic::x86_fma_vfmsub_pd_512:
       Opc = X86ISD::FMSUB;
       break;
     case Intrinsic::x86_fma_vfnmadd_ps:
     case Intrinsic::x86_fma_vfnmadd_pd:
     case Intrinsic::x86_fma_vfnmadd_ps_256:
     case Intrinsic::x86_fma_vfnmadd_pd_256:
+    case Intrinsic::x86_fma_vfnmadd_ps_512:
+    case Intrinsic::x86_fma_vfnmadd_pd_512:
       Opc = X86ISD::FNMADD;
       break;
     case Intrinsic::x86_fma_vfnmsub_ps:
     case Intrinsic::x86_fma_vfnmsub_pd:
     case Intrinsic::x86_fma_vfnmsub_ps_256:
     case Intrinsic::x86_fma_vfnmsub_pd_256:
+    case Intrinsic::x86_fma_vfnmsub_ps_512:
+    case Intrinsic::x86_fma_vfnmsub_pd_512:
       Opc = X86ISD::FNMSUB;
       break;
     case Intrinsic::x86_fma_vfmaddsub_ps:
     case Intrinsic::x86_fma_vfmaddsub_pd:
     case Intrinsic::x86_fma_vfmaddsub_ps_256:
     case Intrinsic::x86_fma_vfmaddsub_pd_256:
+    case Intrinsic::x86_fma_vfmaddsub_ps_512:
+    case Intrinsic::x86_fma_vfmaddsub_pd_512:
       Opc = X86ISD::FMADDSUB;
       break;
     case Intrinsic::x86_fma_vfmsubadd_ps:
     case Intrinsic::x86_fma_vfmsubadd_pd:
     case Intrinsic::x86_fma_vfmsubadd_ps_256:
     case Intrinsic::x86_fma_vfmsubadd_pd_256:
+    case Intrinsic::x86_fma_vfmsubadd_ps_512:
+    case Intrinsic::x86_fma_vfmsubadd_pd_512:
       Opc = X86ISD::FMSUBADD;
       break;
     }
@@ -11386,7 +11735,87 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
   }
 }
 
-static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) {
+static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
+                             SDValue Base, SDValue Index,
+                             SDValue ScaleOp, SDValue Chain,
+                             const X86Subtarget * Subtarget) {
+  SDLoc dl(Op);
+  ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
+  assert(C && "Invalid scale type");
+  SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8);
+  SDValue Src = getZeroVector(Op.getValueType(), Subtarget, DAG, dl); 
+  EVT MaskVT = MVT::getVectorVT(MVT::i1, 
+                                Index.getValueType().getVectorNumElements());
+  SDValue MaskInReg = DAG.getConstant(~0, MaskVT);
+  SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
+  SDValue Disp = DAG.getTargetConstant(0, MVT::i32);
+  SDValue Segment = DAG.getRegister(0, MVT::i32);
+  SDValue Ops[] = {Src, MaskInReg, Base, Scale, Index, Disp, Segment, Chain};
+  SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
+  SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
+  return DAG.getMergeValues(RetOps, array_lengthof(RetOps), dl);
+}
+
+static SDValue getMGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
+                              SDValue Src, SDValue Mask, SDValue Base,
+                              SDValue Index, SDValue ScaleOp, SDValue Chain,
+                              const X86Subtarget * Subtarget) {
+  SDLoc dl(Op);
+  ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
+  assert(C && "Invalid scale type");
+  SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8);
+  EVT MaskVT = MVT::getVectorVT(MVT::i1,
+                                Index.getValueType().getVectorNumElements());
+  SDValue MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
+  SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
+  SDValue Disp = DAG.getTargetConstant(0, MVT::i32);
+  SDValue Segment = DAG.getRegister(0, MVT::i32);
+  if (Src.getOpcode() == ISD::UNDEF)
+    Src = getZeroVector(Op.getValueType(), Subtarget, DAG, dl); 
+  SDValue Ops[] = {Src, MaskInReg, Base, Scale, Index, Disp, Segment, Chain};
+  SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
+  SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
+  return DAG.getMergeValues(RetOps, array_lengthof(RetOps), dl);
+}
+
+static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
+                              SDValue Src, SDValue Base, SDValue Index,
+                              SDValue ScaleOp, SDValue Chain) {
+  SDLoc dl(Op);
+  ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
+  assert(C && "Invalid scale type");
+  SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8);
+  SDValue Disp = DAG.getTargetConstant(0, MVT::i32);
+  SDValue Segment = DAG.getRegister(0, MVT::i32);
+  EVT MaskVT = MVT::getVectorVT(MVT::i1,
+                                Index.getValueType().getVectorNumElements());
+  SDValue MaskInReg = DAG.getConstant(~0, MaskVT);
+  SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
+  SDValue Ops[] = {Base, Scale, Index, Disp, Segment, MaskInReg, Src, Chain};
+  SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
+  return SDValue(Res, 1);
+}
+
+static SDValue getMScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
+                               SDValue Src, SDValue Mask, SDValue Base,
+                               SDValue Index, SDValue ScaleOp, SDValue Chain) {
+  SDLoc dl(Op);
+  ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
+  assert(C && "Invalid scale type");
+  SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8);
+  SDValue Disp = DAG.getTargetConstant(0, MVT::i32);
+  SDValue Segment = DAG.getRegister(0, MVT::i32);
+  EVT MaskVT = MVT::getVectorVT(MVT::i1,
+                                Index.getValueType().getVectorNumElements());
+  SDValue MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
+  SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
+  SDValue Ops[] = {Base, Scale, Index, Disp, Segment, MaskInReg, Src, Chain};
+  SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
+  return SDValue(Res, 1);
+}
+
+static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
+                                      SelectionDAG &DAG) {
   SDLoc dl(Op);
   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
   switch (IntNo) {
@@ -11421,7 +11850,144 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) {
     return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
                        SDValue(Result.getNode(), 2));
   }
-
+  //int_gather(index, base, scale);
+  case Intrinsic::x86_avx512_gather_qpd_512:
+  case Intrinsic::x86_avx512_gather_qps_512:
+  case Intrinsic::x86_avx512_gather_dpd_512:
+  case Intrinsic::x86_avx512_gather_qpi_512:
+  case Intrinsic::x86_avx512_gather_qpq_512:
+  case Intrinsic::x86_avx512_gather_dpq_512:
+  case Intrinsic::x86_avx512_gather_dps_512:
+  case Intrinsic::x86_avx512_gather_dpi_512: {
+    unsigned Opc;
+    switch (IntNo) {
+      default: llvm_unreachable("Unexpected intrinsic!");
+      case Intrinsic::x86_avx512_gather_qps_512: Opc = X86::VGATHERQPSZrm; break;
+      case Intrinsic::x86_avx512_gather_qpd_512: Opc = X86::VGATHERQPDZrm; break;
+      case Intrinsic::x86_avx512_gather_dpd_512: Opc = X86::VGATHERDPDZrm; break;
+      case Intrinsic::x86_avx512_gather_dps_512: Opc = X86::VGATHERDPSZrm; break;
+      case Intrinsic::x86_avx512_gather_qpi_512: Opc = X86::VPGATHERQDZrm; break;
+      case Intrinsic::x86_avx512_gather_qpq_512: Opc = X86::VPGATHERQQZrm; break;
+      case Intrinsic::x86_avx512_gather_dpi_512: Opc = X86::VPGATHERDDZrm; break;
+      case Intrinsic::x86_avx512_gather_dpq_512: Opc = X86::VPGATHERDQZrm; break;
+    }
+    SDValue Chain = Op.getOperand(0);
+    SDValue Index = Op.getOperand(2);
+    SDValue Base  = Op.getOperand(3);
+    SDValue Scale = Op.getOperand(4);
+    return getGatherNode(Opc, Op, DAG, Base, Index, Scale, Chain, Subtarget);
+  }
+  //int_gather_mask(v1, mask, index, base, scale);
+  case Intrinsic::x86_avx512_gather_qps_mask_512:
+  case Intrinsic::x86_avx512_gather_qpd_mask_512:
+  case Intrinsic::x86_avx512_gather_dpd_mask_512:
+  case Intrinsic::x86_avx512_gather_dps_mask_512:
+  case Intrinsic::x86_avx512_gather_qpi_mask_512:
+  case Intrinsic::x86_avx512_gather_qpq_mask_512:
+  case Intrinsic::x86_avx512_gather_dpi_mask_512:
+  case Intrinsic::x86_avx512_gather_dpq_mask_512: {
+    unsigned Opc;
+    switch (IntNo) {
+      default: llvm_unreachable("Unexpected intrinsic!");
+      case Intrinsic::x86_avx512_gather_qps_mask_512: 
+        Opc = X86::VGATHERQPSZrm; break;
+      case Intrinsic::x86_avx512_gather_qpd_mask_512:
+        Opc = X86::VGATHERQPDZrm; break;
+      case Intrinsic::x86_avx512_gather_dpd_mask_512:
+        Opc = X86::VGATHERDPDZrm; break;
+      case Intrinsic::x86_avx512_gather_dps_mask_512:
+        Opc = X86::VGATHERDPSZrm; break;
+      case Intrinsic::x86_avx512_gather_qpi_mask_512:
+        Opc = X86::VPGATHERQDZrm; break;
+      case Intrinsic::x86_avx512_gather_qpq_mask_512:
+        Opc = X86::VPGATHERQQZrm; break;
+      case Intrinsic::x86_avx512_gather_dpi_mask_512:
+        Opc = X86::VPGATHERDDZrm; break;
+      case Intrinsic::x86_avx512_gather_dpq_mask_512:
+        Opc = X86::VPGATHERDQZrm; break;
+    }
+    SDValue Chain = Op.getOperand(0);
+    SDValue Src   = Op.getOperand(2);
+    SDValue Mask  = Op.getOperand(3);
+    SDValue Index = Op.getOperand(4);
+    SDValue Base  = Op.getOperand(5);
+    SDValue Scale = Op.getOperand(6);
+    return getMGatherNode(Opc, Op, DAG, Src, Mask, Base, Index, Scale, Chain,
+                          Subtarget);
+  }
+  //int_scatter(base, index, v1, scale);
+  case Intrinsic::x86_avx512_scatter_qpd_512:
+  case Intrinsic::x86_avx512_scatter_qps_512:
+  case Intrinsic::x86_avx512_scatter_dpd_512:
+  case Intrinsic::x86_avx512_scatter_qpi_512:
+  case Intrinsic::x86_avx512_scatter_qpq_512:
+  case Intrinsic::x86_avx512_scatter_dpq_512:
+  case Intrinsic::x86_avx512_scatter_dps_512:
+  case Intrinsic::x86_avx512_scatter_dpi_512: {
+    unsigned Opc;
+    switch (IntNo) {
+      default: llvm_unreachable("Unexpected intrinsic!");
+      case Intrinsic::x86_avx512_scatter_qpd_512: 
+        Opc = X86::VSCATTERQPDZmr; break;
+      case Intrinsic::x86_avx512_scatter_qps_512:
+        Opc = X86::VSCATTERQPSZmr; break;
+      case Intrinsic::x86_avx512_scatter_dpd_512:
+        Opc = X86::VSCATTERDPDZmr; break;
+      case Intrinsic::x86_avx512_scatter_dps_512:
+        Opc = X86::VSCATTERDPSZmr; break;
+      case Intrinsic::x86_avx512_scatter_qpi_512:
+        Opc = X86::VPSCATTERQDZmr; break;
+      case Intrinsic::x86_avx512_scatter_qpq_512:
+        Opc = X86::VPSCATTERQQZmr; break;
+      case Intrinsic::x86_avx512_scatter_dpq_512:
+        Opc = X86::VPSCATTERDQZmr; break;
+      case Intrinsic::x86_avx512_scatter_dpi_512:
+        Opc = X86::VPSCATTERDDZmr; break;
+    }
+    SDValue Chain = Op.getOperand(0);
+    SDValue Base  = Op.getOperand(2);
+    SDValue Index = Op.getOperand(3);
+    SDValue Src   = Op.getOperand(4);
+    SDValue Scale = Op.getOperand(5);
+    return getScatterNode(Opc, Op, DAG, Src, Base, Index, Scale, Chain);
+  }
+  //int_scatter_mask(base, mask, index, v1, scale);
+  case Intrinsic::x86_avx512_scatter_qps_mask_512:
+  case Intrinsic::x86_avx512_scatter_qpd_mask_512:
+  case Intrinsic::x86_avx512_scatter_dpd_mask_512:
+  case Intrinsic::x86_avx512_scatter_dps_mask_512:
+  case Intrinsic::x86_avx512_scatter_qpi_mask_512:
+  case Intrinsic::x86_avx512_scatter_qpq_mask_512:
+  case Intrinsic::x86_avx512_scatter_dpi_mask_512:
+  case Intrinsic::x86_avx512_scatter_dpq_mask_512: {
+    unsigned Opc;
+    switch (IntNo) {
+      default: llvm_unreachable("Unexpected intrinsic!");
+      case Intrinsic::x86_avx512_scatter_qpd_mask_512: 
+        Opc = X86::VSCATTERQPDZmr; break;
+      case Intrinsic::x86_avx512_scatter_qps_mask_512:
+        Opc = X86::VSCATTERQPSZmr; break;
+      case Intrinsic::x86_avx512_scatter_dpd_mask_512:
+        Opc = X86::VSCATTERDPDZmr; break;
+      case Intrinsic::x86_avx512_scatter_dps_mask_512:
+        Opc = X86::VSCATTERDPSZmr; break;
+      case Intrinsic::x86_avx512_scatter_qpi_mask_512:
+        Opc = X86::VPSCATTERQDZmr; break;
+      case Intrinsic::x86_avx512_scatter_qpq_mask_512:
+        Opc = X86::VPSCATTERQQZmr; break;
+      case Intrinsic::x86_avx512_scatter_dpq_mask_512:
+        Opc = X86::VPSCATTERDQZmr; break;
+      case Intrinsic::x86_avx512_scatter_dpi_mask_512:
+        Opc = X86::VPSCATTERDDZmr; break;
+    }
+    SDValue Chain = Op.getOperand(0);
+    SDValue Base  = Op.getOperand(2);
+    SDValue Mask  = Op.getOperand(3);
+    SDValue Index = Op.getOperand(4);
+    SDValue Src   = Op.getOperand(5);
+    SDValue Scale = Op.getOperand(6);
+    return getMScatterNode(Opc, Op, DAG, Src, Mask, Base, Index, Scale, Chain);
+  }
   // XTEST intrinsics.
   case Intrinsic::x86_xtest: {
     SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
@@ -11913,8 +12479,8 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget,
     return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
   }
 
-  assert((VT == MVT::v2i64 || VT == MVT::v4i64) &&
-         "Only know how to lower V2I64/V4I64 multiply");
+  assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
+         "Only know how to lower V2I64/V4I64/V8I64 multiply");
 
   //  Ahi = psrlqi(a, 32);
   //  Bhi = psrlqi(b, 32);
@@ -11927,13 +12493,12 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget,
   //  AhiBlo = psllqi(AhiBlo, 32);
   //  return AloBlo + AloBhi + AhiBlo;
 
-  SDValue ShAmt = DAG.getConstant(32, MVT::i32);
-
-  SDValue Ahi = DAG.getNode(X86ISD::VSRLI, dl, VT, A, ShAmt);
-  SDValue Bhi = DAG.getNode(X86ISD::VSRLI, dl, VT, B, ShAmt);
+  SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
+  SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
 
   // Bit cast to 32-bit vectors for MULUDQ
-  EVT MulVT = (VT == MVT::v2i64) ? MVT::v4i32 : MVT::v8i32;
+  EVT MulVT = (VT == MVT::v2i64) ? MVT::v4i32 :
+                                  (VT == MVT::v4i64) ? MVT::v8i32 : MVT::v16i32;
   A = DAG.getNode(ISD::BITCAST, dl, MulVT, A);
   B = DAG.getNode(ISD::BITCAST, dl, MulVT, B);
   Ahi = DAG.getNode(ISD::BITCAST, dl, MulVT, Ahi);
@@ -11943,14 +12508,14 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget,
   SDValue AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
   SDValue AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
 
-  AloBhi = DAG.getNode(X86ISD::VSHLI, dl, VT, AloBhi, ShAmt);
-  AhiBlo = DAG.getNode(X86ISD::VSHLI, dl, VT, AhiBlo, ShAmt);
+  AloBhi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AloBhi, 32, DAG);
+  AhiBlo = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AhiBlo, 32, DAG);
 
   SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi);
   return DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo);
 }
 
-SDValue X86TargetLowering::LowerSDIV(SDValue Op, SelectionDAG &DAG) const {
+static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) {
   EVT VT = Op.getValueType();
   EVT EltTy = VT.getVectorElementType();
   unsigned NumElts = VT.getVectorNumElements();
@@ -11972,16 +12537,26 @@ SDValue X86TargetLowering::LowerSDIV(SDValue Op, SelectionDAG &DAG) const {
 
   if ((SplatValue != 0) &&
       (SplatValue.isPowerOf2() || (-SplatValue).isPowerOf2())) {
-    unsigned lg2 = SplatValue.countTrailingZeros();
+    unsigned Lg2 = SplatValue.countTrailingZeros();
     // Splat the sign bit.
-    SDValue Sz = DAG.getConstant(EltTy.getSizeInBits()-1, MVT::i32);
-    SDValue SGN = getTargetVShiftNode(X86ISD::VSRAI, dl, VT, N0, Sz, DAG);
+    SmallVector<SDValue, 16> Sz(NumElts,
+                                DAG.getConstant(EltTy.getSizeInBits() - 1,
+                                                EltTy));
+    SDValue SGN = DAG.getNode(ISD::SRA, dl, VT, N0,
+                              DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Sz[0],
+                                          NumElts));
     // Add (N0 < 0) ? abs2 - 1 : 0;
-    SDValue Amt = DAG.getConstant(EltTy.getSizeInBits() - lg2, MVT::i32);
-    SDValue SRL = getTargetVShiftNode(X86ISD::VSRLI, dl, VT, SGN, Amt, DAG);
+    SmallVector<SDValue, 16> Amt(NumElts,
+                                 DAG.getConstant(EltTy.getSizeInBits() - Lg2,
+                                                 EltTy));
+    SDValue SRL = DAG.getNode(ISD::SRL, dl, VT, SGN,
+                              DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Amt[0],
+                                          NumElts));
     SDValue ADD = DAG.getNode(ISD::ADD, dl, VT, N0, SRL);
-    SDValue Lg2Amt = DAG.getConstant(lg2, MVT::i32);
-    SDValue SRA = getTargetVShiftNode(X86ISD::VSRAI, dl, VT, ADD, Lg2Amt, DAG);
+    SmallVector<SDValue, 16> Lg2Amt(NumElts, DAG.getConstant(Lg2, EltTy));
+    SDValue SRA = DAG.getNode(ISD::SRA, dl, VT, ADD,
+                              DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Lg2Amt[0],
+                                          NumElts));
 
     // If we're dividing by a positive value, we're done.  Otherwise, we must
     // negate the result.
@@ -12010,23 +12585,26 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
 
       if (VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 ||
           (Subtarget->hasInt256() &&
-           (VT == MVT::v4i64 || VT == MVT::v8i32 || VT == MVT::v16i16))) {
+           (VT == MVT::v4i64 || VT == MVT::v8i32 || VT == MVT::v16i16)) ||
+          (Subtarget->hasAVX512() &&
+           (VT == MVT::v8i64 || VT == MVT::v16i32))) {
         if (Op.getOpcode() == ISD::SHL)
-          return DAG.getNode(X86ISD::VSHLI, dl, VT, R,
-                             DAG.getConstant(ShiftAmt, MVT::i32));
+          return getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, R, ShiftAmt,
+                                            DAG);
         if (Op.getOpcode() == ISD::SRL)
-          return DAG.getNode(X86ISD::VSRLI, dl, VT, R,
-                             DAG.getConstant(ShiftAmt, MVT::i32));
+          return getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt,
+                                            DAG);
         if (Op.getOpcode() == ISD::SRA && VT != MVT::v2i64 && VT != MVT::v4i64)
-          return DAG.getNode(X86ISD::VSRAI, dl, VT, R,
-                             DAG.getConstant(ShiftAmt, MVT::i32));
+          return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, ShiftAmt,
+                                            DAG);
       }
 
       if (VT == MVT::v16i8) {
         if (Op.getOpcode() == ISD::SHL) {
           // Make a large shift.
-          SDValue SHL = DAG.getNode(X86ISD::VSHLI, dl, MVT::v8i16, R,
-                                    DAG.getConstant(ShiftAmt, MVT::i32));
+          SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl,
+                                                   MVT::v8i16, R, ShiftAmt,
+                                                   DAG); 
           SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL);
           // Zero out the rightmost bits.
           SmallVector<SDValue, 16> V(16,
@@ -12037,8 +12615,9 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
         }
         if (Op.getOpcode() == ISD::SRL) {
           // Make a large shift.
-          SDValue SRL = DAG.getNode(X86ISD::VSRLI, dl, MVT::v8i16, R,
-                                    DAG.getConstant(ShiftAmt, MVT::i32));
+          SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl,
+                                                   MVT::v8i16, R, ShiftAmt,
+                                                   DAG);
           SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL);
           // Zero out the leftmost bits.
           SmallVector<SDValue, 16> V(16,
@@ -12069,8 +12648,9 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
       if (Subtarget->hasInt256() && VT == MVT::v32i8) {
         if (Op.getOpcode() == ISD::SHL) {
           // Make a large shift.
-          SDValue SHL = DAG.getNode(X86ISD::VSHLI, dl, MVT::v16i16, R,
-                                    DAG.getConstant(ShiftAmt, MVT::i32));
+          SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl,
+                                                   MVT::v16i16, R, ShiftAmt,
+                                                   DAG);
           SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL);
           // Zero out the rightmost bits.
           SmallVector<SDValue, 32> V(32,
@@ -12081,8 +12661,9 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
         }
         if (Op.getOpcode() == ISD::SRL) {
           // Make a large shift.
-          SDValue SRL = DAG.getNode(X86ISD::VSRLI, dl, MVT::v16i16, R,
-                                    DAG.getConstant(ShiftAmt, MVT::i32));
+          SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl,
+                                                   MVT::v16i16, R, ShiftAmt,
+                                                   DAG);
           SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL);
           // Zero out the leftmost bits.
           SmallVector<SDValue, 32> V(32,
@@ -12147,14 +12728,14 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
     default:
       llvm_unreachable("Unknown shift opcode!");
     case ISD::SHL:
-      return DAG.getNode(X86ISD::VSHLI, dl, VT, R,
-                         DAG.getConstant(ShiftAmt, MVT::i32));
+      return getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, R, ShiftAmt,
+                                        DAG);
     case ISD::SRL:
-      return DAG.getNode(X86ISD::VSRLI, dl, VT, R,
-                         DAG.getConstant(ShiftAmt, MVT::i32));
+      return getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt,
+                                        DAG);
     case ISD::SRA:
-      return DAG.getNode(X86ISD::VSRAI, dl, VT, R,
-                         DAG.getConstant(ShiftAmt, MVT::i32));
+      return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, ShiftAmt,
+                                        DAG);
     }
   }
 
@@ -12172,7 +12753,8 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
       VT == MVT::v4i32 || VT == MVT::v8i16 ||
       (Subtarget->hasInt256() &&
        ((VT == MVT::v4i64 && Op.getOpcode() != ISD::SRA) ||
-        VT == MVT::v8i32 || VT == MVT::v16i16))) {
+        VT == MVT::v8i32 || VT == MVT::v16i16)) ||
+       (Subtarget->hasAVX512() && (VT == MVT::v8i64 || VT == MVT::v16i32))) {
     SDValue BaseShAmt;
     EVT EltVT = VT.getVectorElementType();
 
@@ -12240,6 +12822,8 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
         case MVT::v4i64:
         case MVT::v8i32:
         case MVT::v16i16:
+        case MVT::v16i32:
+        case MVT::v8i64:
           return getTargetVShiftNode(X86ISD::VSHLI, dl, VT, R, BaseShAmt, DAG);
         }
       case ISD::SRA:
@@ -12249,6 +12833,8 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
         case MVT::v8i16:
         case MVT::v8i32:
         case MVT::v16i16:
+        case MVT::v16i32:
+        case MVT::v8i64:
           return getTargetVShiftNode(X86ISD::VSRAI, dl, VT, R, BaseShAmt, DAG);
         }
       case ISD::SRL:
@@ -12260,6 +12846,8 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
         case MVT::v4i64:
         case MVT::v8i32:
         case MVT::v16i16:
+        case MVT::v16i32:
+        case MVT::v8i64:
           return getTargetVShiftNode(X86ISD::VSRLI, dl, VT, R, BaseShAmt, DAG);
         }
       }
@@ -12268,7 +12856,8 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
 
   // Special case in 32-bit mode, where i64 is expanded into high and low parts.
   if (!Subtarget->is64Bit() &&
-      (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64)) &&
+      (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64) ||
+      (Subtarget->hasAVX512() && VT == MVT::v8i64)) &&
       Amt.getOpcode() == ISD::BITCAST &&
       Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
     Amt = Amt.getOperand(0);
@@ -12297,7 +12886,8 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
   return SDValue();
 }
 
-SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
+static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
+                          SelectionDAG &DAG) {
 
   EVT VT = Op.getValueType();
   SDLoc dl(Op);
@@ -12316,6 +12906,8 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
   if (V.getNode())
       return V;
 
+  if (Subtarget->hasAVX512() && (VT == MVT::v16i32 || VT == MVT::v8i64))
+    return Op;
   // AVX2 has VPSLLV/VPSRAV/VPSRLV.
   if (Subtarget->hasInt256()) {
     if (Op.getOpcode() == ISD::SRL &&
@@ -12356,8 +12948,7 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
 
     // r = VSELECT(r, psllw(r & (char16)15, 4), a);
     SDValue M = DAG.getNode(ISD::AND, dl, VT, R, CM1);
-    M = getTargetVShiftNode(X86ISD::VSHLI, dl, MVT::v8i16, M,
-                            DAG.getConstant(4, MVT::i32), DAG);
+    M = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, MVT::v8i16, M, 4, DAG);
     M = DAG.getNode(ISD::BITCAST, dl, VT, M);
     R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R);
 
@@ -12368,8 +12959,7 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
 
     // r = VSELECT(r, psllw(r & (char16)63, 2), a);
     M = DAG.getNode(ISD::AND, dl, VT, R, CM2);
-    M = getTargetVShiftNode(X86ISD::VSHLI, dl, MVT::v8i16, M,
-                            DAG.getConstant(2, MVT::i32), DAG);
+    M = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, MVT::v8i16, M, 2, DAG);
     M = DAG.getNode(ISD::BITCAST, dl, VT, M);
     R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R);
 
@@ -12512,7 +13102,6 @@ SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
 
   unsigned BitsDiff = VT.getScalarType().getSizeInBits() -
                       ExtraVT.getScalarType().getSizeInBits();
-  SDValue ShAmt = DAG.getConstant(BitsDiff, MVT::i32);
 
   switch (VT.getSimpleVT().SimpleTy) {
     default: return SDValue();
@@ -12546,24 +13135,34 @@ SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
       // fall through
     case MVT::v4i32:
     case MVT::v8i16: {
-      // (sext (vzext x)) -> (vsext x)
       SDValue Op0 = Op.getOperand(0);
       SDValue Op00 = Op0.getOperand(0);
       SDValue Tmp1;
       // Hopefully, this VECTOR_SHUFFLE is just a VZEXT.
       if (Op0.getOpcode() == ISD::BITCAST &&
-          Op00.getOpcode() == ISD::VECTOR_SHUFFLE)
-        Tmp1 = LowerVectorIntExtend(Op00, DAG);
-      if (Tmp1.getNode()) {
-        SDValue Tmp1Op0 = Tmp1.getOperand(0);
-        assert(Tmp1Op0.getOpcode() == X86ISD::VZEXT &&
-               "This optimization is invalid without a VZEXT.");
-        return DAG.getNode(X86ISD::VSEXT, dl, VT, Tmp1Op0.getOperand(0));
+          Op00.getOpcode() == ISD::VECTOR_SHUFFLE) {
+        // (sext (vzext x)) -> (vsext x)
+        Tmp1 = LowerVectorIntExtend(Op00, Subtarget, DAG);
+        if (Tmp1.getNode()) {
+          EVT ExtraEltVT = ExtraVT.getVectorElementType();
+          // This folding is only valid when the in-reg type is a vector of i8,
+          // i16, or i32.
+          if (ExtraEltVT == MVT::i8 || ExtraEltVT == MVT::i16 ||
+              ExtraEltVT == MVT::i32) {
+            SDValue Tmp1Op0 = Tmp1.getOperand(0);
+            assert(Tmp1Op0.getOpcode() == X86ISD::VZEXT &&
+                   "This optimization is invalid without a VZEXT.");
+            return DAG.getNode(X86ISD::VSEXT, dl, VT, Tmp1Op0.getOperand(0));
+          }
+          Op0 = Tmp1;
+        }
       }
 
       // If the above didn't work, then just use Shift-Left + Shift-Right.
-      Tmp1 = getTargetVShiftNode(X86ISD::VSHLI, dl, VT, Op0, ShAmt, DAG);
-      return getTargetVShiftNode(X86ISD::VSRAI, dl, VT, Tmp1, ShAmt, DAG);
+      Tmp1 = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Op0, BitsDiff,
+                                        DAG);
+      return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Tmp1, BitsDiff,
+                                        DAG);
     }
   }
 }
@@ -12655,9 +13254,10 @@ static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget,
   return DAG.getMergeValues(Ops, array_lengthof(Ops), dl);
 }
 
-SDValue X86TargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const {
-  EVT SrcVT = Op.getOperand(0).getValueType();
-  EVT DstVT = Op.getValueType();
+static SDValue LowerBITCAST(SDValue Op, const X86Subtarget *Subtarget,
+                            SelectionDAG &DAG) {
+  MVT SrcVT = Op.getOperand(0).getSimpleValueType();
+  MVT DstVT = Op.getSimpleValueType();
   assert(Subtarget->is64Bit() && !Subtarget->hasSSE2() &&
          Subtarget->hasMMX() && "Unexpected custom BITCAST");
   assert((DstVT == MVT::i64 ||
@@ -12742,7 +13342,8 @@ static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
                      Op.getOperand(1), Op.getOperand(2));
 }
 
-SDValue X86TargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
+static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget *Subtarget,
+                            SelectionDAG &DAG) {
   assert(Subtarget->isTargetDarwin() && Subtarget->is64Bit());
 
   // For MacOSX, we want to call an alternative entry point: __sincos_stret,
@@ -12753,8 +13354,8 @@ SDValue X86TargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
   EVT ArgVT = Arg.getValueType();
   Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
 
-  ArgListTy Args;
-  ArgListEntry Entry;
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
 
   Entry.Node = Arg;
   Entry.Ty = ArgTy;
@@ -12767,7 +13368,8 @@ SDValue X86TargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
   // the small struct {f32, f32} is returned in (eax, edx). For f64,
   // the results are returned via SRet in memory.
   const char *LibcallName =  isF64 ? "__sincos_stret" : "__sincosf_stret";
-  SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy());
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  SDValue Callee = DAG.getExternalSymbol(LibcallName, TLI.getPointerTy());
 
   Type *RetTy = isF64
     ? (Type*)StructType::get(ArgTy, ArgTy, NULL)
@@ -12778,7 +13380,7 @@ SDValue X86TargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
                          CallingConv::C, /*isTaillCall=*/false,
                          /*doesNotRet=*/false, /*isReturnValueUsed*/true,
                          Callee, Args, DAG, dl);
-  std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
+  std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
 
   if (isF64)
     // Returned in xmm0 and xmm1.
@@ -12822,9 +13424,9 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::SINT_TO_FP:         return LowerSINT_TO_FP(Op, DAG);
   case ISD::UINT_TO_FP:         return LowerUINT_TO_FP(Op, DAG);
   case ISD::TRUNCATE:           return LowerTRUNCATE(Op, DAG);
-  case ISD::ZERO_EXTEND:        return LowerZERO_EXTEND(Op, DAG);
-  case ISD::SIGN_EXTEND:        return LowerSIGN_EXTEND(Op, DAG);
-  case ISD::ANY_EXTEND:         return LowerANY_EXTEND(Op, DAG);
+  case ISD::ZERO_EXTEND:        return LowerZERO_EXTEND(Op, Subtarget, DAG);
+  case ISD::SIGN_EXTEND:        return LowerSIGN_EXTEND(Op, Subtarget, DAG);
+  case ISD::ANY_EXTEND:         return LowerANY_EXTEND(Op, Subtarget, DAG);
   case ISD::FP_TO_SINT:         return LowerFP_TO_SINT(Op, DAG);
   case ISD::FP_TO_UINT:         return LowerFP_TO_UINT(Op, DAG);
   case ISD::FP_EXTEND:          return LowerFP_EXTEND(Op, DAG);
@@ -12840,7 +13442,8 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::VAARG:              return LowerVAARG(Op, DAG);
   case ISD::VACOPY:             return LowerVACOPY(Op, Subtarget, DAG);
   case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
-  case ISD::INTRINSIC_W_CHAIN:  return LowerINTRINSIC_W_CHAIN(Op, DAG);
+  case ISD::INTRINSIC_VOID:
+  case ISD::INTRINSIC_W_CHAIN:  return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
   case ISD::RETURNADDR:         return LowerRETURNADDR(Op, DAG);
   case ISD::FRAMEADDR:          return LowerFRAMEADDR(Op, DAG);
   case ISD::FRAME_TO_ARGS_OFFSET:
@@ -12858,7 +13461,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::MUL:                return LowerMUL(Op, Subtarget, DAG);
   case ISD::SRA:
   case ISD::SRL:
-  case ISD::SHL:                return LowerShift(Op, DAG);
+  case ISD::SHL:                return LowerShift(Op, Subtarget, DAG);
   case ISD::SADDO:
   case ISD::UADDO:
   case ISD::SSUBO:
@@ -12866,7 +13469,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::SMULO:
   case ISD::UMULO:              return LowerXALUO(Op, DAG);
   case ISD::READCYCLECOUNTER:   return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
-  case ISD::BITCAST:            return LowerBITCAST(Op, DAG);
+  case ISD::BITCAST:            return LowerBITCAST(Op, Subtarget, DAG);
   case ISD::ADDC:
   case ISD::ADDE:
   case ISD::SUBC:
@@ -12874,7 +13477,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::ADD:                return LowerADD(Op, DAG);
   case ISD::SUB:                return LowerSUB(Op, DAG);
   case ISD::SDIV:               return LowerSDIV(Op, DAG);
-  case ISD::FSINCOS:            return LowerFSINCOS(Op, DAG);
+  case ISD::FSINCOS:            return LowerFSINCOS(Op, Subtarget, DAG);
   }
 }
 
@@ -13128,6 +13731,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::CMP:                return "X86ISD::CMP";
   case X86ISD::COMI:               return "X86ISD::COMI";
   case X86ISD::UCOMI:              return "X86ISD::UCOMI";
+  case X86ISD::CMPM:               return "X86ISD::CMPM";
+  case X86ISD::CMPMU:              return "X86ISD::CMPMU";
   case X86ISD::SETCC:              return "X86ISD::SETCC";
   case X86ISD::SETCC_CARRY:        return "X86ISD::SETCC_CARRY";
   case X86ISD::FSETCCsd:           return "X86ISD::FSETCCsd";
@@ -13187,6 +13792,9 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::VZEXT_LOAD:         return "X86ISD::VZEXT_LOAD";
   case X86ISD::VZEXT:              return "X86ISD::VZEXT";
   case X86ISD::VSEXT:              return "X86ISD::VSEXT";
+  case X86ISD::VTRUNC:             return "X86ISD::VTRUNC";
+  case X86ISD::VTRUNCM:            return "X86ISD::VTRUNCM";
+  case X86ISD::VINSERT:            return "X86ISD::VINSERT";
   case X86ISD::VFPEXT:             return "X86ISD::VFPEXT";
   case X86ISD::VFPROUND:           return "X86ISD::VFPROUND";
   case X86ISD::VSHLDQ:             return "X86ISD::VSHLDQ";
@@ -13200,6 +13808,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::CMPP:               return "X86ISD::CMPP";
   case X86ISD::PCMPEQ:             return "X86ISD::PCMPEQ";
   case X86ISD::PCMPGT:             return "X86ISD::PCMPGT";
+  case X86ISD::PCMPEQM:            return "X86ISD::PCMPEQM";
+  case X86ISD::PCMPGTM:            return "X86ISD::PCMPGTM";
   case X86ISD::ADD:                return "X86ISD::ADD";
   case X86ISD::SUB:                return "X86ISD::SUB";
   case X86ISD::ADC:                return "X86ISD::ADC";
@@ -13214,9 +13824,14 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::BLSI:               return "X86ISD::BLSI";
   case X86ISD::BLSMSK:             return "X86ISD::BLSMSK";
   case X86ISD::BLSR:               return "X86ISD::BLSR";
+  case X86ISD::BZHI:               return "X86ISD::BZHI";
+  case X86ISD::BEXTR:              return "X86ISD::BEXTR";
   case X86ISD::MUL_IMM:            return "X86ISD::MUL_IMM";
   case X86ISD::PTEST:              return "X86ISD::PTEST";
   case X86ISD::TESTP:              return "X86ISD::TESTP";
+  case X86ISD::TESTM:              return "X86ISD::TESTM";
+  case X86ISD::KORTEST:            return "X86ISD::KORTEST";
+  case X86ISD::KTEST:              return "X86ISD::KTEST";
   case X86ISD::PALIGNR:            return "X86ISD::PALIGNR";
   case X86ISD::PSHUFD:             return "X86ISD::PSHUFD";
   case X86ISD::PSHUFHW:            return "X86ISD::PSHUFHW";
@@ -13239,6 +13854,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::VPERMILP:           return "X86ISD::VPERMILP";
   case X86ISD::VPERM2X128:         return "X86ISD::VPERM2X128";
   case X86ISD::VPERMV:             return "X86ISD::VPERMV";
+  case X86ISD::VPERMV3:            return "X86ISD::VPERMV3";
   case X86ISD::VPERMI:             return "X86ISD::VPERMI";
   case X86ISD::PMULUDQ:            return "X86ISD::PMULUDQ";
   case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
@@ -13422,37 +14038,46 @@ bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
 bool
 X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
                                       EVT VT) const {
+  if (!VT.isSimple())
+    return false;
+
+  MVT SVT = VT.getSimpleVT();
+
   // Very little shuffling can be done for 64-bit vectors right now.
   if (VT.getSizeInBits() == 64)
     return false;
 
   // FIXME: pshufb, blends, shifts.
-  return (VT.getVectorNumElements() == 2 ||
+  return (SVT.getVectorNumElements() == 2 ||
           ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
-          isMOVLMask(M, VT) ||
-          isSHUFPMask(M, VT, Subtarget->hasFp256()) ||
-          isPSHUFDMask(M, VT) ||
-          isPSHUFHWMask(M, VT, Subtarget->hasInt256()) ||
-          isPSHUFLWMask(M, VT, Subtarget->hasInt256()) ||
-          isPALIGNRMask(M, VT, Subtarget) ||
-          isUNPCKLMask(M, VT, Subtarget->hasInt256()) ||
-          isUNPCKHMask(M, VT, Subtarget->hasInt256()) ||
-          isUNPCKL_v_undef_Mask(M, VT, Subtarget->hasInt256()) ||
-          isUNPCKH_v_undef_Mask(M, VT, Subtarget->hasInt256()));
+          isMOVLMask(M, SVT) ||
+          isSHUFPMask(M, SVT) ||
+          isPSHUFDMask(M, SVT) ||
+          isPSHUFHWMask(M, SVT, Subtarget->hasInt256()) ||
+          isPSHUFLWMask(M, SVT, Subtarget->hasInt256()) ||
+          isPALIGNRMask(M, SVT, Subtarget) ||
+          isUNPCKLMask(M, SVT, Subtarget->hasInt256()) ||
+          isUNPCKHMask(M, SVT, Subtarget->hasInt256()) ||
+          isUNPCKL_v_undef_Mask(M, SVT, Subtarget->hasInt256()) ||
+          isUNPCKH_v_undef_Mask(M, SVT, Subtarget->hasInt256()));
 }
 
 bool
 X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
                                           EVT VT) const {
-  unsigned NumElts = VT.getVectorNumElements();
+  if (!VT.isSimple())
+    return false;
+
+  MVT SVT = VT.getSimpleVT();
+  unsigned NumElts = SVT.getVectorNumElements();
   // FIXME: This collection of masks seems suspect.
   if (NumElts == 2)
     return true;
-  if (NumElts == 4 && VT.is128BitVector()) {
-    return (isMOVLMask(Mask, VT)  ||
-            isCommutedMOVLMask(Mask, VT, true) ||
-            isSHUFPMask(Mask, VT, Subtarget->hasFp256()) ||
-            isSHUFPMask(Mask, VT, Subtarget->hasFp256(), /* Commuted */ true));
+  if (NumElts == 4 && SVT.is128BitVector()) {
+    return (isMOVLMask(Mask, SVT)  ||
+            isCommutedMOVLMask(Mask, SVT, true) ||
+            isSHUFPMask(Mask, SVT) ||
+            isSHUFPMask(Mask, SVT, /* Commuted */ true));
   }
   return false;
 }
@@ -15194,6 +15819,9 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   case X86::CMOV_V8F32:
   case X86::CMOV_V4F64:
   case X86::CMOV_V4I64:
+  case X86::CMOV_V16F32:
+  case X86::CMOV_V8F64:
+  case X86::CMOV_V8I64:
   case X86::CMOV_GR16:
   case X86::CMOV_GR32:
   case X86::CMOV_RFP32:
@@ -15642,7 +16270,7 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
   for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i)
     Elts.push_back(getShuffleScalarElt(N, i, DAG, 0));
 
-  return EltsFromConsecutiveLoads(VT, Elts, dl, DAG);
+  return EltsFromConsecutiveLoads(VT, Elts, dl, DAG, true);
 }
 
 /// PerformTruncateCombine - Converts truncate operation to
@@ -15749,6 +16377,44 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
                      EltNo);
 }
 
+/// Extract one bit from mask vector, like v16i1 or v8i1.
+/// AVX-512 feature.
+static SDValue ExtractBitFromMaskVector(SDNode *N, SelectionDAG &DAG) {
+  SDValue Vec = N->getOperand(0);
+  SDLoc dl(Vec);
+  MVT VecVT = Vec.getSimpleValueType();
+  SDValue Idx = N->getOperand(1);
+  MVT EltVT = N->getSimpleValueType(0);
+  
+  assert((VecVT.getVectorElementType() == MVT::i1 && EltVT == MVT::i8) ||
+         "Unexpected operands in ExtractBitFromMaskVector");
+
+  // variable index
+  if (!isa<ConstantSDNode>(Idx)) {
+    MVT ExtVT = (VecVT == MVT::v8i1 ?  MVT::v8i64 : MVT::v16i32);
+    SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Vec);
+    SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
+                              ExtVT.getVectorElementType(), Ext);
+    return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
+  }
+
+  unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+
+  MVT ScalarVT = MVT::getIntegerVT(VecVT.getSizeInBits());
+  unsigned MaxShift = VecVT.getSizeInBits() - 1;
+  Vec = DAG.getNode(ISD::BITCAST, dl, ScalarVT, Vec);
+  Vec = DAG.getNode(ISD::SHL, dl, ScalarVT, Vec, 
+              DAG.getConstant(MaxShift - IdxVal, ScalarVT));
+  Vec = DAG.getNode(ISD::SRL, dl, ScalarVT, Vec,
+    DAG.getConstant(MaxShift, ScalarVT));
+
+  if (VecVT == MVT::v16i1) {
+    Vec = DAG.getNode(ISD::BITCAST, dl, MVT::i16, Vec);
+    return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Vec);
+  }
+  return DAG.getNode(ISD::BITCAST, dl, MVT::i8, Vec);
+}
+
 /// PerformEXTRACT_VECTOR_ELTCombine - Detect vector gather/scatter index
 /// generation and convert it from being a bunch of shuffles and extracts
 /// to a simple store and scalar loads to extract the elements.
@@ -15759,6 +16425,11 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
     return NewOp;
 
   SDValue InputVector = N->getOperand(0);
+
+  if (InputVector.getValueType().getVectorElementType() == MVT::i1 &&
+      !DCI.isBeforeLegalize())
+    return ExtractBitFromMaskVector(N, DAG);
+
   // Detect whether we are trying to convert from mmx to i32 and the bitcast
   // from mmx to v2i32 has a single usage.
   if (InputVector.getNode()->getOpcode() == llvm::ISD::BITCAST &&
@@ -15846,24 +16517,28 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
 }
 
 /// \brief Matches a VSELECT onto min/max or return 0 if the node doesn't match.
-static unsigned matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS,
-                                   SDValue RHS, SelectionDAG &DAG,
-                                   const X86Subtarget *Subtarget) {
+static std::pair<unsigned, bool>
+matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS, SDValue RHS,
+                   SelectionDAG &DAG, const X86Subtarget *Subtarget) {
   if (!VT.isVector())
-    return 0;
+    return std::make_pair(0, false);
 
+  bool NeedSplit = false;
   switch (VT.getSimpleVT().SimpleTy) {
-  default: return 0;
+  default: return std::make_pair(0, false);
   case MVT::v32i8:
   case MVT::v16i16:
   case MVT::v8i32:
     if (!Subtarget->hasAVX2())
-      return 0;
+      NeedSplit = true;
+    if (!Subtarget->hasAVX())
+      return std::make_pair(0, false);
+    break;
   case MVT::v16i8:
   case MVT::v8i16:
   case MVT::v4i32:
     if (!Subtarget->hasSSE2())
-      return 0;
+      return std::make_pair(0, false);
   }
 
   // SSE2 has only a small subset of the operations.
@@ -15874,6 +16549,7 @@ static unsigned matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS,
 
   ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
 
+  unsigned Opc = 0;
   // Check for x CC y ? x : y.
   if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
       DAG.isEqualTo(RHS, Cond.getOperand(1))) {
@@ -15881,16 +16557,16 @@ static unsigned matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS,
     default: break;
     case ISD::SETULT:
     case ISD::SETULE:
-      return hasUnsigned ? X86ISD::UMIN : 0;
+      Opc = hasUnsigned ? X86ISD::UMIN : 0; break;
     case ISD::SETUGT:
     case ISD::SETUGE:
-      return hasUnsigned ? X86ISD::UMAX : 0;
+      Opc = hasUnsigned ? X86ISD::UMAX : 0; break;
     case ISD::SETLT:
     case ISD::SETLE:
-      return hasSigned ? X86ISD::SMIN : 0;
+      Opc = hasSigned ? X86ISD::SMIN : 0; break;
     case ISD::SETGT:
     case ISD::SETGE:
-      return hasSigned ? X86ISD::SMAX : 0;
+      Opc = hasSigned ? X86ISD::SMAX : 0; break;
     }
   // Check for x CC y ? y : x -- a min/max with reversed arms.
   } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
@@ -15899,20 +16575,20 @@ static unsigned matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS,
     default: break;
     case ISD::SETULT:
     case ISD::SETULE:
-      return hasUnsigned ? X86ISD::UMAX : 0;
+      Opc = hasUnsigned ? X86ISD::UMAX : 0; break;
     case ISD::SETUGT:
     case ISD::SETUGE:
-      return hasUnsigned ? X86ISD::UMIN : 0;
+      Opc = hasUnsigned ? X86ISD::UMIN : 0; break;
     case ISD::SETLT:
     case ISD::SETLE:
-      return hasSigned ? X86ISD::SMAX : 0;
+      Opc = hasSigned ? X86ISD::SMAX : 0; break;
     case ISD::SETGT:
     case ISD::SETGE:
-      return hasSigned ? X86ISD::SMIN : 0;
+      Opc = hasSigned ? X86ISD::SMIN : 0; break;
     }
   }
 
-  return 0;
+  return std::make_pair(Opc, NeedSplit);
 }
 
 /// PerformSELECTCombine - Do target-specific dag combines on SELECT and VSELECT
@@ -15926,13 +16602,14 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
   SDValue LHS = N->getOperand(1);
   SDValue RHS = N->getOperand(2);
   EVT VT = LHS.getValueType();
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
 
   // If we have SSE[12] support, try to form min/max nodes. SSE min/max
   // instructions match the semantics of the common C idiom x<y?x:y but not
   // x<=y?x:y, because of how they handle negative zero (which can be
   // ignored in unsafe-math mode).
   if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
-      VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
+      VT != MVT::f80 && TLI.isTypeLegal(VT) &&
       (Subtarget->hasSSE2() ||
        (Subtarget->hasSSE1() && VT.getScalarType() == MVT::f32))) {
     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
@@ -16071,6 +16748,22 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
       return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
   }
 
+  EVT CondVT = Cond.getValueType();
+  if (Subtarget->hasAVX512() && VT.isVector() && CondVT.isVector() &&
+      CondVT.getVectorElementType() == MVT::i1) {
+    // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
+    // lowering on AVX-512. In this case we convert it to
+    // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
+    // The same situation for all 128 and 256-bit vectors of i8 and i16
+    EVT OpVT = LHS.getValueType();
+    if ((OpVT.is128BitVector() || OpVT.is256BitVector()) &&
+        (OpVT.getVectorElementType() == MVT::i8 ||
+         OpVT.getVectorElementType() == MVT::i16)) {
+      Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, OpVT, Cond);
+      DCI.AddToWorklist(Cond.getNode());
+      return DAG.getNode(N->getOpcode(), DL, OpVT, Cond, LHS, RHS);
+    }
+  }
   // If this is a select between two integer constants, try to do some
   // optimizations.
   if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) {
@@ -16195,9 +16888,12 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
     }
   }
 
+  // Early exit check
+  if (!TLI.isTypeLegal(VT))
+    return SDValue();
+
   // Match VSELECTs into subs with unsigned saturation.
-  if (!DCI.isBeforeLegalize() &&
-      N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
+  if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
       // psubus is available in SSE2 and AVX2 for i8 and i16 vectors.
       ((Subtarget->hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) ||
        (Subtarget->hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)))) {
@@ -16251,14 +16947,35 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
   }
 
   // Try to match a min/max vector operation.
-  if (!DCI.isBeforeLegalize() &&
-      N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC)
-    if (unsigned Op = matchIntegerMINMAX(Cond, VT, LHS, RHS, DAG, Subtarget))
-      return DAG.getNode(Op, DL, N->getValueType(0), LHS, RHS);
+  if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC) {
+    std::pair<unsigned, bool> ret = matchIntegerMINMAX(Cond, VT, LHS, RHS, DAG, Subtarget);
+    unsigned Opc = ret.first;
+    bool NeedSplit = ret.second;
+
+    if (Opc && NeedSplit) {
+      unsigned NumElems = VT.getVectorNumElements();
+      // Extract the LHS vectors
+      SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, DL);
+      SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, DL);
+
+      // Extract the RHS vectors
+      SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, DL);
+      SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, DL);
+
+      // Create min/max for each subvector
+      LHS = DAG.getNode(Opc, DL, LHS1.getValueType(), LHS1, RHS1);
+      RHS = DAG.getNode(Opc, DL, LHS2.getValueType(), LHS2, RHS2);
+
+      // Merge the result
+      return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LHS, RHS);
+    } else if (Opc)
+      return DAG.getNode(Opc, DL, VT, LHS, RHS);
+  }
 
   // Simplify vector selection if the selector will be produced by CMPP*/PCMP*.
-  if (!DCI.isBeforeLegalize() && N->getOpcode() == ISD::VSELECT &&
-      Cond.getOpcode() == ISD::SETCC) {
+  if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
+      // Check if SETCC has already been promoted
+      TLI.getSetCCResultType(*DAG.getContext(), VT) == Cond.getValueType()) {
 
     assert(Cond.getValueType().isVector() &&
            "vector select expects a vector selector!");
@@ -16305,7 +17022,6 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
   // matched by one of the SSE/AVX BLEND instructions. These instructions only
   // depend on the highest bit in each word. Try to use SimplifyDemandedBits
   // to simplify previous instructions.
-  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
       !DCI.isBeforeLegalize() && TLI.isOperationLegal(ISD::VSELECT, VT)) {
     unsigned BitWidth = Cond.getValueType().getScalarType().getSizeInBits();
@@ -16314,6 +17030,15 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
     if (BitWidth == 1)
       return SDValue();
 
+    // Check all uses of that condition operand to check whether it will be
+    // consumed by non-BLEND instructions, which may depend on all bits are set
+    // properly.
+    for (SDNode::use_iterator I = Cond->use_begin(),
+                              E = Cond->use_end(); I != E; ++I)
+      if (I->getOpcode() != ISD::VSELECT)
+        // TODO: Add other opcodes eventually lowered into BLEND.
+        return SDValue();
+
     assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
     APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1);
 
@@ -16990,33 +17715,80 @@ static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,
   if (R.getNode())
     return R;
 
-  // Create BLSI, and BLSR instructions
+  // Create BLSI, BLSR, and BZHI instructions
   // BLSI is X & (-X)
   // BLSR is X & (X-1)
-  if (Subtarget->hasBMI() && (VT == MVT::i32 || VT == MVT::i64)) {
+  // BZHI is X & ((1 << Y) - 1)
+  // BEXTR is ((X >> imm) & (2**size-1))
+  if (VT == MVT::i32 || VT == MVT::i64) {
     SDValue N0 = N->getOperand(0);
     SDValue N1 = N->getOperand(1);
     SDLoc DL(N);
 
-    // Check LHS for neg
-    if (N0.getOpcode() == ISD::SUB && N0.getOperand(1) == N1 &&
-        isZero(N0.getOperand(0)))
-      return DAG.getNode(X86ISD::BLSI, DL, VT, N1);
-
-    // Check RHS for neg
-    if (N1.getOpcode() == ISD::SUB && N1.getOperand(1) == N0 &&
-        isZero(N1.getOperand(0)))
-      return DAG.getNode(X86ISD::BLSI, DL, VT, N0);
+    if (Subtarget->hasBMI()) {
+      // Check LHS for neg
+      if (N0.getOpcode() == ISD::SUB && N0.getOperand(1) == N1 &&
+          isZero(N0.getOperand(0)))
+        return DAG.getNode(X86ISD::BLSI, DL, VT, N1);
+
+      // Check RHS for neg
+      if (N1.getOpcode() == ISD::SUB && N1.getOperand(1) == N0 &&
+          isZero(N1.getOperand(0)))
+        return DAG.getNode(X86ISD::BLSI, DL, VT, N0);
+
+      // Check LHS for X-1
+      if (N0.getOpcode() == ISD::ADD && N0.getOperand(0) == N1 &&
+          isAllOnes(N0.getOperand(1)))
+        return DAG.getNode(X86ISD::BLSR, DL, VT, N1);
+
+      // Check RHS for X-1
+      if (N1.getOpcode() == ISD::ADD && N1.getOperand(0) == N0 &&
+          isAllOnes(N1.getOperand(1)))
+        return DAG.getNode(X86ISD::BLSR, DL, VT, N0);
+    }
+
+    if (Subtarget->hasBMI2()) {
+      // Check for (and (add (shl 1, Y), -1), X)
+      if (N0.getOpcode() == ISD::ADD && isAllOnes(N0.getOperand(1))) {
+        SDValue N00 = N0.getOperand(0);
+        if (N00.getOpcode() == ISD::SHL) {
+          SDValue N001 = N00.getOperand(1);
+          assert(N001.getValueType() == MVT::i8 && "unexpected type");
+          ConstantSDNode *C = dyn_cast<ConstantSDNode>(N00.getOperand(0));
+          if (C && C->getZExtValue() == 1)
+            return DAG.getNode(X86ISD::BZHI, DL, VT, N1, N001);
+        }
+      }
 
-    // Check LHS for X-1
-    if (N0.getOpcode() == ISD::ADD && N0.getOperand(0) == N1 &&
-        isAllOnes(N0.getOperand(1)))
-      return DAG.getNode(X86ISD::BLSR, DL, VT, N1);
+      // Check for (and X, (add (shl 1, Y), -1))
+      if (N1.getOpcode() == ISD::ADD && isAllOnes(N1.getOperand(1))) {
+        SDValue N10 = N1.getOperand(0);
+        if (N10.getOpcode() == ISD::SHL) {
+          SDValue N101 = N10.getOperand(1);
+          assert(N101.getValueType() == MVT::i8 && "unexpected type");
+          ConstantSDNode *C = dyn_cast<ConstantSDNode>(N10.getOperand(0));
+          if (C && C->getZExtValue() == 1)
+            return DAG.getNode(X86ISD::BZHI, DL, VT, N0, N101);
+        }
+      }
+    }
 
-    // Check RHS for X-1
-    if (N1.getOpcode() == ISD::ADD && N1.getOperand(0) == N0 &&
-        isAllOnes(N1.getOperand(1)))
-      return DAG.getNode(X86ISD::BLSR, DL, VT, N0);
+    // Check for BEXTR.
+    if ((Subtarget->hasBMI() || Subtarget->hasTBM()) &&
+        (N0.getOpcode() == ISD::SRA || N0.getOpcode() == ISD::SRL)) {
+      ConstantSDNode *MaskNode = dyn_cast<ConstantSDNode>(N1);
+      ConstantSDNode *ShiftNode = dyn_cast<ConstantSDNode>(N0.getOperand(1));
+      if (MaskNode && ShiftNode) {
+        uint64_t Mask = MaskNode->getZExtValue();
+        uint64_t Shift = ShiftNode->getZExtValue();
+        if (isMask_64(Mask)) {
+          uint64_t MaskSize = CountPopulation_64(Mask);
+          if (Shift + MaskSize <= VT.getSizeInBits())
+            return DAG.getNode(X86ISD::BEXTR, DL, VT, N0.getOperand(0),
+                               DAG.getConstant(Shift | (MaskSize << 8), VT));
+        }
+      }
+    } // BEXTR
 
     return SDValue();
   }
@@ -17732,7 +18504,7 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
       RHS.getOpcode() != ISD::VECTOR_SHUFFLE)
     return false;
 
-  MVT VT = LHS.getValueType().getSimpleVT();
+  MVT VT = LHS.getSimpleValueType();
 
   assert((VT.is128BitVector() || VT.is256BitVector()) &&
          "Unsupported vector type for horizontal add/sub");
@@ -18205,7 +18977,7 @@ static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG,
     if (!Ld->isVolatile() && !N->getValueType(0).isVector() &&
         ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
         !XTLI->getSubtarget()->is64Bit() &&
-        !DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
+        VT == MVT::i64) {
       SDValue FILDChain = XTLI->BuildFILD(SDValue(N, 0), Ld->getValueType(0),
                                           Ld->getChain(), Op0, DAG);
       DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));
@@ -18531,6 +19303,22 @@ namespace {
   const VariadicFunction1<bool, StringRef, StringRef, matchAsmImpl> matchAsm={};
 }
 
+static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
+
+  if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
+    if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") &&
+        std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") &&
+        std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) {
+
+      if (AsmPieces.size() == 3)
+        return true;
+      else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}"))
+        return true;
+    }
+  }
+  return false;
+}
+
 bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
   InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
 
@@ -18572,12 +19360,8 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
       const std::string &ConstraintsStr = IA->getConstraintString();
       SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
       array_pod_sort(AsmPieces.begin(), AsmPieces.end());
-      if (AsmPieces.size() == 4 &&
-          AsmPieces[0] == "~{cc}" &&
-          AsmPieces[1] == "~{dirflag}" &&
-          AsmPieces[2] == "~{flags}" &&
-          AsmPieces[3] == "~{fpsr}")
-      return IntrinsicLowering::LowerToByteSwap(CI);
+      if (clobbersFlagRegisters(AsmPieces))
+        return IntrinsicLowering::LowerToByteSwap(CI);
     }
     break;
   case 3:
@@ -18590,11 +19374,7 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
       const std::string &ConstraintsStr = IA->getConstraintString();
       SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
       array_pod_sort(AsmPieces.begin(), AsmPieces.end());
-      if (AsmPieces.size() == 4 &&
-          AsmPieces[0] == "~{cc}" &&
-          AsmPieces[1] == "~{dirflag}" &&
-          AsmPieces[2] == "~{flags}" &&
-          AsmPieces[3] == "~{fpsr}")
+      if (clobbersFlagRegisters(AsmPieces))
         return IntrinsicLowering::LowerToByteSwap(CI);
     }
 
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index 2703274..bc3dd60 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -53,7 +53,7 @@ namespace llvm {
       /// to X86::XORPS or X86::XORPD.
       FXOR,
 
-      /// FAND - Bitwise logical ANDNOT of floating point values. This
+      /// FANDN - Bitwise logical ANDNOT of floating point values. This
       /// corresponds to X86::ANDNPS or X86::ANDNPD.
       FANDN,
 
@@ -254,6 +254,12 @@ namespace llvm {
       // VSEXT - Vector integer signed-extend.
       VSEXT,
 
+      // VTRUNC - Vector integer truncate.
+      VTRUNC,
+
+      // VTRUNC - Vector integer truncate with mask.
+      VTRUNCM,
+
       // VFPEXT - Vector FP extend.
       VFPEXT,
 
@@ -274,6 +280,13 @@ namespace llvm {
 
       // PCMP* - Vector integer comparisons.
       PCMPEQ, PCMPGT,
+      // PCMP*M - Vector integer comparisons, the result is in a mask vector.
+      PCMPEQM, PCMPGTM,
+
+      /// CMPM, CMPMU - Vector comparison generating mask bits for fp and
+      /// integer signed and unsigned data types.
+      CMPM,
+      CMPMU,
 
       // ADD, SUB, SMUL, etc. - Arithmetic operations with FLAGS results.
       ADD, SUB, ADC, SBB, SMUL,
@@ -282,18 +295,23 @@ namespace llvm {
       BLSI,   // BLSI - Extract lowest set isolated bit
       BLSMSK, // BLSMSK - Get mask up to lowest set bit
       BLSR,   // BLSR - Reset lowest set bit
+      BZHI,   // BZHI - Zero high bits
+      BEXTR,  // BEXTR - Bit field extract
 
       UMUL, // LOW, HI, FLAGS = umul LHS, RHS
 
       // MUL_IMM - X86 specific multiply by immediate.
       MUL_IMM,
 
-      // PTEST - Vector bitwise comparisons
+      // PTEST - Vector bitwise comparisons.
       PTEST,
 
-      // TESTP - Vector packed fp sign bitwise comparisons
+      // TESTP - Vector packed fp sign bitwise comparisons.
       TESTP,
 
+      // TESTM - Vector "test" in AVX-512, the result is in a mask vector.
+      TESTM,
+
       // OR/AND test for masks
       KORTEST,
       KTEST,
@@ -318,11 +336,13 @@ namespace llvm {
       UNPCKH,
       VPERMILP,
       VPERMV,
+      VPERMV3,
       VPERMI,
       VPERM2X128,
       VBROADCAST,
       // masked broadcast
       VBROADCASTM,
+      VINSERT,
 
       // PMULUDQ - Vector multiply packed unsigned doubleword integers
       PMULUDQ,
@@ -755,6 +775,8 @@ namespace llvm {
     SDValue BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, SDValue StackSlot,
                       SelectionDAG &DAG) const;
 
+    virtual bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const LLVM_OVERRIDE;
+
     /// \brief Reset the operation actions based on target options.
     virtual void resetOperationActions();
 
@@ -831,8 +853,6 @@ namespace llvm {
                                                bool isSigned,
                                                bool isReplace) const;
 
-    SDValue LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, SDLoc dl,
-                                   SelectionDAG &DAG) const;
     SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
@@ -846,18 +866,12 @@ namespace llvm {
     SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerBITCAST(SDValue op, SelectionDAG &DAG) const;
     SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG) const;
     SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerZERO_EXTEND(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerZERO_EXTEND_AVX512(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerSIGN_EXTEND(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerANY_EXTEND(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFABS(SDValue Op, SelectionDAG &DAG) const;
@@ -881,19 +895,7 @@ namespace llvm {
     SDValue lowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerShift(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const;
-
-    // Utility functions to help LowerVECTOR_SHUFFLE & LowerBUILD_VECTOR
-    SDValue LowerVectorBroadcast(SDValue Op, SelectionDAG &DAG) const;
-    SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG) const;
-    SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) const;
-
-    SDValue LowerVectorAllZeroTest(SDValue Op, SelectionDAG &DAG) const;
-
-    SDValue LowerVectorIntExtend(SDValue Op, SelectionDAG &DAG) const;
 
     virtual SDValue
       LowerFormalArguments(SDValue Chain,
@@ -925,6 +927,8 @@ namespace llvm {
                    const SmallVectorImpl<ISD::OutputArg> &Outs,
                    LLVMContext &Context) const;
 
+    virtual const uint16_t *getScratchRegisters(CallingConv::ID CC) const;
+
     /// Utility function to emit atomic-load-arith operations (and, or, xor,
     /// nand, max, min, umax, umin). It takes the corresponding instruction to
     /// expand, the associated machine basic block, and the associated X86
diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td
index 8abae14..cb19fbd 100644
--- a/lib/Target/X86/X86InstrAVX512.td
+++ b/lib/Target/X86/X86InstrAVX512.td
@@ -80,6 +80,21 @@ let Predicates = [HasAVX512] in {
   def : Pat<(v16i16 (bitconvert (v32i8 VR256X:$src))),  (v16i16 VR256X:$src)>;
 }
 
+//
+// AVX-512: VPXOR instruction writes zero to its upper part, it's safe build zeros.
+//
+
+let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
+    isPseudo = 1, Predicates = [HasAVX512] in {
+def AVX512_512_SET0 : I<0, Pseudo, (outs VR512:$dst), (ins), "",
+               [(set VR512:$dst, (v16f32 immAllZerosV))]>;
+}
+
+def : Pat<(v8i64 immAllZerosV), (AVX512_512_SET0)>;
+def : Pat<(v16i32 immAllZerosV), (AVX512_512_SET0)>;
+def : Pat<(v8f64 immAllZerosV), (AVX512_512_SET0)>;
+def : Pat<(v16f32 immAllZerosV), (AVX512_512_SET0)>;
+
 //===----------------------------------------------------------------------===//
 // AVX-512 - VECTOR INSERT
 //
@@ -376,6 +391,11 @@ def : Pat<(v16f32 (X86VBroadcast (loadf32 addr:$src))),
 def : Pat<(v8f64 (X86VBroadcast (loadf64 addr:$src))),
           (VBROADCASTSDZrm addr:$src)>;
 
+def : Pat<(int_x86_avx512_vbroadcast_ss_512 addr:$src),
+          (VBROADCASTSSZrm addr:$src)>;
+def : Pat<(int_x86_avx512_vbroadcast_sd_512 addr:$src),
+          (VBROADCASTSDZrm addr:$src)>;
+
 multiclass avx512_int_broadcast_reg<bits<8> opc, string OpcodeStr,
                           RegisterClass SrcRC, RegisterClass KRC> {
   def Zrr : AVX5128I<opc, MRMSrcReg, (outs VR512:$dst), (ins SrcRC:$src),
@@ -402,6 +422,13 @@ def : Pat<(v16i32 (X86VBroadcast (i32 GR32:$src))),
         (VPBROADCASTDrZrr GR32:$src)>;
 def : Pat<(v8i64 (X86VBroadcast (i64 GR64:$src))),
         (VPBROADCASTQrZrr GR64:$src)>;
+def : Pat<(v8i64 (X86VBroadcastm VK8WM:$mask, (i64 GR64:$src))),
+        (VPBROADCASTQrZkrr VK8WM:$mask, GR64:$src)>;
+
+def : Pat<(v16i32 (int_x86_avx512_pbroadcastd_i32_512 (i32 GR32:$src))),
+        (VPBROADCASTDrZrr GR32:$src)>;
+def : Pat<(v8i64 (int_x86_avx512_pbroadcastq_i64_512 (i64 GR64:$src))),
+        (VPBROADCASTQrZrr GR64:$src)>;
 
 multiclass avx512_int_broadcast_rm<bits<8> opc, string OpcodeStr,
                           X86MemOperand x86memop, PatFrag ld_frag,
@@ -414,10 +441,11 @@ multiclass avx512_int_broadcast_rm<bits<8> opc, string OpcodeStr,
   def krr : AVX5128I<opc, MRMSrcReg, (outs DstRC:$dst), (ins KRC:$mask,
                                                          VR128X:$src),
                     !strconcat(OpcodeStr, 
-                    "\t{$src, ${dst}{${mask}}{z}|${dst}{${mask}}{z}, $src}"),
+                    "\t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"),
                     [(set DstRC:$dst,
                       (OpVT (X86VBroadcastm KRC:$mask, (SrcVT VR128X:$src))))]>,
                     EVEX, EVEX_KZ;
+  let mayLoad = 1 in {
   def rm : AVX5128I<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src),
                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                   [(set DstRC:$dst, 
@@ -425,9 +453,10 @@ multiclass avx512_int_broadcast_rm<bits<8> opc, string OpcodeStr,
   def krm : AVX5128I<opc, MRMSrcMem, (outs DstRC:$dst), (ins KRC:$mask,
                                                          x86memop:$src),
                   !strconcat(OpcodeStr, 
-                      "\t{$src, ${dst}{${mask}}{z}|${dst}{${mask}}{z}, $src}"),
+                      "\t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"),
                   [(set DstRC:$dst, (OpVT (X86VBroadcastm KRC:$mask, 
                                      (ld_frag addr:$src))))]>, EVEX, EVEX_KZ;
+  }
 }
 
 defm VPBROADCASTDZ  : avx512_int_broadcast_rm<0x58, "vpbroadcastd", i32mem,
@@ -437,10 +466,20 @@ defm VPBROADCASTQZ  : avx512_int_broadcast_rm<0x59, "vpbroadcastq", i64mem,
                       loadi64, VR512, v8i64, v2i64, VK8WM>,  EVEX_V512, VEX_W,
                       EVEX_CD8<64, CD8VT1>;
 
+def : Pat<(v16i32 (int_x86_avx512_pbroadcastd_512 (v4i32 VR128X:$src))),
+          (VPBROADCASTDZrr VR128X:$src)>;
+def : Pat<(v8i64 (int_x86_avx512_pbroadcastq_512 (v2i64 VR128X:$src))),
+          (VPBROADCASTQZrr VR128X:$src)>;
+
 def : Pat<(v16f32 (X86VBroadcast (v4f32 VR128X:$src))),
           (VBROADCASTSSZrr VR128X:$src)>;
 def : Pat<(v8f64 (X86VBroadcast (v2f64 VR128X:$src))),
           (VBROADCASTSDZrr VR128X:$src)>;
+
+def : Pat<(v16f32 (int_x86_avx512_vbroadcast_ss_ps_512 (v4f32 VR128X:$src))),
+          (VBROADCASTSSZrr VR128X:$src)>;
+def : Pat<(v8f64 (int_x86_avx512_vbroadcast_sd_pd_512 (v2f64 VR128X:$src))),
+          (VBROADCASTSDZrr VR128X:$src)>;
     
 // Provide fallback in case the load node that is used in the patterns above
 // is used by additional users, which prevents the pattern selection.
@@ -473,6 +512,306 @@ defm VPBROADCASTMW2D : avx512_mask_broadcast<0x3A, "vpbroadcastmw2d", VR512,
 defm VPBROADCASTMB2Q : avx512_mask_broadcast<0x2A, "vpbroadcastmb2q", VR512,
                                             VK8, v8i64, v8i1>, EVEX_V512, VEX_W;
 
+//===----------------------------------------------------------------------===//
+// AVX-512 - VPERM
+//
+// -- immediate form --
+multiclass avx512_perm_imm<bits<8> opc, string OpcodeStr, RegisterClass RC,
+                         SDNode OpNode, PatFrag mem_frag, 
+                         X86MemOperand x86memop, ValueType OpVT> {
+  def ri : AVX512AIi8<opc, MRMSrcReg, (outs RC:$dst),
+                     (ins RC:$src1, i8imm:$src2),
+                     !strconcat(OpcodeStr,
+                         "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                     [(set RC:$dst,
+                       (OpVT (OpNode RC:$src1, (i8 imm:$src2))))]>,
+                     EVEX;
+  def mi : AVX512AIi8<opc, MRMSrcMem, (outs RC:$dst),
+                     (ins x86memop:$src1, i8imm:$src2),
+                     !strconcat(OpcodeStr,
+                         "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                     [(set RC:$dst,
+                       (OpVT (OpNode (mem_frag addr:$src1),
+                              (i8 imm:$src2))))]>, EVEX;
+}
+
+defm VPERMQZ  : avx512_perm_imm<0x00, "vpermq", VR512, X86VPermi, memopv8i64,
+                        i512mem, v8i64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+let ExeDomain = SSEPackedDouble in 
+defm VPERMPDZ  : avx512_perm_imm<0x01, "vpermpd", VR512, X86VPermi, memopv8f64, 
+                        f512mem, v8f64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+
+// -- VPERM - register form --
+multiclass avx512_perm<bits<8> opc, string OpcodeStr, RegisterClass RC, 
+                     PatFrag mem_frag, X86MemOperand x86memop, ValueType OpVT> {
+
+  def rr : AVX5128I<opc, MRMSrcReg, (outs RC:$dst),
+                   (ins RC:$src1, RC:$src2),
+                   !strconcat(OpcodeStr,
+                       "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                   [(set RC:$dst,
+                     (OpVT (X86VPermv RC:$src1, RC:$src2)))]>, EVEX_4V;
+
+  def rm : AVX5128I<opc, MRMSrcMem, (outs RC:$dst),
+                   (ins RC:$src1, x86memop:$src2),
+                   !strconcat(OpcodeStr,
+                       "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                   [(set RC:$dst,
+                     (OpVT (X86VPermv RC:$src1, (mem_frag addr:$src2))))]>,
+                     EVEX_4V;
+}
+
+defm VPERMDZ   : avx512_perm<0x36, "vpermd",  VR512,  memopv16i32, i512mem,
+                           v16i32>, EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VPERMQZ   : avx512_perm<0x36, "vpermq",  VR512,  memopv8i64,  i512mem, 
+                           v8i64>,  EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+let ExeDomain = SSEPackedSingle in
+defm VPERMPSZ  : avx512_perm<0x16, "vpermps", VR512,  memopv16f32, f512mem,
+                           v16f32>, EVEX_V512, EVEX_CD8<32, CD8VF>;
+let ExeDomain = SSEPackedDouble in
+defm VPERMPDZ  : avx512_perm<0x16, "vpermpd", VR512,  memopv8f64, f512mem, 
+                           v8f64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+
+// -- VPERM2I - 3 source operands form --
+multiclass avx512_perm_3src<bits<8> opc, string OpcodeStr, RegisterClass RC,
+                          PatFrag mem_frag, X86MemOperand x86memop,
+                          ValueType OpVT> {
+let Constraints = "$src1 = $dst" in {
+  def rr : AVX5128I<opc, MRMSrcReg, (outs RC:$dst),
+                   (ins RC:$src1, RC:$src2, RC:$src3),
+                   !strconcat(OpcodeStr,
+                       "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                   [(set RC:$dst,
+                     (OpVT (X86VPermv3 RC:$src1, RC:$src2, RC:$src3)))]>,
+                    EVEX_4V;
+
+  def rm : AVX5128I<opc, MRMSrcMem, (outs RC:$dst),
+                   (ins RC:$src1, RC:$src2, x86memop:$src3),
+                   !strconcat(OpcodeStr,
+                    "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                   [(set RC:$dst,
+                     (OpVT (X86VPermv3 RC:$src1, RC:$src2, 
+                      (mem_frag addr:$src3))))]>, EVEX_4V;
+  }
+}
+defm VPERMI2D  : avx512_perm_3src<0x76, "vpermi2d",  VR512, memopv16i32, i512mem, 
+                               v16i32>, EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VPERMI2Q  : avx512_perm_3src<0x76, "vpermi2q",  VR512, memopv8i64, i512mem, 
+                               v8i64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+defm VPERMI2PS : avx512_perm_3src<0x77, "vpermi2ps",  VR512, memopv16f32, i512mem, 
+                               v16f32>, EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VPERMI2PD : avx512_perm_3src<0x77, "vpermi2pd",  VR512, memopv8f64, i512mem, 
+                               v8f64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512 - BLEND using mask
+//
+multiclass avx512_blendmask<bits<8> opc, string OpcodeStr, Intrinsic Int, 
+                          RegisterClass KRC, RegisterClass RC,
+                          X86MemOperand x86memop, PatFrag mem_frag,
+                          SDNode OpNode, ValueType vt> {
+  def rr : AVX5128I<opc, MRMSrcReg, (outs RC:$dst),
+               (ins KRC:$mask, RC:$src1, RC:$src2),
+               !strconcat(OpcodeStr,
+                "\t{$src2, $src1, ${dst} {${mask}}|${dst} {${mask}}, $src1, $src2}"),
+               [(set RC:$dst, (OpNode KRC:$mask, (vt RC:$src2), 
+                 (vt RC:$src1)))]>, EVEX_4V, EVEX_K;
+  def rr_Int : AVX5128I<opc, MRMSrcReg, (outs RC:$dst),
+               (ins KRC:$mask, RC:$src1, RC:$src2),
+               !strconcat(OpcodeStr,
+                "\t{$src2, $src1, ${dst} {${mask}}|${dst} {${mask}}, $src1, $src2}"),
+               [(set RC:$dst, (Int KRC:$mask, (vt RC:$src2),
+                 (vt RC:$src1)))]>, EVEX_4V, EVEX_K;
+
+  let mayLoad = 1 in {
+    def rm : AVX5128I<opc, MRMSrcMem, (outs RC:$dst),
+                 (ins KRC:$mask, RC:$src1, x86memop:$src2),
+                 !strconcat(OpcodeStr,
+                  "\t{$src2, $src1, $mask, $dst|$dst, $mask, $src1, $src2}"),
+                 []>, 
+                 EVEX_4V, EVEX_K;
+
+    def rm_Int : AVX5128I<opc, MRMSrcMem, (outs RC:$dst),
+                 (ins KRC:$mask, RC:$src1, x86memop:$src2),
+                 !strconcat(OpcodeStr,
+                  "\t{$src2, $src1, $mask, $dst|$dst, $mask, $src1, $src2}"),
+                 [(set RC:$dst, (Int KRC:$mask, (vt RC:$src1),
+                   (mem_frag addr:$src2)))]>,
+                 EVEX_4V, EVEX_K;
+  }
+}
+
+let ExeDomain = SSEPackedSingle in
+defm VBLENDMPSZ : avx512_blendmask<0x65, "vblendmps", 
+                              int_x86_avx512_mskblend_ps_512,
+                              VK16WM, VR512, f512mem,
+                              memopv16f32, vselect, v16f32>, 
+                              EVEX_CD8<32, CD8VF>, EVEX_V512;
+let ExeDomain = SSEPackedDouble in
+defm VBLENDMPDZ : avx512_blendmask<0x65, "vblendmpd", 
+                              int_x86_avx512_mskblend_pd_512,
+                              VK8WM, VR512, f512mem,
+                              memopv8f64, vselect, v8f64>, 
+                              VEX_W, EVEX_CD8<64, CD8VF>, EVEX_V512;
+
+defm VPBLENDMDZ : avx512_blendmask<0x64, "vpblendmd", 
+                              int_x86_avx512_mskblend_d_512,
+                              VK16WM, VR512, f512mem, 
+                              memopv16i32, vselect, v16i32>, 
+                              EVEX_CD8<32, CD8VF>, EVEX_V512;
+
+defm VPBLENDMQZ : avx512_blendmask<0x64, "vpblendmq", 
+                              int_x86_avx512_mskblend_q_512, 
+                              VK8WM, VR512, f512mem, 
+                              memopv8i64, vselect, v8i64>, 
+                              VEX_W, EVEX_CD8<64, CD8VF>, EVEX_V512;
+
+let Predicates = [HasAVX512] in {
+def : Pat<(v8f32 (vselect (v8i1 VK8WM:$mask), (v8f32 VR256X:$src1),
+                            (v8f32 VR256X:$src2))),
+            (EXTRACT_SUBREG 
+              (v16f32 (VBLENDMPSZrr (COPY_TO_REGCLASS VK8WM:$mask, VK16WM), 
+            (v16f32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm)),
+            (v16f32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)))), sub_ymm)>;
+
+def : Pat<(v8i32 (vselect (v8i1 VK8WM:$mask), (v8i32 VR256X:$src1),
+                            (v8i32 VR256X:$src2))),
+            (EXTRACT_SUBREG 
+                (v16i32 (VPBLENDMDZrr (COPY_TO_REGCLASS VK8WM:$mask, VK16WM), 
+            (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm)),
+            (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)))), sub_ymm)>;
+}
+
+multiclass avx512_icmp_packed<bits<8> opc, string OpcodeStr, RegisterClass KRC, 
+              RegisterClass RC, X86MemOperand x86memop, PatFrag memop_frag, 
+              SDNode OpNode, ValueType vt> {
+  def rr : AVX512BI<opc, MRMSrcReg,
+             (outs KRC:$dst), (ins RC:$src1, RC:$src2), 
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set KRC:$dst, (OpNode (vt RC:$src1), (vt RC:$src2)))], 
+             IIC_SSE_ALU_F32P_RR>, EVEX_4V;
+  def rm : AVX512BI<opc, MRMSrcMem,
+             (outs KRC:$dst), (ins RC:$src1, x86memop:$src2), 
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set KRC:$dst, (OpNode (vt RC:$src1), (memop_frag addr:$src2)))],
+             IIC_SSE_ALU_F32P_RM>, EVEX_4V;
+}
+
+defm VPCMPEQDZ : avx512_icmp_packed<0x76, "vpcmpeqd", VK16, VR512, i512mem, 
+                           memopv16i32, X86pcmpeqm, v16i32>, EVEX_V512;
+defm VPCMPEQQZ : avx512_icmp_packed<0x29, "vpcmpeqq", VK8, VR512, i512mem, 
+                           memopv8i64, X86pcmpeqm, v8i64>, T8, EVEX_V512, VEX_W;
+
+defm VPCMPGTDZ : avx512_icmp_packed<0x66, "vpcmpgtd", VK16, VR512, i512mem, 
+                           memopv16i32, X86pcmpgtm, v16i32>, EVEX_V512;
+defm VPCMPGTQZ : avx512_icmp_packed<0x37, "vpcmpgtq", VK8, VR512, i512mem, 
+                           memopv8i64, X86pcmpgtm, v8i64>, T8, EVEX_V512, VEX_W;
+
+def : Pat<(v8i1 (X86pcmpgtm (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))),
+            (COPY_TO_REGCLASS (VPCMPGTDZrr 
+            (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)),
+            (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm))), VK8)>;
+
+def : Pat<(v8i1 (X86pcmpeqm (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))),
+            (COPY_TO_REGCLASS (VPCMPEQDZrr 
+            (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)),
+            (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm))), VK8)>;
+
+multiclass avx512_icmp_cc<bits<8> opc, RegisterClass KRC,
+              RegisterClass RC, X86MemOperand x86memop, PatFrag memop_frag, 
+              SDNode OpNode, ValueType vt, Operand CC, string asm,
+              string asm_alt> {
+  def rri : AVX512AIi8<opc, MRMSrcReg,
+             (outs KRC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm,
+             [(set KRC:$dst, (OpNode (vt RC:$src1), (vt RC:$src2), imm:$cc))], 
+             IIC_SSE_ALU_F32P_RR>, EVEX_4V;
+  def rmi : AVX512AIi8<opc, MRMSrcMem,
+             (outs KRC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm,
+             [(set KRC:$dst, (OpNode (vt RC:$src1), (memop_frag addr:$src2),
+                              imm:$cc))], IIC_SSE_ALU_F32P_RM>, EVEX_4V;
+  // Accept explicit immediate argument form instead of comparison code.
+  let neverHasSideEffects = 1 in {
+    def rri_alt : AVX512AIi8<opc, MRMSrcReg,
+               (outs RC:$dst), (ins RC:$src1, RC:$src2, i8imm:$cc),
+               asm_alt, [], IIC_SSE_ALU_F32P_RR>, EVEX_4V;
+    def rmi_alt : AVX512AIi8<opc, MRMSrcMem,
+               (outs RC:$dst), (ins RC:$src1, x86memop:$src2, i8imm:$cc),
+               asm_alt, [], IIC_SSE_ALU_F32P_RM>, EVEX_4V;
+  }
+}
+
+defm VPCMPDZ : avx512_icmp_cc<0x1F, VK16, VR512, i512mem, memopv16i32,
+                              X86cmpm, v16i32, AVXCC,
+              "vpcmp${cc}d\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+              "vpcmpd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}">,
+              EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VPCMPUDZ : avx512_icmp_cc<0x1E, VK16, VR512, i512mem, memopv16i32,
+                               X86cmpmu, v16i32, AVXCC,
+              "vpcmp${cc}ud\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+              "vpcmpud\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}">,
+              EVEX_V512, EVEX_CD8<32, CD8VF>;
+
+defm VPCMPQZ : avx512_icmp_cc<0x1F, VK8, VR512, i512mem, memopv8i64,
+                              X86cmpm, v8i64, AVXCC,
+              "vpcmp${cc}q\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+              "vpcmpq\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}">,
+              VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>;
+defm VPCMPUQZ : avx512_icmp_cc<0x1E, VK8, VR512, i512mem, memopv8i64,
+                               X86cmpmu, v8i64, AVXCC,
+              "vpcmp${cc}uq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+              "vpcmpuq\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}">,
+              VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>;
+
+// avx512_cmp_packed - sse 1 & 2 compare packed instructions
+multiclass avx512_cmp_packed<RegisterClass KRC, RegisterClass RC,
+                           X86MemOperand x86memop, Operand CC,
+                           SDNode OpNode, ValueType vt, string asm,
+                           string asm_alt, Domain d> {
+  def rri : AVX512PIi8<0xC2, MRMSrcReg,
+             (outs KRC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm,
+             [(set KRC:$dst, (OpNode (vt RC:$src1), (vt RC:$src2), imm:$cc))], d>;
+  def rmi : AVX512PIi8<0xC2, MRMSrcMem,
+             (outs KRC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm,
+             [(set KRC:$dst,
+              (OpNode (vt RC:$src1), (memop addr:$src2), imm:$cc))], d>;
+
+  // Accept explicit immediate argument form instead of comparison code.
+  let neverHasSideEffects = 1 in {
+    def rri_alt : AVX512PIi8<0xC2, MRMSrcReg,
+               (outs RC:$dst), (ins RC:$src1, RC:$src2, i8imm:$cc),
+               asm_alt, [], d>;
+    def rmi_alt : AVX512PIi8<0xC2, MRMSrcMem,
+               (outs RC:$dst), (ins RC:$src1, x86memop:$src2, i8imm:$cc),
+               asm_alt, [], d>;
+  }
+}
+
+defm VCMPPSZ : avx512_cmp_packed<VK16, VR512, f512mem, AVXCC, X86cmpm, v16f32,
+               "vcmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+               "vcmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
+               SSEPackedSingle>, EVEX_4V, EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VCMPPDZ : avx512_cmp_packed<VK8, VR512, f512mem, AVXCC, X86cmpm, v8f64,
+               "vcmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+               "vcmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
+               SSEPackedDouble>, OpSize, EVEX_4V, VEX_W, EVEX_V512,
+               EVEX_CD8<64, CD8VF>;
+
+def : Pat<(v8i1 (X86cmpm (v8f32 VR256X:$src1), (v8f32 VR256X:$src2), imm:$cc)),
+          (COPY_TO_REGCLASS (VCMPPSZrri
+            (v16f32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)),
+            (v16f32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm)),
+            imm:$cc), VK8)>;
+def : Pat<(v8i1 (X86cmpm (v8i32 VR256X:$src1), (v8i32 VR256X:$src2), imm:$cc)),
+          (COPY_TO_REGCLASS (VPCMPDZrri
+            (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)),
+            (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm)),
+            imm:$cc), VK8)>;
+def : Pat<(v8i1 (X86cmpmu (v8i32 VR256X:$src1), (v8i32 VR256X:$src2), imm:$cc)),
+          (COPY_TO_REGCLASS (VPCMPUDZrri
+            (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)),
+            (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm)),
+            imm:$cc), VK8)>;
+               
 // Mask register copy, including
 // - copy between mask registers
 // - load/store mask registers
@@ -713,3 +1052,2475 @@ def : Pat<(v16i1 (insert_subvector undef, (v8i1 VK8:$src), (iPTR 0))),
 
 def : Pat<(v8i1 (extract_subvector (v16i1 VK16:$src), (iPTR 8))),
           (v8i1 (COPY_TO_REGCLASS (KSHIFTRWri VK16:$src, (i8 8)), VK8))>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512 - Aligned and unaligned load and store
+//
+
+multiclass avx512_mov_packed<bits<8> opc, RegisterClass RC, RegisterClass KRC,
+                            X86MemOperand x86memop, PatFrag ld_frag, 
+                            string asm, Domain d> {
+let neverHasSideEffects = 1 in
+  def rr : AVX512PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
+              !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [], d>,
+              EVEX;
+let canFoldAsLoad = 1 in
+  def rm : AVX512PI<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
+              !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
+               [(set RC:$dst, (ld_frag addr:$src))], d>, EVEX;
+let Constraints = "$src1 = $dst" in {
+  def rrk : AVX512PI<opc, MRMSrcReg, (outs RC:$dst), 
+                                     (ins RC:$src1, KRC:$mask, RC:$src2),
+              !strconcat(asm, 
+              "\t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"), [], d>,
+              EVEX, EVEX_K;
+  def rmk : AVX512PI<opc, MRMSrcMem, (outs RC:$dst),
+                                (ins RC:$src1, KRC:$mask, x86memop:$src2),
+              !strconcat(asm, 
+              "\t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"),
+               [], d>, EVEX, EVEX_K;
+}
+}
+
+defm VMOVAPSZ : avx512_mov_packed<0x28, VR512, VK16WM, f512mem, alignedloadv16f32,
+                              "vmovaps", SSEPackedSingle>,
+                               EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VMOVAPDZ : avx512_mov_packed<0x28, VR512, VK8WM, f512mem, alignedloadv8f64,
+                              "vmovapd", SSEPackedDouble>,
+                              OpSize, EVEX_V512, VEX_W,
+                              EVEX_CD8<64, CD8VF>;
+defm VMOVUPSZ : avx512_mov_packed<0x10, VR512, VK16WM, f512mem, loadv16f32,
+                              "vmovups", SSEPackedSingle>,
+                              EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VMOVUPDZ : avx512_mov_packed<0x10, VR512, VK8WM, f512mem, loadv8f64,
+                              "vmovupd", SSEPackedDouble>,
+                               OpSize, EVEX_V512, VEX_W,
+                               EVEX_CD8<64, CD8VF>;
+def VMOVAPSZmr : AVX512PI<0x29, MRMDestMem, (outs), (ins f512mem:$dst, VR512:$src),
+                    "vmovaps\t{$src, $dst|$dst, $src}",
+                    [(alignedstore512 (v16f32 VR512:$src), addr:$dst)],
+                    SSEPackedSingle>, EVEX, EVEX_V512, EVEX_CD8<32, CD8VF>;
+def VMOVAPDZmr : AVX512PI<0x29, MRMDestMem, (outs), (ins f512mem:$dst, VR512:$src),
+                    "vmovapd\t{$src, $dst|$dst, $src}",
+                    [(alignedstore512 (v8f64 VR512:$src), addr:$dst)],
+                    SSEPackedDouble>, EVEX, EVEX_V512,
+                    OpSize, VEX_W, EVEX_CD8<64, CD8VF>;
+def VMOVUPSZmr : AVX512PI<0x11, MRMDestMem, (outs), (ins f512mem:$dst, VR512:$src),
+                    "vmovups\t{$src, $dst|$dst, $src}",
+                    [(store (v16f32 VR512:$src), addr:$dst)],
+                    SSEPackedSingle>, EVEX, EVEX_V512, EVEX_CD8<32, CD8VF>;
+def VMOVUPDZmr : AVX512PI<0x11, MRMDestMem, (outs), (ins f512mem:$dst, VR512:$src),
+                    "vmovupd\t{$src, $dst|$dst, $src}",
+                    [(store (v8f64 VR512:$src), addr:$dst)],
+                    SSEPackedDouble>, EVEX, EVEX_V512,
+                    OpSize, VEX_W, EVEX_CD8<64, CD8VF>;
+
+let neverHasSideEffects = 1 in {
+  def VMOVDQA32rr  : AVX512BI<0x6F, MRMSrcReg, (outs VR512:$dst),
+                             (ins VR512:$src),
+                             "vmovdqa32\t{$src, $dst|$dst, $src}", []>,
+                             EVEX, EVEX_V512;
+  def VMOVDQA64rr  : AVX512BI<0x6F, MRMSrcReg, (outs VR512:$dst),
+                             (ins VR512:$src),
+                             "vmovdqa64\t{$src, $dst|$dst, $src}", []>,
+                             EVEX, EVEX_V512, VEX_W;
+let mayStore = 1 in {
+  def VMOVDQA32mr  : AVX512BI<0x7F, MRMDestMem, (outs),
+                     (ins i512mem:$dst, VR512:$src),
+                     "vmovdqa32\t{$src, $dst|$dst, $src}", []>,
+                     EVEX, EVEX_V512, EVEX_CD8<32, CD8VF>;
+  def VMOVDQA64mr  : AVX512BI<0x7F, MRMDestMem, (outs),
+                     (ins i512mem:$dst, VR512:$src),
+                     "vmovdqa64\t{$src, $dst|$dst, $src}", []>,
+                     EVEX, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+}
+let mayLoad = 1 in {
+def VMOVDQA32rm  : AVX512BI<0x6F, MRMSrcMem, (outs VR512:$dst), 
+                           (ins i512mem:$src),
+                           "vmovdqa32\t{$src, $dst|$dst, $src}", []>,
+                           EVEX, EVEX_V512, EVEX_CD8<32, CD8VF>;
+def VMOVDQA64rm  : AVX512BI<0x6F, MRMSrcMem, (outs VR512:$dst),
+                           (ins i512mem:$src),
+                           "vmovdqa64\t{$src, $dst|$dst, $src}", []>,
+                           EVEX, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+}
+}
+
+// 512-bit aligned load/store
+def : Pat<(alignedloadv8i64 addr:$src),  (VMOVDQA64rm addr:$src)>;
+def : Pat<(alignedloadv16i32 addr:$src), (VMOVDQA32rm addr:$src)>;
+
+def : Pat<(alignedstore512 (v8i64  VR512:$src), addr:$dst),
+          (VMOVDQA64mr addr:$dst, VR512:$src)>;
+def : Pat<(alignedstore512 (v16i32 VR512:$src), addr:$dst),
+          (VMOVDQA32mr addr:$dst, VR512:$src)>;
+
+multiclass avx512_mov_int<bits<8> load_opc, bits<8> store_opc, string asm,
+                          RegisterClass RC, RegisterClass KRC,
+                          PatFrag ld_frag, X86MemOperand x86memop> {
+let neverHasSideEffects = 1 in
+  def rr : AVX512XSI<load_opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
+                     !strconcat(asm, "\t{$src, $dst|$dst, $src}"), []>, EVEX;
+let canFoldAsLoad = 1 in
+  def rm : AVX512XSI<load_opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
+                     !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
+                     [(set RC:$dst, (ld_frag addr:$src))]>, EVEX;
+let mayStore = 1 in
+  def mr : AVX512XSI<store_opc, MRMDestMem, (outs),
+                     (ins x86memop:$dst, VR512:$src),
+                     !strconcat(asm, "\t{$src, $dst|$dst, $src}"), []>, EVEX;
+let Constraints = "$src1 = $dst" in {
+  def rrk : AVX512XSI<load_opc, MRMSrcReg, (outs RC:$dst),
+                                      (ins RC:$src1, KRC:$mask, RC:$src2),
+              !strconcat(asm, 
+              "\t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"), []>,
+              EVEX, EVEX_K;
+  def rmk : AVX512XSI<load_opc, MRMSrcMem, (outs RC:$dst),
+                                  (ins RC:$src1, KRC:$mask, x86memop:$src2),
+              !strconcat(asm, 
+              "\t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"),
+               []>, EVEX, EVEX_K;
+}
+}
+
+defm VMOVDQU32 : avx512_mov_int<0x6F, 0x7F, "vmovdqu32", VR512, VK16WM,
+                                memopv16i32, i512mem>,
+                                EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VMOVDQU64 : avx512_mov_int<0x6F, 0x7F, "vmovdqu64", VR512, VK8WM,
+                                memopv8i64, i512mem>,
+                                EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+
+// 512-bit unaligned load/store
+def : Pat<(loadv8i64 addr:$src),         (VMOVDQU64rm addr:$src)>;
+def : Pat<(loadv16i32 addr:$src),        (VMOVDQU32rm addr:$src)>;
+
+def : Pat<(store (v8i64  VR512:$src), addr:$dst),
+          (VMOVDQU64mr addr:$dst, VR512:$src)>;
+def : Pat<(store (v16i32 VR512:$src), addr:$dst),
+          (VMOVDQU32mr addr:$dst, VR512:$src)>;
+
+let AddedComplexity = 20 in {
+def : Pat<(v16f32 (vselect VK16WM:$mask, (v16f32 VR512:$src1), 
+                           (v16f32 VR512:$src2))),
+                  (VMOVUPSZrrk VR512:$src2, VK16WM:$mask, VR512:$src1)>;
+def : Pat<(v8f64 (vselect VK8WM:$mask, (v8f64 VR512:$src1), 
+                           (v8f64 VR512:$src2))),
+                  (VMOVUPDZrrk VR512:$src2, VK8WM:$mask, VR512:$src1)>;
+def : Pat<(v16i32 (vselect VK16WM:$mask, (v16i32 VR512:$src1), 
+                           (v16i32 VR512:$src2))),
+                  (VMOVDQU32rrk VR512:$src2, VK16WM:$mask, VR512:$src1)>;
+def : Pat<(v8i64 (vselect VK8WM:$mask, (v8i64 VR512:$src1), 
+                           (v8i64 VR512:$src2))),
+                  (VMOVDQU64rrk VR512:$src2, VK8WM:$mask, VR512:$src1)>;
+}
+// Move Int Doubleword to Packed Double Int
+//
+def VMOVDI2PDIZrr : AVX512SI<0x6E, MRMSrcReg, (outs VR128X:$dst), (ins GR32:$src),
+                      "vmovd{z}\t{$src, $dst|$dst, $src}",
+                      [(set VR128X:$dst,
+                        (v4i32 (scalar_to_vector GR32:$src)))], IIC_SSE_MOVDQ>,
+                        EVEX, VEX_LIG;
+def VMOVDI2PDIZrm : AVX512SI<0x6E, MRMSrcMem, (outs VR128X:$dst), (ins i32mem:$src),
+                      "vmovd{z}\t{$src, $dst|$dst, $src}",
+                      [(set VR128X:$dst,
+                        (v4i32 (scalar_to_vector (loadi32 addr:$src))))],
+                        IIC_SSE_MOVDQ>, EVEX, VEX_LIG, EVEX_CD8<32, CD8VT1>;
+def VMOV64toPQIZrr : AVX512SI<0x6E, MRMSrcReg, (outs VR128X:$dst), (ins GR64:$src),
+                      "vmovq{z}\t{$src, $dst|$dst, $src}",
+                        [(set VR128X:$dst,
+                          (v2i64 (scalar_to_vector GR64:$src)))],
+                          IIC_SSE_MOVDQ>, EVEX, VEX_W, VEX_LIG;
+let isCodeGenOnly = 1 in {
+def VMOV64toSDZrr : AVX512SI<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
+                       "vmovq{z}\t{$src, $dst|$dst, $src}",
+                       [(set FR64:$dst, (bitconvert GR64:$src))],
+                       IIC_SSE_MOVDQ>, EVEX, VEX_W, Sched<[WriteMove]>;
+def VMOVSDto64Zrr : AVX512SI<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
+                         "vmovq{z}\t{$src, $dst|$dst, $src}",
+                         [(set GR64:$dst, (bitconvert FR64:$src))],
+                         IIC_SSE_MOVDQ>, EVEX, VEX_W, Sched<[WriteMove]>;
+}
+def VMOVSDto64Zmr : AVX512SI<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
+                         "vmovq{z}\t{$src, $dst|$dst, $src}",
+                         [(store (i64 (bitconvert FR64:$src)), addr:$dst)],
+                         IIC_SSE_MOVDQ>, EVEX, VEX_W, Sched<[WriteStore]>,
+                         EVEX_CD8<64, CD8VT1>;
+
+// Move Int Doubleword to Single Scalar
+//
+let isCodeGenOnly = 1 in {
+def VMOVDI2SSZrr  : AVX512SI<0x6E, MRMSrcReg, (outs FR32X:$dst), (ins GR32:$src),
+                      "vmovd{z}\t{$src, $dst|$dst, $src}",
+                      [(set FR32X:$dst, (bitconvert GR32:$src))],
+                      IIC_SSE_MOVDQ>, EVEX, VEX_LIG;
+
+def VMOVDI2SSZrm  : AVX512SI<0x6E, MRMSrcMem, (outs FR32X:$dst), (ins i32mem:$src),
+                      "vmovd{z}\t{$src, $dst|$dst, $src}",
+                      [(set FR32X:$dst, (bitconvert (loadi32 addr:$src)))],
+                      IIC_SSE_MOVDQ>, EVEX, VEX_LIG, EVEX_CD8<32, CD8VT1>;
+}
+
+// Move Packed Doubleword Int to Packed Double Int
+//
+def VMOVPDI2DIZrr  : AVX512SI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128X:$src),
+                       "vmovd{z}\t{$src, $dst|$dst, $src}",
+                       [(set GR32:$dst, (vector_extract (v4i32 VR128X:$src),
+                                        (iPTR 0)))], IIC_SSE_MOVD_ToGP>,
+                       EVEX, VEX_LIG;
+def VMOVPDI2DIZmr  : AVX512SI<0x7E, MRMDestMem, (outs),
+                       (ins i32mem:$dst, VR128X:$src),
+                       "vmovd{z}\t{$src, $dst|$dst, $src}",
+                       [(store (i32 (vector_extract (v4i32 VR128X:$src),
+                                     (iPTR 0))), addr:$dst)], IIC_SSE_MOVDQ>,
+                       EVEX, VEX_LIG, EVEX_CD8<32, CD8VT1>;
+
+// Move Packed Doubleword Int first element to Doubleword Int
+//
+def VMOVPQIto64Zrr : I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128X:$src),
+                      "vmovq{z}\t{$src, $dst|$dst, $src}",
+                      [(set GR64:$dst, (extractelt (v2i64 VR128X:$src),
+                                                   (iPTR 0)))],
+                      IIC_SSE_MOVD_ToGP>, TB, OpSize, EVEX, VEX_LIG, VEX_W,
+                      Requires<[HasAVX512, In64BitMode]>;
+
+def VMOVPQIto64Zmr : I<0xD6, MRMDestMem, (outs),
+                       (ins i64mem:$dst, VR128X:$src),
+                       "vmovq{z}\t{$src, $dst|$dst, $src}",
+                       [(store (extractelt (v2i64 VR128X:$src), (iPTR 0)),
+                               addr:$dst)], IIC_SSE_MOVDQ>,
+                       EVEX, OpSize, VEX_LIG, VEX_W, TB, EVEX_CD8<64, CD8VT1>,
+                       Sched<[WriteStore]>, Requires<[HasAVX512, In64BitMode]>;
+
+// Move Scalar Single to Double Int
+//
+let isCodeGenOnly = 1 in {
+def VMOVSS2DIZrr  : AVX512SI<0x7E, MRMDestReg, (outs GR32:$dst),
+                      (ins FR32X:$src),
+                      "vmovd{z}\t{$src, $dst|$dst, $src}",
+                      [(set GR32:$dst, (bitconvert FR32X:$src))],
+                      IIC_SSE_MOVD_ToGP>, EVEX, VEX_LIG;
+def VMOVSS2DIZmr  : AVX512SI<0x7E, MRMDestMem, (outs),
+                      (ins i32mem:$dst, FR32X:$src),
+                      "vmovd{z}\t{$src, $dst|$dst, $src}",
+                      [(store (i32 (bitconvert FR32X:$src)), addr:$dst)],
+                      IIC_SSE_MOVDQ>, EVEX, VEX_LIG, EVEX_CD8<32, CD8VT1>;
+}
+
+// Move Quadword Int to Packed Quadword Int
+//
+def VMOVQI2PQIZrm : AVX512SI<0x6E, MRMSrcMem, (outs VR128X:$dst),
+                      (ins i64mem:$src),
+                      "vmovq{z}\t{$src, $dst|$dst, $src}",
+                      [(set VR128X:$dst,
+                        (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>,
+                      EVEX, VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512  MOVSS, MOVSD
+//===----------------------------------------------------------------------===//
+
+multiclass avx512_move_scalar <string asm, RegisterClass RC, 
+                              SDNode OpNode, ValueType vt,
+                              X86MemOperand x86memop, PatFrag mem_pat> {
+  def rr : SI<0x10, MRMSrcReg, (outs VR128X:$dst), (ins VR128X:$src1, RC:$src2), 
+              !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+              [(set VR128X:$dst, (vt (OpNode VR128X:$src1,
+                                      (scalar_to_vector RC:$src2))))],
+              IIC_SSE_MOV_S_RR>, EVEX_4V, VEX_LIG;
+  def rm : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
+              !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
+              [(set RC:$dst, (mem_pat addr:$src))], IIC_SSE_MOV_S_RM>,
+              EVEX, VEX_LIG;
+  def mr: SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
+             !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
+             [(store RC:$src, addr:$dst)], IIC_SSE_MOV_S_MR>,
+             EVEX, VEX_LIG;
+}
+
+let ExeDomain = SSEPackedSingle in
+defm VMOVSSZ : avx512_move_scalar<"movss{z}", FR32X, X86Movss, v4f32, f32mem,
+                                 loadf32>, XS, EVEX_CD8<32, CD8VT1>;
+
+let ExeDomain = SSEPackedDouble in
+defm VMOVSDZ : avx512_move_scalar<"movsd{z}", FR64X, X86Movsd, v2f64, f64mem,
+                                 loadf64>, XD, VEX_W, EVEX_CD8<64, CD8VT1>;
+
+
+// For the disassembler
+let isCodeGenOnly = 1 in {
+  def VMOVSSZrr_REV : SI<0x11, MRMDestReg, (outs VR128X:$dst),
+                        (ins VR128X:$src1, FR32X:$src2),
+                        "movss{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}", [],
+                        IIC_SSE_MOV_S_RR>,
+                        XS, EVEX_4V, VEX_LIG;
+  def VMOVSDZrr_REV : SI<0x11, MRMDestReg, (outs VR128X:$dst),
+                        (ins VR128X:$src1, FR64X:$src2),
+                        "movsd{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}", [],
+                        IIC_SSE_MOV_S_RR>,
+                        XD, EVEX_4V, VEX_LIG, VEX_W;
+}
+
+let Predicates = [HasAVX512] in {
+  let AddedComplexity = 15 in {
+  // Move scalar to XMM zero-extended, zeroing a VR128X then do a
+  // MOVS{S,D} to the lower bits.
+  def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32X:$src)))),
+            (VMOVSSZrr (v4f32 (V_SET0)), FR32X:$src)>;
+  def : Pat<(v4f32 (X86vzmovl (v4f32 VR128X:$src))),
+            (VMOVSSZrr (v4f32 (V_SET0)), (COPY_TO_REGCLASS VR128X:$src, FR32X))>;
+  def : Pat<(v4i32 (X86vzmovl (v4i32 VR128X:$src))),
+            (VMOVSSZrr (v4i32 (V_SET0)), (COPY_TO_REGCLASS VR128X:$src, FR32X))>;
+  def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64X:$src)))),
+            (VMOVSDZrr (v2f64 (V_SET0)), FR64X:$src)>;
+
+  // Move low f32 and clear high bits.
+  def : Pat<(v8f32 (X86vzmovl (v8f32 VR256X:$src))),
+            (SUBREG_TO_REG (i32 0),
+             (VMOVSSZrr (v4f32 (V_SET0)), 
+              (EXTRACT_SUBREG (v8f32 VR256X:$src), sub_xmm)), sub_xmm)>;
+  def : Pat<(v8i32 (X86vzmovl (v8i32 VR256X:$src))),
+            (SUBREG_TO_REG (i32 0),
+             (VMOVSSZrr (v4i32 (V_SET0)),
+                       (EXTRACT_SUBREG (v8i32 VR256X:$src), sub_xmm)), sub_xmm)>;
+  }
+
+  let AddedComplexity = 20 in {
+  // MOVSSrm zeros the high parts of the register; represent this
+  // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
+  def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
+            (COPY_TO_REGCLASS (VMOVSSZrm addr:$src), VR128X)>;
+  def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
+            (COPY_TO_REGCLASS (VMOVSSZrm addr:$src), VR128X)>;
+  def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
+            (COPY_TO_REGCLASS (VMOVSSZrm addr:$src), VR128X)>;
+
+  // MOVSDrm zeros the high parts of the register; represent this
+  // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
+  def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))),
+            (COPY_TO_REGCLASS (VMOVSDZrm addr:$src), VR128X)>;
+  def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
+            (COPY_TO_REGCLASS (VMOVSDZrm addr:$src), VR128X)>;
+  def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
+            (COPY_TO_REGCLASS (VMOVSDZrm addr:$src), VR128X)>;
+  def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))),
+            (COPY_TO_REGCLASS (VMOVSDZrm addr:$src), VR128X)>;
+  def : Pat<(v2f64 (X86vzload addr:$src)),
+            (COPY_TO_REGCLASS (VMOVSDZrm addr:$src), VR128X)>;
+
+  // Represent the same patterns above but in the form they appear for
+  // 256-bit types
+  def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
+                   (v4i32 (scalar_to_vector (loadi32 addr:$src))), (iPTR 0)))),
+            (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrm addr:$src), sub_xmm)>;
+  def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
+                   (v4f32 (scalar_to_vector (loadf32 addr:$src))), (iPTR 0)))),
+            (SUBREG_TO_REG (i32 0), (VMOVSSZrm addr:$src), sub_xmm)>;
+  def : Pat<(v4f64 (X86vzmovl (insert_subvector undef,
+                   (v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))),
+            (SUBREG_TO_REG (i32 0), (VMOVSDZrm addr:$src), sub_xmm)>;
+  }
+  def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
+                   (v4f32 (scalar_to_vector FR32X:$src)), (iPTR 0)))),
+            (SUBREG_TO_REG (i32 0), (v4f32 (VMOVSSZrr (v4f32 (V_SET0)),
+                                            FR32X:$src)), sub_xmm)>;
+  def : Pat<(v4f64 (X86vzmovl (insert_subvector undef,
+                   (v2f64 (scalar_to_vector FR64X:$src)), (iPTR 0)))),
+            (SUBREG_TO_REG (i64 0), (v2f64 (VMOVSDZrr (v2f64 (V_SET0)),
+                                     FR64X:$src)), sub_xmm)>;
+  def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
+                   (v2i64 (scalar_to_vector (loadi64 addr:$src))), (iPTR 0)))),
+            (SUBREG_TO_REG (i64 0), (VMOVQI2PQIZrm addr:$src), sub_xmm)>;
+
+  // Move low f64 and clear high bits.
+  def : Pat<(v4f64 (X86vzmovl (v4f64 VR256X:$src))),
+            (SUBREG_TO_REG (i32 0),
+             (VMOVSDZrr (v2f64 (V_SET0)),
+                       (EXTRACT_SUBREG (v4f64 VR256X:$src), sub_xmm)), sub_xmm)>;
+
+  def : Pat<(v4i64 (X86vzmovl (v4i64 VR256X:$src))),
+            (SUBREG_TO_REG (i32 0), (VMOVSDZrr (v2i64 (V_SET0)),
+                       (EXTRACT_SUBREG (v4i64 VR256X:$src), sub_xmm)), sub_xmm)>;
+
+  // Extract and store.
+  def : Pat<(store (f32 (vector_extract (v4f32 VR128X:$src), (iPTR 0))),
+                   addr:$dst),
+            (VMOVSSZmr addr:$dst, (COPY_TO_REGCLASS (v4f32 VR128X:$src), FR32X))>;
+  def : Pat<(store (f64 (vector_extract (v2f64 VR128X:$src), (iPTR 0))),
+                   addr:$dst),
+            (VMOVSDZmr addr:$dst, (COPY_TO_REGCLASS (v2f64 VR128X:$src), FR64X))>;
+
+  // Shuffle with VMOVSS
+  def : Pat<(v4i32 (X86Movss VR128X:$src1, VR128X:$src2)),
+            (VMOVSSZrr (v4i32 VR128X:$src1),
+                      (COPY_TO_REGCLASS (v4i32 VR128X:$src2), FR32X))>;
+  def : Pat<(v4f32 (X86Movss VR128X:$src1, VR128X:$src2)),
+            (VMOVSSZrr (v4f32 VR128X:$src1),
+                      (COPY_TO_REGCLASS (v4f32 VR128X:$src2), FR32X))>;
+
+  // 256-bit variants
+  def : Pat<(v8i32 (X86Movss VR256X:$src1, VR256X:$src2)),
+            (SUBREG_TO_REG (i32 0),
+              (VMOVSSZrr (EXTRACT_SUBREG (v8i32 VR256X:$src1), sub_xmm),
+                        (EXTRACT_SUBREG (v8i32 VR256X:$src2), sub_xmm)),
+              sub_xmm)>;
+  def : Pat<(v8f32 (X86Movss VR256X:$src1, VR256X:$src2)),
+            (SUBREG_TO_REG (i32 0),
+              (VMOVSSZrr (EXTRACT_SUBREG (v8f32 VR256X:$src1), sub_xmm),
+                        (EXTRACT_SUBREG (v8f32 VR256X:$src2), sub_xmm)),
+              sub_xmm)>;
+
+  // Shuffle with VMOVSD
+  def : Pat<(v2i64 (X86Movsd VR128X:$src1, VR128X:$src2)),
+            (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>;
+  def : Pat<(v2f64 (X86Movsd VR128X:$src1, VR128X:$src2)),
+            (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>;
+  def : Pat<(v4f32 (X86Movsd VR128X:$src1, VR128X:$src2)),
+            (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>;
+  def : Pat<(v4i32 (X86Movsd VR128X:$src1, VR128X:$src2)),
+            (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>;
+
+  // 256-bit variants
+  def : Pat<(v4i64 (X86Movsd VR256X:$src1, VR256X:$src2)),
+            (SUBREG_TO_REG (i32 0),
+              (VMOVSDZrr (EXTRACT_SUBREG (v4i64 VR256X:$src1), sub_xmm),
+                        (EXTRACT_SUBREG (v4i64 VR256X:$src2), sub_xmm)),
+              sub_xmm)>;
+  def : Pat<(v4f64 (X86Movsd VR256X:$src1, VR256X:$src2)),
+            (SUBREG_TO_REG (i32 0),
+              (VMOVSDZrr (EXTRACT_SUBREG (v4f64 VR256X:$src1), sub_xmm),
+                        (EXTRACT_SUBREG (v4f64 VR256X:$src2), sub_xmm)),
+              sub_xmm)>;
+
+  def : Pat<(v2f64 (X86Movlpd VR128X:$src1, VR128X:$src2)),
+            (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>;
+  def : Pat<(v2i64 (X86Movlpd VR128X:$src1, VR128X:$src2)),
+            (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>;
+  def : Pat<(v4f32 (X86Movlps VR128X:$src1, VR128X:$src2)),
+            (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>;
+  def : Pat<(v4i32 (X86Movlps VR128X:$src1, VR128X:$src2)),
+            (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>;
+}
+
+let AddedComplexity = 15 in
+def VMOVZPQILo2PQIZrr : AVX512XSI<0x7E, MRMSrcReg, (outs VR128X:$dst),
+                                (ins VR128X:$src),
+                                "vmovq{z}\t{$src, $dst|$dst, $src}",
+                                [(set VR128X:$dst, (v2i64 (X86vzmovl 
+                                                   (v2i64 VR128X:$src))))],
+                                IIC_SSE_MOVQ_RR>, EVEX, VEX_W;
+
+let AddedComplexity = 20 in
+def VMOVZPQILo2PQIZrm : AVX512XSI<0x7E, MRMSrcMem, (outs VR128X:$dst),
+                                 (ins i128mem:$src),
+                                 "vmovq{z}\t{$src, $dst|$dst, $src}",
+                                 [(set VR128X:$dst, (v2i64 (X86vzmovl
+                                                     (loadv2i64 addr:$src))))],
+                                 IIC_SSE_MOVDQ>, EVEX, VEX_W,
+                                 EVEX_CD8<8, CD8VT8>;
+
+let Predicates = [HasAVX512] in {
+  // AVX 128-bit movd/movq instruction write zeros in the high 128-bit part.
+  let AddedComplexity = 20 in {
+    def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
+              (VMOVDI2PDIZrm addr:$src)>;
+    def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))),
+              (VMOV64toPQIZrr GR64:$src)>;
+    def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))),
+              (VMOVDI2PDIZrr GR32:$src)>;
+              
+    def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))),
+              (VMOVDI2PDIZrm addr:$src)>;
+    def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
+              (VMOVDI2PDIZrm addr:$src)>;
+    def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
+            (VMOVZPQILo2PQIZrm addr:$src)>;
+    def : Pat<(v2f64 (X86vzmovl (v2f64 VR128X:$src))),
+            (VMOVZPQILo2PQIZrr VR128X:$src)>;
+  }
+
+  // Use regular 128-bit instructions to match 256-bit scalar_to_vec+zext.
+  def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
+                               (v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))),
+            (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrr GR32:$src), sub_xmm)>;
+  def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
+                               (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))),
+            (SUBREG_TO_REG (i64 0), (VMOV64toPQIZrr GR64:$src), sub_xmm)>;
+}
+
+def : Pat<(v16i32 (X86Vinsert (v16i32 immAllZerosV), GR32:$src2, (iPTR 0))),
+        (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrr GR32:$src2), sub_xmm)>;
+
+def : Pat<(v8i64 (X86Vinsert (bc_v8i64 (v16i32 immAllZerosV)), GR64:$src2, (iPTR 0))),
+        (SUBREG_TO_REG (i32 0), (VMOV64toPQIZrr GR64:$src2), sub_xmm)>;
+
+def : Pat<(v16i32 (X86Vinsert undef, GR32:$src2, (iPTR 0))),
+        (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrr GR32:$src2), sub_xmm)>;
+
+def : Pat<(v8i64 (X86Vinsert undef, GR64:$src2, (iPTR 0))),
+        (SUBREG_TO_REG (i32 0), (VMOV64toPQIZrr GR64:$src2), sub_xmm)>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512 - Integer arithmetic
+//
+multiclass avx512_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                        ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
+                        X86MemOperand x86memop, PatFrag scalar_mfrag,
+                        X86MemOperand x86scalar_mop, string BrdcstStr,
+                        OpndItins itins, bit IsCommutable = 0> {
+  let isCommutable = IsCommutable in
+  def rr : AVX512BI<opc, MRMSrcReg, (outs RC:$dst),
+       (ins RC:$src1, RC:$src2),
+       !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+       [(set RC:$dst, (OpVT (OpNode (OpVT RC:$src1), (OpVT RC:$src2))))], 
+       itins.rr>, EVEX_4V;
+  def rm : AVX512BI<opc, MRMSrcMem, (outs RC:$dst),
+       (ins RC:$src1, x86memop:$src2),
+       !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+       [(set RC:$dst, (OpVT (OpNode (OpVT RC:$src1), (memop_frag addr:$src2))))],
+                                     itins.rm>, EVEX_4V;
+  def rmb : AVX512BI<opc, MRMSrcMem, (outs RC:$dst),
+       (ins RC:$src1, x86scalar_mop:$src2),
+       !strconcat(OpcodeStr, "\t{${src2}", BrdcstStr,
+                  ", $src1, $dst|$dst, $src1, ${src2}", BrdcstStr, "}"),
+       [(set RC:$dst, (OpNode RC:$src1, 
+                       (OpVT (X86VBroadcast (scalar_mfrag addr:$src2)))))],
+                        itins.rm>, EVEX_4V, EVEX_B;
+}
+multiclass avx512_binop_rm2<bits<8> opc, string OpcodeStr,
+                         ValueType DstVT, ValueType SrcVT, RegisterClass RC,
+                         PatFrag memop_frag, X86MemOperand x86memop,
+                         OpndItins itins,
+                         bit IsCommutable = 0> {
+  let isCommutable = IsCommutable in
+  def rr : AVX512BI<opc, MRMSrcReg, (outs RC:$dst),
+       (ins RC:$src1, RC:$src2),
+       !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+       []>, EVEX_4V, VEX_W;
+  def rm : AVX512BI<opc, MRMSrcMem, (outs RC:$dst),
+       (ins RC:$src1, x86memop:$src2),
+       !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+       []>, EVEX_4V, VEX_W;
+}
+
+defm VPADDDZ : avx512_binop_rm<0xFE, "vpaddd", add, v16i32, VR512, memopv16i32,
+                   i512mem, loadi32, i32mem, "{1to16}", SSE_INTALU_ITINS_P, 1>,
+                   EVEX_V512, EVEX_CD8<32, CD8VF>;
+
+defm VPSUBDZ : avx512_binop_rm<0xFA, "vpsubd", sub, v16i32, VR512, memopv16i32,
+                   i512mem, loadi32, i32mem, "{1to16}", SSE_INTALU_ITINS_P, 0>,
+                   EVEX_V512, EVEX_CD8<32, CD8VF>;
+
+defm VPMULLDZ : avx512_binop_rm<0x40, "vpmulld", mul, v16i32, VR512, memopv16i32,
+                   i512mem, loadi32, i32mem, "{1to16}", SSE_INTALU_ITINS_P, 1>,
+                   T8, EVEX_V512, EVEX_CD8<32, CD8VF>;
+
+defm VPADDQZ : avx512_binop_rm<0xD4, "vpaddq", add, v8i64, VR512, memopv8i64,
+                   i512mem, loadi64, i64mem, "{1to8}", SSE_INTALU_ITINS_P, 1>, 
+                   EVEX_CD8<64, CD8VF>, EVEX_V512, VEX_W;
+
+defm VPSUBQZ : avx512_binop_rm<0xFB, "vpsubq", sub, v8i64, VR512, memopv8i64,
+                   i512mem, loadi64, i64mem, "{1to8}", SSE_INTALU_ITINS_P, 0>,
+                   EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+
+defm VPMULDQZ : avx512_binop_rm2<0x28, "vpmuldq", v8i64, v16i32,
+                VR512, memopv8i64, i512mem, SSE_INTALU_ITINS_P, 1>, T8,
+                EVEX_V512, EVEX_CD8<64, CD8VF>;
+
+defm VPMULUDQZ : avx512_binop_rm2<0xF4, "vpmuludq", v8i64, v16i32,
+                 VR512, memopv8i64, i512mem, SSE_INTMUL_ITINS_P, 1>, EVEX_V512,
+                 EVEX_CD8<64, CD8VF>;
+
+def : Pat<(v8i64 (X86pmuludq (v16i32 VR512:$src1), (v16i32 VR512:$src2))),
+          (VPMULUDQZrr VR512:$src1, VR512:$src2)>;
+
+defm VPMAXUDZ : avx512_binop_rm<0x3F, "vpmaxud", X86umax, v16i32, VR512, memopv16i32,
+                   i512mem, loadi32, i32mem, "{1to16}", SSE_INTALU_ITINS_P, 1>,
+                   T8, EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VPMAXUQZ : avx512_binop_rm<0x3F, "vpmaxuq", X86umax, v8i64, VR512, memopv8i64,
+                   i512mem, loadi64, i64mem, "{1to8}", SSE_INTALU_ITINS_P, 0>,
+                   T8, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+
+defm VPMAXSDZ : avx512_binop_rm<0x3D, "vpmaxsd", X86smax, v16i32, VR512, memopv16i32,
+                   i512mem, loadi32, i32mem, "{1to16}", SSE_INTALU_ITINS_P, 1>,
+                   EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VPMAXSQZ : avx512_binop_rm<0x3D, "vpmaxsq", X86smax, v8i64, VR512, memopv8i64,
+                   i512mem, loadi64, i64mem, "{1to8}", SSE_INTALU_ITINS_P, 0>,
+                   T8, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+
+defm VPMINUDZ : avx512_binop_rm<0x3B, "vpminud", X86umin, v16i32, VR512, memopv16i32,
+                   i512mem, loadi32, i32mem, "{1to16}", SSE_INTALU_ITINS_P, 1>,
+                   T8, EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VPMINUQZ : avx512_binop_rm<0x3B, "vpminuq", X86umin, v8i64, VR512, memopv8i64,
+                   i512mem, loadi64, i64mem, "{1to8}", SSE_INTALU_ITINS_P, 0>,
+                   T8, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+
+defm VPMINSDZ : avx512_binop_rm<0x39, "vpminsd", X86smin, v16i32, VR512, memopv16i32,
+                   i512mem, loadi32, i32mem, "{1to16}", SSE_INTALU_ITINS_P, 1>,
+                   T8, EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VPMINSQZ : avx512_binop_rm<0x39, "vpminsq", X86smin, v8i64, VR512, memopv8i64,
+                   i512mem, loadi64, i64mem, "{1to8}", SSE_INTALU_ITINS_P, 0>,
+                   T8, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512 - Unpack Instructions
+//===----------------------------------------------------------------------===//
+
+multiclass avx512_unpack_fp<bits<8> opc, SDNode OpNode, ValueType vt,
+                                   PatFrag mem_frag, RegisterClass RC,
+                                   X86MemOperand x86memop, string asm,
+                                   Domain d> {
+    def rr : AVX512PI<opc, MRMSrcReg,
+                (outs RC:$dst), (ins RC:$src1, RC:$src2),
+                asm, [(set RC:$dst,
+                           (vt (OpNode RC:$src1, RC:$src2)))],
+                           d>, EVEX_4V;
+    def rm : AVX512PI<opc, MRMSrcMem,
+                (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
+                asm, [(set RC:$dst,
+                       (vt (OpNode RC:$src1,
+                            (bitconvert (mem_frag addr:$src2)))))],
+                        d>, EVEX_4V;
+}
+
+defm VUNPCKHPSZ: avx512_unpack_fp<0x15, X86Unpckh, v16f32, memopv8f64,
+      VR512, f512mem, "vunpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+      SSEPackedSingle>, EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VUNPCKHPDZ: avx512_unpack_fp<0x15, X86Unpckh, v8f64, memopv8f64,
+      VR512, f512mem, "vunpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+      SSEPackedDouble>, OpSize, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+defm VUNPCKLPSZ: avx512_unpack_fp<0x14, X86Unpckl, v16f32, memopv8f64,
+      VR512, f512mem, "vunpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+      SSEPackedSingle>, EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VUNPCKLPDZ: avx512_unpack_fp<0x14, X86Unpckl, v8f64, memopv8f64,
+      VR512, f512mem, "vunpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+      SSEPackedDouble>, OpSize, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+
+multiclass avx512_unpack_int<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                        ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
+                        X86MemOperand x86memop> {
+  def rr : AVX512BI<opc, MRMSrcReg, (outs RC:$dst),
+       (ins RC:$src1, RC:$src2),
+       !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+       [(set RC:$dst, (OpVT (OpNode (OpVT RC:$src1), (OpVT RC:$src2))))], 
+       IIC_SSE_UNPCK>, EVEX_4V;
+  def rm : AVX512BI<opc, MRMSrcMem, (outs RC:$dst),
+       (ins RC:$src1, x86memop:$src2),
+       !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+       [(set RC:$dst, (OpVT (OpNode (OpVT RC:$src1),
+                                     (bitconvert (memop_frag addr:$src2)))))],
+                                     IIC_SSE_UNPCK>, EVEX_4V;
+}
+defm VPUNPCKLDQZ  : avx512_unpack_int<0x62, "vpunpckldq", X86Unpckl, v16i32,
+                                VR512, memopv16i32, i512mem>, EVEX_V512,
+                                EVEX_CD8<32, CD8VF>;
+defm VPUNPCKLQDQZ : avx512_unpack_int<0x6C, "vpunpcklqdq", X86Unpckl, v8i64,
+                                VR512, memopv8i64, i512mem>, EVEX_V512,
+                                VEX_W, EVEX_CD8<64, CD8VF>;
+defm VPUNPCKHDQZ  : avx512_unpack_int<0x6A, "vpunpckhdq", X86Unpckh, v16i32,
+                                VR512, memopv16i32, i512mem>, EVEX_V512,
+                                EVEX_CD8<32, CD8VF>;
+defm VPUNPCKHQDQZ : avx512_unpack_int<0x6D, "vpunpckhqdq", X86Unpckh, v8i64,
+                                VR512, memopv8i64, i512mem>, EVEX_V512,
+                                VEX_W, EVEX_CD8<64, CD8VF>;
+//===----------------------------------------------------------------------===//
+// AVX-512 - PSHUFD
+//
+
+multiclass avx512_pshuf_imm<bits<8> opc, string OpcodeStr, RegisterClass RC,
+                         SDNode OpNode, PatFrag mem_frag, 
+                         X86MemOperand x86memop, ValueType OpVT> {
+  def ri : AVX512Ii8<opc, MRMSrcReg, (outs RC:$dst),
+                     (ins RC:$src1, i8imm:$src2),
+                     !strconcat(OpcodeStr,
+                         "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                     [(set RC:$dst,
+                       (OpVT (OpNode RC:$src1, (i8 imm:$src2))))]>,
+                     EVEX;
+  def mi : AVX512Ii8<opc, MRMSrcMem, (outs RC:$dst),
+                     (ins x86memop:$src1, i8imm:$src2),
+                     !strconcat(OpcodeStr,
+                         "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                     [(set RC:$dst,
+                       (OpVT (OpNode (mem_frag addr:$src1),
+                              (i8 imm:$src2))))]>, EVEX;
+}
+
+defm VPSHUFDZ : avx512_pshuf_imm<0x70, "vpshufd", VR512, X86PShufd, memopv16i32,
+                      i512mem, v16i32>, OpSize, EVEX_V512, EVEX_CD8<32, CD8VF>;
+
+let ExeDomain = SSEPackedSingle in
+defm VPERMILPSZ : avx512_pshuf_imm<0x04, "vpermilps", VR512, X86VPermilp,
+                      memopv16f32, i512mem, v16f32>, OpSize, TA, EVEX_V512,
+                      EVEX_CD8<32, CD8VF>;
+let ExeDomain = SSEPackedDouble in
+defm VPERMILPDZ : avx512_pshuf_imm<0x05, "vpermilpd", VR512, X86VPermilp,
+                      memopv8f64, i512mem, v8f64>, OpSize, TA, EVEX_V512,
+                      VEX_W, EVEX_CD8<32, CD8VF>;
+
+def : Pat<(v16i32 (X86VPermilp VR512:$src1, (i8 imm:$imm))),
+          (VPERMILPSZri VR512:$src1, imm:$imm)>;
+def : Pat<(v8i64 (X86VPermilp VR512:$src1, (i8 imm:$imm))),
+          (VPERMILPDZri VR512:$src1, imm:$imm)>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512  Logical Instructions
+//===----------------------------------------------------------------------===//
+
+defm VPANDDZ : avx512_binop_rm<0xDB, "vpandd", and, v16i32, VR512, memopv16i32,
+                      i512mem, loadi32, i32mem, "{1to16}", SSE_BIT_ITINS_P, 1>,
+                      EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VPANDQZ : avx512_binop_rm<0xDB, "vpandq", and, v8i64, VR512, memopv8i64,
+                      i512mem, loadi64, i64mem, "{1to8}", SSE_BIT_ITINS_P, 1>,
+                      EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+defm VPORDZ  : avx512_binop_rm<0xEB, "vpord", or, v16i32, VR512, memopv16i32,
+                      i512mem, loadi32, i32mem, "{1to16}", SSE_BIT_ITINS_P, 1>,
+                      EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VPORQZ  : avx512_binop_rm<0xEB, "vporq", or, v8i64, VR512, memopv8i64,
+                      i512mem, loadi64, i64mem, "{1to8}", SSE_BIT_ITINS_P, 1>,
+                      EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+defm VPXORDZ : avx512_binop_rm<0xEF, "vpxord", xor, v16i32, VR512, memopv16i32,
+                      i512mem, loadi32, i32mem, "{1to16}", SSE_BIT_ITINS_P, 1>,
+                      EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VPXORQZ : avx512_binop_rm<0xEF, "vpxorq", xor, v8i64, VR512, memopv8i64,
+                      i512mem, loadi64, i64mem, "{1to8}", SSE_BIT_ITINS_P, 1>,
+                      EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+defm VPANDNDZ : avx512_binop_rm<0xDF, "vpandnd", X86andnp, v16i32, VR512,
+                      memopv16i32, i512mem, loadi32, i32mem, "{1to16}",
+                      SSE_BIT_ITINS_P, 0>, EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VPANDNQZ : avx512_binop_rm<0xDF, "vpandnq", X86andnp, v8i64, VR512, memopv8i64,
+                      i512mem, loadi64, i64mem, "{1to8}", SSE_BIT_ITINS_P, 0>,
+                      EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512  FP arithmetic
+//===----------------------------------------------------------------------===//
+
+multiclass avx512_binop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                  SizeItins itins> {
+  defm SSZ : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss{z}"), OpNode, FR32X,
+                             f32mem, itins.s, 0>, XS, EVEX_4V, VEX_LIG,
+                             EVEX_CD8<32, CD8VT1>;
+  defm SDZ : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd{z}"), OpNode, FR64X,
+                             f64mem, itins.d, 0>, XD, VEX_W, EVEX_4V, VEX_LIG,
+                             EVEX_CD8<64, CD8VT1>;
+}
+
+let isCommutable = 1 in {
+defm VADD : avx512_binop_s<0x58, "add", fadd, SSE_ALU_ITINS_S>;
+defm VMUL : avx512_binop_s<0x59, "mul", fmul, SSE_ALU_ITINS_S>;
+defm VMIN : avx512_binop_s<0x5D, "min", X86fmin, SSE_ALU_ITINS_S>;
+defm VMAX : avx512_binop_s<0x5F, "max", X86fmax, SSE_ALU_ITINS_S>;
+}
+let isCommutable = 0 in {
+defm VSUB : avx512_binop_s<0x5C, "sub", fsub, SSE_ALU_ITINS_S>;
+defm VDIV : avx512_binop_s<0x5E, "div", fdiv, SSE_ALU_ITINS_S>;
+}
+
+multiclass avx512_fp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                           RegisterClass RC, ValueType vt,
+                           X86MemOperand x86memop, PatFrag mem_frag,
+                           X86MemOperand x86scalar_mop, PatFrag scalar_mfrag,
+                           string BrdcstStr,
+                           Domain d, OpndItins itins, bit commutable> {
+  let isCommutable = commutable in
+    def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+       !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+       [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], itins.rr, d>,
+       EVEX_4V, TB;
+  let mayLoad = 1 in {
+    def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
+       !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+       [(set RC:$dst, (OpNode RC:$src1, (mem_frag addr:$src2)))],
+          itins.rm, d>, EVEX_4V, TB;
+    def rmb : PI<opc, MRMSrcMem, (outs RC:$dst),
+       (ins RC:$src1, x86scalar_mop:$src2),
+       !strconcat(OpcodeStr, "\t{${src2}", BrdcstStr,
+                  ", $src1, $dst|$dst, $src1, ${src2}", BrdcstStr, "}"),
+       [(set RC:$dst, (OpNode RC:$src1, 
+                       (vt (X86VBroadcast (scalar_mfrag addr:$src2)))))],
+       itins.rm, d>, EVEX_4V, EVEX_B, TB;
+    }
+}
+
+defm VADDPSZ : avx512_fp_packed<0x58, "addps", fadd, VR512, v16f32, f512mem,
+                   memopv16f32, f32mem, loadf32, "{1to16}", SSEPackedSingle, 
+                   SSE_ALU_ITINS_P.s, 1>, EVEX_V512, EVEX_CD8<32, CD8VF>;
+                   
+defm VADDPDZ : avx512_fp_packed<0x58, "addpd", fadd, VR512, v8f64, f512mem,
+                   memopv8f64, f64mem, loadf64, "{1to8}", SSEPackedDouble,
+                   SSE_ALU_ITINS_P.d, 1>,
+                   EVEX_V512, OpSize, VEX_W, EVEX_CD8<64, CD8VF>;
+
+defm VMULPSZ : avx512_fp_packed<0x59, "mulps", fmul, VR512, v16f32, f512mem,
+                   memopv16f32, f32mem, loadf32, "{1to16}", SSEPackedSingle,
+                   SSE_ALU_ITINS_P.s, 1>, EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VMULPDZ : avx512_fp_packed<0x59, "mulpd", fmul, VR512, v8f64, f512mem,
+                   memopv8f64, f64mem, loadf64, "{1to8}", SSEPackedDouble,
+                   SSE_ALU_ITINS_P.d, 1>,
+                   EVEX_V512, OpSize, VEX_W, EVEX_CD8<64, CD8VF>;
+
+defm VMINPSZ : avx512_fp_packed<0x5D, "minps", X86fmin, VR512, v16f32, f512mem,
+                   memopv16f32, f32mem, loadf32, "{1to16}", SSEPackedSingle,
+                   SSE_ALU_ITINS_P.s, 1>,
+                   EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VMAXPSZ : avx512_fp_packed<0x5F, "maxps", X86fmax, VR512, v16f32, f512mem,
+                   memopv16f32, f32mem, loadf32, "{1to16}", SSEPackedSingle,
+                   SSE_ALU_ITINS_P.s, 1>,
+                   EVEX_V512, EVEX_CD8<32, CD8VF>;
+
+defm VMINPDZ : avx512_fp_packed<0x5D, "minpd", X86fmin, VR512, v8f64, f512mem,
+                   memopv8f64, f64mem, loadf64, "{1to8}", SSEPackedDouble,
+                   SSE_ALU_ITINS_P.d, 1>,
+                   EVEX_V512, OpSize, VEX_W, EVEX_CD8<64, CD8VF>;
+defm VMAXPDZ : avx512_fp_packed<0x5F, "maxpd", X86fmax, VR512, v8f64, f512mem,
+                   memopv8f64, f64mem, loadf64, "{1to8}", SSEPackedDouble,
+                   SSE_ALU_ITINS_P.d, 1>,
+                   EVEX_V512, OpSize, VEX_W, EVEX_CD8<64, CD8VF>;
+
+defm VSUBPSZ : avx512_fp_packed<0x5C, "subps", fsub, VR512, v16f32, f512mem,
+                   memopv16f32, f32mem, loadf32, "{1to16}", SSEPackedSingle,
+                   SSE_ALU_ITINS_P.s, 0>, EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VDIVPSZ : avx512_fp_packed<0x5E, "divps", fdiv, VR512, v16f32, f512mem,
+                   memopv16f32, f32mem, loadf32, "{1to16}", SSEPackedSingle,
+                   SSE_ALU_ITINS_P.s, 0>, EVEX_V512, EVEX_CD8<32, CD8VF>;
+
+defm VSUBPDZ : avx512_fp_packed<0x5C, "subpd", fsub, VR512, v8f64, f512mem, 
+                   memopv8f64, f64mem, loadf64, "{1to8}", SSEPackedDouble,
+                   SSE_ALU_ITINS_P.d, 0>, 
+                   EVEX_V512, OpSize, VEX_W, EVEX_CD8<64, CD8VF>;
+defm VDIVPDZ : avx512_fp_packed<0x5E, "divpd", fdiv, VR512, v8f64, f512mem, 
+                   memopv8f64, f64mem, loadf64, "{1to8}", SSEPackedDouble,
+                   SSE_ALU_ITINS_P.d, 0>, 
+                   EVEX_V512, OpSize, VEX_W, EVEX_CD8<64, CD8VF>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512  VPTESTM instructions
+//===----------------------------------------------------------------------===//
+
+multiclass avx512_vptest<bits<8> opc, string OpcodeStr, RegisterClass KRC, 
+              RegisterClass RC, X86MemOperand x86memop, PatFrag memop_frag, 
+              SDNode OpNode, ValueType vt> {
+  def rr : AVX5128I<opc, MRMSrcReg,
+             (outs KRC:$dst), (ins RC:$src1, RC:$src2), 
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set KRC:$dst, (OpNode (vt RC:$src1), (vt RC:$src2)))]>, EVEX_4V;
+  def rm : AVX5128I<opc, MRMSrcMem,
+             (outs KRC:$dst), (ins RC:$src1, x86memop:$src2), 
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set KRC:$dst, (OpNode (vt RC:$src1), 
+              (bitconvert (memop_frag addr:$src2))))]>, EVEX_4V;
+}
+
+defm VPTESTMDZ  : avx512_vptest<0x27, "vptestmd", VK16, VR512,  f512mem,
+                              memopv16i32, X86testm, v16i32>, EVEX_V512,
+                              EVEX_CD8<32, CD8VF>;
+defm VPTESTMQZ  : avx512_vptest<0x27, "vptestmq", VK8, VR512,  f512mem,
+                              memopv8i64, X86testm, v8i64>, EVEX_V512, VEX_W,
+                              EVEX_CD8<64, CD8VF>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512  Shift instructions
+//===----------------------------------------------------------------------===//
+multiclass avx512_shift_rmi<bits<8> opc, Format ImmFormR, Format ImmFormM,
+                         string OpcodeStr, SDNode OpNode, RegisterClass RC,
+                         ValueType vt, X86MemOperand x86memop, PatFrag mem_frag,
+                         RegisterClass KRC> {
+  def ri : AVX512BIi8<opc, ImmFormR, (outs RC:$dst),
+       (ins RC:$src1, i8imm:$src2),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+       [(set RC:$dst, (vt (OpNode RC:$src1, (i8 imm:$src2))))],
+        SSE_INTSHIFT_ITINS_P.rr>, EVEX_4V;
+  def rik : AVX512BIi8<opc, ImmFormR, (outs RC:$dst),
+       (ins KRC:$mask, RC:$src1, i8imm:$src2),
+           !strconcat(OpcodeStr,
+                "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}"),
+       [], SSE_INTSHIFT_ITINS_P.rr>, EVEX_4V, EVEX_K;
+  def mi: AVX512BIi8<opc, ImmFormM, (outs RC:$dst),
+       (ins x86memop:$src1, i8imm:$src2),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+       [(set RC:$dst, (OpNode (mem_frag addr:$src1),
+                     (i8 imm:$src2)))], SSE_INTSHIFT_ITINS_P.rm>, EVEX_4V;
+  def mik: AVX512BIi8<opc, ImmFormM, (outs RC:$dst),
+       (ins KRC:$mask, x86memop:$src1, i8imm:$src2),
+           !strconcat(OpcodeStr,
+                "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}"),
+       [], SSE_INTSHIFT_ITINS_P.rm>, EVEX_4V, EVEX_K;
+}
+
+multiclass avx512_shift_rrm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                          RegisterClass RC, ValueType vt, ValueType SrcVT,
+                          PatFrag bc_frag, RegisterClass KRC> {
+  // src2 is always 128-bit
+  def rr : AVX512BI<opc, MRMSrcReg, (outs RC:$dst),
+       (ins RC:$src1, VR128X:$src2),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+       [(set RC:$dst, (vt (OpNode RC:$src1, (SrcVT VR128X:$src2))))],
+        SSE_INTSHIFT_ITINS_P.rr>, EVEX_4V;
+  def rrk : AVX512BI<opc, MRMSrcReg, (outs RC:$dst),
+       (ins KRC:$mask, RC:$src1, VR128X:$src2),
+           !strconcat(OpcodeStr,
+                "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}"),
+       [], SSE_INTSHIFT_ITINS_P.rr>, EVEX_4V, EVEX_K;
+  def rm : AVX512BI<opc, MRMSrcMem, (outs RC:$dst),
+       (ins RC:$src1, i128mem:$src2),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+       [(set RC:$dst, (vt (OpNode RC:$src1,
+                       (bc_frag (memopv2i64 addr:$src2)))))],
+                        SSE_INTSHIFT_ITINS_P.rm>, EVEX_4V;
+  def rmk : AVX512BI<opc, MRMSrcMem, (outs RC:$dst),
+       (ins KRC:$mask, RC:$src1, i128mem:$src2),
+           !strconcat(OpcodeStr,
+                "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}"),
+       [], SSE_INTSHIFT_ITINS_P.rm>, EVEX_4V, EVEX_K;
+}
+
+defm VPSRLDZ : avx512_shift_rmi<0x72, MRM2r, MRM2m, "vpsrld", X86vsrli,
+                           VR512, v16i32, i512mem, memopv16i32, VK16WM>,
+                           EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VPSRLDZ : avx512_shift_rrm<0xD2, "vpsrld", X86vsrl,
+                           VR512, v16i32, v4i32, bc_v4i32, VK16WM>, EVEX_V512,
+                           EVEX_CD8<32, CD8VQ>;
+                           
+defm VPSRLQZ : avx512_shift_rmi<0x73, MRM2r, MRM2m, "vpsrlq", X86vsrli,
+                           VR512, v8i64, i512mem, memopv8i64, VK8WM>, EVEX_V512,
+                           EVEX_CD8<64, CD8VF>, VEX_W;
+defm VPSRLQZ : avx512_shift_rrm<0xD3, "vpsrlq", X86vsrl,
+                           VR512, v8i64, v2i64, bc_v2i64, VK8WM>, EVEX_V512,
+                           EVEX_CD8<64, CD8VQ>, VEX_W;
+
+defm VPSLLDZ : avx512_shift_rmi<0x72, MRM6r, MRM6m, "vpslld", X86vshli,
+                           VR512, v16i32, i512mem, memopv16i32, VK16WM>, EVEX_V512,
+                           EVEX_CD8<32, CD8VF>;
+defm VPSLLDZ : avx512_shift_rrm<0xF2, "vpslld", X86vshl,
+                           VR512, v16i32, v4i32, bc_v4i32, VK16WM>, EVEX_V512,
+                           EVEX_CD8<32, CD8VQ>;
+                           
+defm VPSLLQZ : avx512_shift_rmi<0x73, MRM6r, MRM6m, "vpsllq", X86vshli,
+                           VR512, v8i64, i512mem, memopv8i64, VK8WM>, EVEX_V512,
+                           EVEX_CD8<64, CD8VF>, VEX_W;
+defm VPSLLQZ : avx512_shift_rrm<0xF3, "vpsllq", X86vshl,
+                           VR512, v8i64, v2i64, bc_v2i64, VK8WM>, EVEX_V512,
+                           EVEX_CD8<64, CD8VQ>, VEX_W;
+
+defm VPSRADZ : avx512_shift_rmi<0x72, MRM4r, MRM4m, "vpsrad", X86vsrai,
+                           VR512, v16i32, i512mem, memopv16i32, VK16WM>,
+                           EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VPSRADZ : avx512_shift_rrm<0xE2, "vpsrad", X86vsra,
+                           VR512, v16i32, v4i32, bc_v4i32, VK16WM>, EVEX_V512,
+                           EVEX_CD8<32, CD8VQ>;
+                           
+defm VPSRAQZ : avx512_shift_rmi<0x72, MRM4r, MRM4m, "vpsraq", X86vsrai,
+                           VR512, v8i64, i512mem, memopv8i64, VK8WM>, EVEX_V512,
+                           EVEX_CD8<64, CD8VF>, VEX_W;
+defm VPSRAQZ : avx512_shift_rrm<0xE2, "vpsraq", X86vsra,
+                           VR512, v8i64, v2i64, bc_v2i64, VK8WM>, EVEX_V512,
+                           EVEX_CD8<64, CD8VQ>, VEX_W;
+
+//===-------------------------------------------------------------------===//
+// Variable Bit Shifts
+//===-------------------------------------------------------------------===//
+multiclass avx512_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                           RegisterClass RC, ValueType vt,
+                           X86MemOperand x86memop, PatFrag mem_frag> {
+  def rr  : AVX5128I<opc, MRMSrcReg, (outs RC:$dst),
+             (ins RC:$src1, RC:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set RC:$dst,
+               (vt (OpNode RC:$src1, (vt RC:$src2))))]>,
+             EVEX_4V;
+  def rm  : AVX5128I<opc, MRMSrcMem, (outs RC:$dst),
+             (ins RC:$src1, x86memop:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set RC:$dst,
+               (vt (OpNode RC:$src1, (mem_frag addr:$src2))))]>,
+             EVEX_4V;
+}
+
+defm VPSLLVDZ : avx512_var_shift<0x47, "vpsllvd", shl, VR512, v16i32, 
+                               i512mem, memopv16i32>, EVEX_V512,
+                               EVEX_CD8<32, CD8VF>;
+defm VPSLLVQZ : avx512_var_shift<0x47, "vpsllvq", shl, VR512, v8i64, 
+                               i512mem, memopv8i64>, EVEX_V512, VEX_W,
+                               EVEX_CD8<64, CD8VF>;
+defm VPSRLVDZ : avx512_var_shift<0x45, "vpsrlvd", srl, VR512, v16i32, 
+                               i512mem, memopv16i32>, EVEX_V512,
+                               EVEX_CD8<32, CD8VF>;
+defm VPSRLVQZ : avx512_var_shift<0x45, "vpsrlvq", srl, VR512, v8i64, 
+                               i512mem, memopv8i64>, EVEX_V512, VEX_W,
+                               EVEX_CD8<64, CD8VF>;
+defm VPSRAVDZ : avx512_var_shift<0x46, "vpsravd", sra, VR512, v16i32, 
+                               i512mem, memopv16i32>, EVEX_V512,
+                               EVEX_CD8<32, CD8VF>;
+defm VPSRAVQZ : avx512_var_shift<0x46, "vpsravq", sra, VR512, v8i64, 
+                               i512mem, memopv8i64>, EVEX_V512, VEX_W,
+                               EVEX_CD8<64, CD8VF>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512 - MOVDDUP
+//===----------------------------------------------------------------------===//
+
+multiclass avx512_movddup<string OpcodeStr, RegisterClass RC, ValueType VT, 
+                        X86MemOperand x86memop, PatFrag memop_frag> {
+def rr  : AVX512PDI<0x12, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
+                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                    [(set RC:$dst, (VT (X86Movddup RC:$src)))]>, EVEX;
+def rm  : AVX512PDI<0x12, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
+                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                    [(set RC:$dst,
+                      (VT (X86Movddup (memop_frag addr:$src))))]>, EVEX;
+}
+
+defm VMOVDDUPZ : avx512_movddup<"vmovddup", VR512, v8f64, f512mem, memopv8f64>,
+                 VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>;
+def : Pat<(X86Movddup (v8f64 (scalar_to_vector (loadf64 addr:$src)))),
+          (VMOVDDUPZrm addr:$src)>;
+
+//===---------------------------------------------------------------------===//
+// Replicate Single FP - MOVSHDUP and MOVSLDUP
+//===---------------------------------------------------------------------===//
+multiclass avx512_replicate_sfp<bits<8> op, SDNode OpNode, string OpcodeStr,
+                              ValueType vt, RegisterClass RC, PatFrag mem_frag,
+                              X86MemOperand x86memop> {
+  def rr : AVX512XSI<op, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
+                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                      [(set RC:$dst, (vt (OpNode RC:$src)))]>, EVEX;
+  let mayLoad = 1 in
+  def rm : AVX512XSI<op, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
+                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                      [(set RC:$dst, (OpNode (mem_frag addr:$src)))]>, EVEX;
+}
+
+defm VMOVSHDUPZ  : avx512_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
+                       v16f32, VR512, memopv16f32, f512mem>, EVEX_V512,
+                       EVEX_CD8<32, CD8VF>;
+defm VMOVSLDUPZ  : avx512_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
+                       v16f32, VR512, memopv16f32, f512mem>, EVEX_V512,
+                       EVEX_CD8<32, CD8VF>;
+
+def : Pat<(v16i32 (X86Movshdup VR512:$src)), (VMOVSHDUPZrr VR512:$src)>;
+def : Pat<(v16i32 (X86Movshdup (memopv16i32 addr:$src))),
+           (VMOVSHDUPZrm addr:$src)>;
+def : Pat<(v16i32 (X86Movsldup VR512:$src)), (VMOVSLDUPZrr VR512:$src)>;
+def : Pat<(v16i32 (X86Movsldup (memopv16i32 addr:$src))),
+           (VMOVSLDUPZrm addr:$src)>;
+
+//===----------------------------------------------------------------------===//
+// Move Low to High and High to Low packed FP Instructions
+//===----------------------------------------------------------------------===//
+def VMOVLHPSZrr : AVX512PSI<0x16, MRMSrcReg, (outs VR128X:$dst),
+          (ins VR128X:$src1, VR128X:$src2),
+          "vmovlhps{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+          [(set VR128X:$dst, (v4f32 (X86Movlhps VR128X:$src1, VR128X:$src2)))],
+           IIC_SSE_MOV_LH>, EVEX_4V;
+def VMOVHLPSZrr : AVX512PSI<0x12, MRMSrcReg, (outs VR128X:$dst),
+          (ins VR128X:$src1, VR128X:$src2),
+          "vmovhlps{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+          [(set VR128X:$dst, (v4f32 (X86Movhlps VR128X:$src1, VR128X:$src2)))],
+          IIC_SSE_MOV_LH>, EVEX_4V;
+
+let Predicates = [HasAVX512] in {
+  // MOVLHPS patterns
+  def : Pat<(v4i32 (X86Movlhps VR128X:$src1, VR128X:$src2)),
+            (VMOVLHPSZrr VR128X:$src1, VR128X:$src2)>;
+  def : Pat<(v2i64 (X86Movlhps VR128X:$src1, VR128X:$src2)),
+            (VMOVLHPSZrr (v2i64 VR128X:$src1), VR128X:$src2)>;
+
+  // MOVHLPS patterns
+  def : Pat<(v4i32 (X86Movhlps VR128X:$src1, VR128X:$src2)),
+            (VMOVHLPSZrr VR128X:$src1, VR128X:$src2)>;
+}
+
+//===----------------------------------------------------------------------===//
+// FMA - Fused Multiply Operations
+//
+let Constraints = "$src1 = $dst" in {
+multiclass avx512_fma3p_rm<bits<8> opc, string OpcodeStr,
+            RegisterClass RC, X86MemOperand x86memop,
+            PatFrag mem_frag, X86MemOperand x86scalar_mop, PatFrag scalar_mfrag,
+            string BrdcstStr, SDNode OpNode, ValueType OpVT> {
+  def r: AVX512FMA3<opc, MRMSrcReg, (outs RC:$dst),
+          (ins RC:$src1, RC:$src2, RC:$src3),
+          !strconcat(OpcodeStr,"\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+          [(set RC:$dst, (OpVT(OpNode RC:$src1, RC:$src2, RC:$src3)))]>;
+
+  let mayLoad = 1 in
+  def m: AVX512FMA3<opc, MRMSrcMem, (outs RC:$dst),
+          (ins RC:$src1, RC:$src2, x86memop:$src3),
+          !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+          [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2,
+                                               (mem_frag addr:$src3))))]>;
+   def mb: AVX512FMA3<opc, MRMSrcMem, (outs RC:$dst),
+           (ins RC:$src1, RC:$src2, x86scalar_mop:$src3),
+           !strconcat(OpcodeStr, "\t{${src3}", BrdcstStr, 
+            ", $src2, $dst|$dst, $src2, ${src3}", BrdcstStr, "}"),
+           [(set RC:$dst, (OpNode RC:$src1, RC:$src2,
+           (OpVT (X86VBroadcast (scalar_mfrag addr:$src3)))))]>, EVEX_B;
+}
+} // Constraints = "$src1 = $dst"
+
+let ExeDomain = SSEPackedSingle in {
+  defm VFMADD213PSZ    : avx512_fma3p_rm<0xA8, "vfmadd213ps", VR512, f512mem,
+                                    memopv16f32, f32mem, loadf32, "{1to16}",
+                                    X86Fmadd, v16f32>, EVEX_V512,
+                                    EVEX_CD8<32, CD8VF>;
+  defm VFMSUB213PSZ    : avx512_fma3p_rm<0xAA, "vfmsub213ps", VR512, f512mem,
+                                    memopv16f32, f32mem, loadf32, "{1to16}",
+                                    X86Fmsub, v16f32>, EVEX_V512,
+                                    EVEX_CD8<32, CD8VF>;
+  defm VFMADDSUB213PSZ : avx512_fma3p_rm<0xA6, "vfmaddsub213ps", VR512, f512mem,
+                                    memopv16f32, f32mem, loadf32, "{1to16}",
+                                    X86Fmaddsub, v16f32>,
+                                    EVEX_V512, EVEX_CD8<32, CD8VF>;
+  defm VFMSUBADD213PSZ : avx512_fma3p_rm<0xA7, "vfmsubadd213ps", VR512, f512mem,
+                                    memopv16f32, f32mem, loadf32, "{1to16}",
+                                    X86Fmsubadd, v16f32>,
+                                    EVEX_V512, EVEX_CD8<32, CD8VF>;
+  defm VFNMADD213PSZ   : avx512_fma3p_rm<0xAC, "vfnmadd213ps", VR512, f512mem,
+                                    memopv16f32, f32mem, loadf32, "{1to16}",
+                                    X86Fnmadd, v16f32>, EVEX_V512,
+                                    EVEX_CD8<32, CD8VF>;
+  defm VFNMSUB213PSZ   : avx512_fma3p_rm<0xAE, "vfnmsub213ps", VR512, f512mem,
+                                    memopv16f32, f32mem, loadf32, "{1to16}",
+                                    X86Fnmsub, v16f32>, EVEX_V512,
+                                    EVEX_CD8<32, CD8VF>;
+}
+let ExeDomain = SSEPackedDouble in {
+  defm VFMADD213PDZ    : avx512_fma3p_rm<0xA8, "vfmadd213pd", VR512, f512mem,
+                                    memopv8f64, f64mem, loadf64, "{1to8}",
+                                    X86Fmadd, v8f64>, EVEX_V512,
+                                    VEX_W, EVEX_CD8<64, CD8VF>;
+  defm VFMSUB213PDZ    : avx512_fma3p_rm<0xAA, "vfmsub213pd", VR512, f512mem,
+                                    memopv8f64, f64mem, loadf64, "{1to8}",
+                                    X86Fmsub, v8f64>, EVEX_V512, VEX_W,
+                                    EVEX_CD8<64, CD8VF>;
+  defm VFMADDSUB213PDZ : avx512_fma3p_rm<0xA6, "vfmaddsub213pd", VR512, f512mem,
+                                    memopv8f64, f64mem, loadf64, "{1to8}",
+                                    X86Fmaddsub, v8f64>, EVEX_V512, VEX_W,
+                                    EVEX_CD8<64, CD8VF>;
+  defm VFMSUBADD213PDZ : avx512_fma3p_rm<0xA7, "vfmsubadd213pd", VR512, f512mem,
+                                    memopv8f64, f64mem, loadf64, "{1to8}",
+                                    X86Fmsubadd, v8f64>, EVEX_V512, VEX_W,
+                                    EVEX_CD8<64, CD8VF>;
+  defm VFNMADD213PDZ : avx512_fma3p_rm<0xAC, "vfnmadd213pd", VR512, f512mem,
+                                  memopv8f64, f64mem, loadf64, "{1to8}",
+                                  X86Fnmadd, v8f64>, EVEX_V512, VEX_W,
+                                  EVEX_CD8<64, CD8VF>;
+  defm VFNMSUB213PDZ : avx512_fma3p_rm<0xAE, "vfnmsub213pd", VR512, f512mem,
+                                  memopv8f64, f64mem, loadf64, "{1to8}",
+                                  X86Fnmsub, v8f64>, EVEX_V512, VEX_W,
+                                  EVEX_CD8<64, CD8VF>;
+}
+
+let Constraints = "$src1 = $dst" in {
+multiclass avx512_fma3p_m132<bits<8> opc, string OpcodeStr,
+            RegisterClass RC, X86MemOperand x86memop,
+            PatFrag mem_frag, X86MemOperand x86scalar_mop, PatFrag scalar_mfrag,
+            string BrdcstStr, SDNode OpNode, ValueType OpVT> {
+  let mayLoad = 1 in
+  def m: AVX512FMA3<opc, MRMSrcMem, (outs RC:$dst),
+          (ins RC:$src1, RC:$src3, x86memop:$src2),
+          !strconcat(OpcodeStr, "\t{$src2, $src3, $dst|$dst, $src3, $src2}"),
+          [(set RC:$dst, (OpVT (OpNode RC:$src1, (mem_frag addr:$src2), RC:$src3)))]>;
+   def mb: AVX512FMA3<opc, MRMSrcMem, (outs RC:$dst),
+           (ins RC:$src1, RC:$src3, x86scalar_mop:$src2),
+           !strconcat(OpcodeStr, "\t{${src2}", BrdcstStr, 
+            ", $src3, $dst|$dst, $src3, ${src2}", BrdcstStr, "}"),
+           [(set RC:$dst, (OpNode RC:$src1, 
+           (OpVT (X86VBroadcast (scalar_mfrag addr:$src2))), RC:$src3))]>, EVEX_B;
+}
+} // Constraints = "$src1 = $dst"
+
+
+let ExeDomain = SSEPackedSingle in {
+  defm VFMADD132PSZ    : avx512_fma3p_m132<0x98, "vfmadd132ps", VR512, f512mem,
+                                    memopv16f32, f32mem, loadf32, "{1to16}",
+                                    X86Fmadd, v16f32>, EVEX_V512,
+                                    EVEX_CD8<32, CD8VF>;
+  defm VFMSUB132PSZ    : avx512_fma3p_m132<0x9A, "vfmsub132ps", VR512, f512mem,
+                                    memopv16f32, f32mem, loadf32, "{1to16}",
+                                    X86Fmsub, v16f32>, EVEX_V512,
+                                    EVEX_CD8<32, CD8VF>;
+  defm VFMADDSUB132PSZ : avx512_fma3p_m132<0x96, "vfmaddsub132ps", VR512, f512mem,
+                                    memopv16f32, f32mem, loadf32, "{1to16}",
+                                    X86Fmaddsub, v16f32>,
+                                    EVEX_V512, EVEX_CD8<32, CD8VF>;
+  defm VFMSUBADD132PSZ : avx512_fma3p_m132<0x97, "vfmsubadd132ps", VR512, f512mem,
+                                    memopv16f32, f32mem, loadf32, "{1to16}",
+                                    X86Fmsubadd, v16f32>,
+                                    EVEX_V512, EVEX_CD8<32, CD8VF>;
+  defm VFNMADD132PSZ   : avx512_fma3p_m132<0x9C, "vfnmadd132ps", VR512, f512mem,
+                                    memopv16f32, f32mem, loadf32, "{1to16}",
+                                    X86Fnmadd, v16f32>, EVEX_V512,
+                                    EVEX_CD8<32, CD8VF>;
+  defm VFNMSUB132PSZ   : avx512_fma3p_m132<0x9E, "vfnmsub132ps", VR512, f512mem,
+                                    memopv16f32, f32mem, loadf32, "{1to16}",
+                                    X86Fnmsub, v16f32>, EVEX_V512,
+                                    EVEX_CD8<32, CD8VF>;
+}
+let ExeDomain = SSEPackedDouble in {
+  defm VFMADD132PDZ    : avx512_fma3p_m132<0x98, "vfmadd132pd", VR512, f512mem,
+                                    memopv8f64, f64mem, loadf64, "{1to8}",
+                                    X86Fmadd, v8f64>, EVEX_V512,
+                                    VEX_W, EVEX_CD8<64, CD8VF>;
+  defm VFMSUB132PDZ    : avx512_fma3p_m132<0x9A, "vfmsub132pd", VR512, f512mem,
+                                    memopv8f64, f64mem, loadf64, "{1to8}",
+                                    X86Fmsub, v8f64>, EVEX_V512, VEX_W,
+                                    EVEX_CD8<64, CD8VF>;
+  defm VFMADDSUB132PDZ : avx512_fma3p_m132<0x96, "vfmaddsub132pd", VR512, f512mem,
+                                    memopv8f64, f64mem, loadf64, "{1to8}",
+                                    X86Fmaddsub, v8f64>, EVEX_V512, VEX_W,
+                                    EVEX_CD8<64, CD8VF>;
+  defm VFMSUBADD132PDZ : avx512_fma3p_m132<0x97, "vfmsubadd132pd", VR512, f512mem,
+                                    memopv8f64, f64mem, loadf64, "{1to8}",
+                                    X86Fmsubadd, v8f64>, EVEX_V512, VEX_W,
+                                    EVEX_CD8<64, CD8VF>;
+  defm VFNMADD132PDZ : avx512_fma3p_m132<0x9C, "vfnmadd132pd", VR512, f512mem,
+                                  memopv8f64, f64mem, loadf64, "{1to8}",
+                                  X86Fnmadd, v8f64>, EVEX_V512, VEX_W,
+                                  EVEX_CD8<64, CD8VF>;
+  defm VFNMSUB132PDZ : avx512_fma3p_m132<0x9E, "vfnmsub132pd", VR512, f512mem,
+                                  memopv8f64, f64mem, loadf64, "{1to8}",
+                                  X86Fnmsub, v8f64>, EVEX_V512, VEX_W,
+                                  EVEX_CD8<64, CD8VF>;
+}
+
+// Scalar FMA
+let Constraints = "$src1 = $dst" in {
+multiclass avx512_fma3s_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, 
+                 RegisterClass RC, ValueType OpVT, 
+                 X86MemOperand x86memop, Operand memop, 
+                 PatFrag mem_frag> {
+  let isCommutable = 1 in
+  def r     : AVX512FMA3<opc, MRMSrcReg, (outs RC:$dst),
+                   (ins RC:$src1, RC:$src2, RC:$src3),
+                   !strconcat(OpcodeStr,
+                              "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                   [(set RC:$dst,
+                     (OpVT (OpNode RC:$src2, RC:$src1, RC:$src3)))]>;
+  let mayLoad = 1 in
+  def m     : AVX512FMA3<opc, MRMSrcMem, (outs RC:$dst),
+                   (ins RC:$src1, RC:$src2, f128mem:$src3),
+                   !strconcat(OpcodeStr,
+                              "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                   [(set RC:$dst,
+                     (OpVT (OpNode RC:$src2, RC:$src1,
+                            (mem_frag addr:$src3))))]>;
+}
+
+} // Constraints = "$src1 = $dst"
+
+defm VFMADDSSZ  : avx512_fma3s_rm<0xA9, "vfmadd213ss{z}", X86Fmadd, FR32X, 
+                      f32, f32mem, ssmem, loadf32>, EVEX_CD8<32, CD8VT1>;
+defm VFMADDSDZ  : avx512_fma3s_rm<0xA9, "vfmadd213sd{z}", X86Fmadd, FR64X,
+                      f64, f64mem, sdmem, loadf64>, VEX_W, EVEX_CD8<64, CD8VT1>;
+defm VFMSUBSSZ  : avx512_fma3s_rm<0xAB, "vfmsub213ss{z}", X86Fmsub, FR32X, 
+                      f32, f32mem, ssmem, loadf32>, EVEX_CD8<32, CD8VT1>;
+defm VFMSUBSDZ  : avx512_fma3s_rm<0xAB, "vfmsub213sd{z}", X86Fmsub, FR64X,
+                      f64, f64mem, sdmem, loadf64>, VEX_W, EVEX_CD8<64, CD8VT1>;
+defm VFNMADDSSZ  : avx512_fma3s_rm<0xAD, "vfnmadd213ss{z}", X86Fnmadd, FR32X, 
+                      f32, f32mem, ssmem, loadf32>, EVEX_CD8<32, CD8VT1>;
+defm VFNMADDSDZ  : avx512_fma3s_rm<0xAD, "vfnmadd213sd{z}", X86Fnmadd, FR64X,
+                      f64, f64mem, sdmem, loadf64>, VEX_W, EVEX_CD8<64, CD8VT1>;
+defm VFNMSUBSSZ  : avx512_fma3s_rm<0xAF, "vfnmsub213ss{z}", X86Fnmsub, FR32X, 
+                      f32, f32mem, ssmem, loadf32>, EVEX_CD8<32, CD8VT1>;
+defm VFNMSUBSDZ  : avx512_fma3s_rm<0xAF, "vfnmsub213sd{z}", X86Fnmsub, FR64X,
+                      f64, f64mem, sdmem, loadf64>, VEX_W, EVEX_CD8<64, CD8VT1>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512  Scalar convert from sign integer to float/double
+//===----------------------------------------------------------------------===//
+
+multiclass avx512_vcvtsi<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
+                          X86MemOperand x86memop, string asm> {
+let neverHasSideEffects = 1 in {
+  def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src),
+              !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
+              EVEX_4V;
+  let mayLoad = 1 in
+  def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst),
+              (ins DstRC:$src1, x86memop:$src),
+              !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
+              EVEX_4V;
+} // neverHasSideEffects = 1
+}
+let Predicates = [HasAVX512] in {
+defm VCVTSI2SSZ   : avx512_vcvtsi<0x2A, GR32, FR32X, i32mem, "cvtsi2ss{l}{z}">,
+                                  XS, VEX_LIG, EVEX_CD8<32, CD8VT1>;
+defm VCVTSI642SSZ : avx512_vcvtsi<0x2A, GR64, FR32X, i64mem, "cvtsi2ss{q}{z}">,
+                                  XS, VEX_W, VEX_LIG, EVEX_CD8<64, CD8VT1>;
+defm VCVTSI2SDZ   : avx512_vcvtsi<0x2A, GR32, FR64X, i32mem, "cvtsi2sd{l}{z}">,
+                                  XD, VEX_LIG, EVEX_CD8<32, CD8VT1>;
+defm VCVTSI642SDZ : avx512_vcvtsi<0x2A, GR64, FR64X, i64mem, "cvtsi2sd{q}{z}">,
+                                  XD, VEX_W, VEX_LIG, EVEX_CD8<64, CD8VT1>;
+
+def : Pat<(f32 (sint_to_fp (loadi32 addr:$src))),
+          (VCVTSI2SSZrm (f32 (IMPLICIT_DEF)), addr:$src)>;
+def : Pat<(f32 (sint_to_fp (loadi64 addr:$src))),
+          (VCVTSI642SSZrm (f32 (IMPLICIT_DEF)), addr:$src)>;
+def : Pat<(f64 (sint_to_fp (loadi32 addr:$src))),
+          (VCVTSI2SDZrm (f64 (IMPLICIT_DEF)), addr:$src)>;
+def : Pat<(f64 (sint_to_fp (loadi64 addr:$src))),
+          (VCVTSI642SDZrm (f64 (IMPLICIT_DEF)), addr:$src)>;
+
+def : Pat<(f32 (sint_to_fp GR32:$src)),
+          (VCVTSI2SSZrr (f32 (IMPLICIT_DEF)), GR32:$src)>;
+def : Pat<(f32 (sint_to_fp GR64:$src)),
+          (VCVTSI642SSZrr (f32 (IMPLICIT_DEF)), GR64:$src)>;
+def : Pat<(f64 (sint_to_fp GR32:$src)),
+          (VCVTSI2SDZrr (f64 (IMPLICIT_DEF)), GR32:$src)>;
+def : Pat<(f64 (sint_to_fp GR64:$src)),
+          (VCVTSI642SDZrr (f64 (IMPLICIT_DEF)), GR64:$src)>;
+
+defm VCVTUSI2SSZ   : avx512_vcvtsi<0x7B, GR32, FR32X, i32mem, "cvtusi2ss{l}{z}">,
+                                  XS, VEX_LIG, EVEX_CD8<32, CD8VT1>;
+defm VCVTUSI642SSZ : avx512_vcvtsi<0x7B, GR64, FR32X, i64mem, "cvtusi2ss{q}{z}">,
+                                  XS, VEX_W, VEX_LIG, EVEX_CD8<64, CD8VT1>;
+defm VCVTUSI2SDZ   : avx512_vcvtsi<0x7B, GR32, FR64X, i32mem, "cvtusi2sd{l}{z}">,
+                                  XD, VEX_LIG, EVEX_CD8<32, CD8VT1>;
+defm VCVTUSI642SDZ : avx512_vcvtsi<0x7B, GR64, FR64X, i64mem, "cvtusi2sd{q}{z}">,
+                                  XD, VEX_W, VEX_LIG, EVEX_CD8<64, CD8VT1>;
+
+def : Pat<(f32 (uint_to_fp (loadi32 addr:$src))),
+          (VCVTUSI2SSZrm (f32 (IMPLICIT_DEF)), addr:$src)>;
+def : Pat<(f32 (uint_to_fp (loadi64 addr:$src))),
+          (VCVTUSI642SSZrm (f32 (IMPLICIT_DEF)), addr:$src)>;
+def : Pat<(f64 (uint_to_fp (loadi32 addr:$src))),
+          (VCVTUSI2SDZrm (f64 (IMPLICIT_DEF)), addr:$src)>;
+def : Pat<(f64 (uint_to_fp (loadi64 addr:$src))),
+          (VCVTUSI642SDZrm (f64 (IMPLICIT_DEF)), addr:$src)>;
+
+def : Pat<(f32 (uint_to_fp GR32:$src)),
+          (VCVTUSI2SSZrr (f32 (IMPLICIT_DEF)), GR32:$src)>;
+def : Pat<(f32 (uint_to_fp GR64:$src)),
+          (VCVTUSI642SSZrr (f32 (IMPLICIT_DEF)), GR64:$src)>;
+def : Pat<(f64 (uint_to_fp GR32:$src)),
+          (VCVTUSI2SDZrr (f64 (IMPLICIT_DEF)), GR32:$src)>;
+def : Pat<(f64 (uint_to_fp GR64:$src)),
+          (VCVTUSI642SDZrr (f64 (IMPLICIT_DEF)), GR64:$src)>;
+}
+
+//===----------------------------------------------------------------------===//
+// AVX-512  Scalar convert from float/double to integer
+//===----------------------------------------------------------------------===//
+multiclass avx512_cvt_s_int<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
+                          Intrinsic Int, Operand memop, ComplexPattern mem_cpat,
+                          string asm> {
+let neverHasSideEffects = 1 in {
+  def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
+              !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
+              [(set DstRC:$dst, (Int SrcRC:$src))]>, EVEX, VEX_LIG;
+  let mayLoad = 1 in
+  def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins memop:$src),
+              !strconcat(asm,"\t{$src, $dst|$dst, $src}"), []>, EVEX, VEX_LIG;
+} // neverHasSideEffects = 1
+}
+let Predicates = [HasAVX512] in {
+// Convert float/double to signed/unsigned int 32/64
+defm VCVTSS2SIZ:    avx512_cvt_s_int<0x2D, VR128X, GR32, int_x86_sse_cvtss2si,
+                                   ssmem, sse_load_f32, "cvtss2si{z}">,
+                                   XS, EVEX_CD8<32, CD8VT1>;
+defm VCVTSS2SI64Z:  avx512_cvt_s_int<0x2D, VR128X, GR64, int_x86_sse_cvtss2si64,
+                                   ssmem, sse_load_f32, "cvtss2si{z}">,
+                                   XS, VEX_W, EVEX_CD8<32, CD8VT1>;
+defm VCVTSS2USIZ:   avx512_cvt_s_int<0x79, VR128X, GR32, int_x86_avx512_cvtss2usi,
+                                   ssmem, sse_load_f32, "cvtss2usi{z}">,
+                                   XS, EVEX_CD8<32, CD8VT1>;
+defm VCVTSS2USI64Z: avx512_cvt_s_int<0x79, VR128X, GR64,
+                                   int_x86_avx512_cvtss2usi64, ssmem,
+                                   sse_load_f32, "cvtss2usi{z}">, XS, VEX_W,
+                                   EVEX_CD8<32, CD8VT1>;
+defm VCVTSD2SIZ:    avx512_cvt_s_int<0x2D, VR128X, GR32, int_x86_sse2_cvtsd2si,
+                                   sdmem, sse_load_f64, "cvtsd2si{z}">,
+                                   XD, EVEX_CD8<64, CD8VT1>;
+defm VCVTSD2SI64Z:  avx512_cvt_s_int<0x2D, VR128X, GR64, int_x86_sse2_cvtsd2si64,
+                                   sdmem, sse_load_f64, "cvtsd2si{z}">,
+                                   XD, VEX_W, EVEX_CD8<64, CD8VT1>;
+defm VCVTSD2USIZ:   avx512_cvt_s_int<0x79, VR128X, GR32, int_x86_avx512_cvtsd2usi,
+                                   sdmem, sse_load_f64, "cvtsd2usi{z}">,
+                                   XD, EVEX_CD8<64, CD8VT1>;
+defm VCVTSD2USI64Z: avx512_cvt_s_int<0x79, VR128X, GR64,
+                                   int_x86_avx512_cvtsd2usi64, sdmem,
+                                   sse_load_f64, "cvtsd2usi{z}">, XD, VEX_W,
+                                   EVEX_CD8<64, CD8VT1>;
+
+defm Int_VCVTSI2SSZ : sse12_cvt_sint_3addr<0x2A, GR32, VR128X,
+          int_x86_sse_cvtsi2ss, i32mem, loadi32, "cvtsi2ss{l}{z}",
+          SSE_CVT_Scalar, 0>, XS, EVEX_4V;
+defm Int_VCVTSI2SS64Z : sse12_cvt_sint_3addr<0x2A, GR64, VR128X,
+          int_x86_sse_cvtsi642ss, i64mem, loadi64, "cvtsi2ss{q}{z}",
+          SSE_CVT_Scalar, 0>, XS, EVEX_4V, VEX_W;
+defm Int_VCVTSI2SDZ : sse12_cvt_sint_3addr<0x2A, GR32, VR128X,
+          int_x86_sse2_cvtsi2sd, i32mem, loadi32, "cvtsi2sd{l}{z}",
+          SSE_CVT_Scalar, 0>, XD, EVEX_4V;
+defm Int_VCVTSI2SD64Z : sse12_cvt_sint_3addr<0x2A, GR64, VR128X,
+          int_x86_sse2_cvtsi642sd, i64mem, loadi64, "cvtsi2sd{q}{z}",
+          SSE_CVT_Scalar, 0>, XD, EVEX_4V, VEX_W;
+
+defm Int_VCVTUSI2SSZ : sse12_cvt_sint_3addr<0x2A, GR32, VR128X,
+          int_x86_avx512_cvtusi2ss, i32mem, loadi32, "cvtusi2ss{l}{z}",
+          SSE_CVT_Scalar, 0>, XS, EVEX_4V;
+defm Int_VCVTUSI2SS64Z : sse12_cvt_sint_3addr<0x2A, GR64, VR128X,
+          int_x86_avx512_cvtusi642ss, i64mem, loadi64, "cvtusi2ss{q}{z}",
+          SSE_CVT_Scalar, 0>, XS, EVEX_4V, VEX_W;
+defm Int_VCVTUSI2SDZ : sse12_cvt_sint_3addr<0x2A, GR32, VR128X,
+          int_x86_avx512_cvtusi2sd, i32mem, loadi32, "cvtusi2sd{l}{z}",
+          SSE_CVT_Scalar, 0>, XD, EVEX_4V;
+defm Int_VCVTUSI2SD64Z : sse12_cvt_sint_3addr<0x2A, GR64, VR128X,
+          int_x86_avx512_cvtusi642sd, i64mem, loadi64, "cvtusi2sd{q}{z}",
+          SSE_CVT_Scalar, 0>, XD, EVEX_4V, VEX_W;
+
+// Convert float/double to signed/unsigned int 32/64 with truncation
+defm Int_VCVTTSS2SIZ : avx512_cvt_s_int<0x2C, VR128X, GR32, int_x86_sse_cvttss2si,
+                                   ssmem, sse_load_f32, "cvttss2si{z}">,
+                                   XS, EVEX_CD8<32, CD8VT1>;
+defm Int_VCVTTSS2SI64Z : avx512_cvt_s_int<0x2C, VR128X, GR64,
+                                   int_x86_sse_cvttss2si64, ssmem, sse_load_f32,
+                                   "cvttss2si{z}">, XS, VEX_W,
+                                   EVEX_CD8<32, CD8VT1>;
+defm Int_VCVTTSD2SIZ : avx512_cvt_s_int<0x2C, VR128X, GR32, int_x86_sse2_cvttsd2si,
+                                   sdmem, sse_load_f64, "cvttsd2si{z}">, XD,
+                                   EVEX_CD8<64, CD8VT1>;
+defm Int_VCVTTSD2SI64Z : avx512_cvt_s_int<0x2C, VR128X, GR64,
+                                   int_x86_sse2_cvttsd2si64, sdmem, sse_load_f64,
+                                   "cvttsd2si{z}">, XD, VEX_W,
+                                   EVEX_CD8<64, CD8VT1>;
+defm Int_VCVTTSS2USIZ : avx512_cvt_s_int<0x78, VR128X, GR32,
+                                   int_x86_avx512_cvttss2usi, ssmem, sse_load_f32,
+                                   "cvttss2si{z}">, XS, EVEX_CD8<32, CD8VT1>;
+defm Int_VCVTTSS2USI64Z : avx512_cvt_s_int<0x78, VR128X, GR64,
+                                   int_x86_avx512_cvttss2usi64, ssmem,
+                                   sse_load_f32, "cvttss2usi{z}">, XS, VEX_W,
+                                   EVEX_CD8<32, CD8VT1>;
+defm Int_VCVTTSD2USIZ : avx512_cvt_s_int<0x78, VR128X, GR32,
+                                   int_x86_avx512_cvttsd2usi,
+                                   sdmem, sse_load_f64, "cvttsd2usi{z}">, XD,
+                                   EVEX_CD8<64, CD8VT1>;
+defm Int_VCVTTSD2USI64Z : avx512_cvt_s_int<0x78, VR128X, GR64,
+                                   int_x86_avx512_cvttsd2usi64, sdmem,
+                                   sse_load_f64, "cvttsd2usi{z}">, XD, VEX_W,
+                                   EVEX_CD8<64, CD8VT1>;
+}
+
+multiclass avx512_cvt_s<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
+                         SDNode OpNode, X86MemOperand x86memop, PatFrag ld_frag,
+                         string asm> {
+  def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
+              !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
+              [(set DstRC:$dst, (OpNode SrcRC:$src))]>, EVEX;
+  def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src),
+              !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
+              [(set DstRC:$dst, (OpNode (ld_frag addr:$src)))]>, EVEX;
+}
+
+defm VCVTTSS2SIZ    : avx512_cvt_s<0x2C, FR32X, GR32, fp_to_sint, f32mem,
+                                  loadf32, "cvttss2si{z}">, XS,
+                                  EVEX_CD8<32, CD8VT1>;
+defm VCVTTSS2USIZ   : avx512_cvt_s<0x78, FR32X, GR32, fp_to_uint, f32mem,
+                                  loadf32, "cvttss2usi{z}">, XS,
+                                  EVEX_CD8<32, CD8VT1>;
+defm VCVTTSS2SI64Z  : avx512_cvt_s<0x2C, FR32X, GR64, fp_to_sint, f32mem,
+                                  loadf32, "cvttss2si{z}">, XS, VEX_W,
+                                  EVEX_CD8<32, CD8VT1>;
+defm VCVTTSS2USI64Z : avx512_cvt_s<0x78, FR32X, GR64, fp_to_uint, f32mem,
+                                  loadf32, "cvttss2usi{z}">, XS, VEX_W,
+                                  EVEX_CD8<32, CD8VT1>;
+defm VCVTTSD2SIZ    : avx512_cvt_s<0x2C, FR64X, GR32, fp_to_sint, f64mem,
+                                  loadf64, "cvttsd2si{z}">, XD,
+                                  EVEX_CD8<64, CD8VT1>;
+defm VCVTTSD2USIZ   : avx512_cvt_s<0x78, FR64X, GR32, fp_to_uint, f64mem,
+                                  loadf64, "cvttsd2usi{z}">, XD,
+                                  EVEX_CD8<64, CD8VT1>;
+defm VCVTTSD2SI64Z  : avx512_cvt_s<0x2C, FR64X, GR64, fp_to_sint, f64mem,
+                                  loadf64, "cvttsd2si{z}">, XD, VEX_W,
+                                  EVEX_CD8<64, CD8VT1>;
+defm VCVTTSD2USI64Z : avx512_cvt_s<0x78, FR64X, GR64, fp_to_uint, f64mem,
+                                  loadf64, "cvttsd2usi{z}">, XD, VEX_W,
+                                  EVEX_CD8<64, CD8VT1>;
+//===----------------------------------------------------------------------===//
+// AVX-512  Convert form float to double and back
+//===----------------------------------------------------------------------===//
+let neverHasSideEffects = 1 in {
+def VCVTSS2SDZrr : AVX512XSI<0x5A, MRMSrcReg, (outs FR64X:$dst),
+                    (ins FR32X:$src1, FR32X:$src2),
+                    "vcvtss2sd{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                    []>, EVEX_4V, VEX_LIG, Sched<[WriteCvtF2F]>;
+let mayLoad = 1 in
+def VCVTSS2SDZrm : AVX512XSI<0x5A, MRMSrcMem, (outs FR64X:$dst),
+                    (ins FR32X:$src1, f32mem:$src2),
+                    "vcvtss2sd{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                    []>, EVEX_4V, VEX_LIG, Sched<[WriteCvtF2FLd, ReadAfterLd]>,
+                    EVEX_CD8<32, CD8VT1>;
+
+// Convert scalar double to scalar single
+def VCVTSD2SSZrr  : AVX512XDI<0x5A, MRMSrcReg, (outs FR32X:$dst),
+                      (ins FR64X:$src1, FR64X:$src2),
+                      "vcvtsd2ss{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                      []>, EVEX_4V, VEX_LIG, VEX_W, Sched<[WriteCvtF2F]>;
+let mayLoad = 1 in
+def VCVTSD2SSZrm  : AVX512XDI<0x5A, MRMSrcMem, (outs FR32X:$dst),
+                      (ins FR64X:$src1, f64mem:$src2),
+                      "vcvtsd2ss{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                      []>, EVEX_4V, VEX_LIG, VEX_W,
+                      Sched<[WriteCvtF2FLd, ReadAfterLd]>, EVEX_CD8<64, CD8VT1>;
+}
+
+def : Pat<(f64 (fextend FR32X:$src)), (VCVTSS2SDZrr FR32X:$src, FR32X:$src)>,
+      Requires<[HasAVX512]>;
+def : Pat<(fextend (loadf32 addr:$src)),
+    (VCVTSS2SDZrm (f32 (IMPLICIT_DEF)), addr:$src)>, Requires<[HasAVX512]>;
+
+def : Pat<(extloadf32 addr:$src),
+    (VCVTSS2SDZrm (f32 (IMPLICIT_DEF)), addr:$src)>,
+      Requires<[HasAVX512, OptForSize]>;
+
+def : Pat<(extloadf32 addr:$src),
+    (VCVTSS2SDZrr (f32 (IMPLICIT_DEF)), (VMOVSSZrm addr:$src))>,
+    Requires<[HasAVX512, OptForSpeed]>;
+
+def : Pat<(f32 (fround FR64X:$src)), (VCVTSD2SSZrr FR64X:$src, FR64X:$src)>,
+           Requires<[HasAVX512]>;
+
+multiclass avx512_vcvt_fp<bits<8> opc, string asm, RegisterClass SrcRC, 
+               RegisterClass DstRC, SDNode OpNode, PatFrag mem_frag, 
+               X86MemOperand x86memop, ValueType OpVT, ValueType InVT,
+               Domain d> {
+let neverHasSideEffects = 1 in {
+  def rr : AVX512PI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
+              !strconcat(asm,"\t{$src, $dst|$dst, $src}"), 
+              [(set DstRC:$dst,
+                (OpVT (OpNode (InVT SrcRC:$src))))], d>, EVEX;
+  let mayLoad = 1 in
+  def rm : AVX512PI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src),
+              !strconcat(asm,"\t{$src, $dst|$dst, $src}"), 
+              [(set DstRC:$dst,
+                (OpVT (OpNode (InVT (bitconvert (mem_frag addr:$src))))))], d>, EVEX;
+} // neverHasSideEffects = 1
+}
+
+defm VCVTPD2PSZ : avx512_vcvt_fp<0x5A, "vcvtpd2ps", VR512, VR256X, fround,
+                                memopv8f64, f512mem, v8f32, v8f64,
+                                SSEPackedSingle>, EVEX_V512, VEX_W, OpSize,
+                                EVEX_CD8<64, CD8VF>;
+
+defm VCVTPS2PDZ : avx512_vcvt_fp<0x5A, "vcvtps2pd", VR256X, VR512, fextend,
+                                memopv4f64, f256mem, v8f64, v8f32,
+                                SSEPackedDouble>, EVEX_V512, EVEX_CD8<32, CD8VH>;
+def : Pat<(v8f64 (extloadv8f32 addr:$src)),
+            (VCVTPS2PDZrm addr:$src)>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512  Vector convert from sign integer to float/double
+//===----------------------------------------------------------------------===//
+
+defm VCVTDQ2PSZ : avx512_vcvt_fp<0x5B, "vcvtdq2ps", VR512, VR512, sint_to_fp,
+                                memopv8i64, i512mem, v16f32, v16i32,
+                                SSEPackedSingle>, EVEX_V512, EVEX_CD8<32, CD8VF>;
+
+defm VCVTDQ2PDZ : avx512_vcvt_fp<0xE6, "vcvtdq2pd", VR256X, VR512, sint_to_fp,
+                                memopv4i64, i256mem, v8f64, v8i32,
+                                SSEPackedDouble>, EVEX_V512, XS,
+                                EVEX_CD8<32, CD8VH>;
+
+defm VCVTTPS2DQZ : avx512_vcvt_fp<0x5B, "vcvttps2dq", VR512, VR512, fp_to_sint,
+                                 memopv16f32, f512mem, v16i32, v16f32,
+                                 SSEPackedSingle>, EVEX_V512, XS,
+                                 EVEX_CD8<32, CD8VF>;
+
+defm VCVTTPD2DQZ : avx512_vcvt_fp<0xE6, "vcvttpd2dq", VR512, VR256X, fp_to_sint,
+                                 memopv8f64, f512mem, v8i32, v8f64, 
+                                 SSEPackedDouble>, EVEX_V512, OpSize, VEX_W,
+                                 EVEX_CD8<64, CD8VF>;
+
+defm VCVTTPS2UDQZ : avx512_vcvt_fp<0x78, "vcvttps2udq", VR512, VR512, fp_to_uint,
+                                 memopv16f32, f512mem, v16i32, v16f32,
+                                 SSEPackedSingle>, EVEX_V512, 
+                                 EVEX_CD8<32, CD8VF>;
+
+defm VCVTTPD2UDQZ : avx512_vcvt_fp<0x78, "vcvttpd2udq", VR512, VR256X, fp_to_uint,
+                                 memopv8f64, f512mem, v8i32, v8f64,
+                                 SSEPackedDouble>, EVEX_V512, VEX_W,
+                                 EVEX_CD8<64, CD8VF>;
+                                 
+defm VCVTUDQ2PDZ : avx512_vcvt_fp<0x7A, "vcvtudq2pd", VR256X, VR512, uint_to_fp,
+                                 memopv4i64, f256mem, v8f64, v8i32,
+                                 SSEPackedDouble>, EVEX_V512, XS,
+                                 EVEX_CD8<32, CD8VH>;
+                                 
+defm VCVTUDQ2PSZ : avx512_vcvt_fp<0x7A, "vcvtudq2ps", VR512, VR512, uint_to_fp,
+                                 memopv16i32, f512mem, v16f32, v16i32,
+                                 SSEPackedSingle>, EVEX_V512, XD,
+                                 EVEX_CD8<32, CD8VF>;
+
+def : Pat<(v8i32 (fp_to_uint (v8f32 VR256X:$src1))),
+          (EXTRACT_SUBREG (v16i32 (VCVTTPS2UDQZrr 
+           (v16f32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)))), sub_ymm)>;
+                                 
+
+def : Pat<(int_x86_avx512_cvtdq2_ps_512 VR512:$src),
+          (VCVTDQ2PSZrr VR512:$src)>;
+def : Pat<(int_x86_avx512_cvtdq2_ps_512 (bitconvert (memopv8i64 addr:$src))),
+          (VCVTDQ2PSZrm addr:$src)>;
+
+def VCVTPS2DQZrr : AVX512BI<0x5B, MRMSrcReg, (outs VR512:$dst), (ins VR512:$src),
+                        "vcvtps2dq\t{$src, $dst|$dst, $src}",
+                        [(set VR512:$dst,
+                          (int_x86_avx512_cvt_ps2dq_512 VR512:$src))],
+                        IIC_SSE_CVT_PS_RR>, EVEX, EVEX_V512;
+def VCVTPS2DQZrm : AVX512BI<0x5B, MRMSrcMem, (outs VR512:$dst), (ins f512mem:$src),
+                        "vcvtps2dq\t{$src, $dst|$dst, $src}",
+                        [(set VR512:$dst,
+                          (int_x86_avx512_cvt_ps2dq_512 (memopv16f32 addr:$src)))],
+                        IIC_SSE_CVT_PS_RM>, EVEX, EVEX_V512, EVEX_CD8<32, CD8VF>;
+
+
+let Predicates = [HasAVX512] in {
+  def : Pat<(v8f32 (fround (loadv8f64 addr:$src))),
+            (VCVTPD2PSZrm addr:$src)>;
+  def : Pat<(v8f64 (extloadv8f32 addr:$src)),
+            (VCVTPS2PDZrm addr:$src)>;
+}
+
+//===----------------------------------------------------------------------===//
+// Half precision conversion instructions
+//===----------------------------------------------------------------------===//
+multiclass avx512_f16c_ph2ps<RegisterClass destRC, RegisterClass srcRC,
+                             X86MemOperand x86memop, Intrinsic Int> {
+  def rr : AVX5128I<0x13, MRMSrcReg, (outs destRC:$dst), (ins srcRC:$src),
+             "vcvtph2ps\t{$src, $dst|$dst, $src}",
+             [(set destRC:$dst, (Int srcRC:$src))]>, EVEX;
+  let neverHasSideEffects = 1, mayLoad = 1 in
+  def rm : AVX5128I<0x13, MRMSrcMem, (outs destRC:$dst), (ins x86memop:$src),
+             "vcvtph2ps\t{$src, $dst|$dst, $src}", []>, EVEX;
+}
+
+multiclass avx512_f16c_ps2ph<RegisterClass destRC, RegisterClass srcRC,
+                             X86MemOperand x86memop, Intrinsic Int> {
+  def rr : AVX512AIi8<0x1D, MRMDestReg, (outs destRC:$dst),
+               (ins srcRC:$src1, i32i8imm:$src2),
+               "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+               [(set destRC:$dst, (Int srcRC:$src1, imm:$src2))]>, EVEX;
+  let neverHasSideEffects = 1, mayStore = 1 in
+  def mr : AVX512AIi8<0x1D, MRMDestMem, (outs),
+               (ins x86memop:$dst, srcRC:$src1, i32i8imm:$src2),
+               "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, EVEX;
+}
+
+defm VCVTPH2PSZ : avx512_f16c_ph2ps<VR512, VR256X, f256mem,
+                                    int_x86_avx512_vcvtph2ps_512>, EVEX_V512,
+                                    EVEX_CD8<32, CD8VH>;
+defm VCVTPS2PHZ : avx512_f16c_ps2ph<VR256X, VR512, f256mem,
+                                    int_x86_avx512_vcvtps2ph_512>, EVEX_V512,
+                                    EVEX_CD8<32, CD8VH>;
+
+let Defs = [EFLAGS], Predicates = [HasAVX512] in {
+  defm VUCOMISSZ : sse12_ord_cmp<0x2E, FR32X, X86cmp, f32, f32mem, loadf32,
+                                 "ucomiss{z}">, TB, EVEX, VEX_LIG,
+                                 EVEX_CD8<32, CD8VT1>;
+  defm VUCOMISDZ : sse12_ord_cmp<0x2E, FR64X, X86cmp, f64, f64mem, loadf64,
+                                  "ucomisd{z}">, TB, OpSize, EVEX,
+                                  VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>;
+  let Pattern = []<dag> in {
+    defm VCOMISSZ  : sse12_ord_cmp<0x2F, VR128X, undef, v4f32, f128mem, load,
+                                   "comiss{z}">, TB, EVEX, VEX_LIG,
+                                   EVEX_CD8<32, CD8VT1>;
+    defm VCOMISDZ  : sse12_ord_cmp<0x2F, VR128X, undef, v2f64, f128mem, load,
+                                   "comisd{z}">, TB, OpSize, EVEX,
+                                    VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>;
+  }
+  defm Int_VUCOMISSZ  : sse12_ord_cmp<0x2E, VR128X, X86ucomi, v4f32, f128mem,
+                            load, "ucomiss">, TB, EVEX, VEX_LIG,
+                            EVEX_CD8<32, CD8VT1>;
+  defm Int_VUCOMISDZ  : sse12_ord_cmp<0x2E, VR128X, X86ucomi, v2f64, f128mem,
+                            load, "ucomisd">, TB, OpSize, EVEX,
+                            VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>;
+
+  defm Int_VCOMISSZ  : sse12_ord_cmp<0x2F, VR128X, X86comi, v4f32, f128mem,
+                            load, "comiss">, TB, EVEX, VEX_LIG,
+                            EVEX_CD8<32, CD8VT1>;
+  defm Int_VCOMISDZ  : sse12_ord_cmp<0x2F, VR128X, X86comi, v2f64, f128mem,
+                            load, "comisd">, TB, OpSize, EVEX,
+                            VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>;
+}
+  
+/// avx512_unop_p - AVX-512 unops in packed form.
+multiclass avx512_fp_unop_p<bits<8> opc, string OpcodeStr, SDNode OpNode> {
+  def PSZr : AVX5128I<opc, MRMSrcReg, (outs VR512:$dst), (ins VR512:$src),
+                        !strconcat(OpcodeStr,
+                                   "ps\t{$src, $dst|$dst, $src}"),
+                        [(set VR512:$dst, (v16f32 (OpNode VR512:$src)))]>,
+                        EVEX, EVEX_V512;
+  def PSZm : AVX5128I<opc, MRMSrcMem, (outs VR512:$dst), (ins f256mem:$src),
+                        !strconcat(OpcodeStr,
+                                   "ps\t{$src, $dst|$dst, $src}"),
+                        [(set VR512:$dst, (OpNode (memopv16f32 addr:$src)))]>,
+                        EVEX, EVEX_V512, EVEX_CD8<32, CD8VF>;
+  def PDZr : AVX5128I<opc, MRMSrcReg, (outs VR512:$dst), (ins VR512:$src),
+                        !strconcat(OpcodeStr,
+                                   "pd\t{$src, $dst|$dst, $src}"),
+                        [(set VR512:$dst, (v8f64 (OpNode VR512:$src)))]>,
+                        EVEX, EVEX_V512, VEX_W;
+  def PDZm : AVX5128I<opc, MRMSrcMem, (outs VR512:$dst), (ins f512mem:$src),
+                        !strconcat(OpcodeStr,
+                                   "pd\t{$src, $dst|$dst, $src}"),
+                        [(set VR512:$dst, (OpNode (memopv16f32 addr:$src)))]>,
+                        EVEX, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+}
+
+/// avx512_fp_unop_p_int - AVX-512 intrinsics unops in packed forms.
+multiclass avx512_fp_unop_p_int<bits<8> opc, string OpcodeStr,
+                              Intrinsic V16F32Int, Intrinsic V8F64Int> {
+  def PSZr_Int : AVX5128I<opc, MRMSrcReg, (outs VR512:$dst), (ins VR512:$src),
+                           !strconcat(OpcodeStr,
+                                      "ps\t{$src, $dst|$dst, $src}"),
+                           [(set VR512:$dst, (V16F32Int VR512:$src))]>, 
+                           EVEX, EVEX_V512;
+  def PSZm_Int : AVX5128I<opc, MRMSrcMem, (outs VR512:$dst), (ins f512mem:$src),
+                         !strconcat(OpcodeStr,
+                         "ps\t{$src, $dst|$dst, $src}"),
+                         [(set VR512:$dst, 
+                           (V16F32Int (memopv16f32 addr:$src)))]>, EVEX,
+                         EVEX_V512, EVEX_CD8<32, CD8VF>;
+  def PDZr_Int : AVX5128I<opc, MRMSrcReg, (outs VR512:$dst), (ins VR512:$src),
+                           !strconcat(OpcodeStr,
+                                      "pd\t{$src, $dst|$dst, $src}"),
+                           [(set VR512:$dst, (V8F64Int VR512:$src))]>, 
+                           EVEX, EVEX_V512, VEX_W;
+  def PDZm_Int : AVX5128I<opc, MRMSrcMem, (outs VR512:$dst), (ins f512mem:$src),
+                         !strconcat(OpcodeStr,
+                         "pd\t{$src, $dst|$dst, $src}"),
+                         [(set VR512:$dst, 
+                           (V8F64Int (memopv8f64 addr:$src)))]>,
+                            EVEX, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+}
+
+/// avx512_fp_unop_s - AVX-512 unops in scalar form.
+multiclass avx512_fp_unop_s<bits<8> opc, string OpcodeStr> {
+  let hasSideEffects = 0 in {
+  def SSZr : AVX5128I<opc, MRMSrcReg, (outs FR32X:$dst),
+               (ins FR32X:$src1, FR32X:$src2),
+               !strconcat(OpcodeStr,
+                          "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                      []>, EVEX_4V;
+  let mayLoad = 1 in {
+  def SSZm : AVX5128I<opc, MRMSrcMem, (outs FR32X:$dst),
+               (ins FR32X:$src1, f32mem:$src2),
+               !strconcat(OpcodeStr,
+                          "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                      []>, EVEX_4V, EVEX_CD8<32, CD8VT1>;
+  def SSZm_Int : AVX5128I<opc, MRMSrcMem, (outs VR128X:$dst),
+                   (ins VR128X:$src1, ssmem:$src2),
+                   !strconcat(OpcodeStr,
+                              "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                   []>, EVEX_4V, EVEX_CD8<32, CD8VT1>;
+  }
+  def SDZr : AVX5128I<opc, MRMSrcReg, (outs FR64X:$dst),
+               (ins FR64X:$src1, FR64X:$src2),
+               !strconcat(OpcodeStr,
+                          "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, 
+                      EVEX_4V, VEX_W;
+  let mayLoad = 1 in {
+  def SDZm : AVX5128I<opc, MRMSrcMem, (outs FR64X:$dst),
+               (ins FR64X:$src1, f64mem:$src2),
+               !strconcat(OpcodeStr,
+                          "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, 
+               EVEX_4V, VEX_W, EVEX_CD8<64, CD8VT1>;
+  def SDZm_Int : AVX5128I<opc, MRMSrcMem, (outs VR128X:$dst),
+                  (ins VR128X:$src1, sdmem:$src2),
+                   !strconcat(OpcodeStr,
+                              "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                  []>, EVEX_4V, VEX_W, EVEX_CD8<64, CD8VT1>;
+  }
+}
+}
+
+defm VRCP14   : avx512_fp_unop_s<0x4D, "vrcp14">,
+                avx512_fp_unop_p<0x4C, "vrcp14", X86frcp>,
+                avx512_fp_unop_p_int<0x4C, "vrcp14", 
+                    int_x86_avx512_rcp14_ps_512, int_x86_avx512_rcp14_pd_512>;
+
+defm VRSQRT14  : avx512_fp_unop_s<0x4F, "vrsqrt14">,
+                avx512_fp_unop_p<0x4E, "vrsqrt14", X86frsqrt>,
+                avx512_fp_unop_p_int<0x4E, "vrsqrt14", 
+                    int_x86_avx512_rsqrt14_ps_512, int_x86_avx512_rsqrt14_pd_512>;
+
+def : Pat<(int_x86_avx512_rsqrt14_ss VR128X:$src),
+          (COPY_TO_REGCLASS (VRSQRT14SSZr (f32 (IMPLICIT_DEF)),
+                                        (COPY_TO_REGCLASS VR128X:$src, FR32)),
+                            VR128X)>;
+def : Pat<(int_x86_avx512_rsqrt14_ss sse_load_f32:$src),
+          (VRSQRT14SSZm_Int (v4f32 (IMPLICIT_DEF)), sse_load_f32:$src)>;
+
+def : Pat<(int_x86_avx512_rcp14_ss VR128X:$src),
+          (COPY_TO_REGCLASS (VRCP14SSZr (f32 (IMPLICIT_DEF)),
+                                      (COPY_TO_REGCLASS VR128X:$src, FR32)),
+                            VR128X)>;
+def : Pat<(int_x86_avx512_rcp14_ss sse_load_f32:$src),
+          (VRCP14SSZm_Int (v4f32 (IMPLICIT_DEF)), sse_load_f32:$src)>;
+
+let AddedComplexity = 20, Predicates = [HasERI] in {
+defm VRCP28   : avx512_fp_unop_s<0xCB, "vrcp28">,
+                avx512_fp_unop_p<0xCA, "vrcp28", X86frcp>,
+                avx512_fp_unop_p_int<0xCA, "vrcp28",
+                    int_x86_avx512_rcp28_ps_512, int_x86_avx512_rcp28_pd_512>;
+
+defm VRSQRT28  : avx512_fp_unop_s<0xCD, "vrsqrt28">,
+                avx512_fp_unop_p<0xCC, "vrsqrt28", X86frsqrt>,
+                avx512_fp_unop_p_int<0xCC, "vrsqrt28",
+                    int_x86_avx512_rsqrt28_ps_512, int_x86_avx512_rsqrt28_pd_512>;
+}
+
+let Predicates = [HasERI] in {
+  def : Pat<(int_x86_avx512_rsqrt28_ss VR128X:$src),
+            (COPY_TO_REGCLASS (VRSQRT28SSZr (f32 (IMPLICIT_DEF)),
+                                         (COPY_TO_REGCLASS VR128X:$src, FR32)),
+                              VR128X)>;
+  def : Pat<(int_x86_avx512_rsqrt28_ss sse_load_f32:$src),
+            (VRSQRT28SSZm_Int (v4f32 (IMPLICIT_DEF)), sse_load_f32:$src)>;
+
+  def : Pat<(int_x86_avx512_rcp28_ss VR128X:$src),
+            (COPY_TO_REGCLASS (VRCP28SSZr (f32 (IMPLICIT_DEF)),
+                                       (COPY_TO_REGCLASS VR128X:$src, FR32)),
+                              VR128X)>;
+  def : Pat<(int_x86_avx512_rcp28_ss sse_load_f32:$src),
+            (VRCP28SSZm_Int (v4f32 (IMPLICIT_DEF)), sse_load_f32:$src)>;
+}
+multiclass avx512_sqrt_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                              Intrinsic V16F32Int, Intrinsic V8F64Int,
+                              OpndItins itins_s, OpndItins itins_d> {
+  def PSZrr :AVX512PSI<opc, MRMSrcReg, (outs VR512:$dst), (ins VR512:$src),
+             !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+             [(set VR512:$dst, (v16f32 (OpNode VR512:$src)))], itins_s.rr>,
+             EVEX, EVEX_V512;
+
+  let mayLoad = 1 in
+  def PSZrm : AVX512PSI<opc, MRMSrcMem, (outs VR512:$dst), (ins f512mem:$src),
+              !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+              [(set VR512:$dst, 
+                (OpNode (v16f32 (bitconvert (memopv16f32 addr:$src)))))],
+              itins_s.rm>, EVEX, EVEX_V512, EVEX_CD8<32, CD8VF>;
+
+  def PDZrr : AVX512PDI<opc, MRMSrcReg, (outs VR512:$dst), (ins VR512:$src),
+              !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+              [(set VR512:$dst, (v8f64 (OpNode VR512:$src)))], itins_d.rr>,
+              EVEX, EVEX_V512;
+
+  let mayLoad = 1 in
+    def PDZrm : AVX512PDI<opc, MRMSrcMem, (outs VR512:$dst), (ins f512mem:$src),
+                !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                [(set VR512:$dst, (OpNode
+                  (v8f64 (bitconvert (memopv16f32 addr:$src)))))],
+                itins_d.rm>, EVEX, EVEX_V512, EVEX_CD8<64, CD8VF>;
+
+  def PSZr_Int : AVX512PSI<opc, MRMSrcReg, (outs VR512:$dst), (ins VR512:$src),
+                           !strconcat(OpcodeStr,
+                                      "ps\t{$src, $dst|$dst, $src}"),
+                           [(set VR512:$dst, (V16F32Int VR512:$src))]>, 
+                           EVEX, EVEX_V512;
+  def PSZm_Int : AVX512PSI<opc, MRMSrcMem, (outs VR512:$dst), (ins f512mem:$src),
+                          !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
+                          [(set VR512:$dst, 
+                           (V16F32Int (memopv16f32 addr:$src)))]>, EVEX,
+                          EVEX_V512, EVEX_CD8<32, CD8VF>;
+  def PDZr_Int : AVX512PDI<opc, MRMSrcReg, (outs VR512:$dst), (ins VR512:$src),
+                           !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
+                           [(set VR512:$dst, (V8F64Int VR512:$src))]>, 
+                           EVEX, EVEX_V512, VEX_W;
+  def PDZm_Int : AVX512PDI<opc, MRMSrcMem, (outs VR512:$dst), (ins f512mem:$src),
+                         !strconcat(OpcodeStr,
+                         "pd\t{$src, $dst|$dst, $src}"),
+                         [(set VR512:$dst, (V8F64Int (memopv8f64 addr:$src)))]>,
+                         EVEX, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+}
+
+multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr,
+                          Intrinsic F32Int, Intrinsic F64Int,
+                          OpndItins itins_s, OpndItins itins_d> {
+  def SSZr : SI<opc, MRMSrcReg, (outs FR32X:$dst),
+               (ins FR32X:$src1, FR32X:$src2),
+               !strconcat(OpcodeStr,
+                          "ss{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                      [], itins_s.rr>, XS, EVEX_4V;
+  def SSZr_Int : SIi8<opc, MRMSrcReg, (outs VR128X:$dst),
+               (ins VR128X:$src1, VR128X:$src2),
+               !strconcat(OpcodeStr,
+                "ss{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+               [(set VR128X:$dst, 
+                 (F32Int VR128X:$src1, VR128X:$src2))],
+               itins_s.rr>, XS, EVEX_4V;
+  let mayLoad = 1 in {
+  def SSZm : SI<opc, MRMSrcMem, (outs FR32X:$dst),
+               (ins FR32X:$src1, f32mem:$src2),
+               !strconcat(OpcodeStr,
+                          "ss{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                      [], itins_s.rm>, XS, EVEX_4V, EVEX_CD8<32, CD8VT1>;
+  def SSZm_Int : SIi8<opc, MRMSrcMem, (outs VR128X:$dst),
+                   (ins VR128X:$src1, ssmem:$src2),
+                   !strconcat(OpcodeStr,
+                 "ss{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                   [(set VR128X:$dst, 
+                     (F32Int VR128X:$src1, sse_load_f32:$src2))],
+                   itins_s.rm>, XS, EVEX_4V, EVEX_CD8<32, CD8VT1>;
+  }
+  def SDZr : SI<opc, MRMSrcReg, (outs FR64X:$dst),
+               (ins FR64X:$src1, FR64X:$src2),
+               !strconcat(OpcodeStr,
+                          "sd{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, 
+                      XD, EVEX_4V, VEX_W;
+  def SDZr_Int : SIi8<opc, MRMSrcReg, (outs VR128X:$dst),
+               (ins VR128X:$src1, VR128X:$src2),
+               !strconcat(OpcodeStr,
+                "sd{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+               [(set VR128X:$dst, 
+                 (F64Int VR128X:$src1, VR128X:$src2))],
+               itins_s.rr>, XD, EVEX_4V, VEX_W;
+  let mayLoad = 1 in {
+  def SDZm : SI<opc, MRMSrcMem, (outs FR64X:$dst),
+               (ins FR64X:$src1, f64mem:$src2),
+               !strconcat(OpcodeStr,
+                  "sd{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, 
+               XD, EVEX_4V, VEX_W, EVEX_CD8<64, CD8VT1>;
+  def SDZm_Int : SIi8<opc, MRMSrcMem, (outs VR128X:$dst),
+                  (ins VR128X:$src1, sdmem:$src2),
+                   !strconcat(OpcodeStr,
+                  "sd{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                  [(set VR128X:$dst, 
+                    (F64Int VR128X:$src1, sse_load_f64:$src2))]>, 
+                  XD, EVEX_4V, VEX_W, EVEX_CD8<64, CD8VT1>;
+  }
+}
+
+
+defm VSQRT  : avx512_sqrt_scalar<0x51, "sqrt", 
+                int_x86_avx512_sqrt_ss, int_x86_avx512_sqrt_sd, 
+                SSE_SQRTSS, SSE_SQRTSD>,
+              avx512_sqrt_packed<0x51, "vsqrt", fsqrt,
+                int_x86_avx512_sqrt_ps_512, int_x86_avx512_sqrt_pd_512,
+                SSE_SQRTPS, SSE_SQRTPD>;
+
+let Predicates = [HasAVX512] in {
+  def : Pat<(f32 (fsqrt FR32X:$src)),
+            (VSQRTSSZr (f32 (IMPLICIT_DEF)), FR32X:$src)>;
+  def : Pat<(f32 (fsqrt (load addr:$src))),
+            (VSQRTSSZm (f32 (IMPLICIT_DEF)), addr:$src)>,
+            Requires<[OptForSize]>;
+  def : Pat<(f64 (fsqrt FR64X:$src)),
+            (VSQRTSDZr (f64 (IMPLICIT_DEF)), FR64X:$src)>;
+  def : Pat<(f64 (fsqrt (load addr:$src))),
+            (VSQRTSDZm (f64 (IMPLICIT_DEF)), addr:$src)>,
+            Requires<[OptForSize]>;
+
+  def : Pat<(f32 (X86frsqrt FR32X:$src)),
+            (VRSQRT14SSZr (f32 (IMPLICIT_DEF)), FR32X:$src)>;
+  def : Pat<(f32 (X86frsqrt (load addr:$src))),
+            (VRSQRT14SSZm (f32 (IMPLICIT_DEF)), addr:$src)>,
+            Requires<[OptForSize]>;
+
+  def : Pat<(f32 (X86frcp FR32X:$src)),
+            (VRCP14SSZr (f32 (IMPLICIT_DEF)), FR32X:$src)>;
+  def : Pat<(f32 (X86frcp (load addr:$src))),
+            (VRCP14SSZm (f32 (IMPLICIT_DEF)), addr:$src)>,
+            Requires<[OptForSize]>;
+
+  def : Pat<(int_x86_sse_sqrt_ss VR128X:$src),
+            (COPY_TO_REGCLASS (VSQRTSSZr (f32 (IMPLICIT_DEF)),
+                                        (COPY_TO_REGCLASS VR128X:$src, FR32)),
+                              VR128X)>;
+  def : Pat<(int_x86_sse_sqrt_ss sse_load_f32:$src),
+            (VSQRTSSZm_Int (v4f32 (IMPLICIT_DEF)), sse_load_f32:$src)>;
+
+  def : Pat<(int_x86_sse2_sqrt_sd VR128X:$src),
+            (COPY_TO_REGCLASS (VSQRTSDZr (f64 (IMPLICIT_DEF)),
+                                        (COPY_TO_REGCLASS VR128X:$src, FR64)),
+                              VR128X)>;
+  def : Pat<(int_x86_sse2_sqrt_sd sse_load_f64:$src),
+            (VSQRTSDZm_Int (v2f64 (IMPLICIT_DEF)), sse_load_f64:$src)>;
+}
+
+
+multiclass avx512_fp_unop_rm<bits<8> opcps, bits<8> opcpd, string OpcodeStr,
+                            X86MemOperand x86memop, RegisterClass RC,
+                            PatFrag mem_frag32, PatFrag mem_frag64,
+                            Intrinsic V4F32Int, Intrinsic V2F64Int,
+                            CD8VForm VForm> {
+let ExeDomain = SSEPackedSingle in {
+  // Intrinsic operation, reg.
+  // Vector intrinsic operation, reg
+  def PSr : AVX512AIi8<opcps, MRMSrcReg,
+                    (outs RC:$dst), (ins RC:$src1, i32i8imm:$src2),
+                    !strconcat(OpcodeStr,
+                    "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                    [(set RC:$dst, (V4F32Int RC:$src1, imm:$src2))]>;
+
+  // Vector intrinsic operation, mem
+  def PSm : AVX512AIi8<opcps, MRMSrcMem,
+                    (outs RC:$dst), (ins x86memop:$src1, i32i8imm:$src2),
+                    !strconcat(OpcodeStr,
+                    "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                    [(set RC:$dst,
+                          (V4F32Int (mem_frag32 addr:$src1),imm:$src2))]>,
+                    EVEX_CD8<32, VForm>;
+} // ExeDomain = SSEPackedSingle
+
+let ExeDomain = SSEPackedDouble in {
+  // Vector intrinsic operation, reg
+  def PDr : AVX512AIi8<opcpd, MRMSrcReg,
+                     (outs RC:$dst), (ins RC:$src1, i32i8imm:$src2),
+                     !strconcat(OpcodeStr,
+                     "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                     [(set RC:$dst, (V2F64Int RC:$src1, imm:$src2))]>;
+
+  // Vector intrinsic operation, mem
+  def PDm : AVX512AIi8<opcpd, MRMSrcMem,
+                     (outs RC:$dst), (ins x86memop:$src1, i32i8imm:$src2),
+                     !strconcat(OpcodeStr,
+                     "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                     [(set RC:$dst,
+                          (V2F64Int (mem_frag64 addr:$src1),imm:$src2))]>,
+                     EVEX_CD8<64, VForm>;
+} // ExeDomain = SSEPackedDouble
+}
+
+multiclass avx512_fp_binop_rm<bits<8> opcss, bits<8> opcsd,
+                            string OpcodeStr,
+                            Intrinsic F32Int,
+                            Intrinsic F64Int> {
+let ExeDomain = GenericDomain in {
+  // Operation, reg.
+  let hasSideEffects = 0 in
+  def SSr : AVX512AIi8<opcss, MRMSrcReg,
+      (outs FR32X:$dst), (ins FR32X:$src1, FR32X:$src2, i32i8imm:$src3),
+      !strconcat(OpcodeStr,
+              "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+      []>;
+
+  // Intrinsic operation, reg.
+  def SSr_Int : AVX512AIi8<opcss, MRMSrcReg,
+        (outs VR128X:$dst), (ins VR128X:$src1, VR128X:$src2, i32i8imm:$src3),
+        !strconcat(OpcodeStr,
+                "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+        [(set VR128X:$dst, (F32Int VR128X:$src1, VR128X:$src2, imm:$src3))]>;
+
+  // Intrinsic operation, mem.
+  def SSm : AVX512AIi8<opcss, MRMSrcMem, (outs VR128X:$dst),
+                     (ins VR128X:$src1, ssmem:$src2, i32i8imm:$src3),
+                     !strconcat(OpcodeStr,
+                   "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+                     [(set VR128X:$dst, (F32Int VR128X:$src1, 
+                                         sse_load_f32:$src2, imm:$src3))]>,
+                     EVEX_CD8<32, CD8VT1>;
+
+  // Operation, reg.
+  let hasSideEffects = 0 in
+  def SDr : AVX512AIi8<opcsd, MRMSrcReg,
+        (outs FR64X:$dst), (ins FR64X:$src1, FR64X:$src2, i32i8imm:$src3),
+        !strconcat(OpcodeStr,
+                "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+        []>, VEX_W;
+
+  // Intrinsic operation, reg.
+  def SDr_Int : AVX512AIi8<opcsd, MRMSrcReg,
+        (outs VR128X:$dst), (ins VR128X:$src1, VR128X:$src2, i32i8imm:$src3),
+        !strconcat(OpcodeStr,
+                "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+        [(set VR128X:$dst, (F64Int VR128X:$src1, VR128X:$src2, imm:$src3))]>,
+        VEX_W;
+
+  // Intrinsic operation, mem.
+  def SDm : AVX512AIi8<opcsd, MRMSrcMem,
+        (outs VR128X:$dst), (ins VR128X:$src1, sdmem:$src2, i32i8imm:$src3),
+        !strconcat(OpcodeStr,
+                "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+        [(set VR128X:$dst,
+              (F64Int VR128X:$src1, sse_load_f64:$src2, imm:$src3))]>,
+        VEX_W, EVEX_CD8<64, CD8VT1>;
+} // ExeDomain = GenericDomain
+}
+
+let Predicates = [HasAVX512] in {
+  defm VRNDSCALE  : avx512_fp_binop_rm<0x0A, 0x0B, "vrndscale",
+                              int_x86_avx512_rndscale_ss,
+                              int_x86_avx512_rndscale_sd>, EVEX_4V;
+
+  defm VRNDSCALEZ : avx512_fp_unop_rm<0x08, 0x09, "vrndscale", f256mem, VR512,
+                                  memopv16f32, memopv8f64,
+                                  int_x86_avx512_rndscale_ps_512,
+                                  int_x86_avx512_rndscale_pd_512, CD8VF>, 
+                                  EVEX, EVEX_V512;
+}
+
+def : Pat<(ffloor FR32X:$src),
+          (VRNDSCALESSr (f32 (IMPLICIT_DEF)), FR32X:$src, (i32 0x1))>;
+def : Pat<(f64 (ffloor FR64X:$src)),
+          (VRNDSCALESDr (f64 (IMPLICIT_DEF)), FR64X:$src, (i32 0x1))>;
+def : Pat<(f32 (fnearbyint FR32X:$src)),
+          (VRNDSCALESSr (f32 (IMPLICIT_DEF)), FR32X:$src, (i32 0xC))>;
+def : Pat<(f64 (fnearbyint FR64X:$src)),
+          (VRNDSCALESDr (f64 (IMPLICIT_DEF)), FR64X:$src, (i32 0xC))>;
+def : Pat<(f32 (fceil FR32X:$src)),
+          (VRNDSCALESSr (f32 (IMPLICIT_DEF)), FR32X:$src, (i32 0x2))>;
+def : Pat<(f64 (fceil FR64X:$src)),
+          (VRNDSCALESDr (f64 (IMPLICIT_DEF)), FR64X:$src, (i32 0x2))>;
+def : Pat<(f32 (frint FR32X:$src)),
+          (VRNDSCALESSr (f32 (IMPLICIT_DEF)), FR32X:$src, (i32 0x4))>;
+def : Pat<(f64 (frint FR64X:$src)),
+          (VRNDSCALESDr (f64 (IMPLICIT_DEF)), FR64X:$src, (i32 0x4))>;
+def : Pat<(f32 (ftrunc FR32X:$src)),
+          (VRNDSCALESSr (f32 (IMPLICIT_DEF)), FR32X:$src, (i32 0x3))>;
+def : Pat<(f64 (ftrunc FR64X:$src)),
+          (VRNDSCALESDr (f64 (IMPLICIT_DEF)), FR64X:$src, (i32 0x3))>;
+
+def : Pat<(v16f32 (ffloor VR512:$src)),
+          (VRNDSCALEZPSr VR512:$src, (i32 0x1))>;
+def : Pat<(v16f32 (fnearbyint VR512:$src)),
+          (VRNDSCALEZPSr VR512:$src, (i32 0xC))>;
+def : Pat<(v16f32 (fceil VR512:$src)),
+          (VRNDSCALEZPSr VR512:$src, (i32 0x2))>;
+def : Pat<(v16f32 (frint VR512:$src)),
+          (VRNDSCALEZPSr VR512:$src, (i32 0x4))>;
+def : Pat<(v16f32 (ftrunc VR512:$src)),
+          (VRNDSCALEZPSr VR512:$src, (i32 0x3))>;
+
+def : Pat<(v8f64 (ffloor VR512:$src)),
+          (VRNDSCALEZPDr VR512:$src, (i32 0x1))>;
+def : Pat<(v8f64 (fnearbyint VR512:$src)),
+          (VRNDSCALEZPDr VR512:$src, (i32 0xC))>;
+def : Pat<(v8f64 (fceil VR512:$src)),
+          (VRNDSCALEZPDr VR512:$src, (i32 0x2))>;
+def : Pat<(v8f64 (frint VR512:$src)),
+          (VRNDSCALEZPDr VR512:$src, (i32 0x4))>;
+def : Pat<(v8f64 (ftrunc VR512:$src)),
+          (VRNDSCALEZPDr VR512:$src, (i32 0x3))>;
+
+//-------------------------------------------------
+// Integer truncate and extend operations
+//-------------------------------------------------
+
+multiclass avx512_trunc_sat<bits<8> opc, string OpcodeStr,
+                          RegisterClass dstRC, RegisterClass srcRC,
+                          RegisterClass KRC, X86MemOperand x86memop> {
+  def rr : AVX512XS8I<opc, MRMDestReg, (outs dstRC:$dst),
+               (ins srcRC:$src),
+               !strconcat(OpcodeStr,"\t{$src, $dst|$dst, $src}"),
+               []>, EVEX;
+
+  def krr : AVX512XS8I<opc, MRMDestReg, (outs dstRC:$dst),
+               (ins KRC:$mask, srcRC:$src),
+               !strconcat(OpcodeStr,
+                 "\t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"),
+               []>, EVEX, EVEX_KZ;
+
+  def mr : AVX512XS8I<opc, MRMDestMem, (outs), (ins x86memop:$dst, srcRC:$src),
+               !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+               []>, EVEX;
+}
+defm VPMOVQB    : avx512_trunc_sat<0x32, "vpmovqb",   VR128X, VR512, VK8WM, 
+                                 i128mem>, EVEX_V512, EVEX_CD8<8, CD8VO>;
+defm VPMOVSQB   : avx512_trunc_sat<0x22, "vpmovsqb",  VR128X, VR512, VK8WM,
+                                 i128mem>, EVEX_V512, EVEX_CD8<8, CD8VO>;
+defm VPMOVUSQB  : avx512_trunc_sat<0x12, "vpmovusqb", VR128X, VR512, VK8WM,
+                                 i128mem>, EVEX_V512, EVEX_CD8<8, CD8VO>;
+defm VPMOVQW    : avx512_trunc_sat<0x34, "vpmovqw",   VR128X, VR512, VK8WM,
+                                 i128mem>, EVEX_V512, EVEX_CD8<16, CD8VQ>;
+defm VPMOVSQW   : avx512_trunc_sat<0x24, "vpmovsqw",  VR128X, VR512, VK8WM,
+                                 i128mem>, EVEX_V512, EVEX_CD8<16, CD8VQ>;
+defm VPMOVUSQW  : avx512_trunc_sat<0x14, "vpmovusqw", VR128X, VR512, VK8WM,
+                                 i128mem>, EVEX_V512, EVEX_CD8<16, CD8VQ>;
+defm VPMOVQD    : avx512_trunc_sat<0x35, "vpmovqd",   VR256X, VR512, VK8WM,
+                                 i256mem>, EVEX_V512, EVEX_CD8<32, CD8VH>;
+defm VPMOVSQD   : avx512_trunc_sat<0x25, "vpmovsqd",  VR256X, VR512, VK8WM,
+                                 i256mem>, EVEX_V512, EVEX_CD8<32, CD8VH>;
+defm VPMOVUSQD  : avx512_trunc_sat<0x15, "vpmovusqd", VR256X, VR512, VK8WM,
+                                 i256mem>, EVEX_V512, EVEX_CD8<32, CD8VH>;
+defm VPMOVDW    : avx512_trunc_sat<0x33, "vpmovdw",   VR256X, VR512, VK16WM,
+                                 i256mem>, EVEX_V512, EVEX_CD8<16, CD8VH>;
+defm VPMOVSDW   : avx512_trunc_sat<0x23, "vpmovsdw",  VR256X, VR512, VK16WM,
+                                 i256mem>, EVEX_V512, EVEX_CD8<16, CD8VH>;
+defm VPMOVUSDW  : avx512_trunc_sat<0x13, "vpmovusdw", VR256X, VR512, VK16WM,
+                                 i256mem>, EVEX_V512, EVEX_CD8<16, CD8VH>;
+defm VPMOVDB    : avx512_trunc_sat<0x31, "vpmovdb",   VR128X, VR512, VK16WM,
+                                 i128mem>, EVEX_V512, EVEX_CD8<8, CD8VQ>;
+defm VPMOVSDB   : avx512_trunc_sat<0x21, "vpmovsdb",  VR128X, VR512, VK16WM,
+                                 i128mem>, EVEX_V512, EVEX_CD8<8, CD8VQ>;
+defm VPMOVUSDB  : avx512_trunc_sat<0x11, "vpmovusdb", VR128X, VR512, VK16WM,
+                                 i128mem>, EVEX_V512, EVEX_CD8<8, CD8VQ>;
+
+def : Pat<(v16i8  (X86vtrunc (v8i64  VR512:$src))), (VPMOVQBrr  VR512:$src)>;
+def : Pat<(v8i16  (X86vtrunc (v8i64  VR512:$src))), (VPMOVQWrr  VR512:$src)>;
+def : Pat<(v16i16 (X86vtrunc (v16i32 VR512:$src))), (VPMOVDWrr  VR512:$src)>;
+def : Pat<(v16i8  (X86vtrunc (v16i32 VR512:$src))), (VPMOVDBrr  VR512:$src)>;
+def : Pat<(v8i32  (X86vtrunc (v8i64  VR512:$src))), (VPMOVQDrr  VR512:$src)>;
+
+def : Pat<(v16i8  (X86vtruncm VK16WM:$mask, (v16i32 VR512:$src))),
+                  (VPMOVDBkrr VK16WM:$mask, VR512:$src)>;
+def : Pat<(v16i16 (X86vtruncm VK16WM:$mask, (v16i32 VR512:$src))),
+                  (VPMOVDWkrr VK16WM:$mask, VR512:$src)>;
+def : Pat<(v8i16  (X86vtruncm VK8WM:$mask,  (v8i64 VR512:$src))),
+                  (VPMOVQWkrr  VK8WM:$mask, VR512:$src)>;
+def : Pat<(v8i32  (X86vtruncm VK8WM:$mask,  (v8i64 VR512:$src))),
+                  (VPMOVQDkrr  VK8WM:$mask, VR512:$src)>;
+
+
+multiclass avx512_extend<bits<8> opc, string OpcodeStr, RegisterClass DstRC,
+                      RegisterClass SrcRC, SDNode OpNode, PatFrag mem_frag, 
+                      X86MemOperand x86memop, ValueType OpVT, ValueType InVT> {
+
+  def rr : AVX5128I<opc, MRMSrcReg, (outs DstRC:$dst),
+              (ins SrcRC:$src),
+              !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+              [(set DstRC:$dst, (OpVT (OpNode (InVT SrcRC:$src))))]>, EVEX;
+  def rm : AVX5128I<opc, MRMSrcMem, (outs DstRC:$dst),
+              (ins x86memop:$src),
+              !strconcat(OpcodeStr,"\t{$src, $dst|$dst, $src}"),
+              [(set DstRC:$dst,
+                (OpVT (OpNode (InVT (bitconvert (mem_frag addr:$src))))))]>,
+              EVEX;
+}
+
+defm VPMOVZXBDZ: avx512_extend<0x31, "vpmovzxbd", VR512, VR128X, X86vzext, 
+                             memopv2i64, i128mem, v16i32, v16i8>, EVEX_V512,
+                             EVEX_CD8<8, CD8VQ>;
+defm VPMOVZXBQZ: avx512_extend<0x32, "vpmovzxbq", VR512, VR128X, X86vzext, 
+                             memopv2i64, i128mem, v8i64, v16i8>, EVEX_V512,
+                             EVEX_CD8<8, CD8VO>;
+defm VPMOVZXWDZ: avx512_extend<0x33, "vpmovzxwd", VR512, VR256X, X86vzext, 
+                             memopv4i64, i256mem, v16i32, v16i16>, EVEX_V512,
+                             EVEX_CD8<16, CD8VH>;
+defm VPMOVZXWQZ: avx512_extend<0x34, "vpmovzxwq", VR512, VR128X, X86vzext, 
+                             memopv2i64, i128mem, v8i64, v8i16>, EVEX_V512,
+                             EVEX_CD8<16, CD8VQ>;
+defm VPMOVZXDQZ: avx512_extend<0x35, "vpmovzxdq", VR512, VR256X, X86vzext, 
+                             memopv4i64, i256mem, v8i64, v8i32>, EVEX_V512,
+                             EVEX_CD8<32, CD8VH>;
+                             
+defm VPMOVSXBDZ: avx512_extend<0x21, "vpmovsxbd", VR512, VR128X, X86vsext, 
+                             memopv2i64, i128mem, v16i32, v16i8>, EVEX_V512,
+                             EVEX_CD8<8, CD8VQ>;
+defm VPMOVSXBQZ: avx512_extend<0x22, "vpmovsxbq", VR512, VR128X, X86vsext, 
+                             memopv2i64, i128mem, v8i64, v16i8>, EVEX_V512,
+                             EVEX_CD8<8, CD8VO>;
+defm VPMOVSXWDZ: avx512_extend<0x23, "vpmovsxwd", VR512, VR256X, X86vsext, 
+                             memopv4i64, i256mem, v16i32, v16i16>, EVEX_V512,
+                             EVEX_CD8<16, CD8VH>;
+defm VPMOVSXWQZ: avx512_extend<0x24, "vpmovsxwq", VR512, VR128X, X86vsext, 
+                             memopv2i64, i128mem, v8i64, v8i16>, EVEX_V512,
+                             EVEX_CD8<16, CD8VQ>;
+defm VPMOVSXDQZ: avx512_extend<0x25, "vpmovsxdq", VR512, VR256X, X86vsext, 
+                             memopv4i64, i256mem, v8i64, v8i32>, EVEX_V512,
+                             EVEX_CD8<32, CD8VH>;
+
+//===----------------------------------------------------------------------===//
+// GATHER - SCATTER Operations
+
+multiclass avx512_gather<bits<8> opc, string OpcodeStr, RegisterClass KRC,
+                       RegisterClass RC, X86MemOperand memop> {
+let mayLoad = 1,
+  Constraints = "@earlyclobber $dst, $src1 = $dst, $mask = $mask_wb" in
+  def rm  : AVX5128I<opc, MRMSrcMem, (outs RC:$dst, KRC:$mask_wb),
+            (ins RC:$src1, KRC:$mask, memop:$src2),
+            !strconcat(OpcodeStr,
+            "\t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"),
+            []>, EVEX, EVEX_K;
+}
+defm VGATHERDPDZ : avx512_gather<0x92, "vgatherdpd", VK8WM, VR512, vy64xmem>,
+                                 EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
+defm VGATHERDPSZ : avx512_gather<0x92, "vgatherdps", VK16WM, VR512, vz32mem>,
+                                 EVEX_V512, EVEX_CD8<32, CD8VT1>;
+
+defm VGATHERQPDZ : avx512_gather<0x93, "vgatherqpd", VK8WM, VR512, vz64mem>,
+                                 EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
+defm VGATHERQPSZ : avx512_gather<0x93, "vgatherqps", VK8WM, VR256X, vz64mem>,
+                                 EVEX_V512, EVEX_CD8<32, CD8VT1>;
+  
+defm VPGATHERDQZ : avx512_gather<0x90, "vpgatherdq", VK8WM, VR512,  vy64xmem>,
+                                 EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
+defm VPGATHERDDZ : avx512_gather<0x90, "vpgatherdd", VK16WM, VR512, vz32mem>,
+                                 EVEX_V512, EVEX_CD8<32, CD8VT1>;
+
+defm VPGATHERQQZ : avx512_gather<0x91, "vpgatherqq", VK8WM, VR512,  vz64mem>,
+                                 EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
+defm VPGATHERQDZ : avx512_gather<0x91, "vpgatherqd", VK8WM, VR256X,  vz64mem>,
+                                 EVEX_V512, EVEX_CD8<32, CD8VT1>;
+
+multiclass avx512_scatter<bits<8> opc, string OpcodeStr, RegisterClass KRC,
+                       RegisterClass RC, X86MemOperand memop> {
+let mayStore = 1, Constraints = "$mask = $mask_wb" in
+  def mr  : AVX5128I<opc, MRMDestMem, (outs KRC:$mask_wb),
+            (ins memop:$dst, KRC:$mask, RC:$src2),
+            !strconcat(OpcodeStr,
+            "\t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"),
+            []>, EVEX, EVEX_K;
+}
+
+defm VSCATTERDPDZ : avx512_scatter<0xA2, "vscatterdpd", VK8WM, VR512, vy64xmem>,
+                                   EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
+defm VSCATTERDPSZ : avx512_scatter<0xA2, "vscatterdps", VK16WM, VR512, vz32mem>,
+                                   EVEX_V512, EVEX_CD8<32, CD8VT1>;
+
+defm VSCATTERQPDZ : avx512_scatter<0xA3, "vscatterqpd", VK8WM, VR512, vz64mem>,
+                                   EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
+defm VSCATTERQPSZ : avx512_scatter<0xA3, "vscatterqps", VK8WM, VR256X, vz64mem>,
+                                   EVEX_V512, EVEX_CD8<32, CD8VT1>;
+  
+defm VPSCATTERDQZ : avx512_scatter<0xA0, "vpscatterdq", VK8WM, VR512, vy64xmem>,
+                                   EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
+defm VPSCATTERDDZ : avx512_scatter<0xA0, "vpscatterdd", VK16WM, VR512, vz32mem>,
+                                   EVEX_V512, EVEX_CD8<32, CD8VT1>;
+
+defm VPSCATTERQQZ : avx512_scatter<0xA1, "vpscatterqq", VK8WM, VR512, vz64mem>,
+                                  EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
+defm VPSCATTERQDZ : avx512_scatter<0xA1, "vpscatterqd", VK8WM, VR256X, vz64mem>,
+                                  EVEX_V512, EVEX_CD8<32, CD8VT1>;
+
+//===----------------------------------------------------------------------===//
+// VSHUFPS - VSHUFPD Operations
+
+multiclass avx512_shufp<RegisterClass RC, X86MemOperand x86memop,
+                      ValueType vt, string OpcodeStr, PatFrag mem_frag,
+                      Domain d> {
+  def rmi : AVX512PIi8<0xC6, MRMSrcMem, (outs RC:$dst),
+                   (ins RC:$src1, x86memop:$src2, i8imm:$src3),
+                   !strconcat(OpcodeStr,
+                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+                   [(set RC:$dst, (vt (X86Shufp RC:$src1, (mem_frag addr:$src2),
+                                       (i8 imm:$src3))))], d, IIC_SSE_SHUFP>,
+                   EVEX_4V, Sched<[WriteShuffleLd, ReadAfterLd]>;
+  def rri : AVX512PIi8<0xC6, MRMSrcReg, (outs RC:$dst),
+                   (ins RC:$src1, RC:$src2, i8imm:$src3),
+                   !strconcat(OpcodeStr,
+                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+                   [(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2,
+                                       (i8 imm:$src3))))], d, IIC_SSE_SHUFP>,
+                   EVEX_4V, Sched<[WriteShuffle]>;
+}
+
+defm VSHUFPSZ  : avx512_shufp<VR512, f512mem, v16f32, "vshufps", memopv16f32,
+                  SSEPackedSingle>, EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VSHUFPDZ  : avx512_shufp<VR512, f512mem, v8f64, "vshufpd", memopv8f64,
+                  SSEPackedDouble>, OpSize, VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>;
+
+def : Pat<(v16i32 (X86Shufp VR512:$src1, VR512:$src2, (i8 imm:$imm))),
+          (VSHUFPSZrri VR512:$src1, VR512:$src2, imm:$imm)>;
+def : Pat<(v16i32 (X86Shufp VR512:$src1,
+                    (memopv16i32 addr:$src2), (i8 imm:$imm))),
+          (VSHUFPSZrmi VR512:$src1, addr:$src2, imm:$imm)>;
+
+def : Pat<(v8i64 (X86Shufp VR512:$src1, VR512:$src2, (i8 imm:$imm))),
+          (VSHUFPDZrri VR512:$src1, VR512:$src2, imm:$imm)>;
+def : Pat<(v8i64 (X86Shufp VR512:$src1,
+                            (memopv8i64 addr:$src2), (i8 imm:$imm))),
+          (VSHUFPDZrmi VR512:$src1, addr:$src2, imm:$imm)>;
+
+multiclass avx512_alignr<string OpcodeStr, RegisterClass RC,
+                       X86MemOperand x86memop> {
+  def rri : AVX512AIi8<0x03, MRMSrcReg, (outs RC:$dst),
+                     (ins RC:$src1, RC:$src2, i8imm:$src3),
+                     !strconcat(OpcodeStr,
+                     "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+                     []>, EVEX_4V;
+  let mayLoad = 1 in
+  def rmi : AVX512AIi8<0x03, MRMSrcMem, (outs RC:$dst),
+                     (ins RC:$src1, x86memop:$src2, i8imm:$src3),
+                     !strconcat(OpcodeStr,
+                     "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+                     []>, EVEX_4V;
+}
+defm VALIGND : avx512_alignr<"valignd", VR512, i512mem>, 
+                 EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VALIGNQ : avx512_alignr<"valignq", VR512, i512mem>, 
+                 VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>;
+
+def : Pat<(v16f32 (X86PAlignr VR512:$src1, VR512:$src2, (i8 imm:$imm))),
+          (VALIGNDrri VR512:$src2, VR512:$src1, imm:$imm)>;
+def : Pat<(v8f64 (X86PAlignr VR512:$src1, VR512:$src2, (i8 imm:$imm))),
+          (VALIGNQrri VR512:$src2, VR512:$src1, imm:$imm)>;
+def : Pat<(v16i32 (X86PAlignr VR512:$src1, VR512:$src2, (i8 imm:$imm))),
+          (VALIGNDrri VR512:$src2, VR512:$src1, imm:$imm)>;
+def : Pat<(v8i64 (X86PAlignr VR512:$src1, VR512:$src2, (i8 imm:$imm))),
+          (VALIGNQrri VR512:$src2, VR512:$src1, imm:$imm)>;
+
+multiclass avx512_vpabs<bits<8> opc, string OpcodeStr, RegisterClass RC,
+                       X86MemOperand x86memop> {
+  def rr  : AVX5128I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
+                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>,
+                    EVEX;
+  def rm  : AVX5128I<opc, MRMSrcMem, (outs VR512:$dst), 
+                   (ins x86memop:$src),
+                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>,
+                   EVEX;
+}
+
+defm VPABSD : avx512_vpabs<0x1E, "vpabsd", VR512, i512mem>, EVEX_V512,
+                        EVEX_CD8<32, CD8VF>;
+defm VPABSQ : avx512_vpabs<0x1F, "vpabsq", VR512, i512mem>, EVEX_V512, VEX_W,
+                        EVEX_CD8<64, CD8VF>;
+
+multiclass avx512_conflict<bits<8> opc, string OpcodeStr, 
+                        RegisterClass RC, RegisterClass KRC, PatFrag memop_frag,
+                        X86MemOperand x86memop, PatFrag scalar_mfrag,
+                        X86MemOperand x86scalar_mop, string BrdcstStr,
+                        Intrinsic Int, Intrinsic maskInt, Intrinsic maskzInt> {
+  def rr : AVX5128I<opc, MRMSrcReg, (outs RC:$dst),
+       (ins RC:$src),
+       !strconcat(OpcodeStr, "\t{$src, ${dst} |${dst}, $src}"),
+       [(set RC:$dst, (Int RC:$src))]>, EVEX;
+  def rm : AVX5128I<opc, MRMSrcMem, (outs RC:$dst),
+       (ins x86memop:$src),
+       !strconcat(OpcodeStr, "\t{$src, ${dst}|${dst}, $src}"),
+       [(set RC:$dst, (Int (memop_frag addr:$src)))]>, EVEX;
+  def rmb : AVX5128I<opc, MRMSrcMem, (outs RC:$dst),
+       (ins x86scalar_mop:$src),
+       !strconcat(OpcodeStr, "\t{${src}", BrdcstStr,
+                  ", ${dst}|${dst}, ${src}", BrdcstStr, "}"),
+       []>, EVEX, EVEX_B;
+  def rrkz : AVX5128I<opc, MRMSrcReg, (outs RC:$dst),
+       (ins KRC:$mask, RC:$src),
+       !strconcat(OpcodeStr,
+                  "\t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"),
+       [(set RC:$dst, (maskzInt KRC:$mask, RC:$src))]>, EVEX, EVEX_KZ;
+  def rmkz : AVX5128I<opc, MRMSrcMem, (outs RC:$dst),
+       (ins KRC:$mask, x86memop:$src),
+       !strconcat(OpcodeStr,
+                  "\t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"),
+       [(set RC:$dst, (maskzInt KRC:$mask, (memop_frag addr:$src)))]>,
+       EVEX, EVEX_KZ;
+  def rmbkz : AVX5128I<opc, MRMSrcMem, (outs RC:$dst),
+       (ins KRC:$mask, x86scalar_mop:$src),
+       !strconcat(OpcodeStr, "\t{${src}", BrdcstStr,
+                  ", ${dst} {${mask}} {z}|${dst} {${mask}} {z}, ${src}",
+                  BrdcstStr, "}"),
+       []>, EVEX, EVEX_KZ, EVEX_B;
+       
+  let Constraints = "$src1 = $dst" in {
+  def rrk : AVX5128I<opc, MRMSrcReg, (outs RC:$dst),
+       (ins RC:$src1, KRC:$mask, RC:$src2),
+       !strconcat(OpcodeStr,
+                  "\t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"),
+       [(set RC:$dst, (maskInt RC:$src1, KRC:$mask, RC:$src2))]>, EVEX, EVEX_K;
+  def rmk : AVX5128I<opc, MRMSrcMem, (outs RC:$dst),
+       (ins RC:$src1, KRC:$mask, x86memop:$src2),
+       !strconcat(OpcodeStr,
+                  "\t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"),
+       [(set RC:$dst, (maskInt RC:$src1, KRC:$mask, (memop_frag addr:$src2)))]>, EVEX, EVEX_K;
+  def rmbk : AVX5128I<opc, MRMSrcMem, (outs RC:$dst),
+       (ins RC:$src1, KRC:$mask, x86scalar_mop:$src2),
+       !strconcat(OpcodeStr, "\t{${src2}", BrdcstStr,
+                  ", ${dst} {${mask}}|${dst} {${mask}}, ${src2}", BrdcstStr, "}"),
+       []>, EVEX, EVEX_K, EVEX_B;
+   }
+}
+
+let Predicates = [HasCDI] in {
+defm VPCONFLICTD : avx512_conflict<0xC4, "vpconflictd", VR512, VK16WM,
+                    memopv16i32, i512mem, loadi32, i32mem, "{1to16}",
+                    int_x86_avx512_conflict_d_512,
+                    int_x86_avx512_conflict_d_mask_512,
+                    int_x86_avx512_conflict_d_maskz_512>,
+                    EVEX_V512, EVEX_CD8<32, CD8VF>;
+
+defm VPCONFLICTQ : avx512_conflict<0xC4, "vpconflictq", VR512, VK8WM,
+                    memopv8i64, i512mem, loadi64, i64mem, "{1to8}",
+                    int_x86_avx512_conflict_q_512,
+                    int_x86_avx512_conflict_q_mask_512,
+                    int_x86_avx512_conflict_q_maskz_512>,
+                    EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+}
diff --git a/lib/Target/X86/X86InstrArithmetic.td b/lib/Target/X86/X86InstrArithmetic.td
index 9ce02ba..7fc9c44 100644
--- a/lib/Target/X86/X86InstrArithmetic.td
+++ b/lib/Target/X86/X86InstrArithmetic.td
@@ -497,6 +497,21 @@ def DEC64_32r : I<0xFF, MRM1r, (outs GR32:$dst), (ins GR32:$src1),
                 Requires<[In64BitMode]>;
 } // isConvertibleToThreeAddress = 1, CodeSize = 2
 
+let isCodeGenOnly = 1, CodeSize = 2 in {
+def INC32_16r : I<0xFF, MRM0r, (outs GR16:$dst), (ins GR16:$src1),
+                  "inc{w}\t$dst", [], IIC_UNARY_REG>,
+                OpSize, Requires<[In32BitMode]>;
+def INC32_32r : I<0xFF, MRM0r, (outs GR32:$dst), (ins GR32:$src1),
+                  "inc{l}\t$dst", [], IIC_UNARY_REG>,
+                Requires<[In32BitMode]>;
+def DEC32_16r : I<0xFF, MRM1r, (outs GR16:$dst), (ins GR16:$src1),
+                  "dec{w}\t$dst", [], IIC_UNARY_REG>,
+                OpSize, Requires<[In32BitMode]>;
+def DEC32_32r : I<0xFF, MRM1r, (outs GR32:$dst), (ins GR32:$src1),
+                  "dec{l}\t$dst", [], IIC_UNARY_REG>,
+                Requires<[In32BitMode]>;
+} // isCodeGenOnly = 1, CodeSize = 2
+
 } // Constraints = "$src1 = $dst", SchedRW
 
 let CodeSize = 2, SchedRW = [WriteALULd, WriteRMW] in {
@@ -578,7 +593,6 @@ let CodeSize = 2, SchedRW = [WriteALULd, WriteRMW] in {
 } // CodeSize = 2, SchedRW
 } // Defs = [EFLAGS]
 
-
 /// X86TypeInfo - This is a bunch of information that describes relevant X86
 /// information about value types.  For example, it can tell you what the
 /// register class and preferred load to use.
@@ -726,20 +740,25 @@ class BinOpRR_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
   : BinOpRR<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst),
             [(set typeinfo.RegClass:$dst, EFLAGS,
                   (opnode typeinfo.RegClass:$src1, typeinfo.RegClass:$src2,
-                          EFLAGS))], IIC_BIN_NONMEM>;
+                          EFLAGS))], IIC_BIN_CARRY_NONMEM>;
 
 // BinOpRR_Rev - Instructions like "add reg, reg, reg" (reversed encoding).
-class BinOpRR_Rev<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo>
+class BinOpRR_Rev<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+                 InstrItinClass itin = IIC_BIN_NONMEM>
   : ITy<opcode, MRMSrcReg, typeinfo,
         (outs typeinfo.RegClass:$dst),
         (ins typeinfo.RegClass:$src1, typeinfo.RegClass:$src2),
-        mnemonic, "{$src2, $dst|$dst, $src2}", [], IIC_BIN_NONMEM>,
+        mnemonic, "{$src2, $dst|$dst, $src2}", [], itin>,
     Sched<[WriteALU]> {
   // The disassembler should know about this, but not the asmparser.
   let isCodeGenOnly = 1;
   let hasSideEffects = 0;
 }
 
+// BinOpRR_RDD_Rev - Instructions like "adc reg, reg, reg" (reversed encoding).
+class BinOpRR_RFF_Rev<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo>
+  : BinOpRR_Rev<opcode, mnemonic, typeinfo, IIC_BIN_CARRY_NONMEM>;
+
 // BinOpRR_F_Rev - Instructions like "cmp reg, reg" (reversed encoding).
 class BinOpRR_F_Rev<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo>
   : ITy<opcode, MRMSrcReg, typeinfo, (outs),
@@ -753,10 +772,11 @@ class BinOpRR_F_Rev<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo>
 
 // BinOpRM - Instructions like "add reg, reg, [mem]".
 class BinOpRM<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
-              dag outlist, list<dag> pattern>
+              dag outlist, list<dag> pattern,
+              InstrItinClass itin = IIC_BIN_MEM>
   : ITy<opcode, MRMSrcMem, typeinfo, outlist,
         (ins typeinfo.RegClass:$src1, typeinfo.MemOperand:$src2),
-        mnemonic, "{$src2, $src1|$src1, $src2}", pattern, IIC_BIN_NONMEM>,
+        mnemonic, "{$src2, $src1|$src1, $src2}", pattern, itin>,
     Sched<[WriteALULd, ReadAfterLd]>;
 
 // BinOpRM_R - Instructions like "add reg, reg, [mem]".
@@ -786,14 +806,15 @@ class BinOpRM_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
   : BinOpRM<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst),
             [(set typeinfo.RegClass:$dst, EFLAGS,
             (opnode typeinfo.RegClass:$src1, (typeinfo.LoadNode addr:$src2),
-                    EFLAGS))]>;
+                    EFLAGS))], IIC_BIN_CARRY_MEM>;
 
 // BinOpRI - Instructions like "add reg, reg, imm".
 class BinOpRI<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
-              Format f, dag outlist, list<dag> pattern>
+              Format f, dag outlist, list<dag> pattern,
+              InstrItinClass itin = IIC_BIN_NONMEM>
   : ITy<opcode, f, typeinfo, outlist,
         (ins typeinfo.RegClass:$src1, typeinfo.ImmOperand:$src2),
-        mnemonic, "{$src2, $src1|$src1, $src2}", pattern, IIC_BIN_NONMEM>,
+        mnemonic, "{$src2, $src1|$src1, $src2}", pattern, itin>,
     Sched<[WriteALU]> {
   let ImmT = typeinfo.ImmEncoding;
 }
@@ -824,14 +845,15 @@ class BinOpRI_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
   : BinOpRI<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst),
             [(set typeinfo.RegClass:$dst, EFLAGS,
                 (opnode typeinfo.RegClass:$src1, typeinfo.ImmOperator:$src2,
-                        EFLAGS))]>;
+                        EFLAGS))], IIC_BIN_CARRY_NONMEM>;
 
 // BinOpRI8 - Instructions like "add reg, reg, imm8".
 class BinOpRI8<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
-               Format f, dag outlist, list<dag> pattern>
+               Format f, dag outlist, list<dag> pattern,
+               InstrItinClass itin = IIC_BIN_NONMEM>
   : ITy<opcode, f, typeinfo, outlist,
         (ins typeinfo.RegClass:$src1, typeinfo.Imm8Operand:$src2),
-        mnemonic, "{$src2, $src1|$src1, $src2}", pattern, IIC_BIN_NONMEM>,
+        mnemonic, "{$src2, $src1|$src1, $src2}", pattern, itin>,
     Sched<[WriteALU]> {
   let ImmT = Imm8; // Always 8-bit immediate.
 }
@@ -863,14 +885,14 @@ class BinOpRI8_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
   : BinOpRI8<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst),
              [(set typeinfo.RegClass:$dst, EFLAGS,
                (opnode typeinfo.RegClass:$src1, typeinfo.Imm8Operator:$src2,
-                       EFLAGS))]>;
+                       EFLAGS))], IIC_BIN_CARRY_NONMEM>;
 
 // BinOpMR - Instructions like "add [mem], reg".
 class BinOpMR<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
-              list<dag> pattern>
+              list<dag> pattern, InstrItinClass itin = IIC_BIN_MEM>
   : ITy<opcode, MRMDestMem, typeinfo,
         (outs), (ins typeinfo.MemOperand:$dst, typeinfo.RegClass:$src),
-        mnemonic, "{$src, $dst|$dst, $src}", pattern, IIC_BIN_MEM>,
+        mnemonic, "{$src, $dst|$dst, $src}", pattern, itin>,
     Sched<[WriteALULd, WriteRMW]>;
 
 // BinOpMR_RMW - Instructions like "add [mem], reg".
@@ -886,7 +908,7 @@ class BinOpMR_RMW_FF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
   : BinOpMR<opcode, mnemonic, typeinfo,
           [(store (opnode (load addr:$dst), typeinfo.RegClass:$src, EFLAGS),
                   addr:$dst),
-           (implicit EFLAGS)]>;
+           (implicit EFLAGS)], IIC_BIN_CARRY_MEM>;
 
 // BinOpMR_F - Instructions like "cmp [mem], reg".
 class BinOpMR_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
@@ -896,10 +918,11 @@ class BinOpMR_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
 
 // BinOpMI - Instructions like "add [mem], imm".
 class BinOpMI<string mnemonic, X86TypeInfo typeinfo,
-              Format f, list<dag> pattern, bits<8> opcode = 0x80>
+              Format f, list<dag> pattern, bits<8> opcode = 0x80,
+              InstrItinClass itin = IIC_BIN_MEM>
   : ITy<opcode, f, typeinfo,
         (outs), (ins typeinfo.MemOperand:$dst, typeinfo.ImmOperand:$src),
-        mnemonic, "{$src, $dst|$dst, $src}", pattern, IIC_BIN_MEM>,
+        mnemonic, "{$src, $dst|$dst, $src}", pattern, itin>,
     Sched<[WriteALULd, WriteRMW]> {
   let ImmT = typeinfo.ImmEncoding;
 }
@@ -917,7 +940,7 @@ class BinOpMI_RMW_FF<string mnemonic, X86TypeInfo typeinfo,
   : BinOpMI<mnemonic, typeinfo, f,
             [(store (opnode (typeinfo.VT (load addr:$dst)),
                             typeinfo.ImmOperator:$src, EFLAGS), addr:$dst),
-             (implicit EFLAGS)]>;
+             (implicit EFLAGS)], 0x80, IIC_BIN_CARRY_MEM>;
 
 // BinOpMI_F - Instructions like "cmp [mem], imm".
 class BinOpMI_F<string mnemonic, X86TypeInfo typeinfo,
@@ -929,10 +952,11 @@ class BinOpMI_F<string mnemonic, X86TypeInfo typeinfo,
 
 // BinOpMI8 - Instructions like "add [mem], imm8".
 class BinOpMI8<string mnemonic, X86TypeInfo typeinfo,
-               Format f, list<dag> pattern>
+               Format f, list<dag> pattern,
+               InstrItinClass itin = IIC_BIN_MEM>
   : ITy<0x82, f, typeinfo,
         (outs), (ins typeinfo.MemOperand:$dst, typeinfo.Imm8Operand:$src),
-        mnemonic, "{$src, $dst|$dst, $src}", pattern, IIC_BIN_MEM>,
+        mnemonic, "{$src, $dst|$dst, $src}", pattern, itin>,
     Sched<[WriteALULd, WriteRMW]> {
   let ImmT = Imm8; // Always 8-bit immediate.
 }
@@ -951,7 +975,7 @@ class BinOpMI8_RMW_FF<string mnemonic, X86TypeInfo typeinfo,
   : BinOpMI8<mnemonic, typeinfo, f,
              [(store (opnode (load addr:$dst),
                              typeinfo.Imm8Operator:$src, EFLAGS), addr:$dst),
-              (implicit EFLAGS)]>;
+              (implicit EFLAGS)], IIC_BIN_CARRY_MEM>;
 
 // BinOpMI8_F - Instructions like "cmp [mem], imm8".
 class BinOpMI8_F<string mnemonic, X86TypeInfo typeinfo,
@@ -962,10 +986,11 @@ class BinOpMI8_F<string mnemonic, X86TypeInfo typeinfo,
 
 // BinOpAI - Instructions like "add %eax, %eax, imm", that imp-def EFLAGS.
 class BinOpAI<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
-              Register areg, string operands>
+              Register areg, string operands,
+              InstrItinClass itin = IIC_BIN_NONMEM>
   : ITy<opcode, RawFrm, typeinfo,
         (outs), (ins typeinfo.ImmOperand:$src),
-        mnemonic, operands, []>, Sched<[WriteALU]> {
+        mnemonic, operands, [], itin>, Sched<[WriteALU]> {
   let ImmT = typeinfo.ImmEncoding;
   let Uses = [areg];
   let Defs = [areg, EFLAGS];
@@ -976,7 +1001,8 @@ class BinOpAI<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
 // and use EFLAGS.
 class BinOpAI_FF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
                 Register areg, string operands>
-  : BinOpAI<opcode, mnemonic, typeinfo, areg, operands> {
+  : BinOpAI<opcode, mnemonic, typeinfo, areg, operands,
+            IIC_BIN_CARRY_NONMEM> {
   let Uses = [areg, EFLAGS];
 }
 
@@ -1070,10 +1096,10 @@ multiclass ArithBinOp_RFF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
         def NAME#64rr : BinOpRR_RFF<BaseOpc, mnemonic, Xi64, opnode>;
       } // isCommutable
 
-      def NAME#8rr_REV  : BinOpRR_Rev<BaseOpc2, mnemonic, Xi8>;
-      def NAME#16rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi16>;
-      def NAME#32rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi32>;
-      def NAME#64rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi64>;
+      def NAME#8rr_REV  : BinOpRR_RFF_Rev<BaseOpc2, mnemonic, Xi8>;
+      def NAME#16rr_REV : BinOpRR_RFF_Rev<BaseOpc2, mnemonic, Xi16>;
+      def NAME#32rr_REV : BinOpRR_RFF_Rev<BaseOpc2, mnemonic, Xi32>;
+      def NAME#64rr_REV : BinOpRR_RFF_Rev<BaseOpc2, mnemonic, Xi64>;
 
       def NAME#8rm   : BinOpRM_RFF<BaseOpc2, mnemonic, Xi8 , opnode>;
       def NAME#16rm  : BinOpRM_RFF<BaseOpc2, mnemonic, Xi16, opnode>;
diff --git a/lib/Target/X86/X86InstrCompiler.td b/lib/Target/X86/X86InstrCompiler.td
index 8969946..7d10b67 100644
--- a/lib/Target/X86/X86InstrCompiler.td
+++ b/lib/Target/X86/X86InstrCompiler.td
@@ -884,6 +884,24 @@ let Uses = [EFLAGS], usesCustomInserter = 1 in {
                     [(set VR256:$dst,
                       (v4i64 (X86cmov VR256:$t, VR256:$f, imm:$cond,
                                           EFLAGS)))]>;
+  def CMOV_V8I64 : I<0, Pseudo,
+                    (outs VR512:$dst), (ins VR512:$t, VR512:$f, i8imm:$cond),
+                    "#CMOV_V8I64 PSEUDO!",
+                    [(set VR512:$dst,
+                      (v8i64 (X86cmov VR512:$t, VR512:$f, imm:$cond,
+                                          EFLAGS)))]>;
+  def CMOV_V8F64 : I<0, Pseudo,
+                    (outs VR512:$dst), (ins VR512:$t, VR512:$f, i8imm:$cond),
+                    "#CMOV_V8F64 PSEUDO!",
+                    [(set VR512:$dst,
+                      (v8f64 (X86cmov VR512:$t, VR512:$f, imm:$cond,
+                                          EFLAGS)))]>;
+  def CMOV_V16F32 : I<0, Pseudo,
+                    (outs VR512:$dst), (ins VR512:$t, VR512:$f, i8imm:$cond),
+                    "#CMOV_V16F32 PSEUDO!",
+                    [(set VR512:$dst,
+                      (v16f32 (X86cmov VR512:$t, VR512:$f, imm:$cond,
+                                          EFLAGS)))]>;
 }
 
 
@@ -917,8 +935,6 @@ def : Pat<(store (i32 (X86Wrapper texternalsym:$src)), addr:$dst),
 def : Pat<(store (i32 (X86Wrapper tblockaddress:$src)), addr:$dst),
           (MOV32mi addr:$dst, tblockaddress:$src)>;
 
-
-
 // ConstantPool GlobalAddress, ExternalSymbol, and JumpTable when not in small
 // code model mode, should use 'movabs'.  FIXME: This is really a hack, the
 //  'movabs' predicate should handle this sort of thing.
@@ -966,14 +982,12 @@ def : Pat<(store (i64 (X86Wrapper tblockaddress:$src)), addr:$dst),
           (MOV64mi32 addr:$dst, tblockaddress:$src)>,
           Requires<[NearData, IsStatic]>;
 
-
-
 // Calls
 
 // tls has some funny stuff here...
 // This corresponds to movabs $foo@tpoff, %rax
 def : Pat<(i64 (X86Wrapper tglobaltlsaddr :$dst)),
-          (MOV64ri tglobaltlsaddr :$dst)>;
+          (MOV64ri32 tglobaltlsaddr :$dst)>;
 // This corresponds to add $foo@tpoff, %rax
 def : Pat<(add GR64:$src1, (X86Wrapper tglobaltlsaddr :$dst)),
           (ADD64ri32 GR64:$src1, tglobaltlsaddr :$dst)>;
diff --git a/lib/Target/X86/X86InstrControl.td b/lib/Target/X86/X86InstrControl.td
index 0e69651..e4ccc06 100644
--- a/lib/Target/X86/X86InstrControl.td
+++ b/lib/Target/X86/X86InstrControl.td
@@ -49,10 +49,12 @@ let isTerminator = 1, isReturn = 1, isBarrier = 1,
 let isBarrier = 1, isBranch = 1, isTerminator = 1, SchedRW = [WriteJump] in {
   def JMP_4 : Ii32PCRel<0xE9, RawFrm, (outs), (ins brtarget:$dst),
                         "jmp\t$dst", [(br bb:$dst)], IIC_JMP_REL>;
+  let hasSideEffects = 0 in
   def JMP_1 : Ii8PCRel<0xEB, RawFrm, (outs), (ins brtarget8:$dst),
                        "jmp\t$dst", [], IIC_JMP_REL>;
   // FIXME : Intel syntax for JMP64pcrel32 such that it is not ambiguious
   // with JMP_1.
+  let hasSideEffects = 0 in
   def JMP64pcrel32 : I<0xE9, RawFrm, (outs), (ins brtarget:$dst),
                        "jmpq\t$dst", [], IIC_JMP_REL>;
 }
@@ -60,6 +62,7 @@ let isBarrier = 1, isBranch = 1, isTerminator = 1, SchedRW = [WriteJump] in {
 // Conditional Branches.
 let isBranch = 1, isTerminator = 1, Uses = [EFLAGS], SchedRW = [WriteJump] in {
   multiclass ICBr<bits<8> opc1, bits<8> opc4, string asm, PatFrag Cond> {
+    let hasSideEffects = 0 in
     def _1 : Ii8PCRel <opc1, RawFrm, (outs), (ins brtarget8:$dst), asm, [],
                        IIC_Jcc>;
     def _4 : Ii32PCRel<opc4, RawFrm, (outs), (ins brtarget:$dst), asm,
@@ -85,7 +88,7 @@ defm JLE : ICBr<0x7E, 0x8E, "jle\t$dst", X86_COND_LE>;
 defm JG  : ICBr<0x7F, 0x8F, "jg\t$dst" , X86_COND_G>;
 
 // jcx/jecx/jrcx instructions.
-let isBranch = 1, isTerminator = 1, SchedRW = [WriteJump] in {
+let isBranch = 1, isTerminator = 1, hasSideEffects = 0, SchedRW = [WriteJump] in {
   // These are the 32-bit versions of this instruction for the asmparser.  In
   // 32-bit mode, the address size prefix is jcxz and the unprefixed version is
   // jecxz.
diff --git a/lib/Target/X86/X86InstrExtension.td b/lib/Target/X86/X86InstrExtension.td
index 28954c6..4090550 100644
--- a/lib/Target/X86/X86InstrExtension.td
+++ b/lib/Target/X86/X86InstrExtension.td
@@ -14,26 +14,26 @@
 let neverHasSideEffects = 1 in {
   let Defs = [AX], Uses = [AL] in
   def CBW : I<0x98, RawFrm, (outs), (ins),
-              "{cbtw|cbw}", []>, OpSize;   // AX = signext(AL)
+              "{cbtw|cbw}", [], IIC_CBW>, OpSize;   // AX = signext(AL)
   let Defs = [EAX], Uses = [AX] in
   def CWDE : I<0x98, RawFrm, (outs), (ins),
-              "{cwtl|cwde}", []>;   // EAX = signext(AX)
+              "{cwtl|cwde}", [], IIC_CBW>;   // EAX = signext(AX)
 
   let Defs = [AX,DX], Uses = [AX] in
   def CWD : I<0x99, RawFrm, (outs), (ins),
-              "{cwtd|cwd}", []>, OpSize; // DX:AX = signext(AX)
+              "{cwtd|cwd}", [], IIC_CBW>, OpSize; // DX:AX = signext(AX)
   let Defs = [EAX,EDX], Uses = [EAX] in
   def CDQ : I<0x99, RawFrm, (outs), (ins),
-              "{cltd|cdq}", []>; // EDX:EAX = signext(EAX)
+              "{cltd|cdq}", [], IIC_CBW>; // EDX:EAX = signext(EAX)
 
 
   let Defs = [RAX], Uses = [EAX] in
   def CDQE : RI<0x98, RawFrm, (outs), (ins),
-               "{cltq|cdqe}", []>;     // RAX = signext(EAX)
+               "{cltq|cdqe}", [], IIC_CBW>;     // RAX = signext(EAX)
 
   let Defs = [RAX,RDX], Uses = [RAX] in
   def CQO  : RI<0x99, RawFrm, (outs), (ins),
-                "{cqto|cqo}", []>; // RDX:RAX = signext(RAX)
+                "{cqto|cqo}", [], IIC_CBW>; // RDX:RAX = signext(RAX)
 }
 
 
diff --git a/lib/Target/X86/X86InstrFMA.td b/lib/Target/X86/X86InstrFMA.td
index 7759a8a..69cd5a5 100644
--- a/lib/Target/X86/X86InstrFMA.td
+++ b/lib/Target/X86/X86InstrFMA.td
@@ -74,43 +74,43 @@ let neverHasSideEffects = 1 in {
 
 // Fused Multiply-Add
 let ExeDomain = SSEPackedSingle in {
-  defm VFMADDPS    : fma3p_forms<0x98, 0xA8, 0xB8, "vfmadd", "ps", memopv4f32,
-                                 memopv8f32, X86Fmadd, v4f32, v8f32>;
-  defm VFMSUBPS    : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "ps", memopv4f32,
-                                 memopv8f32, X86Fmsub, v4f32, v8f32>;
+  defm VFMADDPS    : fma3p_forms<0x98, 0xA8, 0xB8, "vfmadd", "ps", loadv4f32,
+                                 loadv8f32, X86Fmadd, v4f32, v8f32>;
+  defm VFMSUBPS    : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "ps", loadv4f32,
+                                 loadv8f32, X86Fmsub, v4f32, v8f32>;
   defm VFMADDSUBPS : fma3p_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "ps",
-                                 memopv4f32, memopv8f32, X86Fmaddsub,
+                                 loadv4f32, loadv8f32, X86Fmaddsub,
                                  v4f32, v8f32>;
   defm VFMSUBADDPS : fma3p_forms<0x97, 0xA7, 0xB7, "vfmsubadd", "ps",
-                                 memopv4f32, memopv8f32, X86Fmsubadd,
+                                 loadv4f32, loadv8f32, X86Fmsubadd,
                                  v4f32, v8f32>;
 }
 
 let ExeDomain = SSEPackedDouble in {
-  defm VFMADDPD    : fma3p_forms<0x98, 0xA8, 0xB8, "vfmadd", "pd", memopv2f64,
-                                 memopv4f64, X86Fmadd, v2f64, v4f64>, VEX_W;
-  defm VFMSUBPD    : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "pd", memopv2f64,
-                                 memopv4f64, X86Fmsub, v2f64, v4f64>, VEX_W;
+  defm VFMADDPD    : fma3p_forms<0x98, 0xA8, 0xB8, "vfmadd", "pd", loadv2f64,
+                                 loadv4f64, X86Fmadd, v2f64, v4f64>, VEX_W;
+  defm VFMSUBPD    : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "pd", loadv2f64,
+                                 loadv4f64, X86Fmsub, v2f64, v4f64>, VEX_W;
   defm VFMADDSUBPD : fma3p_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "pd",
-                                 memopv2f64, memopv4f64, X86Fmaddsub,
+                                 loadv2f64, loadv4f64, X86Fmaddsub,
                                  v2f64, v4f64>, VEX_W;
   defm VFMSUBADDPD : fma3p_forms<0x97, 0xA7, 0xB7, "vfmsubadd", "pd",
-                                 memopv2f64, memopv4f64, X86Fmsubadd,
+                                 loadv2f64, loadv4f64, X86Fmsubadd,
                                  v2f64, v4f64>, VEX_W;
 }
 
 // Fused Negative Multiply-Add
 let ExeDomain = SSEPackedSingle in {
-  defm VFNMADDPS : fma3p_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "ps",  memopv4f32,
-                               memopv8f32, X86Fnmadd, v4f32, v8f32>;
-  defm VFNMSUBPS : fma3p_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "ps",  memopv4f32,
-                               memopv8f32, X86Fnmsub, v4f32, v8f32>;
+  defm VFNMADDPS : fma3p_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "ps",  loadv4f32,
+                               loadv8f32, X86Fnmadd, v4f32, v8f32>;
+  defm VFNMSUBPS : fma3p_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "ps",  loadv4f32,
+                               loadv8f32, X86Fnmsub, v4f32, v8f32>;
 }
 let ExeDomain = SSEPackedDouble in {
-  defm VFNMADDPD : fma3p_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "pd", memopv2f64,
-                               memopv4f64, X86Fnmadd, v2f64, v4f64>, VEX_W;
+  defm VFNMADDPD : fma3p_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "pd", loadv2f64,
+                               loadv4f64, X86Fnmadd, v2f64, v4f64>, VEX_W;
   defm VFNMSUBPD : fma3p_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "pd",
-                               memopv2f64, memopv4f64, X86Fnmsub, v2f64,
+                               loadv2f64, loadv4f64, X86Fnmsub, v2f64,
                                v4f64>, VEX_W;
 }
 
@@ -206,25 +206,26 @@ multiclass fma4s<bits<8> opc, string OpcodeStr, RegisterClass RC,
            !strconcat(OpcodeStr,
            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
            [(set RC:$dst,
-             (OpVT (OpNode RC:$src1, RC:$src2, RC:$src3)))]>, VEX_W, MemOp4;
+             (OpVT (OpNode RC:$src1, RC:$src2, RC:$src3)))]>, VEX_W, VEX_LIG, MemOp4;
   def rm : FMA4<opc, MRMSrcMem, (outs RC:$dst),
            (ins RC:$src1, RC:$src2, x86memop:$src3),
            !strconcat(OpcodeStr,
            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
            [(set RC:$dst, (OpNode RC:$src1, RC:$src2,
-                           (mem_frag addr:$src3)))]>, VEX_W, MemOp4;
+                           (mem_frag addr:$src3)))]>, VEX_W, VEX_LIG, MemOp4;
   def mr : FMA4<opc, MRMSrcMem, (outs RC:$dst),
            (ins RC:$src1, x86memop:$src2, RC:$src3),
            !strconcat(OpcodeStr,
            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
            [(set RC:$dst,
-             (OpNode RC:$src1, (mem_frag addr:$src2), RC:$src3))]>;
+             (OpNode RC:$src1, (mem_frag addr:$src2), RC:$src3))]>, VEX_LIG;
 // For disassembler
 let isCodeGenOnly = 1, hasSideEffects = 0 in
   def rr_REV : FMA4<opc, MRMSrcReg, (outs RC:$dst),
                (ins RC:$src1, RC:$src2, RC:$src3),
                !strconcat(OpcodeStr,
-               "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>;
+               "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>,
+               VEX_LIG;
 }
 
 multiclass fma4s_int<bits<8> opc, string OpcodeStr, Operand memop,
@@ -235,19 +236,19 @@ multiclass fma4s_int<bits<8> opc, string OpcodeStr, Operand memop,
                !strconcat(OpcodeStr,
                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
                [(set VR128:$dst,
-                 (Int VR128:$src1, VR128:$src2, VR128:$src3))]>, VEX_W, MemOp4;
+                 (Int VR128:$src1, VR128:$src2, VR128:$src3))]>, VEX_W, VEX_LIG, MemOp4;
   def rm_Int : FMA4<opc, MRMSrcMem, (outs VR128:$dst),
                (ins VR128:$src1, VR128:$src2, memop:$src3),
                !strconcat(OpcodeStr,
                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
                [(set VR128:$dst, (Int VR128:$src1, VR128:$src2,
-                                  mem_cpat:$src3))]>, VEX_W, MemOp4;
+                                  mem_cpat:$src3))]>, VEX_W, VEX_LIG, MemOp4;
   def mr_Int : FMA4<opc, MRMSrcMem, (outs VR128:$dst),
                (ins VR128:$src1, memop:$src2, VR128:$src3),
                !strconcat(OpcodeStr,
                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
                [(set VR128:$dst,
-                 (Int VR128:$src1, mem_cpat:$src2, VR128:$src3))]>;
+                 (Int VR128:$src1, mem_cpat:$src2, VR128:$src3))]>, VEX_LIG;
 }
 
 multiclass fma4p<bits<8> opc, string OpcodeStr, SDNode OpNode,
@@ -338,31 +339,31 @@ defm VFNMSUBSD4 : fma4s<0x7F, "vfnmsubsd", FR64, f64mem, f64,
 
 let ExeDomain = SSEPackedSingle in {
   defm VFMADDPS4    : fma4p<0x68, "vfmaddps", X86Fmadd, v4f32, v8f32,
-                            memopv4f32, memopv8f32>;
+                            loadv4f32, loadv8f32>;
   defm VFMSUBPS4    : fma4p<0x6C, "vfmsubps", X86Fmsub, v4f32, v8f32,
-                            memopv4f32, memopv8f32>;
+                            loadv4f32, loadv8f32>;
   defm VFNMADDPS4   : fma4p<0x78, "vfnmaddps", X86Fnmadd, v4f32, v8f32,
-                            memopv4f32, memopv8f32>;
+                            loadv4f32, loadv8f32>;
   defm VFNMSUBPS4   : fma4p<0x7C, "vfnmsubps", X86Fnmsub, v4f32, v8f32,
-                            memopv4f32, memopv8f32>;
+                            loadv4f32, loadv8f32>;
   defm VFMADDSUBPS4 : fma4p<0x5C, "vfmaddsubps", X86Fmaddsub, v4f32, v8f32,
-                            memopv4f32, memopv8f32>;
+                            loadv4f32, loadv8f32>;
   defm VFMSUBADDPS4 : fma4p<0x5E, "vfmsubaddps", X86Fmsubadd, v4f32, v8f32,
-                            memopv4f32, memopv8f32>;
+                            loadv4f32, loadv8f32>;
 }
 
 let ExeDomain = SSEPackedDouble in {
   defm VFMADDPD4    : fma4p<0x69, "vfmaddpd", X86Fmadd, v2f64, v4f64,
-                            memopv2f64, memopv4f64>;
+                            loadv2f64, loadv4f64>;
   defm VFMSUBPD4    : fma4p<0x6D, "vfmsubpd", X86Fmsub, v2f64, v4f64,
-                            memopv2f64, memopv4f64>;
+                            loadv2f64, loadv4f64>;
   defm VFNMADDPD4   : fma4p<0x79, "vfnmaddpd", X86Fnmadd, v2f64, v4f64,
-                            memopv2f64, memopv4f64>;
+                            loadv2f64, loadv4f64>;
   defm VFNMSUBPD4   : fma4p<0x7D, "vfnmsubpd", X86Fnmsub, v2f64, v4f64,
-                            memopv2f64, memopv4f64>;
+                            loadv2f64, loadv4f64>;
   defm VFMADDSUBPD4 : fma4p<0x5D, "vfmaddsubpd", X86Fmaddsub, v2f64, v4f64,
-                            memopv2f64, memopv4f64>;
+                            loadv2f64, loadv4f64>;
   defm VFMSUBADDPD4 : fma4p<0x5F, "vfmsubaddpd", X86Fmsubadd, v2f64, v4f64,
-                            memopv2f64, memopv4f64>;
+                            loadv2f64, loadv4f64>;
 }
 
diff --git a/lib/Target/X86/X86InstrFormats.td b/lib/Target/X86/X86InstrFormats.td
index 64018b3..0fd9011 100644
--- a/lib/Target/X86/X86InstrFormats.td
+++ b/lib/Target/X86/X86InstrFormats.td
@@ -139,6 +139,7 @@ class T8XS   { bits<5> Prefix = 18; }
 class TAXD   { bits<5> Prefix = 19; }
 class XOP8   { bits<5> Prefix = 20; }
 class XOP9   { bits<5> Prefix = 21; }
+class XOPA   { bits<5> Prefix = 22; }
 class VEX    { bit hasVEXPrefix = 1; }
 class VEX_W  { bit hasVEX_WPrefix = 1; }
 class VEX_4V : VEX { bit hasVEX_4VPrefix = 1; }
@@ -339,10 +340,11 @@ def __xd : XD;
 class SI<bits<8> o, Format F, dag outs, dag ins, string asm,
          list<dag> pattern, InstrItinClass itin = NoItinerary>
       : I<o, F, outs, ins, asm, pattern, itin> {
-  let Predicates = !if(hasVEXPrefix /* VEX */, [HasAVX],
+  let Predicates = !if(hasEVEXPrefix /* EVEX */, [HasAVX512],
+                   !if(hasVEXPrefix /* VEX */, [UseAVX],
                    !if(!eq(Prefix, __xs.Prefix), [UseSSE1],
                    !if(!eq(Prefix, __xd.Prefix), [UseSSE2],
-                   !if(hasOpSizePrefix, [UseSSE2], [UseSSE1]))));
+                   !if(hasOpSizePrefix, [UseSSE2], [UseSSE1])))));
 
   // AVX instructions have a 'v' prefix in the mnemonic
   let AsmString = !if(hasVEXPrefix, !strconcat("v", asm), asm);
@@ -352,8 +354,9 @@ class SI<bits<8> o, Format F, dag outs, dag ins, string asm,
 class SIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag> pattern, InstrItinClass itin = NoItinerary>
       : Ii8<o, F, outs, ins, asm, pattern, itin> {
-  let Predicates = !if(hasVEXPrefix /* VEX */, [HasAVX],
-            !if(!eq(Prefix, __xs.Prefix), [UseSSE1], [UseSSE2]));
+  let Predicates = !if(hasEVEXPrefix /* EVEX */, [HasAVX512],
+                   !if(hasVEXPrefix /* VEX */, [UseAVX],
+                   !if(!eq(Prefix, __xs.Prefix), [UseSSE1], [UseSSE2])));
 
   // AVX instructions have a 'v' prefix in the mnemonic
   let AsmString = !if(hasVEXPrefix, !strconcat("v", asm), asm);
@@ -363,8 +366,9 @@ class SIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
 class PI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern,
          InstrItinClass itin, Domain d>
       : I<o, F, outs, ins, asm, pattern, itin, d> {
-  let Predicates = !if(hasVEXPrefix /* VEX */, [HasAVX],
-        !if(hasOpSizePrefix /* OpSize */, [UseSSE2], [UseSSE1]));
+  let Predicates = !if(hasEVEXPrefix /* EVEX */, [HasAVX512],
+                   !if(hasVEXPrefix /* VEX */, [HasAVX],
+                   !if(hasOpSizePrefix /* OpSize */, [UseSSE2], [UseSSE1])));
 
   // AVX instructions have a 'v' prefix in the mnemonic
   let AsmString = !if(hasVEXPrefix, !strconcat("v", asm), asm);
@@ -381,11 +385,12 @@ class MMXPI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> patter
 class PIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag> pattern, InstrItinClass itin, Domain d>
       : Ii8<o, F, outs, ins, asm, pattern, itin, d> {
-  let Predicates = !if(hasVEX_4VPrefix /* VEX */, [HasAVX],
-        !if(hasOpSizePrefix /* OpSize */, [UseSSE2], [UseSSE1]));
+  let Predicates = !if(hasEVEXPrefix /* EVEX */, [HasAVX512],
+                   !if(hasVEXPrefix /* VEX */, [HasAVX],
+                   !if(hasOpSizePrefix /* OpSize */, [UseSSE2], [UseSSE1])));
 
   // AVX instructions have a 'v' prefix in the mnemonic
-  let AsmString = !if(hasVEX_4VPrefix, !strconcat("v", asm), asm);
+  let AsmString = !if(hasVEXPrefix, !strconcat("v", asm), asm);
 }
 
 // SSE1 Instruction Templates:
@@ -460,7 +465,7 @@ class PDIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
 class VSDI<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag> pattern, InstrItinClass itin = NoItinerary>
       : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin>, XD,
-        Requires<[HasAVX]>;
+        Requires<[UseAVX]>;
 class VS2SI<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern, InstrItinClass itin = NoItinerary>
       : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin>, XS,
@@ -472,7 +477,7 @@ class VPDI<bits<8> o, Format F, dag outs, dag ins, string asm,
 class VS2I<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag> pattern, InstrItinClass itin = NoItinerary>
       : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin>, TB,
-        OpSize, Requires<[HasAVX]>;
+        OpSize, Requires<[UseAVX]>;
 class S2I<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag> pattern, InstrItinClass itin = NoItinerary>
       : I<o, F, outs, ins, asm, pattern, itin>, TB,
@@ -641,7 +646,7 @@ class AVX512AIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
         Requires<[HasAVX512]>;
 class AVX512Ii8<bits<8> o, Format F, dag outs, dag ins, string asm,
               list<dag> pattern, InstrItinClass itin = NoItinerary>
-      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>,
+      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TB,
       Requires<[HasAVX512]>;
 class AVX512PDI<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag> pattern, InstrItinClass itin = NoItinerary>
@@ -653,10 +658,10 @@ class AVX512PSI<bits<8> o, Format F, dag outs, dag ins, string asm,
         Requires<[HasAVX512]>;
 class AVX512PIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
               list<dag> pattern, Domain d, InstrItinClass itin = NoItinerary>
-      : Ii8<o, F, outs, ins, asm, pattern, itin, d>, Requires<[HasAVX512]>;
+      : Ii8<o, F, outs, ins, asm, pattern, itin, d>, TB, Requires<[HasAVX512]>;
 class AVX512PI<bits<8> o, Format F, dag outs, dag ins, string asm,
               list<dag> pattern, Domain d, InstrItinClass itin = NoItinerary>
-      : I<o, F, outs, ins, asm, pattern, itin, d>, Requires<[HasAVX512]>;
+      : I<o, F, outs, ins, asm, pattern, itin, d>, TB, Requires<[HasAVX512]>;
 class AVX512FMA3<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag>pattern, InstrItinClass itin = NoItinerary>
       : I<o, F, outs, ins, asm, pattern, itin>, T8,
@@ -667,7 +672,7 @@ class AVX512FMA3<bits<8> o, Format F, dag outs, dag ins, string asm,
 // AES8I
 // These use the same encoding as the SSE4.2 T8 and TA encodings.
 class AES8I<bits<8> o, Format F, dag outs, dag ins, string asm,
-            list<dag>pattern, InstrItinClass itin = NoItinerary>
+            list<dag>pattern, InstrItinClass itin = IIC_AES>
       : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8,
         Requires<[HasAES]>;
 
@@ -767,6 +772,7 @@ class VRS2I<bits<8> o, Format F, dag outs, dag ins, string asm,
 //
 
 // MMXI   - MMX instructions with TB prefix.
+// MMXI32 - MMX instructions with TB prefix valid only in 32 bit mode.
 // MMXI64 - MMX instructions with TB prefix valid only in 64 bit mode.
 // MMX2I  - MMX / SSE2 instructions with TB and OpSize prefixes.
 // MMXIi8 - MMX instructions with ImmT == Imm8 and TB prefix.
@@ -776,6 +782,9 @@ class VRS2I<bits<8> o, Format F, dag outs, dag ins, string asm,
 class MMXI<bits<8> o, Format F, dag outs, dag ins, string asm, 
            list<dag> pattern, InstrItinClass itin = NoItinerary>
       : I<o, F, outs, ins, asm, pattern, itin>, TB, Requires<[HasMMX]>;
+class MMXI32<bits<8> o, Format F, dag outs, dag ins, string asm, 
+             list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin>, TB, Requires<[HasMMX,In32BitMode]>;
 class MMXI64<bits<8> o, Format F, dag outs, dag ins, string asm, 
              list<dag> pattern, InstrItinClass itin = NoItinerary>
       : I<o, F, outs, ins, asm, pattern, itin>, TB, Requires<[HasMMX,In64BitMode]>;
diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td
index 0b51521..1fed424 100644
--- a/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -105,6 +105,13 @@ def X86vsext   : SDNode<"X86ISD::VSEXT",
                          SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
                                               SDTCisInt<0>, SDTCisInt<1>]>>;
 
+def X86vtrunc   : SDNode<"X86ISD::VTRUNC",
+                         SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
+                                              SDTCisInt<0>, SDTCisInt<1>]>>;
+def X86vtruncm   : SDNode<"X86ISD::VTRUNCM",
+                         SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
+                                              SDTCisInt<0>, SDTCisInt<1>,
+                                              SDTCisVec<2>, SDTCisInt<2>]>>;
 def X86vfpext  : SDNode<"X86ISD::VFPEXT",
                         SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
                                              SDTCisFP<0>, SDTCisFP<1>]>>;
@@ -118,6 +125,15 @@ def X86cmpp    : SDNode<"X86ISD::CMPP",      SDTX86VFCMP>;
 def X86pcmpeq  : SDNode<"X86ISD::PCMPEQ", SDTIntBinOp, [SDNPCommutative]>;
 def X86pcmpgt  : SDNode<"X86ISD::PCMPGT", SDTIntBinOp>;
 
+def X86IntCmpMask : SDTypeProfile<1, 2,
+    [SDTCisVec<0>, SDTCisSameAs<1, 2>, SDTCisInt<1>]>;
+def X86pcmpeqm  : SDNode<"X86ISD::PCMPEQM", X86IntCmpMask, [SDNPCommutative]>;
+def X86pcmpgtm  : SDNode<"X86ISD::PCMPGTM", X86IntCmpMask>;
+
+def X86CmpMaskCC : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>]>;
+def X86cmpm   : SDNode<"X86ISD::CMPM",    X86CmpMaskCC>;
+def X86cmpmu  : SDNode<"X86ISD::CMPMU",   X86CmpMaskCC>;
+
 def X86vshl    : SDNode<"X86ISD::VSHL",
                         SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
                                       SDTCisVec<2>]>>;
@@ -140,6 +156,9 @@ def X86ptest   : SDNode<"X86ISD::PTEST", SDTX86CmpPTest>;
 def X86testp   : SDNode<"X86ISD::TESTP", SDTX86CmpPTest>;
 def X86kortest : SDNode<"X86ISD::KORTEST", SDTX86CmpPTest>;
 def X86ktest   : SDNode<"X86ISD::KTEST", SDTX86CmpPTest>;
+def X86testm  : SDNode<"X86ISD::TESTM", SDTypeProfile<1, 2, [SDTCisVec<0>,
+                                          SDTCisVec<1>,
+                                          SDTCisSameAs<2, 1>]>>;
 
 def X86pmuludq : SDNode<"X86ISD::PMULUDQ",
                         SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
@@ -151,6 +170,8 @@ def X86pmuludq : SDNode<"X86ISD::PMULUDQ",
 def SDTShuff1Op : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0,1>]>;
 def SDTShuff2Op : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
                                 SDTCisSameAs<0,2>]>;
+def SDTShuff3Op : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                                SDTCisSameAs<0,2>, SDTCisSameAs<0,3>]>;
 
 def SDTShuff2OpI : SDTypeProfile<1, 2, [SDTCisVec<0>,
                                  SDTCisSameAs<0,1>, SDTCisInt<2>]>;
@@ -194,11 +215,14 @@ def X86Unpckh : SDNode<"X86ISD::UNPCKH", SDTShuff2Op>;
 def X86VPermilp  : SDNode<"X86ISD::VPERMILP", SDTShuff2OpI>;
 def X86VPermv    : SDNode<"X86ISD::VPERMV",   SDTShuff2Op>;
 def X86VPermi    : SDNode<"X86ISD::VPERMI",   SDTShuff2OpI>;
+def X86VPermv3   : SDNode<"X86ISD::VPERMV3",  SDTShuff3Op>;
 
 def X86VPerm2x128 : SDNode<"X86ISD::VPERM2X128", SDTShuff3OpI>;
 
 def X86VBroadcast : SDNode<"X86ISD::VBROADCAST", SDTVBroadcast>;
 def X86VBroadcastm : SDNode<"X86ISD::VBROADCASTM", SDTVBroadcastm>;
+def X86Vinsert   : SDNode<"X86ISD::VINSERT",  SDTypeProfile<1, 3,
+                              [SDTCisSameAs<0, 1>, SDTCisPtrTy<3>]>, []>;
 
 def X86Blendi    : SDNode<"X86ISD::BLENDI",   SDTBlend>;
 def X86Fmadd     : SDNode<"X86ISD::FMADD",     SDTFma>;
@@ -236,13 +260,13 @@ def sse_load_f64 : ComplexPattern<v2f64, 5, "SelectScalarSSELoad", [],
 def ssmem : Operand<v4f32> {
   let PrintMethod = "printf32mem";
   let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc_nosp, i32imm, i8imm);
-  let ParserMatchClass = X86MemAsmOperand;
+  let ParserMatchClass = X86Mem32AsmOperand;
   let OperandType = "OPERAND_MEMORY";
 }
 def sdmem : Operand<v2f64> {
   let PrintMethod = "printf64mem";
   let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc_nosp, i32imm, i8imm);
-  let ParserMatchClass = X86MemAsmOperand;
+  let ParserMatchClass = X86Mem64AsmOperand;
   let OperandType = "OPERAND_MEMORY";
 }
 
@@ -262,9 +286,16 @@ def loadv8f32    : PatFrag<(ops node:$ptr), (v8f32 (load node:$ptr))>;
 def loadv4f64    : PatFrag<(ops node:$ptr), (v4f64 (load node:$ptr))>;
 def loadv4i64    : PatFrag<(ops node:$ptr), (v4i64 (load node:$ptr))>;
 
-// 128-/256-bit extload pattern fragments
+// 512-bit load pattern fragments
+def loadv16f32   : PatFrag<(ops node:$ptr), (v16f32 (load node:$ptr))>;
+def loadv8f64    : PatFrag<(ops node:$ptr), (v8f64 (load node:$ptr))>;
+def loadv16i32   : PatFrag<(ops node:$ptr), (v16i32 (load node:$ptr))>;
+def loadv8i64    : PatFrag<(ops node:$ptr), (v8i64 (load node:$ptr))>;
+
+// 128-/256-/512-bit extload pattern fragments
 def extloadv2f32 : PatFrag<(ops node:$ptr), (v2f64 (extloadvf32 node:$ptr))>;
 def extloadv4f32 : PatFrag<(ops node:$ptr), (v4f64 (extloadvf32 node:$ptr))>;
+def extloadv8f32 : PatFrag<(ops node:$ptr), (v8f64 (extloadvf32 node:$ptr))>;
 
 // Like 'store', but always requires 128-bit vector alignment.
 def alignedstore : PatFrag<(ops node:$val, node:$ptr),
@@ -278,6 +309,12 @@ def alignedstore256 : PatFrag<(ops node:$val, node:$ptr),
   return cast<StoreSDNode>(N)->getAlignment() >= 32;
 }]>;
 
+// Like 'store', but always requires 512-bit vector alignment.
+def alignedstore512 : PatFrag<(ops node:$val, node:$ptr),
+                              (store node:$val, node:$ptr), [{
+  return cast<StoreSDNode>(N)->getAlignment() >= 64;
+}]>;
+
 // Like 'load', but always requires 128-bit vector alignment.
 def alignedload : PatFrag<(ops node:$ptr), (load node:$ptr), [{
   return cast<LoadSDNode>(N)->getAlignment() >= 16;
@@ -293,6 +330,11 @@ def alignedload256 : PatFrag<(ops node:$ptr), (load node:$ptr), [{
   return cast<LoadSDNode>(N)->getAlignment() >= 32;
 }]>;
 
+// Like 'load', but always requires 512-bit vector alignment.
+def alignedload512 : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return cast<LoadSDNode>(N)->getAlignment() >= 64;
+}]>;
+
 def alignedloadfsf32 : PatFrag<(ops node:$ptr),
                                (f32 (alignedload node:$ptr))>;
 def alignedloadfsf64 : PatFrag<(ops node:$ptr),
@@ -316,6 +358,16 @@ def alignedloadv4f64 : PatFrag<(ops node:$ptr),
 def alignedloadv4i64 : PatFrag<(ops node:$ptr),
                                (v4i64 (alignedload256 node:$ptr))>;
 
+// 512-bit aligned load pattern fragments
+def alignedloadv16f32 : PatFrag<(ops node:$ptr),
+                                (v16f32 (alignedload512 node:$ptr))>;
+def alignedloadv16i32 : PatFrag<(ops node:$ptr),
+                                (v16i32 (alignedload512 node:$ptr))>;
+def alignedloadv8f64  : PatFrag<(ops node:$ptr),
+                                (v8f64  (alignedload512 node:$ptr))>;
+def alignedloadv8i64  : PatFrag<(ops node:$ptr),
+                                (v8i64  (alignedload512 node:$ptr))>;
+
 // Like 'load', but uses special alignment checks suitable for use in
 // memory operands in most SSE instructions, which are required to
 // be naturally aligned on some targets but not on others.  If the subtarget
@@ -327,6 +379,16 @@ def memop : PatFrag<(ops node:$ptr), (load node:$ptr), [{
          || cast<LoadSDNode>(N)->getAlignment() >= 16;
 }]>;
 
+def memop4 : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return    Subtarget->hasVectorUAMem()
+         || cast<LoadSDNode>(N)->getAlignment() >= 4;
+}]>;
+
+def memop8 : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return    Subtarget->hasVectorUAMem()
+         || cast<LoadSDNode>(N)->getAlignment() >= 8;
+}]>;
+
 def memopfsf32 : PatFrag<(ops node:$ptr), (f32   (memop node:$ptr))>;
 def memopfsf64 : PatFrag<(ops node:$ptr), (f64   (memop node:$ptr))>;
 
@@ -342,6 +404,12 @@ def memopv8f32 : PatFrag<(ops node:$ptr), (v8f32 (memop node:$ptr))>;
 def memopv4f64 : PatFrag<(ops node:$ptr), (v4f64 (memop node:$ptr))>;
 def memopv4i64 : PatFrag<(ops node:$ptr), (v4i64 (memop node:$ptr))>;
 
+// 512-bit memop pattern fragments
+def memopv16f32 : PatFrag<(ops node:$ptr), (v16f32 (memop4 node:$ptr))>;
+def memopv8f64  : PatFrag<(ops node:$ptr), (v8f64  (memop8 node:$ptr))>;
+def memopv16i32 : PatFrag<(ops node:$ptr), (v16i32 (memop4 node:$ptr))>;
+def memopv8i64  : PatFrag<(ops node:$ptr), (v8i64  (memop8 node:$ptr))>;
+
 // SSSE3 uses MMX registers for some instructions. They aren't aligned on a
 // 16-byte boundary.
 // FIXME: 8 byte alignment for mmx reads is not required
@@ -391,6 +459,10 @@ def bc_v16i16 : PatFrag<(ops node:$in), (v16i16 (bitconvert node:$in))>;
 def bc_v8i32 : PatFrag<(ops node:$in), (v8i32 (bitconvert node:$in))>;
 def bc_v4i64 : PatFrag<(ops node:$in), (v4i64 (bitconvert node:$in))>;
 
+// 512-bit bitconvert pattern fragments
+def bc_v16i32 : PatFrag<(ops node:$in), (v16i32 (bitconvert node:$in))>;
+def bc_v8i64 : PatFrag<(ops node:$in), (v8i64 (bitconvert node:$in))>;
+
 def vzmovl_v2i64 : PatFrag<(ops node:$src),
                            (bitconvert (v2i64 (X86vzmovl
                              (v2i64 (scalar_to_vector (loadi64 node:$src))))))>;
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index 0443a93..2461773 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -24,6 +24,7 @@
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/StackMaps.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/MC/MCAsmInfo.h"
@@ -35,7 +36,7 @@
 #include "llvm/Target/TargetOptions.h"
 #include <limits>
 
-#define GET_INSTRINFO_CTOR
+#define GET_INSTRINFO_CTOR_DTOR
 #include "X86GenInstrInfo.inc"
 
 using namespace llvm;
@@ -81,6 +82,7 @@ enum {
   TB_ALIGN_NONE  =    0 << TB_ALIGN_SHIFT,
   TB_ALIGN_16    =   16 << TB_ALIGN_SHIFT,
   TB_ALIGN_32    =   32 << TB_ALIGN_SHIFT,
+  TB_ALIGN_64    =   64 << TB_ALIGN_SHIFT,
   TB_ALIGN_MASK  = 0xff << TB_ALIGN_SHIFT
 };
 
@@ -90,6 +92,9 @@ struct X86OpTblEntry {
   uint16_t Flags;
 };
 
+// Pin the vtable to this file.
+void X86InstrInfo::anchor() {}
+
 X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
   : X86GenInstrInfo((tm.getSubtarget<X86Subtarget>().is64Bit()
                      ? X86::ADJCALLSTACKDOWN64
@@ -298,8 +303,6 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
     { X86::DIV64r,      X86::DIV64m,        TB_FOLDED_LOAD },
     { X86::DIV8r,       X86::DIV8m,         TB_FOLDED_LOAD },
     { X86::EXTRACTPSrr, X86::EXTRACTPSmr,   TB_FOLDED_STORE },
-    { X86::FsMOVAPDrr,  X86::MOVSDmr,       TB_FOLDED_STORE | TB_NO_REVERSE },
-    { X86::FsMOVAPSrr,  X86::MOVSSmr,       TB_FOLDED_STORE | TB_NO_REVERSE },
     { X86::IDIV16r,     X86::IDIV16m,       TB_FOLDED_LOAD },
     { X86::IDIV32r,     X86::IDIV32m,       TB_FOLDED_LOAD },
     { X86::IDIV64r,     X86::IDIV64m,       TB_FOLDED_LOAD },
@@ -356,8 +359,6 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
     { X86::TEST8ri,     X86::TEST8mi,       TB_FOLDED_LOAD },
     // AVX 128-bit versions of foldable instructions
     { X86::VEXTRACTPSrr,X86::VEXTRACTPSmr,  TB_FOLDED_STORE  },
-    { X86::FsVMOVAPDrr, X86::VMOVSDmr,      TB_FOLDED_STORE | TB_NO_REVERSE },
-    { X86::FsVMOVAPSrr, X86::VMOVSSmr,      TB_FOLDED_STORE | TB_NO_REVERSE },
     { X86::VEXTRACTF128rr, X86::VEXTRACTF128mr, TB_FOLDED_STORE | TB_ALIGN_16 },
     { X86::VMOVAPDrr,   X86::VMOVAPDmr,     TB_FOLDED_STORE | TB_ALIGN_16 },
     { X86::VMOVAPSrr,   X86::VMOVAPSmr,     TB_FOLDED_STORE | TB_ALIGN_16 },
@@ -374,7 +375,9 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
     { X86::VMOVAPSYrr,  X86::VMOVAPSYmr,    TB_FOLDED_STORE | TB_ALIGN_32 },
     { X86::VMOVDQAYrr,  X86::VMOVDQAYmr,    TB_FOLDED_STORE | TB_ALIGN_32 },
     { X86::VMOVUPDYrr,  X86::VMOVUPDYmr,    TB_FOLDED_STORE },
-    { X86::VMOVUPSYrr,  X86::VMOVUPSYmr,    TB_FOLDED_STORE }
+    { X86::VMOVUPSYrr,  X86::VMOVUPSYmr,    TB_FOLDED_STORE },
+    // AVX-512 foldable instructions
+    { X86::VMOVPDI2DIZrr,X86::VMOVPDI2DIZmr,  TB_FOLDED_STORE }
   };
 
   for (unsigned i = 0, e = array_lengthof(OpTbl0); i != e; ++i) {
@@ -400,8 +403,6 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
     { X86::CVTTSD2SIrr,     X86::CVTTSD2SIrm,         0 },
     { X86::CVTTSS2SI64rr,   X86::CVTTSS2SI64rm,       0 },
     { X86::CVTTSS2SIrr,     X86::CVTTSS2SIrm,         0 },
-    { X86::FsMOVAPDrr,      X86::MOVSDrm,             TB_NO_REVERSE },
-    { X86::FsMOVAPSrr,      X86::MOVSSrm,             TB_NO_REVERSE },
     { X86::IMUL16rri,       X86::IMUL16rmi,           0 },
     { X86::IMUL16rri8,      X86::IMUL16rmi8,          0 },
     { X86::IMUL32rri,       X86::IMUL32rmi,           0 },
@@ -444,7 +445,6 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
     { X86::MOVSX64rr8,      X86::MOVSX64rm8,          0 },
     { X86::MOVUPDrr,        X86::MOVUPDrm,            TB_ALIGN_16 },
     { X86::MOVUPSrr,        X86::MOVUPSrm,            0 },
-    { X86::MOVZDI2PDIrr,    X86::MOVZDI2PDIrm,        0 },
     { X86::MOVZQI2PQIrr,    X86::MOVZQI2PQIrm,        0 },
     { X86::MOVZPQILo2PQIrr, X86::MOVZPQILo2PQIrm,     TB_ALIGN_16 },
     { X86::MOVZX16rr8,      X86::MOVZX16rm8,          0 },
@@ -493,8 +493,6 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
     { X86::VCVTSD2SIrr,     X86::VCVTSD2SIrm,         0 },
     { X86::VCVTSS2SI64rr,   X86::VCVTSS2SI64rm,       0 },
     { X86::VCVTSS2SIrr,     X86::VCVTSS2SIrm,         0 },
-    { X86::FsVMOVAPDrr,     X86::VMOVSDrm,            TB_NO_REVERSE },
-    { X86::FsVMOVAPSrr,     X86::VMOVSSrm,            TB_NO_REVERSE },
     { X86::VMOV64toPQIrr,   X86::VMOVQI2PQIrm,        0 },
     { X86::VMOV64toSDrr,    X86::VMOV64toSDrm,        0 },
     { X86::VMOVAPDrr,       X86::VMOVAPDrm,           TB_ALIGN_16 },
@@ -507,7 +505,6 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
     { X86::VMOVSHDUPrr,     X86::VMOVSHDUPrm,         TB_ALIGN_16 },
     { X86::VMOVUPDrr,       X86::VMOVUPDrm,           0 },
     { X86::VMOVUPSrr,       X86::VMOVUPSrm,           0 },
-    { X86::VMOVZDI2PDIrr,   X86::VMOVZDI2PDIrm,       0 },
     { X86::VMOVZQI2PQIrr,   X86::VMOVZQI2PQIrm,       0 },
     { X86::VMOVZPQILo2PQIrr,X86::VMOVZPQILo2PQIrm,    TB_ALIGN_16 },
     { X86::VPABSBrr128,     X86::VPABSBrm128,         0 },
@@ -552,11 +549,27 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
     { X86::VBROADCASTSSYrr, X86::VBROADCASTSSYrm,     TB_NO_REVERSE },
     { X86::VBROADCASTSDYrr, X86::VBROADCASTSDYrm,     TB_NO_REVERSE },
 
-    // BMI/BMI2/LZCNT/POPCNT foldable instructions
+    // BMI/BMI2/LZCNT/POPCNT/TBM foldable instructions
     { X86::BEXTR32rr,       X86::BEXTR32rm,           0 },
     { X86::BEXTR64rr,       X86::BEXTR64rm,           0 },
+    { X86::BEXTRI32ri,      X86::BEXTRI32mi,          0 },
+    { X86::BEXTRI64ri,      X86::BEXTRI64mi,          0 },
+    { X86::BLCFILL32rr,     X86::BLCFILL32rm,         0 },
+    { X86::BLCFILL64rr,     X86::BLCFILL64rm,         0 },
+    { X86::BLCI32rr,        X86::BLCI32rm,            0 },
+    { X86::BLCI64rr,        X86::BLCI64rm,            0 },
+    { X86::BLCIC32rr,       X86::BLCIC32rm,           0 },
+    { X86::BLCIC64rr,       X86::BLCIC64rm,           0 },
+    { X86::BLCMSK32rr,      X86::BLCMSK32rm,          0 },
+    { X86::BLCMSK64rr,      X86::BLCMSK64rm,          0 },
+    { X86::BLCS32rr,        X86::BLCS32rm,            0 },
+    { X86::BLCS64rr,        X86::BLCS64rm,            0 },
+    { X86::BLSFILL32rr,     X86::BLSFILL32rm,         0 },
+    { X86::BLSFILL64rr,     X86::BLSFILL64rm,         0 },
     { X86::BLSI32rr,        X86::BLSI32rm,            0 },
     { X86::BLSI64rr,        X86::BLSI64rm,            0 },
+    { X86::BLSIC32rr,       X86::BLSIC32rm,           0 },
+    { X86::BLSIC64rr,       X86::BLSIC64rm,           0 },
     { X86::BLSMSK32rr,      X86::BLSMSK32rm,          0 },
     { X86::BLSMSK64rr,      X86::BLSMSK64rm,          0 },
     { X86::BLSR32rr,        X86::BLSR32rm,            0 },
@@ -577,9 +590,27 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
     { X86::SHRX64rr,        X86::SHRX64rm,            0 },
     { X86::SHLX32rr,        X86::SHLX32rm,            0 },
     { X86::SHLX64rr,        X86::SHLX64rm,            0 },
+    { X86::T1MSKC32rr,      X86::T1MSKC32rm,          0 },
+    { X86::T1MSKC64rr,      X86::T1MSKC64rm,          0 },
     { X86::TZCNT16rr,       X86::TZCNT16rm,           0 },
     { X86::TZCNT32rr,       X86::TZCNT32rm,           0 },
     { X86::TZCNT64rr,       X86::TZCNT64rm,           0 },
+    { X86::TZMSK32rr,       X86::TZMSK32rm,           0 },
+    { X86::TZMSK64rr,       X86::TZMSK64rm,           0 },
+
+    // AVX-512 foldable instructions
+    { X86::VMOV64toPQIZrr,  X86::VMOVQI2PQIZrm,       0 },
+    { X86::VMOVDI2SSZrr,    X86::VMOVDI2SSZrm,        0 },
+    { X86::VMOVDQA32rr,     X86::VMOVDQA32rm,         TB_ALIGN_64 },
+    { X86::VMOVDQA64rr,     X86::VMOVDQA64rm,         TB_ALIGN_64 },
+    { X86::VMOVDQU32rr,     X86::VMOVDQU32rm,         0 },
+    { X86::VMOVDQU64rr,     X86::VMOVDQU64rm,         0 },
+
+    // AES foldable instructions
+    { X86::AESIMCrr,              X86::AESIMCrm,              TB_ALIGN_16 },
+    { X86::AESKEYGENASSIST128rr,  X86::AESKEYGENASSIST128rm,  TB_ALIGN_16 },
+    { X86::VAESIMCrr,             X86::VAESIMCrm,             TB_ALIGN_16 },
+    { X86::VAESKEYGENASSIST128rr, X86::VAESKEYGENASSIST128rm, TB_ALIGN_16 },
   };
 
   for (unsigned i = 0, e = array_lengthof(OpTbl1); i != e; ++i) {
@@ -1177,6 +1208,52 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
     { X86::PDEP64rr,          X86::PDEP64rm,            0 },
     { X86::PEXT32rr,          X86::PEXT32rm,            0 },
     { X86::PEXT64rr,          X86::PEXT64rm,            0 },
+
+    // AVX-512 foldable instructions
+    { X86::VPADDDZrr,         X86::VPADDDZrm,           0 },
+    { X86::VPADDQZrr,         X86::VPADDQZrm,           0 },
+    { X86::VADDPSZrr,         X86::VADDPSZrm,           0 },
+    { X86::VADDPDZrr,         X86::VADDPDZrm,           0 },
+    { X86::VSUBPSZrr,         X86::VSUBPSZrm,           0 },
+    { X86::VSUBPDZrr,         X86::VSUBPDZrm,           0 },
+    { X86::VMULPSZrr,         X86::VMULPSZrm,           0 },
+    { X86::VMULPDZrr,         X86::VMULPDZrm,           0 },
+    { X86::VDIVPSZrr,         X86::VDIVPSZrm,           0 },
+    { X86::VDIVPDZrr,         X86::VDIVPDZrm,           0 },
+    { X86::VMINPSZrr,         X86::VMINPSZrm,           0 },
+    { X86::VMINPDZrr,         X86::VMINPDZrm,           0 },
+    { X86::VMAXPSZrr,         X86::VMAXPSZrm,           0 },
+    { X86::VMAXPDZrr,         X86::VMAXPDZrm,           0 },
+    { X86::VPERMPDZri,        X86::VPERMPDZmi,          0 },
+    { X86::VPERMPSZrr,        X86::VPERMPSZrm,          0 },
+    { X86::VPSLLVDZrr,        X86::VPSLLVDZrm,          0 },
+    { X86::VPSLLVQZrr,        X86::VPSLLVQZrm,          0 },
+    { X86::VPSRAVDZrr,        X86::VPSRAVDZrm,          0 },
+    { X86::VPSRLVDZrr,        X86::VPSRLVDZrm,          0 },
+    { X86::VPSRLVQZrr,        X86::VPSRLVQZrm,          0 },
+    { X86::VSHUFPDZrri,       X86::VSHUFPDZrmi,         0 },
+    { X86::VSHUFPSZrri,       X86::VSHUFPSZrmi,         0 },
+    { X86::VALIGNQrri,        X86::VALIGNQrmi,          0 },
+    { X86::VALIGNDrri,        X86::VALIGNDrmi,          0 },
+
+    // AES foldable instructions
+    { X86::AESDECLASTrr,      X86::AESDECLASTrm,        TB_ALIGN_16 },
+    { X86::AESDECrr,          X86::AESDECrm,            TB_ALIGN_16 },
+    { X86::AESENCLASTrr,      X86::AESENCLASTrm,        TB_ALIGN_16 },
+    { X86::AESENCrr,          X86::AESENCrm,            TB_ALIGN_16 },
+    { X86::VAESDECLASTrr,     X86::VAESDECLASTrm,       TB_ALIGN_16 },
+    { X86::VAESDECrr,         X86::VAESDECrm,           TB_ALIGN_16 },
+    { X86::VAESENCLASTrr,     X86::VAESENCLASTrm,       TB_ALIGN_16 },
+    { X86::VAESENCrr,         X86::VAESENCrm,           TB_ALIGN_16 },
+
+    // SHA foldable instructions
+    { X86::SHA1MSG1rr,        X86::SHA1MSG1rm,          TB_ALIGN_16 },
+    { X86::SHA1MSG2rr,        X86::SHA1MSG2rm,          TB_ALIGN_16 },
+    { X86::SHA1NEXTErr,       X86::SHA1NEXTErm,         TB_ALIGN_16 },
+    { X86::SHA1RNDS4rri,      X86::SHA1RNDS4rmi,        TB_ALIGN_16 },
+    { X86::SHA256MSG1rr,      X86::SHA256MSG1rm,        TB_ALIGN_16 },
+    { X86::SHA256MSG2rr,      X86::SHA256MSG2rm,        TB_ALIGN_16 },
+    { X86::SHA256RNDS2rr,     X86::SHA256RNDS2rm,       TB_ALIGN_16 },
   };
 
   for (unsigned i = 0, e = array_lengthof(OpTbl2); i != e; ++i) {
@@ -1338,6 +1415,11 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
     { X86::VFMSUBADDPD4rr,        X86::VFMSUBADDPD4rm,        TB_ALIGN_16 },
     { X86::VFMSUBADDPS4rrY,       X86::VFMSUBADDPS4rmY,       TB_ALIGN_32 },
     { X86::VFMSUBADDPD4rrY,       X86::VFMSUBADDPD4rmY,       TB_ALIGN_32 },
+    // AVX-512 VPERMI instructions with 3 source operands.
+    { X86::VPERMI2Drr,            X86::VPERMI2Drm,            0 },
+    { X86::VPERMI2Qrr,            X86::VPERMI2Qrm,            0 },
+    { X86::VPERMI2PSrr,           X86::VPERMI2PSrm,           0 },
+    { X86::VPERMI2PDrr,           X86::VPERMI2PDrm,           0 },
   };
 
   for (unsigned i = 0, e = array_lengthof(OpTbl3); i != e; ++i) {
@@ -1454,6 +1536,8 @@ static bool isFrameLoadOpcode(int Opcode) {
   case X86::VMOVDQAYrm:
   case X86::MMX_MOVD64rm:
   case X86::MMX_MOVQ64rm:
+  case X86::VMOVDQA32rm:
+  case X86::VMOVDQA64rm:
     return true;
   }
 }
@@ -2890,23 +2974,29 @@ static bool isHReg(unsigned Reg) {
 
 // Try and copy between VR128/VR64 and GR64 registers.
 static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg,
-                                        bool HasAVX) {
+                                        const X86Subtarget& Subtarget) {
+
+
   // SrcReg(VR128) -> DestReg(GR64)
   // SrcReg(VR64)  -> DestReg(GR64)
   // SrcReg(GR64)  -> DestReg(VR128)
   // SrcReg(GR64)  -> DestReg(VR64)
 
+  bool HasAVX = Subtarget.hasAVX();
+  bool HasAVX512 = Subtarget.hasAVX512();
   if (X86::GR64RegClass.contains(DestReg)) {
-    if (X86::VR128RegClass.contains(SrcReg))
+    if (X86::VR128XRegClass.contains(SrcReg))
       // Copy from a VR128 register to a GR64 register.
-      return HasAVX ? X86::VMOVPQIto64rr : X86::MOVPQIto64rr;
+      return HasAVX512 ? X86::VMOVPQIto64Zrr: (HasAVX ? X86::VMOVPQIto64rr :
+                                               X86::MOVPQIto64rr);
     if (X86::VR64RegClass.contains(SrcReg))
       // Copy from a VR64 register to a GR64 register.
       return X86::MOVSDto64rr;
   } else if (X86::GR64RegClass.contains(SrcReg)) {
     // Copy from a GR64 register to a VR128 register.
-    if (X86::VR128RegClass.contains(DestReg))
-      return HasAVX ? X86::VMOV64toPQIrr : X86::MOV64toPQIrr;
+    if (X86::VR128XRegClass.contains(DestReg))
+      return HasAVX512 ? X86::VMOV64toPQIZrr: (HasAVX ? X86::VMOV64toPQIrr :
+                                               X86::MOV64toPQIrr);
     // Copy from a GR64 register to a VR64 register.
     if (X86::VR64RegClass.contains(DestReg))
       return X86::MOV64toSDrr;
@@ -2915,14 +3005,30 @@ static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg,
   // SrcReg(FR32) -> DestReg(GR32)
   // SrcReg(GR32) -> DestReg(FR32)
 
-  if (X86::GR32RegClass.contains(DestReg) && X86::FR32RegClass.contains(SrcReg))
+  if (X86::GR32RegClass.contains(DestReg) && X86::FR32XRegClass.contains(SrcReg))
     // Copy from a FR32 register to a GR32 register.
-    return HasAVX ? X86::VMOVSS2DIrr : X86::MOVSS2DIrr;
+    return HasAVX512 ? X86::VMOVSS2DIZrr : (HasAVX ? X86::VMOVSS2DIrr : X86::MOVSS2DIrr);
 
-  if (X86::FR32RegClass.contains(DestReg) && X86::GR32RegClass.contains(SrcReg))
+  if (X86::FR32XRegClass.contains(DestReg) && X86::GR32RegClass.contains(SrcReg))
     // Copy from a GR32 register to a FR32 register.
-    return HasAVX ? X86::VMOVDI2SSrr : X86::MOVDI2SSrr;
+    return HasAVX512 ? X86::VMOVDI2SSZrr : (HasAVX ? X86::VMOVDI2SSrr : X86::MOVDI2SSrr);
+  return 0;
+}
 
+static
+unsigned copyPhysRegOpcode_AVX512(unsigned& DestReg, unsigned& SrcReg) {
+  if (X86::VR128XRegClass.contains(DestReg, SrcReg) ||
+      X86::VR256XRegClass.contains(DestReg, SrcReg) ||
+      X86::VR512RegClass.contains(DestReg, SrcReg)) {
+     DestReg = get512BitSuperRegister(DestReg);
+     SrcReg = get512BitSuperRegister(SrcReg);
+     return X86::VMOVAPSZrr;
+  }
+  if ((X86::VK8RegClass.contains(DestReg) ||
+       X86::VK16RegClass.contains(DestReg)) &&
+      (X86::VK8RegClass.contains(SrcReg) ||
+       X86::VK16RegClass.contains(SrcReg)))
+    return X86::KMOVWkk;
   return 0;
 }
 
@@ -2932,7 +3038,8 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                                bool KillSrc) const {
   // First deal with the normal symmetric copies.
   bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX();
-  unsigned Opc;
+  bool HasAVX512 = TM.getSubtarget<X86Subtarget>().hasAVX512();
+  unsigned Opc = 0;
   if (X86::GR64RegClass.contains(DestReg, SrcReg))
     Opc = X86::MOV64rr;
   else if (X86::GR32RegClass.contains(DestReg, SrcReg))
@@ -2950,14 +3057,17 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
              "8-bit H register can not be copied outside GR8_NOREX");
     } else
       Opc = X86::MOV8rr;
-  } else if (X86::VR128RegClass.contains(DestReg, SrcReg))
+  }
+  else if (X86::VR64RegClass.contains(DestReg, SrcReg))
+    Opc = X86::MMX_MOVQ64rr;
+  else if (HasAVX512)
+    Opc = copyPhysRegOpcode_AVX512(DestReg, SrcReg);
+  else if (X86::VR128RegClass.contains(DestReg, SrcReg))
     Opc = HasAVX ? X86::VMOVAPSrr : X86::MOVAPSrr;
   else if (X86::VR256RegClass.contains(DestReg, SrcReg))
     Opc = X86::VMOVAPSYrr;
-  else if (X86::VR64RegClass.contains(DestReg, SrcReg))
-    Opc = X86::MMX_MOVQ64rr;
-  else
-    Opc = CopyToFromAsymmetricReg(DestReg, SrcReg, HasAVX);
+  if (!Opc)
+    Opc = CopyToFromAsymmetricReg(DestReg, SrcReg, TM.getSubtarget<X86Subtarget>());
 
   if (Opc) {
     BuildMI(MBB, MI, DL, get(Opc), DestReg)
@@ -3005,6 +3115,18 @@ static unsigned getLoadStoreRegOpcode(unsigned Reg,
                                       bool isStackAligned,
                                       const TargetMachine &TM,
                                       bool load) {
+  if (TM.getSubtarget<X86Subtarget>().hasAVX512()) {
+    if (X86::VK8RegClass.hasSubClassEq(RC)  ||
+      X86::VK16RegClass.hasSubClassEq(RC))
+      return load ? X86::KMOVWkm : X86::KMOVWmk;
+    if (RC->getSize() == 4 && X86::FR32XRegClass.hasSubClassEq(RC))
+      return load ? X86::VMOVSSZrm : X86::VMOVSSZmr;
+    if (RC->getSize() == 8 && X86::FR64XRegClass.hasSubClassEq(RC))
+      return load ? X86::VMOVSDZrm : X86::VMOVSDZmr;
+    if (X86::VR512RegClass.hasSubClassEq(RC))
+      return load ? X86::VMOVUPSZrm : X86::VMOVUPSZmr;
+  }
+
   bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX();
   switch (RC->getSize()) {
   default:
@@ -3046,7 +3168,8 @@ static unsigned getLoadStoreRegOpcode(unsigned Reg,
     assert(X86::RFP80RegClass.hasSubClassEq(RC) && "Unknown 10-byte regclass");
     return load ? X86::LD_Fp80m : X86::ST_FpP80m;
   case 16: {
-    assert(X86::VR128RegClass.hasSubClassEq(RC) && "Unknown 16-byte regclass");
+    assert((X86::VR128RegClass.hasSubClassEq(RC) ||
+            X86::VR128XRegClass.hasSubClassEq(RC))&& "Unknown 16-byte regclass");
     // If stack is realigned we can use aligned stores.
     if (isStackAligned)
       return load ?
@@ -3058,12 +3181,19 @@ static unsigned getLoadStoreRegOpcode(unsigned Reg,
         (HasAVX ? X86::VMOVUPSmr : X86::MOVUPSmr);
   }
   case 32:
-    assert(X86::VR256RegClass.hasSubClassEq(RC) && "Unknown 32-byte regclass");
+    assert((X86::VR256RegClass.hasSubClassEq(RC) ||
+            X86::VR256XRegClass.hasSubClassEq(RC)) && "Unknown 32-byte regclass");
     // If stack is realigned we can use aligned stores.
     if (isStackAligned)
       return load ? X86::VMOVAPSYrm : X86::VMOVAPSYmr;
     else
       return load ? X86::VMOVUPSYrm : X86::VMOVUPSYmr;
+  case 64:
+    assert(X86::VR512RegClass.hasSubClassEq(RC) && "Unknown 64-byte regclass");
+    if (isStackAligned)
+      return load ? X86::VMOVAPSZrm : X86::VMOVAPSZmr;
+    else
+      return load ? X86::VMOVUPSZrm : X86::VMOVUPSZmr;
   }
 }
 
@@ -3090,7 +3220,7 @@ void X86InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
   const MachineFunction &MF = *MBB.getParent();
   assert(MF.getFrameInfo()->getObjectSize(FrameIdx) >= RC->getSize() &&
          "Stack slot too small for store");
-  unsigned Alignment = RC->getSize() == 32 ? 32 : 16;
+  unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16);
   bool isAligned = (TM.getFrameLowering()->getStackAlignment() >= Alignment) ||
     RI.canRealignStack(MF);
   unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, TM);
@@ -3106,7 +3236,7 @@ void X86InstrInfo::storeRegToAddr(MachineFunction &MF, unsigned SrcReg,
                                   MachineInstr::mmo_iterator MMOBegin,
                                   MachineInstr::mmo_iterator MMOEnd,
                                   SmallVectorImpl<MachineInstr*> &NewMIs) const {
-  unsigned Alignment = RC->getSize() == 32 ? 32 : 16;
+  unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16);
   bool isAligned = MMOBegin != MMOEnd &&
                    (*MMOBegin)->getAlignment() >= Alignment;
   unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, TM);
@@ -3126,7 +3256,7 @@ void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
                                         const TargetRegisterClass *RC,
                                         const TargetRegisterInfo *TRI) const {
   const MachineFunction &MF = *MBB.getParent();
-  unsigned Alignment = RC->getSize() == 32 ? 32 : 16;
+  unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16);
   bool isAligned = (TM.getFrameLowering()->getStackAlignment() >= Alignment) ||
     RI.canRealignStack(MF);
   unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, TM);
@@ -3140,7 +3270,7 @@ void X86InstrInfo::loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
                                  MachineInstr::mmo_iterator MMOBegin,
                                  MachineInstr::mmo_iterator MMOEnd,
                                  SmallVectorImpl<MachineInstr*> &NewMIs) const {
-  unsigned Alignment = RC->getSize() == 32 ? 32 : 16;
+  unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16);
   bool isAligned = MMOBegin != MMOEnd &&
                    (*MMOBegin)->getAlignment() >= Alignment;
   unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, TM);
@@ -3722,6 +3852,8 @@ bool X86InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
   case X86::AVX_SET0:
     assert(HasAVX && "AVX not supported");
     return Expand2AddrUndef(MIB, get(X86::VXORPSYrr));
+  case X86::AVX512_512_SET0:
+    return Expand2AddrUndef(MIB, get(X86::VPXORDZrr));
   case X86::V_SETALLONES:
     return Expand2AddrUndef(MIB, get(HasAVX ? X86::VPCMPEQDrr : X86::PCMPEQDrr));
   case X86::AVX2_SETALLONES:
@@ -3729,6 +3861,9 @@ bool X86InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
   case X86::TEST8ri_NOREX:
     MI->setDesc(get(X86::TEST8ri));
     return true;
+  case X86::KSET0W: return Expand2AddrUndef(MIB, get(X86::KXORWrr));
+  case X86::KSET1B:
+  case X86::KSET1W: return Expand2AddrUndef(MIB, get(X86::KXNORWrr));
   }
   return false;
 }
@@ -3942,18 +4077,6 @@ static bool hasPartialRegUpdate(unsigned Opcode) {
   case X86::RSQRTSSr_Int:
   case X86::SQRTSSr:
   case X86::SQRTSSr_Int:
-  // AVX encoded versions
-  case X86::VCVTSD2SSrr:
-  case X86::Int_VCVTSD2SSrr:
-  case X86::VCVTSS2SDrr:
-  case X86::Int_VCVTSS2SDrr:
-  case X86::VRCPSSr:
-  case X86::VROUNDSDr:
-  case X86::VROUNDSDr_Int:
-  case X86::VROUNDSSr:
-  case X86::VROUNDSSr_Int:
-  case X86::VRSQRTSSr:
-  case X86::VSQRTSSr:
     return true;
   }
 
@@ -3985,10 +4108,77 @@ getPartialRegUpdateClearance(const MachineInstr *MI, unsigned OpNum,
   return 16;
 }
 
+// Return true for any instruction the copies the high bits of the first source
+// operand into the unused high bits of the destination operand.
+static bool hasUndefRegUpdate(unsigned Opcode) {
+  switch (Opcode) {
+  case X86::VCVTSI2SSrr:
+  case X86::Int_VCVTSI2SSrr:
+  case X86::VCVTSI2SS64rr:
+  case X86::Int_VCVTSI2SS64rr:
+  case X86::VCVTSI2SDrr:
+  case X86::Int_VCVTSI2SDrr:
+  case X86::VCVTSI2SD64rr:
+  case X86::Int_VCVTSI2SD64rr:
+  case X86::VCVTSD2SSrr:
+  case X86::Int_VCVTSD2SSrr:
+  case X86::VCVTSS2SDrr:
+  case X86::Int_VCVTSS2SDrr:
+  case X86::VRCPSSr:
+  case X86::VROUNDSDr:
+  case X86::VROUNDSDr_Int:
+  case X86::VROUNDSSr:
+  case X86::VROUNDSSr_Int:
+  case X86::VRSQRTSSr:
+  case X86::VSQRTSSr:
+
+  // AVX-512
+  case X86::VCVTSD2SSZrr:
+  case X86::VCVTSS2SDZrr:
+    return true;
+  }
+
+  return false;
+}
+
+/// Inform the ExeDepsFix pass how many idle instructions we would like before
+/// certain undef register reads.
+///
+/// This catches the VCVTSI2SD family of instructions:
+///
+/// vcvtsi2sdq %rax, %xmm0<undef>, %xmm14
+///
+/// We should to be careful *not* to catch VXOR idioms which are presumably
+/// handled specially in the pipeline:
+///
+/// vxorps %xmm1<undef>, %xmm1<undef>, %xmm1
+///
+/// Like getPartialRegUpdateClearance, this makes a strong assumption that the
+/// high bits that are passed-through are not live.
+unsigned X86InstrInfo::
+getUndefRegClearance(const MachineInstr *MI, unsigned &OpNum,
+                     const TargetRegisterInfo *TRI) const {
+  if (!hasUndefRegUpdate(MI->getOpcode()))
+    return 0;
+
+  // Set the OpNum parameter to the first source operand.
+  OpNum = 1;
+
+  const MachineOperand &MO = MI->getOperand(OpNum);
+  if (MO.isUndef() && TargetRegisterInfo::isPhysicalRegister(MO.getReg())) {
+    // Use the same magic number as getPartialRegUpdateClearance.
+    return 16;
+  }
+  return 0;
+}
+
 void X86InstrInfo::
 breakPartialRegDependency(MachineBasicBlock::iterator MI, unsigned OpNum,
                           const TargetRegisterInfo *TRI) const {
   unsigned Reg = MI->getOperand(OpNum).getReg();
+  // If MI kills this register, the false dependence is already broken.
+  if (MI->killsRegister(Reg, TRI))
+    return;
   if (X86::VR128RegClass.contains(Reg)) {
     // These instructions are all floating point domain, so xorps is the best
     // choice.
@@ -4008,10 +4198,75 @@ breakPartialRegDependency(MachineBasicBlock::iterator MI, unsigned OpNum,
   MI->addRegisterKilled(Reg, TRI, true);
 }
 
-MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
-                                                  MachineInstr *MI,
-                                           const SmallVectorImpl<unsigned> &Ops,
-                                                  int FrameIndex) const {
+static MachineInstr* foldPatchpoint(MachineFunction &MF,
+                                    MachineInstr *MI,
+                                    const SmallVectorImpl<unsigned> &Ops,
+                                    int FrameIndex,
+                                    const TargetInstrInfo &TII) {
+  unsigned StartIdx = 0;
+  switch (MI->getOpcode()) {
+  case TargetOpcode::STACKMAP:
+    StartIdx = 2; // Skip ID, nShadowBytes.
+    break;
+  case TargetOpcode::PATCHPOINT: {
+    // For PatchPoint, the call args are not foldable.
+    PatchPointOpers opers(MI);
+    StartIdx = opers.getVarIdx();
+    break;
+  }
+  default:
+    llvm_unreachable("unexpected stackmap opcode");
+  }
+
+  // Return false if any operands requested for folding are not foldable (not
+  // part of the stackmap's live values).
+  for (SmallVectorImpl<unsigned>::const_iterator I = Ops.begin(), E = Ops.end();
+       I != E; ++I) {
+    if (*I < StartIdx)
+      return 0;
+  }
+
+  MachineInstr *NewMI =
+    MF.CreateMachineInstr(TII.get(MI->getOpcode()), MI->getDebugLoc(), true);
+  MachineInstrBuilder MIB(MF, NewMI);
+
+  // No need to fold return, the meta data, and function arguments
+  for (unsigned i = 0; i < StartIdx; ++i)
+    MIB.addOperand(MI->getOperand(i));
+
+  for (unsigned i = StartIdx; i < MI->getNumOperands(); ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (std::find(Ops.begin(), Ops.end(), i) != Ops.end()) {
+      assert(MO.getReg() && "patchpoint can only fold a vreg operand");
+      // Compute the spill slot size and offset.
+      const TargetRegisterClass *RC = MF.getRegInfo().getRegClass(MO.getReg());
+      unsigned SpillSize;
+      unsigned SpillOffset;
+      bool Valid = TII.getStackSlotRange(RC, MO.getSubReg(), SpillSize,
+                                         SpillOffset, &MF.getTarget());
+      if (!Valid)
+        report_fatal_error("cannot spill patchpoint subregister operand");
+
+      MIB.addOperand(MachineOperand::CreateImm(StackMaps::IndirectMemRefOp));
+      MIB.addOperand(MachineOperand::CreateImm(SpillSize));
+      MIB.addOperand(MachineOperand::CreateFI(FrameIndex));
+      addOffset(MIB, SpillOffset);
+    }
+    else
+      MIB.addOperand(MO);
+  }
+  return NewMI;
+}
+
+MachineInstr*
+X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
+                                    const SmallVectorImpl<unsigned> &Ops,
+                                    int FrameIndex) const {
+  // Special case stack map and patch point intrinsics.
+  if (MI->getOpcode() == TargetOpcode::STACKMAP
+      || MI->getOpcode() == TargetOpcode::PATCHPOINT) {
+    return foldPatchpoint(MF, MI, Ops, FrameIndex, *this);
+  }
   // Check switch flag
   if (NoFusing) return NULL;
 
@@ -4025,6 +4280,10 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
   const MachineFrameInfo *MFI = MF.getFrameInfo();
   unsigned Size = MFI->getObjectSize(FrameIndex);
   unsigned Alignment = MFI->getObjectAlignment(FrameIndex);
+  // If the function stack isn't realigned we don't want to fold instructions
+  // that need increased alignment.
+  if (!RI.needsStackRealignment(MF))
+    Alignment = std::min(Alignment, TM.getFrameLowering()->getStackAlignment());
   if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) {
     unsigned NewOpc = 0;
     unsigned RCSize = 0;
@@ -4054,6 +4313,12 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
                                                   MachineInstr *MI,
                                            const SmallVectorImpl<unsigned> &Ops,
                                                   MachineInstr *LoadMI) const {
+  // If loading from a FrameIndex, fold directly from the FrameIndex.
+  unsigned NumOps = LoadMI->getDesc().getNumOperands();
+  int FrameIndex;
+  if (isLoadFromStackSlot(LoadMI, FrameIndex))
+    return foldMemoryOperandImpl(MF, MI, Ops, FrameIndex);
+
   // Check switch flag
   if (NoFusing) return NULL;
 
@@ -4179,7 +4444,6 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
       return NULL;
 
     // Folding a normal load. Just copy the load's address operands.
-    unsigned NumOps = LoadMI->getDesc().getNumOperands();
     for (unsigned i = NumOps - X86::AddrNumOperands; i != NumOps; ++i)
       MOs.push_back(LoadMI->getOperand(i));
     break;
@@ -5001,6 +5265,37 @@ bool X86InstrInfo::isHighLatencyDef(int opc) const {
   case X86::VSQRTSSm:
   case X86::VSQRTSSm_Int:
   case X86::VSQRTSSr:
+  case X86::VSQRTPDZrm:
+  case X86::VSQRTPDZrr:
+  case X86::VSQRTPSZrm:
+  case X86::VSQRTPSZrr:
+  case X86::VSQRTSDZm:
+  case X86::VSQRTSDZm_Int:
+  case X86::VSQRTSDZr:
+  case X86::VSQRTSSZm_Int:
+  case X86::VSQRTSSZr:
+  case X86::VSQRTSSZm:
+  case X86::VDIVSDZrm:
+  case X86::VDIVSDZrr:
+  case X86::VDIVSSZrm:
+  case X86::VDIVSSZrr:
+
+  case X86::VGATHERQPSZrm:
+  case X86::VGATHERQPDZrm:
+  case X86::VGATHERDPDZrm:
+  case X86::VGATHERDPSZrm:
+  case X86::VPGATHERQDZrm:
+  case X86::VPGATHERQQZrm:
+  case X86::VPGATHERDDZrm:
+  case X86::VPGATHERDQZrm:
+  case X86::VSCATTERQPDZmr:
+  case X86::VSCATTERQPSZmr:
+  case X86::VSCATTERDPDZmr:
+  case X86::VSCATTERDPSZmr:
+  case X86::VPSCATTERQDZmr:
+  case X86::VPSCATTERQQZmr:
+  case X86::VPSCATTERDDZmr:
+  case X86::VPSCATTERDQZmr:
     return true;
   }
 }
diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h
index a0d1ba7..600e392 100644
--- a/lib/Target/X86/X86InstrInfo.h
+++ b/lib/Target/X86/X86InstrInfo.h
@@ -152,6 +152,8 @@ class X86InstrInfo : public X86GenInstrInfo {
                             MemOp2RegOpTableType &M2RTable,
                             unsigned RegOp, unsigned MemOp, unsigned Flags);
 
+  virtual void anchor();
+
 public:
   explicit X86InstrInfo(X86TargetMachine &tm);
 
@@ -369,6 +371,8 @@ public:
 
   unsigned getPartialRegUpdateClearance(const MachineInstr *MI, unsigned OpNum,
                                         const TargetRegisterInfo *TRI) const;
+  unsigned getUndefRegClearance(const MachineInstr *MI, unsigned &OpNum,
+                                const TargetRegisterInfo *TRI) const;
   void breakPartialRegDependency(MachineBasicBlock::iterator MI, unsigned OpNum,
                                  const TargetRegisterInfo *TRI) const;
 
diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td
index 0960a2a..6e5d543 100644
--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td
@@ -248,11 +248,12 @@ def X86xor_flag  : SDNode<"X86ISD::XOR",  SDTBinaryArithWithFlags,
                           [SDNPCommutative]>;
 def X86and_flag  : SDNode<"X86ISD::AND",  SDTBinaryArithWithFlags,
                           [SDNPCommutative]>;
-def X86andn_flag : SDNode<"X86ISD::ANDN", SDTBinaryArithWithFlags>;
 
 def X86blsi   : SDNode<"X86ISD::BLSI",   SDTIntUnaryOp>;
 def X86blsmsk : SDNode<"X86ISD::BLSMSK", SDTIntUnaryOp>;
 def X86blsr   : SDNode<"X86ISD::BLSR",   SDTIntUnaryOp>;
+def X86bzhi   : SDNode<"X86ISD::BZHI",   SDTIntShiftOp>;
+def X86bextr  : SDNode<"X86ISD::BEXTR",  SDTIntBinOp>;
 
 def X86mul_imm : SDNode<"X86ISD::MUL_IMM", SDTIntBinOp>;
 
@@ -278,53 +279,52 @@ def ptr_rc_nosp : PointerLikeRegClass<1>;
 
 // *mem - Operand definitions for the funky X86 addressing mode operands.
 //
-def X86MemAsmOperand : AsmOperandClass { 
- let Name = "Mem"; let PredicateMethod = "isMem"; 
+def X86MemAsmOperand : AsmOperandClass {
+ let Name = "Mem";
 }
-def X86Mem8AsmOperand : AsmOperandClass { 
-  let Name = "Mem8"; let PredicateMethod = "isMem8";
+def X86Mem8AsmOperand : AsmOperandClass {
+  let Name = "Mem8"; let RenderMethod = "addMemOperands";
 }
-def X86Mem16AsmOperand : AsmOperandClass { 
-  let Name = "Mem16"; let PredicateMethod = "isMem16";
+def X86Mem16AsmOperand : AsmOperandClass {
+  let Name = "Mem16"; let RenderMethod = "addMemOperands";
 }
-def X86Mem32AsmOperand : AsmOperandClass { 
-  let Name = "Mem32"; let PredicateMethod = "isMem32";
+def X86Mem32AsmOperand : AsmOperandClass {
+  let Name = "Mem32"; let RenderMethod = "addMemOperands";
 }
-def X86Mem64AsmOperand : AsmOperandClass { 
-  let Name = "Mem64"; let PredicateMethod = "isMem64";
+def X86Mem64AsmOperand : AsmOperandClass {
+  let Name = "Mem64"; let RenderMethod = "addMemOperands";
 }
-def X86Mem80AsmOperand : AsmOperandClass { 
-  let Name = "Mem80"; let PredicateMethod = "isMem80";
+def X86Mem80AsmOperand : AsmOperandClass {
+  let Name = "Mem80"; let RenderMethod = "addMemOperands";
 }
-def X86Mem128AsmOperand : AsmOperandClass { 
-  let Name = "Mem128"; let PredicateMethod = "isMem128";
+def X86Mem128AsmOperand : AsmOperandClass {
+  let Name = "Mem128"; let RenderMethod = "addMemOperands";
 }
-def X86Mem256AsmOperand : AsmOperandClass { 
-  let Name = "Mem256"; let PredicateMethod = "isMem256";
+def X86Mem256AsmOperand : AsmOperandClass {
+  let Name = "Mem256"; let RenderMethod = "addMemOperands";
+}
+def X86Mem512AsmOperand : AsmOperandClass {
+  let Name = "Mem512"; let RenderMethod = "addMemOperands";
 }
 
 // Gather mem operands
 def X86MemVX32Operand : AsmOperandClass {
-  let Name = "MemVX32"; let PredicateMethod = "isMemVX32";
+  let Name = "MemVX32"; let RenderMethod = "addMemOperands";
 }
 def X86MemVY32Operand : AsmOperandClass {
-  let Name = "MemVY32"; let PredicateMethod = "isMemVY32";
+  let Name = "MemVY32"; let RenderMethod = "addMemOperands";
+}
+def X86MemVZ32Operand : AsmOperandClass {
+  let Name = "MemVZ32"; let RenderMethod = "addMemOperands";
 }
 def X86MemVX64Operand : AsmOperandClass {
-  let Name = "MemVX64"; let PredicateMethod = "isMemVX64";
+  let Name = "MemVX64"; let RenderMethod = "addMemOperands";
 }
 def X86MemVY64Operand : AsmOperandClass {
-  let Name = "MemVY64"; let PredicateMethod = "isMemVY64";
+  let Name = "MemVY64"; let RenderMethod = "addMemOperands";
 }
-
 def X86MemVZ64Operand : AsmOperandClass {
-  let Name = "MemVZ64"; let PredicateMethod = "isMemVZ64";
-}
-def X86MemVZ32Operand : AsmOperandClass {
-  let Name = "MemVZ32"; let PredicateMethod = "isMemVZ32";
-}
-def X86Mem512AsmOperand : AsmOperandClass {
-  let Name = "Mem512"; let PredicateMethod = "isMem512";
+  let Name = "MemVZ64"; let RenderMethod = "addMemOperands";
 }
 
 def X86AbsMemAsmOperand : AsmOperandClass {
@@ -343,29 +343,29 @@ def opaque48mem : X86MemOperand<"printopaquemem">;
 def opaque80mem : X86MemOperand<"printopaquemem">;
 def opaque512mem : X86MemOperand<"printopaquemem">;
 
-def i8mem   : X86MemOperand<"printi8mem"> { 
+def i8mem   : X86MemOperand<"printi8mem"> {
   let ParserMatchClass = X86Mem8AsmOperand; }
-def i16mem  : X86MemOperand<"printi16mem"> { 
+def i16mem  : X86MemOperand<"printi16mem"> {
   let ParserMatchClass = X86Mem16AsmOperand; }
-def i32mem  : X86MemOperand<"printi32mem"> { 
+def i32mem  : X86MemOperand<"printi32mem"> {
   let ParserMatchClass = X86Mem32AsmOperand; }
-def i64mem  : X86MemOperand<"printi64mem"> { 
+def i64mem  : X86MemOperand<"printi64mem"> {
   let ParserMatchClass = X86Mem64AsmOperand; }
-def i128mem : X86MemOperand<"printi128mem"> { 
+def i128mem : X86MemOperand<"printi128mem"> {
   let ParserMatchClass = X86Mem128AsmOperand; }
-def i256mem : X86MemOperand<"printi256mem"> { 
+def i256mem : X86MemOperand<"printi256mem"> {
   let ParserMatchClass = X86Mem256AsmOperand; }
-def i512mem : X86MemOperand<"printi512mem"> { 
+def i512mem : X86MemOperand<"printi512mem"> {
   let ParserMatchClass = X86Mem512AsmOperand; }
-def f32mem  : X86MemOperand<"printf32mem"> { 
+def f32mem  : X86MemOperand<"printf32mem"> {
   let ParserMatchClass = X86Mem32AsmOperand; }
-def f64mem  : X86MemOperand<"printf64mem"> { 
+def f64mem  : X86MemOperand<"printf64mem"> {
   let ParserMatchClass = X86Mem64AsmOperand; }
-def f80mem  : X86MemOperand<"printf80mem"> { 
+def f80mem  : X86MemOperand<"printf80mem"> {
   let ParserMatchClass = X86Mem80AsmOperand; }
-def f128mem : X86MemOperand<"printf128mem"> { 
+def f128mem : X86MemOperand<"printf128mem"> {
   let ParserMatchClass = X86Mem128AsmOperand; }
-def f256mem : X86MemOperand<"printf256mem">{ 
+def f256mem : X86MemOperand<"printf256mem">{
   let ParserMatchClass = X86Mem256AsmOperand; }
 def f512mem : X86MemOperand<"printf512mem">{
   let ParserMatchClass = X86Mem512AsmOperand; }
@@ -439,17 +439,49 @@ let OperandType = "OPERAND_PCREL",
 def i32imm_pcrel : Operand<i32>;
 def i16imm_pcrel : Operand<i16>;
 
-def offset8 : Operand<i64>;
-def offset16 : Operand<i64>;
-def offset32 : Operand<i64>;
-def offset64 : Operand<i64>;
-
 // Branch targets have OtherVT type and print as pc-relative values.
 def brtarget : Operand<OtherVT>;
 def brtarget8 : Operand<OtherVT>;
 
 }
 
+def X86MemOffs8AsmOperand : AsmOperandClass {
+  let Name = "MemOffs8";
+  let RenderMethod = "addMemOffsOperands";
+  let SuperClasses = [X86Mem8AsmOperand];
+}
+def X86MemOffs16AsmOperand : AsmOperandClass {
+  let Name = "MemOffs16";
+  let RenderMethod = "addMemOffsOperands";
+  let SuperClasses = [X86Mem16AsmOperand];
+}
+def X86MemOffs32AsmOperand : AsmOperandClass {
+  let Name = "MemOffs32";
+  let RenderMethod = "addMemOffsOperands";
+  let SuperClasses = [X86Mem32AsmOperand];
+}
+def X86MemOffs64AsmOperand : AsmOperandClass {
+  let Name = "MemOffs64";
+  let RenderMethod = "addMemOffsOperands";
+  let SuperClasses = [X86Mem64AsmOperand];
+}
+
+let OperandType = "OPERAND_MEMORY" in {
+def offset8 : Operand<i64> {
+  let ParserMatchClass = X86MemOffs8AsmOperand;
+  let PrintMethod = "printMemOffs8"; }
+def offset16 : Operand<i64> {
+  let ParserMatchClass = X86MemOffs16AsmOperand;
+  let PrintMethod = "printMemOffs16"; }
+def offset32 : Operand<i64> {
+  let ParserMatchClass = X86MemOffs32AsmOperand;
+  let PrintMethod = "printMemOffs32"; }
+def offset64 : Operand<i64> {
+  let ParserMatchClass = X86MemOffs64AsmOperand;
+  let PrintMethod = "printMemOffs64"; }
+}
+
+
 def SSECC : Operand<i8> {
   let PrintMethod = "printSSECC";
   let OperandType = "OPERAND_IMMEDIATE";
@@ -470,6 +502,14 @@ class ImmZExtAsmOperandClass : AsmOperandClass {
   let RenderMethod = "addImmOperands";
 }
 
+def X86GR32orGR64AsmOperand : AsmOperandClass {
+  let Name = "GR32orGR64";
+}
+
+def GR32orGR64 : RegisterOperand<GR32> {
+  let ParserMatchClass = X86GR32orGR64AsmOperand;
+}
+
 // Sign-extended immediate classes. We don't need to define the full lattice
 // here because there is no instruction with an ambiguity between ImmSExti64i32
 // and ImmSExti32i8.
@@ -617,13 +657,13 @@ def HasSSE4A     : Predicate<"Subtarget->hasSSE4A()">;
 def HasAVX       : Predicate<"Subtarget->hasAVX()">;
 def HasAVX2      : Predicate<"Subtarget->hasAVX2()">;
 def HasAVX1Only  : Predicate<"Subtarget->hasAVX() && !Subtarget->hasAVX2()">;
-def HasAVX512      : Predicate<"Subtarget->hasAVX512()">;
+def HasAVX512    : Predicate<"Subtarget->hasAVX512()">;
 def UseAVX       : Predicate<"Subtarget->hasAVX() && !Subtarget->hasAVX512()">;
 def UseAVX2      : Predicate<"Subtarget->hasAVX2() && !Subtarget->hasAVX512()">;
 def NoAVX512       : Predicate<"!Subtarget->hasAVX512()">;
 def HasCDI       : Predicate<"Subtarget->hasCDI()">;
 def HasPFI       : Predicate<"Subtarget->hasPFI()">;
-def HasEMI       : Predicate<"Subtarget->hasERI()">;
+def HasERI       : Predicate<"Subtarget->hasERI()">;
 
 def HasPOPCNT    : Predicate<"Subtarget->hasPOPCNT()">;
 def HasAES       : Predicate<"Subtarget->hasAES()">;
@@ -632,6 +672,7 @@ def HasFMA       : Predicate<"Subtarget->hasFMA()">;
 def UseFMAOnAVX  : Predicate<"Subtarget->hasFMA() && !Subtarget->hasAVX512()">;
 def HasFMA4      : Predicate<"Subtarget->hasFMA4()">;
 def HasXOP       : Predicate<"Subtarget->hasXOP()">;
+def HasTBM       : Predicate<"Subtarget->hasTBM()">;
 def HasMOVBE     : Predicate<"Subtarget->hasMOVBE()">;
 def HasRDRAND    : Predicate<"Subtarget->hasRDRAND()">;
 def HasF16C      : Predicate<"Subtarget->hasF16C()">;
@@ -643,9 +684,10 @@ def HasRTM       : Predicate<"Subtarget->hasRTM()">;
 def HasHLE       : Predicate<"Subtarget->hasHLE()">;
 def HasTSX       : Predicate<"Subtarget->hasRTM() || Subtarget->hasHLE()">;
 def HasADX       : Predicate<"Subtarget->hasADX()">;
+def HasSHA       : Predicate<"Subtarget->hasSHA()">;
 def HasPRFCHW    : Predicate<"Subtarget->hasPRFCHW()">;
 def HasRDSEED    : Predicate<"Subtarget->hasRDSEED()">;
-def HasPrefetchW : Predicate<"Subtarget->has3DNow() || Subtarget->hasPRFCHW()">;
+def HasPrefetchW : Predicate<"Subtarget->hasPRFCHW()">;
 def FPStackf32   : Predicate<"!Subtarget->hasSSE1()">;
 def FPStackf64   : Predicate<"!Subtarget->hasSSE2()">;
 def HasCmpxchg16b: Predicate<"Subtarget->hasCmpxchg16b()">;
@@ -944,53 +986,56 @@ let Defs = [EFLAGS] in {
 def BSF16rr  : I<0xBC, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
                  "bsf{w}\t{$src, $dst|$dst, $src}",
                  [(set GR16:$dst, EFLAGS, (X86bsf GR16:$src))],
-                  IIC_BSF>, TB, OpSize, Sched<[WriteShift]>;
+                  IIC_BIT_SCAN_REG>, TB, OpSize, Sched<[WriteShift]>;
 def BSF16rm  : I<0xBC, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
                  "bsf{w}\t{$src, $dst|$dst, $src}",
                  [(set GR16:$dst, EFLAGS, (X86bsf (loadi16 addr:$src)))],
-                  IIC_BSF>, TB, OpSize, Sched<[WriteShiftLd]>;
+                  IIC_BIT_SCAN_MEM>, TB, OpSize, Sched<[WriteShiftLd]>;
 def BSF32rr  : I<0xBC, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
                  "bsf{l}\t{$src, $dst|$dst, $src}",
-                 [(set GR32:$dst, EFLAGS, (X86bsf GR32:$src))], IIC_BSF>, TB,
+                 [(set GR32:$dst, EFLAGS, (X86bsf GR32:$src))],
+                 IIC_BIT_SCAN_REG>, TB,
                Sched<[WriteShift]>;
 def BSF32rm  : I<0xBC, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
                  "bsf{l}\t{$src, $dst|$dst, $src}",
                  [(set GR32:$dst, EFLAGS, (X86bsf (loadi32 addr:$src)))],
-                 IIC_BSF>, TB, Sched<[WriteShiftLd]>;
+                 IIC_BIT_SCAN_MEM>, TB, Sched<[WriteShiftLd]>;
 def BSF64rr  : RI<0xBC, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
                   "bsf{q}\t{$src, $dst|$dst, $src}",
                   [(set GR64:$dst, EFLAGS, (X86bsf GR64:$src))],
-                  IIC_BSF>, TB, Sched<[WriteShift]>;
+                  IIC_BIT_SCAN_REG>, TB, Sched<[WriteShift]>;
 def BSF64rm  : RI<0xBC, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
                   "bsf{q}\t{$src, $dst|$dst, $src}",
                   [(set GR64:$dst, EFLAGS, (X86bsf (loadi64 addr:$src)))],
-                  IIC_BSF>, TB, Sched<[WriteShiftLd]>;
+                  IIC_BIT_SCAN_MEM>, TB, Sched<[WriteShiftLd]>;
 
 def BSR16rr  : I<0xBD, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
                  "bsr{w}\t{$src, $dst|$dst, $src}",
-                 [(set GR16:$dst, EFLAGS, (X86bsr GR16:$src))], IIC_BSR>,
+                 [(set GR16:$dst, EFLAGS, (X86bsr GR16:$src))],
+                 IIC_BIT_SCAN_REG>,
                  TB, OpSize, Sched<[WriteShift]>;
 def BSR16rm  : I<0xBD, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
                  "bsr{w}\t{$src, $dst|$dst, $src}",
                  [(set GR16:$dst, EFLAGS, (X86bsr (loadi16 addr:$src)))],
-                 IIC_BSR>, TB,
+                 IIC_BIT_SCAN_MEM>, TB,
                  OpSize, Sched<[WriteShiftLd]>;
 def BSR32rr  : I<0xBD, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
                  "bsr{l}\t{$src, $dst|$dst, $src}",
-                 [(set GR32:$dst, EFLAGS, (X86bsr GR32:$src))], IIC_BSR>, TB,
+                 [(set GR32:$dst, EFLAGS, (X86bsr GR32:$src))],
+                 IIC_BIT_SCAN_REG>, TB,
                Sched<[WriteShift]>;
 def BSR32rm  : I<0xBD, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
                  "bsr{l}\t{$src, $dst|$dst, $src}",
                  [(set GR32:$dst, EFLAGS, (X86bsr (loadi32 addr:$src)))],
-                 IIC_BSR>, TB, Sched<[WriteShiftLd]>;
+                 IIC_BIT_SCAN_MEM>, TB, Sched<[WriteShiftLd]>;
 def BSR64rr  : RI<0xBD, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
                   "bsr{q}\t{$src, $dst|$dst, $src}",
-                  [(set GR64:$dst, EFLAGS, (X86bsr GR64:$src))], IIC_BSR>, TB,
+                  [(set GR64:$dst, EFLAGS, (X86bsr GR64:$src))], IIC_BIT_SCAN_REG>, TB,
                Sched<[WriteShift]>;
 def BSR64rm  : RI<0xBD, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
                   "bsr{q}\t{$src, $dst|$dst, $src}",
                   [(set GR64:$dst, EFLAGS, (X86bsr (loadi64 addr:$src)))],
-                  IIC_BSR>, TB, Sched<[WriteShiftLd]>;
+                  IIC_BIT_SCAN_MEM>, TB, Sched<[WriteShiftLd]>;
 } // Defs = [EFLAGS]
 
 let SchedRW = [WriteMicrocoded] in {
@@ -1072,9 +1117,12 @@ def MOV64mi32 : RIi32<0xC7, MRM0m, (outs), (ins i64mem:$dst, i64i32imm:$src),
                       [(store i64immSExt32:$src, addr:$dst)], IIC_MOV_MEM>;
 } // SchedRW
 
+let hasSideEffects = 0 in {
+
 /// moffs8, moffs16 and moffs32 versions of moves.  The immediate is a
 /// 32-bit offset from the PC.  These are only valid in x86-32 mode.
 let SchedRW = [WriteALU] in {
+let mayLoad = 1 in {
 def MOV8o8a : Ii32 <0xA0, RawFrm, (outs), (ins offset8:$src),
                    "mov{b}\t{$src, %al|al, $src}", [], IIC_MOV_MEM>,
                    Requires<[In32BitMode]>;
@@ -1084,6 +1132,8 @@ def MOV16o16a : Ii32 <0xA1, RawFrm, (outs), (ins offset16:$src),
 def MOV32o32a : Ii32 <0xA1, RawFrm, (outs), (ins offset32:$src),
                       "mov{l}\t{$src, %eax|eax, $src}", [], IIC_MOV_MEM>,
                      Requires<[In32BitMode]>;
+}
+let mayStore = 1 in {
 def MOV8ao8 : Ii32 <0xA2, RawFrm, (outs offset8:$dst), (ins),
                    "mov{b}\t{%al, $dst|$dst, al}", [], IIC_MOV_MEM>,
                   Requires<[In32BitMode]>;
@@ -1094,34 +1144,40 @@ def MOV32ao32 : Ii32 <0xA3, RawFrm, (outs offset32:$dst), (ins),
                       "mov{l}\t{%eax, $dst|$dst, eax}", [], IIC_MOV_MEM>,
                      Requires<[In32BitMode]>;
 }
+}
 
 // These forms all have full 64-bit absolute addresses in their instructions
 // and use the movabs mnemonic to indicate this specific form.
-def MOV64o8a : RIi64_NOREX<0xA0, RawFrm, (outs), (ins offset64:$src),
+let mayLoad = 1 in {
+def MOV64o8a : RIi64_NOREX<0xA0, RawFrm, (outs), (ins offset8:$src),
                      "movabs{b}\t{$src, %al|al, $src}", []>,
                      Requires<[In64BitMode]>;
-def MOV64o16a : RIi64_NOREX<0xA1, RawFrm, (outs), (ins offset64:$src),
+def MOV64o16a : RIi64_NOREX<0xA1, RawFrm, (outs), (ins offset16:$src),
                      "movabs{w}\t{$src, %ax|ax, $src}", []>, OpSize,
                      Requires<[In64BitMode]>;
-def MOV64o32a : RIi64_NOREX<0xA1, RawFrm, (outs), (ins offset64:$src),
+def MOV64o32a : RIi64_NOREX<0xA1, RawFrm, (outs), (ins offset32:$src),
                      "movabs{l}\t{$src, %eax|eax, $src}", []>,
                      Requires<[In64BitMode]>;
 def MOV64o64a : RIi64<0xA1, RawFrm, (outs), (ins offset64:$src),
                      "movabs{q}\t{$src, %rax|rax, $src}", []>,
                      Requires<[In64BitMode]>;
+}
 
-def MOV64ao8 : RIi64_NOREX<0xA2, RawFrm, (outs offset64:$dst), (ins),
+let mayStore = 1 in {
+def MOV64ao8 : RIi64_NOREX<0xA2, RawFrm, (outs offset8:$dst), (ins),
                      "movabs{b}\t{%al, $dst|$dst, al}", []>,
                      Requires<[In64BitMode]>;
-def MOV64ao16 : RIi64_NOREX<0xA3, RawFrm, (outs offset64:$dst), (ins),
+def MOV64ao16 : RIi64_NOREX<0xA3, RawFrm, (outs offset16:$dst), (ins),
                      "movabs{w}\t{%ax, $dst|$dst, ax}", []>, OpSize,
                      Requires<[In64BitMode]>;
-def MOV64ao32 : RIi64_NOREX<0xA3, RawFrm, (outs offset64:$dst), (ins),
+def MOV64ao32 : RIi64_NOREX<0xA3, RawFrm, (outs offset32:$dst), (ins),
                      "movabs{l}\t{%eax, $dst|$dst, eax}", []>,
                      Requires<[In64BitMode]>;
 def MOV64ao64 : RIi64<0xA3, RawFrm, (outs offset64:$dst), (ins),
                      "movabs{q}\t{%rax, $dst|$dst, rax}", []>,
                      Requires<[In64BitMode]>;
+}
+} // hasSideEffects = 0
 
 let isCodeGenOnly = 1, hasSideEffects = 0, SchedRW = [WriteMove] in {
 def MOV8rr_REV : I<0x8A, MRMSrcReg, (outs GR8:$dst), (ins GR8:$src),
@@ -1173,7 +1229,7 @@ def MOV8rr_NOREX : I<0x88, MRMDestReg,
                      (outs GR8_NOREX:$dst), (ins GR8_NOREX:$src),
                      "mov{b}\t{$src, $dst|$dst, $src}  # NOREX", [], IIC_MOV>,
                    Sched<[WriteMove]>;
-let mayStore = 1 in
+let mayStore = 1, neverHasSideEffects = 1 in
 def MOV8mr_NOREX : I<0x88, MRMDestMem,
                      (outs), (ins i8mem_NOREX:$dst, GR8_NOREX:$src),
                      "mov{b}\t{$src, $dst|$dst, $src}  # NOREX", [],
@@ -1814,6 +1870,30 @@ let Predicates = [HasBMI2], Defs = [EFLAGS] in {
                                int_x86_bmi_bzhi_64, loadi64>, VEX_W;
 }
 
+def : Pat<(X86bzhi GR32:$src1, GR8:$src2),
+          (BZHI32rr GR32:$src1,
+                    (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+def : Pat<(X86bzhi (loadi32 addr:$src1), GR8:$src2),
+          (BZHI32rm addr:$src1,
+                    (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+def : Pat<(X86bzhi GR64:$src1, GR8:$src2),
+          (BZHI64rr GR64:$src1,
+                    (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+def : Pat<(X86bzhi (loadi64 addr:$src1), GR8:$src2),
+          (BZHI64rm addr:$src1,
+                    (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+
+let Predicates = [HasBMI] in {
+  def : Pat<(X86bextr GR32:$src1, GR32:$src2),
+            (BEXTR32rr GR32:$src1, GR32:$src2)>;
+  def : Pat<(X86bextr (loadi32 addr:$src1), GR32:$src2),
+            (BEXTR32rm addr:$src1, GR32:$src2)>;
+  def : Pat<(X86bextr GR64:$src1, GR64:$src2),
+            (BEXTR64rr GR64:$src1, GR64:$src2)>;
+  def : Pat<(X86bextr (loadi64 addr:$src1), GR64:$src2),
+            (BEXTR64rm addr:$src1, GR64:$src2)>;
+} // HasBMI
+
 multiclass bmi_pdep_pext<string mnemonic, RegisterClass RC,
                          X86MemOperand x86memop, Intrinsic Int,
                          PatFrag ld_frag> {
@@ -1838,6 +1918,134 @@ let Predicates = [HasBMI2] in {
 }
 
 //===----------------------------------------------------------------------===//
+// TBM Instructions
+//
+let Predicates = [HasTBM], Defs = [EFLAGS] in {
+
+multiclass tbm_ternary_imm_intr<bits<8> opc, RegisterClass RC, string OpcodeStr,
+                                X86MemOperand x86memop, PatFrag ld_frag,
+                                Intrinsic Int, Operand immtype,
+                                SDPatternOperator immoperator> {
+  def ri : Ii32<opc,  MRMSrcReg, (outs RC:$dst), (ins RC:$src1, immtype:$cntl),
+                !strconcat(OpcodeStr,
+                           "\t{$cntl, $src1, $dst|$dst, $src1, $cntl}"),
+                [(set RC:$dst, (Int RC:$src1, immoperator:$cntl))]>,
+           XOP, XOPA, VEX;
+  def mi : Ii32<opc,  MRMSrcMem, (outs RC:$dst),
+                (ins x86memop:$src1, immtype:$cntl),
+                !strconcat(OpcodeStr,
+                           "\t{$cntl, $src1, $dst|$dst, $src1, $cntl}"),
+                [(set RC:$dst, (Int (ld_frag addr:$src1), immoperator:$cntl))]>,
+           XOP, XOPA, VEX;
+}
+
+defm BEXTRI32 : tbm_ternary_imm_intr<0x10, GR32, "bextr", i32mem, loadi32,
+                                     int_x86_tbm_bextri_u32, i32imm, imm>;
+defm BEXTRI64 : tbm_ternary_imm_intr<0x10, GR64, "bextr", i64mem, loadi64,
+                                     int_x86_tbm_bextri_u64, i64i32imm,
+                                     i64immSExt32>, VEX_W;
+
+multiclass tbm_binary_rm<bits<8> opc, Format FormReg, Format FormMem,
+                         RegisterClass RC, string OpcodeStr,
+                         X86MemOperand x86memop, PatFrag ld_frag> {
+let hasSideEffects = 0 in {
+  def rr : I<opc,  FormReg, (outs RC:$dst), (ins RC:$src),
+             !strconcat(OpcodeStr,"\t{$src, $dst|$dst, $src}"),
+             []>, XOP, XOP9, VEX_4V;
+  let mayLoad = 1 in
+  def rm : I<opc,  FormMem, (outs RC:$dst), (ins x86memop:$src),
+             !strconcat(OpcodeStr,"\t{$src, $dst|$dst, $src}"),
+             []>, XOP, XOP9, VEX_4V;
+}
+}
+
+multiclass tbm_binary_intr<bits<8> opc, string OpcodeStr,
+                           Format FormReg, Format FormMem> {
+  defm NAME#32 : tbm_binary_rm<opc, FormReg, FormMem, GR32, OpcodeStr, i32mem,
+                               loadi32>;
+  defm NAME#64 : tbm_binary_rm<opc, FormReg, FormMem, GR64, OpcodeStr, i64mem,
+                               loadi64>, VEX_W;
+}
+
+defm BLCFILL : tbm_binary_intr<0x01, "blcfill", MRM1r, MRM1m>;
+defm BLCI    : tbm_binary_intr<0x02, "blci", MRM6r, MRM6m>;
+defm BLCIC   : tbm_binary_intr<0x01, "blcic", MRM5r, MRM5m>;
+defm BLCMSK  : tbm_binary_intr<0x02, "blcmsk", MRM1r, MRM1m>;
+defm BLCS    : tbm_binary_intr<0x01, "blcs", MRM3r, MRM3m>;
+defm BLSFILL : tbm_binary_intr<0x01, "blsfill", MRM2r, MRM2m>;
+defm BLSIC   : tbm_binary_intr<0x01, "blsic", MRM6r, MRM6m>;
+defm T1MSKC  : tbm_binary_intr<0x01, "t1mskc", MRM7r, MRM7m>;
+defm TZMSK   : tbm_binary_intr<0x01, "tzmsk", MRM4r, MRM4m>;
+} // HasTBM, EFLAGS
+
+//===----------------------------------------------------------------------===//
+// Pattern fragments to auto generate TBM instructions.
+//===----------------------------------------------------------------------===//
+
+let Predicates = [HasTBM] in {
+  def : Pat<(X86bextr GR32:$src1, (i32 imm:$src2)),
+            (BEXTRI32ri GR32:$src1, imm:$src2)>;
+  def : Pat<(X86bextr (loadi32 addr:$src1), (i32 imm:$src2)),
+            (BEXTRI32mi addr:$src1, imm:$src2)>;
+  def : Pat<(X86bextr GR64:$src1, i64immSExt32:$src2),
+            (BEXTRI64ri GR64:$src1, i64immSExt32:$src2)>;
+  def : Pat<(X86bextr (loadi64 addr:$src1), i64immSExt32:$src2),
+            (BEXTRI64mi addr:$src1, i64immSExt32:$src2)>;
+
+  // FIXME: patterns for the load versions are not implemented
+  def : Pat<(and GR32:$src, (add GR32:$src, 1)),
+            (BLCFILL32rr GR32:$src)>;
+  def : Pat<(and GR64:$src, (add GR64:$src, 1)),
+            (BLCFILL64rr GR64:$src)>;
+
+  def : Pat<(or GR32:$src, (not (add GR32:$src, 1))),
+            (BLCI32rr GR32:$src)>;
+  def : Pat<(or GR64:$src, (not (add GR64:$src, 1))),
+            (BLCI64rr GR64:$src)>;
+
+  // Extra patterns because opt can optimize the above patterns to this.
+  def : Pat<(or GR32:$src, (sub -2, GR32:$src)),
+            (BLCI32rr GR32:$src)>;
+  def : Pat<(or GR64:$src, (sub -2, GR64:$src)),
+            (BLCI64rr GR64:$src)>;
+
+  def : Pat<(and (not GR32:$src), (add GR32:$src, 1)),
+            (BLCIC32rr GR32:$src)>;
+  def : Pat<(and (not GR64:$src), (add GR64:$src, 1)),
+            (BLCIC64rr GR64:$src)>;
+
+  def : Pat<(xor GR32:$src, (add GR32:$src, 1)),
+            (BLCMSK32rr GR32:$src)>;
+  def : Pat<(xor GR64:$src, (add GR64:$src, 1)),
+            (BLCMSK64rr GR64:$src)>;
+
+  def : Pat<(or GR32:$src, (add GR32:$src, 1)),
+            (BLCS32rr GR32:$src)>;
+  def : Pat<(or GR64:$src, (add GR64:$src, 1)),
+            (BLCS64rr GR64:$src)>;
+
+  def : Pat<(or GR32:$src, (add GR32:$src, -1)),
+            (BLSFILL32rr GR32:$src)>;
+  def : Pat<(or GR64:$src, (add GR64:$src, -1)),
+            (BLSFILL64rr GR64:$src)>;
+
+  def : Pat<(or (not GR32:$src), (add GR32:$src, -1)),
+            (BLSIC32rr GR32:$src)>;
+  def : Pat<(or (not GR64:$src), (add GR64:$src, -1)),
+            (BLSIC64rr GR64:$src)>;
+
+  def : Pat<(or (not GR32:$src), (add GR32:$src, 1)),
+            (T1MSKC32rr GR32:$src)>;
+  def : Pat<(or (not GR64:$src), (add GR64:$src, 1)),
+            (T1MSKC64rr GR64:$src)>;
+
+  def : Pat<(and (not GR32:$src), (add GR32:$src, -1)),
+            (TZMSK32rr GR32:$src)>;
+  def : Pat<(and (not GR64:$src), (add GR64:$src, -1)),
+            (TZMSK64rr GR64:$src)>;
+} // HasTBM
+
+//===----------------------------------------------------------------------===//
 // Subsystems.
 //===----------------------------------------------------------------------===//
 
diff --git a/lib/Target/X86/X86InstrMMX.td b/lib/Target/X86/X86InstrMMX.td
index cb12956..ba58143 100644
--- a/lib/Target/X86/X86InstrMMX.td
+++ b/lib/Target/X86/X86InstrMMX.td
@@ -204,7 +204,7 @@ multiclass sse12_cvt_pint_3addr<bits<8> opc, RegisterClass SrcRC,
 //===----------------------------------------------------------------------===//
 
 def MMX_EMMS  : MMXI<0x77, RawFrm, (outs), (ins), "emms",
-                     [(int_x86_mmx_emms)]>;
+                     [(int_x86_mmx_emms)], IIC_MMX_EMMS>;
 
 //===----------------------------------------------------------------------===//
 // MMX Scalar Instructions
@@ -236,10 +236,10 @@ def MMX_MOVD64grr : MMXI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR64:$src),
                           (MMX_X86movd2w (x86mmx VR64:$src)))],
                           IIC_MMX_MOV_REG_MM>, Sched<[WriteMove]>;
 
-let neverHasSideEffects = 1 in
 def MMX_MOVD64to64rr : MMXRI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR64:$src),
                              "movd\t{$src, $dst|$dst, $src}",
-                             [], IIC_MMX_MOV_MM_RM>, Sched<[WriteMove]>;
+                             [(set VR64:$dst, (bitconvert GR64:$src))],
+                             IIC_MMX_MOV_MM_RM>, Sched<[WriteMove]>;
 
 // These are 64 bit moves, but since the OS X assembler doesn't
 // recognize a register-register movq, we write them as
@@ -250,10 +250,6 @@ def MMX_MOVD64from64rr : MMXRI<0x7E, MRMDestReg,
                                "movd\t{$src, $dst|$dst, $src}", 
                              [(set GR64:$dst,
                               (bitconvert VR64:$src))], IIC_MMX_MOV_REG_MM>;
-def MMX_MOVD64rrv164 : MMXRI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR64:$src),
-                             "movd\t{$src, $dst|$dst, $src}",
-                             [(set VR64:$dst,
-                              (bitconvert GR64:$src))], IIC_MMX_MOV_MM_RM>;
 let neverHasSideEffects = 1 in
 def MMX_MOVQ64rr : MMXI<0x6F, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src),
                         "movq\t{$src, $dst|$dst, $src}", [],
@@ -289,7 +285,7 @@ def MMX_MOVQ2DQrr : MMXS2SIi8<0xD6, MRMSrcReg, (outs VR128:$dst),
                                     (i64 (bitconvert (x86mmx VR64:$src))))))],
                               IIC_MMX_MOVQ_RR>;
 
-let neverHasSideEffects = 1 in
+let isCodeGenOnly = 1, hasSideEffects = 1 in {
 def MMX_MOVQ2FR64rr: MMXS2SIi8<0xD6, MRMSrcReg, (outs FR64:$dst),
                                (ins VR64:$src), "movq2dq\t{$src, $dst|$dst, $src}",
                                [], IIC_MMX_MOVQ_RR>;
@@ -297,6 +293,7 @@ def MMX_MOVQ2FR64rr: MMXS2SIi8<0xD6, MRMSrcReg, (outs FR64:$dst),
 def MMX_MOVFR642Qrr: MMXSDIi8<0xD6, MRMSrcReg, (outs VR64:$dst),
                               (ins FR64:$src), "movdq2q\t{$src, $dst|$dst, $src}",
                               [], IIC_MMX_MOVQ_RR>;
+}
 } // SchedRW
 
 def MMX_MOVNTQmr  : MMXI<0xE7, MRMDestMem, (outs), (ins i64mem:$dst, VR64:$src),
@@ -304,21 +301,15 @@ def MMX_MOVNTQmr  : MMXI<0xE7, MRMDestMem, (outs), (ins i64mem:$dst, VR64:$src),
                          [(int_x86_mmx_movnt_dq addr:$dst, VR64:$src)],
                          IIC_MMX_MOVQ_RM>, Sched<[WriteStore]>;
 
-let AddedComplexity = 15 in
-// movd to MMX register zero-extends
-def MMX_MOVZDI2PDIrr : MMXI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR32:$src),
-                             "movd\t{$src, $dst|$dst, $src}",
-              [(set VR64:$dst,
-                    (x86mmx (X86vzmovl (x86mmx (scalar_to_vector GR32:$src)))))],
-                            IIC_MMX_MOV_MM_RM>, Sched<[WriteMove]>;
-let AddedComplexity = 20 in
-def MMX_MOVZDI2PDIrm : MMXI<0x6E, MRMSrcMem, (outs VR64:$dst),
-                           (ins i32mem:$src),
-                             "movd\t{$src, $dst|$dst, $src}",
-          [(set VR64:$dst,
-                (x86mmx (X86vzmovl (x86mmx
-                                   (scalar_to_vector (loadi32 addr:$src))))))],
-                            IIC_MMX_MOV_MM_RM>, Sched<[WriteLoad]>;
+let Predicates = [HasMMX] in {
+  let AddedComplexity = 15 in
+  // movd to MMX register zero-extends
+  def : Pat<(x86mmx (X86vzmovl (x86mmx (scalar_to_vector GR32:$src)))),
+            (MMX_MOVD64rr GR32:$src)>;
+  let AddedComplexity = 20 in
+  def : Pat<(x86mmx (X86vzmovl (x86mmx (scalar_to_vector (loadi32 addr:$src))))),
+            (MMX_MOVD64rm addr:$src)>;
+}
 
 // Arithmetic Instructions
 defm MMX_PABSB : SS3I_unop_rm_int_mm<0x1C, "pabsb", int_x86_ssse3_pabs_b,
@@ -555,18 +546,18 @@ let Constraints = "$src1 = $dst" in {
 
 // Extract / Insert
 def MMX_PEXTRWirri: MMXIi8<0xC5, MRMSrcReg,
-                           (outs GR32:$dst), (ins VR64:$src1, i32i8imm:$src2),
-                           "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                           [(set GR32:$dst, (int_x86_mmx_pextr_w VR64:$src1,
-                                             (iPTR imm:$src2)))],
-                           IIC_MMX_PEXTR>, Sched<[WriteShuffle]>;
+                       (outs GR32orGR64:$dst), (ins VR64:$src1, i32i8imm:$src2),
+                       "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                       [(set GR32orGR64:$dst, (int_x86_mmx_pextr_w VR64:$src1,
+                                         (iPTR imm:$src2)))],
+                       IIC_MMX_PEXTR>, Sched<[WriteShuffle]>;
 let Constraints = "$src1 = $dst" in {
   def MMX_PINSRWirri : MMXIi8<0xC4, MRMSrcReg,
                       (outs VR64:$dst), 
-                      (ins VR64:$src1, GR32:$src2, i32i8imm:$src3),
+                      (ins VR64:$src1, GR32orGR64:$src2, i32i8imm:$src3),
                       "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
                       [(set VR64:$dst, (int_x86_mmx_pinsr_w VR64:$src1,
-                                        GR32:$src2, (iPTR imm:$src3)))],
+                                        GR32orGR64:$src2, (iPTR imm:$src3)))],
                       IIC_MMX_PINSRW>, Sched<[WriteShuffle]>;
 
   def MMX_PINSRWirmi : MMXIi8<0xC4, MRMSrcMem,
@@ -580,9 +571,10 @@ let Constraints = "$src1 = $dst" in {
 }
 
 // Mask creation
-def MMX_PMOVMSKBrr : MMXI<0xD7, MRMSrcReg, (outs GR32:$dst), (ins VR64:$src),
+def MMX_PMOVMSKBrr : MMXI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
+                          (ins VR64:$src),
                           "pmovmskb\t{$src, $dst|$dst, $src}",
-                          [(set GR32:$dst, 
+                          [(set GR32orGR64:$dst,
                                 (int_x86_mmx_pmovmskb VR64:$src))]>;
 
 
@@ -599,10 +591,10 @@ def : Pat<(x86mmx (MMX_X86movdq2q (loadv2i64 addr:$src))),
 // Misc.
 let SchedRW = [WriteShuffle] in {
 let Uses = [EDI] in
-def MMX_MASKMOVQ : MMXI<0xF7, MRMSrcReg, (outs), (ins VR64:$src, VR64:$mask),
-                        "maskmovq\t{$mask, $src|$src, $mask}",
-                        [(int_x86_mmx_maskmovq VR64:$src, VR64:$mask, EDI)],
-                        IIC_MMX_MASKMOV>;
+def MMX_MASKMOVQ : MMXI32<0xF7, MRMSrcReg, (outs), (ins VR64:$src, VR64:$mask),
+                          "maskmovq\t{$mask, $src|$src, $mask}",
+                          [(int_x86_mmx_maskmovq VR64:$src, VR64:$mask, EDI)],
+                          IIC_MMX_MASKMOV>;
 let Uses = [RDI] in
 def MMX_MASKMOVQ64: MMXI64<0xF7, MRMSrcReg, (outs), (ins VR64:$src, VR64:$mask),
                            "maskmovq\t{$mask, $src|$src, $mask}",
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index a86006a..a5debc0 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -151,6 +151,34 @@ def SSE_MOVU_ITINS : OpndItins<
   IIC_SSE_MOVU_P_RR, IIC_SSE_MOVU_P_RM
 >;
 
+def SSE_DPPD_ITINS : OpndItins<
+  IIC_SSE_DPPD_RR, IIC_SSE_DPPD_RM
+>;
+
+def SSE_DPPS_ITINS : OpndItins<
+  IIC_SSE_DPPS_RR, IIC_SSE_DPPD_RM
+>;
+
+def DEFAULT_ITINS : OpndItins<
+  IIC_ALU_NONMEM, IIC_ALU_MEM
+>;
+
+def SSE_EXTRACT_ITINS : OpndItins<
+  IIC_SSE_EXTRACTPS_RR, IIC_SSE_EXTRACTPS_RM
+>;
+
+def SSE_INSERT_ITINS : OpndItins<
+  IIC_SSE_INSERTPS_RR, IIC_SSE_INSERTPS_RM
+>;
+
+def SSE_MPSADBW_ITINS : OpndItins<
+  IIC_SSE_MPSADBW_RR, IIC_SSE_MPSADBW_RM
+>;
+
+def SSE_PMULLD_ITINS : OpndItins<
+  IIC_SSE_PMULLD_RR, IIC_SSE_PMULLD_RM
+>;
+
 //===----------------------------------------------------------------------===//
 // SSE 1 & 2 Instructions Classes
 //===----------------------------------------------------------------------===//
@@ -455,10 +483,10 @@ let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
 // SSE 1 & 2 - Move FP Scalar Instructions
 //
 // Move Instructions. Register-to-register movss/movsd is not used for FR32/64
-// register copies because it's a partial register update; FsMOVAPSrr/FsMOVAPDrr
-// is used instead. Register-to-register movss/movsd is not modeled as an
-// INSERT_SUBREG because INSERT_SUBREG requires that the insert be implementable
-// in terms of a copy, and just mentioned, we don't use movss/movsd for copies.
+// register copies because it's a partial register update; Register-to-register
+// movss/movsd is not modeled as an INSERT_SUBREG because INSERT_SUBREG requires
+// that the insert be implementable in terms of a copy, and just mentioned, we
+// don't use movss/movsd for copies.
 //===----------------------------------------------------------------------===//
 
 multiclass sse12_move_rr<RegisterClass RC, SDNode OpNode, ValueType vt,
@@ -526,7 +554,7 @@ let canFoldAsLoad = 1, isReMaterializable = 1 in {
 }
 
 // Patterns
-let Predicates = [HasAVX] in {
+let Predicates = [UseAVX] in {
   let AddedComplexity = 15 in {
   // Move scalar to XMM zero-extended, zeroing a VR128 then do a
   // MOVS{S,D} to the lower bits.
@@ -1074,23 +1102,6 @@ let Predicates = [UseSSE1] in {
             (MOVUPSmr addr:$dst, VR128:$src)>;
 }
 
-// Alias instruction to do FR32 or FR64 reg-to-reg copy using movaps. Upper
-// bits are disregarded. FIXME: Set encoding to pseudo!
-let neverHasSideEffects = 1, SchedRW = [WriteMove] in {
-def FsVMOVAPSrr : VPSI<0x28, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src),
-                       "movaps\t{$src, $dst|$dst, $src}", [],
-                       IIC_SSE_MOVA_P_RR>, VEX;
-def FsVMOVAPDrr : VPDI<0x28, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src),
-                       "movapd\t{$src, $dst|$dst, $src}", [],
-                       IIC_SSE_MOVA_P_RR>, VEX;
-def FsMOVAPSrr : PSI<0x28, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src),
-                     "movaps\t{$src, $dst|$dst, $src}", [],
-                     IIC_SSE_MOVA_P_RR>;
-def FsMOVAPDrr : PDI<0x28, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src),
-                     "movapd\t{$src, $dst|$dst, $src}", [],
-                     IIC_SSE_MOVA_P_RR>;
-}
-
 // Alias instruction to load FR32 or FR64 from f128mem using movaps. Upper
 // bits are disregarded. FIXME: Set encoding to pseudo!
 let canFoldAsLoad = 1, isReMaterializable = 1, SchedRW = [WriteLoad] in {
@@ -1103,15 +1114,15 @@ let isCodeGenOnly = 1 in {
                          "movapd\t{$src, $dst|$dst, $src}",
                          [(set FR64:$dst, (alignedloadfsf64 addr:$src))],
                          IIC_SSE_MOVA_P_RM>, VEX;
+  def FsMOVAPSrm : PSI<0x28, MRMSrcMem, (outs FR32:$dst), (ins f128mem:$src),
+                       "movaps\t{$src, $dst|$dst, $src}",
+                       [(set FR32:$dst, (alignedloadfsf32 addr:$src))],
+                       IIC_SSE_MOVA_P_RM>;
+  def FsMOVAPDrm : PDI<0x28, MRMSrcMem, (outs FR64:$dst), (ins f128mem:$src),
+                       "movapd\t{$src, $dst|$dst, $src}",
+                       [(set FR64:$dst, (alignedloadfsf64 addr:$src))],
+                       IIC_SSE_MOVA_P_RM>;
 }
-def FsMOVAPSrm : PSI<0x28, MRMSrcMem, (outs FR32:$dst), (ins f128mem:$src),
-                     "movaps\t{$src, $dst|$dst, $src}",
-                     [(set FR32:$dst, (alignedloadfsf32 addr:$src))],
-                     IIC_SSE_MOVA_P_RM>;
-def FsMOVAPDrm : PDI<0x28, MRMSrcMem, (outs FR64:$dst), (ins f128mem:$src),
-                     "movapd\t{$src, $dst|$dst, $src}",
-                     [(set FR64:$dst, (alignedloadfsf64 addr:$src))],
-                     IIC_SSE_MOVA_P_RM>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1327,7 +1338,7 @@ let Predicates = [UseSSE2] in {
 // SSE 1 & 2 - Move Low to High and High to Low packed FP Instructions
 //===----------------------------------------------------------------------===//
 
-let AddedComplexity = 20 in {
+let AddedComplexity = 20, Predicates = [UseAVX] in {
   def VMOVLHPSrr : VPSI<0x16, MRMSrcReg, (outs VR128:$dst),
                                        (ins VR128:$src1, VR128:$src2),
                       "movlhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
@@ -1358,7 +1369,7 @@ let Constraints = "$src1 = $dst", AddedComplexity = 20 in {
                         IIC_SSE_MOV_LH>, Sched<[WriteShuffle]>;
 }
 
-let Predicates = [HasAVX] in {
+let Predicates = [UseAVX] in {
   // MOVLHPS patterns
   def : Pat<(v4i32 (X86Movlhps VR128:$src1, VR128:$src2)),
             (VMOVLHPSrr VR128:$src1, VR128:$src2)>;
@@ -1440,7 +1451,7 @@ let neverHasSideEffects = 1 in {
 
 multiclass sse12_vcvt_avx<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
                           X86MemOperand x86memop, string asm> {
-let neverHasSideEffects = 1 in {
+let neverHasSideEffects = 1, Predicates = [UseAVX] in {
   def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src),
               !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
            Sched<[WriteCvtI2F]>;
@@ -1452,6 +1463,7 @@ let neverHasSideEffects = 1 in {
 } // neverHasSideEffects = 1
 }
 
+let Predicates = [UseAVX] in {
 defm VCVTTSS2SI   : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32,
                                 "cvttss2si\t{$src, $dst|$dst, $src}",
                                 SSE_CVT_SS2SI_32>,
@@ -1485,7 +1497,7 @@ def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}",
                 (VCVTTSD2SI64rr GR64:$dst, FR64:$src), 0>;
 def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}",
                 (VCVTTSD2SI64rm GR64:$dst, f64mem:$src), 0>;
-
+}
 // The assembler can recognize rr 64-bit instructions by seeing a rxx
 // register, but the same isn't true when only using memory operands,
 // provide other assembly "l" and "q" forms to address this explicitly
@@ -1499,12 +1511,12 @@ defm VCVTSI2SD   : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd{l}">,
 defm VCVTSI2SD64 : sse12_vcvt_avx<0x2A, GR64, FR64, i64mem, "cvtsi2sd{q}">,
                                   XD, VEX_4V, VEX_W, VEX_LIG;
 
-def : InstAlias<"vcvtsi2ss\t{$src, $src1, $dst|$dst, $src1, $src}",
+let Predicates = [UseAVX] in {
+  def : InstAlias<"vcvtsi2ss\t{$src, $src1, $dst|$dst, $src1, $src}",
                 (VCVTSI2SSrm FR64:$dst, FR64:$src1, i32mem:$src)>;
-def : InstAlias<"vcvtsi2sd\t{$src, $src1, $dst|$dst, $src1, $src}",
+  def : InstAlias<"vcvtsi2sd\t{$src, $src1, $dst|$dst, $src1, $src}",
                 (VCVTSI2SDrm FR64:$dst, FR64:$src1, i32mem:$src)>;
 
-let Predicates = [HasAVX] in {
   def : Pat<(f32 (sint_to_fp (loadi32 addr:$src))),
             (VCVTSI2SSrm (f32 (IMPLICIT_DEF)), addr:$src)>;
   def : Pat<(f32 (sint_to_fp (loadi64 addr:$src))),
@@ -1606,19 +1618,21 @@ multiclass sse12_cvt_sint_3addr<bits<8> opc, RegisterClass SrcRC,
               itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
 }
 
+let Predicates = [UseAVX] in {
 defm VCVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32,
                   int_x86_sse2_cvtsd2si, sdmem, sse_load_f64, "cvtsd2si",
                   SSE_CVT_SD2SI>, XD, VEX, VEX_LIG;
 defm VCVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64,
                     int_x86_sse2_cvtsd2si64, sdmem, sse_load_f64, "cvtsd2si",
                     SSE_CVT_SD2SI>, XD, VEX, VEX_W, VEX_LIG;
-
+}
 defm CVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse2_cvtsd2si,
                  sdmem, sse_load_f64, "cvtsd2si", SSE_CVT_SD2SI>, XD;
 defm CVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse2_cvtsd2si64,
                    sdmem, sse_load_f64, "cvtsd2si", SSE_CVT_SD2SI>, XD, REX_W;
 
 
+let Predicates = [UseAVX] in {
 defm Int_VCVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
           int_x86_sse_cvtsi2ss, i32mem, loadi32, "cvtsi2ss{l}",
           SSE_CVT_Scalar, 0>, XS, VEX_4V;
@@ -1633,7 +1647,7 @@ defm Int_VCVTSI2SD64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
           int_x86_sse2_cvtsi642sd, i64mem, loadi64, "cvtsi2sd{q}",
           SSE_CVT_Scalar, 0>, XD,
           VEX_4V, VEX_W;
-
+}
 let Constraints = "$src1 = $dst" in {
   defm Int_CVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
                         int_x86_sse_cvtsi2ss, i32mem, loadi32,
@@ -1652,6 +1666,7 @@ let Constraints = "$src1 = $dst" in {
 /// SSE 1 Only
 
 // Aliases for intrinsics
+let Predicates = [UseAVX] in {
 defm Int_VCVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse_cvttss2si,
                                     ssmem, sse_load_f32, "cvttss2si",
                                     SSE_CVT_SS2SI_32>, XS, VEX;
@@ -1666,6 +1681,7 @@ defm Int_VCVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
                                   int_x86_sse2_cvttsd2si64, sdmem, sse_load_f64,
                                   "cvttsd2si", SSE_CVT_SD2SI>,
                                   XD, VEX, VEX_W;
+}
 defm Int_CVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse_cvttss2si,
                                     ssmem, sse_load_f32, "cvttss2si",
                                     SSE_CVT_SS2SI_32>, XS;
@@ -1679,13 +1695,14 @@ defm Int_CVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
                                   int_x86_sse2_cvttsd2si64, sdmem, sse_load_f64,
                                   "cvttsd2si", SSE_CVT_SD2SI>, XD, REX_W;
 
+let Predicates = [UseAVX] in {
 defm VCVTSS2SI   : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse_cvtss2si,
                                   ssmem, sse_load_f32, "cvtss2si",
                                   SSE_CVT_SS2SI_32>, XS, VEX, VEX_LIG;
 defm VCVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse_cvtss2si64,
                                   ssmem, sse_load_f32, "cvtss2si",
                                   SSE_CVT_SS2SI_64>, XS, VEX, VEX_W, VEX_LIG;
-
+}
 defm CVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse_cvtss2si,
                                ssmem, sse_load_f32, "cvtss2si",
                                SSE_CVT_SS2SI_32>, XS;
@@ -1707,6 +1724,7 @@ defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, VR128, i128mem,
                             SSEPackedSingle, SSE_CVT_PS>,
                             TB, Requires<[UseSSE2]>;
 
+let Predicates = [UseAVX] in {
 def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}",
                 (VCVTSS2SIrr GR32:$dst, VR128:$src), 0>;
 def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}",
@@ -1723,6 +1741,7 @@ def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}",
                 (VCVTSD2SI64rr GR64:$dst, VR128:$src), 0>;
 def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}",
                 (VCVTSD2SI64rm GR64:$dst, sdmem:$src), 0>;
+}
 
 def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}",
                 (CVTSS2SIrr GR32:$dst, VR128:$src), 0>;
@@ -1744,7 +1763,7 @@ def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}",
 /// SSE 2 Only
 
 // Convert scalar double to scalar single
-let neverHasSideEffects = 1 in {
+let neverHasSideEffects = 1, Predicates = [UseAVX] in {
 def VCVTSD2SSrr  : VSDI<0x5A, MRMSrcReg, (outs FR32:$dst),
                        (ins FR64:$src1, FR64:$src2),
                       "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", [],
@@ -1760,7 +1779,7 @@ def VCVTSD2SSrm  : I<0x5A, MRMSrcMem, (outs FR32:$dst),
 }
 
 def : Pat<(f32 (fround FR64:$src)), (VCVTSD2SSrr FR64:$src, FR64:$src)>,
-          Requires<[HasAVX]>;
+          Requires<[UseAVX]>;
 
 def CVTSD2SSrr  : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src),
                       "cvtsd2ss\t{$src, $dst|$dst, $src}",
@@ -1778,27 +1797,27 @@ def Int_VCVTSD2SSrr: I<0x5A, MRMSrcReg,
                        "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                        [(set VR128:$dst,
                          (int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))],
-                       IIC_SSE_CVT_Scalar_RR>, XD, VEX_4V, Requires<[HasAVX]>,
+                       IIC_SSE_CVT_Scalar_RR>, XD, VEX_4V, Requires<[UseAVX]>,
                        Sched<[WriteCvtF2F]>;
 def Int_VCVTSD2SSrm: I<0x5A, MRMSrcReg,
                        (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
                        "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                        [(set VR128:$dst, (int_x86_sse2_cvtsd2ss
                                           VR128:$src1, sse_load_f64:$src2))],
-                       IIC_SSE_CVT_Scalar_RM>, XD, VEX_4V, Requires<[HasAVX]>,
+                       IIC_SSE_CVT_Scalar_RM>, XD, VEX_4V, Requires<[UseAVX]>,
                        Sched<[WriteCvtF2FLd, ReadAfterLd]>;
 
 let Constraints = "$src1 = $dst" in {
 def Int_CVTSD2SSrr: I<0x5A, MRMSrcReg,
                        (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
-                       "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                       "cvtsd2ss\t{$src2, $dst|$dst, $src2}",
                        [(set VR128:$dst,
                          (int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))],
                        IIC_SSE_CVT_Scalar_RR>, XD, Requires<[UseSSE2]>,
                        Sched<[WriteCvtF2F]>;
 def Int_CVTSD2SSrm: I<0x5A, MRMSrcReg,
                        (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
-                       "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                       "cvtsd2ss\t{$src2, $dst|$dst, $src2}",
                        [(set VR128:$dst, (int_x86_sse2_cvtsd2ss
                                           VR128:$src1, sse_load_f64:$src2))],
                        IIC_SSE_CVT_Scalar_RM>, XD, Requires<[UseSSE2]>,
@@ -1807,7 +1826,7 @@ def Int_CVTSD2SSrm: I<0x5A, MRMSrcReg,
 
 // Convert scalar single to scalar double
 // SSE2 instructions with XS prefix
-let neverHasSideEffects = 1 in {
+let neverHasSideEffects = 1, Predicates = [UseAVX] in {
 def VCVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst),
                     (ins FR32:$src1, FR32:$src2),
                     "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
@@ -1824,16 +1843,16 @@ def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst),
 }
 
 def : Pat<(f64 (fextend FR32:$src)),
-    (VCVTSS2SDrr FR32:$src, FR32:$src)>, Requires<[HasAVX]>;
+    (VCVTSS2SDrr FR32:$src, FR32:$src)>, Requires<[UseAVX]>;
 def : Pat<(fextend (loadf32 addr:$src)),
-    (VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>, Requires<[HasAVX]>;
+    (VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>, Requires<[UseAVX]>;
 
 def : Pat<(extloadf32 addr:$src),
     (VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>,
-    Requires<[HasAVX, OptForSize]>;
+    Requires<[UseAVX, OptForSize]>;
 def : Pat<(extloadf32 addr:$src),
     (VCVTSS2SDrr (f32 (IMPLICIT_DEF)), (VMOVSSrm addr:$src))>,
-    Requires<[HasAVX, OptForSpeed]>;
+    Requires<[UseAVX, OptForSpeed]>;
 
 def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src),
                    "cvtss2sd\t{$src, $dst|$dst, $src}",
@@ -1861,14 +1880,14 @@ def Int_VCVTSS2SDrr: I<0x5A, MRMSrcReg,
                     "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                     [(set VR128:$dst,
                       (int_x86_sse2_cvtss2sd VR128:$src1, VR128:$src2))],
-                    IIC_SSE_CVT_Scalar_RR>, XS, VEX_4V, Requires<[HasAVX]>,
+                    IIC_SSE_CVT_Scalar_RR>, XS, VEX_4V, Requires<[UseAVX]>,
                     Sched<[WriteCvtF2F]>;
 def Int_VCVTSS2SDrm: I<0x5A, MRMSrcMem,
                       (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2),
                     "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                     [(set VR128:$dst,
                       (int_x86_sse2_cvtss2sd VR128:$src1, sse_load_f32:$src2))],
-                    IIC_SSE_CVT_Scalar_RM>, XS, VEX_4V, Requires<[HasAVX]>,
+                    IIC_SSE_CVT_Scalar_RM>, XS, VEX_4V, Requires<[UseAVX]>,
                     Sched<[WriteCvtF2FLd, ReadAfterLd]>;
 let Constraints = "$src1 = $dst" in { // SSE2 instructions with XS prefix
 def Int_CVTSS2SDrr: I<0x5A, MRMSrcReg,
@@ -1895,7 +1914,7 @@ def VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
 def VCVTPS2DQrm : VPDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                        "cvtps2dq\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst,
-                         (int_x86_sse2_cvtps2dq (memopv4f32 addr:$src)))],
+                         (int_x86_sse2_cvtps2dq (loadv4f32 addr:$src)))],
                        IIC_SSE_CVT_PS_RM>, VEX, Sched<[WriteCvtF2ILd]>;
 def VCVTPS2DQYrr : VPDI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
                         "cvtps2dq\t{$src, $dst|$dst, $src}",
@@ -1905,7 +1924,7 @@ def VCVTPS2DQYrr : VPDI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
 def VCVTPS2DQYrm : VPDI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
                         "cvtps2dq\t{$src, $dst|$dst, $src}",
                         [(set VR256:$dst,
-                          (int_x86_avx_cvt_ps2dq_256 (memopv8f32 addr:$src)))],
+                          (int_x86_avx_cvt_ps2dq_256 (loadv8f32 addr:$src)))],
                         IIC_SSE_CVT_PS_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>;
 def CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                      "cvtps2dq\t{$src, $dst|$dst, $src}",
@@ -1934,7 +1953,7 @@ def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}",
 def VCVTPD2DQXrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                        "vcvtpd2dqx\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst,
-                         (int_x86_sse2_cvtpd2dq (memopv2f64 addr:$src)))]>, VEX,
+                         (int_x86_sse2_cvtpd2dq (loadv2f64 addr:$src)))]>, VEX,
                        Sched<[WriteCvtF2ILd]>;
 
 // YMM only
@@ -1946,7 +1965,7 @@ def VCVTPD2DQYrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
 def VCVTPD2DQYrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
                        "vcvtpd2dq{y}\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst,
-                         (int_x86_avx_cvt_pd2dq_256 (memopv4f64 addr:$src)))]>,
+                         (int_x86_avx_cvt_pd2dq_256 (loadv4f64 addr:$src)))]>,
                        VEX, VEX_L, Sched<[WriteCvtF2ILd]>;
 def : InstAlias<"vcvtpd2dq\t{$src, $dst|$dst, $src}",
                 (VCVTPD2DQYrr VR128:$dst, VR256:$src)>;
@@ -1972,7 +1991,7 @@ def VCVTTPS2DQrr : VS2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
 def VCVTTPS2DQrm : VS2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                          "cvttps2dq\t{$src, $dst|$dst, $src}",
                          [(set VR128:$dst, (int_x86_sse2_cvttps2dq
-                                            (memopv4f32 addr:$src)))],
+                                            (loadv4f32 addr:$src)))],
                          IIC_SSE_CVT_PS_RM>, VEX, Sched<[WriteCvtF2ILd]>;
 def VCVTTPS2DQYrr : VS2SI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
                           "cvttps2dq\t{$src, $dst|$dst, $src}",
@@ -1982,7 +2001,7 @@ def VCVTTPS2DQYrr : VS2SI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
 def VCVTTPS2DQYrm : VS2SI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
                           "cvttps2dq\t{$src, $dst|$dst, $src}",
                           [(set VR256:$dst, (int_x86_avx_cvtt_ps2dq_256
-                                             (memopv8f32 addr:$src)))],
+                                             (loadv8f32 addr:$src)))],
                           IIC_SSE_CVT_PS_RM>, VEX, VEX_L,
                           Sched<[WriteCvtF2ILd]>;
 
@@ -1999,27 +2018,27 @@ def CVTTPS2DQrm : S2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
 let Predicates = [HasAVX] in {
   def : Pat<(v4f32 (sint_to_fp (v4i32 VR128:$src))),
             (VCVTDQ2PSrr VR128:$src)>;
-  def : Pat<(v4f32 (sint_to_fp (bc_v4i32 (memopv2i64 addr:$src)))),
+  def : Pat<(v4f32 (sint_to_fp (bc_v4i32 (loadv2i64 addr:$src)))),
             (VCVTDQ2PSrm addr:$src)>;
 
   def : Pat<(int_x86_sse2_cvtdq2ps VR128:$src),
             (VCVTDQ2PSrr VR128:$src)>;
-  def : Pat<(int_x86_sse2_cvtdq2ps (bc_v4i32 (memopv2i64 addr:$src))),
+  def : Pat<(int_x86_sse2_cvtdq2ps (bc_v4i32 (loadv2i64 addr:$src))),
             (VCVTDQ2PSrm addr:$src)>;
 
   def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))),
             (VCVTTPS2DQrr VR128:$src)>;
-  def : Pat<(v4i32 (fp_to_sint (memopv4f32 addr:$src))),
+  def : Pat<(v4i32 (fp_to_sint (loadv4f32 addr:$src))),
             (VCVTTPS2DQrm addr:$src)>;
 
   def : Pat<(v8f32 (sint_to_fp (v8i32 VR256:$src))),
             (VCVTDQ2PSYrr VR256:$src)>;
-  def : Pat<(v8f32 (sint_to_fp (bc_v8i32 (memopv4i64 addr:$src)))),
+  def : Pat<(v8f32 (sint_to_fp (bc_v8i32 (loadv4i64 addr:$src)))),
             (VCVTDQ2PSYrm addr:$src)>;
 
   def : Pat<(v8i32 (fp_to_sint (v8f32 VR256:$src))),
             (VCVTTPS2DQYrr VR256:$src)>;
-  def : Pat<(v8i32 (fp_to_sint (memopv8f32 addr:$src))),
+  def : Pat<(v8i32 (fp_to_sint (loadv8f32 addr:$src))),
             (VCVTTPS2DQYrm addr:$src)>;
 }
 
@@ -2056,7 +2075,7 @@ def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}",
 def VCVTTPD2DQXrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                          "cvttpd2dqx\t{$src, $dst|$dst, $src}",
                          [(set VR128:$dst, (int_x86_sse2_cvttpd2dq
-                                            (memopv2f64 addr:$src)))],
+                                            (loadv2f64 addr:$src)))],
                          IIC_SSE_CVT_PD_RM>, VEX, Sched<[WriteCvtF2ILd]>;
 
 // YMM only
@@ -2068,7 +2087,7 @@ def VCVTTPD2DQYrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
 def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
                          "cvttpd2dq{y}\t{$src, $dst|$dst, $src}",
                          [(set VR128:$dst,
-                          (int_x86_avx_cvtt_pd2dq_256 (memopv4f64 addr:$src)))],
+                          (int_x86_avx_cvtt_pd2dq_256 (loadv4f64 addr:$src)))],
                          IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>;
 def : InstAlias<"vcvttpd2dq\t{$src, $dst|$dst, $src}",
                 (VCVTTPD2DQYrr VR128:$dst, VR256:$src)>;
@@ -2076,7 +2095,7 @@ def : InstAlias<"vcvttpd2dq\t{$src, $dst|$dst, $src}",
 let Predicates = [HasAVX] in {
   def : Pat<(v4i32 (fp_to_sint (v4f64 VR256:$src))),
             (VCVTTPD2DQYrr VR256:$src)>;
-  def : Pat<(v4i32 (fp_to_sint (memopv4f64 addr:$src))),
+  def : Pat<(v4i32 (fp_to_sint (loadv4f64 addr:$src))),
             (VCVTTPD2DQYrm addr:$src)>;
 } // Predicates = [HasAVX]
 
@@ -2110,7 +2129,7 @@ def VCVTPS2PDYrr : I<0x5A, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
 def VCVTPS2PDYrm : I<0x5A, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src),
                      "vcvtps2pd\t{$src, $dst|$dst, $src}",
                      [(set VR256:$dst,
-                       (int_x86_avx_cvt_ps2_pd_256 (memopv4f32 addr:$src)))],
+                       (int_x86_avx_cvt_ps2_pd_256 (loadv4f32 addr:$src)))],
                      IIC_SSE_CVT_PD_RM>, TB, VEX, VEX_L, Sched<[WriteCvtF2FLd]>;
 }
 
@@ -2140,7 +2159,7 @@ def VCVTDQ2PDYrm  : S2SI<0xE6, MRMSrcMem, (outs VR256:$dst), (ins i128mem:$src),
                      "vcvtdq2pd\t{$src, $dst|$dst, $src}",
                      [(set VR256:$dst,
                        (int_x86_avx_cvtdq2_pd_256
-                        (bitconvert (memopv2i64 addr:$src))))]>, VEX, VEX_L,
+                        (bitconvert (loadv2i64 addr:$src))))]>, VEX, VEX_L,
                     Sched<[WriteCvtI2FLd]>;
 def VCVTDQ2PDYrr  : S2SI<0xE6, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
                      "vcvtdq2pd\t{$src, $dst|$dst, $src}",
@@ -2162,7 +2181,7 @@ def CVTDQ2PDrr  : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
 let Predicates = [HasAVX] in {
   def : Pat<(v4f64 (sint_to_fp (v4i32 VR128:$src))),
             (VCVTDQ2PDYrr VR128:$src)>;
-  def : Pat<(v4f64 (sint_to_fp (bc_v4i32 (memopv2i64 addr:$src)))),
+  def : Pat<(v4f64 (sint_to_fp (bc_v4i32 (loadv2i64 addr:$src)))),
             (VCVTDQ2PDYrm addr:$src)>;
 } // Predicates = [HasAVX]
 
@@ -2181,7 +2200,7 @@ def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}",
 def VCVTPD2PSXrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                         "cvtpd2psx\t{$src, $dst|$dst, $src}",
                         [(set VR128:$dst,
-                          (int_x86_sse2_cvtpd2ps (memopv2f64 addr:$src)))],
+                          (int_x86_sse2_cvtpd2ps (loadv2f64 addr:$src)))],
                         IIC_SSE_CVT_PD_RM>, VEX, Sched<[WriteCvtF2FLd]>;
 
 // YMM only
@@ -2193,7 +2212,7 @@ def VCVTPD2PSYrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
 def VCVTPD2PSYrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
                         "cvtpd2ps{y}\t{$src, $dst|$dst, $src}",
                         [(set VR128:$dst,
-                          (int_x86_avx_cvt_pd2_ps_256 (memopv4f64 addr:$src)))],
+                          (int_x86_avx_cvt_pd2_ps_256 (loadv4f64 addr:$src)))],
                         IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2FLd]>;
 def : InstAlias<"vcvtpd2ps\t{$src, $dst|$dst, $src}",
                 (VCVTPD2PSYrr VR128:$dst, VR256:$src)>;
@@ -2215,13 +2234,13 @@ def CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
 let Predicates = [HasAVX] in {
   def : Pat<(int_x86_avx_cvtdq2_ps_256 VR256:$src),
             (VCVTDQ2PSYrr VR256:$src)>;
-  def : Pat<(int_x86_avx_cvtdq2_ps_256 (bitconvert (memopv4i64 addr:$src))),
+  def : Pat<(int_x86_avx_cvtdq2_ps_256 (bitconvert (loadv4i64 addr:$src))),
             (VCVTDQ2PSYrm addr:$src)>;
 
   // Match fround and fextend for 128/256-bit conversions
   def : Pat<(v4f32 (X86vfpround (v2f64 VR128:$src))),
             (VCVTPD2PSrr VR128:$src)>;
-  def : Pat<(v4f32 (X86vfpround (memopv2f64 addr:$src))),
+  def : Pat<(v4f32 (X86vfpround (loadv2f64 addr:$src))),
             (VCVTPD2PSXrm addr:$src)>;
   def : Pat<(v4f32 (fround (v4f64 VR256:$src))),
             (VCVTPD2PSYrr VR256:$src)>;
@@ -2299,7 +2318,7 @@ let Constraints = "$src1 = $dst" in {
   defm CMPSD : sse12_cmp_scalar<FR64, f64mem, SSECC, X86cmpsd, f64, loadf64,
                   "cmp${cc}sd\t{$src2, $dst|$dst, $src2}",
                   "cmpsd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
-                  SSE_ALU_F32S>, // same latency as 32 bit compare
+                  SSE_ALU_F64S>,
                   XD;
 }
 
@@ -2334,7 +2353,7 @@ let Constraints = "$src1 = $dst" in {
                        SSE_ALU_F32S>, XS;
   defm Int_CMPSD  : sse12_cmp_scalar_int<f64mem, SSECC, int_x86_sse2_cmp_sd,
                        "cmp${cc}sd\t{$src, $dst|$dst, $src}",
-                       SSE_ALU_F32S>, // same latency as f32
+                       SSE_ALU_F64S>,
                        XD;
 }
 
@@ -2403,26 +2422,27 @@ let Defs = [EFLAGS] in {
 // sse12_cmp_packed - sse 1 & 2 compare packed instructions
 multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop,
                             Operand CC, Intrinsic Int, string asm,
-                            string asm_alt, Domain d> {
+                            string asm_alt, Domain d,
+                            OpndItins itins = SSE_ALU_F32P> {
   def rri : PIi8<0xC2, MRMSrcReg,
              (outs RC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm,
              [(set RC:$dst, (Int RC:$src1, RC:$src2, imm:$cc))],
-             IIC_SSE_CMPP_RR, d>,
+             itins.rr, d>,
             Sched<[WriteFAdd]>;
   def rmi : PIi8<0xC2, MRMSrcMem,
              (outs RC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm,
              [(set RC:$dst, (Int RC:$src1, (memop addr:$src2), imm:$cc))],
-             IIC_SSE_CMPP_RM, d>,
+             itins.rm, d>,
             Sched<[WriteFAddLd, ReadAfterLd]>;
 
   // Accept explicit immediate argument form instead of comparison code.
   let neverHasSideEffects = 1 in {
     def rri_alt : PIi8<0xC2, MRMSrcReg,
                (outs RC:$dst), (ins RC:$src1, RC:$src2, i8imm:$cc),
-               asm_alt, [], IIC_SSE_CMPP_RR, d>, Sched<[WriteFAdd]>;
+               asm_alt, [], itins.rr, d>, Sched<[WriteFAdd]>;
     def rmi_alt : PIi8<0xC2, MRMSrcMem,
                (outs RC:$dst), (ins RC:$src1, x86memop:$src2, i8imm:$cc),
-               asm_alt, [], IIC_SSE_CMPP_RM, d>,
+               asm_alt, [], itins.rm, d>,
                Sched<[WriteFAddLd, ReadAfterLd]>;
   }
 }
@@ -2447,11 +2467,11 @@ let Constraints = "$src1 = $dst" in {
   defm CMPPS : sse12_cmp_packed<VR128, f128mem, SSECC, int_x86_sse_cmp_ps,
                  "cmp${cc}ps\t{$src2, $dst|$dst, $src2}",
                  "cmpps\t{$cc, $src2, $dst|$dst, $src2, $cc}",
-                 SSEPackedSingle>, TB;
+                 SSEPackedSingle, SSE_ALU_F32P>, TB;
   defm CMPPD : sse12_cmp_packed<VR128, f128mem, SSECC, int_x86_sse2_cmp_pd,
                  "cmp${cc}pd\t{$src2, $dst|$dst, $src2}",
                  "cmppd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
-                 SSEPackedDouble>, TB, OpSize;
+                 SSEPackedDouble, SSE_ALU_F64P>, TB, OpSize;
 }
 
 let Predicates = [HasAVX] in {
@@ -2511,16 +2531,16 @@ multiclass sse12_shuffle<RegisterClass RC, X86MemOperand x86memop,
 
 defm VSHUFPS  : sse12_shuffle<VR128, f128mem, v4f32,
            "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
-           memopv4f32, SSEPackedSingle>, TB, VEX_4V;
+           loadv4f32, SSEPackedSingle>, TB, VEX_4V;
 defm VSHUFPSY : sse12_shuffle<VR256, f256mem, v8f32,
            "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
-           memopv8f32, SSEPackedSingle>, TB, VEX_4V, VEX_L;
+           loadv8f32, SSEPackedSingle>, TB, VEX_4V, VEX_L;
 defm VSHUFPD  : sse12_shuffle<VR128, f128mem, v2f64,
-           "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src2, $src2, $src3}",
-           memopv2f64, SSEPackedDouble>, TB, OpSize, VEX_4V;
+           "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+           loadv2f64, SSEPackedDouble>, TB, OpSize, VEX_4V;
 defm VSHUFPDY : sse12_shuffle<VR256, f256mem, v4f64,
-           "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src2, $src2, $src3}",
-           memopv4f64, SSEPackedDouble>, TB, OpSize, VEX_4V, VEX_L;
+           "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+           loadv4f64, SSEPackedDouble>, TB, OpSize, VEX_4V, VEX_L;
 
 let Constraints = "$src1 = $dst" in {
   defm SHUFPS : sse12_shuffle<VR128, f128mem, v4f32,
@@ -2535,13 +2555,13 @@ let Constraints = "$src1 = $dst" in {
 
 let Predicates = [HasAVX] in {
   def : Pat<(v4i32 (X86Shufp VR128:$src1,
-                       (bc_v4i32 (memopv2i64 addr:$src2)), (i8 imm:$imm))),
+                       (bc_v4i32 (loadv2i64 addr:$src2)), (i8 imm:$imm))),
             (VSHUFPSrmi VR128:$src1, addr:$src2, imm:$imm)>;
   def : Pat<(v4i32 (X86Shufp VR128:$src1, VR128:$src2, (i8 imm:$imm))),
             (VSHUFPSrri VR128:$src1, VR128:$src2, imm:$imm)>;
 
   def : Pat<(v2i64 (X86Shufp VR128:$src1,
-                       (memopv2i64 addr:$src2), (i8 imm:$imm))),
+                       (loadv2i64 addr:$src2), (i8 imm:$imm))),
             (VSHUFPDrmi VR128:$src1, addr:$src2, imm:$imm)>;
   def : Pat<(v2i64 (X86Shufp VR128:$src1, VR128:$src2, (i8 imm:$imm))),
             (VSHUFPDrri VR128:$src1, VR128:$src2, imm:$imm)>;
@@ -2550,13 +2570,13 @@ let Predicates = [HasAVX] in {
   def : Pat<(v8i32 (X86Shufp VR256:$src1, VR256:$src2, (i8 imm:$imm))),
             (VSHUFPSYrri VR256:$src1, VR256:$src2, imm:$imm)>;
   def : Pat<(v8i32 (X86Shufp VR256:$src1,
-                      (bc_v8i32 (memopv4i64 addr:$src2)), (i8 imm:$imm))),
+                      (bc_v8i32 (loadv4i64 addr:$src2)), (i8 imm:$imm))),
             (VSHUFPSYrmi VR256:$src1, addr:$src2, imm:$imm)>;
 
   def : Pat<(v4i64 (X86Shufp VR256:$src1, VR256:$src2, (i8 imm:$imm))),
             (VSHUFPDYrri VR256:$src1, VR256:$src2, imm:$imm)>;
   def : Pat<(v4i64 (X86Shufp VR256:$src1,
-                              (memopv4i64 addr:$src2), (i8 imm:$imm))),
+                              (loadv4i64 addr:$src2), (i8 imm:$imm))),
             (VSHUFPDYrmi VR256:$src1, addr:$src2, imm:$imm)>;
 }
 
@@ -2600,29 +2620,29 @@ multiclass sse12_unpack_interleave<bits<8> opc, SDNode OpNode, ValueType vt,
              Sched<[WriteShuffleLd, ReadAfterLd]>;
 }
 
-defm VUNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, memopv4f32,
+defm VUNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, loadv4f32,
       VR128, f128mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                      SSEPackedSingle>, TB, VEX_4V;
-defm VUNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, memopv2f64,
+defm VUNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, loadv2f64,
       VR128, f128mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                      SSEPackedDouble>, TB, OpSize, VEX_4V;
-defm VUNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, memopv4f32,
+defm VUNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, loadv4f32,
       VR128, f128mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                      SSEPackedSingle>, TB, VEX_4V;
-defm VUNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, memopv2f64,
+defm VUNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, loadv2f64,
       VR128, f128mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                      SSEPackedDouble>, TB, OpSize, VEX_4V;
 
-defm VUNPCKHPSY: sse12_unpack_interleave<0x15, X86Unpckh, v8f32, memopv8f32,
+defm VUNPCKHPSY: sse12_unpack_interleave<0x15, X86Unpckh, v8f32, loadv8f32,
       VR256, f256mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                      SSEPackedSingle>, TB, VEX_4V, VEX_L;
-defm VUNPCKHPDY: sse12_unpack_interleave<0x15, X86Unpckh, v4f64, memopv4f64,
+defm VUNPCKHPDY: sse12_unpack_interleave<0x15, X86Unpckh, v4f64, loadv4f64,
       VR256, f256mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                      SSEPackedDouble>, TB, OpSize, VEX_4V, VEX_L;
-defm VUNPCKLPSY: sse12_unpack_interleave<0x14, X86Unpckl, v8f32, memopv8f32,
+defm VUNPCKLPSY: sse12_unpack_interleave<0x14, X86Unpckl, v8f32, loadv8f32,
       VR256, f256mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                      SSEPackedSingle>, TB, VEX_4V, VEX_L;
-defm VUNPCKLPDY: sse12_unpack_interleave<0x14, X86Unpckl, v4f64, memopv4f64,
+defm VUNPCKLPDY: sse12_unpack_interleave<0x14, X86Unpckl, v4f64, loadv4f64,
       VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                      SSEPackedDouble>, TB, OpSize, VEX_4V, VEX_L;
 
@@ -2642,20 +2662,20 @@ let Constraints = "$src1 = $dst" in {
 } // Constraints = "$src1 = $dst"
 
 let Predicates = [HasAVX1Only] in {
-  def : Pat<(v8i32 (X86Unpckl VR256:$src1, (bc_v8i32 (memopv4i64 addr:$src2)))),
+  def : Pat<(v8i32 (X86Unpckl VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)))),
             (VUNPCKLPSYrm VR256:$src1, addr:$src2)>;
   def : Pat<(v8i32 (X86Unpckl VR256:$src1, VR256:$src2)),
             (VUNPCKLPSYrr VR256:$src1, VR256:$src2)>;
-  def : Pat<(v8i32 (X86Unpckh VR256:$src1, (bc_v8i32 (memopv4i64 addr:$src2)))),
+  def : Pat<(v8i32 (X86Unpckh VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)))),
             (VUNPCKHPSYrm VR256:$src1, addr:$src2)>;
   def : Pat<(v8i32 (X86Unpckh VR256:$src1, VR256:$src2)),
             (VUNPCKHPSYrr VR256:$src1, VR256:$src2)>;
 
-  def : Pat<(v4i64 (X86Unpckl VR256:$src1, (memopv4i64 addr:$src2))),
+  def : Pat<(v4i64 (X86Unpckl VR256:$src1, (loadv4i64 addr:$src2))),
             (VUNPCKLPDYrm VR256:$src1, addr:$src2)>;
   def : Pat<(v4i64 (X86Unpckl VR256:$src1, VR256:$src2)),
             (VUNPCKLPDYrr VR256:$src1, VR256:$src2)>;
-  def : Pat<(v4i64 (X86Unpckh VR256:$src1, (memopv4i64 addr:$src2))),
+  def : Pat<(v4i64 (X86Unpckh VR256:$src1, (loadv4i64 addr:$src2))),
             (VUNPCKHPDYrm VR256:$src1, addr:$src2)>;
   def : Pat<(v4i64 (X86Unpckh VR256:$src1, VR256:$src2)),
             (VUNPCKHPDYrr VR256:$src1, VR256:$src2)>;
@@ -2686,13 +2706,10 @@ let Predicates = [UseSSE2] in {
 /// sse12_extr_sign_mask - sse 1 & 2 unpack and interleave
 multiclass sse12_extr_sign_mask<RegisterClass RC, Intrinsic Int, string asm,
                                 Domain d> {
-  def rr32 : PI<0x50, MRMSrcReg, (outs GR32:$dst), (ins RC:$src),
-                !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
-                     [(set GR32:$dst, (Int RC:$src))], IIC_SSE_MOVMSK, d>,
-             Sched<[WriteVecLogic]>;
-  def rr64 : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins RC:$src),
-                !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [],
-                IIC_SSE_MOVMSK, d>, REX_W, Sched<[WriteVecLogic]>;
+  def rr : PI<0x50, MRMSrcReg, (outs GR32orGR64:$dst), (ins RC:$src),
+              !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
+              [(set GR32orGR64:$dst, (Int RC:$src))], IIC_SSE_MOVMSK, d>,
+              Sched<[WriteVecLogic]>;
 }
 
 let Predicates = [HasAVX] in {
@@ -2709,29 +2726,15 @@ let Predicates = [HasAVX] in {
                                         OpSize, VEX, VEX_L;
 
   def : Pat<(i32 (X86fgetsign FR32:$src)),
-            (VMOVMSKPSrr32 (COPY_TO_REGCLASS FR32:$src, VR128))>;
+            (VMOVMSKPSrr (COPY_TO_REGCLASS FR32:$src, VR128))>;
   def : Pat<(i64 (X86fgetsign FR32:$src)),
-            (VMOVMSKPSrr64 (COPY_TO_REGCLASS FR32:$src, VR128))>;
+            (SUBREG_TO_REG (i64 0),
+             (VMOVMSKPSrr (COPY_TO_REGCLASS FR32:$src, VR128)), sub_32bit)>;
   def : Pat<(i32 (X86fgetsign FR64:$src)),
-            (VMOVMSKPDrr32 (COPY_TO_REGCLASS FR64:$src, VR128))>;
+            (VMOVMSKPDrr (COPY_TO_REGCLASS FR64:$src, VR128))>;
   def : Pat<(i64 (X86fgetsign FR64:$src)),
-            (VMOVMSKPDrr64 (COPY_TO_REGCLASS FR64:$src, VR128))>;
-
-  // Assembler Only
-  def VMOVMSKPSr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src),
-             "movmskps\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVMSK,
-             SSEPackedSingle>, TB, VEX, Sched<[WriteVecLogic]>;
-  def VMOVMSKPDr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src),
-             "movmskpd\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVMSK,
-             SSEPackedDouble>, TB,
-             OpSize, VEX, Sched<[WriteVecLogic]>;
-  def VMOVMSKPSYr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR256:$src),
-             "movmskps\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVMSK,
-             SSEPackedSingle>, TB, VEX, VEX_L, Sched<[WriteVecLogic]>;
-  def VMOVMSKPDYr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR256:$src),
-             "movmskpd\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVMSK,
-             SSEPackedDouble>, TB,
-             OpSize, VEX, VEX_L, Sched<[WriteVecLogic]>;
+            (SUBREG_TO_REG (i64 0),
+             (VMOVMSKPDrr (COPY_TO_REGCLASS FR64:$src, VR128)), sub_32bit)>;
 }
 
 defm MOVMSKPS : sse12_extr_sign_mask<VR128, int_x86_sse_movmsk_ps, "movmskps",
@@ -2740,16 +2743,18 @@ defm MOVMSKPD : sse12_extr_sign_mask<VR128, int_x86_sse2_movmsk_pd, "movmskpd",
                                      SSEPackedDouble>, TB, OpSize;
 
 def : Pat<(i32 (X86fgetsign FR32:$src)),
-          (MOVMSKPSrr32 (COPY_TO_REGCLASS FR32:$src, VR128))>,
+          (MOVMSKPSrr (COPY_TO_REGCLASS FR32:$src, VR128))>,
       Requires<[UseSSE1]>;
 def : Pat<(i64 (X86fgetsign FR32:$src)),
-          (MOVMSKPSrr64 (COPY_TO_REGCLASS FR32:$src, VR128))>,
+          (SUBREG_TO_REG (i64 0),
+           (MOVMSKPSrr (COPY_TO_REGCLASS FR32:$src, VR128)), sub_32bit)>,
       Requires<[UseSSE1]>;
 def : Pat<(i32 (X86fgetsign FR64:$src)),
-          (MOVMSKPDrr32 (COPY_TO_REGCLASS FR64:$src, VR128))>,
+          (MOVMSKPDrr (COPY_TO_REGCLASS FR64:$src, VR128))>,
       Requires<[UseSSE2]>;
 def : Pat<(i64 (X86fgetsign FR64:$src)),
-          (MOVMSKPDrr64 (COPY_TO_REGCLASS FR64:$src, VR128))>,
+          (SUBREG_TO_REG (i64 0),
+           (MOVMSKPDrr (COPY_TO_REGCLASS FR64:$src, VR128)), sub_32bit)>,
       Requires<[UseSSE2]>;
 
 //===---------------------------------------------------------------------===//
@@ -2788,7 +2793,7 @@ multiclass PDI_binop_all<bits<8> opc, string OpcodeStr, SDNode Opcode,
                          OpndItins itins, bit IsCommutable = 0> {
 let Predicates = [HasAVX] in
   defm V#NAME : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode, OpVT128,
-                    VR128, memopv2i64, i128mem, itins, IsCommutable, 0>, VEX_4V;
+                    VR128, loadv2i64, i128mem, itins, IsCommutable, 0>, VEX_4V;
 
 let Constraints = "$src1 = $dst" in
   defm NAME : PDI_binop_rm<opc, OpcodeStr, Opcode, OpVT128, VR128,
@@ -2796,7 +2801,7 @@ let Constraints = "$src1 = $dst" in
 
 let Predicates = [HasAVX2] in
   defm V#NAME#Y : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode,
-                               OpVT256, VR256, memopv4i64, i256mem, itins,
+                               OpVT256, VR256, loadv4i64, i256mem, itins,
                                IsCommutable, 0>, VEX_4V, VEX_L;
 }
 
@@ -2836,16 +2841,18 @@ multiclass sse12_fp_alias_pack_logical<bits<8> opc, string OpcodeStr,
 }
 
 // Alias bitwise logical operations using SSE logical ops on packed FP values.
-defm FsAND  : sse12_fp_alias_pack_logical<0x54, "and", X86fand,
-              SSE_BIT_ITINS_P>;
-defm FsOR   : sse12_fp_alias_pack_logical<0x56, "or", X86for,
-              SSE_BIT_ITINS_P>;
-defm FsXOR  : sse12_fp_alias_pack_logical<0x57, "xor", X86fxor,
-              SSE_BIT_ITINS_P>;
-
-let isCommutable = 0 in
-  defm FsANDN : sse12_fp_alias_pack_logical<0x55, "andn", X86fandn,
+let isCodeGenOnly = 1 in {
+  defm FsAND  : sse12_fp_alias_pack_logical<0x54, "and", X86fand,
+                SSE_BIT_ITINS_P>;
+  defm FsOR   : sse12_fp_alias_pack_logical<0x56, "or", X86for,
                 SSE_BIT_ITINS_P>;
+  defm FsXOR  : sse12_fp_alias_pack_logical<0x57, "xor", X86fxor,
+                SSE_BIT_ITINS_P>;
+
+  let isCommutable = 0 in
+    defm FsANDN : sse12_fp_alias_pack_logical<0x55, "andn", X86fandn,
+                  SSE_BIT_ITINS_P>;
+}
 
 /// sse12_fp_packed_logical - SSE 1 & 2 packed FP logical ops
 ///
@@ -2855,14 +2862,14 @@ multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr,
         !strconcat(OpcodeStr, "ps"), f256mem,
         [(set VR256:$dst, (v4i64 (OpNode VR256:$src1, VR256:$src2)))],
         [(set VR256:$dst, (OpNode (bc_v4i64 (v8f32 VR256:$src1)),
-                           (memopv4i64 addr:$src2)))], 0>, TB, VEX_4V, VEX_L;
+                           (loadv4i64 addr:$src2)))], 0>, TB, VEX_4V, VEX_L;
 
   defm V#NAME#PDY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedDouble,
         !strconcat(OpcodeStr, "pd"), f256mem,
         [(set VR256:$dst, (OpNode (bc_v4i64 (v4f64 VR256:$src1)),
                                   (bc_v4i64 (v4f64 VR256:$src2))))],
         [(set VR256:$dst, (OpNode (bc_v4i64 (v4f64 VR256:$src1)),
-                                  (memopv4i64 addr:$src2)))], 0>,
+                                  (loadv4i64 addr:$src2)))], 0>,
                                   TB, OpSize, VEX_4V, VEX_L;
 
   // In AVX no need to add a pattern for 128-bit logical rr ps, because they
@@ -2872,14 +2879,14 @@ multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr,
   defm V#NAME#PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
        !strconcat(OpcodeStr, "ps"), f128mem, [],
        [(set VR128:$dst, (OpNode (bc_v2i64 (v4f32 VR128:$src1)),
-                                 (memopv2i64 addr:$src2)))], 0>, TB, VEX_4V;
+                                 (loadv2i64 addr:$src2)))], 0>, TB, VEX_4V;
 
   defm V#NAME#PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble,
        !strconcat(OpcodeStr, "pd"), f128mem,
        [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)),
                                  (bc_v2i64 (v2f64 VR128:$src2))))],
        [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)),
-                                 (memopv2i64 addr:$src2)))], 0>,
+                                 (loadv2i64 addr:$src2)))], 0>,
                                                  TB, OpSize, VEX_4V;
 
   let Constraints = "$src1 = $dst" in {
@@ -2921,120 +2928,93 @@ let isCommutable = 0 in
 
 /// FIXME: once all 256-bit intrinsics are matched, cleanup and refactor those
 /// classes below
-multiclass basic_sse12_fp_binop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                                  SizeItins itins,
-                                  bit Is2Addr = 1> {
-  defm SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"),
-                            OpNode, FR32, f32mem,
-                            itins.s, Is2Addr>, XS;
-  defm SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"),
-                            OpNode, FR64, f64mem,
-                            itins.d, Is2Addr>, XD;
-}
-
 multiclass basic_sse12_fp_binop_p<bits<8> opc, string OpcodeStr,
                                   SDNode OpNode, SizeItins itins> {
-let Predicates = [HasAVX] in {
   defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
-                               VR128, v4f32, f128mem, memopv4f32,
+                               VR128, v4f32, f128mem, loadv4f32,
                                SSEPackedSingle, itins.s, 0>, TB, VEX_4V;
   defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
-                               VR128, v2f64, f128mem, memopv2f64,
+                               VR128, v2f64, f128mem, loadv2f64,
                                SSEPackedDouble, itins.d, 0>, TB, OpSize, VEX_4V;
 
   defm V#NAME#PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"),
-                        OpNode, VR256, v8f32, f256mem, memopv8f32,
+                        OpNode, VR256, v8f32, f256mem, loadv8f32,
                         SSEPackedSingle, itins.s, 0>, TB, VEX_4V, VEX_L;
   defm V#NAME#PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"),
-                        OpNode, VR256, v4f64, f256mem, memopv4f64,
+                        OpNode, VR256, v4f64, f256mem, loadv4f64,
                         SSEPackedDouble, itins.d, 0>, TB, OpSize, VEX_4V, VEX_L;
-}
-
-let Constraints = "$src1 = $dst" in {
-  defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR128,
-                            v4f32, f128mem, memopv4f32, SSEPackedSingle,
-                            itins.s, 1>, TB;
-  defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR128,
-                            v2f64, f128mem, memopv2f64, SSEPackedDouble,
-                            itins.d, 1>, TB, OpSize;
-}
-}
 
-multiclass basic_sse12_fp_binop_s_int<bits<8> opc, string OpcodeStr,
-                                      SizeItins itins,
-                                      bit Is2Addr = 1> {
-  defm SS : sse12_fp_scalar_int<opc, OpcodeStr, VR128,
-     !strconcat(OpcodeStr, "ss"), "", "_ss", ssmem, sse_load_f32,
-     itins.s, Is2Addr>, XS;
-  defm SD : sse12_fp_scalar_int<opc, OpcodeStr, VR128,
-     !strconcat(OpcodeStr, "sd"), "2", "_sd", sdmem, sse_load_f64,
-     itins.d, Is2Addr>, XD;
+  let Constraints = "$src1 = $dst" in {
+    defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR128,
+                              v4f32, f128mem, memopv4f32, SSEPackedSingle,
+                              itins.s>, TB;
+    defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR128,
+                              v2f64, f128mem, memopv2f64, SSEPackedDouble,
+                              itins.d>, TB, OpSize;
+  }
 }
 
-// Binary Arithmetic instructions
-defm ADD : basic_sse12_fp_binop_p<0x58, "add", fadd, SSE_ALU_ITINS_P>;
-defm MUL : basic_sse12_fp_binop_p<0x59, "mul", fmul, SSE_MUL_ITINS_P>;
-let isCommutable = 0 in {
-  defm SUB : basic_sse12_fp_binop_p<0x5C, "sub", fsub, SSE_ALU_ITINS_P>;
-  defm DIV : basic_sse12_fp_binop_p<0x5E, "div", fdiv, SSE_DIV_ITINS_P>;
-  defm MAX : basic_sse12_fp_binop_p<0x5F, "max", X86fmax, SSE_ALU_ITINS_P>;
-  defm MIN : basic_sse12_fp_binop_p<0x5D, "min", X86fmin, SSE_ALU_ITINS_P>;
-}
+multiclass basic_sse12_fp_binop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                  SizeItins itins> {
+  defm V#NAME#SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"),
+                         OpNode, FR32, f32mem, itins.s, 0>, XS, VEX_4V, VEX_LIG;
+  defm V#NAME#SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"),
+                         OpNode, FR64, f64mem, itins.d, 0>, XD, VEX_4V, VEX_LIG;
 
-let isCodeGenOnly = 1 in {
-  defm MAXC: basic_sse12_fp_binop_p<0x5F, "max", X86fmaxc, SSE_ALU_ITINS_P>;
-  defm MINC: basic_sse12_fp_binop_p<0x5D, "min", X86fminc, SSE_ALU_ITINS_P>;
+  let Constraints = "$src1 = $dst" in {
+    defm SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"),
+                              OpNode, FR32, f32mem, itins.s>, XS;
+    defm SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"),
+                              OpNode, FR64, f64mem, itins.d>, XD;
+  }
 }
 
-defm VADD : basic_sse12_fp_binop_s<0x58, "add", fadd, SSE_ALU_ITINS_S, 0>,
-            basic_sse12_fp_binop_s_int<0x58, "add", SSE_ALU_ITINS_S, 0>,
-              VEX_4V, VEX_LIG;
-defm VMUL : basic_sse12_fp_binop_s<0x59, "mul", fmul, SSE_MUL_ITINS_S, 0>,
-            basic_sse12_fp_binop_s_int<0x59, "mul", SSE_MUL_ITINS_S, 0>,
-              VEX_4V, VEX_LIG;
+multiclass basic_sse12_fp_binop_s_int<bits<8> opc, string OpcodeStr,
+                                      SizeItins itins> {
+  defm V#NAME#SS : sse12_fp_scalar_int<opc, OpcodeStr, VR128,
+                   !strconcat(OpcodeStr, "ss"), "", "_ss", ssmem, sse_load_f32,
+                   itins.s, 0>, XS, VEX_4V, VEX_LIG;
+  defm V#NAME#SD : sse12_fp_scalar_int<opc, OpcodeStr, VR128,
+                   !strconcat(OpcodeStr, "sd"), "2", "_sd", sdmem, sse_load_f64,
+                   itins.d, 0>, XD, VEX_4V, VEX_LIG;
 
-let isCommutable = 0 in {
-  defm VSUB : basic_sse12_fp_binop_s<0x5C, "sub", fsub, SSE_ALU_ITINS_S, 0>,
-              basic_sse12_fp_binop_s_int<0x5C, "sub", SSE_ALU_ITINS_S, 0>,
-                VEX_4V, VEX_LIG;
-  defm VDIV : basic_sse12_fp_binop_s<0x5E, "div", fdiv, SSE_DIV_ITINS_S, 0>,
-              basic_sse12_fp_binop_s_int<0x5E, "div", SSE_DIV_ITINS_S, 0>,
-                VEX_4V, VEX_LIG;
-  defm VMAX : basic_sse12_fp_binop_s<0x5F, "max", X86fmax, SSE_ALU_ITINS_S, 0>,
-              basic_sse12_fp_binop_s_int<0x5F, "max", SSE_ALU_ITINS_S, 0>,
-                VEX_4V, VEX_LIG;
-  defm VMIN : basic_sse12_fp_binop_s<0x5D, "min", X86fmin, SSE_ALU_ITINS_S, 0>,
-              basic_sse12_fp_binop_s_int<0x5D, "min", SSE_ALU_ITINS_S, 0>,
-                VEX_4V, VEX_LIG;
+  let Constraints = "$src1 = $dst" in {
+    defm SS : sse12_fp_scalar_int<opc, OpcodeStr, VR128,
+                   !strconcat(OpcodeStr, "ss"), "", "_ss", ssmem, sse_load_f32,
+                   itins.s>, XS;
+    defm SD : sse12_fp_scalar_int<opc, OpcodeStr, VR128,
+                   !strconcat(OpcodeStr, "sd"), "2", "_sd", sdmem, sse_load_f64,
+                   itins.d>, XD;
+  }
 }
 
-let Constraints = "$src1 = $dst" in {
-  defm ADD : basic_sse12_fp_binop_s<0x58, "add", fadd, SSE_ALU_ITINS_S>,
-             basic_sse12_fp_binop_s_int<0x58, "add", SSE_ALU_ITINS_S>;
-  defm MUL : basic_sse12_fp_binop_s<0x59, "mul", fmul, SSE_MUL_ITINS_S>,
-             basic_sse12_fp_binop_s_int<0x59, "mul", SSE_MUL_ITINS_S>;
-
-  let isCommutable = 0 in {
-    defm SUB : basic_sse12_fp_binop_s<0x5C, "sub", fsub, SSE_ALU_ITINS_S>,
-               basic_sse12_fp_binop_s_int<0x5C, "sub", SSE_ALU_ITINS_S>;
-    defm DIV : basic_sse12_fp_binop_s<0x5E, "div", fdiv, SSE_DIV_ITINS_S>,
-               basic_sse12_fp_binop_s_int<0x5E, "div", SSE_DIV_ITINS_S>;
-    defm MAX : basic_sse12_fp_binop_s<0x5F, "max", X86fmax, SSE_ALU_ITINS_S>,
-               basic_sse12_fp_binop_s_int<0x5F, "max", SSE_ALU_ITINS_S>;
-    defm MIN : basic_sse12_fp_binop_s<0x5D, "min", X86fmin, SSE_ALU_ITINS_S>,
-               basic_sse12_fp_binop_s_int<0x5D, "min", SSE_ALU_ITINS_S>;
-  }
+// Binary Arithmetic instructions
+defm ADD : basic_sse12_fp_binop_p<0x58, "add", fadd, SSE_ALU_ITINS_P>,
+           basic_sse12_fp_binop_s<0x58, "add", fadd, SSE_ALU_ITINS_S>,
+           basic_sse12_fp_binop_s_int<0x58, "add", SSE_ALU_ITINS_S>;
+defm MUL : basic_sse12_fp_binop_p<0x59, "mul", fmul, SSE_MUL_ITINS_P>,
+           basic_sse12_fp_binop_s<0x59, "mul", fmul, SSE_MUL_ITINS_S>,
+           basic_sse12_fp_binop_s_int<0x59, "mul", SSE_MUL_ITINS_S>;
+let isCommutable = 0 in {
+  defm SUB : basic_sse12_fp_binop_p<0x5C, "sub", fsub, SSE_ALU_ITINS_P>,
+             basic_sse12_fp_binop_s<0x5C, "sub", fsub, SSE_ALU_ITINS_S>,
+             basic_sse12_fp_binop_s_int<0x5C, "sub", SSE_ALU_ITINS_S>;
+  defm DIV : basic_sse12_fp_binop_p<0x5E, "div", fdiv, SSE_DIV_ITINS_P>,
+             basic_sse12_fp_binop_s<0x5E, "div", fdiv, SSE_DIV_ITINS_S>,
+             basic_sse12_fp_binop_s_int<0x5E, "div", SSE_DIV_ITINS_S>;
+  defm MAX : basic_sse12_fp_binop_p<0x5F, "max", X86fmax, SSE_ALU_ITINS_P>,
+             basic_sse12_fp_binop_s<0x5F, "max", X86fmax, SSE_ALU_ITINS_S>,
+             basic_sse12_fp_binop_s_int<0x5F, "max", SSE_ALU_ITINS_S>;
+  defm MIN : basic_sse12_fp_binop_p<0x5D, "min", X86fmin, SSE_ALU_ITINS_P>,
+             basic_sse12_fp_binop_s<0x5D, "min", X86fmin, SSE_ALU_ITINS_S>,
+             basic_sse12_fp_binop_s_int<0x5D, "min", SSE_ALU_ITINS_S>;
 }
 
 let isCodeGenOnly = 1 in {
-  defm VMAXC: basic_sse12_fp_binop_s<0x5F, "max", X86fmaxc, SSE_ALU_ITINS_S, 0>,
-       VEX_4V, VEX_LIG;
-  defm VMINC: basic_sse12_fp_binop_s<0x5D, "min", X86fminc, SSE_ALU_ITINS_S, 0>,
-       VEX_4V, VEX_LIG;
-  let Constraints = "$src1 = $dst" in {
-    defm MAXC: basic_sse12_fp_binop_s<0x5F, "max", X86fmaxc, SSE_ALU_ITINS_S>;
-    defm MINC: basic_sse12_fp_binop_s<0x5D, "min", X86fminc, SSE_ALU_ITINS_S>;
-  }
+  defm MAXC: basic_sse12_fp_binop_p<0x5F, "max", X86fmaxc, SSE_ALU_ITINS_P>,
+             basic_sse12_fp_binop_s<0x5F, "max", X86fmaxc, SSE_ALU_ITINS_S>;
+  defm MINC: basic_sse12_fp_binop_p<0x5D, "min", X86fminc, SSE_ALU_ITINS_P>,
+             basic_sse12_fp_binop_s<0x5D, "min", X86fminc, SSE_ALU_ITINS_S>;
 }
 
 /// Unop Arithmetic
@@ -3180,7 +3160,7 @@ let Predicates = [HasAVX] in {
   def V#NAME#PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                        !strconcat("v", OpcodeStr,
                                   "ps\t{$src, $dst|$dst, $src}"),
-                       [(set VR128:$dst, (OpNode (memopv4f32 addr:$src)))],
+                       [(set VR128:$dst, (OpNode (loadv4f32 addr:$src)))],
                        itins.rm>, VEX, Sched<[itins.Sched.Folded]>;
   def V#NAME#PSYr : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
                         !strconcat("v", OpcodeStr,
@@ -3190,7 +3170,7 @@ let Predicates = [HasAVX] in {
   def V#NAME#PSYm : PSI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
                         !strconcat("v", OpcodeStr,
                                    "ps\t{$src, $dst|$dst, $src}"),
-                        [(set VR256:$dst, (OpNode (memopv8f32 addr:$src)))],
+                        [(set VR256:$dst, (OpNode (loadv8f32 addr:$src)))],
                         itins.rm>, VEX, VEX_L, Sched<[itins.Sched.Folded]>;
 }
 
@@ -3217,7 +3197,7 @@ let Predicates = [HasAVX] in {
   def V#NAME#PSm_Int : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                           !strconcat("v", OpcodeStr,
                           "ps\t{$src, $dst|$dst, $src}"),
-                          [(set VR128:$dst, (V4F32Int (memopv4f32 addr:$src)))],
+                          [(set VR128:$dst, (V4F32Int (loadv4f32 addr:$src)))],
                           itins.rm>, VEX, Sched<[itins.Sched.Folded]>;
   def V#NAME#PSYr_Int : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
                             !strconcat("v", OpcodeStr,
@@ -3228,7 +3208,7 @@ let Predicates = [HasAVX] in {
                           (ins f256mem:$src),
                           !strconcat("v", OpcodeStr,
                                     "ps\t{$src, $dst|$dst, $src}"),
-                          [(set VR256:$dst, (V8F32Int (memopv8f32 addr:$src)))],
+                          [(set VR256:$dst, (V8F32Int (loadv8f32 addr:$src)))],
                           itins.rm>, VEX, VEX_L, Sched<[itins.Sched.Folded]>;
 }
 
@@ -3298,7 +3278,7 @@ let Predicates = [HasAVX] in {
   def V#NAME#PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                        !strconcat("v", OpcodeStr,
                                   "pd\t{$src, $dst|$dst, $src}"),
-                       [(set VR128:$dst, (OpNode (memopv2f64 addr:$src)))],
+                       [(set VR128:$dst, (OpNode (loadv2f64 addr:$src)))],
                        itins.rm>, VEX, Sched<[itins.Sched.Folded]>;
   def V#NAME#PDYr : PDI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
                         !strconcat("v", OpcodeStr,
@@ -3308,7 +3288,7 @@ let Predicates = [HasAVX] in {
   def V#NAME#PDYm : PDI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
                         !strconcat("v", OpcodeStr,
                                    "pd\t{$src, $dst|$dst, $src}"),
-                        [(set VR256:$dst, (OpNode (memopv4f64 addr:$src)))],
+                        [(set VR256:$dst, (OpNode (loadv4f64 addr:$src)))],
                         itins.rm>, VEX, VEX_L, Sched<[itins.Sched.Folded]>;
 }
 
@@ -3341,30 +3321,31 @@ defm RCP   : sse1_fp_unop_rw<0x53, "rcp", X86frcp, SSE_RCPS>,
              sse1_fp_unop_p_int<0x53, "rcp", int_x86_sse_rcp_ps,
                                 int_x86_avx_rcp_ps_256, SSE_RCPP>;
 
-def : Pat<(f32 (fsqrt FR32:$src)),
-          (VSQRTSSr (f32 (IMPLICIT_DEF)), FR32:$src)>, Requires<[HasAVX]>;
-def : Pat<(f32 (fsqrt (load addr:$src))),
-          (VSQRTSSm (f32 (IMPLICIT_DEF)), addr:$src)>,
-          Requires<[HasAVX, OptForSize]>;
-def : Pat<(f64 (fsqrt FR64:$src)),
-          (VSQRTSDr (f64 (IMPLICIT_DEF)), FR64:$src)>, Requires<[HasAVX]>;
-def : Pat<(f64 (fsqrt (load addr:$src))),
-          (VSQRTSDm (f64 (IMPLICIT_DEF)), addr:$src)>,
-          Requires<[HasAVX, OptForSize]>;
-
-def : Pat<(f32 (X86frsqrt FR32:$src)),
-          (VRSQRTSSr (f32 (IMPLICIT_DEF)), FR32:$src)>, Requires<[HasAVX]>;
-def : Pat<(f32 (X86frsqrt (load addr:$src))),
-          (VRSQRTSSm (f32 (IMPLICIT_DEF)), addr:$src)>,
-          Requires<[HasAVX, OptForSize]>;
-
-def : Pat<(f32 (X86frcp FR32:$src)),
-          (VRCPSSr (f32 (IMPLICIT_DEF)), FR32:$src)>, Requires<[HasAVX]>;
-def : Pat<(f32 (X86frcp (load addr:$src))),
-          (VRCPSSm (f32 (IMPLICIT_DEF)), addr:$src)>,
-          Requires<[HasAVX, OptForSize]>;
-
-let Predicates = [HasAVX] in {
+let Predicates = [UseAVX] in {
+  def : Pat<(f32 (fsqrt FR32:$src)),
+            (VSQRTSSr (f32 (IMPLICIT_DEF)), FR32:$src)>, Requires<[HasAVX]>;
+  def : Pat<(f32 (fsqrt (load addr:$src))),
+            (VSQRTSSm (f32 (IMPLICIT_DEF)), addr:$src)>,
+            Requires<[HasAVX, OptForSize]>;
+  def : Pat<(f64 (fsqrt FR64:$src)),
+            (VSQRTSDr (f64 (IMPLICIT_DEF)), FR64:$src)>, Requires<[HasAVX]>;
+  def : Pat<(f64 (fsqrt (load addr:$src))),
+            (VSQRTSDm (f64 (IMPLICIT_DEF)), addr:$src)>,
+            Requires<[HasAVX, OptForSize]>;
+
+  def : Pat<(f32 (X86frsqrt FR32:$src)),
+            (VRSQRTSSr (f32 (IMPLICIT_DEF)), FR32:$src)>, Requires<[HasAVX]>;
+  def : Pat<(f32 (X86frsqrt (load addr:$src))),
+            (VRSQRTSSm (f32 (IMPLICIT_DEF)), addr:$src)>,
+            Requires<[HasAVX, OptForSize]>;
+
+  def : Pat<(f32 (X86frcp FR32:$src)),
+            (VRCPSSr (f32 (IMPLICIT_DEF)), FR32:$src)>, Requires<[HasAVX]>;
+  def : Pat<(f32 (X86frcp (load addr:$src))),
+            (VRCPSSm (f32 (IMPLICIT_DEF)), addr:$src)>,
+            Requires<[HasAVX, OptForSize]>;
+}
+let Predicates = [UseAVX] in {
   def : Pat<(int_x86_sse_sqrt_ss VR128:$src),
             (COPY_TO_REGCLASS (VSQRTSSr (f32 (IMPLICIT_DEF)),
                                         (COPY_TO_REGCLASS VR128:$src, FR32)),
@@ -3378,7 +3359,9 @@ let Predicates = [HasAVX] in {
                               VR128)>;
   def : Pat<(int_x86_sse2_sqrt_sd sse_load_f64:$src),
             (VSQRTSDm_Int (v2f64 (IMPLICIT_DEF)), sse_load_f64:$src)>;
+}
 
+let Predicates = [HasAVX] in {
   def : Pat<(int_x86_sse_rsqrt_ss VR128:$src),
             (COPY_TO_REGCLASS (VRSQRTSSr (f32 (IMPLICIT_DEF)),
                                          (COPY_TO_REGCLASS VR128:$src, FR32)),
@@ -3662,7 +3645,7 @@ def MOVDQUrm :   I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
                  XS, Requires<[UseSSE2]>;
 }
 
-let mayStore = 1, SchedRW = [WriteStore] in {
+let mayStore = 1, neverHasSideEffects = 1, SchedRW = [WriteStore] in {
 def MOVDQAmr : PDI<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
                    "movdqa\t{$src, $dst|$dst, $src}",
                    [/*(alignedstore (v2i64 VR128:$src), addr:$dst)*/],
@@ -3725,7 +3708,7 @@ multiclass PDI_binop_all_int<bits<8> opc, string OpcodeStr, Intrinsic IntId128,
                              bit IsCommutable = 0> {
 let Predicates = [HasAVX] in
   defm V#NAME : PDI_binop_rm_int<opc, !strconcat("v", OpcodeStr), IntId128,
-                                 VR128, memopv2i64, i128mem, itins,
+                                 VR128, loadv2i64, i128mem, itins,
                                  IsCommutable, 0>, VEX_4V;
 
 let Constraints = "$src1 = $dst" in
@@ -3734,7 +3717,7 @@ let Constraints = "$src1 = $dst" in
 
 let Predicates = [HasAVX2] in
   defm V#NAME#Y : PDI_binop_rm_int<opc, !strconcat("v", OpcodeStr), IntId256,
-                                   VR256, memopv4i64, i256mem, itins,
+                                   VR256, loadv4i64, i256mem, itins,
                                    IsCommutable, 0>, VEX_4V, VEX_L;
 }
 
@@ -3761,11 +3744,11 @@ multiclass PDI_binop_rmi<bits<8> opc, bits<8> opc2, Format ImmForm,
                        (bc_frag (memopv2i64 addr:$src2)))))], itins.rm>,
       Sched<[WriteVecShiftLd, ReadAfterLd]>;
   def ri : PDIi8<opc2, ImmForm, (outs RC:$dst),
-       (ins RC:$src1, i32i8imm:$src2),
+       (ins RC:$src1, i8imm:$src2),
        !if(Is2Addr,
            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
-       [(set RC:$dst, (DstVT (OpNode2 RC:$src1, (i32 imm:$src2))))], itins.ri>,
+       [(set RC:$dst, (DstVT (OpNode2 RC:$src1, (i8 imm:$src2))))], itins.ri>,
        Sched<[WriteVecShift]>;
 }
 
@@ -3849,15 +3832,15 @@ defm PAVGB   : PDI_binop_all_int<0xE0, "pavgb", int_x86_sse2_pavg_b,
 defm PAVGW   : PDI_binop_all_int<0xE3, "pavgw", int_x86_sse2_pavg_w,
                                  int_x86_avx2_pavg_w, SSE_INTALU_ITINS_P, 1>;
 defm PSADBW  : PDI_binop_all_int<0xF6, "psadbw", int_x86_sse2_psad_bw,
-                                 int_x86_avx2_psad_bw, SSE_INTALU_ITINS_P, 1>;
+                                 int_x86_avx2_psad_bw, SSE_PMADD, 1>;
 
 let Predicates = [HasAVX] in
 defm VPMULUDQ : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v2i64, v4i32, VR128,
-                              memopv2i64, i128mem, SSE_INTMUL_ITINS_P, 1, 0>,
+                              loadv2i64, i128mem, SSE_INTMUL_ITINS_P, 1, 0>,
                               VEX_4V;
 let Predicates = [HasAVX2] in
 defm VPMULUDQY : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v4i64, v8i32,
-                               VR256, memopv4i64, i256mem,
+                               VR256, loadv4i64, i256mem,
                                SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V, VEX_L;
 let Constraints = "$src1 = $dst" in
 defm PMULUDQ : PDI_binop_rm2<0xF4, "pmuludq", X86pmuludq, v2i64, v4i32, VR128,
@@ -3993,12 +3976,14 @@ let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift] in {
                        (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
                        "pslldq\t{$src2, $dst|$dst, $src2}",
                        [(set VR128:$dst,
-                         (int_x86_sse2_psll_dq_bs VR128:$src1, imm:$src2))]>;
+                         (int_x86_sse2_psll_dq_bs VR128:$src1, imm:$src2))],
+                         IIC_SSE_INTSHDQ_P_RI>;
   def PSRLDQri : PDIi8<0x73, MRM3r,
                        (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
                        "psrldq\t{$src2, $dst|$dst, $src2}",
                        [(set VR128:$dst,
-                         (int_x86_sse2_psrl_dq_bs VR128:$src1, imm:$src2))]>;
+                         (int_x86_sse2_psrl_dq_bs VR128:$src1, imm:$src2))],
+                         IIC_SSE_INTSHDQ_P_RI>;
   // PSRADQri doesn't exist in SSE[1-3].
 }
 } // Constraints = "$src1 = $dst"
@@ -4082,14 +4067,14 @@ let Predicates = [HasAVX] in {
                                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                       [(set VR128:$dst,
                         (vt128 (OpNode VR128:$src1, (i8 imm:$src2))))],
-                      IIC_SSE_PSHUF>, VEX, Sched<[WriteShuffle]>;
+                      IIC_SSE_PSHUF_RI>, VEX, Sched<[WriteShuffle]>;
   def V#NAME#mi : Ii8<0x70, MRMSrcMem, (outs VR128:$dst),
                       (ins i128mem:$src1, i8imm:$src2),
                       !strconcat("v", OpcodeStr,
                                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                      [(set VR128:$dst,
-                       (vt128 (OpNode (bitconvert (memopv2i64 addr:$src1)),
-                        (i8 imm:$src2))))], IIC_SSE_PSHUF>, VEX,
+                       (vt128 (OpNode (bitconvert (loadv2i64 addr:$src1)),
+                        (i8 imm:$src2))))], IIC_SSE_PSHUF_MI>, VEX,
                   Sched<[WriteShuffleLd]>;
 }
 
@@ -4100,14 +4085,14 @@ let Predicates = [HasAVX2] in {
                                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                        [(set VR256:$dst,
                          (vt256 (OpNode VR256:$src1, (i8 imm:$src2))))],
-                       IIC_SSE_PSHUF>, VEX, VEX_L, Sched<[WriteShuffle]>;
+                       IIC_SSE_PSHUF_RI>, VEX, VEX_L, Sched<[WriteShuffle]>;
   def V#NAME#Ymi : Ii8<0x70, MRMSrcMem, (outs VR256:$dst),
                        (ins i256mem:$src1, i8imm:$src2),
                        !strconcat("v", OpcodeStr,
                                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                       [(set VR256:$dst,
-                        (vt256 (OpNode (bitconvert (memopv4i64 addr:$src1)),
-                         (i8 imm:$src2))))], IIC_SSE_PSHUF>, VEX, VEX_L,
+                        (vt256 (OpNode (bitconvert (loadv4i64 addr:$src1)),
+                         (i8 imm:$src2))))], IIC_SSE_PSHUF_MI>, VEX, VEX_L,
                    Sched<[WriteShuffleLd]>;
 }
 
@@ -4118,14 +4103,14 @@ let Predicates = [UseSSE2] in {
                           "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                 [(set VR128:$dst,
                   (vt128 (OpNode VR128:$src1, (i8 imm:$src2))))],
-                IIC_SSE_PSHUF>, Sched<[WriteShuffle]>;
+                IIC_SSE_PSHUF_RI>, Sched<[WriteShuffle]>;
   def mi : Ii8<0x70, MRMSrcMem,
                (outs VR128:$dst), (ins i128mem:$src1, i8imm:$src2),
                !strconcat(OpcodeStr,
                           "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                 [(set VR128:$dst,
                   (vt128 (OpNode (bitconvert (memopv2i64 addr:$src1)),
-                          (i8 imm:$src2))))], IIC_SSE_PSHUF>,
+                          (i8 imm:$src2))))], IIC_SSE_PSHUF_MI>,
            Sched<[WriteShuffleLd]>;
 }
 }
@@ -4136,7 +4121,7 @@ defm PSHUFHW : sse2_pshuffle<"pshufhw", v8i16, v16i16, X86PShufhw>, XS;
 defm PSHUFLW : sse2_pshuffle<"pshuflw", v8i16, v16i16, X86PShuflw>, XD;
 
 let Predicates = [HasAVX] in {
-  def : Pat<(v4f32 (X86PShufd (memopv4f32 addr:$src1), (i8 imm:$imm))),
+  def : Pat<(v4f32 (X86PShufd (loadv4f32 addr:$src1), (i8 imm:$imm))),
             (VPSHUFDmi addr:$src1, imm:$imm)>;
   def : Pat<(v4f32 (X86PShufd VR128:$src1, (i8 imm:$imm))),
             (VPSHUFDri VR128:$src1, imm:$imm)>;
@@ -4259,13 +4244,13 @@ let ExeDomain = SSEPackedInt in {
 multiclass sse2_pinsrw<bit Is2Addr = 1> {
   def rri : Ii8<0xC4, MRMSrcReg,
        (outs VR128:$dst), (ins VR128:$src1,
-        GR32:$src2, i32i8imm:$src3),
+        GR32orGR64:$src2, i32i8imm:$src3),
        !if(Is2Addr,
            "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
            "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
        [(set VR128:$dst,
-         (X86pinsrw VR128:$src1, GR32:$src2, imm:$src3))], IIC_SSE_PINSRW>,
-       Sched<[WriteShuffle]>;
+         (X86pinsrw VR128:$src1, GR32orGR64:$src2, imm:$src3))],
+       IIC_SSE_PINSRW>, Sched<[WriteShuffle]>;
   def rmi : Ii8<0xC4, MRMSrcMem,
                        (outs VR128:$dst), (ins VR128:$src1,
                         i16mem:$src2, i32i8imm:$src3),
@@ -4281,29 +4266,24 @@ multiclass sse2_pinsrw<bit Is2Addr = 1> {
 // Extract
 let Predicates = [HasAVX] in
 def VPEXTRWri : Ii8<0xC5, MRMSrcReg,
-                    (outs GR32:$dst), (ins VR128:$src1, i32i8imm:$src2),
+                    (outs GR32orGR64:$dst), (ins VR128:$src1, i32i8imm:$src2),
                     "vpextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                    [(set GR32:$dst, (X86pextrw (v8i16 VR128:$src1),
-                                                imm:$src2))]>, TB, OpSize, VEX,
+                    [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1),
+                                            imm:$src2))]>, TB, OpSize, VEX,
                 Sched<[WriteShuffle]>;
 def PEXTRWri : PDIi8<0xC5, MRMSrcReg,
-                    (outs GR32:$dst), (ins VR128:$src1, i32i8imm:$src2),
+                    (outs GR32orGR64:$dst), (ins VR128:$src1, i32i8imm:$src2),
                     "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                    [(set GR32:$dst, (X86pextrw (v8i16 VR128:$src1),
-                                                imm:$src2))], IIC_SSE_PEXTRW>,
+                    [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1),
+                                            imm:$src2))], IIC_SSE_PEXTRW>,
                Sched<[WriteShuffleLd, ReadAfterLd]>;
 
 // Insert
-let Predicates = [HasAVX] in {
-  defm VPINSRW : sse2_pinsrw<0>, TB, OpSize, VEX_4V;
-  def  VPINSRWrr64i : Ii8<0xC4, MRMSrcReg, (outs VR128:$dst),
-       (ins VR128:$src1, GR64:$src2, i32i8imm:$src3),
-       "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
-       []>, TB, OpSize, VEX_4V, Sched<[WriteShuffle]>;
-}
+let Predicates = [HasAVX] in
+defm VPINSRW : sse2_pinsrw<0>, TB, OpSize, VEX_4V;
 
-let Constraints = "$src1 = $dst" in
-  defm PINSRW : sse2_pinsrw, TB, OpSize, Requires<[UseSSE2]>;
+let Predicates = [UseSSE2], Constraints = "$src1 = $dst" in
+defm PINSRW : sse2_pinsrw, TB, OpSize;
 
 } // ExeDomain = SSEPackedInt
 
@@ -4313,24 +4293,23 @@ let Constraints = "$src1 = $dst" in
 
 let ExeDomain = SSEPackedInt, SchedRW = [WriteVecLogic] in {
 
-def VPMOVMSKBrr  : VPDI<0xD7, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src),
+def VPMOVMSKBrr  : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
+           (ins VR128:$src),
            "pmovmskb\t{$src, $dst|$dst, $src}",
-           [(set GR32:$dst, (int_x86_sse2_pmovmskb_128 VR128:$src))],
+           [(set GR32orGR64:$dst, (int_x86_sse2_pmovmskb_128 VR128:$src))],
            IIC_SSE_MOVMSK>, VEX;
-def VPMOVMSKBr64r : VPDI<0xD7, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src),
-           "pmovmskb\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVMSK>, VEX;
 
 let Predicates = [HasAVX2] in {
-def VPMOVMSKBYrr  : VPDI<0xD7, MRMSrcReg, (outs GR32:$dst), (ins VR256:$src),
+def VPMOVMSKBYrr  : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
+           (ins VR256:$src),
            "pmovmskb\t{$src, $dst|$dst, $src}",
-           [(set GR32:$dst, (int_x86_avx2_pmovmskb VR256:$src))]>, VEX, VEX_L;
-def VPMOVMSKBYr64r : VPDI<0xD7, MRMSrcReg, (outs GR64:$dst), (ins VR256:$src),
-           "pmovmskb\t{$src, $dst|$dst, $src}", []>, VEX, VEX_L;
+           [(set GR32orGR64:$dst, (int_x86_avx2_pmovmskb VR256:$src))]>,
+           VEX, VEX_L;
 }
 
-def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src),
+def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), (ins VR128:$src),
            "pmovmskb\t{$src, $dst|$dst, $src}",
-           [(set GR32:$dst, (int_x86_sse2_pmovmskb_128 VR128:$src))],
+           [(set GR32orGR64:$dst, (int_x86_sse2_pmovmskb_128 VR128:$src))],
            IIC_SSE_MOVMSK>;
 
 } // ExeDomain = SSEPackedInt
@@ -4341,25 +4320,25 @@ def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src),
 
 let ExeDomain = SSEPackedInt, SchedRW = [WriteStore] in {
 
-let Uses = [EDI] in
+let Uses = [EDI], Predicates = [HasAVX,In32BitMode] in
 def VMASKMOVDQU : VPDI<0xF7, MRMSrcReg, (outs),
            (ins VR128:$src, VR128:$mask),
            "maskmovdqu\t{$mask, $src|$src, $mask}",
            [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)],
            IIC_SSE_MASKMOV>, VEX;
-let Uses = [RDI] in
+let Uses = [RDI], Predicates = [HasAVX,In64BitMode] in
 def VMASKMOVDQU64 : VPDI<0xF7, MRMSrcReg, (outs),
            (ins VR128:$src, VR128:$mask),
            "maskmovdqu\t{$mask, $src|$src, $mask}",
            [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)],
            IIC_SSE_MASKMOV>, VEX;
 
-let Uses = [EDI] in
+let Uses = [EDI], Predicates = [UseSSE2,In32BitMode] in
 def MASKMOVDQU : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
            "maskmovdqu\t{$mask, $src|$src, $mask}",
            [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)],
            IIC_SSE_MASKMOV>;
-let Uses = [RDI] in
+let Uses = [RDI], Predicates = [UseSSE2,In64BitMode] in
 def MASKMOVDQU64 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
            "maskmovdqu\t{$mask, $src|$src, $mask}",
            [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)],
@@ -4386,12 +4365,13 @@ def VMOVDI2PDIrm : VS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
                         IIC_SSE_MOVDQ>,
                       VEX, Sched<[WriteLoad]>;
 def VMOV64toPQIrr : VRS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
-                        "mov{d|q}\t{$src, $dst|$dst, $src}",
+                        "movq\t{$src, $dst|$dst, $src}",
                         [(set VR128:$dst,
                           (v2i64 (scalar_to_vector GR64:$src)))],
                           IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>;
+let isCodeGenOnly = 1 in
 def VMOV64toSDrr : VRS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
-                       "mov{d|q}\t{$src, $dst|$dst, $src}",
+                       "movq\t{$src, $dst|$dst, $src}",
                        [(set FR64:$dst, (bitconvert GR64:$src))],
                        IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>;
 
@@ -4410,6 +4390,7 @@ def MOV64toPQIrr : RS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
                         [(set VR128:$dst,
                           (v2i64 (scalar_to_vector GR64:$src)))],
                           IIC_SSE_MOVDQ>, Sched<[WriteMove]>;
+let isCodeGenOnly = 1 in
 def MOV64toSDrr : RS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
                        "mov{d|q}\t{$src, $dst|$dst, $src}",
                        [(set FR64:$dst, (bitconvert GR64:$src))],
@@ -4418,25 +4399,27 @@ def MOV64toSDrr : RS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
 //===---------------------------------------------------------------------===//
 // Move Int Doubleword to Single Scalar
 //
-def VMOVDI2SSrr  : VS2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
-                      "movd\t{$src, $dst|$dst, $src}",
-                      [(set FR32:$dst, (bitconvert GR32:$src))],
-                      IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>;
-
-def VMOVDI2SSrm  : VS2I<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src),
-                      "movd\t{$src, $dst|$dst, $src}",
-                      [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))],
-                      IIC_SSE_MOVDQ>,
-                      VEX, Sched<[WriteLoad]>;
-def MOVDI2SSrr  : S2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
-                      "movd\t{$src, $dst|$dst, $src}",
-                      [(set FR32:$dst, (bitconvert GR32:$src))],
-                      IIC_SSE_MOVDQ>, Sched<[WriteMove]>;
-
-def MOVDI2SSrm  : S2I<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src),
-                      "movd\t{$src, $dst|$dst, $src}",
-                      [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))],
-                      IIC_SSE_MOVDQ>, Sched<[WriteLoad]>;
+let isCodeGenOnly = 1 in {
+  def VMOVDI2SSrr  : VS2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
+                        "movd\t{$src, $dst|$dst, $src}",
+                        [(set FR32:$dst, (bitconvert GR32:$src))],
+                        IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>;
+
+  def VMOVDI2SSrm  : VS2I<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src),
+                        "movd\t{$src, $dst|$dst, $src}",
+                        [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))],
+                        IIC_SSE_MOVDQ>,
+                        VEX, Sched<[WriteLoad]>;
+  def MOVDI2SSrr  : S2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
+                        "movd\t{$src, $dst|$dst, $src}",
+                        [(set FR32:$dst, (bitconvert GR32:$src))],
+                        IIC_SSE_MOVDQ>, Sched<[WriteMove]>;
+
+  def MOVDI2SSrm  : S2I<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src),
+                        "movd\t{$src, $dst|$dst, $src}",
+                        [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))],
+                        IIC_SSE_MOVDQ>, Sched<[WriteLoad]>;
+}
 
 //===---------------------------------------------------------------------===//
 // Move Packed Doubleword Int to Packed Double Int
@@ -4463,12 +4446,24 @@ def MOVPDI2DImr  : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src),
                                      (iPTR 0))), addr:$dst)],
                                      IIC_SSE_MOVDQ>, Sched<[WriteLoad]>;
 
+def : Pat<(v8i32 (X86Vinsert (v8i32 immAllZerosV), GR32:$src2, (iPTR 0))),
+        (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrr GR32:$src2), sub_xmm)>;
+
+def : Pat<(v4i64 (X86Vinsert (bc_v4i64 (v8i32 immAllZerosV)), GR64:$src2, (iPTR 0))),
+        (SUBREG_TO_REG (i32 0), (VMOV64toPQIrr GR64:$src2), sub_xmm)>;
+
+def : Pat<(v8i32 (X86Vinsert undef, GR32:$src2, (iPTR 0))),
+        (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrr GR32:$src2), sub_xmm)>;
+
+def : Pat<(v4i64 (X86Vinsert undef, GR64:$src2, (iPTR 0))),
+        (SUBREG_TO_REG (i32 0), (VMOV64toPQIrr GR64:$src2), sub_xmm)>;
+
 //===---------------------------------------------------------------------===//
 // Move Packed Doubleword Int first element to Doubleword Int
 //
 let SchedRW = [WriteMove] in {
 def VMOVPQIto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
-                          "mov{d|q}\t{$src, $dst|$dst, $src}",
+                          "movq\t{$src, $dst|$dst, $src}",
                           [(set GR64:$dst, (vector_extract (v2i64 VR128:$src),
                                                            (iPTR 0)))],
                                                            IIC_SSE_MOVD_ToGP>,
@@ -4484,121 +4479,112 @@ def MOVPQIto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
 //===---------------------------------------------------------------------===//
 // Bitcast FR64 <-> GR64
 //
-let Predicates = [HasAVX] in
-def VMOV64toSDrm : S2SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src),
-                        "vmovq\t{$src, $dst|$dst, $src}",
-                        [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))]>,
-                        VEX, Sched<[WriteLoad]>;
-def VMOVSDto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
+let isCodeGenOnly = 1 in {
+  let Predicates = [UseAVX] in
+  def VMOV64toSDrm : VS2SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src),
+                          "movq\t{$src, $dst|$dst, $src}",
+                          [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))]>,
+                          VEX, Sched<[WriteLoad]>;
+  def VMOVSDto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
+                           "movq\t{$src, $dst|$dst, $src}",
+                           [(set GR64:$dst, (bitconvert FR64:$src))],
+                           IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>;
+  def VMOVSDto64mr : VRS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
+                           "movq\t{$src, $dst|$dst, $src}",
+                           [(store (i64 (bitconvert FR64:$src)), addr:$dst)],
+                           IIC_SSE_MOVDQ>, VEX, Sched<[WriteStore]>;
+
+  def MOV64toSDrm : S2SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src),
+                         "movq\t{$src, $dst|$dst, $src}",
+                         [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))],
+                         IIC_SSE_MOVDQ>, Sched<[WriteLoad]>;
+  def MOVSDto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
                          "mov{d|q}\t{$src, $dst|$dst, $src}",
                          [(set GR64:$dst, (bitconvert FR64:$src))],
-                         IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>;
-def VMOVSDto64mr : VRS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
+                         IIC_SSE_MOVD_ToGP>, Sched<[WriteMove]>;
+  def MOVSDto64mr : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
                          "movq\t{$src, $dst|$dst, $src}",
                          [(store (i64 (bitconvert FR64:$src)), addr:$dst)],
-                         IIC_SSE_MOVDQ>, VEX, Sched<[WriteStore]>;
-
-def MOV64toSDrm : S2SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src),
-                       "movq\t{$src, $dst|$dst, $src}",
-                       [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))],
-                       IIC_SSE_MOVDQ>, Sched<[WriteLoad]>;
-def MOVSDto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
-                       "mov{d|q}\t{$src, $dst|$dst, $src}",
-                       [(set GR64:$dst, (bitconvert FR64:$src))],
-                       IIC_SSE_MOVD_ToGP>, Sched<[WriteMove]>;
-def MOVSDto64mr : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
-                       "movq\t{$src, $dst|$dst, $src}",
-                       [(store (i64 (bitconvert FR64:$src)), addr:$dst)],
-                       IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
+                         IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
+}
 
 //===---------------------------------------------------------------------===//
 // Move Scalar Single to Double Int
 //
-def VMOVSS2DIrr  : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
-                      "movd\t{$src, $dst|$dst, $src}",
-                      [(set GR32:$dst, (bitconvert FR32:$src))],
-                      IIC_SSE_MOVD_ToGP>, VEX, Sched<[WriteMove]>;
-def VMOVSS2DImr  : VS2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src),
-                      "movd\t{$src, $dst|$dst, $src}",
-                      [(store (i32 (bitconvert FR32:$src)), addr:$dst)],
-                      IIC_SSE_MOVDQ>, VEX, Sched<[WriteStore]>;
-def MOVSS2DIrr  : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
-                      "movd\t{$src, $dst|$dst, $src}",
-                      [(set GR32:$dst, (bitconvert FR32:$src))],
-                      IIC_SSE_MOVD_ToGP>, Sched<[WriteMove]>;
-def MOVSS2DImr  : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src),
-                      "movd\t{$src, $dst|$dst, $src}",
-                      [(store (i32 (bitconvert FR32:$src)), addr:$dst)],
-                      IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
+let isCodeGenOnly = 1 in {
+  def VMOVSS2DIrr  : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
+                        "movd\t{$src, $dst|$dst, $src}",
+                        [(set GR32:$dst, (bitconvert FR32:$src))],
+                        IIC_SSE_MOVD_ToGP>, VEX, Sched<[WriteMove]>;
+  def VMOVSS2DImr  : VS2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src),
+                        "movd\t{$src, $dst|$dst, $src}",
+                        [(store (i32 (bitconvert FR32:$src)), addr:$dst)],
+                        IIC_SSE_MOVDQ>, VEX, Sched<[WriteStore]>;
+  def MOVSS2DIrr  : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
+                        "movd\t{$src, $dst|$dst, $src}",
+                        [(set GR32:$dst, (bitconvert FR32:$src))],
+                        IIC_SSE_MOVD_ToGP>, Sched<[WriteMove]>;
+  def MOVSS2DImr  : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src),
+                        "movd\t{$src, $dst|$dst, $src}",
+                        [(store (i32 (bitconvert FR32:$src)), addr:$dst)],
+                        IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
+}
 
 //===---------------------------------------------------------------------===//
 // Patterns and instructions to describe movd/movq to XMM register zero-extends
 //
-let SchedRW = [WriteMove] in {
+let isCodeGenOnly = 1, SchedRW = [WriteMove] in {
 let AddedComplexity = 15 in {
-def VMOVZDI2PDIrr : VS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
-                       "movd\t{$src, $dst|$dst, $src}",
-                       [(set VR128:$dst, (v4i32 (X86vzmovl
-                                      (v4i32 (scalar_to_vector GR32:$src)))))],
-                                      IIC_SSE_MOVDQ>, VEX;
 def VMOVZQI2PQIrr : VS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
-                       "mov{d|q}\t{$src, $dst|$dst, $src}", // X86-64 only
+                       "movq\t{$src, $dst|$dst, $src}", // X86-64 only
                        [(set VR128:$dst, (v2i64 (X86vzmovl
                                       (v2i64 (scalar_to_vector GR64:$src)))))],
                                       IIC_SSE_MOVDQ>,
                                       VEX, VEX_W;
-}
-let AddedComplexity = 15 in {
-def MOVZDI2PDIrr : S2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
-                       "movd\t{$src, $dst|$dst, $src}",
-                       [(set VR128:$dst, (v4i32 (X86vzmovl
-                                      (v4i32 (scalar_to_vector GR32:$src)))))],
-                                      IIC_SSE_MOVDQ>;
 def MOVZQI2PQIrr : RS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
                        "mov{d|q}\t{$src, $dst|$dst, $src}", // X86-64 only
                        [(set VR128:$dst, (v2i64 (X86vzmovl
                                       (v2i64 (scalar_to_vector GR64:$src)))))],
                                       IIC_SSE_MOVDQ>;
 }
-} // SchedRW
+} // isCodeGenOnly, SchedRW
 
-let AddedComplexity = 20, SchedRW = [WriteLoad] in {
-def VMOVZDI2PDIrm : VS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
-                       "movd\t{$src, $dst|$dst, $src}",
-                       [(set VR128:$dst,
-                         (v4i32 (X86vzmovl (v4i32 (scalar_to_vector
-                                                   (loadi32 addr:$src))))))],
-                                                   IIC_SSE_MOVDQ>, VEX;
-def MOVZDI2PDIrm : S2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
-                       "movd\t{$src, $dst|$dst, $src}",
-                       [(set VR128:$dst,
-                         (v4i32 (X86vzmovl (v4i32 (scalar_to_vector
-                                                   (loadi32 addr:$src))))))],
-                                                   IIC_SSE_MOVDQ>;
-} // AddedComplexity, SchedRW
+let Predicates = [UseAVX] in {
+  let AddedComplexity = 15 in
+    def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))),
+              (VMOVDI2PDIrr GR32:$src)>;
 
-let Predicates = [HasAVX] in {
   // AVX 128-bit movd/movq instruction write zeros in the high 128-bit part.
   let AddedComplexity = 20 in {
+    def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
+              (VMOVDI2PDIrm addr:$src)>;
     def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))),
-              (VMOVZDI2PDIrm addr:$src)>;
+              (VMOVDI2PDIrm addr:$src)>;
     def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
-              (VMOVZDI2PDIrm addr:$src)>;
+              (VMOVDI2PDIrm addr:$src)>;
   }
   // Use regular 128-bit instructions to match 256-bit scalar_to_vec+zext.
   def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
                                (v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))),
-            (SUBREG_TO_REG (i32 0), (VMOVZDI2PDIrr GR32:$src), sub_xmm)>;
+            (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrr GR32:$src), sub_xmm)>;
   def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
                                (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))),
             (SUBREG_TO_REG (i64 0), (VMOVZQI2PQIrr GR64:$src), sub_xmm)>;
 }
 
-let Predicates = [UseSSE2], AddedComplexity = 20 in {
-  def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))),
-            (MOVZDI2PDIrm addr:$src)>;
-  def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
-            (MOVZDI2PDIrm addr:$src)>;
+let Predicates = [UseSSE2] in {
+  let AddedComplexity = 15 in
+    def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))),
+              (MOVDI2PDIrr GR32:$src)>;
+
+  let AddedComplexity = 20 in {
+    def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
+              (MOVDI2PDIrm addr:$src)>;
+    def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))),
+              (MOVDI2PDIrm addr:$src)>;
+    def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
+              (MOVDI2PDIrm addr:$src)>;
+  }
 }
 
 // These are the correct encodings of the instructions so that we know how to
@@ -4607,15 +4593,12 @@ let Predicates = [UseSSE2], AddedComplexity = 20 in {
 def : InstAlias<"movq\t{$src, $dst|$dst, $src}",
                 (MOV64toPQIrr VR128:$dst, GR64:$src), 0>;
 def : InstAlias<"movq\t{$src, $dst|$dst, $src}",
-                (MOV64toSDrr FR64:$dst, GR64:$src), 0>;
-def : InstAlias<"movq\t{$src, $dst|$dst, $src}",
                 (MOVPQIto64rr GR64:$dst, VR128:$src), 0>;
-def : InstAlias<"movq\t{$src, $dst|$dst, $src}",
-                (MOVSDto64rr GR64:$dst, FR64:$src), 0>;
-def : InstAlias<"movq\t{$src, $dst|$dst, $src}",
-                (VMOVZQI2PQIrr VR128:$dst, GR64:$src), 0>;
-def : InstAlias<"movq\t{$src, $dst|$dst, $src}",
-                (MOVZQI2PQIrr VR128:$dst, GR64:$src), 0>;
+// Allow "vmovd" but print "vmovq" since we don't need compatibility for AVX.
+def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}",
+                (VMOV64toPQIrr VR128:$dst, GR64:$src), 0>;
+def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}",
+                (VMOVPQIto64rr GR64:$dst, VR128:$src), 0>;
 
 //===---------------------------------------------------------------------===//
 // SSE2 - Move Quadword
@@ -4630,7 +4613,7 @@ def VMOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
                     "vmovq\t{$src, $dst|$dst, $src}",
                     [(set VR128:$dst,
                       (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, XS,
-                    VEX, Requires<[HasAVX]>;
+                    VEX, Requires<[UseAVX]>;
 def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
                     "movq\t{$src, $dst|$dst, $src}",
                     [(set VR128:$dst,
@@ -4667,16 +4650,15 @@ def MOVLQ128mr : S2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
                      [(int_x86_sse2_storel_dq addr:$dst, VR128:$src)],
                      IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
 
-let AddedComplexity = 20 in
+let isCodeGenOnly = 1, AddedComplexity = 20 in {
 def VMOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
                      "vmovq\t{$src, $dst|$dst, $src}",
                      [(set VR128:$dst,
                        (v2i64 (X86vzmovl (v2i64 (scalar_to_vector
                                                  (loadi64 addr:$src))))))],
                                                  IIC_SSE_MOVDQ>,
-                     XS, VEX, Requires<[HasAVX]>, Sched<[WriteLoad]>;
+                     XS, VEX, Requires<[UseAVX]>, Sched<[WriteLoad]>;
 
-let AddedComplexity = 20 in
 def MOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
                      "movq\t{$src, $dst|$dst, $src}",
                      [(set VR128:$dst,
@@ -4684,10 +4666,9 @@ def MOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
                                                  (loadi64 addr:$src))))))],
                                                  IIC_SSE_MOVDQ>,
                      XS, Requires<[UseSSE2]>, Sched<[WriteLoad]>;
+}
 
-let Predicates = [HasAVX], AddedComplexity = 20 in {
-  def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
-            (VMOVZQI2PQIrm addr:$src)>;
+let Predicates = [UseAVX], AddedComplexity = 20 in {
   def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4f32 addr:$src)))),
             (VMOVZQI2PQIrm addr:$src)>;
   def : Pat<(v2i64 (X86vzload addr:$src)),
@@ -4695,8 +4676,6 @@ let Predicates = [HasAVX], AddedComplexity = 20 in {
 }
 
 let Predicates = [UseSSE2], AddedComplexity = 20 in {
-  def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
-            (MOVZQI2PQIrm addr:$src)>;
   def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4f32 addr:$src)))),
             (MOVZQI2PQIrm addr:$src)>;
   def : Pat<(v2i64 (X86vzload addr:$src)), (MOVZQI2PQIrm addr:$src)>;
@@ -4719,7 +4698,7 @@ def VMOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                         "vmovq\t{$src, $dst|$dst, $src}",
                     [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))],
                     IIC_SSE_MOVQ_RR>,
-                      XS, VEX, Requires<[HasAVX]>;
+                      XS, VEX, Requires<[UseAVX]>;
 let AddedComplexity = 15 in
 def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                         "movq\t{$src, $dst|$dst, $src}",
@@ -4728,14 +4707,14 @@ def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                       XS, Requires<[UseSSE2]>;
 } // SchedRW
 
-let SchedRW = [WriteVecLogicLd] in {
+let isCodeGenOnly = 1, SchedRW = [WriteVecLogicLd] in {
 let AddedComplexity = 20 in
 def VMOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
                         "vmovq\t{$src, $dst|$dst, $src}",
                     [(set VR128:$dst, (v2i64 (X86vzmovl
                                              (loadv2i64 addr:$src))))],
                                              IIC_SSE_MOVDQ>,
-                      XS, VEX, Requires<[HasAVX]>;
+                      XS, VEX, Requires<[UseAVX]>;
 let AddedComplexity = 20 in {
 def MOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
                         "movq\t{$src, $dst|$dst, $src}",
@@ -4744,49 +4723,19 @@ def MOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
                                              IIC_SSE_MOVDQ>,
                       XS, Requires<[UseSSE2]>;
 }
-} // SchedRW
+} // isCodeGenOnly, SchedRW
 
 let AddedComplexity = 20 in {
-  let Predicates = [HasAVX] in {
-    def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
-              (VMOVZPQILo2PQIrm addr:$src)>;
+  let Predicates = [UseAVX] in {
     def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
               (VMOVZPQILo2PQIrr VR128:$src)>;
   }
   let Predicates = [UseSSE2] in {
-    def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
-              (MOVZPQILo2PQIrm addr:$src)>;
     def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
               (MOVZPQILo2PQIrr VR128:$src)>;
   }
 }
 
-// Instructions to match in the assembler
-let SchedRW = [WriteMove] in {
-def VMOVQs64rr : VS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
-                      "movq\t{$src, $dst|$dst, $src}", [],
-                      IIC_SSE_MOVDQ>, VEX, VEX_W;
-def VMOVQd64rr : VS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
-                      "movq\t{$src, $dst|$dst, $src}", [],
-                      IIC_SSE_MOVDQ>, VEX, VEX_W;
-// Recognize "movd" with GR64 destination, but encode as a "movq"
-def VMOVQd64rr_alt : VS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
-                          "movd\t{$src, $dst|$dst, $src}", [],
-                          IIC_SSE_MOVDQ>, VEX, VEX_W;
-} // SchedRW
-
-// Instructions for the disassembler
-// xr = XMM register
-// xm = mem64
-
-let SchedRW = [WriteMove] in {
-let Predicates = [HasAVX] in
-def VMOVQxrxr: I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-                 "vmovq\t{$src, $dst|$dst, $src}", []>, VEX, XS;
-def MOVQxrxr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-                 "movq\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVQ_RR>, XS;
-} // SchedRW
-
 //===---------------------------------------------------------------------===//
 // SSE3 - Replicate Single FP - MOVSHDUP and MOVSLDUP
 //===---------------------------------------------------------------------===//
@@ -4805,13 +4754,13 @@ def rm : S3SI<op, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
 
 let Predicates = [HasAVX] in {
   defm VMOVSHDUP  : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
-                                       v4f32, VR128, memopv4f32, f128mem>, VEX;
+                                       v4f32, VR128, loadv4f32, f128mem>, VEX;
   defm VMOVSLDUP  : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
-                                       v4f32, VR128, memopv4f32, f128mem>, VEX;
+                                       v4f32, VR128, loadv4f32, f128mem>, VEX;
   defm VMOVSHDUPY : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
-                                 v8f32, VR256, memopv8f32, f256mem>, VEX, VEX_L;
+                                 v8f32, VR256, loadv8f32, f256mem>, VEX, VEX_L;
   defm VMOVSLDUPY : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
-                                 v8f32, VR256, memopv8f32, f256mem>, VEX, VEX_L;
+                                 v8f32, VR256, loadv8f32, f256mem>, VEX, VEX_L;
 }
 defm MOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "movshdup", v4f32, VR128,
                                    memopv4f32, f128mem>;
@@ -4821,19 +4770,19 @@ defm MOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "movsldup", v4f32, VR128,
 let Predicates = [HasAVX] in {
   def : Pat<(v4i32 (X86Movshdup VR128:$src)),
             (VMOVSHDUPrr VR128:$src)>;
-  def : Pat<(v4i32 (X86Movshdup (bc_v4i32 (memopv2i64 addr:$src)))),
+  def : Pat<(v4i32 (X86Movshdup (bc_v4i32 (loadv2i64 addr:$src)))),
             (VMOVSHDUPrm addr:$src)>;
   def : Pat<(v4i32 (X86Movsldup VR128:$src)),
             (VMOVSLDUPrr VR128:$src)>;
-  def : Pat<(v4i32 (X86Movsldup (bc_v4i32 (memopv2i64 addr:$src)))),
+  def : Pat<(v4i32 (X86Movsldup (bc_v4i32 (loadv2i64 addr:$src)))),
             (VMOVSLDUPrm addr:$src)>;
   def : Pat<(v8i32 (X86Movshdup VR256:$src)),
             (VMOVSHDUPYrr VR256:$src)>;
-  def : Pat<(v8i32 (X86Movshdup (bc_v8i32 (memopv4i64 addr:$src)))),
+  def : Pat<(v8i32 (X86Movshdup (bc_v8i32 (loadv4i64 addr:$src)))),
             (VMOVSHDUPYrm addr:$src)>;
   def : Pat<(v8i32 (X86Movsldup VR256:$src)),
             (VMOVSLDUPYrr VR256:$src)>;
-  def : Pat<(v8i32 (X86Movsldup (bc_v8i32 (memopv4i64 addr:$src)))),
+  def : Pat<(v8i32 (X86Movsldup (bc_v8i32 (loadv4i64 addr:$src)))),
             (VMOVSLDUPYrm addr:$src)>;
 }
 
@@ -4887,20 +4836,20 @@ let Predicates = [HasAVX] in {
 defm MOVDDUP : sse3_replicate_dfp<"movddup">;
 
 let Predicates = [HasAVX] in {
-  def : Pat<(X86Movddup (memopv2f64 addr:$src)),
+  def : Pat<(X86Movddup (loadv2f64 addr:$src)),
             (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
-  def : Pat<(X86Movddup (bc_v2f64 (memopv4f32 addr:$src))),
+  def : Pat<(X86Movddup (bc_v2f64 (loadv4f32 addr:$src))),
             (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
-  def : Pat<(X86Movddup (bc_v2f64 (memopv2i64 addr:$src))),
+  def : Pat<(X86Movddup (bc_v2f64 (loadv2i64 addr:$src))),
             (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
   def : Pat<(X86Movddup (bc_v2f64
                              (v2i64 (scalar_to_vector (loadi64 addr:$src))))),
             (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
 
   // 256-bit version
-  def : Pat<(X86Movddup (memopv4f64 addr:$src)),
+  def : Pat<(X86Movddup (loadv4f64 addr:$src)),
             (VMOVDDUPYrm addr:$src)>;
-  def : Pat<(X86Movddup (memopv4i64 addr:$src)),
+  def : Pat<(X86Movddup (loadv4i64 addr:$src)),
             (VMOVDDUPYrm addr:$src)>;
   def : Pat<(X86Movddup (v4i64 (scalar_to_vector (loadi64 addr:$src)))),
             (VMOVDDUPYrm addr:$src)>;
@@ -5102,12 +5051,12 @@ multiclass SS3I_unop_rm_int_y<bits<8> opc, string OpcodeStr,
 // Helper fragments to match sext vXi1 to vXiY.
 def v16i1sextv16i8 : PatLeaf<(v16i8 (X86pcmpgt (bc_v16i8 (v4i32 immAllZerosV)),
                                                VR128:$src))>;
-def v8i1sextv8i16  : PatLeaf<(v8i16 (X86vsrai VR128:$src, (i32 15)))>;
-def v4i1sextv4i32  : PatLeaf<(v4i32 (X86vsrai VR128:$src, (i32 31)))>;
+def v8i1sextv8i16  : PatLeaf<(v8i16 (X86vsrai VR128:$src, (i8 15)))>;
+def v4i1sextv4i32  : PatLeaf<(v4i32 (X86vsrai VR128:$src, (i8 31)))>;
 def v32i1sextv32i8 : PatLeaf<(v32i8 (X86pcmpgt (bc_v32i8 (v8i32 immAllZerosV)),
                                                VR256:$src))>;
-def v16i1sextv16i16: PatLeaf<(v16i16 (X86vsrai VR256:$src, (i32 15)))>;
-def v8i1sextv8i32  : PatLeaf<(v8i32 (X86vsrai VR256:$src, (i32 31)))>;
+def v16i1sextv16i16: PatLeaf<(v16i16 (X86vsrai VR256:$src, (i8 15)))>;
+def v8i1sextv8i32  : PatLeaf<(v8i32 (X86vsrai VR256:$src, (i8 31)))>;
 
 let Predicates = [HasAVX] in {
   defm VPABSB  : SS3I_unop_rm_int<0x1C, "vpabsb",
@@ -5263,34 +5212,34 @@ multiclass SS3I_binop_rm_int_y<bits<8> opc, string OpcodeStr,
        !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
        [(set VR256:$dst,
          (IntId256 VR256:$src1,
-          (bitconvert (memopv4i64 addr:$src2))))]>, OpSize;
+          (bitconvert (loadv4i64 addr:$src2))))]>, OpSize;
 }
 
 let ImmT = NoImm, Predicates = [HasAVX] in {
 let isCommutable = 0 in {
   defm VPHADDW    : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v8i16, VR128,
-                                  memopv2i64, i128mem,
+                                  loadv2i64, i128mem,
                                   SSE_PHADDSUBW, 0>, VEX_4V;
   defm VPHADDD    : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v4i32, VR128,
-                                  memopv2i64, i128mem,
+                                  loadv2i64, i128mem,
                                   SSE_PHADDSUBD, 0>, VEX_4V;
   defm VPHSUBW    : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v8i16, VR128,
-                                  memopv2i64, i128mem,
+                                  loadv2i64, i128mem,
                                   SSE_PHADDSUBW, 0>, VEX_4V;
   defm VPHSUBD    : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v4i32, VR128,
-                                  memopv2i64, i128mem,
+                                  loadv2i64, i128mem,
                                   SSE_PHADDSUBD, 0>, VEX_4V;
   defm VPSIGNB    : SS3I_binop_rm<0x08, "vpsignb", X86psign, v16i8, VR128,
-                                  memopv2i64, i128mem,
+                                  loadv2i64, i128mem,
                                   SSE_PSIGN, 0>, VEX_4V;
   defm VPSIGNW    : SS3I_binop_rm<0x09, "vpsignw", X86psign, v8i16, VR128,
-                                  memopv2i64, i128mem,
+                                  loadv2i64, i128mem,
                                   SSE_PSIGN, 0>, VEX_4V;
   defm VPSIGND    : SS3I_binop_rm<0x0A, "vpsignd", X86psign, v4i32, VR128,
-                                  memopv2i64, i128mem,
+                                  loadv2i64, i128mem,
                                   SSE_PSIGN, 0>, VEX_4V;
   defm VPSHUFB    : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v16i8, VR128,
-                                  memopv2i64, i128mem,
+                                  loadv2i64, i128mem,
                                   SSE_PSHUFB, 0>, VEX_4V;
   defm VPHADDSW   : SS3I_binop_rm_int<0x03, "vphaddsw",
                                       int_x86_ssse3_phadd_sw_128,
@@ -5310,28 +5259,28 @@ defm VPMULHRSW    : SS3I_binop_rm_int<0x0B, "vpmulhrsw",
 let ImmT = NoImm, Predicates = [HasAVX2] in {
 let isCommutable = 0 in {
   defm VPHADDWY   : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v16i16, VR256,
-                                  memopv4i64, i256mem,
+                                  loadv4i64, i256mem,
                                   SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
   defm VPHADDDY   : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v8i32, VR256,
-                                  memopv4i64, i256mem,
+                                  loadv4i64, i256mem,
                                   SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
   defm VPHSUBWY   : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v16i16, VR256,
-                                  memopv4i64, i256mem,
+                                  loadv4i64, i256mem,
                                   SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
   defm VPHSUBDY   : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v8i32, VR256,
-                                  memopv4i64, i256mem,
+                                  loadv4i64, i256mem,
                                   SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
   defm VPSIGNBY   : SS3I_binop_rm<0x08, "vpsignb", X86psign, v32i8, VR256,
-                                  memopv4i64, i256mem,
+                                  loadv4i64, i256mem,
                                   SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
   defm VPSIGNWY   : SS3I_binop_rm<0x09, "vpsignw", X86psign, v16i16, VR256,
-                                  memopv4i64, i256mem,
+                                  loadv4i64, i256mem,
                                   SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
   defm VPSIGNDY   : SS3I_binop_rm<0x0A, "vpsignd", X86psign, v8i32, VR256,
-                                  memopv4i64, i256mem,
+                                  loadv4i64, i256mem,
                                   SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
   defm VPSHUFBY   : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v32i8, VR256,
-                                  memopv4i64, i256mem,
+                                  loadv4i64, i256mem,
                                   SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
   defm VPHADDSW   : SS3I_binop_rm_int_y<0x03, "vphaddsw",
                                         int_x86_avx2_phadd_sw>, VEX_4V, VEX_L;
@@ -5389,7 +5338,7 @@ multiclass ssse3_palignr<string asm, bit Is2Addr = 1> {
         !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
         !strconcat(asm,
                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
-      [], IIC_SSE_PALIGNR>, OpSize, Sched<[WriteShuffle]>;
+      [], IIC_SSE_PALIGNRR>, OpSize, Sched<[WriteShuffle]>;
   let mayLoad = 1 in
   def R128rm : SS3AI<0x0F, MRMSrcMem, (outs VR128:$dst),
       (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
@@ -5397,7 +5346,7 @@ multiclass ssse3_palignr<string asm, bit Is2Addr = 1> {
         !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
         !strconcat(asm,
                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
-      [], IIC_SSE_PALIGNR>, OpSize, Sched<[WriteShuffleLd, ReadAfterLd]>;
+      [], IIC_SSE_PALIGNRM>, OpSize, Sched<[WriteShuffleLd, ReadAfterLd]>;
   }
 }
 
@@ -5489,16 +5438,17 @@ def : InstAlias<"monitor\t{%rax, %rcx, %rdx|rdx, rcx, rax}", (MONITORrrr)>,
 // SSE4.1 - Packed Move with Sign/Zero Extend
 //===----------------------------------------------------------------------===//
 
-multiclass SS41I_binop_rm_int8<bits<8> opc, string OpcodeStr, Intrinsic IntId> {
+multiclass SS41I_binop_rm_int8<bits<8> opc, string OpcodeStr, Intrinsic IntId,
+                               OpndItins itins = DEFAULT_ITINS> {
   def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-                 [(set VR128:$dst, (IntId VR128:$src))]>, OpSize;
+                 [(set VR128:$dst, (IntId VR128:$src))], itins.rr>, OpSize;
 
   def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
        [(set VR128:$dst,
-         (IntId (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))]>,
-       OpSize;
+         (IntId (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))],
+         itins.rm>, OpSize;
 }
 
 multiclass SS41I_binop_rm_int16_y<bits<8> opc, string OpcodeStr,
@@ -5509,22 +5459,23 @@ multiclass SS41I_binop_rm_int16_y<bits<8> opc, string OpcodeStr,
 
   def Yrm : SS48I<opc, MRMSrcMem, (outs VR256:$dst), (ins i128mem:$src),
                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-                  [(set VR256:$dst, (IntId (load addr:$src)))]>, OpSize;
+                  [(set VR256:$dst, (IntId (load addr:$src)))]>,
+                  OpSize;
 }
 
 let Predicates = [HasAVX] in {
-defm VPMOVSXBW : SS41I_binop_rm_int8<0x20, "vpmovsxbw", int_x86_sse41_pmovsxbw>,
-                                     VEX;
-defm VPMOVSXWD : SS41I_binop_rm_int8<0x23, "vpmovsxwd", int_x86_sse41_pmovsxwd>,
-                                     VEX;
-defm VPMOVSXDQ : SS41I_binop_rm_int8<0x25, "vpmovsxdq", int_x86_sse41_pmovsxdq>,
-                                     VEX;
-defm VPMOVZXBW : SS41I_binop_rm_int8<0x30, "vpmovzxbw", int_x86_sse41_pmovzxbw>,
-                                     VEX;
-defm VPMOVZXWD : SS41I_binop_rm_int8<0x33, "vpmovzxwd", int_x86_sse41_pmovzxwd>,
-                                     VEX;
-defm VPMOVZXDQ : SS41I_binop_rm_int8<0x35, "vpmovzxdq", int_x86_sse41_pmovzxdq>,
-                                     VEX;
+defm VPMOVSXBW : SS41I_binop_rm_int8<0x20, "vpmovsxbw",
+                                     int_x86_sse41_pmovsxbw>, VEX;
+defm VPMOVSXWD : SS41I_binop_rm_int8<0x23, "vpmovsxwd",
+                                     int_x86_sse41_pmovsxwd>, VEX;
+defm VPMOVSXDQ : SS41I_binop_rm_int8<0x25, "vpmovsxdq",
+                                     int_x86_sse41_pmovsxdq>, VEX;
+defm VPMOVZXBW : SS41I_binop_rm_int8<0x30, "vpmovzxbw",
+                                     int_x86_sse41_pmovzxbw>, VEX;
+defm VPMOVZXWD : SS41I_binop_rm_int8<0x33, "vpmovzxwd",
+                                     int_x86_sse41_pmovzxwd>, VEX;
+defm VPMOVZXDQ : SS41I_binop_rm_int8<0x35, "vpmovzxdq",
+                                     int_x86_sse41_pmovzxdq>, VEX;
 }
 
 let Predicates = [HasAVX2] in {
@@ -5542,12 +5493,12 @@ defm VPMOVZXDQ : SS41I_binop_rm_int16_y<0x35, "vpmovzxdq",
                                         int_x86_avx2_pmovzxdq>, VEX, VEX_L;
 }
 
-defm PMOVSXBW   : SS41I_binop_rm_int8<0x20, "pmovsxbw", int_x86_sse41_pmovsxbw>;
-defm PMOVSXWD   : SS41I_binop_rm_int8<0x23, "pmovsxwd", int_x86_sse41_pmovsxwd>;
-defm PMOVSXDQ   : SS41I_binop_rm_int8<0x25, "pmovsxdq", int_x86_sse41_pmovsxdq>;
-defm PMOVZXBW   : SS41I_binop_rm_int8<0x30, "pmovzxbw", int_x86_sse41_pmovzxbw>;
-defm PMOVZXWD   : SS41I_binop_rm_int8<0x33, "pmovzxwd", int_x86_sse41_pmovzxwd>;
-defm PMOVZXDQ   : SS41I_binop_rm_int8<0x35, "pmovzxdq", int_x86_sse41_pmovzxdq>;
+defm PMOVSXBW   : SS41I_binop_rm_int8<0x20, "pmovsxbw", int_x86_sse41_pmovsxbw,                                       SSE_INTALU_ITINS_P>;
+defm PMOVSXWD   : SS41I_binop_rm_int8<0x23, "pmovsxwd", int_x86_sse41_pmovsxwd,                                       SSE_INTALU_ITINS_P>;
+defm PMOVSXDQ   : SS41I_binop_rm_int8<0x25, "pmovsxdq", int_x86_sse41_pmovsxdq,                                       SSE_INTALU_ITINS_P>;
+defm PMOVZXBW   : SS41I_binop_rm_int8<0x30, "pmovzxbw", int_x86_sse41_pmovzxbw,                                       SSE_INTALU_ITINS_P>;
+defm PMOVZXWD   : SS41I_binop_rm_int8<0x33, "pmovzxwd", int_x86_sse41_pmovzxwd,                                       SSE_INTALU_ITINS_P>;
+defm PMOVZXDQ   : SS41I_binop_rm_int8<0x35, "pmovzxdq", int_x86_sse41_pmovzxdq,                                       SSE_INTALU_ITINS_P>;
 
 let Predicates = [HasAVX] in {
   // Common patterns involving scalar load.
@@ -5645,32 +5596,39 @@ let Predicates = [HasAVX2] in {
               (VPMOVZXDQYrr VR128:$src)>;
     def : Pat<(v8i32 (X86vzmovly (v8i16 VR128:$src))),
               (VPMOVZXWDYrr VR128:$src)>;
+    def : Pat<(v16i16 (X86vzmovly (v16i8 VR128:$src))),
+              (VPMOVZXBWYrr VR128:$src)>;
   }
 
   def : Pat<(v4i64 (X86vsmovl (v4i32 VR128:$src))), (VPMOVSXDQYrr VR128:$src)>;
   def : Pat<(v8i32 (X86vsmovl (v8i16 VR128:$src))), (VPMOVSXWDYrr VR128:$src)>;
+  def : Pat<(v16i16 (X86vsmovl (v16i8 VR128:$src))), (VPMOVSXBWYrr VR128:$src)>;
 }
 
 let Predicates = [HasAVX] in {
   def : Pat<(v2i64 (X86vsmovl (v4i32 VR128:$src))), (VPMOVSXDQrr VR128:$src)>;
   def : Pat<(v4i32 (X86vsmovl (v8i16 VR128:$src))), (VPMOVSXWDrr VR128:$src)>;
+  def : Pat<(v8i16 (X86vsmovl (v16i8 VR128:$src))), (VPMOVSXBWrr VR128:$src)>;
 }
 
 let Predicates = [UseSSE41] in {
   def : Pat<(v2i64 (X86vsmovl (v4i32 VR128:$src))), (PMOVSXDQrr VR128:$src)>;
   def : Pat<(v4i32 (X86vsmovl (v8i16 VR128:$src))), (PMOVSXWDrr VR128:$src)>;
+  def : Pat<(v8i16 (X86vsmovl (v16i8 VR128:$src))), (PMOVSXBWrr VR128:$src)>;
 }
 
 
-multiclass SS41I_binop_rm_int4<bits<8> opc, string OpcodeStr, Intrinsic IntId> {
+multiclass SS41I_binop_rm_int4<bits<8> opc, string OpcodeStr, Intrinsic IntId,
+                               OpndItins itins = DEFAULT_ITINS> {
   def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-                 [(set VR128:$dst, (IntId VR128:$src))]>, OpSize;
+                 [(set VR128:$dst, (IntId VR128:$src))], itins.rr>, OpSize;
 
   def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
        [(set VR128:$dst,
-         (IntId (bitconvert (v4i32 (scalar_to_vector (loadi32 addr:$src))))))]>,
+         (IntId (bitconvert (v4i32 (scalar_to_vector (loadi32 addr:$src))))))],
+         itins.rm>,
           OpSize;
 }
 
@@ -5709,10 +5667,14 @@ defm VPMOVZXWQ : SS41I_binop_rm_int8_y<0x34, "vpmovzxwq",
                                        int_x86_avx2_pmovzxwq>, VEX, VEX_L;
 }
 
-defm PMOVSXBD   : SS41I_binop_rm_int4<0x21, "pmovsxbd", int_x86_sse41_pmovsxbd>;
-defm PMOVSXWQ   : SS41I_binop_rm_int4<0x24, "pmovsxwq", int_x86_sse41_pmovsxwq>;
-defm PMOVZXBD   : SS41I_binop_rm_int4<0x31, "pmovzxbd", int_x86_sse41_pmovzxbd>;
-defm PMOVZXWQ   : SS41I_binop_rm_int4<0x34, "pmovzxwq", int_x86_sse41_pmovzxwq>;
+defm PMOVSXBD   : SS41I_binop_rm_int4<0x21, "pmovsxbd", int_x86_sse41_pmovsxbd,
+                                      SSE_INTALU_ITINS_P>;
+defm PMOVSXWQ   : SS41I_binop_rm_int4<0x24, "pmovsxwq", int_x86_sse41_pmovsxwq,
+                                      SSE_INTALU_ITINS_P>;
+defm PMOVZXBD   : SS41I_binop_rm_int4<0x31, "pmovzxbd", int_x86_sse41_pmovzxbd,
+                                      SSE_INTALU_ITINS_P>;
+defm PMOVZXWQ   : SS41I_binop_rm_int4<0x34, "pmovzxwq", int_x86_sse41_pmovzxwq,
+                                      SSE_INTALU_ITINS_P>;
 
 let Predicates = [HasAVX] in {
   // Common patterns involving scalar load
@@ -5740,7 +5702,8 @@ let Predicates = [UseSSE41] in {
             (PMOVZXWQrm addr:$src)>;
 }
 
-multiclass SS41I_binop_rm_int2<bits<8> opc, string OpcodeStr, Intrinsic IntId> {
+multiclass SS41I_binop_rm_int2<bits<8> opc, string OpcodeStr, Intrinsic IntId,
+                               OpndItins itins = DEFAULT_ITINS> {
   def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                  [(set VR128:$dst, (IntId VR128:$src))]>, OpSize;
@@ -5779,8 +5742,10 @@ defm VPMOVSXBQ : SS41I_binop_rm_int4_y<0x22, "vpmovsxbq",
 defm VPMOVZXBQ : SS41I_binop_rm_int4_y<0x32, "vpmovzxbq",
                                        int_x86_avx2_pmovzxbq>, VEX, VEX_L;
 }
-defm PMOVSXBQ   : SS41I_binop_rm_int2<0x22, "pmovsxbq", int_x86_sse41_pmovsxbq>;
-defm PMOVZXBQ   : SS41I_binop_rm_int2<0x32, "pmovzxbq", int_x86_sse41_pmovzxbq>;
+defm PMOVSXBQ   : SS41I_binop_rm_int2<0x22, "pmovsxbq", int_x86_sse41_pmovsxbq,
+                                      SSE_INTALU_ITINS_P>;
+defm PMOVZXBQ   : SS41I_binop_rm_int2<0x32, "pmovzxbq", int_x86_sse41_pmovzxbq,
+                                      SSE_INTALU_ITINS_P>;
 
 let Predicates = [HasAVX2] in {
   def : Pat<(v16i16 (X86vsext (v16i8 VR128:$src))), (VPMOVSXBWYrr VR128:$src)>;
@@ -6032,35 +5997,39 @@ let Predicates = [UseSSE41] in {
 
 /// SS41I_binop_ext8 - SSE 4.1 extract 8 bits to 32 bit reg or 8 bit mem
 multiclass SS41I_extract8<bits<8> opc, string OpcodeStr> {
-  def rr : SS4AIi8<opc, MRMDestReg, (outs GR32:$dst),
+  def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
                  (ins VR128:$src1, i32i8imm:$src2),
                  !strconcat(OpcodeStr,
-                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                 [(set GR32:$dst, (X86pextrb (v16i8 VR128:$src1), imm:$src2))]>,
+                            "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                 [(set GR32orGR64:$dst, (X86pextrb (v16i8 VR128:$src1),
+                                         imm:$src2))]>,
                  OpSize;
   let neverHasSideEffects = 1, mayStore = 1 in
   def mr : SS4AIi8<opc, MRMDestMem, (outs),
                  (ins i8mem:$dst, VR128:$src1, i32i8imm:$src2),
                  !strconcat(OpcodeStr,
-                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                            "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                  []>, OpSize;
 // FIXME:
 // There's an AssertZext in the way of writing the store pattern
 // (store (i8 (trunc (X86pextrb (v16i8 VR128:$src1), imm:$src2))), addr:$dst)
 }
 
-let Predicates = [HasAVX] in {
+let Predicates = [HasAVX] in
   defm VPEXTRB : SS41I_extract8<0x14, "vpextrb">, VEX;
-  def  VPEXTRBrr64 : SS4AIi8<0x14, MRMDestReg, (outs GR64:$dst),
-         (ins VR128:$src1, i32i8imm:$src2),
-         "vpextrb\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, OpSize, VEX;
-}
 
 defm PEXTRB      : SS41I_extract8<0x14, "pextrb">;
 
 
 /// SS41I_extract16 - SSE 4.1 extract 16 bits to memory destination
 multiclass SS41I_extract16<bits<8> opc, string OpcodeStr> {
+  let isCodeGenOnly = 1, hasSideEffects = 0 in
+  def rr_REV : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
+                   (ins VR128:$src1, i32i8imm:$src2),
+                   !strconcat(OpcodeStr,
+                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                   []>, OpSize;
+
   let neverHasSideEffects = 1, mayStore = 1 in
   def mr : SS4AIi8<opc, MRMDestMem, (outs),
                  (ins i16mem:$dst, VR128:$src1, i32i8imm:$src2),
@@ -6122,31 +6091,28 @@ defm PEXTRQ      : SS41I_extract64<0x16, "pextrq">;
 
 /// SS41I_extractf32 - SSE 4.1 extract 32 bits fp value to int reg or memory
 /// destination
-multiclass SS41I_extractf32<bits<8> opc, string OpcodeStr> {
-  def rr : SS4AIi8<opc, MRMDestReg, (outs GR32:$dst),
+multiclass SS41I_extractf32<bits<8> opc, string OpcodeStr,
+                            OpndItins itins = DEFAULT_ITINS> {
+  def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
                  (ins VR128:$src1, i32i8imm:$src2),
                  !strconcat(OpcodeStr,
                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                 [(set GR32:$dst,
-                    (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2))]>,
+                 [(set GR32orGR64:$dst,
+                    (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2))],
+                    itins.rr>,
            OpSize;
   def mr : SS4AIi8<opc, MRMDestMem, (outs),
                  (ins f32mem:$dst, VR128:$src1, i32i8imm:$src2),
                  !strconcat(OpcodeStr,
                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                  [(store (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2),
-                          addr:$dst)]>, OpSize;
+                          addr:$dst)], itins.rm>, OpSize;
 }
 
 let ExeDomain = SSEPackedSingle in {
-  let Predicates = [UseAVX] in {
+  let Predicates = [UseAVX] in
     defm VEXTRACTPS : SS41I_extractf32<0x17, "vextractps">, VEX;
-    def VEXTRACTPSrr64 : SS4AIi8<0x17, MRMDestReg, (outs GR64:$dst),
-                    (ins VR128:$src1, i32i8imm:$src2),
-                    "vextractps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                    []>, OpSize, VEX;
-  }
-  defm EXTRACTPS   : SS41I_extractf32<0x17, "extractps">;
+  defm EXTRACTPS   : SS41I_extractf32<0x17, "extractps", SSE_EXTRACT_ITINS>;
 }
 
 // Also match an EXTRACTPS store when the store is done as f32 instead of i32.
@@ -6167,13 +6133,13 @@ def : Pat<(store (f32 (bitconvert (extractelt (bc_v4i32 (v4f32 VR128:$src1)),
 
 multiclass SS41I_insert8<bits<8> opc, string asm, bit Is2Addr = 1> {
   def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
-      (ins VR128:$src1, GR32:$src2, i32i8imm:$src3),
+      (ins VR128:$src1, GR32orGR64:$src2, i32i8imm:$src3),
       !if(Is2Addr,
         !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
         !strconcat(asm,
                    "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
       [(set VR128:$dst,
-        (X86pinsrb VR128:$src1, GR32:$src2, imm:$src3))]>, OpSize;
+        (X86pinsrb VR128:$src1, GR32orGR64:$src2, imm:$src3))]>, OpSize;
   def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
       (ins VR128:$src1, i8mem:$src2, i32i8imm:$src3),
       !if(Is2Addr,
@@ -6246,7 +6212,8 @@ let Constraints = "$src1 = $dst" in
 // are optimized inserts that won't zero arbitrary elements in the destination
 // vector. The next one matches the intrinsic and could zero arbitrary elements
 // in the target vector.
-multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1> {
+multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1,
+                           OpndItins itins = DEFAULT_ITINS> {
   def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
       (ins VR128:$src1, VR128:$src2, u32u8imm:$src3),
       !if(Is2Addr,
@@ -6254,7 +6221,7 @@ multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1> {
         !strconcat(asm,
                    "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
       [(set VR128:$dst,
-        (X86insrtps VR128:$src1, VR128:$src2, imm:$src3))]>,
+        (X86insrtps VR128:$src1, VR128:$src2, imm:$src3))], itins.rr>,
       OpSize;
   def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
       (ins VR128:$src1, f32mem:$src2, u32u8imm:$src3),
@@ -6265,14 +6232,14 @@ multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1> {
       [(set VR128:$dst,
         (X86insrtps VR128:$src1,
                    (v4f32 (scalar_to_vector (loadf32 addr:$src2))),
-                    imm:$src3))]>, OpSize;
+                    imm:$src3))], itins.rm>, OpSize;
 }
 
 let ExeDomain = SSEPackedSingle in {
-  let Predicates = [HasAVX] in
+  let Predicates = [UseAVX] in
     defm VINSERTPS : SS41I_insertf32<0x21, "vinsertps", 0>, VEX_4V;
   let Constraints = "$src1 = $dst" in
-    defm INSERTPS : SS41I_insertf32<0x21, "insertps">;
+    defm INSERTPS : SS41I_insertf32<0x21, "insertps", 1, SSE_INSERT_ITINS>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -6290,7 +6257,8 @@ let ExeDomain = SSEPackedSingle in {
                     (outs RC:$dst), (ins RC:$src1, i32i8imm:$src2),
                     !strconcat(OpcodeStr,
                     "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                    [(set RC:$dst, (V4F32Int RC:$src1, imm:$src2))]>,
+                    [(set RC:$dst, (V4F32Int RC:$src1, imm:$src2))],
+                    IIC_SSE_ROUNDPS_REG>,
                     OpSize;
 
   // Vector intrinsic operation, mem
@@ -6299,7 +6267,8 @@ let ExeDomain = SSEPackedSingle in {
                     !strconcat(OpcodeStr,
                     "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                     [(set RC:$dst,
-                          (V4F32Int (mem_frag32 addr:$src1),imm:$src2))]>,
+                          (V4F32Int (mem_frag32 addr:$src1),imm:$src2))],
+                          IIC_SSE_ROUNDPS_MEM>,
                     OpSize;
 } // ExeDomain = SSEPackedSingle
 
@@ -6309,7 +6278,8 @@ let ExeDomain = SSEPackedDouble in {
                     (outs RC:$dst), (ins RC:$src1, i32i8imm:$src2),
                     !strconcat(OpcodeStr,
                     "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                    [(set RC:$dst, (V2F64Int RC:$src1, imm:$src2))]>,
+                    [(set RC:$dst, (V2F64Int RC:$src1, imm:$src2))],
+                    IIC_SSE_ROUNDPS_REG>,
                     OpSize;
 
   // Vector intrinsic operation, mem
@@ -6318,7 +6288,8 @@ let ExeDomain = SSEPackedDouble in {
                     !strconcat(OpcodeStr,
                     "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                     [(set RC:$dst,
-                          (V2F64Int (mem_frag64 addr:$src1),imm:$src2))]>,
+                          (V2F64Int (mem_frag64 addr:$src1),imm:$src2))],
+                          IIC_SSE_ROUNDPS_REG>,
                     OpSize;
 } // ExeDomain = SSEPackedDouble
 }
@@ -6402,11 +6373,11 @@ let ExeDomain = GenericDomain in {
 let Predicates = [HasAVX] in {
   // Intrinsic form
   defm VROUND  : sse41_fp_unop_rm<0x08, 0x09, "vround", f128mem, VR128,
-                                  memopv4f32, memopv2f64,
+                                  loadv4f32, loadv2f64,
                                   int_x86_sse41_round_ps,
                                   int_x86_sse41_round_pd>, VEX;
   defm VROUNDY : sse41_fp_unop_rm<0x08, 0x09, "vround", f256mem, VR256,
-                                  memopv8f32, memopv4f64,
+                                  loadv8f32, loadv4f64,
                                   int_x86_avx_round_ps_256,
                                   int_x86_avx_round_pd_256>, VEX, VEX_L;
   defm VROUND  : sse41_fp_binop_rm<0x0A, 0x0B, "vround",
@@ -6544,7 +6515,7 @@ def VPTESTrr  : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
                 OpSize, VEX;
 def VPTESTrm  : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
                 "vptest\t{$src2, $src1|$src1, $src2}",
-                [(set EFLAGS,(X86ptest VR128:$src1, (memopv2i64 addr:$src2)))]>,
+                [(set EFLAGS,(X86ptest VR128:$src1, (loadv2i64 addr:$src2)))]>,
                 OpSize, VEX;
 
 def VPTESTYrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR256:$src1, VR256:$src2),
@@ -6553,7 +6524,7 @@ def VPTESTYrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR256:$src1, VR256:$src2),
                 OpSize, VEX, VEX_L;
 def VPTESTYrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR256:$src1, i256mem:$src2),
                 "vptest\t{$src2, $src1|$src1, $src2}",
-                [(set EFLAGS,(X86ptest VR256:$src1, (memopv4i64 addr:$src2)))]>,
+                [(set EFLAGS,(X86ptest VR256:$src1, (loadv4i64 addr:$src2)))]>,
                 OpSize, VEX, VEX_L;
 }
 
@@ -6582,13 +6553,13 @@ multiclass avx_bittest<bits<8> opc, string OpcodeStr, RegisterClass RC,
 
 let Defs = [EFLAGS], Predicates = [HasAVX] in {
 let ExeDomain = SSEPackedSingle in {
-defm VTESTPS  : avx_bittest<0x0E, "vtestps", VR128, f128mem, memopv4f32, v4f32>;
-defm VTESTPSY : avx_bittest<0x0E, "vtestps", VR256, f256mem, memopv8f32, v8f32>,
+defm VTESTPS  : avx_bittest<0x0E, "vtestps", VR128, f128mem, loadv4f32, v4f32>;
+defm VTESTPSY : avx_bittest<0x0E, "vtestps", VR256, f256mem, loadv8f32, v8f32>,
                             VEX_L;
 }
 let ExeDomain = SSEPackedDouble in {
-defm VTESTPD  : avx_bittest<0x0F, "vtestpd", VR128, f128mem, memopv2f64, v2f64>;
-defm VTESTPDY : avx_bittest<0x0F, "vtestpd", VR256, f256mem, memopv4f64, v4f64>,
+defm VTESTPD  : avx_bittest<0x0F, "vtestpd", VR128, f128mem, loadv2f64, v2f64>;
+defm VTESTPDY : avx_bittest<0x0F, "vtestpd", VR256, f256mem, loadv4f64, v4f64>,
                             VEX_L;
 }
 }
@@ -6600,30 +6571,33 @@ defm VTESTPDY : avx_bittest<0x0F, "vtestpd", VR256, f256mem, memopv4f64, v4f64>,
 let Defs = [EFLAGS], Predicates = [HasPOPCNT] in {
   def POPCNT16rr : I<0xB8, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
                      "popcnt{w}\t{$src, $dst|$dst, $src}",
-                     [(set GR16:$dst, (ctpop GR16:$src)), (implicit EFLAGS)]>,
+                     [(set GR16:$dst, (ctpop GR16:$src)), (implicit EFLAGS)],
+                     IIC_SSE_POPCNT_RR>,
                      OpSize, XS;
   def POPCNT16rm : I<0xB8, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
                      "popcnt{w}\t{$src, $dst|$dst, $src}",
                      [(set GR16:$dst, (ctpop (loadi16 addr:$src))),
-                      (implicit EFLAGS)]>, OpSize, XS;
+                      (implicit EFLAGS)], IIC_SSE_POPCNT_RM>, OpSize, XS;
 
   def POPCNT32rr : I<0xB8, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
                      "popcnt{l}\t{$src, $dst|$dst, $src}",
-                     [(set GR32:$dst, (ctpop GR32:$src)), (implicit EFLAGS)]>,
+                     [(set GR32:$dst, (ctpop GR32:$src)), (implicit EFLAGS)],
+                     IIC_SSE_POPCNT_RR>,
                      XS;
   def POPCNT32rm : I<0xB8, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
                      "popcnt{l}\t{$src, $dst|$dst, $src}",
                      [(set GR32:$dst, (ctpop (loadi32 addr:$src))),
-                      (implicit EFLAGS)]>, XS;
+                      (implicit EFLAGS)], IIC_SSE_POPCNT_RM>, XS;
 
   def POPCNT64rr : RI<0xB8, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
                       "popcnt{q}\t{$src, $dst|$dst, $src}",
-                      [(set GR64:$dst, (ctpop GR64:$src)), (implicit EFLAGS)]>,
+                      [(set GR64:$dst, (ctpop GR64:$src)), (implicit EFLAGS)],
+                      IIC_SSE_POPCNT_RR>,
                       XS;
   def POPCNT64rm : RI<0xB8, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
                       "popcnt{q}\t{$src, $dst|$dst, $src}",
                       [(set GR64:$dst, (ctpop (loadi64 addr:$src))),
-                       (implicit EFLAGS)]>, XS;
+                       (implicit EFLAGS)], IIC_SSE_POPCNT_RM>, XS;
 }
 
 
@@ -6651,14 +6625,16 @@ defm PHMINPOSUW : SS41I_unop_rm_int_v16 <0x41, "phminposuw",
 
 /// SS41I_binop_rm_int - Simple SSE 4.1 binary operator
 multiclass SS41I_binop_rm_int<bits<8> opc, string OpcodeStr,
-                              Intrinsic IntId128, bit Is2Addr = 1> {
+                              Intrinsic IntId128, bit Is2Addr = 1,
+                              OpndItins itins = DEFAULT_ITINS> {
   let isCommutable = 1 in
   def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
        (ins VR128:$src1, VR128:$src2),
        !if(Is2Addr,
            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
-       [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>, OpSize;
+       [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))],
+        itins.rr>, OpSize;
   def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
        (ins VR128:$src1, i128mem:$src2),
        !if(Is2Addr,
@@ -6666,7 +6642,8 @@ multiclass SS41I_binop_rm_int<bits<8> opc, string OpcodeStr,
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
        [(set VR128:$dst,
          (IntId128 VR128:$src1,
-          (bitconvert (memopv2i64 addr:$src2))))]>, OpSize;
+          (bitconvert (memopv2i64 addr:$src2))))],
+          itins.rm>, OpSize;
 }
 
 /// SS41I_binop_rm_int_y - Simple SSE 4.1 binary operator
@@ -6682,14 +6659,15 @@ multiclass SS41I_binop_rm_int_y<bits<8> opc, string OpcodeStr,
        !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
        [(set VR256:$dst,
          (IntId256 VR256:$src1,
-          (bitconvert (memopv4i64 addr:$src2))))]>, OpSize;
+          (bitconvert (loadv4i64 addr:$src2))))]>, OpSize;
 }
 
 
 /// SS48I_binop_rm - Simple SSE41 binary operator.
 multiclass SS48I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
                           ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
-                          X86MemOperand x86memop, bit Is2Addr = 1> {
+                          X86MemOperand x86memop, bit Is2Addr = 1,
+                          OpndItins itins = DEFAULT_ITINS> {
   let isCommutable = 1 in
   def rr : SS48I<opc, MRMSrcReg, (outs RC:$dst),
        (ins RC:$src1, RC:$src2),
@@ -6712,21 +6690,21 @@ let Predicates = [HasAVX] in {
   defm VPACKUSDW : SS41I_binop_rm_int<0x2B, "vpackusdw", int_x86_sse41_packusdw,
                                                          0>, VEX_4V;
   defm VPMINSB   : SS48I_binop_rm<0x38, "vpminsb", X86smin, v16i8, VR128,
-                                  memopv2i64, i128mem, 0>, VEX_4V;
+                                  loadv2i64, i128mem, 0>, VEX_4V;
   defm VPMINSD   : SS48I_binop_rm<0x39, "vpminsd", X86smin, v4i32, VR128,
-                                  memopv2i64, i128mem, 0>, VEX_4V;
+                                  loadv2i64, i128mem, 0>, VEX_4V;
   defm VPMINUD   : SS48I_binop_rm<0x3B, "vpminud", X86umin, v4i32, VR128,
-                                  memopv2i64, i128mem, 0>, VEX_4V;
+                                  loadv2i64, i128mem, 0>, VEX_4V;
   defm VPMINUW   : SS48I_binop_rm<0x3A, "vpminuw", X86umin, v8i16, VR128,
-                                  memopv2i64, i128mem, 0>, VEX_4V;
+                                  loadv2i64, i128mem, 0>, VEX_4V;
   defm VPMAXSB   : SS48I_binop_rm<0x3C, "vpmaxsb", X86smax, v16i8, VR128,
-                                  memopv2i64, i128mem, 0>, VEX_4V;
+                                  loadv2i64, i128mem, 0>, VEX_4V;
   defm VPMAXSD   : SS48I_binop_rm<0x3D, "vpmaxsd", X86smax, v4i32, VR128,
-                                  memopv2i64, i128mem, 0>, VEX_4V;
+                                  loadv2i64, i128mem, 0>, VEX_4V;
   defm VPMAXUD   : SS48I_binop_rm<0x3F, "vpmaxud", X86umax, v4i32, VR128,
-                                  memopv2i64, i128mem, 0>, VEX_4V;
+                                  loadv2i64, i128mem, 0>, VEX_4V;
   defm VPMAXUW   : SS48I_binop_rm<0x3E, "vpmaxuw", X86umax, v8i16, VR128,
-                                  memopv2i64, i128mem, 0>, VEX_4V;
+                                  loadv2i64, i128mem, 0>, VEX_4V;
   defm VPMULDQ   : SS41I_binop_rm_int<0x28, "vpmuldq",   int_x86_sse41_pmuldq,
                                                          0>, VEX_4V;
 }
@@ -6736,21 +6714,21 @@ let Predicates = [HasAVX2] in {
   defm VPACKUSDW : SS41I_binop_rm_int_y<0x2B, "vpackusdw",
                                         int_x86_avx2_packusdw>, VEX_4V, VEX_L;
   defm VPMINSBY  : SS48I_binop_rm<0x38, "vpminsb", X86smin, v32i8, VR256,
-                                  memopv4i64, i256mem, 0>, VEX_4V, VEX_L;
+                                  loadv4i64, i256mem, 0>, VEX_4V, VEX_L;
   defm VPMINSDY  : SS48I_binop_rm<0x39, "vpminsd", X86smin, v8i32, VR256,
-                                  memopv4i64, i256mem, 0>, VEX_4V, VEX_L;
+                                  loadv4i64, i256mem, 0>, VEX_4V, VEX_L;
   defm VPMINUDY  : SS48I_binop_rm<0x3B, "vpminud", X86umin, v8i32, VR256,
-                                  memopv4i64, i256mem, 0>, VEX_4V, VEX_L;
+                                  loadv4i64, i256mem, 0>, VEX_4V, VEX_L;
   defm VPMINUWY  : SS48I_binop_rm<0x3A, "vpminuw", X86umin, v16i16, VR256,
-                                  memopv4i64, i256mem, 0>, VEX_4V, VEX_L;
+                                  loadv4i64, i256mem, 0>, VEX_4V, VEX_L;
   defm VPMAXSBY  : SS48I_binop_rm<0x3C, "vpmaxsb", X86smax, v32i8, VR256,
-                                  memopv4i64, i256mem, 0>, VEX_4V, VEX_L;
+                                  loadv4i64, i256mem, 0>, VEX_4V, VEX_L;
   defm VPMAXSDY  : SS48I_binop_rm<0x3D, "vpmaxsd", X86smax, v8i32, VR256,
-                                  memopv4i64, i256mem, 0>, VEX_4V, VEX_L;
+                                  loadv4i64, i256mem, 0>, VEX_4V, VEX_L;
   defm VPMAXUDY  : SS48I_binop_rm<0x3F, "vpmaxud", X86umax, v8i32, VR256,
-                                  memopv4i64, i256mem, 0>, VEX_4V, VEX_L;
+                                  loadv4i64, i256mem, 0>, VEX_4V, VEX_L;
   defm VPMAXUWY  : SS48I_binop_rm<0x3E, "vpmaxuw", X86umax, v16i16, VR256,
-                                  memopv4i64, i256mem, 0>, VEX_4V, VEX_L;
+                                  loadv4i64, i256mem, 0>, VEX_4V, VEX_L;
   defm VPMULDQ   : SS41I_binop_rm_int_y<0x28, "vpmuldq",
                                         int_x86_avx2_pmul_dq>, VEX_4V, VEX_L;
 }
@@ -6759,22 +6737,23 @@ let Constraints = "$src1 = $dst" in {
   let isCommutable = 0 in
   defm PACKUSDW : SS41I_binop_rm_int<0x2B, "packusdw", int_x86_sse41_packusdw>;
   defm PMINSB   : SS48I_binop_rm<0x38, "pminsb", X86smin, v16i8, VR128,
-                                 memopv2i64, i128mem>;
+                                 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
   defm PMINSD   : SS48I_binop_rm<0x39, "pminsd", X86smin, v4i32, VR128,
-                                 memopv2i64, i128mem>;
+                                 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
   defm PMINUD   : SS48I_binop_rm<0x3B, "pminud", X86umin, v4i32, VR128,
-                                 memopv2i64, i128mem>;
+                                 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
   defm PMINUW   : SS48I_binop_rm<0x3A, "pminuw", X86umin, v8i16, VR128,
-                                 memopv2i64, i128mem>;
+                                 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
   defm PMAXSB   : SS48I_binop_rm<0x3C, "pmaxsb", X86smax, v16i8, VR128,
-                                 memopv2i64, i128mem>;
+                                 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
   defm PMAXSD   : SS48I_binop_rm<0x3D, "pmaxsd", X86smax, v4i32, VR128,
-                                 memopv2i64, i128mem>;
+                                 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
   defm PMAXUD   : SS48I_binop_rm<0x3F, "pmaxud", X86umax, v4i32, VR128,
-                                 memopv2i64, i128mem>;
+                                 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
   defm PMAXUW   : SS48I_binop_rm<0x3E, "pmaxuw", X86umax, v8i16, VR128,
-                                 memopv2i64, i128mem>;
-  defm PMULDQ   : SS41I_binop_rm_int<0x28, "pmuldq",   int_x86_sse41_pmuldq>;
+                                 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
+  defm PMULDQ   : SS41I_binop_rm_int<0x28, "pmuldq",   int_x86_sse41_pmuldq,
+                                     1, SSE_INTMUL_ITINS_P>;
 }
 
 let Predicates = [HasAVX] in {
@@ -6792,15 +6771,16 @@ let Predicates = [HasAVX2] in {
 
 let Constraints = "$src1 = $dst" in {
   defm PMULLD  : SS48I_binop_rm<0x40, "pmulld", mul, v4i32, VR128,
-                                memopv2i64, i128mem>;
+                                memopv2i64, i128mem, 1, SSE_PMULLD_ITINS>;
   defm PCMPEQQ : SS48I_binop_rm<0x29, "pcmpeqq", X86pcmpeq, v2i64, VR128,
-                                memopv2i64, i128mem>;
+                                memopv2i64, i128mem, 1, SSE_INTALUQ_ITINS_P>;
 }
 
 /// SS41I_binop_rmi_int - SSE 4.1 binary operator with 8-bit immediate
 multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr,
                  Intrinsic IntId, RegisterClass RC, PatFrag memop_frag,
-                 X86MemOperand x86memop, bit Is2Addr = 1> {
+                 X86MemOperand x86memop, bit Is2Addr = 1,
+                 OpndItins itins = DEFAULT_ITINS> {
   let isCommutable = 1 in
   def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
         (ins RC:$src1, RC:$src2, u32u8imm:$src3),
@@ -6809,7 +6789,7 @@ multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr,
                 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
             !strconcat(OpcodeStr,
                 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
-        [(set RC:$dst, (IntId RC:$src1, RC:$src2, imm:$src3))]>,
+        [(set RC:$dst, (IntId RC:$src1, RC:$src2, imm:$src3))], itins.rr>,
         OpSize;
   def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
         (ins RC:$src1, x86memop:$src2, u32u8imm:$src3),
@@ -6820,7 +6800,7 @@ multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr,
                 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
         [(set RC:$dst,
           (IntId RC:$src1,
-           (bitconvert (memop_frag addr:$src2)), imm:$src3))]>,
+           (bitconvert (memop_frag addr:$src2)), imm:$src3))], itins.rm>,
         OpSize;
 }
 
@@ -6828,40 +6808,40 @@ let Predicates = [HasAVX] in {
   let isCommutable = 0 in {
     let ExeDomain = SSEPackedSingle in {
     defm VBLENDPS : SS41I_binop_rmi_int<0x0C, "vblendps", int_x86_sse41_blendps,
-                                        VR128, memopv4f32, f128mem, 0>, VEX_4V;
+                                        VR128, loadv4f32, f128mem, 0>, VEX_4V;
     defm VBLENDPSY : SS41I_binop_rmi_int<0x0C, "vblendps",
-                                    int_x86_avx_blend_ps_256, VR256, memopv8f32,
+                                    int_x86_avx_blend_ps_256, VR256, loadv8f32,
                                     f256mem, 0>, VEX_4V, VEX_L;
     }
     let ExeDomain = SSEPackedDouble in {
     defm VBLENDPD : SS41I_binop_rmi_int<0x0D, "vblendpd", int_x86_sse41_blendpd,
-                                        VR128, memopv2f64, f128mem, 0>, VEX_4V;
+                                        VR128, loadv2f64, f128mem, 0>, VEX_4V;
     defm VBLENDPDY : SS41I_binop_rmi_int<0x0D, "vblendpd",
-                                     int_x86_avx_blend_pd_256,VR256, memopv4f64,
+                                     int_x86_avx_blend_pd_256,VR256, loadv4f64,
                                      f256mem, 0>, VEX_4V, VEX_L;
     }
   defm VPBLENDW : SS41I_binop_rmi_int<0x0E, "vpblendw", int_x86_sse41_pblendw,
-                                      VR128, memopv2i64, i128mem, 0>, VEX_4V;
+                                      VR128, loadv2i64, i128mem, 0>, VEX_4V;
   defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw,
-                                      VR128, memopv2i64, i128mem, 0>, VEX_4V;
+                                      VR128, loadv2i64, i128mem, 0>, VEX_4V;
   }
   let ExeDomain = SSEPackedSingle in
   defm VDPPS : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps,
-                                   VR128, memopv4f32, f128mem, 0>, VEX_4V;
+                                   VR128, loadv4f32, f128mem, 0>, VEX_4V;
   let ExeDomain = SSEPackedDouble in
   defm VDPPD : SS41I_binop_rmi_int<0x41, "vdppd", int_x86_sse41_dppd,
-                                   VR128, memopv2f64, f128mem, 0>, VEX_4V;
+                                   VR128, loadv2f64, f128mem, 0>, VEX_4V;
   let ExeDomain = SSEPackedSingle in
   defm VDPPSY : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_avx_dp_ps_256,
-                                  VR256, memopv8f32, i256mem, 0>, VEX_4V, VEX_L;
+                                  VR256, loadv8f32, i256mem, 0>, VEX_4V, VEX_L;
 }
 
 let Predicates = [HasAVX2] in {
   let isCommutable = 0 in {
   defm VPBLENDWY : SS41I_binop_rmi_int<0x0E, "vpblendw", int_x86_avx2_pblendw,
-                                  VR256, memopv4i64, i256mem, 0>, VEX_4V, VEX_L;
+                                  VR256, loadv4i64, i256mem, 0>, VEX_4V, VEX_L;
   defm VMPSADBWY : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_avx2_mpsadbw,
-                                  VR256, memopv4i64, i256mem, 0>, VEX_4V, VEX_L;
+                                  VR256, loadv4i64, i256mem, 0>, VEX_4V, VEX_L;
   }
 }
 
@@ -6869,21 +6849,27 @@ let Constraints = "$src1 = $dst" in {
   let isCommutable = 0 in {
   let ExeDomain = SSEPackedSingle in
   defm BLENDPS : SS41I_binop_rmi_int<0x0C, "blendps", int_x86_sse41_blendps,
-                                     VR128, memopv4f32, f128mem>;
+                                     VR128, memopv4f32, f128mem,
+                                     1, SSE_INTALU_ITINS_P>;
   let ExeDomain = SSEPackedDouble in
   defm BLENDPD : SS41I_binop_rmi_int<0x0D, "blendpd", int_x86_sse41_blendpd,
-                                     VR128, memopv2f64, f128mem>;
+                                     VR128, memopv2f64, f128mem,
+                                     1, SSE_INTALU_ITINS_P>;
   defm PBLENDW : SS41I_binop_rmi_int<0x0E, "pblendw", int_x86_sse41_pblendw,
-                                     VR128, memopv2i64, i128mem>;
+                                     VR128, memopv2i64, i128mem,
+                                     1, SSE_INTALU_ITINS_P>;
   defm MPSADBW : SS41I_binop_rmi_int<0x42, "mpsadbw", int_x86_sse41_mpsadbw,
-                                     VR128, memopv2i64, i128mem>;
+                                     VR128, memopv2i64, i128mem,
+                                     1, SSE_INTMUL_ITINS_P>;
   }
   let ExeDomain = SSEPackedSingle in
   defm DPPS : SS41I_binop_rmi_int<0x40, "dpps", int_x86_sse41_dpps,
-                                  VR128, memopv4f32, f128mem>;
+                                  VR128, memopv4f32, f128mem, 1,
+                                  SSE_DPPS_ITINS>;
   let ExeDomain = SSEPackedDouble in
   defm DPPD : SS41I_binop_rmi_int<0x41, "dppd", int_x86_sse41_dppd,
-                                  VR128, memopv2f64, f128mem>;
+                                  VR128, memopv2f64, f128mem, 1,
+                                  SSE_DPPD_ITINS>;
 }
 
 /// SS41I_quaternary_int_avx - AVX SSE 4.1 with 4 operators
@@ -6910,23 +6896,23 @@ multiclass SS41I_quaternary_int_avx<bits<8> opc, string OpcodeStr,
 let Predicates = [HasAVX] in {
 let ExeDomain = SSEPackedDouble in {
 defm VBLENDVPD  : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR128, f128mem,
-                                           memopv2f64, int_x86_sse41_blendvpd>;
+                                           loadv2f64, int_x86_sse41_blendvpd>;
 defm VBLENDVPDY : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR256, f256mem,
-                                  memopv4f64, int_x86_avx_blendv_pd_256>, VEX_L;
+                                  loadv4f64, int_x86_avx_blendv_pd_256>, VEX_L;
 } // ExeDomain = SSEPackedDouble
 let ExeDomain = SSEPackedSingle in {
 defm VBLENDVPS  : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR128, f128mem,
-                                           memopv4f32, int_x86_sse41_blendvps>;
+                                           loadv4f32, int_x86_sse41_blendvps>;
 defm VBLENDVPSY : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR256, f256mem,
-                                  memopv8f32, int_x86_avx_blendv_ps_256>, VEX_L;
+                                  loadv8f32, int_x86_avx_blendv_ps_256>, VEX_L;
 } // ExeDomain = SSEPackedSingle
 defm VPBLENDVB  : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR128, i128mem,
-                                           memopv2i64, int_x86_sse41_pblendvb>;
+                                           loadv2i64, int_x86_sse41_pblendvb>;
 }
 
 let Predicates = [HasAVX2] in {
 defm VPBLENDVBY : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR256, i256mem,
-                                      memopv4i64, int_x86_avx2_pblendvb>, VEX_L;
+                                      loadv4i64, int_x86_avx2_pblendvb>, VEX_L;
 }
 
 let Predicates = [HasAVX] in {
@@ -6979,7 +6965,7 @@ let Predicates = [HasAVX] in {
 let Predicates = [HasAVX2] in {
   def : Pat<(v32i8 (vselect (v32i8 VR256:$mask), (v32i8 VR256:$src1),
                             (v32i8 VR256:$src2))),
-            (VPBLENDVBYrr VR256:$src1, VR256:$src2, VR256:$mask)>;
+            (VPBLENDVBYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
   def : Pat<(v16i16 (X86Blendi (v16i16 VR256:$src1), (v16i16 VR256:$src2),
                                (imm:$mask))),
             (VPBLENDWYrri VR256:$src1, VR256:$src2, imm:$mask)>;
@@ -6988,13 +6974,14 @@ let Predicates = [HasAVX2] in {
 /// SS41I_ternary_int - SSE 4.1 ternary operator
 let Uses = [XMM0], Constraints = "$src1 = $dst" in {
   multiclass SS41I_ternary_int<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
-                               X86MemOperand x86memop, Intrinsic IntId> {
+                               X86MemOperand x86memop, Intrinsic IntId,
+                               OpndItins itins = DEFAULT_ITINS> {
     def rr0 : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
                     (ins VR128:$src1, VR128:$src2),
                     !strconcat(OpcodeStr,
                      "\t{$src2, $dst|$dst, $src2}"),
-                    [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0))]>,
-                    OpSize;
+                    [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0))],
+                    itins.rr>, OpSize;
 
     def rm0 : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
                     (ins VR128:$src1, x86memop:$src2),
@@ -7002,7 +6989,8 @@ let Uses = [XMM0], Constraints = "$src1 = $dst" in {
                      "\t{$src2, $dst|$dst, $src2}"),
                     [(set VR128:$dst,
                       (IntId VR128:$src1,
-                       (bitconvert (mem_frag addr:$src2)), XMM0))]>, OpSize;
+                       (bitconvert (mem_frag addr:$src2)), XMM0))],
+                       itins.rm>, OpSize;
   }
 }
 
@@ -7099,11 +7087,11 @@ multiclass SS42I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
 
 let Predicates = [HasAVX] in
   defm VPCMPGTQ : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v2i64, VR128,
-                                 memopv2i64, i128mem, 0>, VEX_4V;
+                                 loadv2i64, i128mem, 0>, VEX_4V;
 
 let Predicates = [HasAVX2] in
   defm VPCMPGTQY : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v4i64, VR256,
-                                  memopv4i64, i256mem, 0>, VEX_4V, VEX_L;
+                                  loadv4i64, i256mem, 0>, VEX_4V, VEX_L;
 
 let Constraints = "$src1 = $dst" in
   defm PCMPGTQ : SS42I_binop_rm<0x37, "pcmpgtq", X86pcmpgt, v2i64, VR128,
@@ -7263,70 +7251,100 @@ let Defs = [ECX, EFLAGS], Uses = [EAX, EDX], neverHasSideEffects = 1 in {
 // crc intrinsic instruction
 // This set of instructions are only rm, the only difference is the size
 // of r and m.
+class SS42I_crc32r<bits<8> opc, string asm, RegisterClass RCOut,
+                   RegisterClass RCIn, SDPatternOperator Int> :
+  SS42FI<opc, MRMSrcReg, (outs RCOut:$dst), (ins RCOut:$src1, RCIn:$src2),
+         !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"),
+         [(set RCOut:$dst, (Int RCOut:$src1, RCIn:$src2))], IIC_CRC32_REG>;
+
+class SS42I_crc32m<bits<8> opc, string asm, RegisterClass RCOut,
+                   X86MemOperand x86memop, SDPatternOperator Int> :
+  SS42FI<opc, MRMSrcMem, (outs RCOut:$dst), (ins RCOut:$src1, x86memop:$src2),
+         !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"),
+         [(set RCOut:$dst, (Int RCOut:$src1, (load addr:$src2)))],
+         IIC_CRC32_MEM>;
+
 let Constraints = "$src1 = $dst" in {
-  def CRC32r32m8  : SS42FI<0xF0, MRMSrcMem, (outs GR32:$dst),
-                      (ins GR32:$src1, i8mem:$src2),
-                      "crc32{b}\t{$src2, $src1|$src1, $src2}",
-                       [(set GR32:$dst,
-                         (int_x86_sse42_crc32_32_8 GR32:$src1,
-                         (load addr:$src2)))]>;
-  def CRC32r32r8  : SS42FI<0xF0, MRMSrcReg, (outs GR32:$dst),
-                      (ins GR32:$src1, GR8:$src2),
-                      "crc32{b}\t{$src2, $src1|$src1, $src2}",
-                       [(set GR32:$dst,
-                         (int_x86_sse42_crc32_32_8 GR32:$src1, GR8:$src2))]>;
-  def CRC32r32m16  : SS42FI<0xF1, MRMSrcMem, (outs GR32:$dst),
-                      (ins GR32:$src1, i16mem:$src2),
-                      "crc32{w}\t{$src2, $src1|$src1, $src2}",
-                       [(set GR32:$dst,
-                         (int_x86_sse42_crc32_32_16 GR32:$src1,
-                         (load addr:$src2)))]>,
-                         OpSize;
-  def CRC32r32r16  : SS42FI<0xF1, MRMSrcReg, (outs GR32:$dst),
-                      (ins GR32:$src1, GR16:$src2),
-                      "crc32{w}\t{$src2, $src1|$src1, $src2}",
-                       [(set GR32:$dst,
-                         (int_x86_sse42_crc32_32_16 GR32:$src1, GR16:$src2))]>,
-                         OpSize;
-  def CRC32r32m32  : SS42FI<0xF1, MRMSrcMem, (outs GR32:$dst),
-                      (ins GR32:$src1, i32mem:$src2),
-                      "crc32{l}\t{$src2, $src1|$src1, $src2}",
-                       [(set GR32:$dst,
-                         (int_x86_sse42_crc32_32_32 GR32:$src1,
-                         (load addr:$src2)))]>;
-  def CRC32r32r32  : SS42FI<0xF1, MRMSrcReg, (outs GR32:$dst),
-                      (ins GR32:$src1, GR32:$src2),
-                      "crc32{l}\t{$src2, $src1|$src1, $src2}",
-                       [(set GR32:$dst,
-                         (int_x86_sse42_crc32_32_32 GR32:$src1, GR32:$src2))]>;
-  def CRC32r64m8  : SS42FI<0xF0, MRMSrcMem, (outs GR64:$dst),
-                      (ins GR64:$src1, i8mem:$src2),
-                      "crc32{b}\t{$src2, $src1|$src1, $src2}",
-                       [(set GR64:$dst,
-                         (int_x86_sse42_crc32_64_8 GR64:$src1,
-                         (load addr:$src2)))]>,
-                         REX_W;
-  def CRC32r64r8  : SS42FI<0xF0, MRMSrcReg, (outs GR64:$dst),
-                      (ins GR64:$src1, GR8:$src2),
-                      "crc32{b}\t{$src2, $src1|$src1, $src2}",
-                       [(set GR64:$dst,
-                         (int_x86_sse42_crc32_64_8 GR64:$src1, GR8:$src2))]>,
-                         REX_W;
-  def CRC32r64m64  : SS42FI<0xF1, MRMSrcMem, (outs GR64:$dst),
-                      (ins GR64:$src1, i64mem:$src2),
-                      "crc32{q}\t{$src2, $src1|$src1, $src2}",
-                       [(set GR64:$dst,
-                         (int_x86_sse42_crc32_64_64 GR64:$src1,
-                         (load addr:$src2)))]>,
-                         REX_W;
-  def CRC32r64r64  : SS42FI<0xF1, MRMSrcReg, (outs GR64:$dst),
-                      (ins GR64:$src1, GR64:$src2),
-                      "crc32{q}\t{$src2, $src1|$src1, $src2}",
-                       [(set GR64:$dst,
-                         (int_x86_sse42_crc32_64_64 GR64:$src1, GR64:$src2))]>,
-                         REX_W;
+  def CRC32r32m8  : SS42I_crc32m<0xF0, "crc32{b}", GR32, i8mem,
+                                 int_x86_sse42_crc32_32_8>;
+  def CRC32r32r8  : SS42I_crc32r<0xF0, "crc32{b}", GR32, GR8,
+                                 int_x86_sse42_crc32_32_8>;
+  def CRC32r32m16 : SS42I_crc32m<0xF1, "crc32{w}", GR32, i16mem,
+                                 int_x86_sse42_crc32_32_16>, OpSize;
+  def CRC32r32r16 : SS42I_crc32r<0xF1, "crc32{w}", GR32, GR16,
+                                 int_x86_sse42_crc32_32_16>, OpSize;
+  def CRC32r32m32 : SS42I_crc32m<0xF1, "crc32{l}", GR32, i32mem,
+                                 int_x86_sse42_crc32_32_32>;
+  def CRC32r32r32 : SS42I_crc32r<0xF1, "crc32{l}", GR32, GR32,
+                                 int_x86_sse42_crc32_32_32>;
+  def CRC32r64m64 : SS42I_crc32m<0xF1, "crc32{q}", GR64, i64mem,
+                                 int_x86_sse42_crc32_64_64>, REX_W;
+  def CRC32r64r64 : SS42I_crc32r<0xF1, "crc32{q}", GR64, GR64,
+                                 int_x86_sse42_crc32_64_64>, REX_W;
+  let hasSideEffects = 0 in {
+    let mayLoad = 1 in
+    def CRC32r64m8 : SS42I_crc32m<0xF0, "crc32{b}", GR64, i8mem,
+                                   null_frag>, REX_W;
+    def CRC32r64r8 : SS42I_crc32r<0xF0, "crc32{b}", GR64, GR8,
+                                   null_frag>, REX_W;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// SHA-NI Instructions
+//===----------------------------------------------------------------------===//
+
+multiclass SHAI_binop<bits<8> Opc, string OpcodeStr, Intrinsic IntId,
+                      bit UsesXMM0 = 0> {
+  def rr : I<Opc, MRMSrcReg, (outs VR128:$dst),
+             (ins VR128:$src1, VR128:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+             [!if(UsesXMM0,
+                  (set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0)),
+                  (set VR128:$dst, (IntId VR128:$src1, VR128:$src2)))]>, T8;
+
+  def rm : I<Opc, MRMSrcMem, (outs VR128:$dst),
+             (ins VR128:$src1, i128mem:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+             [!if(UsesXMM0,
+                  (set VR128:$dst, (IntId VR128:$src1,
+                    (bc_v4i32 (memopv2i64 addr:$src2)), XMM0)),
+                  (set VR128:$dst, (IntId VR128:$src1,
+                    (bc_v4i32 (memopv2i64 addr:$src2)))))]>, T8;
+}
+
+let Constraints = "$src1 = $dst", Predicates = [HasSHA] in {
+  def SHA1RNDS4rri : Ii8<0xCC, MRMSrcReg, (outs VR128:$dst),
+                         (ins VR128:$src1, VR128:$src2, i8imm:$src3),
+                         "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                         [(set VR128:$dst,
+                           (int_x86_sha1rnds4 VR128:$src1, VR128:$src2,
+                            (i8 imm:$src3)))]>, TA;
+  def SHA1RNDS4rmi : Ii8<0xCC, MRMSrcMem, (outs VR128:$dst),
+                         (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
+                         "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                         [(set VR128:$dst,
+                           (int_x86_sha1rnds4 VR128:$src1,
+                            (bc_v4i32 (memopv2i64 addr:$src2)),
+                            (i8 imm:$src3)))]>, TA;
+
+  defm SHA1NEXTE : SHAI_binop<0xC8, "sha1nexte", int_x86_sha1nexte>;
+  defm SHA1MSG1  : SHAI_binop<0xC9, "sha1msg1", int_x86_sha1msg1>;
+  defm SHA1MSG2  : SHAI_binop<0xCA, "sha1msg2", int_x86_sha1msg2>;
+
+  let Uses=[XMM0] in
+  defm SHA256RNDS2 : SHAI_binop<0xCB, "sha256rnds2", int_x86_sha256rnds2, 1>;
+
+  defm SHA256MSG1 : SHAI_binop<0xCC, "sha256msg1", int_x86_sha256msg1>;
+  defm SHA256MSG2 : SHAI_binop<0xCD, "sha256msg2", int_x86_sha256msg2>;
 }
 
+// Aliases with explicit %xmm0
+def : InstAlias<"sha256rnds2\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
+                (SHA256RNDS2rr VR128:$dst, VR128:$src2)>;
+def : InstAlias<"sha256rnds2\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
+                (SHA256RNDS2rm VR128:$dst, i128mem:$src2)>;
+
 //===----------------------------------------------------------------------===//
 // AES-NI Instructions
 //===----------------------------------------------------------------------===//
@@ -7383,7 +7401,7 @@ let Predicates = [HasAVX, HasAES] in {
   def VAESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst),
       (ins i128mem:$src1),
       "vaesimc\t{$src1, $dst|$dst, $src1}",
-      [(set VR128:$dst, (int_x86_aesni_aesimc (memopv2i64 addr:$src1)))]>,
+      [(set VR128:$dst, (int_x86_aesni_aesimc (loadv2i64 addr:$src1)))]>,
       OpSize, VEX;
 }
 def AESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst),
@@ -7410,7 +7428,7 @@ let Predicates = [HasAVX, HasAES] in {
       (ins i128mem:$src1, i8imm:$src2),
       "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
       [(set VR128:$dst,
-        (int_x86_aesni_aeskeygenassist (memopv2i64 addr:$src1), imm:$src2))]>,
+        (int_x86_aesni_aeskeygenassist (loadv2i64 addr:$src1), imm:$src2))]>,
       OpSize, VEX;
 }
 def AESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst),
@@ -7441,7 +7459,7 @@ def VPCLMULQDQrm : AVXPCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst),
            (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
            "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
            [(set VR128:$dst, (int_x86_pclmulqdq VR128:$src1,
-                              (memopv2i64 addr:$src2), imm:$src3))]>;
+                              (loadv2i64 addr:$src2), imm:$src3))]>;
 
 // Carry-less Multiplication instructions
 let Constraints = "$src1 = $dst" in {
@@ -7449,13 +7467,15 @@ def PCLMULQDQrr : PCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst),
            (ins VR128:$src1, VR128:$src2, i8imm:$src3),
            "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
            [(set VR128:$dst,
-             (int_x86_pclmulqdq VR128:$src1, VR128:$src2, imm:$src3))]>;
+             (int_x86_pclmulqdq VR128:$src1, VR128:$src2, imm:$src3))],
+             IIC_SSE_PCLMULQDQ_RR>;
 
 def PCLMULQDQrm : PCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst),
            (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
            "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
            [(set VR128:$dst, (int_x86_pclmulqdq VR128:$src1,
-                              (memopv2i64 addr:$src2), imm:$src3))]>;
+                              (memopv2i64 addr:$src2), imm:$src3))],
+                              IIC_SSE_PCLMULQDQ_RM>;
 } // Constraints = "$src1 = $dst"
 
 
@@ -7595,11 +7615,11 @@ def : Pat<(vinsert128_insert:$ins (v4f64 VR256:$src1), (v2f64 VR128:$src2),
           (VINSERTF128rr VR256:$src1, VR128:$src2,
                          (INSERT_get_vinsert128_imm VR256:$ins))>;
 
-def : Pat<(vinsert128_insert:$ins (v8f32 VR256:$src1), (memopv4f32 addr:$src2),
+def : Pat<(vinsert128_insert:$ins (v8f32 VR256:$src1), (loadv4f32 addr:$src2),
                                    (iPTR imm)),
           (VINSERTF128rm VR256:$src1, addr:$src2,
                          (INSERT_get_vinsert128_imm VR256:$ins))>;
-def : Pat<(vinsert128_insert:$ins (v4f64 VR256:$src1), (memopv2f64 addr:$src2),
+def : Pat<(vinsert128_insert:$ins (v4f64 VR256:$src1), (loadv2f64 addr:$src2),
                                    (iPTR imm)),
           (VINSERTF128rm VR256:$src1, addr:$src2,
                          (INSERT_get_vinsert128_imm VR256:$ins))>;
@@ -7623,22 +7643,22 @@ def : Pat<(vinsert128_insert:$ins (v16i16 VR256:$src1), (v8i16 VR128:$src2),
           (VINSERTF128rr VR256:$src1, VR128:$src2,
                          (INSERT_get_vinsert128_imm VR256:$ins))>;
 
-def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (memopv2i64 addr:$src2),
+def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (loadv2i64 addr:$src2),
                                    (iPTR imm)),
           (VINSERTF128rm VR256:$src1, addr:$src2,
                          (INSERT_get_vinsert128_imm VR256:$ins))>;
 def : Pat<(vinsert128_insert:$ins (v8i32 VR256:$src1),
-                                   (bc_v4i32 (memopv2i64 addr:$src2)),
+                                   (bc_v4i32 (loadv2i64 addr:$src2)),
                                    (iPTR imm)),
           (VINSERTF128rm VR256:$src1, addr:$src2,
                          (INSERT_get_vinsert128_imm VR256:$ins))>;
 def : Pat<(vinsert128_insert:$ins (v32i8 VR256:$src1),
-                                   (bc_v16i8 (memopv2i64 addr:$src2)),
+                                   (bc_v16i8 (loadv2i64 addr:$src2)),
                                    (iPTR imm)),
           (VINSERTF128rm VR256:$src1, addr:$src2,
                          (INSERT_get_vinsert128_imm VR256:$ins))>;
 def : Pat<(vinsert128_insert:$ins (v16i16 VR256:$src1),
-                                   (bc_v8i16 (memopv2i64 addr:$src2)),
+                                   (bc_v8i16 (loadv2i64 addr:$src2)),
                                    (iPTR imm)),
           (VINSERTF128rm VR256:$src1, addr:$src2,
                          (INSERT_get_vinsert128_imm VR256:$ins))>;
@@ -7670,12 +7690,12 @@ def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
                     (v4f64 VR256:$src1),
                     (EXTRACT_get_vextract128_imm VR128:$ext)))>;
 
-def : Pat<(alignedstore (v4f32 (vextract128_extract:$ext (v8f32 VR256:$src1),
-                                (iPTR imm))), addr:$dst),
+def : Pat<(store (v4f32 (vextract128_extract:$ext (v8f32 VR256:$src1),
+                         (iPTR imm))), addr:$dst),
           (VEXTRACTF128mr addr:$dst, VR256:$src1,
            (EXTRACT_get_vextract128_imm VR128:$ext))>;
-def : Pat<(alignedstore (v2f64 (vextract128_extract:$ext (v4f64 VR256:$src1),
-                                (iPTR imm))), addr:$dst),
+def : Pat<(store (v2f64 (vextract128_extract:$ext (v4f64 VR256:$src1),
+                         (iPTR imm))), addr:$dst),
           (VEXTRACTF128mr addr:$dst, VR256:$src1,
            (EXTRACT_get_vextract128_imm VR128:$ext))>;
 }
@@ -7785,15 +7805,15 @@ multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr,
 
 let ExeDomain = SSEPackedSingle in {
   defm VPERMILPS  : avx_permil<0x0C, 0x04, "vpermilps", VR128, f128mem, i128mem,
-                               memopv2i64, int_x86_avx_vpermilvar_ps, v4f32>;
+                               loadv2i64, int_x86_avx_vpermilvar_ps, v4f32>;
   defm VPERMILPSY : avx_permil<0x0C, 0x04, "vpermilps", VR256, f256mem, i256mem,
-                       memopv4i64, int_x86_avx_vpermilvar_ps_256, v8f32>, VEX_L;
+                       loadv4i64, int_x86_avx_vpermilvar_ps_256, v8f32>, VEX_L;
 }
 let ExeDomain = SSEPackedDouble in {
   defm VPERMILPD  : avx_permil<0x0D, 0x05, "vpermilpd", VR128, f128mem, i128mem,
-                               memopv2i64, int_x86_avx_vpermilvar_pd, v2f64>;
+                               loadv2i64, int_x86_avx_vpermilvar_pd, v2f64>;
   defm VPERMILPDY : avx_permil<0x0D, 0x05, "vpermilpd", VR256, f256mem, i256mem,
-                       memopv4i64, int_x86_avx_vpermilvar_pd_256, v4f64>, VEX_L;
+                       loadv4i64, int_x86_avx_vpermilvar_pd_256, v4f64>, VEX_L;
 }
 
 let Predicates = [HasAVX] in {
@@ -7801,15 +7821,15 @@ def : Pat<(v8i32 (X86VPermilp VR256:$src1, (i8 imm:$imm))),
           (VPERMILPSYri VR256:$src1, imm:$imm)>;
 def : Pat<(v4i64 (X86VPermilp VR256:$src1, (i8 imm:$imm))),
           (VPERMILPDYri VR256:$src1, imm:$imm)>;
-def : Pat<(v8i32 (X86VPermilp (bc_v8i32 (memopv4i64 addr:$src1)),
+def : Pat<(v8i32 (X86VPermilp (bc_v8i32 (loadv4i64 addr:$src1)),
                                (i8 imm:$imm))),
           (VPERMILPSYmi addr:$src1, imm:$imm)>;
-def : Pat<(v4i64 (X86VPermilp (memopv4i64 addr:$src1), (i8 imm:$imm))),
+def : Pat<(v4i64 (X86VPermilp (loadv4i64 addr:$src1), (i8 imm:$imm))),
           (VPERMILPDYmi addr:$src1, imm:$imm)>;
 
 def : Pat<(v2i64 (X86VPermilp VR128:$src1, (i8 imm:$imm))),
           (VPERMILPDri VR128:$src1, imm:$imm)>;
-def : Pat<(v2i64 (X86VPermilp (memopv2i64 addr:$src1), (i8 imm:$imm))),
+def : Pat<(v2i64 (X86VPermilp (loadv2i64 addr:$src1), (i8 imm:$imm))),
           (VPERMILPDmi addr:$src1, imm:$imm)>;
 }
 
@@ -7825,7 +7845,7 @@ def VPERM2F128rr : AVXAIi8<0x06, MRMSrcReg, (outs VR256:$dst),
 def VPERM2F128rm : AVXAIi8<0x06, MRMSrcMem, (outs VR256:$dst),
           (ins VR256:$src1, f256mem:$src2, i8imm:$src3),
           "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
-          [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (memopv8f32 addr:$src2),
+          [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv8f32 addr:$src2),
                              (i8 imm:$src3)))]>, VEX_4V, VEX_L;
 }
 
@@ -7833,7 +7853,7 @@ let Predicates = [HasAVX] in {
 def : Pat<(v4f64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
           (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
 def : Pat<(v4f64 (X86VPerm2x128 VR256:$src1,
-                  (memopv4f64 addr:$src2), (i8 imm:$imm))),
+                  (loadv4f64 addr:$src2), (i8 imm:$imm))),
           (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
 }
 
@@ -7848,16 +7868,16 @@ def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
           (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
 
 def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1,
-                  (bc_v8i32 (memopv4i64 addr:$src2)), (i8 imm:$imm))),
+                  (bc_v8i32 (loadv4i64 addr:$src2)), (i8 imm:$imm))),
           (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
 def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1,
-                  (memopv4i64 addr:$src2), (i8 imm:$imm))),
+                  (loadv4i64 addr:$src2), (i8 imm:$imm))),
           (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
 def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1,
-                  (bc_v32i8 (memopv4i64 addr:$src2)), (i8 imm:$imm))),
+                  (bc_v32i8 (loadv4i64 addr:$src2)), (i8 imm:$imm))),
           (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
 def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1,
-                  (bc_v16i16 (memopv4i64 addr:$src2)), (i8 imm:$imm))),
+                  (bc_v16i16 (loadv4i64 addr:$src2)), (i8 imm:$imm))),
           (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
 }
 
@@ -7901,7 +7921,7 @@ multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop, Intrinsic Int> {
                TA, OpSize, VEX;
 }
 
-let Predicates = [HasAVX, HasF16C] in {
+let Predicates = [HasF16C] in {
   defm VCVTPH2PS  : f16c_ph2ps<VR128, f64mem, int_x86_vcvtph2ps_128>;
   defm VCVTPH2PSY : f16c_ph2ps<VR256, f128mem, int_x86_vcvtph2ps_256>, VEX_L;
   defm VCVTPS2PH  : f16c_ps2ph<VR128, f64mem, int_x86_vcvtps2ph_128>;
@@ -7935,9 +7955,9 @@ multiclass AVX2_binop_rmi_int<bits<8> opc, string OpcodeStr,
 
 let isCommutable = 0 in {
 defm VPBLENDD : AVX2_binop_rmi_int<0x02, "vpblendd", int_x86_avx2_pblendd_128,
-                                   VR128, memopv2i64, i128mem>;
+                                   VR128, loadv2i64, i128mem>;
 defm VPBLENDDY : AVX2_binop_rmi_int<0x02, "vpblendd", int_x86_avx2_pblendd_256,
-                                    VR256, memopv4i64, i256mem>, VEX_L;
+                                    VR256, loadv4i64, i256mem>, VEX_L;
 }
 
 def : Pat<(v4i32 (X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2),
@@ -8115,9 +8135,9 @@ multiclass avx2_perm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
                    VEX_4V, VEX_L;
 }
 
-defm VPERMD : avx2_perm<0x36, "vpermd", memopv4i64, v8i32>;
+defm VPERMD : avx2_perm<0x36, "vpermd", loadv4i64, v8i32>;
 let ExeDomain = SSEPackedSingle in
-defm VPERMPS : avx2_perm<0x16, "vpermps", memopv8f32, v8f32>;
+defm VPERMPS : avx2_perm<0x16, "vpermps", loadv8f32, v8f32>;
 
 multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
                          ValueType OpVT> {
@@ -8137,9 +8157,9 @@ multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
                               (i8 imm:$src2))))]>, VEX, VEX_L;
 }
 
-defm VPERMQ : avx2_perm_imm<0x00, "vpermq", memopv4i64, v4i64>, VEX_W;
+defm VPERMQ : avx2_perm_imm<0x00, "vpermq", loadv4i64, v4i64>, VEX_W;
 let ExeDomain = SSEPackedDouble in
-defm VPERMPD : avx2_perm_imm<0x01, "vpermpd", memopv4f64, v4f64>, VEX_W;
+defm VPERMPD : avx2_perm_imm<0x01, "vpermpd", loadv4f64, v4f64>, VEX_W;
 
 //===----------------------------------------------------------------------===//
 // VPERM2I128 - Permute Floating-Point Values in 128-bit chunks
@@ -8152,7 +8172,7 @@ def VPERM2I128rr : AVX2AIi8<0x46, MRMSrcReg, (outs VR256:$dst),
 def VPERM2I128rm : AVX2AIi8<0x46, MRMSrcMem, (outs VR256:$dst),
           (ins VR256:$src1, f256mem:$src2, i8imm:$src3),
           "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
-          [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (memopv4i64 addr:$src2),
+          [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv4i64 addr:$src2),
                              (i8 imm:$src3)))]>, VEX_4V, VEX_L;
 
 let Predicates = [HasAVX2] in {
@@ -8163,13 +8183,13 @@ def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
 def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
           (VPERM2I128rr VR256:$src1, VR256:$src2, imm:$imm)>;
 
-def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, (bc_v32i8 (memopv4i64 addr:$src2)),
+def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, (bc_v32i8 (loadv4i64 addr:$src2)),
                   (i8 imm:$imm))),
           (VPERM2I128rm VR256:$src1, addr:$src2, imm:$imm)>;
 def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1,
-                   (bc_v16i16 (memopv4i64 addr:$src2)), (i8 imm:$imm))),
+                   (bc_v16i16 (loadv4i64 addr:$src2)), (i8 imm:$imm))),
           (VPERM2I128rm VR256:$src1, addr:$src2, imm:$imm)>;
-def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, (bc_v8i32 (memopv4i64 addr:$src2)),
+def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)),
                   (i8 imm:$imm))),
           (VPERM2I128rm VR256:$src1, addr:$src2, imm:$imm)>;
 }
@@ -8208,22 +8228,22 @@ def : Pat<(vinsert128_insert:$ins (v16i16 VR256:$src1), (v8i16 VR128:$src2),
           (VINSERTI128rr VR256:$src1, VR128:$src2,
                          (INSERT_get_vinsert128_imm VR256:$ins))>;
 
-def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (memopv2i64 addr:$src2),
+def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (loadv2i64 addr:$src2),
                                    (iPTR imm)),
           (VINSERTI128rm VR256:$src1, addr:$src2,
                          (INSERT_get_vinsert128_imm VR256:$ins))>;
 def : Pat<(vinsert128_insert:$ins (v8i32 VR256:$src1),
-                                   (bc_v4i32 (memopv2i64 addr:$src2)),
+                                   (bc_v4i32 (loadv2i64 addr:$src2)),
                                    (iPTR imm)),
           (VINSERTI128rm VR256:$src1, addr:$src2,
                          (INSERT_get_vinsert128_imm VR256:$ins))>;
 def : Pat<(vinsert128_insert:$ins (v32i8 VR256:$src1),
-                                   (bc_v16i8 (memopv2i64 addr:$src2)),
+                                   (bc_v16i8 (loadv2i64 addr:$src2)),
                                    (iPTR imm)),
           (VINSERTI128rm VR256:$src1, addr:$src2,
                          (INSERT_get_vinsert128_imm VR256:$ins))>;
 def : Pat<(vinsert128_insert:$ins (v16i16 VR256:$src1),
-                                   (bc_v8i16 (memopv2i64 addr:$src2)),
+                                   (bc_v8i16 (loadv2i64 addr:$src2)),
                                    (iPTR imm)),
           (VINSERTI128rm VR256:$src1, addr:$src2,
                          (INSERT_get_vinsert128_imm VR256:$ins))>;
@@ -8262,20 +8282,20 @@ def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
                     (v32i8 VR256:$src1),
                     (EXTRACT_get_vextract128_imm VR128:$ext)))>;
 
-def : Pat<(alignedstore (v2i64 (vextract128_extract:$ext (v4i64 VR256:$src1),
-                                (iPTR imm))), addr:$dst),
+def : Pat<(store (v2i64 (vextract128_extract:$ext (v4i64 VR256:$src1),
+                         (iPTR imm))), addr:$dst),
           (VEXTRACTI128mr addr:$dst, VR256:$src1,
            (EXTRACT_get_vextract128_imm VR128:$ext))>;
-def : Pat<(alignedstore (v4i32 (vextract128_extract:$ext (v8i32 VR256:$src1),
-                                (iPTR imm))), addr:$dst),
+def : Pat<(store (v4i32 (vextract128_extract:$ext (v8i32 VR256:$src1),
+                         (iPTR imm))), addr:$dst),
           (VEXTRACTI128mr addr:$dst, VR256:$src1,
            (EXTRACT_get_vextract128_imm VR128:$ext))>;
-def : Pat<(alignedstore (v8i16 (vextract128_extract:$ext (v16i16 VR256:$src1),
-                                (iPTR imm))), addr:$dst),
+def : Pat<(store (v8i16 (vextract128_extract:$ext (v16i16 VR256:$src1),
+                         (iPTR imm))), addr:$dst),
           (VEXTRACTI128mr addr:$dst, VR256:$src1,
            (EXTRACT_get_vextract128_imm VR128:$ext))>;
-def : Pat<(alignedstore (v16i8 (vextract128_extract:$ext (v32i8 VR256:$src1),
-                                (iPTR imm))), addr:$dst),
+def : Pat<(store (v16i8 (vextract128_extract:$ext (v32i8 VR256:$src1),
+                         (iPTR imm))), addr:$dst),
           (VEXTRACTI128mr addr:$dst, VR256:$src1,
            (EXTRACT_get_vextract128_imm VR128:$ext))>;
 }
@@ -8333,7 +8353,7 @@ multiclass avx2_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode,
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(set VR128:$dst,
                (vt128 (OpNode VR128:$src1,
-                       (vt128 (bitconvert (memopv2i64 addr:$src2))))))]>,
+                       (vt128 (bitconvert (loadv2i64 addr:$src2))))))]>,
              VEX_4V;
   def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst),
              (ins VR256:$src1, VR256:$src2),
@@ -8346,7 +8366,7 @@ multiclass avx2_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode,
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(set VR256:$dst,
                (vt256 (OpNode VR256:$src1,
-                       (vt256 (bitconvert (memopv4i64 addr:$src2))))))]>,
+                       (vt256 (bitconvert (loadv4i64 addr:$src2))))))]>,
              VEX_4V, VEX_L;
 }
 
diff --git a/lib/Target/X86/X86InstrXOP.td b/lib/Target/X86/X86InstrXOP.td
index 2aa08fa..2b6ee5c 100644
--- a/lib/Target/X86/X86InstrXOP.td
+++ b/lib/Target/X86/X86InstrXOP.td
@@ -20,23 +20,21 @@ multiclass xop2op<bits<8> opc, string OpcodeStr, Intrinsic Int, PatFrag memop> {
            [(set VR128:$dst, (Int (bitconvert (memop addr:$src))))]>, VEX;
 }
 
-let isAsmParserOnly = 1 in {
-  defm VPHSUBWD  : xop2op<0xE2, "vphsubwd", int_x86_xop_vphsubwd, memopv2i64>;
-  defm VPHSUBDQ  : xop2op<0xE3, "vphsubdq", int_x86_xop_vphsubdq, memopv2i64>;
-  defm VPHSUBBW  : xop2op<0xE1, "vphsubbw", int_x86_xop_vphsubbw, memopv2i64>;
-  defm VPHADDWQ  : xop2op<0xC7, "vphaddwq", int_x86_xop_vphaddwq, memopv2i64>;
-  defm VPHADDWD  : xop2op<0xC6, "vphaddwd", int_x86_xop_vphaddwd, memopv2i64>;
-  defm VPHADDUWQ : xop2op<0xD7, "vphadduwq", int_x86_xop_vphadduwq, memopv2i64>;
-  defm VPHADDUWD : xop2op<0xD6, "vphadduwd", int_x86_xop_vphadduwd, memopv2i64>;
-  defm VPHADDUDQ : xop2op<0xDB, "vphaddudq", int_x86_xop_vphaddudq, memopv2i64>;
-  defm VPHADDUBW : xop2op<0xD1, "vphaddubw", int_x86_xop_vphaddubw, memopv2i64>;
-  defm VPHADDUBQ : xop2op<0xD3, "vphaddubq", int_x86_xop_vphaddubq, memopv2i64>;
-  defm VPHADDUBD : xop2op<0xD2, "vphaddubd", int_x86_xop_vphaddubd, memopv2i64>;
-  defm VPHADDDQ  : xop2op<0xCB, "vphadddq", int_x86_xop_vphadddq, memopv2i64>;
-  defm VPHADDBW  : xop2op<0xC1, "vphaddbw", int_x86_xop_vphaddbw, memopv2i64>;
-  defm VPHADDBQ  : xop2op<0xC3, "vphaddbq", int_x86_xop_vphaddbq, memopv2i64>;
-  defm VPHADDBD  : xop2op<0xC2, "vphaddbd", int_x86_xop_vphaddbd, memopv2i64>;
-}
+defm VPHSUBWD  : xop2op<0xE2, "vphsubwd", int_x86_xop_vphsubwd, memopv2i64>;
+defm VPHSUBDQ  : xop2op<0xE3, "vphsubdq", int_x86_xop_vphsubdq, memopv2i64>;
+defm VPHSUBBW  : xop2op<0xE1, "vphsubbw", int_x86_xop_vphsubbw, memopv2i64>;
+defm VPHADDWQ  : xop2op<0xC7, "vphaddwq", int_x86_xop_vphaddwq, memopv2i64>;
+defm VPHADDWD  : xop2op<0xC6, "vphaddwd", int_x86_xop_vphaddwd, memopv2i64>;
+defm VPHADDUWQ : xop2op<0xD7, "vphadduwq", int_x86_xop_vphadduwq, memopv2i64>;
+defm VPHADDUWD : xop2op<0xD6, "vphadduwd", int_x86_xop_vphadduwd, memopv2i64>;
+defm VPHADDUDQ : xop2op<0xDB, "vphaddudq", int_x86_xop_vphaddudq, memopv2i64>;
+defm VPHADDUBW : xop2op<0xD1, "vphaddubw", int_x86_xop_vphaddubw, memopv2i64>;
+defm VPHADDUBQ : xop2op<0xD3, "vphaddubq", int_x86_xop_vphaddubq, memopv2i64>;
+defm VPHADDUBD : xop2op<0xD2, "vphaddubd", int_x86_xop_vphaddubd, memopv2i64>;
+defm VPHADDDQ  : xop2op<0xCB, "vphadddq", int_x86_xop_vphadddq, memopv2i64>;
+defm VPHADDBW  : xop2op<0xC1, "vphaddbw", int_x86_xop_vphaddbw, memopv2i64>;
+defm VPHADDBQ  : xop2op<0xC3, "vphaddbq", int_x86_xop_vphaddbq, memopv2i64>;
+defm VPHADDBD  : xop2op<0xC2, "vphaddbd", int_x86_xop_vphaddbd, memopv2i64>;
 
 // Scalar load 2 addr operand instructions
 multiclass xop2opsld<bits<8> opc, string OpcodeStr, Intrinsic Int,
@@ -49,12 +47,10 @@ multiclass xop2opsld<bits<8> opc, string OpcodeStr, Intrinsic Int,
            [(set VR128:$dst, (Int (bitconvert mem_cpat:$src)))]>, VEX;
 }
 
-let isAsmParserOnly = 1 in {
-  defm VFRCZSS   : xop2opsld<0x82, "vfrczss", int_x86_xop_vfrcz_ss,
-                   ssmem, sse_load_f32>;
-  defm VFRCZSD   : xop2opsld<0x83, "vfrczsd", int_x86_xop_vfrcz_sd,
-                   sdmem, sse_load_f64>;
-}
+defm VFRCZSS   : xop2opsld<0x82, "vfrczss", int_x86_xop_vfrcz_ss,
+                 ssmem, sse_load_f32>;
+defm VFRCZSD   : xop2opsld<0x83, "vfrczsd", int_x86_xop_vfrcz_sd,
+                 sdmem, sse_load_f64>;
 
 multiclass xop2op128<bits<8> opc, string OpcodeStr, Intrinsic Int,
                      PatFrag memop> {
@@ -66,10 +62,8 @@ multiclass xop2op128<bits<8> opc, string OpcodeStr, Intrinsic Int,
            [(set VR128:$dst, (Int (bitconvert (memop addr:$src))))]>, VEX;
 }
 
-let isAsmParserOnly = 1 in {
-  defm VFRCZPS : xop2op128<0x80, "vfrczps", int_x86_xop_vfrcz_ps, memopv4f32>;
-  defm VFRCZPD : xop2op128<0x81, "vfrczpd", int_x86_xop_vfrcz_pd, memopv2f64>;
-}
+defm VFRCZPS : xop2op128<0x80, "vfrczps", int_x86_xop_vfrcz_ps, memopv4f32>;
+defm VFRCZPD : xop2op128<0x81, "vfrczpd", int_x86_xop_vfrcz_pd, memopv2f64>;
 
 multiclass xop2op256<bits<8> opc, string OpcodeStr, Intrinsic Int,
                      PatFrag memop> {
@@ -81,12 +75,8 @@ multiclass xop2op256<bits<8> opc, string OpcodeStr, Intrinsic Int,
            [(set VR256:$dst, (Int (bitconvert (memop addr:$src))))]>, VEX, VEX_L;
 }
 
-let isAsmParserOnly = 1 in {
-  defm VFRCZPS : xop2op256<0x80, "vfrczps", int_x86_xop_vfrcz_ps_256,
-                           memopv8f32>;
-  defm VFRCZPD : xop2op256<0x81, "vfrczpd", int_x86_xop_vfrcz_pd_256,
-                           memopv4f64>;
-}
+defm VFRCZPS : xop2op256<0x80, "vfrczps", int_x86_xop_vfrcz_ps_256, memopv8f32>;
+defm VFRCZPD : xop2op256<0x81, "vfrczpd", int_x86_xop_vfrcz_pd_256, memopv4f64>;
 
 multiclass xop3op<bits<8> opc, string OpcodeStr, Intrinsic Int> {
   def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst),
@@ -107,20 +97,18 @@ multiclass xop3op<bits<8> opc, string OpcodeStr, Intrinsic Int> {
              VEX_4VOp3;
 }
 
-let isAsmParserOnly = 1 in {
-  defm VPSHLW : xop3op<0x95, "vpshlw", int_x86_xop_vpshlw>;
-  defm VPSHLQ : xop3op<0x97, "vpshlq", int_x86_xop_vpshlq>;
-  defm VPSHLD : xop3op<0x96, "vpshld", int_x86_xop_vpshld>;
-  defm VPSHLB : xop3op<0x94, "vpshlb", int_x86_xop_vpshlb>;
-  defm VPSHAW : xop3op<0x99, "vpshaw", int_x86_xop_vpshaw>;
-  defm VPSHAQ : xop3op<0x9B, "vpshaq", int_x86_xop_vpshaq>;
-  defm VPSHAD : xop3op<0x9A, "vpshad", int_x86_xop_vpshad>;
-  defm VPSHAB : xop3op<0x98, "vpshab", int_x86_xop_vpshab>;
-  defm VPROTW : xop3op<0x91, "vprotw", int_x86_xop_vprotw>;
-  defm VPROTQ : xop3op<0x93, "vprotq", int_x86_xop_vprotq>;
-  defm VPROTD : xop3op<0x92, "vprotd", int_x86_xop_vprotd>;
-  defm VPROTB : xop3op<0x90, "vprotb", int_x86_xop_vprotb>;
-}
+defm VPSHLW : xop3op<0x95, "vpshlw", int_x86_xop_vpshlw>;
+defm VPSHLQ : xop3op<0x97, "vpshlq", int_x86_xop_vpshlq>;
+defm VPSHLD : xop3op<0x96, "vpshld", int_x86_xop_vpshld>;
+defm VPSHLB : xop3op<0x94, "vpshlb", int_x86_xop_vpshlb>;
+defm VPSHAW : xop3op<0x99, "vpshaw", int_x86_xop_vpshaw>;
+defm VPSHAQ : xop3op<0x9B, "vpshaq", int_x86_xop_vpshaq>;
+defm VPSHAD : xop3op<0x9A, "vpshad", int_x86_xop_vpshad>;
+defm VPSHAB : xop3op<0x98, "vpshab", int_x86_xop_vpshab>;
+defm VPROTW : xop3op<0x91, "vprotw", int_x86_xop_vprotw>;
+defm VPROTQ : xop3op<0x93, "vprotq", int_x86_xop_vprotq>;
+defm VPROTD : xop3op<0x92, "vprotd", int_x86_xop_vprotd>;
+defm VPROTB : xop3op<0x90, "vprotb", int_x86_xop_vprotb>;
 
 multiclass xop3opimm<bits<8> opc, string OpcodeStr, Intrinsic Int> {
   def ri : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst),
@@ -134,12 +122,10 @@ multiclass xop3opimm<bits<8> opc, string OpcodeStr, Intrinsic Int> {
              (Int (bitconvert (memopv2i64 addr:$src1)), imm:$src2))]>, VEX;
 }
 
-let isAsmParserOnly = 1 in {
-  defm VPROTW : xop3opimm<0xC1, "vprotw", int_x86_xop_vprotwi>;
-  defm VPROTQ : xop3opimm<0xC3, "vprotq", int_x86_xop_vprotqi>;
-  defm VPROTD : xop3opimm<0xC2, "vprotd", int_x86_xop_vprotdi>;
-  defm VPROTB : xop3opimm<0xC0, "vprotb", int_x86_xop_vprotbi>;
-}
+defm VPROTW : xop3opimm<0xC1, "vprotw", int_x86_xop_vprotwi>;
+defm VPROTQ : xop3opimm<0xC3, "vprotq", int_x86_xop_vprotqi>;
+defm VPROTD : xop3opimm<0xC2, "vprotd", int_x86_xop_vprotdi>;
+defm VPROTB : xop3opimm<0xC0, "vprotb", int_x86_xop_vprotbi>;
 
 // Instruction where second source can be memory, but third must be register
 multiclass xop4opm2<bits<8> opc, string OpcodeStr, Intrinsic Int> {
@@ -158,20 +144,18 @@ multiclass xop4opm2<bits<8> opc, string OpcodeStr, Intrinsic Int> {
               VR128:$src3))]>, VEX_4V, VEX_I8IMM;
 }
 
-let isAsmParserOnly = 1 in {
-  defm VPMADCSWD  : xop4opm2<0xB6, "vpmadcswd", int_x86_xop_vpmadcswd>;
-  defm VPMADCSSWD : xop4opm2<0xA6, "vpmadcsswd", int_x86_xop_vpmadcsswd>;
-  defm VPMACSWW   : xop4opm2<0x95, "vpmacsww", int_x86_xop_vpmacsww>;
-  defm VPMACSWD   : xop4opm2<0x96, "vpmacswd", int_x86_xop_vpmacswd>;
-  defm VPMACSSWW  : xop4opm2<0x85, "vpmacssww", int_x86_xop_vpmacssww>;
-  defm VPMACSSWD  : xop4opm2<0x86, "vpmacsswd", int_x86_xop_vpmacsswd>;
-  defm VPMACSSDQL : xop4opm2<0x87, "vpmacssdql", int_x86_xop_vpmacssdql>;
-  defm VPMACSSDQH : xop4opm2<0x8F, "vpmacssdqh", int_x86_xop_vpmacssdqh>;
-  defm VPMACSSDD  : xop4opm2<0x8E, "vpmacssdd", int_x86_xop_vpmacssdd>;
-  defm VPMACSDQL  : xop4opm2<0x97, "vpmacsdql", int_x86_xop_vpmacsdql>;
-  defm VPMACSDQH  : xop4opm2<0x9F, "vpmacsdqh", int_x86_xop_vpmacsdqh>;
-  defm VPMACSDD   : xop4opm2<0x9E, "vpmacsdd", int_x86_xop_vpmacsdd>;
-}
+defm VPMADCSWD  : xop4opm2<0xB6, "vpmadcswd", int_x86_xop_vpmadcswd>;
+defm VPMADCSSWD : xop4opm2<0xA6, "vpmadcsswd", int_x86_xop_vpmadcsswd>;
+defm VPMACSWW   : xop4opm2<0x95, "vpmacsww", int_x86_xop_vpmacsww>;
+defm VPMACSWD   : xop4opm2<0x96, "vpmacswd", int_x86_xop_vpmacswd>;
+defm VPMACSSWW  : xop4opm2<0x85, "vpmacssww", int_x86_xop_vpmacssww>;
+defm VPMACSSWD  : xop4opm2<0x86, "vpmacsswd", int_x86_xop_vpmacsswd>;
+defm VPMACSSDQL : xop4opm2<0x87, "vpmacssdql", int_x86_xop_vpmacssdql>;
+defm VPMACSSDQH : xop4opm2<0x8F, "vpmacssdqh", int_x86_xop_vpmacssdqh>;
+defm VPMACSSDD  : xop4opm2<0x8E, "vpmacssdd", int_x86_xop_vpmacssdd>;
+defm VPMACSDQL  : xop4opm2<0x97, "vpmacsdql", int_x86_xop_vpmacsdql>;
+defm VPMACSDQH  : xop4opm2<0x9F, "vpmacsdqh", int_x86_xop_vpmacsdqh>;
+defm VPMACSDD   : xop4opm2<0x9E, "vpmacsdd", int_x86_xop_vpmacsdd>;
 
 // Instruction where second source can be memory, third must be imm8
 multiclass xop4opimm<bits<8> opc, string OpcodeStr, Intrinsic Int> {
@@ -190,16 +174,14 @@ multiclass xop4opimm<bits<8> opc, string OpcodeStr, Intrinsic Int> {
               imm:$src3))]>, VEX_4V;
 }
 
-let isAsmParserOnly = 1 in {
-  defm VPCOMB  : xop4opimm<0xCC, "vpcomb", int_x86_xop_vpcomb>;
-  defm VPCOMW  : xop4opimm<0xCD, "vpcomw", int_x86_xop_vpcomw>;
-  defm VPCOMD  : xop4opimm<0xCE, "vpcomd", int_x86_xop_vpcomd>;
-  defm VPCOMQ  : xop4opimm<0xCF, "vpcomq", int_x86_xop_vpcomq>;
-  defm VPCOMUB : xop4opimm<0xEC, "vpcomub", int_x86_xop_vpcomub>;
-  defm VPCOMUW : xop4opimm<0xED, "vpcomuw", int_x86_xop_vpcomuw>;
-  defm VPCOMUD : xop4opimm<0xEE, "vpcomud", int_x86_xop_vpcomud>;
-  defm VPCOMUQ : xop4opimm<0xEF, "vpcomuq", int_x86_xop_vpcomuq>;
-}
+defm VPCOMB  : xop4opimm<0xCC, "vpcomb", int_x86_xop_vpcomb>;
+defm VPCOMW  : xop4opimm<0xCD, "vpcomw", int_x86_xop_vpcomw>;
+defm VPCOMD  : xop4opimm<0xCE, "vpcomd", int_x86_xop_vpcomd>;
+defm VPCOMQ  : xop4opimm<0xCF, "vpcomq", int_x86_xop_vpcomq>;
+defm VPCOMUB : xop4opimm<0xEC, "vpcomub", int_x86_xop_vpcomub>;
+defm VPCOMUW : xop4opimm<0xED, "vpcomuw", int_x86_xop_vpcomuw>;
+defm VPCOMUD : xop4opimm<0xEE, "vpcomud", int_x86_xop_vpcomud>;
+defm VPCOMUQ : xop4opimm<0xEF, "vpcomuq", int_x86_xop_vpcomuq>;
 
 // Instruction where either second or third source can be memory
 multiclass xop4op<bits<8> opc, string OpcodeStr, Intrinsic Int> {
@@ -227,10 +209,8 @@ multiclass xop4op<bits<8> opc, string OpcodeStr, Intrinsic Int> {
            VEX_4V, VEX_I8IMM;
 }
 
-let isAsmParserOnly = 1 in {
-  defm VPPERM : xop4op<0xA3, "vpperm", int_x86_xop_vpperm>;
-  defm VPCMOV : xop4op<0xA2, "vpcmov", int_x86_xop_vpcmov>;
-}
+defm VPPERM : xop4op<0xA3, "vpperm", int_x86_xop_vpperm>;
+defm VPCMOV : xop4op<0xA2, "vpcmov", int_x86_xop_vpcmov>;
 
 multiclass xop4op256<bits<8> opc, string OpcodeStr, Intrinsic Int> {
   def rrY : IXOPi8<opc, MRMSrcReg, (outs VR256:$dst),
@@ -257,9 +237,7 @@ multiclass xop4op256<bits<8> opc, string OpcodeStr, Intrinsic Int> {
            VEX_4V, VEX_I8IMM, VEX_L;
 }
 
-let isAsmParserOnly = 1 in {
-  defm VPCMOV : xop4op256<0xA2, "vpcmov", int_x86_xop_vpcmov_256>;
-}
+defm VPCMOV : xop4op256<0xA2, "vpcmov", int_x86_xop_vpcmov_256>;
 
 multiclass xop5op<bits<8> opc, string OpcodeStr, Intrinsic Int128,
                   Intrinsic Int256, PatFrag ld_128, PatFrag ld_256> {
diff --git a/lib/Target/X86/X86JITInfo.cpp b/lib/Target/X86/X86JITInfo.cpp
index fc86e1e..e99f2d9 100644
--- a/lib/Target/X86/X86JITInfo.cpp
+++ b/lib/Target/X86/X86JITInfo.cpp
@@ -127,7 +127,7 @@ extern "C" {
     "movaps  %xmm6, 96(%rsp)\n"
     "movaps  %xmm7, 112(%rsp)\n"
     // JIT callee
-#ifdef _WIN64
+#if defined(_WIN64) || defined(__CYGWIN__)
     "subq    $32, %rsp\n"
     "movq    %rbp, %rcx\n"    // Pass prev frame and return address
     "movq    8(%rbp), %rdx\n"
diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp
index c7c00b5..6649c82 100644
--- a/lib/Target/X86/X86MCInstLower.cpp
+++ b/lib/Target/X86/X86MCInstLower.cpp
@@ -17,6 +17,7 @@
 #include "X86COFFMachineModuleInfo.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/CodeGen/MachineModuleInfoImpls.h"
+#include "llvm/CodeGen/StackMaps.h"
 #include "llvm/IR/Type.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
@@ -34,14 +35,12 @@ namespace {
 /// X86MCInstLower - This class is used to lower an MachineInstr into an MCInst.
 class X86MCInstLower {
   MCContext &Ctx;
-  Mangler *Mang;
   const MachineFunction &MF;
   const TargetMachine &TM;
   const MCAsmInfo &MAI;
   X86AsmPrinter &AsmPrinter;
 public:
-  X86MCInstLower(Mangler *mang, const MachineFunction &MF,
-                 X86AsmPrinter &asmprinter);
+  X86MCInstLower(const MachineFunction &MF, X86AsmPrinter &asmprinter);
 
   void Lower(const MachineInstr *MI, MCInst &OutMI) const;
 
@@ -50,13 +49,16 @@ public:
 
 private:
   MachineModuleInfoMachO &getMachOMMI() const;
+  Mangler *getMang() const {
+    return AsmPrinter.Mang;
+  }
 };
 
 } // end anonymous namespace
 
-X86MCInstLower::X86MCInstLower(Mangler *mang, const MachineFunction &mf,
+X86MCInstLower::X86MCInstLower(const MachineFunction &mf,
                                X86AsmPrinter &asmprinter)
-: Ctx(mf.getContext()), Mang(mang), MF(mf), TM(mf.getTarget()),
+: Ctx(mf.getContext()), MF(mf), TM(mf.getTarget()),
   MAI(*TM.getMCAsmInfo()), AsmPrinter(asmprinter) {}
 
 MachineModuleInfoMachO &X86MCInstLower::getMachOMMI() const {
@@ -81,7 +83,7 @@ GetSymbolFromOperand(const MachineOperand &MO) const {
         MO.getTargetFlags() == X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE)
       isImplicitlyPrivate = true;
 
-    Mang->getNameWithPrefix(Name, GV, isImplicitlyPrivate);
+    getMang()->getNameWithPrefix(Name, GV, isImplicitlyPrivate);
   } else if (MO.isSymbol()) {
     Name += MAI.getGlobalPrefix();
     Name += MO.getSymbolName();
@@ -110,7 +112,7 @@ GetSymbolFromOperand(const MachineOperand &MO) const {
       assert(MO.isGlobal() && "Extern symbol not handled yet");
       StubSym =
         MachineModuleInfoImpl::
-        StubValueTy(Mang->getSymbol(MO.getGlobal()),
+        StubValueTy(AsmPrinter.getSymbol(MO.getGlobal()),
                     !MO.getGlobal()->hasInternalLinkage());
     }
     return Sym;
@@ -124,7 +126,7 @@ GetSymbolFromOperand(const MachineOperand &MO) const {
       assert(MO.isGlobal() && "Extern symbol not handled yet");
       StubSym =
         MachineModuleInfoImpl::
-        StubValueTy(Mang->getSymbol(MO.getGlobal()),
+        StubValueTy(AsmPrinter.getSymbol(MO.getGlobal()),
                     !MO.getGlobal()->hasInternalLinkage());
     }
     return Sym;
@@ -140,7 +142,7 @@ GetSymbolFromOperand(const MachineOperand &MO) const {
     if (MO.isGlobal()) {
       StubSym =
         MachineModuleInfoImpl::
-        StubValueTy(Mang->getSymbol(MO.getGlobal()),
+        StubValueTy(AsmPrinter.getSymbol(MO.getGlobal()),
                     !MO.getGlobal()->hasInternalLinkage());
     } else {
       Name.erase(Name.end()-5, Name.end());
@@ -591,18 +593,6 @@ ReSimplify:
   case X86::MOVSX64rr32:
     SimplifyMOVSX(OutMI);
     break;
-
-  case X86::MORESTACK_RET:
-    OutMI.setOpcode(X86::RET);
-    break;
-
-  case X86::MORESTACK_RET_RESTORE_R10:
-    OutMI.setOpcode(X86::MOV64rr);
-    OutMI.addOperand(MCOperand::CreateReg(X86::R10));
-    OutMI.addOperand(MCOperand::CreateReg(X86::RAX));
-
-    AsmPrinter.OutStreamer.EmitInstruction(MCInstBuilder(X86::RET));
-    break;
   }
 }
 
@@ -685,8 +675,140 @@ static void LowerTlsAddr(MCStreamer &OutStreamer,
     .addExpr(tlsRef));
 }
 
+static std::pair<StackMaps::Location, MachineInstr::const_mop_iterator>
+parseMemoryOperand(StackMaps::Location::LocationType LocTy, unsigned Size,
+                   MachineInstr::const_mop_iterator MOI,
+                   MachineInstr::const_mop_iterator MOE) {
+
+  typedef StackMaps::Location Location;
+
+  assert(std::distance(MOI, MOE) >= 5 && "Too few operands to encode mem op.");
+
+  const MachineOperand &Base = *MOI;
+  const MachineOperand &Scale = *(++MOI);
+  const MachineOperand &Index = *(++MOI);
+  const MachineOperand &Disp = *(++MOI);
+  const MachineOperand &ZeroReg = *(++MOI);
+
+  // Sanity check for supported operand format.
+  assert(Base.isReg() &&
+         Scale.isImm() && Scale.getImm() == 1 &&
+         Index.isReg() && Index.getReg() == 0 &&
+         Disp.isImm() && ZeroReg.isReg() && (ZeroReg.getReg() == 0) &&
+         "Unsupported x86 memory operand sequence.");
+  (void)Scale;
+  (void)Index;
+  (void)ZeroReg;
+
+  return std::make_pair(
+    Location(LocTy, Size, Base.getReg(), Disp.getImm()), ++MOI);
+}
+
+std::pair<StackMaps::Location, MachineInstr::const_mop_iterator>
+X86AsmPrinter::stackmapOperandParser(MachineInstr::const_mop_iterator MOI,
+                                     MachineInstr::const_mop_iterator MOE,
+                                     const TargetMachine &TM) {
+
+  typedef StackMaps::Location Location;
+
+  const MachineOperand &MOP = *MOI;
+  assert(!MOP.isRegMask() && (!MOP.isReg() || !MOP.isImplicit()) &&
+         "Register mask and implicit operands should not be processed.");
+
+  if (MOP.isImm()) {
+    // Verify anyregcc
+    // [<def>], <id>, <numBytes>, <target>, <numArgs>, <cc>, ...
+
+    switch (MOP.getImm()) {
+    default: llvm_unreachable("Unrecognized operand type.");
+    case StackMaps::DirectMemRefOp: {
+      unsigned Size = TM.getDataLayout()->getPointerSizeInBits();
+      assert((Size % 8) == 0 && "Need pointer size in bytes.");
+      Size /= 8;
+      return parseMemoryOperand(StackMaps::Location::Direct, Size,
+                                llvm::next(MOI), MOE);
+    }
+    case StackMaps::IndirectMemRefOp: {
+      ++MOI;
+      int64_t Size = MOI->getImm();
+      assert(Size > 0 && "Need a valid size for indirect memory locations.");
+      return parseMemoryOperand(StackMaps::Location::Indirect, Size,
+                                llvm::next(MOI), MOE);
+    }
+    case StackMaps::ConstantOp: {
+      ++MOI;
+      assert(MOI->isImm() && "Expected constant operand.");
+      int64_t Imm = MOI->getImm();
+      return std::make_pair(
+        Location(Location::Constant, sizeof(int64_t), 0, Imm), ++MOI);
+    }
+    }
+  }
+
+  // Otherwise this is a reg operand. The physical register number will
+  // ultimately be encoded as a DWARF regno. The stack map also records the size
+  // of a spill slot that can hold the register content. (The runtime can
+  // track the actual size of the data type if it needs to.)
+  assert(MOP.isReg() && "Expected register operand here.");
+  assert(TargetRegisterInfo::isPhysicalRegister(MOP.getReg()) &&
+         "Virtreg operands should have been rewritten before now.");
+  const TargetRegisterClass *RC =
+    TM.getRegisterInfo()->getMinimalPhysRegClass(MOP.getReg());
+  assert(!MOP.getSubReg() && "Physical subreg still around.");
+  return std::make_pair(
+    Location(Location::Register, RC->getSize(), MOP.getReg(), 0), ++MOI);
+}
+
+// Lower a stackmap of the form:
+// <id>, <shadowBytes>, ...
+static void LowerSTACKMAP(MCStreamer &OutStreamer,
+                          StackMaps &SM,
+                          const MachineInstr &MI)
+{
+  unsigned NumNOPBytes = MI.getOperand(1).getImm();
+  SM.recordStackMap(MI);
+  // Emit padding.
+  // FIXME: These nops ensure that the stackmap's shadow is covered by
+  // instructions from the same basic block, but the nops should not be
+  // necessary if instructions from the same block follow the stackmap.
+  for (unsigned i = 0; i < NumNOPBytes; ++i)
+    OutStreamer.EmitInstruction(MCInstBuilder(X86::NOOP));
+}
+
+// Lower a patchpoint of the form:
+// [<def>], <id>, <numBytes>, <target>, <numArgs>, <cc>, ...
+static void LowerPATCHPOINT(MCStreamer &OutStreamer,
+                            StackMaps &SM,
+                            const MachineInstr &MI) {
+  SM.recordPatchPoint(MI);
+
+  PatchPointOpers opers(&MI);
+  unsigned ScratchIdx = opers.getNextScratchIdx();
+  unsigned EncodedBytes = 0;
+  int64_t CallTarget = opers.getMetaOper(PatchPointOpers::TargetPos).getImm();
+  if (CallTarget) {
+    // Emit MOV to materialize the target address and the CALL to target.
+    // This is encoded with 12-13 bytes, depending on which register is used.
+    // We conservatively assume that it is 12 bytes and emit in worst case one
+    // extra NOP byte.
+    EncodedBytes = 12;
+    OutStreamer.EmitInstruction(MCInstBuilder(X86::MOV64ri)
+                                .addReg(MI.getOperand(ScratchIdx).getReg())
+                                .addImm(CallTarget));
+    OutStreamer.EmitInstruction(MCInstBuilder(X86::CALL64r)
+                                .addReg(MI.getOperand(ScratchIdx).getReg()));
+  }
+  // Emit padding.
+  unsigned NumBytes = opers.getMetaOper(PatchPointOpers::NBytesPos).getImm();
+  assert(NumBytes >= EncodedBytes &&
+         "Patchpoint can't request size less than the length of a call.");
+
+  for (unsigned i = EncodedBytes; i < NumBytes; ++i)
+    OutStreamer.EmitInstruction(MCInstBuilder(X86::NOOP));
+}
+
 void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
-  X86MCInstLower MCInstLowering(Mang, *MF, *this);
+  X86MCInstLower MCInstLowering(*MF, *this);
   switch (MI->getOpcode()) {
   case TargetOpcode::DBG_VALUE:
     llvm_unreachable("Should be handled target independently");
@@ -774,6 +896,24 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
       .addExpr(DotExpr));
     return;
   }
+
+  case TargetOpcode::STACKMAP:
+    return LowerSTACKMAP(OutStreamer, SM, *MI);
+
+  case TargetOpcode::PATCHPOINT:
+    return LowerPATCHPOINT(OutStreamer, SM, *MI);
+
+  case X86::MORESTACK_RET:
+    OutStreamer.EmitInstruction(MCInstBuilder(X86::RET));
+    return;
+
+  case X86::MORESTACK_RET_RESTORE_R10:
+    // Return, then restore R10.
+    OutStreamer.EmitInstruction(MCInstBuilder(X86::RET));
+    OutStreamer.EmitInstruction(MCInstBuilder(X86::MOV64rr)
+      .addReg(X86::R10)
+      .addReg(X86::RAX));
+    return;
   }
 
   MCInst TmpInst;
diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp
index 0923310..dbda556 100644
--- a/lib/Target/X86/X86RegisterInfo.cpp
+++ b/lib/Target/X86/X86RegisterInfo.cpp
@@ -101,8 +101,8 @@ int X86RegisterInfo::getCompactUnwindRegNum(unsigned RegNum, bool isEH) const {
 
 bool
 X86RegisterInfo::trackLivenessAfterRegAlloc(const MachineFunction &MF) const {
-  // Only enable when post-RA scheduling is enabled and this is needed.
-  return TM.getSubtargetImpl()->postRAScheduler();
+  // ExeDepsFixer and PostRAScheduler require liveness.
+  return true;
 }
 
 int
@@ -239,6 +239,11 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
   case CallingConv::HiPE:
     return CSR_NoRegs_SaveList;
 
+  case CallingConv::WebKit_JS:
+    return CSR_64_SaveList;
+  case CallingConv::AnyReg:
+    return CSR_MostRegs_64_SaveList;
+
   case CallingConv::Intel_OCL_BI: {
     bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX();
     bool HasAVX512 = TM.getSubtarget<X86Subtarget>().hasAVX512();
@@ -296,6 +301,8 @@ X86RegisterInfo::getCallPreservedMask(CallingConv::ID CC) const {
   }
   if (CC == CallingConv::GHC || CC == CallingConv::HiPE)
     return CSR_NoRegs_RegMask;
+  if (CC == CallingConv::WebKit_JS || CC == CallingConv::AnyReg)
+    return CSR_MostRegs_64_RegMask;
   if (!Is64Bit)
     return CSR_32_RegMask;
   if (CC == CallingConv::Cold)
@@ -512,14 +519,6 @@ unsigned X86RegisterInfo::getFrameRegister(const MachineFunction &MF) const {
   return TFI->hasFP(MF) ? FramePtr : StackPtr;
 }
 
-unsigned X86RegisterInfo::getEHExceptionRegister() const {
-  llvm_unreachable("What is the exception register");
-}
-
-unsigned X86RegisterInfo::getEHHandlerRegister() const {
-  llvm_unreachable("What is the exception handler register");
-}
-
 namespace llvm {
 unsigned getX86SubSuperRegister(unsigned Reg, MVT::SimpleValueType VT,
                                 bool High) {
diff --git a/lib/Target/X86/X86RegisterInfo.h b/lib/Target/X86/X86RegisterInfo.h
index fb17682..22251b2 100644
--- a/lib/Target/X86/X86RegisterInfo.h
+++ b/lib/Target/X86/X86RegisterInfo.h
@@ -126,10 +126,6 @@ public:
   unsigned getBaseRegister() const { return BasePtr; }
   // FIXME: Move to FrameInfok
   unsigned getSlotSize() const { return SlotSize; }
-
-  // Exception handling queries.
-  unsigned getEHExceptionRegister() const;
-  unsigned getEHHandlerRegister() const;
 };
 
 // getX86SubSuperRegister - X86 utility function. It returns the sub or super
diff --git a/lib/Target/X86/X86SchedHaswell.td b/lib/Target/X86/X86SchedHaswell.td
index 62ba2bc..9748261 100644
--- a/lib/Target/X86/X86SchedHaswell.td
+++ b/lib/Target/X86/X86SchedHaswell.td
@@ -19,6 +19,10 @@ def HaswellModel : SchedMachineModel {
   let MicroOpBufferSize = 192; // Based on the reorder buffer.
   let LoadLatency = 4;
   let MispredictPenalty = 16;
+
+  // FIXME: SSE4 and AVX are unimplemented. This flag is set to allow
+  // the scheduler to assign a default model to unrecognized opcodes.
+  let CompleteModel = 0;
 }
 
 let SchedModel = HaswellModel in {
diff --git a/lib/Target/X86/X86SchedSandyBridge.td b/lib/Target/X86/X86SchedSandyBridge.td
index 52ead94..3011c6d 100644
--- a/lib/Target/X86/X86SchedSandyBridge.td
+++ b/lib/Target/X86/X86SchedSandyBridge.td
@@ -20,6 +20,10 @@ def SandyBridgeModel : SchedMachineModel {
   let MicroOpBufferSize = 168; // Based on the reorder buffer.
   let LoadLatency = 4;
   let MispredictPenalty = 16;
+
+  // FIXME: SSE4 and AVX are unimplemented. This flag is set to allow
+  // the scheduler to assign a default model to unrecognized opcodes.
+  let CompleteModel = 0;
 }
 
 let SchedModel = SandyBridgeModel in {
diff --git a/lib/Target/X86/X86Schedule.td b/lib/Target/X86/X86Schedule.td
index ceb2e05..0556437 100644
--- a/lib/Target/X86/X86Schedule.td
+++ b/lib/Target/X86/X86Schedule.td
@@ -141,9 +141,12 @@ def IIC_IDIV64      : InstrItinClass;
 // neg/not/inc/dec
 def IIC_UNARY_REG   : InstrItinClass;
 def IIC_UNARY_MEM   : InstrItinClass;
-// add/sub/and/or/xor/adc/sbc/cmp/test
+// add/sub/and/or/xor/sbc/cmp/test
 def IIC_BIN_MEM     : InstrItinClass;
 def IIC_BIN_NONMEM  : InstrItinClass;
+// adc/sbc
+def IIC_BIN_CARRY_MEM     : InstrItinClass;
+def IIC_BIN_CARRY_NONMEM  : InstrItinClass;
 // shift/rotate
 def IIC_SR          : InstrItinClass;
 // shift double
@@ -250,11 +253,11 @@ def IIC_SSE_INTSH_P_RR : InstrItinClass;
 def IIC_SSE_INTSH_P_RM : InstrItinClass;
 def IIC_SSE_INTSH_P_RI : InstrItinClass;
 
-def IIC_SSE_CMPP_RR : InstrItinClass;
-def IIC_SSE_CMPP_RM : InstrItinClass;
+def IIC_SSE_INTSHDQ_P_RI : InstrItinClass;
 
 def IIC_SSE_SHUFP : InstrItinClass;
-def IIC_SSE_PSHUF : InstrItinClass;
+def IIC_SSE_PSHUF_RI : InstrItinClass;
+def IIC_SSE_PSHUF_MI : InstrItinClass;
 
 def IIC_SSE_UNPCK : InstrItinClass;
 
@@ -316,7 +319,8 @@ def IIC_SSE_PSIGN_RM : InstrItinClass;
 
 def IIC_SSE_PMADD : InstrItinClass;
 def IIC_SSE_PMULHRSW : InstrItinClass;
-def IIC_SSE_PALIGNR : InstrItinClass;
+def IIC_SSE_PALIGNRR : InstrItinClass;
+def IIC_SSE_PALIGNRM : InstrItinClass;
 def IIC_SSE_MWAIT : InstrItinClass;
 def IIC_SSE_MONITOR : InstrItinClass;
 
@@ -492,8 +496,8 @@ def IIC_PUSH_REG : InstrItinClass;
 def IIC_PUSH_F : InstrItinClass;
 def IIC_PUSH_A : InstrItinClass;
 def IIC_BSWAP : InstrItinClass;
-def IIC_BSF : InstrItinClass;
-def IIC_BSR : InstrItinClass;
+def IIC_BIT_SCAN_MEM : InstrItinClass;
+def IIC_BIT_SCAN_REG : InstrItinClass;
 def IIC_MOVS : InstrItinClass;
 def IIC_STOS : InstrItinClass;
 def IIC_SCAS : InstrItinClass;
@@ -540,6 +544,33 @@ def IIC_BOUND : InstrItinClass;
 def IIC_ARPL_REG : InstrItinClass;
 def IIC_ARPL_MEM : InstrItinClass;
 def IIC_MOVBE : InstrItinClass;
+def IIC_AES   : InstrItinClass;
+def IIC_BLEND_MEM : InstrItinClass;
+def IIC_BLEND_NOMEM : InstrItinClass;
+def IIC_CBW   : InstrItinClass;
+def IIC_CRC32_REG : InstrItinClass;
+def IIC_CRC32_MEM : InstrItinClass;
+def IIC_SSE_DPPD_RR : InstrItinClass;
+def IIC_SSE_DPPD_RM : InstrItinClass;
+def IIC_SSE_DPPS_RR : InstrItinClass;
+def IIC_SSE_DPPS_RM : InstrItinClass;
+def IIC_MMX_EMMS : InstrItinClass;
+def IIC_SSE_EXTRACTPS_RR : InstrItinClass;
+def IIC_SSE_EXTRACTPS_RM : InstrItinClass;
+def IIC_SSE_INSERTPS_RR : InstrItinClass;
+def IIC_SSE_INSERTPS_RM : InstrItinClass;
+def IIC_SSE_MPSADBW_RR : InstrItinClass;
+def IIC_SSE_MPSADBW_RM : InstrItinClass;
+def IIC_SSE_PMULLD_RR : InstrItinClass;
+def IIC_SSE_PMULLD_RM : InstrItinClass;
+def IIC_SSE_ROUNDPS_REG : InstrItinClass;
+def IIC_SSE_ROUNDPS_MEM : InstrItinClass;
+def IIC_SSE_ROUNDPD_REG : InstrItinClass;
+def IIC_SSE_ROUNDPD_MEM : InstrItinClass;
+def IIC_SSE_POPCNT_RR : InstrItinClass;
+def IIC_SSE_POPCNT_RM : InstrItinClass;
+def IIC_SSE_PCLMULQDQ_RR : InstrItinClass;
+def IIC_SSE_PCLMULQDQ_RM : InstrItinClass;
 
 def IIC_NOP : InstrItinClass;
 
@@ -561,7 +592,7 @@ def IIC_NOP : InstrItinClass;
 // latencies. Since these latencies are not used for pipeline hazards,
 // they do not need to be exact.
 //
-// The GenericModel contains no instruciton itineraries.
+// The GenericModel contains no instruction itineraries.
 def GenericModel : SchedMachineModel {
   let IssueWidth = 4;
   let MicroOpBufferSize = 32;
@@ -572,3 +603,4 @@ def GenericModel : SchedMachineModel {
 include "X86ScheduleAtom.td"
 include "X86SchedSandyBridge.td"
 include "X86SchedHaswell.td"
+include "X86ScheduleSLM.td"
diff --git a/lib/Target/X86/X86ScheduleAtom.td b/lib/Target/X86/X86ScheduleAtom.td
index 14a1471..ba72f29 100644
--- a/lib/Target/X86/X86ScheduleAtom.td
+++ b/lib/Target/X86/X86ScheduleAtom.td
@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file defines the itinerary class data for the Intel Atom (Bonnell)
-// processors.
+// This file defines the itinerary class data for the Intel Atom
+// in order (Saltwell-32nm/Bonnell-45nm) processors.
 //
 //===----------------------------------------------------------------------===//
 
@@ -79,9 +79,12 @@ def AtomItineraries : ProcessorItineraries<
   // neg/not/inc/dec
   InstrItinData<IIC_UNARY_REG, [InstrStage<1, [Port0, Port1]>] >,
   InstrItinData<IIC_UNARY_MEM, [InstrStage<1, [Port0]>] >,
-  // add/sub/and/or/xor/adc/sbc/cmp/test
+  // add/sub/and/or/xor/cmp/test
   InstrItinData<IIC_BIN_NONMEM, [InstrStage<1, [Port0, Port1]>] >,
   InstrItinData<IIC_BIN_MEM, [InstrStage<1, [Port0]>] >,
+  // adc/sbc
+  InstrItinData<IIC_BIN_CARRY_NONMEM, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_BIN_CARRY_MEM, [InstrStage<1, [Port0]>] >,
   // shift/rotate
   InstrItinData<IIC_SR, [InstrStage<1, [Port0]>] >,
   // shift double
@@ -203,11 +206,11 @@ def AtomItineraries : ProcessorItineraries<
   InstrItinData<IIC_SSE_INTSH_P_RM, [InstrStage<3, [Port0, Port1]>] >,
   InstrItinData<IIC_SSE_INTSH_P_RI, [InstrStage<1, [Port0, Port1]>] >,
 
-  InstrItinData<IIC_SSE_CMPP_RR, [InstrStage<6, [Port0, Port1]>] >,
-  InstrItinData<IIC_SSE_CMPP_RM, [InstrStage<7, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_INTSHDQ_P_RI, [InstrStage<1, [Port0, Port1]>] >,
 
   InstrItinData<IIC_SSE_SHUFP, [InstrStage<1, [Port0]>] >,
-  InstrItinData<IIC_SSE_PSHUF, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_SSE_PSHUF_RI, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_SSE_PSHUF_MI, [InstrStage<1, [Port0]>] >,
 
   InstrItinData<IIC_SSE_UNPCK, [InstrStage<1, [Port0]>] >,
 
@@ -278,7 +281,8 @@ def AtomItineraries : ProcessorItineraries<
 
   InstrItinData<IIC_SSE_PMADD, [InstrStage<5, [Port0]>] >,
   InstrItinData<IIC_SSE_PMULHRSW, [InstrStage<5, [Port0]>] >,
-  InstrItinData<IIC_SSE_PALIGNR, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_SSE_PALIGNRR, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_SSE_PALIGNRM, [InstrStage<1, [Port0]>] >,
   InstrItinData<IIC_SSE_MWAIT, [InstrStage<46, [Port0, Port1]>] >,
   InstrItinData<IIC_SSE_MONITOR, [InstrStage<45, [Port0, Port1]>] >,
 
@@ -470,8 +474,8 @@ def AtomItineraries : ProcessorItineraries<
   InstrItinData<IIC_PUSH_A, [InstrStage<8, [Port0, Port1]>] >,
 
   InstrItinData<IIC_BSWAP, [InstrStage<1, [Port0]>] >,
-  InstrItinData<IIC_BSF, [InstrStage<16, [Port0, Port1]>] >,
-  InstrItinData<IIC_BSR, [InstrStage<16, [Port0, Port1]>] >,
+  InstrItinData<IIC_BIT_SCAN_MEM, [InstrStage<16, [Port0, Port1]>] >,
+  InstrItinData<IIC_BIT_SCAN_REG, [InstrStage<16, [Port0, Port1]>] >,
   InstrItinData<IIC_MOVS, [InstrStage<3, [Port0, Port1]>] >,
   InstrItinData<IIC_STOS, [InstrStage<1, [Port0, Port1]>] >,
   InstrItinData<IIC_SCAS, [InstrStage<2, [Port0, Port1]>] >,
@@ -518,6 +522,8 @@ def AtomItineraries : ProcessorItineraries<
   InstrItinData<IIC_ARPL_REG, [InstrStage<24, [Port0, Port1]>] >,
   InstrItinData<IIC_ARPL_MEM, [InstrStage<23, [Port0, Port1]>] >,
   InstrItinData<IIC_MOVBE, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_CBW, [InstrStage<4, [Port0, Port1]>] >,
+  InstrItinData<IIC_MMX_EMMS, [InstrStage<5, [Port0, Port1]>] >,
 
   InstrItinData<IIC_NOP, [InstrStage<1, [Port0, Port1]>] >
   ]>;
diff --git a/lib/Target/X86/X86ScheduleSLM.td b/lib/Target/X86/X86ScheduleSLM.td
new file mode 100644
index 0000000..6c2a304
--- /dev/null
+++ b/lib/Target/X86/X86ScheduleSLM.td
@@ -0,0 +1,668 @@
+//===- X86ScheduleSLM.td - X86 Atom Scheduling Definitions -*- tablegen -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the itinerary class data for the Intel Atom
+// (Silvermont) processor.
+//
+//===----------------------------------------------------------------------===//
+
+def IEC_RSV0 : FuncUnit;
+def IEC_RSV1 : FuncUnit;
+def FPC_RSV0 : FuncUnit;
+def FPC_RSV1 : FuncUnit;
+def MEC_RSV : FuncUnit;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+def SLMItineraries : ProcessorItineraries<
+  [ IEC_RSV0, IEC_RSV1, FPC_RSV0, FPC_RSV1, MEC_RSV ],
+  [], [
+  // [InstrStage<N, [FPC_RSV0, FPC_RSV1]>] 
+  // [InstrStage<N, [FPC_RSV0, FPC_RSV1], 0>, InstrStage<N, [MEC_RSV]>]
+  // [InstrStage<N, [IEC_RSV0, IEC_RSV1]>] 
+  // [InstrStage<N, [IEC_RSV0, IEC_RSV1], 0>,InstrStage<N,[MEC_RSV]>]
+  //
+  // Default is 1 cycle, IEC_RSV0 or IEC_RSV1
+  //InstrItinData<IIC_DEFAULT, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_ALU_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<1, [MEC_RSV]>] >,
+  InstrItinData<IIC_ALU_NONMEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_LEA, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_LEA_16, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  // mul
+  InstrItinData<IIC_MUL8, [InstrStage<4, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MUL16_MEM, [InstrStage<4, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<4, [MEC_RSV]>] >,
+  InstrItinData<IIC_MUL16_REG, [InstrStage<4, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MUL32_MEM, [InstrStage<3, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<3, [MEC_RSV]>] >,
+  InstrItinData<IIC_MUL32_REG, [InstrStage<3, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MUL64, [InstrStage<4, [IEC_RSV0, IEC_RSV1]>] >,
+  // imul by al, ax, eax, rax
+  InstrItinData<IIC_IMUL8, [InstrStage<6, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_IMUL16_MEM, [InstrStage<6, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<6, [MEC_RSV]>] >,
+  InstrItinData<IIC_IMUL16_REG, [InstrStage<6, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_IMUL32_MEM, [InstrStage<6, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<6, [MEC_RSV]>] >,
+  InstrItinData<IIC_IMUL32_REG, [InstrStage<6, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_IMUL64, [InstrStage<6, [IEC_RSV0, IEC_RSV1]>] >,
+  // imul reg by reg|mem
+  InstrItinData<IIC_IMUL16_RM, [InstrStage<4, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<4, [MEC_RSV]>] >,
+  InstrItinData<IIC_IMUL16_RR, [InstrStage<4, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_IMUL32_RM, [InstrStage<3, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<3, [MEC_RSV]>] >,
+  InstrItinData<IIC_IMUL32_RR, [InstrStage<3, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_IMUL64_RM, [InstrStage<4, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<4, [MEC_RSV]>] >,
+  InstrItinData<IIC_IMUL64_RR, [InstrStage<4, [IEC_RSV0, IEC_RSV1]>]  >,
+  // imul reg = reg/mem * imm
+  InstrItinData<IIC_IMUL16_RRI, [InstrStage<4, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_IMUL32_RRI, [InstrStage<3, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_IMUL64_RRI, [InstrStage<4, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_IMUL16_RMI, [InstrStage<4, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<4, [MEC_RSV]>] >,
+  InstrItinData<IIC_IMUL32_RMI, [InstrStage<3, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<3, [MEC_RSV]>] >,
+  InstrItinData<IIC_IMUL64_RMI, [InstrStage<4, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<4, [MEC_RSV]>] >,
+  // idiv - min latency
+  InstrItinData<IIC_IDIV8, [InstrStage<34, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_IDIV16, [InstrStage<35, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_IDIV32, [InstrStage<35, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_IDIV64, [InstrStage<49, [IEC_RSV0, IEC_RSV1]>] >,
+  // div - min latency
+  InstrItinData<IIC_DIV8_REG, [InstrStage<25, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_DIV8_MEM, [InstrStage<25, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<25, [MEC_RSV]>] >,
+  InstrItinData<IIC_DIV16, [InstrStage<26, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_DIV32, [InstrStage<26, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_DIV64, [InstrStage<38, [IEC_RSV0, IEC_RSV1]>] >,
+  // neg/not/inc/dec
+  InstrItinData<IIC_UNARY_REG, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_UNARY_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<1, [MEC_RSV]>] >,
+  // add/sub/and/or/xor/adc/sbc/cmp/test
+  InstrItinData<IIC_BIN_NONMEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_BIN_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<1, [MEC_RSV]>] >,
+  // adc/sbb
+  InstrItinData<IIC_BIN_CARRY_NONMEM, [InstrStage<2, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_BIN_CARRY_MEM, [InstrStage<2, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<2, [MEC_RSV]>] >,
+  // shift/rotate
+  InstrItinData<IIC_SR, [InstrStage<1, [IEC_RSV0], 0>,
+                   InstrStage<1, [MEC_RSV]>] >,
+  // shift double
+  InstrItinData<IIC_SHD16_REG_IM, [InstrStage<2, [IEC_RSV0]>] >,
+  InstrItinData<IIC_SHD16_REG_CL, [InstrStage<4, [IEC_RSV0]>] >,
+  InstrItinData<IIC_SHD16_MEM_IM, [InstrStage<2, [IEC_RSV0], 0>,
+                   InstrStage<2, [MEC_RSV]>] >,
+  InstrItinData<IIC_SHD16_MEM_CL, [InstrStage<4, [IEC_RSV0], 0>,
+                   InstrStage<4, [MEC_RSV]>] >,
+  InstrItinData<IIC_SHD32_REG_IM, [InstrStage<2, [IEC_RSV0]>] >,
+  InstrItinData<IIC_SHD32_REG_CL, [InstrStage<4, [IEC_RSV0]>] >,
+  InstrItinData<IIC_SHD32_MEM_IM, [InstrStage<2, [IEC_RSV0], 0>,
+                   InstrStage<2, [MEC_RSV]>] >,
+  InstrItinData<IIC_SHD32_MEM_CL, [InstrStage<4, [IEC_RSV0], 0>,
+                   InstrStage<4, [MEC_RSV]>] >,
+  InstrItinData<IIC_SHD64_REG_IM, [InstrStage<2, [IEC_RSV0]>] >,
+  InstrItinData<IIC_SHD64_REG_CL, [InstrStage<4, [IEC_RSV0]>] >,
+  InstrItinData<IIC_SHD64_MEM_IM, [InstrStage<2, [IEC_RSV0], 0>,
+                   InstrStage<2, [MEC_RSV]>] >,
+  InstrItinData<IIC_SHD64_MEM_CL, [InstrStage<4, [IEC_RSV0], 0>,
+                   InstrStage<4, [MEC_RSV]>] >,
+  // cmov
+  InstrItinData<IIC_CMOV16_RM, [InstrStage<2, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<2, [MEC_RSV]>] >,
+  InstrItinData<IIC_CMOV16_RR, [InstrStage<2, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_CMOV32_RM, [InstrStage<2, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<2, [MEC_RSV]>] >,
+  InstrItinData<IIC_CMOV32_RR, [InstrStage<2, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_CMOV64_RM, [InstrStage<2, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<2, [MEC_RSV]>] >,
+  InstrItinData<IIC_CMOV64_RR, [InstrStage<2, [IEC_RSV0, IEC_RSV1]>] >,
+  // set
+  InstrItinData<IIC_SET_M, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<1, [MEC_RSV]>] >,
+  InstrItinData<IIC_SET_R, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  // jcc
+  InstrItinData<IIC_Jcc, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  // jcxz/jecxz/jrcxz
+  InstrItinData<IIC_JCXZ, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  // jmp rel
+  InstrItinData<IIC_JMP_REL, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  // jmp indirect
+  InstrItinData<IIC_JMP_REG, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_JMP_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<1, [MEC_RSV]>] >,
+  // jmp far
+  InstrItinData<IIC_JMP_FAR_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<1, [MEC_RSV]>] >,
+  InstrItinData<IIC_JMP_FAR_PTR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  // loop/loope/loopne
+  InstrItinData<IIC_LOOP, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_LOOPE, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_LOOPNE, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  // call - all but reg/imm
+  InstrItinData<IIC_CALL_RI, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_CALL_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<1, [MEC_RSV]>] >,
+  InstrItinData<IIC_CALL_FAR_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<1, [MEC_RSV]>] >,
+  InstrItinData<IIC_CALL_FAR_PTR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  //ret
+  InstrItinData<IIC_RET, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_RET_IMM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  //sign extension movs
+  InstrItinData<IIC_MOVSX, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MOVSX_R16_R8, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MOVSX_R16_M8, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<1, [MEC_RSV]>] >,
+  InstrItinData<IIC_MOVSX_R16_R16, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MOVSX_R32_R32, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  //zero extension movs
+  InstrItinData<IIC_MOVZX, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MOVZX_R16_R8, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MOVZX_R16_M8, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<1, [MEC_RSV]>] >,
+
+  InstrItinData<IIC_REP_MOVS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_REP_STOS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+
+  // SSE binary operations
+  // arithmetic fp scalar
+  InstrItinData<IIC_SSE_ALU_F32S_RR, [InstrStage<3, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_ALU_F32S_RM, [InstrStage<3, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<3, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_ALU_F64S_RR, [InstrStage<3, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_ALU_F64S_RM, [InstrStage<3, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<3, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_MUL_F32S_RR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_MUL_F32S_RM, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<1, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_MUL_F64S_RR, [InstrStage<2, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_MUL_F64S_RM, [InstrStage<2, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<2, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_DIV_F32S_RR, [InstrStage<13, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_DIV_F32S_RM, [InstrStage<13, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<13, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_DIV_F64S_RR, [InstrStage<13, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_DIV_F64S_RM, [InstrStage<13, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<13, [MEC_RSV]>] >,
+
+  InstrItinData<IIC_SSE_COMIS_RR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_COMIS_RM, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
+
+  InstrItinData<IIC_SSE_HADDSUB_RR, [InstrStage<6, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_HADDSUB_RM, [InstrStage<6, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<6, [MEC_RSV]>] >,
+
+  // arithmetic fp parallel
+  InstrItinData<IIC_SSE_ALU_F32P_RR, [InstrStage<3, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_ALU_F32P_RM, [InstrStage<3, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<3, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_ALU_F64P_RR, [InstrStage<4, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_ALU_F64P_RM, [InstrStage<4, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<4, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_MUL_F32P_RR, [InstrStage<2, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_MUL_F32P_RM, [InstrStage<2, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<2, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_MUL_F64P_RR, [InstrStage<4, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_MUL_F64P_RM, [InstrStage<4, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<4, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_DIV_F32P_RR, [InstrStage<27, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_DIV_F32P_RM, [InstrStage<27, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<27, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_DIV_F64P_RR, [InstrStage<27, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_DIV_F64P_RM, [InstrStage<27, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<27, [MEC_RSV]>] >,
+
+  // bitwise parallel
+  InstrItinData<IIC_SSE_BIT_P_RR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_BIT_P_RM, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<1, [MEC_RSV]>] >,
+
+  // arithmetic int parallel
+  InstrItinData<IIC_SSE_INTALU_P_RR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_INTALU_P_RM, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<1, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_INTALUQ_P_RR, [InstrStage<4, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_INTALUQ_P_RM, [InstrStage<4, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<4, [MEC_RSV]>] >,
+
+  // multiply int parallel
+  InstrItinData<IIC_SSE_INTMUL_P_RR, [InstrStage<5, [FPC_RSV0]>] >,
+  InstrItinData<IIC_SSE_INTMUL_P_RM, [InstrStage<5, [FPC_RSV0], 0>,
+                   InstrStage<5, [MEC_RSV]>] >,
+
+  // shift parallel
+  InstrItinData<IIC_SSE_INTSH_P_RR, [InstrStage<2, [FPC_RSV0]>] >,
+  InstrItinData<IIC_SSE_INTSH_P_RM, [InstrStage<2, [FPC_RSV0], 0>,
+                   InstrStage<2, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_INTSH_P_RI, [InstrStage<1, [FPC_RSV0]>] >,
+
+  InstrItinData<IIC_SSE_INTSHDQ_P_RI, [InstrStage<1, [FPC_RSV0]>] >,
+
+  InstrItinData<IIC_SSE_SHUFP, [InstrStage<1, [FPC_RSV0]>] >,
+  InstrItinData<IIC_SSE_PSHUF_RI, [InstrStage<1, [FPC_RSV0]>] >,
+  InstrItinData<IIC_SSE_PSHUF_MI, [InstrStage<1, [FPC_RSV0], 0>,
+                   InstrStage<1, [MEC_RSV]>] >,
+
+  InstrItinData<IIC_SSE_UNPCK, [InstrStage<1, [FPC_RSV0]>] >,
+
+  InstrItinData<IIC_SSE_SQRTPS_RR, [InstrStage<26, [FPC_RSV0]>] >,
+  InstrItinData<IIC_SSE_SQRTPS_RM, [InstrStage<26, [FPC_RSV0], 0>,
+                   InstrStage<26, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_SQRTSS_RR, [InstrStage<13, [FPC_RSV0]>] >,
+  InstrItinData<IIC_SSE_SQRTSS_RM, [InstrStage<13, [FPC_RSV0], 0>,
+                   InstrStage<13, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_SQRTPD_RR, [InstrStage<26, [FPC_RSV0]>] >,
+  InstrItinData<IIC_SSE_SQRTPD_RM, [InstrStage<26, [FPC_RSV0], 0>,
+                   InstrStage<26, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_SQRTSD_RR, [InstrStage<13, [FPC_RSV0]>] >,
+  InstrItinData<IIC_SSE_SQRTSD_RM, [InstrStage<13, [FPC_RSV0], 0>,
+                   InstrStage<13, [MEC_RSV]>] >,
+
+  InstrItinData<IIC_SSE_RCPP_RR, [InstrStage<9, [FPC_RSV0]>] >,
+  InstrItinData<IIC_SSE_RCPP_RM, [InstrStage<9, [FPC_RSV0], 0>,
+                   InstrStage<9, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_RCPS_RR, [InstrStage<4, [FPC_RSV0]>] >,
+  InstrItinData<IIC_SSE_RCPS_RM, [InstrStage<4, [FPC_RSV0], 0>,
+                   InstrStage<4, [MEC_RSV]>] >,
+
+  InstrItinData<IIC_SSE_MOVMSK, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_MASKMOV, [InstrStage<5, [FPC_RSV0, FPC_RSV1]>] >,
+
+  InstrItinData<IIC_SSE_PEXTRW, [InstrStage<4, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_PINSRW, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
+
+  InstrItinData<IIC_SSE_PABS_RR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_PABS_RM, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<1, [MEC_RSV]>] >,
+
+  InstrItinData<IIC_SSE_MOV_S_RR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_MOV_S_RM, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<1, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_MOV_S_MR, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<1, [MEC_RSV]>] >,
+
+  InstrItinData<IIC_SSE_MOVA_P_RR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_MOVA_P_RM, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<1, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_MOVA_P_MR, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<1, [MEC_RSV]>] >,
+
+  InstrItinData<IIC_SSE_MOVU_P_RR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_MOVU_P_RM, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<1, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_MOVU_P_MR, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<1, [MEC_RSV]>] >,
+
+  InstrItinData<IIC_SSE_MOV_LH, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
+
+  InstrItinData<IIC_SSE_LDDQU, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
+
+  InstrItinData<IIC_SSE_MOVDQ, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_MOVD_ToGP, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_MOVQ_RR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
+
+  InstrItinData<IIC_SSE_MOVNT, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+
+  InstrItinData<IIC_SSE_PREFETCH, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_SSE_PAUSE, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_SSE_LFENCE, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_SSE_MFENCE, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_SSE_SFENCE, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_SSE_LDMXCSR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_SSE_STMXCSR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+
+  InstrItinData<IIC_SSE_PHADDSUBD_RR, [InstrStage<6, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_PHADDSUBD_RM, [InstrStage<6, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<6, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_PHADDSUBSW_RR, [InstrStage<9, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_PHADDSUBSW_RM, [InstrStage<9, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<9, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_PHADDSUBW_RR, [InstrStage<9, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<9, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_PHADDSUBW_RM, [InstrStage<9, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<9, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_PSHUFB_RR, [InstrStage<5, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_PSHUFB_RM, [InstrStage<5, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<5, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_PSIGN_RR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_PSIGN_RM, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<1, [MEC_RSV]>] >,
+
+  InstrItinData<IIC_SSE_PMADD, [InstrStage<5, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_PMULHRSW, [InstrStage<5, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_PALIGNRR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_PALIGNRM, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<1, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_MWAIT, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_SSE_MONITOR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+
+  // conversions
+  // to/from PD ...
+  InstrItinData<IIC_SSE_CVT_PD_RR, [InstrStage<5, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_CVT_PD_RM, [InstrStage<5, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<5, [MEC_RSV]>] >,
+  // to/from PS except to/from PD and PS2PI
+  InstrItinData<IIC_SSE_CVT_PS_RR, [InstrStage<4, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_CVT_PS_RM, [InstrStage<4, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<4, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_CVT_Scalar_RR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_CVT_Scalar_RM, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<1, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_CVT_SS2SI32_RR, [InstrStage<4, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_CVT_SS2SI32_RM, [InstrStage<4, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<4, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_CVT_SS2SI64_RR, [InstrStage<4, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_CVT_SS2SI64_RM, [InstrStage<4, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<4, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_CVT_SD2SI_RR, [InstrStage<4, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_CVT_SD2SI_RM, [InstrStage<4, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<4, [MEC_RSV]>] >,
+
+  // MMX MOVs
+  InstrItinData<IIC_MMX_MOV_MM_RM,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MMX_MOV_REG_MM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MMX_MOVQ_RM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MMX_MOVQ_RR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  // other MMX
+  InstrItinData<IIC_MMX_ALU_RM,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MMX_ALU_RR,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MMX_ALUQ_RM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MMX_ALUQ_RR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MMX_PHADDSUBW_RM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MMX_PHADDSUBW_RR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MMX_PHADDSUBD_RM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MMX_PHADDSUBD_RR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MMX_PMUL, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MMX_MISC_FUNC_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MMX_MISC_FUNC_REG, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MMX_PSADBW,   [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MMX_SHIFT_RI, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MMX_SHIFT_RM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MMX_SHIFT_RR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MMX_UNPCK_H_RM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MMX_UNPCK_H_RR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MMX_UNPCK_L, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MMX_PCK_RM,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MMX_PCK_RR,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MMX_PSHUF,   [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MMX_PEXTR,   [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MMX_PINSRW,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MMX_MASKMOV, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  // conversions
+  // from/to PD
+  InstrItinData<IIC_MMX_CVT_PD_RR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MMX_CVT_PD_RM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  // from/to PI
+  InstrItinData<IIC_MMX_CVT_PS_RR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MMX_CVT_PS_RM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+
+  InstrItinData<IIC_CMPX_LOCK, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_CMPX_LOCK_8, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_CMPX_LOCK_8B, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_CMPX_LOCK_16B, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+
+  InstrItinData<IIC_XADD_LOCK_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_XADD_LOCK_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+
+  InstrItinData<IIC_FILD, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_FLD,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_FLD80, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+
+  InstrItinData<IIC_FST,   [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_FST80, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_FIST,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+
+  InstrItinData<IIC_FLDZ,   [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_FUCOM,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_FUCOMI, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_FCOMI,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_FNSTSW, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_FNSTCW, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_FLDCW,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_FNINIT, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_FFREE,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_FNCLEX, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_WAIT,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_FXAM,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_FNOP,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_FLDL,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_F2XM1,  [InstrStage<88, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_FYL2X,  [InstrStage<296, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_FPTAN,  [InstrStage<281, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_FPATAN,  [InstrStage<296, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_FXTRACT,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_FPREM1,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_FPSTP,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_FPREM,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_FYL2XP1,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_FSINCOS,  [InstrStage<281, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_FRNDINT,  [InstrStage<25, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_FSCALE,  [InstrStage<74, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_FCOMPP,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_FXSAVE,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_FXRSTOR,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_FXCH, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
+
+  // System instructions
+  InstrItinData<IIC_CPUID, [InstrStage<60, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_INT,   [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_INT3,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_INVD,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_INVLPG, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_IRET,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_HLT,   [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_LXS,   [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_LTR,   [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_RDTSC, [InstrStage<30, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_RSM,   [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_SIDT,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_SGDT,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_SLDT,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_STR,    [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_SWAPGS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_SYSCALL, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_SYS_ENTER_EXIT, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+
+  InstrItinData<IIC_IN_RR,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_IN_RI,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_OUT_RR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_OUT_IR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_INS,    [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+
+  InstrItinData<IIC_MOV_REG_DR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MOV_DR_REG, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  // worst case for mov REG_CRx
+  InstrItinData<IIC_MOV_REG_CR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MOV_CR_REG, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+
+  InstrItinData<IIC_MOV_REG_SR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MOV_MEM_SR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MOV_SR_REG, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MOV_SR_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  // LAR
+  InstrItinData<IIC_LAR_RM,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_LAR_RR,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  // LSL
+  InstrItinData<IIC_LSL_RM,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_LSL_RR,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+
+  InstrItinData<IIC_LGDT, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_LIDT, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_LLDT_REG, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_LLDT_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  // push control register, segment registers
+  InstrItinData<IIC_PUSH_CS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_PUSH_SR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  // pop control register, segment registers
+  InstrItinData<IIC_POP_SR,    [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_POP_SR_SS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  // VERR, VERW
+  InstrItinData<IIC_VERR,     [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_VERW_REG, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_VERW_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  // WRMSR, RDMSR
+  InstrItinData<IIC_WRMSR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_RDMSR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_RDPMC, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  // SMSW, LMSW
+  InstrItinData<IIC_SMSW, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_LMSW_REG, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_LMSW_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+
+  InstrItinData<IIC_ENTER, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_LEAVE, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+
+  InstrItinData<IIC_POP_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_POP_REG16, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_POP_REG, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_POP_F, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_POP_FD, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_POP_A, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+
+  InstrItinData<IIC_PUSH_IMM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_PUSH_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_PUSH_REG, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_PUSH_F, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_PUSH_A, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+
+  InstrItinData<IIC_BSWAP, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_BIT_SCAN_MEM, [InstrStage<10, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<10, [MEC_RSV]>] >,
+  InstrItinData<IIC_BIT_SCAN_REG, [InstrStage<10, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MOVS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_STOS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_SCAS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_CMPS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MOV, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MOV_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<1, [MEC_RSV]>] >,
+  InstrItinData<IIC_AHF, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_BT_MI, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<1, [MEC_RSV]>] >,
+  InstrItinData<IIC_BT_MR, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<1, [MEC_RSV]>] >,
+  InstrItinData<IIC_BT_RI, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_BT_RR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_BTX_MI, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<1, [MEC_RSV]>] >,
+  InstrItinData<IIC_BTX_MR, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<1, [MEC_RSV]>] >,
+  InstrItinData<IIC_BTX_RI, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_BTX_RR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_XCHG_REG, [InstrStage<5, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_XCHG_MEM, [InstrStage<5, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<5, [MEC_RSV]>] >,
+  InstrItinData<IIC_XADD_REG, [InstrStage<5, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_XADD_MEM, [InstrStage<5, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<5, [MEC_RSV]>] >,
+  InstrItinData<IIC_CMPXCHG_MEM, [InstrStage<6, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_CMPXCHG_REG, [InstrStage<6, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_CMPXCHG_MEM8, [InstrStage<6, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<6, [MEC_RSV]>] >,
+  InstrItinData<IIC_CMPXCHG_REG8, [InstrStage<6, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<6, [MEC_RSV]>] >,
+  InstrItinData<IIC_CMPXCHG_8B, [InstrStage<6, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_CMPXCHG_16B, [InstrStage<6, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_LODS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_OUTS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_CLC, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_CLD, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_CLI, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_CMC, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_CLTS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_STC, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_STI, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_STD, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_XLAT, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_AAA, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_AAD, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_AAM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_AAS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_DAA, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_DAS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_BOUND, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_ARPL_REG, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_ARPL_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<1, [MEC_RSV]>] >,
+  InstrItinData<IIC_MOVBE, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_AES, [InstrStage<8, [FPC_RSV0]>] >,
+  InstrItinData<IIC_BLEND_NOMEM, [InstrStage<4, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_BLEND_MEM, [InstrStage<4, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<4, [MEC_RSV]>] >,
+  InstrItinData<IIC_BIT_SCAN_MEM, [InstrStage<10, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<10, [MEC_RSV]>] >,
+  InstrItinData<IIC_BIT_SCAN_REG, [InstrStage<10, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_CBW, [InstrStage<4, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_CRC32_REG, [InstrStage<3, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_CRC32_MEM, [InstrStage<3, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<3, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_DPPD_RR, [InstrStage<12, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_DPPD_RM, [InstrStage<12, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<12, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_DPPS_RR, [InstrStage<15, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_DPPS_RM, [InstrStage<15, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<15, [MEC_RSV]>] >,
+  InstrItinData<IIC_MMX_EMMS, [InstrStage<10, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_EXTRACTPS_RR, [InstrStage<4, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_EXTRACTPS_RM, [InstrStage<4, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<4, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_INSERTPS_RR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_INSERTPS_RM, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<1, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_MPSADBW_RR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_MPSADBW_RM, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<1, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_PMULLD_RR, [InstrStage<11, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_PMULLD_RM, [InstrStage<11, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<11, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_ROUNDPS_REG, [InstrStage<5, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_ROUNDPS_MEM, [InstrStage<5, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<5, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_ROUNDPD_REG, [InstrStage<4, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_ROUNDPD_MEM, [InstrStage<4, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<4, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_POPCNT_RR, [InstrStage<4, [IEC_RSV1]>] >,
+  InstrItinData<IIC_SSE_POPCNT_RM, [InstrStage<4, [IEC_RSV1], 0>,
+                  InstrStage<4, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_PCLMULQDQ_RR, [InstrStage<10, [IEC_RSV1]>] >,
+  InstrItinData<IIC_SSE_PCLMULQDQ_RM, [InstrStage<10, [IEC_RSV1], 0>,
+                  InstrStage<10, [MEC_RSV]>] >,
+
+  InstrItinData<IIC_NOP, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >
+  ]>;
+
+// Silvermont machine model.
+def SLMModel : SchedMachineModel {
+  let IssueWidth = 2;  // Allows 2 instructions per scheduling group.
+  let MinLatency = 1;  // InstrStage cycles overrides MinLatency.
+                       // OperandCycles may be used for expected latency.
+  let LoadLatency = 3; // Expected cycles, may be overriden by OperandCycles.
+  let HighLatency = 30;// Expected, may be overriden by OperandCycles.
+
+  let Itineraries = SLMItineraries;
+}
diff --git a/lib/Target/X86/X86SelectionDAGInfo.cpp b/lib/Target/X86/X86SelectionDAGInfo.cpp
index d1db79f..b9c620f 100644
--- a/lib/Target/X86/X86SelectionDAGInfo.cpp
+++ b/lib/Target/X86/X86SelectionDAGInfo.cpp
@@ -46,8 +46,6 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl,
       !ConstantSize ||
       ConstantSize->getZExtValue() >
         Subtarget->getMaxInlineSizeThreshold()) {
-    SDValue InFlag(0, 0);
-
     // Check to see if there is a specialized entry-point for memory zeroing.
     ConstantSDNode *V = dyn_cast<ConstantSDNode>(Src);
 
diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp
index fae90f2..01353b2 100644
--- a/lib/Target/X86/X86Subtarget.cpp
+++ b/lib/Target/X86/X86Subtarget.cpp
@@ -276,20 +276,29 @@ void X86Subtarget::AutoDetectSubtargetFeatures() {
          (Family == 6 && Model == 0x2F) || // Westmere: Westmere-EX
          (Family == 6 && Model == 0x2A) || // SandyBridge
          (Family == 6 && Model == 0x2D) || // SandyBridge: SandyBridge-E*
-         (Family == 6 && Model == 0x3A))) {// IvyBridge
+         (Family == 6 && Model == 0x3A) || // IvyBridge
+         (Family == 6 && Model == 0x3E) || // IvyBridge EP
+         (Family == 6 && Model == 0x3C) || // Haswell
+         (Family == 6 && Model == 0x3F) || // ...
+         (Family == 6 && Model == 0x45) || // ...
+         (Family == 6 && Model == 0x46))) { // ...
       IsUAMemFast = true;
       ToggleFeature(X86::FeatureFastUAMem);
     }
 
-    // Set processor type. Currently only Atom is detected.
+    // Set processor type. Currently only Atom or Silvermont (SLM) is detected.
     if (Family == 6 &&
-        (Model == 28 || Model == 38 || Model == 39
-         || Model == 53 || Model == 54)) {
+        (Model == 28 || Model == 38 || Model == 39 ||
+         Model == 53 || Model == 54)) {
       X86ProcFamily = IntelAtom;
 
       UseLeaForSP = true;
       ToggleFeature(X86::FeatureLeaForSP);
     }
+    else if (Family == 6 &&
+        (Model == 55 || Model == 74 || Model == 77)) {
+      X86ProcFamily = IntelSLM;
+    }
 
     unsigned MaxExtLevel;
     X86_MC::GetCpuIDAndInfo(0x80000000, &MaxExtLevel, &EBX, &ECX, &EDX);
@@ -351,14 +360,38 @@ void X86Subtarget::AutoDetectSubtargetFeatures() {
         HasRTM = true;
         ToggleFeature(X86::FeatureRTM);
       }
-      if (IsIntel && ((EBX >> 19) & 0x1)) {
-        HasADX = true;
-        ToggleFeature(X86::FeatureADX);
+      if (IsIntel && ((EBX >> 16) & 0x1)) {
+        X86SSELevel = AVX512F;
+        ToggleFeature(X86::FeatureAVX512);
       }
       if (IsIntel && ((EBX >> 18) & 0x1)) {
         HasRDSEED = true;
         ToggleFeature(X86::FeatureRDSEED);
       }
+      if (IsIntel && ((EBX >> 19) & 0x1)) {
+        HasADX = true;
+        ToggleFeature(X86::FeatureADX);
+      }
+      if (IsIntel && ((EBX >> 26) & 0x1)) {
+        HasPFI = true;
+        ToggleFeature(X86::FeaturePFI);
+      }
+      if (IsIntel && ((EBX >> 27) & 0x1)) {
+        HasERI = true;
+        ToggleFeature(X86::FeatureERI);
+      }
+      if (IsIntel && ((EBX >> 28) & 0x1)) {
+        HasCDI = true;
+        ToggleFeature(X86::FeatureCDI);
+      }
+      if (IsIntel && ((EBX >> 29) & 0x1)) {
+        HasSHA = true;
+        ToggleFeature(X86::FeatureSHA);
+      }
+    }
+    if (IsAMD && ((ECX >> 21) & 0x1)) {
+      HasTBM = true;
+      ToggleFeature(X86::FeatureTBM);
     }
   }
 }
@@ -416,8 +449,8 @@ void X86Subtarget::resetSubtargetFeatures(StringRef CPU, StringRef FS) {
 
     // Make sure 64-bit features are available in 64-bit mode.
     if (In64BitMode) {
-      HasX86_64 = true; ToggleFeature(X86::Feature64Bit);
-      HasCMov = true;   ToggleFeature(X86::FeatureCMOV);
+      if (!HasX86_64) { HasX86_64 = true; ToggleFeature(X86::Feature64Bit); }
+      if (!HasCMov)   { HasCMov   = true; ToggleFeature(X86::FeatureCMOV); }
 
       if (X86SSELevel < SSE2) {
         X86SSELevel = SSE2;
@@ -429,9 +462,9 @@ void X86Subtarget::resetSubtargetFeatures(StringRef CPU, StringRef FS) {
 
   // CPUName may have been set by the CPU detection code. Make sure the
   // new MCSchedModel is used.
-  InitMCProcessorInfo(CPUName, FS);
+  InitCPUSchedModel(CPUName);
 
-  if (X86ProcFamily == IntelAtom)
+  if (X86ProcFamily == IntelAtom || X86ProcFamily == IntelSLM)
     PostRAScheduler = true;
 
   InstrItins = getInstrItineraryForCPU(CPUName);
@@ -468,6 +501,7 @@ void X86Subtarget::initializeEnvironment() {
   HasFMA = false;
   HasFMA4 = false;
   HasXOP = false;
+  HasTBM = false;
   HasMOVBE = false;
   HasRDRAND = false;
   HasF16C = false;
@@ -479,8 +513,9 @@ void X86Subtarget::initializeEnvironment() {
   HasHLE = false;
   HasERI = false;
   HasCDI = false;
-  HasPFI=false;
+  HasPFI = false;
   HasADX = false;
+  HasSHA = false;
   HasPRFCHW = false;
   HasRDSEED = false;
   IsBTMemSlow = false;
diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h
index 8793238..dd8c081 100644
--- a/lib/Target/X86/X86Subtarget.h
+++ b/lib/Target/X86/X86Subtarget.h
@@ -42,7 +42,7 @@ enum Style {
 class X86Subtarget : public X86GenSubtargetInfo {
 protected:
   enum X86SSEEnum {
-    NoMMXSSE, MMX, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, AVX, AVX2, AVX512
+    NoMMXSSE, MMX, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, AVX, AVX2, AVX512F
   };
 
   enum X863DNowEnum {
@@ -50,7 +50,7 @@ protected:
   };
 
   enum X86ProcFamilyEnum {
-    Others, IntelAtom
+    Others, IntelAtom, IntelSLM
   };
 
   /// X86ProcFamily - X86 processor family: Intel Atom, and others
@@ -97,6 +97,9 @@ protected:
   /// HasXOP - Target has XOP instructions
   bool HasXOP;
 
+  /// HasTBM - Target has TBM instructions.
+  bool HasTBM;
+
   /// HasMOVBE - True if the processor has the MOVBE instruction.
   bool HasMOVBE;
 
@@ -127,6 +130,9 @@ protected:
   /// HasADX - Processor has ADX instructions.
   bool HasADX;
 
+  /// HasSHA - Processor has SHA instructions.
+  bool HasSHA;
+
   /// HasPRFCHW - Processor has PRFCHW instructions.
   bool HasPRFCHW;
 
@@ -258,7 +264,7 @@ public:
   bool hasSSE42() const { return X86SSELevel >= SSE42; }
   bool hasAVX() const { return X86SSELevel >= AVX; }
   bool hasAVX2() const { return X86SSELevel >= AVX2; }
-  bool hasAVX512() const { return X86SSELevel >= AVX512; }
+  bool hasAVX512() const { return X86SSELevel >= AVX512F; }
   bool hasFp256() const { return hasAVX(); }
   bool hasInt256() const { return hasAVX2(); }
   bool hasSSE4A() const { return HasSSE4A; }
@@ -271,6 +277,7 @@ public:
   // FIXME: Favor FMA when both are enabled. Is this the right thing to do?
   bool hasFMA4() const { return HasFMA4 && !HasFMA; }
   bool hasXOP() const { return HasXOP; }
+  bool hasTBM() const { return HasTBM; }
   bool hasMOVBE() const { return HasMOVBE; }
   bool hasRDRAND() const { return HasRDRAND; }
   bool hasF16C() const { return HasF16C; }
@@ -281,6 +288,7 @@ public:
   bool hasRTM() const { return HasRTM; }
   bool hasHLE() const { return HasHLE; }
   bool hasADX() const { return HasADX; }
+  bool hasSHA() const { return HasSHA; }
   bool hasPRFCHW() const { return HasPRFCHW; }
   bool hasRDSEED() const { return HasRDSEED; }
   bool isBTMemSlow() const { return IsBTMemSlow; }
@@ -311,10 +319,8 @@ public:
     return (TargetTriple.getEnvironment() == Triple::ELF ||
             TargetTriple.isOSBinFormatELF());
   }
-  bool isTargetLinux() const { return TargetTriple.getOS() == Triple::Linux; }
-  bool isTargetNaCl() const {
-    return TargetTriple.getOS() == Triple::NaCl;
-  }
+  bool isTargetLinux() const { return TargetTriple.isOSLinux(); }
+  bool isTargetNaCl() const { return TargetTriple.isOSNaCl(); }
   bool isTargetNaCl32() const { return isTargetNaCl() && !is64Bit(); }
   bool isTargetNaCl64() const { return isTargetNaCl() && is64Bit(); }
   bool isTargetWindows() const { return TargetTriple.getOS() == Triple::Win32; }
@@ -327,15 +333,14 @@ public:
   }
   bool isTargetEnvMacho() const { return TargetTriple.isEnvironmentMachO(); }
 
+  bool isOSWindows() const { return TargetTriple.isOSWindows(); }
+
   bool isTargetWin64() const {
-    // FIXME: x86_64-cygwin has not been released yet.
     return In64BitMode && TargetTriple.isOSWindows();
   }
 
   bool isTargetWin32() const {
-    // FIXME: Cygwin is included for isTargetWin64 -- should it be included
-    // here too?
-    return !In64BitMode && (isTargetMingw() || isTargetWindows());
+    return !In64BitMode && (isTargetCygMing() || isTargetWindows());
   }
 
   bool isPICStyleSet() const { return PICStyle != PICStyles::None; }
@@ -380,11 +385,14 @@ public:
   /// memset with zero passed as the second argument. Otherwise it
   /// returns null.
   const char *getBZeroEntry() const;
-  
+
   /// This function returns true if the target has sincos() routine in its
   /// compiler runtime or math libraries.
   bool hasSinCos() const;
 
+  /// Enable the MachineScheduler pass for all X86 subtargets.
+  bool enableMachineScheduler() const LLVM_OVERRIDE { return true; }
+
   /// enablePostRAScheduler - run for Atom optimization.
   bool enablePostRAScheduler(CodeGenOpt::Level OptLevel,
                              TargetSubtargetInfo::AntiDepBreakMode& Mode,
diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp
index 49ebd1a..ddf580f 100644
--- a/lib/Target/X86/X86TargetMachine.cpp
+++ b/lib/Target/X86/X86TargetMachine.cpp
@@ -92,7 +92,7 @@ X86TargetMachine::X86TargetMachine(const Target &T, StringRef TT,
   } else if (Subtarget.is64Bit()) {
     // PIC in 64 bit mode is always rip-rel.
     Subtarget.setPICStyle(PICStyles::RIPRel);
-  } else if (Subtarget.isTargetCygMing()) {
+  } else if (Subtarget.isTargetCOFF()) {
     Subtarget.setPICStyle(PICStyles::None);
   } else if (Subtarget.isTargetDarwin()) {
     if (getRelocationModel() == Reloc::PIC_)
@@ -114,14 +114,14 @@ X86TargetMachine::X86TargetMachine(const Target &T, StringRef TT,
 // Command line options for x86
 //===----------------------------------------------------------------------===//
 static cl::opt<bool>
-UseVZeroUpper("x86-use-vzeroupper",
+UseVZeroUpper("x86-use-vzeroupper", cl::Hidden,
   cl::desc("Minimize AVX to SSE transition penalty"),
   cl::init(true));
 
 // Temporary option to control early if-conversion for x86 while adding machine
 // models.
 static cl::opt<bool>
-X86EarlyIfConv("x86-early-ifcvt",
+X86EarlyIfConv("x86-early-ifcvt", cl::Hidden,
 	       cl::desc("Enable early if-conversion on X86"));
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/X86/X86TargetObjectFile.cpp b/lib/Target/X86/X86TargetObjectFile.cpp
index a19c5a6..086cd4d 100644
--- a/lib/Target/X86/X86TargetObjectFile.cpp
+++ b/lib/Target/X86/X86TargetObjectFile.cpp
@@ -25,7 +25,7 @@ getTTypeGlobalReference(const GlobalValue *GV, Mangler *Mang,
   // On Darwin/X86-64, we can reference dwarf symbols with foo@GOTPCREL+4, which
   // is an indirect pc-relative reference.
   if (Encoding & (DW_EH_PE_indirect | DW_EH_PE_pcrel)) {
-    const MCSymbol *Sym = Mang->getSymbol(GV);
+    const MCSymbol *Sym = getSymbol(*Mang, GV);
     const MCExpr *Res =
       MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_GOTPCREL, getContext());
     const MCExpr *Four = MCConstantExpr::Create(4, getContext());
@@ -39,7 +39,7 @@ getTTypeGlobalReference(const GlobalValue *GV, Mangler *Mang,
 MCSymbol *X86_64MachoTargetObjectFile::
 getCFIPersonalitySymbol(const GlobalValue *GV, Mangler *Mang,
                         MachineModuleInfo *MMI) const {
-  return Mang->getSymbol(GV);
+  return getSymbol(*Mang, GV);
 }
 
 void
diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp
index 3bbddad..f88a666 100644
--- a/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -101,6 +101,9 @@ public:
                                    unsigned AddressSpace) const;
 
   virtual unsigned getAddressComputationCost(Type *PtrTy, bool IsComplex) const;
+  
+  virtual unsigned getReductionCost(unsigned Opcode, Type *Ty,
+                                    bool IsPairwiseForm) const;
 
   /// @}
 };
@@ -127,8 +130,8 @@ X86TTI::PopcntSupportKind X86TTI::getPopcntSupport(unsigned TyWidth) const {
   assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
   // TODO: Currently the __builtin_popcount() implementation using SSE3
   //   instructions is inefficient. Once the problem is fixed, we should
-  //   call ST->hasSSE3() instead of ST->hasSSE4().
-  return ST->hasSSE41() ? PSK_FastHardware : PSK_Software;
+  //   call ST->hasSSE3() instead of ST->hasPOPCNT().
+  return ST->hasPOPCNT() ? PSK_FastHardware : PSK_Software;
 }
 
 unsigned X86TTI::getNumberOfRegisters(bool Vector) const {
@@ -174,7 +177,7 @@ unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
   assert(ISD && "Invalid opcode");
 
-  static const CostTblEntry<MVT> AVX2CostTable[] = {
+  static const CostTblEntry<MVT::SimpleValueType> AVX2CostTable[] = {
     // Shifts on v4i64/v8i32 on AVX2 is legal even though we declare to
     // customize them to detect the cases where shift amount is a scalar one.
     { ISD::SHL,     MVT::v4i32,    1 },
@@ -211,13 +214,13 @@ unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
 
   // Look for AVX2 lowering tricks.
   if (ST->hasAVX2()) {
-    int Idx = CostTableLookup<MVT>(AVX2CostTable, array_lengthof(AVX2CostTable),
-                                   ISD, LT.second);
+    int Idx = CostTableLookup(AVX2CostTable, ISD, LT.second);
     if (Idx != -1)
       return LT.first * AVX2CostTable[Idx].Cost;
   }
 
-  static const CostTblEntry<MVT> SSE2UniformConstCostTable[] = {
+  static const CostTblEntry<MVT::SimpleValueType>
+  SSE2UniformConstCostTable[] = {
     // We don't correctly identify costs of casts because they are marked as
     // custom.
     // Constant splats are cheaper for the following instructions.
@@ -238,15 +241,13 @@ unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
 
   if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
       ST->hasSSE2()) {
-    int Idx = CostTableLookup<MVT>(SSE2UniformConstCostTable,
-                                   array_lengthof(SSE2UniformConstCostTable),
-                                   ISD, LT.second);
+    int Idx = CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second);
     if (Idx != -1)
       return LT.first * SSE2UniformConstCostTable[Idx].Cost;
   }
 
 
-  static const CostTblEntry<MVT> SSE2CostTable[] = {
+  static const CostTblEntry<MVT::SimpleValueType> SSE2CostTable[] = {
     // We don't correctly identify costs of casts because they are marked as
     // custom.
     // For some cases, where the shift amount is a scalar we would be able
@@ -287,13 +288,12 @@ unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
   };
 
   if (ST->hasSSE2()) {
-    int Idx = CostTableLookup<MVT>(SSE2CostTable, array_lengthof(SSE2CostTable),
-                                   ISD, LT.second);
+    int Idx = CostTableLookup(SSE2CostTable, ISD, LT.second);
     if (Idx != -1)
       return LT.first * SSE2CostTable[Idx].Cost;
   }
 
-  static const CostTblEntry<MVT> AVX1CostTable[] = {
+  static const CostTblEntry<MVT::SimpleValueType> AVX1CostTable[] = {
     // We don't have to scalarize unsupported ops. We can issue two half-sized
     // operations and we only need to extract the upper YMM half.
     // Two ops + 1 extract + 1 insert = 4.
@@ -312,21 +312,19 @@ unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
 
   // Look for AVX1 lowering tricks.
   if (ST->hasAVX() && !ST->hasAVX2()) {
-    int Idx = CostTableLookup<MVT>(AVX1CostTable, array_lengthof(AVX1CostTable),
-                                   ISD, LT.second);
+    int Idx = CostTableLookup(AVX1CostTable, ISD, LT.second);
     if (Idx != -1)
       return LT.first * AVX1CostTable[Idx].Cost;
   }
 
   // Custom lowering of vectors.
-  static const CostTblEntry<MVT> CustomLowered[] = {
+  static const CostTblEntry<MVT::SimpleValueType> CustomLowered[] = {
     // A v2i64/v4i64 and multiply is custom lowered as a series of long
     // multiplies(3), shifts(4) and adds(2).
     { ISD::MUL,     MVT::v2i64,    9 },
     { ISD::MUL,     MVT::v4i64,    9 },
   };
-  int Idx = CostTableLookup<MVT>(CustomLowered, array_lengthof(CustomLowered),
-                                 ISD, LT.second);
+  int Idx = CostTableLookup(CustomLowered, ISD, LT.second);
   if (Idx != -1)
     return LT.first * CustomLowered[Idx].Cost;
 
@@ -363,7 +361,8 @@ unsigned X86TTI::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const {
   std::pair<unsigned, MVT> LTSrc = TLI->getTypeLegalizationCost(Src);
   std::pair<unsigned, MVT> LTDest = TLI->getTypeLegalizationCost(Dst);
 
-  static const TypeConversionCostTblEntry<MVT> SSE2ConvTbl[] = {
+  static const TypeConversionCostTblEntry<MVT::SimpleValueType>
+  SSE2ConvTbl[] = {
     // These are somewhat magic numbers justified by looking at the output of
     // Intel's IACA, running some kernels and making sure when we take
     // legalization into account the throughput will be overestimated.
@@ -387,9 +386,8 @@ unsigned X86TTI::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const {
   };
 
   if (ST->hasSSE2() && !ST->hasAVX()) {
-    int Idx = ConvertCostTableLookup<MVT>(SSE2ConvTbl,
-                                          array_lengthof(SSE2ConvTbl),
-                                          ISD, LTDest.second, LTSrc.second);
+    int Idx =
+        ConvertCostTableLookup(SSE2ConvTbl, ISD, LTDest.second, LTSrc.second);
     if (Idx != -1)
       return LTSrc.first * SSE2ConvTbl[Idx].Cost;
   }
@@ -401,13 +399,17 @@ unsigned X86TTI::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const {
   if (!SrcTy.isSimple() || !DstTy.isSimple())
     return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src);
 
-  static const TypeConversionCostTblEntry<MVT> AVXConversionTbl[] = {
+  static const TypeConversionCostTblEntry<MVT::SimpleValueType>
+  AVXConversionTbl[] = {
+    { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1 },
+    { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1 },
     { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1 },
     { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1 },
     { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 1 },
     { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 1 },
     { ISD::TRUNCATE,    MVT::v4i32, MVT::v4i64, 1 },
     { ISD::TRUNCATE,    MVT::v8i16, MVT::v8i32, 1 },
+    { ISD::TRUNCATE,    MVT::v16i8, MVT::v16i16, 2 },
 
     { ISD::SINT_TO_FP,  MVT::v8f32, MVT::v8i1,  8 },
     { ISD::SINT_TO_FP,  MVT::v8f32, MVT::v8i8,  8 },
@@ -446,9 +448,8 @@ unsigned X86TTI::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const {
   };
 
   if (ST->hasAVX()) {
-    int Idx = ConvertCostTableLookup<MVT>(AVXConversionTbl,
-                                 array_lengthof(AVXConversionTbl),
-                                 ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT());
+    int Idx = ConvertCostTableLookup(AVXConversionTbl, ISD, DstTy.getSimpleVT(),
+                                     SrcTy.getSimpleVT());
     if (Idx != -1)
       return AVXConversionTbl[Idx].Cost;
   }
@@ -466,7 +467,7 @@ unsigned X86TTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
   assert(ISD && "Invalid opcode");
 
-  static const CostTblEntry<MVT> SSE42CostTbl[] = {
+  static const CostTblEntry<MVT::SimpleValueType> SSE42CostTbl[] = {
     { ISD::SETCC,   MVT::v2f64,   1 },
     { ISD::SETCC,   MVT::v4f32,   1 },
     { ISD::SETCC,   MVT::v2i64,   1 },
@@ -475,7 +476,7 @@ unsigned X86TTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
     { ISD::SETCC,   MVT::v16i8,   1 },
   };
 
-  static const CostTblEntry<MVT> AVX1CostTbl[] = {
+  static const CostTblEntry<MVT::SimpleValueType> AVX1CostTbl[] = {
     { ISD::SETCC,   MVT::v4f64,   1 },
     { ISD::SETCC,   MVT::v8f32,   1 },
     // AVX1 does not support 8-wide integer compare.
@@ -485,7 +486,7 @@ unsigned X86TTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
     { ISD::SETCC,   MVT::v32i8,   4 },
   };
 
-  static const CostTblEntry<MVT> AVX2CostTbl[] = {
+  static const CostTblEntry<MVT::SimpleValueType> AVX2CostTbl[] = {
     { ISD::SETCC,   MVT::v4i64,   1 },
     { ISD::SETCC,   MVT::v8i32,   1 },
     { ISD::SETCC,   MVT::v16i16,  1 },
@@ -493,22 +494,19 @@ unsigned X86TTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
   };
 
   if (ST->hasAVX2()) {
-    int Idx = CostTableLookup<MVT>(AVX2CostTbl, array_lengthof(AVX2CostTbl),
-                                   ISD, MTy);
+    int Idx = CostTableLookup(AVX2CostTbl, ISD, MTy);
     if (Idx != -1)
       return LT.first * AVX2CostTbl[Idx].Cost;
   }
 
   if (ST->hasAVX()) {
-    int Idx = CostTableLookup<MVT>(AVX1CostTbl, array_lengthof(AVX1CostTbl),
-                                   ISD, MTy);
+    int Idx = CostTableLookup(AVX1CostTbl, ISD, MTy);
     if (Idx != -1)
       return LT.first * AVX1CostTbl[Idx].Cost;
   }
 
   if (ST->hasSSE42()) {
-    int Idx = CostTableLookup<MVT>(SSE42CostTbl, array_lengthof(SSE42CostTbl),
-                                   ISD, MTy);
+    int Idx = CostTableLookup(SSE42CostTbl, ISD, MTy);
     if (Idx != -1)
       return LT.first * SSE42CostTbl[Idx].Cost;
   }
@@ -613,3 +611,84 @@ unsigned X86TTI::getAddressComputationCost(Type *Ty, bool IsComplex) const {
 
   return TargetTransformInfo::getAddressComputationCost(Ty, IsComplex);
 }
+
+unsigned X86TTI::getReductionCost(unsigned Opcode, Type *ValTy,
+                                  bool IsPairwise) const {
+  
+  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(ValTy);
+  
+  MVT MTy = LT.second;
+  
+  int ISD = TLI->InstructionOpcodeToISD(Opcode);
+  assert(ISD && "Invalid opcode");
+ 
+  // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput 
+  // and make it as the cost. 
+ 
+  static const CostTblEntry<MVT::SimpleValueType> SSE42CostTblPairWise[] = {
+    { ISD::FADD,  MVT::v2f64,   2 },
+    { ISD::FADD,  MVT::v4f32,   4 },
+    { ISD::ADD,   MVT::v2i64,   2 },      // The data reported by the IACA tool is "1.6".
+    { ISD::ADD,   MVT::v4i32,   3 },      // The data reported by the IACA tool is "3.5".
+    { ISD::ADD,   MVT::v8i16,   5 },
+  };
+ 
+  static const CostTblEntry<MVT::SimpleValueType> AVX1CostTblPairWise[] = {
+    { ISD::FADD,  MVT::v4f32,   4 },
+    { ISD::FADD,  MVT::v4f64,   5 },
+    { ISD::FADD,  MVT::v8f32,   7 },
+    { ISD::ADD,   MVT::v2i64,   1 },      // The data reported by the IACA tool is "1.5".
+    { ISD::ADD,   MVT::v4i32,   3 },      // The data reported by the IACA tool is "3.5".
+    { ISD::ADD,   MVT::v4i64,   5 },      // The data reported by the IACA tool is "4.8".
+    { ISD::ADD,   MVT::v8i16,   5 },
+    { ISD::ADD,   MVT::v8i32,   5 },
+  };
+
+  static const CostTblEntry<MVT::SimpleValueType> SSE42CostTblNoPairWise[] = {
+    { ISD::FADD,  MVT::v2f64,   2 },
+    { ISD::FADD,  MVT::v4f32,   4 },
+    { ISD::ADD,   MVT::v2i64,   2 },      // The data reported by the IACA tool is "1.6".
+    { ISD::ADD,   MVT::v4i32,   3 },      // The data reported by the IACA tool is "3.3".
+    { ISD::ADD,   MVT::v8i16,   4 },      // The data reported by the IACA tool is "4.3".
+  };
+  
+  static const CostTblEntry<MVT::SimpleValueType> AVX1CostTblNoPairWise[] = {
+    { ISD::FADD,  MVT::v4f32,   3 },
+    { ISD::FADD,  MVT::v4f64,   3 },
+    { ISD::FADD,  MVT::v8f32,   4 },
+    { ISD::ADD,   MVT::v2i64,   1 },      // The data reported by the IACA tool is "1.5".
+    { ISD::ADD,   MVT::v4i32,   3 },      // The data reported by the IACA tool is "2.8".
+    { ISD::ADD,   MVT::v4i64,   3 },
+    { ISD::ADD,   MVT::v8i16,   4 },
+    { ISD::ADD,   MVT::v8i32,   5 },
+  };
+  
+  if (IsPairwise) {
+    if (ST->hasAVX()) {
+      int Idx = CostTableLookup(AVX1CostTblPairWise, ISD, MTy);
+      if (Idx != -1)
+        return LT.first * AVX1CostTblPairWise[Idx].Cost;
+    }
+  
+    if (ST->hasSSE42()) {
+      int Idx = CostTableLookup(SSE42CostTblPairWise, ISD, MTy);
+      if (Idx != -1)
+        return LT.first * SSE42CostTblPairWise[Idx].Cost;
+    }
+  } else {
+    if (ST->hasAVX()) {
+      int Idx = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy);
+      if (Idx != -1)
+        return LT.first * AVX1CostTblNoPairWise[Idx].Cost;
+    }
+    
+    if (ST->hasSSE42()) {
+      int Idx = CostTableLookup(SSE42CostTblNoPairWise, ISD, MTy);
+      if (Idx != -1)
+        return LT.first * SSE42CostTblNoPairWise[Idx].Cost;
+    }
+  }
+
+  return TargetTransformInfo::getReductionCost(Opcode, ValTy, IsPairwise);
+}
+
diff --git a/lib/Target/X86/X86VZeroUpper.cpp b/lib/Target/X86/X86VZeroUpper.cpp
index 477f75a..66ae9c2 100644
--- a/lib/Target/X86/X86VZeroUpper.cpp
+++ b/lib/Target/X86/X86VZeroUpper.cpp
@@ -122,11 +122,11 @@ static bool checkFnHasLiveInYmm(MachineRegisterInfo &MRI) {
 }
 
 static bool clobbersAllYmmRegs(const MachineOperand &MO) {
-  for (unsigned reg = X86::YMM0; reg < X86::YMM31; ++reg) {
+  for (unsigned reg = X86::YMM0; reg <= X86::YMM31; ++reg) {
     if (!MO.clobbersPhysReg(reg))
       return false;
   }
-  for (unsigned reg = X86::ZMM0; reg < X86::ZMM31; ++reg) {
+  for (unsigned reg = X86::ZMM0; reg <= X86::ZMM31; ++reg) {
     if (!MO.clobbersPhysReg(reg))
       return false;
   }
@@ -148,6 +148,25 @@ static bool hasYmmReg(MachineInstr *MI) {
   return false;
 }
 
+/// clobbersAnyYmmReg() - Check if any YMM register will be clobbered by this
+/// instruction.
+static bool clobbersAnyYmmReg(MachineInstr *MI) {
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isRegMask())
+      continue;
+    for (unsigned reg = X86::YMM0; reg <= X86::YMM31; ++reg) {
+      if (MO.clobbersPhysReg(reg))
+        return true;
+    }
+    for (unsigned reg = X86::ZMM0; reg <= X86::ZMM31; ++reg) {
+      if (MO.clobbersPhysReg(reg))
+        return true;
+    }
+  }
+  return false;
+}
+
 /// runOnMachineFunction - Loop over all of the basic blocks, inserting
 /// vzero upper instructions before function calls.
 bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) {
@@ -231,8 +250,9 @@ bool VZeroUpperInserter::processBasicBlock(MachineFunction &MF,
   bool BBHasCall = false;
 
   for (MachineBasicBlock::iterator I = BB.begin(); I != BB.end(); ++I) {
-    MachineInstr *MI = I;
     DebugLoc dl = I->getDebugLoc();
+    MachineInstr *MI = I;
+
     bool isControlFlow = MI->isCall() || MI->isReturn();
 
     // Shortcut: don't need to check regular instructions in dirty state.
@@ -251,6 +271,14 @@ bool VZeroUpperInserter::processBasicBlock(MachineFunction &MF,
     if (!isControlFlow)
       continue;
 
+    // If the call won't clobber any YMM register, skip it as well. It usually
+    // happens on helper function calls (such as '_chkstk', '_ftol2') where
+    // standard calling convention is not used (RegMask is not used to mark
+    // register clobbered and register usage (def/imp-def/use) is well-dfined
+    // and explicitly specified.
+    if (MI->isCall() && !clobbersAnyYmmReg(MI))
+      continue;
+
     BBHasCall = true;
 
     // The VZEROUPPER instruction resets the upper 128 bits of all Intel AVX
diff --git a/lib/Target/XCore/CMakeLists.txt b/lib/Target/XCore/CMakeLists.txt
index 85d2a1d..3fa3b34 100644
--- a/lib/Target/XCore/CMakeLists.txt
+++ b/lib/Target/XCore/CMakeLists.txt
@@ -22,6 +22,7 @@ add_llvm_target(XCoreCodeGen
   XCoreSubtarget.cpp
   XCoreTargetMachine.cpp
   XCoreTargetObjectFile.cpp
+  XCoreTargetTransformInfo.cpp
   XCoreSelectionDAGInfo.cpp
   )
 
diff --git a/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.cpp b/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.cpp
index 6f44551..3d1c474 100644
--- a/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.cpp
+++ b/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.cpp
@@ -23,10 +23,14 @@ XCoreMCAsmInfo::XCoreMCAsmInfo(StringRef TT) {
     
   PrivateGlobalPrefix = ".L";
   AscizDirective = ".asciiz";
-  WeakDefDirective = "\t.weak\t";
-  WeakRefDirective = "\t.weak\t";
+
+  HiddenVisibilityAttr = MCSA_Invalid;
+  HiddenDeclarationVisibilityAttr = MCSA_Invalid;
+  ProtectedVisibilityAttr = MCSA_Invalid;
 
   // Debug
   HasLEB128 = true;
+  ExceptionsType = ExceptionHandling::DwarfCFI;
+  DwarfRegNumForCFI = true;
 }
 
diff --git a/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.h b/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.h
index b5a9660..e53c96b 100644
--- a/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.h
+++ b/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.h
@@ -14,13 +14,13 @@
 #ifndef XCORETARGETASMINFO_H
 #define XCORETARGETASMINFO_H
 
-#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCAsmInfoELF.h"
 
 namespace llvm {
   class StringRef;
   class Target;
 
-  class XCoreMCAsmInfo : public MCAsmInfo {
+  class XCoreMCAsmInfo : public MCAsmInfoELF {
     virtual void anchor();
   public:
     explicit XCoreMCAsmInfo(StringRef TT);
diff --git a/lib/Target/XCore/XCore.h b/lib/Target/XCore/XCore.h
index 2f375fc..73c310b 100644
--- a/lib/Target/XCore/XCore.h
+++ b/lib/Target/XCore/XCore.h
@@ -31,6 +31,8 @@ namespace llvm {
                                    CodeGenOpt::Level OptLevel);
   ModulePass *createXCoreLowerThreadLocalPass();
 
+  ImmutablePass *createXCoreTargetTransformInfoPass(const XCoreTargetMachine *TM);
+
 } // end namespace llvm;
 
 #endif
diff --git a/lib/Target/XCore/XCoreAsmPrinter.cpp b/lib/Target/XCore/XCoreAsmPrinter.cpp
index 35ba299..c03dfe6 100644
--- a/lib/Target/XCore/XCoreAsmPrinter.cpp
+++ b/lib/Target/XCore/XCoreAsmPrinter.cpp
@@ -36,6 +36,7 @@
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCExpr.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
@@ -88,14 +89,12 @@ void XCoreAsmPrinter::emitArrayBound(MCSymbol *Sym, const GlobalVariable *GV) {
     MCSymbol *SymGlob = OutContext.GetOrCreateSymbol(
                           Twine(Sym->getName() + StringRef(".globound")));
     OutStreamer.EmitSymbolAttribute(SymGlob, MCSA_Global);
-
-    OutStreamer.EmitRawText("\t.set\t" + Twine(Sym->getName()) +
-                            ".globound," + Twine(ATy->getNumElements()));
-
+    OutStreamer.EmitAssignment(SymGlob,
+                               MCConstantExpr::Create(ATy->getNumElements(),
+                                                      OutContext));
     if (GV->hasWeakLinkage() || GV->hasLinkOnceLinkage()) {
       // TODO Use COMDAT groups for LinkOnceLinkage
-      OutStreamer.EmitRawText(MAI->getWeakDefDirective() +Twine(Sym->getName())+
-                              ".globound");
+      OutStreamer.EmitSymbolAttribute(SymGlob, MCSA_Weak);
     }
   }
 }
@@ -110,7 +109,7 @@ void XCoreAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
   OutStreamer.SwitchSection(getObjFileLowering().SectionForGlobal(GV, Mang,TM));
 
   
-  MCSymbol *GVSym = Mang->getSymbol(GV);
+  MCSymbol *GVSym = getSymbol(GV);
   const Constant *C = GV->getInitializer();
   unsigned Align = (unsigned)TD->getPreferredTypeAlignmentShift(C->getType());
   
@@ -217,7 +216,7 @@ void XCoreAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
     O << *MO.getMBB()->getSymbol();
     break;
   case MachineOperand::MO_GlobalAddress:
-    O << *Mang->getSymbol(MO.getGlobal());
+    O << *getSymbol(MO.getGlobal());
     break;
   case MachineOperand::MO_ExternalSymbol:
     O << MO.getSymbolName();
diff --git a/lib/Target/XCore/XCoreFrameLowering.cpp b/lib/Target/XCore/XCoreFrameLowering.cpp
index b57cf9d..c34b35c 100644
--- a/lib/Target/XCore/XCoreFrameLowering.cpp
+++ b/lib/Target/XCore/XCoreFrameLowering.cpp
@@ -30,10 +30,6 @@
 using namespace llvm;
 
 // helper functions. FIXME: Eliminate.
-static inline bool isImmUs(unsigned val) {
-  return val <= 11;
-}
-
 static inline bool isImmU6(unsigned val) {
   return val < (1 << 6);
 }
@@ -92,11 +88,16 @@ void XCoreFrameLowering::emitPrologue(MachineFunction &MF) const {
   MachineBasicBlock::iterator MBBI = MBB.begin();
   MachineFrameInfo *MFI = MF.getFrameInfo();
   MachineModuleInfo *MMI = &MF.getMMI();
+  const MCRegisterInfo *MRI = MMI->getContext().getRegisterInfo();
   const XCoreInstrInfo &TII =
     *static_cast<const XCoreInstrInfo*>(MF.getTarget().getInstrInfo());
   XCoreFunctionInfo *XFI = MF.getInfo<XCoreFunctionInfo>();
   DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
 
+  if (MFI->getMaxAlignment() > getStackAlignment())
+    report_fatal_error("emitPrologue unsupported alignment: "
+                       + Twine(MFI->getMaxAlignment()));
+
   bool FP = hasFP(MF);
   const AttributeSet &PAL = MF.getFunction()->getAttributes();
 
@@ -119,21 +120,28 @@ void XCoreFrameLowering::emitPrologue(MachineFunction &MF) const {
   bool saveLR = XFI->getUsesLR();
   // Do we need to allocate space on the stack?
   if (FrameSize) {
+    bool LRSavedOnEntry = false;
     int Opcode;
     if (saveLR && (MFI->getObjectOffset(XFI->getLRSpillSlot()) == 0)) {
       Opcode = (isU6) ? XCore::ENTSP_u6 : XCore::ENTSP_lu6;
       MBB.addLiveIn(XCore::LR);
       saveLR = false;
+      LRSavedOnEntry = true;
     } else {
       Opcode = (isU6) ? XCore::EXTSP_u6 : XCore::EXTSP_lu6;
     }
     BuildMI(MBB, MBBI, dl, TII.get(Opcode)).addImm(FrameSize);
 
     if (emitFrameMoves) {
-
       // Show update of SP.
       MCSymbol *FrameLabel = MMI->getContext().CreateTempSymbol();
       BuildMI(MBB, MBBI, dl, TII.get(XCore::PROLOG_LABEL)).addSym(FrameLabel);
+      MMI->addFrameInst(MCCFIInstruction::createDefCfaOffset(FrameLabel,
+                                                             -FrameSize*4));
+      if (LRSavedOnEntry) {
+        unsigned Reg = MRI->getDwarfRegNum(XCore::LR, true);
+        MMI->addFrameInst(MCCFIInstruction::createOffset(FrameLabel, Reg, 0));
+      }
     }
   }
   if (saveLR) {
@@ -144,6 +152,9 @@ void XCoreFrameLowering::emitPrologue(MachineFunction &MF) const {
     if (emitFrameMoves) {
       MCSymbol *SaveLRLabel = MMI->getContext().CreateTempSymbol();
       BuildMI(MBB, MBBI, dl, TII.get(XCore::PROLOG_LABEL)).addSym(SaveLRLabel);
+      unsigned Reg = MRI->getDwarfRegNum(XCore::LR, true);
+      MMI->addFrameInst(MCCFIInstruction::createOffset(SaveLRLabel, Reg,
+                                                       LRSpillOffset));
     }
   }
 
@@ -156,15 +167,34 @@ void XCoreFrameLowering::emitPrologue(MachineFunction &MF) const {
     if (emitFrameMoves) {
       MCSymbol *SaveR10Label = MMI->getContext().CreateTempSymbol();
       BuildMI(MBB, MBBI, dl, TII.get(XCore::PROLOG_LABEL)).addSym(SaveR10Label);
+      unsigned Reg = MRI->getDwarfRegNum(XCore::R10, true);
+      MMI->addFrameInst(MCCFIInstruction::createOffset(SaveR10Label, Reg,
+                                                       FPSpillOffset));
     }
     // Set the FP from the SP.
     unsigned FramePtr = XCore::R10;
-    BuildMI(MBB, MBBI, dl, TII.get(XCore::LDAWSP_ru6), FramePtr)
-      .addImm(0);
+    BuildMI(MBB, MBBI, dl, TII.get(XCore::LDAWSP_ru6), FramePtr).addImm(0);
     if (emitFrameMoves) {
       // Show FP is now valid.
       MCSymbol *FrameLabel = MMI->getContext().CreateTempSymbol();
       BuildMI(MBB, MBBI, dl, TII.get(XCore::PROLOG_LABEL)).addSym(FrameLabel);
+      unsigned Reg = MRI->getDwarfRegNum(FramePtr, true);
+      MMI->addFrameInst(MCCFIInstruction::createDefCfaRegister(FrameLabel,
+                                                               Reg));
+    }
+  }
+
+  if (emitFrameMoves) {
+    // Frame moves for callee saved.
+    std::vector<std::pair<MCSymbol*, CalleeSavedInfo> >&SpillLabels =
+        XFI->getSpillLabels();
+    for (unsigned I = 0, E = SpillLabels.size(); I != E; ++I) {
+      MCSymbol *SpillLabel = SpillLabels[I].first;
+      CalleeSavedInfo &CSI = SpillLabels[I].second;
+      int Offset = MFI->getObjectOffset(CSI.getFrameIdx());
+      unsigned Reg = MRI->getDwarfRegNum(CSI.getReg(), true);
+      MMI->addFrameInst(MCCFIInstruction::createOffset(SpillLabel, Reg,
+                                                       Offset));
     }
   }
 }
diff --git a/lib/Target/XCore/XCoreISelLowering.cpp b/lib/Target/XCore/XCoreISelLowering.cpp
index 6fc7eef5..89ad27d 100644
--- a/lib/Target/XCore/XCoreISelLowering.cpp
+++ b/lib/Target/XCore/XCoreISelLowering.cpp
@@ -59,6 +59,7 @@ getTargetNodeName(unsigned Opcode) const
     case XCoreISD::CRC8              : return "XCoreISD::CRC8";
     case XCoreISD::BR_JT             : return "XCoreISD::BR_JT";
     case XCoreISD::BR_JT32           : return "XCoreISD::BR_JT32";
+    case XCoreISD::MEMBARRIER        : return "XCoreISD::MEMBARRIER";
     default                          : return NULL;
   }
 }
@@ -79,7 +80,7 @@ XCoreTargetLowering::XCoreTargetLowering(XCoreTargetMachine &XTM)
 
   setStackPointerRegisterToSaveRestore(XCore::SP);
 
-  setSchedulingPreference(Sched::RegPressure);
+  setSchedulingPreference(Sched::Source);
 
   // Use i32 for setcc operations results (slt, sgt, ...).
   setBooleanContents(ZeroOrOneBooleanContent);
@@ -148,6 +149,13 @@ XCoreTargetLowering::XCoreTargetLowering(XCoreTargetMachine &XTM)
   setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand);
 
+  // Exception handling
+  setExceptionPointerRegister(XCore::R0);
+  setExceptionSelectorRegister(XCore::R1);
+
+  // Atomic operations
+  setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom);
+
   // TRAMPOLINE is custom lowered.
   setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
   setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
@@ -166,6 +174,24 @@ XCoreTargetLowering::XCoreTargetLowering(XCoreTargetMachine &XTM)
   setMinFunctionAlignment(1);
 }
 
+bool XCoreTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
+  if (Val.getOpcode() != ISD::LOAD)
+    return false;
+
+  EVT VT1 = Val.getValueType();
+  if (!VT1.isSimple() || !VT1.isInteger() ||
+      !VT2.isSimple() || !VT2.isInteger())
+    return false;
+
+  switch (VT1.getSimpleVT().SimpleTy) {
+  default: break;
+  case MVT::i8:
+    return true;
+  }
+
+  return false;
+}
+
 SDValue XCoreTargetLowering::
 LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   switch (Op.getOpcode())
@@ -188,6 +214,7 @@ LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::INIT_TRAMPOLINE:    return LowerINIT_TRAMPOLINE(Op, DAG);
   case ISD::ADJUST_TRAMPOLINE:  return LowerADJUST_TRAMPOLINE(Op, DAG);
   case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
+  case ISD::ATOMIC_FENCE:       return LowerATOMIC_FENCE(Op, DAG);
   default:
     llvm_unreachable("unimplemented operand");
   }
@@ -259,11 +286,6 @@ LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const
   return GA;
 }
 
-static inline SDValue BuildGetId(SelectionDAG &DAG, SDLoc dl) {
-  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32,
-                     DAG.getConstant(Intrinsic::xcore_getid, MVT::i32));
-}
-
 SDValue XCoreTargetLowering::
 LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const
 {
@@ -834,6 +856,12 @@ LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const {
   return SDValue();
 }
 
+SDValue XCoreTargetLowering::
+LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  return DAG.getNode(XCoreISD::MEMBARRIER, DL, MVT::Other, Op.getOperand(0));
+}
+
 //===----------------------------------------------------------------------===//
 //                      Calling Convention Implementation
 //===----------------------------------------------------------------------===//
@@ -1200,7 +1228,7 @@ XCoreTargetLowering::LowerCCCArguments(SDValue Chain,
        ArgDI != ArgDE; ++ArgDI) {
     if (ArgDI->Flags.isByVal() && ArgDI->Flags.getByValSize()) {
       unsigned Size = ArgDI->Flags.getByValSize();
-      unsigned Align = ArgDI->Flags.getByValAlign();
+      unsigned Align = std::max(StackSlotSize, ArgDI->Flags.getByValAlign());
       // Create a new object on the stack and copy the pointee into it.
       int FI = MFI->CreateStackObject(Size, Align, false, false);
       SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
diff --git a/lib/Target/XCore/XCoreISelLowering.h b/lib/Target/XCore/XCoreISelLowering.h
index 7761b7c..bc08497 100644
--- a/lib/Target/XCore/XCoreISelLowering.h
+++ b/lib/Target/XCore/XCoreISelLowering.h
@@ -70,7 +70,10 @@ namespace llvm {
       BR_JT,
 
       // Jumptable branch using long branches for each entry.
-      BR_JT32
+      BR_JT32,
+
+      // Memory barrier.
+      MEMBARRIER
     };
   }
 
@@ -83,6 +86,10 @@ namespace llvm {
 
     explicit XCoreTargetLowering(XCoreTargetMachine &TM);
 
+    using TargetLowering::isZExtFree;
+    virtual bool isZExtFree(SDValue Val, EVT VT2) const;
+
+
     virtual unsigned getJumpTableEncoding() const;
     virtual MVT getScalarShiftAmountTy(EVT LHSTy) const { return MVT::i32; }
 
@@ -154,6 +161,7 @@ namespace llvm {
     SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG) const;
 
     // Inline asm support
     std::pair<unsigned, const TargetRegisterClass*>
diff --git a/lib/Target/XCore/XCoreInstrInfo.cpp b/lib/Target/XCore/XCoreInstrInfo.cpp
index d6b8c2d..33c7f31 100644
--- a/lib/Target/XCore/XCoreInstrInfo.cpp
+++ b/lib/Target/XCore/XCoreInstrInfo.cpp
@@ -22,7 +22,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TargetRegistry.h"
 
-#define GET_INSTRINFO_CTOR
+#define GET_INSTRINFO_CTOR_DTOR
 #include "XCoreGenInstrInfo.inc"
 
 namespace llvm {
@@ -39,6 +39,10 @@ namespace XCore {
 
 using namespace llvm;
 
+
+// Pin the vtable to this file.
+void XCoreInstrInfo::anchor() {}
+
 XCoreInstrInfo::XCoreInstrInfo()
   : XCoreGenInstrInfo(XCore::ADJCALLSTACKDOWN, XCore::ADJCALLSTACKUP),
     RI() {
diff --git a/lib/Target/XCore/XCoreInstrInfo.h b/lib/Target/XCore/XCoreInstrInfo.h
index 51d66a1..4429b07 100644
--- a/lib/Target/XCore/XCoreInstrInfo.h
+++ b/lib/Target/XCore/XCoreInstrInfo.h
@@ -24,6 +24,7 @@ namespace llvm {
 
 class XCoreInstrInfo : public XCoreGenInstrInfo {
   const XCoreRegisterInfo RI;
+  virtual void anchor();
 public:
   XCoreInstrInfo();
 
diff --git a/lib/Target/XCore/XCoreInstrInfo.td b/lib/Target/XCore/XCoreInstrInfo.td
index 81fa84d..934a707 100644
--- a/lib/Target/XCore/XCoreInstrInfo.td
+++ b/lib/Target/XCore/XCoreInstrInfo.td
@@ -70,6 +70,11 @@ def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_XCoreCallSeqStart,
 def callseq_end   : SDNode<"ISD::CALLSEQ_END",   SDT_XCoreCallSeqEnd,
                            [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
 
+def SDT_XCoreMEMBARRIER : SDTypeProfile<0, 0, []>;
+
+def XCoreMemBarrier : SDNode<"XCoreISD::MEMBARRIER", SDT_XCoreMEMBARRIER,
+                             [SDNPHasChain]>;
+
 //===----------------------------------------------------------------------===//
 // Instruction Pattern Stuff
 //===----------------------------------------------------------------------===//
@@ -343,6 +348,10 @@ let usesCustomInserter = 1 in {
                                  (select GRRegs:$cond, GRRegs:$T, GRRegs:$F))]>;
 }
 
+let hasSideEffects = 1 in
+def Int_MemBarrier : PseudoInstXCore<(outs), (ins), "#MEMBARRIER",
+                                     [(XCoreMemBarrier)]>;
+
 //===----------------------------------------------------------------------===//
 // Instructions
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/XCore/XCoreLowerThreadLocal.cpp b/lib/Target/XCore/XCoreLowerThreadLocal.cpp
index 2e328b4..afce753 100644
--- a/lib/Target/XCore/XCoreLowerThreadLocal.cpp
+++ b/lib/Target/XCore/XCoreLowerThreadLocal.cpp
@@ -22,6 +22,9 @@
 #include "llvm/IR/Module.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/NoFolder.h"
+#include "llvm/Support/ValueHandle.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
 
 #define DEBUG_TYPE "xcore-lower-thread-local"
 
@@ -71,13 +74,104 @@ createLoweredInitializer(ArrayType *NewType, Constant *OriginalInitializer) {
   return ConstantArray::get(NewType, Elements);
 }
 
-static bool hasNonInstructionUse(GlobalVariable *GV) {
-  for (Value::use_iterator UI = GV->use_begin(), E = GV->use_end(); UI != E;
-       ++UI)
-    if (!isa<Instruction>(*UI))
-      return true;
+static Instruction *
+createReplacementInstr(ConstantExpr *CE, Instruction *Instr) {
+  IRBuilder<true,NoFolder> Builder(Instr);
+  unsigned OpCode = CE->getOpcode();
+  switch (OpCode) {
+    case Instruction::GetElementPtr: {
+      SmallVector<Value *,4> CEOpVec(CE->op_begin(), CE->op_end());
+      ArrayRef<Value *> CEOps(CEOpVec);
+      return dyn_cast<Instruction>(Builder.CreateInBoundsGEP(CEOps[0],
+                                                             CEOps.slice(1)));
+    }
+    case Instruction::Add:
+    case Instruction::Sub:
+    case Instruction::Mul:
+    case Instruction::UDiv:
+    case Instruction::SDiv:
+    case Instruction::FDiv:
+    case Instruction::URem:
+    case Instruction::SRem:
+    case Instruction::FRem:
+    case Instruction::Shl:
+    case Instruction::LShr:
+    case Instruction::AShr:
+    case Instruction::And:
+    case Instruction::Or:
+    case Instruction::Xor:
+      return dyn_cast<Instruction>(
+                  Builder.CreateBinOp((Instruction::BinaryOps)OpCode,
+                                      CE->getOperand(0), CE->getOperand(1),
+                                      CE->getName()));
+    case Instruction::Trunc:
+    case Instruction::ZExt:
+    case Instruction::SExt:
+    case Instruction::FPToUI:
+    case Instruction::FPToSI:
+    case Instruction::UIToFP:
+    case Instruction::SIToFP:
+    case Instruction::FPTrunc:
+    case Instruction::FPExt:
+    case Instruction::PtrToInt:
+    case Instruction::IntToPtr:
+    case Instruction::BitCast:
+      return dyn_cast<Instruction>(
+                  Builder.CreateCast((Instruction::CastOps)OpCode,
+                                     CE->getOperand(0), CE->getType(),
+                                     CE->getName()));
+    default:
+      llvm_unreachable("Unhandled constant expression!\n");
+  }
+}
+
+static bool replaceConstantExprOp(ConstantExpr *CE, Pass *P) {
+  do {
+    SmallVector<WeakVH,8> WUsers;
+    for (Value::use_iterator I = CE->use_begin(), E = CE->use_end();
+         I != E; ++I)
+      WUsers.push_back(WeakVH(*I));
+    std::sort(WUsers.begin(), WUsers.end());
+    WUsers.erase(std::unique(WUsers.begin(), WUsers.end()), WUsers.end());
+    while (!WUsers.empty())
+      if (WeakVH WU = WUsers.pop_back_val()) {
+        if (PHINode *PN = dyn_cast<PHINode>(WU)) {
+          for (int I = 0, E = PN->getNumIncomingValues(); I < E; ++I)
+            if (PN->getIncomingValue(I) == CE) {
+              BasicBlock *PredBB = PN->getIncomingBlock(I);
+              if (PredBB->getTerminator()->getNumSuccessors() > 1)
+                PredBB = SplitEdge(PredBB, PN->getParent(), P);
+              Instruction *InsertPos = PredBB->getTerminator();
+              Instruction *NewInst = createReplacementInstr(CE, InsertPos);
+              PN->setOperand(I, NewInst);
+            }
+        } else if (Instruction *Instr = dyn_cast<Instruction>(WU)) {
+          Instruction *NewInst = createReplacementInstr(CE, Instr);
+          Instr->replaceUsesOfWith(CE, NewInst);
+        } else {
+          ConstantExpr *CExpr = dyn_cast<ConstantExpr>(WU);
+          if (!CExpr || !replaceConstantExprOp(CExpr, P))
+            return false;
+        }
+      }
+  } while (CE->hasNUsesOrMore(1)); // We need to check becasue a recursive
+  // sibbling may have used 'CE' when createReplacementInstr was called.
+  CE->destroyConstant();
+  return true;
+}
 
-  return false;
+static bool rewriteNonInstructionUses(GlobalVariable *GV, Pass *P) {
+  SmallVector<WeakVH,8> WUsers;
+  for (Value::use_iterator I = GV->use_begin(), E = GV->use_end(); I != E; ++I)
+    if (!isa<Instruction>(*I))
+      WUsers.push_back(WeakVH(*I));
+  while (!WUsers.empty())
+    if (WeakVH WU = WUsers.pop_back_val()) {
+      ConstantExpr *CE = dyn_cast<ConstantExpr>(WU);
+      if (!CE || !replaceConstantExprOp(CE, P))
+        return false;
+    }
+  return true;
 }
 
 static bool isZeroLengthArray(Type *Ty) {
@@ -92,14 +186,16 @@ bool XCoreLowerThreadLocal::lowerGlobal(GlobalVariable *GV) {
     return false;
 
   // Skip globals that we can't lower and leave it for the backend to error.
-  if (hasNonInstructionUse(GV) ||
+  if (!rewriteNonInstructionUses(GV, this) ||
       !GV->getType()->isSized() || isZeroLengthArray(GV->getType()))
     return false;
 
   // Create replacement global.
   ArrayType *NewType = createLoweredType(GV->getType()->getElementType());
-  Constant *NewInitializer = createLoweredInitializer(NewType,
-                                                      GV->getInitializer());
+  Constant *NewInitializer = 0;
+  if (GV->hasInitializer())
+    NewInitializer = createLoweredInitializer(NewType,
+                                              GV->getInitializer());
   GlobalVariable *NewGV =
     new GlobalVariable(*M, NewType, GV->isConstant(), GV->getLinkage(),
                        NewInitializer, "", 0, GlobalVariable::NotThreadLocal,
diff --git a/lib/Target/XCore/XCoreMCInstLower.cpp b/lib/Target/XCore/XCoreMCInstLower.cpp
index f96eda9..def2673 100644
--- a/lib/Target/XCore/XCoreMCInstLower.cpp
+++ b/lib/Target/XCore/XCoreMCInstLower.cpp
@@ -43,7 +43,7 @@ MCOperand XCoreMCInstLower::LowerSymbolOperand(const MachineOperand &MO,
       Symbol = MO.getMBB()->getSymbol();
       break;
     case MachineOperand::MO_GlobalAddress:
-      Symbol = Mang->getSymbol(MO.getGlobal());
+      Symbol = Printer.getSymbol(MO.getGlobal());
       Offset += MO.getOffset();
       break;
     case MachineOperand::MO_BlockAddress:
diff --git a/lib/Target/XCore/XCoreTargetMachine.cpp b/lib/Target/XCore/XCoreTargetMachine.cpp
index 3ef1520..9ae0b86 100644
--- a/lib/Target/XCore/XCoreTargetMachine.cpp
+++ b/lib/Target/XCore/XCoreTargetMachine.cpp
@@ -70,3 +70,11 @@ bool XCorePassConfig::addInstSelector() {
 extern "C" void LLVMInitializeXCoreTarget() {
   RegisterTargetMachine<XCoreTargetMachine> X(TheXCoreTarget);
 }
+
+void XCoreTargetMachine::addAnalysisPasses(PassManagerBase &PM) {
+  // Add first the target-independent BasicTTI pass, then our XCore pass. This
+  // allows the XCore pass to delegate to the target independent layer when
+  // appropriate.
+  PM.add(createBasicTargetTransformInfoPass(this));
+  PM.add(createXCoreTargetTransformInfoPass(this));
+}
diff --git a/lib/Target/XCore/XCoreTargetMachine.h b/lib/Target/XCore/XCoreTargetMachine.h
index eb9a1aa..a19a677 100644
--- a/lib/Target/XCore/XCoreTargetMachine.h
+++ b/lib/Target/XCore/XCoreTargetMachine.h
@@ -57,6 +57,8 @@ public:
 
   // Pass Pipeline Configuration
   virtual TargetPassConfig *createPassConfig(PassManagerBase &PM);
+
+  virtual void addAnalysisPasses(PassManagerBase &PM);
 };
 
 } // end namespace llvm
diff --git a/lib/Target/XCore/XCoreTargetTransformInfo.cpp b/lib/Target/XCore/XCoreTargetTransformInfo.cpp
new file mode 100644
index 0000000..cc165f7
--- /dev/null
+++ b/lib/Target/XCore/XCoreTargetTransformInfo.cpp
@@ -0,0 +1,83 @@
+//===-- XCoreTargetTransformInfo.cpp - XCore specific TTI pass ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements a TargetTransformInfo analysis pass specific to the
+/// XCore target machine. It uses the target's detailed information to provide
+/// more precise answers to certain TTI queries, while letting the target
+/// independent and default TTI implementations handle the rest.
+///
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "xcoretti"
+#include "XCore.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/CostTable.h"
+using namespace llvm;
+
+// Declare the pass initialization routine locally as target-specific passes
+// don't havve a target-wide initialization entry point, and so we rely on the
+// pass constructor initialization.
+namespace llvm {
+void initializeXCoreTTIPass(PassRegistry &);
+}
+
+namespace {
+
+class XCoreTTI : public ImmutablePass, public TargetTransformInfo {
+public:
+  XCoreTTI() : ImmutablePass(ID) {
+    llvm_unreachable("This pass cannot be directly constructed");
+  }
+
+  XCoreTTI(const XCoreTargetMachine *TM)
+      : ImmutablePass(ID) {
+    initializeXCoreTTIPass(*PassRegistry::getPassRegistry());
+  }
+
+  virtual void initializePass() {
+    pushTTIStack(this);
+  }
+
+  virtual void finalizePass() {
+    popTTIStack();
+  }
+
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    TargetTransformInfo::getAnalysisUsage(AU);
+  }
+
+  static char ID;
+
+  virtual void *getAdjustedAnalysisPointer(const void *ID) {
+    if (ID == &TargetTransformInfo::ID)
+      return (TargetTransformInfo*)this;
+    return this;
+  }
+
+  unsigned getNumberOfRegisters(bool Vector) const {
+    if (Vector) {
+       return 0;
+    }
+    return 12;
+  }
+};
+
+} // end anonymous namespace
+
+INITIALIZE_AG_PASS(XCoreTTI, TargetTransformInfo, "xcoretti",
+                   "XCore Target Transform Info", true, true, false)
+char XCoreTTI::ID = 0;
+
+
+ImmutablePass *
+llvm::createXCoreTargetTransformInfoPass(const XCoreTargetMachine *TM) {
+  return new XCoreTTI(TM);
+}
diff --git a/lib/Transforms/Hello/Hello.cpp b/lib/Transforms/Hello/Hello.cpp
index 9f2343b..9251783 100644
--- a/lib/Transforms/Hello/Hello.cpp
+++ b/lib/Transforms/Hello/Hello.cpp
@@ -52,7 +52,7 @@ namespace {
       return false;
     }
 
-    // We don't modify the program, so we preserve all analyses
+    // We don't modify the program, so we preserve all analyses.
     virtual void getAnalysisUsage(AnalysisUsage &AU) const {
       AU.setPreservesAll();
     }
diff --git a/lib/Transforms/IPO/ArgumentPromotion.cpp b/lib/Transforms/IPO/ArgumentPromotion.cpp
index c42d506..df08091 100644
--- a/lib/Transforms/IPO/ArgumentPromotion.cpp
+++ b/lib/Transforms/IPO/ArgumentPromotion.cpp
@@ -88,7 +88,7 @@ char ArgPromotion::ID = 0;
 INITIALIZE_PASS_BEGIN(ArgPromotion, "argpromotion",
                 "Promote 'by reference' arguments to scalars", false, false)
 INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
-INITIALIZE_AG_DEPENDENCY(CallGraph)
+INITIALIZE_PASS_DEPENDENCY(CallGraph)
 INITIALIZE_PASS_END(ArgPromotion, "argpromotion",
                 "Promote 'by reference' arguments to scalars", false, false)
 
@@ -504,7 +504,9 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
   // OriginalLoads - Keep track of a representative load instruction from the
   // original function so that we can tell the alias analysis implementation
   // what the new GEP/Load instructions we are inserting look like.
-  std::map<IndicesVector, LoadInst*> OriginalLoads;
+  // We need to keep the original loads for each argument and the elements
+  // of the argument that are accessed.
+  std::map<std::pair<Argument*, IndicesVector>, LoadInst*> OriginalLoads;
 
   // Attribute - Keep track of the parameter attributes for the arguments
   // that we are *not* promoting. For the ones that we do promote, the parameter
@@ -569,7 +571,7 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
         else
           // Take any load, we will use it only to update Alias Analysis
           OrigLoad = cast<LoadInst>(User->use_back());
-        OriginalLoads[Indices] = OrigLoad;
+        OriginalLoads[std::make_pair(I, Indices)] = OrigLoad;
       }
 
       // Add a parameter to the function for each element passed in.
@@ -676,7 +678,7 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
         for (ScalarizeTable::iterator SI = ArgIndices.begin(),
                E = ArgIndices.end(); SI != E; ++SI) {
           Value *V = *AI;
-          LoadInst *OrigLoad = OriginalLoads[*SI];
+          LoadInst *OrigLoad = OriginalLoads[std::make_pair(I, *SI)];
           if (!SI->empty()) {
             Ops.reserve(SI->size());
             Type *ElTy = V->getType();
diff --git a/lib/Transforms/IPO/ConstantMerge.cpp b/lib/Transforms/IPO/ConstantMerge.cpp
index a7bf188..d94c0f4 100644
--- a/lib/Transforms/IPO/ConstantMerge.cpp
+++ b/lib/Transforms/IPO/ConstantMerge.cpp
@@ -93,9 +93,12 @@ bool ConstantMerge::hasKnownAlignment(GlobalVariable *GV) const {
 }
 
 unsigned ConstantMerge::getAlignment(GlobalVariable *GV) const {
+  unsigned Align = GV->getAlignment();
+  if (Align)
+    return Align;
   if (TD)
     return TD->getPreferredAlignment(GV);
-  return GV->getAlignment();
+  return 0;
 }
 
 bool ConstantMerge::runOnModule(Module &M) {
@@ -210,9 +213,9 @@ bool ConstantMerge::runOnModule(Module &M) {
       // Bump the alignment if necessary.
       if (Replacements[i].first->getAlignment() ||
           Replacements[i].second->getAlignment()) {
-        Replacements[i].second->setAlignment(std::max(
-            Replacements[i].first->getAlignment(),
-            Replacements[i].second->getAlignment()));
+        Replacements[i].second->setAlignment(
+            std::max(getAlignment(Replacements[i].first),
+                     getAlignment(Replacements[i].second)));
       }
 
       // Eliminate any uses of the dead global.
diff --git a/lib/Transforms/IPO/DeadArgumentElimination.cpp b/lib/Transforms/IPO/DeadArgumentElimination.cpp
index 6ee6162..911c14e 100644
--- a/lib/Transforms/IPO/DeadArgumentElimination.cpp
+++ b/lib/Transforms/IPO/DeadArgumentElimination.cpp
@@ -357,6 +357,19 @@ bool DAE::RemoveDeadArgumentsFromCallers(Function &Fn)
   if (Fn.hasLocalLinkage() && !Fn.getFunctionType()->isVarArg())
     return false;
 
+  // If a function seen at compile time is not necessarily the one linked to
+  // the binary being built, it is illegal to change the actual arguments
+  // passed to it. These functions can be captured by isWeakForLinker().
+  // *NOTE* that mayBeOverridden() is insufficient for this purpose as it
+  // doesn't include linkage types like AvailableExternallyLinkage and
+  // LinkOnceODRLinkage. Take link_odr* as an example, it indicates a set of
+  // *EQUIVALENT* globals that can be merged at link-time. However, the
+  // semantic of *EQUIVALENT*-functions includes parameters. Changing
+  // parameters breaks this assumption.
+  //
+  if (Fn.isWeakForLinker())
+    return false;
+
   if (Fn.use_empty())
     return false;
 
diff --git a/lib/Transforms/IPO/ExtractGV.cpp b/lib/Transforms/IPO/ExtractGV.cpp
index fa3d72d..50fb3e6 100644
--- a/lib/Transforms/IPO/ExtractGV.cpp
+++ b/lib/Transforms/IPO/ExtractGV.cpp
@@ -21,6 +21,38 @@
 #include <algorithm>
 using namespace llvm;
 
+/// Make sure GV is visible from both modules. Delete is true if it is
+/// being deleted from this module.
+/// This also makes sure GV cannot be dropped so that references from
+/// the split module remain valid.
+static void makeVisible(GlobalValue &GV, bool Delete) {
+  bool Local = GV.hasLocalLinkage();
+  if (Local)
+    GV.setVisibility(GlobalValue::HiddenVisibility);
+
+  if (Local || Delete) {
+    GV.setLinkage(GlobalValue::ExternalLinkage);
+    return;
+  }
+
+  if (!GV.hasLinkOnceLinkage()) {
+    assert(!GV.isDiscardableIfUnused());
+    return;
+  }
+
+  // Map linkonce* to weak* so that llvm doesn't drop this GV.
+  switch(GV.getLinkage()) {
+  default:
+    llvm_unreachable("Unexpected linkage");
+  case GlobalValue::LinkOnceAnyLinkage:
+    GV.setLinkage(GlobalValue::WeakAnyLinkage);
+    return;
+  case GlobalValue::LinkOnceODRLinkage:
+    GV.setLinkage(GlobalValue::WeakODRLinkage);
+    return;
+  }
+}
+
 namespace {
   /// @brief A pass to extract specific functions and their dependencies.
   class GVExtractorPass : public ModulePass {
@@ -60,12 +92,7 @@ namespace {
             continue;
         }
 
-        bool Local = I->isDiscardableIfUnused();
-        if (Local)
-          I->setVisibility(GlobalValue::HiddenVisibility);
-
-        if (Local || Delete)
-          I->setLinkage(GlobalValue::ExternalLinkage);
+	makeVisible(*I, Delete);
 
         if (Delete)
           I->setInitializer(0);
@@ -80,12 +107,7 @@ namespace {
             continue;
         }
 
-        bool Local = I->isDiscardableIfUnused();
-        if (Local)
-          I->setVisibility(GlobalValue::HiddenVisibility);
-
-        if (Local || Delete)
-          I->setLinkage(GlobalValue::ExternalLinkage);
+	makeVisible(*I, Delete);
 
         if (Delete)
           I->deleteBody();
@@ -97,12 +119,10 @@ namespace {
         Module::alias_iterator CurI = I;
         ++I;
 
-        if (CurI->isDiscardableIfUnused()) {
-          CurI->setVisibility(GlobalValue::HiddenVisibility);
-          CurI->setLinkage(GlobalValue::ExternalLinkage);
-        }
+	bool Delete = deleteStuff == (bool)Named.count(CurI);
+	makeVisible(*CurI, Delete);
 
-        if (deleteStuff == (bool)Named.count(CurI)) {
+        if (Delete) {
           Type *Ty =  CurI->getType()->getElementType();
 
           CurI->removeFromParent();
diff --git a/lib/Transforms/IPO/FunctionAttrs.cpp b/lib/Transforms/IPO/FunctionAttrs.cpp
index 1366883..60e5f06 100644
--- a/lib/Transforms/IPO/FunctionAttrs.cpp
+++ b/lib/Transforms/IPO/FunctionAttrs.cpp
@@ -136,7 +136,8 @@ namespace {
 char FunctionAttrs::ID = 0;
 INITIALIZE_PASS_BEGIN(FunctionAttrs, "functionattrs",
                 "Deduce function attributes", false, false)
-INITIALIZE_AG_DEPENDENCY(CallGraph)
+INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_PASS_DEPENDENCY(CallGraph)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
 INITIALIZE_PASS_END(FunctionAttrs, "functionattrs",
                 "Deduce function attributes", false, false)
@@ -366,6 +367,7 @@ namespace {
         }
       }
       assert(Found && "Capturing call-site captured nothing?");
+      (void)Found;
       return false;
     }
 
diff --git a/lib/Transforms/IPO/GlobalDCE.cpp b/lib/Transforms/IPO/GlobalDCE.cpp
index 201f320..901295d 100644
--- a/lib/Transforms/IPO/GlobalDCE.cpp
+++ b/lib/Transforms/IPO/GlobalDCE.cpp
@@ -179,6 +179,9 @@ void GlobalDCE::GlobalIsNeeded(GlobalValue *G) {
     // any globals used will be marked as needed.
     Function *F = cast<Function>(G);
 
+    if (F->hasPrefixData())
+      MarkUsedGlobalsAsNeeded(F->getPrefixData());
+
     for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB)
       for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I)
         for (User::op_iterator U = I->op_begin(), E = I->op_end(); U != E; ++U)
diff --git a/lib/Transforms/IPO/GlobalOpt.cpp b/lib/Transforms/IPO/GlobalOpt.cpp
index 64cd515..2ea89a1 100644
--- a/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/lib/Transforms/IPO/GlobalOpt.cpp
@@ -37,7 +37,9 @@
 #include "llvm/Support/GetElementPtrTypeIterator.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/ValueHandle.h"
 #include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/GlobalStatus.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
 #include <algorithm>
 using namespace llvm;
@@ -60,7 +62,6 @@ STATISTIC(NumAliasesRemoved, "Number of global aliases eliminated");
 STATISTIC(NumCXXDtorsRemoved, "Number of global C++ destructors removed");
 
 namespace {
-  struct GlobalStatus;
   struct GlobalOpt : public ModulePass {
     virtual void getAnalysisUsage(AnalysisUsage &AU) const {
       AU.addRequired<TargetLibraryInfo>();
@@ -80,7 +81,6 @@ namespace {
     bool OptimizeGlobalCtorsList(GlobalVariable *&GCL);
     bool ProcessGlobal(GlobalVariable *GV,Module::global_iterator &GVI);
     bool ProcessInternalGlobal(GlobalVariable *GV,Module::global_iterator &GVI,
-                               const SmallPtrSet<const PHINode*, 16> &PHIUsers,
                                const GlobalStatus &GS);
     bool OptimizeEmptyGlobalCXXDtors(Function *CXAAtExitFn);
 
@@ -98,209 +98,6 @@ INITIALIZE_PASS_END(GlobalOpt, "globalopt",
 
 ModulePass *llvm::createGlobalOptimizerPass() { return new GlobalOpt(); }
 
-namespace {
-
-/// GlobalStatus - As we analyze each global, keep track of some information
-/// about it.  If we find out that the address of the global is taken, none of
-/// this info will be accurate.
-struct GlobalStatus {
-  /// isCompared - True if the global's address is used in a comparison.
-  bool isCompared;
-
-  /// isLoaded - True if the global is ever loaded.  If the global isn't ever
-  /// loaded it can be deleted.
-  bool isLoaded;
-
-  /// StoredType - Keep track of what stores to the global look like.
-  ///
-  enum StoredType {
-    /// NotStored - There is no store to this global.  It can thus be marked
-    /// constant.
-    NotStored,
-
-    /// isInitializerStored - This global is stored to, but the only thing
-    /// stored is the constant it was initialized with.  This is only tracked
-    /// for scalar globals.
-    isInitializerStored,
-
-    /// isStoredOnce - This global is stored to, but only its initializer and
-    /// one other value is ever stored to it.  If this global isStoredOnce, we
-    /// track the value stored to it in StoredOnceValue below.  This is only
-    /// tracked for scalar globals.
-    isStoredOnce,
-
-    /// isStored - This global is stored to by multiple values or something else
-    /// that we cannot track.
-    isStored
-  } StoredType;
-
-  /// StoredOnceValue - If only one value (besides the initializer constant) is
-  /// ever stored to this global, keep track of what value it is.
-  Value *StoredOnceValue;
-
-  /// AccessingFunction/HasMultipleAccessingFunctions - These start out
-  /// null/false.  When the first accessing function is noticed, it is recorded.
-  /// When a second different accessing function is noticed,
-  /// HasMultipleAccessingFunctions is set to true.
-  const Function *AccessingFunction;
-  bool HasMultipleAccessingFunctions;
-
-  /// HasNonInstructionUser - Set to true if this global has a user that is not
-  /// an instruction (e.g. a constant expr or GV initializer).
-  bool HasNonInstructionUser;
-
-  /// AtomicOrdering - Set to the strongest atomic ordering requirement.
-  AtomicOrdering Ordering;
-
-  GlobalStatus() : isCompared(false), isLoaded(false), StoredType(NotStored),
-                   StoredOnceValue(0), AccessingFunction(0),
-                   HasMultipleAccessingFunctions(false),
-                   HasNonInstructionUser(false), Ordering(NotAtomic) {}
-};
-
-}
-
-/// StrongerOrdering - Return the stronger of the two ordering. If the two
-/// orderings are acquire and release, then return AcquireRelease.
-///
-static AtomicOrdering StrongerOrdering(AtomicOrdering X, AtomicOrdering Y) {
-  if (X == Acquire && Y == Release) return AcquireRelease;
-  if (Y == Acquire && X == Release) return AcquireRelease;
-  return (AtomicOrdering)std::max(X, Y);
-}
-
-/// SafeToDestroyConstant - It is safe to destroy a constant iff it is only used
-/// by constants itself.  Note that constants cannot be cyclic, so this test is
-/// pretty easy to implement recursively.
-///
-static bool SafeToDestroyConstant(const Constant *C) {
-  if (isa<GlobalValue>(C)) return false;
-
-  for (Value::const_use_iterator UI = C->use_begin(), E = C->use_end(); UI != E;
-       ++UI)
-    if (const Constant *CU = dyn_cast<Constant>(*UI)) {
-      if (!SafeToDestroyConstant(CU)) return false;
-    } else
-      return false;
-  return true;
-}
-
-
-/// AnalyzeGlobal - Look at all uses of the global and fill in the GlobalStatus
-/// structure.  If the global has its address taken, return true to indicate we
-/// can't do anything with it.
-///
-static bool AnalyzeGlobal(const Value *V, GlobalStatus &GS,
-                          SmallPtrSet<const PHINode*, 16> &PHIUsers) {
-  for (Value::const_use_iterator UI = V->use_begin(), E = V->use_end(); UI != E;
-       ++UI) {
-    const User *U = *UI;
-    if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(U)) {
-      GS.HasNonInstructionUser = true;
-
-      // If the result of the constantexpr isn't pointer type, then we won't
-      // know to expect it in various places.  Just reject early.
-      if (!isa<PointerType>(CE->getType())) return true;
-
-      if (AnalyzeGlobal(CE, GS, PHIUsers)) return true;
-    } else if (const Instruction *I = dyn_cast<Instruction>(U)) {
-      if (!GS.HasMultipleAccessingFunctions) {
-        const Function *F = I->getParent()->getParent();
-        if (GS.AccessingFunction == 0)
-          GS.AccessingFunction = F;
-        else if (GS.AccessingFunction != F)
-          GS.HasMultipleAccessingFunctions = true;
-      }
-      if (const LoadInst *LI = dyn_cast<LoadInst>(I)) {
-        GS.isLoaded = true;
-        // Don't hack on volatile loads.
-        if (LI->isVolatile()) return true;
-        GS.Ordering = StrongerOrdering(GS.Ordering, LI->getOrdering());
-      } else if (const StoreInst *SI = dyn_cast<StoreInst>(I)) {
-        // Don't allow a store OF the address, only stores TO the address.
-        if (SI->getOperand(0) == V) return true;
-
-        // Don't hack on volatile stores.
-        if (SI->isVolatile()) return true;
-
-        GS.Ordering = StrongerOrdering(GS.Ordering, SI->getOrdering());
-
-        // If this is a direct store to the global (i.e., the global is a scalar
-        // value, not an aggregate), keep more specific information about
-        // stores.
-        if (GS.StoredType != GlobalStatus::isStored) {
-          if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(
-                                                           SI->getOperand(1))) {
-            Value *StoredVal = SI->getOperand(0);
-
-            if (Constant *C = dyn_cast<Constant>(StoredVal)) {
-              if (C->isThreadDependent()) {
-                // The stored value changes between threads; don't track it.
-                return true;
-              }
-            }
-
-            if (StoredVal == GV->getInitializer()) {
-              if (GS.StoredType < GlobalStatus::isInitializerStored)
-                GS.StoredType = GlobalStatus::isInitializerStored;
-            } else if (isa<LoadInst>(StoredVal) &&
-                       cast<LoadInst>(StoredVal)->getOperand(0) == GV) {
-              if (GS.StoredType < GlobalStatus::isInitializerStored)
-                GS.StoredType = GlobalStatus::isInitializerStored;
-            } else if (GS.StoredType < GlobalStatus::isStoredOnce) {
-              GS.StoredType = GlobalStatus::isStoredOnce;
-              GS.StoredOnceValue = StoredVal;
-            } else if (GS.StoredType == GlobalStatus::isStoredOnce &&
-                       GS.StoredOnceValue == StoredVal) {
-              // noop.
-            } else {
-              GS.StoredType = GlobalStatus::isStored;
-            }
-          } else {
-            GS.StoredType = GlobalStatus::isStored;
-          }
-        }
-      } else if (isa<BitCastInst>(I)) {
-        if (AnalyzeGlobal(I, GS, PHIUsers)) return true;
-      } else if (isa<GetElementPtrInst>(I)) {
-        if (AnalyzeGlobal(I, GS, PHIUsers)) return true;
-      } else if (isa<SelectInst>(I)) {
-        if (AnalyzeGlobal(I, GS, PHIUsers)) return true;
-      } else if (const PHINode *PN = dyn_cast<PHINode>(I)) {
-        // PHI nodes we can check just like select or GEP instructions, but we
-        // have to be careful about infinite recursion.
-        if (PHIUsers.insert(PN))  // Not already visited.
-          if (AnalyzeGlobal(I, GS, PHIUsers)) return true;
-      } else if (isa<CmpInst>(I)) {
-        GS.isCompared = true;
-      } else if (const MemTransferInst *MTI = dyn_cast<MemTransferInst>(I)) {
-        if (MTI->isVolatile()) return true;
-        if (MTI->getArgOperand(0) == V)
-          GS.StoredType = GlobalStatus::isStored;
-        if (MTI->getArgOperand(1) == V)
-          GS.isLoaded = true;
-      } else if (const MemSetInst *MSI = dyn_cast<MemSetInst>(I)) {
-        assert(MSI->getArgOperand(0) == V && "Memset only takes one pointer!");
-        if (MSI->isVolatile()) return true;
-        GS.StoredType = GlobalStatus::isStored;
-      } else {
-        return true;  // Any other non-load instruction might take address!
-      }
-    } else if (const Constant *C = dyn_cast<Constant>(U)) {
-      GS.HasNonInstructionUser = true;
-      // We might have a dead and dangling constant hanging off of here.
-      if (!SafeToDestroyConstant(C))
-        return true;
-    } else {
-      GS.HasNonInstructionUser = true;
-      // Otherwise must be some other user.
-      return true;
-    }
-  }
-
-  return false;
-}
-
 /// isLeakCheckerRoot - Is this global variable possibly used by a leak checker
 /// as a root?  If so, we might not really want to eliminate the stores to it.
 static bool isLeakCheckerRoot(GlobalVariable *GV) {
@@ -434,7 +231,7 @@ static bool CleanupPointerRootUsers(GlobalVariable *GV,
         Changed = true;
       }
     } else if (Constant *C = dyn_cast<Constant>(U)) {
-      if (SafeToDestroyConstant(C)) {
+      if (isSafeToDestroyConstant(C)) {
         C->destroyConstant();
         // This could have invalidated UI, start over from scratch.
         Dead.clear();
@@ -471,9 +268,17 @@ static bool CleanupPointerRootUsers(GlobalVariable *GV,
 static bool CleanupConstantGlobalUsers(Value *V, Constant *Init,
                                        DataLayout *TD, TargetLibraryInfo *TLI) {
   bool Changed = false;
-  SmallVector<User*, 8> WorkList(V->use_begin(), V->use_end());
+  // Note that we need to use a weak value handle for the worklist items. When
+  // we delete a constant array, we may also be holding pointer to one of its
+  // elements (or an element of one of its elements if we're dealing with an
+  // array of arrays) in the worklist.
+  SmallVector<WeakVH, 8> WorkList(V->use_begin(), V->use_end());
   while (!WorkList.empty()) {
-    User *U = WorkList.pop_back_val();
+    Value *UV = WorkList.pop_back_val();
+    if (!UV)
+      continue;
+
+    User *U = cast<User>(UV);
 
     if (LoadInst *LI = dyn_cast<LoadInst>(U)) {
       if (Init) {
@@ -534,7 +339,7 @@ static bool CleanupConstantGlobalUsers(Value *V, Constant *Init,
     } else if (Constant *C = dyn_cast<Constant>(U)) {
       // If we have a chain of dead constantexprs or other things dangling from
       // us, and if they are all dead, nuke them without remorse.
-      if (SafeToDestroyConstant(C)) {
+      if (isSafeToDestroyConstant(C)) {
         C->destroyConstant();
         CleanupConstantGlobalUsers(V, Init, TD, TLI);
         return true;
@@ -549,7 +354,7 @@ static bool CleanupConstantGlobalUsers(Value *V, Constant *Init,
 static bool isSafeSROAElementUse(Value *V) {
   // We might have a dead and dangling constant hanging off of here.
   if (Constant *C = dyn_cast<Constant>(V))
-    return SafeToDestroyConstant(C);
+    return isSafeToDestroyConstant(C);
 
   Instruction *I = dyn_cast<Instruction>(V);
   if (!I) return false;
@@ -1373,8 +1178,7 @@ static Value *GetHeapSROAValue(Value *V, unsigned FieldNo,
   } else if (PHINode *PN = dyn_cast<PHINode>(V)) {
     // PN's type is pointer to struct.  Make a new PHI of pointer to struct
     // field.
-    StructType *ST =
-      cast<StructType>(cast<PointerType>(PN->getType())->getElementType());
+    StructType *ST = cast<StructType>(PN->getType()->getPointerElementType());
 
     PHINode *NewPN =
      PHINode::Create(PointerType::getUnqual(ST->getElementType(FieldNo)),
@@ -1505,7 +1309,7 @@ static GlobalVariable *PerformHeapAllocSRoA(GlobalVariable *GV, CallInst *CI,
     unsigned TypeSize = TD->getTypeAllocSize(FieldTy);
     if (StructType *ST = dyn_cast<StructType>(FieldTy))
       TypeSize = TD->getStructLayout(ST)->getSizeInBytes();
-    Type *IntPtrTy = TD->getIntPtrType(CI->getContext());
+    Type *IntPtrTy = TD->getIntPtrType(CI->getType());
     Value *NMI = CallInst::CreateMalloc(CI, IntPtrTy, FieldTy,
                                         ConstantInt::get(IntPtrTy, TypeSize),
                                         NElems, 0,
@@ -1735,7 +1539,7 @@ static bool TryToOptimizeStoreOfMallocToGlobal(GlobalVariable *GV,
     // If this is a fixed size array, transform the Malloc to be an alloc of
     // structs.  malloc [100 x struct],1 -> malloc struct, 100
     if (ArrayType *AT = dyn_cast<ArrayType>(getMallocAllocatedType(CI, TLI))) {
-      Type *IntPtrTy = TD->getIntPtrType(CI->getContext());
+      Type *IntPtrTy = TD->getIntPtrType(CI->getType());
       unsigned TypeSize = TD->getStructLayout(AllocSTy)->getSizeInBytes();
       Value *AllocSize = ConstantInt::get(IntPtrTy, TypeSize);
       Value *NumElements = ConstantInt::get(IntPtrTy, AT->getNumElements());
@@ -1917,13 +1721,12 @@ bool GlobalOpt::ProcessGlobal(GlobalVariable *GV,
   if (!GV->hasLocalLinkage())
     return false;
 
-  SmallPtrSet<const PHINode*, 16> PHIUsers;
   GlobalStatus GS;
 
-  if (AnalyzeGlobal(GV, GS, PHIUsers))
+  if (GlobalStatus::analyzeGlobal(GV, GS))
     return false;
 
-  if (!GS.isCompared && !GV->hasUnnamedAddr()) {
+  if (!GS.IsCompared && !GV->hasUnnamedAddr()) {
     GV->setUnnamedAddr(true);
     NumUnnamed++;
   }
@@ -1931,14 +1734,13 @@ bool GlobalOpt::ProcessGlobal(GlobalVariable *GV,
   if (GV->isConstant() || !GV->hasInitializer())
     return false;
 
-  return ProcessInternalGlobal(GV, GVI, PHIUsers, GS);
+  return ProcessInternalGlobal(GV, GVI, GS);
 }
 
 /// ProcessInternalGlobal - Analyze the specified global variable and optimize
 /// it if possible.  If we make a change, return true.
 bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV,
                                       Module::global_iterator &GVI,
-                                const SmallPtrSet<const PHINode*, 16> &PHIUsers,
                                       const GlobalStatus &GS) {
   // If this is a first class global and has only one accessing function
   // and this function is main (which we know is not recursive), we replace
@@ -1971,7 +1773,7 @@ bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV,
 
   // If the global is never loaded (but may be stored to), it is dead.
   // Delete it now.
-  if (!GS.isLoaded) {
+  if (!GS.IsLoaded) {
     DEBUG(dbgs() << "GLOBAL NEVER LOADED: " << *GV);
 
     bool Changed;
@@ -1992,7 +1794,7 @@ bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV,
     }
     return Changed;
 
-  } else if (GS.StoredType <= GlobalStatus::isInitializerStored) {
+  } else if (GS.StoredType <= GlobalStatus::InitializerStored) {
     DEBUG(dbgs() << "MARKING CONSTANT: " << *GV << "\n");
     GV->setConstant(true);
 
@@ -2015,7 +1817,7 @@ bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV,
         GVI = FirstNewGV;  // Don't skip the newly produced globals!
         return true;
       }
-  } else if (GS.StoredType == GlobalStatus::isStoredOnce) {
+  } else if (GS.StoredType == GlobalStatus::StoredOnce) {
     // If the initial value for the global was an undef value, and if only
     // one other value was stored into it, we can just change the
     // initializer to be the stored value, then delete all stores to the
@@ -2048,11 +1850,14 @@ bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV,
 
     // Otherwise, if the global was not a boolean, we can shrink it to be a
     // boolean.
-    if (Constant *SOVConstant = dyn_cast<Constant>(GS.StoredOnceValue))
-      if (TryToShrinkGlobalToBoolean(GV, SOVConstant)) {
-        ++NumShrunkToBool;
-        return true;
+    if (Constant *SOVConstant = dyn_cast<Constant>(GS.StoredOnceValue)) {
+      if (GS.Ordering == NotAtomic) {
+        if (TryToShrinkGlobalToBoolean(GV, SOVConstant)) {
+          ++NumShrunkToBool;
+          return true;
+        }
       }
+    }
   }
 
   return false;
@@ -2210,8 +2015,7 @@ static GlobalVariable *InstallGlobalCtors(GlobalVariable *GCL,
   CSVals[1] = 0;
 
   StructType *StructTy =
-    cast <StructType>(
-    cast<ArrayType>(GCL->getType()->getElementType())->getElementType());
+    cast<StructType>(GCL->getType()->getElementType()->getArrayElementType());
 
   // Create the new init list.
   std::vector<Constant*> CAList;
@@ -3041,14 +2845,8 @@ bool GlobalOpt::OptimizeGlobalCtorsList(GlobalVariable *&GCL) {
   return true;
 }
 
-static int compareNames(const void *A, const void *B) {
-  const GlobalValue *VA = *reinterpret_cast<GlobalValue* const*>(A);
-  const GlobalValue *VB = *reinterpret_cast<GlobalValue* const*>(B);
-  if (VA->getName() < VB->getName())
-    return -1;
-  if (VB->getName() < VA->getName())
-    return 1;
-  return 0;
+static int compareNames(Constant *const *A, Constant *const *B) {
+  return (*A)->getName().compare((*B)->getName());
 }
 
 static void setUsedInitializer(GlobalVariable &V,
diff --git a/lib/Transforms/IPO/InlineAlways.cpp b/lib/Transforms/IPO/InlineAlways.cpp
index a0095da..437597e 100644
--- a/lib/Transforms/IPO/InlineAlways.cpp
+++ b/lib/Transforms/IPO/InlineAlways.cpp
@@ -63,7 +63,7 @@ public:
 char AlwaysInliner::ID = 0;
 INITIALIZE_PASS_BEGIN(AlwaysInliner, "always-inline",
                 "Inliner for always_inline functions", false, false)
-INITIALIZE_AG_DEPENDENCY(CallGraph)
+INITIALIZE_PASS_DEPENDENCY(CallGraph)
 INITIALIZE_PASS_DEPENDENCY(InlineCostAnalysis)
 INITIALIZE_PASS_END(AlwaysInliner, "always-inline",
                 "Inliner for always_inline functions", false, false)
diff --git a/lib/Transforms/IPO/InlineSimple.cpp b/lib/Transforms/IPO/InlineSimple.cpp
index a4f7026..57379a3 100644
--- a/lib/Transforms/IPO/InlineSimple.cpp
+++ b/lib/Transforms/IPO/InlineSimple.cpp
@@ -28,7 +28,7 @@ using namespace llvm;
 
 namespace {
 
-/// \brief Actaul inliner pass implementation.
+/// \brief Actual inliner pass implementation.
 ///
 /// The common implementation of the inlining logic is shared between this
 /// inliner pass and the always inliner pass. The two passes use different cost
@@ -61,7 +61,7 @@ public:
 char SimpleInliner::ID = 0;
 INITIALIZE_PASS_BEGIN(SimpleInliner, "inline",
                 "Function Integration/Inlining", false, false)
-INITIALIZE_AG_DEPENDENCY(CallGraph)
+INITIALIZE_PASS_DEPENDENCY(CallGraph)
 INITIALIZE_PASS_DEPENDENCY(InlineCostAnalysis)
 INITIALIZE_PASS_END(SimpleInliner, "inline",
                 "Function Integration/Inlining", false, false)
diff --git a/lib/Transforms/IPO/Internalize.cpp b/lib/Transforms/IPO/Internalize.cpp
index d56a06f..64e2ced 100644
--- a/lib/Transforms/IPO/Internalize.cpp
+++ b/lib/Transforms/IPO/Internalize.cpp
@@ -11,6 +11,12 @@
 // If the function or variable is not in the list of external names given to
 // the pass it is marked as internal.
 //
+// This transformation would not be legal in a regular compilation, but it gets
+// extra information from the linker about what is safe.
+//
+// For example: Internalizing a function with external linkage. Only if we are
+// told it is only used from within this module, it is safe to do it.
+//
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "internalize"
@@ -23,6 +29,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/GlobalStatus.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
 #include <fstream>
 #include <set>
@@ -50,10 +57,8 @@ namespace {
   public:
     static char ID; // Pass identification, replacement for typeid
     explicit InternalizePass();
-    explicit InternalizePass(ArrayRef<const char *> exportList);
+    explicit InternalizePass(ArrayRef<const char *> ExportList);
     void LoadFile(const char *Filename);
-    void ClearExportList();
-    void AddToExportList(const std::string &val);
     virtual bool runOnModule(Module &M);
 
     virtual void getAnalysisUsage(AnalysisUsage &AU) const {
@@ -72,15 +77,14 @@ InternalizePass::InternalizePass()
   initializeInternalizePassPass(*PassRegistry::getPassRegistry());
   if (!APIFile.empty())           // If a filename is specified, use it.
     LoadFile(APIFile.c_str());
-  if (!APIList.empty())           // If a list is specified, use it as well.
-    ExternalNames.insert(APIList.begin(), APIList.end());
+  ExternalNames.insert(APIList.begin(), APIList.end());
 }
 
-InternalizePass::InternalizePass(ArrayRef<const char *> exportList)
+InternalizePass::InternalizePass(ArrayRef<const char *> ExportList)
   : ModulePass(ID){
   initializeInternalizePassPass(*PassRegistry::getPassRegistry());
-  for(ArrayRef<const char *>::const_iterator itr = exportList.begin();
-        itr != exportList.end(); itr++) {
+  for(ArrayRef<const char *>::const_iterator itr = ExportList.begin();
+        itr != ExportList.end(); itr++) {
     ExternalNames.insert(*itr);
   }
 }
@@ -101,12 +105,25 @@ void InternalizePass::LoadFile(const char *Filename) {
   }
 }
 
-void InternalizePass::ClearExportList() {
-  ExternalNames.clear();
-}
+static bool shouldInternalize(const GlobalValue &GV,
+                              const std::set<std::string> &ExternalNames) {
+  // Function must be defined here
+  if (GV.isDeclaration())
+    return false;
+
+  // Available externally is really just a "declaration with a body".
+  if (GV.hasAvailableExternallyLinkage())
+    return false;
 
-void InternalizePass::AddToExportList(const std::string &val) {
-  ExternalNames.insert(val);
+  // Already has internal linkage
+  if (GV.hasLocalLinkage())
+    return false;
+
+  // Marked to keep external?
+  if (ExternalNames.count(GV.getName()))
+    return false;
+
+  return true;
 }
 
 bool InternalizePass::runOnModule(Module &M) {
@@ -114,11 +131,6 @@ bool InternalizePass::runOnModule(Module &M) {
   CallGraphNode *ExternalNode = CG ? CG->getExternalCallingNode() : 0;
   bool Changed = false;
 
-  // Never internalize functions which code-gen might insert.
-  // FIXME: We should probably add this (and the __stack_chk_guard) via some
-  // type of call-back in CodeGen.
-  ExternalNames.insert("__stack_chk_fail");
-
   SmallPtrSet<GlobalValue *, 8> Used;
   collectUsedGlobalVariables(M, Used, false);
 
@@ -139,19 +151,20 @@ bool InternalizePass::runOnModule(Module &M) {
 
   // Mark all functions not in the api as internal.
   // FIXME: maybe use private linkage?
-  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I)
-    if (!I->isDeclaration() &&         // Function must be defined here
-        // Available externally is really just a "declaration with a body".
-        !I->hasAvailableExternallyLinkage() &&
-        !I->hasLocalLinkage() &&  // Can't already have internal linkage
-        !ExternalNames.count(I->getName())) {// Not marked to keep external?
-      I->setLinkage(GlobalValue::InternalLinkage);
+  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) {
+    if (!shouldInternalize(*I, ExternalNames))
+      continue;
+
+    I->setLinkage(GlobalValue::InternalLinkage);
+
+    if (ExternalNode)
       // Remove a callgraph edge from the external node to this function.
-      if (ExternalNode) ExternalNode->removeOneAbstractEdgeTo((*CG)[I]);
-      Changed = true;
-      ++NumFunctions;
-      DEBUG(dbgs() << "Internalizing func " << I->getName() << "\n");
-    }
+      ExternalNode->removeOneAbstractEdgeTo((*CG)[I]);
+
+    Changed = true;
+    ++NumFunctions;
+    DEBUG(dbgs() << "Internalizing func " << I->getName() << "\n");
+  }
 
   // Never internalize the llvm.used symbol.  It is used to implement
   // attribute((used)).
@@ -166,35 +179,36 @@ bool InternalizePass::runOnModule(Module &M) {
   ExternalNames.insert("llvm.global.annotations");
 
   // Never internalize symbols code-gen inserts.
+  // FIXME: We should probably add this (and the __stack_chk_guard) via some
+  // type of call-back in CodeGen.
+  ExternalNames.insert("__stack_chk_fail");
   ExternalNames.insert("__stack_chk_guard");
 
   // Mark all global variables with initializers that are not in the api as
   // internal as well.
   // FIXME: maybe use private linkage?
   for (Module::global_iterator I = M.global_begin(), E = M.global_end();
-       I != E; ++I)
-    if (!I->isDeclaration() && !I->hasLocalLinkage() &&
-        // Available externally is really just a "declaration with a body".
-        !I->hasAvailableExternallyLinkage() &&
-        !ExternalNames.count(I->getName())) {
-      I->setLinkage(GlobalValue::InternalLinkage);
-      Changed = true;
-      ++NumGlobals;
-      DEBUG(dbgs() << "Internalized gvar " << I->getName() << "\n");
-    }
+       I != E; ++I) {
+    if (!shouldInternalize(*I, ExternalNames))
+      continue;
+
+    I->setLinkage(GlobalValue::InternalLinkage);
+    Changed = true;
+    ++NumGlobals;
+    DEBUG(dbgs() << "Internalized gvar " << I->getName() << "\n");
+  }
 
   // Mark all aliases that are not in the api as internal as well.
   for (Module::alias_iterator I = M.alias_begin(), E = M.alias_end();
-       I != E; ++I)
-    if (!I->isDeclaration() && !I->hasInternalLinkage() &&
-        // Available externally is really just a "declaration with a body".
-        !I->hasAvailableExternallyLinkage() &&
-        !ExternalNames.count(I->getName())) {
-      I->setLinkage(GlobalValue::InternalLinkage);
-      Changed = true;
-      ++NumAliases;
-      DEBUG(dbgs() << "Internalized alias " << I->getName() << "\n");
-    }
+       I != E; ++I) {
+    if (!shouldInternalize(*I, ExternalNames))
+      continue;
+
+    I->setLinkage(GlobalValue::InternalLinkage);
+    Changed = true;
+    ++NumAliases;
+    DEBUG(dbgs() << "Internalized alias " << I->getName() << "\n");
+  }
 
   return Changed;
 }
@@ -203,6 +217,6 @@ ModulePass *llvm::createInternalizePass() {
   return new InternalizePass();
 }
 
-ModulePass *llvm::createInternalizePass(ArrayRef<const char *> el) {
-  return new InternalizePass(el);
+ModulePass *llvm::createInternalizePass(ArrayRef<const char *> ExportList) {
+  return new InternalizePass(ExportList);
 }
diff --git a/lib/Transforms/IPO/MergeFunctions.cpp b/lib/Transforms/IPO/MergeFunctions.cpp
index 4ce749c..3861421 100644
--- a/lib/Transforms/IPO/MergeFunctions.cpp
+++ b/lib/Transforms/IPO/MergeFunctions.cpp
@@ -210,16 +210,20 @@ private:
 // Any two pointers in the same address space are equivalent, intptr_t and
 // pointers are equivalent. Otherwise, standard type equivalence rules apply.
 bool FunctionComparator::isEquivalentType(Type *Ty1, Type *Ty2) const {
+
+  PointerType *PTy1 = dyn_cast<PointerType>(Ty1);
+  PointerType *PTy2 = dyn_cast<PointerType>(Ty2);
+
+  if (TD) {
+    if (PTy1 && PTy1->getAddressSpace() == 0) Ty1 = TD->getIntPtrType(Ty1);
+    if (PTy2 && PTy2->getAddressSpace() == 0) Ty2 = TD->getIntPtrType(Ty2);
+  }
+
   if (Ty1 == Ty2)
     return true;
-  if (Ty1->getTypeID() != Ty2->getTypeID()) {
-    if (TD) {
-      LLVMContext &Ctx = Ty1->getContext();
-      if (isa<PointerType>(Ty1) && Ty2 == TD->getIntPtrType(Ctx)) return true;
-      if (isa<PointerType>(Ty2) && Ty1 == TD->getIntPtrType(Ctx)) return true;
-    }
+
+  if (Ty1->getTypeID() != Ty2->getTypeID())
     return false;
-  }
 
   switch (Ty1->getTypeID()) {
   default:
@@ -241,8 +245,7 @@ bool FunctionComparator::isEquivalentType(Type *Ty1, Type *Ty2) const {
     return true;
 
   case Type::PointerTyID: {
-    PointerType *PTy1 = cast<PointerType>(Ty1);
-    PointerType *PTy2 = cast<PointerType>(Ty2);
+    assert(PTy1 && PTy2 && "Both types must be pointers here.");
     return PTy1->getAddressSpace() == PTy2->getAddressSpace();
   }
 
@@ -352,14 +355,19 @@ bool FunctionComparator::isEquivalentOperation(const Instruction *I1,
 // Determine whether two GEP operations perform the same underlying arithmetic.
 bool FunctionComparator::isEquivalentGEP(const GEPOperator *GEP1,
                                          const GEPOperator *GEP2) {
-  // When we have target data, we can reduce the GEP down to the value in bytes
-  // added to the address.
-  unsigned BitWidth = TD ? TD->getPointerSizeInBits() : 1;
-  APInt Offset1(BitWidth, 0), Offset2(BitWidth, 0);
-  if (TD &&
-      GEP1->accumulateConstantOffset(*TD, Offset1) &&
-      GEP2->accumulateConstantOffset(*TD, Offset2)) {
-    return Offset1 == Offset2;
+  unsigned AS = GEP1->getPointerAddressSpace();
+  if (AS != GEP2->getPointerAddressSpace())
+    return false;
+
+  if (TD) {
+    // When we have target data, we can reduce the GEP down to the value in bytes
+    // added to the address.
+    unsigned BitWidth = TD ? TD->getPointerSizeInBits(AS) : 1;
+    APInt Offset1(BitWidth, 0), Offset2(BitWidth, 0);
+    if (GEP1->accumulateConstantOffset(*TD, Offset1) &&
+        GEP2->accumulateConstantOffset(*TD, Offset2)) {
+      return Offset1 == Offset2;
+    }
   }
 
   if (GEP1->getPointerOperand()->getType() !=
@@ -713,6 +721,19 @@ void MergeFunctions::writeThunkOrAlias(Function *F, Function *G) {
   writeThunk(F, G);
 }
 
+// Helper for writeThunk,
+// Selects proper bitcast operation,
+// but a bit simplier then CastInst::getCastOpcode.
+static Value* createCast(IRBuilder<false> &Builder, Value *V, Type *DestTy) {
+  Type *SrcTy = V->getType();
+  if (SrcTy->isIntegerTy() && DestTy->isPointerTy())
+    return Builder.CreateIntToPtr(V, DestTy);
+  else if (SrcTy->isPointerTy() && DestTy->isIntegerTy())
+    return Builder.CreatePtrToInt(V, DestTy);
+  else
+    return Builder.CreateBitCast(V, DestTy);
+}
+
 // Replace G with a simple tail call to bitcast(F). Also replace direct uses
 // of G with bitcast(F). Deletes G.
 void MergeFunctions::writeThunk(Function *F, Function *G) {
@@ -738,7 +759,7 @@ void MergeFunctions::writeThunk(Function *F, Function *G) {
   FunctionType *FFTy = F->getFunctionType();
   for (Function::arg_iterator AI = NewG->arg_begin(), AE = NewG->arg_end();
        AI != AE; ++AI) {
-    Args.push_back(Builder.CreateBitCast(AI, FFTy->getParamType(i)));
+    Args.push_back(createCast(Builder, (Value*)AI, FFTy->getParamType(i)));
     ++i;
   }
 
@@ -748,13 +769,7 @@ void MergeFunctions::writeThunk(Function *F, Function *G) {
   if (NewG->getReturnType()->isVoidTy()) {
     Builder.CreateRetVoid();
   } else {
-    Type *RetTy = NewG->getReturnType();
-    if (CI->getType()->isIntegerTy() && RetTy->isPointerTy())
-      Builder.CreateRet(Builder.CreateIntToPtr(CI, RetTy));
-    else if (CI->getType()->isPointerTy() && RetTy->isIntegerTy())
-      Builder.CreateRet(Builder.CreatePtrToInt(CI, RetTy));
-    else
-      Builder.CreateRet(Builder.CreateBitCast(CI, RetTy));
+    Builder.CreateRet(createCast(Builder, CI, NewG->getReturnType()));
   }
 
   NewG->copyAttributesFrom(G);
@@ -829,6 +844,18 @@ bool MergeFunctions::insert(ComparableFunction &NewF) {
 
   const ComparableFunction &OldF = *Result.first;
 
+  // Don't merge tiny functions, since it can just end up making the function
+  // larger.
+  // FIXME: Should still merge them if they are unnamed_addr and produce an
+  // alias.
+  if (NewF.getFunc()->size() == 1) {
+    if (NewF.getFunc()->front().size() <= 2) {
+      DEBUG(dbgs() << NewF.getFunc()->getName()
+            << " is to small to bother merging\n");
+      return false;
+    }
+  }
+
   // Never thunk a strong function to a weak function.
   assert(!OldF.getFunc()->mayBeOverridden() ||
          NewF.getFunc()->mayBeOverridden());
diff --git a/lib/Transforms/IPO/PassManagerBuilder.cpp b/lib/Transforms/IPO/PassManagerBuilder.cpp
index a6b3f4e..24c5018 100644
--- a/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -29,20 +29,20 @@
 using namespace llvm;
 
 static cl::opt<bool>
-RunLoopVectorization("vectorize-loops",
+RunLoopVectorization("vectorize-loops", cl::Hidden,
                      cl::desc("Run the Loop vectorization passes"));
 
 static cl::opt<bool>
-LateVectorization("late-vectorize", cl::init(false), cl::Hidden,
+LateVectorization("late-vectorize", cl::init(true), cl::Hidden,
                   cl::desc("Run the vectorization pasess late in the pass "
                            "pipeline (after the inliner)"));
 
 static cl::opt<bool>
-RunSLPVectorization("vectorize-slp",
+RunSLPVectorization("vectorize-slp", cl::Hidden,
                     cl::desc("Run the SLP vectorization passes"));
 
 static cl::opt<bool>
-RunBBVectorization("vectorize-slp-aggressive",
+RunBBVectorization("vectorize-slp-aggressive", cl::Hidden,
                     cl::desc("Run the BB vectorization passes"));
 
 static cl::opt<bool>
@@ -54,6 +54,10 @@ static cl::opt<bool> UseNewSROA("use-new-sroa",
   cl::init(true), cl::Hidden,
   cl::desc("Enable the new, experimental SROA pass"));
 
+static cl::opt<bool>
+RunLoopRerolling("reroll-loops", cl::Hidden,
+                 cl::desc("Run the loop rerolling pass"));
+
 PassManagerBuilder::PassManagerBuilder() {
     OptLevel = 2;
     SizeLevel = 0;
@@ -65,6 +69,7 @@ PassManagerBuilder::PassManagerBuilder() {
     SLPVectorize = RunSLPVectorization;
     LoopVectorize = RunLoopVectorization;
     LateVectorize = LateVectorization;
+    RerollLoops = RunLoopRerolling;
 }
 
 PassManagerBuilder::~PassManagerBuilder() {
@@ -195,8 +200,8 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) {
   MPM.add(createLoopIdiomPass());             // Recognize idioms like memset.
   MPM.add(createLoopDeletionPass());          // Delete dead loops
 
-  if (!LateVectorize && LoopVectorize && OptLevel > 1 && SizeLevel < 2)
-      MPM.add(createLoopVectorizePass());
+  if (!LateVectorize && LoopVectorize)
+      MPM.add(createLoopVectorizePass(DisableUnrollLoops));
 
   if (!DisableUnrollLoops)
     MPM.add(createLoopUnrollPass());          // Unroll small loops
@@ -216,22 +221,22 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) {
 
   addExtensionsToPM(EP_ScalarOptimizerLate, MPM);
 
-  if (!LateVectorize) {
-    if (SLPVectorize)
-      MPM.add(createSLPVectorizerPass());   // Vectorize parallel scalar chains.
-
-    if (BBVectorize) {
-      MPM.add(createBBVectorizePass());
-      MPM.add(createInstructionCombiningPass());
-      if (OptLevel > 1 && UseGVNAfterVectorization)
-        MPM.add(createGVNPass());           // Remove redundancies
-      else
-        MPM.add(createEarlyCSEPass());      // Catch trivial redundancies
-
-      // BBVectorize may have significantly shortened a loop body; unroll again.
-      if (!DisableUnrollLoops)
-        MPM.add(createLoopUnrollPass());
-    }
+  if (RerollLoops)
+    MPM.add(createLoopRerollPass());
+  if (SLPVectorize)
+    MPM.add(createSLPVectorizerPass());   // Vectorize parallel scalar chains.
+
+  if (BBVectorize) {
+    MPM.add(createBBVectorizePass());
+    MPM.add(createInstructionCombiningPass());
+    if (OptLevel > 1 && UseGVNAfterVectorization)
+      MPM.add(createGVNPass());           // Remove redundancies
+    else
+      MPM.add(createEarlyCSEPass());      // Catch trivial redundancies
+
+    // BBVectorize may have significantly shortened a loop body; unroll again.
+    if (!DisableUnrollLoops)
+      MPM.add(createLoopUnrollPass());
   }
 
   MPM.add(createAggressiveDCEPass());         // Delete dead instructions
@@ -241,7 +246,7 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) {
   // As an experimental mode, run any vectorization passes in a separate
   // pipeline from the CGSCC pass manager that runs iteratively with the
   // inliner.
-  if (LateVectorize) {
+  if (LateVectorize && LoopVectorize) {
     // FIXME: This is a HACK! The inliner pass above implicitly creates a CGSCC
     // pass manager that we are specifically trying to avoid. To prevent this
     // we must insert a no-op module pass to reset the pass manager.
@@ -249,35 +254,9 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) {
 
     // Add the various vectorization passes and relevant cleanup passes for
     // them since we are no longer in the middle of the main scalar pipeline.
-    if (LoopVectorize && OptLevel > 1 && SizeLevel < 2) {
-      MPM.add(createLoopVectorizePass());
-
-      if (!DisableUnrollLoops)
-        MPM.add(createLoopUnrollPass());    // Unroll small loops
-
-      // FIXME: Is this necessary/useful? Should we also do SimplifyCFG?
-      MPM.add(createInstructionCombiningPass());
-    }
-
-    if (SLPVectorize) {
-      MPM.add(createSLPVectorizerPass());   // Vectorize parallel scalar chains.
-
-      // FIXME: Is this necessary/useful? Should we also do SimplifyCFG?
-      MPM.add(createInstructionCombiningPass());
-    }
-
-    if (BBVectorize) {
-      MPM.add(createBBVectorizePass());
-      MPM.add(createInstructionCombiningPass());
-      if (OptLevel > 1 && UseGVNAfterVectorization)
-        MPM.add(createGVNPass());           // Remove redundancies
-      else
-        MPM.add(createEarlyCSEPass());      // Catch trivial redundancies
-
-      // BBVectorize may have significantly shortened a loop body; unroll again.
-      if (!DisableUnrollLoops)
-        MPM.add(createLoopUnrollPass());
-    }
+    MPM.add(createLoopVectorizePass(DisableUnrollLoops));
+    MPM.add(createInstructionCombiningPass());
+    MPM.add(createCFGSimplificationPass());
   }
 
   if (!DisableUnitAtATime) {
@@ -304,11 +283,8 @@ void PassManagerBuilder::populateLTOPassManager(PassManagerBase &PM,
   // Now that composite has been compiled, scan through the module, looking
   // for a main function.  If main is defined, mark all other functions
   // internal.
-  if (Internalize) {
-    std::vector<const char*> E;
-    E.push_back("main");
-    PM.add(createInternalizePass(E));
-  }
+  if (Internalize)
+    PM.add(createInternalizePass("main"));
 
   // Propagate constants at call sites into the functions they call.  This
   // opens opportunities for globalopt (and inlining) by substituting function
@@ -349,6 +325,7 @@ void PassManagerBuilder::populateLTOPassManager(PassManagerBase &PM,
   // The IPO passes may leave cruft around.  Clean up after them.
   PM.add(createInstructionCombiningPass());
   PM.add(createJumpThreadingPass());
+
   // Break up allocas
   if (UseNewSROA)
     PM.add(createSROAPass());
@@ -362,6 +339,7 @@ void PassManagerBuilder::populateLTOPassManager(PassManagerBase &PM,
   PM.add(createLICMPass());                 // Hoist loop invariants.
   PM.add(createGVNPass(DisableGVNLoadPRE)); // Remove redundancies.
   PM.add(createMemCpyOptPass());            // Remove dead memcpys.
+
   // Nuke dead stores.
   PM.add(createDeadStoreEliminationPass());
 
diff --git a/lib/Transforms/IPO/PruneEH.cpp b/lib/Transforms/IPO/PruneEH.cpp
index 89529de..b160913 100644
--- a/lib/Transforms/IPO/PruneEH.cpp
+++ b/lib/Transforms/IPO/PruneEH.cpp
@@ -51,7 +51,7 @@ namespace {
 char PruneEH::ID = 0;
 INITIALIZE_PASS_BEGIN(PruneEH, "prune-eh",
                 "Remove unused exception handling info", false, false)
-INITIALIZE_AG_DEPENDENCY(CallGraph)
+INITIALIZE_PASS_DEPENDENCY(CallGraph)
 INITIALIZE_PASS_END(PruneEH, "prune-eh",
                 "Remove unused exception handling info", false, false)
 
diff --git a/lib/Transforms/IPO/StripSymbols.cpp b/lib/Transforms/IPO/StripSymbols.cpp
index 2791106..c4f5cfc 100644
--- a/lib/Transforms/IPO/StripSymbols.cpp
+++ b/lib/Transforms/IPO/StripSymbols.cpp
@@ -9,7 +9,7 @@
 //
 // The StripSymbols transformation implements code stripping. Specifically, it
 // can delete:
-// 
+//
 //   * names for virtual registers
 //   * symbols for internal globals and functions
 //   * debug information
@@ -39,7 +39,7 @@ namespace {
     bool OnlyDebugInfo;
   public:
     static char ID; // Pass identification, replacement for typeid
-    explicit StripSymbols(bool ODI = false) 
+    explicit StripSymbols(bool ODI = false)
       : ModulePass(ID), OnlyDebugInfo(ODI) {
         initializeStripSymbolsPass(*PassRegistry::getPassRegistry());
       }
@@ -144,7 +144,7 @@ static void RemoveDeadConstant(Constant *C) {
   assert(C->use_empty() && "Constant is not dead!");
   SmallPtrSet<Constant*, 4> Operands;
   for (unsigned i = 0, e = C->getNumOperands(); i != e; ++i)
-    if (OnlyUsedBy(C->getOperand(i), C)) 
+    if (OnlyUsedBy(C->getOperand(i), C))
       Operands.insert(cast<Constant>(C->getOperand(i)));
   if (GlobalVariable *GV = dyn_cast<GlobalVariable>(C)) {
     if (!GV->hasLocalLinkage()) return;   // Don't delete non static globals.
@@ -182,7 +182,7 @@ static void StripTypeNames(Module &M, bool PreserveDbgInfo) {
   for (unsigned i = 0, e = StructTypes.size(); i != e; ++i) {
     StructType *STy = StructTypes[i];
     if (STy->isLiteral() || STy->getName().empty()) continue;
-    
+
     if (PreserveDbgInfo && STy->getName().startswith("llvm.dbg"))
       continue;
 
@@ -199,7 +199,7 @@ static void findUsedValues(GlobalVariable *LLVMUsed,
   ConstantArray *Inits = cast<ConstantArray>(LLVMUsed->getInitializer());
 
   for (unsigned i = 0, e = Inits->getNumOperands(); i != e; ++i)
-    if (GlobalValue *GV = 
+    if (GlobalValue *GV =
           dyn_cast<GlobalValue>(Inits->getOperand(i)->stripPointerCasts()))
       UsedValues.insert(GV);
 }
@@ -217,71 +217,20 @@ static bool StripSymbolNames(Module &M, bool PreserveDbgInfo) {
       if (!PreserveDbgInfo || !I->getName().startswith("llvm.dbg"))
         I->setName("");     // Internal symbols can't participate in linkage
   }
-  
+
   for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) {
     if (I->hasLocalLinkage() && llvmUsedValues.count(I) == 0)
       if (!PreserveDbgInfo || !I->getName().startswith("llvm.dbg"))
         I->setName("");     // Internal symbols can't participate in linkage
     StripSymtab(I->getValueSymbolTable(), PreserveDbgInfo);
   }
-  
+
   // Remove all names from types.
   StripTypeNames(M, PreserveDbgInfo);
 
   return true;
 }
 
-// StripDebugInfo - Strip debug info in the module if it exists.  
-// To do this, we remove llvm.dbg.func.start, llvm.dbg.stoppoint, and 
-// llvm.dbg.region.end calls, and any globals they point to if now dead.
-static bool StripDebugInfo(Module &M) {
-
-  bool Changed = false;
-
-  // Remove all of the calls to the debugger intrinsics, and remove them from
-  // the module.
-  if (Function *Declare = M.getFunction("llvm.dbg.declare")) {
-    while (!Declare->use_empty()) {
-      CallInst *CI = cast<CallInst>(Declare->use_back());
-      CI->eraseFromParent();
-    }
-    Declare->eraseFromParent();
-    Changed = true;
-  }
-
-  if (Function *DbgVal = M.getFunction("llvm.dbg.value")) {
-    while (!DbgVal->use_empty()) {
-      CallInst *CI = cast<CallInst>(DbgVal->use_back());
-      CI->eraseFromParent();
-    }
-    DbgVal->eraseFromParent();
-    Changed = true;
-  }
-
-  for (Module::named_metadata_iterator NMI = M.named_metadata_begin(),
-         NME = M.named_metadata_end(); NMI != NME;) {
-    NamedMDNode *NMD = NMI;
-    ++NMI;
-    if (NMD->getName().startswith("llvm.dbg.")) {
-      NMD->eraseFromParent();
-      Changed = true;
-    }
-  }
-
-  for (Module::iterator MI = M.begin(), ME = M.end(); MI != ME; ++MI)
-    for (Function::iterator FI = MI->begin(), FE = MI->end(); FI != FE;
-         ++FI)
-      for (BasicBlock::iterator BI = FI->begin(), BE = FI->end(); BI != BE;
-           ++BI) {
-        if (!BI->getDebugLoc().isUnknown()) {
-          Changed = true;
-          BI->setDebugLoc(DebugLoc());
-        }
-      }
-
-  return Changed;
-}
-
 bool StripSymbols::runOnModule(Module &M) {
   bool Changed = false;
   Changed |= StripDebugInfo(M);
@@ -307,13 +256,13 @@ bool StripDebugDeclare::runOnModule(Module &M) {
       assert(CI->use_empty() && "llvm.dbg intrinsic should have void result");
       CI->eraseFromParent();
       if (Arg1->use_empty()) {
-        if (Constant *C = dyn_cast<Constant>(Arg1)) 
+        if (Constant *C = dyn_cast<Constant>(Arg1))
           DeadConstants.push_back(C);
-        else 
+        else
           RecursivelyDeleteTriviallyDeadInstructions(Arg1);
       }
       if (Arg2->use_empty())
-        if (Constant *C = dyn_cast<Constant>(Arg2)) 
+        if (Constant *C = dyn_cast<Constant>(Arg2))
           DeadConstants.push_back(C);
     }
     Declare->eraseFromParent();
@@ -332,76 +281,107 @@ bool StripDebugDeclare::runOnModule(Module &M) {
   return true;
 }
 
+/// Remove any debug info for global variables/functions in the given module for
+/// which said global variable/function no longer exists (i.e. is null).
+///
+/// Debugging information is encoded in llvm IR using metadata. This is designed
+/// such a way that debug info for symbols preserved even if symbols are
+/// optimized away by the optimizer. This special pass removes debug info for
+/// such symbols.
 bool StripDeadDebugInfo::runOnModule(Module &M) {
   bool Changed = false;
 
-  // Debugging infomration is encoded in llvm IR using metadata. This is designed
-  // such a way that debug info for symbols preserved even if symbols are
-  // optimized away by the optimizer. This special pass removes debug info for 
-  // such symbols.
-
-  // llvm.dbg.gv keeps track of debug info for global variables.
-  if (NamedMDNode *NMD = M.getNamedMetadata("llvm.dbg.gv")) {
-    SmallVector<MDNode *, 8> MDs;
-    for (unsigned i = 0, e = NMD->getNumOperands(); i != e; ++i)
-      if (NMD->getOperand(i)) {
-        assert(DIGlobalVariable(NMD->getOperand(i)).isGlobalVariable() &&
-          "A MDNode in llvm.dbg.gv should be a DIGlobalVariable.");
-        MDs.push_back(NMD->getOperand(i));
-      }
-      else
-        Changed = true;
-    NMD->eraseFromParent();
-    NMD = NULL;
-
-    for (SmallVectorImpl<MDNode *>::iterator I = MDs.begin(),
-           E = MDs.end(); I != E; ++I) {
-      GlobalVariable *GV = DIGlobalVariable(*I).getGlobal();
-      if (GV && M.getGlobalVariable(GV->getName(), true)) {
-        if (!NMD)
-          NMD = M.getOrInsertNamedMetadata("llvm.dbg.gv");
-        NMD->addOperand(*I);
-      }
+  LLVMContext &C = M.getContext();
+
+  // Find all debug info in F. This is actually overkill in terms of what we
+  // want to do, but we want to try and be as resilient as possible in the face
+  // of potential debug info changes by using the formal interfaces given to us
+  // as much as possible.
+  DebugInfoFinder F;
+  F.processModule(M);
+
+  // For each compile unit, find the live set of global variables/functions and
+  // replace the current list of potentially dead global variables/functions
+  // with the live list.
+  SmallVector<Value *, 64> LiveGlobalVariables;
+  SmallVector<Value *, 64> LiveSubprograms;
+  DenseSet<const MDNode *> VisitedSet;
+
+  for (DebugInfoFinder::iterator CI = F.compile_unit_begin(),
+         CE = F.compile_unit_end(); CI != CE; ++CI) {
+    // Create our compile unit.
+    DICompileUnit DIC(*CI);
+    assert(DIC.Verify() && "DIC must verify as a DICompileUnit.");
+
+    // Create our live subprogram list.
+    DIArray SPs = DIC.getSubprograms();
+    bool SubprogramChange = false;
+    for (unsigned i = 0, e = SPs.getNumElements(); i != e; ++i) {
+      DISubprogram DISP(SPs.getElement(i));
+      assert(DISP.Verify() && "DISP must verify as a DISubprogram.");
+
+      // Make sure we visit each subprogram only once.
+      if (!VisitedSet.insert(DISP).second)
+        continue;
+
+      // If the function referenced by DISP is not null, the function is live.
+      if (DISP.getFunction())
+        LiveSubprograms.push_back(DISP);
       else
-        Changed = true;
+        SubprogramChange = true;
     }
-  }
 
-  // llvm.dbg.sp keeps track of debug info for subprograms.
-  if (NamedMDNode *NMD = M.getNamedMetadata("llvm.dbg.sp")) {
-    SmallVector<MDNode *, 8> MDs;
-    for (unsigned i = 0, e = NMD->getNumOperands(); i != e; ++i)
-      if (NMD->getOperand(i)) {
-        assert(DISubprogram(NMD->getOperand(i)).isSubprogram() &&
-          "A MDNode in llvm.dbg.sp should be a DISubprogram.");
-        MDs.push_back(NMD->getOperand(i));
-      }
+    // Create our live global variable list.
+    DIArray GVs = DIC.getGlobalVariables();
+    bool GlobalVariableChange = false;
+    for (unsigned i = 0, e = GVs.getNumElements(); i != e; ++i) {
+      DIGlobalVariable DIG(GVs.getElement(i));
+      assert(DIG.Verify() && "DIG must verify as DIGlobalVariable.");
+
+      // Make sure we only visit each global variable only once.
+      if (!VisitedSet.insert(DIG).second)
+        continue;
+
+      // If the global variable referenced by DIG is not null, the global
+      // variable is live.
+      if (DIG.getGlobal())
+        LiveGlobalVariables.push_back(DIG);
       else
-        Changed = true;
-    NMD->eraseFromParent();
-    NMD = NULL;
-
-    for (SmallVectorImpl<MDNode *>::iterator I = MDs.begin(),
-           E = MDs.end(); I != E; ++I) {
-      bool FnIsLive = false;
-      if (Function *F = DISubprogram(*I).getFunction())
-        if (M.getFunction(F->getName()))
-          FnIsLive = true;
-      if (FnIsLive) {
-          if (!NMD)
-            NMD = M.getOrInsertNamedMetadata("llvm.dbg.sp");
-          NMD->addOperand(*I);
-      } else {
-        // Remove llvm.dbg.lv.fnname named mdnode which may have been used
-        // to hold debug info for dead function's local variables.
-        StringRef FName = DISubprogram(*I).getLinkageName();
-        if (FName.empty())
-          FName = DISubprogram(*I).getName();
-        if (NamedMDNode *LVNMD = M.getNamedMetadata(
-                "llvm.dbg.lv." + Function::getRealLinkageName(FName)))
-          LVNMD->eraseFromParent();
-      }
+        GlobalVariableChange = true;
+    }
+
+    // If we found dead subprograms or global variables, replace the current
+    // subprogram list/global variable list with our new live subprogram/global
+    // variable list.
+    if (SubprogramChange) {
+      // Make sure that 9 is still the index of the subprograms. This is to make
+      // sure that an assert is hit if the location of the subprogram array
+      // changes. This is just to make sure that this is updated if such an
+      // event occurs.
+      assert(DIC->getNumOperands() >= 10 &&
+             SPs == DIC->getOperand(9) &&
+             "DICompileUnits is expected to store Subprograms in operand "
+             "9.");
+      DIC->replaceOperandWith(9, MDNode::get(C, LiveSubprograms));
+      Changed = true;
     }
+
+    if (GlobalVariableChange) {
+      // Make sure that 10 is still the index of global variables. This is to
+      // make sure that an assert is hit if the location of the subprogram array
+      // changes. This is just to make sure that this index is updated if such
+      // an event occurs.
+      assert(DIC->getNumOperands() >= 11 &&
+             GVs == DIC->getOperand(10) &&
+             "DICompileUnits is expected to store Global Variables in operand "
+             "10.");
+      DIC->replaceOperandWith(10, MDNode::get(C, LiveGlobalVariables));
+      Changed = true;
+    }
+
+    // Reset lists for the next iteration.
+    LiveSubprograms.clear();
+    LiveGlobalVariables.clear();
   }
 
   return Changed;
diff --git a/lib/Transforms/InstCombine/InstCombine.h b/lib/Transforms/InstCombine/InstCombine.h
index b3084cc..a5eddc2 100644
--- a/lib/Transforms/InstCombine/InstCombine.h
+++ b/lib/Transforms/InstCombine/InstCombine.h
@@ -158,8 +158,8 @@ public:
                               ConstantInt *DivRHS);
   Instruction *FoldICmpShrCst(ICmpInst &ICI, BinaryOperator *DivI,
                               ConstantInt *DivRHS);
-  Instruction *FoldICmpAddOpCst(ICmpInst &ICI, Value *X, ConstantInt *CI,
-                                ICmpInst::Predicate Pred, Value *TheAdd);
+  Instruction *FoldICmpAddOpCst(Instruction &ICI, Value *X, ConstantInt *CI,
+                                ICmpInst::Predicate Pred);
   Instruction *FoldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
                            ICmpInst::Predicate Cond, Instruction &I);
   Instruction *FoldShiftByConstant(Value *Op0, ConstantInt *Op1,
@@ -178,6 +178,7 @@ public:
   Instruction *visitPtrToInt(PtrToIntInst &CI);
   Instruction *visitIntToPtr(IntToPtrInst &CI);
   Instruction *visitBitCast(BitCastInst &CI);
+  Instruction *visitAddrSpaceCast(AddrSpaceCastInst &CI);
   Instruction *FoldSelectOpOp(SelectInst &SI, Instruction *TI,
                               Instruction *FI);
   Instruction *FoldSelectIntoOp(SelectInst &SI, Value*, Value*);
@@ -212,8 +213,8 @@ private:
   bool ShouldChangeType(Type *From, Type *To) const;
   Value *dyn_castNegVal(Value *V) const;
   Value *dyn_castFNegVal(Value *V, bool NoSignedZero=false) const;
-  Type *FindElementAtOffset(Type *Ty, int64_t Offset,
-                                  SmallVectorImpl<Value*> &NewIndices);
+  Type *FindElementAtOffset(Type *PtrTy, int64_t Offset,
+                            SmallVectorImpl<Value*> &NewIndices);
   Instruction *FoldOpIntoSelect(Instruction &Op, SelectInst *SI);
 
   /// ShouldOptimizeCast - Return true if the cast from "V to Ty" actually
@@ -271,7 +272,7 @@ public:
     if (&I == V)
       V = UndefValue::get(I.getType());
 
-    DEBUG(errs() << "IC: Replacing " << I << "\n"
+    DEBUG(dbgs() << "IC: Replacing " << I << "\n"
                     "    with " << *V << '\n');
 
     I.replaceAllUsesWith(V);
@@ -283,7 +284,7 @@ public:
   // instruction.  Instead, visit methods should return the value returned by
   // this function.
   Instruction *EraseInstFromFunction(Instruction &I) {
-    DEBUG(errs() << "IC: ERASE " << I << '\n');
+    DEBUG(dbgs() << "IC: ERASE " << I << '\n');
 
     assert(I.use_empty() && "Cannot erase instruction that is used!");
     // Make sure that we reprocess all operands now that we reduced their
diff --git a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index b474bd8..88bb69b 100644
--- a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -488,6 +488,26 @@ static unsigned getTypeOfMaskedICmp(Value* A, Value* B, Value* C,
   return result;
 }
 
+/// Convert an analysis of a masked ICmp into its equivalent if all boolean
+/// operations had the opposite sense. Since each "NotXXX" flag (recording !=)
+/// is adjacent to the corresponding normal flag (recording ==), this just
+/// involves swapping those bits over.
+static unsigned conjugateICmpMask(unsigned Mask) {
+  unsigned NewMask;
+  NewMask = (Mask & (FoldMskICmp_AMask_AllOnes | FoldMskICmp_BMask_AllOnes |
+                     FoldMskICmp_Mask_AllZeroes | FoldMskICmp_AMask_Mixed |
+                     FoldMskICmp_BMask_Mixed))
+            << 1;
+
+  NewMask |=
+      (Mask & (FoldMskICmp_AMask_NotAllOnes | FoldMskICmp_BMask_NotAllOnes |
+               FoldMskICmp_Mask_NotAllZeroes | FoldMskICmp_AMask_NotMixed |
+               FoldMskICmp_BMask_NotMixed))
+      >> 1;
+
+  return NewMask;
+}
+
 /// decomposeBitTestICmp - Decompose an icmp into the form ((X & Y) pred Z)
 /// if possible. The returned predicate is either == or !=. Returns false if
 /// decomposition fails.
@@ -548,14 +568,22 @@ static unsigned foldLogOpOfMaskedICmpsHelper(Value*& A,
     L21 = L22 = L1 = 0;
   } else {
     // Look for ANDs in the LHS icmp.
-    if (match(L1, m_And(m_Value(L11), m_Value(L12)))) {
-      if (!match(L2, m_And(m_Value(L21), m_Value(L22))))
-        L21 = L22 = 0;
-    } else {
-      if (!match(L2, m_And(m_Value(L11), m_Value(L12))))
-        return 0;
-      std::swap(L1, L2);
+    if (!L1->getType()->isIntegerTy()) {
+      // You can icmp pointers, for example. They really aren't masks.
+      L11 = L12 = 0;
+    } else if (!match(L1, m_And(m_Value(L11), m_Value(L12)))) {
+      // Any icmp can be viewed as being trivially masked; if it allows us to
+      // remove one, it's worth it.
+      L11 = L1;
+      L12 = Constant::getAllOnesValue(L1->getType());
+    }
+
+    if (!L2->getType()->isIntegerTy()) {
+      // You can icmp pointers, for example. They really aren't masks.
       L21 = L22 = 0;
+    } else if (!match(L2, m_And(m_Value(L21), m_Value(L22)))) {
+      L21 = L2;
+      L22 = Constant::getAllOnesValue(L2->getType());
     }
   }
 
@@ -576,7 +604,14 @@ static unsigned foldLogOpOfMaskedICmpsHelper(Value*& A,
       return 0;
     }
     E = R2; R1 = 0; ok = true;
-  } else if (match(R1, m_And(m_Value(R11), m_Value(R12)))) {
+  } else if (R1->getType()->isIntegerTy()) {
+    if (!match(R1, m_And(m_Value(R11), m_Value(R12)))) {
+      // As before, model no mask as a trivial mask if it'll let us do an
+      // optimisation.
+      R11 = R1;
+      R12 = Constant::getAllOnesValue(R1->getType());
+    }
+
     if (R11 == L11 || R11 == L12 || R11 == L21 || R11 == L22) {
       A = R11; D = R12; E = R2; ok = true;
     } else if (R12 == L11 || R12 == L12 || R12 == L21 || R12 == L22) {
@@ -589,7 +624,12 @@ static unsigned foldLogOpOfMaskedICmpsHelper(Value*& A,
     return 0;
 
   // Look for ANDs in on the right side of the RHS icmp.
-  if (!ok && match(R2, m_And(m_Value(R11), m_Value(R12)))) {
+  if (!ok && R2->getType()->isIntegerTy()) {
+    if (!match(R2, m_And(m_Value(R11), m_Value(R12)))) {
+      R11 = R2;
+      R12 = Constant::getAllOnesValue(R2->getType());
+    }
+
     if (R11 == L11 || R11 == L12 || R11 == L21 || R11 == L22) {
       A = R11; D = R12; E = R1; ok = true;
     } else if (R12 == L11 || R12 == L12 || R12 == L21 || R12 == L22) {
@@ -618,8 +658,7 @@ static unsigned foldLogOpOfMaskedICmpsHelper(Value*& A,
 /// foldLogOpOfMaskedICmps:
 /// try to fold (icmp(A & B) ==/!= C) &/| (icmp(A & D) ==/!= E)
 /// into a single (icmp(A & X) ==/!= Y)
-static Value* foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS,
-                                     ICmpInst::Predicate NEWCC,
+static Value* foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS, bool IsAnd,
                                      llvm::InstCombiner::BuilderTy* Builder) {
   Value *A = 0, *B = 0, *C = 0, *D = 0, *E = 0;
   ICmpInst::Predicate LHSCC = LHS->getPredicate(), RHSCC = RHS->getPredicate();
@@ -629,8 +668,24 @@ static Value* foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS,
   assert(ICmpInst::isEquality(LHSCC) && ICmpInst::isEquality(RHSCC) &&
          "foldLogOpOfMaskedICmpsHelper must return an equality predicate.");
 
-  if (NEWCC == ICmpInst::ICMP_NE)
-    mask >>= 1; // treat "Not"-states as normal states
+  // In full generality:
+  //     (icmp (A & B) Op C) | (icmp (A & D) Op E)
+  // ==  ![ (icmp (A & B) !Op C) & (icmp (A & D) !Op E) ]
+  //
+  // If the latter can be converted into (icmp (A & X) Op Y) then the former is
+  // equivalent to (icmp (A & X) !Op Y).
+  //
+  // Therefore, we can pretend for the rest of this function that we're dealing
+  // with the conjunction, provided we flip the sense of any comparisons (both
+  // input and output).
+
+  // In most cases we're going to produce an EQ for the "&&" case.
+  ICmpInst::Predicate NEWCC = IsAnd ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE;
+  if (!IsAnd) {
+    // Convert the masking analysis into its equivalent with negated
+    // comparisons.
+    mask = conjugateICmpMask(mask);
+  }
 
   if (mask & FoldMskICmp_Mask_AllZeroes) {
     // (icmp eq (A & B), 0) & (icmp eq (A & D), 0)
@@ -657,6 +712,40 @@ static Value* foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS,
     Value* newAnd = Builder->CreateAnd(A, newAnd1);
     return Builder->CreateICmp(NEWCC, newAnd, A);
   }
+
+  // Remaining cases assume at least that B and D are constant, and depend on
+  // their actual values. This isn't strictly, necessary, just a "handle the
+  // easy cases for now" decision.
+  ConstantInt *BCst = dyn_cast<ConstantInt>(B);
+  if (BCst == 0) return 0;
+  ConstantInt *DCst = dyn_cast<ConstantInt>(D);
+  if (DCst == 0) return 0;
+
+  if (mask & (FoldMskICmp_Mask_NotAllZeroes | FoldMskICmp_BMask_NotAllOnes)) {
+    // (icmp ne (A & B), 0) & (icmp ne (A & D), 0) and
+    // (icmp ne (A & B), B) & (icmp ne (A & D), D)
+    //     -> (icmp ne (A & B), 0) or (icmp ne (A & D), 0)
+    // Only valid if one of the masks is a superset of the other (check "B&D" is
+    // the same as either B or D).
+    APInt NewMask = BCst->getValue() & DCst->getValue();
+
+    if (NewMask == BCst->getValue())
+      return LHS;
+    else if (NewMask == DCst->getValue())
+      return RHS;
+  }
+  if (mask & FoldMskICmp_AMask_NotAllOnes) {
+    // (icmp ne (A & B), B) & (icmp ne (A & D), D)
+    //     -> (icmp ne (A & B), A) or (icmp ne (A & D), A)
+    // Only valid if one of the masks is a superset of the other (check "B|D" is
+    // the same as either B or D).
+    APInt NewMask = BCst->getValue() | DCst->getValue();
+
+    if (NewMask == BCst->getValue())
+      return LHS;
+    else if (NewMask == DCst->getValue())
+      return RHS;
+  }
   if (mask & FoldMskICmp_BMask_Mixed) {
     // (icmp eq (A & B), C) & (icmp eq (A & D), E)
     // We already know that B & C == C && D & E == E.
@@ -665,14 +754,9 @@ static Value* foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS,
     // contradict, then we can transform to
     // -> (icmp eq (A & (B|D)), (C|E))
     // Currently, we only handle the case of B, C, D, and E being constant.
-    ConstantInt *BCst = dyn_cast<ConstantInt>(B);
-    if (BCst == 0) return 0;
-    ConstantInt *DCst = dyn_cast<ConstantInt>(D);
-    if (DCst == 0) return 0;
     // we can't simply use C and E, because we might actually handle
     //   (icmp ne (A & B), B) & (icmp eq (A & D), D)
     // with B and D, having a single bit set
-
     ConstantInt *CCst = dyn_cast<ConstantInt>(C);
     if (CCst == 0) return 0;
     if (LHSCC != NEWCC)
@@ -715,7 +799,7 @@ Value *InstCombiner::FoldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS) {
   }
 
   // handle (roughly):  (icmp eq (A & B), C) & (icmp eq (A & D), E)
-  if (Value *V = foldLogOpOfMaskedICmps(LHS, RHS, ICmpInst::ICMP_EQ, Builder))
+  if (Value *V = foldLogOpOfMaskedICmps(LHS, RHS, true, Builder))
     return V;
 
   // This only handles icmp of constants: (icmp1 A, C1) & (icmp2 B, C2).
@@ -849,10 +933,15 @@ Value *InstCombiner::FoldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS) {
     case ICmpInst::ICMP_SGT:        // (X != 13 & X s> 15) -> X s> 15
       return RHS;
     case ICmpInst::ICMP_NE:
+      // Special case to get the ordering right when the values wrap around
+      // zero.
+      if (LHSCst->getValue() == 0 && RHSCst->getValue().isAllOnesValue())
+        std::swap(LHSCst, RHSCst);
       if (LHSCst == SubOne(RHSCst)){// (X != 13 & X != 14) -> X-13 >u 1
         Constant *AddCST = ConstantExpr::getNeg(LHSCst);
         Value *Add = Builder->CreateAdd(Val, AddCST, Val->getName()+".off");
-        return Builder->CreateICmpUGT(Add, ConstantInt::get(Add->getType(), 1));
+        return Builder->CreateICmpUGT(Add, ConstantInt::get(Add->getType(), 1),
+                                      Val->getName()+".cmp");
       }
       break;                        // (X != 13 & X != 15) -> no change
     }
@@ -1454,10 +1543,60 @@ static Instruction *MatchSelectFromAndOr(Value *A, Value *B,
   return 0;
 }
 
+/// IsOneHotValue - Returns true for "one-hot" values (values where at most
+/// one bit can be set).
+static bool IsOneHotValue(Value *V) {
+  // Match 1<<K.
+  if (BinaryOperator *BO = dyn_cast<BinaryOperator>(V))
+    if (BO->getOpcode() == Instruction::Shl) {
+      ConstantInt *One = dyn_cast<ConstantInt>(BO->getOperand(0));
+      return One && One->isOne();
+    }
+
+  // Check for power of two integer constants.
+  if (ConstantInt *K = dyn_cast<ConstantInt>(V))
+    return K->getValue().isPowerOf2();
+
+  return false;
+}
+
 /// FoldOrOfICmps - Fold (icmp)|(icmp) if possible.
 Value *InstCombiner::FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS) {
   ICmpInst::Predicate LHSCC = LHS->getPredicate(), RHSCC = RHS->getPredicate();
 
+  // Fold (iszero(A & K1) | iszero(A & K2)) ->  (A & (K1 | K2)) != (K1 | K2)
+  // if K1 and K2 are a one-bit mask.
+  ConstantInt *LHSCst = dyn_cast<ConstantInt>(LHS->getOperand(1));
+  ConstantInt *RHSCst = dyn_cast<ConstantInt>(RHS->getOperand(1));
+
+  if (LHS->getPredicate() == ICmpInst::ICMP_EQ && LHSCst && LHSCst->isZero() &&
+      RHS->getPredicate() == ICmpInst::ICMP_EQ && RHSCst && RHSCst->isZero()) {
+
+    BinaryOperator *LAnd = dyn_cast<BinaryOperator>(LHS->getOperand(0));
+    BinaryOperator *RAnd = dyn_cast<BinaryOperator>(RHS->getOperand(0));
+    if (LAnd && RAnd && LAnd->hasOneUse() && RHS->hasOneUse() &&
+        LAnd->getOpcode() == Instruction::And &&
+        RAnd->getOpcode() == Instruction::And) {
+
+      Value *Mask = 0;
+      Value *Masked = 0;
+      if (LAnd->getOperand(0) == RAnd->getOperand(0) &&
+          IsOneHotValue(LAnd->getOperand(1)) &&
+          IsOneHotValue(RAnd->getOperand(1))) {
+        Mask = Builder->CreateOr(LAnd->getOperand(1), RAnd->getOperand(1));
+        Masked = Builder->CreateAnd(LAnd->getOperand(0), Mask);
+      } else if (LAnd->getOperand(1) == RAnd->getOperand(1) &&
+                 IsOneHotValue(LAnd->getOperand(0)) &&
+                 IsOneHotValue(RAnd->getOperand(0))) {
+        Mask = Builder->CreateOr(LAnd->getOperand(0), RAnd->getOperand(0));
+        Masked = Builder->CreateAnd(LAnd->getOperand(1), Mask);
+      }
+
+      if (Masked)
+        return Builder->CreateICmp(ICmpInst::ICMP_NE, Masked, Mask);
+    }
+  }
+
   // (icmp1 A, B) | (icmp2 A, B) --> (icmp3 A, B)
   if (PredicatesFoldable(LHSCC, RHSCC)) {
     if (LHS->getOperand(0) == RHS->getOperand(1) &&
@@ -1474,13 +1613,10 @@ Value *InstCombiner::FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS) {
 
   // handle (roughly):
   // (icmp ne (A & B), C) | (icmp ne (A & D), E)
-  if (Value *V = foldLogOpOfMaskedICmps(LHS, RHS, ICmpInst::ICMP_NE, Builder))
+  if (Value *V = foldLogOpOfMaskedICmps(LHS, RHS, false, Builder))
     return V;
 
   Value *Val = LHS->getOperand(0), *Val2 = RHS->getOperand(0);
-  ConstantInt *LHSCst = dyn_cast<ConstantInt>(LHS->getOperand(1));
-  ConstantInt *RHSCst = dyn_cast<ConstantInt>(RHS->getOperand(1));
-
   if (LHS->hasOneUse() || RHS->hasOneUse()) {
     // (icmp eq B, 0) | (icmp ult A, B) -> (icmp ule A, B-1)
     // (icmp eq B, 0) | (icmp ugt B, A) -> (icmp ule A, B-1)
diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 9f74fd6..0cd7b14 100644
--- a/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -999,20 +999,15 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
 
   // Check to see if we are changing the return type...
   if (OldRetTy != NewRetTy) {
-    if (Callee->isDeclaration() &&
-        // Conversion is ok if changing from one pointer type to another or from
-        // a pointer to an integer of the same size.
-        !((OldRetTy->isPointerTy() || !TD ||
-           OldRetTy == TD->getIntPtrType(Caller->getContext())) &&
-          (NewRetTy->isPointerTy() || !TD ||
-           NewRetTy == TD->getIntPtrType(Caller->getContext()))))
-      return false;   // Cannot transform this return value.
+    if (!CastInst::isBitCastable(NewRetTy, OldRetTy)) {
+      if (Callee->isDeclaration())
+        return false;   // Cannot transform this return value.
 
-    if (!Caller->use_empty() &&
-        // void -> non-void is handled specially
-        !NewRetTy->isVoidTy() &&
-        !CastInst::isBitCastable(NewRetTy, OldRetTy))
+      if (!Caller->use_empty() &&
+          // void -> non-void is handled specially
+          !NewRetTy->isVoidTy())
       return false;   // Cannot transform this return value.
+    }
 
     if (!CallerPAL.isEmpty() && !Caller->use_empty()) {
       AttrBuilder RAttrs(CallerPAL, AttributeSet::ReturnIndex);
@@ -1045,9 +1040,8 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
     Type *ParamTy = FT->getParamType(i);
     Type *ActTy = (*AI)->getType();
 
-    if (!CastInst::isBitCastable(ActTy, ParamTy)) {
+    if (!CastInst::isBitCastable(ActTy, ParamTy))
       return false;   // Cannot transform this parameter value.
-    }
 
     if (AttrBuilder(CallerPAL.getParamAttributes(i + 1), i + 1).
           hasAttributes(AttributeFuncs::
@@ -1063,21 +1057,11 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
       if (ParamPTy == 0 || !ParamPTy->getElementType()->isSized() || TD == 0)
         return false;
 
-      Type *CurElTy = cast<PointerType>(ActTy)->getElementType();
+      Type *CurElTy = ActTy->getPointerElementType();
       if (TD->getTypeAllocSize(CurElTy) !=
           TD->getTypeAllocSize(ParamPTy->getElementType()))
         return false;
     }
-
-    // Converting from one pointer type to another or between a pointer and an
-    // integer of the same size is safe even if we do not have a body.
-    bool isConvertible = ActTy == ParamTy ||
-      (TD && ((ParamTy->isPointerTy() ||
-      ParamTy == TD->getIntPtrType(Caller->getContext())) &&
-              (ActTy->isPointerTy() ||
-              ActTy == TD->getIntPtrType(Caller->getContext()))));
-    if (Callee->isDeclaration() && !isConvertible)
-      return false;
   }
 
   if (Callee->isDeclaration()) {
diff --git a/lib/Transforms/InstCombine/InstCombineCasts.cpp b/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 361acdd..72377dc 100644
--- a/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -1229,6 +1229,19 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &CI) {
     }
   }
 
+  // (fptrunc (select cond, R1, Cst)) -->
+  // (select cond, (fptrunc R1), (fptrunc Cst))
+  SelectInst *SI = dyn_cast<SelectInst>(CI.getOperand(0));
+  if (SI &&
+      (isa<ConstantFP>(SI->getOperand(1)) ||
+       isa<ConstantFP>(SI->getOperand(2)))) {
+    Value *LHSTrunc = Builder->CreateFPTrunc(SI->getOperand(1),
+                                             CI.getType());
+    Value *RHSTrunc = Builder->CreateFPTrunc(SI->getOperand(2),
+                                             CI.getType());
+    return SelectInst::Create(SI->getOperand(0), LHSTrunc, RHSTrunc);
+  }
+
   IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI.getOperand(0));
   if (II) {
     switch (II->getIntrinsicID()) {
@@ -1249,9 +1262,14 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &CI) {
   }
 
   // Fold (fptrunc (sqrt (fpext x))) -> (sqrtf x)
+  // Note that we restrict this transformation based on
+  // TLI->has(LibFunc::sqrtf), even for the sqrt intrinsic, because
+  // TLI->has(LibFunc::sqrtf) is sufficient to guarantee that the
+  // single-precision intrinsic can be expanded in the backend.
   CallInst *Call = dyn_cast<CallInst>(CI.getOperand(0));
   if (Call && Call->getCalledFunction() && TLI->has(LibFunc::sqrtf) &&
-      Call->getCalledFunction()->getName() == TLI->getName(LibFunc::sqrt) &&
+      (Call->getCalledFunction()->getName() == TLI->getName(LibFunc::sqrt) ||
+       Call->getCalledFunction()->getIntrinsicID() == Intrinsic::sqrt) &&
       Call->getNumArgOperands() == 1 &&
       Call->hasOneUse()) {
     CastInst *Arg = dyn_cast<CastInst>(Call->getArgOperand(0));
@@ -1262,11 +1280,11 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &CI) {
         Arg->getOperand(0)->getType()->isFloatTy()) {
       Function *Callee = Call->getCalledFunction();
       Module *M = CI.getParent()->getParent()->getParent();
-      Constant *SqrtfFunc = M->getOrInsertFunction("sqrtf",
-                                                   Callee->getAttributes(),
-                                                   Builder->getFloatTy(),
-                                                   Builder->getFloatTy(),
-                                                   NULL);
+      Constant *SqrtfFunc = (Callee->getIntrinsicID() == Intrinsic::sqrt) ?
+        Intrinsic::getDeclaration(M, Intrinsic::sqrt, Builder->getFloatTy()) :
+        M->getOrInsertFunction("sqrtf", Callee->getAttributes(),
+                               Builder->getFloatTy(), Builder->getFloatTy(),
+                               NULL);
       CallInst *ret = CallInst::Create(SqrtfFunc, Arg->getOperand(0),
                                        "sqrtfcall");
       ret->setAttributes(Callee->getAttributes());
@@ -1338,14 +1356,18 @@ Instruction *InstCombiner::visitIntToPtr(IntToPtrInst &CI) {
   // If the source integer type is not the intptr_t type for this target, do a
   // trunc or zext to the intptr_t type, then inttoptr of it.  This allows the
   // cast to be exposed to other transforms.
-  if (TD && CI.getOperand(0)->getType()->getScalarSizeInBits() !=
-      TD->getPointerSizeInBits()) {
-    Type *Ty = TD->getIntPtrType(CI.getContext());
-    if (CI.getType()->isVectorTy()) // Handle vectors of pointers.
-      Ty = VectorType::get(Ty, CI.getType()->getVectorNumElements());
-
-    Value *P = Builder->CreateZExtOrTrunc(CI.getOperand(0), Ty);
-    return new IntToPtrInst(P, CI.getType());
+
+  if (TD) {
+    unsigned AS = CI.getAddressSpace();
+    if (CI.getOperand(0)->getType()->getScalarSizeInBits() !=
+        TD->getPointerSizeInBits(AS)) {
+      Type *Ty = TD->getIntPtrType(CI.getContext(), AS);
+      if (CI.getType()->isVectorTy()) // Handle vectors of pointers.
+        Ty = VectorType::get(Ty, CI.getType()->getVectorNumElements());
+
+      Value *P = Builder->CreateZExtOrTrunc(CI.getOperand(0), Ty);
+      return new IntToPtrInst(P, CI.getType());
+    }
   }
 
   if (Instruction *I = commonCastTransforms(CI))
@@ -1370,25 +1392,32 @@ Instruction *InstCombiner::commonPointerCastTransforms(CastInst &CI) {
       return &CI;
     }
 
+    if (!TD)
+      return commonCastTransforms(CI);
+
     // If the GEP has a single use, and the base pointer is a bitcast, and the
     // GEP computes a constant offset, see if we can convert these three
     // instructions into fewer.  This typically happens with unions and other
     // non-type-safe code.
-    APInt Offset(TD ? TD->getPointerSizeInBits() : 1, 0);
-    if (TD && GEP->hasOneUse() && isa<BitCastInst>(GEP->getOperand(0)) &&
+    unsigned AS = GEP->getPointerAddressSpace();
+    unsigned OffsetBits = TD->getPointerSizeInBits(AS);
+    APInt Offset(OffsetBits, 0);
+    BitCastInst *BCI = dyn_cast<BitCastInst>(GEP->getOperand(0));
+    if (GEP->hasOneUse() &&
+        BCI &&
         GEP->accumulateConstantOffset(*TD, Offset)) {
       // Get the base pointer input of the bitcast, and the type it points to.
-      Value *OrigBase = cast<BitCastInst>(GEP->getOperand(0))->getOperand(0);
-      Type *GEPIdxTy =
-      cast<PointerType>(OrigBase->getType())->getElementType();
+      Value *OrigBase = BCI->getOperand(0);
       SmallVector<Value*, 8> NewIndices;
-      if (FindElementAtOffset(GEPIdxTy, Offset.getSExtValue(), NewIndices)) {
+      if (FindElementAtOffset(OrigBase->getType(),
+                              Offset.getSExtValue(),
+                              NewIndices)) {
         // If we were able to index down into an element, create the GEP
         // and bitcast the result.  This eliminates one bitcast, potentially
         // two.
         Value *NGEP = cast<GEPOperator>(GEP)->isInBounds() ?
-        Builder->CreateInBoundsGEP(OrigBase, NewIndices) :
-        Builder->CreateGEP(OrigBase, NewIndices);
+          Builder->CreateInBoundsGEP(OrigBase, NewIndices) :
+          Builder->CreateGEP(OrigBase, NewIndices);
         NGEP->takeName(GEP);
 
         if (isa<BitCastInst>(CI))
@@ -1406,16 +1435,22 @@ Instruction *InstCombiner::visitPtrToInt(PtrToIntInst &CI) {
   // If the destination integer type is not the intptr_t type for this target,
   // do a ptrtoint to intptr_t then do a trunc or zext.  This allows the cast
   // to be exposed to other transforms.
-  if (TD && CI.getType()->getScalarSizeInBits() != TD->getPointerSizeInBits()) {
-    Type *Ty = TD->getIntPtrType(CI.getContext());
-    if (CI.getType()->isVectorTy()) // Handle vectors of pointers.
-      Ty = VectorType::get(Ty, CI.getType()->getVectorNumElements());
 
-    Value *P = Builder->CreatePtrToInt(CI.getOperand(0), Ty);
-    return CastInst::CreateIntegerCast(P, CI.getType(), /*isSigned=*/false);
-  }
+  if (!TD)
+    return commonPointerCastTransforms(CI);
+
+  Type *Ty = CI.getType();
+  unsigned AS = CI.getPointerAddressSpace();
+
+  if (Ty->getScalarSizeInBits() == TD->getPointerSizeInBits(AS))
+    return commonPointerCastTransforms(CI);
 
-  return commonPointerCastTransforms(CI);
+  Type *PtrTy = TD->getIntPtrType(CI.getContext(), AS);
+  if (Ty->isVectorTy()) // Handle vectors of pointers.
+    PtrTy = VectorType::get(PtrTy, Ty->getVectorNumElements());
+
+  Value *P = Builder->CreatePtrToInt(CI.getOperand(0), PtrTy);
+  return CastInst::CreateIntegerCast(P, Ty, /*isSigned=*/false);
 }
 
 /// OptimizeVectorResize - This input value (which is known to have vector type)
@@ -1488,12 +1523,17 @@ static unsigned getTypeSizeIndex(unsigned Value, Type *Ty) {
 /// insertions into the vector.  See the example in the comment for
 /// OptimizeIntegerToVectorInsertions for the pattern this handles.
 /// The type of V is always a non-zero multiple of VecEltTy's size.
+/// Shift is the number of bits between the lsb of V and the lsb of
+/// the vector.
 ///
 /// This returns false if the pattern can't be matched or true if it can,
 /// filling in Elements with the elements found here.
-static bool CollectInsertionElements(Value *V, unsigned ElementIndex,
+static bool CollectInsertionElements(Value *V, unsigned Shift,
                                      SmallVectorImpl<Value*> &Elements,
-                                     Type *VecEltTy) {
+                                     Type *VecEltTy, InstCombiner &IC) {
+  assert(isMultipleOfTypeSize(Shift, VecEltTy) &&
+         "Shift should be a multiple of the element type size");
+
   // Undef values never contribute useful bits to the result.
   if (isa<UndefValue>(V)) return true;
 
@@ -1505,8 +1545,12 @@ static bool CollectInsertionElements(Value *V, unsigned ElementIndex,
       if (C->isNullValue())
         return true;
 
+    unsigned ElementIndex = getTypeSizeIndex(Shift, VecEltTy);
+    if (IC.getDataLayout()->isBigEndian())
+      ElementIndex = Elements.size() - ElementIndex - 1;
+
     // Fail if multiple elements are inserted into this slot.
-    if (ElementIndex >= Elements.size() || Elements[ElementIndex] != 0)
+    if (Elements[ElementIndex] != 0)
       return false;
 
     Elements[ElementIndex] = V;
@@ -1522,7 +1566,7 @@ static bool CollectInsertionElements(Value *V, unsigned ElementIndex,
     // it to the right type so it gets properly inserted.
     if (NumElts == 1)
       return CollectInsertionElements(ConstantExpr::getBitCast(C, VecEltTy),
-                                      ElementIndex, Elements, VecEltTy);
+                                      Shift, Elements, VecEltTy, IC);
 
     // Okay, this is a constant that covers multiple elements.  Slice it up into
     // pieces and insert each element-sized piece into the vector.
@@ -1533,10 +1577,11 @@ static bool CollectInsertionElements(Value *V, unsigned ElementIndex,
     Type *ElementIntTy = IntegerType::get(C->getContext(), ElementSize);
 
     for (unsigned i = 0; i != NumElts; ++i) {
+      unsigned ShiftI = Shift+i*ElementSize;
       Constant *Piece = ConstantExpr::getLShr(C, ConstantInt::get(C->getType(),
-                                                               i*ElementSize));
+                                                                  ShiftI));
       Piece = ConstantExpr::getTrunc(Piece, ElementIntTy);
-      if (!CollectInsertionElements(Piece, ElementIndex+i, Elements, VecEltTy))
+      if (!CollectInsertionElements(Piece, ShiftI, Elements, VecEltTy, IC))
         return false;
     }
     return true;
@@ -1549,29 +1594,28 @@ static bool CollectInsertionElements(Value *V, unsigned ElementIndex,
   switch (I->getOpcode()) {
   default: return false; // Unhandled case.
   case Instruction::BitCast:
-    return CollectInsertionElements(I->getOperand(0), ElementIndex,
-                                    Elements, VecEltTy);
+    return CollectInsertionElements(I->getOperand(0), Shift,
+                                    Elements, VecEltTy, IC);
   case Instruction::ZExt:
     if (!isMultipleOfTypeSize(
                           I->getOperand(0)->getType()->getPrimitiveSizeInBits(),
                               VecEltTy))
       return false;
-    return CollectInsertionElements(I->getOperand(0), ElementIndex,
-                                    Elements, VecEltTy);
+    return CollectInsertionElements(I->getOperand(0), Shift,
+                                    Elements, VecEltTy, IC);
   case Instruction::Or:
-    return CollectInsertionElements(I->getOperand(0), ElementIndex,
-                                    Elements, VecEltTy) &&
-           CollectInsertionElements(I->getOperand(1), ElementIndex,
-                                    Elements, VecEltTy);
+    return CollectInsertionElements(I->getOperand(0), Shift,
+                                    Elements, VecEltTy, IC) &&
+           CollectInsertionElements(I->getOperand(1), Shift,
+                                    Elements, VecEltTy, IC);
   case Instruction::Shl: {
     // Must be shifting by a constant that is a multiple of the element size.
     ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(1));
     if (CI == 0) return false;
-    if (!isMultipleOfTypeSize(CI->getZExtValue(), VecEltTy)) return false;
-    unsigned IndexShift = getTypeSizeIndex(CI->getZExtValue(), VecEltTy);
-
-    return CollectInsertionElements(I->getOperand(0), ElementIndex+IndexShift,
-                                    Elements, VecEltTy);
+    Shift += CI->getZExtValue();
+    if (!isMultipleOfTypeSize(Shift, VecEltTy)) return false;
+    return CollectInsertionElements(I->getOperand(0), Shift,
+                                    Elements, VecEltTy, IC);
   }
 
   }
@@ -1594,12 +1638,15 @@ static bool CollectInsertionElements(Value *V, unsigned ElementIndex,
 /// Into two insertelements that do "buildvector{%inc, %inc5}".
 static Value *OptimizeIntegerToVectorInsertions(BitCastInst &CI,
                                                 InstCombiner &IC) {
+  // We need to know the target byte order to perform this optimization.
+  if (!IC.getDataLayout()) return 0;
+
   VectorType *DestVecTy = cast<VectorType>(CI.getType());
   Value *IntInput = CI.getOperand(0);
 
   SmallVector<Value*, 8> Elements(DestVecTy->getNumElements());
   if (!CollectInsertionElements(IntInput, 0, Elements,
-                                DestVecTy->getElementType()))
+                                DestVecTy->getElementType(), IC))
     return 0;
 
   // If we succeeded, we know that all of the element are specified by Elements
@@ -1785,10 +1832,9 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
     // Okay, we have (bitcast (shuffle ..)).  Check to see if this is
     // a bitcast to a vector with the same # elts.
     if (SVI->hasOneUse() && DestTy->isVectorTy() &&
-        cast<VectorType>(DestTy)->getNumElements() ==
-              SVI->getType()->getNumElements() &&
+        DestTy->getVectorNumElements() == SVI->getType()->getNumElements() &&
         SVI->getType()->getNumElements() ==
-          cast<VectorType>(SVI->getOperand(0)->getType())->getNumElements()) {
+        SVI->getOperand(0)->getType()->getVectorNumElements()) {
       BitCastInst *Tmp;
       // If either of the operands is a cast from CI.getType(), then
       // evaluating the shuffle in the casted destination's type will allow
@@ -1810,3 +1856,7 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
     return commonPointerCastTransforms(CI);
   return commonCastTransforms(CI);
 }
+
+Instruction *InstCombiner::visitAddrSpaceCast(AddrSpaceCastInst &CI) {
+  return commonCastTransforms(CI);
+}
diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp
index c0225ae..9bb65ef 100644
--- a/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -227,7 +227,8 @@ Instruction *InstCombiner::
 FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV,
                              CmpInst &ICI, ConstantInt *AndCst) {
   // We need TD information to know the pointer size unless this is inbounds.
-  if (!GEP->isInBounds() && TD == 0) return 0;
+  if (!GEP->isInBounds() && TD == 0)
+    return 0;
 
   Constant *Init = GV->getInitializer();
   if (!isa<ConstantArray>(Init) && !isa<ConstantDataArray>(Init))
@@ -393,9 +394,12 @@ FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV,
   // If the index is larger than the pointer size of the target, truncate the
   // index down like the GEP would do implicitly.  We don't have to do this for
   // an inbounds GEP because the index can't be out of range.
-  if (!GEP->isInBounds() &&
-      Idx->getType()->getPrimitiveSizeInBits() > TD->getPointerSizeInBits())
-    Idx = Builder->CreateTrunc(Idx, TD->getIntPtrType(Idx->getContext()));
+  if (!GEP->isInBounds()) {
+    Type *IntPtrTy = TD->getIntPtrType(GEP->getType());
+    unsigned PtrSize = IntPtrTy->getIntegerBitWidth();
+    if (Idx->getType()->getPrimitiveSizeInBits() > PtrSize)
+      Idx = Builder->CreateTrunc(Idx, IntPtrTy);
+  }
 
   // If the comparison is only true for one or two elements, emit direct
   // comparisons.
@@ -562,16 +566,18 @@ static Value *EvaluateGEPOffsetExpression(User *GEP, InstCombiner &IC) {
     }
   }
 
+
+
   // Okay, we know we have a single variable index, which must be a
   // pointer/array/vector index.  If there is no offset, life is simple, return
   // the index.
-  unsigned IntPtrWidth = TD.getPointerSizeInBits();
+  Type *IntPtrTy = TD.getIntPtrType(GEP->getOperand(0)->getType());
+  unsigned IntPtrWidth = IntPtrTy->getIntegerBitWidth();
   if (Offset == 0) {
     // Cast to intptrty in case a truncation occurs.  If an extension is needed,
     // we don't need to bother extending: the extension won't affect where the
     // computation crosses zero.
     if (VariableIdx->getType()->getPrimitiveSizeInBits() > IntPtrWidth) {
-      Type *IntPtrTy = TD.getIntPtrType(VariableIdx->getContext());
       VariableIdx = IC.Builder->CreateTrunc(VariableIdx, IntPtrTy);
     }
     return VariableIdx;
@@ -593,7 +599,6 @@ static Value *EvaluateGEPOffsetExpression(User *GEP, InstCombiner &IC) {
     return 0;
 
   // Okay, we can do this evaluation.  Start by converting the index to intptr.
-  Type *IntPtrTy = TD.getIntPtrType(VariableIdx->getContext());
   if (VariableIdx->getType() != IntPtrTy)
     VariableIdx = IC.Builder->CreateIntCast(VariableIdx, IntPtrTy,
                                             true /*Signed*/);
@@ -737,10 +742,9 @@ Instruction *InstCombiner::FoldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
 }
 
 /// FoldICmpAddOpCst - Fold "icmp pred (X+CI), X".
-Instruction *InstCombiner::FoldICmpAddOpCst(ICmpInst &ICI,
+Instruction *InstCombiner::FoldICmpAddOpCst(Instruction &ICI,
                                             Value *X, ConstantInt *CI,
-                                            ICmpInst::Predicate Pred,
-                                            Value *TheAdd) {
+                                            ICmpInst::Predicate Pred) {
   // If we have X+0, exit early (simplifying logic below) and let it get folded
   // elsewhere.   icmp X+0, X  -> icmp X, X
   if (CI->isZero()) {
@@ -1194,11 +1198,16 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
       Type *AndTy = AndCST->getType();          // Type of the and.
 
       // We can fold this as long as we can't shift unknown bits
-      // into the mask.  This can only happen with signed shift
-      // rights, as they sign-extend.
+      // into the mask. This can happen with signed shift
+      // rights, as they sign-extend. With logical shifts,
+      // we must still make sure the comparison is not signed
+      // because we are effectively changing the
+      // position of the sign bit (PR17827).
+      // TODO: We can relax these constraints a bit more.
       if (ShAmt) {
-        bool CanFold = Shift->isLogicalShift();
-        if (!CanFold) {
+        bool CanFold = false;
+        unsigned ShiftOpcode = Shift->getOpcode();
+        if (ShiftOpcode == Instruction::AShr) {
           // To test for the bad case of the signed shr, see if any
           // of the bits shifted in could be tested after the mask.
           uint32_t TyBits = Ty->getPrimitiveSizeInBits();
@@ -1208,6 +1217,9 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
           if ((APInt::getHighBitsSet(BitWidth, BitWidth-ShAmtVal) &
                AndCST->getValue()) == 0)
             CanFold = true;
+        } else if (ShiftOpcode == Instruction::Shl ||
+                   ShiftOpcode == Instruction::LShr) {
+          CanFold = !ICI.isSigned();
         }
 
         if (CanFold) {
@@ -1781,8 +1793,7 @@ Instruction *InstCombiner::visitICmpInstWithCastAndCast(ICmpInst &ICI) {
   // Turn icmp (ptrtoint x), (ptrtoint/c) into a compare of the input if the
   // integer type is the same size as the pointer type.
   if (TD && LHSCI->getOpcode() == Instruction::PtrToInt &&
-      TD->getPointerSizeInBits() ==
-         cast<IntegerType>(DestTy)->getBitWidth()) {
+      TD->getPointerTypeSizeInBits(SrcTy) == DestTy->getIntegerBitWidth()) {
     Value *RHSOp = 0;
     if (Constant *RHSC = dyn_cast<Constant>(ICI.getOperand(1))) {
       RHSOp = ConstantExpr::getIntToPtr(RHSC, SrcTy);
@@ -2035,14 +2046,59 @@ static APInt DemandedBitsLHSMask(ICmpInst &I,
 
 }
 
+/// \brief Check if the order of \p Op0 and \p Op1 as operand in an ICmpInst
+/// should be swapped.
+/// The descision is based on how many times these two operands are reused
+/// as subtract operands and their positions in those instructions.
+/// The rational is that several architectures use the same instruction for
+/// both subtract and cmp, thus it is better if the order of those operands
+/// match.
+/// \return true if Op0 and Op1 should be swapped.
+static bool swapMayExposeCSEOpportunities(const Value * Op0,
+                                          const Value * Op1) {
+  // Filter out pointer value as those cannot appears directly in subtract.
+  // FIXME: we may want to go through inttoptrs or bitcasts.
+  if (Op0->getType()->isPointerTy())
+    return false;
+  // Count every uses of both Op0 and Op1 in a subtract.
+  // Each time Op0 is the first operand, count -1: swapping is bad, the
+  // subtract has already the same layout as the compare.
+  // Each time Op0 is the second operand, count +1: swapping is good, the
+  // subtract has a diffrent layout as the compare.
+  // At the end, if the benefit is greater than 0, Op0 should come second to
+  // expose more CSE opportunities.
+  int GlobalSwapBenefits = 0;
+  for (Value::const_use_iterator UI = Op0->use_begin(), UIEnd = Op0->use_end(); UI != UIEnd; ++UI) {
+    const BinaryOperator *BinOp = dyn_cast<BinaryOperator>(*UI);
+    if (!BinOp || BinOp->getOpcode() != Instruction::Sub)
+      continue;
+    // If Op0 is the first argument, this is not beneficial to swap the
+    // arguments.
+    int LocalSwapBenefits = -1;
+    unsigned Op1Idx = 1;
+    if (BinOp->getOperand(Op1Idx) == Op0) {
+      Op1Idx = 0;
+      LocalSwapBenefits = 1;
+    }
+    if (BinOp->getOperand(Op1Idx) != Op1)
+      continue;
+    GlobalSwapBenefits += LocalSwapBenefits;
+  }
+  return GlobalSwapBenefits > 0;
+}
+
 Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
   bool Changed = false;
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+  unsigned Op0Cplxity = getComplexity(Op0);
+  unsigned Op1Cplxity = getComplexity(Op1);
 
   /// Orders the operands of the compare so that they are listed from most
   /// complex to least complex.  This puts constants before unary operators,
   /// before binary operators.
-  if (getComplexity(Op0) < getComplexity(Op1)) {
+  if (Op0Cplxity < Op1Cplxity ||
+        (Op0Cplxity == Op1Cplxity &&
+         swapMayExposeCSEOpportunities(Op0, Op1))) {
     I.swapOperands();
     std::swap(Op0, Op1);
     Changed = true;
@@ -2477,7 +2533,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
       case Instruction::IntToPtr:
         // icmp pred inttoptr(X), null -> icmp pred X, 0
         if (RHSC->isNullValue() && TD &&
-            TD->getIntPtrType(RHSC->getContext()) ==
+            TD->getIntPtrType(RHSC->getType()) ==
                LHSI->getOperand(0)->getType())
           return new ICmpInst(I.getPredicate(), LHSI->getOperand(0),
                         Constant::getNullValue(LHSI->getOperand(0)->getType()));
@@ -2900,6 +2956,24 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
                             Builder->CreateTrunc(B, A->getType()));
     }
 
+    // (A >> C) == (B >> C) --> (A^B) u< (1 << C)
+    // For lshr and ashr pairs.
+    if ((match(Op0, m_OneUse(m_LShr(m_Value(A), m_ConstantInt(Cst1)))) &&
+         match(Op1, m_OneUse(m_LShr(m_Value(B), m_Specific(Cst1))))) ||
+        (match(Op0, m_OneUse(m_AShr(m_Value(A), m_ConstantInt(Cst1)))) &&
+         match(Op1, m_OneUse(m_AShr(m_Value(B), m_Specific(Cst1)))))) {
+      unsigned TypeBits = Cst1->getBitWidth();
+      unsigned ShAmt = (unsigned)Cst1->getLimitedValue(TypeBits);
+      if (ShAmt < TypeBits && ShAmt != 0) {
+        ICmpInst::Predicate Pred = I.getPredicate() == ICmpInst::ICMP_NE
+                                       ? ICmpInst::ICMP_UGE
+                                       : ICmpInst::ICMP_ULT;
+        Value *Xor = Builder->CreateXor(A, B, I.getName() + ".unshifted");
+        APInt CmpVal = APInt::getOneBitSet(TypeBits, ShAmt);
+        return new ICmpInst(Pred, Xor, Builder->getInt(CmpVal));
+      }
+    }
+
     // Transform "icmp eq (trunc (lshr(X, cst1)), cst" to
     // "icmp (and X, mask), cst"
     uint64_t ShAmt = 0;
@@ -2930,20 +3004,15 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
     Value *X; ConstantInt *Cst;
     // icmp X+Cst, X
     if (match(Op0, m_Add(m_Value(X), m_ConstantInt(Cst))) && Op1 == X)
-      return FoldICmpAddOpCst(I, X, Cst, I.getPredicate(), Op0);
+      return FoldICmpAddOpCst(I, X, Cst, I.getPredicate());
 
     // icmp X, X+Cst
     if (match(Op1, m_Add(m_Value(X), m_ConstantInt(Cst))) && Op0 == X)
-      return FoldICmpAddOpCst(I, X, Cst, I.getSwappedPredicate(), Op1);
+      return FoldICmpAddOpCst(I, X, Cst, I.getSwappedPredicate());
   }
   return Changed ? &I : 0;
 }
 
-
-
-
-
-
 /// FoldFCmp_IntToFP_Cst - Fold fcmp ([us]itofp x, cst) if possible.
 ///
 Instruction *InstCombiner::FoldFCmp_IntToFP_Cst(FCmpInst &I,
diff --git a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index e2d7966..4c861b3 100644
--- a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -154,7 +154,7 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) {
   // Ensure that the alloca array size argument has type intptr_t, so that
   // any casting is exposed early.
   if (TD) {
-    Type *IntPtrTy = TD->getIntPtrType(AI.getContext());
+    Type *IntPtrTy = TD->getIntPtrType(AI.getType());
     if (AI.getArraySize()->getType() != IntPtrTy) {
       Value *V = Builder->CreateIntCast(AI.getArraySize(),
                                         IntPtrTy, false);
@@ -180,12 +180,13 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) {
       // Now that I is pointing to the first non-allocation-inst in the block,
       // insert our getelementptr instruction...
       //
-      Value *NullIdx =Constant::getNullValue(Type::getInt32Ty(AI.getContext()));
-      Value *Idx[2];
-      Idx[0] = NullIdx;
-      Idx[1] = NullIdx;
+      Type *IdxTy = TD
+                  ? TD->getIntPtrType(AI.getType())
+                  : Type::getInt64Ty(AI.getContext());
+      Value *NullIdx = Constant::getNullValue(IdxTy);
+      Value *Idx[2] = { NullIdx, NullIdx };
       Instruction *GEP =
-           GetElementPtrInst::CreateInBounds(New, Idx, New->getName()+".sub");
+        GetElementPtrInst::CreateInBounds(New, Idx, New->getName() + ".sub");
       InsertNewInstBefore(GEP, *It);
 
       // Now make everything use the getelementptr instead of the original
@@ -262,9 +263,9 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) {
         for (unsigned i = 0, e = ToDelete.size(); i != e; ++i)
           EraseInstFromFunction(*ToDelete[i]);
         Constant *TheSrc = cast<Constant>(Copy->getSource());
-        Instruction *NewI
-          = ReplaceInstUsesWith(AI, ConstantExpr::getBitCast(TheSrc,
-                                                             AI.getType()));
+        Constant *Cast
+          = ConstantExpr::getPointerBitCastOrAddrSpaceCast(TheSrc, AI.getType());
+        Instruction *NewI = ReplaceInstUsesWith(AI, Cast);
         EraseInstFromFunction(*Copy);
         ++NumGlobalCopies;
         return NewI;
@@ -302,9 +303,11 @@ static Instruction *InstCombineLoadCast(InstCombiner &IC, LoadInst &LI,
       if (ArrayType *ASrcTy = dyn_cast<ArrayType>(SrcPTy))
         if (Constant *CSrc = dyn_cast<Constant>(CastOp))
           if (ASrcTy->getNumElements() != 0) {
-            Value *Idxs[2];
-            Idxs[0] = Constant::getNullValue(Type::getInt32Ty(LI.getContext()));
-            Idxs[1] = Idxs[0];
+            Type *IdxTy = TD
+                        ? TD->getIntPtrType(SrcTy)
+                        : Type::getInt64Ty(SrcTy->getContext());
+            Value *Idx = Constant::getNullValue(IdxTy);
+            Value *Idxs[2] = { Idx, Idx };
             CastOp = ConstantExpr::getGetElementPtr(CSrc, Idxs);
             SrcTy = cast<PointerType>(CastOp->getType());
             SrcPTy = SrcTy->getElementType();
@@ -315,7 +318,8 @@ static Instruction *InstCombineLoadCast(InstCombiner &IC, LoadInst &LI,
             SrcPTy->isVectorTy()) &&
           // Do not allow turning this into a load of an integer, which is then
           // casted to a pointer, this pessimizes pointer analysis a lot.
-          (SrcPTy->isPointerTy() == LI.getType()->isPointerTy()) &&
+          (SrcPTy->isPtrOrPtrVectorTy() ==
+           LI.getType()->isPtrOrPtrVectorTy()) &&
           IC.getDataLayout()->getTypeSizeInBits(SrcPTy) ==
                IC.getDataLayout()->getTypeSizeInBits(DestPTy)) {
 
diff --git a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index cc6a301..a759548 100644
--- a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -374,9 +374,12 @@ Value *InstCombiner::foldFMulConst(Instruction *FMulOrDiv, ConstantFP *C,
   } else {
     if (C0) {
       // (C0 / X) * C => (C0 * C) / X
-      ConstantFP *F = cast<ConstantFP>(ConstantExpr::getFMul(C0, C));
-      if (isNormalFp(F))
-        R = BinaryOperator::CreateFDiv(F, Opnd1);
+      if (FMulOrDiv->hasOneUse()) {
+        // It would otherwise introduce another div.
+        ConstantFP *F = cast<ConstantFP>(ConstantExpr::getFMul(C0, C));
+        if (isNormalFp(F))
+          R = BinaryOperator::CreateFDiv(F, Opnd1);
+      }
     } else {
       // (X / C1) * C => X * (C/C1) if C/C1 is not a denormal
       ConstantFP *F = cast<ConstantFP>(ConstantExpr::getFDiv(C, C1));
@@ -460,10 +463,9 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) {
             if (Swap && FAddSub->getOpcode() == Instruction::FSub)
               std::swap(M0, M1);
 
-            Value *R = (FAddSub->getOpcode() == Instruction::FAdd) ?
-                        BinaryOperator::CreateFAdd(M0, M1) :
-                        BinaryOperator::CreateFSub(M0, M1);
-            Instruction *RI = cast<Instruction>(R);
+            Instruction *RI = (FAddSub->getOpcode() == Instruction::FAdd)
+                                  ? BinaryOperator::CreateFAdd(M0, M1)
+                                  : BinaryOperator::CreateFSub(M0, M1);
             RI->copyFastMathFlags(&I);
             return RI;
           }
@@ -490,13 +492,13 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) {
     }
     // if pattern detected emit alternate sequence
     if (OpX && OpY) {
+      BuilderTy::FastMathFlagGuard Guard(*Builder);
+      Builder->SetFastMathFlags(Log2->getFastMathFlags());
       Log2->setArgOperand(0, OpY);
       Value *FMulVal = Builder->CreateFMul(OpX, Log2);
-      Instruction *FMul = cast<Instruction>(FMulVal);
-      FMul->copyFastMathFlags(Log2);
-      Instruction *FSub = BinaryOperator::CreateFSub(FMulVal, OpX);
-      FSub->copyFastMathFlags(Log2);
-      return FSub;
+      Value *FSub = Builder->CreateFSub(FMulVal, OpX);
+      FSub->takeName(&I);
+      return ReplaceInstUsesWith(I, FSub);
     }
   }
 
@@ -506,6 +508,9 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) {
   for (int i = 0; i < 2; i++) {
     bool IgnoreZeroSign = I.hasNoSignedZeros();
     if (BinaryOperator::isFNeg(Opnd0, IgnoreZeroSign)) {
+      BuilderTy::FastMathFlagGuard Guard(*Builder);
+      Builder->SetFastMathFlags(I.getFastMathFlags());
+
       Value *N0 = dyn_castFNegVal(Opnd0, IgnoreZeroSign);
       Value *N1 = dyn_castFNegVal(Opnd1, IgnoreZeroSign);
 
@@ -516,13 +521,9 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) {
       if (Opnd0->hasOneUse()) {
         // -X * Y => -(X*Y) (Promote negation as high as possible)
         Value *T = Builder->CreateFMul(N0, Opnd1);
-        cast<Instruction>(T)->setDebugLoc(I.getDebugLoc());
-        Instruction *Neg = BinaryOperator::CreateFNeg(T);
-        if (I.getFastMathFlags().any()) {
-          cast<Instruction>(T)->copyFastMathFlags(&I);
-          Neg->copyFastMathFlags(&I);
-        }
-        return Neg;
+        Value *Neg = Builder->CreateFNeg(T);
+        Neg->takeName(&I);
+        return ReplaceInstUsesWith(I, Neg);
       }
     }
 
@@ -545,13 +546,13 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) {
           Y = Opnd0_0;
 
         if (Y) {
-          Instruction *T = cast<Instruction>(Builder->CreateFMul(Opnd1, Opnd1));
-          T->copyFastMathFlags(&I);
-          T->setDebugLoc(I.getDebugLoc());
+          BuilderTy::FastMathFlagGuard Guard(*Builder);
+          Builder->SetFastMathFlags(I.getFastMathFlags());
+          Value *T = Builder->CreateFMul(Opnd1, Opnd1);
 
-          Instruction *R = BinaryOperator::CreateFMul(T, Y);
-          R->copyFastMathFlags(&I);
-          return R;
+          Value *R = Builder->CreateFMul(T, Y);
+          R->takeName(&I);
+          return ReplaceInstUsesWith(I, R);
         }
       }
     }
diff --git a/lib/Transforms/InstCombine/InstCombinePHI.cpp b/lib/Transforms/InstCombine/InstCombinePHI.cpp
index bd14e81..4c6d0c4 100644
--- a/lib/Transforms/InstCombine/InstCombinePHI.cpp
+++ b/lib/Transforms/InstCombine/InstCombinePHI.cpp
@@ -604,8 +604,6 @@ namespace llvm {
              LHS.Width == RHS.Width;
     }
   };
-  template <>
-  struct isPodLike<LoweredPHIRecord> { static const bool value = true; };
 }
 
 
@@ -688,10 +686,10 @@ Instruction *InstCombiner::SliceUpIllegalIntegerPHI(PHINode &FirstPhi) {
   // extracted out of it.  First, sort the users by their offset and size.
   array_pod_sort(PHIUsers.begin(), PHIUsers.end());
 
-  DEBUG(errs() << "SLICING UP PHI: " << FirstPhi << '\n';
-            for (unsigned i = 1, e = PHIsToSlice.size(); i != e; ++i)
-              errs() << "AND USER PHI #" << i << ": " << *PHIsToSlice[i] <<'\n';
-        );
+  DEBUG(dbgs() << "SLICING UP PHI: " << FirstPhi << '\n';
+        for (unsigned i = 1, e = PHIsToSlice.size(); i != e; ++i)
+          dbgs() << "AND USER PHI #" << i << ": " << *PHIsToSlice[i] << '\n';
+    );
 
   // PredValues - This is a temporary used when rewriting PHI nodes.  It is
   // hoisted out here to avoid construction/destruction thrashing.
@@ -772,7 +770,7 @@ Instruction *InstCombiner::SliceUpIllegalIntegerPHI(PHINode &FirstPhi) {
       }
       PredValues.clear();
 
-      DEBUG(errs() << "  Made element PHI for offset " << Offset << ": "
+      DEBUG(dbgs() << "  Made element PHI for offset " << Offset << ": "
                    << *EltPHI << '\n');
       ExtractedVals[LoweredPHIRecord(PN, Offset, Ty)] = EltPHI;
     }
@@ -792,7 +790,7 @@ Instruction *InstCombiner::SliceUpIllegalIntegerPHI(PHINode &FirstPhi) {
 // PHINode simplification
 //
 Instruction *InstCombiner::visitPHINode(PHINode &PN) {
-  if (Value *V = SimplifyInstruction(&PN, TD))
+  if (Value *V = SimplifyInstruction(&PN, TD, TLI))
     return ReplaceInstUsesWith(PN, V);
 
   // If all PHI operands are the same operation, pull them through the PHI,
diff --git a/lib/Transforms/InstCombine/InstCombineSelect.cpp b/lib/Transforms/InstCombine/InstCombineSelect.cpp
index 7581dbe..283bec2 100644
--- a/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -367,7 +367,7 @@ static Value *foldSelectICmpAndOr(const SelectInst &SI, Value *TrueVal,
                                   Value *FalseVal,
                                   InstCombiner::BuilderTy *Builder) {
   const ICmpInst *IC = dyn_cast<ICmpInst>(SI.getCondition());
-  if (!IC || !IC->isEquality())
+  if (!IC || !IC->isEquality() || !SI.getType()->isIntegerTy())
     return 0;
 
   Value *CmpLHS = IC->getOperand(0);
diff --git a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
index a7bfe09..c831ddd 100644
--- a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -808,7 +808,6 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
         // TODO: Could compute known zero/one bits based on the input.
         break;
       }
-      case Intrinsic::x86_sse42_crc32_64_8:
       case Intrinsic::x86_sse42_crc32_64_64:
         KnownZero = APInt::getHighBitsSet(64, 32);
         return 0;
@@ -845,21 +844,26 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
 Value *InstCombiner::SimplifyShrShlDemandedBits(Instruction *Shr,
   Instruction *Shl, APInt DemandedMask, APInt &KnownZero, APInt &KnownOne) {
 
-  unsigned ShlAmt = cast<ConstantInt>(Shl->getOperand(1))->getZExtValue();
-  unsigned ShrAmt = cast<ConstantInt>(Shr->getOperand(1))->getZExtValue();
+  const APInt &ShlOp1 = cast<ConstantInt>(Shl->getOperand(1))->getValue();
+  const APInt &ShrOp1 = cast<ConstantInt>(Shr->getOperand(1))->getValue();
+  if (!ShlOp1 || !ShrOp1)
+      return 0; // Noop.
+
+  Value *VarX = Shr->getOperand(0);
+  Type *Ty = VarX->getType();
+  unsigned BitWidth = Ty->getIntegerBitWidth();
+  if (ShlOp1.uge(BitWidth) || ShrOp1.uge(BitWidth))
+    return 0; // Undef.
+
+  unsigned ShlAmt = ShlOp1.getZExtValue();
+  unsigned ShrAmt = ShrOp1.getZExtValue();
 
   KnownOne.clearAllBits();
   KnownZero = APInt::getBitsSet(KnownZero.getBitWidth(), 0, ShlAmt-1);
   KnownZero &= DemandedMask;
 
-  if (ShlAmt == 0 || ShrAmt == 0)
-    return 0;
-
-  Value *VarX = Shr->getOperand(0);
-  Type *Ty = VarX->getType();
-
-  APInt BitMask1(APInt::getAllOnesValue(Ty->getIntegerBitWidth()));
-  APInt BitMask2(APInt::getAllOnesValue(Ty->getIntegerBitWidth()));
+  APInt BitMask1(APInt::getAllOnesValue(BitWidth));
+  APInt BitMask2(APInt::getAllOnesValue(BitWidth));
 
   bool isLshr = (Shr->getOpcode() == Instruction::LShr);
   BitMask1 = isLshr ? (BitMask1.lshr(ShrAmt) << ShlAmt) :
diff --git a/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
index f3de6e2..1e72410 100644
--- a/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ b/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -106,8 +106,8 @@ static Value *FindScalarElement(Value *V, unsigned EltNo) {
 }
 
 // If we have a PHI node with a vector type that has only 2 uses: feed
-// itself and be an operand of extractelemnt at a constant location,
-// try to replace the PHI of the vector type with a PHI of a scalar type
+// itself and be an operand of extractelement at a constant location,
+// try to replace the PHI of the vector type with a PHI of a scalar type.
 Instruction *InstCombiner::scalarizePHI(ExtractElementInst &EI, PHINode *PN) {
   // Verify that the PHI node has exactly 2 uses. Otherwise return NULL.
   if (!PN->hasNUses(2))
@@ -282,6 +282,38 @@ Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) {
         Worklist.AddValue(EE);
         return CastInst::Create(CI->getOpcode(), EE, EI.getType());
       }
+    } else if (SelectInst *SI = dyn_cast<SelectInst>(I)) {
+      if (SI->hasOneUse()) {
+        // TODO: For a select on vectors, it might be useful to do this if it
+        // has multiple extractelement uses. For vector select, that seems to
+        // fight the vectorizer.
+
+        // If we are extracting an element from a vector select or a select on
+        // vectors, a select on the scalars extracted from the vector arguments.
+        Value *TrueVal = SI->getTrueValue();
+        Value *FalseVal = SI->getFalseValue();
+
+        Value *Cond = SI->getCondition();
+        if (Cond->getType()->isVectorTy()) {
+          Cond = Builder->CreateExtractElement(Cond,
+                                               EI.getIndexOperand(),
+                                               Cond->getName() + ".elt");
+        }
+
+        Value *V1Elem
+          = Builder->CreateExtractElement(TrueVal,
+                                          EI.getIndexOperand(),
+                                          TrueVal->getName() + ".elt");
+
+        Value *V2Elem
+          = Builder->CreateExtractElement(FalseVal,
+                                          EI.getIndexOperand(),
+                                          FalseVal->getName() + ".elt");
+        return SelectInst::Create(Cond,
+                                  V1Elem,
+                                  V2Elem,
+                                  SI->getName() + ".elt");
+      }
     }
   }
   return 0;
@@ -294,7 +326,7 @@ static bool CollectSingleShuffleElements(Value *V, Value *LHS, Value *RHS,
                                          SmallVectorImpl<Constant*> &Mask) {
   assert(V->getType() == LHS->getType() && V->getType() == RHS->getType() &&
          "Invalid CollectSingleShuffleElements");
-  unsigned NumElts = cast<VectorType>(V->getType())->getNumElements();
+  unsigned NumElts = V->getType()->getVectorNumElements();
 
   if (isa<UndefValue>(V)) {
     Mask.assign(NumElts, UndefValue::get(Type::getInt32Ty(V->getContext())));
diff --git a/lib/Transforms/InstCombine/InstCombineWorklist.h b/lib/Transforms/InstCombine/InstCombineWorklist.h
index 19959c0..f84db27 100644
--- a/lib/Transforms/InstCombine/InstCombineWorklist.h
+++ b/lib/Transforms/InstCombine/InstCombineWorklist.h
@@ -37,7 +37,7 @@ public:
   /// in it.
   void Add(Instruction *I) {
     if (WorklistMap.insert(std::make_pair(I, Worklist.size())).second) {
-      DEBUG(errs() << "IC: ADD: " << *I << '\n');
+      DEBUG(dbgs() << "IC: ADD: " << *I << '\n');
       Worklist.push_back(I);
     }
   }
@@ -54,7 +54,7 @@ public:
     assert(Worklist.empty() && "Worklist must be empty to add initial group");
     Worklist.reserve(NumEntries+16);
     WorklistMap.resize(NumEntries);
-    DEBUG(errs() << "IC: ADDING: " << NumEntries << " instrs to worklist\n");
+    DEBUG(dbgs() << "IC: ADDING: " << NumEntries << " instrs to worklist\n");
     for (unsigned Idx = 0; NumEntries; --NumEntries) {
       Instruction *I = List[NumEntries-1];
       WorklistMap.insert(std::make_pair(I, Idx++));
@@ -74,8 +74,7 @@ public:
   }
 
   Instruction *RemoveOne() {
-    Instruction *I = Worklist.back();
-    Worklist.pop_back();
+    Instruction *I = Worklist.pop_back_val();
     WorklistMap.erase(I);
     return I;
   }
diff --git a/lib/Transforms/InstCombine/InstructionCombining.cpp b/lib/Transforms/InstCombine/InstructionCombining.cpp
index b34ae21..191a101 100644
--- a/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -699,7 +699,10 @@ Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) {
       Value *TrueVInPred = TrueV->DoPHITranslation(PhiTransBB, ThisBB);
       Value *FalseVInPred = FalseV->DoPHITranslation(PhiTransBB, ThisBB);
       Value *InV = 0;
-      if (Constant *InC = dyn_cast<Constant>(PN->getIncomingValue(i)))
+      // Beware of ConstantExpr:  it may eventually evaluate to getNullValue,
+      // even if currently isNullValue gives false.
+      Constant *InC = dyn_cast<Constant>(PN->getIncomingValue(i));
+      if (InC && !isa<ConstantExpr>(InC))
         InV = InC->isNullValue() ? FalseVInPred : TrueVInPred;
       else
         InV = Builder->CreateSelect(PN->getIncomingValue(i),
@@ -755,19 +758,25 @@ Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) {
   return ReplaceInstUsesWith(I, NewPN);
 }
 
-/// FindElementAtOffset - Given a type and a constant offset, determine whether
-/// or not there is a sequence of GEP indices into the type that will land us at
-/// the specified offset.  If so, fill them into NewIndices and return the
-/// resultant element type, otherwise return null.
-Type *InstCombiner::FindElementAtOffset(Type *Ty, int64_t Offset,
-                                          SmallVectorImpl<Value*> &NewIndices) {
-  if (!TD) return 0;
-  if (!Ty->isSized()) return 0;
+/// FindElementAtOffset - Given a pointer type and a constant offset, determine
+/// whether or not there is a sequence of GEP indices into the pointed type that
+/// will land us at the specified offset.  If so, fill them into NewIndices and
+/// return the resultant element type, otherwise return null.
+Type *InstCombiner::FindElementAtOffset(Type *PtrTy, int64_t Offset,
+                                        SmallVectorImpl<Value*> &NewIndices) {
+  assert(PtrTy->isPtrOrPtrVectorTy());
+
+  if (!TD)
+    return 0;
+
+  Type *Ty = PtrTy->getPointerElementType();
+  if (!Ty->isSized())
+    return 0;
 
   // Start with the index over the outer type.  Note that the type size
   // might be zero (even if the offset isn't zero) if the indexed type
   // is something like [0 x {int, int}]
-  Type *IntPtrTy = TD->getIntPtrType(Ty->getContext());
+  Type *IntPtrTy = TD->getIntPtrType(PtrTy);
   int64_t FirstIdx = 0;
   if (int64_t TySize = TD->getTypeAllocSize(Ty)) {
     FirstIdx = Offset/TySize;
@@ -1176,6 +1185,22 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
         GetElementPtrInst::Create(Src->getOperand(0), Indices, GEP.getName());
   }
 
+  // Canonicalize (gep i8* X, -(ptrtoint Y)) to (sub (ptrtoint X), (ptrtoint Y))
+  // The GEP pattern is emitted by the SCEV expander for certain kinds of
+  // pointer arithmetic.
+  if (TD && GEP.getNumIndices() == 1 &&
+      match(GEP.getOperand(1), m_Neg(m_PtrToInt(m_Value())))) {
+    unsigned AS = GEP.getPointerAddressSpace();
+    if (GEP.getType() == Builder->getInt8PtrTy(AS) &&
+        GEP.getOperand(1)->getType()->getScalarSizeInBits() ==
+        TD->getPointerSizeInBits(AS)) {
+      Operator *Index = cast<Operator>(GEP.getOperand(1));
+      Value *PtrToInt = Builder->CreatePtrToInt(PtrOp, Index->getType());
+      Value *NewSub = Builder->CreateSub(PtrToInt, Index->getOperand(1));
+      return CastInst::Create(Instruction::IntToPtr, NewSub, GEP.getType());
+    }
+  }
+
   // Handle gep(bitcast x) and gep(gep x, 0, 0, 0).
   Value *StrippedPtr = PtrOp->stripPointerCasts();
   PointerType *StrippedPtrTy = dyn_cast<PointerType>(StrippedPtr->getType());
@@ -1231,13 +1256,12 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
       // %t = getelementptr i32* bitcast ([2 x i32]* %str to i32*), i32 %V
       // into:  %t1 = getelementptr [2 x i32]* %str, i32 0, i32 %V; bitcast
       Type *SrcElTy = StrippedPtrTy->getElementType();
-      Type *ResElTy=cast<PointerType>(PtrOp->getType())->getElementType();
+      Type *ResElTy = PtrOp->getType()->getPointerElementType();
       if (TD && SrcElTy->isArrayTy() &&
-          TD->getTypeAllocSize(cast<ArrayType>(SrcElTy)->getElementType()) ==
+          TD->getTypeAllocSize(SrcElTy->getArrayElementType()) ==
           TD->getTypeAllocSize(ResElTy)) {
-        Value *Idx[2];
-        Idx[0] = Constant::getNullValue(Type::getInt32Ty(GEP.getContext()));
-        Idx[1] = GEP.getOperand(1);
+        Type *IdxType = TD->getIntPtrType(GEP.getType());
+        Value *Idx[2] = { Constant::getNullValue(IdxType), GEP.getOperand(1) };
         Value *NewGEP = GEP.isInBounds() ?
           Builder->CreateInBoundsGEP(StrippedPtr, Idx, GEP.getName()) :
           Builder->CreateGEP(StrippedPtr, Idx, GEP.getName());
@@ -1261,7 +1285,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
 
           // Earlier transforms ensure that the index has type IntPtrType, which
           // considerably simplifies the logic by eliminating implicit casts.
-          assert(Idx->getType() == TD->getIntPtrType(GEP.getContext()) &&
+          assert(Idx->getType() == TD->getIntPtrType(GEP.getType()) &&
                  "Index not cast to pointer width?");
 
           bool NSW;
@@ -1287,8 +1311,8 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
         // Check that changing to the array element type amounts to dividing the
         // index by a scale factor.
         uint64_t ResSize = TD->getTypeAllocSize(ResElTy);
-        uint64_t ArrayEltSize =
-          TD->getTypeAllocSize(cast<ArrayType>(SrcElTy)->getElementType());
+        uint64_t ArrayEltSize
+          = TD->getTypeAllocSize(SrcElTy->getArrayElementType());
         if (ResSize && ArrayEltSize % ResSize == 0) {
           Value *Idx = GEP.getOperand(1);
           unsigned BitWidth = Idx->getType()->getPrimitiveSizeInBits();
@@ -1296,7 +1320,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
 
           // Earlier transforms ensure that the index has type IntPtrType, which
           // considerably simplifies the logic by eliminating implicit casts.
-          assert(Idx->getType() == TD->getIntPtrType(GEP.getContext()) &&
+          assert(Idx->getType() == TD->getIntPtrType(GEP.getType()) &&
                  "Index not cast to pointer width?");
 
           bool NSW;
@@ -1304,9 +1328,11 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
             // Successfully decomposed Idx as NewIdx * Scale, form a new GEP.
             // If the multiplication NewIdx * Scale may overflow then the new
             // GEP may not be "inbounds".
-            Value *Off[2];
-            Off[0] = Constant::getNullValue(Type::getInt32Ty(GEP.getContext()));
-            Off[1] = NewIdx;
+            Value *Off[2] = {
+              Constant::getNullValue(TD->getIntPtrType(GEP.getType())),
+              NewIdx
+            };
+
             Value *NewGEP = GEP.isInBounds() && NSW ?
               Builder->CreateInBoundsGEP(StrippedPtr, Off, GEP.getName()) :
               Builder->CreateGEP(StrippedPtr, Off, GEP.getName());
@@ -1318,15 +1344,20 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
     }
   }
 
+  if (!TD)
+    return 0;
+
   /// See if we can simplify:
   ///   X = bitcast A* to B*
   ///   Y = gep X, <...constant indices...>
   /// into a gep of the original struct.  This is important for SROA and alias
   /// analysis of unions.  If "A" is also a bitcast, wait for A/X to be merged.
   if (BitCastInst *BCI = dyn_cast<BitCastInst>(PtrOp)) {
-    APInt Offset(TD ? TD->getPointerSizeInBits() : 1, 0);
-    if (TD &&
-        !isa<BitCastInst>(BCI->getOperand(0)) &&
+    Value *Operand = BCI->getOperand(0);
+    PointerType *OpType = cast<PointerType>(Operand->getType());
+    unsigned OffsetBits = TD->getPointerTypeSizeInBits(OpType);
+    APInt Offset(OffsetBits, 0);
+    if (!isa<BitCastInst>(Operand) &&
         GEP.accumulateConstantOffset(*TD, Offset) &&
         StrippedPtrTy->getAddressSpace() == GEP.getPointerAddressSpace()) {
 
@@ -1335,8 +1366,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
       if (!Offset) {
         // If the bitcast is of an allocation, and the allocation will be
         // converted to match the type of the cast, don't touch this.
-        if (isa<AllocaInst>(BCI->getOperand(0)) ||
-            isAllocationFn(BCI->getOperand(0), TLI)) {
+        if (isa<AllocaInst>(Operand) || isAllocationFn(Operand, TLI)) {
           // See if the bitcast simplifies, if so, don't nuke this GEP yet.
           if (Instruction *I = visitBitCast(*BCI)) {
             if (I != BCI) {
@@ -1347,19 +1377,17 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
             return &GEP;
           }
         }
-        return new BitCastInst(BCI->getOperand(0), GEP.getType());
+        return new BitCastInst(Operand, GEP.getType());
       }
 
       // Otherwise, if the offset is non-zero, we need to find out if there is a
       // field at Offset in 'A's type.  If so, we can pull the cast through the
       // GEP.
       SmallVector<Value*, 8> NewIndices;
-      Type *InTy =
-        cast<PointerType>(BCI->getOperand(0)->getType())->getElementType();
-      if (FindElementAtOffset(InTy, Offset.getSExtValue(), NewIndices)) {
+      if (FindElementAtOffset(OpType, Offset.getSExtValue(), NewIndices)) {
         Value *NGEP = GEP.isInBounds() ?
-          Builder->CreateInBoundsGEP(BCI->getOperand(0), NewIndices) :
-          Builder->CreateGEP(BCI->getOperand(0), NewIndices);
+          Builder->CreateInBoundsGEP(Operand, NewIndices) :
+          Builder->CreateGEP(Operand, NewIndices);
 
         if (NGEP->getType() == GEP.getType())
           return ReplaceInstUsesWith(GEP, NGEP);
@@ -1372,8 +1400,6 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
   return 0;
 }
 
-
-
 static bool
 isAllocSiteRemovable(Instruction *AI, SmallVectorImpl<WeakVH> &Users,
                      const TargetLibraryInfo *TLI) {
@@ -2209,7 +2235,7 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB,
       // DCE instruction if trivially dead.
       if (isInstructionTriviallyDead(Inst, TLI)) {
         ++NumDeadInst;
-        DEBUG(errs() << "IC: DCE: " << *Inst << '\n');
+        DEBUG(dbgs() << "IC: DCE: " << *Inst << '\n');
         Inst->eraseFromParent();
         continue;
       }
@@ -2217,7 +2243,7 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB,
       // ConstantProp instruction if trivially constant.
       if (!Inst->use_empty() && isa<Constant>(Inst->getOperand(0)))
         if (Constant *C = ConstantFoldInstruction(Inst, TD, TLI)) {
-          DEBUG(errs() << "IC: ConstFold to: " << *C << " from: "
+          DEBUG(dbgs() << "IC: ConstFold to: " << *C << " from: "
                        << *Inst << '\n');
           Inst->replaceAllUsesWith(C);
           ++NumConstProp;
@@ -2293,7 +2319,7 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB,
 bool InstCombiner::DoOneIteration(Function &F, unsigned Iteration) {
   MadeIRChange = false;
 
-  DEBUG(errs() << "\n\nINSTCOMBINE ITERATION #" << Iteration << " on "
+  DEBUG(dbgs() << "\n\nINSTCOMBINE ITERATION #" << Iteration << " on "
                << F.getName() << "\n");
 
   {
@@ -2338,7 +2364,7 @@ bool InstCombiner::DoOneIteration(Function &F, unsigned Iteration) {
 
     // Check to see if we can DCE the instruction.
     if (isInstructionTriviallyDead(I, TLI)) {
-      DEBUG(errs() << "IC: DCE: " << *I << '\n');
+      DEBUG(dbgs() << "IC: DCE: " << *I << '\n');
       EraseInstFromFunction(*I);
       ++NumDeadInst;
       MadeIRChange = true;
@@ -2348,7 +2374,7 @@ bool InstCombiner::DoOneIteration(Function &F, unsigned Iteration) {
     // Instruction isn't dead, see if we can constant propagate it.
     if (!I->use_empty() && isa<Constant>(I->getOperand(0)))
       if (Constant *C = ConstantFoldInstruction(I, TD, TLI)) {
-        DEBUG(errs() << "IC: ConstFold to: " << *C << " from: " << *I << '\n');
+        DEBUG(dbgs() << "IC: ConstFold to: " << *C << " from: " << *I << '\n');
 
         // Add operands to the worklist.
         ReplaceInstUsesWith(*I, C);
@@ -2396,13 +2422,13 @@ bool InstCombiner::DoOneIteration(Function &F, unsigned Iteration) {
     std::string OrigI;
 #endif
     DEBUG(raw_string_ostream SS(OrigI); I->print(SS); OrigI = SS.str(););
-    DEBUG(errs() << "IC: Visiting: " << OrigI << '\n');
+    DEBUG(dbgs() << "IC: Visiting: " << OrigI << '\n');
 
     if (Instruction *Result = visit(*I)) {
       ++NumCombined;
       // Should we replace the old instruction with a new one?
       if (Result != I) {
-        DEBUG(errs() << "IC: Old = " << *I << '\n'
+        DEBUG(dbgs() << "IC: Old = " << *I << '\n'
                      << "    New = " << *Result << '\n');
 
         if (!I->getDebugLoc().isUnknown())
@@ -2431,7 +2457,7 @@ bool InstCombiner::DoOneIteration(Function &F, unsigned Iteration) {
         EraseInstFromFunction(*I);
       } else {
 #ifndef NDEBUG
-        DEBUG(errs() << "IC: Mod = " << OrigI << '\n'
+        DEBUG(dbgs() << "IC: Mod = " << OrigI << '\n'
                      << "    New = " << *I << '\n');
 #endif
 
diff --git a/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index d77e20b..d731ec5 100644
--- a/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -23,6 +23,7 @@
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/DIBuilder.h"
@@ -59,6 +60,7 @@ static const uint64_t kDefaultShort64bitShadowOffset = 0x7FFF8000;  // < 2G.
 static const uint64_t kPPC64_ShadowOffset64 = 1ULL << 41;
 static const uint64_t kMIPS32_ShadowOffset32 = 0x0aaa8000;
 
+static const size_t kMinStackMallocSize = 1 << 6;  // 64B
 static const size_t kMaxStackMallocSize = 1 << 16;  // 64K
 static const uintptr_t kCurrentStackFrameMagic = 0x41B58AB3;
 static const uintptr_t kRetiredStackFrameMagic = 0x45E0360E;
@@ -75,21 +77,30 @@ static const char *const kAsanUnregisterGlobalsName =
 static const char *const kAsanPoisonGlobalsName = "__asan_before_dynamic_init";
 static const char *const kAsanUnpoisonGlobalsName = "__asan_after_dynamic_init";
 static const char *const kAsanInitName = "__asan_init_v3";
+static const char *const kAsanCovName = "__sanitizer_cov";
 static const char *const kAsanHandleNoReturnName = "__asan_handle_no_return";
 static const char *const kAsanMappingOffsetName = "__asan_mapping_offset";
 static const char *const kAsanMappingScaleName = "__asan_mapping_scale";
-static const char *const kAsanStackMallocName = "__asan_stack_malloc";
-static const char *const kAsanStackFreeName = "__asan_stack_free";
+static const int         kMaxAsanStackMallocSizeClass = 10;
+static const char *const kAsanStackMallocNameTemplate = "__asan_stack_malloc_";
+static const char *const kAsanStackFreeNameTemplate = "__asan_stack_free_";
 static const char *const kAsanGenPrefix = "__asan_gen_";
 static const char *const kAsanPoisonStackMemoryName =
     "__asan_poison_stack_memory";
 static const char *const kAsanUnpoisonStackMemoryName =
     "__asan_unpoison_stack_memory";
 
+static const char *const kAsanOptionDetectUAR =
+    "__asan_option_detect_stack_use_after_return";
+
+// These constants must match the definitions in the run-time library.
 static const int kAsanStackLeftRedzoneMagic = 0xf1;
 static const int kAsanStackMidRedzoneMagic = 0xf2;
 static const int kAsanStackRightRedzoneMagic = 0xf3;
 static const int kAsanStackPartialRedzoneMagic = 0xf4;
+#ifndef NDEBUG
+static const int kAsanStackAfterReturnMagic = 0xf5;
+#endif
 
 // Accesses sizes are powers of two: 1, 2, 4, 8, 16.
 static const size_t kNumberOfAccessSizes = 5;
@@ -124,6 +135,8 @@ static cl::opt<bool> ClUseAfterReturn("asan-use-after-return",
 // This flag may need to be replaced with -f[no]asan-globals.
 static cl::opt<bool> ClGlobals("asan-globals",
        cl::desc("Handle global objects"), cl::Hidden, cl::init(true));
+static cl::opt<bool> ClCoverage("asan-coverage",
+       cl::desc("ASan coverage"), cl::Hidden, cl::init(false));
 static cl::opt<bool> ClInitializers("asan-initialization-order",
        cl::desc("Handle C++ initializer order"), cl::Hidden, cl::init(false));
 static cl::opt<bool> ClMemIntrin("asan-memintrin",
@@ -184,6 +197,13 @@ static cl::opt<int> ClDebugMin("asan-debug-min", cl::desc("Debug min inst"),
 static cl::opt<int> ClDebugMax("asan-debug-max", cl::desc("Debug man inst"),
                                cl::Hidden, cl::init(-1));
 
+STATISTIC(NumInstrumentedReads, "Number of instrumented reads");
+STATISTIC(NumInstrumentedWrites, "Number of instrumented writes");
+STATISTIC(NumOptimizedAccessesToGlobalArray,
+          "Number of optimized accesses to global arrays");
+STATISTIC(NumOptimizedAccessesToGlobalVar,
+          "Number of optimized accesses to global vars");
+
 namespace {
 /// A set of dynamically initialized globals extracted from metadata.
 class SetOfDynamicallyInitializedGlobals {
@@ -306,6 +326,8 @@ struct AddressSanitizer : public FunctionPass {
   bool ShouldInstrumentGlobal(GlobalVariable *G);
   bool LooksLikeCodeInBug11395(Instruction *I);
   void FindDynamicInitializers(Module &M);
+  bool GlobalIsLinkerInitialized(GlobalVariable *G);
+  bool InjectCoverage(Function &F);
 
   bool CheckInitOrder;
   bool CheckUseAfterReturn;
@@ -321,6 +343,7 @@ struct AddressSanitizer : public FunctionPass {
   Function *AsanCtorFunction;
   Function *AsanInitFunction;
   Function *AsanHandleNoReturnFunc;
+  Function *AsanCovFunction;
   OwningPtr<SpecialCaseList> BL;
   // This array is indexed by AccessIsWrite and log2(AccessSize).
   Function *AsanErrorCallback[2][kNumberOfAccessSizes];
@@ -396,12 +419,14 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {
   uint64_t TotalStackSize;
   unsigned StackAlignment;
 
-  Function *AsanStackMallocFunc, *AsanStackFreeFunc;
+  Function *AsanStackMallocFunc[kMaxAsanStackMallocSizeClass + 1],
+           *AsanStackFreeFunc[kMaxAsanStackMallocSizeClass + 1];
   Function *AsanPoisonStackMemoryFunc, *AsanUnpoisonStackMemoryFunc;
 
   // Stores a place and arguments of poisoning/unpoisoning call for alloca.
   struct AllocaPoisonCall {
     IntrinsicInst *InsBefore;
+    AllocaInst *AI;
     uint64_t Size;
     bool DoPoison;
   };
@@ -480,7 +505,7 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {
     AllocaInst *AI = findAllocaForValue(II.getArgOperand(1));
     if (!AI) return;
     bool DoPoison = (ID == Intrinsic::lifetime_end);
-    AllocaPoisonCall APC = {&II, SizeValue, DoPoison};
+    AllocaPoisonCall APC = {&II, AI, SizeValue, DoPoison};
     AllocaPoisonCallVec.push_back(APC);
   }
 
@@ -488,7 +513,7 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {
   void initializeCallbacks(Module &M);
 
   // Check if we want (and can) handle this alloca.
-  bool isInterestingAlloca(AllocaInst &AI) {
+  bool isInterestingAlloca(AllocaInst &AI) const {
     return (!AI.isArrayAllocation() &&
             AI.isStaticAlloca() &&
             AI.getAlignment() <= RedzoneSize() &&
@@ -498,24 +523,27 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {
   size_t RedzoneSize() const {
     return RedzoneSizeForScale(Mapping.Scale);
   }
-  uint64_t getAllocaSizeInBytes(AllocaInst *AI) {
+  uint64_t getAllocaSizeInBytes(AllocaInst *AI) const {
     Type *Ty = AI->getAllocatedType();
     uint64_t SizeInBytes = ASan.TD->getTypeAllocSize(Ty);
     return SizeInBytes;
   }
-  uint64_t getAlignedSize(uint64_t SizeInBytes) {
+  uint64_t getAlignedSize(uint64_t SizeInBytes) const {
     size_t RZ = RedzoneSize();
     return ((SizeInBytes + RZ - 1) / RZ) * RZ;
   }
-  uint64_t getAlignedAllocaSize(AllocaInst *AI) {
+  uint64_t getAlignedAllocaSize(AllocaInst *AI) const {
     uint64_t SizeInBytes = getAllocaSizeInBytes(AI);
     return getAlignedSize(SizeInBytes);
   }
   /// Finds alloca where the value comes from.
   AllocaInst *findAllocaForValue(Value *V);
-  void poisonRedZones(const ArrayRef<AllocaInst*> &AllocaVec, IRBuilder<> IRB,
+  void poisonRedZones(const ArrayRef<AllocaInst*> &AllocaVec, IRBuilder<> &IRB,
                       Value *ShadowBase, bool DoPoison);
-  void poisonAlloca(Value *V, uint64_t Size, IRBuilder<> IRB, bool DoPoison);
+  void poisonAlloca(Value *V, uint64_t Size, IRBuilder<> &IRB, bool DoPoison);
+
+  void SetShadowToStackAfterReturnInlined(IRBuilder<> &IRB, Value *ShadowBase,
+                                          int Size);
 };
 
 }  // namespace
@@ -642,6 +670,13 @@ static Value *isInterestingMemoryAccess(Instruction *I, bool *IsWrite) {
   return NULL;
 }
 
+bool AddressSanitizer::GlobalIsLinkerInitialized(GlobalVariable *G) {
+  // If a global variable does not have dynamic initialization we don't
+  // have to instrument it.  However, if a global does not have initializer
+  // at all, we assume it has dynamic initializer (in other TU).
+  return G->hasInitializer() && !DynamicallyInitializedGlobals.Contains(G);
+}
+
 void AddressSanitizer::instrumentMop(Instruction *I) {
   bool IsWrite = false;
   Value *Addr = isInterestingMemoryAccess(I, &IsWrite);
@@ -650,13 +685,19 @@ void AddressSanitizer::instrumentMop(Instruction *I) {
     if (GlobalVariable *G = dyn_cast<GlobalVariable>(Addr)) {
       // If initialization order checking is disabled, a simple access to a
       // dynamically initialized global is always valid.
-      if (!CheckInitOrder)
-        return;
-      // If a global variable does not have dynamic initialization we don't
-      // have to instrument it.  However, if a global does not have initailizer
-      // at all, we assume it has dynamic initializer (in other TU).
-      if (G->hasInitializer() && !DynamicallyInitializedGlobals.Contains(G))
+      if (!CheckInitOrder || GlobalIsLinkerInitialized(G)) {
+        NumOptimizedAccessesToGlobalVar++;
         return;
+      }
+    }
+    ConstantExpr *CE = dyn_cast<ConstantExpr>(Addr);
+    if (CE && CE->isGEPWithNoNotionalOverIndexing()) {
+      if (GlobalVariable *G = dyn_cast<GlobalVariable>(CE->getOperand(0))) {
+        if (CE->getOperand(1)->isNullValue() && GlobalIsLinkerInitialized(G)) {
+          NumOptimizedAccessesToGlobalArray++;
+          return;
+        }
+      }
     }
   }
 
@@ -668,6 +709,11 @@ void AddressSanitizer::instrumentMop(Instruction *I) {
 
   assert((TypeSize % 8) == 0);
 
+  if (IsWrite)
+    NumInstrumentedWrites++;
+  else
+    NumInstrumentedReads++;
+
   // Instrument a 1-, 2-, 4-, 8-, or 16- byte access with one check.
   if (TypeSize == 8  || TypeSize == 16 ||
       TypeSize == 32 || TypeSize == 64 || TypeSize == 128)
@@ -883,7 +929,7 @@ bool AddressSanitizerModule::runOnModule(Module &M) {
   TD = getAnalysisIfAvailable<DataLayout>();
   if (!TD)
     return false;
-  BL.reset(new SpecialCaseList(BlacklistFile));
+  BL.reset(SpecialCaseList::createOrDie(BlacklistFile));
   if (BL->isIn(M)) return false;
   C = &(M.getContext());
   int LongSize = TD->getPointerSizeInBits();
@@ -914,8 +960,7 @@ bool AddressSanitizerModule::runOnModule(Module &M) {
   StructType *GlobalStructTy = StructType::get(IntptrTy, IntptrTy,
                                                IntptrTy, IntptrTy,
                                                IntptrTy, IntptrTy, NULL);
-  SmallVector<Constant *, 16> Initializers(n), DynamicInit;
-
+  SmallVector<Constant *, 16> Initializers(n);
 
   Function *CtorFunc = M.getFunction(kAsanModuleCtorName);
   assert(CtorFunc);
@@ -1046,6 +1091,8 @@ void AddressSanitizer::initializeCallbacks(Module &M) {
 
   AsanHandleNoReturnFunc = checkInterfaceFunction(M.getOrInsertFunction(
       kAsanHandleNoReturnName, IRB.getVoidTy(), NULL));
+  AsanCovFunction = checkInterfaceFunction(M.getOrInsertFunction(
+      kAsanCovName, IRB.getVoidTy(), IntptrTy, NULL));
   // We insert an empty inline asm after __asan_report* to avoid callback merge.
   EmptyAsm = InlineAsm::get(FunctionType::get(IRB.getVoidTy(), false),
                             StringRef(""), StringRef(""),
@@ -1076,7 +1123,7 @@ bool AddressSanitizer::doInitialization(Module &M) {
 
   if (!TD)
     return false;
-  BL.reset(new SpecialCaseList(BlacklistFile));
+  BL.reset(SpecialCaseList::createOrDie(BlacklistFile));
   DynamicallyInitializedGlobals.Init(M);
 
   C = &(M.getContext());
@@ -1117,6 +1164,47 @@ bool AddressSanitizer::maybeInsertAsanInitAtFunctionEntry(Function &F) {
   return false;
 }
 
+// Poor man's coverage that works with ASan.
+// We create a Guard boolean variable with the same linkage
+// as the function and inject this code into the entry block:
+// if (*Guard) {
+//    __sanitizer_cov(&F);
+//    *Guard = 1;
+// }
+// The accesses to Guard are atomic. The rest of the logic is
+// in __sanitizer_cov (it's fine to call it more than once).
+//
+// This coverage implementation provides very limited data:
+// it only tells if a given function was ever executed.
+// No counters, no per-basic-block or per-edge data.
+// But for many use cases this is what we need and the added slowdown
+// is negligible. This simple implementation will probably be obsoleted
+// by the upcoming Clang-based coverage implementation.
+// By having it here and now we hope to
+//  a) get the functionality to users earlier and
+//  b) collect usage statistics to help improve Clang coverage design.
+bool AddressSanitizer::InjectCoverage(Function &F) {
+  if (!ClCoverage) return false;
+  IRBuilder<> IRB(F.getEntryBlock().getFirstInsertionPt());
+  Type *Int8Ty = IRB.getInt8Ty();
+  GlobalVariable *Guard = new GlobalVariable(
+      *F.getParent(), Int8Ty, false, GlobalValue::PrivateLinkage,
+      Constant::getNullValue(Int8Ty), "__asan_gen_cov_" + F.getName());
+  LoadInst *Load = IRB.CreateLoad(Guard);
+  Load->setAtomic(Monotonic);
+  Load->setAlignment(1);
+  Value *Cmp = IRB.CreateICmpEQ(Constant::getNullValue(Int8Ty), Load);
+  Instruction *Ins = SplitBlockAndInsertIfThen(cast<Instruction>(Cmp), false);
+  IRB.SetInsertPoint(Ins);
+  // We pass &F to __sanitizer_cov. We could avoid this and rely on
+  // GET_CALLER_PC, but having the PC of the first instruction is just nice.
+  IRB.CreateCall(AsanCovFunction, IRB.CreatePointerCast(&F, IntptrTy));
+  StoreInst *Store = IRB.CreateStore(ConstantInt::get(Int8Ty, 1), Guard);
+  Store->setAtomic(Monotonic);
+  Store->setAlignment(1);
+  return true;
+}
+
 bool AddressSanitizer::runOnFunction(Function &F) {
   if (BL->isIn(F)) return false;
   if (&F == AsanCtorFunction) return false;
@@ -1212,6 +1300,10 @@ bool AddressSanitizer::runOnFunction(Function &F) {
   }
 
   bool res = NumInstrumented > 0 || ChangedStack || !NoReturnCalls.empty();
+
+  if (InjectCoverage(F))
+    res = true;
+
   DEBUG(dbgs() << "ASAN done instrumenting: " << res << " " << F << "\n");
 
   if (ClKeepUninstrumented) {
@@ -1271,11 +1363,15 @@ bool AddressSanitizer::LooksLikeCodeInBug11395(Instruction *I) {
 
 void FunctionStackPoisoner::initializeCallbacks(Module &M) {
   IRBuilder<> IRB(*C);
-  AsanStackMallocFunc = checkInterfaceFunction(M.getOrInsertFunction(
-      kAsanStackMallocName, IntptrTy, IntptrTy, IntptrTy, NULL));
-  AsanStackFreeFunc = checkInterfaceFunction(M.getOrInsertFunction(
-      kAsanStackFreeName, IRB.getVoidTy(),
-      IntptrTy, IntptrTy, IntptrTy, NULL));
+  for (int i = 0; i <= kMaxAsanStackMallocSizeClass; i++) {
+    std::string Suffix = itostr(i);
+    AsanStackMallocFunc[i] = checkInterfaceFunction(
+        M.getOrInsertFunction(kAsanStackMallocNameTemplate + Suffix, IntptrTy,
+                              IntptrTy, IntptrTy, NULL));
+    AsanStackFreeFunc[i] = checkInterfaceFunction(M.getOrInsertFunction(
+        kAsanStackFreeNameTemplate + Suffix, IRB.getVoidTy(), IntptrTy,
+        IntptrTy, IntptrTy, NULL));
+  }
   AsanPoisonStackMemoryFunc = checkInterfaceFunction(M.getOrInsertFunction(
       kAsanPoisonStackMemoryName, IRB.getVoidTy(), IntptrTy, IntptrTy, NULL));
   AsanUnpoisonStackMemoryFunc = checkInterfaceFunction(M.getOrInsertFunction(
@@ -1283,7 +1379,7 @@ void FunctionStackPoisoner::initializeCallbacks(Module &M) {
 }
 
 void FunctionStackPoisoner::poisonRedZones(
-  const ArrayRef<AllocaInst*> &AllocaVec, IRBuilder<> IRB, Value *ShadowBase,
+  const ArrayRef<AllocaInst*> &AllocaVec, IRBuilder<> &IRB, Value *ShadowBase,
   bool DoPoison) {
   size_t ShadowRZSize = RedzoneSize() >> Mapping.Scale;
   assert(ShadowRZSize >= 1 && ShadowRZSize <= 4);
@@ -1344,12 +1440,40 @@ void FunctionStackPoisoner::poisonRedZones(
   }
 }
 
+// Fake stack allocator (asan_fake_stack.h) has 11 size classes
+// for every power of 2 from kMinStackMallocSize to kMaxAsanStackMallocSizeClass
+static int StackMallocSizeClass(uint64_t LocalStackSize) {
+  assert(LocalStackSize <= kMaxStackMallocSize);
+  uint64_t MaxSize = kMinStackMallocSize;
+  for (int i = 0; ; i++, MaxSize *= 2)
+    if (LocalStackSize <= MaxSize)
+      return i;
+  llvm_unreachable("impossible LocalStackSize");
+}
+
+// Set Size bytes starting from ShadowBase to kAsanStackAfterReturnMagic.
+// We can not use MemSet intrinsic because it may end up calling the actual
+// memset. Size is a multiple of 8.
+// Currently this generates 8-byte stores on x86_64; it may be better to
+// generate wider stores.
+void FunctionStackPoisoner::SetShadowToStackAfterReturnInlined(
+    IRBuilder<> &IRB, Value *ShadowBase, int Size) {
+  assert(!(Size % 8));
+  assert(kAsanStackAfterReturnMagic == 0xf5);
+  for (int i = 0; i < Size; i += 8) {
+    Value *p = IRB.CreateAdd(ShadowBase, ConstantInt::get(IntptrTy, i));
+    IRB.CreateStore(ConstantInt::get(IRB.getInt64Ty(), 0xf5f5f5f5f5f5f5f5ULL),
+                    IRB.CreateIntToPtr(p, IRB.getInt64Ty()->getPointerTo()));
+  }
+}
+
 void FunctionStackPoisoner::poisonStack() {
   uint64_t LocalStackSize = TotalStackSize +
                             (AllocaVec.size() + 1) * RedzoneSize();
 
   bool DoStackMalloc = ASan.CheckUseAfterReturn
       && LocalStackSize <= kMaxStackMallocSize;
+  int StackMallocIdx = -1;
 
   assert(AllocaVec.size() > 0);
   Instruction *InsBefore = AllocaVec[0];
@@ -1367,8 +1491,28 @@ void FunctionStackPoisoner::poisonStack() {
   Value *LocalStackBase = OrigStackBase;
 
   if (DoStackMalloc) {
-    LocalStackBase = IRB.CreateCall2(AsanStackMallocFunc,
+    // LocalStackBase = OrigStackBase
+    // if (__asan_option_detect_stack_use_after_return)
+    //   LocalStackBase = __asan_stack_malloc_N(LocalStackBase, OrigStackBase);
+    StackMallocIdx = StackMallocSizeClass(LocalStackSize);
+    assert(StackMallocIdx <= kMaxAsanStackMallocSizeClass);
+    Constant *OptionDetectUAR = F.getParent()->getOrInsertGlobal(
+        kAsanOptionDetectUAR, IRB.getInt32Ty());
+    Value *Cmp = IRB.CreateICmpNE(IRB.CreateLoad(OptionDetectUAR),
+                                  Constant::getNullValue(IRB.getInt32Ty()));
+    Instruction *Term =
+        SplitBlockAndInsertIfThen(cast<Instruction>(Cmp), false);
+    BasicBlock *CmpBlock = cast<Instruction>(Cmp)->getParent();
+    IRBuilder<> IRBIf(Term);
+    LocalStackBase = IRBIf.CreateCall2(
+        AsanStackMallocFunc[StackMallocIdx],
         ConstantInt::get(IntptrTy, LocalStackSize), OrigStackBase);
+    BasicBlock *SetBlock = cast<Instruction>(LocalStackBase)->getParent();
+    IRB.SetInsertPoint(InsBefore);
+    PHINode *Phi = IRB.CreatePHI(IntptrTy, 2);
+    Phi->addIncoming(OrigStackBase, CmpBlock);
+    Phi->addIncoming(LocalStackBase, SetBlock);
+    LocalStackBase = Phi;
   }
 
   // This string will be parsed by the run-time (DescribeAddressIfStack).
@@ -1380,11 +1524,10 @@ void FunctionStackPoisoner::poisonStack() {
   bool HavePoisonedAllocas = false;
   for (size_t i = 0, n = AllocaPoisonCallVec.size(); i < n; i++) {
     const AllocaPoisonCall &APC = AllocaPoisonCallVec[i];
-    IntrinsicInst *II = APC.InsBefore;
-    AllocaInst *AI = findAllocaForValue(II->getArgOperand(1));
-    assert(AI);
-    IRBuilder<> IRB(II);
-    poisonAlloca(AI, APC.Size, IRB, APC.DoPoison);
+    assert(APC.InsBefore);
+    assert(APC.AI);
+    IRBuilder<> IRB(APC.InsBefore);
+    poisonAlloca(APC.AI, APC.Size, IRB, APC.DoPoison);
     HavePoisonedAllocas |= APC.DoPoison;
   }
 
@@ -1442,10 +1585,35 @@ void FunctionStackPoisoner::poisonStack() {
     // Unpoison the stack.
     poisonRedZones(AllocaVec, IRBRet, ShadowBase, false);
     if (DoStackMalloc) {
+      assert(StackMallocIdx >= 0);
       // In use-after-return mode, mark the whole stack frame unaddressable.
-      IRBRet.CreateCall3(AsanStackFreeFunc, LocalStackBase,
-                         ConstantInt::get(IntptrTy, LocalStackSize),
-                         OrigStackBase);
+      if (StackMallocIdx <= 4) {
+        // For small sizes inline the whole thing:
+        // if LocalStackBase != OrigStackBase:
+        //     memset(ShadowBase, kAsanStackAfterReturnMagic, ShadowSize);
+        //     **SavedFlagPtr(LocalStackBase) = 0
+        // FIXME: if LocalStackBase != OrigStackBase don't call poisonRedZones.
+        Value *Cmp = IRBRet.CreateICmpNE(LocalStackBase, OrigStackBase);
+        TerminatorInst *PoisonTerm =
+            SplitBlockAndInsertIfThen(cast<Instruction>(Cmp), false);
+        IRBuilder<> IRBPoison(PoisonTerm);
+        int ClassSize = kMinStackMallocSize << StackMallocIdx;
+        SetShadowToStackAfterReturnInlined(IRBPoison, ShadowBase,
+                                           ClassSize >> Mapping.Scale);
+        Value *SavedFlagPtrPtr = IRBPoison.CreateAdd(
+            LocalStackBase,
+            ConstantInt::get(IntptrTy, ClassSize - ASan.LongSize / 8));
+        Value *SavedFlagPtr = IRBPoison.CreateLoad(
+            IRBPoison.CreateIntToPtr(SavedFlagPtrPtr, IntptrPtrTy));
+        IRBPoison.CreateStore(
+            Constant::getNullValue(IRBPoison.getInt8Ty()),
+            IRBPoison.CreateIntToPtr(SavedFlagPtr, IRBPoison.getInt8PtrTy()));
+      } else {
+        // For larger frames call __asan_stack_free_*.
+        IRBRet.CreateCall3(AsanStackFreeFunc[StackMallocIdx], LocalStackBase,
+                           ConstantInt::get(IntptrTy, LocalStackSize),
+                           OrigStackBase);
+      }
     } else if (HavePoisonedAllocas) {
       // If we poisoned some allocas in llvm.lifetime analysis,
       // unpoison whole stack frame now.
@@ -1460,7 +1628,7 @@ void FunctionStackPoisoner::poisonStack() {
 }
 
 void FunctionStackPoisoner::poisonAlloca(Value *V, uint64_t Size,
-                                         IRBuilder<> IRB, bool DoPoison) {
+                                         IRBuilder<> &IRB, bool DoPoison) {
   // For now just insert the call to ASan runtime.
   Value *AddrArg = IRB.CreatePointerCast(V, IntptrTy);
   Value *SizeArg = ConstantInt::get(IntptrTy, Size);
diff --git a/lib/Transforms/Instrumentation/BoundsChecking.cpp b/lib/Transforms/Instrumentation/BoundsChecking.cpp
index b094d42..7a9f0f6 100644
--- a/lib/Transforms/Instrumentation/BoundsChecking.cpp
+++ b/lib/Transforms/Instrumentation/BoundsChecking.cpp
@@ -80,7 +80,7 @@ BasicBlock *BoundsChecking::getTrapBB() {
     return TrapBB;
 
   Function *Fn = Inst->getParent()->getParent();
-  BasicBlock::iterator PrevInsertPoint = Builder->GetInsertPoint();
+  IRBuilder<>::InsertPointGuard Guard(*Builder);
   TrapBB = BasicBlock::Create(Fn->getContext(), "trap", Fn);
   Builder->SetInsertPoint(TrapBB);
 
@@ -91,7 +91,6 @@ BasicBlock *BoundsChecking::getTrapBB() {
   TrapCall->setDebugLoc(Inst->getDebugLoc());
   Builder->CreateUnreachable();
 
-  Builder->SetInsertPoint(PrevInsertPoint);
   return TrapBB;
 }
 
@@ -173,7 +172,8 @@ bool BoundsChecking::runOnFunction(Function &F) {
   TrapBB = 0;
   BuilderTy TheBuilder(F.getContext(), TargetFolder(TD));
   Builder = &TheBuilder;
-  ObjectSizeOffsetEvaluator TheObjSizeEval(TD, TLI, F.getContext());
+  ObjectSizeOffsetEvaluator TheObjSizeEval(TD, TLI, F.getContext(),
+                                           /*RoundToAlign=*/true);
   ObjSizeEval = &TheObjSizeEval;
 
   // check HANDLE_MEMORY_INST in include/llvm/Instruction.def for memory
diff --git a/lib/Transforms/Instrumentation/CMakeLists.txt b/lib/Transforms/Instrumentation/CMakeLists.txt
index 5e34863..3563593 100644
--- a/lib/Transforms/Instrumentation/CMakeLists.txt
+++ b/lib/Transforms/Instrumentation/CMakeLists.txt
@@ -1,14 +1,11 @@
 add_llvm_library(LLVMInstrumentation
   AddressSanitizer.cpp
   BoundsChecking.cpp
+  DataFlowSanitizer.cpp
   DebugIR.cpp
-  EdgeProfiling.cpp
   GCOVProfiling.cpp
   MemorySanitizer.cpp
   Instrumentation.cpp
-  OptimalEdgeProfiling.cpp
-  PathProfiling.cpp
-  ProfilingUtils.cpp
   ThreadSanitizer.cpp
   )
 
diff --git a/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp b/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
new file mode 100644
index 0000000..9b9e725
--- /dev/null
+++ b/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
@@ -0,0 +1,1397 @@
+//===-- DataFlowSanitizer.cpp - dynamic data flow analysis ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file is a part of DataFlowSanitizer, a generalised dynamic data flow
+/// analysis.
+///
+/// Unlike other Sanitizer tools, this tool is not designed to detect a specific
+/// class of bugs on its own.  Instead, it provides a generic dynamic data flow
+/// analysis framework to be used by clients to help detect application-specific
+/// issues within their own code.
+///
+/// The analysis is based on automatic propagation of data flow labels (also
+/// known as taint labels) through a program as it performs computation.  Each
+/// byte of application memory is backed by two bytes of shadow memory which
+/// hold the label.  On Linux/x86_64, memory is laid out as follows:
+///
+/// +--------------------+ 0x800000000000 (top of memory)
+/// | application memory |
+/// +--------------------+ 0x700000008000 (kAppAddr)
+/// |                    |
+/// |       unused       |
+/// |                    |
+/// +--------------------+ 0x200200000000 (kUnusedAddr)
+/// |    union table     |
+/// +--------------------+ 0x200000000000 (kUnionTableAddr)
+/// |   shadow memory    |
+/// +--------------------+ 0x000000010000 (kShadowAddr)
+/// | reserved by kernel |
+/// +--------------------+ 0x000000000000
+///
+/// To derive a shadow memory address from an application memory address,
+/// bits 44-46 are cleared to bring the address into the range
+/// [0x000000008000,0x100000000000).  Then the address is shifted left by 1 to
+/// account for the double byte representation of shadow labels and move the
+/// address into the shadow memory range.  See the function
+/// DataFlowSanitizer::getShadowAddress below.
+///
+/// For more information, please refer to the design document:
+/// http://clang.llvm.org/docs/DataFlowSanitizerDesign.html
+
+#include "llvm/Transforms/Instrumentation.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InstVisitor.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/SpecialCaseList.h"
+#include <iterator>
+
+using namespace llvm;
+
+// The -dfsan-preserve-alignment flag controls whether this pass assumes that
+// alignment requirements provided by the input IR are correct.  For example,
+// if the input IR contains a load with alignment 8, this flag will cause
+// the shadow load to have alignment 16.  This flag is disabled by default as
+// we have unfortunately encountered too much code (including Clang itself;
+// see PR14291) which performs misaligned access.
+static cl::opt<bool> ClPreserveAlignment(
+    "dfsan-preserve-alignment",
+    cl::desc("respect alignment requirements provided by input IR"), cl::Hidden,
+    cl::init(false));
+
+// The ABI list file controls how shadow parameters are passed.  The pass treats
+// every function labelled "uninstrumented" in the ABI list file as conforming
+// to the "native" (i.e. unsanitized) ABI.  Unless the ABI list contains
+// additional annotations for those functions, a call to one of those functions
+// will produce a warning message, as the labelling behaviour of the function is
+// unknown.  The other supported annotations are "functional" and "discard",
+// which are described below under DataFlowSanitizer::WrapperKind.
+static cl::opt<std::string> ClABIListFile(
+    "dfsan-abilist",
+    cl::desc("File listing native ABI functions and how the pass treats them"),
+    cl::Hidden);
+
+// Controls whether the pass uses IA_Args or IA_TLS as the ABI for instrumented
+// functions (see DataFlowSanitizer::InstrumentedABI below).
+static cl::opt<bool> ClArgsABI(
+    "dfsan-args-abi",
+    cl::desc("Use the argument ABI rather than the TLS ABI"),
+    cl::Hidden);
+
+static cl::opt<bool> ClDebugNonzeroLabels(
+    "dfsan-debug-nonzero-labels",
+    cl::desc("Insert calls to __dfsan_nonzero_label on observing a parameter, "
+             "load or return with a nonzero label"),
+    cl::Hidden);
+
+namespace {
+
+class DataFlowSanitizer : public ModulePass {
+  friend struct DFSanFunction;
+  friend class DFSanVisitor;
+
+  enum {
+    ShadowWidth = 16
+  };
+
+  /// Which ABI should be used for instrumented functions?
+  enum InstrumentedABI {
+    /// Argument and return value labels are passed through additional
+    /// arguments and by modifying the return type.
+    IA_Args,
+
+    /// Argument and return value labels are passed through TLS variables
+    /// __dfsan_arg_tls and __dfsan_retval_tls.
+    IA_TLS
+  };
+
+  /// How should calls to uninstrumented functions be handled?
+  enum WrapperKind {
+    /// This function is present in an uninstrumented form but we don't know
+    /// how it should be handled.  Print a warning and call the function anyway.
+    /// Don't label the return value.
+    WK_Warning,
+
+    /// This function does not write to (user-accessible) memory, and its return
+    /// value is unlabelled.
+    WK_Discard,
+
+    /// This function does not write to (user-accessible) memory, and the label
+    /// of its return value is the union of the label of its arguments.
+    WK_Functional,
+
+    /// Instead of calling the function, a custom wrapper __dfsw_F is called,
+    /// where F is the name of the function.  This function may wrap the
+    /// original function or provide its own implementation.  This is similar to
+    /// the IA_Args ABI, except that IA_Args uses a struct return type to
+    /// pass the return value shadow in a register, while WK_Custom uses an
+    /// extra pointer argument to return the shadow.  This allows the wrapped
+    /// form of the function type to be expressed in C.
+    WK_Custom
+  };
+
+  DataLayout *DL;
+  Module *Mod;
+  LLVMContext *Ctx;
+  IntegerType *ShadowTy;
+  PointerType *ShadowPtrTy;
+  IntegerType *IntptrTy;
+  ConstantInt *ZeroShadow;
+  ConstantInt *ShadowPtrMask;
+  ConstantInt *ShadowPtrMul;
+  Constant *ArgTLS;
+  Constant *RetvalTLS;
+  void *(*GetArgTLSPtr)();
+  void *(*GetRetvalTLSPtr)();
+  Constant *GetArgTLS;
+  Constant *GetRetvalTLS;
+  FunctionType *DFSanUnionFnTy;
+  FunctionType *DFSanUnionLoadFnTy;
+  FunctionType *DFSanUnimplementedFnTy;
+  FunctionType *DFSanSetLabelFnTy;
+  FunctionType *DFSanNonzeroLabelFnTy;
+  Constant *DFSanUnionFn;
+  Constant *DFSanUnionLoadFn;
+  Constant *DFSanUnimplementedFn;
+  Constant *DFSanSetLabelFn;
+  Constant *DFSanNonzeroLabelFn;
+  MDNode *ColdCallWeights;
+  OwningPtr<SpecialCaseList> ABIList;
+  DenseMap<Value *, Function *> UnwrappedFnMap;
+  AttributeSet ReadOnlyNoneAttrs;
+
+  Value *getShadowAddress(Value *Addr, Instruction *Pos);
+  Value *combineShadows(Value *V1, Value *V2, Instruction *Pos);
+  bool isInstrumented(const Function *F);
+  bool isInstrumented(const GlobalAlias *GA);
+  FunctionType *getArgsFunctionType(FunctionType *T);
+  FunctionType *getTrampolineFunctionType(FunctionType *T);
+  FunctionType *getCustomFunctionType(FunctionType *T);
+  InstrumentedABI getInstrumentedABI();
+  WrapperKind getWrapperKind(Function *F);
+  void addGlobalNamePrefix(GlobalValue *GV);
+  Function *buildWrapperFunction(Function *F, StringRef NewFName,
+                                 GlobalValue::LinkageTypes NewFLink,
+                                 FunctionType *NewFT);
+  Constant *getOrBuildTrampolineFunction(FunctionType *FT, StringRef FName);
+
+ public:
+  DataFlowSanitizer(StringRef ABIListFile = StringRef(),
+                    void *(*getArgTLS)() = 0, void *(*getRetValTLS)() = 0);
+  static char ID;
+  bool doInitialization(Module &M);
+  bool runOnModule(Module &M);
+};
+
+struct DFSanFunction {
+  DataFlowSanitizer &DFS;
+  Function *F;
+  DataFlowSanitizer::InstrumentedABI IA;
+  bool IsNativeABI;
+  Value *ArgTLSPtr;
+  Value *RetvalTLSPtr;
+  AllocaInst *LabelReturnAlloca;
+  DenseMap<Value *, Value *> ValShadowMap;
+  DenseMap<AllocaInst *, AllocaInst *> AllocaShadowMap;
+  std::vector<std::pair<PHINode *, PHINode *> > PHIFixups;
+  DenseSet<Instruction *> SkipInsts;
+  DenseSet<Value *> NonZeroChecks;
+
+  DFSanFunction(DataFlowSanitizer &DFS, Function *F, bool IsNativeABI)
+      : DFS(DFS), F(F), IA(DFS.getInstrumentedABI()),
+        IsNativeABI(IsNativeABI), ArgTLSPtr(0), RetvalTLSPtr(0),
+        LabelReturnAlloca(0) {}
+  Value *getArgTLSPtr();
+  Value *getArgTLS(unsigned Index, Instruction *Pos);
+  Value *getRetvalTLS();
+  Value *getShadow(Value *V);
+  void setShadow(Instruction *I, Value *Shadow);
+  Value *combineOperandShadows(Instruction *Inst);
+  Value *loadShadow(Value *ShadowAddr, uint64_t Size, uint64_t Align,
+                    Instruction *Pos);
+  void storeShadow(Value *Addr, uint64_t Size, uint64_t Align, Value *Shadow,
+                   Instruction *Pos);
+};
+
+class DFSanVisitor : public InstVisitor<DFSanVisitor> {
+ public:
+  DFSanFunction &DFSF;
+  DFSanVisitor(DFSanFunction &DFSF) : DFSF(DFSF) {}
+
+  void visitOperandShadowInst(Instruction &I);
+
+  void visitBinaryOperator(BinaryOperator &BO);
+  void visitCastInst(CastInst &CI);
+  void visitCmpInst(CmpInst &CI);
+  void visitGetElementPtrInst(GetElementPtrInst &GEPI);
+  void visitLoadInst(LoadInst &LI);
+  void visitStoreInst(StoreInst &SI);
+  void visitReturnInst(ReturnInst &RI);
+  void visitCallSite(CallSite CS);
+  void visitPHINode(PHINode &PN);
+  void visitExtractElementInst(ExtractElementInst &I);
+  void visitInsertElementInst(InsertElementInst &I);
+  void visitShuffleVectorInst(ShuffleVectorInst &I);
+  void visitExtractValueInst(ExtractValueInst &I);
+  void visitInsertValueInst(InsertValueInst &I);
+  void visitAllocaInst(AllocaInst &I);
+  void visitSelectInst(SelectInst &I);
+  void visitMemSetInst(MemSetInst &I);
+  void visitMemTransferInst(MemTransferInst &I);
+};
+
+}
+
+char DataFlowSanitizer::ID;
+INITIALIZE_PASS(DataFlowSanitizer, "dfsan",
+                "DataFlowSanitizer: dynamic data flow analysis.", false, false)
+
+ModulePass *llvm::createDataFlowSanitizerPass(StringRef ABIListFile,
+                                              void *(*getArgTLS)(),
+                                              void *(*getRetValTLS)()) {
+  return new DataFlowSanitizer(ABIListFile, getArgTLS, getRetValTLS);
+}
+
+DataFlowSanitizer::DataFlowSanitizer(StringRef ABIListFile,
+                                     void *(*getArgTLS)(),
+                                     void *(*getRetValTLS)())
+    : ModulePass(ID), GetArgTLSPtr(getArgTLS), GetRetvalTLSPtr(getRetValTLS),
+      ABIList(SpecialCaseList::createOrDie(ABIListFile.empty() ? ClABIListFile
+                                                               : ABIListFile)) {
+}
+
+FunctionType *DataFlowSanitizer::getArgsFunctionType(FunctionType *T) {
+  llvm::SmallVector<Type *, 4> ArgTypes;
+  std::copy(T->param_begin(), T->param_end(), std::back_inserter(ArgTypes));
+  for (unsigned i = 0, e = T->getNumParams(); i != e; ++i)
+    ArgTypes.push_back(ShadowTy);
+  if (T->isVarArg())
+    ArgTypes.push_back(ShadowPtrTy);
+  Type *RetType = T->getReturnType();
+  if (!RetType->isVoidTy())
+    RetType = StructType::get(RetType, ShadowTy, (Type *)0);
+  return FunctionType::get(RetType, ArgTypes, T->isVarArg());
+}
+
+FunctionType *DataFlowSanitizer::getTrampolineFunctionType(FunctionType *T) {
+  assert(!T->isVarArg());
+  llvm::SmallVector<Type *, 4> ArgTypes;
+  ArgTypes.push_back(T->getPointerTo());
+  std::copy(T->param_begin(), T->param_end(), std::back_inserter(ArgTypes));
+  for (unsigned i = 0, e = T->getNumParams(); i != e; ++i)
+    ArgTypes.push_back(ShadowTy);
+  Type *RetType = T->getReturnType();
+  if (!RetType->isVoidTy())
+    ArgTypes.push_back(ShadowPtrTy);
+  return FunctionType::get(T->getReturnType(), ArgTypes, false);
+}
+
+FunctionType *DataFlowSanitizer::getCustomFunctionType(FunctionType *T) {
+  assert(!T->isVarArg());
+  llvm::SmallVector<Type *, 4> ArgTypes;
+  for (FunctionType::param_iterator i = T->param_begin(), e = T->param_end();
+       i != e; ++i) {
+    FunctionType *FT;
+    if (isa<PointerType>(*i) && (FT = dyn_cast<FunctionType>(cast<PointerType>(
+                                     *i)->getElementType()))) {
+      ArgTypes.push_back(getTrampolineFunctionType(FT)->getPointerTo());
+      ArgTypes.push_back(Type::getInt8PtrTy(*Ctx));
+    } else {
+      ArgTypes.push_back(*i);
+    }
+  }
+  for (unsigned i = 0, e = T->getNumParams(); i != e; ++i)
+    ArgTypes.push_back(ShadowTy);
+  Type *RetType = T->getReturnType();
+  if (!RetType->isVoidTy())
+    ArgTypes.push_back(ShadowPtrTy);
+  return FunctionType::get(T->getReturnType(), ArgTypes, false);
+}
+
+bool DataFlowSanitizer::doInitialization(Module &M) {
+  DL = getAnalysisIfAvailable<DataLayout>();
+  if (!DL)
+    return false;
+
+  Mod = &M;
+  Ctx = &M.getContext();
+  ShadowTy = IntegerType::get(*Ctx, ShadowWidth);
+  ShadowPtrTy = PointerType::getUnqual(ShadowTy);
+  IntptrTy = DL->getIntPtrType(*Ctx);
+  ZeroShadow = ConstantInt::getSigned(ShadowTy, 0);
+  ShadowPtrMask = ConstantInt::getSigned(IntptrTy, ~0x700000000000LL);
+  ShadowPtrMul = ConstantInt::getSigned(IntptrTy, ShadowWidth / 8);
+
+  Type *DFSanUnionArgs[2] = { ShadowTy, ShadowTy };
+  DFSanUnionFnTy =
+      FunctionType::get(ShadowTy, DFSanUnionArgs, /*isVarArg=*/ false);
+  Type *DFSanUnionLoadArgs[2] = { ShadowPtrTy, IntptrTy };
+  DFSanUnionLoadFnTy =
+      FunctionType::get(ShadowTy, DFSanUnionLoadArgs, /*isVarArg=*/ false);
+  DFSanUnimplementedFnTy = FunctionType::get(
+      Type::getVoidTy(*Ctx), Type::getInt8PtrTy(*Ctx), /*isVarArg=*/false);
+  Type *DFSanSetLabelArgs[3] = { ShadowTy, Type::getInt8PtrTy(*Ctx), IntptrTy };
+  DFSanSetLabelFnTy = FunctionType::get(Type::getVoidTy(*Ctx),
+                                        DFSanSetLabelArgs, /*isVarArg=*/false);
+  DFSanNonzeroLabelFnTy = FunctionType::get(
+      Type::getVoidTy(*Ctx), ArrayRef<Type *>(), /*isVarArg=*/false);
+
+  if (GetArgTLSPtr) {
+    Type *ArgTLSTy = ArrayType::get(ShadowTy, 64);
+    ArgTLS = 0;
+    GetArgTLS = ConstantExpr::getIntToPtr(
+        ConstantInt::get(IntptrTy, uintptr_t(GetArgTLSPtr)),
+        PointerType::getUnqual(
+            FunctionType::get(PointerType::getUnqual(ArgTLSTy), (Type *)0)));
+  }
+  if (GetRetvalTLSPtr) {
+    RetvalTLS = 0;
+    GetRetvalTLS = ConstantExpr::getIntToPtr(
+        ConstantInt::get(IntptrTy, uintptr_t(GetRetvalTLSPtr)),
+        PointerType::getUnqual(
+            FunctionType::get(PointerType::getUnqual(ShadowTy), (Type *)0)));
+  }
+
+  ColdCallWeights = MDBuilder(*Ctx).createBranchWeights(1, 1000);
+  return true;
+}
+
+bool DataFlowSanitizer::isInstrumented(const Function *F) {
+  return !ABIList->isIn(*F, "uninstrumented");
+}
+
+bool DataFlowSanitizer::isInstrumented(const GlobalAlias *GA) {
+  return !ABIList->isIn(*GA, "uninstrumented");
+}
+
+DataFlowSanitizer::InstrumentedABI DataFlowSanitizer::getInstrumentedABI() {
+  return ClArgsABI ? IA_Args : IA_TLS;
+}
+
+DataFlowSanitizer::WrapperKind DataFlowSanitizer::getWrapperKind(Function *F) {
+  if (ABIList->isIn(*F, "functional"))
+    return WK_Functional;
+  if (ABIList->isIn(*F, "discard"))
+    return WK_Discard;
+  if (ABIList->isIn(*F, "custom"))
+    return WK_Custom;
+
+  return WK_Warning;
+}
+
+void DataFlowSanitizer::addGlobalNamePrefix(GlobalValue *GV) {
+  std::string GVName = GV->getName(), Prefix = "dfs$";
+  GV->setName(Prefix + GVName);
+
+  // Try to change the name of the function in module inline asm.  We only do
+  // this for specific asm directives, currently only ".symver", to try to avoid
+  // corrupting asm which happens to contain the symbol name as a substring.
+  // Note that the substitution for .symver assumes that the versioned symbol
+  // also has an instrumented name.
+  std::string Asm = GV->getParent()->getModuleInlineAsm();
+  std::string SearchStr = ".symver " + GVName + ",";
+  size_t Pos = Asm.find(SearchStr);
+  if (Pos != std::string::npos) {
+    Asm.replace(Pos, SearchStr.size(),
+                ".symver " + Prefix + GVName + "," + Prefix);
+    GV->getParent()->setModuleInlineAsm(Asm);
+  }
+}
+
+Function *
+DataFlowSanitizer::buildWrapperFunction(Function *F, StringRef NewFName,
+                                        GlobalValue::LinkageTypes NewFLink,
+                                        FunctionType *NewFT) {
+  FunctionType *FT = F->getFunctionType();
+  Function *NewF = Function::Create(NewFT, NewFLink, NewFName,
+                                    F->getParent());
+  NewF->copyAttributesFrom(F);
+  NewF->removeAttributes(
+      AttributeSet::ReturnIndex,
+      AttributeFuncs::typeIncompatible(NewFT->getReturnType(),
+                                       AttributeSet::ReturnIndex));
+
+  BasicBlock *BB = BasicBlock::Create(*Ctx, "entry", NewF);
+  std::vector<Value *> Args;
+  unsigned n = FT->getNumParams();
+  for (Function::arg_iterator ai = NewF->arg_begin(); n != 0; ++ai, --n)
+    Args.push_back(&*ai);
+  CallInst *CI = CallInst::Create(F, Args, "", BB);
+  if (FT->getReturnType()->isVoidTy())
+    ReturnInst::Create(*Ctx, BB);
+  else
+    ReturnInst::Create(*Ctx, CI, BB);
+
+  return NewF;
+}
+
+Constant *DataFlowSanitizer::getOrBuildTrampolineFunction(FunctionType *FT,
+                                                          StringRef FName) {
+  FunctionType *FTT = getTrampolineFunctionType(FT);
+  Constant *C = Mod->getOrInsertFunction(FName, FTT);
+  Function *F = dyn_cast<Function>(C);
+  if (F && F->isDeclaration()) {
+    F->setLinkage(GlobalValue::LinkOnceODRLinkage);
+    BasicBlock *BB = BasicBlock::Create(*Ctx, "entry", F);
+    std::vector<Value *> Args;
+    Function::arg_iterator AI = F->arg_begin(); ++AI;
+    for (unsigned N = FT->getNumParams(); N != 0; ++AI, --N)
+      Args.push_back(&*AI);
+    CallInst *CI =
+        CallInst::Create(&F->getArgumentList().front(), Args, "", BB);
+    ReturnInst *RI;
+    if (FT->getReturnType()->isVoidTy())
+      RI = ReturnInst::Create(*Ctx, BB);
+    else
+      RI = ReturnInst::Create(*Ctx, CI, BB);
+
+    DFSanFunction DFSF(*this, F, /*IsNativeABI=*/true);
+    Function::arg_iterator ValAI = F->arg_begin(), ShadowAI = AI; ++ValAI;
+    for (unsigned N = FT->getNumParams(); N != 0; ++ValAI, ++ShadowAI, --N)
+      DFSF.ValShadowMap[ValAI] = ShadowAI;
+    DFSanVisitor(DFSF).visitCallInst(*CI);
+    if (!FT->getReturnType()->isVoidTy())
+      new StoreInst(DFSF.getShadow(RI->getReturnValue()),
+                    &F->getArgumentList().back(), RI);
+  }
+
+  return C;
+}
+
+bool DataFlowSanitizer::runOnModule(Module &M) {
+  if (!DL)
+    return false;
+
+  if (ABIList->isIn(M, "skip"))
+    return false;
+
+  if (!GetArgTLSPtr) {
+    Type *ArgTLSTy = ArrayType::get(ShadowTy, 64);
+    ArgTLS = Mod->getOrInsertGlobal("__dfsan_arg_tls", ArgTLSTy);
+    if (GlobalVariable *G = dyn_cast<GlobalVariable>(ArgTLS))
+      G->setThreadLocalMode(GlobalVariable::InitialExecTLSModel);
+  }
+  if (!GetRetvalTLSPtr) {
+    RetvalTLS = Mod->getOrInsertGlobal("__dfsan_retval_tls", ShadowTy);
+    if (GlobalVariable *G = dyn_cast<GlobalVariable>(RetvalTLS))
+      G->setThreadLocalMode(GlobalVariable::InitialExecTLSModel);
+  }
+
+  DFSanUnionFn = Mod->getOrInsertFunction("__dfsan_union", DFSanUnionFnTy);
+  if (Function *F = dyn_cast<Function>(DFSanUnionFn)) {
+    F->addAttribute(AttributeSet::FunctionIndex, Attribute::ReadNone);
+    F->addAttribute(AttributeSet::ReturnIndex, Attribute::ZExt);
+    F->addAttribute(1, Attribute::ZExt);
+    F->addAttribute(2, Attribute::ZExt);
+  }
+  DFSanUnionLoadFn =
+      Mod->getOrInsertFunction("__dfsan_union_load", DFSanUnionLoadFnTy);
+  if (Function *F = dyn_cast<Function>(DFSanUnionLoadFn)) {
+    F->addAttribute(AttributeSet::ReturnIndex, Attribute::ZExt);
+  }
+  DFSanUnimplementedFn =
+      Mod->getOrInsertFunction("__dfsan_unimplemented", DFSanUnimplementedFnTy);
+  DFSanSetLabelFn =
+      Mod->getOrInsertFunction("__dfsan_set_label", DFSanSetLabelFnTy);
+  if (Function *F = dyn_cast<Function>(DFSanSetLabelFn)) {
+    F->addAttribute(1, Attribute::ZExt);
+  }
+  DFSanNonzeroLabelFn =
+      Mod->getOrInsertFunction("__dfsan_nonzero_label", DFSanNonzeroLabelFnTy);
+
+  std::vector<Function *> FnsToInstrument;
+  llvm::SmallPtrSet<Function *, 2> FnsWithNativeABI;
+  for (Module::iterator i = M.begin(), e = M.end(); i != e; ++i) {
+    if (!i->isIntrinsic() &&
+        i != DFSanUnionFn &&
+        i != DFSanUnionLoadFn &&
+        i != DFSanUnimplementedFn &&
+        i != DFSanSetLabelFn &&
+        i != DFSanNonzeroLabelFn)
+      FnsToInstrument.push_back(&*i);
+  }
+
+  // Give function aliases prefixes when necessary, and build wrappers where the
+  // instrumentedness is inconsistent.
+  for (Module::alias_iterator i = M.alias_begin(), e = M.alias_end(); i != e;) {
+    GlobalAlias *GA = &*i;
+    ++i;
+    // Don't stop on weak.  We assume people aren't playing games with the
+    // instrumentedness of overridden weak aliases.
+    if (Function *F = dyn_cast<Function>(
+            GA->resolveAliasedGlobal(/*stopOnWeak=*/false))) {
+      bool GAInst = isInstrumented(GA), FInst = isInstrumented(F);
+      if (GAInst && FInst) {
+        addGlobalNamePrefix(GA);
+      } else if (GAInst != FInst) {
+        // Non-instrumented alias of an instrumented function, or vice versa.
+        // Replace the alias with a native-ABI wrapper of the aliasee.  The pass
+        // below will take care of instrumenting it.
+        Function *NewF =
+            buildWrapperFunction(F, "", GA->getLinkage(), F->getFunctionType());
+        GA->replaceAllUsesWith(NewF);
+        NewF->takeName(GA);
+        GA->eraseFromParent();
+        FnsToInstrument.push_back(NewF);
+      }
+    }
+  }
+
+  AttrBuilder B;
+  B.addAttribute(Attribute::ReadOnly).addAttribute(Attribute::ReadNone);
+  ReadOnlyNoneAttrs = AttributeSet::get(*Ctx, AttributeSet::FunctionIndex, B);
+
+  // First, change the ABI of every function in the module.  ABI-listed
+  // functions keep their original ABI and get a wrapper function.
+  for (std::vector<Function *>::iterator i = FnsToInstrument.begin(),
+                                         e = FnsToInstrument.end();
+       i != e; ++i) {
+    Function &F = **i;
+    FunctionType *FT = F.getFunctionType();
+
+    bool IsZeroArgsVoidRet = (FT->getNumParams() == 0 && !FT->isVarArg() &&
+                              FT->getReturnType()->isVoidTy());
+
+    if (isInstrumented(&F)) {
+      // Instrumented functions get a 'dfs$' prefix.  This allows us to more
+      // easily identify cases of mismatching ABIs.
+      if (getInstrumentedABI() == IA_Args && !IsZeroArgsVoidRet) {
+        FunctionType *NewFT = getArgsFunctionType(FT);
+        Function *NewF = Function::Create(NewFT, F.getLinkage(), "", &M);
+        NewF->copyAttributesFrom(&F);
+        NewF->removeAttributes(
+            AttributeSet::ReturnIndex,
+            AttributeFuncs::typeIncompatible(NewFT->getReturnType(),
+                                             AttributeSet::ReturnIndex));
+        for (Function::arg_iterator FArg = F.arg_begin(),
+                                    NewFArg = NewF->arg_begin(),
+                                    FArgEnd = F.arg_end();
+             FArg != FArgEnd; ++FArg, ++NewFArg) {
+          FArg->replaceAllUsesWith(NewFArg);
+        }
+        NewF->getBasicBlockList().splice(NewF->begin(), F.getBasicBlockList());
+
+        for (Function::use_iterator ui = F.use_begin(), ue = F.use_end();
+             ui != ue;) {
+          BlockAddress *BA = dyn_cast<BlockAddress>(ui.getUse().getUser());
+          ++ui;
+          if (BA) {
+            BA->replaceAllUsesWith(
+                BlockAddress::get(NewF, BA->getBasicBlock()));
+            delete BA;
+          }
+        }
+        F.replaceAllUsesWith(
+            ConstantExpr::getBitCast(NewF, PointerType::getUnqual(FT)));
+        NewF->takeName(&F);
+        F.eraseFromParent();
+        *i = NewF;
+        addGlobalNamePrefix(NewF);
+      } else {
+        addGlobalNamePrefix(&F);
+      }
+               // Hopefully, nobody will try to indirectly call a vararg
+               // function... yet.
+    } else if (FT->isVarArg()) {
+      UnwrappedFnMap[&F] = &F;
+      *i = 0;
+    } else if (!IsZeroArgsVoidRet || getWrapperKind(&F) == WK_Custom) {
+      // Build a wrapper function for F.  The wrapper simply calls F, and is
+      // added to FnsToInstrument so that any instrumentation according to its
+      // WrapperKind is done in the second pass below.
+      FunctionType *NewFT = getInstrumentedABI() == IA_Args
+                                ? getArgsFunctionType(FT)
+                                : FT;
+      Function *NewF = buildWrapperFunction(
+          &F, std::string("dfsw$") + std::string(F.getName()),
+          GlobalValue::LinkOnceODRLinkage, NewFT);
+      if (getInstrumentedABI() == IA_TLS)
+        NewF->removeAttributes(AttributeSet::FunctionIndex, ReadOnlyNoneAttrs);
+
+      Value *WrappedFnCst =
+          ConstantExpr::getBitCast(NewF, PointerType::getUnqual(FT));
+      F.replaceAllUsesWith(WrappedFnCst);
+      UnwrappedFnMap[WrappedFnCst] = &F;
+      *i = NewF;
+
+      if (!F.isDeclaration()) {
+        // This function is probably defining an interposition of an
+        // uninstrumented function and hence needs to keep the original ABI.
+        // But any functions it may call need to use the instrumented ABI, so
+        // we instrument it in a mode which preserves the original ABI.
+        FnsWithNativeABI.insert(&F);
+
+        // This code needs to rebuild the iterators, as they may be invalidated
+        // by the push_back, taking care that the new range does not include
+        // any functions added by this code.
+        size_t N = i - FnsToInstrument.begin(),
+               Count = e - FnsToInstrument.begin();
+        FnsToInstrument.push_back(&F);
+        i = FnsToInstrument.begin() + N;
+        e = FnsToInstrument.begin() + Count;
+      }
+    }
+  }
+
+  for (std::vector<Function *>::iterator i = FnsToInstrument.begin(),
+                                         e = FnsToInstrument.end();
+       i != e; ++i) {
+    if (!*i || (*i)->isDeclaration())
+      continue;
+
+    removeUnreachableBlocks(**i);
+
+    DFSanFunction DFSF(*this, *i, FnsWithNativeABI.count(*i));
+
+    // DFSanVisitor may create new basic blocks, which confuses df_iterator.
+    // Build a copy of the list before iterating over it.
+    llvm::SmallVector<BasicBlock *, 4> BBList;
+    std::copy(df_begin(&(*i)->getEntryBlock()), df_end(&(*i)->getEntryBlock()),
+              std::back_inserter(BBList));
+
+    for (llvm::SmallVector<BasicBlock *, 4>::iterator i = BBList.begin(),
+                                                      e = BBList.end();
+         i != e; ++i) {
+      Instruction *Inst = &(*i)->front();
+      while (1) {
+        // DFSanVisitor may split the current basic block, changing the current
+        // instruction's next pointer and moving the next instruction to the
+        // tail block from which we should continue.
+        Instruction *Next = Inst->getNextNode();
+        // DFSanVisitor may delete Inst, so keep track of whether it was a
+        // terminator.
+        bool IsTerminator = isa<TerminatorInst>(Inst);
+        if (!DFSF.SkipInsts.count(Inst))
+          DFSanVisitor(DFSF).visit(Inst);
+        if (IsTerminator)
+          break;
+        Inst = Next;
+      }
+    }
+
+    // We will not necessarily be able to compute the shadow for every phi node
+    // until we have visited every block.  Therefore, the code that handles phi
+    // nodes adds them to the PHIFixups list so that they can be properly
+    // handled here.
+    for (std::vector<std::pair<PHINode *, PHINode *> >::iterator
+             i = DFSF.PHIFixups.begin(),
+             e = DFSF.PHIFixups.end();
+         i != e; ++i) {
+      for (unsigned val = 0, n = i->first->getNumIncomingValues(); val != n;
+           ++val) {
+        i->second->setIncomingValue(
+            val, DFSF.getShadow(i->first->getIncomingValue(val)));
+      }
+    }
+
+    // -dfsan-debug-nonzero-labels will split the CFG in all kinds of crazy
+    // places (i.e. instructions in basic blocks we haven't even begun visiting
+    // yet).  To make our life easier, do this work in a pass after the main
+    // instrumentation.
+    if (ClDebugNonzeroLabels) {
+      for (DenseSet<Value *>::iterator i = DFSF.NonZeroChecks.begin(),
+                                       e = DFSF.NonZeroChecks.end();
+           i != e; ++i) {
+        Instruction *Pos;
+        if (Instruction *I = dyn_cast<Instruction>(*i))
+          Pos = I->getNextNode();
+        else
+          Pos = DFSF.F->getEntryBlock().begin();
+        while (isa<PHINode>(Pos) || isa<AllocaInst>(Pos))
+          Pos = Pos->getNextNode();
+        IRBuilder<> IRB(Pos);
+        Instruction *NeInst = cast<Instruction>(
+            IRB.CreateICmpNE(*i, DFSF.DFS.ZeroShadow));
+        BranchInst *BI = cast<BranchInst>(SplitBlockAndInsertIfThen(
+            NeInst, /*Unreachable=*/ false, ColdCallWeights));
+        IRBuilder<> ThenIRB(BI);
+        ThenIRB.CreateCall(DFSF.DFS.DFSanNonzeroLabelFn);
+      }
+    }
+  }
+
+  return false;
+}
+
+Value *DFSanFunction::getArgTLSPtr() {
+  if (ArgTLSPtr)
+    return ArgTLSPtr;
+  if (DFS.ArgTLS)
+    return ArgTLSPtr = DFS.ArgTLS;
+
+  IRBuilder<> IRB(F->getEntryBlock().begin());
+  return ArgTLSPtr = IRB.CreateCall(DFS.GetArgTLS);
+}
+
+Value *DFSanFunction::getRetvalTLS() {
+  if (RetvalTLSPtr)
+    return RetvalTLSPtr;
+  if (DFS.RetvalTLS)
+    return RetvalTLSPtr = DFS.RetvalTLS;
+
+  IRBuilder<> IRB(F->getEntryBlock().begin());
+  return RetvalTLSPtr = IRB.CreateCall(DFS.GetRetvalTLS);
+}
+
+Value *DFSanFunction::getArgTLS(unsigned Idx, Instruction *Pos) {
+  IRBuilder<> IRB(Pos);
+  return IRB.CreateConstGEP2_64(getArgTLSPtr(), 0, Idx);
+}
+
+Value *DFSanFunction::getShadow(Value *V) {
+  if (!isa<Argument>(V) && !isa<Instruction>(V))
+    return DFS.ZeroShadow;
+  Value *&Shadow = ValShadowMap[V];
+  if (!Shadow) {
+    if (Argument *A = dyn_cast<Argument>(V)) {
+      if (IsNativeABI)
+        return DFS.ZeroShadow;
+      switch (IA) {
+      case DataFlowSanitizer::IA_TLS: {
+        Value *ArgTLSPtr = getArgTLSPtr();
+        Instruction *ArgTLSPos =
+            DFS.ArgTLS ? &*F->getEntryBlock().begin()
+                       : cast<Instruction>(ArgTLSPtr)->getNextNode();
+        IRBuilder<> IRB(ArgTLSPos);
+        Shadow = IRB.CreateLoad(getArgTLS(A->getArgNo(), ArgTLSPos));
+        break;
+      }
+      case DataFlowSanitizer::IA_Args: {
+        unsigned ArgIdx = A->getArgNo() + F->getArgumentList().size() / 2;
+        Function::arg_iterator i = F->arg_begin();
+        while (ArgIdx--)
+          ++i;
+        Shadow = i;
+        assert(Shadow->getType() == DFS.ShadowTy);
+        break;
+      }
+      }
+      NonZeroChecks.insert(Shadow);
+    } else {
+      Shadow = DFS.ZeroShadow;
+    }
+  }
+  return Shadow;
+}
+
+void DFSanFunction::setShadow(Instruction *I, Value *Shadow) {
+  assert(!ValShadowMap.count(I));
+  assert(Shadow->getType() == DFS.ShadowTy);
+  ValShadowMap[I] = Shadow;
+}
+
+Value *DataFlowSanitizer::getShadowAddress(Value *Addr, Instruction *Pos) {
+  assert(Addr != RetvalTLS && "Reinstrumenting?");
+  IRBuilder<> IRB(Pos);
+  return IRB.CreateIntToPtr(
+      IRB.CreateMul(
+          IRB.CreateAnd(IRB.CreatePtrToInt(Addr, IntptrTy), ShadowPtrMask),
+          ShadowPtrMul),
+      ShadowPtrTy);
+}
+
+// Generates IR to compute the union of the two given shadows, inserting it
+// before Pos.  Returns the computed union Value.
+Value *DataFlowSanitizer::combineShadows(Value *V1, Value *V2,
+                                         Instruction *Pos) {
+  if (V1 == ZeroShadow)
+    return V2;
+  if (V2 == ZeroShadow)
+    return V1;
+  if (V1 == V2)
+    return V1;
+  IRBuilder<> IRB(Pos);
+  BasicBlock *Head = Pos->getParent();
+  Value *Ne = IRB.CreateICmpNE(V1, V2);
+  Instruction *NeInst = dyn_cast<Instruction>(Ne);
+  if (NeInst) {
+    BranchInst *BI = cast<BranchInst>(SplitBlockAndInsertIfThen(
+        NeInst, /*Unreachable=*/ false, ColdCallWeights));
+    IRBuilder<> ThenIRB(BI);
+    CallInst *Call = ThenIRB.CreateCall2(DFSanUnionFn, V1, V2);
+    Call->addAttribute(AttributeSet::ReturnIndex, Attribute::ZExt);
+    Call->addAttribute(1, Attribute::ZExt);
+    Call->addAttribute(2, Attribute::ZExt);
+
+    BasicBlock *Tail = BI->getSuccessor(0);
+    PHINode *Phi = PHINode::Create(ShadowTy, 2, "", Tail->begin());
+    Phi->addIncoming(Call, Call->getParent());
+    Phi->addIncoming(V1, Head);
+    Pos = Phi;
+    return Phi;
+  } else {
+    assert(0 && "todo");
+    return 0;
+  }
+}
+
+// A convenience function which folds the shadows of each of the operands
+// of the provided instruction Inst, inserting the IR before Inst.  Returns
+// the computed union Value.
+Value *DFSanFunction::combineOperandShadows(Instruction *Inst) {
+  if (Inst->getNumOperands() == 0)
+    return DFS.ZeroShadow;
+
+  Value *Shadow = getShadow(Inst->getOperand(0));
+  for (unsigned i = 1, n = Inst->getNumOperands(); i != n; ++i) {
+    Shadow = DFS.combineShadows(Shadow, getShadow(Inst->getOperand(i)), Inst);
+  }
+  return Shadow;
+}
+
+void DFSanVisitor::visitOperandShadowInst(Instruction &I) {
+  Value *CombinedShadow = DFSF.combineOperandShadows(&I);
+  DFSF.setShadow(&I, CombinedShadow);
+}
+
+// Generates IR to load shadow corresponding to bytes [Addr, Addr+Size), where
+// Addr has alignment Align, and take the union of each of those shadows.
+Value *DFSanFunction::loadShadow(Value *Addr, uint64_t Size, uint64_t Align,
+                                 Instruction *Pos) {
+  if (AllocaInst *AI = dyn_cast<AllocaInst>(Addr)) {
+    llvm::DenseMap<AllocaInst *, AllocaInst *>::iterator i =
+        AllocaShadowMap.find(AI);
+    if (i != AllocaShadowMap.end()) {
+      IRBuilder<> IRB(Pos);
+      return IRB.CreateLoad(i->second);
+    }
+  }
+
+  uint64_t ShadowAlign = Align * DFS.ShadowWidth / 8;
+  SmallVector<Value *, 2> Objs;
+  GetUnderlyingObjects(Addr, Objs, DFS.DL);
+  bool AllConstants = true;
+  for (SmallVector<Value *, 2>::iterator i = Objs.begin(), e = Objs.end();
+       i != e; ++i) {
+    if (isa<Function>(*i) || isa<BlockAddress>(*i))
+      continue;
+    if (isa<GlobalVariable>(*i) && cast<GlobalVariable>(*i)->isConstant())
+      continue;
+
+    AllConstants = false;
+    break;
+  }
+  if (AllConstants)
+    return DFS.ZeroShadow;
+
+  Value *ShadowAddr = DFS.getShadowAddress(Addr, Pos);
+  switch (Size) {
+  case 0:
+    return DFS.ZeroShadow;
+  case 1: {
+    LoadInst *LI = new LoadInst(ShadowAddr, "", Pos);
+    LI->setAlignment(ShadowAlign);
+    return LI;
+  }
+  case 2: {
+    IRBuilder<> IRB(Pos);
+    Value *ShadowAddr1 =
+        IRB.CreateGEP(ShadowAddr, ConstantInt::get(DFS.IntptrTy, 1));
+    return DFS.combineShadows(IRB.CreateAlignedLoad(ShadowAddr, ShadowAlign),
+                              IRB.CreateAlignedLoad(ShadowAddr1, ShadowAlign),
+                              Pos);
+  }
+  }
+  if (Size % (64 / DFS.ShadowWidth) == 0) {
+    // Fast path for the common case where each byte has identical shadow: load
+    // shadow 64 bits at a time, fall out to a __dfsan_union_load call if any
+    // shadow is non-equal.
+    BasicBlock *FallbackBB = BasicBlock::Create(*DFS.Ctx, "", F);
+    IRBuilder<> FallbackIRB(FallbackBB);
+    CallInst *FallbackCall = FallbackIRB.CreateCall2(
+        DFS.DFSanUnionLoadFn, ShadowAddr, ConstantInt::get(DFS.IntptrTy, Size));
+    FallbackCall->addAttribute(AttributeSet::ReturnIndex, Attribute::ZExt);
+
+    // Compare each of the shadows stored in the loaded 64 bits to each other,
+    // by computing (WideShadow rotl ShadowWidth) == WideShadow.
+    IRBuilder<> IRB(Pos);
+    Value *WideAddr =
+        IRB.CreateBitCast(ShadowAddr, Type::getInt64PtrTy(*DFS.Ctx));
+    Value *WideShadow = IRB.CreateAlignedLoad(WideAddr, ShadowAlign);
+    Value *TruncShadow = IRB.CreateTrunc(WideShadow, DFS.ShadowTy);
+    Value *ShlShadow = IRB.CreateShl(WideShadow, DFS.ShadowWidth);
+    Value *ShrShadow = IRB.CreateLShr(WideShadow, 64 - DFS.ShadowWidth);
+    Value *RotShadow = IRB.CreateOr(ShlShadow, ShrShadow);
+    Value *ShadowsEq = IRB.CreateICmpEQ(WideShadow, RotShadow);
+
+    BasicBlock *Head = Pos->getParent();
+    BasicBlock *Tail = Head->splitBasicBlock(Pos);
+    // In the following code LastBr will refer to the previous basic block's
+    // conditional branch instruction, whose true successor is fixed up to point
+    // to the next block during the loop below or to the tail after the final
+    // iteration.
+    BranchInst *LastBr = BranchInst::Create(FallbackBB, FallbackBB, ShadowsEq);
+    ReplaceInstWithInst(Head->getTerminator(), LastBr);
+
+    for (uint64_t Ofs = 64 / DFS.ShadowWidth; Ofs != Size;
+         Ofs += 64 / DFS.ShadowWidth) {
+      BasicBlock *NextBB = BasicBlock::Create(*DFS.Ctx, "", F);
+      IRBuilder<> NextIRB(NextBB);
+      WideAddr = NextIRB.CreateGEP(WideAddr, ConstantInt::get(DFS.IntptrTy, 1));
+      Value *NextWideShadow = NextIRB.CreateAlignedLoad(WideAddr, ShadowAlign);
+      ShadowsEq = NextIRB.CreateICmpEQ(WideShadow, NextWideShadow);
+      LastBr->setSuccessor(0, NextBB);
+      LastBr = NextIRB.CreateCondBr(ShadowsEq, FallbackBB, FallbackBB);
+    }
+
+    LastBr->setSuccessor(0, Tail);
+    FallbackIRB.CreateBr(Tail);
+    PHINode *Shadow = PHINode::Create(DFS.ShadowTy, 2, "", &Tail->front());
+    Shadow->addIncoming(FallbackCall, FallbackBB);
+    Shadow->addIncoming(TruncShadow, LastBr->getParent());
+    return Shadow;
+  }
+
+  IRBuilder<> IRB(Pos);
+  CallInst *FallbackCall = IRB.CreateCall2(
+      DFS.DFSanUnionLoadFn, ShadowAddr, ConstantInt::get(DFS.IntptrTy, Size));
+  FallbackCall->addAttribute(AttributeSet::ReturnIndex, Attribute::ZExt);
+  return FallbackCall;
+}
+
+void DFSanVisitor::visitLoadInst(LoadInst &LI) {
+  uint64_t Size = DFSF.DFS.DL->getTypeStoreSize(LI.getType());
+  uint64_t Align;
+  if (ClPreserveAlignment) {
+    Align = LI.getAlignment();
+    if (Align == 0)
+      Align = DFSF.DFS.DL->getABITypeAlignment(LI.getType());
+  } else {
+    Align = 1;
+  }
+  IRBuilder<> IRB(&LI);
+  Value *LoadedShadow =
+      DFSF.loadShadow(LI.getPointerOperand(), Size, Align, &LI);
+  Value *PtrShadow = DFSF.getShadow(LI.getPointerOperand());
+  Value *CombinedShadow = DFSF.DFS.combineShadows(LoadedShadow, PtrShadow, &LI);
+  if (CombinedShadow != DFSF.DFS.ZeroShadow)
+    DFSF.NonZeroChecks.insert(CombinedShadow);
+
+  DFSF.setShadow(&LI, CombinedShadow);
+}
+
+void DFSanFunction::storeShadow(Value *Addr, uint64_t Size, uint64_t Align,
+                                Value *Shadow, Instruction *Pos) {
+  if (AllocaInst *AI = dyn_cast<AllocaInst>(Addr)) {
+    llvm::DenseMap<AllocaInst *, AllocaInst *>::iterator i =
+        AllocaShadowMap.find(AI);
+    if (i != AllocaShadowMap.end()) {
+      IRBuilder<> IRB(Pos);
+      IRB.CreateStore(Shadow, i->second);
+      return;
+    }
+  }
+
+  uint64_t ShadowAlign = Align * DFS.ShadowWidth / 8;
+  IRBuilder<> IRB(Pos);
+  Value *ShadowAddr = DFS.getShadowAddress(Addr, Pos);
+  if (Shadow == DFS.ZeroShadow) {
+    IntegerType *ShadowTy = IntegerType::get(*DFS.Ctx, Size * DFS.ShadowWidth);
+    Value *ExtZeroShadow = ConstantInt::get(ShadowTy, 0);
+    Value *ExtShadowAddr =
+        IRB.CreateBitCast(ShadowAddr, PointerType::getUnqual(ShadowTy));
+    IRB.CreateAlignedStore(ExtZeroShadow, ExtShadowAddr, ShadowAlign);
+    return;
+  }
+
+  const unsigned ShadowVecSize = 128 / DFS.ShadowWidth;
+  uint64_t Offset = 0;
+  if (Size >= ShadowVecSize) {
+    VectorType *ShadowVecTy = VectorType::get(DFS.ShadowTy, ShadowVecSize);
+    Value *ShadowVec = UndefValue::get(ShadowVecTy);
+    for (unsigned i = 0; i != ShadowVecSize; ++i) {
+      ShadowVec = IRB.CreateInsertElement(
+          ShadowVec, Shadow, ConstantInt::get(Type::getInt32Ty(*DFS.Ctx), i));
+    }
+    Value *ShadowVecAddr =
+        IRB.CreateBitCast(ShadowAddr, PointerType::getUnqual(ShadowVecTy));
+    do {
+      Value *CurShadowVecAddr = IRB.CreateConstGEP1_32(ShadowVecAddr, Offset);
+      IRB.CreateAlignedStore(ShadowVec, CurShadowVecAddr, ShadowAlign);
+      Size -= ShadowVecSize;
+      ++Offset;
+    } while (Size >= ShadowVecSize);
+    Offset *= ShadowVecSize;
+  }
+  while (Size > 0) {
+    Value *CurShadowAddr = IRB.CreateConstGEP1_32(ShadowAddr, Offset);
+    IRB.CreateAlignedStore(Shadow, CurShadowAddr, ShadowAlign);
+    --Size;
+    ++Offset;
+  }
+}
+
+void DFSanVisitor::visitStoreInst(StoreInst &SI) {
+  uint64_t Size =
+      DFSF.DFS.DL->getTypeStoreSize(SI.getValueOperand()->getType());
+  uint64_t Align;
+  if (ClPreserveAlignment) {
+    Align = SI.getAlignment();
+    if (Align == 0)
+      Align = DFSF.DFS.DL->getABITypeAlignment(SI.getValueOperand()->getType());
+  } else {
+    Align = 1;
+  }
+  DFSF.storeShadow(SI.getPointerOperand(), Size, Align,
+                   DFSF.getShadow(SI.getValueOperand()), &SI);
+}
+
+void DFSanVisitor::visitBinaryOperator(BinaryOperator &BO) {
+  visitOperandShadowInst(BO);
+}
+
+void DFSanVisitor::visitCastInst(CastInst &CI) { visitOperandShadowInst(CI); }
+
+void DFSanVisitor::visitCmpInst(CmpInst &CI) { visitOperandShadowInst(CI); }
+
+void DFSanVisitor::visitGetElementPtrInst(GetElementPtrInst &GEPI) {
+  visitOperandShadowInst(GEPI);
+}
+
+void DFSanVisitor::visitExtractElementInst(ExtractElementInst &I) {
+  visitOperandShadowInst(I);
+}
+
+void DFSanVisitor::visitInsertElementInst(InsertElementInst &I) {
+  visitOperandShadowInst(I);
+}
+
+void DFSanVisitor::visitShuffleVectorInst(ShuffleVectorInst &I) {
+  visitOperandShadowInst(I);
+}
+
+void DFSanVisitor::visitExtractValueInst(ExtractValueInst &I) {
+  visitOperandShadowInst(I);
+}
+
+void DFSanVisitor::visitInsertValueInst(InsertValueInst &I) {
+  visitOperandShadowInst(I);
+}
+
+void DFSanVisitor::visitAllocaInst(AllocaInst &I) {
+  bool AllLoadsStores = true;
+  for (Instruction::use_iterator i = I.use_begin(), e = I.use_end(); i != e;
+       ++i) {
+    if (isa<LoadInst>(*i))
+      continue;
+
+    if (StoreInst *SI = dyn_cast<StoreInst>(*i)) {
+      if (SI->getPointerOperand() == &I)
+        continue;
+    }
+
+    AllLoadsStores = false;
+    break;
+  }
+  if (AllLoadsStores) {
+    IRBuilder<> IRB(&I);
+    DFSF.AllocaShadowMap[&I] = IRB.CreateAlloca(DFSF.DFS.ShadowTy);
+  }
+  DFSF.setShadow(&I, DFSF.DFS.ZeroShadow);
+}
+
+void DFSanVisitor::visitSelectInst(SelectInst &I) {
+  Value *CondShadow = DFSF.getShadow(I.getCondition());
+  Value *TrueShadow = DFSF.getShadow(I.getTrueValue());
+  Value *FalseShadow = DFSF.getShadow(I.getFalseValue());
+
+  if (isa<VectorType>(I.getCondition()->getType())) {
+    DFSF.setShadow(
+        &I, DFSF.DFS.combineShadows(
+                CondShadow,
+                DFSF.DFS.combineShadows(TrueShadow, FalseShadow, &I), &I));
+  } else {
+    Value *ShadowSel;
+    if (TrueShadow == FalseShadow) {
+      ShadowSel = TrueShadow;
+    } else {
+      ShadowSel =
+          SelectInst::Create(I.getCondition(), TrueShadow, FalseShadow, "", &I);
+    }
+    DFSF.setShadow(&I, DFSF.DFS.combineShadows(CondShadow, ShadowSel, &I));
+  }
+}
+
+void DFSanVisitor::visitMemSetInst(MemSetInst &I) {
+  IRBuilder<> IRB(&I);
+  Value *ValShadow = DFSF.getShadow(I.getValue());
+  IRB.CreateCall3(
+      DFSF.DFS.DFSanSetLabelFn, ValShadow,
+      IRB.CreateBitCast(I.getDest(), Type::getInt8PtrTy(*DFSF.DFS.Ctx)),
+      IRB.CreateZExtOrTrunc(I.getLength(), DFSF.DFS.IntptrTy));
+}
+
+void DFSanVisitor::visitMemTransferInst(MemTransferInst &I) {
+  IRBuilder<> IRB(&I);
+  Value *DestShadow = DFSF.DFS.getShadowAddress(I.getDest(), &I);
+  Value *SrcShadow = DFSF.DFS.getShadowAddress(I.getSource(), &I);
+  Value *LenShadow = IRB.CreateMul(
+      I.getLength(),
+      ConstantInt::get(I.getLength()->getType(), DFSF.DFS.ShadowWidth / 8));
+  Value *AlignShadow;
+  if (ClPreserveAlignment) {
+    AlignShadow = IRB.CreateMul(I.getAlignmentCst(),
+                                ConstantInt::get(I.getAlignmentCst()->getType(),
+                                                 DFSF.DFS.ShadowWidth / 8));
+  } else {
+    AlignShadow = ConstantInt::get(I.getAlignmentCst()->getType(),
+                                   DFSF.DFS.ShadowWidth / 8);
+  }
+  Type *Int8Ptr = Type::getInt8PtrTy(*DFSF.DFS.Ctx);
+  DestShadow = IRB.CreateBitCast(DestShadow, Int8Ptr);
+  SrcShadow = IRB.CreateBitCast(SrcShadow, Int8Ptr);
+  IRB.CreateCall5(I.getCalledValue(), DestShadow, SrcShadow, LenShadow,
+                  AlignShadow, I.getVolatileCst());
+}
+
+void DFSanVisitor::visitReturnInst(ReturnInst &RI) {
+  if (!DFSF.IsNativeABI && RI.getReturnValue()) {
+    switch (DFSF.IA) {
+    case DataFlowSanitizer::IA_TLS: {
+      Value *S = DFSF.getShadow(RI.getReturnValue());
+      IRBuilder<> IRB(&RI);
+      IRB.CreateStore(S, DFSF.getRetvalTLS());
+      break;
+    }
+    case DataFlowSanitizer::IA_Args: {
+      IRBuilder<> IRB(&RI);
+      Type *RT = DFSF.F->getFunctionType()->getReturnType();
+      Value *InsVal =
+          IRB.CreateInsertValue(UndefValue::get(RT), RI.getReturnValue(), 0);
+      Value *InsShadow =
+          IRB.CreateInsertValue(InsVal, DFSF.getShadow(RI.getReturnValue()), 1);
+      RI.setOperand(0, InsShadow);
+      break;
+    }
+    }
+  }
+}
+
+void DFSanVisitor::visitCallSite(CallSite CS) {
+  Function *F = CS.getCalledFunction();
+  if ((F && F->isIntrinsic()) || isa<InlineAsm>(CS.getCalledValue())) {
+    visitOperandShadowInst(*CS.getInstruction());
+    return;
+  }
+
+  IRBuilder<> IRB(CS.getInstruction());
+
+  DenseMap<Value *, Function *>::iterator i =
+      DFSF.DFS.UnwrappedFnMap.find(CS.getCalledValue());
+  if (i != DFSF.DFS.UnwrappedFnMap.end()) {
+    Function *F = i->second;
+    switch (DFSF.DFS.getWrapperKind(F)) {
+    case DataFlowSanitizer::WK_Warning: {
+      CS.setCalledFunction(F);
+      IRB.CreateCall(DFSF.DFS.DFSanUnimplementedFn,
+                     IRB.CreateGlobalStringPtr(F->getName()));
+      DFSF.setShadow(CS.getInstruction(), DFSF.DFS.ZeroShadow);
+      return;
+    }
+    case DataFlowSanitizer::WK_Discard: {
+      CS.setCalledFunction(F);
+      DFSF.setShadow(CS.getInstruction(), DFSF.DFS.ZeroShadow);
+      return;
+    }
+    case DataFlowSanitizer::WK_Functional: {
+      CS.setCalledFunction(F);
+      visitOperandShadowInst(*CS.getInstruction());
+      return;
+    }
+    case DataFlowSanitizer::WK_Custom: {
+      // Don't try to handle invokes of custom functions, it's too complicated.
+      // Instead, invoke the dfsw$ wrapper, which will in turn call the __dfsw_
+      // wrapper.
+      if (CallInst *CI = dyn_cast<CallInst>(CS.getInstruction())) {
+        FunctionType *FT = F->getFunctionType();
+        FunctionType *CustomFT = DFSF.DFS.getCustomFunctionType(FT);
+        std::string CustomFName = "__dfsw_";
+        CustomFName += F->getName();
+        Constant *CustomF =
+            DFSF.DFS.Mod->getOrInsertFunction(CustomFName, CustomFT);
+        if (Function *CustomFn = dyn_cast<Function>(CustomF)) {
+          CustomFn->copyAttributesFrom(F);
+
+          // Custom functions returning non-void will write to the return label.
+          if (!FT->getReturnType()->isVoidTy()) {
+            CustomFn->removeAttributes(AttributeSet::FunctionIndex,
+                                       DFSF.DFS.ReadOnlyNoneAttrs);
+          }
+        }
+
+        std::vector<Value *> Args;
+
+        CallSite::arg_iterator i = CS.arg_begin();
+        for (unsigned n = FT->getNumParams(); n != 0; ++i, --n) {
+          Type *T = (*i)->getType();
+          FunctionType *ParamFT;
+          if (isa<PointerType>(T) &&
+              (ParamFT = dyn_cast<FunctionType>(
+                   cast<PointerType>(T)->getElementType()))) {
+            std::string TName = "dfst";
+            TName += utostr(FT->getNumParams() - n);
+            TName += "$";
+            TName += F->getName();
+            Constant *T = DFSF.DFS.getOrBuildTrampolineFunction(ParamFT, TName);
+            Args.push_back(T);
+            Args.push_back(
+                IRB.CreateBitCast(*i, Type::getInt8PtrTy(*DFSF.DFS.Ctx)));
+          } else {
+            Args.push_back(*i);
+          }
+        }
+
+        i = CS.arg_begin();
+        for (unsigned n = FT->getNumParams(); n != 0; ++i, --n)
+          Args.push_back(DFSF.getShadow(*i));
+
+        if (!FT->getReturnType()->isVoidTy()) {
+          if (!DFSF.LabelReturnAlloca) {
+            DFSF.LabelReturnAlloca =
+                new AllocaInst(DFSF.DFS.ShadowTy, "labelreturn",
+                               DFSF.F->getEntryBlock().begin());
+          }
+          Args.push_back(DFSF.LabelReturnAlloca);
+        }
+
+        CallInst *CustomCI = IRB.CreateCall(CustomF, Args);
+        CustomCI->setCallingConv(CI->getCallingConv());
+        CustomCI->setAttributes(CI->getAttributes());
+
+        if (!FT->getReturnType()->isVoidTy()) {
+          LoadInst *LabelLoad = IRB.CreateLoad(DFSF.LabelReturnAlloca);
+          DFSF.setShadow(CustomCI, LabelLoad);
+        }
+
+        CI->replaceAllUsesWith(CustomCI);
+        CI->eraseFromParent();
+        return;
+      }
+      break;
+    }
+    }
+  }
+
+  FunctionType *FT = cast<FunctionType>(
+      CS.getCalledValue()->getType()->getPointerElementType());
+  if (DFSF.DFS.getInstrumentedABI() == DataFlowSanitizer::IA_TLS) {
+    for (unsigned i = 0, n = FT->getNumParams(); i != n; ++i) {
+      IRB.CreateStore(DFSF.getShadow(CS.getArgument(i)),
+                      DFSF.getArgTLS(i, CS.getInstruction()));
+    }
+  }
+
+  Instruction *Next = 0;
+  if (!CS.getType()->isVoidTy()) {
+    if (InvokeInst *II = dyn_cast<InvokeInst>(CS.getInstruction())) {
+      if (II->getNormalDest()->getSinglePredecessor()) {
+        Next = II->getNormalDest()->begin();
+      } else {
+        BasicBlock *NewBB =
+            SplitEdge(II->getParent(), II->getNormalDest(), &DFSF.DFS);
+        Next = NewBB->begin();
+      }
+    } else {
+      Next = CS->getNextNode();
+    }
+
+    if (DFSF.DFS.getInstrumentedABI() == DataFlowSanitizer::IA_TLS) {
+      IRBuilder<> NextIRB(Next);
+      LoadInst *LI = NextIRB.CreateLoad(DFSF.getRetvalTLS());
+      DFSF.SkipInsts.insert(LI);
+      DFSF.setShadow(CS.getInstruction(), LI);
+      DFSF.NonZeroChecks.insert(LI);
+    }
+  }
+
+  // Do all instrumentation for IA_Args down here to defer tampering with the
+  // CFG in a way that SplitEdge may be able to detect.
+  if (DFSF.DFS.getInstrumentedABI() == DataFlowSanitizer::IA_Args) {
+    FunctionType *NewFT = DFSF.DFS.getArgsFunctionType(FT);
+    Value *Func =
+        IRB.CreateBitCast(CS.getCalledValue(), PointerType::getUnqual(NewFT));
+    std::vector<Value *> Args;
+
+    CallSite::arg_iterator i = CS.arg_begin(), e = CS.arg_end();
+    for (unsigned n = FT->getNumParams(); n != 0; ++i, --n)
+      Args.push_back(*i);
+
+    i = CS.arg_begin();
+    for (unsigned n = FT->getNumParams(); n != 0; ++i, --n)
+      Args.push_back(DFSF.getShadow(*i));
+
+    if (FT->isVarArg()) {
+      unsigned VarArgSize = CS.arg_size() - FT->getNumParams();
+      ArrayType *VarArgArrayTy = ArrayType::get(DFSF.DFS.ShadowTy, VarArgSize);
+      AllocaInst *VarArgShadow =
+          new AllocaInst(VarArgArrayTy, "", DFSF.F->getEntryBlock().begin());
+      Args.push_back(IRB.CreateConstGEP2_32(VarArgShadow, 0, 0));
+      for (unsigned n = 0; i != e; ++i, ++n) {
+        IRB.CreateStore(DFSF.getShadow(*i),
+                        IRB.CreateConstGEP2_32(VarArgShadow, 0, n));
+        Args.push_back(*i);
+      }
+    }
+
+    CallSite NewCS;
+    if (InvokeInst *II = dyn_cast<InvokeInst>(CS.getInstruction())) {
+      NewCS = IRB.CreateInvoke(Func, II->getNormalDest(), II->getUnwindDest(),
+                               Args);
+    } else {
+      NewCS = IRB.CreateCall(Func, Args);
+    }
+    NewCS.setCallingConv(CS.getCallingConv());
+    NewCS.setAttributes(CS.getAttributes().removeAttributes(
+        *DFSF.DFS.Ctx, AttributeSet::ReturnIndex,
+        AttributeFuncs::typeIncompatible(NewCS.getInstruction()->getType(),
+                                         AttributeSet::ReturnIndex)));
+
+    if (Next) {
+      ExtractValueInst *ExVal =
+          ExtractValueInst::Create(NewCS.getInstruction(), 0, "", Next);
+      DFSF.SkipInsts.insert(ExVal);
+      ExtractValueInst *ExShadow =
+          ExtractValueInst::Create(NewCS.getInstruction(), 1, "", Next);
+      DFSF.SkipInsts.insert(ExShadow);
+      DFSF.setShadow(ExVal, ExShadow);
+      DFSF.NonZeroChecks.insert(ExShadow);
+
+      CS.getInstruction()->replaceAllUsesWith(ExVal);
+    }
+
+    CS.getInstruction()->eraseFromParent();
+  }
+}
+
+void DFSanVisitor::visitPHINode(PHINode &PN) {
+  PHINode *ShadowPN =
+      PHINode::Create(DFSF.DFS.ShadowTy, PN.getNumIncomingValues(), "", &PN);
+
+  // Give the shadow phi node valid predecessors to fool SplitEdge into working.
+  Value *UndefShadow = UndefValue::get(DFSF.DFS.ShadowTy);
+  for (PHINode::block_iterator i = PN.block_begin(), e = PN.block_end(); i != e;
+       ++i) {
+    ShadowPN->addIncoming(UndefShadow, *i);
+  }
+
+  DFSF.PHIFixups.push_back(std::make_pair(&PN, ShadowPN));
+  DFSF.setShadow(&PN, ShadowPN);
+}
diff --git a/lib/Transforms/Instrumentation/DebugIR.cpp b/lib/Transforms/Instrumentation/DebugIR.cpp
index 651381d..f50a044 100644
--- a/lib/Transforms/Instrumentation/DebugIR.cpp
+++ b/lib/Transforms/Instrumentation/DebugIR.cpp
@@ -25,6 +25,7 @@
 #include "llvm/InstVisitor.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Instruction.h"
+#include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Transforms/Instrumentation.h"
 #include "llvm/Transforms/Utils/Cloning.h"
@@ -401,7 +402,7 @@ private:
       Type *PointeeTy = T->getPointerElementType();
       if (!(N = getType(PointeeTy)))
         N = Builder.createPointerType(
-            getOrCreateType(PointeeTy), Layout.getPointerSizeInBits(),
+            getOrCreateType(PointeeTy), Layout.getPointerTypeSizeInBits(T),
             Layout.getPrefTypeAlignment(T), getTypeName(T));
     } else if (T->isArrayTy()) {
       SmallVector<Value *, 1> Subrange;
diff --git a/lib/Transforms/Instrumentation/EdgeProfiling.cpp b/lib/Transforms/Instrumentation/EdgeProfiling.cpp
deleted file mode 100644
index a2459fb..0000000
--- a/lib/Transforms/Instrumentation/EdgeProfiling.cpp
+++ /dev/null
@@ -1,117 +0,0 @@
-//===- EdgeProfiling.cpp - Insert counters for edge profiling -------------===//
-//
-//                      The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass instruments the specified program with counters for edge profiling.
-// Edge profiling can give a reasonable approximation of the hot paths through a
-// program, and is used for a wide variety of program transformations.
-//
-// Note that this implementation is very naive.  We insert a counter for *every*
-// edge in the program, instead of using control flow information to prune the
-// number of counters inserted.
-//
-//===----------------------------------------------------------------------===//
-#define DEBUG_TYPE "insert-edge-profiling"
-
-#include "llvm/Transforms/Instrumentation.h"
-#include "ProfilingUtils.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include <set>
-using namespace llvm;
-
-STATISTIC(NumEdgesInserted, "The # of edges inserted.");
-
-namespace {
-  class EdgeProfiler : public ModulePass {
-    bool runOnModule(Module &M);
-  public:
-    static char ID; // Pass identification, replacement for typeid
-    EdgeProfiler() : ModulePass(ID) {
-      initializeEdgeProfilerPass(*PassRegistry::getPassRegistry());
-    }
-
-    virtual const char *getPassName() const {
-      return "Edge Profiler";
-    }
-  };
-}
-
-char EdgeProfiler::ID = 0;
-INITIALIZE_PASS(EdgeProfiler, "insert-edge-profiling",
-                "Insert instrumentation for edge profiling", false, false)
-
-ModulePass *llvm::createEdgeProfilerPass() { return new EdgeProfiler(); }
-
-bool EdgeProfiler::runOnModule(Module &M) {
-  Function *Main = M.getFunction("main");
-  if (Main == 0) {
-    errs() << "WARNING: cannot insert edge profiling into a module"
-           << " with no main function!\n";
-    return false;  // No main, no instrumentation!
-  }
-
-  std::set<BasicBlock*> BlocksToInstrument;
-  unsigned NumEdges = 0;
-  for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) {
-    if (F->isDeclaration()) continue;
-    // Reserve space for (0,entry) edge.
-    ++NumEdges;
-    for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
-      // Keep track of which blocks need to be instrumented.  We don't want to
-      // instrument blocks that are added as the result of breaking critical
-      // edges!
-      BlocksToInstrument.insert(BB);
-      NumEdges += BB->getTerminator()->getNumSuccessors();
-    }
-  }
-
-  Type *ATy = ArrayType::get(Type::getInt32Ty(M.getContext()), NumEdges);
-  GlobalVariable *Counters =
-    new GlobalVariable(M, ATy, false, GlobalValue::InternalLinkage,
-                       Constant::getNullValue(ATy), "EdgeProfCounters");
-  NumEdgesInserted = NumEdges;
-
-  // Instrument all of the edges...
-  unsigned i = 0;
-  for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) {
-    if (F->isDeclaration()) continue;
-    // Create counter for (0,entry) edge.
-    IncrementCounterInBlock(&F->getEntryBlock(), i++, Counters);
-    for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB)
-      if (BlocksToInstrument.count(BB)) {  // Don't instrument inserted blocks
-        // Okay, we have to add a counter of each outgoing edge.  If the
-        // outgoing edge is not critical don't split it, just insert the counter
-        // in the source or destination of the edge.
-        TerminatorInst *TI = BB->getTerminator();
-        for (unsigned s = 0, e = TI->getNumSuccessors(); s != e; ++s) {
-          // If the edge is critical, split it.
-          SplitCriticalEdge(TI, s, this);
-
-          // Okay, we are guaranteed that the edge is no longer critical.  If we
-          // only have a single successor, insert the counter in this block,
-          // otherwise insert it in the successor block.
-          if (TI->getNumSuccessors() == 1) {
-            // Insert counter at the start of the block
-            IncrementCounterInBlock(BB, i++, Counters, false);
-          } else {
-            // Insert counter at the start of the block
-            IncrementCounterInBlock(TI->getSuccessor(s), i++, Counters);
-          }
-        }
-      }
-  }
-
-  // Add the initialization call to main.
-  InsertProfilingInitCall(Main, "llvm_start_edge_profiling", Counters);
-  return true;
-}
-
diff --git a/lib/Transforms/Instrumentation/GCOVProfiling.cpp b/lib/Transforms/Instrumentation/GCOVProfiling.cpp
index 4c2681f..206bffb 100644
--- a/lib/Transforms/Instrumentation/GCOVProfiling.cpp
+++ b/lib/Transforms/Instrumentation/GCOVProfiling.cpp
@@ -17,7 +17,6 @@
 #define DEBUG_TYPE "insert-gcov-profiling"
 
 #include "llvm/Transforms/Instrumentation.h"
-#include "ProfilingUtils.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Statistic.h"
@@ -103,6 +102,7 @@ namespace {
     Constant *getIncrementIndirectCounterFunc();
     Constant *getEmitFunctionFunc();
     Constant *getEmitArcsFunc();
+    Constant *getSummaryInfoFunc();
     Constant *getDeleteWriteoutFunctionListFunc();
     Constant *getDeleteFlushFunctionListFunc();
     Constant *getEndFileFunc();
@@ -519,15 +519,15 @@ bool GCOVProfiler::emitProfileArcs() {
         TerminatorInst *TI = BB->getTerminator();
         int Successors = isa<ReturnInst>(TI) ? 1 : TI->getNumSuccessors();
         if (Successors) {
-          IRBuilder<> Builder(TI);
-          
           if (Successors == 1) {
+            IRBuilder<> Builder(BB->getFirstInsertionPt());
             Value *Counter = Builder.CreateConstInBoundsGEP2_64(Counters, 0,
                                                                 Edge);
             Value *Count = Builder.CreateLoad(Counter);
             Count = Builder.CreateAdd(Count, Builder.getInt64(1));
             Builder.CreateStore(Count, Counter);
           } else if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
+            IRBuilder<> Builder(BI);
             Value *Sel = Builder.CreateSelect(BI->getCondition(),
                                               Builder.getInt64(Edge),
                                               Builder.getInt64(Edge + 1));
@@ -543,6 +543,7 @@ bool GCOVProfiler::emitProfileArcs() {
             for (int i = 0; i != Successors; ++i)
               ComplexEdgeSuccs.insert(TI->getSuccessor(i));
           }
+
           Edge += Successors;
         }
       }
@@ -554,14 +555,13 @@ bool GCOVProfiler::emitProfileArcs() {
         GlobalVariable *EdgeState = getEdgeStateValue();
         
         for (int i = 0, e = ComplexEdgePreds.size(); i != e; ++i) {
-          IRBuilder<> Builder(ComplexEdgePreds[i+1]->getTerminator());
+          IRBuilder<> Builder(ComplexEdgePreds[i + 1]->getFirstInsertionPt());
           Builder.CreateStore(Builder.getInt32(i), EdgeState);
         }
+
         for (int i = 0, e = ComplexEdgeSuccs.size(); i != e; ++i) {
-          // call runtime to perform increment
-          BasicBlock::iterator InsertPt =
-            ComplexEdgeSuccs[i+1]->getFirstInsertionPt();
-          IRBuilder<> Builder(InsertPt);
+          // Call runtime to perform increment.
+          IRBuilder<> Builder(ComplexEdgeSuccs[i+1]->getFirstInsertionPt());
           Value *CounterPtrArray =
             Builder.CreateConstInBoundsGEP2_64(EdgeTable, 0,
                                                i * ComplexEdgePreds.size());
@@ -599,7 +599,7 @@ bool GCOVProfiler::emitProfileArcs() {
     };
     FTy = FunctionType::get(Builder.getVoidTy(), Params, false);
 
-    // Inialize the environment and register the local writeout and flush
+    // Initialize the environment and register the local writeout and flush
     // functions.
     Constant *GCOVInit = M->getOrInsertFunction("llvm_gcov_init", FTy);
     Builder.CreateCall2(GCOVInit, WriteoutF, FlushF);
@@ -701,6 +701,11 @@ Constant *GCOVProfiler::getEmitArcsFunc() {
   return M->getOrInsertFunction("llvm_gcda_emit_arcs", FTy);
 }
 
+Constant *GCOVProfiler::getSummaryInfoFunc() {
+  FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), false);
+  return M->getOrInsertFunction("llvm_gcda_summary_info", FTy);
+}
+
 Constant *GCOVProfiler::getDeleteWriteoutFunctionListFunc() {
   FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), false);
   return M->getOrInsertFunction("llvm_delete_writeout_function_list", FTy);
@@ -747,6 +752,7 @@ Function *GCOVProfiler::insertCounterWriteout(
   Constant *StartFile = getStartFileFunc();
   Constant *EmitFunction = getEmitFunctionFunc();
   Constant *EmitArcs = getEmitArcsFunc();
+  Constant *SummaryInfo = getSummaryInfoFunc();
   Constant *EndFile = getEndFileFunc();
 
   NamedMDNode *CU_Nodes = M->getNamedMetadata("llvm.dbg.cu");
@@ -773,6 +779,7 @@ Function *GCOVProfiler::insertCounterWriteout(
                             Builder.getInt32(Arcs),
                             Builder.CreateConstGEP2_64(GV, 0, 0));
       }
+      Builder.CreateCall(SummaryInfo);
       Builder.CreateCall(EndFile);
     }
   }
diff --git a/lib/Transforms/Instrumentation/Instrumentation.cpp b/lib/Transforms/Instrumentation/Instrumentation.cpp
index 9f35396..b1bea38 100644
--- a/lib/Transforms/Instrumentation/Instrumentation.cpp
+++ b/lib/Transforms/Instrumentation/Instrumentation.cpp
@@ -24,12 +24,10 @@ void llvm::initializeInstrumentation(PassRegistry &Registry) {
   initializeAddressSanitizerPass(Registry);
   initializeAddressSanitizerModulePass(Registry);
   initializeBoundsCheckingPass(Registry);
-  initializeEdgeProfilerPass(Registry);
   initializeGCOVProfilerPass(Registry);
-  initializeOptimalEdgeProfilerPass(Registry);
-  initializePathProfilerPass(Registry);
   initializeMemorySanitizerPass(Registry);
   initializeThreadSanitizerPass(Registry);
+  initializeDataFlowSanitizerPass(Registry);
 }
 
 /// LLVMInitializeInstrumentation - C binding for
diff --git a/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index 0251f16..d547adc 100644
--- a/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -66,6 +66,31 @@
 /// avoids storing origin to memory when a fully initialized value is stored.
 /// This way it avoids needless overwritting origin of the 4-byte region on
 /// a short (i.e. 1 byte) clean store, and it is also good for performance.
+///
+///                            Atomic handling.
+///
+/// Ideally, every atomic store of application value should update the
+/// corresponding shadow location in an atomic way. Unfortunately, atomic store
+/// of two disjoint locations can not be done without severe slowdown.
+///
+/// Therefore, we implement an approximation that may err on the safe side.
+/// In this implementation, every atomically accessed location in the program
+/// may only change from (partially) uninitialized to fully initialized, but
+/// not the other way around. We load the shadow _after_ the application load,
+/// and we store the shadow _before_ the app store. Also, we always store clean
+/// shadow (if the application store is atomic). This way, if the store-load
+/// pair constitutes a happens-before arc, shadow store and load are correctly
+/// ordered such that the load will get either the value that was stored, or
+/// some later value (which is always clean).
+///
+/// This does not work very well with Compare-And-Swap (CAS) and
+/// Read-Modify-Write (RMW) operations. To follow the above logic, CAS and RMW
+/// must store the new shadow before the app operation, and load the shadow
+/// after the app operation. Computers don't work this way. Current
+/// implementation ignores the load aspect of CAS/RMW, always returning a clean
+/// value. It implements the store part as a simple atomic store by storing a
+/// clean shadow.
+
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "msan"
@@ -157,6 +182,18 @@ static cl::opt<std::string>  ClBlacklistFile("msan-blacklist",
        cl::desc("File containing the list of functions where MemorySanitizer "
                 "should not report bugs"), cl::Hidden);
 
+// Experimental. Wraps all indirect calls in the instrumented code with
+// a call to the given function. This is needed to assist the dynamic
+// helper tool (MSanDR) to regain control on transition between instrumented and
+// non-instrumented code.
+static cl::opt<std::string> ClWrapIndirectCalls("msan-wrap-indirect-calls",
+       cl::desc("Wrap indirect calls with a given function"),
+       cl::Hidden);
+
+static cl::opt<bool> ClWrapIndirectCallsFast("msan-wrap-indirect-calls-fast",
+       cl::desc("Do not wrap indirect calls with target in the same module"),
+       cl::Hidden, cl::init(true));
+
 namespace {
 
 /// \brief An instrumentation pass implementing detection of uninitialized
@@ -168,12 +205,12 @@ class MemorySanitizer : public FunctionPass {
  public:
   MemorySanitizer(bool TrackOrigins = false,
                   StringRef BlacklistFile = StringRef())
-    : FunctionPass(ID),
-      TrackOrigins(TrackOrigins || ClTrackOrigins),
-      TD(0),
-      WarningFn(0),
-      BlacklistFile(BlacklistFile.empty() ? ClBlacklistFile
-                                          : BlacklistFile) { }
+      : FunctionPass(ID),
+        TrackOrigins(TrackOrigins || ClTrackOrigins),
+        TD(0),
+        WarningFn(0),
+        BlacklistFile(BlacklistFile.empty() ? ClBlacklistFile : BlacklistFile),
+        WrapIndirectCalls(!ClWrapIndirectCalls.empty()) {}
   const char *getPassName() const { return "MemorySanitizer"; }
   bool runOnFunction(Function &F);
   bool doInitialization(Module &M);
@@ -207,13 +244,16 @@ class MemorySanitizer : public FunctionPass {
   /// function.
   GlobalVariable *OriginTLS;
 
+  GlobalVariable *MsandrModuleStart;
+  GlobalVariable *MsandrModuleEnd;
+
   /// \brief The run-time callback to print a warning.
   Value *WarningFn;
   /// \brief Run-time helper that copies origin info for a memory range.
   Value *MsanCopyOriginFn;
   /// \brief Run-time helper that generates a new origin value for a stack
   /// allocation.
-  Value *MsanSetAllocaOriginFn;
+  Value *MsanSetAllocaOrigin4Fn;
   /// \brief Run-time helper that poisons stack on function entry.
   Value *MsanPoisonStackFn;
   /// \brief MSan runtime replacements for memmove, memcpy and memset.
@@ -236,6 +276,12 @@ class MemorySanitizer : public FunctionPass {
   /// \brief An empty volatile inline asm that prevents callback merge.
   InlineAsm *EmptyAsm;
 
+  bool WrapIndirectCalls;
+  /// \brief Run-time wrapper for indirect calls.
+  Value *IndirectCallWrapperFn;
+  // Argument and return type of IndirectCallWrapperFn: void (*f)(void).
+  Type *AnyFunctionPtrTy;
+
   friend struct MemorySanitizerVisitor;
   friend struct VarArgAMD64Helper;
 };
@@ -281,9 +327,9 @@ void MemorySanitizer::initializeCallbacks(Module &M) {
   MsanCopyOriginFn = M.getOrInsertFunction(
     "__msan_copy_origin", IRB.getVoidTy(), IRB.getInt8PtrTy(),
     IRB.getInt8PtrTy(), IntptrTy, NULL);
-  MsanSetAllocaOriginFn = M.getOrInsertFunction(
-    "__msan_set_alloca_origin", IRB.getVoidTy(), IRB.getInt8PtrTy(), IntptrTy,
-    IRB.getInt8PtrTy(), NULL);
+  MsanSetAllocaOrigin4Fn = M.getOrInsertFunction(
+    "__msan_set_alloca_origin4", IRB.getVoidTy(), IRB.getInt8PtrTy(), IntptrTy,
+    IRB.getInt8PtrTy(), IntptrTy, NULL);
   MsanPoisonStackFn = M.getOrInsertFunction(
     "__msan_poison_stack", IRB.getVoidTy(), IRB.getInt8PtrTy(), IntptrTy, NULL);
   MemmoveFn = M.getOrInsertFunction(
@@ -329,6 +375,24 @@ void MemorySanitizer::initializeCallbacks(Module &M) {
   EmptyAsm = InlineAsm::get(FunctionType::get(IRB.getVoidTy(), false),
                             StringRef(""), StringRef(""),
                             /*hasSideEffects=*/true);
+
+  if (WrapIndirectCalls) {
+    AnyFunctionPtrTy =
+        PointerType::getUnqual(FunctionType::get(IRB.getVoidTy(), false));
+    IndirectCallWrapperFn = M.getOrInsertFunction(
+        ClWrapIndirectCalls, AnyFunctionPtrTy, AnyFunctionPtrTy, NULL);
+  }
+
+  if (ClWrapIndirectCallsFast) {
+    MsandrModuleStart = new GlobalVariable(
+        M, IRB.getInt32Ty(), false, GlobalValue::ExternalLinkage,
+        0, "__executable_start");
+    MsandrModuleStart->setVisibility(GlobalVariable::HiddenVisibility);
+    MsandrModuleEnd = new GlobalVariable(
+        M, IRB.getInt32Ty(), false, GlobalValue::ExternalLinkage,
+        0, "_end");
+    MsandrModuleEnd->setVisibility(GlobalVariable::HiddenVisibility);
+  }
 }
 
 /// \brief Module-level initialization.
@@ -338,7 +402,7 @@ bool MemorySanitizer::doInitialization(Module &M) {
   TD = getAnalysisIfAvailable<DataLayout>();
   if (!TD)
     return false;
-  BL.reset(new SpecialCaseList(BlacklistFile));
+  BL.reset(SpecialCaseList::createOrDie(BlacklistFile));
   C = &(M.getContext());
   unsigned PtrSize = TD->getPointerSizeInBits(/* AddressSpace */0);
   switch (PtrSize) {
@@ -423,22 +487,27 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   MemorySanitizer &MS;
   SmallVector<PHINode *, 16> ShadowPHINodes, OriginPHINodes;
   ValueMap<Value*, Value*> ShadowMap, OriginMap;
+  OwningPtr<VarArgHelper> VAHelper;
+
+  // The following flags disable parts of MSan instrumentation based on
+  // blacklist contents and command-line options.
   bool InsertChecks;
   bool LoadShadow;
   bool PoisonStack;
   bool PoisonUndef;
-  OwningPtr<VarArgHelper> VAHelper;
+  bool CheckReturnValue;
 
   struct ShadowOriginAndInsertPoint {
-    Instruction *Shadow;
-    Instruction *Origin;
+    Value *Shadow;
+    Value *Origin;
     Instruction *OrigIns;
-    ShadowOriginAndInsertPoint(Instruction *S, Instruction *O, Instruction *I)
+    ShadowOriginAndInsertPoint(Value *S, Value *O, Instruction *I)
       : Shadow(S), Origin(O), OrigIns(I) { }
     ShadowOriginAndInsertPoint() : Shadow(0), Origin(0), OrigIns(0) { }
   };
   SmallVector<ShadowOriginAndInsertPoint, 16> InstrumentationList;
   SmallVector<Instruction*, 16> StoreList;
+  SmallVector<CallSite, 16> IndirectCallList;
 
   MemorySanitizerVisitor(Function &F, MemorySanitizer &MS)
       : F(F), MS(MS), VAHelper(CreateVarArgHelper(F, MS, *this)) {
@@ -449,6 +518,9 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     LoadShadow = SanitizeFunction;
     PoisonStack = SanitizeFunction && ClPoisonStack;
     PoisonUndef = SanitizeFunction && ClPoisonUndef;
+    // FIXME: Consider using SpecialCaseList to specify a list of functions that
+    // must always return fully initialized values. For now, we hardcode "main".
+    CheckReturnValue = SanitizeFunction && (F.getName() == "main");
 
     DEBUG(if (!InsertChecks)
           dbgs() << "MemorySanitizer is not inserting checks into '"
@@ -462,7 +534,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       IRBuilder<> IRB(&I);
       Value *Val = I.getValueOperand();
       Value *Addr = I.getPointerOperand();
-      Value *Shadow = getShadow(Val);
+      Value *Shadow = I.isAtomic() ? getCleanShadow(Val) : getShadow(Val);
       Value *ShadowPtr = getShadowPtr(Addr, Shadow->getType(), IRB);
 
       StoreInst *NewSI =
@@ -471,7 +543,10 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       (void)NewSI;
 
       if (ClCheckAccessAddress)
-        insertCheck(Addr, &I);
+        insertShadowCheck(Addr, &I);
+
+      if (I.isAtomic())
+        I.setOrdering(addReleaseOrdering(I.getOrdering()));
 
       if (MS.TrackOrigins) {
         unsigned Alignment = std::max(kMinOriginAlignment, I.getAlignment());
@@ -481,11 +556,10 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
         } else {
           Value *ConvertedShadow = convertToShadowTyNoVec(Shadow, IRB);
 
-          Constant *Cst = dyn_cast_or_null<Constant>(ConvertedShadow);
           // TODO(eugenis): handle non-zero constant shadow by inserting an
           // unconditional check (can not simply fail compilation as this could
           // be in the dead code).
-          if (Cst)
+          if (isa<Constant>(ConvertedShadow))
             continue;
 
           Value *Cmp = IRB.CreateICmpNE(ConvertedShadow,
@@ -503,12 +577,15 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
 
   void materializeChecks() {
     for (size_t i = 0, n = InstrumentationList.size(); i < n; i++) {
-      Instruction *Shadow = InstrumentationList[i].Shadow;
+      Value *Shadow = InstrumentationList[i].Shadow;
       Instruction *OrigIns = InstrumentationList[i].OrigIns;
       IRBuilder<> IRB(OrigIns);
       DEBUG(dbgs() << "  SHAD0 : " << *Shadow << "\n");
       Value *ConvertedShadow = convertToShadowTyNoVec(Shadow, IRB);
       DEBUG(dbgs() << "  SHAD1 : " << *ConvertedShadow << "\n");
+      // See the comment in materializeStores().
+      if (isa<Constant>(ConvertedShadow))
+        continue;
       Value *Cmp = IRB.CreateICmpNE(ConvertedShadow,
                                     getCleanShadow(ConvertedShadow), "_mscmp");
       Instruction *CheckTerm =
@@ -518,7 +595,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
 
       IRB.SetInsertPoint(CheckTerm);
       if (MS.TrackOrigins) {
-        Instruction *Origin = InstrumentationList[i].Origin;
+        Value *Origin = InstrumentationList[i].Origin;
         IRB.CreateStore(Origin ? (Value*)Origin : (Value*)IRB.getInt32(0),
                         MS.OriginTLS);
       }
@@ -530,6 +607,48 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     DEBUG(dbgs() << "DONE:\n" << F);
   }
 
+  void materializeIndirectCalls() {
+    for (size_t i = 0, n = IndirectCallList.size(); i < n; i++) {
+      CallSite CS = IndirectCallList[i];
+      Instruction *I = CS.getInstruction();
+      BasicBlock *B = I->getParent();
+      IRBuilder<> IRB(I);
+      Value *Fn0 = CS.getCalledValue();
+      Value *Fn = IRB.CreateBitCast(Fn0, MS.AnyFunctionPtrTy);
+
+      if (ClWrapIndirectCallsFast) {
+        // Check that call target is inside this module limits.
+        Value *Start =
+            IRB.CreateBitCast(MS.MsandrModuleStart, MS.AnyFunctionPtrTy);
+        Value *End = IRB.CreateBitCast(MS.MsandrModuleEnd, MS.AnyFunctionPtrTy);
+
+        Value *NotInThisModule = IRB.CreateOr(IRB.CreateICmpULT(Fn, Start),
+                                              IRB.CreateICmpUGE(Fn, End));
+
+        PHINode *NewFnPhi =
+            IRB.CreatePHI(Fn0->getType(), 2, "msandr.indirect_target");
+
+        Instruction *CheckTerm = SplitBlockAndInsertIfThen(
+            cast<Instruction>(NotInThisModule),
+            /* Unreachable */ false, MS.ColdCallWeights);
+
+        IRB.SetInsertPoint(CheckTerm);
+        // Slow path: call wrapper function to possibly transform the call
+        // target.
+        Value *NewFn = IRB.CreateBitCast(
+            IRB.CreateCall(MS.IndirectCallWrapperFn, Fn), Fn0->getType());
+
+        NewFnPhi->addIncoming(Fn0, B);
+        NewFnPhi->addIncoming(NewFn, dyn_cast<Instruction>(NewFn)->getParent());
+        CS.setCalledFunction(NewFnPhi);
+      } else {
+        Value *NewFn = IRB.CreateBitCast(
+            IRB.CreateCall(MS.IndirectCallWrapperFn, Fn), Fn0->getType());
+        CS.setCalledFunction(NewFn);
+      }
+    }
+  }
+
   /// \brief Add MemorySanitizer instrumentation to a function.
   bool runOnFunction() {
     MS.initializeCallbacks(*F.getParent());
@@ -572,6 +691,9 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     // Insert shadow value checks.
     materializeChecks();
 
+    // Wrap indirect calls.
+    materializeIndirectCalls();
+
     return true;
   }
 
@@ -835,20 +957,63 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   /// \brief Remember the place where a shadow check should be inserted.
   ///
   /// This location will be later instrumented with a check that will print a
-  /// UMR warning in runtime if the value is not fully defined.
-  void insertCheck(Value *Val, Instruction *OrigIns) {
-    assert(Val);
+  /// UMR warning in runtime if the shadow value is not 0.
+  void insertShadowCheck(Value *Shadow, Value *Origin, Instruction *OrigIns) {
+    assert(Shadow);
     if (!InsertChecks) return;
-    Instruction *Shadow = dyn_cast_or_null<Instruction>(getShadow(Val));
-    if (!Shadow) return;
 #ifndef NDEBUG
     Type *ShadowTy = Shadow->getType();
     assert((isa<IntegerType>(ShadowTy) || isa<VectorType>(ShadowTy)) &&
            "Can only insert checks for integer and vector shadow types");
 #endif
-    Instruction *Origin = dyn_cast_or_null<Instruction>(getOrigin(Val));
     InstrumentationList.push_back(
-      ShadowOriginAndInsertPoint(Shadow, Origin, OrigIns));
+        ShadowOriginAndInsertPoint(Shadow, Origin, OrigIns));
+  }
+
+  /// \brief Remember the place where a shadow check should be inserted.
+  ///
+  /// This location will be later instrumented with a check that will print a
+  /// UMR warning in runtime if the value is not fully defined.
+  void insertShadowCheck(Value *Val, Instruction *OrigIns) {
+    assert(Val);
+    Instruction *Shadow = dyn_cast_or_null<Instruction>(getShadow(Val));
+    if (!Shadow) return;
+    Instruction *Origin = dyn_cast_or_null<Instruction>(getOrigin(Val));
+    insertShadowCheck(Shadow, Origin, OrigIns);
+  }
+
+  AtomicOrdering addReleaseOrdering(AtomicOrdering a) {
+    switch (a) {
+      case NotAtomic:
+        return NotAtomic;
+      case Unordered:
+      case Monotonic:
+      case Release:
+        return Release;
+      case Acquire:
+      case AcquireRelease:
+        return AcquireRelease;
+      case SequentiallyConsistent:
+        return SequentiallyConsistent;
+    }
+    llvm_unreachable("Unknown ordering");
+  }
+
+  AtomicOrdering addAcquireOrdering(AtomicOrdering a) {
+    switch (a) {
+      case NotAtomic:
+        return NotAtomic;
+      case Unordered:
+      case Monotonic:
+      case Acquire:
+        return Acquire;
+      case Release:
+      case AcquireRelease:
+        return AcquireRelease;
+      case SequentiallyConsistent:
+        return SequentiallyConsistent;
+    }
+    llvm_unreachable("Unknown ordering");
   }
 
   // ------------------- Visitors.
@@ -859,7 +1024,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   /// Optionally, checks that the load address is fully defined.
   void visitLoadInst(LoadInst &I) {
     assert(I.getType()->isSized() && "Load type must have size");
-    IRBuilder<> IRB(&I);
+    IRBuilder<> IRB(I.getNextNode());
     Type *ShadowTy = getShadowTy(&I);
     Value *Addr = I.getPointerOperand();
     if (LoadShadow) {
@@ -871,7 +1036,10 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     }
 
     if (ClCheckAccessAddress)
-      insertCheck(I.getPointerOperand(), &I);
+      insertShadowCheck(I.getPointerOperand(), &I);
+
+    if (I.isAtomic())
+      I.setOrdering(addAcquireOrdering(I.getOrdering()));
 
     if (MS.TrackOrigins) {
       if (LoadShadow) {
@@ -892,9 +1060,40 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     StoreList.push_back(&I);
   }
 
+  void handleCASOrRMW(Instruction &I) {
+    assert(isa<AtomicRMWInst>(I) || isa<AtomicCmpXchgInst>(I));
+
+    IRBuilder<> IRB(&I);
+    Value *Addr = I.getOperand(0);
+    Value *ShadowPtr = getShadowPtr(Addr, I.getType(), IRB);
+
+    if (ClCheckAccessAddress)
+      insertShadowCheck(Addr, &I);
+
+    // Only test the conditional argument of cmpxchg instruction.
+    // The other argument can potentially be uninitialized, but we can not
+    // detect this situation reliably without possible false positives.
+    if (isa<AtomicCmpXchgInst>(I))
+      insertShadowCheck(I.getOperand(1), &I);
+
+    IRB.CreateStore(getCleanShadow(&I), ShadowPtr);
+
+    setShadow(&I, getCleanShadow(&I));
+  }
+
+  void visitAtomicRMWInst(AtomicRMWInst &I) {
+    handleCASOrRMW(I);
+    I.setOrdering(addReleaseOrdering(I.getOrdering()));
+  }
+
+  void visitAtomicCmpXchgInst(AtomicCmpXchgInst &I) {
+    handleCASOrRMW(I);
+    I.setOrdering(addReleaseOrdering(I.getOrdering()));
+  }
+
   // Vector manipulation.
   void visitExtractElementInst(ExtractElementInst &I) {
-    insertCheck(I.getOperand(1), &I);
+    insertShadowCheck(I.getOperand(1), &I);
     IRBuilder<> IRB(&I);
     setShadow(&I, IRB.CreateExtractElement(getShadow(&I, 0), I.getOperand(1),
               "_msprop"));
@@ -902,7 +1101,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   }
 
   void visitInsertElementInst(InsertElementInst &I) {
-    insertCheck(I.getOperand(2), &I);
+    insertShadowCheck(I.getOperand(2), &I);
     IRBuilder<> IRB(&I);
     setShadow(&I, IRB.CreateInsertElement(getShadow(&I, 0), getShadow(&I, 1),
               I.getOperand(2), "_msprop"));
@@ -910,7 +1109,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   }
 
   void visitShuffleVectorInst(ShuffleVectorInst &I) {
-    insertCheck(I.getOperand(2), &I);
+    insertShadowCheck(I.getOperand(2), &I);
     IRBuilder<> IRB(&I);
     setShadow(&I, IRB.CreateShuffleVector(getShadow(&I, 0), getShadow(&I, 1),
               I.getOperand(2), "_msprop"));
@@ -1109,18 +1308,19 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
 
   /// \brief Cast between two shadow types, extending or truncating as
   /// necessary.
-  Value *CreateShadowCast(IRBuilder<> &IRB, Value *V, Type *dstTy) {
+  Value *CreateShadowCast(IRBuilder<> &IRB, Value *V, Type *dstTy,
+                          bool Signed = false) {
     Type *srcTy = V->getType();
     if (dstTy->isIntegerTy() && srcTy->isIntegerTy())
-      return IRB.CreateIntCast(V, dstTy, false);
+      return IRB.CreateIntCast(V, dstTy, Signed);
     if (dstTy->isVectorTy() && srcTy->isVectorTy() &&
         dstTy->getVectorNumElements() == srcTy->getVectorNumElements())
-      return IRB.CreateIntCast(V, dstTy, false);
+      return IRB.CreateIntCast(V, dstTy, Signed);
     size_t srcSizeInBits = VectorOrPrimitiveTypeSizeInBits(srcTy);
     size_t dstSizeInBits = VectorOrPrimitiveTypeSizeInBits(dstTy);
     Value *V1 = IRB.CreateBitCast(V, Type::getIntNTy(*MS.C, srcSizeInBits));
     Value *V2 =
-      IRB.CreateIntCast(V1, Type::getIntNTy(*MS.C, dstSizeInBits), false);
+      IRB.CreateIntCast(V1, Type::getIntNTy(*MS.C, dstSizeInBits), Signed);
     return IRB.CreateBitCast(V2, dstTy);
     // TODO: handle struct types.
   }
@@ -1145,7 +1345,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   void handleDiv(Instruction &I) {
     IRBuilder<> IRB(&I);
     // Strict on the second argument.
-    insertCheck(I.getOperand(1), &I);
+    insertShadowCheck(I.getOperand(1), &I);
     setShadow(&I, getShadow(&I, 0));
     setOrigin(&I, getOrigin(&I, 0));
   }
@@ -1428,7 +1628,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     IRB.CreateAlignedStore(Shadow, ShadowPtr, 1);
 
     if (ClCheckAccessAddress)
-      insertCheck(Addr, &I);
+      insertShadowCheck(Addr, &I);
 
     // FIXME: use ClStoreCleanOrigin
     // FIXME: factor out common code from materializeStores
@@ -1455,9 +1655,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       setShadow(&I, getCleanShadow(&I));
     }
 
-
     if (ClCheckAccessAddress)
-      insertCheck(Addr, &I);
+      insertShadowCheck(Addr, &I);
 
     if (MS.TrackOrigins) {
       if (LoadShadow)
@@ -1554,11 +1753,119 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     setOrigin(&I, getOrigin(Op));
   }
 
+  // \brief Instrument vector convert instrinsic.
+  //
+  // This function instruments intrinsics like cvtsi2ss:
+  // %Out = int_xxx_cvtyyy(%ConvertOp)
+  // or
+  // %Out = int_xxx_cvtyyy(%CopyOp, %ConvertOp)
+  // Intrinsic converts \p NumUsedElements elements of \p ConvertOp to the same
+  // number \p Out elements, and (if has 2 arguments) copies the rest of the
+  // elements from \p CopyOp.
+  // In most cases conversion involves floating-point value which may trigger a
+  // hardware exception when not fully initialized. For this reason we require
+  // \p ConvertOp[0:NumUsedElements] to be fully initialized and trap otherwise.
+  // We copy the shadow of \p CopyOp[NumUsedElements:] to \p
+  // Out[NumUsedElements:]. This means that intrinsics without \p CopyOp always
+  // return a fully initialized value.
+  void handleVectorConvertIntrinsic(IntrinsicInst &I, int NumUsedElements) {
+    IRBuilder<> IRB(&I);
+    Value *CopyOp, *ConvertOp;
+
+    switch (I.getNumArgOperands()) {
+    case 2:
+      CopyOp = I.getArgOperand(0);
+      ConvertOp = I.getArgOperand(1);
+      break;
+    case 1:
+      ConvertOp = I.getArgOperand(0);
+      CopyOp = NULL;
+      break;
+    default:
+      llvm_unreachable("Cvt intrinsic with unsupported number of arguments.");
+    }
+
+    // The first *NumUsedElements* elements of ConvertOp are converted to the
+    // same number of output elements. The rest of the output is copied from
+    // CopyOp, or (if not available) filled with zeroes.
+    // Combine shadow for elements of ConvertOp that are used in this operation,
+    // and insert a check.
+    // FIXME: consider propagating shadow of ConvertOp, at least in the case of
+    // int->any conversion.
+    Value *ConvertShadow = getShadow(ConvertOp);
+    Value *AggShadow = 0;
+    if (ConvertOp->getType()->isVectorTy()) {
+      AggShadow = IRB.CreateExtractElement(
+          ConvertShadow, ConstantInt::get(IRB.getInt32Ty(), 0));
+      for (int i = 1; i < NumUsedElements; ++i) {
+        Value *MoreShadow = IRB.CreateExtractElement(
+            ConvertShadow, ConstantInt::get(IRB.getInt32Ty(), i));
+        AggShadow = IRB.CreateOr(AggShadow, MoreShadow);
+      }
+    } else {
+      AggShadow = ConvertShadow;
+    }
+    assert(AggShadow->getType()->isIntegerTy());
+    insertShadowCheck(AggShadow, getOrigin(ConvertOp), &I);
+
+    // Build result shadow by zero-filling parts of CopyOp shadow that come from
+    // ConvertOp.
+    if (CopyOp) {
+      assert(CopyOp->getType() == I.getType());
+      assert(CopyOp->getType()->isVectorTy());
+      Value *ResultShadow = getShadow(CopyOp);
+      Type *EltTy = ResultShadow->getType()->getVectorElementType();
+      for (int i = 0; i < NumUsedElements; ++i) {
+        ResultShadow = IRB.CreateInsertElement(
+            ResultShadow, ConstantInt::getNullValue(EltTy),
+            ConstantInt::get(IRB.getInt32Ty(), i));
+      }
+      setShadow(&I, ResultShadow);
+      setOrigin(&I, getOrigin(CopyOp));
+    } else {
+      setShadow(&I, getCleanShadow(&I));
+    }
+  }
+
   void visitIntrinsicInst(IntrinsicInst &I) {
     switch (I.getIntrinsicID()) {
     case llvm::Intrinsic::bswap:
       handleBswap(I);
       break;
+    case llvm::Intrinsic::x86_avx512_cvtsd2usi64:
+    case llvm::Intrinsic::x86_avx512_cvtsd2usi:
+    case llvm::Intrinsic::x86_avx512_cvtss2usi64:
+    case llvm::Intrinsic::x86_avx512_cvtss2usi:
+    case llvm::Intrinsic::x86_avx512_cvttss2usi64:
+    case llvm::Intrinsic::x86_avx512_cvttss2usi:
+    case llvm::Intrinsic::x86_avx512_cvttsd2usi64:
+    case llvm::Intrinsic::x86_avx512_cvttsd2usi:
+    case llvm::Intrinsic::x86_avx512_cvtusi2sd:
+    case llvm::Intrinsic::x86_avx512_cvtusi2ss:
+    case llvm::Intrinsic::x86_avx512_cvtusi642sd:
+    case llvm::Intrinsic::x86_avx512_cvtusi642ss:
+    case llvm::Intrinsic::x86_sse2_cvtsd2si64:
+    case llvm::Intrinsic::x86_sse2_cvtsd2si:
+    case llvm::Intrinsic::x86_sse2_cvtsd2ss:
+    case llvm::Intrinsic::x86_sse2_cvtsi2sd:
+    case llvm::Intrinsic::x86_sse2_cvtsi642sd:
+    case llvm::Intrinsic::x86_sse2_cvtss2sd:
+    case llvm::Intrinsic::x86_sse2_cvttsd2si64:
+    case llvm::Intrinsic::x86_sse2_cvttsd2si:
+    case llvm::Intrinsic::x86_sse_cvtsi2ss:
+    case llvm::Intrinsic::x86_sse_cvtsi642ss:
+    case llvm::Intrinsic::x86_sse_cvtss2si64:
+    case llvm::Intrinsic::x86_sse_cvtss2si:
+    case llvm::Intrinsic::x86_sse_cvttss2si64:
+    case llvm::Intrinsic::x86_sse_cvttss2si:
+      handleVectorConvertIntrinsic(I, 1);
+      break;
+    case llvm::Intrinsic::x86_sse2_cvtdq2pd:
+    case llvm::Intrinsic::x86_sse2_cvtps2pd:
+    case llvm::Intrinsic::x86_sse_cvtps2pi:
+    case llvm::Intrinsic::x86_sse_cvttps2pi:
+      handleVectorConvertIntrinsic(I, 2);
+      break;
     default:
       if (!handleUnknownIntrinsic(I))
         visitInstruction(I);
@@ -1604,6 +1911,10 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       }
     }
     IRBuilder<> IRB(&I);
+
+    if (MS.WrapIndirectCalls && !CS.getCalledFunction())
+      IndirectCallList.push_back(CS);
+
     unsigned ArgOffset = 0;
     DEBUG(dbgs() << "  CallSite: " << I << "\n");
     for (CallSite::arg_iterator ArgIt = CS.arg_begin(), End = CS.arg_end();
@@ -1647,7 +1958,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     DEBUG(dbgs() << "  done with call args\n");
 
     FunctionType *FT =
-      cast<FunctionType>(CS.getCalledValue()->getType()-> getContainedType(0));
+      cast<FunctionType>(CS.getCalledValue()->getType()->getContainedType(0));
     if (FT->isVarArg()) {
       VAHelper->visitCallSite(CS, IRB);
     }
@@ -1686,12 +1997,17 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
 
   void visitReturnInst(ReturnInst &I) {
     IRBuilder<> IRB(&I);
-    if (Value *RetVal = I.getReturnValue()) {
-      // Set the shadow for the RetVal.
+    Value *RetVal = I.getReturnValue();
+    if (!RetVal) return;
+    Value *ShadowPtr = getShadowPtrForRetval(RetVal, IRB);
+    if (CheckReturnValue) {
+      insertShadowCheck(RetVal, &I);
+      Value *Shadow = getCleanShadow(RetVal);
+      IRB.CreateAlignedStore(Shadow, ShadowPtr, kShadowTLSAlignment);
+    } else {
       Value *Shadow = getShadow(RetVal);
-      Value *ShadowPtr = getShadowPtrForRetval(RetVal, IRB);
-      DEBUG(dbgs() << "Return: " << *Shadow << "\n" << *ShadowPtr << "\n");
       IRB.CreateAlignedStore(Shadow, ShadowPtr, kShadowTLSAlignment);
+      // FIXME: make it conditional if ClStoreCleanOrigin==0
       if (MS.TrackOrigins)
         IRB.CreateStore(getOrigin(RetVal), getOriginPtrForRetval(IRB));
     }
@@ -1734,18 +2050,34 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       Value *Descr =
           createPrivateNonConstGlobalForString(*F.getParent(),
                                                StackDescription.str());
-      IRB.CreateCall3(MS.MsanSetAllocaOriginFn,
+
+      IRB.CreateCall4(MS.MsanSetAllocaOrigin4Fn,
                       IRB.CreatePointerCast(&I, IRB.getInt8PtrTy()),
                       ConstantInt::get(MS.IntptrTy, Size),
-                      IRB.CreatePointerCast(Descr, IRB.getInt8PtrTy()));
+                      IRB.CreatePointerCast(Descr, IRB.getInt8PtrTy()),
+                      IRB.CreatePointerCast(&F, MS.IntptrTy));
     }
   }
 
   void visitSelectInst(SelectInst& I) {
     IRBuilder<> IRB(&I);
-    setShadow(&I,  IRB.CreateSelect(I.getCondition(),
-              getShadow(I.getTrueValue()), getShadow(I.getFalseValue()),
-              "_msprop"));
+    // a = select b, c, d
+    Value *S = IRB.CreateSelect(I.getCondition(), getShadow(I.getTrueValue()),
+                                getShadow(I.getFalseValue()));
+    if (I.getType()->isAggregateType()) {
+      // To avoid "sign extending" i1 to an arbitrary aggregate type, we just do
+      // an extra "select". This results in much more compact IR.
+      // Sa = select Sb, poisoned, (select b, Sc, Sd)
+      S = IRB.CreateSelect(getShadow(I.getCondition()),
+                           getPoisonedShadow(getShadowTy(I.getType())), S,
+                           "_msprop_select_agg");
+    } else {
+      // Sa = (sext Sb) | (select b, Sc, Sd)
+      S = IRB.CreateOr(S, CreateShadowCast(IRB, getShadow(I.getCondition()),
+                                           S->getType(), true),
+                       "_msprop_select");
+    }
+    setShadow(&I, S);
     if (MS.TrackOrigins) {
       // Origins are always i32, so any vector conditions must be flattened.
       // FIXME: consider tracking vector origins for app vectors?
@@ -1780,7 +2112,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     Value *ResShadow = IRB.CreateExtractValue(AggShadow, I.getIndices());
     DEBUG(dbgs() << "   ResShadow:  " << *ResShadow << "\n");
     setShadow(&I, ResShadow);
-    setOrigin(&I, getCleanOrigin());
+    setOriginForNaryOp(I);
   }
 
   void visitInsertValueInst(InsertValueInst &I) {
@@ -1793,7 +2125,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     Value *Res = IRB.CreateInsertValue(AggShadow, InsShadow, I.getIndices());
     DEBUG(dbgs() << "   Res:        " << *Res << "\n");
     setShadow(&I, Res);
-    setOrigin(&I, getCleanOrigin());
+    setOriginForNaryOp(I);
   }
 
   void dumpInst(Instruction &I) {
@@ -1816,7 +2148,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       dumpInst(I);
     DEBUG(dbgs() << "DEFAULT: " << I << "\n");
     for (size_t i = 0, n = I.getNumOperands(); i < n; i++)
-      insertCheck(I.getOperand(i), &I);
+      insertShadowCheck(I.getOperand(i), &I);
     setShadow(&I, getCleanShadow(&I));
     setOrigin(&I, getCleanOrigin());
   }
@@ -1970,8 +2302,7 @@ struct VarArgAMD64Helper : public VarArgHelper {
       Value *OverflowArgAreaPtr = IRB.CreateLoad(OverflowArgAreaPtrPtr);
       Value *OverflowArgAreaShadowPtr =
         MSV.getShadowPtr(OverflowArgAreaPtr, IRB.getInt8Ty(), IRB);
-      Value *SrcPtr =
-        getShadowPtrForVAArgument(VAArgTLSCopy, IRB, AMD64FpEndOffset);
+      Value *SrcPtr = IRB.CreateConstGEP1_32(VAArgTLSCopy, AMD64FpEndOffset);
       IRB.CreateMemCpy(OverflowArgAreaShadowPtr, SrcPtr, VAArgOverflowSize, 16);
     }
   }
diff --git a/lib/Transforms/Instrumentation/OptimalEdgeProfiling.cpp b/lib/Transforms/Instrumentation/OptimalEdgeProfiling.cpp
deleted file mode 100644
index b45aef6..0000000
--- a/lib/Transforms/Instrumentation/OptimalEdgeProfiling.cpp
+++ /dev/null
@@ -1,225 +0,0 @@
-//===- OptimalEdgeProfiling.cpp - Insert counters for opt. edge profiling -===//
-//
-//                      The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass instruments the specified program with counters for edge profiling.
-// Edge profiling can give a reasonable approximation of the hot paths through a
-// program, and is used for a wide variety of program transformations.
-//
-//===----------------------------------------------------------------------===//
-#define DEBUG_TYPE "insert-optimal-edge-profiling"
-#include "llvm/Transforms/Instrumentation.h"
-#include "MaximumSpanningTree.h"
-#include "ProfilingUtils.h"
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/Passes.h"
-#include "llvm/Analysis/ProfileInfo.h"
-#include "llvm/Analysis/ProfileInfoLoader.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-using namespace llvm;
-
-STATISTIC(NumEdgesInserted, "The # of edges inserted.");
-
-namespace {
-  class OptimalEdgeProfiler : public ModulePass {
-    bool runOnModule(Module &M);
-  public:
-    static char ID; // Pass identification, replacement for typeid
-    OptimalEdgeProfiler() : ModulePass(ID) {
-      initializeOptimalEdgeProfilerPass(*PassRegistry::getPassRegistry());
-    }
-
-    void getAnalysisUsage(AnalysisUsage &AU) const {
-      AU.addRequiredID(ProfileEstimatorPassID);
-      AU.addRequired<ProfileInfo>();
-    }
-
-    virtual const char *getPassName() const {
-      return "Optimal Edge Profiler";
-    }
-  };
-}
-
-char OptimalEdgeProfiler::ID = 0;
-INITIALIZE_PASS_BEGIN(OptimalEdgeProfiler, "insert-optimal-edge-profiling",
-                "Insert optimal instrumentation for edge profiling",
-                false, false)
-INITIALIZE_PASS_DEPENDENCY(ProfileEstimatorPass)
-INITIALIZE_AG_DEPENDENCY(ProfileInfo)
-INITIALIZE_PASS_END(OptimalEdgeProfiler, "insert-optimal-edge-profiling",
-                "Insert optimal instrumentation for edge profiling",
-                false, false)
-
-ModulePass *llvm::createOptimalEdgeProfilerPass() {
-  return new OptimalEdgeProfiler();
-}
-
-inline static void printEdgeCounter(ProfileInfo::Edge e,
-                                    BasicBlock* b,
-                                    unsigned i) {
-  DEBUG(dbgs() << "--Edge Counter for " << (e) << " in " \
-               << ((b)?(b)->getName():"0") << " (# " << (i) << ")\n");
-}
-
-bool OptimalEdgeProfiler::runOnModule(Module &M) {
-  Function *Main = M.getFunction("main");
-  if (Main == 0) {
-    errs() << "WARNING: cannot insert edge profiling into a module"
-           << " with no main function!\n";
-    return false;  // No main, no instrumentation!
-  }
-
-  // NumEdges counts all the edges that may be instrumented. Later on its
-  // decided which edges to actually instrument, to achieve optimal profiling.
-  // For the entry block a virtual edge (0,entry) is reserved, for each block
-  // with no successors an edge (BB,0) is reserved. These edges are necessary
-  // to calculate a truly optimal maximum spanning tree and thus an optimal
-  // instrumentation.
-  unsigned NumEdges = 0;
-
-  for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) {
-    if (F->isDeclaration()) continue;
-    // Reserve space for (0,entry) edge.
-    ++NumEdges;
-    for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
-      // Keep track of which blocks need to be instrumented.  We don't want to
-      // instrument blocks that are added as the result of breaking critical
-      // edges!
-      if (BB->getTerminator()->getNumSuccessors() == 0) {
-        // Reserve space for (BB,0) edge.
-        ++NumEdges;
-      } else {
-        NumEdges += BB->getTerminator()->getNumSuccessors();
-      }
-    }
-  }
-
-  // In the profiling output a counter for each edge is reserved, but only few
-  // are used. This is done to be able to read back in the profile without
-  // calulating the maximum spanning tree again, instead each edge counter that
-  // is not used is initialised with -1 to signal that this edge counter has to
-  // be calculated from other edge counters on reading the profile info back
-  // in.
-
-  Type *Int32 = Type::getInt32Ty(M.getContext());
-  ArrayType *ATy = ArrayType::get(Int32, NumEdges);
-  GlobalVariable *Counters =
-    new GlobalVariable(M, ATy, false, GlobalValue::InternalLinkage,
-                       Constant::getNullValue(ATy), "OptEdgeProfCounters");
-  NumEdgesInserted = 0;
-
-  std::vector<Constant*> Initializer(NumEdges);
-  Constant *Zero = ConstantInt::get(Int32, 0);
-  Constant *Uncounted = ConstantInt::get(Int32, ProfileInfoLoader::Uncounted);
-
-  // Instrument all of the edges not in MST...
-  unsigned i = 0;
-  for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) {
-    if (F->isDeclaration()) continue;
-    DEBUG(dbgs() << "Working on " << F->getName() << "\n");
-
-    // Calculate a Maximum Spanning Tree with the edge weights determined by
-    // ProfileEstimator. ProfileEstimator also assign weights to the virtual
-    // edges (0,entry) and (BB,0) (for blocks with no successors) and this
-    // edges also participate in the maximum spanning tree calculation.
-    // The third parameter of MaximumSpanningTree() has the effect that not the
-    // actual MST is returned but the edges _not_ in the MST.
-
-    ProfileInfo::EdgeWeights ECs =
-      getAnalysis<ProfileInfo>(*F).getEdgeWeights(F);
-    std::vector<ProfileInfo::EdgeWeight> EdgeVector(ECs.begin(), ECs.end());
-    MaximumSpanningTree<BasicBlock> MST(EdgeVector);
-    std::stable_sort(MST.begin(), MST.end());
-
-    // Check if (0,entry) not in the MST. If not, instrument edge
-    // (IncrementCounterInBlock()) and set the counter initially to zero, if
-    // the edge is in the MST the counter is initialised to -1.
-
-    BasicBlock *entry = &(F->getEntryBlock());
-    ProfileInfo::Edge edge = ProfileInfo::getEdge(0, entry);
-    if (!std::binary_search(MST.begin(), MST.end(), edge)) {
-      printEdgeCounter(edge, entry, i);
-      IncrementCounterInBlock(entry, i, Counters); ++NumEdgesInserted;
-      Initializer[i++] = (Zero);
-    } else{
-      Initializer[i++] = (Uncounted);
-    }
-
-    // InsertedBlocks contains all blocks that were inserted for splitting an
-    // edge, this blocks do not have to be instrumented.
-    DenseSet<BasicBlock*> InsertedBlocks;
-    for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
-      // Check if block was not inserted and thus does not have to be
-      // instrumented.
-      if (InsertedBlocks.count(BB)) continue;
-
-      // Okay, we have to add a counter of each outgoing edge not in MST. If
-      // the outgoing edge is not critical don't split it, just insert the
-      // counter in the source or destination of the edge. Also, if the block
-      // has no successors, the virtual edge (BB,0) is processed.
-      TerminatorInst *TI = BB->getTerminator();
-      if (TI->getNumSuccessors() == 0) {
-        ProfileInfo::Edge edge = ProfileInfo::getEdge(BB, 0);
-        if (!std::binary_search(MST.begin(), MST.end(), edge)) {
-          printEdgeCounter(edge, BB, i);
-          IncrementCounterInBlock(BB, i, Counters); ++NumEdgesInserted;
-          Initializer[i++] = (Zero);
-        } else{
-          Initializer[i++] = (Uncounted);
-        }
-      }
-      for (unsigned s = 0, e = TI->getNumSuccessors(); s != e; ++s) {
-        BasicBlock *Succ = TI->getSuccessor(s);
-        ProfileInfo::Edge edge = ProfileInfo::getEdge(BB,Succ);
-        if (!std::binary_search(MST.begin(), MST.end(), edge)) {
-
-          // If the edge is critical, split it.
-          bool wasInserted = SplitCriticalEdge(TI, s, this);
-          Succ = TI->getSuccessor(s);
-          if (wasInserted)
-            InsertedBlocks.insert(Succ);
-
-          // Okay, we are guaranteed that the edge is no longer critical.  If
-          // we only have a single successor, insert the counter in this block,
-          // otherwise insert it in the successor block.
-          if (TI->getNumSuccessors() == 1) {
-            // Insert counter at the start of the block
-            printEdgeCounter(edge, BB, i);
-            IncrementCounterInBlock(BB, i, Counters); ++NumEdgesInserted;
-          } else {
-            // Insert counter at the start of the block
-            printEdgeCounter(edge, Succ, i);
-            IncrementCounterInBlock(Succ, i, Counters); ++NumEdgesInserted;
-          }
-          Initializer[i++] = (Zero);
-        } else {
-          Initializer[i++] = (Uncounted);
-        }
-      }
-    }
-  }
-
-  // Check if the number of edges counted at first was the number of edges we
-  // considered for instrumentation.
-  assert(i == NumEdges && "the number of edges in counting array is wrong");
-
-  // Assign the now completely defined initialiser to the array.
-  Constant *init = ConstantArray::get(ATy, Initializer);
-  Counters->setInitializer(init);
-
-  // Add the initialization call to main.
-  InsertProfilingInitCall(Main, "llvm_start_opt_edge_profiling", Counters);
-  return true;
-}
-
diff --git a/lib/Transforms/Instrumentation/PathProfiling.cpp b/lib/Transforms/Instrumentation/PathProfiling.cpp
deleted file mode 100644
index 7de7326..0000000
--- a/lib/Transforms/Instrumentation/PathProfiling.cpp
+++ /dev/null
@@ -1,1424 +0,0 @@
-//===- PathProfiling.cpp - Inserts counters for path profiling ------------===//
-//
-//                      The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass instruments functions for Ball-Larus path profiling.  Ball-Larus
-// profiling converts the CFG into a DAG by replacing backedges with edges
-// from entry to the start block and from the end block to exit.  The paths
-// along the new DAG are enumrated, i.e. each path is given a path number.
-// Edges are instrumented to increment the path number register, such that the
-// path number register will equal the path number of the path taken at the
-// exit.
-//
-// This file defines classes for building a CFG for use with different stages
-// in the Ball-Larus path profiling instrumentation [Ball96].  The
-// requirements are formatting the llvm CFG into the Ball-Larus DAG, path
-// numbering, finding a spanning tree, moving increments from the spanning
-// tree to chords.
-//
-// Terms:
-// DAG            - Directed Acyclic Graph.
-// Ball-Larus DAG - A CFG with an entry node, an exit node, and backedges
-//                  removed in the following manner.  For every backedge
-//                  v->w, insert edge ENTRY->w and edge v->EXIT.
-// Path Number    - The number corresponding to a specific path through a
-//                  Ball-Larus DAG.
-// Spanning Tree  - A subgraph, S, is a spanning tree if S covers all
-//                  vertices and is a tree.
-// Chord          - An edge not in the spanning tree.
-//
-// [Ball96]
-//  T. Ball and J. R. Larus. "Efficient Path Profiling."
-//  International Symposium on Microarchitecture, pages 46-57, 1996.
-//  http://portal.acm.org/citation.cfm?id=243857
-//
-// [Ball94]
-//  Thomas Ball.  "Efficiently Counting Program Events with Support for
-//  On-line queries."
-//  ACM Transactions on Programmmg Languages and Systems, Vol 16, No 5,
-//  September 1994, Pages 1399-1410.
-//===----------------------------------------------------------------------===//
-#define DEBUG_TYPE "insert-path-profiling"
-
-#include "llvm/Transforms/Instrumentation.h"
-#include "ProfilingUtils.h"
-#include "llvm/Analysis/PathNumbering.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/TypeBuilder.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/CFG.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Compiler.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include <vector>
-
-#define HASH_THRESHHOLD 100000
-
-using namespace llvm;
-
-namespace {
-class BLInstrumentationNode;
-class BLInstrumentationEdge;
-class BLInstrumentationDag;
-
-// ---------------------------------------------------------------------------
-// BLInstrumentationNode extends BallLarusNode with member used by the
-// instrumentation algortihms.
-// ---------------------------------------------------------------------------
-class BLInstrumentationNode : public BallLarusNode {
-public:
-  // Creates a new BLInstrumentationNode from a BasicBlock.
-  BLInstrumentationNode(BasicBlock* BB);
-
-  // Get/sets the Value corresponding to the pathNumber register,
-  // constant or phinode.  Used by the instrumentation code to remember
-  // path number Values.
-  Value* getStartingPathNumber();
-  void setStartingPathNumber(Value* pathNumber);
-
-  Value* getEndingPathNumber();
-  void setEndingPathNumber(Value* pathNumber);
-
-  // Get/set the PHINode Instruction for this node.
-  PHINode* getPathPHI();
-  void setPathPHI(PHINode* pathPHI);
-
-private:
-
-  Value* _startingPathNumber; // The Value for the current pathNumber.
-  Value* _endingPathNumber; // The Value for the current pathNumber.
-  PHINode* _pathPHI; // The PHINode for current pathNumber.
-};
-
-// --------------------------------------------------------------------------
-// BLInstrumentationEdge extends BallLarusEdge with data about the
-// instrumentation that will end up on each edge.
-// --------------------------------------------------------------------------
-class BLInstrumentationEdge : public BallLarusEdge {
-public:
-  BLInstrumentationEdge(BLInstrumentationNode* source,
-                        BLInstrumentationNode* target);
-
-  // Sets the target node of this edge.  Required to split edges.
-  void setTarget(BallLarusNode* node);
-
-  // Get/set whether edge is in the spanning tree.
-  bool isInSpanningTree() const;
-  void setIsInSpanningTree(bool isInSpanningTree);
-
-  // Get/ set whether this edge will be instrumented with a path number
-  // initialization.
-  bool isInitialization() const;
-  void setIsInitialization(bool isInitialization);
-
-  // Get/set whether this edge will be instrumented with a path counter
-  // increment.  Notice this is incrementing the path counter
-  // corresponding to the path number register.  The path number
-  // increment is determined by getIncrement().
-  bool isCounterIncrement() const;
-  void setIsCounterIncrement(bool isCounterIncrement);
-
-  // Get/set the path number increment that this edge will be instrumented
-  // with.  This is distinct from the path counter increment and the
-  // weight.  The counter increment counts the number of executions of
-  // some path, whereas the path number keeps track of which path number
-  // the program is on.
-  long getIncrement() const;
-  void setIncrement(long increment);
-
-  // Get/set whether the edge has been instrumented.
-  bool hasInstrumentation();
-  void setHasInstrumentation(bool hasInstrumentation);
-
-  // Returns the successor number of this edge in the source.
-  unsigned getSuccessorNumber();
-
-private:
-  // The increment that the code will be instrumented with.
-  long long _increment;
-
-  // Whether this edge is in the spanning tree.
-  bool _isInSpanningTree;
-
-  // Whether this edge is an initialiation of the path number.
-  bool _isInitialization;
-
-  // Whether this edge is a path counter increment.
-  bool _isCounterIncrement;
-
-  // Whether this edge has been instrumented.
-  bool _hasInstrumentation;
-};
-
-// ---------------------------------------------------------------------------
-// BLInstrumentationDag extends BallLarusDag with algorithms that
-// determine where instrumentation should be placed.
-// ---------------------------------------------------------------------------
-class BLInstrumentationDag : public BallLarusDag {
-public:
-  BLInstrumentationDag(Function &F);
-
-  // Returns the Exit->Root edge. This edge is required for creating
-  // directed cycles in the algorithm for moving instrumentation off of
-  // the spanning tree
-  BallLarusEdge* getExitRootEdge();
-
-  // Returns an array of phony edges which mark those nodes
-  // with function calls
-  BLEdgeVector getCallPhonyEdges();
-
-  // Gets/sets the path counter array
-  GlobalVariable* getCounterArray();
-  void setCounterArray(GlobalVariable* c);
-
-  // Calculates the increments for the chords, thereby removing
-  // instrumentation from the spanning tree edges. Implementation is based
-  // on the algorithm in Figure 4 of [Ball94]
-  void calculateChordIncrements();
-
-  // Updates the state when an edge has been split
-  void splitUpdate(BLInstrumentationEdge* formerEdge, BasicBlock* newBlock);
-
-  // Calculates a spanning tree of the DAG ignoring cycles.  Whichever
-  // edges are in the spanning tree will not be instrumented, but this
-  // implementation does not try to minimize the instrumentation overhead
-  // by trying to find hot edges.
-  void calculateSpanningTree();
-
-  // Pushes initialization further down in order to group the first
-  // increment and initialization.
-  void pushInitialization();
-
-  // Pushes the path counter increments up in order to group the last path
-  // number increment.
-  void pushCounters();
-
-  // Removes phony edges from the successor list of the source, and the
-  // predecessor list of the target.
-  void unlinkPhony();
-
-  // Generate dot graph for the function
-  void generateDotGraph();
-
-protected:
-  // BLInstrumentationDag creates BLInstrumentationNode objects in this
-  // method overriding the creation of BallLarusNode objects.
-  //
-  // Allows subclasses to determine which type of Node is created.
-  // Override this method to produce subclasses of BallLarusNode if
-  // necessary.
-  virtual BallLarusNode* createNode(BasicBlock* BB);
-
-  // BLInstrumentationDag create BLInstrumentationEdges.
-  //
-  // Allows subclasses to determine which type of Edge is created.
-  // Override this method to produce subclasses of BallLarusEdge if
-  // necessary.  Parameters source and target will have been created by
-  // createNode and can be cast to the subclass of BallLarusNode*
-  // returned by createNode.
-  virtual BallLarusEdge* createEdge(
-    BallLarusNode* source, BallLarusNode* target, unsigned edgeNumber);
-
-private:
-  BLEdgeVector _treeEdges; // All edges in the spanning tree.
-  BLEdgeVector _chordEdges; // All edges not in the spanning tree.
-  GlobalVariable* _counterArray; // Array to store path counters
-
-  // Removes the edge from the appropriate predecessor and successor lists.
-  void unlinkEdge(BallLarusEdge* edge);
-
-  // Makes an edge part of the spanning tree.
-  void makeEdgeSpanning(BLInstrumentationEdge* edge);
-
-  // Pushes initialization and calls itself recursively.
-  void pushInitializationFromEdge(BLInstrumentationEdge* edge);
-
-  // Pushes path counter increments up recursively.
-  void pushCountersFromEdge(BLInstrumentationEdge* edge);
-
-  // Depth first algorithm for determining the chord increments.f
-  void calculateChordIncrementsDfs(
-    long weight, BallLarusNode* v, BallLarusEdge* e);
-
-  // Determines the relative direction of two edges.
-  int calculateChordIncrementsDir(BallLarusEdge* e, BallLarusEdge* f);
-};
-
-// ---------------------------------------------------------------------------
-// PathProfiler is a module pass which instruments path profiling instructions
-// ---------------------------------------------------------------------------
-class PathProfiler : public ModulePass {
-private:
-  // Current context for multi threading support.
-  LLVMContext* Context;
-
-  // Which function are we currently instrumenting
-  unsigned currentFunctionNumber;
-
-  // The function prototype in the profiling runtime for incrementing a
-  // single path counter in a hash table.
-  Constant* llvmIncrementHashFunction;
-  Constant* llvmDecrementHashFunction;
-
-  // Instruments each function with path profiling.  'main' is instrumented
-  // with code to save the profile to disk.
-  bool runOnModule(Module &M);
-
-  // Analyzes the function for Ball-Larus path profiling, and inserts code.
-  void runOnFunction(std::vector<Constant*> &ftInit, Function &F, Module &M);
-
-  // Creates an increment constant representing incr.
-  ConstantInt* createIncrementConstant(long incr, int bitsize);
-
-  // Creates an increment constant representing the value in
-  // edge->getIncrement().
-  ConstantInt* createIncrementConstant(BLInstrumentationEdge* edge);
-
-  // Finds the insertion point after pathNumber in block.  PathNumber may
-  // be NULL.
-  BasicBlock::iterator getInsertionPoint(
-    BasicBlock* block, Value* pathNumber);
-
-  // Inserts source's pathNumber Value* into target.  Target may or may not
-  // have multiple predecessors, and may or may not have its phiNode
-  // initalized.
-  void pushValueIntoNode(
-    BLInstrumentationNode* source, BLInstrumentationNode* target);
-
-  // Inserts source's pathNumber Value* into the appropriate slot of
-  // target's phiNode.
-  void pushValueIntoPHI(
-    BLInstrumentationNode* target, BLInstrumentationNode* source);
-
-  // The Value* in node, oldVal,  is updated with a Value* correspodning to
-  // oldVal + addition.
-  void insertNumberIncrement(BLInstrumentationNode* node, Value* addition,
-                             bool atBeginning);
-
-  // Creates a counter increment in the given node.  The Value* in node is
-  // taken as the index into a hash table.
-  void insertCounterIncrement(
-    Value* incValue,
-    BasicBlock::iterator insertPoint,
-    BLInstrumentationDag* dag,
-    bool increment = true);
-
-  // A PHINode is created in the node, and its values initialized to -1U.
-  void preparePHI(BLInstrumentationNode* node);
-
-  // Inserts instrumentation for the given edge
-  //
-  // Pre: The edge's source node has pathNumber set if edge is non zero
-  // path number increment.
-  //
-  // Post: Edge's target node has a pathNumber set to the path number Value
-  // corresponding to the value of the path register after edge's
-  // execution.
-  void insertInstrumentationStartingAt(
-    BLInstrumentationEdge* edge,
-    BLInstrumentationDag* dag);
-
-  // If this edge is a critical edge, then inserts a node at this edge.
-  // This edge becomes the first edge, and a new BallLarusEdge is created.
-  bool splitCritical(BLInstrumentationEdge* edge, BLInstrumentationDag* dag);
-
-  // Inserts instrumentation according to the marked edges in dag.  Phony
-  // edges must be unlinked from the DAG, but accessible from the
-  // backedges.  Dag must have initializations, path number increments, and
-  // counter increments present.
-  //
-  // Counter storage is created here.
-  void insertInstrumentation( BLInstrumentationDag& dag, Module &M);
-
-public:
-  static char ID; // Pass identification, replacement for typeid
-  PathProfiler() : ModulePass(ID) {
-    initializePathProfilerPass(*PassRegistry::getPassRegistry());
-  }
-
-  virtual const char *getPassName() const {
-    return "Path Profiler";
-  }
-};
-} // end anonymous namespace
-
-// Should we print the dot-graphs
-static cl::opt<bool> DotPathDag("path-profile-pathdag", cl::Hidden,
-        cl::desc("Output the path profiling DAG for each function."));
-
-// Register the path profiler as a pass
-char PathProfiler::ID = 0;
-INITIALIZE_PASS(PathProfiler, "insert-path-profiling",
-                "Insert instrumentation for Ball-Larus path profiling",
-                false, false)
-
-ModulePass *llvm::createPathProfilerPass() { return new PathProfiler(); }
-
-namespace llvm {
-  class PathProfilingFunctionTable {};
-
-  // Type for global array storing references to hashes or arrays
-  template<bool xcompile> class TypeBuilder<PathProfilingFunctionTable,
-                                            xcompile> {
-  public:
-    static StructType *get(LLVMContext& C) {
-      return( StructType::get(
-                TypeBuilder<types::i<32>, xcompile>::get(C), // type
-                TypeBuilder<types::i<32>, xcompile>::get(C), // array size
-                TypeBuilder<types::i<8>*, xcompile>::get(C), // array/hash ptr
-                NULL));
-    }
-  };
-
-  typedef TypeBuilder<PathProfilingFunctionTable, true>
-  ftEntryTypeBuilder;
-
-  // BallLarusEdge << operator overloading
-  raw_ostream& operator<<(raw_ostream& os,
-                          const BLInstrumentationEdge& edge)
-      LLVM_ATTRIBUTE_USED;
-  raw_ostream& operator<<(raw_ostream& os,
-                          const BLInstrumentationEdge& edge) {
-    os << "[" << edge.getSource()->getName() << " -> "
-       << edge.getTarget()->getName() << "] init: "
-       << (edge.isInitialization() ? "yes" : "no")
-       << " incr:" << edge.getIncrement() << " cinc: "
-       << (edge.isCounterIncrement() ? "yes" : "no");
-    return(os);
-  }
-}
-
-// Creates a new BLInstrumentationNode from a BasicBlock.
-BLInstrumentationNode::BLInstrumentationNode(BasicBlock* BB) :
-  BallLarusNode(BB),
-  _startingPathNumber(NULL), _endingPathNumber(NULL), _pathPHI(NULL) {}
-
-// Constructor for BLInstrumentationEdge.
-BLInstrumentationEdge::BLInstrumentationEdge(BLInstrumentationNode* source,
-                                             BLInstrumentationNode* target)
-  : BallLarusEdge(source, target, 0),
-    _increment(0), _isInSpanningTree(false), _isInitialization(false),
-    _isCounterIncrement(false), _hasInstrumentation(false) {}
-
-// Sets the target node of this edge.  Required to split edges.
-void BLInstrumentationEdge::setTarget(BallLarusNode* node) {
-  _target = node;
-}
-
-// Returns whether this edge is in the spanning tree.
-bool BLInstrumentationEdge::isInSpanningTree() const {
-  return(_isInSpanningTree);
-}
-
-// Sets whether this edge is in the spanning tree.
-void BLInstrumentationEdge::setIsInSpanningTree(bool isInSpanningTree) {
-  _isInSpanningTree = isInSpanningTree;
-}
-
-// Returns whether this edge will be instrumented with a path number
-// initialization.
-bool BLInstrumentationEdge::isInitialization() const {
-  return(_isInitialization);
-}
-
-// Sets whether this edge will be instrumented with a path number
-// initialization.
-void BLInstrumentationEdge::setIsInitialization(bool isInitialization) {
-  _isInitialization = isInitialization;
-}
-
-// Returns whether this edge will be instrumented with a path counter
-// increment.  Notice this is incrementing the path counter
-// corresponding to the path number register.  The path number
-// increment is determined by getIncrement().
-bool BLInstrumentationEdge::isCounterIncrement() const {
-  return(_isCounterIncrement);
-}
-
-// Sets whether this edge will be instrumented with a path counter
-// increment.
-void BLInstrumentationEdge::setIsCounterIncrement(bool isCounterIncrement) {
-  _isCounterIncrement = isCounterIncrement;
-}
-
-// Gets the path number increment that this edge will be instrumented
-// with.  This is distinct from the path counter increment and the
-// weight.  The counter increment is counts the number of executions of
-// some path, whereas the path number keeps track of which path number
-// the program is on.
-long BLInstrumentationEdge::getIncrement() const {
-  return(_increment);
-}
-
-// Set whether this edge will be instrumented with a path number
-// increment.
-void BLInstrumentationEdge::setIncrement(long increment) {
-  _increment = increment;
-}
-
-// True iff the edge has already been instrumented.
-bool BLInstrumentationEdge::hasInstrumentation() {
-  return(_hasInstrumentation);
-}
-
-// Set whether this edge has been instrumented.
-void BLInstrumentationEdge::setHasInstrumentation(bool hasInstrumentation) {
-  _hasInstrumentation = hasInstrumentation;
-}
-
-// Returns the successor number of this edge in the source.
-unsigned BLInstrumentationEdge::getSuccessorNumber() {
-  BallLarusNode* sourceNode = getSource();
-  BallLarusNode* targetNode = getTarget();
-  BasicBlock* source = sourceNode->getBlock();
-  BasicBlock* target = targetNode->getBlock();
-
-  if(source == NULL || target == NULL)
-    return(0);
-
-  TerminatorInst* terminator = source->getTerminator();
-
-        unsigned i;
-  for(i=0; i < terminator->getNumSuccessors(); i++) {
-    if(terminator->getSuccessor(i) == target)
-      break;
-  }
-
-  return(i);
-}
-
-// BLInstrumentationDag constructor initializes a DAG for the given Function.
-BLInstrumentationDag::BLInstrumentationDag(Function &F) : BallLarusDag(F),
-                                                          _counterArray(0) {
-}
-
-// Returns the Exit->Root edge. This edge is required for creating
-// directed cycles in the algorithm for moving instrumentation off of
-// the spanning tree
-BallLarusEdge* BLInstrumentationDag::getExitRootEdge() {
-  BLEdgeIterator erEdge = getExit()->succBegin();
-  return(*erEdge);
-}
-
-BLEdgeVector BLInstrumentationDag::getCallPhonyEdges () {
-  BLEdgeVector callEdges;
-
-  for( BLEdgeIterator edge = _edges.begin(), end = _edges.end();
-       edge != end; edge++ ) {
-    if( (*edge)->getType() == BallLarusEdge::CALLEDGE_PHONY )
-      callEdges.push_back(*edge);
-  }
-
-  return callEdges;
-}
-
-// Gets the path counter array
-GlobalVariable* BLInstrumentationDag::getCounterArray() {
-  return _counterArray;
-}
-
-void BLInstrumentationDag::setCounterArray(GlobalVariable* c) {
-  _counterArray = c;
-}
-
-// Calculates the increment for the chords, thereby removing
-// instrumentation from the spanning tree edges. Implementation is based on
-// the algorithm in Figure 4 of [Ball94]
-void BLInstrumentationDag::calculateChordIncrements() {
-  calculateChordIncrementsDfs(0, getRoot(), NULL);
-
-  BLInstrumentationEdge* chord;
-  for(BLEdgeIterator chordEdge = _chordEdges.begin(),
-      end = _chordEdges.end(); chordEdge != end; chordEdge++) {
-    chord = (BLInstrumentationEdge*) *chordEdge;
-    chord->setIncrement(chord->getIncrement() + chord->getWeight());
-  }
-}
-
-// Updates the state when an edge has been split
-void BLInstrumentationDag::splitUpdate(BLInstrumentationEdge* formerEdge,
-                                       BasicBlock* newBlock) {
-  BallLarusNode* oldTarget = formerEdge->getTarget();
-  BallLarusNode* newNode = addNode(newBlock);
-  formerEdge->setTarget(newNode);
-  newNode->addPredEdge(formerEdge);
-
-  DEBUG(dbgs() << "  Edge split: " << *formerEdge << "\n");
-
-  oldTarget->removePredEdge(formerEdge);
-  BallLarusEdge* newEdge = addEdge(newNode, oldTarget,0);
-
-  if( formerEdge->getType() == BallLarusEdge::BACKEDGE ||
-                        formerEdge->getType() == BallLarusEdge::SPLITEDGE) {
-                newEdge->setType(formerEdge->getType());
-    newEdge->setPhonyRoot(formerEdge->getPhonyRoot());
-    newEdge->setPhonyExit(formerEdge->getPhonyExit());
-    formerEdge->setType(BallLarusEdge::NORMAL);
-                formerEdge->setPhonyRoot(NULL);
-    formerEdge->setPhonyExit(NULL);
-  }
-}
-
-// Calculates a spanning tree of the DAG ignoring cycles.  Whichever
-// edges are in the spanning tree will not be instrumented, but this
-// implementation does not try to minimize the instrumentation overhead
-// by trying to find hot edges.
-void BLInstrumentationDag::calculateSpanningTree() {
-  std::stack<BallLarusNode*> dfsStack;
-
-  for(BLNodeIterator nodeIt = _nodes.begin(), end = _nodes.end();
-      nodeIt != end; nodeIt++) {
-    (*nodeIt)->setColor(BallLarusNode::WHITE);
-  }
-
-  dfsStack.push(getRoot());
-  while(dfsStack.size() > 0) {
-    BallLarusNode* node = dfsStack.top();
-    dfsStack.pop();
-
-    if(node->getColor() == BallLarusNode::WHITE)
-      continue;
-
-    BallLarusNode* nextNode;
-    bool forward = true;
-    BLEdgeIterator succEnd = node->succEnd();
-
-    node->setColor(BallLarusNode::WHITE);
-    // first iterate over successors then predecessors
-    for(BLEdgeIterator edge = node->succBegin(), predEnd = node->predEnd();
-        edge != predEnd; edge++) {
-      if(edge == succEnd) {
-        edge = node->predBegin();
-        forward = false;
-      }
-
-      // Ignore split edges
-      if ((*edge)->getType() == BallLarusEdge::SPLITEDGE)
-        continue;
-
-      nextNode = forward? (*edge)->getTarget(): (*edge)->getSource();
-      if(nextNode->getColor() != BallLarusNode::WHITE) {
-        nextNode->setColor(BallLarusNode::WHITE);
-        makeEdgeSpanning((BLInstrumentationEdge*)(*edge));
-      }
-    }
-  }
-
-  for(BLEdgeIterator edge = _edges.begin(), end = _edges.end();
-      edge != end; edge++) {
-    BLInstrumentationEdge* instEdge = (BLInstrumentationEdge*) (*edge);
-      // safe since createEdge is overriden
-    if(!instEdge->isInSpanningTree() && (*edge)->getType()
-        != BallLarusEdge::SPLITEDGE)
-      _chordEdges.push_back(instEdge);
-  }
-}
-
-// Pushes initialization further down in order to group the first
-// increment and initialization.
-void BLInstrumentationDag::pushInitialization() {
-  BLInstrumentationEdge* exitRootEdge =
-                (BLInstrumentationEdge*) getExitRootEdge();
-  exitRootEdge->setIsInitialization(true);
-  pushInitializationFromEdge(exitRootEdge);
-}
-
-// Pushes the path counter increments up in order to group the last path
-// number increment.
-void BLInstrumentationDag::pushCounters() {
-  BLInstrumentationEdge* exitRootEdge =
-    (BLInstrumentationEdge*) getExitRootEdge();
-  exitRootEdge->setIsCounterIncrement(true);
-  pushCountersFromEdge(exitRootEdge);
-}
-
-// Removes phony edges from the successor list of the source, and the
-// predecessor list of the target.
-void BLInstrumentationDag::unlinkPhony() {
-  BallLarusEdge* edge;
-
-  for(BLEdgeIterator next = _edges.begin(),
-      end = _edges.end(); next != end; next++) {
-    edge = (*next);
-
-    if( edge->getType() == BallLarusEdge::BACKEDGE_PHONY ||
-        edge->getType() == BallLarusEdge::SPLITEDGE_PHONY ||
-        edge->getType() == BallLarusEdge::CALLEDGE_PHONY ) {
-      unlinkEdge(edge);
-    }
-  }
-}
-
-// Generate a .dot graph to represent the DAG and pathNumbers
-void BLInstrumentationDag::generateDotGraph() {
-  std::string errorInfo;
-  std::string functionName = getFunction().getName().str();
-  std::string filename = "pathdag." + functionName + ".dot";
-
-  DEBUG (dbgs() << "Writing '" << filename << "'...\n");
-  raw_fd_ostream dotFile(filename.c_str(), errorInfo);
-
-  if (!errorInfo.empty()) {
-    errs() << "Error opening '" << filename.c_str() <<"' for writing!";
-    errs() << "\n";
-    return;
-  }
-
-  dotFile << "digraph " << functionName << " {\n";
-
-  for( BLEdgeIterator edge = _edges.begin(), end = _edges.end();
-       edge != end; edge++) {
-    std::string sourceName = (*edge)->getSource()->getName();
-    std::string targetName = (*edge)->getTarget()->getName();
-
-    dotFile << "\t\"" << sourceName.c_str() << "\" -> \""
-            << targetName.c_str() << "\" ";
-
-    long inc = ((BLInstrumentationEdge*)(*edge))->getIncrement();
-
-    switch( (*edge)->getType() ) {
-    case BallLarusEdge::NORMAL:
-      dotFile << "[label=" << inc << "] [color=black];\n";
-      break;
-
-    case BallLarusEdge::BACKEDGE:
-      dotFile << "[color=cyan];\n";
-      break;
-
-    case BallLarusEdge::BACKEDGE_PHONY:
-      dotFile << "[label=" << inc
-              << "] [color=blue];\n";
-      break;
-
-    case BallLarusEdge::SPLITEDGE:
-      dotFile << "[color=violet];\n";
-      break;
-
-    case BallLarusEdge::SPLITEDGE_PHONY:
-      dotFile << "[label=" << inc << "] [color=red];\n";
-      break;
-
-    case BallLarusEdge::CALLEDGE_PHONY:
-      dotFile << "[label=" << inc     << "] [color=green];\n";
-      break;
-    }
-  }
-
-  dotFile << "}\n";
-}
-
-// Allows subclasses to determine which type of Node is created.
-// Override this method to produce subclasses of BallLarusNode if
-// necessary. The destructor of BallLarusDag will call free on each pointer
-// created.
-BallLarusNode* BLInstrumentationDag::createNode(BasicBlock* BB) {
-  return( new BLInstrumentationNode(BB) );
-}
-
-// Allows subclasses to determine which type of Edge is created.
-// Override this method to produce subclasses of BallLarusEdge if
-// necessary. The destructor of BallLarusDag will call free on each pointer
-// created.
-BallLarusEdge* BLInstrumentationDag::createEdge(BallLarusNode* source,
-                                                BallLarusNode* target, unsigned edgeNumber) {
-  // One can cast from BallLarusNode to BLInstrumentationNode since createNode
-  // is overriden to produce BLInstrumentationNode.
-  return( new BLInstrumentationEdge((BLInstrumentationNode*)source,
-                                    (BLInstrumentationNode*)target) );
-}
-
-// Sets the Value corresponding to the pathNumber register, constant,
-// or phinode.  Used by the instrumentation code to remember path
-// number Values.
-Value* BLInstrumentationNode::getStartingPathNumber(){
-  return(_startingPathNumber);
-}
-
-// Sets the Value of the pathNumber.  Used by the instrumentation code.
-void BLInstrumentationNode::setStartingPathNumber(Value* pathNumber) {
-  DEBUG(dbgs() << "  SPN-" << getName() << " <-- " << (pathNumber ?
-                                                       pathNumber->getName() :
-                                                       "unused") << "\n");
-  _startingPathNumber = pathNumber;
-}
-
-Value* BLInstrumentationNode::getEndingPathNumber(){
-  return(_endingPathNumber);
-}
-
-void BLInstrumentationNode::setEndingPathNumber(Value* pathNumber) {
-  DEBUG(dbgs() << "  EPN-" << getName() << " <-- "
-               << (pathNumber ? pathNumber->getName() : "unused") << "\n");
-  _endingPathNumber = pathNumber;
-}
-
-// Get the PHINode Instruction for this node.  Used by instrumentation
-// code.
-PHINode* BLInstrumentationNode::getPathPHI() {
-  return(_pathPHI);
-}
-
-// Set the PHINode Instruction for this node.  Used by instrumentation
-// code.
-void BLInstrumentationNode::setPathPHI(PHINode* pathPHI) {
-  _pathPHI = pathPHI;
-}
-
-// Removes the edge from the appropriate predecessor and successor
-// lists.
-void BLInstrumentationDag::unlinkEdge(BallLarusEdge* edge) {
-  if(edge == getExitRootEdge())
-    DEBUG(dbgs() << " Removing exit->root edge\n");
-
-  edge->getSource()->removeSuccEdge(edge);
-  edge->getTarget()->removePredEdge(edge);
-}
-
-// Makes an edge part of the spanning tree.
-void BLInstrumentationDag::makeEdgeSpanning(BLInstrumentationEdge* edge) {
-  edge->setIsInSpanningTree(true);
-  _treeEdges.push_back(edge);
-}
-
-// Pushes initialization and calls itself recursively.
-void BLInstrumentationDag::pushInitializationFromEdge(
-  BLInstrumentationEdge* edge) {
-  BallLarusNode* target;
-
-  target = edge->getTarget();
-  if( target->getNumberPredEdges() > 1 || target == getExit() ) {
-    return;
-  } else {
-    for(BLEdgeIterator next = target->succBegin(),
-          end = target->succEnd(); next != end; next++) {
-      BLInstrumentationEdge* intoEdge = (BLInstrumentationEdge*) *next;
-
-      // Skip split edges
-      if (intoEdge->getType() == BallLarusEdge::SPLITEDGE)
-        continue;
-
-      intoEdge->setIncrement(intoEdge->getIncrement() +
-                             edge->getIncrement());
-      intoEdge->setIsInitialization(true);
-      pushInitializationFromEdge(intoEdge);
-    }
-
-    edge->setIncrement(0);
-    edge->setIsInitialization(false);
-  }
-}
-
-// Pushes path counter increments up recursively.
-void BLInstrumentationDag::pushCountersFromEdge(BLInstrumentationEdge* edge) {
-  BallLarusNode* source;
-
-  source = edge->getSource();
-  if(source->getNumberSuccEdges() > 1 || source == getRoot()
-     || edge->isInitialization()) {
-    return;
-  } else {
-    for(BLEdgeIterator previous = source->predBegin(),
-          end = source->predEnd(); previous != end; previous++) {
-      BLInstrumentationEdge* fromEdge = (BLInstrumentationEdge*) *previous;
-
-      // Skip split edges
-      if (fromEdge->getType() == BallLarusEdge::SPLITEDGE)
-        continue;
-
-      fromEdge->setIncrement(fromEdge->getIncrement() +
-                             edge->getIncrement());
-      fromEdge->setIsCounterIncrement(true);
-      pushCountersFromEdge(fromEdge);
-    }
-
-    edge->setIncrement(0);
-    edge->setIsCounterIncrement(false);
-  }
-}
-
-// Depth first algorithm for determining the chord increments.
-void BLInstrumentationDag::calculateChordIncrementsDfs(long weight,
-                                                       BallLarusNode* v, BallLarusEdge* e) {
-  BLInstrumentationEdge* f;
-
-  for(BLEdgeIterator treeEdge = _treeEdges.begin(),
-        end = _treeEdges.end(); treeEdge != end; treeEdge++) {
-    f = (BLInstrumentationEdge*) *treeEdge;
-    if(e != f && v == f->getTarget()) {
-      calculateChordIncrementsDfs(
-        calculateChordIncrementsDir(e,f)*(weight) +
-        f->getWeight(), f->getSource(), f);
-    }
-    if(e != f && v == f->getSource()) {
-      calculateChordIncrementsDfs(
-        calculateChordIncrementsDir(e,f)*(weight) +
-        f->getWeight(), f->getTarget(), f);
-    }
-  }
-
-  for(BLEdgeIterator chordEdge = _chordEdges.begin(),
-        end = _chordEdges.end(); chordEdge != end; chordEdge++) {
-    f = (BLInstrumentationEdge*) *chordEdge;
-    if(v == f->getSource() || v == f->getTarget()) {
-      f->setIncrement(f->getIncrement() +
-                      calculateChordIncrementsDir(e,f)*weight);
-    }
-  }
-}
-
-// Determines the relative direction of two edges.
-int BLInstrumentationDag::calculateChordIncrementsDir(BallLarusEdge* e,
-                                                      BallLarusEdge* f) {
-  if( e == NULL)
-    return(1);
-  else if(e->getSource() == f->getTarget()
-          || e->getTarget() == f->getSource())
-    return(1);
-
-  return(-1);
-}
-
-// Creates an increment constant representing incr.
-ConstantInt* PathProfiler::createIncrementConstant(long incr,
-                                                   int bitsize) {
-  return(ConstantInt::get(IntegerType::get(*Context, 32), incr));
-}
-
-// Creates an increment constant representing the value in
-// edge->getIncrement().
-ConstantInt* PathProfiler::createIncrementConstant(
-  BLInstrumentationEdge* edge) {
-  return(createIncrementConstant(edge->getIncrement(), 32));
-}
-
-// Finds the insertion point after pathNumber in block.  PathNumber may
-// be NULL.
-BasicBlock::iterator PathProfiler::getInsertionPoint(BasicBlock* block, Value*
-                                                     pathNumber) {
-  if(pathNumber == NULL || isa<ConstantInt>(pathNumber)
-     || (((Instruction*)(pathNumber))->getParent()) != block) {
-    return(block->getFirstInsertionPt());
-  } else {
-    Instruction* pathNumberInst = (Instruction*) (pathNumber);
-    BasicBlock::iterator insertPoint;
-    BasicBlock::iterator end = block->end();
-
-    for(insertPoint = block->begin();
-        insertPoint != end; insertPoint++) {
-      Instruction* insertInst = &(*insertPoint);
-
-      if(insertInst == pathNumberInst)
-        return(++insertPoint);
-    }
-
-    return(insertPoint);
-  }
-}
-
-// A PHINode is created in the node, and its values initialized to -1U.
-void PathProfiler::preparePHI(BLInstrumentationNode* node) {
-  BasicBlock* block = node->getBlock();
-  BasicBlock::iterator insertPoint = block->getFirstInsertionPt();
-  pred_iterator PB = pred_begin(node->getBlock()),
-          PE = pred_end(node->getBlock());
-  PHINode* phi = PHINode::Create(Type::getInt32Ty(*Context),
-                                 std::distance(PB, PE), "pathNumber",
-                                 insertPoint );
-  node->setPathPHI(phi);
-  node->setStartingPathNumber(phi);
-  node->setEndingPathNumber(phi);
-
-  for(pred_iterator predIt = PB; predIt != PE; predIt++) {
-    BasicBlock* pred = (*predIt);
-
-    if(pred != NULL)
-      phi->addIncoming(createIncrementConstant((long)-1, 32), pred);
-  }
-}
-
-// Inserts source's pathNumber Value* into target.  Target may or may not
-// have multiple predecessors, and may or may not have its phiNode
-// initalized.
-void PathProfiler::pushValueIntoNode(BLInstrumentationNode* source,
-                                     BLInstrumentationNode* target) {
-  if(target->getBlock() == NULL)
-    return;
-
-
-  if(target->getNumberPredEdges() <= 1) {
-    assert(target->getStartingPathNumber() == NULL &&
-           "Target already has path number");
-    target->setStartingPathNumber(source->getEndingPathNumber());
-    target->setEndingPathNumber(source->getEndingPathNumber());
-    DEBUG(dbgs() << "  Passing path number"
-          << (source->getEndingPathNumber() ? "" : " (null)")
-          << " value through.\n");
-  } else {
-    if(target->getPathPHI() == NULL) {
-      DEBUG(dbgs() << "  Initializing PHI node for block '"
-            << target->getName() << "'\n");
-      preparePHI(target);
-    }
-    pushValueIntoPHI(target, source);
-    DEBUG(dbgs() << "  Passing number value into PHI for block '"
-          << target->getName() << "'\n");
-  }
-}
-
-// Inserts source's pathNumber Value* into the appropriate slot of
-// target's phiNode.
-void PathProfiler::pushValueIntoPHI(BLInstrumentationNode* target,
-                                    BLInstrumentationNode* source) {
-  PHINode* phi = target->getPathPHI();
-  assert(phi != NULL && "  Tried to push value into node with PHI, but node"
-         " actually had no PHI.");
-  phi->removeIncomingValue(source->getBlock(), false);
-  phi->addIncoming(source->getEndingPathNumber(), source->getBlock());
-}
-
-// The Value* in node, oldVal,  is updated with a Value* correspodning to
-// oldVal + addition.
-void PathProfiler::insertNumberIncrement(BLInstrumentationNode* node,
-                                         Value* addition, bool atBeginning) {
-  BasicBlock* block = node->getBlock();
-  assert(node->getStartingPathNumber() != NULL);
-  assert(node->getEndingPathNumber() != NULL);
-
-  BasicBlock::iterator insertPoint;
-
-  if( atBeginning )
-    insertPoint = block->getFirstInsertionPt();
-  else
-    insertPoint = block->getTerminator();
-
-  DEBUG(errs() << "  Creating addition instruction.\n");
-  Value* newpn = BinaryOperator::Create(Instruction::Add,
-                                        node->getStartingPathNumber(),
-                                        addition, "pathNumber", insertPoint);
-
-  node->setEndingPathNumber(newpn);
-
-  if( atBeginning )
-    node->setStartingPathNumber(newpn);
-}
-
-// Creates a counter increment in the given node.  The Value* in node is
-// taken as the index into an array or hash table.  The hash table access
-// is a call to the runtime.
-void PathProfiler::insertCounterIncrement(Value* incValue,
-                                          BasicBlock::iterator insertPoint,
-                                          BLInstrumentationDag* dag,
-                                          bool increment) {
-  // Counter increment for array
-  if( dag->getNumberOfPaths() <= HASH_THRESHHOLD ) {
-    // Get pointer to the array location
-    std::vector<Value*> gepIndices(2);
-    gepIndices[0] = Constant::getNullValue(Type::getInt32Ty(*Context));
-    gepIndices[1] = incValue;
-
-    GetElementPtrInst* pcPointer =
-      GetElementPtrInst::Create(dag->getCounterArray(), gepIndices,
-                                "counterInc", insertPoint);
-
-    // Load from the array - call it oldPC
-    LoadInst* oldPc = new LoadInst(pcPointer, "oldPC", insertPoint);
-
-    // Test to see whether adding 1 will overflow the counter
-    ICmpInst* isMax = new ICmpInst(insertPoint, CmpInst::ICMP_ULT, oldPc,
-                                   createIncrementConstant(0xffffffff, 32),
-                                   "isMax");
-
-    // Select increment for the path counter based on overflow
-    SelectInst* inc =
-      SelectInst::Create( isMax, createIncrementConstant(increment?1:-1,32),
-                          createIncrementConstant(0,32),
-                          "pathInc", insertPoint);
-
-    // newPc = oldPc + inc
-    BinaryOperator* newPc = BinaryOperator::Create(Instruction::Add,
-                                                   oldPc, inc, "newPC",
-                                                   insertPoint);
-
-    // Store back in to the array
-    new StoreInst(newPc, pcPointer, insertPoint);
-  } else { // Counter increment for hash
-    std::vector<Value*> args(2);
-    args[0] = ConstantInt::get(Type::getInt32Ty(*Context),
-                               currentFunctionNumber);
-    args[1] = incValue;
-
-    CallInst::Create(
-      increment ? llvmIncrementHashFunction : llvmDecrementHashFunction,
-      args, "", insertPoint);
-  }
-}
-
-// Inserts instrumentation for the given edge
-//
-// Pre: The edge's source node has pathNumber set if edge is non zero
-// path number increment.
-//
-// Post: Edge's target node has a pathNumber set to the path number Value
-// corresponding to the value of the path register after edge's
-// execution.
-//
-// FIXME: This should be reworked so it's not recursive.
-void PathProfiler::insertInstrumentationStartingAt(BLInstrumentationEdge* edge,
-                                                   BLInstrumentationDag* dag) {
-  // Mark the edge as instrumented
-  edge->setHasInstrumentation(true);
-  DEBUG(dbgs() << "\nInstrumenting edge: " << (*edge) << "\n");
-
-  // create a new node for this edge's instrumentation
-  splitCritical(edge, dag);
-
-  BLInstrumentationNode* sourceNode = (BLInstrumentationNode*)edge->getSource();
-  BLInstrumentationNode* targetNode = (BLInstrumentationNode*)edge->getTarget();
-  BLInstrumentationNode* instrumentNode;
-  BLInstrumentationNode* nextSourceNode;
-
-  bool atBeginning = false;
-
-  // Source node has only 1 successor so any information can be simply
-  // inserted in to it without splitting
-  if( sourceNode->getBlock() && sourceNode->getNumberSuccEdges() <= 1) {
-    DEBUG(dbgs() << "  Potential instructions to be placed in: "
-          << sourceNode->getName() << " (at end)\n");
-    instrumentNode = sourceNode;
-    nextSourceNode = targetNode; // ... since we never made any new nodes
-  }
-
-  // The target node only has one predecessor, so we can safely insert edge
-  // instrumentation into it. If there was splitting, it must have been
-  // successful.
-  else if( targetNode->getNumberPredEdges() == 1 ) {
-    DEBUG(dbgs() << "  Potential instructions to be placed in: "
-          << targetNode->getName() << " (at beginning)\n");
-    pushValueIntoNode(sourceNode, targetNode);
-    instrumentNode = targetNode;
-    nextSourceNode = NULL; // ... otherwise we'll just keep splitting
-    atBeginning = true;
-  }
-
-  // Somehow, splitting must have failed.
-  else {
-    errs() << "Instrumenting could not split a critical edge.\n";
-    DEBUG(dbgs() << "  Couldn't split edge " << (*edge) << ".\n");
-    return;
-  }
-
-  // Insert instrumentation if this is a back or split edge
-  if( edge->getType() == BallLarusEdge::BACKEDGE ||
-      edge->getType() == BallLarusEdge::SPLITEDGE ) {
-    BLInstrumentationEdge* top =
-      (BLInstrumentationEdge*) edge->getPhonyRoot();
-    BLInstrumentationEdge* bottom =
-      (BLInstrumentationEdge*) edge->getPhonyExit();
-
-    assert( top->isInitialization() && " Top phony edge did not"
-            " contain a path number initialization.");
-    assert( bottom->isCounterIncrement() && " Bottom phony edge"
-            " did not contain a path counter increment.");
-
-    // split edge has yet to be initialized
-    if( !instrumentNode->getEndingPathNumber() ) {
-      instrumentNode->setStartingPathNumber(createIncrementConstant(0,32));
-      instrumentNode->setEndingPathNumber(createIncrementConstant(0,32));
-    }
-
-    BasicBlock::iterator insertPoint = atBeginning ?
-      instrumentNode->getBlock()->getFirstInsertionPt() :
-      instrumentNode->getBlock()->getTerminator();
-
-    // add information from the bottom edge, if it exists
-    if( bottom->getIncrement() ) {
-      Value* newpn =
-        BinaryOperator::Create(Instruction::Add,
-                               instrumentNode->getStartingPathNumber(),
-                               createIncrementConstant(bottom),
-                               "pathNumber", insertPoint);
-      instrumentNode->setEndingPathNumber(newpn);
-    }
-
-    insertCounterIncrement(instrumentNode->getEndingPathNumber(),
-                           insertPoint, dag);
-
-    if( atBeginning )
-      instrumentNode->setStartingPathNumber(createIncrementConstant(top));
-
-    instrumentNode->setEndingPathNumber(createIncrementConstant(top));
-
-    // Check for path counter increments
-    if( top->isCounterIncrement() ) {
-      insertCounterIncrement(instrumentNode->getEndingPathNumber(),
-                             instrumentNode->getBlock()->getTerminator(),dag);
-      instrumentNode->setEndingPathNumber(0);
-    }
-  }
-
-  // Insert instrumentation if this is a normal edge
-  else {
-    BasicBlock::iterator insertPoint = atBeginning ?
-      instrumentNode->getBlock()->getFirstInsertionPt() :
-      instrumentNode->getBlock()->getTerminator();
-
-    if( edge->isInitialization() ) { // initialize path number
-      instrumentNode->setEndingPathNumber(createIncrementConstant(edge));
-    } else if( edge->getIncrement() )       {// increment path number
-      Value* newpn =
-        BinaryOperator::Create(Instruction::Add,
-                               instrumentNode->getStartingPathNumber(),
-                               createIncrementConstant(edge),
-                               "pathNumber", insertPoint);
-      instrumentNode->setEndingPathNumber(newpn);
-
-      if( atBeginning )
-        instrumentNode->setStartingPathNumber(newpn);
-    }
-
-    // Check for path counter increments
-    if( edge->isCounterIncrement() ) {
-      insertCounterIncrement(instrumentNode->getEndingPathNumber(),
-                             insertPoint, dag);
-      instrumentNode->setEndingPathNumber(0);
-    }
-  }
-
-  // Push it along
-  if (nextSourceNode && instrumentNode->getEndingPathNumber())
-    pushValueIntoNode(instrumentNode, nextSourceNode);
-
-  // Add all the successors
-  for( BLEdgeIterator next = targetNode->succBegin(),
-         end = targetNode->succEnd(); next != end; next++ ) {
-    // So long as it is un-instrumented, add it to the list
-    if( !((BLInstrumentationEdge*)(*next))->hasInstrumentation() )
-      insertInstrumentationStartingAt((BLInstrumentationEdge*)*next,dag);
-    else
-      DEBUG(dbgs() << "  Edge " << *(BLInstrumentationEdge*)(*next)
-            << " already instrumented.\n");
-  }
-}
-
-// Inserts instrumentation according to the marked edges in dag.  Phony edges
-// must be unlinked from the DAG, but accessible from the backedges.  Dag
-// must have initializations, path number increments, and counter increments
-// present.
-//
-// Counter storage is created here.
-void PathProfiler::insertInstrumentation(
-  BLInstrumentationDag& dag, Module &M) {
-
-  BLInstrumentationEdge* exitRootEdge =
-    (BLInstrumentationEdge*) dag.getExitRootEdge();
-  insertInstrumentationStartingAt(exitRootEdge, &dag);
-
-  // Iterate through each call edge and apply the appropriate hash increment
-  // and decrement functions
-  BLEdgeVector callEdges = dag.getCallPhonyEdges();
-  for( BLEdgeIterator edge = callEdges.begin(),
-         end = callEdges.end(); edge != end; edge++ ) {
-    BLInstrumentationNode* node =
-      (BLInstrumentationNode*)(*edge)->getSource();
-    BasicBlock::iterator insertPoint = node->getBlock()->getFirstInsertionPt();
-
-    // Find the first function call
-    while( ((Instruction&)(*insertPoint)).getOpcode() != Instruction::Call )
-      insertPoint++;
-
-    DEBUG(dbgs() << "\nInstrumenting method call block '"
-                 << node->getBlock()->getName() << "'\n");
-    DEBUG(dbgs() << "   Path number initialized: "
-                 << ((node->getStartingPathNumber()) ? "yes" : "no") << "\n");
-
-    Value* newpn;
-    if( node->getStartingPathNumber() ) {
-      long inc = ((BLInstrumentationEdge*)(*edge))->getIncrement();
-      if ( inc )
-        newpn = BinaryOperator::Create(Instruction::Add,
-                                       node->getStartingPathNumber(),
-                                       createIncrementConstant(inc,32),
-                                       "pathNumber", insertPoint);
-      else
-        newpn = node->getStartingPathNumber();
-    } else {
-      newpn = (Value*)createIncrementConstant(
-        ((BLInstrumentationEdge*)(*edge))->getIncrement(), 32);
-    }
-
-    insertCounterIncrement(newpn, insertPoint, &dag);
-    insertCounterIncrement(newpn, node->getBlock()->getTerminator(),
-                           &dag, false);
-  }
-}
-
-// Entry point of the module
-void PathProfiler::runOnFunction(std::vector<Constant*> &ftInit,
-                                 Function &F, Module &M) {
-  // Build DAG from CFG
-  BLInstrumentationDag dag = BLInstrumentationDag(F);
-  dag.init();
-
-  // give each path a unique integer value
-  dag.calculatePathNumbers();
-
-  // modify path increments to increase the efficiency
-  // of instrumentation
-  dag.calculateSpanningTree();
-  dag.calculateChordIncrements();
-  dag.pushInitialization();
-  dag.pushCounters();
-  dag.unlinkPhony();
-
-  // potentially generate .dot graph for the dag
-  if (DotPathDag)
-    dag.generateDotGraph ();
-
-  // Should we store the information in an array or hash
-  if( dag.getNumberOfPaths() <= HASH_THRESHHOLD ) {
-    Type* t = ArrayType::get(Type::getInt32Ty(*Context),
-                                   dag.getNumberOfPaths());
-
-    dag.setCounterArray(new GlobalVariable(M, t, false,
-                                           GlobalValue::InternalLinkage,
-                                           Constant::getNullValue(t), ""));
-  }
-
-  insertInstrumentation(dag, M);
-
-  // Add to global function reference table
-  unsigned type;
-  Type* voidPtr = TypeBuilder<types::i<8>*, true>::get(*Context);
-
-  if( dag.getNumberOfPaths() <= HASH_THRESHHOLD )
-    type = ProfilingArray;
-  else
-    type = ProfilingHash;
-
-  std::vector<Constant*> entryArray(3);
-  entryArray[0] = createIncrementConstant(type,32);
-  entryArray[1] = createIncrementConstant(dag.getNumberOfPaths(),32);
-  entryArray[2] = dag.getCounterArray() ?
-    ConstantExpr::getBitCast(dag.getCounterArray(), voidPtr) :
-    Constant::getNullValue(voidPtr);
-
-  StructType* at = ftEntryTypeBuilder::get(*Context);
-  ConstantStruct* functionEntry =
-    (ConstantStruct*)ConstantStruct::get(at, entryArray);
-  ftInit.push_back(functionEntry);
-}
-
-// Output the bitcode if we want to observe instrumentation changess
-#define PRINT_MODULE dbgs() <<                               \
-  "\n\n============= MODULE BEGIN ===============\n" << M << \
-  "\n============== MODULE END ================\n"
-
-bool PathProfiler::runOnModule(Module &M) {
-  Context = &M.getContext();
-
-  DEBUG(dbgs()
-        << "****************************************\n"
-        << "****************************************\n"
-        << "**                                    **\n"
-        << "**   PATH PROFILING INSTRUMENTATION   **\n"
-        << "**                                    **\n"
-        << "****************************************\n"
-        << "****************************************\n");
-
-  // No main, no instrumentation!
-  Function *Main = M.getFunction("main");
-
-  // Using fortran? ... this kind of works
-  if (!Main)
-    Main = M.getFunction("MAIN__");
-
-  if (!Main) {
-    errs() << "WARNING: cannot insert path profiling into a module"
-           << " with no main function!\n";
-    return false;
-  }
-
-  llvmIncrementHashFunction = M.getOrInsertFunction(
-    "llvm_increment_path_count",
-    Type::getVoidTy(*Context), // return type
-    Type::getInt32Ty(*Context), // function number
-    Type::getInt32Ty(*Context), // path number
-    NULL );
-
-  llvmDecrementHashFunction = M.getOrInsertFunction(
-    "llvm_decrement_path_count",
-    Type::getVoidTy(*Context), // return type
-    Type::getInt32Ty(*Context), // function number
-    Type::getInt32Ty(*Context), // path number
-    NULL );
-
-  std::vector<Constant*> ftInit;
-  unsigned functionNumber = 0;
-  for (Module::iterator F = M.begin(), E = M.end(); F != E; F++) {
-    if (F->isDeclaration())
-      continue;
-
-    DEBUG(dbgs() << "Function: " << F->getName() << "\n");
-    functionNumber++;
-
-    // set function number
-    currentFunctionNumber = functionNumber;
-    runOnFunction(ftInit, *F, M);
-  }
-
-  Type *t = ftEntryTypeBuilder::get(*Context);
-  ArrayType* ftArrayType = ArrayType::get(t, ftInit.size());
-  Constant* ftInitConstant = ConstantArray::get(ftArrayType, ftInit);
-
-  DEBUG(dbgs() << " ftArrayType:" << *ftArrayType << "\n");
-
-  GlobalVariable* functionTable =
-    new GlobalVariable(M, ftArrayType, false, GlobalValue::InternalLinkage,
-                       ftInitConstant, "functionPathTable");
-  Type *eltType = ftArrayType->getTypeAtIndex((unsigned)0);
-  InsertProfilingInitCall(Main, "llvm_start_path_profiling", functionTable,
-                          PointerType::getUnqual(eltType));
-
-  DEBUG(PRINT_MODULE);
-
-  return true;
-}
-
-// If this edge is a critical edge, then inserts a node at this edge.
-// This edge becomes the first edge, and a new BallLarusEdge is created.
-// Returns true if the edge was split
-bool PathProfiler::splitCritical(BLInstrumentationEdge* edge,
-                                 BLInstrumentationDag* dag) {
-  unsigned succNum = edge->getSuccessorNumber();
-  BallLarusNode* sourceNode = edge->getSource();
-  BallLarusNode* targetNode = edge->getTarget();
-  BasicBlock* sourceBlock = sourceNode->getBlock();
-  BasicBlock* targetBlock = targetNode->getBlock();
-
-  if(sourceBlock == NULL || targetBlock == NULL
-     || sourceNode->getNumberSuccEdges() <= 1
-     || targetNode->getNumberPredEdges() == 1 ) {
-    return(false);
-  }
-
-  TerminatorInst* terminator = sourceBlock->getTerminator();
-
-  if( SplitCriticalEdge(terminator, succNum, this, false)) {
-    BasicBlock* newBlock = terminator->getSuccessor(succNum);
-    dag->splitUpdate(edge, newBlock);
-    return(true);
-  } else
-    return(false);
-}
diff --git a/lib/Transforms/Instrumentation/ProfilingUtils.cpp b/lib/Transforms/Instrumentation/ProfilingUtils.cpp
deleted file mode 100644
index 4b3de6d..0000000
--- a/lib/Transforms/Instrumentation/ProfilingUtils.cpp
+++ /dev/null
@@ -1,169 +0,0 @@
-//===- ProfilingUtils.cpp - Helper functions shared by profilers ----------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements a few helper functions which are used by profile
-// instrumentation code to instrument the code.  This allows the profiler pass
-// to worry about *what* to insert, and these functions take care of *how* to do
-// it.
-//
-//===----------------------------------------------------------------------===//
-
-#include "ProfilingUtils.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Module.h"
-
-void llvm::InsertProfilingInitCall(Function *MainFn, const char *FnName,
-                                   GlobalValue *Array,
-                                   PointerType *arrayType) {
-  LLVMContext &Context = MainFn->getContext();
-  Type *ArgVTy =
-    PointerType::getUnqual(Type::getInt8PtrTy(Context));
-  PointerType *UIntPtr = arrayType ? arrayType :
-    Type::getInt32PtrTy(Context);
-  Module &M = *MainFn->getParent();
-  Constant *InitFn = M.getOrInsertFunction(FnName, Type::getInt32Ty(Context),
-                                           Type::getInt32Ty(Context),
-                                           ArgVTy, UIntPtr,
-                                           Type::getInt32Ty(Context),
-                                           (Type *)0);
-
-  // This could force argc and argv into programs that wouldn't otherwise have
-  // them, but instead we just pass null values in.
-  std::vector<Value*> Args(4);
-  Args[0] = Constant::getNullValue(Type::getInt32Ty(Context));
-  Args[1] = Constant::getNullValue(ArgVTy);
-
-  // Skip over any allocas in the entry block.
-  BasicBlock *Entry = MainFn->begin();
-  BasicBlock::iterator InsertPos = Entry->begin();
-  while (isa<AllocaInst>(InsertPos)) ++InsertPos;
-
-  std::vector<Constant*> GEPIndices(2,
-                             Constant::getNullValue(Type::getInt32Ty(Context)));
-  unsigned NumElements = 0;
-  if (Array) {
-    Args[2] = ConstantExpr::getGetElementPtr(Array, GEPIndices);
-    NumElements =
-      cast<ArrayType>(Array->getType()->getElementType())->getNumElements();
-  } else {
-    // If this profiling instrumentation doesn't have a constant array, just
-    // pass null.
-    Args[2] = ConstantPointerNull::get(UIntPtr);
-  }
-  Args[3] = ConstantInt::get(Type::getInt32Ty(Context), NumElements);
-
-  CallInst *InitCall = CallInst::Create(InitFn, Args, "newargc", InsertPos);
-
-  // If argc or argv are not available in main, just pass null values in.
-  Function::arg_iterator AI;
-  switch (MainFn->arg_size()) {
-  default:
-  case 2:
-    AI = MainFn->arg_begin(); ++AI;
-    if (AI->getType() != ArgVTy) {
-      Instruction::CastOps opcode = CastInst::getCastOpcode(AI, false, ArgVTy,
-                                                            false);
-      InitCall->setArgOperand(1,
-          CastInst::Create(opcode, AI, ArgVTy, "argv.cast", InitCall));
-    } else {
-      InitCall->setArgOperand(1, AI);
-    }
-    /* FALL THROUGH */
-
-  case 1:
-    AI = MainFn->arg_begin();
-    // If the program looked at argc, have it look at the return value of the
-    // init call instead.
-    if (!AI->getType()->isIntegerTy(32)) {
-      Instruction::CastOps opcode;
-      if (!AI->use_empty()) {
-        opcode = CastInst::getCastOpcode(InitCall, true, AI->getType(), true);
-        AI->replaceAllUsesWith(
-          CastInst::Create(opcode, InitCall, AI->getType(), "", InsertPos));
-      }
-      opcode = CastInst::getCastOpcode(AI, true,
-                                       Type::getInt32Ty(Context), true);
-      InitCall->setArgOperand(0,
-          CastInst::Create(opcode, AI, Type::getInt32Ty(Context),
-                           "argc.cast", InitCall));
-    } else {
-      AI->replaceAllUsesWith(InitCall);
-      InitCall->setArgOperand(0, AI);
-    }
-
-  case 0: break;
-  }
-}
-
-void llvm::IncrementCounterInBlock(BasicBlock *BB, unsigned CounterNum,
-                                   GlobalValue *CounterArray, bool beginning) {
-  // Insert the increment after any alloca or PHI instructions...
-  BasicBlock::iterator InsertPos = beginning ? BB->getFirstInsertionPt() :
-                                   BB->getTerminator();
-  while (isa<AllocaInst>(InsertPos))
-    ++InsertPos;
-
-  LLVMContext &Context = BB->getContext();
-
-  // Create the getelementptr constant expression
-  std::vector<Constant*> Indices(2);
-  Indices[0] = Constant::getNullValue(Type::getInt32Ty(Context));
-  Indices[1] = ConstantInt::get(Type::getInt32Ty(Context), CounterNum);
-  Constant *ElementPtr =
-    ConstantExpr::getGetElementPtr(CounterArray, Indices);
-
-  // Load, increment and store the value back.
-  Value *OldVal = new LoadInst(ElementPtr, "OldFuncCounter", InsertPos);
-  Value *NewVal = BinaryOperator::Create(Instruction::Add, OldVal,
-                                 ConstantInt::get(Type::getInt32Ty(Context), 1),
-                                         "NewFuncCounter", InsertPos);
-  new StoreInst(NewVal, ElementPtr, InsertPos);
-}
-
-void llvm::InsertProfilingShutdownCall(Function *Callee, Module *Mod) {
-  // llvm.global_dtors is an array of type { i32, void ()* }. Prepare those
-  // types.
-  Type *GlobalDtorElems[2] = {
-    Type::getInt32Ty(Mod->getContext()),
-    FunctionType::get(Type::getVoidTy(Mod->getContext()), false)->getPointerTo()
-  };
-  StructType *GlobalDtorElemTy =
-      StructType::get(Mod->getContext(), GlobalDtorElems, false);
-
-  // Construct the new element we'll be adding.
-  Constant *Elem[2] = {
-    ConstantInt::get(Type::getInt32Ty(Mod->getContext()), 65535),
-    ConstantExpr::getBitCast(Callee, GlobalDtorElems[1])
-  };
-
-  // If llvm.global_dtors exists, make a copy of the things in its list and
-  // delete it, to replace it with one that has a larger array type.
-  std::vector<Constant *> dtors;
-  if (GlobalVariable *GlobalDtors = Mod->getNamedGlobal("llvm.global_dtors")) {
-    if (ConstantArray *InitList =
-        dyn_cast<ConstantArray>(GlobalDtors->getInitializer())) {
-      for (unsigned i = 0, e = InitList->getType()->getNumElements();
-           i != e; ++i)
-        dtors.push_back(cast<Constant>(InitList->getOperand(i)));
-    }
-    GlobalDtors->eraseFromParent();
-  }
-
-  // Build up llvm.global_dtors with our new item in it.
-  GlobalVariable *GlobalDtors = new GlobalVariable(
-      *Mod, ArrayType::get(GlobalDtorElemTy, 1), false,
-      GlobalValue::AppendingLinkage, NULL, "llvm.global_dtors");
-                                    
-  dtors.push_back(ConstantStruct::get(GlobalDtorElemTy, Elem));
-  GlobalDtors->setInitializer(ConstantArray::get(
-      cast<ArrayType>(GlobalDtors->getType()->getElementType()), dtors));
-}
diff --git a/lib/Transforms/Instrumentation/ProfilingUtils.h b/lib/Transforms/Instrumentation/ProfilingUtils.h
deleted file mode 100644
index 09b2217..0000000
--- a/lib/Transforms/Instrumentation/ProfilingUtils.h
+++ /dev/null
@@ -1,36 +0,0 @@
-//===- ProfilingUtils.h - Helper functions shared by profilers --*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines a few helper functions which are used by profile
-// instrumentation code to instrument the code.  This allows the profiler pass
-// to worry about *what* to insert, and these functions take care of *how* to do
-// it.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef PROFILINGUTILS_H
-#define PROFILINGUTILS_H
-
-namespace llvm {
-  class BasicBlock;
-  class Function;
-  class GlobalValue;
-  class Module;
-  class PointerType;
-
-  void InsertProfilingInitCall(Function *MainFn, const char *FnName,
-                               GlobalValue *Arr = 0,
-                               PointerType *arrayType = 0);
-  void IncrementCounterInBlock(BasicBlock *BB, unsigned CounterNum,
-                               GlobalValue *CounterArray,
-                               bool beginning = true);
-  void InsertProfilingShutdownCall(Function *Callee, Module *Mod);
-}
-
-#endif
diff --git a/lib/Transforms/Instrumentation/ThreadSanitizer.cpp b/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
index cc971a3..89fb746 100644
--- a/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
@@ -227,7 +227,7 @@ bool ThreadSanitizer::doInitialization(Module &M) {
   TD = getAnalysisIfAvailable<DataLayout>();
   if (!TD)
     return false;
-  BL.reset(new SpecialCaseList(BlacklistFile));
+  BL.reset(SpecialCaseList::createOrDie(BlacklistFile));
 
   // Always insert a call to __tsan_init into the module's CTORs.
   IRBuilder<> IRB(M.getContext());
@@ -240,12 +240,8 @@ bool ThreadSanitizer::doInitialization(Module &M) {
 }
 
 static bool isVtableAccess(Instruction *I) {
-  if (MDNode *Tag = I->getMetadata(LLVMContext::MD_tbaa)) {
-    if (Tag->getNumOperands() < 1) return false;
-    if (MDString *Tag1 = dyn_cast<MDString>(Tag->getOperand(0))) {
-      if (Tag1->getString() == "vtable pointer") return true;
-    }
-  }
+  if (MDNode *Tag = I->getMetadata(LLVMContext::MD_tbaa))
+    return Tag->isTBAAVtableAccess();
   return false;
 }
 
@@ -362,7 +358,7 @@ bool ThreadSanitizer::runOnFunction(Function &F) {
   // (e.g. variables that do not escape, etc).
 
   // Instrument memory accesses.
-  if (ClInstrumentMemoryAccesses)
+  if (ClInstrumentMemoryAccesses && F.hasFnAttribute(Attribute::SanitizeThread))
     for (size_t i = 0, n = AllLoadsAndStores.size(); i < n; ++i) {
       Res |= instrumentLoadOrStore(AllLoadsAndStores[i]);
     }
diff --git a/lib/Transforms/ObjCARC/ObjCARCOpts.cpp b/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
index 6f94a7c..2976df6 100644
--- a/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
+++ b/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
@@ -176,91 +176,6 @@ static const Value *FindSingleUseIdentifiedObject(const Value *Arg) {
   return 0;
 }
 
-/// \brief Test whether the given retainable object pointer escapes.
-///
-/// This differs from regular escape analysis in that a use as an
-/// argument to a call is not considered an escape.
-///
-static bool DoesRetainableObjPtrEscape(const User *Ptr) {
-  DEBUG(dbgs() << "DoesRetainableObjPtrEscape: Target: " << *Ptr << "\n");
-
-  // Walk the def-use chains.
-  SmallVector<const Value *, 4> Worklist;
-  Worklist.push_back(Ptr);
-  // If Ptr has any operands add them as well.
-  for (User::const_op_iterator I = Ptr->op_begin(), E = Ptr->op_end(); I != E;
-       ++I) {
-    Worklist.push_back(*I);
-  }
-
-  // Ensure we do not visit any value twice.
-  SmallPtrSet<const Value *, 8> VisitedSet;
-
-  do {
-    const Value *V = Worklist.pop_back_val();
-
-    DEBUG(dbgs() << "Visiting: " << *V << "\n");
-
-    for (Value::const_use_iterator UI = V->use_begin(), UE = V->use_end();
-         UI != UE; ++UI) {
-      const User *UUser = *UI;
-
-      DEBUG(dbgs() << "User: " << *UUser << "\n");
-
-      // Special - Use by a call (callee or argument) is not considered
-      // to be an escape.
-      switch (GetBasicInstructionClass(UUser)) {
-      case IC_StoreWeak:
-      case IC_InitWeak:
-      case IC_StoreStrong:
-      case IC_Autorelease:
-      case IC_AutoreleaseRV: {
-        DEBUG(dbgs() << "User copies pointer arguments. Pointer Escapes!\n");
-        // These special functions make copies of their pointer arguments.
-        return true;
-      }
-      case IC_IntrinsicUser:
-        // Use by the use intrinsic is not an escape.
-        continue;
-      case IC_User:
-      case IC_None:
-        // Use by an instruction which copies the value is an escape if the
-        // result is an escape.
-        if (isa<BitCastInst>(UUser) || isa<GetElementPtrInst>(UUser) ||
-            isa<PHINode>(UUser) || isa<SelectInst>(UUser)) {
-
-          if (VisitedSet.insert(UUser)) {
-            DEBUG(dbgs() << "User copies value. Ptr escapes if result escapes."
-                  " Adding to list.\n");
-            Worklist.push_back(UUser);
-          } else {
-            DEBUG(dbgs() << "Already visited node.\n");
-          }
-          continue;
-        }
-        // Use by a load is not an escape.
-        if (isa<LoadInst>(UUser))
-          continue;
-        // Use by a store is not an escape if the use is the address.
-        if (const StoreInst *SI = dyn_cast<StoreInst>(UUser))
-          if (V != SI->getValueOperand())
-            continue;
-        break;
-      default:
-        // Regular calls and other stuff are not considered escapes.
-        continue;
-      }
-      // Otherwise, conservatively assume an escape.
-      DEBUG(dbgs() << "Assuming ptr escapes.\n");
-      return true;
-    }
-  } while (!Worklist.empty());
-
-  // No escapes found.
-  DEBUG(dbgs() << "Ptr does not escape.\n");
-  return false;
-}
-
 /// This is a wrapper around getUnderlyingObjCPtr along the lines of
 /// GetUnderlyingObjects except that it returns early when it sees the first
 /// alloca.
@@ -517,7 +432,7 @@ namespace {
     bool Partial;
 
     /// The current position in the sequence.
-    Sequence Seq : 8;
+    unsigned char Seq : 8;
 
     /// Unidirectional information about the current sequence.
     RRInfo RRI;
@@ -583,7 +498,7 @@ namespace {
     }
 
     Sequence GetSeq() const {
-      return Seq;
+      return static_cast<Sequence>(Seq);
     }
 
     void ClearSequenceProgress() {
@@ -623,7 +538,8 @@ namespace {
 
 void
 PtrState::Merge(const PtrState &Other, bool TopDown) {
-  Seq = MergeSeqs(Seq, Other.Seq, TopDown);
+  Seq = MergeSeqs(static_cast<Sequence>(Seq), static_cast<Sequence>(Other.Seq),
+                  TopDown);
   KnownPositiveRefCount &= Other.KnownPositiveRefCount;
 
   // If we're not in a sequence (anymore), drop all associated state.
@@ -674,7 +590,9 @@ namespace {
     SmallVector<BasicBlock *, 2> Succs;
 
   public:
-    BBState() : TopDownPathCount(0), BottomUpPathCount(0) {}
+    static const unsigned OverflowOccurredValue;
+
+    BBState() : TopDownPathCount(0), BottomUpPathCount(0) { }
 
     typedef MapTy::iterator ptr_iterator;
     typedef MapTy::const_iterator ptr_const_iterator;
@@ -745,27 +663,31 @@ namespace {
     /// Returns true if overflow occured. Returns false if overflow did not
     /// occur.
     bool GetAllPathCountWithOverflow(unsigned &PathCount) const {
-      assert(TopDownPathCount != 0);
-      assert(BottomUpPathCount != 0);
+      if (TopDownPathCount == OverflowOccurredValue ||
+          BottomUpPathCount == OverflowOccurredValue)
+        return true;
       unsigned long long Product =
         (unsigned long long)TopDownPathCount*BottomUpPathCount;
-      PathCount = Product;
-      // Overflow occured if any of the upper bits of Product are set.
-      return Product >> 32;
+      // Overflow occured if any of the upper bits of Product are set or if all
+      // the lower bits of Product are all set.
+      return (Product >> 32) ||
+             ((PathCount = Product) == OverflowOccurredValue);
     }
 
     // Specialized CFG utilities.
     typedef SmallVectorImpl<BasicBlock *>::const_iterator edge_iterator;
-    edge_iterator pred_begin() { return Preds.begin(); }
-    edge_iterator pred_end() { return Preds.end(); }
-    edge_iterator succ_begin() { return Succs.begin(); }
-    edge_iterator succ_end() { return Succs.end(); }
+    edge_iterator pred_begin() const { return Preds.begin(); }
+    edge_iterator pred_end() const { return Preds.end(); }
+    edge_iterator succ_begin() const { return Succs.begin(); }
+    edge_iterator succ_end() const { return Succs.end(); }
 
     void addSucc(BasicBlock *Succ) { Succs.push_back(Succ); }
     void addPred(BasicBlock *Pred) { Preds.push_back(Pred); }
 
     bool isExit() const { return Succs.empty(); }
   };
+
+  const unsigned BBState::OverflowOccurredValue = 0xffffffff;
 }
 
 void BBState::InitFromPred(const BBState &Other) {
@@ -781,13 +703,25 @@ void BBState::InitFromSucc(const BBState &Other) {
 /// The top-down traversal uses this to merge information about predecessors to
 /// form the initial state for a new block.
 void BBState::MergePred(const BBState &Other) {
+  if (TopDownPathCount == OverflowOccurredValue)
+    return;
+
   // Other.TopDownPathCount can be 0, in which case it is either dead or a
   // loop backedge. Loop backedges are special.
   TopDownPathCount += Other.TopDownPathCount;
 
+  // In order to be consistent, we clear the top down pointers when by adding
+  // TopDownPathCount becomes OverflowOccurredValue even though "true" overflow
+  // has not occured.
+  if (TopDownPathCount == OverflowOccurredValue) {
+    clearTopDownPointers();
+    return;
+  }
+
   // Check for overflow. If we have overflow, fall back to conservative
   // behavior.
   if (TopDownPathCount < Other.TopDownPathCount) {
+    TopDownPathCount = OverflowOccurredValue;
     clearTopDownPointers();
     return;
   }
@@ -813,13 +747,25 @@ void BBState::MergePred(const BBState &Other) {
 /// The bottom-up traversal uses this to merge information about successors to
 /// form the initial state for a new block.
 void BBState::MergeSucc(const BBState &Other) {
+  if (BottomUpPathCount == OverflowOccurredValue)
+    return;
+
   // Other.BottomUpPathCount can be 0, in which case it is either dead or a
   // loop backedge. Loop backedges are special.
   BottomUpPathCount += Other.BottomUpPathCount;
 
+  // In order to be consistent, we clear the top down pointers when by adding
+  // BottomUpPathCount becomes OverflowOccurredValue even though "true" overflow
+  // has not occured.
+  if (BottomUpPathCount == OverflowOccurredValue) {
+    clearBottomUpPointers();
+    return;
+  }
+
   // Check for overflow. If we have overflow, fall back to conservative
   // behavior.
   if (BottomUpPathCount < Other.BottomUpPathCount) {
+    BottomUpPathCount = OverflowOccurredValue;
     clearBottomUpPointers();
     return;
   }
@@ -1158,13 +1104,9 @@ namespace {
     unsigned ARCAnnotationProvenanceSourceMDKind;
 #endif // ARC_ANNOATIONS
 
-    bool IsRetainBlockOptimizable(const Instruction *Inst);
-
     bool OptimizeRetainRVCall(Function &F, Instruction *RetainRV);
     void OptimizeAutoreleaseRVCall(Function &F, Instruction *AutoreleaseRV,
                                    InstructionClass &Class);
-    bool OptimizeRetainBlockCall(Function &F, Instruction *RetainBlock,
-                                 InstructionClass &Class);
     void OptimizeIndividualCalls(Function &F);
 
     void CheckForCFGHazards(const BasicBlock *BB,
@@ -1253,22 +1195,6 @@ void ObjCARCOpt::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesCFG();
 }
 
-bool ObjCARCOpt::IsRetainBlockOptimizable(const Instruction *Inst) {
-  // Without the magic metadata tag, we have to assume this might be an
-  // objc_retainBlock call inserted to convert a block pointer to an id,
-  // in which case it really is needed.
-  if (!Inst->getMetadata(CopyOnEscapeMDKind))
-    return false;
-
-  // If the pointer "escapes" (not including being used in a call),
-  // the copy may be needed.
-  if (DoesRetainableObjPtrEscape(Inst))
-    return false;
-
-  // Otherwise, it's not needed.
-  return true;
-}
-
 /// Turn objc_retainAutoreleasedReturnValue into objc_retain if the operand is
 /// not a return value.  Or, if it can be paired with an
 /// objc_autoreleaseReturnValue, delete the pair and return true.
@@ -1369,41 +1295,6 @@ ObjCARCOpt::OptimizeAutoreleaseRVCall(Function &F, Instruction *AutoreleaseRV,
 
 }
 
-// \brief Attempt to strength reduce objc_retainBlock calls to objc_retain
-// calls.
-//
-// Specifically: If an objc_retainBlock call has the copy_on_escape metadata and
-// does not escape (following the rules of block escaping), strength reduce the
-// objc_retainBlock to an objc_retain.
-//
-// TODO: If an objc_retainBlock call is dominated period by a previous
-// objc_retainBlock call, strength reduce the objc_retainBlock to an
-// objc_retain.
-bool
-ObjCARCOpt::OptimizeRetainBlockCall(Function &F, Instruction *Inst,
-                                    InstructionClass &Class) {
-  assert(GetBasicInstructionClass(Inst) == Class);
-  assert(IC_RetainBlock == Class);
-
-  // If we can not optimize Inst, return false.
-  if (!IsRetainBlockOptimizable(Inst))
-    return false;
-
-  Changed = true;
-  ++NumPeeps;
-
-  DEBUG(dbgs() << "Strength reduced retainBlock => retain.\n");
-  DEBUG(dbgs() << "Old: " << *Inst << "\n");
-  CallInst *RetainBlock = cast<CallInst>(Inst);
-  Constant *NewDecl = EP.get(ARCRuntimeEntryPoints::EPT_Retain);
-  RetainBlock->setCalledFunction(NewDecl);
-  // Remove copy_on_escape metadata.
-  RetainBlock->setMetadata(CopyOnEscapeMDKind, 0);
-  Class = IC_Retain;
-  DEBUG(dbgs() << "New: " << *Inst << "\n");
-  return true;
-}
-
 /// Visit each call, one at a time, and make simplifications without doing any
 /// additional analysis.
 void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
@@ -1480,11 +1371,6 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
       }
       break;
     }
-    case IC_RetainBlock:
-      // If we strength reduce an objc_retainBlock to an objc_retain, continue
-      // onto the objc_retain peephole optimizations. Otherwise break.
-      OptimizeRetainBlockCall(F, Inst, Class);
-      break;
     case IC_RetainRV:
       if (OptimizeRetainRVCall(F, Inst))
         continue;
@@ -2520,15 +2406,26 @@ ObjCARCOpt::ConnectTDBUTraversals(DenseMap<const BasicBlock *, BBState>
         if (Jt == Releases.end())
           return false;
         const RRInfo &NewRetainReleaseRRI = Jt->second;
-        assert(NewRetainReleaseRRI.Calls.count(NewRetain));
+
+        // If the release does not have a reference to the retain as well,
+        // something happened which is unaccounted for. Do not do anything.
+        //
+        // This can happen if we catch an additive overflow during path count
+        // merging.
+        if (!NewRetainReleaseRRI.Calls.count(NewRetain))
+          return false;
+
         if (ReleasesToMove.Calls.insert(NewRetainRelease)) {
 
           // If we overflow when we compute the path count, don't remove/move
           // anything.
           const BBState &NRRBBState = BBStates[NewRetainRelease->getParent()];
-          unsigned PathCount;
+          unsigned PathCount = BBState::OverflowOccurredValue;
           if (NRRBBState.GetAllPathCountWithOverflow(PathCount))
             return false;
+          assert(PathCount != BBState::OverflowOccurredValue &&
+                 "PathCount at this point can not be "
+                 "OverflowOccurredValue.");
           OldDelta -= PathCount;
 
           // Merge the ReleaseMetadata and IsTailCallRelease values.
@@ -2558,8 +2455,12 @@ ObjCARCOpt::ConnectTDBUTraversals(DenseMap<const BasicBlock *, BBState>
                 // If we overflow when we compute the path count, don't
                 // remove/move anything.
                 const BBState &RIPBBState = BBStates[RIP->getParent()];
+                PathCount = BBState::OverflowOccurredValue;
                 if (RIPBBState.GetAllPathCountWithOverflow(PathCount))
                   return false;
+                assert(PathCount != BBState::OverflowOccurredValue &&
+                       "PathCount at this point can not be "
+                       "OverflowOccurredValue.");
                 NewDelta -= PathCount;
               }
             }
@@ -2589,15 +2490,25 @@ ObjCARCOpt::ConnectTDBUTraversals(DenseMap<const BasicBlock *, BBState>
         if (Jt == Retains.end())
           return false;
         const RRInfo &NewReleaseRetainRRI = Jt->second;
-        assert(NewReleaseRetainRRI.Calls.count(NewRelease));
-        if (RetainsToMove.Calls.insert(NewReleaseRetain)) {
 
+        // If the retain does not have a reference to the release as well,
+        // something happened which is unaccounted for. Do not do anything.
+        //
+        // This can happen if we catch an additive overflow during path count
+        // merging.
+        if (!NewReleaseRetainRRI.Calls.count(NewRelease))
+          return false;
+
+        if (RetainsToMove.Calls.insert(NewReleaseRetain)) {
           // If we overflow when we compute the path count, don't remove/move
           // anything.
           const BBState &NRRBBState = BBStates[NewReleaseRetain->getParent()];
-          unsigned PathCount;
+          unsigned PathCount = BBState::OverflowOccurredValue;
           if (NRRBBState.GetAllPathCountWithOverflow(PathCount))
             return false;
+          assert(PathCount != BBState::OverflowOccurredValue &&
+                 "PathCount at this point can not be "
+                 "OverflowOccurredValue.");
           OldDelta += PathCount;
           OldCount += PathCount;
 
@@ -2612,8 +2523,13 @@ ObjCARCOpt::ConnectTDBUTraversals(DenseMap<const BasicBlock *, BBState>
                 // If we overflow when we compute the path count, don't
                 // remove/move anything.
                 const BBState &RIPBBState = BBStates[RIP->getParent()];
+
+                PathCount = BBState::OverflowOccurredValue;
                 if (RIPBBState.GetAllPathCountWithOverflow(PathCount))
                   return false;
+                assert(PathCount != BBState::OverflowOccurredValue &&
+                       "PathCount at this point can not be "
+                       "OverflowOccurredValue.");
                 NewDelta += PathCount;
                 NewCount += PathCount;
               }
diff --git a/lib/Transforms/Scalar/BasicBlockPlacement.cpp b/lib/Transforms/Scalar/BasicBlockPlacement.cpp
deleted file mode 100644
index e755008..0000000
--- a/lib/Transforms/Scalar/BasicBlockPlacement.cpp
+++ /dev/null
@@ -1,152 +0,0 @@
-//===-- BasicBlockPlacement.cpp - Basic Block Code Layout optimization ----===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements a very simple profile guided basic block placement
-// algorithm.  The idea is to put frequently executed blocks together at the
-// start of the function, and hopefully increase the number of fall-through
-// conditional branches.  If there is no profile information for a particular
-// function, this pass basically orders blocks in depth-first order
-//
-// The algorithm implemented here is basically "Algo1" from "Profile Guided Code
-// Positioning" by Pettis and Hansen, except that it uses basic block counts
-// instead of edge counts.  This should be improved in many ways, but is very
-// simple for now.
-//
-// Basically we "place" the entry block, then loop over all successors in a DFO,
-// placing the most frequently executed successor until we run out of blocks.  I
-// told you this was _extremely_ simplistic. :) This is also much slower than it
-// could be.  When it becomes important, this pass will be rewritten to use a
-// better algorithm, and then we can worry about efficiency.
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "block-placement"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/ProfileInfo.h"
-#include "llvm/IR/Function.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/CFG.h"
-#include <set>
-using namespace llvm;
-
-STATISTIC(NumMoved, "Number of basic blocks moved");
-
-namespace {
-  struct BlockPlacement : public FunctionPass {
-    static char ID; // Pass identification, replacement for typeid
-    BlockPlacement() : FunctionPass(ID) {
-      initializeBlockPlacementPass(*PassRegistry::getPassRegistry());
-    }
-
-    virtual bool runOnFunction(Function &F);
-
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
-      AU.setPreservesCFG();
-      AU.addRequired<ProfileInfo>();
-      //AU.addPreserved<ProfileInfo>();  // Does this work?
-    }
-  private:
-    /// PI - The profile information that is guiding us.
-    ///
-    ProfileInfo *PI;
-
-    /// NumMovedBlocks - Every time we move a block, increment this counter.
-    ///
-    unsigned NumMovedBlocks;
-
-    /// PlacedBlocks - Every time we place a block, remember it so we don't get
-    /// into infinite loops.
-    std::set<BasicBlock*> PlacedBlocks;
-
-    /// InsertPos - This an iterator to the next place we want to insert a
-    /// block.
-    Function::iterator InsertPos;
-
-    /// PlaceBlocks - Recursively place the specified blocks and any unplaced
-    /// successors.
-    void PlaceBlocks(BasicBlock *BB);
-  };
-}
-
-char BlockPlacement::ID = 0;
-INITIALIZE_PASS_BEGIN(BlockPlacement, "block-placement",
-                "Profile Guided Basic Block Placement", false, false)
-INITIALIZE_AG_DEPENDENCY(ProfileInfo)
-INITIALIZE_PASS_END(BlockPlacement, "block-placement",
-                "Profile Guided Basic Block Placement", false, false)
-
-FunctionPass *llvm::createBlockPlacementPass() { return new BlockPlacement(); }
-
-bool BlockPlacement::runOnFunction(Function &F) {
-  PI = &getAnalysis<ProfileInfo>();
-
-  NumMovedBlocks = 0;
-  InsertPos = F.begin();
-
-  // Recursively place all blocks.
-  PlaceBlocks(F.begin());
-
-  PlacedBlocks.clear();
-  NumMoved += NumMovedBlocks;
-  return NumMovedBlocks != 0;
-}
-
-
-/// PlaceBlocks - Recursively place the specified blocks and any unplaced
-/// successors.
-void BlockPlacement::PlaceBlocks(BasicBlock *BB) {
-  assert(!PlacedBlocks.count(BB) && "Already placed this block!");
-  PlacedBlocks.insert(BB);
-
-  // Place the specified block.
-  if (&*InsertPos != BB) {
-    // Use splice to move the block into the right place.  This avoids having to
-    // remove the block from the function then readd it, which causes a bunch of
-    // symbol table traffic that is entirely pointless.
-    Function::BasicBlockListType &Blocks = BB->getParent()->getBasicBlockList();
-    Blocks.splice(InsertPos, Blocks, BB);
-
-    ++NumMovedBlocks;
-  } else {
-    // This block is already in the right place, we don't have to do anything.
-    ++InsertPos;
-  }
-
-  // Keep placing successors until we run out of ones to place.  Note that this
-  // loop is very inefficient (N^2) for blocks with many successors, like switch
-  // statements.  FIXME!
-  while (1) {
-    // Okay, now place any unplaced successors.
-    succ_iterator SI = succ_begin(BB), E = succ_end(BB);
-
-    // Scan for the first unplaced successor.
-    for (; SI != E && PlacedBlocks.count(*SI); ++SI)
-      /*empty*/;
-    if (SI == E) return;  // No more successors to place.
-
-    double MaxExecutionCount = PI->getExecutionCount(*SI);
-    BasicBlock *MaxSuccessor = *SI;
-
-    // Scan for more frequently executed successors
-    for (; SI != E; ++SI)
-      if (!PlacedBlocks.count(*SI)) {
-        double Count = PI->getExecutionCount(*SI);
-        if (Count > MaxExecutionCount ||
-            // Prefer to not disturb the code.
-            (Count == MaxExecutionCount && *SI == &*InsertPos)) {
-          MaxExecutionCount = Count;
-          MaxSuccessor = *SI;
-        }
-      }
-
-    // Now that we picked the maximally executed successor, place it.
-    PlaceBlocks(MaxSuccessor);
-  }
-}
diff --git a/lib/Transforms/Scalar/CMakeLists.txt b/lib/Transforms/Scalar/CMakeLists.txt
index f5d1db1..626c810 100644
--- a/lib/Transforms/Scalar/CMakeLists.txt
+++ b/lib/Transforms/Scalar/CMakeLists.txt
@@ -1,6 +1,5 @@
 add_llvm_library(LLVMScalarOpts
   ADCE.cpp
-  BasicBlockPlacement.cpp
   CodeGenPrepare.cpp
   ConstantProp.cpp
   CorrelatedValuePropagation.cpp
@@ -17,12 +16,15 @@ add_llvm_library(LLVMScalarOpts
   LoopInstSimplify.cpp
   LoopRotation.cpp
   LoopStrengthReduce.cpp
+  LoopRerollPass.cpp
   LoopUnrollPass.cpp
   LoopUnswitch.cpp
   LowerAtomic.cpp
   MemCpyOptimizer.cpp
+  PartiallyInlineLibCalls.cpp
   Reassociate.cpp
   Reg2Mem.cpp
+  SampleProfile.cpp
   SCCP.cpp
   SROA.cpp
   Scalar.cpp
diff --git a/lib/Transforms/Scalar/CodeGenPrepare.cpp b/lib/Transforms/Scalar/CodeGenPrepare.cpp
index 44804a2..007e9b7 100644
--- a/lib/Transforms/Scalar/CodeGenPrepare.cpp
+++ b/lib/Transforms/Scalar/CodeGenPrepare.cpp
@@ -22,7 +22,6 @@
 #include "llvm/Analysis/DominatorInternals.h"
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/Analysis/ProfileInfo.h"
 #include "llvm/Assembly/Writer.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
@@ -80,7 +79,6 @@ namespace {
     const TargetLowering *TLI;
     const TargetLibraryInfo *TLInfo;
     DominatorTree *DT;
-    ProfileInfo *PFI;
 
     /// CurInstIterator - As we scan instructions optimizing them, this is the
     /// next instruction to optimize.  Xforms that can invalidate this should
@@ -111,7 +109,6 @@ namespace {
 
     virtual void getAnalysisUsage(AnalysisUsage &AU) const {
       AU.addPreserved<DominatorTree>();
-      AU.addPreserved<ProfileInfo>();
       AU.addRequired<TargetLibraryInfo>();
     }
 
@@ -151,7 +148,6 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
   if (TM) TLI = TM->getTargetLowering();
   TLInfo = &getAnalysis<TargetLibraryInfo>();
   DT = getAnalysisIfAvailable<DominatorTree>();
-  PFI = getAnalysisIfAvailable<ProfileInfo>();
   OptSize = F.getAttributes().hasAttribute(AttributeSet::FunctionIndex,
                                            Attribute::OptimizeForSize);
 
@@ -442,10 +438,6 @@ void CodeGenPrepare::EliminateMostlyEmptyBlock(BasicBlock *BB) {
     DT->changeImmediateDominator(DestBB, NewIDom);
     DT->eraseNode(BB);
   }
-  if (PFI) {
-    PFI->replaceAllUses(BB, DestBB);
-    PFI->removeEdge(ProfileInfo::getEdge(BB, DestBB));
-  }
   BB->eraseFromParent();
   ++NumBlocksElim;
 
@@ -840,10 +832,12 @@ struct ExtAddrMode : public TargetLowering::AddrMode {
   }
 };
 
+#ifndef NDEBUG
 static inline raw_ostream &operator<<(raw_ostream &OS, const ExtAddrMode &AM) {
   AM.print(OS);
   return OS;
 }
+#endif
 
 void ExtAddrMode::print(raw_ostream &OS) const {
   bool NeedPlus = false;
@@ -1035,7 +1029,7 @@ bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode,
   case Instruction::IntToPtr:
     // This inttoptr is a no-op if the integer type is pointer sized.
     if (TLI.getValueType(AddrInst->getOperand(0)->getType()) ==
-        TLI.getPointerTy())
+        TLI.getPointerTy(AddrInst->getType()->getPointerAddressSpace()))
       return MatchAddr(AddrInst->getOperand(0), Depth);
     return false;
   case Instruction::BitCast:
@@ -1418,8 +1412,7 @@ IsProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore,
     Value *Address = User->getOperand(OpNo);
     if (!Address->getType()->isPointerTy())
       return false;
-    Type *AddressAccessTy =
-      cast<PointerType>(Address->getType())->getElementType();
+    Type *AddressAccessTy = Address->getType()->getPointerElementType();
 
     // Do a match against the root of this address, ignoring profitability. This
     // will tell us if the addressing mode for the memory operation will
@@ -1573,9 +1566,7 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
   } else {
     DEBUG(dbgs() << "CGP: SINKING nonlocal addrmode: " << AddrMode << " for "
                  << *MemoryInst);
-    Type *IntPtrTy =
-          TLI->getDataLayout()->getIntPtrType(AccessTy->getContext());
-
+    Type *IntPtrTy = TLI->getDataLayout()->getIntPtrType(Addr->getType());
     Value *Result = 0;
 
     // Start with the base register. Do this first so that subsequent address
@@ -1894,7 +1885,8 @@ bool CodeGenPrepare::OptimizeInst(Instruction *I) {
     // It is possible for very late stage optimizations (such as SimplifyCFG)
     // to introduce PHI nodes too late to be cleaned up.  If we detect such a
     // trivial PHI, go ahead and zap it here.
-    if (Value *V = SimplifyInstruction(P)) {
+    if (Value *V = SimplifyInstruction(P, TLI ? TLI->getDataLayout() : 0,
+                                       TLInfo, DT)) {
       P->replaceAllUsesWith(V);
       P->eraseFromParent();
       ++NumPHIsElim;
diff --git a/lib/Transforms/Scalar/EarlyCSE.cpp b/lib/Transforms/Scalar/EarlyCSE.cpp
index 3c08634..5266894 100644
--- a/lib/Transforms/Scalar/EarlyCSE.cpp
+++ b/lib/Transforms/Scalar/EarlyCSE.cpp
@@ -72,11 +72,6 @@ namespace {
 }
 
 namespace llvm {
-// SimpleValue is POD.
-template<> struct isPodLike<SimpleValue> {
-  static const bool value = true;
-};
-
 template<> struct DenseMapInfo<SimpleValue> {
   static inline SimpleValue getEmptyKey() {
     return DenseMapInfo<Instruction*>::getEmptyKey();
@@ -220,11 +215,6 @@ namespace {
 }
 
 namespace llvm {
-  // CallValue is POD.
-  template<> struct isPodLike<CallValue> {
-    static const bool value = true;
-  };
-
   template<> struct DenseMapInfo<CallValue> {
     static inline CallValue getEmptyKey() {
       return DenseMapInfo<Instruction*>::getEmptyKey();
diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp
index bc418af..6af269d 100644
--- a/lib/Transforms/Scalar/GVN.cpp
+++ b/lib/Transforms/Scalar/GVN.cpp
@@ -21,6 +21,7 @@
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/CFG.h"
@@ -507,7 +508,9 @@ namespace {
     enum ValType {
       SimpleVal,  // A simple offsetted value that is accessed.
       LoadVal,    // A value produced by a load.
-      MemIntrin   // A memory intrinsic which is loaded from.
+      MemIntrin,  // A memory intrinsic which is loaded from.
+      UndefVal    // A UndefValue representing a value from dead block (which
+                  // is not yet physically removed from the CFG). 
     };
   
     /// V - The value that is live out of the block.
@@ -545,10 +548,20 @@ namespace {
       Res.Offset = Offset;
       return Res;
     }
-  
+
+    static AvailableValueInBlock getUndef(BasicBlock *BB) {
+      AvailableValueInBlock Res;
+      Res.BB = BB;
+      Res.Val.setPointer(0);
+      Res.Val.setInt(UndefVal);
+      Res.Offset = 0;
+      return Res;
+    }
+
     bool isSimpleValue() const { return Val.getInt() == SimpleVal; }
     bool isCoercedLoadValue() const { return Val.getInt() == LoadVal; }
     bool isMemIntrinValue() const { return Val.getInt() == MemIntrin; }
+    bool isUndefValue() const { return Val.getInt() == UndefVal; }
   
     Value *getSimpleValue() const {
       assert(isSimpleValue() && "Wrong accessor");
@@ -576,6 +589,7 @@ namespace {
     DominatorTree *DT;
     const DataLayout *TD;
     const TargetLibraryInfo *TLI;
+    SetVector<BasicBlock *> DeadBlocks;
 
     ValueTable VN;
 
@@ -698,6 +712,9 @@ namespace {
     unsigned replaceAllDominatedUsesWith(Value *From, Value *To,
                                          const BasicBlockEdge &Root);
     bool propagateEquality(Value *LHS, Value *RHS, const BasicBlockEdge &Root);
+    bool processFoldableCondBr(BranchInst *BI);
+    void addDeadBlock(BasicBlock *BB);
+    void assignValNumForDeadCode();
   };
 
   char GVN::ID = 0;
@@ -1071,14 +1088,15 @@ static int AnalyzeLoadFromClobberingMemInst(Type *LoadTy, Value *LoadPtr,
   if (Offset == -1)
     return Offset;
 
+  unsigned AS = Src->getType()->getPointerAddressSpace();
   // Otherwise, see if we can constant fold a load from the constant with the
   // offset applied as appropriate.
   Src = ConstantExpr::getBitCast(Src,
-                                 llvm::Type::getInt8PtrTy(Src->getContext()));
+                                 Type::getInt8PtrTy(Src->getContext(), AS));
   Constant *OffsetCst =
     ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset);
   Src = ConstantExpr::getGetElementPtr(Src, OffsetCst);
-  Src = ConstantExpr::getBitCast(Src, PointerType::getUnqual(LoadTy));
+  Src = ConstantExpr::getBitCast(Src, PointerType::get(LoadTy, AS));
   if (ConstantFoldLoadFromConstPtr(Src, &TD))
     return Offset;
   return -1;
@@ -1155,7 +1173,7 @@ static Value *GetLoadValueForLoad(LoadInst *SrcVal, unsigned Offset,
     Type *DestPTy =
       IntegerType::get(LoadTy->getContext(), NewLoadSize*8);
     DestPTy = PointerType::get(DestPTy,
-                       cast<PointerType>(PtrVal->getType())->getAddressSpace());
+                               PtrVal->getType()->getPointerAddressSpace());
     Builder.SetCurrentDebugLocation(SrcVal->getDebugLoc());
     PtrVal = Builder.CreateBitCast(PtrVal, DestPTy);
     LoadInst *NewLoad = Builder.CreateLoad(PtrVal);
@@ -1230,15 +1248,16 @@ static Value *GetMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset,
   // Otherwise, this is a memcpy/memmove from a constant global.
   MemTransferInst *MTI = cast<MemTransferInst>(SrcInst);
   Constant *Src = cast<Constant>(MTI->getSource());
+  unsigned AS = Src->getType()->getPointerAddressSpace();
 
   // Otherwise, see if we can constant fold a load from the constant with the
   // offset applied as appropriate.
   Src = ConstantExpr::getBitCast(Src,
-                                 llvm::Type::getInt8PtrTy(Src->getContext()));
+                                 Type::getInt8PtrTy(Src->getContext(), AS));
   Constant *OffsetCst =
-  ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset);
+    ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset);
   Src = ConstantExpr::getGetElementPtr(Src, OffsetCst);
-  Src = ConstantExpr::getBitCast(Src, PointerType::getUnqual(LoadTy));
+  Src = ConstantExpr::getBitCast(Src, PointerType::get(LoadTy, AS));
   return ConstantFoldLoadFromConstPtr(Src, &TD);
 }
 
@@ -1253,8 +1272,10 @@ static Value *ConstructSSAForLoadSet(LoadInst *LI,
   // just use the dominating value directly.
   if (ValuesPerBlock.size() == 1 &&
       gvn.getDominatorTree().properlyDominates(ValuesPerBlock[0].BB,
-                                               LI->getParent()))
+                                               LI->getParent())) {
+    assert(!ValuesPerBlock[0].isUndefValue() && "Dead BB dominate this block");
     return ValuesPerBlock[0].MaterializeAdjustedValue(LI->getType(), gvn);
+  }
 
   // Otherwise, we have to construct SSA form.
   SmallVector<PHINode*, 8> NewPHIs;
@@ -1324,7 +1345,7 @@ Value *AvailableValueInBlock::MaterializeAdjustedValue(Type *LoadTy, GVN &gvn) c
                    << *getCoercedLoadValue() << '\n'
                    << *Res << '\n' << "\n\n\n");
     }
-  } else {
+  } else if (isMemIntrinValue()) {
     const DataLayout *TD = gvn.getDataLayout();
     assert(TD && "Need target data to handle type mismatch case");
     Res = GetMemInstValueForLoad(getMemIntrinValue(), Offset,
@@ -1332,6 +1353,10 @@ Value *AvailableValueInBlock::MaterializeAdjustedValue(Type *LoadTy, GVN &gvn) c
     DEBUG(dbgs() << "GVN COERCED NONLOCAL MEM INTRIN:\nOffset: " << Offset
                  << "  " << *getMemIntrinValue() << '\n'
                  << *Res << '\n' << "\n\n\n");
+  } else {
+    assert(isUndefValue() && "Should be UndefVal");
+    DEBUG(dbgs() << "GVN COERCED NONLOCAL Undef:\n";);
+    return UndefValue::get(LoadTy);
   }
   return Res;
 }
@@ -1355,6 +1380,13 @@ void GVN::AnalyzeLoadAvailability(LoadInst *LI, LoadDepVect &Deps,
     BasicBlock *DepBB = Deps[i].getBB();
     MemDepResult DepInfo = Deps[i].getResult();
 
+    if (DeadBlocks.count(DepBB)) {
+      // Dead dependent mem-op disguise as a load evaluating the same value
+      // as the load in question.
+      ValuesPerBlock.push_back(AvailableValueInBlock::getUndef(DepBB));
+      continue;
+    }
+
     if (!DepInfo.isDef() && !DepInfo.isClobber()) {
       UnavailableBlocks.push_back(DepBB);
       continue;
@@ -2191,11 +2223,13 @@ bool GVN::processInstruction(Instruction *I) {
   // For conditional branches, we can perform simple conditional propagation on
   // the condition value itself.
   if (BranchInst *BI = dyn_cast<BranchInst>(I)) {
-    if (!BI->isConditional() || isa<Constant>(BI->getCondition()))
+    if (!BI->isConditional())
       return false;
 
-    Value *BranchCond = BI->getCondition();
+    if (isa<Constant>(BI->getCondition()))
+      return processFoldableCondBr(BI);
 
+    Value *BranchCond = BI->getCondition();
     BasicBlock *TrueSucc = BI->getSuccessor(0);
     BasicBlock *FalseSucc = BI->getSuccessor(1);
     // Avoid multiple edges early.
@@ -2312,6 +2346,9 @@ bool GVN::runOnFunction(Function& F) {
   }
 
   if (EnablePRE) {
+    // Fabricate val-num for dead-code in order to suppress assertion in
+    // performPRE().
+    assignValNumForDeadCode();
     bool PREChanged = true;
     while (PREChanged) {
       PREChanged = performPRE(F);
@@ -2325,6 +2362,9 @@ bool GVN::runOnFunction(Function& F) {
   // Actually, when this happens, we should just fully integrate PRE into GVN.
 
   cleanupGlobalSets();
+  // Do not cleanup DeadBlocks in cleanupGlobalSets() as it's called for each
+  // iteration. 
+  DeadBlocks.clear();
 
   return Changed;
 }
@@ -2335,6 +2375,9 @@ bool GVN::processBlock(BasicBlock *BB) {
   // (and incrementing BI before processing an instruction).
   assert(InstrsToErase.empty() &&
          "We expect InstrsToErase to be empty across iterations");
+  if (DeadBlocks.count(BB))
+    return false;
+
   bool ChangedFunction = false;
 
   for (BasicBlock::iterator BI = BB->begin(), BE = BB->end();
@@ -2628,3 +2671,133 @@ void GVN::verifyRemoved(const Instruction *Inst) const {
     }
   }
 }
+
+// BB is declared dead, which implied other blocks become dead as well. This
+// function is to add all these blocks to "DeadBlocks". For the dead blocks'
+// live successors, update their phi nodes by replacing the operands
+// corresponding to dead blocks with UndefVal.
+//
+void GVN::addDeadBlock(BasicBlock *BB) {
+  SmallVector<BasicBlock *, 4> NewDead;
+  SmallSetVector<BasicBlock *, 4> DF;
+
+  NewDead.push_back(BB);
+  while (!NewDead.empty()) {
+    BasicBlock *D = NewDead.pop_back_val();
+    if (DeadBlocks.count(D))
+      continue;
+
+    // All blocks dominated by D are dead.
+    SmallVector<BasicBlock *, 8> Dom;
+    DT->getDescendants(D, Dom);
+    DeadBlocks.insert(Dom.begin(), Dom.end());
+    
+    // Figure out the dominance-frontier(D).
+    for (SmallVectorImpl<BasicBlock *>::iterator I = Dom.begin(),
+           E = Dom.end(); I != E; I++) {
+      BasicBlock *B = *I;
+      for (succ_iterator SI = succ_begin(B), SE = succ_end(B); SI != SE; SI++) {
+        BasicBlock *S = *SI;
+        if (DeadBlocks.count(S))
+          continue;
+
+        bool AllPredDead = true;
+        for (pred_iterator PI = pred_begin(S), PE = pred_end(S); PI != PE; PI++)
+          if (!DeadBlocks.count(*PI)) {
+            AllPredDead = false;
+            break;
+          }
+
+        if (!AllPredDead) {
+          // S could be proved dead later on. That is why we don't update phi
+          // operands at this moment.
+          DF.insert(S);
+        } else {
+          // While S is not dominated by D, it is dead by now. This could take
+          // place if S already have a dead predecessor before D is declared
+          // dead.
+          NewDead.push_back(S);
+        }
+      }
+    }
+  }
+
+  // For the dead blocks' live successors, update their phi nodes by replacing
+  // the operands corresponding to dead blocks with UndefVal.
+  for(SmallSetVector<BasicBlock *, 4>::iterator I = DF.begin(), E = DF.end();
+        I != E; I++) {
+    BasicBlock *B = *I;
+    if (DeadBlocks.count(B))
+      continue;
+
+    SmallVector<BasicBlock *, 4> Preds(pred_begin(B), pred_end(B));
+    for (SmallVectorImpl<BasicBlock *>::iterator PI = Preds.begin(),
+           PE = Preds.end(); PI != PE; PI++) {
+      BasicBlock *P = *PI;
+
+      if (!DeadBlocks.count(P))
+        continue;
+
+      if (isCriticalEdge(P->getTerminator(), GetSuccessorNumber(P, B))) {
+        if (BasicBlock *S = splitCriticalEdges(P, B))
+          DeadBlocks.insert(P = S);
+      }
+
+      for (BasicBlock::iterator II = B->begin(); isa<PHINode>(II); ++II) {
+        PHINode &Phi = cast<PHINode>(*II);
+        Phi.setIncomingValue(Phi.getBasicBlockIndex(P),
+                             UndefValue::get(Phi.getType()));
+      }
+    }
+  }
+}
+
+// If the given branch is recognized as a foldable branch (i.e. conditional
+// branch with constant condition), it will perform following analyses and
+// transformation.
+//  1) If the dead out-coming edge is a critical-edge, split it. Let 
+//     R be the target of the dead out-coming edge.
+//  1) Identify the set of dead blocks implied by the branch's dead outcoming
+//     edge. The result of this step will be {X| X is dominated by R}
+//  2) Identify those blocks which haves at least one dead prodecessor. The
+//     result of this step will be dominance-frontier(R).
+//  3) Update the PHIs in DF(R) by replacing the operands corresponding to 
+//     dead blocks with "UndefVal" in an hope these PHIs will optimized away.
+//
+// Return true iff *NEW* dead code are found.
+bool GVN::processFoldableCondBr(BranchInst *BI) {
+  if (!BI || BI->isUnconditional())
+    return false;
+
+  ConstantInt *Cond = dyn_cast<ConstantInt>(BI->getCondition());
+  if (!Cond)
+    return false;
+
+  BasicBlock *DeadRoot = Cond->getZExtValue() ? 
+                         BI->getSuccessor(1) : BI->getSuccessor(0);
+  if (DeadBlocks.count(DeadRoot))
+    return false;
+
+  if (!DeadRoot->getSinglePredecessor())
+    DeadRoot = splitCriticalEdges(BI->getParent(), DeadRoot);
+
+  addDeadBlock(DeadRoot);
+  return true;
+}
+
+// performPRE() will trigger assert if it come across an instruciton without
+// associated val-num. As it normally has far more live instructions than dead
+// instructions, it makes more sense just to "fabricate" a val-number for the
+// dead code than checking if instruction involved is dead or not.
+void GVN::assignValNumForDeadCode() {
+  for (SetVector<BasicBlock *>::iterator I = DeadBlocks.begin(),
+        E = DeadBlocks.end(); I != E; I++) {
+    BasicBlock *BB = *I;
+    for (BasicBlock::iterator II = BB->begin(), EE = BB->end();
+          II != EE; II++) {
+      Instruction *Inst = &*II;
+      unsigned ValNum = VN.lookup_or_add(Inst);
+      addToLeaderTable(ValNum, Inst, BB);
+    }
+  }
+}
diff --git a/lib/Transforms/Scalar/IndVarSimplify.cpp b/lib/Transforms/Scalar/IndVarSimplify.cpp
index d51e034..235aaaa 100644
--- a/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -532,7 +532,8 @@ void IndVarSimplify::RewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) {
         // and varies predictably *inside* the loop.  Evaluate the value it
         // contains when the loop exits, if possible.
         const SCEV *ExitValue = SE->getSCEVAtScope(Inst, L->getParentLoop());
-        if (!SE->isLoopInvariant(ExitValue, L) || !isSafeToExpand(ExitValue))
+        if (!SE->isLoopInvariant(ExitValue, L) ||
+            !isSafeToExpand(ExitValue, *SE))
           continue;
 
         // Computing the value outside of the loop brings no benefit if :
@@ -1479,8 +1480,14 @@ static Value *genLoopLimit(PHINode *IndVar, const SCEV *IVCount, Loop *L,
   if (IndVar->getType()->isPointerTy()
       && !IVCount->getType()->isPointerTy()) {
 
+    // IVOffset will be the new GEP offset that is interpreted by GEP as a
+    // signed value. IVCount on the other hand represents the loop trip count,
+    // which is an unsigned value. FindLoopCounter only allows induction
+    // variables that have a positive unit stride of one. This means we don't
+    // have to handle the case of negative offsets (yet) and just need to zero
+    // extend IVCount.
     Type *OfsTy = SE->getEffectiveSCEVType(IVInit->getType());
-    const SCEV *IVOffset = SE->getTruncateOrSignExtend(IVCount, OfsTy);
+    const SCEV *IVOffset = SE->getTruncateOrZeroExtend(IVCount, OfsTy);
 
     // Expand the code for the iteration count.
     assert(SE->isLoopInvariant(IVOffset, L) &&
@@ -1492,7 +1499,7 @@ static Value *genLoopLimit(PHINode *IndVar, const SCEV *IVCount, Loop *L,
     assert(AR->getStart() == SE->getSCEV(GEPBase) && "bad loop counter");
     // We could handle pointer IVs other than i8*, but we need to compensate for
     // gep index scaling. See canExpandBackedgeTakenCount comments.
-    assert(SE->getSizeOfExpr(
+    assert(SE->getSizeOfExpr(IntegerType::getInt64Ty(IndVar->getContext()),
              cast<PointerType>(GEPBase->getType())->getElementType())->isOne()
            && "unit stride pointer IV must be i8*");
 
@@ -1506,9 +1513,10 @@ static Value *genLoopLimit(PHINode *IndVar, const SCEV *IVCount, Loop *L,
     // BECount = (IVEnd - IVInit - 1) => IVLimit = IVInit (postinc).
     //
     // Valid Cases: (1) both integers is most common; (2) both may be pointers
-    // for simple memset-style loops; (3) IVInit is an integer and IVCount is a
-    // pointer may occur when enable-iv-rewrite generates a canonical IV on top
-    // of case #2.
+    // for simple memset-style loops.
+    //
+    // IVInit integer and IVCount pointer would only occur if a canonical IV
+    // were generated on top of case #2, which is not expected.
 
     const SCEV *IVLimit = 0;
     // For unit stride, IVCount = Start + BECount with 2's complement overflow.
diff --git a/lib/Transforms/Scalar/JumpThreading.cpp b/lib/Transforms/Scalar/JumpThreading.cpp
index 0b8906d..b3ec2fc 100644
--- a/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/lib/Transforms/Scalar/JumpThreading.cpp
@@ -827,7 +827,6 @@ bool JumpThreading::ProcessBlock(BasicBlock *BB) {
   return false;
 }
 
-
 /// SimplifyPartiallyRedundantLoad - If LI is an obviously partially redundant
 /// load instruction, eliminate it by replacing it with a PHI node.  This is an
 /// important optimization that encourages jump threading, and needs to be run
@@ -842,6 +841,12 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
   if (LoadBB->getSinglePredecessor())
     return false;
 
+  // If the load is defined in a landing pad, it can't be partially redundant,
+  // because the edges between the invoke and the landing pad cannot have other
+  // instructions between them.
+  if (LoadBB->isLandingPad())
+    return false;
+
   Value *LoadedPtr = LI->getOperand(0);
 
   // If the loaded operand is defined in the LoadBB, it can't be available.
diff --git a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index 07d991b..952b76b 100644
--- a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -314,7 +314,7 @@ bool NclPopcountRecognize::preliminaryScreen() {
   if (TTI->getPopcntSupport(32) != TargetTransformInfo::PSK_FastHardware)
     return false;
 
-  // Counting population are usually conducted by few arithmetic instrutions.
+  // Counting population are usually conducted by few arithmetic instructions.
   // Such instructions can be easilly "absorbed" by vacant slots in a
   // non-compact loop. Therefore, recognizing popcount idiom only makes sense
   // in a compact loop.
@@ -953,6 +953,8 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize,
   Value *SplatValue = isBytewiseValue(StoredVal);
   Constant *PatternValue = 0;
 
+  unsigned DestAS = DestPtr->getType()->getPointerAddressSpace();
+
   // If we're allowed to form a memset, and the stored value would be acceptable
   // for memset, use it.
   if (SplatValue && TLI->has(LibFunc::memset) &&
@@ -961,8 +963,10 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize,
       CurLoop->isLoopInvariant(SplatValue)) {
     // Keep and use SplatValue.
     PatternValue = 0;
-  } else if (TLI->has(LibFunc::memset_pattern16) &&
+  } else if (DestAS == 0 &&
+             TLI->has(LibFunc::memset_pattern16) &&
              (PatternValue = getMemSetPatternValue(StoredVal, *TD))) {
+    // Don't create memset_pattern16s with address spaces.
     // It looks like we can use PatternValue!
     SplatValue = 0;
   } else {
@@ -978,20 +982,20 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize,
   IRBuilder<> Builder(Preheader->getTerminator());
   SCEVExpander Expander(*SE, "loop-idiom");
 
+  Type *DestInt8PtrTy = Builder.getInt8PtrTy(DestAS);
+
   // Okay, we have a strided store "p[i]" of a splattable value.  We can turn
   // this into a memset in the loop preheader now if we want.  However, this
   // would be unsafe to do if there is anything else in the loop that may read
   // or write to the aliased location.  Check for any overlap by generating the
   // base pointer and checking the region.
-  unsigned AddrSpace = cast<PointerType>(DestPtr->getType())->getAddressSpace();
   Value *BasePtr =
-    Expander.expandCodeFor(Ev->getStart(), Builder.getInt8PtrTy(AddrSpace),
+    Expander.expandCodeFor(Ev->getStart(), DestInt8PtrTy,
                            Preheader->getTerminator());
 
-
   if (mayLoopAccessLocation(BasePtr, AliasAnalysis::ModRef,
                             CurLoop, BECount,
-                            StoreSize, getAnalysis<AliasAnalysis>(), TheStore)){
+                            StoreSize, getAnalysis<AliasAnalysis>(), TheStore)) {
     Expander.clear();
     // If we generated new code for the base pointer, clean up.
     deleteIfDeadInstruction(BasePtr, *SE, TLI);
@@ -1002,27 +1006,35 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize,
 
   // The # stored bytes is (BECount+1)*Size.  Expand the trip count out to
   // pointer size if it isn't already.
-  Type *IntPtr = TD->getIntPtrType(DestPtr->getContext());
+  Type *IntPtr = Builder.getIntPtrTy(TD, DestAS);
   BECount = SE->getTruncateOrZeroExtend(BECount, IntPtr);
 
   const SCEV *NumBytesS = SE->getAddExpr(BECount, SE->getConstant(IntPtr, 1),
                                          SCEV::FlagNUW);
-  if (StoreSize != 1)
+  if (StoreSize != 1) {
     NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtr, StoreSize),
                                SCEV::FlagNUW);
+  }
 
   Value *NumBytes =
     Expander.expandCodeFor(NumBytesS, IntPtr, Preheader->getTerminator());
 
   CallInst *NewCall;
-  if (SplatValue)
-    NewCall = Builder.CreateMemSet(BasePtr, SplatValue,NumBytes,StoreAlignment);
-  else {
+  if (SplatValue) {
+    NewCall = Builder.CreateMemSet(BasePtr,
+                                   SplatValue,
+                                   NumBytes,
+                                   StoreAlignment);
+  } else {
+    // Everything is emitted in default address space
+    Type *Int8PtrTy = DestInt8PtrTy;
+
     Module *M = TheStore->getParent()->getParent()->getParent();
     Value *MSP = M->getOrInsertFunction("memset_pattern16",
                                         Builder.getVoidTy(),
-                                        Builder.getInt8PtrTy(),
-                                        Builder.getInt8PtrTy(), IntPtr,
+                                        Int8PtrTy,
+                                        Int8PtrTy,
+                                        IntPtr,
                                         (void*)0);
 
     // Otherwise we should form a memset_pattern16.  PatternValue is known to be
@@ -1032,7 +1044,7 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize,
                                             PatternValue, ".memset_pattern");
     GV->setUnnamedAddr(true); // Ok to merge these.
     GV->setAlignment(16);
-    Value *PatternPtr = ConstantExpr::getBitCast(GV, Builder.getInt8PtrTy());
+    Value *PatternPtr = ConstantExpr::getBitCast(GV, Int8PtrTy);
     NewCall = Builder.CreateCall3(MSP, BasePtr, PatternPtr, NumBytes);
   }
 
@@ -1108,17 +1120,17 @@ processLoopStoreOfLoopLoad(StoreInst *SI, unsigned StoreSize,
 
   // The # stored bytes is (BECount+1)*Size.  Expand the trip count out to
   // pointer size if it isn't already.
-  Type *IntPtr = TD->getIntPtrType(SI->getContext());
-  BECount = SE->getTruncateOrZeroExtend(BECount, IntPtr);
+  Type *IntPtrTy = Builder.getIntPtrTy(TD, SI->getPointerAddressSpace());
+  BECount = SE->getTruncateOrZeroExtend(BECount, IntPtrTy);
 
-  const SCEV *NumBytesS = SE->getAddExpr(BECount, SE->getConstant(IntPtr, 1),
+  const SCEV *NumBytesS = SE->getAddExpr(BECount, SE->getConstant(IntPtrTy, 1),
                                          SCEV::FlagNUW);
   if (StoreSize != 1)
-    NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtr, StoreSize),
+    NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtrTy, StoreSize),
                                SCEV::FlagNUW);
 
   Value *NumBytes =
-    Expander.expandCodeFor(NumBytesS, IntPtr, Preheader->getTerminator());
+    Expander.expandCodeFor(NumBytesS, IntPtrTy, Preheader->getTerminator());
 
   CallInst *NewCall =
     Builder.CreateMemCpy(StoreBasePtr, LoadBasePtr, NumBytes,
diff --git a/lib/Transforms/Scalar/LoopRerollPass.cpp b/lib/Transforms/Scalar/LoopRerollPass.cpp
new file mode 100644
index 0000000..335af81
--- /dev/null
+++ b/lib/Transforms/Scalar/LoopRerollPass.cpp
@@ -0,0 +1,1184 @@
+//===-- LoopReroll.cpp - Loop rerolling pass ------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass implements a simple loop reroller.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "loop-reroll"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AliasSetTracker.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+
+using namespace llvm;
+
+STATISTIC(NumRerolledLoops, "Number of rerolled loops");
+
+static cl::opt<unsigned>
+MaxInc("max-reroll-increment", cl::init(2048), cl::Hidden,
+  cl::desc("The maximum increment for loop rerolling"));
+
+// This loop re-rolling transformation aims to transform loops like this:
+//
+// int foo(int a);
+// void bar(int *x) {
+//   for (int i = 0; i < 500; i += 3) {
+//     foo(i);
+//     foo(i+1);
+//     foo(i+2);
+//   }
+// }
+//
+// into a loop like this:
+//
+// void bar(int *x) {
+//   for (int i = 0; i < 500; ++i)
+//     foo(i);
+// }
+//
+// It does this by looking for loops that, besides the latch code, are composed
+// of isomorphic DAGs of instructions, with each DAG rooted at some increment
+// to the induction variable, and where each DAG is isomorphic to the DAG
+// rooted at the induction variable (excepting the sub-DAGs which root the
+// other induction-variable increments). In other words, we're looking for loop
+// bodies of the form:
+//
+// %iv = phi [ (preheader, ...), (body, %iv.next) ]
+// f(%iv)
+// %iv.1 = add %iv, 1                <-- a root increment
+// f(%iv.1)
+// %iv.2 = add %iv, 2                <-- a root increment
+// f(%iv.2)
+// %iv.scale_m_1 = add %iv, scale-1  <-- a root increment
+// f(%iv.scale_m_1)
+// ...
+// %iv.next = add %iv, scale
+// %cmp = icmp(%iv, ...)
+// br %cmp, header, exit
+//
+// where each f(i) is a set of instructions that, collectively, are a function
+// only of i (and other loop-invariant values).
+//
+// As a special case, we can also reroll loops like this:
+//
+// int foo(int);
+// void bar(int *x) {
+//   for (int i = 0; i < 500; ++i) {
+//     x[3*i] = foo(0);
+//     x[3*i+1] = foo(0);
+//     x[3*i+2] = foo(0);
+//   }
+// }
+//
+// into this:
+//
+// void bar(int *x) {
+//   for (int i = 0; i < 1500; ++i)
+//     x[i] = foo(0);
+// }
+//
+// in which case, we're looking for inputs like this:
+//
+// %iv = phi [ (preheader, ...), (body, %iv.next) ]
+// %scaled.iv = mul %iv, scale
+// f(%scaled.iv)
+// %scaled.iv.1 = add %scaled.iv, 1
+// f(%scaled.iv.1)
+// %scaled.iv.2 = add %scaled.iv, 2
+// f(%scaled.iv.2)
+// %scaled.iv.scale_m_1 = add %scaled.iv, scale-1
+// f(%scaled.iv.scale_m_1)
+// ...
+// %iv.next = add %iv, 1
+// %cmp = icmp(%iv, ...)
+// br %cmp, header, exit
+
+namespace {
+  class LoopReroll : public LoopPass {
+  public:
+    static char ID; // Pass ID, replacement for typeid
+    LoopReroll() : LoopPass(ID) {
+      initializeLoopRerollPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnLoop(Loop *L, LPPassManager &LPM);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<AliasAnalysis>();
+      AU.addRequired<LoopInfo>();
+      AU.addPreserved<LoopInfo>();
+      AU.addRequired<DominatorTree>();
+      AU.addPreserved<DominatorTree>();
+      AU.addRequired<ScalarEvolution>();
+      AU.addRequired<TargetLibraryInfo>();
+    }
+
+protected:
+    AliasAnalysis *AA;
+    LoopInfo *LI;
+    ScalarEvolution *SE;
+    DataLayout *DL;
+    TargetLibraryInfo *TLI;
+    DominatorTree *DT;
+
+    typedef SmallVector<Instruction *, 16> SmallInstructionVector;
+    typedef SmallSet<Instruction *, 16>   SmallInstructionSet;
+
+    // A chain of isomorphic instructions, indentified by a single-use PHI,
+    // representing a reduction. Only the last value may be used outside the
+    // loop.
+    struct SimpleLoopReduction {
+      SimpleLoopReduction(Instruction *P, Loop *L)
+        : Valid(false), Instructions(1, P) {
+        assert(isa<PHINode>(P) && "First reduction instruction must be a PHI");
+        add(L);
+      }
+
+      bool valid() const {
+        return Valid;
+      }
+
+      Instruction *getPHI() const {
+        assert(Valid && "Using invalid reduction");
+        return Instructions.front();
+      }
+
+      Instruction *getReducedValue() const {
+        assert(Valid && "Using invalid reduction");
+        return Instructions.back();
+      }
+
+      Instruction *get(size_t i) const {
+        assert(Valid && "Using invalid reduction");
+        return Instructions[i+1];
+      }
+
+      Instruction *operator [] (size_t i) const { return get(i); }
+
+      // The size, ignoring the initial PHI.
+      size_t size() const {
+        assert(Valid && "Using invalid reduction");
+        return Instructions.size()-1;
+      }
+
+      typedef SmallInstructionVector::iterator iterator;
+      typedef SmallInstructionVector::const_iterator const_iterator;
+
+      iterator begin() {
+        assert(Valid && "Using invalid reduction");
+        return llvm::next(Instructions.begin());
+      }
+
+      const_iterator begin() const {
+        assert(Valid && "Using invalid reduction");
+        return llvm::next(Instructions.begin());
+      }
+
+      iterator end() { return Instructions.end(); }
+      const_iterator end() const { return Instructions.end(); }
+
+    protected:
+      bool Valid;
+      SmallInstructionVector Instructions;
+
+      void add(Loop *L);
+    };
+
+    // The set of all reductions, and state tracking of possible reductions
+    // during loop instruction processing.
+    struct ReductionTracker {
+      typedef SmallVector<SimpleLoopReduction, 16> SmallReductionVector;
+
+      // Add a new possible reduction.
+      void addSLR(SimpleLoopReduction &SLR) {
+        PossibleReds.push_back(SLR);
+      }
+
+      // Setup to track possible reductions corresponding to the provided
+      // rerolling scale. Only reductions with a number of non-PHI instructions
+      // that is divisible by the scale are considered. Three instructions sets
+      // are filled in:
+      //   - A set of all possible instructions in eligible reductions.
+      //   - A set of all PHIs in eligible reductions
+      //   - A set of all reduced values (last instructions) in eligible reductions.
+      void restrictToScale(uint64_t Scale,
+                           SmallInstructionSet &PossibleRedSet,
+                           SmallInstructionSet &PossibleRedPHISet,
+                           SmallInstructionSet &PossibleRedLastSet) {
+        PossibleRedIdx.clear();
+        PossibleRedIter.clear();
+        Reds.clear();
+
+        for (unsigned i = 0, e = PossibleReds.size(); i != e; ++i)
+          if (PossibleReds[i].size() % Scale == 0) {
+            PossibleRedLastSet.insert(PossibleReds[i].getReducedValue());
+            PossibleRedPHISet.insert(PossibleReds[i].getPHI());
+      
+            PossibleRedSet.insert(PossibleReds[i].getPHI());
+            PossibleRedIdx[PossibleReds[i].getPHI()] = i;
+            for (SimpleLoopReduction::iterator J = PossibleReds[i].begin(),
+                 JE = PossibleReds[i].end(); J != JE; ++J) {
+              PossibleRedSet.insert(*J);
+              PossibleRedIdx[*J] = i;
+            }
+          }
+      }
+
+      // The functions below are used while processing the loop instructions.
+
+      // Are the two instructions both from reductions, and furthermore, from
+      // the same reduction?
+      bool isPairInSame(Instruction *J1, Instruction *J2) {
+        DenseMap<Instruction *, int>::iterator J1I = PossibleRedIdx.find(J1);
+        if (J1I != PossibleRedIdx.end()) {
+          DenseMap<Instruction *, int>::iterator J2I = PossibleRedIdx.find(J2);
+          if (J2I != PossibleRedIdx.end() && J1I->second == J2I->second)
+            return true;
+        }
+
+        return false;
+      }
+
+      // The two provided instructions, the first from the base iteration, and
+      // the second from iteration i, form a matched pair. If these are part of
+      // a reduction, record that fact.
+      void recordPair(Instruction *J1, Instruction *J2, unsigned i) {
+        if (PossibleRedIdx.count(J1)) {
+          assert(PossibleRedIdx.count(J2) &&
+                 "Recording reduction vs. non-reduction instruction?");
+
+          PossibleRedIter[J1] = 0;
+          PossibleRedIter[J2] = i;
+
+          int Idx = PossibleRedIdx[J1];
+          assert(Idx == PossibleRedIdx[J2] &&
+                 "Recording pair from different reductions?");
+          Reds.insert(Idx);
+        }
+      }
+
+      // The functions below can be called after we've finished processing all
+      // instructions in the loop, and we know which reductions were selected.
+
+      // Is the provided instruction the PHI of a reduction selected for
+      // rerolling?
+      bool isSelectedPHI(Instruction *J) {
+        if (!isa<PHINode>(J))
+          return false;
+
+        for (DenseSet<int>::iterator RI = Reds.begin(), RIE = Reds.end();
+             RI != RIE; ++RI) {
+          int i = *RI;
+          if (cast<Instruction>(J) == PossibleReds[i].getPHI())
+            return true;
+        }
+
+        return false;
+      }
+
+      bool validateSelected();
+      void replaceSelected();
+
+    protected:
+      // The vector of all possible reductions (for any scale).
+      SmallReductionVector PossibleReds;
+
+      DenseMap<Instruction *, int> PossibleRedIdx;
+      DenseMap<Instruction *, int> PossibleRedIter;
+      DenseSet<int> Reds;
+    };
+
+    void collectPossibleIVs(Loop *L, SmallInstructionVector &PossibleIVs);
+    void collectPossibleReductions(Loop *L,
+           ReductionTracker &Reductions);
+    void collectInLoopUserSet(Loop *L,
+           const SmallInstructionVector &Roots,
+           const SmallInstructionSet &Exclude,
+           const SmallInstructionSet &Final,
+           DenseSet<Instruction *> &Users);
+    void collectInLoopUserSet(Loop *L,
+           Instruction * Root,
+           const SmallInstructionSet &Exclude,
+           const SmallInstructionSet &Final,
+           DenseSet<Instruction *> &Users);
+    bool findScaleFromMul(Instruction *RealIV, uint64_t &Scale,
+                          Instruction *&IV,
+                          SmallInstructionVector &LoopIncs);
+    bool collectAllRoots(Loop *L, uint64_t Inc, uint64_t Scale, Instruction *IV,
+                         SmallVector<SmallInstructionVector, 32> &Roots,
+                         SmallInstructionSet &AllRoots,
+                         SmallInstructionVector &LoopIncs);
+    bool reroll(Instruction *IV, Loop *L, BasicBlock *Header, const SCEV *IterCount,
+                ReductionTracker &Reductions);
+  };
+}
+
+char LoopReroll::ID = 0;
+INITIALIZE_PASS_BEGIN(LoopReroll, "loop-reroll", "Reroll loops", false, false)
+INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_PASS_DEPENDENCY(LoopInfo)
+INITIALIZE_PASS_DEPENDENCY(DominatorTree)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
+INITIALIZE_PASS_END(LoopReroll, "loop-reroll", "Reroll loops", false, false)
+
+Pass *llvm::createLoopRerollPass() {
+  return new LoopReroll;
+}
+
+// Returns true if the provided instruction is used outside the given loop.
+// This operates like Instruction::isUsedOutsideOfBlock, but considers PHIs in
+// non-loop blocks to be outside the loop.
+static bool hasUsesOutsideLoop(Instruction *I, Loop *L) {
+  for (Value::use_iterator UI = I->use_begin(),
+       UIE = I->use_end(); UI != UIE; ++UI) {
+    Instruction *User = cast<Instruction>(*UI);
+    if (!L->contains(User))
+      return true;
+  }
+
+  return false;
+}
+
+// Collect the list of loop induction variables with respect to which it might
+// be possible to reroll the loop.
+void LoopReroll::collectPossibleIVs(Loop *L,
+                                    SmallInstructionVector &PossibleIVs) {
+  BasicBlock *Header = L->getHeader();
+  for (BasicBlock::iterator I = Header->begin(),
+       IE = Header->getFirstInsertionPt(); I != IE; ++I) {
+    if (!isa<PHINode>(I))
+      continue;
+    if (!I->getType()->isIntegerTy())
+      continue;
+
+    if (const SCEVAddRecExpr *PHISCEV =
+        dyn_cast<SCEVAddRecExpr>(SE->getSCEV(I))) {
+      if (PHISCEV->getLoop() != L)
+        continue;
+      if (!PHISCEV->isAffine())
+        continue;
+      if (const SCEVConstant *IncSCEV =
+          dyn_cast<SCEVConstant>(PHISCEV->getStepRecurrence(*SE))) {
+        if (!IncSCEV->getValue()->getValue().isStrictlyPositive())
+          continue;
+        if (IncSCEV->getValue()->uge(MaxInc))
+          continue;
+
+        DEBUG(dbgs() << "LRR: Possible IV: " << *I << " = " <<
+              *PHISCEV << "\n");
+        PossibleIVs.push_back(I);
+      }
+    }
+  }
+}
+
+// Add the remainder of the reduction-variable chain to the instruction vector
+// (the initial PHINode has already been added). If successful, the object is
+// marked as valid.
+void LoopReroll::SimpleLoopReduction::add(Loop *L) {
+  assert(!Valid && "Cannot add to an already-valid chain");
+
+  // The reduction variable must be a chain of single-use instructions
+  // (including the PHI), except for the last value (which is used by the PHI
+  // and also outside the loop).
+  Instruction *C = Instructions.front();
+
+  do {
+    C = cast<Instruction>(*C->use_begin());
+    if (C->hasOneUse()) {
+      if (!C->isBinaryOp())
+        return;
+
+      if (!(isa<PHINode>(Instructions.back()) ||
+            C->isSameOperationAs(Instructions.back())))
+        return;
+
+      Instructions.push_back(C);
+    }
+  } while (C->hasOneUse());
+
+  if (Instructions.size() < 2 ||
+      !C->isSameOperationAs(Instructions.back()) ||
+      C->use_begin() == C->use_end())
+    return;
+
+  // C is now the (potential) last instruction in the reduction chain.
+  for (Value::use_iterator UI = C->use_begin(), UIE = C->use_end();
+       UI != UIE; ++UI) {
+    // The only in-loop user can be the initial PHI.
+    if (L->contains(cast<Instruction>(*UI)))
+      if (cast<Instruction>(*UI ) != Instructions.front())
+        return;
+  }
+
+  Instructions.push_back(C);
+  Valid = true;
+}
+
+// Collect the vector of possible reduction variables.
+void LoopReroll::collectPossibleReductions(Loop *L,
+  ReductionTracker &Reductions) {
+  BasicBlock *Header = L->getHeader();
+  for (BasicBlock::iterator I = Header->begin(),
+       IE = Header->getFirstInsertionPt(); I != IE; ++I) {
+    if (!isa<PHINode>(I))
+      continue;
+    if (!I->getType()->isSingleValueType())
+      continue;
+
+    SimpleLoopReduction SLR(I, L);
+    if (!SLR.valid())
+      continue;
+
+    DEBUG(dbgs() << "LRR: Possible reduction: " << *I << " (with " <<
+          SLR.size() << " chained instructions)\n");
+    Reductions.addSLR(SLR);
+  }
+}
+
+// Collect the set of all users of the provided root instruction. This set of
+// users contains not only the direct users of the root instruction, but also
+// all users of those users, and so on. There are two exceptions:
+//
+//   1. Instructions in the set of excluded instructions are never added to the
+//   use set (even if they are users). This is used, for example, to exclude
+//   including root increments in the use set of the primary IV.
+//
+//   2. Instructions in the set of final instructions are added to the use set
+//   if they are users, but their users are not added. This is used, for
+//   example, to prevent a reduction update from forcing all later reduction
+//   updates into the use set.
+void LoopReroll::collectInLoopUserSet(Loop *L,
+  Instruction *Root, const SmallInstructionSet &Exclude,
+  const SmallInstructionSet &Final,
+  DenseSet<Instruction *> &Users) {
+  SmallInstructionVector Queue(1, Root);
+  while (!Queue.empty()) {
+    Instruction *I = Queue.pop_back_val();
+    if (!Users.insert(I).second)
+      continue;
+
+    if (!Final.count(I))
+      for (Value::use_iterator UI = I->use_begin(),
+           UIE = I->use_end(); UI != UIE; ++UI) {
+        Instruction *User = cast<Instruction>(*UI);
+        if (PHINode *PN = dyn_cast<PHINode>(User)) {
+          // Ignore "wrap-around" uses to PHIs of this loop's header.
+          if (PN->getIncomingBlock(UI) == L->getHeader())
+            continue;
+        }
+  
+        if (L->contains(User) && !Exclude.count(User)) {
+          Queue.push_back(User);
+        }
+      }
+
+    // We also want to collect single-user "feeder" values.
+    for (User::op_iterator OI = I->op_begin(),
+         OIE = I->op_end(); OI != OIE; ++OI) {
+      if (Instruction *Op = dyn_cast<Instruction>(*OI))
+        if (Op->hasOneUse() && L->contains(Op) && !Exclude.count(Op) &&
+            !Final.count(Op))
+          Queue.push_back(Op);
+    }
+  }
+}
+
+// Collect all of the users of all of the provided root instructions (combined
+// into a single set).
+void LoopReroll::collectInLoopUserSet(Loop *L,
+  const SmallInstructionVector &Roots,
+  const SmallInstructionSet &Exclude,
+  const SmallInstructionSet &Final,
+  DenseSet<Instruction *> &Users) {
+  for (SmallInstructionVector::const_iterator I = Roots.begin(),
+       IE = Roots.end(); I != IE; ++I)
+    collectInLoopUserSet(L, *I, Exclude, Final, Users);
+}
+
+static bool isSimpleLoadStore(Instruction *I) {
+  if (LoadInst *LI = dyn_cast<LoadInst>(I))
+    return LI->isSimple();
+  if (StoreInst *SI = dyn_cast<StoreInst>(I))
+    return SI->isSimple();
+  if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
+    return !MI->isVolatile();
+  return false;
+}
+
+// Recognize loops that are setup like this:
+//
+// %iv = phi [ (preheader, ...), (body, %iv.next) ]
+// %scaled.iv = mul %iv, scale
+// f(%scaled.iv)
+// %scaled.iv.1 = add %scaled.iv, 1
+// f(%scaled.iv.1)
+// %scaled.iv.2 = add %scaled.iv, 2
+// f(%scaled.iv.2)
+// %scaled.iv.scale_m_1 = add %scaled.iv, scale-1
+// f(%scaled.iv.scale_m_1)
+// ...
+// %iv.next = add %iv, 1
+// %cmp = icmp(%iv, ...)
+// br %cmp, header, exit
+//
+// and, if found, set IV = %scaled.iv, and add %iv.next to LoopIncs.
+bool LoopReroll::findScaleFromMul(Instruction *RealIV, uint64_t &Scale,
+                                  Instruction *&IV,
+                                  SmallInstructionVector &LoopIncs) {
+  // This is a special case: here we're looking for all uses (except for
+  // the increment) to be multiplied by a common factor. The increment must
+  // be by one. This is to capture loops like:
+  //   for (int i = 0; i < 500; ++i) {
+  //     foo(3*i); foo(3*i+1); foo(3*i+2);
+  //   }
+  if (RealIV->getNumUses() != 2)
+    return false;
+  const SCEVAddRecExpr *RealIVSCEV = cast<SCEVAddRecExpr>(SE->getSCEV(RealIV));
+  Instruction *User1 = cast<Instruction>(*RealIV->use_begin()),
+              *User2 = cast<Instruction>(*llvm::next(RealIV->use_begin()));
+  if (!SE->isSCEVable(User1->getType()) || !SE->isSCEVable(User2->getType()))
+    return false;
+  const SCEVAddRecExpr *User1SCEV =
+                         dyn_cast<SCEVAddRecExpr>(SE->getSCEV(User1)),
+                       *User2SCEV =
+                         dyn_cast<SCEVAddRecExpr>(SE->getSCEV(User2));
+  if (!User1SCEV || !User1SCEV->isAffine() ||
+      !User2SCEV || !User2SCEV->isAffine())
+    return false;
+
+  // We assume below that User1 is the scale multiply and User2 is the
+  // increment. If this can't be true, then swap them.
+  if (User1SCEV == RealIVSCEV->getPostIncExpr(*SE)) {
+    std::swap(User1, User2);
+    std::swap(User1SCEV, User2SCEV);
+  }
+
+  if (User2SCEV != RealIVSCEV->getPostIncExpr(*SE))
+    return false;
+  assert(User2SCEV->getStepRecurrence(*SE)->isOne() &&
+         "Invalid non-unit step for multiplicative scaling");
+  LoopIncs.push_back(User2);
+
+  if (const SCEVConstant *MulScale =
+      dyn_cast<SCEVConstant>(User1SCEV->getStepRecurrence(*SE))) {
+    // Make sure that both the start and step have the same multiplier.
+    if (RealIVSCEV->getStart()->getType() != MulScale->getType())
+      return false;
+    if (SE->getMulExpr(RealIVSCEV->getStart(), MulScale) !=
+        User1SCEV->getStart())
+      return false;
+
+    ConstantInt *MulScaleCI = MulScale->getValue();
+    if (!MulScaleCI->uge(2) || MulScaleCI->uge(MaxInc))
+      return false;
+    Scale = MulScaleCI->getZExtValue();
+    IV = User1;
+  } else
+    return false;
+
+  DEBUG(dbgs() << "LRR: Found possible scaling " << *User1 << "\n");
+  return true;
+}
+
+// Collect all root increments with respect to the provided induction variable
+// (normally the PHI, but sometimes a multiply). A root increment is an
+// instruction, normally an add, with a positive constant less than Scale. In a
+// rerollable loop, each of these increments is the root of an instruction
+// graph isomorphic to the others. Also, we collect the final induction
+// increment (the increment equal to the Scale), and its users in LoopIncs.
+bool LoopReroll::collectAllRoots(Loop *L, uint64_t Inc, uint64_t Scale,
+                                 Instruction *IV,
+                                 SmallVector<SmallInstructionVector, 32> &Roots,
+                                 SmallInstructionSet &AllRoots,
+                                 SmallInstructionVector &LoopIncs) {
+  for (Value::use_iterator UI = IV->use_begin(),
+       UIE = IV->use_end(); UI != UIE; ++UI) {
+    Instruction *User = cast<Instruction>(*UI);
+    if (!SE->isSCEVable(User->getType()))
+      continue;
+    if (User->getType() != IV->getType())
+      continue;
+    if (!L->contains(User))
+      continue;
+    if (hasUsesOutsideLoop(User, L))
+      continue;
+
+    if (const SCEVConstant *Diff = dyn_cast<SCEVConstant>(SE->getMinusSCEV(
+          SE->getSCEV(User), SE->getSCEV(IV)))) {
+      uint64_t Idx = Diff->getValue()->getValue().getZExtValue();
+      if (Idx > 0 && Idx < Scale) {
+        Roots[Idx-1].push_back(User);
+        AllRoots.insert(User);
+      } else if (Idx == Scale && Inc > 1) {
+        LoopIncs.push_back(User);
+      }
+    }
+  }
+
+  if (Roots[0].empty())
+    return false;
+  bool AllSame = true;
+  for (unsigned i = 1; i < Scale-1; ++i)
+    if (Roots[i].size() != Roots[0].size()) {
+      AllSame = false;
+      break;
+    }
+
+  if (!AllSame)
+    return false;
+
+  return true;
+}
+
+// Validate the selected reductions. All iterations must have an isomorphic
+// part of the reduction chain and, for non-associative reductions, the chain
+// entries must appear in order.
+bool LoopReroll::ReductionTracker::validateSelected() {
+  // For a non-associative reduction, the chain entries must appear in order.
+  for (DenseSet<int>::iterator RI = Reds.begin(), RIE = Reds.end();
+       RI != RIE; ++RI) {
+    int i = *RI;
+    int PrevIter = 0, BaseCount = 0, Count = 0;
+    for (SimpleLoopReduction::iterator J = PossibleReds[i].begin(),
+         JE = PossibleReds[i].end(); J != JE; ++J) {
+	// Note that all instructions in the chain must have been found because
+	// all instructions in the function must have been assigned to some
+	// iteration.
+      int Iter = PossibleRedIter[*J];
+      if (Iter != PrevIter && Iter != PrevIter + 1 &&
+          !PossibleReds[i].getReducedValue()->isAssociative()) {
+        DEBUG(dbgs() << "LRR: Out-of-order non-associative reduction: " <<
+                        *J << "\n");
+        return false;
+      }
+
+      if (Iter != PrevIter) {
+        if (Count != BaseCount) {
+          DEBUG(dbgs() << "LRR: Iteration " << PrevIter <<
+                " reduction use count " << Count <<
+                " is not equal to the base use count " <<
+                BaseCount << "\n");
+          return false;
+        }
+
+        Count = 0;
+      }
+
+      ++Count;
+      if (Iter == 0)
+        ++BaseCount;
+
+      PrevIter = Iter;
+    }
+  }
+
+  return true;
+}
+
+// For all selected reductions, remove all parts except those in the first
+// iteration (and the PHI). Replace outside uses of the reduced value with uses
+// of the first-iteration reduced value (in other words, reroll the selected
+// reductions).
+void LoopReroll::ReductionTracker::replaceSelected() {
+  // Fixup reductions to refer to the last instruction associated with the
+  // first iteration (not the last).
+  for (DenseSet<int>::iterator RI = Reds.begin(), RIE = Reds.end();
+       RI != RIE; ++RI) {
+    int i = *RI;
+    int j = 0;
+    for (int e = PossibleReds[i].size(); j != e; ++j)
+      if (PossibleRedIter[PossibleReds[i][j]] != 0) {
+        --j;
+        break;
+      }
+
+    // Replace users with the new end-of-chain value.
+    SmallInstructionVector Users;
+    for (Value::use_iterator UI =
+           PossibleReds[i].getReducedValue()->use_begin(),
+         UIE = PossibleReds[i].getReducedValue()->use_end(); UI != UIE; ++UI)
+      Users.push_back(cast<Instruction>(*UI));
+
+    for (SmallInstructionVector::iterator J = Users.begin(),
+         JE = Users.end(); J != JE; ++J)
+      (*J)->replaceUsesOfWith(PossibleReds[i].getReducedValue(),
+                              PossibleReds[i][j]);
+  }
+}
+
+// Reroll the provided loop with respect to the provided induction variable.
+// Generally, we're looking for a loop like this:
+//
+// %iv = phi [ (preheader, ...), (body, %iv.next) ]
+// f(%iv)
+// %iv.1 = add %iv, 1                <-- a root increment
+// f(%iv.1)
+// %iv.2 = add %iv, 2                <-- a root increment
+// f(%iv.2)
+// %iv.scale_m_1 = add %iv, scale-1  <-- a root increment
+// f(%iv.scale_m_1)
+// ...
+// %iv.next = add %iv, scale
+// %cmp = icmp(%iv, ...)
+// br %cmp, header, exit
+//
+// Notably, we do not require that f(%iv), f(%iv.1), etc. be isolated groups of
+// instructions. In other words, the instructions in f(%iv), f(%iv.1), etc. can
+// be intermixed with eachother. The restriction imposed by this algorithm is
+// that the relative order of the isomorphic instructions in f(%iv), f(%iv.1),
+// etc. be the same.
+//
+// First, we collect the use set of %iv, excluding the other increment roots.
+// This gives us f(%iv). Then we iterate over the loop instructions (scale-1)
+// times, having collected the use set of f(%iv.(i+1)), during which we:
+//   - Ensure that the next unmatched instruction in f(%iv) is isomorphic to
+//     the next unmatched instruction in f(%iv.(i+1)).
+//   - Ensure that both matched instructions don't have any external users
+//     (with the exception of last-in-chain reduction instructions).
+//   - Track the (aliasing) write set, and other side effects, of all
+//     instructions that belong to future iterations that come before the matched
+//     instructions. If the matched instructions read from that write set, then
+//     f(%iv) or f(%iv.(i+1)) has some dependency on instructions in
+//     f(%iv.(j+1)) for some j > i, and we cannot reroll the loop. Similarly,
+//     if any of these future instructions had side effects (could not be
+//     speculatively executed), and so do the matched instructions, when we
+//     cannot reorder those side-effect-producing instructions, and rerolling
+//     fails.
+//
+// Finally, we make sure that all loop instructions are either loop increment
+// roots, belong to simple latch code, parts of validated reductions, part of
+// f(%iv) or part of some f(%iv.i). If all of that is true (and all reductions
+// have been validated), then we reroll the loop.
+bool LoopReroll::reroll(Instruction *IV, Loop *L, BasicBlock *Header,
+                        const SCEV *IterCount,
+                        ReductionTracker &Reductions) {
+  const SCEVAddRecExpr *RealIVSCEV = cast<SCEVAddRecExpr>(SE->getSCEV(IV));
+  uint64_t Inc = cast<SCEVConstant>(RealIVSCEV->getOperand(1))->
+                   getValue()->getZExtValue();
+  // The collection of loop increment instructions.
+  SmallInstructionVector LoopIncs;
+  uint64_t Scale = Inc;
+
+  // The effective induction variable, IV, is normally also the real induction
+  // variable. When we're dealing with a loop like:
+  //   for (int i = 0; i < 500; ++i)
+  //     x[3*i] = ...;
+  //     x[3*i+1] = ...;
+  //     x[3*i+2] = ...;
+  // then the real IV is still i, but the effective IV is (3*i).
+  Instruction *RealIV = IV;
+  if (Inc == 1 && !findScaleFromMul(RealIV, Scale, IV, LoopIncs))
+    return false;
+
+  assert(Scale <= MaxInc && "Scale is too large");
+  assert(Scale > 1 && "Scale must be at least 2");
+
+  // The set of increment instructions for each increment value.
+  SmallVector<SmallInstructionVector, 32> Roots(Scale-1);
+  SmallInstructionSet AllRoots;
+  if (!collectAllRoots(L, Inc, Scale, IV, Roots, AllRoots, LoopIncs))
+    return false;
+
+  DEBUG(dbgs() << "LRR: Found all root induction increments for: " <<
+                  *RealIV << "\n");
+
+  // An array of just the possible reductions for this scale factor. When we
+  // collect the set of all users of some root instructions, these reduction
+  // instructions are treated as 'final' (their uses are not considered).
+  // This is important because we don't want the root use set to search down
+  // the reduction chain.
+  SmallInstructionSet PossibleRedSet;
+  SmallInstructionSet PossibleRedLastSet, PossibleRedPHISet;
+  Reductions.restrictToScale(Scale, PossibleRedSet, PossibleRedPHISet,
+                             PossibleRedLastSet);
+
+  // We now need to check for equivalence of the use graph of each root with
+  // that of the primary induction variable (excluding the roots). Our goal
+  // here is not to solve the full graph isomorphism problem, but rather to
+  // catch common cases without a lot of work. As a result, we will assume
+  // that the relative order of the instructions in each unrolled iteration
+  // is the same (although we will not make an assumption about how the
+  // different iterations are intermixed). Note that while the order must be
+  // the same, the instructions may not be in the same basic block.
+  SmallInstructionSet Exclude(AllRoots);
+  Exclude.insert(LoopIncs.begin(), LoopIncs.end());
+
+  DenseSet<Instruction *> BaseUseSet;
+  collectInLoopUserSet(L, IV, Exclude, PossibleRedSet, BaseUseSet);
+
+  DenseSet<Instruction *> AllRootUses;
+  std::vector<DenseSet<Instruction *> > RootUseSets(Scale-1);
+
+  bool MatchFailed = false;
+  for (unsigned i = 0; i < Scale-1 && !MatchFailed; ++i) {
+    DenseSet<Instruction *> &RootUseSet = RootUseSets[i];
+    collectInLoopUserSet(L, Roots[i], SmallInstructionSet(),
+                         PossibleRedSet, RootUseSet);
+
+    DEBUG(dbgs() << "LRR: base use set size: " << BaseUseSet.size() <<
+                    " vs. iteration increment " << (i+1) <<
+                    " use set size: " << RootUseSet.size() << "\n");
+
+    if (BaseUseSet.size() != RootUseSet.size()) {
+      MatchFailed = true;
+      break;
+    }
+
+    // In addition to regular aliasing information, we need to look for
+    // instructions from later (future) iterations that have side effects
+    // preventing us from reordering them past other instructions with side
+    // effects.
+    bool FutureSideEffects = false;
+    AliasSetTracker AST(*AA);
+
+    // The map between instructions in f(%iv.(i+1)) and f(%iv).
+    DenseMap<Value *, Value *> BaseMap;
+
+    assert(L->getNumBlocks() == 1 && "Cannot handle multi-block loops");
+    for (BasicBlock::iterator J1 = Header->begin(), J2 = Header->begin(),
+         JE = Header->end(); J1 != JE && !MatchFailed; ++J1) {
+      if (cast<Instruction>(J1) == RealIV)
+        continue;
+      if (cast<Instruction>(J1) == IV)
+        continue;
+      if (!BaseUseSet.count(J1))
+        continue;
+      if (PossibleRedPHISet.count(J1)) // Skip reduction PHIs.
+        continue;
+
+      while (J2 != JE && (!RootUseSet.count(J2) ||
+             std::find(Roots[i].begin(), Roots[i].end(), J2) !=
+               Roots[i].end())) {
+        // As we iterate through the instructions, instructions that don't
+        // belong to previous iterations (or the base case), must belong to
+        // future iterations. We want to track the alias set of writes from
+        // previous iterations.
+        if (!isa<PHINode>(J2) && !BaseUseSet.count(J2) &&
+            !AllRootUses.count(J2)) {
+          if (J2->mayWriteToMemory())
+            AST.add(J2);
+
+          // Note: This is specifically guarded by a check on isa<PHINode>,
+          // which while a valid (somewhat arbitrary) micro-optimization, is
+          // needed because otherwise isSafeToSpeculativelyExecute returns
+          // false on PHI nodes.
+          if (!isSimpleLoadStore(J2) && !isSafeToSpeculativelyExecute(J2, DL))
+            FutureSideEffects = true; 
+        }
+
+        ++J2;
+      }
+
+      if (!J1->isSameOperationAs(J2)) {
+        DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 <<
+                        " vs. " << *J2 << "\n");
+        MatchFailed = true;
+        break;
+      }
+
+      // Make sure that this instruction, which is in the use set of this
+      // root instruction, does not also belong to the base set or the set of
+      // some previous root instruction.
+      if (BaseUseSet.count(J2) || AllRootUses.count(J2)) {
+        DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 <<
+                        " vs. " << *J2 << " (prev. case overlap)\n");
+        MatchFailed = true;
+        break;
+      }
+
+      // Make sure that we don't alias with any instruction in the alias set
+      // tracker. If we do, then we depend on a future iteration, and we
+      // can't reroll.
+      if (J2->mayReadFromMemory()) {
+        for (AliasSetTracker::iterator K = AST.begin(), KE = AST.end();
+             K != KE && !MatchFailed; ++K) {
+          if (K->aliasesUnknownInst(J2, *AA)) {
+            DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 <<
+                            " vs. " << *J2 << " (depends on future store)\n");
+            MatchFailed = true;
+            break;
+          }
+        }
+      }
+
+      // If we've past an instruction from a future iteration that may have
+      // side effects, and this instruction might also, then we can't reorder
+      // them, and this matching fails. As an exception, we allow the alias
+      // set tracker to handle regular (simple) load/store dependencies.
+      if (FutureSideEffects &&
+            ((!isSimpleLoadStore(J1) && !isSafeToSpeculativelyExecute(J1)) ||
+             (!isSimpleLoadStore(J2) && !isSafeToSpeculativelyExecute(J2)))) {
+        DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 <<
+                        " vs. " << *J2 <<
+                        " (side effects prevent reordering)\n");
+        MatchFailed = true;
+        break;
+      }
+
+      // For instructions that are part of a reduction, if the operation is
+      // associative, then don't bother matching the operands (because we
+      // already know that the instructions are isomorphic, and the order
+      // within the iteration does not matter). For non-associative reductions,
+      // we do need to match the operands, because we need to reject
+      // out-of-order instructions within an iteration!
+      // For example (assume floating-point addition), we need to reject this:
+      //   x += a[i]; x += b[i];
+      //   x += a[i+1]; x += b[i+1];
+      //   x += b[i+2]; x += a[i+2];
+      bool InReduction = Reductions.isPairInSame(J1, J2);
+
+      if (!(InReduction && J1->isAssociative())) {
+        bool Swapped = false, SomeOpMatched = false;;
+        for (unsigned j = 0; j < J1->getNumOperands() && !MatchFailed; ++j) {
+          Value *Op2 = J2->getOperand(j);
+
+	  // If this is part of a reduction (and the operation is not
+	  // associatve), then we match all operands, but not those that are
+	  // part of the reduction.
+          if (InReduction)
+            if (Instruction *Op2I = dyn_cast<Instruction>(Op2))
+              if (Reductions.isPairInSame(J2, Op2I))
+                continue;
+
+          DenseMap<Value *, Value *>::iterator BMI = BaseMap.find(Op2);
+          if (BMI != BaseMap.end())
+            Op2 = BMI->second;
+          else if (std::find(Roots[i].begin(), Roots[i].end(),
+                             (Instruction*) Op2) != Roots[i].end())
+            Op2 = IV;
+
+          if (J1->getOperand(Swapped ? unsigned(!j) : j) != Op2) {
+	    // If we've not already decided to swap the matched operands, and
+	    // we've not already matched our first operand (note that we could
+	    // have skipped matching the first operand because it is part of a
+	    // reduction above), and the instruction is commutative, then try
+	    // the swapped match.
+            if (!Swapped && J1->isCommutative() && !SomeOpMatched &&
+                J1->getOperand(!j) == Op2) {
+              Swapped = true;
+            } else {
+              DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 <<
+                              " vs. " << *J2 << " (operand " << j << ")\n");
+              MatchFailed = true;
+              break;
+            }
+          }
+
+          SomeOpMatched = true;
+        }
+      }
+
+      if ((!PossibleRedLastSet.count(J1) && hasUsesOutsideLoop(J1, L)) ||
+          (!PossibleRedLastSet.count(J2) && hasUsesOutsideLoop(J2, L))) {
+        DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 <<
+                        " vs. " << *J2 << " (uses outside loop)\n");
+        MatchFailed = true;
+        break;
+      }
+
+      if (!MatchFailed)
+        BaseMap.insert(std::pair<Value *, Value *>(J2, J1));
+
+      AllRootUses.insert(J2);
+      Reductions.recordPair(J1, J2, i+1);
+
+      ++J2;
+    }
+  }
+
+  if (MatchFailed)
+    return false;
+
+  DEBUG(dbgs() << "LRR: Matched all iteration increments for " <<
+                  *RealIV << "\n");
+
+  DenseSet<Instruction *> LoopIncUseSet;
+  collectInLoopUserSet(L, LoopIncs, SmallInstructionSet(),
+                       SmallInstructionSet(), LoopIncUseSet);
+  DEBUG(dbgs() << "LRR: Loop increment set size: " <<
+                  LoopIncUseSet.size() << "\n");
+
+  // Make sure that all instructions in the loop have been included in some
+  // use set.
+  for (BasicBlock::iterator J = Header->begin(), JE = Header->end();
+       J != JE; ++J) {
+    if (isa<DbgInfoIntrinsic>(J))
+      continue;
+    if (cast<Instruction>(J) == RealIV)
+      continue;
+    if (cast<Instruction>(J) == IV)
+      continue;
+    if (BaseUseSet.count(J) || AllRootUses.count(J) ||
+        (LoopIncUseSet.count(J) && (J->isTerminator() ||
+                                    isSafeToSpeculativelyExecute(J, DL))))
+      continue;
+
+    if (AllRoots.count(J))
+      continue;
+
+    if (Reductions.isSelectedPHI(J))
+      continue;
+
+    DEBUG(dbgs() << "LRR: aborting reroll based on " << *RealIV <<
+                    " unprocessed instruction found: " << *J << "\n");
+    MatchFailed = true;
+    break;
+  }
+
+  if (MatchFailed)
+    return false;
+
+  DEBUG(dbgs() << "LRR: all instructions processed from " <<
+                  *RealIV << "\n");
+
+  if (!Reductions.validateSelected())
+    return false;
+
+  // At this point, we've validated the rerolling, and we're committed to
+  // making changes!
+
+  Reductions.replaceSelected();
+
+  // Remove instructions associated with non-base iterations.
+  for (BasicBlock::reverse_iterator J = Header->rbegin();
+       J != Header->rend();) {
+    if (AllRootUses.count(&*J)) {
+      Instruction *D = &*J;
+      DEBUG(dbgs() << "LRR: removing: " << *D << "\n");
+      D->eraseFromParent();
+      continue;
+    }
+
+    ++J; 
+  }
+
+  // Insert the new induction variable.
+  const SCEV *Start = RealIVSCEV->getStart();
+  if (Inc == 1)
+    Start = SE->getMulExpr(Start,
+                           SE->getConstant(Start->getType(), Scale));
+  const SCEVAddRecExpr *H =
+    cast<SCEVAddRecExpr>(SE->getAddRecExpr(Start,
+                           SE->getConstant(RealIVSCEV->getType(), 1),
+                           L, SCEV::FlagAnyWrap));
+  { // Limit the lifetime of SCEVExpander.
+    SCEVExpander Expander(*SE, "reroll");
+    PHINode *NewIV =
+      cast<PHINode>(Expander.expandCodeFor(H, IV->getType(),
+                                           Header->begin()));
+    for (DenseSet<Instruction *>::iterator J = BaseUseSet.begin(),
+         JE = BaseUseSet.end(); J != JE; ++J)
+      (*J)->replaceUsesOfWith(IV, NewIV);
+
+    if (BranchInst *BI = dyn_cast<BranchInst>(Header->getTerminator())) {
+      if (LoopIncUseSet.count(BI)) {
+        const SCEV *ICSCEV = RealIVSCEV->evaluateAtIteration(IterCount, *SE);
+        if (Inc == 1)
+          ICSCEV =
+            SE->getMulExpr(ICSCEV, SE->getConstant(ICSCEV->getType(), Scale));
+        Value *IC;
+        if (isa<SCEVConstant>(ICSCEV)) {
+          IC = Expander.expandCodeFor(ICSCEV, NewIV->getType(), BI);
+        } else {
+          BasicBlock *Preheader = L->getLoopPreheader();
+          if (!Preheader)
+            Preheader = InsertPreheaderForLoop(L, this);
+
+          IC = Expander.expandCodeFor(ICSCEV, NewIV->getType(),
+                                      Preheader->getTerminator());
+        }
+ 
+        Value *NewIVNext = NewIV->getIncomingValueForBlock(Header); 
+        Value *Cond = new ICmpInst(BI, CmpInst::ICMP_EQ, NewIVNext, IC,
+                                   "exitcond");
+        BI->setCondition(Cond);
+
+        if (BI->getSuccessor(1) != Header)
+          BI->swapSuccessors();
+      }
+    }
+  }
+
+  SimplifyInstructionsInBlock(Header, DL, TLI);
+  DeleteDeadPHIs(Header, TLI);
+  ++NumRerolledLoops;
+  return true;
+}
+
+bool LoopReroll::runOnLoop(Loop *L, LPPassManager &LPM) {
+  AA = &getAnalysis<AliasAnalysis>();
+  LI = &getAnalysis<LoopInfo>();
+  SE = &getAnalysis<ScalarEvolution>();
+  TLI = &getAnalysis<TargetLibraryInfo>();
+  DL = getAnalysisIfAvailable<DataLayout>();
+  DT = &getAnalysis<DominatorTree>();
+
+  BasicBlock *Header = L->getHeader();
+  DEBUG(dbgs() << "LRR: F[" << Header->getParent()->getName() <<
+        "] Loop %" << Header->getName() << " (" <<
+        L->getNumBlocks() << " block(s))\n");
+
+  bool Changed = false;
+
+  // For now, we'll handle only single BB loops.
+  if (L->getNumBlocks() > 1)
+    return Changed;
+
+  if (!SE->hasLoopInvariantBackedgeTakenCount(L))
+    return Changed;
+
+  const SCEV *LIBETC = SE->getBackedgeTakenCount(L);
+  const SCEV *IterCount =
+    SE->getAddExpr(LIBETC, SE->getConstant(LIBETC->getType(), 1));
+  DEBUG(dbgs() << "LRR: iteration count = " << *IterCount << "\n");
+
+  // First, we need to find the induction variable with respect to which we can
+  // reroll (there may be several possible options).
+  SmallInstructionVector PossibleIVs;
+  collectPossibleIVs(L, PossibleIVs);
+
+  if (PossibleIVs.empty()) {
+    DEBUG(dbgs() << "LRR: No possible IVs found\n");
+    return Changed;
+  }
+
+  ReductionTracker Reductions;
+  collectPossibleReductions(L, Reductions);
+
+  // For each possible IV, collect the associated possible set of 'root' nodes
+  // (i+1, i+2, etc.).
+  for (SmallInstructionVector::iterator I = PossibleIVs.begin(),
+       IE = PossibleIVs.end(); I != IE; ++I)
+    if (reroll(*I, L, Header, IterCount, Reductions)) {
+      Changed = true;
+      break;
+    }
+
+  return Changed;
+}
+
diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 14cb979..eff5268 100644
--- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -1170,6 +1170,13 @@ public:
   /// may be used.
   bool AllFixupsOutsideLoop;
 
+  /// RigidFormula is set to true to guarantee that this use will be associated
+  /// with a single formula--the one that initially matched. Some SCEV
+  /// expressions cannot be expanded. This allows LSR to consider the registers
+  /// used by those expressions without the need to expand them later after
+  /// changing the formula.
+  bool RigidFormula;
+
   /// WidestFixupType - This records the widest use type for any fixup using
   /// this LSRUse. FindUseWithSimilarFormula can't consider uses with different
   /// max fixup widths to be equivalent, because the narrower one may be relying
@@ -1188,6 +1195,7 @@ public:
                                       MinOffset(INT64_MAX),
                                       MaxOffset(INT64_MIN),
                                       AllFixupsOutsideLoop(true),
+                                      RigidFormula(false),
                                       WidestFixupType(0) {}
 
   bool HasFormulaWithSameRegs(const Formula &F) const;
@@ -1214,6 +1222,9 @@ bool LSRUse::HasFormulaWithSameRegs(const Formula &F) const {
 /// InsertFormula - If the given formula has not yet been inserted, add it to
 /// the list, and return true. Return false otherwise.
 bool LSRUse::InsertFormula(const Formula &F) {
+  if (!Formulae.empty() && RigidFormula)
+    return false;
+
   SmallVector<const SCEV *, 4> Key = F.BaseRegs;
   if (F.ScaledReg) Key.push_back(F.ScaledReg);
   // Unstable sort by host order ok, because this is only used for uniquifying.
@@ -1433,7 +1444,7 @@ static unsigned getScalingFactorCost(const TargetTransformInfo &TTI,
   }
   case LSRUse::ICmpZero:
     // ICmpZero BaseReg + -1*ScaleReg => ICmp BaseReg, ScaleReg.
-    // Therefore, return 0 in case F.Scale == -1. 
+    // Therefore, return 0 in case F.Scale == -1.
     return F.Scale != -1;
 
   case LSRUse::Basic:
@@ -2943,7 +2954,7 @@ void LSRInstance::CollectFixupsAndInitialFormulae() {
 
         // x == y  -->  x - y == 0
         const SCEV *N = SE.getSCEV(NV);
-        if (SE.isLoopInvariant(N, L) && isSafeToExpand(N)) {
+        if (SE.isLoopInvariant(N, L) && isSafeToExpand(N, SE)) {
           // S is normalized, so normalize N before folding it into S
           // to keep the result normalized.
           N = TransformForPostIncUse(Normalize, N, CI, 0,
@@ -2986,6 +2997,10 @@ void LSRInstance::CollectFixupsAndInitialFormulae() {
 /// and loop-computable portions.
 void
 LSRInstance::InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx) {
+  // Mark uses whose expressions cannot be expanded.
+  if (!isSafeToExpand(S, SE))
+    LU.RigidFormula = true;
+
   Formula F;
   F.InitialMatch(S, L, SE);
   bool Inserted = InsertFormula(LU, LUIdx, F);
@@ -4353,6 +4368,8 @@ Value *LSRInstance::Expand(const LSRFixup &LF,
                            SCEVExpander &Rewriter,
                            SmallVectorImpl<WeakVH> &DeadInsts) const {
   const LSRUse &LU = Uses[LF.LUIdx];
+  if (LU.RigidFormula)
+    return LF.OperandValToReplace;
 
   // Determine an input position which will be dominated by the operands and
   // which will dominate the result.
diff --git a/lib/Transforms/Scalar/LoopUnrollPass.cpp b/lib/Transforms/Scalar/LoopUnrollPass.cpp
index 80d060b..08ac38d 100644
--- a/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -49,12 +49,17 @@ namespace {
   class LoopUnroll : public LoopPass {
   public:
     static char ID; // Pass ID, replacement for typeid
-    LoopUnroll(int T = -1, int C = -1,  int P = -1) : LoopPass(ID) {
+    LoopUnroll(int T = -1, int C = -1, int P = -1, int R = -1) : LoopPass(ID) {
       CurrentThreshold = (T == -1) ? UnrollThreshold : unsigned(T);
       CurrentCount = (C == -1) ? UnrollCount : unsigned(C);
       CurrentAllowPartial = (P == -1) ? UnrollAllowPartial : (bool)P;
+      CurrentRuntime = (R == -1) ? UnrollRuntime : (bool)R;
 
       UserThreshold = (T != -1) || (UnrollThreshold.getNumOccurrences() > 0);
+      UserAllowPartial = (P != -1) ||
+                         (UnrollAllowPartial.getNumOccurrences() > 0);
+      UserRuntime = (R != -1) || (UnrollRuntime.getNumOccurrences() > 0);
+      UserCount = (C != -1) || (UnrollCount.getNumOccurrences() > 0);
 
       initializeLoopUnrollPass(*PassRegistry::getPassRegistry());
     }
@@ -75,7 +80,11 @@ namespace {
     unsigned CurrentCount;
     unsigned CurrentThreshold;
     bool     CurrentAllowPartial;
+    bool     CurrentRuntime;
+    bool     UserCount;            // CurrentCount is user-specified.
     bool     UserThreshold;        // CurrentThreshold is user-specified.
+    bool     UserAllowPartial;     // CurrentAllowPartial is user-specified.
+    bool     UserRuntime;          // CurrentRuntime is user-specified.
 
     bool runOnLoop(Loop *L, LPPassManager &LPM);
 
@@ -110,8 +119,9 @@ INITIALIZE_PASS_DEPENDENCY(LCSSA)
 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
 INITIALIZE_PASS_END(LoopUnroll, "loop-unroll", "Unroll loops", false, false)
 
-Pass *llvm::createLoopUnrollPass(int Threshold, int Count, int AllowPartial) {
-  return new LoopUnroll(Threshold, Count, AllowPartial);
+Pass *llvm::createLoopUnrollPass(int Threshold, int Count, int AllowPartial,
+                                 int Runtime) {
+  return new LoopUnroll(Threshold, Count, AllowPartial, Runtime);
 }
 
 /// ApproximateLoopSize - Approximate the size of the loop.
@@ -145,16 +155,24 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
         << "] Loop %" << Header->getName() << "\n");
   (void)Header;
 
+  TargetTransformInfo::UnrollingPreferences UP;
+  UP.Threshold = CurrentThreshold;
+  UP.OptSizeThreshold = OptSizeUnrollThreshold;
+  UP.Count = CurrentCount;
+  UP.Partial = CurrentAllowPartial;
+  UP.Runtime = CurrentRuntime;
+  TTI.getUnrollingPreferences(L, UP);
+
   // Determine the current unrolling threshold.  While this is normally set
   // from UnrollThreshold, it is overridden to a smaller value if the current
   // function is marked as optimize-for-size, and the unroll threshold was
   // not user specified.
-  unsigned Threshold = CurrentThreshold;
+  unsigned Threshold = UserThreshold ? CurrentThreshold : UP.Threshold;
   if (!UserThreshold &&
       Header->getParent()->getAttributes().
         hasAttribute(AttributeSet::FunctionIndex,
                      Attribute::OptimizeForSize))
-    Threshold = OptSizeUnrollThreshold;
+    Threshold = UP.OptSizeThreshold;
 
   // Find trip count and trip multiple if count is not available
   unsigned TripCount = 0;
@@ -167,11 +185,14 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
     TripCount = SE->getSmallConstantTripCount(L, LatchBlock);
     TripMultiple = SE->getSmallConstantTripMultiple(L, LatchBlock);
   }
+
+  bool Runtime = UserRuntime ? CurrentRuntime : UP.Runtime;
+
   // Use a default unroll-count if the user doesn't specify a value
   // and the trip count is a run-time value.  The default is different
   // for run-time or compile-time trip count loops.
-  unsigned Count = CurrentCount;
-  if (UnrollRuntime && CurrentCount == 0 && TripCount == 0)
+  unsigned Count = UserCount ? CurrentCount : UP.Count;
+  if (Runtime && Count == 0 && TripCount == 0)
     Count = UnrollRuntimeCount;
 
   if (Count == 0) {
@@ -204,7 +225,8 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
     if (TripCount != 1 && Size > Threshold) {
       DEBUG(dbgs() << "  Too large to fully unroll with count: " << Count
             << " because size: " << Size << ">" << Threshold << "\n");
-      if (!CurrentAllowPartial && !(UnrollRuntime && TripCount == 0)) {
+      bool AllowPartial = UserAllowPartial ? CurrentAllowPartial : UP.Partial;
+      if (!AllowPartial && !(Runtime && TripCount == 0)) {
         DEBUG(dbgs() << "  will not try to unroll partially because "
               << "-unroll-allow-partial not given\n");
         return false;
@@ -215,7 +237,7 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
         while (Count != 0 && TripCount%Count != 0)
           Count--;
       }
-      else if (UnrollRuntime) {
+      else if (Runtime) {
         // Reduce unroll count to be a lower power-of-two value
         while (Count != 0 && Size > Threshold) {
           Count >>= 1;
@@ -231,7 +253,7 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
   }
 
   // Unroll the loop.
-  if (!UnrollLoop(L, Count, TripCount, UnrollRuntime, TripMultiple, LI, &LPM))
+  if (!UnrollLoop(L, Count, TripCount, Runtime, TripMultiple, LI, &LPM))
     return false;
 
   return true;
diff --git a/lib/Transforms/Scalar/LoopUnswitch.cpp b/lib/Transforms/Scalar/LoopUnswitch.cpp
index 59aff31..c4ebfd5 100644
--- a/lib/Transforms/Scalar/LoopUnswitch.cpp
+++ b/lib/Transforms/Scalar/LoopUnswitch.cpp
@@ -212,8 +212,6 @@ namespace {
                                         Instruction *InsertPt);
 
     void SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L);
-    void RemoveBlockIfDead(BasicBlock *BB,
-                           std::vector<Instruction*> &Worklist, Loop *l);
     void RemoveLoopFromHierarchy(Loop *L);
     bool IsTrivialUnswitchCondition(Value *Cond, Constant **Val = 0,
                                     BasicBlock **LoopExit = 0);
@@ -946,114 +944,6 @@ static void ReplaceUsesOfWith(Instruction *I, Value *V,
   ++NumSimplify;
 }
 
-/// RemoveBlockIfDead - If the specified block is dead, remove it, update loop
-/// information, and remove any dead successors it has.
-///
-void LoopUnswitch::RemoveBlockIfDead(BasicBlock *BB,
-                                     std::vector<Instruction*> &Worklist,
-                                     Loop *L) {
-  if (pred_begin(BB) != pred_end(BB)) {
-    // This block isn't dead, since an edge to BB was just removed, see if there
-    // are any easy simplifications we can do now.
-    if (BasicBlock *Pred = BB->getSinglePredecessor()) {
-      // If it has one pred, fold phi nodes in BB.
-      while (PHINode *PN = dyn_cast<PHINode>(BB->begin()))
-        ReplaceUsesOfWith(PN, PN->getIncomingValue(0), Worklist, L, LPM);
-
-      // If this is the header of a loop and the only pred is the latch, we now
-      // have an unreachable loop.
-      if (Loop *L = LI->getLoopFor(BB))
-        if (loopHeader == BB && L->contains(Pred)) {
-          // Remove the branch from the latch to the header block, this makes
-          // the header dead, which will make the latch dead (because the header
-          // dominates the latch).
-          LPM->deleteSimpleAnalysisValue(Pred->getTerminator(), L);
-          Pred->getTerminator()->eraseFromParent();
-          new UnreachableInst(BB->getContext(), Pred);
-
-          // The loop is now broken, remove it from LI.
-          RemoveLoopFromHierarchy(L);
-
-          // Reprocess the header, which now IS dead.
-          RemoveBlockIfDead(BB, Worklist, L);
-          return;
-        }
-
-      // If pred ends in a uncond branch, add uncond branch to worklist so that
-      // the two blocks will get merged.
-      if (BranchInst *BI = dyn_cast<BranchInst>(Pred->getTerminator()))
-        if (BI->isUnconditional())
-          Worklist.push_back(BI);
-    }
-    return;
-  }
-
-  DEBUG(dbgs() << "Nuking dead block: " << *BB);
-
-  // Remove the instructions in the basic block from the worklist.
-  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
-    RemoveFromWorklist(I, Worklist);
-
-    // Anything that uses the instructions in this basic block should have their
-    // uses replaced with undefs.
-    // If I is not void type then replaceAllUsesWith undef.
-    // This allows ValueHandlers and custom metadata to adjust itself.
-    if (!I->getType()->isVoidTy())
-      I->replaceAllUsesWith(UndefValue::get(I->getType()));
-  }
-
-  // If this is the edge to the header block for a loop, remove the loop and
-  // promote all subloops.
-  if (Loop *BBLoop = LI->getLoopFor(BB)) {
-    if (BBLoop->getLoopLatch() == BB) {
-      RemoveLoopFromHierarchy(BBLoop);
-      if (currentLoop == BBLoop) {
-        currentLoop = 0;
-        redoLoop = false;
-      }
-    }
-  }
-
-  // Remove the block from the loop info, which removes it from any loops it
-  // was in.
-  LI->removeBlock(BB);
-
-  // Remove phi node entries in successors for this block.
-  TerminatorInst *TI = BB->getTerminator();
-  SmallVector<BasicBlock*, 4> Succs;
-  for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) {
-    Succs.push_back(TI->getSuccessor(i));
-    TI->getSuccessor(i)->removePredecessor(BB);
-  }
-
-  // Unique the successors, remove anything with multiple uses.
-  array_pod_sort(Succs.begin(), Succs.end());
-  Succs.erase(std::unique(Succs.begin(), Succs.end()), Succs.end());
-
-  // Remove the basic block, including all of the instructions contained in it.
-  LPM->deleteSimpleAnalysisValue(BB, L);
-  BB->eraseFromParent();
-  // Remove successor blocks here that are not dead, so that we know we only
-  // have dead blocks in this list.  Nondead blocks have a way of becoming dead,
-  // then getting removed before we revisit them, which is badness.
-  //
-  for (unsigned i = 0; i != Succs.size(); ++i)
-    if (pred_begin(Succs[i]) != pred_end(Succs[i])) {
-      // One exception is loop headers.  If this block was the preheader for a
-      // loop, then we DO want to visit the loop so the loop gets deleted.
-      // We know that if the successor is a loop header, that this loop had to
-      // be the preheader: the case where this was the latch block was handled
-      // above and headers can only have two predecessors.
-      if (!LI->isLoopHeader(Succs[i])) {
-        Succs.erase(Succs.begin()+i);
-        --i;
-      }
-    }
-
-  for (unsigned i = 0, e = Succs.size(); i != e; ++i)
-    RemoveBlockIfDead(Succs[i], Worklist, L);
-}
-
 /// RemoveLoopFromHierarchy - We have discovered that the specified loop has
 /// become unwrapped, either because the backedge was deleted, or because the
 /// edge into the header was removed.  If the edge into the header from the
@@ -1262,23 +1152,6 @@ void LoopUnswitch::SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L) {
         continue;
       }
 
-      if (ConstantInt *CB = dyn_cast<ConstantInt>(BI->getCondition())){
-        // Conditional branch.  Turn it into an unconditional branch, then
-        // remove dead blocks.
-        continue;  // FIXME: Enable.
-
-        DEBUG(dbgs() << "Folded branch: " << *BI);
-        BasicBlock *DeadSucc = BI->getSuccessor(CB->getZExtValue());
-        BasicBlock *LiveSucc = BI->getSuccessor(!CB->getZExtValue());
-        DeadSucc->removePredecessor(BI->getParent(), true);
-        Worklist.push_back(BranchInst::Create(LiveSucc, BI));
-        LPM->deleteSimpleAnalysisValue(BI, L);
-        BI->eraseFromParent();
-        RemoveFromWorklist(BI, Worklist);
-        ++NumSimplify;
-
-        RemoveBlockIfDead(DeadSucc, Worklist, L);
-      }
       continue;
     }
   }
diff --git a/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index 8f61ffd..9912d3d 100644
--- a/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -170,14 +170,17 @@ bool MemsetRange::isProfitableToUseMemset(const DataLayout &TD) const {
   // pessimize the llvm optimizer.
   //
   // Since we don't have perfect knowledge here, make some assumptions: assume
-  // the maximum GPR width is the same size as the pointer size and assume that
-  // this width can be stored.  If so, check to see whether we will end up
-  // actually reducing the number of stores used.
+  // the maximum GPR width is the same size as the largest legal integer
+  // size. If so, check to see whether we will end up actually reducing the
+  // number of stores used.
   unsigned Bytes = unsigned(End-Start);
-  unsigned NumPointerStores = Bytes/TD.getPointerSize();
+  unsigned MaxIntSize = TD.getLargestLegalIntTypeSize();
+  if (MaxIntSize == 0)
+    MaxIntSize = 1;
+  unsigned NumPointerStores = Bytes / MaxIntSize;
 
   // Assume the remaining bytes if any are done a byte at a time.
-  unsigned NumByteStores = Bytes - NumPointerStores*TD.getPointerSize();
+  unsigned NumByteStores = Bytes - NumPointerStores * MaxIntSize;
 
   // If we will reduce the # stores (according to this heuristic), do the
   // transformation.  This encourages merging 4 x i8 -> i32 and 2 x i16 -> i32
diff --git a/lib/Target/Mips/MipsOptimizeMathLibCalls.cpp b/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp
index de3f09c..15cee44 100644
--- a/lib/Target/Mips/MipsOptimizeMathLibCalls.cpp
+++ b/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp
@@ -1,4 +1,4 @@
-//===---- MipsOptimizeMathLibCalls.cpp - Optimize math lib calls.      ----===//
+//===--- PartiallyInlineLibCalls.cpp - Partially inline libcalls ----------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,76 +7,60 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This pass does an IR transformation which enables the backend to emit native
-// math instructions.
+// This pass tries to partially inline the fast path of well-known library
+// functions, such as using square-root instructions for cases where sqrt()
+// does not need to set errno.
 //
 //===----------------------------------------------------------------------===//
 
-#include "MipsTargetMachine.h"
+#define DEBUG_TYPE "partially-inline-libcalls"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 
 using namespace llvm;
 
-static cl::opt<bool> DisableOpt("disable-mips-math-optimization",
-                                cl::init(false),
-                                cl::desc("MIPS: Disable math lib call "
-                                         "optimization."), cl::Hidden);
-
 namespace {
-  class MipsOptimizeMathLibCalls : public FunctionPass {
+  class PartiallyInlineLibCalls : public FunctionPass {
   public:
     static char ID;
 
-    MipsOptimizeMathLibCalls(MipsTargetMachine &TM_) :
-      FunctionPass(ID), TM(TM_) {}
-
-    virtual const char *getPassName() const {
-      return "MIPS: Optimize calls to math library functions.";
+    PartiallyInlineLibCalls() :
+      FunctionPass(ID) {
+      initializePartiallyInlineLibCallsPass(*PassRegistry::getPassRegistry());
     }
 
     virtual void getAnalysisUsage(AnalysisUsage &AU) const;
-
     virtual bool runOnFunction(Function &F);
 
   private:
     /// Optimize calls to sqrt.
     bool optimizeSQRT(CallInst *Call, Function *CalledFunc,
-                      BasicBlock &CurrBB,
-                      Function::iterator &BB);
-
-    const TargetMachine &TM;
+                      BasicBlock &CurrBB, Function::iterator &BB);
   };
 
-  char MipsOptimizeMathLibCalls::ID = 0;
+  char PartiallyInlineLibCalls::ID = 0;
 }
 
-FunctionPass *llvm::createMipsOptimizeMathLibCalls(MipsTargetMachine &TM) {
-  return new MipsOptimizeMathLibCalls(TM);
-}
+INITIALIZE_PASS(PartiallyInlineLibCalls, "partially-inline-libcalls",
+                "Partially inline calls to library functions", false, false)
 
-void MipsOptimizeMathLibCalls::getAnalysisUsage(AnalysisUsage &AU) const {
+void PartiallyInlineLibCalls::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<TargetLibraryInfo>();
+  AU.addRequired<TargetTransformInfo>();
   FunctionPass::getAnalysisUsage(AU);
 }
 
-bool MipsOptimizeMathLibCalls::runOnFunction(Function &F) {
-  if (DisableOpt)
-    return false;
-
-  const MipsSubtarget &Subtarget = TM.getSubtarget<MipsSubtarget>();
-
-  if (Subtarget.inMips16Mode())
-    return false;
-
+bool PartiallyInlineLibCalls::runOnFunction(Function &F) {
   bool Changed = false;
   Function::iterator CurrBB;
-  const TargetLibraryInfo *LibInfo = &getAnalysis<TargetLibraryInfo>();
-
+  TargetLibraryInfo *TLI = &getAnalysis<TargetLibraryInfo>();
+  const TargetTransformInfo *TTI = &getAnalysis<TargetTransformInfo>();
   for (Function::iterator BB = F.begin(), BE = F.end(); BB != BE;) {
     CurrBB = BB++;
 
@@ -88,25 +72,18 @@ bool MipsOptimizeMathLibCalls::runOnFunction(Function &F) {
       if (!Call || !(CalledFunc = Call->getCalledFunction()))
         continue;
 
-      LibFunc::Func LibFunc;
-      Attribute A = CalledFunc->getAttributes()
-        .getAttribute(AttributeSet::FunctionIndex, "use-soft-float");
-
-      // Skip if function has "use-soft-float" attribute.
-      if ((A.isStringAttribute() && (A.getValueAsString() == "true")) ||
-          TM.Options.UseSoftFloat)
-        continue;
-
       // Skip if function either has local linkage or is not a known library
       // function.
+      LibFunc::Func LibFunc;
       if (CalledFunc->hasLocalLinkage() || !CalledFunc->hasName() ||
-          !LibInfo->getLibFunc(CalledFunc->getName(), LibFunc))
+          !TLI->getLibFunc(CalledFunc->getName(), LibFunc))
         continue;
 
       switch (LibFunc) {
       case LibFunc::sqrtf:
       case LibFunc::sqrt:
-        if (optimizeSQRT(Call, CalledFunc, *CurrBB, BB))
+        if (TTI->haveFastSqrt(Call->getType()) &&
+            optimizeSQRT(Call, CalledFunc, *CurrBB, BB))
           break;
         continue;
       default:
@@ -121,10 +98,10 @@ bool MipsOptimizeMathLibCalls::runOnFunction(Function &F) {
   return Changed;
 }
 
-bool MipsOptimizeMathLibCalls::optimizeSQRT(CallInst *Call,
-                                            Function *CalledFunc,
-                                            BasicBlock &CurrBB,
-                                            Function::iterator &BB) {
+bool PartiallyInlineLibCalls::optimizeSQRT(CallInst *Call,
+                                           Function *CalledFunc,
+                                           BasicBlock &CurrBB,
+                                           Function::iterator &BB) {
   // There is no need to change the IR, since backend will emit sqrt
   // instruction if the call has already been marked read-only.
   if (Call->onlyReadsMemory())
@@ -173,3 +150,7 @@ bool MipsOptimizeMathLibCalls::optimizeSQRT(CallInst *Call,
   BB = JoinBB;
   return true;
 }
+
+FunctionPass *llvm::createPartiallyInlineLibCallsPass() {
+  return new PartiallyInlineLibCalls();
+}
diff --git a/lib/Transforms/Scalar/SROA.cpp b/lib/Transforms/Scalar/SROA.cpp
index 5c55143..9f3fc83 100644
--- a/lib/Transforms/Scalar/SROA.cpp
+++ b/lib/Transforms/Scalar/SROA.cpp
@@ -733,9 +733,9 @@ class AllocaPromoter : public LoadAndStorePromoter {
   SmallVector<DbgValueInst *, 4> DVIs;
 
 public:
-  AllocaPromoter(const SmallVectorImpl<Instruction*> &Insts, SSAUpdater &S,
+  AllocaPromoter(const SmallVectorImpl<Instruction *> &Insts, SSAUpdater &S,
                  AllocaInst &AI, DIBuilder &DIB)
-    : LoadAndStorePromoter(Insts, S), AI(AI), DIB(DIB) {}
+      : LoadAndStorePromoter(Insts, S), AI(AI), DIB(DIB) {}
 
   void run(const SmallVectorImpl<Instruction*> &Insts) {
     // Retain the debug information attached to the alloca for use when
@@ -762,9 +762,30 @@ public:
 
   virtual bool isInstInList(Instruction *I,
                             const SmallVectorImpl<Instruction*> &Insts) const {
+    Value *Ptr;
     if (LoadInst *LI = dyn_cast<LoadInst>(I))
-      return LI->getOperand(0) == &AI;
-    return cast<StoreInst>(I)->getPointerOperand() == &AI;
+      Ptr = LI->getOperand(0);
+    else
+      Ptr = cast<StoreInst>(I)->getPointerOperand();
+
+    // Only used to detect cycles, which will be rare and quickly found as
+    // we're walking up a chain of defs rather than down through uses.
+    SmallPtrSet<Value *, 4> Visited;
+
+    do {
+      if (Ptr == &AI)
+        return true;
+
+      if (BitCastInst *BCI = dyn_cast<BitCastInst>(Ptr))
+        Ptr = BCI->getOperand(0);
+      else if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(Ptr))
+        Ptr = GEPI->getPointerOperand();
+      else
+        return false;
+
+    } while (Visited.insert(Ptr));
+
+    return false;
   }
 
   virtual void updateDebugInfo(Instruction *Inst) const {
@@ -917,6 +938,7 @@ static Type *findCommonType(AllocaSlices::const_iterator B,
                             AllocaSlices::const_iterator E,
                             uint64_t EndOffset) {
   Type *Ty = 0;
+  bool IgnoreNonIntegralTypes = false;
   for (AllocaSlices::const_iterator I = B; I != E; ++I) {
     Use *U = I->getUse();
     if (isa<IntrinsicInst>(*U->getUser()))
@@ -925,29 +947,37 @@ static Type *findCommonType(AllocaSlices::const_iterator B,
       continue;
 
     Type *UserTy = 0;
-    if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser()))
+    if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
       UserTy = LI->getType();
-    else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser()))
+    } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
       UserTy = SI->getValueOperand()->getType();
-    else
-      return 0; // Bail if we have weird uses.
+    } else {
+      IgnoreNonIntegralTypes = true; // Give up on anything but an iN type.
+      continue;
+    }
 
     if (IntegerType *ITy = dyn_cast<IntegerType>(UserTy)) {
       // If the type is larger than the partition, skip it. We only encounter
       // this for split integer operations where we want to use the type of the
-      // entity causing the split.
-      if (ITy->getBitWidth() / 8 > (EndOffset - B->beginOffset()))
+      // entity causing the split. Also skip if the type is not a byte width
+      // multiple.
+      if (ITy->getBitWidth() % 8 != 0 ||
+          ITy->getBitWidth() / 8 > (EndOffset - B->beginOffset()))
         continue;
 
       // If we have found an integer type use covering the alloca, use that
-      // regardless of the other types, as integers are often used for a
-      // "bucket
-      // of bits" type.
+      // regardless of the other types, as integers are often used for
+      // a "bucket of bits" type.
+      //
+      // NB: This *must* be the only return from inside the loop so that the
+      // order of slices doesn't impact the computed type.
       return ITy;
+    } else if (IgnoreNonIntegralTypes) {
+      continue;
     }
 
     if (Ty && Ty != UserTy)
-      return 0;
+      IgnoreNonIntegralTypes = true; // Give up on anything but an iN type.
 
     Ty = UserTy;
   }
@@ -1431,6 +1461,10 @@ static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy) {
   if (!NewTy->isSingleValueType() || !OldTy->isSingleValueType())
     return false;
 
+  // We can convert pointers to integers and vice-versa. Same for vectors
+  // of pointers and integers.
+  OldTy = OldTy->getScalarType();
+  NewTy = NewTy->getScalarType();
   if (NewTy->isPointerTy() || OldTy->isPointerTy()) {
     if (NewTy->isPointerTy() && OldTy->isPointerTy())
       return true;
@@ -1449,21 +1483,53 @@ static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy) {
 /// inttoptr, and ptrtoint casts. Use the \c canConvertValue predicate to test
 /// two types for viability with this routine.
 static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
-                           Type *Ty) {
-  assert(canConvertValue(DL, V->getType(), Ty) &&
-         "Value not convertable to type");
-  if (V->getType() == Ty)
+                           Type *NewTy) {
+  Type *OldTy = V->getType();
+  assert(canConvertValue(DL, OldTy, NewTy) && "Value not convertable to type");
+
+  if (OldTy == NewTy)
     return V;
-  if (IntegerType *OldITy = dyn_cast<IntegerType>(V->getType()))
-    if (IntegerType *NewITy = dyn_cast<IntegerType>(Ty))
+
+  if (IntegerType *OldITy = dyn_cast<IntegerType>(OldTy))
+    if (IntegerType *NewITy = dyn_cast<IntegerType>(NewTy))
       if (NewITy->getBitWidth() > OldITy->getBitWidth())
         return IRB.CreateZExt(V, NewITy);
-  if (V->getType()->isIntegerTy() && Ty->isPointerTy())
-    return IRB.CreateIntToPtr(V, Ty);
-  if (V->getType()->isPointerTy() && Ty->isIntegerTy())
-    return IRB.CreatePtrToInt(V, Ty);
 
-  return IRB.CreateBitCast(V, Ty);
+  // See if we need inttoptr for this type pair. A cast involving both scalars
+  // and vectors requires and additional bitcast.
+  if (OldTy->getScalarType()->isIntegerTy() &&
+      NewTy->getScalarType()->isPointerTy()) {
+    // Expand <2 x i32> to i8* --> <2 x i32> to i64 to i8*
+    if (OldTy->isVectorTy() && !NewTy->isVectorTy())
+      return IRB.CreateIntToPtr(IRB.CreateBitCast(V, DL.getIntPtrType(NewTy)),
+                                NewTy);
+
+    // Expand i128 to <2 x i8*> --> i128 to <2 x i64> to <2 x i8*>
+    if (!OldTy->isVectorTy() && NewTy->isVectorTy())
+      return IRB.CreateIntToPtr(IRB.CreateBitCast(V, DL.getIntPtrType(NewTy)),
+                                NewTy);
+
+    return IRB.CreateIntToPtr(V, NewTy);
+  }
+
+  // See if we need ptrtoint for this type pair. A cast involving both scalars
+  // and vectors requires and additional bitcast.
+  if (OldTy->getScalarType()->isPointerTy() &&
+      NewTy->getScalarType()->isIntegerTy()) {
+    // Expand <2 x i8*> to i128 --> <2 x i8*> to <2 x i64> to i128
+    if (OldTy->isVectorTy() && !NewTy->isVectorTy())
+      return IRB.CreateBitCast(IRB.CreatePtrToInt(V, DL.getIntPtrType(OldTy)),
+                               NewTy);
+
+    // Expand i8* to <2 x i32> --> i8* to i64 to <2 x i32>
+    if (!OldTy->isVectorTy() && NewTy->isVectorTy())
+      return IRB.CreateBitCast(IRB.CreatePtrToInt(V, DL.getIntPtrType(OldTy)),
+                               NewTy);
+
+    return IRB.CreatePtrToInt(V, NewTy);
+  }
+
+  return IRB.CreateBitCast(V, NewTy);
 }
 
 /// \brief Test whether the given slice use can be promoted to a vector.
@@ -3364,12 +3430,12 @@ void SROA::deleteDeadInstructions(SmallPtrSet<AllocaInst*, 4> &DeletedAllocas) {
 }
 
 static void enqueueUsersInWorklist(Instruction &I,
-                                   SmallVectorImpl<Use *> &UseWorklist,
-                                   SmallPtrSet<Use *, 8> &VisitedUses) {
+                                   SmallVectorImpl<Instruction *> &Worklist,
+                                   SmallPtrSet<Instruction *, 8> &Visited) {
   for (Value::use_iterator UI = I.use_begin(), UE = I.use_end(); UI != UE;
        ++UI)
-    if (VisitedUses.insert(&UI.getUse()))
-      UseWorklist.push_back(&UI.getUse());
+    if (Visited.insert(cast<Instruction>(*UI)))
+      Worklist.push_back(cast<Instruction>(*UI));
 }
 
 /// \brief Promote the allocas, using the best available technique.
@@ -3388,7 +3454,7 @@ bool SROA::promoteAllocas(Function &F) {
 
   if (DT && !ForceSSAUpdater) {
     DEBUG(dbgs() << "Promoting allocas with mem2reg...\n");
-    PromoteMemToReg(PromotableAllocas, *DT, DL);
+    PromoteMemToReg(PromotableAllocas, *DT);
     PromotableAllocas.clear();
     return true;
   }
@@ -3396,29 +3462,29 @@ bool SROA::promoteAllocas(Function &F) {
   DEBUG(dbgs() << "Promoting allocas with SSAUpdater...\n");
   SSAUpdater SSA;
   DIBuilder DIB(*F.getParent());
-  SmallVector<Instruction*, 64> Insts;
+  SmallVector<Instruction *, 64> Insts;
 
   // We need a worklist to walk the uses of each alloca.
-  SmallVector<Use *, 8> UseWorklist;
-  SmallPtrSet<Use *, 8> VisitedUses;
+  SmallVector<Instruction *, 8> Worklist;
+  SmallPtrSet<Instruction *, 8> Visited;
   SmallVector<Instruction *, 32> DeadInsts;
 
   for (unsigned Idx = 0, Size = PromotableAllocas.size(); Idx != Size; ++Idx) {
     AllocaInst *AI = PromotableAllocas[Idx];
-    UseWorklist.clear();
-    VisitedUses.clear();
+    Insts.clear();
+    Worklist.clear();
+    Visited.clear();
 
-    enqueueUsersInWorklist(*AI, UseWorklist, VisitedUses);
+    enqueueUsersInWorklist(*AI, Worklist, Visited);
 
-    while (!UseWorklist.empty()) {
-      Use *U = UseWorklist.pop_back_val();
-      Instruction &I = *cast<Instruction>(U->getUser());
+    while (!Worklist.empty()) {
+      Instruction *I = Worklist.pop_back_val();
 
       // FIXME: Currently the SSAUpdater infrastructure doesn't reason about
       // lifetime intrinsics and so we strip them (and the bitcasts+GEPs
       // leading to them) here. Eventually it should use them to optimize the
       // scalar values produced.
-      if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) {
+      if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
         assert(II->getIntrinsicID() == Intrinsic::lifetime_start ||
                II->getIntrinsicID() == Intrinsic::lifetime_end);
         II->eraseFromParent();
@@ -3428,12 +3494,12 @@ bool SROA::promoteAllocas(Function &F) {
       // Push the loads and stores we find onto the list. SROA will already
       // have validated that all loads and stores are viable candidates for
       // promotion.
-      if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
+      if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
         assert(LI->getType() == AI->getAllocatedType());
         Insts.push_back(LI);
         continue;
       }
-      if (StoreInst *SI = dyn_cast<StoreInst>(&I)) {
+      if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
         assert(SI->getValueOperand()->getType() == AI->getAllocatedType());
         Insts.push_back(SI);
         continue;
@@ -3442,11 +3508,10 @@ bool SROA::promoteAllocas(Function &F) {
       // For everything else, we know that only no-op bitcasts and GEPs will
       // make it this far, just recurse through them and recall them for later
       // removal.
-      DeadInsts.push_back(&I);
-      enqueueUsersInWorklist(I, UseWorklist, VisitedUses);
+      DeadInsts.push_back(I);
+      enqueueUsersInWorklist(*I, Worklist, Visited);
     }
     AllocaPromoter(Insts, SSA, *AI, DIB).run(Insts);
-    Insts.clear();
     while (!DeadInsts.empty())
       DeadInsts.pop_back_val()->eraseFromParent();
     AI->eraseFromParent();
diff --git a/lib/Transforms/Scalar/SampleProfile.cpp b/lib/Transforms/Scalar/SampleProfile.cpp
new file mode 100644
index 0000000..9bcd702
--- /dev/null
+++ b/lib/Transforms/Scalar/SampleProfile.cpp
@@ -0,0 +1,479 @@
+//===- SampleProfile.cpp - Incorporate sample profiles into the IR --------===//
+//
+//                      The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the SampleProfileLoader transformation. This pass
+// reads a profile file generated by a sampling profiler (e.g. Linux Perf -
+// http://perf.wiki.kernel.org/) and generates IR metadata to reflect the
+// profile information in the given profile.
+//
+// This pass generates branch weight annotations on the IR:
+//
+// - prof: Represents branch weights. This annotation is added to branches
+//      to indicate the weights of each edge coming out of the branch.
+//      The weight of each edge is the weight of the target block for
+//      that edge. The weight of a block B is computed as the maximum
+//      number of samples found in B.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "sample-profile"
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/OwningPtr.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/DebugInfo/DIContext.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/InstIterator.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Regex.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+
+using namespace llvm;
+
+// Command line option to specify the file to read samples from. This is
+// mainly used for debugging.
+static cl::opt<std::string> SampleProfileFile(
+    "sample-profile-file", cl::init(""), cl::value_desc("filename"),
+    cl::desc("Profile file loaded by -sample-profile"), cl::Hidden);
+
+namespace {
+/// \brief Sample-based profile reader.
+///
+/// Each profile contains sample counts for all the functions
+/// executed. Inside each function, statements are annotated with the
+/// collected samples on all the instructions associated with that
+/// statement.
+///
+/// For this to produce meaningful data, the program needs to be
+/// compiled with some debug information (at minimum, line numbers:
+/// -gline-tables-only). Otherwise, it will be impossible to match IR
+/// instructions to the line numbers collected by the profiler.
+///
+/// From the profile file, we are interested in collecting the
+/// following information:
+///
+/// * A list of functions included in the profile (mangled names).
+///
+/// * For each function F:
+///   1. The total number of samples collected in F.
+///
+///   2. The samples collected at each line in F. To provide some
+///      protection against source code shuffling, line numbers should
+///      be relative to the start of the function.
+class SampleProfile {
+public:
+  SampleProfile(StringRef F) : Profiles(0), Filename(F) {}
+
+  void dump();
+  void loadText();
+  void loadNative() { llvm_unreachable("not implemented"); }
+  bool emitAnnotations(Function &F);
+  void printFunctionProfile(raw_ostream &OS, StringRef FName);
+  void dumpFunctionProfile(StringRef FName);
+
+protected:
+  typedef DenseMap<uint32_t, uint32_t> BodySampleMap;
+  typedef DenseMap<BasicBlock *, uint32_t> BlockWeightMap;
+
+  /// \brief Representation of the runtime profile for a function.
+  ///
+  /// This data structure contains the runtime profile for a given
+  /// function. It contains the total number of samples collected
+  /// in the function and a map of samples collected in every statement.
+  struct FunctionProfile {
+    /// \brief Total number of samples collected inside this function.
+    ///
+    /// Samples are cumulative, they include all the samples collected
+    /// inside this function and all its inlined callees.
+    unsigned TotalSamples;
+
+    // \brief Total number of samples collected at the head of the function.
+    unsigned TotalHeadSamples;
+
+    /// \brief Map line offsets to collected samples.
+    ///
+    /// Each entry in this map contains the number of samples
+    /// collected at the corresponding line offset. All line locations
+    /// are an offset from the start of the function.
+    BodySampleMap BodySamples;
+
+    /// \brief Map basic blocks to their computed weights.
+    ///
+    /// The weight of a basic block is defined to be the maximum
+    /// of all the instruction weights in that block.
+    BlockWeightMap BlockWeights;
+  };
+
+  uint32_t getInstWeight(Instruction &I, unsigned FirstLineno,
+                         BodySampleMap &BodySamples);
+  uint32_t computeBlockWeight(BasicBlock *B, unsigned FirstLineno,
+                              BodySampleMap &BodySamples);
+
+  /// \brief Map every function to its associated profile.
+  ///
+  /// The profile of every function executed at runtime is collected
+  /// in the structure FunctionProfile. This maps function objects
+  /// to their corresponding profiles.
+  StringMap<FunctionProfile> Profiles;
+
+  /// \brief Path name to the file holding the profile data.
+  ///
+  /// The format of this file is defined by each profiler
+  /// independently. If possible, the profiler should have a text
+  /// version of the profile format to be used in constructing test
+  /// cases and debugging.
+  StringRef Filename;
+};
+
+/// \brief Loader class for text-based profiles.
+///
+/// This class defines a simple interface to read text files containing
+/// profiles. It keeps track of line number information and location of
+/// the file pointer. Users of this class are responsible for actually
+/// parsing the lines returned by the readLine function.
+///
+/// TODO - This does not really belong here. It is a generic text file
+/// reader. It should be moved to the Support library and made more general.
+class ExternalProfileTextLoader {
+public:
+  ExternalProfileTextLoader(StringRef F) : Filename(F) {
+    error_code EC;
+    EC = MemoryBuffer::getFile(Filename, Buffer);
+    if (EC)
+      report_fatal_error("Could not open profile file " + Filename + ": " +
+                         EC.message());
+    FP = Buffer->getBufferStart();
+    Lineno = 0;
+  }
+
+  /// \brief Read a line from the mapped file.
+  StringRef readLine() {
+    size_t Length = 0;
+    const char *start = FP;
+    while (FP != Buffer->getBufferEnd() && *FP != '\n') {
+      Length++;
+      FP++;
+    }
+    if (FP != Buffer->getBufferEnd())
+      FP++;
+    Lineno++;
+    return StringRef(start, Length);
+  }
+
+  /// \brief Return true, if we've reached EOF.
+  bool atEOF() const { return FP == Buffer->getBufferEnd(); }
+
+  /// \brief Report a parse error message and stop compilation.
+  void reportParseError(Twine Msg) const {
+    report_fatal_error(Filename + ":" + Twine(Lineno) + ": " + Msg + "\n");
+  }
+
+private:
+  /// \brief Memory buffer holding the text file.
+  OwningPtr<MemoryBuffer> Buffer;
+
+  /// \brief Current position into the memory buffer.
+  const char *FP;
+
+  /// \brief Current line number.
+  int64_t Lineno;
+
+  /// \brief Path name where to the profile file.
+  StringRef Filename;
+};
+
+/// \brief Sample profile pass.
+///
+/// This pass reads profile data from the file specified by
+/// -sample-profile-file and annotates every affected function with the
+/// profile information found in that file.
+class SampleProfileLoader : public FunctionPass {
+public:
+  // Class identification, replacement for typeinfo
+  static char ID;
+
+  SampleProfileLoader(StringRef Name = SampleProfileFile)
+      : FunctionPass(ID), Profiler(0), Filename(Name) {
+    initializeSampleProfileLoaderPass(*PassRegistry::getPassRegistry());
+  }
+
+  virtual bool doInitialization(Module &M);
+
+  void dump() { Profiler->dump(); }
+
+  virtual const char *getPassName() const { return "Sample profile pass"; }
+
+  virtual bool runOnFunction(Function &F);
+
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.setPreservesCFG();
+  }
+
+protected:
+  /// \brief Profile reader object.
+  OwningPtr<SampleProfile> Profiler;
+
+  /// \brief Name of the profile file to load.
+  StringRef Filename;
+};
+}
+
+/// \brief Print the function profile for \p FName on stream \p OS.
+///
+/// \param OS Stream to emit the output to.
+/// \param FName Name of the function to print.
+void SampleProfile::printFunctionProfile(raw_ostream &OS, StringRef FName) {
+  FunctionProfile FProfile = Profiles[FName];
+  OS << "Function: " << FName << ", " << FProfile.TotalSamples << ", "
+     << FProfile.TotalHeadSamples << ", " << FProfile.BodySamples.size()
+     << " sampled lines\n";
+  for (BodySampleMap::const_iterator SI = FProfile.BodySamples.begin(),
+                                     SE = FProfile.BodySamples.end();
+       SI != SE; ++SI)
+    OS << "\tline offset: " << SI->first
+       << ", number of samples: " << SI->second << "\n";
+  OS << "\n";
+}
+
+/// \brief Dump the function profile for \p FName.
+///
+/// \param FName Name of the function to print.
+void SampleProfile::dumpFunctionProfile(StringRef FName) {
+  printFunctionProfile(dbgs(), FName);
+}
+
+/// \brief Dump all the function profiles found.
+void SampleProfile::dump() {
+  for (StringMap<FunctionProfile>::const_iterator I = Profiles.begin(),
+                                                  E = Profiles.end();
+       I != E; ++I)
+    dumpFunctionProfile(I->getKey());
+}
+
+/// \brief Load samples from a text file.
+///
+/// The file is divided in two segments:
+///
+/// Symbol table (represented with the string "symbol table")
+///    Number of symbols in the table
+///    symbol 1
+///    symbol 2
+///    ...
+///    symbol N
+///
+/// Function body profiles
+///    function1:total_samples:total_head_samples:number_of_locations
+///    location_offset_1: number_of_samples
+///    location_offset_2: number_of_samples
+///    ...
+///    location_offset_N: number_of_samples
+///
+/// Function names must be mangled in order for the profile loader to
+/// match them in the current translation unit.
+///
+/// Since this is a flat profile, a function that shows up more than
+/// once gets all its samples aggregated across all its instances.
+/// TODO - flat profiles are too imprecise to provide good optimization
+/// opportunities. Convert them to context-sensitive profile.
+///
+/// This textual representation is useful to generate unit tests and
+/// for debugging purposes, but it should not be used to generate
+/// profiles for large programs, as the representation is extremely
+/// inefficient.
+void SampleProfile::loadText() {
+  ExternalProfileTextLoader Loader(Filename);
+
+  // Read the symbol table.
+  StringRef Line = Loader.readLine();
+  if (Line != "symbol table")
+    Loader.reportParseError("Expected 'symbol table', found " + Line);
+  int NumSymbols;
+  Line = Loader.readLine();
+  if (Line.getAsInteger(10, NumSymbols))
+    Loader.reportParseError("Expected a number, found " + Line);
+  for (int I = 0; I < NumSymbols; I++) {
+    StringRef FName = Loader.readLine();
+    FunctionProfile &FProfile = Profiles[FName];
+    FProfile.BodySamples.clear();
+    FProfile.TotalSamples = 0;
+    FProfile.TotalHeadSamples = 0;
+  }
+
+  // Read the profile of each function. Since each function may be
+  // mentioned more than once, and we are collecting flat profiles,
+  // accumulate samples as we parse them.
+  Regex HeadRE("^([^:]+):([0-9]+):([0-9]+):([0-9]+)$");
+  Regex LineSample("^([0-9]+): ([0-9]+)$");
+  while (!Loader.atEOF()) {
+    SmallVector<StringRef, 4> Matches;
+    Line = Loader.readLine();
+    if (!HeadRE.match(Line, &Matches))
+      Loader.reportParseError("Expected 'mangled_name:NUM:NUM:NUM', found " +
+                              Line);
+    assert(Matches.size() == 5);
+    StringRef FName = Matches[1];
+    unsigned NumSamples, NumHeadSamples, NumSampledLines;
+    Matches[2].getAsInteger(10, NumSamples);
+    Matches[3].getAsInteger(10, NumHeadSamples);
+    Matches[4].getAsInteger(10, NumSampledLines);
+    FunctionProfile &FProfile = Profiles[FName];
+    FProfile.TotalSamples += NumSamples;
+    FProfile.TotalHeadSamples += NumHeadSamples;
+    BodySampleMap &SampleMap = FProfile.BodySamples;
+    unsigned I;
+    for (I = 0; I < NumSampledLines && !Loader.atEOF(); I++) {
+      Line = Loader.readLine();
+      if (!LineSample.match(Line, &Matches))
+        Loader.reportParseError("Expected 'NUM: NUM', found " + Line);
+      assert(Matches.size() == 3);
+      unsigned LineOffset, NumSamples;
+      Matches[1].getAsInteger(10, LineOffset);
+      Matches[2].getAsInteger(10, NumSamples);
+      SampleMap[LineOffset] += NumSamples;
+    }
+
+    if (I < NumSampledLines)
+      Loader.reportParseError("Unexpected end of file");
+  }
+}
+
+/// \brief Get the weight for an instruction.
+///
+/// The "weight" of an instruction \p Inst is the number of samples
+/// collected on that instruction at runtime. To retrieve it, we
+/// need to compute the line number of \p Inst relative to the start of its
+/// function. We use \p FirstLineno to compute the offset. We then
+/// look up the samples collected for \p Inst using \p BodySamples.
+///
+/// \param Inst Instruction to query.
+/// \param FirstLineno Line number of the first instruction in the function.
+/// \param BodySamples Map of relative source line locations to samples.
+///
+/// \returns The profiled weight of I.
+uint32_t SampleProfile::getInstWeight(Instruction &Inst, unsigned FirstLineno,
+                                      BodySampleMap &BodySamples) {
+  unsigned LOffset = Inst.getDebugLoc().getLine() - FirstLineno + 1;
+  return BodySamples.lookup(LOffset);
+}
+
+/// \brief Compute the weight of a basic block.
+///
+/// The weight of basic block \p B is the maximum weight of all the
+/// instructions in B.
+///
+/// \param B The basic block to query.
+/// \param FirstLineno The line number for the first line in the
+///     function holding B.
+/// \param BodySamples The map containing all the samples collected in that
+///     function.
+///
+/// \returns The computed weight of B.
+uint32_t SampleProfile::computeBlockWeight(BasicBlock *B, unsigned FirstLineno,
+                                           BodySampleMap &BodySamples) {
+  // If we've computed B's weight before, return it.
+  Function *F = B->getParent();
+  FunctionProfile &FProfile = Profiles[F->getName()];
+  std::pair<BlockWeightMap::iterator, bool> Entry =
+      FProfile.BlockWeights.insert(std::make_pair(B, 0));
+  if (!Entry.second)
+    return Entry.first->second;
+
+  // Otherwise, compute and cache B's weight.
+  uint32_t Weight = 0;
+  for (BasicBlock::iterator I = B->begin(), E = B->end(); I != E; ++I) {
+    uint32_t InstWeight = getInstWeight(*I, FirstLineno, BodySamples);
+    if (InstWeight > Weight)
+      Weight = InstWeight;
+  }
+  Entry.first->second = Weight;
+  return Weight;
+}
+
+/// \brief Generate branch weight metadata for all branches in \p F.
+///
+/// For every branch instruction B in \p F, we compute the weight of the
+/// target block for each of the edges out of B. This is the weight
+/// that we associate with that branch.
+///
+/// TODO - This weight assignment will most likely be wrong if the
+/// target branch has more than two predecessors. This needs to be done
+/// using some form of flow propagation.
+///
+/// Once all the branch weights are computed, we emit the MD_prof
+/// metadata on B using the computed values.
+///
+/// \param F The function to query.
+bool SampleProfile::emitAnnotations(Function &F) {
+  bool Changed = false;
+  FunctionProfile &FProfile = Profiles[F.getName()];
+  unsigned FirstLineno = inst_begin(F)->getDebugLoc().getLine();
+  MDBuilder MDB(F.getContext());
+
+  // Clear the block weights cache.
+  FProfile.BlockWeights.clear();
+
+  // When we find a branch instruction: For each edge E out of the branch,
+  // the weight of E is the weight of the target block.
+  for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I) {
+    BasicBlock *B = I;
+    TerminatorInst *TI = B->getTerminator();
+    if (TI->getNumSuccessors() == 1)
+      continue;
+    if (!isa<BranchInst>(TI) && !isa<SwitchInst>(TI))
+      continue;
+
+    SmallVector<uint32_t, 4> Weights;
+    unsigned NSuccs = TI->getNumSuccessors();
+    for (unsigned I = 0; I < NSuccs; ++I) {
+      BasicBlock *Succ = TI->getSuccessor(I);
+      uint32_t Weight =
+          computeBlockWeight(Succ, FirstLineno, FProfile.BodySamples);
+      Weights.push_back(Weight);
+    }
+
+    TI->setMetadata(llvm::LLVMContext::MD_prof,
+                    MDB.createBranchWeights(Weights));
+    Changed = true;
+  }
+
+  return Changed;
+}
+
+char SampleProfileLoader::ID = 0;
+INITIALIZE_PASS(SampleProfileLoader, "sample-profile", "Sample Profile loader",
+                false, false)
+
+bool SampleProfileLoader::runOnFunction(Function &F) {
+  return Profiler->emitAnnotations(F);
+}
+
+bool SampleProfileLoader::doInitialization(Module &M) {
+  Profiler.reset(new SampleProfile(Filename));
+  Profiler->loadText();
+  return true;
+}
+
+FunctionPass *llvm::createSampleProfileLoaderPass() {
+  return new SampleProfileLoader(SampleProfileFile);
+}
+
+FunctionPass *llvm::createSampleProfileLoaderPass(StringRef Name) {
+  return new SampleProfileLoader(Name);
+}
diff --git a/lib/Transforms/Scalar/Scalar.cpp b/lib/Transforms/Scalar/Scalar.cpp
index 758334d..857597e 100644
--- a/lib/Transforms/Scalar/Scalar.cpp
+++ b/lib/Transforms/Scalar/Scalar.cpp
@@ -28,7 +28,7 @@ using namespace llvm;
 /// ScalarOpts library.
 void llvm::initializeScalarOpts(PassRegistry &Registry) {
   initializeADCEPass(Registry);
-  initializeBlockPlacementPass(Registry);
+  initializeSampleProfileLoaderPass(Registry);
   initializeCodeGenPreparePass(Registry);
   initializeConstantPropagationPass(Registry);
   initializeCorrelatedValuePropagationPass(Registry);
@@ -44,12 +44,14 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
   initializeLoopInstSimplifyPass(Registry);
   initializeLoopRotatePass(Registry);
   initializeLoopStrengthReducePass(Registry);
+  initializeLoopRerollPass(Registry);
   initializeLoopUnrollPass(Registry);
   initializeLoopUnswitchPass(Registry);
   initializeLoopIdiomRecognizePass(Registry);
   initializeLowerAtomicPass(Registry);
   initializeLowerExpectIntrinsicPass(Registry);
   initializeMemCpyOptPass(Registry);
+  initializePartiallyInlineLibCallsPass(Registry);
   initializeReassociatePass(Registry);
   initializeRegToMemPass(Registry);
   initializeSCCPPass(Registry);
@@ -111,6 +113,10 @@ void LLVMAddLoopRotatePass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createLoopRotatePass());
 }
 
+void LLVMAddLoopRerollPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createLoopRerollPass());
+}
+
 void LLVMAddLoopUnrollPass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createLoopUnrollPass());
 }
@@ -123,6 +129,10 @@ void LLVMAddMemCpyOptPass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createMemCpyOptPass());
 }
 
+void LLVMAddPartiallyInlineLibCallsPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createPartiallyInlineLibCallsPass());
+}
+
 void LLVMAddPromoteMemoryToRegisterPass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createPromoteMemoryToRegisterPass());
 }
diff --git a/lib/Transforms/Scalar/ScalarReplAggregates.cpp b/lib/Transforms/Scalar/ScalarReplAggregates.cpp
index 73b2edf..57b290e 100644
--- a/lib/Transforms/Scalar/ScalarReplAggregates.cpp
+++ b/lib/Transforms/Scalar/ScalarReplAggregates.cpp
@@ -963,7 +963,7 @@ ConvertScalar_InsertValue(Value *SV, Value *Old,
   if (SV->getType()->isFloatingPointTy() || SV->getType()->isVectorTy())
     SV = Builder.CreateBitCast(SV, IntegerType::get(SV->getContext(),SrcWidth));
   else if (SV->getType()->isPointerTy())
-    SV = Builder.CreatePtrToInt(SV, TD.getIntPtrType(SV->getContext()));
+    SV = Builder.CreatePtrToInt(SV, TD.getIntPtrType(SV->getType()));
 
   // Zero extend or truncate the value if needed.
   if (SV->getType() != AllocaType) {
@@ -1426,7 +1426,7 @@ bool SROA::performPromotion(Function &F) {
     if (Allocas.empty()) break;
 
     if (HasDomTree)
-      PromoteMemToReg(Allocas, *DT, TD);
+      PromoteMemToReg(Allocas, *DT);
     else {
       SSAUpdater SSA;
       for (unsigned i = 0, e = Allocas.size(); i != e; ++i) {
diff --git a/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/lib/Transforms/Scalar/SimplifyCFGPass.cpp
index 6d05640..8371f6d 100644
--- a/lib/Transforms/Scalar/SimplifyCFGPass.cpp
+++ b/lib/Transforms/Scalar/SimplifyCFGPass.cpp
@@ -66,161 +66,6 @@ FunctionPass *llvm::createCFGSimplificationPass() {
   return new CFGSimplifyPass();
 }
 
-/// changeToUnreachable - Insert an unreachable instruction before the specified
-/// instruction, making it and the rest of the code in the block dead.
-static void changeToUnreachable(Instruction *I, bool UseLLVMTrap) {
-  BasicBlock *BB = I->getParent();
-  // Loop over all of the successors, removing BB's entry from any PHI
-  // nodes.
-  for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI)
-    (*SI)->removePredecessor(BB);
-
-  // Insert a call to llvm.trap right before this.  This turns the undefined
-  // behavior into a hard fail instead of falling through into random code.
-  if (UseLLVMTrap) {
-    Function *TrapFn =
-      Intrinsic::getDeclaration(BB->getParent()->getParent(), Intrinsic::trap);
-    CallInst *CallTrap = CallInst::Create(TrapFn, "", I);
-    CallTrap->setDebugLoc(I->getDebugLoc());
-  }
-  new UnreachableInst(I->getContext(), I);
-
-  // All instructions after this are dead.
-  BasicBlock::iterator BBI = I, BBE = BB->end();
-  while (BBI != BBE) {
-    if (!BBI->use_empty())
-      BBI->replaceAllUsesWith(UndefValue::get(BBI->getType()));
-    BB->getInstList().erase(BBI++);
-  }
-}
-
-/// changeToCall - Convert the specified invoke into a normal call.
-static void changeToCall(InvokeInst *II) {
-  SmallVector<Value*, 8> Args(II->op_begin(), II->op_end() - 3);
-  CallInst *NewCall = CallInst::Create(II->getCalledValue(), Args, "", II);
-  NewCall->takeName(II);
-  NewCall->setCallingConv(II->getCallingConv());
-  NewCall->setAttributes(II->getAttributes());
-  NewCall->setDebugLoc(II->getDebugLoc());
-  II->replaceAllUsesWith(NewCall);
-
-  // Follow the call by a branch to the normal destination.
-  BranchInst::Create(II->getNormalDest(), II);
-
-  // Update PHI nodes in the unwind destination
-  II->getUnwindDest()->removePredecessor(II->getParent());
-  II->eraseFromParent();
-}
-
-static bool markAliveBlocks(BasicBlock *BB,
-                            SmallPtrSet<BasicBlock*, 128> &Reachable) {
-
-  SmallVector<BasicBlock*, 128> Worklist;
-  Worklist.push_back(BB);
-  Reachable.insert(BB);
-  bool Changed = false;
-  do {
-    BB = Worklist.pop_back_val();
-
-    // Do a quick scan of the basic block, turning any obviously unreachable
-    // instructions into LLVM unreachable insts.  The instruction combining pass
-    // canonicalizes unreachable insts into stores to null or undef.
-    for (BasicBlock::iterator BBI = BB->begin(), E = BB->end(); BBI != E;++BBI){
-      if (CallInst *CI = dyn_cast<CallInst>(BBI)) {
-        if (CI->doesNotReturn()) {
-          // If we found a call to a no-return function, insert an unreachable
-          // instruction after it.  Make sure there isn't *already* one there
-          // though.
-          ++BBI;
-          if (!isa<UnreachableInst>(BBI)) {
-            // Don't insert a call to llvm.trap right before the unreachable.
-            changeToUnreachable(BBI, false);
-            Changed = true;
-          }
-          break;
-        }
-      }
-
-      // Store to undef and store to null are undefined and used to signal that
-      // they should be changed to unreachable by passes that can't modify the
-      // CFG.
-      if (StoreInst *SI = dyn_cast<StoreInst>(BBI)) {
-        // Don't touch volatile stores.
-        if (SI->isVolatile()) continue;
-
-        Value *Ptr = SI->getOperand(1);
-
-        if (isa<UndefValue>(Ptr) ||
-            (isa<ConstantPointerNull>(Ptr) &&
-             SI->getPointerAddressSpace() == 0)) {
-          changeToUnreachable(SI, true);
-          Changed = true;
-          break;
-        }
-      }
-    }
-
-    // Turn invokes that call 'nounwind' functions into ordinary calls.
-    if (InvokeInst *II = dyn_cast<InvokeInst>(BB->getTerminator())) {
-      Value *Callee = II->getCalledValue();
-      if (isa<ConstantPointerNull>(Callee) || isa<UndefValue>(Callee)) {
-        changeToUnreachable(II, true);
-        Changed = true;
-      } else if (II->doesNotThrow()) {
-        if (II->use_empty() && II->onlyReadsMemory()) {
-          // jump to the normal destination branch.
-          BranchInst::Create(II->getNormalDest(), II);
-          II->getUnwindDest()->removePredecessor(II->getParent());
-          II->eraseFromParent();
-        } else
-          changeToCall(II);
-        Changed = true;
-      }
-    }
-
-    Changed |= ConstantFoldTerminator(BB, true);
-    for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI)
-      if (Reachable.insert(*SI))
-        Worklist.push_back(*SI);
-  } while (!Worklist.empty());
-  return Changed;
-}
-
-/// removeUnreachableBlocksFromFn - Remove blocks that are not reachable, even
-/// if they are in a dead cycle.  Return true if a change was made, false
-/// otherwise.
-static bool removeUnreachableBlocksFromFn(Function &F) {
-  SmallPtrSet<BasicBlock*, 128> Reachable;
-  bool Changed = markAliveBlocks(F.begin(), Reachable);
-
-  // If there are unreachable blocks in the CFG...
-  if (Reachable.size() == F.size())
-    return Changed;
-
-  assert(Reachable.size() < F.size());
-  NumSimpl += F.size()-Reachable.size();
-
-  // Loop over all of the basic blocks that are not reachable, dropping all of
-  // their internal references...
-  for (Function::iterator BB = ++F.begin(), E = F.end(); BB != E; ++BB) {
-    if (Reachable.count(BB))
-      continue;
-
-    for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI)
-      if (Reachable.count(*SI))
-        (*SI)->removePredecessor(BB);
-    BB->dropAllReferences();
-  }
-
-  for (Function::iterator I = ++F.begin(); I != F.end();)
-    if (!Reachable.count(I))
-      I = F.getBasicBlockList().erase(I);
-    else
-      ++I;
-
-  return true;
-}
-
 /// mergeEmptyReturnBlocks - If we have more than one empty (other than phi
 /// node) return blocks, merge them together to promote recursive block merging.
 static bool mergeEmptyReturnBlocks(Function &F) {
@@ -325,7 +170,7 @@ static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI,
 bool CFGSimplifyPass::runOnFunction(Function &F) {
   const TargetTransformInfo &TTI = getAnalysis<TargetTransformInfo>();
   const DataLayout *TD = getAnalysisIfAvailable<DataLayout>();
-  bool EverChanged = removeUnreachableBlocksFromFn(F);
+  bool EverChanged = removeUnreachableBlocks(F);
   EverChanged |= mergeEmptyReturnBlocks(F);
   EverChanged |= iterativelySimplifyCFG(F, TTI, TD);
 
@@ -333,16 +178,16 @@ bool CFGSimplifyPass::runOnFunction(Function &F) {
   if (!EverChanged) return false;
 
   // iterativelySimplifyCFG can (rarely) make some loops dead.  If this happens,
-  // removeUnreachableBlocksFromFn is needed to nuke them, which means we should
+  // removeUnreachableBlocks is needed to nuke them, which means we should
   // iterate between the two optimizations.  We structure the code like this to
   // avoid reruning iterativelySimplifyCFG if the second pass of
-  // removeUnreachableBlocksFromFn doesn't do anything.
-  if (!removeUnreachableBlocksFromFn(F))
+  // removeUnreachableBlocks doesn't do anything.
+  if (!removeUnreachableBlocks(F))
     return true;
 
   do {
     EverChanged = iterativelySimplifyCFG(F, TTI, TD);
-    EverChanged |= removeUnreachableBlocksFromFn(F);
+    EverChanged |= removeUnreachableBlocks(F);
   } while (EverChanged);
 
   return true;
diff --git a/lib/Transforms/Scalar/StructurizeCFG.cpp b/lib/Transforms/Scalar/StructurizeCFG.cpp
index bb6f163..5045ff8 100644
--- a/lib/Transforms/Scalar/StructurizeCFG.cpp
+++ b/lib/Transforms/Scalar/StructurizeCFG.cpp
@@ -231,7 +231,7 @@ public:
 
   StructurizeCFG() :
     RegionPass(ID) {
-    initializeRegionInfoPass(*PassRegistry::getPassRegistry());
+    initializeStructurizeCFGPass(*PassRegistry::getPassRegistry());
   }
 
   using Pass::doInitialization;
@@ -244,6 +244,7 @@ public:
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.addRequiredID(LowerSwitchID);
     AU.addRequired<DominatorTree>();
     AU.addPreserved<DominatorTree>();
     RegionPass::getAnalysisUsage(AU);
@@ -256,6 +257,7 @@ char StructurizeCFG::ID = 0;
 
 INITIALIZE_PASS_BEGIN(StructurizeCFG, "structurizecfg", "Structurize the CFG",
                       false, false)
+INITIALIZE_PASS_DEPENDENCY(LowerSwitch)
 INITIALIZE_PASS_DEPENDENCY(DominatorTree)
 INITIALIZE_PASS_DEPENDENCY(RegionInfo)
 INITIALIZE_PASS_END(StructurizeCFG, "structurizecfg", "Structurize the CFG",
@@ -321,21 +323,32 @@ Value *StructurizeCFG::invert(Value *Condition) {
   if (match(Condition, m_Not(m_Value(Condition))))
     return Condition;
 
-  // Third: Check all the users for an invert
-  BasicBlock *Parent = cast<Instruction>(Condition)->getParent();
-  for (Value::use_iterator I = Condition->use_begin(),
-       E = Condition->use_end(); I != E; ++I) {
+  if (Instruction *Inst = dyn_cast<Instruction>(Condition)) {
+    // Third: Check all the users for an invert
+    BasicBlock *Parent = Inst->getParent();
+    for (Value::use_iterator I = Condition->use_begin(),
+           E = Condition->use_end(); I != E; ++I) {
 
-    Instruction *User = dyn_cast<Instruction>(*I);
-    if (!User || User->getParent() != Parent)
-      continue;
+      Instruction *User = dyn_cast<Instruction>(*I);
+      if (!User || User->getParent() != Parent)
+        continue;
+
+      if (match(*I, m_Not(m_Specific(Condition))))
+        return *I;
+    }
 
-    if (match(*I, m_Not(m_Specific(Condition))))
-      return *I;
+    // Last option: Create a new instruction
+    return BinaryOperator::CreateNot(Condition, "", Parent->getTerminator());
   }
 
-  // Last option: Create a new instruction
-  return BinaryOperator::CreateNot(Condition, "", Parent->getTerminator());
+  if (Argument *Arg = dyn_cast<Argument>(Condition)) {
+    BasicBlock &EntryBlock = Arg->getParent()->getEntryBlock();
+    return BinaryOperator::CreateNot(Condition,
+                                     Arg->getName() + ".inv",
+                                     EntryBlock.getTerminator());
+  }
+
+  llvm_unreachable("Unhandled condition to invert");
 }
 
 /// \brief Build the condition for one edge
@@ -766,6 +779,20 @@ void StructurizeCFG::handleLoops(bool ExitUseAllowed,
     handleLoops(false, LoopEnd);
   }
 
+  // If the start of the loop is the entry block, we can't branch to it so
+  // insert a new dummy entry block.
+  Function *LoopFunc = LoopStart->getParent();
+  if (LoopStart == &LoopFunc->getEntryBlock()) {
+    LoopStart->setName("entry.orig");
+
+    BasicBlock *NewEntry =
+      BasicBlock::Create(LoopStart->getContext(),
+                         "entry",
+                         LoopFunc,
+                         LoopStart);
+    BranchInst::Create(LoopStart, NewEntry);
+  }
+
   // Create an extra loop end node
   LoopEnd = needPrefix(false);
   BasicBlock *Next = needPostfix(LoopEnd, ExitUseAllowed);
diff --git a/lib/Transforms/Utils/BasicBlockUtils.cpp b/lib/Transforms/Utils/BasicBlockUtils.cpp
index e17a416..12de9ee 100644
--- a/lib/Transforms/Utils/BasicBlockUtils.cpp
+++ b/lib/Transforms/Utils/BasicBlockUtils.cpp
@@ -248,7 +248,6 @@ BasicBlock *llvm::SplitEdge(BasicBlock *BB, BasicBlock *Succ, Pass *P) {
 
   // If the edge isn't critical, then BB has a single successor or Succ has a
   // single pred.  Split the block.
-  BasicBlock::iterator SplitPoint;
   if (BasicBlock *SP = Succ->getSinglePredecessor()) {
     // If the successor only has a single pred, split the top of the successor
     // block.
@@ -401,8 +400,12 @@ static void UpdatePHINodes(BasicBlock *OrigBB, BasicBlock *NewBB,
       // If all incoming values for the new PHI would be the same, just don't
       // make a new PHI.  Instead, just remove the incoming values from the old
       // PHI.
-      for (unsigned i = 0, e = Preds.size(); i != e; ++i)
-        PN->removeIncomingValue(Preds[i], false);
+      for (unsigned i = 0, e = Preds.size(); i != e; ++i) {
+        // Explicitly check the BB index here to handle duplicates in Preds.
+        int Idx = PN->getBasicBlockIndex(Preds[i]);
+        if (Idx >= 0)
+          PN->removeIncomingValue(Idx, false);
+      }
     } else {
       // If the values coming into the block are not the same, we need a PHI.
       // Create the new PHI node, insert it into NewBB at the end of the block
diff --git a/lib/Transforms/Utils/BreakCriticalEdges.cpp b/lib/Transforms/Utils/BreakCriticalEdges.cpp
index 8f3ff96..0e7f7f7 100644
--- a/lib/Transforms/Utils/BreakCriticalEdges.cpp
+++ b/lib/Transforms/Utils/BreakCriticalEdges.cpp
@@ -22,7 +22,6 @@
 #include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/ProfileInfo.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Type.h"
@@ -45,7 +44,6 @@ namespace {
     virtual void getAnalysisUsage(AnalysisUsage &AU) const {
       AU.addPreserved<DominatorTree>();
       AU.addPreserved<LoopInfo>();
-      AU.addPreserved<ProfileInfo>();
 
       // No loop canonicalization guarantees are broken by this pass.
       AU.addPreservedID(LoopSimplifyID);
@@ -213,10 +211,9 @@ BasicBlock *llvm::SplitCriticalEdge(TerminatorInst *TI, unsigned SuccNum,
 
   DominatorTree *DT = P->getAnalysisIfAvailable<DominatorTree>();
   LoopInfo *LI = P->getAnalysisIfAvailable<LoopInfo>();
-  ProfileInfo *PI = P->getAnalysisIfAvailable<ProfileInfo>();
 
   // If we have nothing to update, just return.
-  if (DT == 0 && LI == 0 && PI == 0)
+  if (DT == 0 && LI == 0)
     return NewBB;
 
   // Now update analysis information.  Since the only predecessor of NewBB is
@@ -369,9 +366,5 @@ BasicBlock *llvm::SplitCriticalEdge(TerminatorInst *TI, unsigned SuccNum,
     }
   }
 
-  // Update ProfileInfo if it is around.
-  if (PI)
-    PI->splitEdge(TIBB, DestBB, NewBB, MergeIdenticalEdges);
-
   return NewBB;
 }
diff --git a/lib/Transforms/Utils/CMakeLists.txt b/lib/Transforms/Utils/CMakeLists.txt
index 3648fd6..5afd6b8 100644
--- a/lib/Transforms/Utils/CMakeLists.txt
+++ b/lib/Transforms/Utils/CMakeLists.txt
@@ -8,6 +8,7 @@ add_llvm_library(LLVMTransformUtils
   CmpInstAnalysis.cpp
   CodeExtractor.cpp
   DemoteRegToStack.cpp
+  GlobalStatus.cpp
   InlineFunction.cpp
   InstructionNamer.cpp
   IntegerDivision.cpp
diff --git a/lib/Transforms/Utils/CodeExtractor.cpp b/lib/Transforms/Utils/CodeExtractor.cpp
index 82013f9..6f00864 100644
--- a/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/lib/Transforms/Utils/CodeExtractor.cpp
@@ -665,8 +665,7 @@ emitCallAndSwitchStatement(Function *newFunction, BasicBlock *codeReplacer,
     TheSwitch->setCondition(call);
     TheSwitch->setDefaultDest(TheSwitch->getSuccessor(NumExitBlocks));
     // Remove redundant case
-    SwitchInst::CaseIt ToBeRemoved(TheSwitch, NumExitBlocks-1);
-    TheSwitch->removeCase(ToBeRemoved);
+    TheSwitch->removeCase(SwitchInst::CaseIt(TheSwitch, NumExitBlocks-1));
     break;
   }
 }
diff --git a/lib/Transforms/Utils/FlattenCFG.cpp b/lib/Transforms/Utils/FlattenCFG.cpp
index 9cbe15d..1da226b 100644
--- a/lib/Transforms/Utils/FlattenCFG.cpp
+++ b/lib/Transforms/Utils/FlattenCFG.cpp
@@ -266,8 +266,7 @@ bool FlattenCFGOpt::FlattenParallelAndOr(BasicBlock *BB, IRBuilder<> &Builder,
   BasicBlock *CB;
   BranchInst *PBI = dyn_cast<BranchInst>(FirstCondBlock->getTerminator());
   bool Iteration = true;
-  BasicBlock *SaveInsertBB = Builder.GetInsertBlock();
-  BasicBlock::iterator SaveInsertPt = Builder.GetInsertPoint();
+  IRBuilder<>::InsertPointGuard Guard(Builder);
   Value *PC = PBI->getCondition();
 
   do {
@@ -298,7 +297,6 @@ bool FlattenCFGOpt::FlattenParallelAndOr(BasicBlock *BB, IRBuilder<> &Builder,
     new UnreachableInst(CB->getContext(), CB);
   } while (Iteration);
 
-  Builder.SetInsertPoint(SaveInsertBB, SaveInsertPt);
   DEBUG(dbgs() << "Use parallel and/or in:\n" << *FirstCondBlock);
   return true;
 }
@@ -372,7 +370,7 @@ bool FlattenCFGOpt::CompareIfRegionBlock(BasicBlock *Head1, BasicBlock *Head2,
 
 /// Check whether \param BB is the merge block of a if-region.  If yes, check
 /// whether there exists an adjacent if-region upstream, the two if-regions
-/// contain identical instuctions and can be legally merged.  \returns true if
+/// contain identical instructions and can be legally merged.  \returns true if
 /// the two if-regions are merged.
 ///
 /// From:
diff --git a/lib/Transforms/Utils/GlobalStatus.cpp b/lib/Transforms/Utils/GlobalStatus.cpp
new file mode 100644
index 0000000..5f0a563
--- /dev/null
+++ b/lib/Transforms/Utils/GlobalStatus.cpp
@@ -0,0 +1,183 @@
+//===-- GlobalStatus.cpp - Compute status info for globals -----------------==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/Transforms/Utils/GlobalStatus.h"
+
+using namespace llvm;
+
+/// Return the stronger of the two ordering. If the two orderings are acquire
+/// and release, then return AcquireRelease.
+///
+static AtomicOrdering strongerOrdering(AtomicOrdering X, AtomicOrdering Y) {
+  if (X == Acquire && Y == Release)
+    return AcquireRelease;
+  if (Y == Acquire && X == Release)
+    return AcquireRelease;
+  return (AtomicOrdering)std::max(X, Y);
+}
+
+/// It is safe to destroy a constant iff it is only used by constants itself.
+/// Note that constants cannot be cyclic, so this test is pretty easy to
+/// implement recursively.
+///
+bool llvm::isSafeToDestroyConstant(const Constant *C) {
+  if (isa<GlobalValue>(C))
+    return false;
+
+  for (Value::const_use_iterator UI = C->use_begin(), E = C->use_end(); UI != E;
+       ++UI)
+    if (const Constant *CU = dyn_cast<Constant>(*UI)) {
+      if (!isSafeToDestroyConstant(CU))
+        return false;
+    } else
+      return false;
+  return true;
+}
+
+static bool analyzeGlobalAux(const Value *V, GlobalStatus &GS,
+                             SmallPtrSet<const PHINode *, 16> &PhiUsers) {
+  for (Value::const_use_iterator UI = V->use_begin(), E = V->use_end(); UI != E;
+       ++UI) {
+    const User *U = *UI;
+    if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(U)) {
+      GS.HasNonInstructionUser = true;
+
+      // If the result of the constantexpr isn't pointer type, then we won't
+      // know to expect it in various places.  Just reject early.
+      if (!isa<PointerType>(CE->getType()))
+        return true;
+
+      if (analyzeGlobalAux(CE, GS, PhiUsers))
+        return true;
+    } else if (const Instruction *I = dyn_cast<Instruction>(U)) {
+      if (!GS.HasMultipleAccessingFunctions) {
+        const Function *F = I->getParent()->getParent();
+        if (GS.AccessingFunction == 0)
+          GS.AccessingFunction = F;
+        else if (GS.AccessingFunction != F)
+          GS.HasMultipleAccessingFunctions = true;
+      }
+      if (const LoadInst *LI = dyn_cast<LoadInst>(I)) {
+        GS.IsLoaded = true;
+        // Don't hack on volatile loads.
+        if (LI->isVolatile())
+          return true;
+        GS.Ordering = strongerOrdering(GS.Ordering, LI->getOrdering());
+      } else if (const StoreInst *SI = dyn_cast<StoreInst>(I)) {
+        // Don't allow a store OF the address, only stores TO the address.
+        if (SI->getOperand(0) == V)
+          return true;
+
+        // Don't hack on volatile stores.
+        if (SI->isVolatile())
+          return true;
+
+        GS.Ordering = strongerOrdering(GS.Ordering, SI->getOrdering());
+
+        // If this is a direct store to the global (i.e., the global is a scalar
+        // value, not an aggregate), keep more specific information about
+        // stores.
+        if (GS.StoredType != GlobalStatus::Stored) {
+          if (const GlobalVariable *GV =
+                  dyn_cast<GlobalVariable>(SI->getOperand(1))) {
+            Value *StoredVal = SI->getOperand(0);
+
+            if (Constant *C = dyn_cast<Constant>(StoredVal)) {
+              if (C->isThreadDependent()) {
+                // The stored value changes between threads; don't track it.
+                return true;
+              }
+            }
+
+            if (StoredVal == GV->getInitializer()) {
+              if (GS.StoredType < GlobalStatus::InitializerStored)
+                GS.StoredType = GlobalStatus::InitializerStored;
+            } else if (isa<LoadInst>(StoredVal) &&
+                       cast<LoadInst>(StoredVal)->getOperand(0) == GV) {
+              if (GS.StoredType < GlobalStatus::InitializerStored)
+                GS.StoredType = GlobalStatus::InitializerStored;
+            } else if (GS.StoredType < GlobalStatus::StoredOnce) {
+              GS.StoredType = GlobalStatus::StoredOnce;
+              GS.StoredOnceValue = StoredVal;
+            } else if (GS.StoredType == GlobalStatus::StoredOnce &&
+                       GS.StoredOnceValue == StoredVal) {
+              // noop.
+            } else {
+              GS.StoredType = GlobalStatus::Stored;
+            }
+          } else {
+            GS.StoredType = GlobalStatus::Stored;
+          }
+        }
+      } else if (isa<BitCastInst>(I)) {
+        if (analyzeGlobalAux(I, GS, PhiUsers))
+          return true;
+      } else if (isa<GetElementPtrInst>(I)) {
+        if (analyzeGlobalAux(I, GS, PhiUsers))
+          return true;
+      } else if (isa<SelectInst>(I)) {
+        if (analyzeGlobalAux(I, GS, PhiUsers))
+          return true;
+      } else if (const PHINode *PN = dyn_cast<PHINode>(I)) {
+        // PHI nodes we can check just like select or GEP instructions, but we
+        // have to be careful about infinite recursion.
+        if (PhiUsers.insert(PN)) // Not already visited.
+          if (analyzeGlobalAux(I, GS, PhiUsers))
+            return true;
+      } else if (isa<CmpInst>(I)) {
+        GS.IsCompared = true;
+      } else if (const MemTransferInst *MTI = dyn_cast<MemTransferInst>(I)) {
+        if (MTI->isVolatile())
+          return true;
+        if (MTI->getArgOperand(0) == V)
+          GS.StoredType = GlobalStatus::Stored;
+        if (MTI->getArgOperand(1) == V)
+          GS.IsLoaded = true;
+      } else if (const MemSetInst *MSI = dyn_cast<MemSetInst>(I)) {
+        assert(MSI->getArgOperand(0) == V && "Memset only takes one pointer!");
+        if (MSI->isVolatile())
+          return true;
+        GS.StoredType = GlobalStatus::Stored;
+      } else if (ImmutableCallSite C = I) {
+        if (!C.isCallee(UI))
+          return true;
+        GS.IsLoaded = true;
+      } else {
+        return true; // Any other non-load instruction might take address!
+      }
+    } else if (const Constant *C = dyn_cast<Constant>(U)) {
+      GS.HasNonInstructionUser = true;
+      // We might have a dead and dangling constant hanging off of here.
+      if (!isSafeToDestroyConstant(C))
+        return true;
+    } else {
+      GS.HasNonInstructionUser = true;
+      // Otherwise must be some other user.
+      return true;
+    }
+  }
+
+  return false;
+}
+
+bool GlobalStatus::analyzeGlobal(const Value *V, GlobalStatus &GS) {
+  SmallPtrSet<const PHINode *, 16> PhiUsers;
+  return analyzeGlobalAux(V, GS, PhiUsers);
+}
+
+GlobalStatus::GlobalStatus()
+    : IsCompared(false), IsLoaded(false), StoredType(NotStored),
+      StoredOnceValue(0), AccessingFunction(0),
+      HasMultipleAccessingFunctions(false), HasNonInstructionUser(false),
+      Ordering(NotAtomic) {}
diff --git a/lib/Transforms/Utils/InlineFunction.cpp b/lib/Transforms/Utils/InlineFunction.cpp
index dabb67b..d021bce 100644
--- a/lib/Transforms/Utils/InlineFunction.cpp
+++ b/lib/Transforms/Utils/InlineFunction.cpp
@@ -193,7 +193,8 @@ static bool HandleCallsInBlockInlinedThroughInvoke(BasicBlock *BB,
     CallInst *CI = dyn_cast<CallInst>(I);
 
     // If this call cannot unwind, don't convert it to an invoke.
-    if (!CI || CI->doesNotThrow())
+    // Inline asm calls cannot throw.
+    if (!CI || CI->doesNotThrow() || isa<InlineAsm>(CI->getCalledValue()))
       continue;
 
     // Convert this function call into an invoke instruction.  First, split the
diff --git a/lib/Transforms/Utils/LCSSA.cpp b/lib/Transforms/Utils/LCSSA.cpp
index 2d1b166..f15e8d5 100644
--- a/lib/Transforms/Utils/LCSSA.cpp
+++ b/lib/Transforms/Utils/LCSSA.cpp
@@ -55,7 +55,6 @@ namespace {
     DominatorTree *DT;
     LoopInfo *LI;
     ScalarEvolution *SE;
-    std::vector<BasicBlock*> LoopBlocks;
     PredIteratorCache PredCache;
     Loop *L;
     
@@ -82,11 +81,6 @@ namespace {
       // Check the special guarantees that LCSSA makes.
       assert(L->isLCSSAForm(*DT) && "LCSSA form not preserved!");
     }
-
-    /// inLoop - returns true if the given block is within the current loop
-    bool inLoop(BasicBlock *B) const {
-      return std::binary_search(LoopBlocks.begin(), LoopBlocks.end(), B);
-    }
   };
 }
   
@@ -129,11 +123,6 @@ bool LCSSA::runOnLoop(Loop *TheLoop, LPPassManager &LPM) {
   if (ExitBlocks.empty())
     return false;
   
-  // Speed up queries by creating a sorted vector of blocks.
-  LoopBlocks.clear();
-  LoopBlocks.insert(LoopBlocks.end(), L->block_begin(), L->block_end());
-  array_pod_sort(LoopBlocks.begin(), LoopBlocks.end());
-  
   // Look at all the instructions in the loop, checking to see if they have uses
   // outside the loop.  If so, rewrite those uses.
   bool MadeChange = false;
@@ -198,7 +187,7 @@ bool LCSSA::ProcessInstruction(Instruction *Inst,
     if (PHINode *PN = dyn_cast<PHINode>(U))
       UserBB = PN->getIncomingBlock(UI);
     
-    if (InstBB != UserBB && !inLoop(UserBB))
+    if (InstBB != UserBB && !L->contains(UserBB))
       UsesToRewrite.push_back(&UI.getUse());
   }
 
@@ -244,7 +233,7 @@ bool LCSSA::ProcessInstruction(Instruction *Inst,
       // If the exit block has a predecessor not within the loop, arrange for
       // the incoming value use corresponding to that predecessor to be
       // rewritten in terms of a different LCSSA PHI.
-      if (!inLoop(*PI))
+      if (!L->contains(*PI))
         UsesToRewrite.push_back(
           &PN->getOperandUse(
             PN->getOperandNumForIncomingValue(PN->getNumIncomingValues()-1)));
diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp
index 08e1808..2768041 100644
--- a/lib/Transforms/Utils/Local.cpp
+++ b/lib/Transforms/Utils/Local.cpp
@@ -16,10 +16,10 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
-#include "llvm/Analysis/ProfileInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/DIBuilder.h"
 #include "llvm/DebugInfo.h"
@@ -43,6 +43,8 @@
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
+STATISTIC(NumRemoved, "Number of unreachable basic blocks removed");
+
 //===----------------------------------------------------------------------===//
 //  Local constant propagation.
 //
@@ -193,33 +195,28 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions,
       // Otherwise, we can fold this switch into a conditional branch
       // instruction if it has only one non-default destination.
       SwitchInst::CaseIt FirstCase = SI->case_begin();
-      IntegersSubset& Case = FirstCase.getCaseValueEx();
-      if (Case.isSingleNumber()) {
-        // FIXME: Currently work with ConstantInt based numbers.
-        Value *Cond = Builder.CreateICmpEQ(SI->getCondition(),
-             Case.getSingleNumber(0).toConstantInt(),
-            "cond");
-
-        // Insert the new branch.
-        BranchInst *NewBr = Builder.CreateCondBr(Cond,
-                                FirstCase.getCaseSuccessor(),
-                                SI->getDefaultDest());
-        MDNode* MD = SI->getMetadata(LLVMContext::MD_prof);
-        if (MD && MD->getNumOperands() == 3) {
-          ConstantInt *SICase = dyn_cast<ConstantInt>(MD->getOperand(2));
-          ConstantInt *SIDef = dyn_cast<ConstantInt>(MD->getOperand(1));
-          assert(SICase && SIDef);
-          // The TrueWeight should be the weight for the single case of SI.
-          NewBr->setMetadata(LLVMContext::MD_prof,
-                 MDBuilder(BB->getContext()).
-                 createBranchWeights(SICase->getValue().getZExtValue(),
-                                     SIDef->getValue().getZExtValue()));
-        }
+      Value *Cond = Builder.CreateICmpEQ(SI->getCondition(),
+          FirstCase.getCaseValue(), "cond");
 
-        // Delete the old switch.
-        SI->eraseFromParent();
-        return true;
+      // Insert the new branch.
+      BranchInst *NewBr = Builder.CreateCondBr(Cond,
+                                               FirstCase.getCaseSuccessor(),
+                                               SI->getDefaultDest());
+      MDNode* MD = SI->getMetadata(LLVMContext::MD_prof);
+      if (MD && MD->getNumOperands() == 3) {
+        ConstantInt *SICase = dyn_cast<ConstantInt>(MD->getOperand(2));
+        ConstantInt *SIDef = dyn_cast<ConstantInt>(MD->getOperand(1));
+        assert(SICase && SIDef);
+        // The TrueWeight should be the weight for the single case of SI.
+        NewBr->setMetadata(LLVMContext::MD_prof,
+                        MDBuilder(BB->getContext()).
+                        createBranchWeights(SICase->getValue().getZExtValue(),
+                                            SIDef->getValue().getZExtValue()));
       }
+
+      // Delete the old switch.
+      SI->eraseFromParent();
+      return true;
     }
     return false;
   }
@@ -415,7 +412,7 @@ bool llvm::SimplifyInstructionsInBlock(BasicBlock *BB, const DataLayout *TD,
     Instruction *Inst = BI++;
 
     WeakVH BIHandle(BI);
-    if (recursivelySimplifyInstruction(Inst, TD)) {
+    if (recursivelySimplifyInstruction(Inst, TD, TLI)) {
       MadeChange = true;
       if (BIHandle != BI)
         BI = BB->begin();
@@ -515,11 +512,6 @@ void llvm::MergeBasicBlockIntoOnlyPred(BasicBlock *DestBB, Pass *P) {
       DT->changeImmediateDominator(DestBB, PredBBIDom);
       DT->eraseNode(PredBB);
     }
-    ProfileInfo *PI = P->getAnalysisIfAvailable<ProfileInfo>();
-    if (PI) {
-      PI->replaceAllUses(PredBB, DestBB);
-      PI->removeEdge(ProfileInfo::getEdge(PredBB, DestBB));
-    }
   }
   // Nuke BB.
   PredBB->eraseFromParent();
@@ -533,7 +525,7 @@ static bool CanMergeValues(Value *First, Value *Second) {
 }
 
 /// CanPropagatePredecessorsForPHIs - Return true if we can fold BB, an
-/// almost-empty BB ending in an unconditional branch to Succ, into succ.
+/// almost-empty BB ending in an unconditional branch to Succ, into Succ.
 ///
 /// Assumption: Succ is the single successor for BB.
 ///
@@ -1053,7 +1045,11 @@ bool llvm::LowerDbgDeclare(Function &F) {
   for (SmallVectorImpl<DbgDeclareInst *>::iterator I = Dbgs.begin(),
          E = Dbgs.end(); I != E; ++I) {
     DbgDeclareInst *DDI = *I;
-    if (AllocaInst *AI = dyn_cast_or_null<AllocaInst>(DDI->getAddress())) {
+    AllocaInst *AI = dyn_cast_or_null<AllocaInst>(DDI->getAddress());
+    // If this is an alloca for a scalar variable, insert a dbg.value
+    // at each load and store to the alloca and erase the dbg.declare.
+    if (AI && !AI->isArrayAllocation()) {
+
       // We only remove the dbg.declare intrinsic if all uses are
       // converted to dbg.value intrinsics.
       bool RemoveDDI = true;
@@ -1121,33 +1117,153 @@ bool llvm::replaceDbgDeclareForAlloca(AllocaInst *AI, Value *NewAllocaAddress,
   return true;
 }
 
-bool llvm::removeUnreachableBlocks(Function &F) {
-  SmallPtrSet<BasicBlock*, 16> Reachable;
+/// changeToUnreachable - Insert an unreachable instruction before the specified
+/// instruction, making it and the rest of the code in the block dead.
+static void changeToUnreachable(Instruction *I, bool UseLLVMTrap) {
+  BasicBlock *BB = I->getParent();
+  // Loop over all of the successors, removing BB's entry from any PHI
+  // nodes.
+  for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI)
+    (*SI)->removePredecessor(BB);
+
+  // Insert a call to llvm.trap right before this.  This turns the undefined
+  // behavior into a hard fail instead of falling through into random code.
+  if (UseLLVMTrap) {
+    Function *TrapFn =
+      Intrinsic::getDeclaration(BB->getParent()->getParent(), Intrinsic::trap);
+    CallInst *CallTrap = CallInst::Create(TrapFn, "", I);
+    CallTrap->setDebugLoc(I->getDebugLoc());
+  }
+  new UnreachableInst(I->getContext(), I);
+
+  // All instructions after this are dead.
+  BasicBlock::iterator BBI = I, BBE = BB->end();
+  while (BBI != BBE) {
+    if (!BBI->use_empty())
+      BBI->replaceAllUsesWith(UndefValue::get(BBI->getType()));
+    BB->getInstList().erase(BBI++);
+  }
+}
+
+/// changeToCall - Convert the specified invoke into a normal call.
+static void changeToCall(InvokeInst *II) {
+  SmallVector<Value*, 8> Args(II->op_begin(), II->op_end() - 3);
+  CallInst *NewCall = CallInst::Create(II->getCalledValue(), Args, "", II);
+  NewCall->takeName(II);
+  NewCall->setCallingConv(II->getCallingConv());
+  NewCall->setAttributes(II->getAttributes());
+  NewCall->setDebugLoc(II->getDebugLoc());
+  II->replaceAllUsesWith(NewCall);
+
+  // Follow the call by a branch to the normal destination.
+  BranchInst::Create(II->getNormalDest(), II);
+
+  // Update PHI nodes in the unwind destination
+  II->getUnwindDest()->removePredecessor(II->getParent());
+  II->eraseFromParent();
+}
+
+static bool markAliveBlocks(BasicBlock *BB,
+                            SmallPtrSet<BasicBlock*, 128> &Reachable) {
+
   SmallVector<BasicBlock*, 128> Worklist;
-  Worklist.push_back(&F.getEntryBlock());
-  Reachable.insert(&F.getEntryBlock());
+  Worklist.push_back(BB);
+  Reachable.insert(BB);
+  bool Changed = false;
   do {
-    BasicBlock *BB = Worklist.pop_back_val();
+    BB = Worklist.pop_back_val();
+
+    // Do a quick scan of the basic block, turning any obviously unreachable
+    // instructions into LLVM unreachable insts.  The instruction combining pass
+    // canonicalizes unreachable insts into stores to null or undef.
+    for (BasicBlock::iterator BBI = BB->begin(), E = BB->end(); BBI != E;++BBI){
+      if (CallInst *CI = dyn_cast<CallInst>(BBI)) {
+        if (CI->doesNotReturn()) {
+          // If we found a call to a no-return function, insert an unreachable
+          // instruction after it.  Make sure there isn't *already* one there
+          // though.
+          ++BBI;
+          if (!isa<UnreachableInst>(BBI)) {
+            // Don't insert a call to llvm.trap right before the unreachable.
+            changeToUnreachable(BBI, false);
+            Changed = true;
+          }
+          break;
+        }
+      }
+
+      // Store to undef and store to null are undefined and used to signal that
+      // they should be changed to unreachable by passes that can't modify the
+      // CFG.
+      if (StoreInst *SI = dyn_cast<StoreInst>(BBI)) {
+        // Don't touch volatile stores.
+        if (SI->isVolatile()) continue;
+
+        Value *Ptr = SI->getOperand(1);
+
+        if (isa<UndefValue>(Ptr) ||
+            (isa<ConstantPointerNull>(Ptr) &&
+             SI->getPointerAddressSpace() == 0)) {
+          changeToUnreachable(SI, true);
+          Changed = true;
+          break;
+        }
+      }
+    }
+
+    // Turn invokes that call 'nounwind' functions into ordinary calls.
+    if (InvokeInst *II = dyn_cast<InvokeInst>(BB->getTerminator())) {
+      Value *Callee = II->getCalledValue();
+      if (isa<ConstantPointerNull>(Callee) || isa<UndefValue>(Callee)) {
+        changeToUnreachable(II, true);
+        Changed = true;
+      } else if (II->doesNotThrow()) {
+        if (II->use_empty() && II->onlyReadsMemory()) {
+          // jump to the normal destination branch.
+          BranchInst::Create(II->getNormalDest(), II);
+          II->getUnwindDest()->removePredecessor(II->getParent());
+          II->eraseFromParent();
+        } else
+          changeToCall(II);
+        Changed = true;
+      }
+    }
+
+    Changed |= ConstantFoldTerminator(BB, true);
     for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI)
       if (Reachable.insert(*SI))
         Worklist.push_back(*SI);
   } while (!Worklist.empty());
+  return Changed;
+}
 
+/// removeUnreachableBlocksFromFn - Remove blocks that are not reachable, even
+/// if they are in a dead cycle.  Return true if a change was made, false
+/// otherwise.
+bool llvm::removeUnreachableBlocks(Function &F) {
+  SmallPtrSet<BasicBlock*, 128> Reachable;
+  bool Changed = markAliveBlocks(F.begin(), Reachable);
+
+  // If there are unreachable blocks in the CFG...
   if (Reachable.size() == F.size())
-    return false;
+    return Changed;
 
   assert(Reachable.size() < F.size());
-  for (Function::iterator I = llvm::next(F.begin()), E = F.end(); I != E; ++I) {
-    if (Reachable.count(I))
+  NumRemoved += F.size()-Reachable.size();
+
+  // Loop over all of the basic blocks that are not reachable, dropping all of
+  // their internal references...
+  for (Function::iterator BB = ++F.begin(), E = F.end(); BB != E; ++BB) {
+    if (Reachable.count(BB))
       continue;
 
-    for (succ_iterator SI = succ_begin(I), SE = succ_end(I); SI != SE; ++SI)
+    for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI)
       if (Reachable.count(*SI))
-        (*SI)->removePredecessor(I);
-    I->dropAllReferences();
+        (*SI)->removePredecessor(BB);
+    BB->dropAllReferences();
   }
 
-  for (Function::iterator I = llvm::next(F.begin()), E=F.end(); I != E;)
+  for (Function::iterator I = ++F.begin(); I != F.end();)
     if (!Reachable.count(I))
       I = F.getBasicBlockList().erase(I);
     else
diff --git a/lib/Transforms/Utils/LoopUnroll.cpp b/lib/Transforms/Utils/LoopUnroll.cpp
index cb581b3..162807d 100644
--- a/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/lib/Transforms/Utils/LoopUnroll.cpp
@@ -90,7 +90,8 @@ static BasicBlock *FoldBlockIntoPredecessor(BasicBlock *BB, LoopInfo* LI,
   // Move all definitions in the successor to the predecessor...
   OnlyPred->getInstList().splice(OnlyPred->end(), BB->getInstList());
 
-  std::string OldName = BB->getName();
+  // OldName will be valid until erased.
+  StringRef OldName = BB->getName();
 
   // Erase basic block from the function...
 
@@ -102,12 +103,13 @@ static BasicBlock *FoldBlockIntoPredecessor(BasicBlock *BB, LoopInfo* LI,
     }
   }
   LI->removeBlock(BB);
-  BB->eraseFromParent();
 
   // Inherit predecessor's name if it exists...
   if (!OldName.empty() && !OnlyPred->hasName())
     OnlyPred->setName(OldName);
 
+  BB->eraseFromParent();
+
   return OnlyPred;
 }
 
@@ -239,8 +241,6 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount,
     DEBUG(dbgs() << "!\n");
   }
 
-  std::vector<BasicBlock*> LoopBlocks = L->getBlocks();
-
   bool ContinueOnTrue = L->contains(BI->getSuccessor(0));
   BasicBlock *LoopExit = BI->getSuccessor(ContinueOnTrue);
 
diff --git a/lib/Transforms/Utils/LowerExpectIntrinsic.cpp b/lib/Transforms/Utils/LowerExpectIntrinsic.cpp
index 4aee8ff..e017f50 100644
--- a/lib/Transforms/Utils/LowerExpectIntrinsic.cpp
+++ b/lib/Transforms/Utils/LowerExpectIntrinsic.cpp
@@ -29,7 +29,7 @@
 
 using namespace llvm;
 
-STATISTIC(IfHandled, "Number of 'expect' intrinsic intructions handled");
+STATISTIC(IfHandled, "Number of 'expect' intrinsic instructions handled");
 
 static cl::opt<uint32_t>
 LikelyBranchWeight("likely-branch-weight", cl::Hidden, cl::init(64),
diff --git a/lib/Transforms/Utils/LowerInvoke.cpp b/lib/Transforms/Utils/LowerInvoke.cpp
index f66b54d..9799a30 100644
--- a/lib/Transforms/Utils/LowerInvoke.cpp
+++ b/lib/Transforms/Utils/LowerInvoke.cpp
@@ -346,7 +346,6 @@ splitLiveRangesLiveAcrossInvokes(SmallVectorImpl<InvokeInst*> &Invokes) {
       // Scan all of the uses and see if the live range is live across an unwind
       // edge.  If we find a use live across an invoke edge, create an alloca
       // and spill the value.
-      std::set<InvokeInst*> InvokesWithStoreInserted;
 
       // Find all of the blocks that this value is live in.
       std::set<BasicBlock*> LiveBBs;
diff --git a/lib/Transforms/Utils/LowerSwitch.cpp b/lib/Transforms/Utils/LowerSwitch.cpp
index 955b853..2d2a8a5 100644
--- a/lib/Transforms/Utils/LowerSwitch.cpp
+++ b/lib/Transforms/Utils/LowerSwitch.cpp
@@ -66,6 +66,18 @@ namespace {
                              BasicBlock* OrigBlock, BasicBlock* Default);
     unsigned Clusterify(CaseVector& Cases, SwitchInst *SI);
   };
+
+  /// The comparison function for sorting the switch case values in the vector.
+  /// WARNING: Case ranges should be disjoint!
+  struct CaseCmp {
+    bool operator () (const LowerSwitch::CaseRange& C1,
+                      const LowerSwitch::CaseRange& C2) {
+
+      const ConstantInt* CI1 = cast<const ConstantInt>(C1.Low);
+      const ConstantInt* CI2 = cast<const ConstantInt>(C2.High);
+      return CI1->getValue().slt(CI2->getValue());
+    }
+  };
 }
 
 char LowerSwitch::ID = 0;
@@ -147,7 +159,7 @@ BasicBlock* LowerSwitch::switchConvert(CaseItr Begin, CaseItr End,
   Function::iterator FI = OrigBlock;
   F->getBasicBlockList().insert(++FI, NewNode);
 
-  ICmpInst* Comp = new ICmpInst(ICmpInst::ICMP_ULT,
+  ICmpInst* Comp = new ICmpInst(ICmpInst::ICMP_SLT,
                                 Val, Pivot.Low, "Pivot");
   NewNode->getInstList().push_back(Comp);
   BranchInst::Create(LBranch, RBranch, Comp, NewNode);
@@ -222,34 +234,40 @@ BasicBlock* LowerSwitch::newLeafBlock(CaseRange& Leaf, Value* Val,
 
 // Clusterify - Transform simple list of Cases into list of CaseRange's
 unsigned LowerSwitch::Clusterify(CaseVector& Cases, SwitchInst *SI) {
-
-  IntegersSubsetToBB TheClusterifier;
+  unsigned numCmps = 0;
 
   // Start with "simple" cases
-  for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end();
-       i != e; ++i) {
-    BasicBlock *SuccBB = i.getCaseSuccessor();
-    IntegersSubset CaseRanges = i.getCaseValueEx();
-    TheClusterifier.add(CaseRanges, SuccBB);
-  }
-  
-  TheClusterifier.optimize();
+  for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end(); i != e; ++i)
+    Cases.push_back(CaseRange(i.getCaseValue(), i.getCaseValue(),
+                              i.getCaseSuccessor()));
   
-  size_t numCmps = 0;
-  for (IntegersSubsetToBB::RangeIterator i = TheClusterifier.begin(),
-       e = TheClusterifier.end(); i != e; ++i, ++numCmps) {
-    IntegersSubsetToBB::Cluster &C = *i;
-    
-    // FIXME: Currently work with ConstantInt based numbers.
-    // Changing it to APInt based is a pretty heavy for this commit.
-    Cases.push_back(CaseRange(C.first.getLow().toConstantInt(),
-                              C.first.getHigh().toConstantInt(), C.second));
-    if (C.first.isSingleNumber())
+  std::sort(Cases.begin(), Cases.end(), CaseCmp());
+
+  // Merge case into clusters
+  if (Cases.size()>=2)
+    for (CaseItr I=Cases.begin(), J=llvm::next(Cases.begin()); J!=Cases.end(); ) {
+      int64_t nextValue = cast<ConstantInt>(J->Low)->getSExtValue();
+      int64_t currentValue = cast<ConstantInt>(I->High)->getSExtValue();
+      BasicBlock* nextBB = J->BB;
+      BasicBlock* currentBB = I->BB;
+
+      // If the two neighboring cases go to the same destination, merge them
+      // into a single case.
+      if ((nextValue-currentValue==1) && (currentBB == nextBB)) {
+        I->High = J->High;
+        J = Cases.erase(J);
+      } else {
+        I = J++;
+      }
+    }
+
+  for (CaseItr I=Cases.begin(), E=Cases.end(); I!=E; ++I, ++numCmps) {
+    if (I->Low != I->High)
       // A range counts double, since it requires two compares.
       ++numCmps;
   }
 
-  return numCmps;  
+  return numCmps;
 }
 
 // processSwitchInst - Replace the specified switch instruction with a sequence
diff --git a/lib/Transforms/Utils/Mem2Reg.cpp b/lib/Transforms/Utils/Mem2Reg.cpp
index ebd7db6..61b3965 100644
--- a/lib/Transforms/Utils/Mem2Reg.cpp
+++ b/lib/Transforms/Utils/Mem2Reg.cpp
@@ -16,7 +16,6 @@
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/Dominators.h"
-#include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/Transforms/Utils/PromoteMemToReg.h"
@@ -28,7 +27,6 @@ STATISTIC(NumPromoted, "Number of alloca's promoted");
 namespace {
   struct PromotePass : public FunctionPass {
     static char ID; // Pass identification, replacement for typeid
-
     PromotePass() : FunctionPass(ID) {
       initializePromotePassPass(*PassRegistry::getPassRegistry());
     }
@@ -64,7 +62,6 @@ bool PromotePass::runOnFunction(Function &F) {
   bool Changed  = false;
 
   DominatorTree &DT = getAnalysis<DominatorTree>();
-  const DataLayout *DL = getAnalysisIfAvailable<DataLayout>();
 
   while (1) {
     Allocas.clear();
@@ -73,12 +70,12 @@ bool PromotePass::runOnFunction(Function &F) {
     // the entry node
     for (BasicBlock::iterator I = BB.begin(), E = --BB.end(); I != E; ++I)
       if (AllocaInst *AI = dyn_cast<AllocaInst>(I))       // Is it an alloca?
-        if (isAllocaPromotable(AI, DL))
+        if (isAllocaPromotable(AI))
           Allocas.push_back(AI);
 
     if (Allocas.empty()) break;
 
-    PromoteMemToReg(Allocas, DT, DL);
+    PromoteMemToReg(Allocas, DT);
     NumPromoted += Allocas.size();
     Changed = true;
   }
diff --git a/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
index 6910180..8f6eee3 100644
--- a/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
+++ b/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
@@ -30,7 +30,6 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
@@ -46,7 +45,6 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Metadata.h"
-#include "llvm/InstVisitor.h"
 #include "llvm/Support/CFG.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include <algorithm>
@@ -58,16 +56,56 @@ STATISTIC(NumSingleStore,   "Number of alloca's promoted with a single store");
 STATISTIC(NumDeadAlloca,    "Number of dead alloca's removed");
 STATISTIC(NumPHIInsert,     "Number of PHI nodes inserted");
 
-namespace {
+bool llvm::isAllocaPromotable(const AllocaInst *AI) {
+  // FIXME: If the memory unit is of pointer or integer type, we can permit
+  // assignments to subsections of the memory unit.
+
+  // Only allow direct and non-volatile loads and stores...
+  for (Value::const_use_iterator UI = AI->use_begin(), UE = AI->use_end();
+       UI != UE; ++UI) { // Loop over all of the uses of the alloca
+    const User *U = *UI;
+    if (const LoadInst *LI = dyn_cast<LoadInst>(U)) {
+      // Note that atomic loads can be transformed; atomic semantics do
+      // not have any meaning for a local alloca.
+      if (LI->isVolatile())
+        return false;
+    } else if (const StoreInst *SI = dyn_cast<StoreInst>(U)) {
+      if (SI->getOperand(0) == AI)
+        return false; // Don't allow a store OF the AI, only INTO the AI.
+      // Note that atomic stores can be transformed; atomic semantics do
+      // not have any meaning for a local alloca.
+      if (SI->isVolatile())
+        return false;
+    } else if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(U)) {
+      if (II->getIntrinsicID() != Intrinsic::lifetime_start &&
+          II->getIntrinsicID() != Intrinsic::lifetime_end)
+        return false;
+    } else if (const BitCastInst *BCI = dyn_cast<BitCastInst>(U)) {
+      if (BCI->getType() != Type::getInt8PtrTy(U->getContext()))
+        return false;
+      if (!onlyUsedByLifetimeMarkers(BCI))
+        return false;
+    } else if (const GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(U)) {
+      if (GEPI->getType() != Type::getInt8PtrTy(U->getContext()))
+        return false;
+      if (!GEPI->hasAllZeroIndices())
+        return false;
+      if (!onlyUsedByLifetimeMarkers(GEPI))
+        return false;
+    } else {
+      return false;
+    }
+  }
 
-struct AllocaInfo : private InstVisitor<AllocaInfo, bool> {
-  const DataLayout *DL;
+  return true;
+}
+
+namespace {
 
+struct AllocaInfo {
   SmallVector<BasicBlock *, 32> DefiningBlocks;
   SmallVector<BasicBlock *, 32> UsingBlocks;
-  SmallVector<Instruction *, 8> DeadInsts;
 
-  Type *AllocaTy;
   StoreInst *OnlyStore;
   BasicBlock *OnlyBlock;
   bool OnlyUsedInOneBlock;
@@ -75,13 +113,9 @@ struct AllocaInfo : private InstVisitor<AllocaInfo, bool> {
   Value *AllocaPointerVal;
   DbgDeclareInst *DbgDeclare;
 
-  AllocaInfo(const DataLayout *DL) : DL(DL) {}
-
   void clear() {
     DefiningBlocks.clear();
     UsingBlocks.clear();
-    DeadInsts.clear();
-    AllocaTy = 0;
     OnlyStore = 0;
     OnlyBlock = 0;
     OnlyUsedInOneBlock = true;
@@ -91,116 +125,39 @@ struct AllocaInfo : private InstVisitor<AllocaInfo, bool> {
 
   /// Scan the uses of the specified alloca, filling in the AllocaInfo used
   /// by the rest of the pass to reason about the uses of this alloca.
-  bool analyzeAlloca(AllocaInst &AI) {
+  void AnalyzeAlloca(AllocaInst *AI) {
     clear();
 
-    AllocaTy = AI.getAllocatedType();
-    enqueueUsers(AI);
-
-    // Walk queued up uses in the worklist to handle nested uses.
-    while (!UseWorklist.empty()) {
-      U = UseWorklist.pop_back_val();
-      Instruction &I = *cast<Instruction>(U->getUser());
-      if (!visit(I))
-        return false; // Propagate failure to promote up.
+    // As we scan the uses of the alloca instruction, keep track of stores,
+    // and decide whether all of the loads and stores to the alloca are within
+    // the same basic block.
+    for (Value::use_iterator UI = AI->use_begin(), E = AI->use_end();
+         UI != E;) {
+      Instruction *User = cast<Instruction>(*UI++);
+
+      if (StoreInst *SI = dyn_cast<StoreInst>(User)) {
+        // Remember the basic blocks which define new values for the alloca
+        DefiningBlocks.push_back(SI->getParent());
+        AllocaPointerVal = SI->getOperand(0);
+        OnlyStore = SI;
+      } else {
+        LoadInst *LI = cast<LoadInst>(User);
+        // Otherwise it must be a load instruction, keep track of variable
+        // reads.
+        UsingBlocks.push_back(LI->getParent());
+        AllocaPointerVal = LI;
+      }
 
       if (OnlyUsedInOneBlock) {
         if (OnlyBlock == 0)
-          OnlyBlock = I.getParent();
-        else if (OnlyBlock != I.getParent())
+          OnlyBlock = User->getParent();
+        else if (OnlyBlock != User->getParent())
           OnlyUsedInOneBlock = false;
       }
     }
 
-    DbgDeclare = FindAllocaDbgDeclare(&AI);
-    return true;
-  }
-
-private:
-  // Befriend the base class so it can call through private visitor methods.
-  friend class InstVisitor<AllocaInfo, bool>;
-
-  /// \brief A use pointer that is non-null when visiting uses.
-  Use *U;
-
-  /// \brief A worklist for recursively visiting all uses of an alloca.
-  SmallVector<Use *, 8> UseWorklist;
-
-  /// \brief A set for preventing cyclic visitation.
-  SmallPtrSet<Use *, 8> VisitedUses;
-
-  void enqueueUsers(Instruction &I) {
-    for (Value::use_iterator UI = I.use_begin(), UE = I.use_end(); UI != UE;
-         ++UI)
-      if (VisitedUses.insert(&UI.getUse()))
-        UseWorklist.push_back(&UI.getUse());
+    DbgDeclare = FindAllocaDbgDeclare(AI);
   }
-
-  bool visitLoadInst(LoadInst &LI) {
-    if (LI.isVolatile() || LI.getType() != AllocaTy)
-      return false;
-
-    // Keep track of variable reads.
-    UsingBlocks.push_back(LI.getParent());
-    AllocaPointerVal = &LI;
-    return true;
-  }
-
-  bool visitStoreInst(StoreInst &SI) {
-    if (SI.isVolatile() || SI.getValueOperand() == U->get() ||
-        SI.getValueOperand()->getType() != AllocaTy)
-      return false;
-
-    // Remember the basic blocks which define new values for the alloca
-    DefiningBlocks.push_back(SI.getParent());
-    AllocaPointerVal = SI.getOperand(0);
-    OnlyStore = &SI;
-    return true;
-  }
-
-  bool visitBitCastInst(BitCastInst &BC) {
-    if (BC.use_empty())
-      DeadInsts.push_back(&BC);
-    else
-      enqueueUsers(BC);
-    return true;
-  }
-
-  bool visitGetElementPtrInst(GetElementPtrInst &GEPI) {
-    if (GEPI.use_empty()) {
-      DeadInsts.push_back(&GEPI);
-      return true;
-    }
-
-    enqueueUsers(GEPI);
-
-    return GEPI.hasAllZeroIndices();
-  }
-
-  // We can promote through debug info intrinsics as they don't alter the
-  // value stored in memory.
-  bool visitDbgInfoIntrinsic(DbgInfoIntrinsic &I) {
-    DeadInsts.push_back(&I);
-    return true;
-  }
-
-  bool visitIntrinsicInst(IntrinsicInst &II) {
-    switch (II.getIntrinsicID()) {
-    default:
-      return false;
-
-      // Lifetime intrinsics don't preclude promoting the memory to a register.
-      // FIXME: We should use these to promote to undef when outside of a valid
-      // lifetime.
-    case Intrinsic::lifetime_start:
-    case Intrinsic::lifetime_end:
-      DeadInsts.push_back(&II);
-      return true;
-    }
-  }
-
-  // The fallback is that the alloca cannot be promoted.
-  bool visitInstruction(Instruction &I) { return false; }
 };
 
 // Data package used by RenamePass()
@@ -278,7 +235,6 @@ struct PromoteMem2Reg {
   std::vector<AllocaInst *> Allocas;
   DominatorTree &DT;
   DIBuilder DIB;
-  const DataLayout *DL;
 
   /// An AliasSetTracker object to update.  If null, don't update it.
   AliasSetTracker *AST;
@@ -324,9 +280,9 @@ struct PromoteMem2Reg {
 
 public:
   PromoteMem2Reg(ArrayRef<AllocaInst *> Allocas, DominatorTree &DT,
-                 const DataLayout *DL, AliasSetTracker *AST)
+                 AliasSetTracker *AST)
       : Allocas(Allocas.begin(), Allocas.end()), DT(DT),
-        DIB(*DT.getRoot()->getParent()->getParent()), DL(DL), AST(AST) {}
+        DIB(*DT.getRoot()->getParent()->getParent()), AST(AST) {}
 
   void run();
 
@@ -357,39 +313,27 @@ private:
 
 } // end of anonymous namespace
 
-/// \brief Walk a small vector of dead instructions and recursively remove them
-/// and subsequently dead instructions.
-///
-/// This is only valid to call on dead instructions using an alloca which is
-/// promotable, as we leverage that assumption to delete them faster.
-static void removeDeadInstructions(AllocaInst *AI,
-                                   SmallVectorImpl<Instruction *> &DeadInsts) {
-  while (!DeadInsts.empty()) {
-    Instruction *I = DeadInsts.pop_back_val();
-
-    // Don't delete the alloca itself.
-    if (I == AI)
-      continue;
+static void removeLifetimeIntrinsicUsers(AllocaInst *AI) {
+  // Knowing that this alloca is promotable, we know that it's safe to kill all
+  // instructions except for load and store.
 
-    // Note that we open code the deletion algorithm here because we know
-    // apriori that all of the instructions using an alloca that reaches here
-    // are trivially dead when their use list becomes empty (The only risk are
-    // lifetime markers which we specifically want to nuke). By coding it here
-    // we can skip the triviality test and be more efficient.
-    //
-    // Null out all of the instruction's operands to see if any operand becomes
-    // dead as we go.
-    for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE;
-         ++OI) {
-      Instruction *Op = dyn_cast<Instruction>(*OI);
-      if (!Op)
-        continue;
-
-      OI->set(0);
-      if (!Op->use_empty())
-        continue;
+  for (Value::use_iterator UI = AI->use_begin(), UE = AI->use_end();
+       UI != UE;) {
+    Instruction *I = cast<Instruction>(*UI);
+    ++UI;
+    if (isa<LoadInst>(I) || isa<StoreInst>(I))
+      continue;
 
-      DeadInsts.push_back(Op);
+    if (!I->getType()->isVoidTy()) {
+      // The only users of this bitcast/GEP instruction are lifetime intrinsics.
+      // Follow the use/def chain to erase them now instead of leaving it for
+      // dead code elimination later.
+      for (Value::use_iterator UI = I->use_begin(), UE = I->use_end();
+           UI != UE;) {
+        Instruction *Inst = cast<Instruction>(*UI);
+        ++UI;
+        Inst->eraseFromParent();
+      }
     }
     I->eraseFromParent();
   }
@@ -474,6 +418,7 @@ static bool rewriteSingleStoreAlloca(AllocaInst *AI, AllocaInfo &Info,
     DIBuilder DIB(*AI->getParent()->getParent()->getParent());
     ConvertDebugDeclareToDebugValue(DDI, Info.OnlyStore, DIB);
     DDI->eraseFromParent();
+    LBI.deleteValue(DDI);
   }
   // Remove the (now dead) store and alloca.
   Info.OnlyStore->eraseFromParent();
@@ -486,16 +431,6 @@ static bool rewriteSingleStoreAlloca(AllocaInst *AI, AllocaInfo &Info,
   return true;
 }
 
-namespace {
-/// This is a helper predicate used to search by the first element of a pair.
-struct StoreIndexSearchPredicate {
-  bool operator()(const std::pair<unsigned, StoreInst *> &LHS,
-                  const std::pair<unsigned, StoreInst *> &RHS) {
-    return LHS.first < RHS.first;
-  }
-};
-}
-
 /// Many allocas are only used within a single basic block.  If this is the
 /// case, avoid traversing the CFG and inserting a lot of potentially useless
 /// PHI nodes by just performing a single linear pass over the basic block
@@ -528,8 +463,7 @@ static void promoteSingleBlockAlloca(AllocaInst *AI, const AllocaInfo &Info,
 
   // Sort the stores by their index, making it efficient to do a lookup with a
   // binary search.
-  std::sort(StoresByIndex.begin(), StoresByIndex.end(),
-            StoreIndexSearchPredicate());
+  std::sort(StoresByIndex.begin(), StoresByIndex.end(), less_first());
 
   // Walk all of the loads from this alloca, replacing them with the nearest
   // store above them, if any.
@@ -544,7 +478,7 @@ static void promoteSingleBlockAlloca(AllocaInst *AI, const AllocaInfo &Info,
     StoresByIndexTy::iterator I =
         std::lower_bound(StoresByIndex.begin(), StoresByIndex.end(),
                          std::make_pair(LoadIdx, static_cast<StoreInst *>(0)),
-                         StoreIndexSearchPredicate());
+                         less_first());
 
     if (I == StoresByIndex.begin())
       // If there is no store before this load, the load takes the undef value.
@@ -577,8 +511,10 @@ static void promoteSingleBlockAlloca(AllocaInst *AI, const AllocaInfo &Info,
   LBI.deleteValue(AI);
 
   // The alloca's debuginfo can be removed as well.
-  if (DbgDeclareInst *DDI = Info.DbgDeclare)
+  if (DbgDeclareInst *DDI = Info.DbgDeclare) {
     DDI->eraseFromParent();
+    LBI.deleteValue(DDI);
+  }
 
   ++NumLocalPromoted;
 }
@@ -590,23 +526,17 @@ void PromoteMem2Reg::run() {
     PointerAllocaValues.resize(Allocas.size());
   AllocaDbgDeclares.resize(Allocas.size());
 
-  AllocaInfo Info(DL);
+  AllocaInfo Info;
   LargeBlockInfo LBI;
 
   for (unsigned AllocaNum = 0; AllocaNum != Allocas.size(); ++AllocaNum) {
     AllocaInst *AI = Allocas[AllocaNum];
 
+    assert(isAllocaPromotable(AI) && "Cannot promote non-promotable alloca!");
     assert(AI->getParent()->getParent() == &F &&
            "All allocas should be in the same function, which is same as DF!");
 
-    // Calculate the set of read and write-locations for each alloca.  This is
-    // analogous to finding the 'uses' and 'definitions' of each variable.
-    bool Good = Info.analyzeAlloca(*AI);
-    (void)Good;
-    assert(Good && "Cannot promote non-promotable alloca!");
-
-    // Nuke all of the dead instructions.
-    removeDeadInstructions(AI, Info.DeadInsts);
+    removeLifetimeIntrinsicUsers(AI);
 
     if (AI->use_empty()) {
       // If there are no uses of the alloca, just delete it now.
@@ -620,6 +550,10 @@ void PromoteMem2Reg::run() {
       continue;
     }
 
+    // Calculate the set of read and write-locations for each alloca.  This is
+    // analogous to finding the 'uses' and 'definitions' of each variable.
+    Info.AnalyzeAlloca(AI);
+
     // If there is only a single store to this value, replace any loads of
     // it that are directly dominated by the definition with the value stored.
     if (Info.DefiningBlocks.size() == 1) {
@@ -904,16 +838,6 @@ void PromoteMem2Reg::ComputeLiveInBlocks(
   }
 }
 
-namespace {
-typedef std::pair<DomTreeNode *, unsigned> DomTreeNodePair;
-
-struct DomTreeNodeCompare {
-  bool operator()(const DomTreeNodePair &LHS, const DomTreeNodePair &RHS) {
-    return LHS.second < RHS.second;
-  }
-};
-} // end anonymous namespace
-
 /// At this point, we're committed to promoting the alloca using IDF's, and the
 /// standard SSA construction algorithm.  Determine which blocks need phi nodes
 /// and see if we can optimize out some work by avoiding insertion of dead phi
@@ -931,9 +855,9 @@ void PromoteMem2Reg::DetermineInsertionPoint(AllocaInst *AI, unsigned AllocaNum,
 
   // Use a priority queue keyed on dominator tree level so that inserted nodes
   // are handled from the bottom of the dominator tree upwards.
-  typedef std::priority_queue<DomTreeNodePair,
-                              SmallVector<DomTreeNodePair, 32>,
-                              DomTreeNodeCompare> IDFPriorityQueue;
+  typedef std::pair<DomTreeNode *, unsigned> DomTreeNodePair;
+  typedef std::priority_queue<DomTreeNodePair, SmallVector<DomTreeNodePair, 32>,
+                              less_second> IDFPriorityQueue;
   IDFPriorityQueue PQ;
 
   for (SmallPtrSet<BasicBlock *, 32>::const_iterator I = DefBlocks.begin(),
@@ -1145,19 +1069,11 @@ NextIteration:
   goto NextIteration;
 }
 
-bool llvm::isAllocaPromotable(const AllocaInst *AI, const DataLayout *DL) {
-  // We cast away constness because we re-use the non-const analysis that the
-  // actual promotion routine uses. While it is non-const, it doesn't actually
-  // mutate anything at this phase, and we discard the non-const results that
-  // promotion uses to mutate the alloca.
-  return AllocaInfo(DL).analyzeAlloca(*const_cast<AllocaInst *>(AI));
-}
-
 void llvm::PromoteMemToReg(ArrayRef<AllocaInst *> Allocas, DominatorTree &DT,
-                           const DataLayout *DL, AliasSetTracker *AST) {
+                           AliasSetTracker *AST) {
   // If there is nothing to do, bail out...
   if (Allocas.empty())
     return;
 
-  PromoteMem2Reg(Allocas, DT, DL, AST).run();
+  PromoteMem2Reg(Allocas, DT, AST).run();
 }
diff --git a/lib/Transforms/Utils/SSAUpdater.cpp b/lib/Transforms/Utils/SSAUpdater.cpp
index fc85ef3..30adbfa 100644
--- a/lib/Transforms/Utils/SSAUpdater.cpp
+++ b/lib/Transforms/Utils/SSAUpdater.cpp
@@ -63,7 +63,7 @@ void SSAUpdater::AddAvailableValue(BasicBlock *BB, Value *V) {
 }
 
 static bool IsEquivalentPHI(PHINode *PHI,
-                            DenseMap<BasicBlock*, Value*> &ValueMapping) {
+                          SmallDenseMap<BasicBlock*, Value*, 8> &ValueMapping) {
   unsigned PHINumValues = PHI->getNumIncomingValues();
   if (PHINumValues != ValueMapping.size())
     return false;
@@ -136,8 +136,8 @@ Value *SSAUpdater::GetValueInMiddleOfBlock(BasicBlock *BB) {
   // Otherwise, we do need a PHI: check to see if we already have one available
   // in this block that produces the right value.
   if (isa<PHINode>(BB->begin())) {
-    DenseMap<BasicBlock*, Value*> ValueMapping(PredValues.begin(),
-                                               PredValues.end());
+    SmallDenseMap<BasicBlock*, Value*, 8> ValueMapping(PredValues.begin(),
+                                                       PredValues.end());
     PHINode *SomePHI;
     for (BasicBlock::iterator It = BB->begin();
          (SomePHI = dyn_cast<PHINode>(It)); ++It) {
diff --git a/lib/Transforms/Utils/SimplifyCFG.cpp b/lib/Transforms/Utils/SimplifyCFG.cpp
index c4c1423..ff50b12 100644
--- a/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -19,6 +19,7 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
@@ -475,9 +476,13 @@ Value *SimplifyCFGOpt::isValueEqualityComparison(TerminatorInst *TI) {
           CV = ICI->getOperand(0);
 
   // Unwrap any lossless ptrtoint cast.
-  if (TD && CV && CV->getType() == TD->getIntPtrType(CV->getContext()))
-    if (PtrToIntInst *PTII = dyn_cast<PtrToIntInst>(CV))
-      CV = PTII->getOperand(0);
+  if (TD && CV) {
+    if (PtrToIntInst *PTII = dyn_cast<PtrToIntInst>(CV)) {
+      Value *Ptr = PTII->getPointerOperand();
+      if (PTII->getType() == TD->getIntPtrType(Ptr->getType()))
+        CV = Ptr;
+    }
+  }
   return CV;
 }
 
@@ -699,9 +704,10 @@ namespace {
   };
 }
 
-static int ConstantIntSortPredicate(const void *P1, const void *P2) {
-  const ConstantInt *LHS = *(const ConstantInt*const*)P1;
-  const ConstantInt *RHS = *(const ConstantInt*const*)P2;
+static int ConstantIntSortPredicate(ConstantInt *const *P1,
+                                    ConstantInt *const *P2) {
+  const ConstantInt *LHS = *P1;
+  const ConstantInt *RHS = *P2;
   if (LHS->getValue().ult(RHS->getValue()))
     return 1;
   if (LHS->getValue() == RHS->getValue())
@@ -924,7 +930,7 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI,
       // Convert pointer to int before we switch.
       if (CV->getType()->isPointerTy()) {
         assert(TD && "Cannot switch on pointer without DataLayout");
-        CV = Builder.CreatePtrToInt(CV, TD->getIntPtrType(CV->getContext()),
+        CV = Builder.CreatePtrToInt(CV, TD->getIntPtrType(CV->getType()),
                                     "magicptr");
       }
 
@@ -1556,6 +1562,19 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB) {
   return true;
 }
 
+/// \returns True if this block contains a CallInst with the NoDuplicate
+/// attribute.
+static bool HasNoDuplicateCall(const BasicBlock *BB) {
+  for (BasicBlock::const_iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
+    const CallInst *CI = dyn_cast<CallInst>(I);
+    if (!CI)
+      continue;
+    if (CI->cannotDuplicate())
+      return true;
+  }
+  return false;
+}
+
 /// BlockIsSimpleEnoughToThreadThrough - Return true if we can thread a branch
 /// across this block.
 static bool BlockIsSimpleEnoughToThreadThrough(BasicBlock *BB) {
@@ -1603,6 +1622,8 @@ static bool FoldCondBranchOnPHI(BranchInst *BI, const DataLayout *TD) {
   // Now we know that this block has multiple preds and two succs.
   if (!BlockIsSimpleEnoughToThreadThrough(BB)) return false;
 
+  if (HasNoDuplicateCall(BB)) return false;
+
   // Okay, this is a simple enough basic block.  See if any phi values are
   // constants.
   for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
@@ -2069,14 +2090,19 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI) {
     // Ensure that any values used in the bonus instruction are also used
     // by the terminator of the predecessor.  This means that those values
     // must already have been resolved, so we won't be inhibiting the
-    // out-of-order core by speculating them earlier.
-    if (BonusInst) {
+    // out-of-order core by speculating them earlier. We also allow
+    // instructions that are used by the terminator's condition because it
+    // exposes more merging opportunities.
+    bool UsedByBranch = (BonusInst && BonusInst->hasOneUse() &&
+                         *BonusInst->use_begin() == Cond);
+
+    if (BonusInst && !UsedByBranch) {
       // Collect the values used by the bonus inst
       SmallPtrSet<Value*, 4> UsedValues;
       for (Instruction::op_iterator OI = BonusInst->op_begin(),
            OE = BonusInst->op_end(); OI != OE; ++OI) {
         Value *V = *OI;
-        if (!isa<Constant>(V))
+        if (!isa<Constant>(V) && !isa<Argument>(V))
           UsedValues.insert(V);
       }
 
@@ -2787,7 +2813,7 @@ static bool SimplifyBranchOnICmpChain(BranchInst *BI, const DataLayout *TD,
   if (CompVal->getType()->isPointerTy()) {
     assert(TD && "Cannot switch on pointer without DataLayout");
     CompVal = Builder.CreatePtrToInt(CompVal,
-                                     TD->getIntPtrType(CompVal->getContext()),
+                                     TD->getIntPtrType(CompVal->getType()),
                                      "magicptr");
   }
 
@@ -3160,7 +3186,7 @@ static bool TurnSwitchRangeIntoICmp(SwitchInst *SI, IRBuilder<> &Builder) {
 /// and use it to remove dead cases.
 static bool EliminateDeadSwitchCases(SwitchInst *SI) {
   Value *Cond = SI->getCondition();
-  unsigned Bits = cast<IntegerType>(Cond->getType())->getBitWidth();
+  unsigned Bits = Cond->getType()->getIntegerBitWidth();
   APInt KnownZero(Bits, 0), KnownOne(Bits, 0);
   ComputeMaskedBits(Cond, KnownZero, KnownOne);
 
@@ -3303,28 +3329,10 @@ static Constant *LookupConstant(Value *V,
 /// simple instructions such as binary operations where both operands are
 /// constant or can be replaced by constants from the ConstantPool. Returns the
 /// resulting constant on success, 0 otherwise.
-static Constant *ConstantFold(Instruction *I,
-                         const SmallDenseMap<Value*, Constant*>& ConstantPool) {
-  if (BinaryOperator *BO = dyn_cast<BinaryOperator>(I)) {
-    Constant *A = LookupConstant(BO->getOperand(0), ConstantPool);
-    if (!A)
-      return 0;
-    Constant *B = LookupConstant(BO->getOperand(1), ConstantPool);
-    if (!B)
-      return 0;
-    return ConstantExpr::get(BO->getOpcode(), A, B);
-  }
-
-  if (CmpInst *Cmp = dyn_cast<CmpInst>(I)) {
-    Constant *A = LookupConstant(I->getOperand(0), ConstantPool);
-    if (!A)
-      return 0;
-    Constant *B = LookupConstant(I->getOperand(1), ConstantPool);
-    if (!B)
-      return 0;
-    return ConstantExpr::getCompare(Cmp->getPredicate(), A, B);
-  }
-
+static Constant *
+ConstantFold(Instruction *I,
+             const SmallDenseMap<Value *, Constant *> &ConstantPool,
+             const DataLayout *DL) {
   if (SelectInst *Select = dyn_cast<SelectInst>(I)) {
     Constant *A = LookupConstant(Select->getCondition(), ConstantPool);
     if (!A)
@@ -3336,14 +3344,19 @@ static Constant *ConstantFold(Instruction *I,
     return 0;
   }
 
-  if (CastInst *Cast = dyn_cast<CastInst>(I)) {
-    Constant *A = LookupConstant(I->getOperand(0), ConstantPool);
-    if (!A)
+  SmallVector<Constant *, 4> COps;
+  for (unsigned N = 0, E = I->getNumOperands(); N != E; ++N) {
+    if (Constant *A = LookupConstant(I->getOperand(N), ConstantPool))
+      COps.push_back(A);
+    else
       return 0;
-    return ConstantExpr::getCast(Cast->getOpcode(), A, Cast->getDestTy());
   }
 
-  return 0;
+  if (CmpInst *Cmp = dyn_cast<CmpInst>(I))
+    return ConstantFoldCompareInstOperands(Cmp->getPredicate(), COps[0],
+                                           COps[1], DL);
+
+  return ConstantFoldInstOperands(I->getOpcode(), I->getType(), COps, DL);
 }
 
 /// GetCaseResults - Try to determine the resulting constant values in phi nodes
@@ -3355,7 +3368,8 @@ GetCaseResults(SwitchInst *SI,
                ConstantInt *CaseVal,
                BasicBlock *CaseDest,
                BasicBlock **CommonDest,
-               SmallVectorImpl<std::pair<PHINode*,Constant*> > &Res) {
+               SmallVectorImpl<std::pair<PHINode *, Constant *> > &Res,
+               const DataLayout *DL) {
   // The block from which we enter the common destination.
   BasicBlock *Pred = SI->getParent();
 
@@ -3374,7 +3388,7 @@ GetCaseResults(SwitchInst *SI,
     } else if (isa<DbgInfoIntrinsic>(I)) {
       // Skip debug intrinsic.
       continue;
-    } else if (Constant *C = ConstantFold(I, ConstantPool)) {
+    } else if (Constant *C = ConstantFold(I, ConstantPool, DL)) {
       // Instruction is side-effect free and constant.
       ConstantPool.insert(std::make_pair(I, C));
     } else {
@@ -3698,7 +3712,7 @@ static bool SwitchToLookupTable(SwitchInst *SI,
     typedef SmallVector<std::pair<PHINode*, Constant*>, 4> ResultsTy;
     ResultsTy Results;
     if (!GetCaseResults(SI, CaseVal, CI.getCaseSuccessor(), &CommonDest,
-                        Results))
+                        Results, TD))
       return false;
 
     // Append the result from this case to the list for each phi.
@@ -3712,7 +3726,7 @@ static bool SwitchToLookupTable(SwitchInst *SI,
   // Get the resulting values for the default case.
   SmallVector<std::pair<PHINode*, Constant*>, 4> DefaultResultsList;
   if (!GetCaseResults(SI, 0, SI->getDefaultDest(), &CommonDest,
-                      DefaultResultsList))
+                      DefaultResultsList, TD))
     return false;
   for (size_t I = 0, E = DefaultResultsList.size(); I != E; ++I) {
     PHINode *PHI = DefaultResultsList[I].first;
@@ -3733,14 +3747,32 @@ static bool SwitchToLookupTable(SwitchInst *SI,
                                             CommonDest->getParent(),
                                             CommonDest);
 
-  // Check whether the condition value is within the case range, and branch to
-  // the new BB.
+  // Compute the table index value.
   Builder.SetInsertPoint(SI);
   Value *TableIndex = Builder.CreateSub(SI->getCondition(), MinCaseVal,
                                         "switch.tableidx");
-  Value *Cmp = Builder.CreateICmpULT(TableIndex, ConstantInt::get(
-      MinCaseVal->getType(), TableSize));
-  Builder.CreateCondBr(Cmp, LookupBB, SI->getDefaultDest());
+
+  // Compute the maximum table size representable by the integer type we are
+  // switching upon.
+  unsigned CaseSize = MinCaseVal->getType()->getPrimitiveSizeInBits();
+  uint64_t MaxTableSize = CaseSize > 63? UINT64_MAX : 1ULL << CaseSize;
+  assert(MaxTableSize >= TableSize &&
+         "It is impossible for a switch to have more entries than the max "
+         "representable value of its input integer type's size.");
+
+  // If we have a fully covered lookup table, unconditionally branch to the
+  // lookup table BB. Otherwise, check if the condition value is within the case
+  // range. If it is so, branch to the new BB. Otherwise branch to SI's default
+  // destination.
+  const bool GeneratingCoveredLookupTable = MaxTableSize == TableSize;
+  if (GeneratingCoveredLookupTable) {
+    Builder.CreateBr(LookupBB);
+    SI->getDefaultDest()->removePredecessor(SI->getParent());
+  } else {
+    Value *Cmp = Builder.CreateICmpULT(TableIndex, ConstantInt::get(
+                                         MinCaseVal->getType(), TableSize));
+    Builder.CreateCondBr(Cmp, LookupBB, SI->getDefaultDest());
+  }
 
   // Populate the BB that does the lookups.
   Builder.SetInsertPoint(LookupBB);
@@ -3769,9 +3801,11 @@ static bool SwitchToLookupTable(SwitchInst *SI,
     Builder.CreateBr(CommonDest);
 
   // Remove the switch.
-  for (unsigned i = 0; i < SI->getNumSuccessors(); ++i) {
+  for (unsigned i = 0, e = SI->getNumSuccessors(); i < e; ++i) {
     BasicBlock *Succ = SI->getSuccessor(i);
-    if (Succ == SI->getDefaultDest()) continue;
+
+    if (Succ == SI->getDefaultDest())
+      continue;
     Succ->removePredecessor(SI->getParent());
   }
   SI->eraseFromParent();
diff --git a/lib/Transforms/Utils/SimplifyLibCalls.cpp b/lib/Transforms/Utils/SimplifyLibCalls.cpp
index 094c201..15b3e66 100644
--- a/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -17,6 +17,7 @@
 #include "llvm/Transforms/Utils/SimplifyLibCalls.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/Triple.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Function.h"
@@ -26,11 +27,16 @@
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Support/Allocator.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Transforms/Utils/BuildLibCalls.h"
 
 using namespace llvm;
 
+static cl::opt<bool>
+ColdErrorCalls("error-reporting-is-cold",  cl::init(true),
+  cl::Hidden, cl::desc("Treat error-reporting calls as cold"));
+
 /// This class is the abstract base class for the set of optimizations that
 /// corresponds to one library call.
 namespace {
@@ -118,6 +124,21 @@ static bool callHasFloatingPointArgument(const CallInst *CI) {
   return false;
 }
 
+/// \brief Check whether the overloaded unary floating point function
+/// corresponing to \a Ty is available.
+static bool hasUnaryFloatFn(const TargetLibraryInfo *TLI, Type *Ty,
+                            LibFunc::Func DoubleFn, LibFunc::Func FloatFn,
+                            LibFunc::Func LongDoubleFn) {
+  switch (Ty->getTypeID()) {
+  case Type::FloatTyID:
+    return TLI->has(FloatFn);
+  case Type::DoubleTyID:
+    return TLI->has(DoubleFn);
+  default:
+    return TLI->has(LongDoubleFn);
+  }
+}
+
 //===----------------------------------------------------------------------===//
 // Fortified Library Call Optimizations
 //===----------------------------------------------------------------------===//
@@ -477,7 +498,7 @@ struct StrChrOpt : public LibCallOptimization {
 
     // Compute the offset, make sure to handle the case when we're searching for
     // zero (a weird way to spell strlen).
-    size_t I = CharC->getSExtValue() == 0 ?
+    size_t I = (0xFF & CharC->getSExtValue()) == 0 ?
         Str.size() : Str.find(CharC->getSExtValue());
     if (I == StringRef::npos) // Didn't find the char.  strchr returns null.
       return Constant::getNullValue(CI->getType());
@@ -513,7 +534,7 @@ struct StrRChrOpt : public LibCallOptimization {
     }
 
     // Compute the offset.
-    size_t I = CharC->getSExtValue() == 0 ?
+    size_t I = (0xFF & CharC->getSExtValue()) == 0 ?
         Str.size() : Str.rfind(CharC->getSExtValue());
     if (I == StringRef::npos) // Didn't find the char. Return null.
       return Constant::getNullValue(CI->getType());
@@ -774,7 +795,7 @@ struct StrPBrkOpt : public LibCallOptimization {
     // Constant folding.
     if (HasS1 && HasS2) {
       size_t I = S1.find_first_of(S2);
-      if (I == std::string::npos) // No match.
+      if (I == StringRef::npos) // No match.
         return Constant::getNullValue(CI->getType());
 
       return B.CreateGEP(CI->getArgOperand(0), B.getInt64(I), "strpbrk");
@@ -912,7 +933,7 @@ struct StrStrOpt : public LibCallOptimization {
 
     // If both strings are known, constant fold it.
     if (HasStr1 && HasStr2) {
-      std::string::size_type Offset = SearchStr.find(ToFindStr);
+      size_t Offset = SearchStr.find(ToFindStr);
 
       if (Offset == StringRef::npos) // strstr("foo", "bar") -> null
         return Constant::getNullValue(CI->getType());
@@ -1031,7 +1052,7 @@ struct MemSetOpt : public LibCallOptimization {
     if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) ||
         !FT->getParamType(0)->isPointerTy() ||
         !FT->getParamType(1)->isIntegerTy() ||
-        FT->getParamType(2) != TD->getIntPtrType(*Context))
+        FT->getParamType(2) != TD->getIntPtrType(FT->getParamType(0)))
       return 0;
 
     // memset(p, v, n) -> llvm.memset(p, v, n, 1)
@@ -1133,9 +1154,13 @@ struct PowOpt : public UnsafeFPLibCallOptimization {
 
     Value *Op1 = CI->getArgOperand(0), *Op2 = CI->getArgOperand(1);
     if (ConstantFP *Op1C = dyn_cast<ConstantFP>(Op1)) {
-      if (Op1C->isExactlyValue(1.0))  // pow(1.0, x) -> 1.0
+      // pow(1.0, x) -> 1.0
+      if (Op1C->isExactlyValue(1.0))
         return Op1C;
-      if (Op1C->isExactlyValue(2.0))  // pow(2.0, x) -> exp2(x)
+      // pow(2.0, x) -> exp2(x)
+      if (Op1C->isExactlyValue(2.0) &&
+          hasUnaryFloatFn(TLI, Op1->getType(), LibFunc::exp2, LibFunc::exp2f,
+                          LibFunc::exp2l))
         return EmitUnaryFloatFnCall(Op2, "exp2", B, Callee->getAttributes());
     }
 
@@ -1145,7 +1170,11 @@ struct PowOpt : public UnsafeFPLibCallOptimization {
     if (Op2C->getValueAPF().isZero())  // pow(x, 0.0) -> 1.0
       return ConstantFP::get(CI->getType(), 1.0);
 
-    if (Op2C->isExactlyValue(0.5)) {
+    if (Op2C->isExactlyValue(0.5) &&
+        hasUnaryFloatFn(TLI, Op2->getType(), LibFunc::sqrt, LibFunc::sqrtf,
+                        LibFunc::sqrtl) &&
+        hasUnaryFloatFn(TLI, Op2->getType(), LibFunc::fabs, LibFunc::fabsf,
+                        LibFunc::fabsl)) {
       // Expand pow(x, 0.5) to (x == -infinity ? +infinity : fabs(sqrt(x))).
       // This is faster than calling pow, and still handles negative zero
       // and negative infinity correctly.
@@ -1178,7 +1207,7 @@ struct Exp2Opt : public UnsafeFPLibCallOptimization {
   virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
     Value *Ret = NULL;
     if (UnsafeFPShrink && Callee->getName() == "exp2" &&
-        TLI->has(LibFunc::exp2)) {
+        TLI->has(LibFunc::exp2f)) {
       UnaryDoubleFPOpt UnsafeUnaryDoubleFP(true);
       Ret = UnsafeUnaryDoubleFP.callOptimizer(Callee, CI, B);
     }
@@ -1229,6 +1258,155 @@ struct Exp2Opt : public UnsafeFPLibCallOptimization {
   }
 };
 
+struct SinCosPiOpt : public LibCallOptimization {
+  SinCosPiOpt() {}
+
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    // Make sure the prototype is as expected, otherwise the rest of the
+    // function is probably invalid and likely to abort.
+    if (!isTrigLibCall(CI))
+      return 0;
+
+    Value *Arg = CI->getArgOperand(0);
+    SmallVector<CallInst *, 1> SinCalls;
+    SmallVector<CallInst *, 1> CosCalls;
+    SmallVector<CallInst *, 1> SinCosCalls;
+
+    bool IsFloat = Arg->getType()->isFloatTy();
+
+    // Look for all compatible sinpi, cospi and sincospi calls with the same
+    // argument. If there are enough (in some sense) we can make the
+    // substitution.
+    for (Value::use_iterator UI = Arg->use_begin(), UE = Arg->use_end();
+         UI != UE; ++UI)
+      classifyArgUse(*UI, CI->getParent(), IsFloat, SinCalls, CosCalls,
+                     SinCosCalls);
+
+    // It's only worthwhile if both sinpi and cospi are actually used.
+    if (SinCosCalls.empty() && (SinCalls.empty() || CosCalls.empty()))
+      return 0;
+
+    Value *Sin, *Cos, *SinCos;
+    insertSinCosCall(B, CI->getCalledFunction(), Arg, IsFloat, Sin, Cos,
+                     SinCos);
+
+    replaceTrigInsts(SinCalls, Sin);
+    replaceTrigInsts(CosCalls, Cos);
+    replaceTrigInsts(SinCosCalls, SinCos);
+
+    return 0;
+  }
+
+  bool isTrigLibCall(CallInst *CI) {
+    Function *Callee = CI->getCalledFunction();
+    FunctionType *FT = Callee->getFunctionType();
+
+    // We can only hope to do anything useful if we can ignore things like errno
+    // and floating-point exceptions.
+    bool AttributesSafe = CI->hasFnAttr(Attribute::NoUnwind) &&
+                          CI->hasFnAttr(Attribute::ReadNone);
+
+    // Other than that we need float(float) or double(double)
+    return AttributesSafe && FT->getNumParams() == 1 &&
+           FT->getReturnType() == FT->getParamType(0) &&
+           (FT->getParamType(0)->isFloatTy() ||
+            FT->getParamType(0)->isDoubleTy());
+  }
+
+  void classifyArgUse(Value *Val, BasicBlock *BB, bool IsFloat,
+                      SmallVectorImpl<CallInst *> &SinCalls,
+                      SmallVectorImpl<CallInst *> &CosCalls,
+                      SmallVectorImpl<CallInst *> &SinCosCalls) {
+    CallInst *CI = dyn_cast<CallInst>(Val);
+
+    if (!CI)
+      return;
+
+    Function *Callee = CI->getCalledFunction();
+    StringRef FuncName = Callee->getName();
+    LibFunc::Func Func;
+    if (!TLI->getLibFunc(FuncName, Func) || !TLI->has(Func) ||
+        !isTrigLibCall(CI))
+      return;
+
+    if (IsFloat) {
+      if (Func == LibFunc::sinpif)
+        SinCalls.push_back(CI);
+      else if (Func == LibFunc::cospif)
+        CosCalls.push_back(CI);
+      else if (Func == LibFunc::sincospi_stretf)
+        SinCosCalls.push_back(CI);
+    } else {
+      if (Func == LibFunc::sinpi)
+        SinCalls.push_back(CI);
+      else if (Func == LibFunc::cospi)
+        CosCalls.push_back(CI);
+      else if (Func == LibFunc::sincospi_stret)
+        SinCosCalls.push_back(CI);
+    }
+  }
+
+  void replaceTrigInsts(SmallVectorImpl<CallInst*> &Calls, Value *Res) {
+    for (SmallVectorImpl<CallInst*>::iterator I = Calls.begin(),
+           E = Calls.end();
+         I != E; ++I) {
+      LCS->replaceAllUsesWith(*I, Res);
+    }
+  }
+
+  void insertSinCosCall(IRBuilder<> &B, Function *OrigCallee, Value *Arg,
+                        bool UseFloat, Value *&Sin, Value *&Cos,
+                        Value *&SinCos) {
+    Type *ArgTy = Arg->getType();
+    Type *ResTy;
+    StringRef Name;
+
+    Triple T(OrigCallee->getParent()->getTargetTriple());
+    if (UseFloat) {
+      Name = "__sincospi_stretf";
+
+      assert(T.getArch() != Triple::x86 && "x86 messy and unsupported for now");
+      // x86_64 can't use {float, float} since that would be returned in both
+      // xmm0 and xmm1, which isn't what a real struct would do.
+      ResTy = T.getArch() == Triple::x86_64
+                  ? static_cast<Type *>(VectorType::get(ArgTy, 2))
+                  : static_cast<Type *>(StructType::get(ArgTy, ArgTy, NULL));
+    } else {
+      Name = "__sincospi_stret";
+      ResTy = StructType::get(ArgTy, ArgTy, NULL);
+    }
+
+    Module *M = OrigCallee->getParent();
+    Value *Callee = M->getOrInsertFunction(Name, OrigCallee->getAttributes(),
+                                           ResTy, ArgTy, NULL);
+
+    if (Instruction *ArgInst = dyn_cast<Instruction>(Arg)) {
+      // If the argument is an instruction, it must dominate all uses so put our
+      // sincos call there.
+      BasicBlock::iterator Loc = ArgInst;
+      B.SetInsertPoint(ArgInst->getParent(), ++Loc);
+    } else {
+      // Otherwise (e.g. for a constant) the beginning of the function is as
+      // good a place as any.
+      BasicBlock &EntryBB = B.GetInsertBlock()->getParent()->getEntryBlock();
+      B.SetInsertPoint(&EntryBB, EntryBB.begin());
+    }
+
+    SinCos = B.CreateCall(Callee, Arg, "sincospi");
+
+    if (SinCos->getType()->isStructTy()) {
+      Sin = B.CreateExtractValue(SinCos, 0, "sinpi");
+      Cos = B.CreateExtractValue(SinCos, 1, "cospi");
+    } else {
+      Sin = B.CreateExtractElement(SinCos, ConstantInt::get(B.getInt32Ty(), 0),
+                                   "sinpi");
+      Cos = B.CreateExtractElement(SinCos, ConstantInt::get(B.getInt32Ty(), 1),
+                                   "cospi");
+    }
+  }
+
+};
+
 //===----------------------------------------------------------------------===//
 // Integer Library Call Optimizations
 //===----------------------------------------------------------------------===//
@@ -1333,6 +1511,54 @@ struct ToAsciiOpt : public LibCallOptimization {
 // Formatting and IO Library Call Optimizations
 //===----------------------------------------------------------------------===//
 
+struct ErrorReportingOpt : public LibCallOptimization {
+  ErrorReportingOpt(int S = -1) : StreamArg(S) {}
+
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &) {
+    // Error reporting calls should be cold, mark them as such.
+    // This applies even to non-builtin calls: it is only a hint and applies to
+    // functions that the frontend might not understand as builtins.
+
+    // This heuristic was suggested in:
+    // Improving Static Branch Prediction in a Compiler
+    // Brian L. Deitrich, Ben-Chung Cheng, Wen-mei W. Hwu
+    // Proceedings of PACT'98, Oct. 1998, IEEE
+
+    if (!CI->hasFnAttr(Attribute::Cold) && isReportingError(Callee, CI)) {
+      CI->addAttribute(AttributeSet::FunctionIndex, Attribute::Cold);
+    }
+
+    return 0;
+  }
+
+protected:
+  bool isReportingError(Function *Callee, CallInst *CI) {
+    if (!ColdErrorCalls)
+      return false;
+ 
+    if (!Callee || !Callee->isDeclaration())
+      return false;
+
+    if (StreamArg < 0)
+      return true;
+
+    // These functions might be considered cold, but only if their stream
+    // argument is stderr.
+
+    if (StreamArg >= (int) CI->getNumArgOperands())
+      return false;
+    LoadInst *LI = dyn_cast<LoadInst>(CI->getArgOperand(StreamArg));
+    if (!LI)
+      return false;
+    GlobalVariable *GV = dyn_cast<GlobalVariable>(LI->getPointerOperand());
+    if (!GV || !GV->isDeclaration())
+      return false;
+    return GV->getName() == "stderr";
+  }
+
+  int StreamArg;
+};
+
 struct PrintFOpt : public LibCallOptimization {
   Value *optimizeFixedFormatString(Function *Callee, CallInst *CI,
                                    IRBuilder<> &B) {
@@ -1361,7 +1587,7 @@ struct PrintFOpt : public LibCallOptimization {
 
     // printf("foo\n") --> puts("foo")
     if (FormatStr[FormatStr.size()-1] == '\n' &&
-        FormatStr.find('%') == std::string::npos) {  // no format characters.
+        FormatStr.find('%') == StringRef::npos) { // No format characters.
       // Create a string literal with no \n on it.  We expect the constant merge
       // pass to be run after this pass, to merge duplicate strings.
       FormatStr = FormatStr.drop_back();
@@ -1513,6 +1739,9 @@ struct SPrintFOpt : public LibCallOptimization {
 struct FPrintFOpt : public LibCallOptimization {
   Value *optimizeFixedFormatString(Function *Callee, CallInst *CI,
                                    IRBuilder<> &B) {
+    ErrorReportingOpt ER(/* StreamArg = */ 0);
+    (void) ER.callOptimizer(Callee, CI, B);
+
     // All the optimizations depend on the format string.
     StringRef FormatStr;
     if (!getConstantStringInfo(CI->getArgOperand(1), FormatStr))
@@ -1590,6 +1819,9 @@ struct FPrintFOpt : public LibCallOptimization {
 
 struct FWriteOpt : public LibCallOptimization {
   virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    ErrorReportingOpt ER(/* StreamArg = */ 3);
+    (void) ER.callOptimizer(Callee, CI, B);
+
     // Require a pointer, an integer, an integer, a pointer, returning integer.
     FunctionType *FT = Callee->getFunctionType();
     if (FT->getNumParams() != 4 || !FT->getParamType(0)->isPointerTy() ||
@@ -1623,6 +1855,9 @@ struct FWriteOpt : public LibCallOptimization {
 
 struct FPutsOpt : public LibCallOptimization {
   virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    ErrorReportingOpt ER(/* StreamArg = */ 1);
+    (void) ER.callOptimizer(Callee, CI, B);
+
     // These optimizations require DataLayout.
     if (!TD) return 0;
 
@@ -1741,6 +1976,7 @@ static MemSetOpt MemSet;
 // Math library call optimizations.
 static UnaryDoubleFPOpt UnaryDoubleFP(false);
 static UnaryDoubleFPOpt UnsafeUnaryDoubleFP(true);
+static SinCosPiOpt SinCosPi;
 
   // Integer library call optimizations.
 static FFSOpt FFS;
@@ -1750,6 +1986,9 @@ static IsAsciiOpt IsAscii;
 static ToAsciiOpt ToAscii;
 
 // Formatting and IO library call optimizations.
+static ErrorReportingOpt ErrorReporting;
+static ErrorReportingOpt ErrorReporting0(0);
+static ErrorReportingOpt ErrorReporting1(1);
 static PrintFOpt PrintF;
 static SPrintFOpt SPrintF;
 static FPrintFOpt FPrintF;
@@ -1825,6 +2064,11 @@ LibCallOptimization *LibCallSimplifierImpl::lookupOptimization(CallInst *CI) {
       case LibFunc::cos:
       case LibFunc::cosl:
         return &Cos;
+      case LibFunc::sinpif:
+      case LibFunc::sinpi:
+      case LibFunc::cospif:
+      case LibFunc::cospi:
+        return &SinCosPi;
       case LibFunc::powf:
       case LibFunc::pow:
       case LibFunc::powl:
@@ -1859,6 +2103,13 @@ LibCallOptimization *LibCallSimplifierImpl::lookupOptimization(CallInst *CI) {
         return &FPuts;
       case LibFunc::puts:
         return &Puts;
+      case LibFunc::perror:
+        return &ErrorReporting;
+      case LibFunc::vfprintf:
+      case LibFunc::fiprintf:
+        return &ErrorReporting0;
+      case LibFunc::fputc:
+        return &ErrorReporting1;
       case LibFunc::ceil:
       case LibFunc::fabs:
       case LibFunc::floor:
diff --git a/lib/Transforms/Utils/SpecialCaseList.cpp b/lib/Transforms/Utils/SpecialCaseList.cpp
index b98cb5b..2ef692c 100644
--- a/lib/Transforms/Utils/SpecialCaseList.cpp
+++ b/lib/Transforms/Utils/SpecialCaseList.cpp
@@ -49,29 +49,45 @@ struct SpecialCaseList::Entry {
   }
 };
 
-SpecialCaseList::SpecialCaseList(const StringRef Path) {
-  // Validate and open blacklist file.
-  if (Path.empty()) return;
+SpecialCaseList::SpecialCaseList() : Entries() {}
+
+SpecialCaseList *SpecialCaseList::create(
+    const StringRef Path, std::string &Error) {
+  if (Path.empty())
+    return new SpecialCaseList();
   OwningPtr<MemoryBuffer> File;
   if (error_code EC = MemoryBuffer::getFile(Path, File)) {
-    report_fatal_error("Can't open blacklist file: " + Path + ": " +
-                       EC.message());
+    Error = (Twine("Can't open file '") + Path + "': " + EC.message()).str();
+    return 0;
   }
+  return create(File.get(), Error);
+}
 
-  init(File.get());
+SpecialCaseList *SpecialCaseList::create(
+    const MemoryBuffer *MB, std::string &Error) {
+  OwningPtr<SpecialCaseList> SCL(new SpecialCaseList());
+  if (!SCL->parse(MB, Error))
+    return 0;
+  return SCL.take();
 }
 
-SpecialCaseList::SpecialCaseList(const MemoryBuffer *MB) {
-  init(MB);
+SpecialCaseList *SpecialCaseList::createOrDie(const StringRef Path) {
+  std::string Error;
+  if (SpecialCaseList *SCL = create(Path, Error))
+    return SCL;
+  report_fatal_error(Error);
 }
 
-void SpecialCaseList::init(const MemoryBuffer *MB) {
+bool SpecialCaseList::parse(const MemoryBuffer *MB, std::string &Error) {
   // Iterate through each line in the blacklist file.
   SmallVector<StringRef, 16> Lines;
   SplitString(MB->getBuffer(), Lines, "\n\r");
   StringMap<StringMap<std::string> > Regexps;
+  assert(Entries.empty() &&
+         "parse() should be called on an empty SpecialCaseList");
+  int LineNo = 1;
   for (SmallVectorImpl<StringRef>::iterator I = Lines.begin(), E = Lines.end();
-       I != E; ++I) {
+       I != E; ++I, ++LineNo) {
     // Ignore empty lines and lines starting with "#"
     if (I->empty() || I->startswith("#"))
       continue;
@@ -80,7 +96,9 @@ void SpecialCaseList::init(const MemoryBuffer *MB) {
     StringRef Prefix = SplitLine.first;
     if (SplitLine.second.empty()) {
       // Missing ':' in the line.
-      report_fatal_error("malformed blacklist line: " + SplitLine.first);
+      Error = (Twine("Malformed line ") + Twine(LineNo) + ": '" +
+               SplitLine.first + "'").str();
+      return false;
     }
 
     std::pair<StringRef, StringRef> SplitRegexp = SplitLine.second.split("=");
@@ -113,10 +131,11 @@ void SpecialCaseList::init(const MemoryBuffer *MB) {
 
     // Check that the regexp is valid.
     Regex CheckRE(Regexp);
-    std::string Error;
-    if (!CheckRE.isValid(Error)) {
-      report_fatal_error("malformed blacklist regex: " + SplitLine.second +
-          ": " + Error);
+    std::string REError;
+    if (!CheckRE.isValid(REError)) {
+      Error = (Twine("Malformed regex in line ") + Twine(LineNo) + ": '" +
+               SplitLine.second + "': " + REError).str();
+      return false;
     }
 
     // Add this regexp into the proper group by its prefix.
@@ -135,6 +154,7 @@ void SpecialCaseList::init(const MemoryBuffer *MB) {
       Entries[I->getKey()][II->getKey()].RegEx = new Regex(II->getValue());
     }
   }
+  return true;
 }
 
 SpecialCaseList::~SpecialCaseList() {
@@ -149,18 +169,12 @@ SpecialCaseList::~SpecialCaseList() {
   }
 }
 
-bool SpecialCaseList::findCategory(const Function &F,
-                                   StringRef &Category) const {
-  return findCategory(*F.getParent(), Category) ||
-         findCategory("fun", F.getName(), Category);
-}
-
 bool SpecialCaseList::isIn(const Function& F, const StringRef Category) const {
   return isIn(*F.getParent(), Category) ||
          inSectionCategory("fun", F.getName(), Category);
 }
 
-static StringRef GetGVTypeString(const GlobalVariable &G) {
+static StringRef GetGlobalTypeString(const GlobalValue &G) {
   // Types of GlobalVariables are always pointer types.
   Type *GType = G.getType()->getElementType();
   // For now we support blacklisting struct types only.
@@ -171,46 +185,29 @@ static StringRef GetGVTypeString(const GlobalVariable &G) {
   return "<unknown type>";
 }
 
-bool SpecialCaseList::findCategory(const GlobalVariable &G,
-                                   StringRef &Category) const {
-  return findCategory(*G.getParent(), Category) ||
-         findCategory("global", G.getName(), Category) ||
-         findCategory("type", GetGVTypeString(G), Category);
-}
-
 bool SpecialCaseList::isIn(const GlobalVariable &G,
                            const StringRef Category) const {
   return isIn(*G.getParent(), Category) ||
          inSectionCategory("global", G.getName(), Category) ||
-         inSectionCategory("type", GetGVTypeString(G), Category);
+         inSectionCategory("type", GetGlobalTypeString(G), Category);
 }
 
-bool SpecialCaseList::findCategory(const Module &M, StringRef &Category) const {
-  return findCategory("src", M.getModuleIdentifier(), Category);
+bool SpecialCaseList::isIn(const GlobalAlias &GA,
+                           const StringRef Category) const {
+  if (isIn(*GA.getParent(), Category))
+    return true;
+
+  if (isa<FunctionType>(GA.getType()->getElementType()))
+    return inSectionCategory("fun", GA.getName(), Category);
+
+  return inSectionCategory("global", GA.getName(), Category) ||
+         inSectionCategory("type", GetGlobalTypeString(GA), Category);
 }
 
 bool SpecialCaseList::isIn(const Module &M, const StringRef Category) const {
   return inSectionCategory("src", M.getModuleIdentifier(), Category);
 }
 
-bool SpecialCaseList::findCategory(const StringRef Section,
-                                   const StringRef Query,
-                                   StringRef &Category) const {
-  StringMap<StringMap<Entry> >::const_iterator I = Entries.find(Section);
-  if (I == Entries.end()) return false;
-
-  for (StringMap<Entry>::const_iterator II = I->second.begin(),
-                                        IE = I->second.end();
-       II != IE; ++II) {
-    if (II->getValue().match(Query)) {
-      Category = II->first();
-      return true;
-    }
-  }
-
-  return false;
-}
-
 bool SpecialCaseList::inSectionCategory(const StringRef Section,
                                         const StringRef Query,
                                         const StringRef Category) const {
diff --git a/lib/Transforms/Vectorize/BBVectorize.cpp b/lib/Transforms/Vectorize/BBVectorize.cpp
index cbc1d63..c5e1dcb 100644
--- a/lib/Transforms/Vectorize/BBVectorize.cpp
+++ b/lib/Transforms/Vectorize/BBVectorize.cpp
@@ -533,7 +533,7 @@ namespace {
       default: break;
       case Instruction::GetElementPtr:
         // We mark this instruction as zero-cost because scalar GEPs are usually
-        // lowered to the intruction addressing mode. At the moment we don't
+        // lowered to the instruction addressing mode. At the moment we don't
         // generate vector GEPs.
         return 0;
       case Instruction::Br:
@@ -625,10 +625,10 @@ namespace {
         ConstantInt *IntOff = ConstOffSCEV->getValue();
         int64_t Offset = IntOff->getSExtValue();
 
-        Type *VTy = cast<PointerType>(IPtr->getType())->getElementType();
+        Type *VTy = IPtr->getType()->getPointerElementType();
         int64_t VTyTSS = (int64_t) TD->getTypeStoreSize(VTy);
 
-        Type *VTy2 = cast<PointerType>(JPtr->getType())->getElementType();
+        Type *VTy2 = JPtr->getType()->getPointerElementType();
         if (VTy != VTy2 && Offset < 0) {
           int64_t VTy2TSS = (int64_t) TD->getTypeStoreSize(VTy2);
           OffsetInElmts = Offset/VTy2TSS;
@@ -1182,6 +1182,8 @@ namespace {
       // Look for an instruction with which to pair instruction *I...
       DenseSet<Value *> Users;
       AliasSetTracker WriteSet(*AA);
+      if (I->mayWriteToMemory()) WriteSet.add(I);
+
       bool JAfterStart = IAfterStart;
       BasicBlock::iterator J = llvm::next(I);
       for (unsigned ss = 0; J != E && ss <= Config.SearchLimit; ++J, ++ss) {
@@ -1403,6 +1405,8 @@ namespace {
 
       DenseSet<Value *> Users;
       AliasSetTracker WriteSet(*AA);
+      if (I->mayWriteToMemory()) WriteSet.add(I);
+
       for (BasicBlock::iterator J = llvm::next(I); J != E; ++J) {
         (void) trackUsesOfI(Users, WriteSet, I, J);
 
@@ -2227,11 +2231,12 @@ namespace {
     // The pointer value is taken to be the one with the lowest offset.
     Value *VPtr = IPtr;
 
-    Type *ArgTypeI = cast<PointerType>(IPtr->getType())->getElementType();
-    Type *ArgTypeJ = cast<PointerType>(JPtr->getType())->getElementType();
+    Type *ArgTypeI = IPtr->getType()->getPointerElementType();
+    Type *ArgTypeJ = JPtr->getType()->getPointerElementType();
     Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ);
-    Type *VArgPtrType = PointerType::get(VArgType,
-      cast<PointerType>(IPtr->getType())->getAddressSpace());
+    Type *VArgPtrType
+      = PointerType::get(VArgType,
+                         IPtr->getType()->getPointerAddressSpace());
     return new BitCastInst(VPtr, VArgPtrType, getReplacementName(I, true, o),
                         /* insert before */ I);
   }
@@ -2240,7 +2245,7 @@ namespace {
                      unsigned MaskOffset, unsigned NumInElem,
                      unsigned NumInElem1, unsigned IdxOffset,
                      std::vector<Constant*> &Mask) {
-    unsigned NumElem1 = cast<VectorType>(J->getType())->getNumElements();
+    unsigned NumElem1 = J->getType()->getVectorNumElements();
     for (unsigned v = 0; v < NumElem1; ++v) {
       int m = cast<ShuffleVectorInst>(J)->getMaskValue(v);
       if (m < 0) {
@@ -2267,18 +2272,18 @@ namespace {
     Type *ArgTypeJ = J->getType();
     Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ);
 
-    unsigned NumElemI = cast<VectorType>(ArgTypeI)->getNumElements();
+    unsigned NumElemI = ArgTypeI->getVectorNumElements();
 
     // Get the total number of elements in the fused vector type.
     // By definition, this must equal the number of elements in
     // the final mask.
-    unsigned NumElem = cast<VectorType>(VArgType)->getNumElements();
+    unsigned NumElem = VArgType->getVectorNumElements();
     std::vector<Constant*> Mask(NumElem);
 
     Type *OpTypeI = I->getOperand(0)->getType();
-    unsigned NumInElemI = cast<VectorType>(OpTypeI)->getNumElements();
+    unsigned NumInElemI = OpTypeI->getVectorNumElements();
     Type *OpTypeJ = J->getOperand(0)->getType();
-    unsigned NumInElemJ = cast<VectorType>(OpTypeJ)->getNumElements();
+    unsigned NumInElemJ = OpTypeJ->getVectorNumElements();
 
     // The fused vector will be:
     // -----------------------------------------------------
@@ -2340,6 +2345,12 @@ namespace {
     return ExpandedIEChain;
   }
 
+  static unsigned getNumScalarElements(Type *Ty) {
+    if (VectorType *VecTy = dyn_cast<VectorType>(Ty))
+      return VecTy->getNumElements();
+    return 1;
+  }
+
   // Returns the value to be used as the specified operand of the vector
   // instruction that fuses I with J.
   Value *BBVectorize::getReplacementInput(LLVMContext& Context, Instruction *I,
@@ -2355,17 +2366,8 @@ namespace {
     Instruction *L = I, *H = J;
     Type *ArgTypeL = ArgTypeI, *ArgTypeH = ArgTypeJ;
 
-    unsigned numElemL;
-    if (ArgTypeL->isVectorTy())
-      numElemL = cast<VectorType>(ArgTypeL)->getNumElements();
-    else
-      numElemL = 1;
-
-    unsigned numElemH;
-    if (ArgTypeH->isVectorTy())
-      numElemH = cast<VectorType>(ArgTypeH)->getNumElements();
-    else
-      numElemH = 1;
+    unsigned numElemL = getNumScalarElements(ArgTypeL);
+    unsigned numElemH = getNumScalarElements(ArgTypeH);
 
     Value *LOp = L->getOperand(o);
     Value *HOp = H->getOperand(o);
@@ -2426,11 +2428,12 @@ namespace {
 
       if (CanUseInputs) {
         unsigned LOpElem =
-          cast<VectorType>(cast<Instruction>(LOp)->getOperand(0)->getType())
-            ->getNumElements();
+          cast<Instruction>(LOp)->getOperand(0)->getType()
+            ->getVectorNumElements();
+
         unsigned HOpElem =
-          cast<VectorType>(cast<Instruction>(HOp)->getOperand(0)->getType())
-            ->getNumElements();
+          cast<Instruction>(HOp)->getOperand(0)->getType()
+            ->getVectorNumElements();
 
         // We have one or two input vectors. We need to map each index of the
         // operands to the index of the original vector.
@@ -2646,14 +2649,14 @@ namespace {
                                            getReplacementName(IBeforeJ ? I : J,
                                                               true, o, 1));
         }
-  
+
         NHOp->insertBefore(IBeforeJ ? J : I);
         HOp = NHOp;
       }
     }
 
     if (ArgType->isVectorTy()) {
-      unsigned numElem = cast<VectorType>(VArgType)->getNumElements();
+      unsigned numElem = VArgType->getVectorNumElements();
       std::vector<Constant*> Mask(numElem);
       for (unsigned v = 0; v < numElem; ++v) {
         unsigned Idx = v;
@@ -2746,16 +2749,8 @@ namespace {
       VectorType *VType = getVecTypeForPair(IType, JType);
       unsigned numElem = VType->getNumElements();
 
-      unsigned numElemI, numElemJ;
-      if (IType->isVectorTy())
-        numElemI = cast<VectorType>(IType)->getNumElements();
-      else
-        numElemI = 1;
-
-      if (JType->isVectorTy())
-        numElemJ = cast<VectorType>(JType)->getNumElements();
-      else
-        numElemJ = 1;
+      unsigned numElemI = getNumScalarElements(IType);
+      unsigned numElemJ = getNumScalarElements(JType);
 
       if (IType->isVectorTy()) {
         std::vector<Constant*> Mask1(numElemI), Mask2(numElemI);
@@ -2804,6 +2799,8 @@ namespace {
 
     DenseSet<Value *> Users;
     AliasSetTracker WriteSet(*AA);
+    if (I->mayWriteToMemory()) WriteSet.add(I);
+
     for (; cast<Instruction>(L) != J; ++L)
       (void) trackUsesOfI(Users, WriteSet, I, L, true, &LoadMoveSetPairs);
 
@@ -2824,6 +2821,8 @@ namespace {
 
     DenseSet<Value *> Users;
     AliasSetTracker WriteSet(*AA);
+    if (I->mayWriteToMemory()) WriteSet.add(I);
+
     for (; cast<Instruction>(L) != J;) {
       if (trackUsesOfI(Users, WriteSet, I, L, true, &LoadMoveSetPairs)) {
         // Move this instruction
@@ -2853,6 +2852,7 @@ namespace {
 
     DenseSet<Value *> Users;
     AliasSetTracker WriteSet(*AA);
+    if (I->mayWriteToMemory()) WriteSet.add(I);
 
     // Note: We cannot end the loop when we reach J because J could be moved
     // farther down the use chain by another instruction pairing. Also, J
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index a62fedc..5e75871 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -48,6 +48,7 @@
 #include "llvm/Transforms/Vectorize.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/EquivalenceClasses.h"
+#include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
@@ -126,6 +127,9 @@ static const unsigned MaxVectorWidth = 64;
 /// Maximum vectorization unroll count.
 static const unsigned MaxUnrollFactor = 16;
 
+/// The cost of a loop that is considered 'small' by the unroller.
+static const unsigned SmallLoopCost = 20;
+
 namespace {
 
 // Forward declarations.
@@ -167,7 +171,9 @@ public:
     updateAnalysis();
   }
 
-private:
+  virtual ~InnerLoopVectorizer() {}
+
+protected:
   /// A small list of PHINodes.
   typedef SmallVector<PHINode*, 4> PhiVector;
   /// When we unroll loops we have multiple vector values for each scalar.
@@ -187,7 +193,13 @@ private:
   /// Create an empty loop, based on the loop ranges of the old loop.
   void createEmptyLoop(LoopVectorizationLegality *Legal);
   /// Copy and widen the instructions from the old loop.
-  void vectorizeLoop(LoopVectorizationLegality *Legal);
+  virtual void vectorizeLoop(LoopVectorizationLegality *Legal);
+
+  /// \brief The Loop exit block may have single value PHI nodes where the
+  /// incoming value is 'Undef'. While vectorizing we only handled real values
+  /// that were defined inside the loop. Here we fix the 'undef case'.
+  /// See PR14725.
+  void fixLCSSAPHIs();
 
   /// A helper function that computes the predicate of the block BB, assuming
   /// that the header block of the loop is set to True. It returns the *entry*
@@ -201,16 +213,23 @@ private:
   void vectorizeBlockInLoop(LoopVectorizationLegality *Legal, BasicBlock *BB,
                             PhiVector *PV);
 
+  /// Vectorize a single PHINode in a block. This method handles the induction
+  /// variable canonicalization. It supports both VF = 1 for unrolled loops and
+  /// arbitrary length vectors.
+  void widenPHIInstruction(Instruction *PN, VectorParts &Entry,
+                           LoopVectorizationLegality *Legal,
+                           unsigned UF, unsigned VF, PhiVector *PV);
+
   /// Insert the new loop to the loop hierarchy and pass manager
   /// and update the analysis passes.
   void updateAnalysis();
 
   /// This instruction is un-vectorizable. Implement it as a sequence
   /// of scalars.
-  void scalarizeInstruction(Instruction *Instr);
+  virtual void scalarizeInstruction(Instruction *Instr);
 
   /// Vectorize Load and Store instructions,
-  void vectorizeMemoryInstruction(Instruction *Instr,
+  virtual void vectorizeMemoryInstruction(Instruction *Instr,
                                   LoopVectorizationLegality *Legal);
 
   /// Create a broadcast instruction. This method generates a broadcast
@@ -218,12 +237,12 @@ private:
   /// value. If this is the induction variable then we extend it to N, N+1, ...
   /// this is needed because each iteration in the loop corresponds to a SIMD
   /// element.
-  Value *getBroadcastInstrs(Value *V);
+  virtual Value *getBroadcastInstrs(Value *V);
 
   /// This function adds 0, 1, 2 ... to each vector element, starting at zero.
   /// If Negate is set then negative numbers are added e.g. (0, -1, -2, ...).
   /// The sequence starts at StartIndex.
-  Value *getConsecutiveVector(Value* Val, int StartIdx, bool Negate);
+  virtual Value *getConsecutiveVector(Value* Val, int StartIdx, bool Negate);
 
   /// When we go over instructions in the basic block we rely on previous
   /// values within the current basic block or on loop invariant values.
@@ -233,7 +252,7 @@ private:
   VectorParts &getVectorValue(Value *V);
 
   /// Generate a shuffle sequence that will reverse the vector Vec.
-  Value *reverseVector(Value *Vec);
+  virtual Value *reverseVector(Value *Vec);
 
   /// This is a helper class that holds the vectorizer state. It maps scalar
   /// instructions to vector instructions. When the code is 'unrolled' then
@@ -291,6 +310,8 @@ private:
   /// The vectorization SIMD factor to use. Each vector will have this many
   /// vector elements.
   unsigned VF;
+
+protected:
   /// The vectorization unroll factor to use. Each scalar is vectorized to this
   /// many different vector instructions.
   unsigned UF;
@@ -326,6 +347,22 @@ private:
   EdgeMaskCache MaskCache;
 };
 
+class InnerLoopUnroller : public InnerLoopVectorizer {
+public:
+  InnerLoopUnroller(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI,
+                    DominatorTree *DT, DataLayout *DL,
+                    const TargetLibraryInfo *TLI, unsigned UnrollFactor) :
+    InnerLoopVectorizer(OrigLoop, SE, LI, DT, DL, TLI, 1, UnrollFactor) { }
+
+private:
+  virtual void scalarizeInstruction(Instruction *Instr);
+  virtual void vectorizeMemoryInstruction(Instruction *Instr,
+                                          LoopVectorizationLegality *Legal);
+  virtual Value *getBroadcastInstrs(Value *V);
+  virtual Value *getConsecutiveVector(Value* Val, int StartIdx, bool Negate);
+  virtual Value *reverseVector(Value *Vec);
+};
+
 /// \brief Look for a meaningful debug location on the instruction or it's
 /// operands.
 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
@@ -409,7 +446,7 @@ public:
     MRK_FloatMax
   };
 
-  /// This POD struct holds information about reduction variables.
+  /// This struct holds information about reduction variables.
   struct ReductionDescriptor {
     ReductionDescriptor() : StartValue(0), LoopExitInstr(0),
       Kind(RK_NoReduction), MinMaxKind(MRK_Invalid) {}
@@ -446,8 +483,8 @@ public:
     MinMaxReductionKind MinMaxKind;
   };
 
-  // This POD struct holds information about the memory runtime legality
-  // check that a group of pointers do not overlap.
+  /// This struct holds information about the memory runtime legality
+  /// check that a group of pointers do not overlap.
   struct RuntimePointerCheck {
     RuntimePointerCheck() : Need(false) {}
 
@@ -457,6 +494,8 @@ public:
       Pointers.clear();
       Starts.clear();
       Ends.clear();
+      IsWritePtr.clear();
+      DependencySetId.clear();
     }
 
     /// Insert a pointer and calculate the start and end SCEVs.
@@ -478,7 +517,7 @@ public:
     SmallVector<unsigned, 2> DependencySetId;
   };
 
-  /// A POD for saving information about induction variables.
+  /// A struct for saving information about induction variables.
   struct InductionInfo {
     InductionInfo(Value *Start, InductionKind K) : StartValue(Start), IK(K) {}
     InductionInfo() : StartValue(0), IK(IK_NoInduction) {}
@@ -725,9 +764,9 @@ struct LoopVectorizeHints {
   /// Vectorization unroll factor.
   unsigned Unroll;
 
-  LoopVectorizeHints(const Loop *L)
+  LoopVectorizeHints(const Loop *L, bool DisableUnrolling)
   : Width(VectorizationFactor)
-  , Unroll(VectorizationUnroll)
+  , Unroll(DisableUnrolling ? 1 : VectorizationUnroll)
   , LoopID(L->getLoopID()) {
     getHints(L);
     // The command line options override any loop metadata except for when
@@ -736,6 +775,9 @@ struct LoopVectorizeHints {
       Width = VectorizationFactor;
     if (VectorizationUnroll.getNumOccurrences() > 0)
       Unroll = VectorizationUnroll;
+
+    DEBUG(if (DisableUnrolling && Unroll == 1)
+            dbgs() << "LV: Unrolling disabled by the pass manager\n");
   }
 
   /// Return the loop vectorizer metadata prefix.
@@ -762,6 +804,7 @@ struct LoopVectorizeHints {
         Vals.push_back(LoopID->getOperand(i));
 
     Vals.push_back(createHint(Context, Twine(Prefix(), "width").str(), Width));
+    Vals.push_back(createHint(Context, Twine(Prefix(), "unroll").str(), 1));
 
     MDNode *NewLoopID = MDNode::get(Context, Vals);
     // Set operand 0 to refer to the loop id itself.
@@ -825,15 +868,18 @@ private:
     unsigned Val = C->getZExtValue();
 
     if (Hint == "width") {
-      assert(isPowerOf2_32(Val) && Val <= MaxVectorWidth &&
-             "Invalid width metadata");
-      Width = Val;
+      if (isPowerOf2_32(Val) && Val <= MaxVectorWidth)
+        Width = Val;
+      else
+        DEBUG(dbgs() << "LV: ignoring invalid width hint metadata\n");
     } else if (Hint == "unroll") {
-      assert(isPowerOf2_32(Val) && Val <= MaxUnrollFactor &&
-             "Invalid unroll metadata");
-      Unroll = Val;
-    } else
-      DEBUG(dbgs() << "LV: ignoring unknown hint " << Hint);
+      if (isPowerOf2_32(Val) && Val <= MaxUnrollFactor)
+        Unroll = Val;
+      else
+        DEBUG(dbgs() << "LV: ignoring invalid unroll hint metadata\n");
+    } else {
+      DEBUG(dbgs() << "LV: ignoring unknown hint " << Hint << '\n');
+    }
   }
 };
 
@@ -842,7 +888,8 @@ struct LoopVectorize : public LoopPass {
   /// Pass identification, replacement for typeid
   static char ID;
 
-  explicit LoopVectorize() : LoopPass(ID) {
+  explicit LoopVectorize(bool NoUnrolling = false)
+    : LoopPass(ID), DisableUnrolling(NoUnrolling) {
     initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
   }
 
@@ -852,6 +899,7 @@ struct LoopVectorize : public LoopPass {
   TargetTransformInfo *TTI;
   DominatorTree *DT;
   TargetLibraryInfo *TLI;
+  bool DisableUnrolling;
 
   virtual bool runOnLoop(Loop *L, LPPassManager &LPM) {
     // We only vectorize innermost loops.
@@ -865,17 +913,22 @@ struct LoopVectorize : public LoopPass {
     DT = &getAnalysis<DominatorTree>();
     TLI = getAnalysisIfAvailable<TargetLibraryInfo>();
 
+    // If the target claims to have no vector registers don't attempt
+    // vectorization.
+    if (!TTI->getNumberOfRegisters(true))
+      return false;
+
     if (DL == NULL) {
-      DEBUG(dbgs() << "LV: Not vectorizing because of missing data layout");
+      DEBUG(dbgs() << "LV: Not vectorizing because of missing data layout\n");
       return false;
     }
 
     DEBUG(dbgs() << "LV: Checking a loop in \"" <<
           L->getHeader()->getParent()->getName() << "\"\n");
 
-    LoopVectorizeHints Hints(L);
+    LoopVectorizeHints Hints(L, DisableUnrolling);
 
-    if (Hints.Width == 1) {
+    if (Hints.Width == 1 && Hints.Unroll == 1) {
       DEBUG(dbgs() << "LV: Not vectorizing.\n");
       return false;
     }
@@ -912,19 +965,23 @@ struct LoopVectorize : public LoopPass {
     unsigned UF = CM.selectUnrollFactor(OptForSize, Hints.Unroll, VF.Width,
                                         VF.Cost);
 
+    DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< VF.Width << ") in "<<
+          F->getParent()->getModuleIdentifier() << '\n');
+    DEBUG(dbgs() << "LV: Unroll Factor is " << UF << '\n');
+
     if (VF.Width == 1) {
       DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
-      return false;
+      if (UF == 1)
+        return false;
+      // We decided not to vectorize, but we may want to unroll.
+      InnerLoopUnroller Unroller(L, SE, LI, DT, DL, TLI, UF);
+      Unroller.vectorize(&LVL);
+    } else {
+      // If we decided that it is *legal* to vectorize the loop then do it.
+      InnerLoopVectorizer LB(L, SE, LI, DT, DL, TLI, VF.Width, UF);
+      LB.vectorize(&LVL);
     }
 
-    DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< VF.Width << ") in "<<
-          F->getParent()->getModuleIdentifier()<<"\n");
-    DEBUG(dbgs() << "LV: Unroll Factor is " << UF << "\n");
-
-    // If we decided that it is *legal* to vectorize the loop then do it.
-    InnerLoopVectorizer LB(L, SE, LI, DT, DL, TLI, VF.Width, UF);
-    LB.vectorize(&LVL);
-
     // Mark the loop as already vectorized to avoid vectorizing again.
     Hints.setAlreadyVectorized(L);
 
@@ -971,25 +1028,19 @@ LoopVectorizationLegality::RuntimePointerCheck::insert(ScalarEvolution *SE,
 }
 
 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
-  // Save the current insertion location.
-  Instruction *Loc = Builder.GetInsertPoint();
-
   // We need to place the broadcast of invariant variables outside the loop.
   Instruction *Instr = dyn_cast<Instruction>(V);
   bool NewInstr = (Instr && Instr->getParent() == LoopVectorBody);
   bool Invariant = OrigLoop->isLoopInvariant(V) && !NewInstr;
 
   // Place the code for broadcasting invariant variables in the new preheader.
+  IRBuilder<>::InsertPointGuard Guard(Builder);
   if (Invariant)
     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
 
   // Broadcast the scalar into all locations in the vector.
   Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
 
-  // Restore the builder insertion point.
-  if (Invariant)
-    Builder.SetInsertPoint(Loc);
-
   return Shuf;
 }
 
@@ -1016,10 +1067,35 @@ Value *InnerLoopVectorizer::getConsecutiveVector(Value* Val, int StartIdx,
   return Builder.CreateAdd(Val, Cv, "induction");
 }
 
+/// \brief Find the operand of the GEP that should be checked for consecutive
+/// stores. This ignores trailing indices that have no effect on the final
+/// pointer.
+static unsigned getGEPInductionOperand(DataLayout *DL,
+                                       const GetElementPtrInst *Gep) {
+  unsigned LastOperand = Gep->getNumOperands() - 1;
+  unsigned GEPAllocSize = DL->getTypeAllocSize(
+      cast<PointerType>(Gep->getType()->getScalarType())->getElementType());
+
+  // Walk backwards and try to peel off zeros.
+  while (LastOperand > 1 && match(Gep->getOperand(LastOperand), m_Zero())) {
+    // Find the type we're currently indexing into.
+    gep_type_iterator GEPTI = gep_type_begin(Gep);
+    std::advance(GEPTI, LastOperand - 1);
+
+    // If it's a type with the same allocation size as the result of the GEP we
+    // can peel off the zero index.
+    if (DL->getTypeAllocSize(*GEPTI) != GEPAllocSize)
+      break;
+    --LastOperand;
+  }
+
+  return LastOperand;
+}
+
 int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
   assert(Ptr->getType()->isPointerTy() && "Unexpected non ptr");
   // Make sure that the pointer does not point to structs.
-  if (cast<PointerType>(Ptr->getType())->getElementType()->isAggregateType())
+  if (Ptr->getType()->getPointerElementType()->isAggregateType())
     return 0;
 
   // If this value is a pointer induction variable we know it is consecutive.
@@ -1037,8 +1113,6 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
     return 0;
 
   unsigned NumOperands = Gep->getNumOperands();
-  Value *LastIndex = Gep->getOperand(NumOperands - 1);
-
   Value *GpPtr = Gep->getPointerOperand();
   // If this GEP value is a consecutive pointer induction variable and all of
   // the indices are constant then we know it is consecutive. We can
@@ -1062,14 +1136,18 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
       return -1;
   }
 
-  // Check that all of the gep indices are uniform except for the last.
-  for (unsigned i = 0; i < NumOperands - 1; ++i)
-    if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop))
+  unsigned InductionOperand = getGEPInductionOperand(DL, Gep);
+
+  // Check that all of the gep indices are uniform except for our induction
+  // operand.
+  for (unsigned i = 0; i != NumOperands; ++i)
+    if (i != InductionOperand &&
+        !SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop))
       return 0;
 
-  // We can emit wide load/stores only if the last index is the induction
-  // variable.
-  const SCEV *Last = SE->getSCEV(LastIndex);
+  // We can emit wide load/stores only if the last non-zero index is the
+  // induction variable.
+  const SCEV *Last = SE->getSCEV(Gep->getOperand(InductionOperand));
   if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Last)) {
     const SCEV *Step = AR->getStepRecurrence(*SE);
 
@@ -1127,6 +1205,10 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
   Type *DataTy = VectorType::get(ScalarDataTy, VF);
   Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand();
   unsigned Alignment = LI ? LI->getAlignment() : SI->getAlignment();
+  // An alignment of 0 means target abi alignment. We need to use the scalar's
+  // target abi alignment in such a case.
+  if (!Alignment)
+    Alignment = DL->getABITypeAlignment(ScalarDataTy);
   unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
   unsigned ScalarAllocatedSize = DL->getTypeAllocSize(ScalarDataTy);
   unsigned VectorElementSize = DL->getTypeStoreSize(DataTy)/VF;
@@ -1166,7 +1248,7 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
     // The last index does not have to be the induction. It can be
     // consecutive and be a function of the index. For example A[I+1];
     unsigned NumOperands = Gep->getNumOperands();
-    unsigned LastOperand = NumOperands - 1;
+    unsigned InductionOperand = getGEPInductionOperand(DL, Gep);
     // Create the new GEP with the new induction variable.
     GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
 
@@ -1175,9 +1257,9 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
       Instruction *GepOperandInst = dyn_cast<Instruction>(GepOperand);
 
       // Update last index or loop invariant instruction anchored in loop.
-      if (i == LastOperand ||
+      if (i == InductionOperand ||
           (GepOperandInst && OrigLoop->contains(GepOperandInst))) {
-        assert((i == LastOperand ||
+        assert((i == InductionOperand ||
                SE->isLoopInvariant(SE->getSCEV(GepOperandInst), OrigLoop)) &&
                "Must be last index or loop invariant");
 
@@ -1301,7 +1383,7 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
       Instruction *Cloned = Instr->clone();
       if (!IsVoidRetTy)
         Cloned->setName(Instr->getName() + ".cloned");
-      // Replace the operands of the cloned instrucions with extracted scalars.
+      // Replace the operands of the cloned instructions with extracted scalars.
       for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
         Value *Op = Params[op][Part];
         // Param is a vector. Need to extract the right lane.
@@ -1335,11 +1417,9 @@ InnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal,
   SmallVector<TrackingVH<Value> , 2> Starts;
   SmallVector<TrackingVH<Value> , 2> Ends;
 
+  LLVMContext &Ctx = Loc->getContext();
   SCEVExpander Exp(*SE, "induction");
 
-  // Use this type for pointer arithmetic.
-  Type* PtrArithTy = Type::getInt8PtrTy(Loc->getContext(), 0);
-
   for (unsigned i = 0; i < NumPointers; ++i) {
     Value *Ptr = PtrRtCheck->Pointers[i];
     const SCEV *Sc = SE->getSCEV(Ptr);
@@ -1350,7 +1430,11 @@ InnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal,
       Starts.push_back(Ptr);
       Ends.push_back(Ptr);
     } else {
-      DEBUG(dbgs() << "LV: Adding RT check for range:" << *Ptr <<"\n");
+      DEBUG(dbgs() << "LV: Adding RT check for range:" << *Ptr << '\n');
+      unsigned AS = Ptr->getType()->getPointerAddressSpace();
+
+      // Use this type for pointer arithmetic.
+      Type *PtrArithTy = Type::getInt8PtrTy(Ctx, AS);
 
       Value *Start = Exp.expandCodeFor(PtrRtCheck->Starts[i], PtrArithTy, Loc);
       Value *End = Exp.expandCodeFor(PtrRtCheck->Ends[i], PtrArithTy, Loc);
@@ -1372,10 +1456,20 @@ InnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal,
       if (PtrRtCheck->DependencySetId[i] == PtrRtCheck->DependencySetId[j])
        continue;
 
-      Value *Start0 = ChkBuilder.CreateBitCast(Starts[i], PtrArithTy, "bc");
-      Value *Start1 = ChkBuilder.CreateBitCast(Starts[j], PtrArithTy, "bc");
-      Value *End0 =   ChkBuilder.CreateBitCast(Ends[i],   PtrArithTy, "bc");
-      Value *End1 =   ChkBuilder.CreateBitCast(Ends[j],   PtrArithTy, "bc");
+      unsigned AS0 = Starts[i]->getType()->getPointerAddressSpace();
+      unsigned AS1 = Starts[j]->getType()->getPointerAddressSpace();
+
+      assert((AS0 == Ends[j]->getType()->getPointerAddressSpace()) &&
+             (AS1 == Ends[i]->getType()->getPointerAddressSpace()) &&
+             "Trying to bounds check pointers with different address spaces");
+
+      Type *PtrArithTy0 = Type::getInt8PtrTy(Ctx, AS0);
+      Type *PtrArithTy1 = Type::getInt8PtrTy(Ctx, AS1);
+
+      Value *Start0 = ChkBuilder.CreateBitCast(Starts[i], PtrArithTy0, "bc");
+      Value *Start1 = ChkBuilder.CreateBitCast(Starts[j], PtrArithTy1, "bc");
+      Value *End0 =   ChkBuilder.CreateBitCast(Ends[i],   PtrArithTy1, "bc");
+      Value *End1 =   ChkBuilder.CreateBitCast(Ends[j],   PtrArithTy0, "bc");
 
       Value *Cmp0 = ChkBuilder.CreateICmpULE(Start0, End1, "bound0");
       Value *Cmp1 = ChkBuilder.CreateICmpULE(Start1, End0, "bound1");
@@ -1390,9 +1484,8 @@ InnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal,
   // We have to do this trickery because the IRBuilder might fold the check to a
   // constant expression in which case there is no Instruction anchored in a
   // the block.
-  LLVMContext &Ctx = Loc->getContext();
-  Instruction * Check = BinaryOperator::CreateAnd(MemoryRuntimeCheck,
-                                                  ConstantInt::getTrue(Ctx));
+  Instruction *Check = BinaryOperator::CreateAnd(MemoryRuntimeCheck,
+                                                 ConstantInt::getTrue(Ctx));
   ChkBuilder.Insert(Check, "memcheck.conflict");
   return Check;
 }
@@ -1444,6 +1537,16 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
   const SCEV *ExitCount = SE->getBackedgeTakenCount(OrigLoop);
   assert(ExitCount != SE->getCouldNotCompute() && "Invalid loop count");
 
+  // The exit count might have the type of i64 while the phi is i32. This can
+  // happen if we have an induction variable that is sign extended before the
+  // compare. The only way that we get a backedge taken count is that the
+  // induction variable was signed and as such will not overflow. In such a case
+  // truncation is legal.
+  if (ExitCount->getType()->getPrimitiveSizeInBits() >
+      IdxTy->getPrimitiveSizeInBits())
+    ExitCount = SE->getTruncateOrNoop(ExitCount, IdxTy);
+
+  ExitCount = SE->getNoopOrZeroExtend(ExitCount, IdxTy);
   // Get the total trip count from the count by adding 1.
   ExitCount = SE->getAddExpr(ExitCount,
                              SE->getConstant(ExitCount->getType(), 1));
@@ -1496,7 +1599,7 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
 
   // Use this IR builder to create the loop instructions (Phi, Br, Cmp)
   // inside the loop.
-  Builder.SetInsertPoint(VecBody->getFirstInsertionPt());
+  Builder.SetInsertPoint(VecBody->getFirstNonPHI());
 
   // Generate the induction variable.
   setDebugLocFromInst(Builder, getDebugLocFromInstOrOperands(OldInduction));
@@ -1724,6 +1827,9 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
   LoopExitBlock = ExitBlock;
   LoopVectorBody = VecBody;
   LoopScalarBody = OldBasicBlock;
+
+  LoopVectorizeHints Hints(Lp, true);
+  Hints.setAlreadyVectorized(Lp);
 }
 
 /// This function returns the identity element (or neutral element) for
@@ -1753,6 +1859,31 @@ LoopVectorizationLegality::getReductionIdentity(ReductionKind K, Type *Tp) {
   }
 }
 
+static Intrinsic::ID checkUnaryFloatSignature(const CallInst &I,
+                                              Intrinsic::ID ValidIntrinsicID) {
+  if (I.getNumArgOperands() != 1 ||
+      !I.getArgOperand(0)->getType()->isFloatingPointTy() ||
+      I.getType() != I.getArgOperand(0)->getType() ||
+      !I.onlyReadsMemory())
+    return Intrinsic::not_intrinsic;
+
+  return ValidIntrinsicID;
+}
+
+static Intrinsic::ID checkBinaryFloatSignature(const CallInst &I,
+                                               Intrinsic::ID ValidIntrinsicID) {
+  if (I.getNumArgOperands() != 2 ||
+      !I.getArgOperand(0)->getType()->isFloatingPointTy() ||
+      !I.getArgOperand(1)->getType()->isFloatingPointTy() ||
+      I.getType() != I.getArgOperand(0)->getType() ||
+      I.getType() != I.getArgOperand(1)->getType() ||
+      !I.onlyReadsMemory())
+    return Intrinsic::not_intrinsic;
+
+  return ValidIntrinsicID;
+}
+
+
 static Intrinsic::ID
 getIntrinsicIDForCall(CallInst *CI, const TargetLibraryInfo *TLI) {
   // If we have an intrinsic call, check if it is trivially vectorizable.
@@ -1767,11 +1898,13 @@ getIntrinsicIDForCall(CallInst *CI, const TargetLibraryInfo *TLI) {
     case Intrinsic::log10:
     case Intrinsic::log2:
     case Intrinsic::fabs:
+    case Intrinsic::copysign:
     case Intrinsic::floor:
     case Intrinsic::ceil:
     case Intrinsic::trunc:
     case Intrinsic::rint:
     case Intrinsic::nearbyint:
+    case Intrinsic::round:
     case Intrinsic::pow:
     case Intrinsic::fma:
     case Intrinsic::fmuladd:
@@ -1789,8 +1922,9 @@ getIntrinsicIDForCall(CallInst *CI, const TargetLibraryInfo *TLI) {
   LibFunc::Func Func;
   Function *F = CI->getCalledFunction();
   // We're going to make assumptions on the semantics of the functions, check
-  // that the target knows that it's available in this environment.
-  if (!F || !TLI->getLibFunc(F->getName(), Func))
+  // that the target knows that it's available in this environment and it does
+  // not have local linkage.
+  if (!F || F->hasLocalLinkage() || !TLI->getLibFunc(F->getName(), Func))
     return Intrinsic::not_intrinsic;
 
   // Otherwise check if we have a call to a function that can be turned into a
@@ -1801,59 +1935,67 @@ getIntrinsicIDForCall(CallInst *CI, const TargetLibraryInfo *TLI) {
   case LibFunc::sin:
   case LibFunc::sinf:
   case LibFunc::sinl:
-    return Intrinsic::sin;
+    return checkUnaryFloatSignature(*CI, Intrinsic::sin);
   case LibFunc::cos:
   case LibFunc::cosf:
   case LibFunc::cosl:
-    return Intrinsic::cos;
+    return checkUnaryFloatSignature(*CI, Intrinsic::cos);
   case LibFunc::exp:
   case LibFunc::expf:
   case LibFunc::expl:
-    return Intrinsic::exp;
+    return checkUnaryFloatSignature(*CI, Intrinsic::exp);
   case LibFunc::exp2:
   case LibFunc::exp2f:
   case LibFunc::exp2l:
-    return Intrinsic::exp2;
+    return checkUnaryFloatSignature(*CI, Intrinsic::exp2);
   case LibFunc::log:
   case LibFunc::logf:
   case LibFunc::logl:
-    return Intrinsic::log;
+    return checkUnaryFloatSignature(*CI, Intrinsic::log);
   case LibFunc::log10:
   case LibFunc::log10f:
   case LibFunc::log10l:
-    return Intrinsic::log10;
+    return checkUnaryFloatSignature(*CI, Intrinsic::log10);
   case LibFunc::log2:
   case LibFunc::log2f:
   case LibFunc::log2l:
-    return Intrinsic::log2;
+    return checkUnaryFloatSignature(*CI, Intrinsic::log2);
   case LibFunc::fabs:
   case LibFunc::fabsf:
   case LibFunc::fabsl:
-    return Intrinsic::fabs;
+    return checkUnaryFloatSignature(*CI, Intrinsic::fabs);
+  case LibFunc::copysign:
+  case LibFunc::copysignf:
+  case LibFunc::copysignl:
+    return checkBinaryFloatSignature(*CI, Intrinsic::copysign);
   case LibFunc::floor:
   case LibFunc::floorf:
   case LibFunc::floorl:
-    return Intrinsic::floor;
+    return checkUnaryFloatSignature(*CI, Intrinsic::floor);
   case LibFunc::ceil:
   case LibFunc::ceilf:
   case LibFunc::ceill:
-    return Intrinsic::ceil;
+    return checkUnaryFloatSignature(*CI, Intrinsic::ceil);
   case LibFunc::trunc:
   case LibFunc::truncf:
   case LibFunc::truncl:
-    return Intrinsic::trunc;
+    return checkUnaryFloatSignature(*CI, Intrinsic::trunc);
   case LibFunc::rint:
   case LibFunc::rintf:
   case LibFunc::rintl:
-    return Intrinsic::rint;
+    return checkUnaryFloatSignature(*CI, Intrinsic::rint);
   case LibFunc::nearbyint:
   case LibFunc::nearbyintf:
   case LibFunc::nearbyintl:
-    return Intrinsic::nearbyint;
+    return checkUnaryFloatSignature(*CI, Intrinsic::nearbyint);
+  case LibFunc::round:
+  case LibFunc::roundf:
+  case LibFunc::roundl:
+    return checkUnaryFloatSignature(*CI, Intrinsic::round);
   case LibFunc::pow:
   case LibFunc::powf:
   case LibFunc::powl:
-    return Intrinsic::pow;
+    return checkBinaryFloatSignature(*CI, Intrinsic::pow);
   }
 
   return Intrinsic::not_intrinsic;
@@ -1925,6 +2067,54 @@ Value *createMinMaxOp(IRBuilder<> &Builder,
   return Select;
 }
 
+namespace {
+struct CSEDenseMapInfo {
+  static bool canHandle(Instruction *I) {
+    return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
+           isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
+  }
+  static inline Instruction *getEmptyKey() {
+    return DenseMapInfo<Instruction *>::getEmptyKey();
+  }
+  static inline Instruction *getTombstoneKey() {
+    return DenseMapInfo<Instruction *>::getTombstoneKey();
+  }
+  static unsigned getHashValue(Instruction *I) {
+    assert(canHandle(I) && "Unknown instruction!");
+    return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
+                                                           I->value_op_end()));
+  }
+  static bool isEqual(Instruction *LHS, Instruction *RHS) {
+    if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
+        LHS == getTombstoneKey() || RHS == getTombstoneKey())
+      return LHS == RHS;
+    return LHS->isIdenticalTo(RHS);
+  }
+};
+}
+
+///\brief Perform cse of induction variable instructions.
+static void cse(BasicBlock *BB) {
+  // Perform simple cse.
+  SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
+  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
+    Instruction *In = I++;
+
+    if (!CSEDenseMapInfo::canHandle(In))
+      continue;
+
+    // Check if we can replace this instruction with any of the
+    // visited instructions.
+    if (Instruction *V = CSEMap.lookup(In)) {
+      In->replaceAllUsesWith(V);
+      In->eraseFromParent();
+      continue;
+    }
+
+    CSEMap[In] = In;
+  }
+}
+
 void
 InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
   //===------------------------------------------------===//
@@ -1995,18 +2185,31 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
     if (RdxDesc.Kind == LoopVectorizationLegality::RK_IntegerMinMax ||
         RdxDesc.Kind == LoopVectorizationLegality::RK_FloatMinMax) {
       // MinMax reduction have the start value as their identify.
-      VectorStart = Identity = Builder.CreateVectorSplat(VF, RdxDesc.StartValue,
-                                                         "minmax.ident");
+      if (VF == 1) {
+        VectorStart = Identity = RdxDesc.StartValue;
+      } else {
+        VectorStart = Identity = Builder.CreateVectorSplat(VF,
+                                                           RdxDesc.StartValue,
+                                                           "minmax.ident");
+      }
     } else {
+      // Handle other reduction kinds:
       Constant *Iden =
-        LoopVectorizationLegality::getReductionIdentity(RdxDesc.Kind,
-                                                        VecTy->getScalarType());
-      Identity = ConstantVector::getSplat(VF, Iden);
-
-      // This vector is the Identity vector where the first element is the
-      // incoming scalar reduction.
-      VectorStart = Builder.CreateInsertElement(Identity,
-                                                RdxDesc.StartValue, Zero);
+      LoopVectorizationLegality::getReductionIdentity(RdxDesc.Kind,
+                                                      VecTy->getScalarType());
+      if (VF == 1) {
+        Identity = Iden;
+        // This vector is the Identity vector where the first element is the
+        // incoming scalar reduction.
+        VectorStart = RdxDesc.StartValue;
+      } else {
+        Identity = ConstantVector::getSplat(VF, Iden);
+
+        // This vector is the Identity vector where the first element is the
+        // incoming scalar reduction.
+        VectorStart = Builder.CreateInsertElement(Identity,
+                                                  RdxDesc.StartValue, Zero);
+      }
     }
 
     // Fix the vector-loop phi.
@@ -2062,37 +2265,40 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
                                         ReducedPartRdx, RdxParts[part]);
     }
 
-    // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles
-    // and vector ops, reducing the set of values being computed by half each
-    // round.
-    assert(isPowerOf2_32(VF) &&
-           "Reduction emission only supported for pow2 vectors!");
-    Value *TmpVec = ReducedPartRdx;
-    SmallVector<Constant*, 32> ShuffleMask(VF, 0);
-    for (unsigned i = VF; i != 1; i >>= 1) {
-      // Move the upper half of the vector to the lower half.
-      for (unsigned j = 0; j != i/2; ++j)
-        ShuffleMask[j] = Builder.getInt32(i/2 + j);
-
-      // Fill the rest of the mask with undef.
-      std::fill(&ShuffleMask[i/2], ShuffleMask.end(),
-                UndefValue::get(Builder.getInt32Ty()));
-
-      Value *Shuf =
+    if (VF > 1) {
+      // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles
+      // and vector ops, reducing the set of values being computed by half each
+      // round.
+      assert(isPowerOf2_32(VF) &&
+             "Reduction emission only supported for pow2 vectors!");
+      Value *TmpVec = ReducedPartRdx;
+      SmallVector<Constant*, 32> ShuffleMask(VF, 0);
+      for (unsigned i = VF; i != 1; i >>= 1) {
+        // Move the upper half of the vector to the lower half.
+        for (unsigned j = 0; j != i/2; ++j)
+          ShuffleMask[j] = Builder.getInt32(i/2 + j);
+
+        // Fill the rest of the mask with undef.
+        std::fill(&ShuffleMask[i/2], ShuffleMask.end(),
+                  UndefValue::get(Builder.getInt32Ty()));
+
+        Value *Shuf =
         Builder.CreateShuffleVector(TmpVec,
                                     UndefValue::get(TmpVec->getType()),
                                     ConstantVector::get(ShuffleMask),
                                     "rdx.shuf");
 
-      if (Op != Instruction::ICmp && Op != Instruction::FCmp)
-        TmpVec = Builder.CreateBinOp((Instruction::BinaryOps)Op, TmpVec, Shuf,
-                                     "bin.rdx");
-      else
-        TmpVec = createMinMaxOp(Builder, RdxDesc.MinMaxKind, TmpVec, Shuf);
-    }
+        if (Op != Instruction::ICmp && Op != Instruction::FCmp)
+          TmpVec = Builder.CreateBinOp((Instruction::BinaryOps)Op, TmpVec, Shuf,
+                                       "bin.rdx");
+        else
+          TmpVec = createMinMaxOp(Builder, RdxDesc.MinMaxKind, TmpVec, Shuf);
+      }
 
-    // The result is in the first element of the vector.
-    Value *Scalar0 = Builder.CreateExtractElement(TmpVec, Builder.getInt32(0));
+      // The result is in the first element of the vector.
+      ReducedPartRdx = Builder.CreateExtractElement(TmpVec,
+                                                    Builder.getInt32(0));
+    }
 
     // Now, we need to fix the users of the reduction variable
     // inside and outside of the scalar remainder loop.
@@ -2101,7 +2307,7 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
     for (BasicBlock::iterator LEI = LoopExitBlock->begin(),
          LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) {
       PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI);
-      if (!LCSSAPhi) continue;
+      if (!LCSSAPhi) break;
 
       // All PHINodes need to have a single entry edge, or two if
       // we already fixed them.
@@ -2111,7 +2317,7 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
       // incoming bypass edge.
       if (LCSSAPhi->getIncomingValue(0) == RdxDesc.LoopExitInstr) {
         // Add an edge coming from the bypass.
-        LCSSAPhi->addIncoming(Scalar0, LoopMiddleBlock);
+        LCSSAPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
         break;
       }
     }// end of the LCSSA phi scan.
@@ -2123,23 +2329,26 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
     assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
     // Pick the other block.
     int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
-    (RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, Scalar0);
+    (RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, ReducedPartRdx);
     (RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, RdxDesc.LoopExitInstr);
   }// end of for each redux variable.
 
-  // The Loop exit block may have single value PHI nodes where the incoming
-  // value is 'undef'. While vectorizing we only handled real values that
-  // were defined inside the loop. Here we handle the 'undef case'.
-  // See PR14725.
+  fixLCSSAPHIs();
+
+  // Remove redundant induction instructions.
+  cse(LoopVectorBody);
+}
+
+void InnerLoopVectorizer::fixLCSSAPHIs() {
   for (BasicBlock::iterator LEI = LoopExitBlock->begin(),
        LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) {
     PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI);
-    if (!LCSSAPhi) continue;
+    if (!LCSSAPhi) break;
     if (LCSSAPhi->getNumIncomingValues() == 1)
       LCSSAPhi->addIncoming(UndefValue::get(LCSSAPhi->getType()),
                             LoopMiddleBlock);
   }
-}
+} 
 
 InnerLoopVectorizer::VectorParts
 InnerLoopVectorizer::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) {
@@ -2200,161 +2409,185 @@ InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) {
   return BlockMask;
 }
 
-void
-InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
-                                          BasicBlock *BB, PhiVector *PV) {
-  // For each instruction in the old loop.
-  for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
-    VectorParts &Entry = WidenMap.get(it);
-    switch (it->getOpcode()) {
-    case Instruction::Br:
-      // Nothing to do for PHIs and BR, since we already took care of the
-      // loop control flow instructions.
-      continue;
-    case Instruction::PHI:{
-      PHINode* P = cast<PHINode>(it);
-      // Handle reduction variables:
-      if (Legal->getReductionVars()->count(P)) {
-        for (unsigned part = 0; part < UF; ++part) {
-          // This is phase one of vectorizing PHIs.
-          Type *VecTy = VectorType::get(it->getType(), VF);
-          Entry[part] = PHINode::Create(VecTy, 2, "vec.phi",
-                                        LoopVectorBody-> getFirstInsertionPt());
-        }
-        PV->push_back(P);
-        continue;
-      }
+void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
+                                              InnerLoopVectorizer::VectorParts &Entry,
+                                              LoopVectorizationLegality *Legal,
+                                              unsigned UF, unsigned VF, PhiVector *PV) {
+  PHINode* P = cast<PHINode>(PN);
+  // Handle reduction variables:
+  if (Legal->getReductionVars()->count(P)) {
+    for (unsigned part = 0; part < UF; ++part) {
+      // This is phase one of vectorizing PHIs.
+      Type *VecTy = (VF == 1) ? PN->getType() :
+      VectorType::get(PN->getType(), VF);
+      Entry[part] = PHINode::Create(VecTy, 2, "vec.phi",
+                                    LoopVectorBody-> getFirstInsertionPt());
+    }
+    PV->push_back(P);
+    return;
+  }
 
-      setDebugLocFromInst(Builder, P);
-      // Check for PHI nodes that are lowered to vector selects.
-      if (P->getParent() != OrigLoop->getHeader()) {
-        // We know that all PHIs in non header blocks are converted into
-        // selects, so we don't have to worry about the insertion order and we
-        // can just use the builder.
-        // At this point we generate the predication tree. There may be
-        // duplications since this is a simple recursive scan, but future
-        // optimizations will clean it up.
-
-        unsigned NumIncoming = P->getNumIncomingValues();
-
-        // Generate a sequence of selects of the form:
-        // SELECT(Mask3, In3,
-        //      SELECT(Mask2, In2,
-        //                   ( ...)))
-        for (unsigned In = 0; In < NumIncoming; In++) {
-          VectorParts Cond = createEdgeMask(P->getIncomingBlock(In),
-                                            P->getParent());
-          VectorParts &In0 = getVectorValue(P->getIncomingValue(In));
-
-          for (unsigned part = 0; part < UF; ++part) {
-            // We might have single edge PHIs (blocks) - use an identity
-            // 'select' for the first PHI operand.
-            if (In == 0)
-              Entry[part] = Builder.CreateSelect(Cond[part], In0[part],
-                                                 In0[part]);
-            else
-              // Select between the current value and the previous incoming edge
-              // based on the incoming mask.
-              Entry[part] = Builder.CreateSelect(Cond[part], In0[part],
-                                                 Entry[part], "predphi");
-          }
-        }
-        continue;
+  setDebugLocFromInst(Builder, P);
+  // Check for PHI nodes that are lowered to vector selects.
+  if (P->getParent() != OrigLoop->getHeader()) {
+    // We know that all PHIs in non header blocks are converted into
+    // selects, so we don't have to worry about the insertion order and we
+    // can just use the builder.
+    // At this point we generate the predication tree. There may be
+    // duplications since this is a simple recursive scan, but future
+    // optimizations will clean it up.
+
+    unsigned NumIncoming = P->getNumIncomingValues();
+
+    // Generate a sequence of selects of the form:
+    // SELECT(Mask3, In3,
+    //      SELECT(Mask2, In2,
+    //                   ( ...)))
+    for (unsigned In = 0; In < NumIncoming; In++) {
+      VectorParts Cond = createEdgeMask(P->getIncomingBlock(In),
+                                        P->getParent());
+      VectorParts &In0 = getVectorValue(P->getIncomingValue(In));
+
+      for (unsigned part = 0; part < UF; ++part) {
+        // We might have single edge PHIs (blocks) - use an identity
+        // 'select' for the first PHI operand.
+        if (In == 0)
+          Entry[part] = Builder.CreateSelect(Cond[part], In0[part],
+                                             In0[part]);
+        else
+          // Select between the current value and the previous incoming edge
+          // based on the incoming mask.
+          Entry[part] = Builder.CreateSelect(Cond[part], In0[part],
+                                             Entry[part], "predphi");
       }
+    }
+    return;
+  }
 
-      // This PHINode must be an induction variable.
-      // Make sure that we know about it.
-      assert(Legal->getInductionVars()->count(P) &&
-             "Not an induction variable");
-
-      LoopVectorizationLegality::InductionInfo II =
-        Legal->getInductionVars()->lookup(P);
-
-      switch (II.IK) {
-      case LoopVectorizationLegality::IK_NoInduction:
-        llvm_unreachable("Unknown induction");
-      case LoopVectorizationLegality::IK_IntInduction: {
-        assert(P->getType() == II.StartValue->getType() && "Types must match");
-        Type *PhiTy = P->getType();
-        Value *Broadcasted;
-        if (P == OldInduction) {
-          // Handle the canonical induction variable. We might have had to
-          // extend the type.
-          Broadcasted = Builder.CreateTrunc(Induction, PhiTy);
-        } else {
-          // Handle other induction variables that are now based on the
-          // canonical one.
-          Value *NormalizedIdx = Builder.CreateSub(Induction, ExtendedIdx,
-                                                   "normalized.idx");
-          NormalizedIdx = Builder.CreateSExtOrTrunc(NormalizedIdx, PhiTy);
-          Broadcasted = Builder.CreateAdd(II.StartValue, NormalizedIdx,
-                                          "offset.idx");
-        }
-        Broadcasted = getBroadcastInstrs(Broadcasted);
-        // After broadcasting the induction variable we need to make the vector
-        // consecutive by adding 0, 1, 2, etc.
+  // This PHINode must be an induction variable.
+  // Make sure that we know about it.
+  assert(Legal->getInductionVars()->count(P) &&
+         "Not an induction variable");
+
+  LoopVectorizationLegality::InductionInfo II =
+  Legal->getInductionVars()->lookup(P);
+
+  switch (II.IK) {
+    case LoopVectorizationLegality::IK_NoInduction:
+      llvm_unreachable("Unknown induction");
+    case LoopVectorizationLegality::IK_IntInduction: {
+      assert(P->getType() == II.StartValue->getType() && "Types must match");
+      Type *PhiTy = P->getType();
+      Value *Broadcasted;
+      if (P == OldInduction) {
+        // Handle the canonical induction variable. We might have had to
+        // extend the type.
+        Broadcasted = Builder.CreateTrunc(Induction, PhiTy);
+      } else {
+        // Handle other induction variables that are now based on the
+        // canonical one.
+        Value *NormalizedIdx = Builder.CreateSub(Induction, ExtendedIdx,
+                                                 "normalized.idx");
+        NormalizedIdx = Builder.CreateSExtOrTrunc(NormalizedIdx, PhiTy);
+        Broadcasted = Builder.CreateAdd(II.StartValue, NormalizedIdx,
+                                        "offset.idx");
+      }
+      Broadcasted = getBroadcastInstrs(Broadcasted);
+      // After broadcasting the induction variable we need to make the vector
+      // consecutive by adding 0, 1, 2, etc.
+      for (unsigned part = 0; part < UF; ++part)
+        Entry[part] = getConsecutiveVector(Broadcasted, VF * part, false);
+      return;
+    }
+    case LoopVectorizationLegality::IK_ReverseIntInduction:
+    case LoopVectorizationLegality::IK_PtrInduction:
+    case LoopVectorizationLegality::IK_ReversePtrInduction:
+      // Handle reverse integer and pointer inductions.
+      Value *StartIdx = ExtendedIdx;
+      // This is the normalized GEP that starts counting at zero.
+      Value *NormalizedIdx = Builder.CreateSub(Induction, StartIdx,
+                                               "normalized.idx");
+
+      // Handle the reverse integer induction variable case.
+      if (LoopVectorizationLegality::IK_ReverseIntInduction == II.IK) {
+        IntegerType *DstTy = cast<IntegerType>(II.StartValue->getType());
+        Value *CNI = Builder.CreateSExtOrTrunc(NormalizedIdx, DstTy,
+                                               "resize.norm.idx");
+        Value *ReverseInd  = Builder.CreateSub(II.StartValue, CNI,
+                                               "reverse.idx");
+
+        // This is a new value so do not hoist it out.
+        Value *Broadcasted = getBroadcastInstrs(ReverseInd);
+        // After broadcasting the induction variable we need to make the
+        // vector consecutive by adding  ... -3, -2, -1, 0.
         for (unsigned part = 0; part < UF; ++part)
-          Entry[part] = getConsecutiveVector(Broadcasted, VF * part, false);
-        continue;
+          Entry[part] = getConsecutiveVector(Broadcasted, -(int)VF * part,
+                                             true);
+        return;
       }
-      case LoopVectorizationLegality::IK_ReverseIntInduction:
-      case LoopVectorizationLegality::IK_PtrInduction:
-      case LoopVectorizationLegality::IK_ReversePtrInduction:
-        // Handle reverse integer and pointer inductions.
-        Value *StartIdx = ExtendedIdx;
-        // This is the normalized GEP that starts counting at zero.
-        Value *NormalizedIdx = Builder.CreateSub(Induction, StartIdx,
-                                                 "normalized.idx");
 
-        // Handle the reverse integer induction variable case.
-        if (LoopVectorizationLegality::IK_ReverseIntInduction == II.IK) {
-          IntegerType *DstTy = cast<IntegerType>(II.StartValue->getType());
-          Value *CNI = Builder.CreateSExtOrTrunc(NormalizedIdx, DstTy,
-                                                 "resize.norm.idx");
-          Value *ReverseInd  = Builder.CreateSub(II.StartValue, CNI,
-                                                 "reverse.idx");
-
-          // This is a new value so do not hoist it out.
-          Value *Broadcasted = getBroadcastInstrs(ReverseInd);
-          // After broadcasting the induction variable we need to make the
-          // vector consecutive by adding  ... -3, -2, -1, 0.
-          for (unsigned part = 0; part < UF; ++part)
-            Entry[part] = getConsecutiveVector(Broadcasted, -(int)VF * part,
-                                               true);
+      // Handle the pointer induction variable case.
+      assert(P->getType()->isPointerTy() && "Unexpected type.");
+
+      // Is this a reverse induction ptr or a consecutive induction ptr.
+      bool Reverse = (LoopVectorizationLegality::IK_ReversePtrInduction ==
+                      II.IK);
+
+      // This is the vector of results. Notice that we don't generate
+      // vector geps because scalar geps result in better code.
+      for (unsigned part = 0; part < UF; ++part) {
+        if (VF == 1) {
+          int EltIndex = (part) * (Reverse ? -1 : 1);
+          Constant *Idx = ConstantInt::get(Induction->getType(), EltIndex);
+          Value *GlobalIdx;
+          if (Reverse)
+            GlobalIdx = Builder.CreateSub(Idx, NormalizedIdx, "gep.ridx");
+          else
+            GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx, "gep.idx");
+
+          Value *SclrGep = Builder.CreateGEP(II.StartValue, GlobalIdx,
+                                             "next.gep");
+          Entry[part] = SclrGep;
           continue;
         }
 
-        // Handle the pointer induction variable case.
-        assert(P->getType()->isPointerTy() && "Unexpected type.");
-
-        // Is this a reverse induction ptr or a consecutive induction ptr.
-        bool Reverse = (LoopVectorizationLegality::IK_ReversePtrInduction ==
-                        II.IK);
-
-        // This is the vector of results. Notice that we don't generate
-        // vector geps because scalar geps result in better code.
-        for (unsigned part = 0; part < UF; ++part) {
-          Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF));
-          for (unsigned int i = 0; i < VF; ++i) {
-            int EltIndex = (i + part * VF) * (Reverse ? -1 : 1);
-            Constant *Idx = ConstantInt::get(Induction->getType(), EltIndex);
-            Value *GlobalIdx;
-            if (!Reverse)
-              GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx, "gep.idx");
-            else
-              GlobalIdx = Builder.CreateSub(Idx, NormalizedIdx, "gep.ridx");
-
-            Value *SclrGep = Builder.CreateGEP(II.StartValue, GlobalIdx,
-                                               "next.gep");
-            VecVal = Builder.CreateInsertElement(VecVal, SclrGep,
-                                                 Builder.getInt32(i),
-                                                 "insert.gep");
-          }
-          Entry[part] = VecVal;
+        Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF));
+        for (unsigned int i = 0; i < VF; ++i) {
+          int EltIndex = (i + part * VF) * (Reverse ? -1 : 1);
+          Constant *Idx = ConstantInt::get(Induction->getType(), EltIndex);
+          Value *GlobalIdx;
+          if (!Reverse)
+            GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx, "gep.idx");
+          else
+            GlobalIdx = Builder.CreateSub(Idx, NormalizedIdx, "gep.ridx");
+
+          Value *SclrGep = Builder.CreateGEP(II.StartValue, GlobalIdx,
+                                             "next.gep");
+          VecVal = Builder.CreateInsertElement(VecVal, SclrGep,
+                                               Builder.getInt32(i),
+                                               "insert.gep");
         }
-        continue;
+        Entry[part] = VecVal;
       }
+      return;
+  }
+}
 
+void
+InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
+                                          BasicBlock *BB, PhiVector *PV) {
+  // For each instruction in the old loop.
+  for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
+    VectorParts &Entry = WidenMap.get(it);
+    switch (it->getOpcode()) {
+    case Instruction::Br:
+      // Nothing to do for PHIs and BR, since we already took care of the
+      // loop control flow instructions.
+      continue;
+    case Instruction::PHI:{
+      // Vectorize PHINodes.
+      widenPHIInstruction(it, Entry, Legal, UF, VF, PV);
+      continue;
     }// End of PHI.
 
     case Instruction::Add:
@@ -2413,8 +2646,10 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
       VectorParts &Cond = getVectorValue(it->getOperand(0));
       VectorParts &Op0  = getVectorValue(it->getOperand(1));
       VectorParts &Op1  = getVectorValue(it->getOperand(2));
-      Value *ScalarCond = Builder.CreateExtractElement(Cond[0],
-                                                       Builder.getInt32(0));
+
+      Value *ScalarCond = (VF == 1) ? Cond[0] :
+        Builder.CreateExtractElement(Cond[0], Builder.getInt32(0));
+
       for (unsigned Part = 0; Part < UF; ++Part) {
         Entry[Part] = Builder.CreateSelect(
           InvariantCond ? ScalarCond : Cond[Part],
@@ -2475,7 +2710,8 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
         break;
       }
       /// Vectorize casts.
-      Type *DestTy = VectorType::get(CI->getType()->getScalarType(), VF);
+      Type *DestTy = (VF == 1) ? CI->getType() :
+                                 VectorType::get(CI->getType(), VF);
 
       VectorParts &A = getVectorValue(it->getOperand(0));
       for (unsigned Part = 0; Part < UF; ++Part)
@@ -2505,7 +2741,10 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
             VectorParts &Arg = getVectorValue(CI->getArgOperand(i));
             Args.push_back(Arg[Part]);
           }
-          Type *Tys[] = { VectorType::get(CI->getType()->getScalarType(), VF) };
+          Type *Tys[] = {CI->getType()};
+          if (VF > 1)
+            Tys[0] = VectorType::get(CI->getType()->getScalarType(), VF);
+
           Function *F = Intrinsic::getDeclaration(M, ID, Tys);
           Entry[Part] = Builder.CreateCall(F, Args);
         }
@@ -2542,19 +2781,36 @@ void InnerLoopVectorizer::updateAnalysis() {
   DEBUG(DT->verifyAnalysis());
 }
 
+/// \brief Check whether it is safe to if-convert this phi node.
+///
+/// Phi nodes with constant expressions that can trap are not safe to if
+/// convert.
+static bool canIfConvertPHINodes(BasicBlock *BB) {
+  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
+    PHINode *Phi = dyn_cast<PHINode>(I);
+    if (!Phi)
+      return true;
+    for (unsigned p = 0, e = Phi->getNumIncomingValues(); p != e; ++p)
+      if (Constant *C = dyn_cast<Constant>(Phi->getIncomingValue(p)))
+        if (C->canTrap())
+          return false;
+  }
+  return true;
+}
+
 bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
   if (!EnableIfConversion)
     return false;
 
   assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable");
-  std::vector<BasicBlock*> &LoopBlocks = TheLoop->getBlocksVector();
 
   // A list of pointers that we can safely read and write to.
   SmallPtrSet<Value *, 8> SafePointes;
 
   // Collect safe addresses.
-  for (unsigned i = 0, e = LoopBlocks.size(); i < e; ++i) {
-    BasicBlock *BB = LoopBlocks[i];
+  for (Loop::block_iterator BI = TheLoop->block_begin(),
+         BE = TheLoop->block_end(); BI != BE; ++BI) {
+    BasicBlock *BB = *BI;
 
     if (blockNeedsPredication(BB))
       continue;
@@ -2568,16 +2824,22 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
   }
 
   // Collect the blocks that need predication.
-  for (unsigned i = 0, e = LoopBlocks.size(); i < e; ++i) {
-    BasicBlock *BB = LoopBlocks[i];
+  BasicBlock *Header = TheLoop->getHeader();
+  for (Loop::block_iterator BI = TheLoop->block_begin(),
+         BE = TheLoop->block_end(); BI != BE; ++BI) {
+    BasicBlock *BB = *BI;
 
     // We don't support switch statements inside loops.
     if (!isa<BranchInst>(BB->getTerminator()))
       return false;
 
     // We must be able to predicate all blocks that need to be predicated.
-    if (blockNeedsPredication(BB) && !blockCanBePredicated(BB, SafePointes))
+    if (blockNeedsPredication(BB)) {
+      if (!blockCanBePredicated(BB, SafePointes))
+        return false;
+    } else if (BB != Header && !canIfConvertPHINodes(BB))
       return false;
+
   }
 
   // We can if-convert this loop.
@@ -2602,19 +2864,17 @@ bool LoopVectorizationLegality::canVectorize() {
   if (!TheLoop->getExitingBlock())
     return false;
 
-  unsigned NumBlocks = TheLoop->getNumBlocks();
+  // We need to have a loop header.
+  DEBUG(dbgs() << "LV: Found a loop: " <<
+        TheLoop->getHeader()->getName() << '\n');
 
   // Check if we can if-convert non single-bb loops.
+  unsigned NumBlocks = TheLoop->getNumBlocks();
   if (NumBlocks != 1 && !canVectorizeWithIfConvert()) {
     DEBUG(dbgs() << "LV: Can't if-convert the loop.\n");
     return false;
   }
 
-  // We need to have a loop header.
-  BasicBlock *Latch = TheLoop->getLoopLatch();
-  DEBUG(dbgs() << "LV: Found a loop: " <<
-        TheLoop->getHeader()->getName() << "\n");
-
   // ScalarEvolution needs to be able to find the exit count.
   const SCEV *ExitCount = SE->getBackedgeTakenCount(TheLoop);
   if (ExitCount == SE->getCouldNotCompute()) {
@@ -2623,6 +2883,7 @@ bool LoopVectorizationLegality::canVectorize() {
   }
 
   // Do not loop-vectorize loops with a tiny trip count.
+  BasicBlock *Latch = TheLoop->getLoopLatch();
   unsigned TC = SE->getSmallConstantTripCount(TheLoop, Latch);
   if (TC > 0u && TC < TinyTripCountVectorThreshold) {
     DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " <<
@@ -2657,7 +2918,13 @@ bool LoopVectorizationLegality::canVectorize() {
 
 static Type *convertPointerToIntegerType(DataLayout &DL, Type *Ty) {
   if (Ty->isPointerTy())
-    return DL.getIntPtrType(Ty->getContext());
+    return DL.getIntPtrType(Ty);
+
+  // It is possible that char's or short's overflow when we ask for the loop's
+  // trip count, work around this by changing the type size.
+  if (Ty->getScalarSizeInBits() < 32)
+    return Type::getInt32Ty(Ty->getContext());
+
   return Ty;
 }
 
@@ -2682,7 +2949,7 @@ static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst,
       Instruction *U = cast<Instruction>(*I);
       // This user may be a reduction exit value.
       if (!TheLoop->contains(U)) {
-        DEBUG(dbgs() << "LV: Found an outside user for : "<< *U << "\n");
+        DEBUG(dbgs() << "LV: Found an outside user for : " << *U << '\n');
         return true;
       }
     }
@@ -2758,6 +3025,12 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
 
           DEBUG(dbgs() << "LV: Found an induction variable.\n");
           Inductions[Phi] = InductionInfo(StartValue, IK);
+
+          // Until we explicitly handle the case of an induction variable with
+          // an outside loop user we have to give up vectorizing this loop.
+          if (hasOutsideLoopUser(TheLoop, it, AllowedExit))
+            return false;
+
           continue;
         }
 
@@ -2812,9 +3085,10 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
       }
 
       // Check that the instruction return type is vectorizable.
-      if (!VectorType::isValidElementType(it->getType()) &&
-          !it->getType()->isVoidTy()) {
-        DEBUG(dbgs() << "LV: Found unvectorizable type." << "\n");
+      // Also, we can't vectorize extractelement instructions.
+      if ((!VectorType::isValidElementType(it->getType()) &&
+           !it->getType()->isVoidTy()) || isa<ExtractElementInst>(it)) {
+        DEBUG(dbgs() << "LV: Found unvectorizable type.\n");
         return false;
       }
 
@@ -2904,7 +3178,7 @@ public:
   /// non-intersection.
   bool canCheckPtrAtRT(LoopVectorizationLegality::RuntimePointerCheck &RtCheck,
                        unsigned &NumComparisons, ScalarEvolution *SE,
-                       Loop *TheLoop);
+                       Loop *TheLoop, bool ShouldCheckStride = false);
 
   /// \brief Goes over all memory accesses, checks whether a RT check is needed
   /// and builds sets of dependent accesses.
@@ -2918,6 +3192,7 @@ public:
   bool isRTCheckNeeded() { return IsRTCheckNeeded; }
 
   bool isDependencyCheckNeeded() { return !CheckDeps.empty(); }
+  void resetDepChecks() { CheckDeps.clear(); }
 
   MemAccessInfoSet &getDependenciesToCheck() { return CheckDeps; }
 
@@ -2972,10 +3247,15 @@ static bool hasComputableBounds(ScalarEvolution *SE, Value *Ptr) {
   return AR->isAffine();
 }
 
+/// \brief Check the stride of the pointer and ensure that it does not wrap in
+/// the address space.
+static int isStridedPtr(ScalarEvolution *SE, DataLayout *DL, Value *Ptr,
+                        const Loop *Lp);
+
 bool AccessAnalysis::canCheckPtrAtRT(
                        LoopVectorizationLegality::RuntimePointerCheck &RtCheck,
                         unsigned &NumComparisons, ScalarEvolution *SE,
-                        Loop *TheLoop) {
+                        Loop *TheLoop, bool ShouldCheckStride) {
   // Find pointers with computable bounds. We are going to use this information
   // to place a runtime bound check.
   unsigned NumReadPtrChecks = 0;
@@ -3003,7 +3283,10 @@ bool AccessAnalysis::canCheckPtrAtRT(
     else
       ++NumReadPtrChecks;
 
-    if (hasComputableBounds(SE, Ptr)) {
+    if (hasComputableBounds(SE, Ptr) &&
+        // When we run after a failing dependency check we have to make sure we
+        // don't have wrapping pointers.
+        (!ShouldCheckStride || isStridedPtr(SE, DL, Ptr, TheLoop) == 1)) {
       // The id of the dependence set.
       unsigned DepId;
 
@@ -3019,7 +3302,7 @@ bool AccessAnalysis::canCheckPtrAtRT(
 
       RtCheck.insert(SE, TheLoop, Ptr, IsWrite, DepId);
 
-      DEBUG(dbgs() << "LV: Found a runtime check ptr:" << *Ptr <<"\n");
+      DEBUG(dbgs() << "LV: Found a runtime check ptr:" << *Ptr << '\n');
     } else {
       CanDoRT = false;
     }
@@ -3027,9 +3310,36 @@ bool AccessAnalysis::canCheckPtrAtRT(
 
   if (IsDepCheckNeeded && CanDoRT && RunningDepId == 2)
     NumComparisons = 0; // Only one dependence set.
-  else
+  else {
     NumComparisons = (NumWritePtrChecks * (NumReadPtrChecks +
                                            NumWritePtrChecks - 1));
+  }
+
+  // If the pointers that we would use for the bounds comparison have different
+  // address spaces, assume the values aren't directly comparable, so we can't
+  // use them for the runtime check. We also have to assume they could
+  // overlap. In the future there should be metadata for whether address spaces
+  // are disjoint.
+  unsigned NumPointers = RtCheck.Pointers.size();
+  for (unsigned i = 0; i < NumPointers; ++i) {
+    for (unsigned j = i + 1; j < NumPointers; ++j) {
+      // Only need to check pointers between two different dependency sets.
+      if (RtCheck.DependencySetId[i] == RtCheck.DependencySetId[j])
+       continue;
+
+      Value *PtrI = RtCheck.Pointers[i];
+      Value *PtrJ = RtCheck.Pointers[j];
+
+      unsigned ASi = PtrI->getType()->getPointerAddressSpace();
+      unsigned ASj = PtrJ->getType()->getPointerAddressSpace();
+      if (ASi != ASj) {
+        DEBUG(dbgs() << "LV: Runtime check would require comparison between"
+                       " different address spaces\n");
+        return false;
+      }
+    }
+  }
+
   return CanDoRT;
 }
 
@@ -3084,7 +3394,7 @@ void AccessAnalysis::processMemAccesses(bool UseDeferred) {
                         !isa<Argument>(UnderlyingObj)) &&
            !isIdentifiedObject(UnderlyingObj))) {
         DEBUG(dbgs() << "LV: Found an unidentified " <<
-              (IsWrite ?  "write" : "read" ) << " ptr:" << *UnderlyingObj <<
+              (IsWrite ?  "write" : "read" ) << " ptr: " << *UnderlyingObj <<
               "\n");
         IsRTCheckNeeded = (IsRTCheckNeeded ||
                            !isIdentifiedObject(UnderlyingObj) ||
@@ -3158,8 +3468,9 @@ public:
   typedef PointerIntPair<Value *, 1, bool> MemAccessInfo;
   typedef SmallPtrSet<MemAccessInfo, 8> MemAccessInfoSet;
 
-  MemoryDepChecker(ScalarEvolution *Se, DataLayout *Dl, const Loop *L) :
-    SE(Se), DL(Dl), InnermostLoop(L), AccessIdx(0) {}
+  MemoryDepChecker(ScalarEvolution *Se, DataLayout *Dl, const Loop *L)
+      : SE(Se), DL(Dl), InnermostLoop(L), AccessIdx(0),
+        ShouldRetryWithRuntimeCheck(false) {}
 
   /// \brief Register the location (instructions are given increasing numbers)
   /// of a write access.
@@ -3189,6 +3500,10 @@ public:
   /// the accesses safely with.
   unsigned getMaxSafeDepDistBytes() { return MaxSafeDepDistBytes; }
 
+  /// \brief In same cases when the dependency check fails we can still
+  /// vectorize the loop with a dynamic array access check.
+  bool shouldRetryWithRuntimeCheck() { return ShouldRetryWithRuntimeCheck; }
+
 private:
   ScalarEvolution *SE;
   DataLayout *DL;
@@ -3206,6 +3521,10 @@ private:
   // We can access this many bytes in parallel safely.
   unsigned MaxSafeDepDistBytes;
 
+  /// \brief If we see a non constant dependence distance we can still try to
+  /// vectorize this loop with runtime checks.
+  bool ShouldRetryWithRuntimeCheck;
+
   /// \brief Check whether there is a plausible dependence between the two
   /// accesses.
   ///
@@ -3403,6 +3722,7 @@ bool MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
   const SCEVConstant *C = dyn_cast<SCEVConstant>(Dist);
   if (!C) {
     DEBUG(dbgs() << "LV: Dependence because of non constant distance\n");
+    ShouldRetryWithRuntimeCheck = true;
     return true;
   }
 
@@ -3428,7 +3748,7 @@ bool MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
   if (Val == 0) {
     if (ATy == BTy)
       return false;
-    DEBUG(dbgs() << "LV: Zero dependence difference but different types");
+    DEBUG(dbgs() << "LV: Zero dependence difference but different types\n");
     return true;
   }
 
@@ -3437,7 +3757,7 @@ bool MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
   // Positive distance bigger than max vectorization factor.
   if (ATy != BTy) {
     DEBUG(dbgs() <<
-          "LV: ReadWrite-Write positive dependency with different types");
+          "LV: ReadWrite-Write positive dependency with different types\n");
     return false;
   }
 
@@ -3454,7 +3774,7 @@ bool MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
       2*TypeByteSize > MaxSafeDepDistBytes ||
       Distance < TypeByteSize * ForcedUnroll * ForcedFactor) {
     DEBUG(dbgs() << "LV: Failure because of Positive distance "
-        << Val.getSExtValue() << "\n");
+        << Val.getSExtValue() << '\n');
     return true;
   }
 
@@ -3467,7 +3787,7 @@ bool MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
      return true;
 
   DEBUG(dbgs() << "LV: Positive distance " << Val.getSExtValue() <<
-        " with max VF=" << MaxSafeDepDistBytes/TypeByteSize << "\n");
+        " with max VF = " << MaxSafeDepDistBytes / TypeByteSize << '\n');
 
   return false;
 }
@@ -3571,8 +3891,8 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
         Stores.push_back(St);
         DepChecker.addAccess(St);
       }
-    } // next instr.
-  } // next block.
+    } // Next instr.
+  } // Next block.
 
   // Now we have two lists that hold the loads and the stores.
   // Next, we find the pointers that they use.
@@ -3619,7 +3939,6 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
     return true;
   }
 
-  SmallPtrSet<Value *, 16> ReadOnlyPtr;
   for (I = Loads.begin(), IE = Loads.end(); I != IE; ++I) {
     LoadInst *LD = cast<LoadInst>(*I);
     Value* Ptr = LD->getPointerOperand();
@@ -3667,7 +3986,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
   if (NumComparisons == 0 && NeedRTCheck)
     NeedRTCheck = false;
 
-  // Check that we did not collect too many pointers or found a unsizeable
+  // Check that we did not collect too many pointers or found an unsizeable
   // pointer.
   if (!CanDoRT || NumComparisons > RuntimeMemoryCheckThreshold) {
     PtrRtCheck.reset();
@@ -3693,9 +4012,32 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
     CanVecMem = DepChecker.areDepsSafe(DependentAccesses,
                                        Accesses.getDependenciesToCheck());
     MaxSafeDepDistBytes = DepChecker.getMaxSafeDepDistBytes();
+
+    if (!CanVecMem && DepChecker.shouldRetryWithRuntimeCheck()) {
+      DEBUG(dbgs() << "LV: Retrying with memory checks\n");
+      NeedRTCheck = true;
+
+      // Clear the dependency checks. We assume they are not needed.
+      Accesses.resetDepChecks();
+
+      PtrRtCheck.reset();
+      PtrRtCheck.Need = true;
+
+      CanDoRT = Accesses.canCheckPtrAtRT(PtrRtCheck, NumComparisons, SE,
+                                         TheLoop, true);
+      // Check that we did not collect too many pointers or found an unsizeable
+      // pointer.
+      if (!CanDoRT || NumComparisons > RuntimeMemoryCheckThreshold) {
+        DEBUG(dbgs() << "LV: Can't vectorize with memory checks\n");
+        PtrRtCheck.reset();
+        return false;
+      }
+
+      CanVecMem = true;
+    }
   }
 
-  DEBUG(dbgs() << "LV: We "<< (NeedRTCheck ? "" : "don't") <<
+  DEBUG(dbgs() << "LV: We" << (NeedRTCheck ? "" : " don't") <<
         " need a runtime memory check.\n");
 
   return CanVecMem;
@@ -3839,6 +4181,12 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
         if (ExitInstruction != 0 || Cur == Phi)
           return false;
 
+        // The instruction used by an outside user must be the last instruction
+        // before we feed back to the reduction phi. Otherwise, we loose VF-1
+        // operations on the value.
+        if (std::find(Phi->op_begin(), Phi->op_end(), Cur) == Phi->op_end())
+         return false;
+
         ExitInstruction = Cur;
         continue;
       }
@@ -4045,6 +4393,14 @@ bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB,
     if (it->mayWriteToMemory() || it->mayThrow())
       return false;
 
+    // Check that we don't have a constant expression that can trap as operand.
+    for (Instruction::op_iterator OI = it->op_begin(), OE = it->op_end();
+         OI != OE; ++OI) {
+      if (Constant *C = dyn_cast<Constant>(*OI))
+        if (C->canTrap())
+          return false;
+    }
+
     // The instructions below can trap.
     switch (it->getOpcode()) {
     default: continue;
@@ -4071,7 +4427,7 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize,
 
   // Find the trip count.
   unsigned TC = SE->getSmallConstantTripCount(TheLoop, TheLoop->getLoopLatch());
-  DEBUG(dbgs() << "LV: Found trip count:"<<TC<<"\n");
+  DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
 
   unsigned WidestType = getWidestType();
   unsigned WidestRegister = TTI.getRegisterBitWidth(true);
@@ -4082,7 +4438,8 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize,
                     WidestRegister : MaxSafeDepDist);
   unsigned MaxVectorSize = WidestRegister / WidestType;
   DEBUG(dbgs() << "LV: The Widest type: " << WidestType << " bits.\n");
-  DEBUG(dbgs() << "LV: The Widest register is:" << WidestRegister << "bits.\n");
+  DEBUG(dbgs() << "LV: The Widest register is: "
+          << WidestRegister << " bits.\n");
 
   if (MaxVectorSize == 0) {
     DEBUG(dbgs() << "LV: The target has no vector registers.\n");
@@ -4118,7 +4475,7 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize,
 
   if (UserVF != 0) {
     assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two");
-    DEBUG(dbgs() << "LV: Using user VF "<<UserVF<<".\n");
+    DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
 
     Factor.Width = UserVF;
     return Factor;
@@ -4126,13 +4483,13 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize,
 
   float Cost = expectedCost(1);
   unsigned Width = 1;
-  DEBUG(dbgs() << "LV: Scalar loop costs: "<< (int)Cost << ".\n");
+  DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)Cost << ".\n");
   for (unsigned i=2; i <= VF; i*=2) {
     // Notice that the vector loop needs to be executed less times, so
     // we need to divide the cost of the vector loops by the width of
     // the vector elements.
     float VectorCost = expectedCost(i) / (float)i;
-    DEBUG(dbgs() << "LV: Vector loop of width "<< i << " costs: " <<
+    DEBUG(dbgs() << "LV: Vector loop of width " << i << " costs: " <<
           (int)VectorCost << ".\n");
     if (VectorCost < Cost) {
       Cost = VectorCost;
@@ -4256,8 +4613,20 @@ LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize,
   else if (UF < 1)
     UF = 1;
 
-  if (Legal->getReductionVars()->size()) {
-    DEBUG(dbgs() << "LV: Unrolling because of reductions. \n");
+  bool HasReductions = Legal->getReductionVars()->size();
+
+  // Decide if we want to unroll if we decided that it is legal to vectorize
+  // but not profitable.
+  if (VF == 1) {
+    if (TheLoop->getNumBlocks() > 1 || !HasReductions ||
+        LoopCost > SmallLoopCost)
+      return 1;
+
+    return UF;
+  }
+
+  if (HasReductions) {
+    DEBUG(dbgs() << "LV: Unrolling because of reductions.\n");
     return UF;
   }
 
@@ -4265,14 +4634,14 @@ LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize,
   // We assume that the cost overhead is 1 and we use the cost model
   // to estimate the cost of the loop and unroll until the cost of the
   // loop overhead is about 5% of the cost of the loop.
-  DEBUG(dbgs() << "LV: Loop cost is "<< LoopCost <<" \n");
-  if (LoopCost < 20) {
-    DEBUG(dbgs() << "LV: Unrolling to reduce branch cost. \n");
-    unsigned NewUF = 20/LoopCost + 1;
+  DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n');
+  if (LoopCost < SmallLoopCost) {
+    DEBUG(dbgs() << "LV: Unrolling to reduce branch cost.\n");
+    unsigned NewUF = SmallLoopCost / (LoopCost + 1);
     return std::min(NewUF, UF);
   }
 
-  DEBUG(dbgs() << "LV: Not Unrolling. \n");
+  DEBUG(dbgs() << "LV: Not Unrolling.\n");
   return 1;
 }
 
@@ -4373,16 +4742,16 @@ LoopVectorizationCostModel::calculateRegisterUsage() {
     MaxUsage = std::max(MaxUsage, OpenIntervals.size());
 
     DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " <<
-          OpenIntervals.size() <<"\n");
+          OpenIntervals.size() << '\n');
 
     // Add the current instruction to the list of open intervals.
     OpenIntervals.insert(I);
   }
 
   unsigned Invariant = LoopInvariants.size();
-  DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsage << " \n");
-  DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant << " \n");
-  DEBUG(dbgs() << "LV(REG): LoopSize: " << R.NumInstructions << " \n");
+  DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsage << '\n');
+  DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant << '\n');
+  DEBUG(dbgs() << "LV(REG): LoopSize: " << R.NumInstructions << '\n');
 
   R.LoopInvariantRegs = Invariant;
   R.MaxLocalUsers = MaxUsage;
@@ -4406,8 +4775,8 @@ unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) {
 
       unsigned C = getInstructionCost(it, VF);
       BlockCost += C;
-      DEBUG(dbgs() << "LV: Found an estimated cost of "<< C <<" for VF " <<
-            VF << " For instruction: "<< *it << "\n");
+      DEBUG(dbgs() << "LV: Found an estimated cost of " << C << " for VF " <<
+            VF << " For instruction: " << *it << '\n');
     }
 
     // We assume that if-converted blocks have a 50% chance of being executed.
@@ -4669,13 +5038,16 @@ char LoopVectorize::ID = 0;
 static const char lv_name[] = "Loop Vectorization";
 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
 INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
+INITIALIZE_PASS_DEPENDENCY(DominatorTree)
 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
+INITIALIZE_PASS_DEPENDENCY(LCSSA)
+INITIALIZE_PASS_DEPENDENCY(LoopInfo)
 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
 
 namespace llvm {
-  Pass *createLoopVectorizePass() {
-    return new LoopVectorize();
+  Pass *createLoopVectorizePass(bool NoUnrolling) {
+    return new LoopVectorize(NoUnrolling);
   }
 }
 
@@ -4690,3 +5062,96 @@ bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
 
   return false;
 }
+
+
+void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr) {
+  assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
+  // Holds vector parameters or scalars, in case of uniform vals.
+  SmallVector<VectorParts, 4> Params;
+
+  setDebugLocFromInst(Builder, Instr);
+
+  // Find all of the vectorized parameters.
+  for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
+    Value *SrcOp = Instr->getOperand(op);
+
+    // If we are accessing the old induction variable, use the new one.
+    if (SrcOp == OldInduction) {
+      Params.push_back(getVectorValue(SrcOp));
+      continue;
+    }
+
+    // Try using previously calculated values.
+    Instruction *SrcInst = dyn_cast<Instruction>(SrcOp);
+
+    // If the src is an instruction that appeared earlier in the basic block
+    // then it should already be vectorized.
+    if (SrcInst && OrigLoop->contains(SrcInst)) {
+      assert(WidenMap.has(SrcInst) && "Source operand is unavailable");
+      // The parameter is a vector value from earlier.
+      Params.push_back(WidenMap.get(SrcInst));
+    } else {
+      // The parameter is a scalar from outside the loop. Maybe even a constant.
+      VectorParts Scalars;
+      Scalars.append(UF, SrcOp);
+      Params.push_back(Scalars);
+    }
+  }
+
+  assert(Params.size() == Instr->getNumOperands() &&
+         "Invalid number of operands");
+
+  // Does this instruction return a value ?
+  bool IsVoidRetTy = Instr->getType()->isVoidTy();
+
+  Value *UndefVec = IsVoidRetTy ? 0 :
+  UndefValue::get(Instr->getType());
+  // Create a new entry in the WidenMap and initialize it to Undef or Null.
+  VectorParts &VecResults = WidenMap.splat(Instr, UndefVec);
+
+  // For each vector unroll 'part':
+  for (unsigned Part = 0; Part < UF; ++Part) {
+    // For each scalar that we create:
+
+    Instruction *Cloned = Instr->clone();
+      if (!IsVoidRetTy)
+        Cloned->setName(Instr->getName() + ".cloned");
+      // Replace the operands of the cloned instructions with extracted scalars.
+      for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
+        Value *Op = Params[op][Part];
+        Cloned->setOperand(op, Op);
+      }
+
+      // Place the cloned scalar in the new loop.
+      Builder.Insert(Cloned);
+
+      // If the original scalar returns a value we need to place it in a vector
+      // so that future users will be able to use it.
+      if (!IsVoidRetTy)
+        VecResults[Part] = Cloned;
+  }
+}
+
+void
+InnerLoopUnroller::vectorizeMemoryInstruction(Instruction *Instr,
+                                              LoopVectorizationLegality*) {
+  return scalarizeInstruction(Instr);
+}
+
+Value *InnerLoopUnroller::reverseVector(Value *Vec) {
+  return Vec;
+}
+
+Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) {
+  return V;
+}
+
+Value *InnerLoopUnroller::getConsecutiveVector(Value* Val, int StartIdx,
+                                               bool Negate) {
+  // When unrolling and the VF is 1, we only need to add a simple scalar.
+  Type *ITy = Val->getType();
+  assert(!ITy->isVectorTy() && "Val must be a scalar");
+  Constant *C = ConstantInt::get(ITy, StartIdx, Negate);
+  return Builder.CreateAdd(Val, C, "induction");
+}
+
diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 9312b4b..c72b51f 100644
--- a/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -25,8 +25,8 @@
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
-#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Analysis/Verifier.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/IR/DataLayout.h"
@@ -49,34 +49,24 @@ static cl::opt<int>
     SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden,
                      cl::desc("Only vectorize if you gain more than this "
                               "number "));
+
+static cl::opt<bool>
+ShouldVectorizeHor("slp-vectorize-hor", cl::init(false), cl::Hidden,
+                   cl::desc("Attempt to vectorize horizontal reductions"));
+
+static cl::opt<bool> ShouldStartVectorizeHorAtStore(
+    "slp-vectorize-hor-store", cl::init(false), cl::Hidden,
+    cl::desc(
+        "Attempt to vectorize horizontal reductions feeding into a store"));
+
 namespace {
 
 static const unsigned MinVecRegSize = 128;
 
 static const unsigned RecursionMaxDepth = 12;
 
-/// RAII pattern to save the insertion point of the IR builder.
-class BuilderLocGuard {
-public:
-  BuilderLocGuard(IRBuilder<> &B) : Builder(B), Loc(B.GetInsertPoint()),
-  DbgLoc(B.getCurrentDebugLocation()) {}
-  ~BuilderLocGuard() {
-    Builder.SetCurrentDebugLocation(DbgLoc);
-    if (Loc)
-      Builder.SetInsertPoint(Loc);
-  }
-
-private:
-  // Prevent copying.
-  BuilderLocGuard(const BuilderLocGuard &);
-  BuilderLocGuard &operator=(const BuilderLocGuard &);
-  IRBuilder<> &Builder;
-  AssertingVH<Instruction> Loc;
-  DebugLoc DbgLoc;
-};
-
-/// A helper class for numbering instructions in multible blocks.
-/// Numbers starts at zero for each basic block.
+/// A helper class for numbering instructions in multiple blocks.
+/// Numbers start at zero for each basic block.
 struct BlockNumbering {
 
   BlockNumbering(BasicBlock *Bb) : BB(Bb), Valid(false) {}
@@ -173,6 +163,37 @@ static unsigned getSameOpcode(ArrayRef<Value *> VL) {
   return Opcode;
 }
 
+/// \returns \p I after propagating metadata from \p VL.
+static Instruction *propagateMetadata(Instruction *I, ArrayRef<Value *> VL) {
+  Instruction *I0 = cast<Instruction>(VL[0]);
+  SmallVector<std::pair<unsigned, MDNode *>, 4> Metadata;
+  I0->getAllMetadataOtherThanDebugLoc(Metadata);
+
+  for (unsigned i = 0, n = Metadata.size(); i != n; ++i) {
+    unsigned Kind = Metadata[i].first;
+    MDNode *MD = Metadata[i].second;
+
+    for (int i = 1, e = VL.size(); MD && i != e; i++) {
+      Instruction *I = cast<Instruction>(VL[i]);
+      MDNode *IMD = I->getMetadata(Kind);
+
+      switch (Kind) {
+      default:
+        MD = 0; // Remove unknown metadata
+        break;
+      case LLVMContext::MD_tbaa:
+        MD = MDNode::getMostGenericTBAA(MD, IMD);
+        break;
+      case LLVMContext::MD_fpmath:
+        MD = MDNode::getMostGenericFPMath(MD, IMD);
+        break;
+      }
+    }
+    I->setMetadata(Kind, MD);
+  }
+  return I;
+}
+
 /// \returns The type that all of the values in \p VL have or null if there
 /// are different types.
 static Type* getSameType(ArrayRef<Value *> VL) {
@@ -216,6 +237,104 @@ static bool CanReuseExtract(ArrayRef<Value *> VL) {
   return true;
 }
 
+static void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
+                                           SmallVectorImpl<Value *> &Left,
+                                           SmallVectorImpl<Value *> &Right) {
+
+  SmallVector<Value *, 16> OrigLeft, OrigRight;
+
+  bool AllSameOpcodeLeft = true;
+  bool AllSameOpcodeRight = true;
+  for (unsigned i = 0, e = VL.size(); i != e; ++i) {
+    Instruction *I = cast<Instruction>(VL[i]);
+    Value *V0 = I->getOperand(0);
+    Value *V1 = I->getOperand(1);
+
+    OrigLeft.push_back(V0);
+    OrigRight.push_back(V1);
+
+    Instruction *I0 = dyn_cast<Instruction>(V0);
+    Instruction *I1 = dyn_cast<Instruction>(V1);
+
+    // Check whether all operands on one side have the same opcode. In this case
+    // we want to preserve the original order and not make things worse by
+    // reordering.
+    AllSameOpcodeLeft = I0;
+    AllSameOpcodeRight = I1;
+
+    if (i && AllSameOpcodeLeft) {
+      if(Instruction *P0 = dyn_cast<Instruction>(OrigLeft[i-1])) {
+        if(P0->getOpcode() != I0->getOpcode())
+          AllSameOpcodeLeft = false;
+      } else
+        AllSameOpcodeLeft = false;
+    }
+    if (i && AllSameOpcodeRight) {
+      if(Instruction *P1 = dyn_cast<Instruction>(OrigRight[i-1])) {
+        if(P1->getOpcode() != I1->getOpcode())
+          AllSameOpcodeRight = false;
+      } else
+        AllSameOpcodeRight = false;
+    }
+
+    // Sort two opcodes. In the code below we try to preserve the ability to use
+    // broadcast of values instead of individual inserts.
+    // vl1 = load
+    // vl2 = phi
+    // vr1 = load
+    // vr2 = vr2
+    //    = vl1 x vr1
+    //    = vl2 x vr2
+    // If we just sorted according to opcode we would leave the first line in
+    // tact but we would swap vl2 with vr2 because opcode(phi) > opcode(load).
+    //    = vl1 x vr1
+    //    = vr2 x vl2
+    // Because vr2 and vr1 are from the same load we loose the opportunity of a
+    // broadcast for the packed right side in the backend: we have [vr1, vl2]
+    // instead of [vr1, vr2=vr1].
+    if (I0 && I1) {
+       if(!i && I0->getOpcode() > I1->getOpcode()) {
+         Left.push_back(I1);
+         Right.push_back(I0);
+       } else if (i && I0->getOpcode() > I1->getOpcode() && Right[i-1] != I1) {
+         // Try not to destroy a broad cast for no apparent benefit.
+         Left.push_back(I1);
+         Right.push_back(I0);
+       } else if (i && I0->getOpcode() == I1->getOpcode() && Right[i-1] ==  I0) {
+         // Try preserve broadcasts.
+         Left.push_back(I1);
+         Right.push_back(I0);
+       } else if (i && I0->getOpcode() == I1->getOpcode() && Left[i-1] == I1) {
+         // Try preserve broadcasts.
+         Left.push_back(I1);
+         Right.push_back(I0);
+       } else {
+         Left.push_back(I0);
+         Right.push_back(I1);
+       }
+       continue;
+    }
+    // One opcode, put the instruction on the right.
+    if (I0) {
+      Left.push_back(V1);
+      Right.push_back(I0);
+      continue;
+    }
+    Left.push_back(V0);
+    Right.push_back(V1);
+  }
+
+  bool LeftBroadcast = isSplat(Left);
+  bool RightBroadcast = isSplat(Right);
+
+  // Don't reorder if the operands where good to begin with.
+  if (!(LeftBroadcast || RightBroadcast) &&
+      (AllSameOpcodeRight || AllSameOpcodeLeft)) {
+    Left = OrigLeft;
+    Right = OrigRight;
+  }
+}
+
 /// Bottom Up SLP Vectorizer.
 class BoUpSLP {
 public:
@@ -238,17 +357,20 @@ public:
     }
 
   /// \brief Vectorize the tree that starts with the elements in \p VL.
-  void vectorizeTree();
+  /// Returns the vectorized root.
+  Value *vectorizeTree();
 
   /// \returns the vectorization cost of the subtree that starts at \p VL.
   /// A negative number means that this is profitable.
   int getTreeCost();
 
-  /// Construct a vectorizable tree that starts at \p Roots.
-  void buildTree(ArrayRef<Value *> Roots);
+  /// Construct a vectorizable tree that starts at \p Roots and is possibly
+  /// used by a reduction of \p RdxOps.
+  void buildTree(ArrayRef<Value *> Roots, ValueSet *RdxOps = 0);
 
   /// Clear the internal data structures that are created by 'buildTree'.
   void deleteTree() {
+    RdxOps = 0;
     VectorizableTree.clear();
     ScalarToTreeEntry.clear();
     MustGather.clear();
@@ -278,7 +400,7 @@ private:
 
   /// \returns the pointer to the vectorized value if \p VL is already
   /// vectorized, or NULL. They may happen in cycles.
-  Value *alreadyVectorized(ArrayRef<Value *> VL);
+  Value *alreadyVectorized(ArrayRef<Value *> VL) const;
 
   /// \brief Take the pointer operand from the Load/Store instruction.
   /// \returns NULL if this is not a valid Load/Store instruction.
@@ -305,26 +427,31 @@ private:
   /// \returns the pointer to the barrier instruction if we can't sink.
   Value *getSinkBarrier(Instruction *Src, Instruction *Dst);
 
-  /// \returns the index of the last instrucion in the BB from \p VL.
+  /// \returns the index of the last instruction in the BB from \p VL.
   int getLastIndex(ArrayRef<Value *> VL);
 
-  /// \returns the Instrucion in the bundle \p VL.
+  /// \returns the Instruction in the bundle \p VL.
   Instruction *getLastInstruction(ArrayRef<Value *> VL);
 
+  /// \brief Set the Builder insert point to one after the last instruction in
+  /// the bundle
+  void setInsertPointAfterBundle(ArrayRef<Value *> VL);
+
   /// \returns a vector from a collection of scalars in \p VL.
   Value *Gather(ArrayRef<Value *> VL, VectorType *Ty);
 
+  /// \returns whether the VectorizableTree is fully vectoriable and will
+  /// be beneficial even the tree height is tiny.
+  bool isFullyVectorizableTinyTree(); 
+
   struct TreeEntry {
     TreeEntry() : Scalars(), VectorizedValue(0), LastScalarIndex(0),
     NeedToGather(0) {}
 
     /// \returns true if the scalars in VL are equal to this entry.
-    bool isSame(ArrayRef<Value *> VL) {
+    bool isSame(ArrayRef<Value *> VL) const {
       assert(VL.size() == Scalars.size() && "Invalid size");
-      for (int i = 0, e = VL.size(); i != e; ++i)
-        if (VL[i] != Scalars[i])
-          return false;
-      return true;
+      return std::equal(VL.begin(), VL.end(), Scalars.begin());
     }
 
     /// A vector of scalars.
@@ -393,10 +520,15 @@ private:
 
   /// Holds all of the instructions that we gathered.
   SetVector<Instruction *> GatherSeq;
+  /// A list of blocks that we are going to CSE.
+  SmallSet<BasicBlock *, 8> CSEBlocks;
 
   /// Numbers instructions in different blocks.
   DenseMap<BasicBlock *, BlockNumbering> BlocksNumbers;
 
+  /// Reduction operators.
+  ValueSet *RdxOps;
+
   // Analysis and block reference.
   Function *F;
   ScalarEvolution *SE;
@@ -409,8 +541,9 @@ private:
   IRBuilder<> Builder;
 };
 
-void BoUpSLP::buildTree(ArrayRef<Value *> Roots) {
+void BoUpSLP::buildTree(ArrayRef<Value *> Roots, ValueSet *Rdx) {
   deleteTree();
+  RdxOps = Rdx;
   if (!getSameType(Roots))
     return;
   buildTree_rec(Roots, 0);
@@ -431,18 +564,20 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots) {
            UE = Scalar->use_end(); User != UE; ++User) {
         DEBUG(dbgs() << "SLP: Checking user:" << **User << ".\n");
 
-        bool Gathered = MustGather.count(*User);
-
         // Skip in-tree scalars that become vectors.
-        if (ScalarToTreeEntry.count(*User) && !Gathered) {
+        if (ScalarToTreeEntry.count(*User)) {
           DEBUG(dbgs() << "SLP: \tInternal user will be removed:" <<
                 **User << ".\n");
           int Idx = ScalarToTreeEntry[*User]; (void) Idx;
           assert(!VectorizableTree[Idx].NeedToGather && "Bad state");
           continue;
         }
+        Instruction *UserInst = dyn_cast<Instruction>(*User);
+        if (!UserInst)
+          continue;
 
-        if (!isa<Instruction>(*User))
+        // Ignore uses that are part of the reduction.
+        if (Rdx && std::find(Rdx->begin(), Rdx->end(), UserInst) != Rdx->end())
           continue;
 
         DEBUG(dbgs() << "SLP: Need to extract:" << **User << " from lane " <<
@@ -574,6 +709,10 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
         continue;
       }
 
+      // This user is part of the reduction.
+      if (RdxOps && RdxOps->count(User))
+        continue;
+
       // Make sure that we can schedule this unknown user.
       BlockNumbering &BN = BlocksNumbers[BB];
       int UserIndex = BN.getIndex(User);
@@ -635,6 +774,18 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
   switch (Opcode) {
     case Instruction::PHI: {
       PHINode *PH = dyn_cast<PHINode>(VL0);
+
+      // Check for terminator values (e.g. invoke).
+      for (unsigned j = 0; j < VL.size(); ++j)
+        for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
+          TerminatorInst *Term = dyn_cast<TerminatorInst>(cast<PHINode>(VL[j])->getIncomingValue(i));
+          if (Term) {
+            DEBUG(dbgs() << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n");
+            newTreeEntry(VL, false);
+            return;
+          }
+        }
+
       newTreeEntry(VL, true);
       DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
 
@@ -658,13 +809,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
     }
     case Instruction::Load: {
       // Check if the loads are consecutive or of we need to swizzle them.
-      for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)
-        if (!isConsecutiveAccess(VL[i], VL[i + 1])) {
+      for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) {
+        LoadInst *L = cast<LoadInst>(VL[i]);
+        if (!L->isSimple() || !isConsecutiveAccess(VL[i], VL[i + 1])) {
           newTreeEntry(VL, false);
           DEBUG(dbgs() << "SLP: Need to swizzle loads.\n");
           return;
         }
-
+      }
       newTreeEntry(VL, true);
       DEBUG(dbgs() << "SLP: added a vector of loads.\n");
       return;
@@ -753,6 +905,16 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
       newTreeEntry(VL, true);
       DEBUG(dbgs() << "SLP: added a vector of bin op.\n");
 
+      // Sort operands of the instructions so that each side is more likely to
+      // have the same opcode.
+      if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {
+        ValueList Left, Right;
+        reorderInputsAccordingToOpcode(VL, Left, Right);
+        buildTree_rec(Left, Depth + 1);
+        buildTree_rec(Right, Depth + 1);
+        return;
+      }
+
       for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
         ValueList Operands;
         // Prepare the operand vector.
@@ -874,9 +1036,24 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
         TTI->getCmpSelInstrCost(Opcode, ScalarTy, Builder.getInt1Ty());
         VecCost = TTI->getCmpSelInstrCost(Opcode, VecTy, MaskTy);
       } else {
-        ScalarCost = VecTy->getNumElements() *
-        TTI->getArithmeticInstrCost(Opcode, ScalarTy);
-        VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy);
+        // Certain instructions can be cheaper to vectorize if they have a
+        // constant second vector operand.
+        TargetTransformInfo::OperandValueKind Op1VK =
+            TargetTransformInfo::OK_AnyValue;
+        TargetTransformInfo::OperandValueKind Op2VK =
+            TargetTransformInfo::OK_UniformConstantValue;
+
+        // Check whether all second operands are constant.
+        for (unsigned i = 0; i < VL.size(); ++i)
+          if (!isa<ConstantInt>(cast<Instruction>(VL[i])->getOperand(1))) {
+            Op2VK = TargetTransformInfo::OK_AnyValue;
+            break;
+          }
+
+        ScalarCost =
+            VecTy->getNumElements() *
+            TTI->getArithmeticInstrCost(Opcode, ScalarTy, Op1VK, Op2VK);
+        VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy, Op1VK, Op2VK);
       }
       return VecCost - ScalarCost;
     }
@@ -884,14 +1061,14 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
       // Cost of wide load - cost of scalar loads.
       int ScalarLdCost = VecTy->getNumElements() *
       TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0);
-      int VecLdCost = TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0);
+      int VecLdCost = TTI->getMemoryOpCost(Instruction::Load, VecTy, 1, 0);
       return VecLdCost - ScalarLdCost;
     }
     case Instruction::Store: {
       // We know that we can merge the stores. Calculate the cost.
       int ScalarStCost = VecTy->getNumElements() *
       TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1, 0);
-      int VecStCost = TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1, 0);
+      int VecStCost = TTI->getMemoryOpCost(Instruction::Store, VecTy, 1, 0);
       return VecStCost - ScalarStCost;
     }
     default:
@@ -899,19 +1076,32 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
   }
 }
 
+bool BoUpSLP::isFullyVectorizableTinyTree() {
+  DEBUG(dbgs() << "SLP: Check whether the tree with height " <<
+        VectorizableTree.size() << " is fully vectorizable .\n");
+
+  // We only handle trees of height 2.
+  if (VectorizableTree.size() != 2)
+    return false;
+
+  // Gathering cost would be too much for tiny trees.
+  if (VectorizableTree[0].NeedToGather || VectorizableTree[1].NeedToGather) 
+    return false; 
+
+  return true; 
+}
+
 int BoUpSLP::getTreeCost() {
   int Cost = 0;
   DEBUG(dbgs() << "SLP: Calculating cost for tree of size " <<
         VectorizableTree.size() << ".\n");
 
-  // Don't vectorize tiny trees. Small load/store chains or consecutive stores
-  // of constants will be vectoried in SelectionDAG in MergeConsecutiveStores.
-  // The SelectionDAG vectorizer can only handle pairs (trees of height = 2).
-  if (VectorizableTree.size() < 3) {
+  // We only vectorize tiny trees if it is fully vectorizable.
+  if (VectorizableTree.size() < 3 && !isFullyVectorizableTinyTree()) {
     if (!VectorizableTree.size()) {
       assert(!ExternalUses.size() && "We should not have any external users");
     }
-    return 0;
+    return INT_MAX;
   }
 
   unsigned BundleWidth = VectorizableTree[0].Scalars.size();
@@ -992,63 +1182,29 @@ bool BoUpSLP::isConsecutiveAccess(Value *A, Value *B) {
   if (PtrA == PtrB || PtrA->getType() != PtrB->getType())
     return false;
 
-  // Calculate a constant offset from the base pointer without using SCEV
-  // in the supported cases.
-  // TODO: Add support for the case where one of the pointers is a GEP that
-  // uses the other pointer.
-  GetElementPtrInst *GepA = dyn_cast<GetElementPtrInst>(PtrA);
-  GetElementPtrInst *GepB = dyn_cast<GetElementPtrInst>(PtrB);
-
-  unsigned BW = DL->getPointerSizeInBits(ASA);
+  unsigned PtrBitWidth = DL->getPointerSizeInBits(ASA);
   Type *Ty = cast<PointerType>(PtrA->getType())->getElementType();
-  int64_t Sz = DL->getTypeStoreSize(Ty);
+  APInt Size(PtrBitWidth, DL->getTypeStoreSize(Ty));
 
-  // Check if PtrA is the base and PtrB is a constant offset.
-  if (GepB && GepB->getPointerOperand() == PtrA) {
-    APInt Offset(BW, 0);
-    if (GepB->accumulateConstantOffset(*DL, Offset))
-      return Offset.getSExtValue() == Sz;
-    return false;
-  }
+  APInt OffsetA(PtrBitWidth, 0), OffsetB(PtrBitWidth, 0);
+  PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(*DL, OffsetA);
+  PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(*DL, OffsetB);
 
-  // Check if PtrB is the base and PtrA is a constant offset.
-  if (GepA && GepA->getPointerOperand() == PtrB) {
-    APInt Offset(BW, 0);
-    if (GepA->accumulateConstantOffset(*DL, Offset))
-      return Offset.getSExtValue() == -Sz;
-    return false;
-  }
+  APInt OffsetDelta = OffsetB - OffsetA;
 
-  // If both pointers are GEPs:
-  if (GepA && GepB) {
-    // Check that they have the same base pointer and number of indices.
-    if (GepA->getPointerOperand() != GepB->getPointerOperand() ||
-        GepA->getNumIndices() != GepB->getNumIndices())
-      return false;
+  // Check if they are based on the same pointer. That makes the offsets
+  // sufficient.
+  if (PtrA == PtrB)
+    return OffsetDelta == Size;
 
-    // Try to strip the geps. This makes SCEV faster.
-    // Make sure that all of the indices except for the last are identical.
-    int LastIdx = GepA->getNumIndices();
-    for (int i = 0; i < LastIdx - 1; i++) {
-      if (GepA->getOperand(i+1) != GepB->getOperand(i+1))
-          return false;
-    }
-
-    PtrA = GepA->getOperand(LastIdx);
-    PtrB = GepB->getOperand(LastIdx);
-    Sz = 1;
-  }
-
-  ConstantInt *CA = dyn_cast<ConstantInt>(PtrA);
-  ConstantInt *CB = dyn_cast<ConstantInt>(PtrB);
-  if (CA && CB) {
-    return (CA->getSExtValue() + Sz == CB->getSExtValue());
-  }
+  // Compute the necessary base pointer delta to have the necessary final delta
+  // equal to the size.
+  APInt BaseDelta = Size - OffsetDelta;
 
-  // Calculate the distance.
+  // Otherwise compute the distance with SCEV between the base pointers.
   const SCEV *PtrSCEVA = SE->getSCEV(PtrA);
   const SCEV *PtrSCEVB = SE->getSCEV(PtrB);
-  const SCEV *C = SE->getConstant(PtrSCEVA->getType(), Sz);
+  const SCEV *C = SE->getConstant(BaseDelta);
   const SCEV *X = SE->getAddExpr(PtrSCEVA, C);
   return X == PtrSCEVB;
 }
@@ -1102,6 +1258,15 @@ Instruction *BoUpSLP::getLastInstruction(ArrayRef<Value *> VL) {
   return I;
 }
 
+void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL) {
+  Instruction *VL0 = cast<Instruction>(VL[0]);
+  Instruction *LastInst = getLastInstruction(VL);
+  BasicBlock::iterator NextInst = LastInst;
+  ++NextInst;
+  Builder.SetInsertPoint(VL0->getParent(), NextInst);
+  Builder.SetCurrentDebugLocation(VL0->getDebugLoc());
+}
+
 Value *BoUpSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) {
   Value *Vec = UndefValue::get(Ty);
   // Generate the 'InsertElement' instruction.
@@ -1109,6 +1274,7 @@ Value *BoUpSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) {
     Vec = Builder.CreateInsertElement(Vec, VL[i], Builder.getInt32(i));
     if (Instruction *Insrt = dyn_cast<Instruction>(Vec)) {
       GatherSeq.insert(Insrt);
+      CSEBlocks.insert(Insrt->getParent());
 
       // Add to our 'need-to-extract' list.
       if (ScalarToTreeEntry.count(VL[i])) {
@@ -1132,10 +1298,12 @@ Value *BoUpSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) {
   return Vec;
 }
 
-Value *BoUpSLP::alreadyVectorized(ArrayRef<Value *> VL) {
-  if (ScalarToTreeEntry.count(VL[0])) {
-    int Idx = ScalarToTreeEntry[VL[0]];
-    TreeEntry *En = &VectorizableTree[Idx];
+Value *BoUpSLP::alreadyVectorized(ArrayRef<Value *> VL) const {
+  SmallDenseMap<Value*, int>::const_iterator Entry
+    = ScalarToTreeEntry.find(VL[0]);
+  if (Entry != ScalarToTreeEntry.end()) {
+    int Idx = Entry->second;
+    const TreeEntry *En = &VectorizableTree[Idx];
     if (En->isSame(VL) && En->VectorizedValue)
       return En->VectorizedValue;
   }
@@ -1159,38 +1327,48 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
 }
 
 Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
-  BuilderLocGuard Guard(Builder);
+  IRBuilder<>::InsertPointGuard Guard(Builder);
 
   if (E->VectorizedValue) {
     DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n");
     return E->VectorizedValue;
   }
 
-  Type *ScalarTy = E->Scalars[0]->getType();
-  if (StoreInst *SI = dyn_cast<StoreInst>(E->Scalars[0]))
+  Instruction *VL0 = cast<Instruction>(E->Scalars[0]);
+  Type *ScalarTy = VL0->getType();
+  if (StoreInst *SI = dyn_cast<StoreInst>(VL0))
     ScalarTy = SI->getValueOperand()->getType();
   VectorType *VecTy = VectorType::get(ScalarTy, E->Scalars.size());
 
   if (E->NeedToGather) {
+    setInsertPointAfterBundle(E->Scalars);
     return Gather(E->Scalars, VecTy);
   }
 
-  Instruction *VL0 = cast<Instruction>(E->Scalars[0]);
   unsigned Opcode = VL0->getOpcode();
   assert(Opcode == getSameOpcode(E->Scalars) && "Invalid opcode");
 
   switch (Opcode) {
     case Instruction::PHI: {
       PHINode *PH = dyn_cast<PHINode>(VL0);
-      Builder.SetInsertPoint(PH->getParent()->getFirstInsertionPt());
+      Builder.SetInsertPoint(PH->getParent()->getFirstNonPHI());
       Builder.SetCurrentDebugLocation(PH->getDebugLoc());
       PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
       E->VectorizedValue = NewPhi;
 
+      // PHINodes may have multiple entries from the same block. We want to
+      // visit every block once.
+      SmallSet<BasicBlock*, 4> VisitedBBs;
+
       for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
         ValueList Operands;
         BasicBlock *IBB = PH->getIncomingBlock(i);
 
+        if (!VisitedBBs.insert(IBB)) {
+          NewPhi->addIncoming(NewPhi->getIncomingValueForBlock(IBB), IBB);
+          continue;
+        }
+
         // Prepare the operand vector.
         for (unsigned j = 0; j < E->Scalars.size(); ++j)
           Operands.push_back(cast<PHINode>(E->Scalars[j])->
@@ -1231,8 +1409,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       for (int i = 0, e = E->Scalars.size(); i < e; ++i)
         INVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0));
 
-      Builder.SetInsertPoint(getLastInstruction(E->Scalars));
-      Builder.SetCurrentDebugLocation(VL0->getDebugLoc());
+      setInsertPointAfterBundle(E->Scalars);
 
       Value *InVec = vectorizeTree(INVL);
 
@@ -1252,8 +1429,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
         RHSV.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1));
       }
 
-      Builder.SetInsertPoint(getLastInstruction(E->Scalars));
-      Builder.SetCurrentDebugLocation(VL0->getDebugLoc());
+      setInsertPointAfterBundle(E->Scalars);
 
       Value *L = vectorizeTree(LHSV);
       Value *R = vectorizeTree(RHSV);
@@ -1279,8 +1455,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
         FalseVec.push_back(cast<Instruction>(E->Scalars[i])->getOperand(2));
       }
 
-      Builder.SetInsertPoint(getLastInstruction(E->Scalars));
-      Builder.SetCurrentDebugLocation(VL0->getDebugLoc());
+      setInsertPointAfterBundle(E->Scalars);
 
       Value *Cond = vectorizeTree(CondVec);
       Value *True = vectorizeTree(TrueVec);
@@ -1288,7 +1463,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
 
       if (Value *V = alreadyVectorized(E->Scalars))
         return V;
-      
+
       Value *V = Builder.CreateSelect(Cond, True, False);
       E->VectorizedValue = V;
       return V;
@@ -1312,13 +1487,15 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
     case Instruction::Or:
     case Instruction::Xor: {
       ValueList LHSVL, RHSVL;
-      for (int i = 0, e = E->Scalars.size(); i < e; ++i) {
-        LHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0));
-        RHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1));
-      }
+      if (isa<BinaryOperator>(VL0) && VL0->isCommutative())
+        reorderInputsAccordingToOpcode(E->Scalars, LHSVL, RHSVL);
+      else
+        for (int i = 0, e = E->Scalars.size(); i < e; ++i) {
+          LHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0));
+          RHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1));
+        }
 
-      Builder.SetInsertPoint(getLastInstruction(E->Scalars));
-      Builder.SetCurrentDebugLocation(VL0->getDebugLoc());
+      setInsertPointAfterBundle(E->Scalars);
 
       Value *LHS = vectorizeTree(LHSVL);
       Value *RHS = vectorizeTree(RHSVL);
@@ -1333,41 +1510,46 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       BinaryOperator *BinOp = cast<BinaryOperator>(VL0);
       Value *V = Builder.CreateBinOp(BinOp->getOpcode(), LHS, RHS);
       E->VectorizedValue = V;
+
+      if (Instruction *I = dyn_cast<Instruction>(V))
+        return propagateMetadata(I, E->Scalars);
+
       return V;
     }
     case Instruction::Load: {
       // Loads are inserted at the head of the tree because we don't want to
       // sink them all the way down past store instructions.
-      Builder.SetInsertPoint(getLastInstruction(E->Scalars));
-      Builder.SetCurrentDebugLocation(VL0->getDebugLoc());
+      setInsertPointAfterBundle(E->Scalars);
 
       LoadInst *LI = cast<LoadInst>(VL0);
-      Value *VecPtr =
-      Builder.CreateBitCast(LI->getPointerOperand(), VecTy->getPointerTo());
+      unsigned AS = LI->getPointerAddressSpace();
+
+      Value *VecPtr = Builder.CreateBitCast(LI->getPointerOperand(),
+                                            VecTy->getPointerTo(AS));
       unsigned Alignment = LI->getAlignment();
       LI = Builder.CreateLoad(VecPtr);
       LI->setAlignment(Alignment);
       E->VectorizedValue = LI;
-      return LI;
+      return propagateMetadata(LI, E->Scalars);
     }
     case Instruction::Store: {
       StoreInst *SI = cast<StoreInst>(VL0);
       unsigned Alignment = SI->getAlignment();
+      unsigned AS = SI->getPointerAddressSpace();
 
       ValueList ValueOp;
       for (int i = 0, e = E->Scalars.size(); i < e; ++i)
         ValueOp.push_back(cast<StoreInst>(E->Scalars[i])->getValueOperand());
 
-      Builder.SetInsertPoint(getLastInstruction(E->Scalars));
-      Builder.SetCurrentDebugLocation(VL0->getDebugLoc());
+      setInsertPointAfterBundle(E->Scalars);
 
       Value *VecValue = vectorizeTree(ValueOp);
-      Value *VecPtr =
-      Builder.CreateBitCast(SI->getPointerOperand(), VecTy->getPointerTo());
+      Value *VecPtr = Builder.CreateBitCast(SI->getPointerOperand(),
+                                            VecTy->getPointerTo(AS));
       StoreInst *S = Builder.CreateStore(VecValue, VecPtr);
       S->setAlignment(Alignment);
       E->VectorizedValue = S;
-      return S;
+      return propagateMetadata(S, E->Scalars);
     }
     default:
     llvm_unreachable("unknown inst");
@@ -1375,7 +1557,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
   return 0;
 }
 
-void BoUpSLP::vectorizeTree() {
+Value *BoUpSLP::vectorizeTree() {
   Builder.SetInsertPoint(F->getEntryBlock().begin());
   vectorizeTree(&VectorizableTree[0]);
 
@@ -1407,6 +1589,7 @@ void BoUpSLP::vectorizeTree() {
     if (PHINode *PN = dyn_cast<PHINode>(Vec)) {
       Builder.SetInsertPoint(PN->getParent()->getFirstInsertionPt());
       Value *Ex = Builder.CreateExtractElement(Vec, Lane);
+      CSEBlocks.insert(PN->getParent());
       User->replaceUsesOfWith(Scalar, Ex);
     } else if (isa<Instruction>(Vec)){
       if (PHINode *PH = dyn_cast<PHINode>(User)) {
@@ -1414,17 +1597,20 @@ void BoUpSLP::vectorizeTree() {
           if (PH->getIncomingValue(i) == Scalar) {
             Builder.SetInsertPoint(PH->getIncomingBlock(i)->getTerminator());
             Value *Ex = Builder.CreateExtractElement(Vec, Lane);
+            CSEBlocks.insert(PH->getIncomingBlock(i));
             PH->setOperand(i, Ex);
           }
         }
       } else {
         Builder.SetInsertPoint(cast<Instruction>(User));
         Value *Ex = Builder.CreateExtractElement(Vec, Lane);
+        CSEBlocks.insert(cast<Instruction>(User)->getParent());
         User->replaceUsesOfWith(Scalar, Ex);
      }
     } else {
       Builder.SetInsertPoint(F->getEntryBlock().begin());
       Value *Ex = Builder.CreateExtractElement(Vec, Lane);
+      CSEBlocks.insert(&F->getEntryBlock());
       User->replaceUsesOfWith(Scalar, Ex);
     }
 
@@ -1450,9 +1636,10 @@ void BoUpSLP::vectorizeTree() {
         for (Value::use_iterator User = Scalar->use_begin(),
              UE = Scalar->use_end(); User != UE; ++User) {
           DEBUG(dbgs() << "SLP: \tvalidating user:" << **User << ".\n");
-          assert(!MustGather.count(*User) &&
-                 "Replacing gathered value with undef");
-          assert(ScalarToTreeEntry.count(*User) &&
+
+          assert((ScalarToTreeEntry.count(*User) ||
+                  // It is legal to replace the reduction users by undef.
+                  (RdxOps && RdxOps->count(*User))) &&
                  "Replacing out-of-tree value with undef");
         }
         Value *Undef = UndefValue::get(Ty);
@@ -1467,8 +1654,20 @@ void BoUpSLP::vectorizeTree() {
     BlocksNumbers[it].forget();
   }
   Builder.ClearInsertionPoint();
+
+  return VectorizableTree[0].VectorizedValue;
 }
 
+class DTCmp {
+  const DominatorTree *DT;
+
+public:
+  DTCmp(const DominatorTree *DT) : DT(DT) {}
+  bool operator()(const BasicBlock *A, const BasicBlock *B) const {
+    return DT->properlyDominates(A, B);
+  }
+};
+
 void BoUpSLP::optimizeGatherSequence() {
   DEBUG(dbgs() << "SLP: Optimizing " << GatherSeq.size()
         << " gather sequences instructions.\n");
@@ -1504,45 +1703,48 @@ void BoUpSLP::optimizeGatherSequence() {
     Insert->moveBefore(PreHeader->getTerminator());
   }
 
+  // Sort blocks by domination. This ensures we visit a block after all blocks
+  // dominating it are visited.
+  SmallVector<BasicBlock *, 8> CSEWorkList(CSEBlocks.begin(), CSEBlocks.end());
+  std::stable_sort(CSEWorkList.begin(), CSEWorkList.end(), DTCmp(DT));
+
   // Perform O(N^2) search over the gather sequences and merge identical
   // instructions. TODO: We can further optimize this scan if we split the
   // instructions into different buckets based on the insert lane.
-  SmallPtrSet<Instruction*, 16> Visited;
-  SmallVector<Instruction*, 16> ToRemove;
-  ReversePostOrderTraversal<Function*> RPOT(F);
-  for (ReversePostOrderTraversal<Function*>::rpo_iterator I = RPOT.begin(),
-       E = RPOT.end(); I != E; ++I) {
+  SmallVector<Instruction *, 16> Visited;
+  for (SmallVectorImpl<BasicBlock *>::iterator I = CSEWorkList.begin(),
+                                               E = CSEWorkList.end();
+       I != E; ++I) {
+    assert((I == CSEWorkList.begin() || !DT->dominates(*I, *llvm::prior(I))) &&
+           "Worklist not sorted properly!");
     BasicBlock *BB = *I;
-    // For all instructions in the function:
-    for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
-      Instruction *In = it;
-      if ((!isa<InsertElementInst>(In) && !isa<ExtractElementInst>(In)) ||
-          !GatherSeq.count(In))
+    // For all instructions in blocks containing gather sequences:
+    for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e;) {
+      Instruction *In = it++;
+      if (!isa<InsertElementInst>(In) && !isa<ExtractElementInst>(In))
         continue;
 
       // Check if we can replace this instruction with any of the
       // visited instructions.
-      for (SmallPtrSet<Instruction*, 16>::iterator v = Visited.begin(),
-           ve = Visited.end(); v != ve; ++v) {
+      for (SmallVectorImpl<Instruction *>::iterator v = Visited.begin(),
+                                                    ve = Visited.end();
+           v != ve; ++v) {
         if (In->isIdenticalTo(*v) &&
             DT->dominates((*v)->getParent(), In->getParent())) {
           In->replaceAllUsesWith(*v);
-          ToRemove.push_back(In);
+          In->eraseFromParent();
           In = 0;
           break;
         }
       }
-      if (In)
-        Visited.insert(In);
+      if (In) {
+        assert(std::find(Visited.begin(), Visited.end(), In) == Visited.end());
+        Visited.push_back(In);
+      }
     }
   }
-
-  // Erase all of the instructions that we RAUWed.
-  for (SmallVectorImpl<Instruction *>::iterator v = ToRemove.begin(),
-       ve = ToRemove.end(); v != ve; ++v) {
-    assert((*v)->getNumUses() == 0 && "Can't remove instructions with uses");
-    (*v)->eraseFromParent();
-  }
+  CSEBlocks.clear();
+  GatherSeq.clear();
 }
 
 /// The SLPVectorizer Pass.
@@ -1575,14 +1777,18 @@ struct SLPVectorizer : public FunctionPass {
     StoreRefs.clear();
     bool Changed = false;
 
+    // If the target claims to have no vector registers don't attempt
+    // vectorization.
+    if (!TTI->getNumberOfRegisters(true))
+      return false;
+
     // Must have DataLayout. We can't require it because some tests run w/o
     // triple.
     if (!DL)
       return false;
 
     // Don't vectorize when the attribute NoImplicitFloat is used.
-    if (F.getAttributes().hasAttribute(AttributeSet::FunctionIndex,
-                                       Attribute::NoImplicitFloat))
+    if (F.hasFnAttribute(Attribute::NoImplicitFloat))
       return false;
 
     DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
@@ -1661,6 +1867,21 @@ private:
   StoreListMap StoreRefs;
 };
 
+/// \brief Check that the Values in the slice in VL array are still existant in
+/// the WeakVH array.
+/// Vectorization of part of the VL array may cause later values in the VL array
+/// to become invalid. We track when this has happened in the WeakVH array.
+static bool hasValueBeenRAUWed(ArrayRef<Value *> &VL,
+                               SmallVectorImpl<WeakVH> &VH,
+                               unsigned SliceBegin,
+                               unsigned SliceSize) {
+  for (unsigned i = SliceBegin; i < SliceBegin + SliceSize; ++i)
+    if (VH[i] != VL[i])
+      return true;
+
+  return false;
+}
+
 bool SLPVectorizer::vectorizeStoreChain(ArrayRef<Value *> Chain,
                                           int CostThreshold, BoUpSLP &R) {
   unsigned ChainLen = Chain.size();
@@ -1673,11 +1894,19 @@ bool SLPVectorizer::vectorizeStoreChain(ArrayRef<Value *> Chain,
   if (!isPowerOf2_32(Sz) || VF < 2)
     return false;
 
+  // Keep track of values that were delete by vectorizing in the loop below.
+  SmallVector<WeakVH, 8> TrackValues(Chain.begin(), Chain.end());
+
   bool Changed = false;
   // Look for profitable vectorizable trees at all offsets, starting at zero.
   for (unsigned i = 0, e = ChainLen; i < e; ++i) {
     if (i + VF > e)
       break;
+
+    // Check that a previous iteration of this loop did not delete the Value.
+    if (hasValueBeenRAUWed(Chain, TrackValues, i, VF))
+      continue;
+
     DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << i
           << "\n");
     ArrayRef<Value *> Operands = Chain.slice(i, VF);
@@ -1697,7 +1926,7 @@ bool SLPVectorizer::vectorizeStoreChain(ArrayRef<Value *> Chain,
     }
   }
 
-    return Changed;
+  return Changed;
 }
 
 bool SLPVectorizer::vectorizeStores(ArrayRef<StoreInst *> Stores,
@@ -1764,15 +1993,17 @@ unsigned SLPVectorizer::collectStores(BasicBlock *BB, BoUpSLP &R) {
     if (!SI)
       continue;
 
+    // Don't touch volatile stores.
+    if (!SI->isSimple())
+      continue;
+
     // Check that the pointer points to scalars.
     Type *Ty = SI->getValueOperand()->getType();
     if (Ty->isAggregateType() || Ty->isVectorTy())
       return 0;
 
-    // Find the base of the GEP.
-    Value *Ptr = SI->getPointerOperand();
-    if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr))
-      Ptr = GEP->getPointerOperand();
+    // Find the base pointer.
+    Value *Ptr = GetUnderlyingObject(SI->getPointerOperand(), DL);
 
     // Save the store locations.
     StoreRefs[Ptr].push_back(SI);
@@ -1797,28 +2028,61 @@ bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R) {
   // Check that all of the parts are scalar instructions of the same type.
   Instruction *I0 = dyn_cast<Instruction>(VL[0]);
   if (!I0)
-    return 0;
+    return false;
 
   unsigned Opcode0 = I0->getOpcode();
 
+  Type *Ty0 = I0->getType();
+  unsigned Sz = DL->getTypeSizeInBits(Ty0);
+  unsigned VF = MinVecRegSize / Sz;
+
   for (int i = 0, e = VL.size(); i < e; ++i) {
     Type *Ty = VL[i]->getType();
     if (Ty->isAggregateType() || Ty->isVectorTy())
-      return 0;
+      return false;
     Instruction *Inst = dyn_cast<Instruction>(VL[i]);
     if (!Inst || Inst->getOpcode() != Opcode0)
-      return 0;
+      return false;
   }
 
-  R.buildTree(VL);
-  int Cost = R.getTreeCost();
+  bool Changed = false;
 
-  if (Cost >= -SLPCostThreshold)
-    return false;
+  // Keep track of values that were delete by vectorizing in the loop below.
+  SmallVector<WeakVH, 8> TrackValues(VL.begin(), VL.end());
 
-  DEBUG(dbgs() << "SLP: Vectorizing pair at cost:" << Cost << ".\n");
-  R.vectorizeTree();
-  return true;
+  for (unsigned i = 0, e = VL.size(); i < e; ++i) {
+    unsigned OpsWidth = 0;
+
+    if (i + VF > e)
+      OpsWidth = e - i;
+    else
+      OpsWidth = VF;
+
+    if (!isPowerOf2_32(OpsWidth) || OpsWidth < 2)
+      break;
+
+    // Check that a previous iteration of this loop did not delete the Value.
+    if (hasValueBeenRAUWed(VL, TrackValues, i, OpsWidth))
+      continue;
+
+    DEBUG(dbgs() << "SLP: Analyzing " << OpsWidth << " operations "
+                 << "\n");
+    ArrayRef<Value *> Ops = VL.slice(i, OpsWidth);
+
+    R.buildTree(Ops);
+    int Cost = R.getTreeCost();
+
+    if (Cost < -SLPCostThreshold) {
+      DEBUG(dbgs() << "SLP: Vectorizing pair at cost:" << Cost << ".\n");
+      R.vectorizeTree();
+
+      // Move to the next bundle.
+      i += VF - 1;
+      Changed = true;
+    }
+  }
+
+  return Changed;
 }
 
 bool SLPVectorizer::tryToVectorize(BinaryOperator *V, BoUpSLP &R) {
@@ -1861,30 +2125,405 @@ bool SLPVectorizer::tryToVectorize(BinaryOperator *V, BoUpSLP &R) {
   return 0;
 }
 
+/// \brief Generate a shuffle mask to be used in a reduction tree.
+///
+/// \param VecLen The length of the vector to be reduced.
+/// \param NumEltsToRdx The number of elements that should be reduced in the
+///        vector.
+/// \param IsPairwise Whether the reduction is a pairwise or splitting
+///        reduction. A pairwise reduction will generate a mask of 
+///        <0,2,...> or <1,3,..> while a splitting reduction will generate
+///        <2,3, undef,undef> for a vector of 4 and NumElts = 2.
+/// \param IsLeft True will generate a mask of even elements, odd otherwise.
+static Value *createRdxShuffleMask(unsigned VecLen, unsigned NumEltsToRdx,
+                                   bool IsPairwise, bool IsLeft,
+                                   IRBuilder<> &Builder) {
+  assert((IsPairwise || !IsLeft) && "Don't support a <0,1,undef,...> mask");
+
+  SmallVector<Constant *, 32> ShuffleMask(
+      VecLen, UndefValue::get(Builder.getInt32Ty()));
+
+  if (IsPairwise)
+    // Build a mask of 0, 2, ... (left) or 1, 3, ... (right).
+    for (unsigned i = 0; i != NumEltsToRdx; ++i)
+      ShuffleMask[i] = Builder.getInt32(2 * i + !IsLeft);
+  else
+    // Move the upper half of the vector to the lower half.
+    for (unsigned i = 0; i != NumEltsToRdx; ++i)
+      ShuffleMask[i] = Builder.getInt32(NumEltsToRdx + i);
+
+  return ConstantVector::get(ShuffleMask);
+}
+
+
+/// Model horizontal reductions.
+///
+/// A horizontal reduction is a tree of reduction operations (currently add and
+/// fadd) that has operations that can be put into a vector as its leaf.
+/// For example, this tree:
+///
+/// mul mul mul mul
+///  \  /    \  /
+///   +       +
+///    \     /
+///       +
+/// This tree has "mul" as its reduced values and "+" as its reduction
+/// operations. A reduction might be feeding into a store or a binary operation
+/// feeding a phi.
+///    ...
+///    \  /
+///     +
+///     |
+///  phi +=
+///
+///  Or:
+///    ...
+///    \  /
+///     +
+///     |
+///   *p =
+///
+class HorizontalReduction {
+  SmallPtrSet<Value *, 16> ReductionOps;
+  SmallVector<Value *, 32> ReducedVals;
+
+  BinaryOperator *ReductionRoot;
+  PHINode *ReductionPHI;
+
+  /// The opcode of the reduction.
+  unsigned ReductionOpcode;
+  /// The opcode of the values we perform a reduction on.
+  unsigned ReducedValueOpcode;
+  /// The width of one full horizontal reduction operation.
+  unsigned ReduxWidth;
+  /// Should we model this reduction as a pairwise reduction tree or a tree that
+  /// splits the vector in halves and adds those halves.
+  bool IsPairwiseReduction;
+
+public:
+  HorizontalReduction()
+    : ReductionRoot(0), ReductionPHI(0), ReductionOpcode(0),
+    ReducedValueOpcode(0), ReduxWidth(0), IsPairwiseReduction(false) {}
+
+  /// \brief Try to find a reduction tree.
+  bool matchAssociativeReduction(PHINode *Phi, BinaryOperator *B,
+                                 DataLayout *DL) {
+    assert((!Phi ||
+            std::find(Phi->op_begin(), Phi->op_end(), B) != Phi->op_end()) &&
+           "Thi phi needs to use the binary operator");
+
+    // We could have a initial reductions that is not an add.
+    //  r *= v1 + v2 + v3 + v4
+    // In such a case start looking for a tree rooted in the first '+'.
+    if (Phi) {
+      if (B->getOperand(0) == Phi) {
+        Phi = 0;
+        B = dyn_cast<BinaryOperator>(B->getOperand(1));
+      } else if (B->getOperand(1) == Phi) {
+        Phi = 0;
+        B = dyn_cast<BinaryOperator>(B->getOperand(0));
+      }
+    }
+
+    if (!B)
+      return false;
+
+    Type *Ty = B->getType();
+    if (Ty->isVectorTy())
+      return false;
+
+    ReductionOpcode = B->getOpcode();
+    ReducedValueOpcode = 0;
+    ReduxWidth = MinVecRegSize / DL->getTypeSizeInBits(Ty);
+    ReductionRoot = B;
+    ReductionPHI = Phi;
+
+    if (ReduxWidth < 4)
+      return false;
+
+    // We currently only support adds.
+    if (ReductionOpcode != Instruction::Add &&
+        ReductionOpcode != Instruction::FAdd)
+      return false;
+
+    // Post order traverse the reduction tree starting at B. We only handle true
+    // trees containing only binary operators.
+    SmallVector<std::pair<BinaryOperator *, unsigned>, 32> Stack;
+    Stack.push_back(std::make_pair(B, 0));
+    while (!Stack.empty()) {
+      BinaryOperator *TreeN = Stack.back().first;
+      unsigned EdgeToVist = Stack.back().second++;
+      bool IsReducedValue = TreeN->getOpcode() != ReductionOpcode;
+
+      // Only handle trees in the current basic block.
+      if (TreeN->getParent() != B->getParent())
+        return false;
+
+      // Each tree node needs to have one user except for the ultimate
+      // reduction.
+      if (!TreeN->hasOneUse() && TreeN != B)
+        return false;
+
+      // Postorder vist.
+      if (EdgeToVist == 2 || IsReducedValue) {
+        if (IsReducedValue) {
+          // Make sure that the opcodes of the operations that we are going to
+          // reduce match.
+          if (!ReducedValueOpcode)
+            ReducedValueOpcode = TreeN->getOpcode();
+          else if (ReducedValueOpcode != TreeN->getOpcode())
+            return false;
+          ReducedVals.push_back(TreeN);
+        } else {
+          // We need to be able to reassociate the adds.
+          if (!TreeN->isAssociative())
+            return false;
+          ReductionOps.insert(TreeN);
+        }
+        // Retract.
+        Stack.pop_back();
+        continue;
+      }
+
+      // Visit left or right.
+      Value *NextV = TreeN->getOperand(EdgeToVist);
+      BinaryOperator *Next = dyn_cast<BinaryOperator>(NextV);
+      if (Next)
+        Stack.push_back(std::make_pair(Next, 0));
+      else if (NextV != Phi)
+        return false;
+    }
+    return true;
+  }
+
+  /// \brief Attempt to vectorize the tree found by
+  /// matchAssociativeReduction.
+  bool tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) {
+    if (ReducedVals.empty())
+      return false;
+
+    unsigned NumReducedVals = ReducedVals.size();
+    if (NumReducedVals < ReduxWidth)
+      return false;
+
+    Value *VectorizedTree = 0;
+    IRBuilder<> Builder(ReductionRoot);
+    FastMathFlags Unsafe;
+    Unsafe.setUnsafeAlgebra();
+    Builder.SetFastMathFlags(Unsafe);
+    unsigned i = 0;
+
+    for (; i < NumReducedVals - ReduxWidth + 1; i += ReduxWidth) {
+      ArrayRef<Value *> ValsToReduce(&ReducedVals[i], ReduxWidth);
+      V.buildTree(ValsToReduce, &ReductionOps);
+
+      // Estimate cost.
+      int Cost = V.getTreeCost() + getReductionCost(TTI, ReducedVals[i]);
+      if (Cost >= -SLPCostThreshold)
+        break;
+
+      DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:" << Cost
+                   << ". (HorRdx)\n");
+
+      // Vectorize a tree.
+      DebugLoc Loc = cast<Instruction>(ReducedVals[i])->getDebugLoc();
+      Value *VectorizedRoot = V.vectorizeTree();
+
+      // Emit a reduction.
+      Value *ReducedSubTree = emitReduction(VectorizedRoot, Builder);
+      if (VectorizedTree) {
+        Builder.SetCurrentDebugLocation(Loc);
+        VectorizedTree = createBinOp(Builder, ReductionOpcode, VectorizedTree,
+                                     ReducedSubTree, "bin.rdx");
+      } else
+        VectorizedTree = ReducedSubTree;
+    }
+
+    if (VectorizedTree) {
+      // Finish the reduction.
+      for (; i < NumReducedVals; ++i) {
+        Builder.SetCurrentDebugLocation(
+          cast<Instruction>(ReducedVals[i])->getDebugLoc());
+        VectorizedTree = createBinOp(Builder, ReductionOpcode, VectorizedTree,
+                                     ReducedVals[i]);
+      }
+      // Update users.
+      if (ReductionPHI) {
+        assert(ReductionRoot != NULL && "Need a reduction operation");
+        ReductionRoot->setOperand(0, VectorizedTree);
+        ReductionRoot->setOperand(1, ReductionPHI);
+      } else
+        ReductionRoot->replaceAllUsesWith(VectorizedTree);
+    }
+    return VectorizedTree != 0;
+  }
+
+private:
+
+  /// \brief Calcuate the cost of a reduction.
+  int getReductionCost(TargetTransformInfo *TTI, Value *FirstReducedVal) {
+    Type *ScalarTy = FirstReducedVal->getType();
+    Type *VecTy = VectorType::get(ScalarTy, ReduxWidth);
+
+    int PairwiseRdxCost = TTI->getReductionCost(ReductionOpcode, VecTy, true);
+    int SplittingRdxCost = TTI->getReductionCost(ReductionOpcode, VecTy, false);
+
+    IsPairwiseReduction = PairwiseRdxCost < SplittingRdxCost;
+    int VecReduxCost = IsPairwiseReduction ? PairwiseRdxCost : SplittingRdxCost;
+
+    int ScalarReduxCost =
+        ReduxWidth * TTI->getArithmeticInstrCost(ReductionOpcode, VecTy);
+
+    DEBUG(dbgs() << "SLP: Adding cost " << VecReduxCost - ScalarReduxCost
+                 << " for reduction that starts with " << *FirstReducedVal
+                 << " (It is a "
+                 << (IsPairwiseReduction ? "pairwise" : "splitting")
+                 << " reduction)\n");
+
+    return VecReduxCost - ScalarReduxCost;
+  }
+
+  static Value *createBinOp(IRBuilder<> &Builder, unsigned Opcode, Value *L,
+                            Value *R, const Twine &Name = "") {
+    if (Opcode == Instruction::FAdd)
+      return Builder.CreateFAdd(L, R, Name);
+    return Builder.CreateBinOp((Instruction::BinaryOps)Opcode, L, R, Name);
+  }
+
+  /// \brief Emit a horizontal reduction of the vectorized value.
+  Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder) {
+    assert(VectorizedValue && "Need to have a vectorized tree node");
+    Instruction *ValToReduce = dyn_cast<Instruction>(VectorizedValue);
+    assert(isPowerOf2_32(ReduxWidth) &&
+           "We only handle power-of-two reductions for now");
+
+    Value *TmpVec = ValToReduce;
+    for (unsigned i = ReduxWidth / 2; i != 0; i >>= 1) {
+      if (IsPairwiseReduction) {
+        Value *LeftMask =
+          createRdxShuffleMask(ReduxWidth, i, true, true, Builder);
+        Value *RightMask =
+          createRdxShuffleMask(ReduxWidth, i, true, false, Builder);
+
+        Value *LeftShuf = Builder.CreateShuffleVector(
+          TmpVec, UndefValue::get(TmpVec->getType()), LeftMask, "rdx.shuf.l");
+        Value *RightShuf = Builder.CreateShuffleVector(
+          TmpVec, UndefValue::get(TmpVec->getType()), (RightMask),
+          "rdx.shuf.r");
+        TmpVec = createBinOp(Builder, ReductionOpcode, LeftShuf, RightShuf,
+                             "bin.rdx");
+      } else {
+        Value *UpperHalf =
+          createRdxShuffleMask(ReduxWidth, i, false, false, Builder);
+        Value *Shuf = Builder.CreateShuffleVector(
+          TmpVec, UndefValue::get(TmpVec->getType()), UpperHalf, "rdx.shuf");
+        TmpVec = createBinOp(Builder, ReductionOpcode, TmpVec, Shuf, "bin.rdx");
+      }
+    }
+
+    // The result is in the first element of the vector.
+    return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0));
+  }
+};
+
+/// \brief Recognize construction of vectors like
+///  %ra = insertelement <4 x float> undef, float %s0, i32 0
+///  %rb = insertelement <4 x float> %ra, float %s1, i32 1
+///  %rc = insertelement <4 x float> %rb, float %s2, i32 2
+///  %rd = insertelement <4 x float> %rc, float %s3, i32 3
+///
+/// Returns true if it matches
+///
+static bool findBuildVector(InsertElementInst *IE,
+                            SmallVectorImpl<Value *> &Ops) {
+  if (!isa<UndefValue>(IE->getOperand(0)))
+    return false;
+
+  while (true) {
+    Ops.push_back(IE->getOperand(1));
+
+    if (IE->use_empty())
+      return false;
+
+    InsertElementInst *NextUse = dyn_cast<InsertElementInst>(IE->use_back());
+    if (!NextUse)
+      return true;
+
+    // If this isn't the final use, make sure the next insertelement is the only
+    // use. It's OK if the final constructed vector is used multiple times
+    if (!IE->hasOneUse())
+      return false;
+
+    IE = NextUse;
+  }
+
+  return false;
+}
+
+static bool PhiTypeSorterFunc(Value *V, Value *V2) {
+  return V->getType() < V2->getType();
+}
+
 bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
   bool Changed = false;
   SmallVector<Value *, 4> Incoming;
-  // Collect the incoming values from the PHIs.
-  for (BasicBlock::iterator instr = BB->begin(), ie = BB->end(); instr != ie;
-       ++instr) {
-    PHINode *P = dyn_cast<PHINode>(instr);
-
-    if (!P)
-      break;
+  SmallSet<Value *, 16> VisitedInstrs;
+
+  bool HaveVectorizedPhiNodes = true;
+  while (HaveVectorizedPhiNodes) {
+    HaveVectorizedPhiNodes = false;
+
+    // Collect the incoming values from the PHIs.
+    Incoming.clear();
+    for (BasicBlock::iterator instr = BB->begin(), ie = BB->end(); instr != ie;
+         ++instr) {
+      PHINode *P = dyn_cast<PHINode>(instr);
+      if (!P)
+        break;
 
-    // Stop constructing the list when you reach a different type.
-    if (Incoming.size() && P->getType() != Incoming[0]->getType()) {
-      Changed |= tryToVectorizeList(Incoming, R);
-      Incoming.clear();
+      if (!VisitedInstrs.count(P))
+        Incoming.push_back(P);
     }
 
-    Incoming.push_back(P);
+    // Sort by type.
+    std::stable_sort(Incoming.begin(), Incoming.end(), PhiTypeSorterFunc);
+
+    // Try to vectorize elements base on their type.
+    for (SmallVector<Value *, 4>::iterator IncIt = Incoming.begin(),
+                                           E = Incoming.end();
+         IncIt != E;) {
+
+      // Look for the next elements with the same type.
+      SmallVector<Value *, 4>::iterator SameTypeIt = IncIt;
+      while (SameTypeIt != E &&
+             (*SameTypeIt)->getType() == (*IncIt)->getType()) {
+        VisitedInstrs.insert(*SameTypeIt);
+        ++SameTypeIt;
+      }
+
+      // Try to vectorize them.
+      unsigned NumElts = (SameTypeIt - IncIt);
+      DEBUG(errs() << "SLP: Trying to vectorize starting at PHIs (" << NumElts << ")\n");
+      if (NumElts > 1 &&
+          tryToVectorizeList(ArrayRef<Value *>(IncIt, NumElts), R)) {
+        // Success start over because instructions might have been changed.
+        HaveVectorizedPhiNodes = true;
+        Changed = true;
+        break;
+      }
+
+      // Start over at the next instruction of a differnt type (or the end).
+      IncIt = SameTypeIt;
+    }
   }
 
-  if (Incoming.size() > 1)
-    Changed |= tryToVectorizeList(Incoming, R);
+  VisitedInstrs.clear();
+
+  for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; it++) {
+    // We may go through BB multiple times so skip the one we have checked.
+    if (!VisitedInstrs.insert(it))
+      continue;
 
-  for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
     if (isa<DbgInfoIntrinsic>(it))
       continue;
 
@@ -1902,24 +2541,86 @@ bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
       if (!BI)
         continue;
 
-      Value *Inst = BI->getOperand(0);
+      // Try to match and vectorize a horizontal reduction.
+      HorizontalReduction HorRdx;
+      if (ShouldVectorizeHor &&
+          HorRdx.matchAssociativeReduction(P, BI, DL) &&
+          HorRdx.tryToReduce(R, TTI)) {
+        Changed = true;
+        it = BB->begin();
+        e = BB->end();
+        continue;
+      }
+
+     Value *Inst = BI->getOperand(0);
       if (Inst == P)
         Inst = BI->getOperand(1);
 
-      Changed |= tryToVectorize(dyn_cast<BinaryOperator>(Inst), R);
+      if (tryToVectorize(dyn_cast<BinaryOperator>(Inst), R)) {
+        // We would like to start over since some instructions are deleted
+        // and the iterator may become invalid value.
+        Changed = true;
+        it = BB->begin();
+        e = BB->end();
+        continue;
+      }
+
       continue;
     }
 
+    // Try to vectorize horizontal reductions feeding into a store.
+    if (ShouldStartVectorizeHorAtStore)
+      if (StoreInst *SI = dyn_cast<StoreInst>(it))
+        if (BinaryOperator *BinOp =
+                dyn_cast<BinaryOperator>(SI->getValueOperand())) {
+          HorizontalReduction HorRdx;
+          if (((HorRdx.matchAssociativeReduction(0, BinOp, DL) &&
+                HorRdx.tryToReduce(R, TTI)) ||
+               tryToVectorize(BinOp, R))) {
+            Changed = true;
+            it = BB->begin();
+            e = BB->end();
+            continue;
+          }
+        }
+
     // Try to vectorize trees that start at compare instructions.
     if (CmpInst *CI = dyn_cast<CmpInst>(it)) {
       if (tryToVectorizePair(CI->getOperand(0), CI->getOperand(1), R)) {
-        Changed |= true;
+        Changed = true;
+        // We would like to start over since some instructions are deleted
+        // and the iterator may become invalid value.
+        it = BB->begin();
+        e = BB->end();
+        continue;
+      }
+
+      for (int i = 0; i < 2; ++i) {
+         if (BinaryOperator *BI = dyn_cast<BinaryOperator>(CI->getOperand(i))) {
+            if (tryToVectorizePair(BI->getOperand(0), BI->getOperand(1), R)) {
+              Changed = true;
+              // We would like to start over since some instructions are deleted
+              // and the iterator may become invalid value.
+              it = BB->begin();
+              e = BB->end();
+            }
+         }
+      }
+      continue;
+    }
+
+    // Try to vectorize trees that start at insertelement instructions.
+    if (InsertElementInst *IE = dyn_cast<InsertElementInst>(it)) {
+      SmallVector<Value *, 8> Ops;
+      if (!findBuildVector(IE, Ops))
         continue;
+
+      if (tryToVectorizeList(Ops, R)) {
+        Changed = true;
+        it = BB->begin();
+        e = BB->end();
       }
-      for (int i = 0; i < 2; ++i)
-        if (BinaryOperator *BI = dyn_cast<BinaryOperator>(CI->getOperand(i)))
-          Changed |=
-              tryToVectorizePair(BI->getOperand(0), BI->getOperand(1), R);
+
       continue;
     }
   }
author	Stephen Hines <srhines@google.com>	2014-02-11 20:01:10 -0800
committer	Stephen Hines <srhines@google.com>	2014-02-11 20:01:10 -0800
commit	ce9904c6ea8fd669978a8eefb854b330eb9828ff (patch)
tree	2418ee2e96ea220977c8fb74959192036ab5b133 /lib
parent	c27b10b198c1d9e9b51f2303994313ec2778edd7 (diff)
parent	dbb832b83351cec97b025b61c26536ef50c3181c (diff)
download	external_llvm-ce9904c6ea8fd669978a8eefb854b330eb9828ff.zip external_llvm-ce9904c6ea8fd669978a8eefb854b330eb9828ff.tar.gz external_llvm-ce9904c6ea8fd669978a8eefb854b330eb9828ff.tar.bz2